Setting a default namespace breaks queries when namespace is empty, so make it a parameter.
The new --namespace (-n) parameter defaults to http://www.w3.org/1999/xhtml because it's easier to set it to "" on the command line than to that long string.
This commit is contained in:
parent
55eb7c1fc0
commit
b536026f58
8 changed files with 23 additions and 15 deletions
|
@ -53,6 +53,7 @@ namespace duck {
|
||||||
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
||||||
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
|
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
|
||||||
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
|
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
|
||||||
|
("namespace,n", po::value<std::string>()->default_value("http://www.w3.org/1999/xhtml"), "Default namespace for XPath queries (try empty string if you get no results)")
|
||||||
;
|
;
|
||||||
po::options_description positional_options("Positional options");
|
po::options_description positional_options("Positional options");
|
||||||
positional_options.add_options()
|
positional_options.add_options()
|
||||||
|
|
|
@ -105,13 +105,15 @@ namespace {
|
||||||
std::vector<std::string> queries;
|
std::vector<std::string> queries;
|
||||||
queries.reserve(1);
|
queries.reserve(1);
|
||||||
queries.push_back(std::move(xpath_str));
|
queries.push_back(std::move(xpath_str));
|
||||||
auto results = xpath->run_query(html, queries);
|
auto results = xpath->run_query(html, queries, parVarMap["namespace"].as<std::string>());
|
||||||
for (const auto& lst : results[0]) {
|
for (const auto& lst : results[0]) {
|
||||||
std::cout << lst.first << ": " << lst.second << '\n';
|
std::cout << lst.first << ": " << lst.second << '\n';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||||
|
using std::string;
|
||||||
|
|
||||||
#if !defined(NDEBUG)
|
#if !defined(NDEBUG)
|
||||||
std::cout << " -- XPath model mode --\n";
|
std::cout << " -- XPath model mode --\n";
|
||||||
if (parVarMap.count("input-url"))
|
if (parVarMap.count("input-url"))
|
||||||
|
@ -126,7 +128,7 @@ namespace {
|
||||||
std::string(parVarMap["agent"].as<std::string>()),
|
std::string(parVarMap["agent"].as<std::string>()),
|
||||||
std::string(parVarMap["from-code"].as<std::string>())
|
std::string(parVarMap["from-code"].as<std::string>())
|
||||||
);
|
);
|
||||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
|
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath, string(parVarMap["namespace"].as<string>()));
|
||||||
//auto list = duck::get_xpath_definitions(*ast);
|
//auto list = duck::get_xpath_definitions(*ast);
|
||||||
|
|
||||||
//std::vector<std::string> expressions;
|
//std::vector<std::string> expressions;
|
||||||
|
|
|
@ -397,7 +397,8 @@ namespace duck { namespace sl {
|
||||||
std::vector<std::string> apply (
|
std::vector<std::string> apply (
|
||||||
const ScrapNode& node,
|
const ScrapNode& node,
|
||||||
HtmlPoolBaseSP html_pool,
|
HtmlPoolBaseSP html_pool,
|
||||||
XPathPtr xpath
|
XPathPtr xpath,
|
||||||
|
std::string&& parDefNamespace
|
||||||
) {
|
) {
|
||||||
using std::placeholders::_1;
|
using std::placeholders::_1;
|
||||||
|
|
||||||
|
@ -411,7 +412,7 @@ namespace duck { namespace sl {
|
||||||
retval.reserve(apply_entries.size());
|
retval.reserve(apply_entries.size());
|
||||||
|
|
||||||
std::cout << "-------------- visiting done ----------------\n";
|
std::cout << "-------------- visiting done ----------------\n";
|
||||||
XPathRunner xpath_runner(html_pool, xpath);
|
XPathRunner xpath_runner(html_pool, xpath, std::move(parDefNamespace));
|
||||||
|
|
||||||
for (auto& apply_entry : apply_entries) {
|
for (auto& apply_entry : apply_entries) {
|
||||||
std::string name(apply_entry.mustache_name);
|
std::string name(apply_entry.mustache_name);
|
||||||
|
|
|
@ -28,7 +28,8 @@ namespace duck { namespace sl {
|
||||||
std::vector<std::string> apply (
|
std::vector<std::string> apply (
|
||||||
const ScrapNode& node,
|
const ScrapNode& node,
|
||||||
HtmlPoolBaseSP html_pool,
|
HtmlPoolBaseSP html_pool,
|
||||||
XPathPtr xpath
|
XPathPtr xpath,
|
||||||
|
std::string&& parDefNamespace
|
||||||
);
|
);
|
||||||
}} //namespace duck::sl
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
|
|
@ -47,8 +47,9 @@ namespace duck { namespace sl {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) :
|
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace) :
|
||||||
m_cached_results(),
|
m_cached_results(),
|
||||||
|
m_def_namespace(std::move(parDefNamespace)),
|
||||||
m_pool(html_pool),
|
m_pool(html_pool),
|
||||||
m_xpath(parXPath)
|
m_xpath(parXPath)
|
||||||
{
|
{
|
||||||
|
@ -74,7 +75,7 @@ namespace duck { namespace sl {
|
||||||
#endif
|
#endif
|
||||||
const std::string* html = m_pool->GetByID(id);
|
const std::string* html = m_pool->GetByID(id);
|
||||||
|
|
||||||
curr_vec = m_xpath->run_query(*html, std::string(parQuery));
|
curr_vec = m_xpath->run_query(*html, std::string(parQuery), m_def_namespace);
|
||||||
std::cout << "First time for this query, result cached now\n";
|
std::cout << "First time for this query, result cached now\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@
|
||||||
namespace duck { namespace sl {
|
namespace duck { namespace sl {
|
||||||
class XPathRunner {
|
class XPathRunner {
|
||||||
public:
|
public:
|
||||||
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath);
|
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace);
|
||||||
~XPathRunner();
|
~XPathRunner();
|
||||||
|
|
||||||
const std::vector<std::string>& query (
|
const std::vector<std::string>& query (
|
||||||
|
@ -40,6 +40,7 @@ namespace duck { namespace sl {
|
||||||
struct XPathKey;
|
struct XPathKey;
|
||||||
|
|
||||||
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
||||||
|
std::string m_def_namespace;
|
||||||
HtmlPoolBaseSP m_pool;
|
HtmlPoolBaseSP m_pool;
|
||||||
XPathPtr m_xpath;
|
XPathPtr m_xpath;
|
||||||
};
|
};
|
||||||
|
|
|
@ -52,17 +52,18 @@ namespace duck {
|
||||||
|
|
||||||
XPath::~XPath() = default;
|
XPath::~XPath() = default;
|
||||||
|
|
||||||
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries) -> BatchResults {
|
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace) -> BatchResults {
|
||||||
XQilla& xqilla = m_xqilla;
|
XQilla& xqilla = m_xqilla;
|
||||||
XercesConfiguration xconfig;
|
XercesConfiguration xconfig;
|
||||||
xercesc::MemBufInputSource input_buf(reinterpret_cast<const XMLByte*>(parXML.c_str()), parXML.size(), "n/a", false);
|
xercesc::MemBufInputSource input_buf(reinterpret_cast<const XMLByte*>(parXML.c_str()), parXML.size(), "n/a", false);
|
||||||
BatchResults retval;
|
BatchResults retval;
|
||||||
try {
|
try {
|
||||||
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY3_UPDATE, &xconfig));
|
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY3_UPDATE, &xconfig));
|
||||||
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
|
|
||||||
context->setDefaultElementAndTypeNS(u"http://www.w3.org/1999/xhtml");
|
|
||||||
Node::Ptr ptr = context->parseDocument(input_buf);
|
Node::Ptr ptr = context->parseDocument(input_buf);
|
||||||
context->setContextItem(ptr);
|
context->setContextItem(ptr);
|
||||||
|
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
|
||||||
|
AutoDeleteArray<XMLCh> ns_wide = xercesc::XMLString::transcode(parDefNamespace.c_str());
|
||||||
|
context->setDefaultElementAndTypeNS(ns_wide);
|
||||||
|
|
||||||
for (const auto& xpath : parQueries) {
|
for (const auto& xpath : parQueries) {
|
||||||
AutoDeleteArray<XMLCh> xpath_wide = xercesc::XMLString::transcode(xpath.c_str());
|
AutoDeleteArray<XMLCh> xpath_wide = xercesc::XMLString::transcode(xpath.c_str());
|
||||||
|
@ -90,8 +91,8 @@ namespace duck {
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery) {
|
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace) {
|
||||||
auto query_res = run_query(parXML, std::vector<std::string>{parQuery});
|
auto query_res = run_query(parXML, std::vector<std::string>{parQuery}, parDefNamespace);
|
||||||
if (query_res.empty() or query_res.front().empty()) {
|
if (query_res.empty() or query_res.front().empty()) {
|
||||||
return std::vector<std::string>();
|
return std::vector<std::string>();
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,8 +42,8 @@ namespace duck {
|
||||||
XPath();
|
XPath();
|
||||||
~XPath();
|
~XPath();
|
||||||
|
|
||||||
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace );
|
||||||
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery );
|
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace );
|
||||||
|
|
||||||
private:
|
private:
|
||||||
XQilla m_xqilla;
|
XQilla m_xqilla;
|
||||||
|
|
Loading…
Add table
Reference in a new issue