From b536026f587168b4271247827c4d36245efa2aea Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Wed, 1 Apr 2020 03:09:45 +0200 Subject: [PATCH] Setting a default namespace breaks queries when namespace is empty, so make it a parameter. The new --namespace (-n) parameter defaults to http://www.w3.org/1999/xhtml because it's easier to set it to "" on the command line than to that long string. --- src/commandline.cpp | 1 + src/main.cpp | 6 ++++-- src/scraplang/apply.cpp | 5 +++-- src/scraplang/apply.hpp | 3 ++- src/scraplang/xpath_runner.cpp | 5 +++-- src/scraplang/xpath_runner.hpp | 3 ++- src/xpath.cpp | 11 ++++++----- src/xpath.hpp | 4 ++-- 8 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/commandline.cpp b/src/commandline.cpp index a86d166..bc6ca8e 100644 --- a/src/commandline.cpp +++ b/src/commandline.cpp @@ -53,6 +53,7 @@ namespace duck { ("agent", po::value()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server") ("model,m", po::value(), "Read XPath expressions from the specified file instead of command line") ("from-code,f", po::value()->default_value(""), "Force source charset to this, disregard any charset reported by the server") + ("namespace,n", po::value()->default_value("http://www.w3.org/1999/xhtml"), "Default namespace for XPath queries (try empty string if you get no results)") ; po::options_description positional_options("Positional options"); positional_options.add_options() diff --git a/src/main.cpp b/src/main.cpp index 03b2577..0abec27 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -105,13 +105,15 @@ namespace { std::vector queries; queries.reserve(1); queries.push_back(std::move(xpath_str)); - auto results = xpath->run_query(html, queries); + auto results = xpath->run_query(html, queries, parVarMap["namespace"].as()); for (const auto& lst : results[0]) { std::cout << lst.first << ": " << lst.second << '\n'; } } void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) { + using std::string; + #if !defined(NDEBUG) std::cout << " -- XPath model mode --\n"; if (parVarMap.count("input-url")) @@ -126,7 +128,7 @@ namespace { std::string(parVarMap["agent"].as()), std::string(parVarMap["from-code"].as()) ); - duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath); + duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath, string(parVarMap["namespace"].as())); //auto list = duck::get_xpath_definitions(*ast); //std::vector expressions; diff --git a/src/scraplang/apply.cpp b/src/scraplang/apply.cpp index e9dddf2..dbfe159 100644 --- a/src/scraplang/apply.cpp +++ b/src/scraplang/apply.cpp @@ -397,7 +397,8 @@ namespace duck { namespace sl { std::vector apply ( const ScrapNode& node, HtmlPoolBaseSP html_pool, - XPathPtr xpath + XPathPtr xpath, + std::string&& parDefNamespace ) { using std::placeholders::_1; @@ -411,7 +412,7 @@ namespace duck { namespace sl { retval.reserve(apply_entries.size()); std::cout << "-------------- visiting done ----------------\n"; - XPathRunner xpath_runner(html_pool, xpath); + XPathRunner xpath_runner(html_pool, xpath, std::move(parDefNamespace)); for (auto& apply_entry : apply_entries) { std::string name(apply_entry.mustache_name); diff --git a/src/scraplang/apply.hpp b/src/scraplang/apply.hpp index 18efb4d..fe6de2e 100644 --- a/src/scraplang/apply.hpp +++ b/src/scraplang/apply.hpp @@ -28,7 +28,8 @@ namespace duck { namespace sl { std::vector apply ( const ScrapNode& node, HtmlPoolBaseSP html_pool, - XPathPtr xpath + XPathPtr xpath, + std::string&& parDefNamespace ); }} //namespace duck::sl diff --git a/src/scraplang/xpath_runner.cpp b/src/scraplang/xpath_runner.cpp index dabcf5e..3df82f5 100644 --- a/src/scraplang/xpath_runner.cpp +++ b/src/scraplang/xpath_runner.cpp @@ -47,8 +47,9 @@ namespace duck { namespace sl { } }; - XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) : + XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace) : m_cached_results(), + m_def_namespace(std::move(parDefNamespace)), m_pool(html_pool), m_xpath(parXPath) { @@ -74,7 +75,7 @@ namespace duck { namespace sl { #endif const std::string* html = m_pool->GetByID(id); - curr_vec = m_xpath->run_query(*html, std::string(parQuery)); + curr_vec = m_xpath->run_query(*html, std::string(parQuery), m_def_namespace); std::cout << "First time for this query, result cached now\n"; } diff --git a/src/scraplang/xpath_runner.hpp b/src/scraplang/xpath_runner.hpp index ce84e53..321a40d 100644 --- a/src/scraplang/xpath_runner.hpp +++ b/src/scraplang/xpath_runner.hpp @@ -28,7 +28,7 @@ namespace duck { namespace sl { class XPathRunner { public: - XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath); + XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace); ~XPathRunner(); const std::vector& query ( @@ -40,6 +40,7 @@ namespace duck { namespace sl { struct XPathKey; std::map> m_cached_results; + std::string m_def_namespace; HtmlPoolBaseSP m_pool; XPathPtr m_xpath; }; diff --git a/src/xpath.cpp b/src/xpath.cpp index f51ff42..7612718 100644 --- a/src/xpath.cpp +++ b/src/xpath.cpp @@ -52,17 +52,18 @@ namespace duck { XPath::~XPath() = default; - auto XPath::run_query (const std::string& parXML, const std::vector& parQueries) -> BatchResults { + auto XPath::run_query (const std::string& parXML, const std::vector& parQueries, const std::string& parDefNamespace) -> BatchResults { XQilla& xqilla = m_xqilla; XercesConfiguration xconfig; xercesc::MemBufInputSource input_buf(reinterpret_cast(parXML.c_str()), parXML.size(), "n/a", false); BatchResults retval; try { AutoDelete context(xqilla.createContext(XQilla::XQUERY3_UPDATE, &xconfig)); - //see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01 - context->setDefaultElementAndTypeNS(u"http://www.w3.org/1999/xhtml"); Node::Ptr ptr = context->parseDocument(input_buf); context->setContextItem(ptr); + //see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01 + AutoDeleteArray ns_wide = xercesc::XMLString::transcode(parDefNamespace.c_str()); + context->setDefaultElementAndTypeNS(ns_wide); for (const auto& xpath : parQueries) { AutoDeleteArray xpath_wide = xercesc::XMLString::transcode(xpath.c_str()); @@ -90,8 +91,8 @@ namespace duck { return retval; } - std::vector XPath::run_query (const std::string& parXML, const std::string& parQuery) { - auto query_res = run_query(parXML, std::vector{parQuery}); + std::vector XPath::run_query (const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace) { + auto query_res = run_query(parXML, std::vector{parQuery}, parDefNamespace); if (query_res.empty() or query_res.front().empty()) { return std::vector(); } diff --git a/src/xpath.hpp b/src/xpath.hpp index 6166218..7428d25 100644 --- a/src/xpath.hpp +++ b/src/xpath.hpp @@ -42,8 +42,8 @@ namespace duck { XPath(); ~XPath(); - BatchResults run_query ( const std::string& parXML, const std::vector& parQueries ); - std::vector run_query ( const std::string& parXML, const std::string& parQuery ); + BatchResults run_query ( const std::string& parXML, const std::vector& parQueries, const std::string& parDefNamespace ); + std::vector run_query ( const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace ); private: XQilla m_xqilla;