Setting a default namespace breaks queries when namespace is empty, so make it a parameter.

The new --namespace (-n) parameter defaults to http://www.w3.org/1999/xhtml
because it's easier to set it to "" on the command line than to that
long string.
This commit is contained in:
King_DuckZ 2020-04-01 03:09:45 +02:00
parent 55eb7c1fc0
commit b536026f58
8 changed files with 23 additions and 15 deletions

View file

@ -53,6 +53,7 @@ namespace duck {
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server") ("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line") ("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server") ("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
("namespace,n", po::value<std::string>()->default_value("http://www.w3.org/1999/xhtml"), "Default namespace for XPath queries (try empty string if you get no results)")
; ;
po::options_description positional_options("Positional options"); po::options_description positional_options("Positional options");
positional_options.add_options() positional_options.add_options()

View file

@ -105,13 +105,15 @@ namespace {
std::vector<std::string> queries; std::vector<std::string> queries;
queries.reserve(1); queries.reserve(1);
queries.push_back(std::move(xpath_str)); queries.push_back(std::move(xpath_str));
auto results = xpath->run_query(html, queries); auto results = xpath->run_query(html, queries, parVarMap["namespace"].as<std::string>());
for (const auto& lst : results[0]) { for (const auto& lst : results[0]) {
std::cout << lst.first << ": " << lst.second << '\n'; std::cout << lst.first << ": " << lst.second << '\n';
} }
} }
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) { void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
using std::string;
#if !defined(NDEBUG) #if !defined(NDEBUG)
std::cout << " -- XPath model mode --\n"; std::cout << " -- XPath model mode --\n";
if (parVarMap.count("input-url")) if (parVarMap.count("input-url"))
@ -126,7 +128,7 @@ namespace {
std::string(parVarMap["agent"].as<std::string>()), std::string(parVarMap["agent"].as<std::string>()),
std::string(parVarMap["from-code"].as<std::string>()) std::string(parVarMap["from-code"].as<std::string>())
); );
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath); duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath, string(parVarMap["namespace"].as<string>()));
//auto list = duck::get_xpath_definitions(*ast); //auto list = duck::get_xpath_definitions(*ast);
//std::vector<std::string> expressions; //std::vector<std::string> expressions;

View file

@ -397,7 +397,8 @@ namespace duck { namespace sl {
std::vector<std::string> apply ( std::vector<std::string> apply (
const ScrapNode& node, const ScrapNode& node,
HtmlPoolBaseSP html_pool, HtmlPoolBaseSP html_pool,
XPathPtr xpath XPathPtr xpath,
std::string&& parDefNamespace
) { ) {
using std::placeholders::_1; using std::placeholders::_1;
@ -411,7 +412,7 @@ namespace duck { namespace sl {
retval.reserve(apply_entries.size()); retval.reserve(apply_entries.size());
std::cout << "-------------- visiting done ----------------\n"; std::cout << "-------------- visiting done ----------------\n";
XPathRunner xpath_runner(html_pool, xpath); XPathRunner xpath_runner(html_pool, xpath, std::move(parDefNamespace));
for (auto& apply_entry : apply_entries) { for (auto& apply_entry : apply_entries) {
std::string name(apply_entry.mustache_name); std::string name(apply_entry.mustache_name);

View file

@ -28,7 +28,8 @@ namespace duck { namespace sl {
std::vector<std::string> apply ( std::vector<std::string> apply (
const ScrapNode& node, const ScrapNode& node,
HtmlPoolBaseSP html_pool, HtmlPoolBaseSP html_pool,
XPathPtr xpath XPathPtr xpath,
std::string&& parDefNamespace
); );
}} //namespace duck::sl }} //namespace duck::sl

View file

@ -47,8 +47,9 @@ namespace duck { namespace sl {
} }
}; };
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) : XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace) :
m_cached_results(), m_cached_results(),
m_def_namespace(std::move(parDefNamespace)),
m_pool(html_pool), m_pool(html_pool),
m_xpath(parXPath) m_xpath(parXPath)
{ {
@ -74,7 +75,7 @@ namespace duck { namespace sl {
#endif #endif
const std::string* html = m_pool->GetByID(id); const std::string* html = m_pool->GetByID(id);
curr_vec = m_xpath->run_query(*html, std::string(parQuery)); curr_vec = m_xpath->run_query(*html, std::string(parQuery), m_def_namespace);
std::cout << "First time for this query, result cached now\n"; std::cout << "First time for this query, result cached now\n";
} }

View file

@ -28,7 +28,7 @@
namespace duck { namespace sl { namespace duck { namespace sl {
class XPathRunner { class XPathRunner {
public: public:
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath); XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace);
~XPathRunner(); ~XPathRunner();
const std::vector<std::string>& query ( const std::vector<std::string>& query (
@ -40,6 +40,7 @@ namespace duck { namespace sl {
struct XPathKey; struct XPathKey;
std::map<XPathKey, std::vector<std::string>> m_cached_results; std::map<XPathKey, std::vector<std::string>> m_cached_results;
std::string m_def_namespace;
HtmlPoolBaseSP m_pool; HtmlPoolBaseSP m_pool;
XPathPtr m_xpath; XPathPtr m_xpath;
}; };

View file

@ -52,17 +52,18 @@ namespace duck {
XPath::~XPath() = default; XPath::~XPath() = default;
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries) -> BatchResults { auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace) -> BatchResults {
XQilla& xqilla = m_xqilla; XQilla& xqilla = m_xqilla;
XercesConfiguration xconfig; XercesConfiguration xconfig;
xercesc::MemBufInputSource input_buf(reinterpret_cast<const XMLByte*>(parXML.c_str()), parXML.size(), "n/a", false); xercesc::MemBufInputSource input_buf(reinterpret_cast<const XMLByte*>(parXML.c_str()), parXML.size(), "n/a", false);
BatchResults retval; BatchResults retval;
try { try {
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY3_UPDATE, &xconfig)); AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY3_UPDATE, &xconfig));
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
context->setDefaultElementAndTypeNS(u"http://www.w3.org/1999/xhtml");
Node::Ptr ptr = context->parseDocument(input_buf); Node::Ptr ptr = context->parseDocument(input_buf);
context->setContextItem(ptr); context->setContextItem(ptr);
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
AutoDeleteArray<XMLCh> ns_wide = xercesc::XMLString::transcode(parDefNamespace.c_str());
context->setDefaultElementAndTypeNS(ns_wide);
for (const auto& xpath : parQueries) { for (const auto& xpath : parQueries) {
AutoDeleteArray<XMLCh> xpath_wide = xercesc::XMLString::transcode(xpath.c_str()); AutoDeleteArray<XMLCh> xpath_wide = xercesc::XMLString::transcode(xpath.c_str());
@ -90,8 +91,8 @@ namespace duck {
return retval; return retval;
} }
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery) { std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace) {
auto query_res = run_query(parXML, std::vector<std::string>{parQuery}); auto query_res = run_query(parXML, std::vector<std::string>{parQuery}, parDefNamespace);
if (query_res.empty() or query_res.front().empty()) { if (query_res.empty() or query_res.front().empty()) {
return std::vector<std::string>(); return std::vector<std::string>();
} }

View file

@ -42,8 +42,8 @@ namespace duck {
XPath(); XPath();
~XPath(); ~XPath();
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries ); BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace );
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery ); std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace );
private: private:
XQilla m_xqilla; XQilla m_xqilla;