Setting a default namespace breaks queries when namespace is empty, so make it a parameter.

The new --namespace (-n) parameter defaults to http://www.w3.org/1999/xhtml
because it's easier to set it to "" on the command line than to that
long string.
This commit is contained in:
King_DuckZ 2020-04-01 03:09:45 +02:00
parent 55eb7c1fc0
commit b536026f58
8 changed files with 23 additions and 15 deletions

View file

@ -53,6 +53,7 @@ namespace duck {
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
("namespace,n", po::value<std::string>()->default_value("http://www.w3.org/1999/xhtml"), "Default namespace for XPath queries (try empty string if you get no results)")
;
po::options_description positional_options("Positional options");
positional_options.add_options()

View file

@ -105,13 +105,15 @@ namespace {
std::vector<std::string> queries;
queries.reserve(1);
queries.push_back(std::move(xpath_str));
auto results = xpath->run_query(html, queries);
auto results = xpath->run_query(html, queries, parVarMap["namespace"].as<std::string>());
for (const auto& lst : results[0]) {
std::cout << lst.first << ": " << lst.second << '\n';
}
}
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
using std::string;
#if !defined(NDEBUG)
std::cout << " -- XPath model mode --\n";
if (parVarMap.count("input-url"))
@ -126,7 +128,7 @@ namespace {
std::string(parVarMap["agent"].as<std::string>()),
std::string(parVarMap["from-code"].as<std::string>())
);
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath, string(parVarMap["namespace"].as<string>()));
//auto list = duck::get_xpath_definitions(*ast);
//std::vector<std::string> expressions;

View file

@ -397,7 +397,8 @@ namespace duck { namespace sl {
std::vector<std::string> apply (
const ScrapNode& node,
HtmlPoolBaseSP html_pool,
XPathPtr xpath
XPathPtr xpath,
std::string&& parDefNamespace
) {
using std::placeholders::_1;
@ -411,7 +412,7 @@ namespace duck { namespace sl {
retval.reserve(apply_entries.size());
std::cout << "-------------- visiting done ----------------\n";
XPathRunner xpath_runner(html_pool, xpath);
XPathRunner xpath_runner(html_pool, xpath, std::move(parDefNamespace));
for (auto& apply_entry : apply_entries) {
std::string name(apply_entry.mustache_name);

View file

@ -28,7 +28,8 @@ namespace duck { namespace sl {
std::vector<std::string> apply (
const ScrapNode& node,
HtmlPoolBaseSP html_pool,
XPathPtr xpath
XPathPtr xpath,
std::string&& parDefNamespace
);
}} //namespace duck::sl

View file

@ -47,8 +47,9 @@ namespace duck { namespace sl {
}
};
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) :
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace) :
m_cached_results(),
m_def_namespace(std::move(parDefNamespace)),
m_pool(html_pool),
m_xpath(parXPath)
{
@ -74,7 +75,7 @@ namespace duck { namespace sl {
#endif
const std::string* html = m_pool->GetByID(id);
curr_vec = m_xpath->run_query(*html, std::string(parQuery));
curr_vec = m_xpath->run_query(*html, std::string(parQuery), m_def_namespace);
std::cout << "First time for this query, result cached now\n";
}

View file

@ -28,7 +28,7 @@
namespace duck { namespace sl {
class XPathRunner {
public:
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath);
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace);
~XPathRunner();
const std::vector<std::string>& query (
@ -40,6 +40,7 @@ namespace duck { namespace sl {
struct XPathKey;
std::map<XPathKey, std::vector<std::string>> m_cached_results;
std::string m_def_namespace;
HtmlPoolBaseSP m_pool;
XPathPtr m_xpath;
};

View file

@ -52,17 +52,18 @@ namespace duck {
XPath::~XPath() = default;
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries) -> BatchResults {
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace) -> BatchResults {
XQilla& xqilla = m_xqilla;
XercesConfiguration xconfig;
xercesc::MemBufInputSource input_buf(reinterpret_cast<const XMLByte*>(parXML.c_str()), parXML.size(), "n/a", false);
BatchResults retval;
try {
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY3_UPDATE, &xconfig));
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
context->setDefaultElementAndTypeNS(u"http://www.w3.org/1999/xhtml");
Node::Ptr ptr = context->parseDocument(input_buf);
context->setContextItem(ptr);
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
AutoDeleteArray<XMLCh> ns_wide = xercesc::XMLString::transcode(parDefNamespace.c_str());
context->setDefaultElementAndTypeNS(ns_wide);
for (const auto& xpath : parQueries) {
AutoDeleteArray<XMLCh> xpath_wide = xercesc::XMLString::transcode(xpath.c_str());
@ -90,8 +91,8 @@ namespace duck {
return retval;
}
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery) {
auto query_res = run_query(parXML, std::vector<std::string>{parQuery});
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace) {
auto query_res = run_query(parXML, std::vector<std::string>{parQuery}, parDefNamespace);
if (query_res.empty() or query_res.front().empty()) {
return std::vector<std::string>();
}

View file

@ -42,8 +42,8 @@ namespace duck {
XPath();
~XPath();
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery );
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace );
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace );
private:
XQilla m_xqilla;