Setting a default namespace breaks queries when namespace is empty, so make it a parameter.
The new --namespace (-n) parameter defaults to http://www.w3.org/1999/xhtml because it's easier to set it to "" on the command line than to that long string.
This commit is contained in:
parent
55eb7c1fc0
commit
b536026f58
8 changed files with 23 additions and 15 deletions
|
@ -53,6 +53,7 @@ namespace duck {
|
|||
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
||||
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
|
||||
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
|
||||
("namespace,n", po::value<std::string>()->default_value("http://www.w3.org/1999/xhtml"), "Default namespace for XPath queries (try empty string if you get no results)")
|
||||
;
|
||||
po::options_description positional_options("Positional options");
|
||||
positional_options.add_options()
|
||||
|
|
|
@ -105,13 +105,15 @@ namespace {
|
|||
std::vector<std::string> queries;
|
||||
queries.reserve(1);
|
||||
queries.push_back(std::move(xpath_str));
|
||||
auto results = xpath->run_query(html, queries);
|
||||
auto results = xpath->run_query(html, queries, parVarMap["namespace"].as<std::string>());
|
||||
for (const auto& lst : results[0]) {
|
||||
std::cout << lst.first << ": " << lst.second << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||
using std::string;
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
std::cout << " -- XPath model mode --\n";
|
||||
if (parVarMap.count("input-url"))
|
||||
|
@ -126,7 +128,7 @@ namespace {
|
|||
std::string(parVarMap["agent"].as<std::string>()),
|
||||
std::string(parVarMap["from-code"].as<std::string>())
|
||||
);
|
||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
|
||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath, string(parVarMap["namespace"].as<string>()));
|
||||
//auto list = duck::get_xpath_definitions(*ast);
|
||||
|
||||
//std::vector<std::string> expressions;
|
||||
|
|
|
@ -397,7 +397,8 @@ namespace duck { namespace sl {
|
|||
std::vector<std::string> apply (
|
||||
const ScrapNode& node,
|
||||
HtmlPoolBaseSP html_pool,
|
||||
XPathPtr xpath
|
||||
XPathPtr xpath,
|
||||
std::string&& parDefNamespace
|
||||
) {
|
||||
using std::placeholders::_1;
|
||||
|
||||
|
@ -411,7 +412,7 @@ namespace duck { namespace sl {
|
|||
retval.reserve(apply_entries.size());
|
||||
|
||||
std::cout << "-------------- visiting done ----------------\n";
|
||||
XPathRunner xpath_runner(html_pool, xpath);
|
||||
XPathRunner xpath_runner(html_pool, xpath, std::move(parDefNamespace));
|
||||
|
||||
for (auto& apply_entry : apply_entries) {
|
||||
std::string name(apply_entry.mustache_name);
|
||||
|
|
|
@ -28,7 +28,8 @@ namespace duck { namespace sl {
|
|||
std::vector<std::string> apply (
|
||||
const ScrapNode& node,
|
||||
HtmlPoolBaseSP html_pool,
|
||||
XPathPtr xpath
|
||||
XPathPtr xpath,
|
||||
std::string&& parDefNamespace
|
||||
);
|
||||
}} //namespace duck::sl
|
||||
|
||||
|
|
|
@ -47,8 +47,9 @@ namespace duck { namespace sl {
|
|||
}
|
||||
};
|
||||
|
||||
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) :
|
||||
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace) :
|
||||
m_cached_results(),
|
||||
m_def_namespace(std::move(parDefNamespace)),
|
||||
m_pool(html_pool),
|
||||
m_xpath(parXPath)
|
||||
{
|
||||
|
@ -74,7 +75,7 @@ namespace duck { namespace sl {
|
|||
#endif
|
||||
const std::string* html = m_pool->GetByID(id);
|
||||
|
||||
curr_vec = m_xpath->run_query(*html, std::string(parQuery));
|
||||
curr_vec = m_xpath->run_query(*html, std::string(parQuery), m_def_namespace);
|
||||
std::cout << "First time for this query, result cached now\n";
|
||||
}
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
namespace duck { namespace sl {
|
||||
class XPathRunner {
|
||||
public:
|
||||
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath);
|
||||
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace);
|
||||
~XPathRunner();
|
||||
|
||||
const std::vector<std::string>& query (
|
||||
|
@ -40,6 +40,7 @@ namespace duck { namespace sl {
|
|||
struct XPathKey;
|
||||
|
||||
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
||||
std::string m_def_namespace;
|
||||
HtmlPoolBaseSP m_pool;
|
||||
XPathPtr m_xpath;
|
||||
};
|
||||
|
|
|
@ -52,17 +52,18 @@ namespace duck {
|
|||
|
||||
XPath::~XPath() = default;
|
||||
|
||||
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries) -> BatchResults {
|
||||
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace) -> BatchResults {
|
||||
XQilla& xqilla = m_xqilla;
|
||||
XercesConfiguration xconfig;
|
||||
xercesc::MemBufInputSource input_buf(reinterpret_cast<const XMLByte*>(parXML.c_str()), parXML.size(), "n/a", false);
|
||||
BatchResults retval;
|
||||
try {
|
||||
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY3_UPDATE, &xconfig));
|
||||
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
|
||||
context->setDefaultElementAndTypeNS(u"http://www.w3.org/1999/xhtml");
|
||||
Node::Ptr ptr = context->parseDocument(input_buf);
|
||||
context->setContextItem(ptr);
|
||||
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
|
||||
AutoDeleteArray<XMLCh> ns_wide = xercesc::XMLString::transcode(parDefNamespace.c_str());
|
||||
context->setDefaultElementAndTypeNS(ns_wide);
|
||||
|
||||
for (const auto& xpath : parQueries) {
|
||||
AutoDeleteArray<XMLCh> xpath_wide = xercesc::XMLString::transcode(xpath.c_str());
|
||||
|
@ -90,8 +91,8 @@ namespace duck {
|
|||
return retval;
|
||||
}
|
||||
|
||||
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery) {
|
||||
auto query_res = run_query(parXML, std::vector<std::string>{parQuery});
|
||||
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace) {
|
||||
auto query_res = run_query(parXML, std::vector<std::string>{parQuery}, parDefNamespace);
|
||||
if (query_res.empty() or query_res.front().empty()) {
|
||||
return std::vector<std::string>();
|
||||
}
|
||||
|
|
|
@ -42,8 +42,8 @@ namespace duck {
|
|||
XPath();
|
||||
~XPath();
|
||||
|
||||
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
||||
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery );
|
||||
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace );
|
||||
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace );
|
||||
|
||||
private:
|
||||
XQilla m_xqilla;
|
||||
|
|
Loading…
Add table
Reference in a new issue