Working on better scraplang support, still not there tho.

This commit is contained in:
King_DuckZ 2020-02-18 10:27:52 +01:00
parent 430886085c
commit 60d6c2cb61
15 changed files with 275 additions and 106 deletions

View file

@ -28,6 +28,7 @@ add_executable(${PROJECT_NAME}
src/html_pool.cpp src/html_pool.cpp
src/htmlretrieve.cpp src/htmlretrieve.cpp
src/commandline.cpp src/commandline.cpp
src/scraplang/parse_exports.cpp
src/scraplang/parse.cpp src/scraplang/parse.cpp
src/scraplang/apply.cpp src/scraplang/apply.cpp
src/xpath.cpp src/xpath.cpp

View file

@ -31,19 +31,6 @@
namespace duck { namespace duck {
namespace { namespace {
void dropScriptTags (std::string& html) {
size_t open_index = 0;
const std::string open_tag("<script");
const std::string close_tag("</script>");
while (html.npos != (open_index = html.find(open_tag, open_index))) {
assert(open_index < html.size());
auto close_index = html.find(close_tag, open_index + open_tag.size());
if (close_index == html.npos)
close_index = html.size();
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
}
}
bool isHttps (const std::string_view& parUrl) { bool isHttps (const std::string_view& parUrl) {
const char protocol[] = "https://"; const char protocol[] = "https://";
@ -56,7 +43,6 @@ namespace duck {
} //unnamed namespace } //unnamed namespace
std::string clean_html (std::string&& html) { std::string clean_html (std::string&& html) {
dropScriptTags(html);
// Initialize a Tidy document // Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate(); TidyDoc tidyDoc = tidyCreate();

View file

@ -22,6 +22,7 @@
#include "scraplang.hpp" #include "scraplang.hpp"
#include "html_pool.hpp" #include "html_pool.hpp"
#include "read_all.hpp" #include "read_all.hpp"
#include "safe_stack_object.hpp"
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <fstream> #include <fstream>
@ -32,8 +33,8 @@
namespace { namespace {
void dump_string ( const std::string& parPathDest, const std::string& parData ); void dump_string ( const std::string& parPathDest, const std::string& parData );
void load_from_commandline ( const boost::program_options::variables_map& parVarMap ); void load_from_commandline ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
void load_from_model ( const boost::program_options::variables_map& parVarMap ); void load_from_model ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
} //unnamed namespace } //unnamed namespace
int main (int argc, char* argv[]) { int main (int argc, char* argv[]) {
@ -51,10 +52,11 @@ int main (int argc, char* argv[]) {
} }
try { try {
curry::SafeStackObject<duck::XPath> query;
if (vm.count("model")) if (vm.count("model"))
load_from_model(vm); load_from_model(vm, query);
else else
load_from_commandline(vm); load_from_commandline(vm, query);
} }
catch (const duck::ParseError& err) { catch (const duck::ParseError& err) {
std::cerr << err.what() << std::endl; std::cerr << err.what() << std::endl;
@ -74,7 +76,7 @@ namespace {
*os << parData; *os << parData;
} }
void load_from_commandline (const boost::program_options::variables_map& parVarMap) { void load_from_commandline (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
const auto& vm = parVarMap; const auto& vm = parVarMap;
const auto url = vm["input-url"].as<std::string>(); const auto url = vm["input-url"].as<std::string>();
@ -91,25 +93,25 @@ namespace {
dump_string(vm["dump"].as<std::string>(), html); dump_string(vm["dump"].as<std::string>(), html);
} }
const std::string xpath = parVarMap["xpath"].as<std::string>(); const std::string xpath_str = parVarMap["xpath"].as<std::string>();
#if !defined(NDEBUG) #if !defined(NDEBUG)
std::cout << " -- XPath direct mode --\n"; std::cout << " -- XPath direct mode --\n";
std::cout << "URL : " << parVarMap["input-url"].as<std::string>() << "\n"; std::cout << "URL : " << parVarMap["input-url"].as<std::string>() << "\n";
std::cout << "XPath: " << xpath << std::endl; std::cout << "XPath: " << xpath_str << std::endl;
std::cout << "Agent: " << parVarMap["agent"].as<std::string>() << std::endl; std::cout << "Agent: " << parVarMap["agent"].as<std::string>() << std::endl;
#endif #endif
std::vector<std::string> queries; std::vector<std::string> queries;
queries.reserve(1); queries.reserve(1);
queries.push_back(std::move(xpath)); queries.push_back(std::move(xpath_str));
auto results = duck::xpath_query(html, queries); auto results = xpath->run_query(html, queries);
for (const auto& lst : results[0]) { for (const auto& lst : results[0]) {
std::cout << lst.first << ": " << lst.second << '\n'; std::cout << lst.first << ": " << lst.second << '\n';
} }
} }
void load_from_model (const boost::program_options::variables_map& parVarMap) { void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
#if !defined(NDEBUG) #if !defined(NDEBUG)
std::cout << " -- XPath model mode --\n"; std::cout << " -- XPath model mode --\n";
if (parVarMap.count("input-url")) if (parVarMap.count("input-url"))
@ -121,7 +123,7 @@ namespace {
auto ast = duck::sl::parse(script); auto ast = duck::sl::parse(script);
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>())); duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool)); duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
//auto list = duck::get_xpath_definitions(*ast); //auto list = duck::get_xpath_definitions(*ast);
//std::vector<std::string> expressions; //std::vector<std::string> expressions;

View file

@ -396,7 +396,8 @@ namespace duck { namespace sl {
std::vector<std::string> apply ( std::vector<std::string> apply (
const ScrapNode& node, const ScrapNode& node,
HtmlPoolBaseSP html_pool HtmlPoolBaseSP html_pool,
XPathPtr xpath
) { ) {
using std::placeholders::_1; using std::placeholders::_1;
@ -410,7 +411,7 @@ namespace duck { namespace sl {
retval.reserve(apply_entries.size()); retval.reserve(apply_entries.size());
std::cout << "-------------- visiting done ----------------\n"; std::cout << "-------------- visiting done ----------------\n";
XPathRunner xpath_runner(html_pool); XPathRunner xpath_runner(html_pool, xpath);
for (auto& apply_entry : apply_entries) { for (auto& apply_entry : apply_entries) {
std::string name(apply_entry.mustache_name); std::string name(apply_entry.mustache_name);

View file

@ -21,10 +21,15 @@
#include "scrap_node.hpp" #include "scrap_node.hpp"
#include "scraplang/html_pool_base.hpp" #include "scraplang/html_pool_base.hpp"
#include "xpath_fwd.hpp"
#include <string> #include <string>
namespace duck { namespace sl { namespace duck { namespace sl {
std::vector<std::string> apply (const ScrapNode& node, HtmlPoolBaseSP html_pool); std::vector<std::string> apply (
const ScrapNode& node,
HtmlPoolBaseSP html_pool,
XPathPtr xpath
);
}} //namespace duck::sl }} //namespace duck::sl
#endif #endif

View file

@ -17,7 +17,9 @@
*/ */
#include "parse.hpp" #include "parse.hpp"
#include "element_def.hpp" #include "scraplang/parse_exports.hpp"
#include "scraplang/scrapgrammar.hpp"
#include "scraplang/element_def.hpp"
#include <boost/spirit/include/qi.hpp> #include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_stl.hpp> #include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp> #include <boost/spirit/include/phoenix_fusion.hpp>
@ -27,10 +29,8 @@
#if !defined(NDEBUG) #if !defined(NDEBUG)
# include <iostream> # include <iostream>
#endif #endif
#include <boost/variant/apply_visitor.hpp>
#include <stdexcept> #include <stdexcept>
namespace qi = boost::spirit::qi;
namespace sp = boost::spirit; namespace sp = boost::spirit;
BOOST_FUSION_ADAPT_STRUCT( BOOST_FUSION_ADAPT_STRUCT(
@ -67,66 +67,8 @@ BOOST_FUSION_ADAPT_STRUCT(
) )
namespace duck { namespace sl { namespace duck { namespace sl {
namespace { std::vector<ScrapNode> parse (std::string_view parData) {
template <typename I, typename Skipper> ScrapGrammar<std::string_view::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
public:
ScrapGrammar() : ScrapGrammar::base_type(start) {
using qi::char_;
using qi::lexeme;
using qi::alpha;
using qi::alnum;
using qi::graph;
using qi::attr;
using qi::eol;
using qi::eoi;
using qi::lit;
using qi::string;
using qi::as_string;
using qi::no_skip;
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
url = -(+alpha >> string("://")) >> alpha >> *graph;
mustache_like_token = "{{" >> identifier >> "}}";
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
xpath_assignment %= identifier >>
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
as_string[lexeme[+(graph | char_(" \t"))]];
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
assignment_list >> +eol >> "end";
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
assignment_list = (xpath_assignment | struct_block) % +eol;
}
private:
template <typename F>
using RuleType = qi::rule<I, F, Skipper>;
RuleType<std::vector<ScrapNode>()> start;
RuleType<FromBlock()> from_block;
RuleType<std::string()> url;
RuleType<std::string()> mustache_like_token;
RuleType<std::string()> quoted_string;
RuleType<XPathElement()> xpath_assignment;
RuleType<std::string()> identifier;
RuleType<SourceInfo()> source_info;
RuleType<ApplyBlock()> apply_block;
RuleType<StructBlock()> struct_block;
RuleType<MustacheBlock()> mustache_block;
RuleType<std::vector<StructItem>()> assignment_list;
};
} //unnamed namespace
std::vector<ScrapNode> parse (const std::string& parData) {
ScrapGrammar<std::string::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
auto it_start = parData.cbegin(); auto it_start = parData.cbegin();
std::vector<ScrapNode> retval; std::vector<ScrapNode> retval;

View file

@ -20,10 +20,10 @@
#define idBE96C2D49C4C413888A79EAEB2B9C0FA #define idBE96C2D49C4C413888A79EAEB2B9C0FA
#include "scrap_node.hpp" #include "scrap_node.hpp"
#include <string> #include <string_view>
namespace duck { namespace sl { namespace duck { namespace sl {
std::vector<ScrapNode> parse ( const std::string& parData ); std::vector<ScrapNode> parse ( std::string_view parData );
}} //namespace duck::sl }} //namespace duck::sl
#endif #endif

View file

@ -0,0 +1,51 @@
/* Copyright (C) 2017-2020 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#include "scraplang/parse_exports.hpp"
#include <string>
#include <vector>
template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
template bool boost::spirit::qi::phrase_parse<
std::basic_string<char>::const_iterator,
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
boost::spirit::ascii::blank_type,
std::vector<duck::sl::ScrapNode>
> (
std::basic_string<char>::const_iterator&,
std::basic_string<char>::const_iterator,
duck::sl::ScrapGrammar<
std::basic_string<char>::const_iterator,
boost::spirit::qi::ascii::blank_type
> const&,
boost::spirit::ascii::blank_type const&,
std::vector<duck::sl::ScrapNode>&
);
template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;

View file

@ -0,0 +1,54 @@
/* Copyright (C) 2017-2020 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "scraplang/scrap_node.hpp"
#include "scraplang/scrapgrammar.hpp"
#include <string>
#include <vector>
extern template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
extern template bool boost::spirit::qi::phrase_parse<
std::basic_string<char>::const_iterator,
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
boost::spirit::ascii::blank_type,
std::vector<duck::sl::ScrapNode>
> (
std::basic_string<char>::const_iterator&,
std::basic_string<char>::const_iterator,
duck::sl::ScrapGrammar<
std::basic_string<char>::const_iterator,
boost::spirit::qi::ascii::blank_type
> const&,
boost::spirit::ascii::blank_type const&,
std::vector<duck::sl::ScrapNode>&
);
extern template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;

View file

@ -0,0 +1,81 @@
/* Copyright (C) 2017-2020 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <boost/spirit/include/qi.hpp>
namespace duck::sl {
namespace qi = ::boost::spirit::qi;
template <typename I, typename Skipper>
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
public:
ScrapGrammar() : ScrapGrammar::base_type(start) {
using qi::char_;
using qi::lexeme;
using qi::alpha;
using qi::alnum;
using qi::graph;
using qi::attr;
using qi::eol;
using qi::eoi;
using qi::lit;
using qi::string;
using qi::as_string;
using qi::no_skip;
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
url = -(+alpha >> string("://")) >> alpha >> *graph;
mustache_like_token = "{{" >> identifier >> "}}";
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
xpath_assignment %= identifier >>
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
as_string[lexeme[+(graph | char_(" \t"))]];
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
assignment_list >> +eol >> "end";
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
assignment_list = (xpath_assignment | struct_block) % +eol;
}
private:
template <typename F>
using RuleType = qi::rule<I, F, Skipper>;
RuleType<std::vector<ScrapNode>()> start;
RuleType<FromBlock()> from_block;
RuleType<std::string()> url;
RuleType<std::string()> mustache_like_token;
RuleType<std::string()> quoted_string;
RuleType<XPathElement()> xpath_assignment;
RuleType<std::string()> identifier;
RuleType<SourceInfo()> source_info;
RuleType<ApplyBlock()> apply_block;
RuleType<StructBlock()> struct_block;
RuleType<MustacheBlock()> mustache_block;
RuleType<std::vector<StructItem>()> assignment_list;
};
} //namespace duck::sl

View file

@ -47,9 +47,10 @@ namespace duck { namespace sl {
} }
}; };
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool) : XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) :
m_cached_results(), m_cached_results(),
m_pool(html_pool) m_pool(html_pool),
m_xpath(parXPath)
{ {
} }
@ -73,7 +74,7 @@ namespace duck { namespace sl {
#endif #endif
const std::string* html = m_pool->GetByID(id); const std::string* html = m_pool->GetByID(id);
curr_vec = xpath_query(*html, std::string(parQuery)); curr_vec = m_xpath->run_query(*html, std::string(parQuery));
std::cout << "First time for this query, result cached now\n"; std::cout << "First time for this query, result cached now\n";
} }

View file

@ -20,6 +20,7 @@
#define id46DB8F4F85E2417E9AF0B1A410240D4F #define id46DB8F4F85E2417E9AF0B1A410240D4F
#include "html_pool_base.hpp" #include "html_pool_base.hpp"
#include "xpath_fwd.hpp"
#include <map> #include <map>
#include <string_view> #include <string_view>
#include <string> #include <string>
@ -27,7 +28,7 @@
namespace duck { namespace sl { namespace duck { namespace sl {
class XPathRunner { class XPathRunner {
public: public:
explicit XPathRunner (HtmlPoolBaseSP html_pool); XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath);
~XPathRunner(); ~XPathRunner();
const std::vector<std::string>& query ( const std::vector<std::string>& query (
@ -40,6 +41,7 @@ namespace duck { namespace sl {
std::map<XPathKey, std::vector<std::string>> m_cached_results; std::map<XPathKey, std::vector<std::string>> m_cached_results;
HtmlPoolBaseSP m_pool; HtmlPoolBaseSP m_pool;
XPathPtr m_xpath;
}; };
}} //namespace duck::sl }} //namespace duck::sl

View file

@ -47,7 +47,11 @@ namespace duck {
} }
} //unnamed namespace } //unnamed namespace
XPathBatchResults xpath_query (const std::string& parXML, const std::vector<std::string>& parQueries) { XPath::XPath() = default;
XPath::~XPath() = default;
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries) -> BatchResults {
XQilla& xqilla = m_xqilla; XQilla& xqilla = m_xqilla;
XercesConfiguration xconfig; XercesConfiguration xconfig;
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY_UPDATE, &xconfig)); AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY_UPDATE, &xconfig));
@ -61,7 +65,7 @@ namespace duck {
} }
context->setContextItem(ptr); context->setContextItem(ptr);
XPathBatchResults retval; BatchResults retval;
for (const auto& xpath : parQueries) { for (const auto& xpath : parQueries) {
AutoDelete<XQQuery> query(xqilla.parse(X(xpath.c_str()))); AutoDelete<XQQuery> query(xqilla.parse(X(xpath.c_str())));
context->setContextPosition(1); context->setContextPosition(1);
@ -75,11 +79,11 @@ namespace duck {
} }
retval.push_back(std::move(new_lst)); retval.push_back(std::move(new_lst));
} }
return std::move(retval); return retval;
} }
std::vector<std::string> xpath_query (const std::string& parXML, const std::string& parQuery) { std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery) {
auto query_res = xpath_query(parXML, std::vector<std::string>{parQuery}); auto query_res = run_query(parXML, std::vector<std::string>{parQuery});
if (query_res.empty() or query_res.front().empty()) { if (query_res.empty() or query_res.front().empty()) {
return std::vector<std::string>(); return std::vector<std::string>();
} }

View file

@ -19,6 +19,7 @@
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7 #ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
#define id21E0A6F345D24C5D83D3B1F74EC810F7 #define id21E0A6F345D24C5D83D3B1F74EC810F7
#include "xpath_fwd.hpp"
#include <string> #include <string>
#include <vector> #include <vector>
#include <exception> #include <exception>
@ -26,8 +27,6 @@
#include <xqilla/xqilla-simple.hpp> #include <xqilla/xqilla-simple.hpp>
namespace duck { namespace duck {
typedef std::vector<std::vector<std::pair<std::string, std::string>>> XPathBatchResults;
class ParseError : public std::exception { class ParseError : public std::exception {
public: public:
ParseError ( int parLine, int parColumn, std::string parMessage ); ParseError ( int parLine, int parColumn, std::string parMessage );
@ -36,8 +35,19 @@ namespace duck {
std::vector<char> m_msg; std::vector<char> m_msg;
}; };
XPathBatchResults xpath_query ( const std::string& parXML, const std::vector<std::string>& parQueries ); class XPath : public Kakoune::SafeCountable {
std::vector<std::string> xpath_query ( const std::string& parXML, const std::string& parQuery ); public:
typedef std::vector<std::vector<std::pair<std::string, std::string>>> BatchResults;
XPath();
~XPath();
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery );
private:
XQilla m_xqilla;
};
} //namespace duck } //namespace duck
#endif #endif

29
src/xpath_fwd.hpp Normal file
View file

@ -0,0 +1,29 @@
/* Copyright (C) 2015-2020 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id08062CD6C4904D94BFF57990C44B6CCB
#define id08062CD6C4904D94BFF57990C44B6CCB
#include "kakoune/safe_ptr.hh"
namespace duck {
class XPath;
using XPathPtr = Kakoune::SafePtr<XPath>;
} //namespace duck
#endif