Working on better scraplang support, still not there tho.
This commit is contained in:
parent
430886085c
commit
60d6c2cb61
15 changed files with 275 additions and 106 deletions
|
@ -28,6 +28,7 @@ add_executable(${PROJECT_NAME}
|
|||
src/html_pool.cpp
|
||||
src/htmlretrieve.cpp
|
||||
src/commandline.cpp
|
||||
src/scraplang/parse_exports.cpp
|
||||
src/scraplang/parse.cpp
|
||||
src/scraplang/apply.cpp
|
||||
src/xpath.cpp
|
||||
|
|
|
@ -31,19 +31,6 @@
|
|||
|
||||
namespace duck {
|
||||
namespace {
|
||||
void dropScriptTags (std::string& html) {
|
||||
size_t open_index = 0;
|
||||
const std::string open_tag("<script");
|
||||
const std::string close_tag("</script>");
|
||||
|
||||
while (html.npos != (open_index = html.find(open_tag, open_index))) {
|
||||
assert(open_index < html.size());
|
||||
auto close_index = html.find(close_tag, open_index + open_tag.size());
|
||||
if (close_index == html.npos)
|
||||
close_index = html.size();
|
||||
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
|
||||
}
|
||||
}
|
||||
|
||||
bool isHttps (const std::string_view& parUrl) {
|
||||
const char protocol[] = "https://";
|
||||
|
@ -56,7 +43,6 @@ namespace duck {
|
|||
} //unnamed namespace
|
||||
|
||||
std::string clean_html (std::string&& html) {
|
||||
dropScriptTags(html);
|
||||
|
||||
// Initialize a Tidy document
|
||||
TidyDoc tidyDoc = tidyCreate();
|
||||
|
|
24
src/main.cpp
24
src/main.cpp
|
@ -22,6 +22,7 @@
|
|||
#include "scraplang.hpp"
|
||||
#include "html_pool.hpp"
|
||||
#include "read_all.hpp"
|
||||
#include "safe_stack_object.hpp"
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
|
@ -32,8 +33,8 @@
|
|||
|
||||
namespace {
|
||||
void dump_string ( const std::string& parPathDest, const std::string& parData );
|
||||
void load_from_commandline ( const boost::program_options::variables_map& parVarMap );
|
||||
void load_from_model ( const boost::program_options::variables_map& parVarMap );
|
||||
void load_from_commandline ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
|
||||
void load_from_model ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
|
||||
} //unnamed namespace
|
||||
|
||||
int main (int argc, char* argv[]) {
|
||||
|
@ -51,10 +52,11 @@ int main (int argc, char* argv[]) {
|
|||
}
|
||||
|
||||
try {
|
||||
curry::SafeStackObject<duck::XPath> query;
|
||||
if (vm.count("model"))
|
||||
load_from_model(vm);
|
||||
load_from_model(vm, query);
|
||||
else
|
||||
load_from_commandline(vm);
|
||||
load_from_commandline(vm, query);
|
||||
}
|
||||
catch (const duck::ParseError& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
|
@ -74,7 +76,7 @@ namespace {
|
|||
*os << parData;
|
||||
}
|
||||
|
||||
void load_from_commandline (const boost::program_options::variables_map& parVarMap) {
|
||||
void load_from_commandline (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||
const auto& vm = parVarMap;
|
||||
const auto url = vm["input-url"].as<std::string>();
|
||||
|
||||
|
@ -91,25 +93,25 @@ namespace {
|
|||
dump_string(vm["dump"].as<std::string>(), html);
|
||||
}
|
||||
|
||||
const std::string xpath = parVarMap["xpath"].as<std::string>();
|
||||
const std::string xpath_str = parVarMap["xpath"].as<std::string>();
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
std::cout << " -- XPath direct mode --\n";
|
||||
std::cout << "URL : " << parVarMap["input-url"].as<std::string>() << "\n";
|
||||
std::cout << "XPath: " << xpath << std::endl;
|
||||
std::cout << "XPath: " << xpath_str << std::endl;
|
||||
std::cout << "Agent: " << parVarMap["agent"].as<std::string>() << std::endl;
|
||||
#endif
|
||||
|
||||
std::vector<std::string> queries;
|
||||
queries.reserve(1);
|
||||
queries.push_back(std::move(xpath));
|
||||
auto results = duck::xpath_query(html, queries);
|
||||
queries.push_back(std::move(xpath_str));
|
||||
auto results = xpath->run_query(html, queries);
|
||||
for (const auto& lst : results[0]) {
|
||||
std::cout << lst.first << ": " << lst.second << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
void load_from_model (const boost::program_options::variables_map& parVarMap) {
|
||||
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||
#if !defined(NDEBUG)
|
||||
std::cout << " -- XPath model mode --\n";
|
||||
if (parVarMap.count("input-url"))
|
||||
|
@ -121,7 +123,7 @@ namespace {
|
|||
auto ast = duck::sl::parse(script);
|
||||
|
||||
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
|
||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool));
|
||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
|
||||
//auto list = duck::get_xpath_definitions(*ast);
|
||||
|
||||
//std::vector<std::string> expressions;
|
||||
|
|
|
@ -396,7 +396,8 @@ namespace duck { namespace sl {
|
|||
|
||||
std::vector<std::string> apply (
|
||||
const ScrapNode& node,
|
||||
HtmlPoolBaseSP html_pool
|
||||
HtmlPoolBaseSP html_pool,
|
||||
XPathPtr xpath
|
||||
) {
|
||||
using std::placeholders::_1;
|
||||
|
||||
|
@ -410,7 +411,7 @@ namespace duck { namespace sl {
|
|||
retval.reserve(apply_entries.size());
|
||||
|
||||
std::cout << "-------------- visiting done ----------------\n";
|
||||
XPathRunner xpath_runner(html_pool);
|
||||
XPathRunner xpath_runner(html_pool, xpath);
|
||||
|
||||
for (auto& apply_entry : apply_entries) {
|
||||
std::string name(apply_entry.mustache_name);
|
||||
|
|
|
@ -21,10 +21,15 @@
|
|||
|
||||
#include "scrap_node.hpp"
|
||||
#include "scraplang/html_pool_base.hpp"
|
||||
#include "xpath_fwd.hpp"
|
||||
#include <string>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
std::vector<std::string> apply (const ScrapNode& node, HtmlPoolBaseSP html_pool);
|
||||
std::vector<std::string> apply (
|
||||
const ScrapNode& node,
|
||||
HtmlPoolBaseSP html_pool,
|
||||
XPathPtr xpath
|
||||
);
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
||||
|
|
|
@ -17,7 +17,9 @@
|
|||
*/
|
||||
|
||||
#include "parse.hpp"
|
||||
#include "element_def.hpp"
|
||||
#include "scraplang/parse_exports.hpp"
|
||||
#include "scraplang/scrapgrammar.hpp"
|
||||
#include "scraplang/element_def.hpp"
|
||||
#include <boost/spirit/include/qi.hpp>
|
||||
#include <boost/spirit/include/phoenix_stl.hpp>
|
||||
#include <boost/spirit/include/phoenix_fusion.hpp>
|
||||
|
@ -27,10 +29,8 @@
|
|||
#if !defined(NDEBUG)
|
||||
# include <iostream>
|
||||
#endif
|
||||
#include <boost/variant/apply_visitor.hpp>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace qi = boost::spirit::qi;
|
||||
namespace sp = boost::spirit;
|
||||
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
|
@ -67,66 +67,8 @@ BOOST_FUSION_ADAPT_STRUCT(
|
|||
)
|
||||
|
||||
namespace duck { namespace sl {
|
||||
namespace {
|
||||
template <typename I, typename Skipper>
|
||||
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
|
||||
public:
|
||||
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
||||
using qi::char_;
|
||||
using qi::lexeme;
|
||||
using qi::alpha;
|
||||
using qi::alnum;
|
||||
using qi::graph;
|
||||
using qi::attr;
|
||||
using qi::eol;
|
||||
using qi::eoi;
|
||||
using qi::lit;
|
||||
using qi::string;
|
||||
using qi::as_string;
|
||||
using qi::no_skip;
|
||||
|
||||
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
|
||||
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
|
||||
source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
|
||||
url = -(+alpha >> string("://")) >> alpha >> *graph;
|
||||
mustache_like_token = "{{" >> identifier >> "}}";
|
||||
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
|
||||
xpath_assignment %= identifier >>
|
||||
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
|
||||
as_string[lexeme[+(graph | char_(" \t"))]];
|
||||
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
|
||||
|
||||
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
|
||||
assignment_list >> +eol >> "end";
|
||||
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
|
||||
|
||||
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
|
||||
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
|
||||
|
||||
assignment_list = (xpath_assignment | struct_block) % +eol;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename F>
|
||||
using RuleType = qi::rule<I, F, Skipper>;
|
||||
|
||||
RuleType<std::vector<ScrapNode>()> start;
|
||||
RuleType<FromBlock()> from_block;
|
||||
RuleType<std::string()> url;
|
||||
RuleType<std::string()> mustache_like_token;
|
||||
RuleType<std::string()> quoted_string;
|
||||
RuleType<XPathElement()> xpath_assignment;
|
||||
RuleType<std::string()> identifier;
|
||||
RuleType<SourceInfo()> source_info;
|
||||
RuleType<ApplyBlock()> apply_block;
|
||||
RuleType<StructBlock()> struct_block;
|
||||
RuleType<MustacheBlock()> mustache_block;
|
||||
RuleType<std::vector<StructItem>()> assignment_list;
|
||||
};
|
||||
} //unnamed namespace
|
||||
|
||||
std::vector<ScrapNode> parse (const std::string& parData) {
|
||||
ScrapGrammar<std::string::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
|
||||
std::vector<ScrapNode> parse (std::string_view parData) {
|
||||
ScrapGrammar<std::string_view::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
|
||||
auto it_start = parData.cbegin();
|
||||
|
||||
std::vector<ScrapNode> retval;
|
||||
|
|
|
@ -20,10 +20,10 @@
|
|||
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
|
||||
|
||||
#include "scrap_node.hpp"
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
std::vector<ScrapNode> parse ( const std::string& parData );
|
||||
std::vector<ScrapNode> parse ( std::string_view parData );
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
||||
|
|
51
src/scraplang/parse_exports.cpp
Normal file
51
src/scraplang/parse_exports.cpp
Normal file
|
@ -0,0 +1,51 @@
|
|||
/* Copyright (C) 2017-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "scraplang/parse_exports.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
|
||||
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
|
||||
|
||||
template bool boost::spirit::qi::phrase_parse<
|
||||
std::basic_string<char>::const_iterator,
|
||||
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
|
||||
boost::spirit::ascii::blank_type,
|
||||
std::vector<duck::sl::ScrapNode>
|
||||
> (
|
||||
std::basic_string<char>::const_iterator&,
|
||||
std::basic_string<char>::const_iterator,
|
||||
duck::sl::ScrapGrammar<
|
||||
std::basic_string<char>::const_iterator,
|
||||
boost::spirit::qi::ascii::blank_type
|
||||
> const&,
|
||||
boost::spirit::ascii::blank_type const&,
|
||||
std::vector<duck::sl::ScrapNode>&
|
||||
);
|
||||
|
||||
template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;
|
54
src/scraplang/parse_exports.hpp
Normal file
54
src/scraplang/parse_exports.hpp
Normal file
|
@ -0,0 +1,54 @@
|
|||
/* Copyright (C) 2017-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "scraplang/scrap_node.hpp"
|
||||
#include "scraplang/scrapgrammar.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
extern template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
|
||||
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
|
||||
|
||||
extern template bool boost::spirit::qi::phrase_parse<
|
||||
std::basic_string<char>::const_iterator,
|
||||
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
|
||||
boost::spirit::ascii::blank_type,
|
||||
std::vector<duck::sl::ScrapNode>
|
||||
> (
|
||||
std::basic_string<char>::const_iterator&,
|
||||
std::basic_string<char>::const_iterator,
|
||||
duck::sl::ScrapGrammar<
|
||||
std::basic_string<char>::const_iterator,
|
||||
boost::spirit::qi::ascii::blank_type
|
||||
> const&,
|
||||
boost::spirit::ascii::blank_type const&,
|
||||
std::vector<duck::sl::ScrapNode>&
|
||||
);
|
||||
|
||||
extern template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;
|
81
src/scraplang/scrapgrammar.hpp
Normal file
81
src/scraplang/scrapgrammar.hpp
Normal file
|
@ -0,0 +1,81 @@
|
|||
/* Copyright (C) 2017-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <boost/spirit/include/qi.hpp>
|
||||
|
||||
namespace duck::sl {
|
||||
namespace qi = ::boost::spirit::qi;
|
||||
|
||||
template <typename I, typename Skipper>
|
||||
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
|
||||
public:
|
||||
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
||||
using qi::char_;
|
||||
using qi::lexeme;
|
||||
using qi::alpha;
|
||||
using qi::alnum;
|
||||
using qi::graph;
|
||||
using qi::attr;
|
||||
using qi::eol;
|
||||
using qi::eoi;
|
||||
using qi::lit;
|
||||
using qi::string;
|
||||
using qi::as_string;
|
||||
using qi::no_skip;
|
||||
|
||||
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
|
||||
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
|
||||
source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
|
||||
url = -(+alpha >> string("://")) >> alpha >> *graph;
|
||||
mustache_like_token = "{{" >> identifier >> "}}";
|
||||
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
|
||||
xpath_assignment %= identifier >>
|
||||
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
|
||||
as_string[lexeme[+(graph | char_(" \t"))]];
|
||||
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
|
||||
|
||||
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
|
||||
assignment_list >> +eol >> "end";
|
||||
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
|
||||
|
||||
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
|
||||
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
|
||||
|
||||
assignment_list = (xpath_assignment | struct_block) % +eol;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename F>
|
||||
using RuleType = qi::rule<I, F, Skipper>;
|
||||
|
||||
RuleType<std::vector<ScrapNode>()> start;
|
||||
RuleType<FromBlock()> from_block;
|
||||
RuleType<std::string()> url;
|
||||
RuleType<std::string()> mustache_like_token;
|
||||
RuleType<std::string()> quoted_string;
|
||||
RuleType<XPathElement()> xpath_assignment;
|
||||
RuleType<std::string()> identifier;
|
||||
RuleType<SourceInfo()> source_info;
|
||||
RuleType<ApplyBlock()> apply_block;
|
||||
RuleType<StructBlock()> struct_block;
|
||||
RuleType<MustacheBlock()> mustache_block;
|
||||
RuleType<std::vector<StructItem>()> assignment_list;
|
||||
};
|
||||
} //namespace duck::sl
|
|
@ -47,9 +47,10 @@ namespace duck { namespace sl {
|
|||
}
|
||||
};
|
||||
|
||||
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool) :
|
||||
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) :
|
||||
m_cached_results(),
|
||||
m_pool(html_pool)
|
||||
m_pool(html_pool),
|
||||
m_xpath(parXPath)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -73,7 +74,7 @@ namespace duck { namespace sl {
|
|||
#endif
|
||||
const std::string* html = m_pool->GetByID(id);
|
||||
|
||||
curr_vec = xpath_query(*html, std::string(parQuery));
|
||||
curr_vec = m_xpath->run_query(*html, std::string(parQuery));
|
||||
std::cout << "First time for this query, result cached now\n";
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#define id46DB8F4F85E2417E9AF0B1A410240D4F
|
||||
|
||||
#include "html_pool_base.hpp"
|
||||
#include "xpath_fwd.hpp"
|
||||
#include <map>
|
||||
#include <string_view>
|
||||
#include <string>
|
||||
|
@ -27,7 +28,7 @@
|
|||
namespace duck { namespace sl {
|
||||
class XPathRunner {
|
||||
public:
|
||||
explicit XPathRunner (HtmlPoolBaseSP html_pool);
|
||||
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath);
|
||||
~XPathRunner();
|
||||
|
||||
const std::vector<std::string>& query (
|
||||
|
@ -40,6 +41,7 @@ namespace duck { namespace sl {
|
|||
|
||||
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
||||
HtmlPoolBaseSP m_pool;
|
||||
XPathPtr m_xpath;
|
||||
};
|
||||
}} //namespace duck::sl
|
||||
|
||||
|
|
|
@ -47,7 +47,11 @@ namespace duck {
|
|||
}
|
||||
} //unnamed namespace
|
||||
|
||||
XPathBatchResults xpath_query (const std::string& parXML, const std::vector<std::string>& parQueries) {
|
||||
XPath::XPath() = default;
|
||||
|
||||
XPath::~XPath() = default;
|
||||
|
||||
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries) -> BatchResults {
|
||||
XQilla& xqilla = m_xqilla;
|
||||
XercesConfiguration xconfig;
|
||||
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY_UPDATE, &xconfig));
|
||||
|
@ -61,7 +65,7 @@ namespace duck {
|
|||
}
|
||||
context->setContextItem(ptr);
|
||||
|
||||
XPathBatchResults retval;
|
||||
BatchResults retval;
|
||||
for (const auto& xpath : parQueries) {
|
||||
AutoDelete<XQQuery> query(xqilla.parse(X(xpath.c_str())));
|
||||
context->setContextPosition(1);
|
||||
|
@ -75,11 +79,11 @@ namespace duck {
|
|||
}
|
||||
retval.push_back(std::move(new_lst));
|
||||
}
|
||||
return std::move(retval);
|
||||
return retval;
|
||||
}
|
||||
|
||||
std::vector<std::string> xpath_query (const std::string& parXML, const std::string& parQuery) {
|
||||
auto query_res = xpath_query(parXML, std::vector<std::string>{parQuery});
|
||||
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery) {
|
||||
auto query_res = run_query(parXML, std::vector<std::string>{parQuery});
|
||||
if (query_res.empty() or query_res.front().empty()) {
|
||||
return std::vector<std::string>();
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||
#define id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||
|
||||
#include "xpath_fwd.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <exception>
|
||||
|
@ -26,8 +27,6 @@
|
|||
#include <xqilla/xqilla-simple.hpp>
|
||||
|
||||
namespace duck {
|
||||
typedef std::vector<std::vector<std::pair<std::string, std::string>>> XPathBatchResults;
|
||||
|
||||
class ParseError : public std::exception {
|
||||
public:
|
||||
ParseError ( int parLine, int parColumn, std::string parMessage );
|
||||
|
@ -36,8 +35,19 @@ namespace duck {
|
|||
std::vector<char> m_msg;
|
||||
};
|
||||
|
||||
XPathBatchResults xpath_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
||||
std::vector<std::string> xpath_query ( const std::string& parXML, const std::string& parQuery );
|
||||
class XPath : public Kakoune::SafeCountable {
|
||||
public:
|
||||
typedef std::vector<std::vector<std::pair<std::string, std::string>>> BatchResults;
|
||||
|
||||
XPath();
|
||||
~XPath();
|
||||
|
||||
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
||||
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery );
|
||||
|
||||
private:
|
||||
XQilla m_xqilla;
|
||||
};
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
||||
|
|
29
src/xpath_fwd.hpp
Normal file
29
src/xpath_fwd.hpp
Normal file
|
@ -0,0 +1,29 @@
|
|||
/* Copyright (C) 2015-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id08062CD6C4904D94BFF57990C44B6CCB
|
||||
#define id08062CD6C4904D94BFF57990C44B6CCB
|
||||
|
||||
#include "kakoune/safe_ptr.hh"
|
||||
|
||||
namespace duck {
|
||||
class XPath;
|
||||
using XPathPtr = Kakoune::SafePtr<XPath>;
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue