Working on better scraplang support, still not there tho.
This commit is contained in:
parent
430886085c
commit
60d6c2cb61
15 changed files with 275 additions and 106 deletions
|
@ -28,6 +28,7 @@ add_executable(${PROJECT_NAME}
|
||||||
src/html_pool.cpp
|
src/html_pool.cpp
|
||||||
src/htmlretrieve.cpp
|
src/htmlretrieve.cpp
|
||||||
src/commandline.cpp
|
src/commandline.cpp
|
||||||
|
src/scraplang/parse_exports.cpp
|
||||||
src/scraplang/parse.cpp
|
src/scraplang/parse.cpp
|
||||||
src/scraplang/apply.cpp
|
src/scraplang/apply.cpp
|
||||||
src/xpath.cpp
|
src/xpath.cpp
|
||||||
|
|
|
@ -31,19 +31,6 @@
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
namespace {
|
namespace {
|
||||||
void dropScriptTags (std::string& html) {
|
|
||||||
size_t open_index = 0;
|
|
||||||
const std::string open_tag("<script");
|
|
||||||
const std::string close_tag("</script>");
|
|
||||||
|
|
||||||
while (html.npos != (open_index = html.find(open_tag, open_index))) {
|
|
||||||
assert(open_index < html.size());
|
|
||||||
auto close_index = html.find(close_tag, open_index + open_tag.size());
|
|
||||||
if (close_index == html.npos)
|
|
||||||
close_index = html.size();
|
|
||||||
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isHttps (const std::string_view& parUrl) {
|
bool isHttps (const std::string_view& parUrl) {
|
||||||
const char protocol[] = "https://";
|
const char protocol[] = "https://";
|
||||||
|
@ -56,7 +43,6 @@ namespace duck {
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
std::string clean_html (std::string&& html) {
|
std::string clean_html (std::string&& html) {
|
||||||
dropScriptTags(html);
|
|
||||||
|
|
||||||
// Initialize a Tidy document
|
// Initialize a Tidy document
|
||||||
TidyDoc tidyDoc = tidyCreate();
|
TidyDoc tidyDoc = tidyCreate();
|
||||||
|
|
24
src/main.cpp
24
src/main.cpp
|
@ -22,6 +22,7 @@
|
||||||
#include "scraplang.hpp"
|
#include "scraplang.hpp"
|
||||||
#include "html_pool.hpp"
|
#include "html_pool.hpp"
|
||||||
#include "read_all.hpp"
|
#include "read_all.hpp"
|
||||||
|
#include "safe_stack_object.hpp"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
@ -32,8 +33,8 @@
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void dump_string ( const std::string& parPathDest, const std::string& parData );
|
void dump_string ( const std::string& parPathDest, const std::string& parData );
|
||||||
void load_from_commandline ( const boost::program_options::variables_map& parVarMap );
|
void load_from_commandline ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
|
||||||
void load_from_model ( const boost::program_options::variables_map& parVarMap );
|
void load_from_model ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
int main (int argc, char* argv[]) {
|
int main (int argc, char* argv[]) {
|
||||||
|
@ -51,10 +52,11 @@ int main (int argc, char* argv[]) {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
curry::SafeStackObject<duck::XPath> query;
|
||||||
if (vm.count("model"))
|
if (vm.count("model"))
|
||||||
load_from_model(vm);
|
load_from_model(vm, query);
|
||||||
else
|
else
|
||||||
load_from_commandline(vm);
|
load_from_commandline(vm, query);
|
||||||
}
|
}
|
||||||
catch (const duck::ParseError& err) {
|
catch (const duck::ParseError& err) {
|
||||||
std::cerr << err.what() << std::endl;
|
std::cerr << err.what() << std::endl;
|
||||||
|
@ -74,7 +76,7 @@ namespace {
|
||||||
*os << parData;
|
*os << parData;
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_from_commandline (const boost::program_options::variables_map& parVarMap) {
|
void load_from_commandline (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||||
const auto& vm = parVarMap;
|
const auto& vm = parVarMap;
|
||||||
const auto url = vm["input-url"].as<std::string>();
|
const auto url = vm["input-url"].as<std::string>();
|
||||||
|
|
||||||
|
@ -91,25 +93,25 @@ namespace {
|
||||||
dump_string(vm["dump"].as<std::string>(), html);
|
dump_string(vm["dump"].as<std::string>(), html);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string xpath = parVarMap["xpath"].as<std::string>();
|
const std::string xpath_str = parVarMap["xpath"].as<std::string>();
|
||||||
|
|
||||||
#if !defined(NDEBUG)
|
#if !defined(NDEBUG)
|
||||||
std::cout << " -- XPath direct mode --\n";
|
std::cout << " -- XPath direct mode --\n";
|
||||||
std::cout << "URL : " << parVarMap["input-url"].as<std::string>() << "\n";
|
std::cout << "URL : " << parVarMap["input-url"].as<std::string>() << "\n";
|
||||||
std::cout << "XPath: " << xpath << std::endl;
|
std::cout << "XPath: " << xpath_str << std::endl;
|
||||||
std::cout << "Agent: " << parVarMap["agent"].as<std::string>() << std::endl;
|
std::cout << "Agent: " << parVarMap["agent"].as<std::string>() << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::vector<std::string> queries;
|
std::vector<std::string> queries;
|
||||||
queries.reserve(1);
|
queries.reserve(1);
|
||||||
queries.push_back(std::move(xpath));
|
queries.push_back(std::move(xpath_str));
|
||||||
auto results = duck::xpath_query(html, queries);
|
auto results = xpath->run_query(html, queries);
|
||||||
for (const auto& lst : results[0]) {
|
for (const auto& lst : results[0]) {
|
||||||
std::cout << lst.first << ": " << lst.second << '\n';
|
std::cout << lst.first << ": " << lst.second << '\n';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_from_model (const boost::program_options::variables_map& parVarMap) {
|
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||||
#if !defined(NDEBUG)
|
#if !defined(NDEBUG)
|
||||||
std::cout << " -- XPath model mode --\n";
|
std::cout << " -- XPath model mode --\n";
|
||||||
if (parVarMap.count("input-url"))
|
if (parVarMap.count("input-url"))
|
||||||
|
@ -121,7 +123,7 @@ namespace {
|
||||||
auto ast = duck::sl::parse(script);
|
auto ast = duck::sl::parse(script);
|
||||||
|
|
||||||
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
|
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
|
||||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool));
|
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
|
||||||
//auto list = duck::get_xpath_definitions(*ast);
|
//auto list = duck::get_xpath_definitions(*ast);
|
||||||
|
|
||||||
//std::vector<std::string> expressions;
|
//std::vector<std::string> expressions;
|
||||||
|
|
|
@ -396,7 +396,8 @@ namespace duck { namespace sl {
|
||||||
|
|
||||||
std::vector<std::string> apply (
|
std::vector<std::string> apply (
|
||||||
const ScrapNode& node,
|
const ScrapNode& node,
|
||||||
HtmlPoolBaseSP html_pool
|
HtmlPoolBaseSP html_pool,
|
||||||
|
XPathPtr xpath
|
||||||
) {
|
) {
|
||||||
using std::placeholders::_1;
|
using std::placeholders::_1;
|
||||||
|
|
||||||
|
@ -410,7 +411,7 @@ namespace duck { namespace sl {
|
||||||
retval.reserve(apply_entries.size());
|
retval.reserve(apply_entries.size());
|
||||||
|
|
||||||
std::cout << "-------------- visiting done ----------------\n";
|
std::cout << "-------------- visiting done ----------------\n";
|
||||||
XPathRunner xpath_runner(html_pool);
|
XPathRunner xpath_runner(html_pool, xpath);
|
||||||
|
|
||||||
for (auto& apply_entry : apply_entries) {
|
for (auto& apply_entry : apply_entries) {
|
||||||
std::string name(apply_entry.mustache_name);
|
std::string name(apply_entry.mustache_name);
|
||||||
|
|
|
@ -21,10 +21,15 @@
|
||||||
|
|
||||||
#include "scrap_node.hpp"
|
#include "scrap_node.hpp"
|
||||||
#include "scraplang/html_pool_base.hpp"
|
#include "scraplang/html_pool_base.hpp"
|
||||||
|
#include "xpath_fwd.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace duck { namespace sl {
|
namespace duck { namespace sl {
|
||||||
std::vector<std::string> apply (const ScrapNode& node, HtmlPoolBaseSP html_pool);
|
std::vector<std::string> apply (
|
||||||
|
const ScrapNode& node,
|
||||||
|
HtmlPoolBaseSP html_pool,
|
||||||
|
XPathPtr xpath
|
||||||
|
);
|
||||||
}} //namespace duck::sl
|
}} //namespace duck::sl
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -17,7 +17,9 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "parse.hpp"
|
#include "parse.hpp"
|
||||||
#include "element_def.hpp"
|
#include "scraplang/parse_exports.hpp"
|
||||||
|
#include "scraplang/scrapgrammar.hpp"
|
||||||
|
#include "scraplang/element_def.hpp"
|
||||||
#include <boost/spirit/include/qi.hpp>
|
#include <boost/spirit/include/qi.hpp>
|
||||||
#include <boost/spirit/include/phoenix_stl.hpp>
|
#include <boost/spirit/include/phoenix_stl.hpp>
|
||||||
#include <boost/spirit/include/phoenix_fusion.hpp>
|
#include <boost/spirit/include/phoenix_fusion.hpp>
|
||||||
|
@ -27,10 +29,8 @@
|
||||||
#if !defined(NDEBUG)
|
#if !defined(NDEBUG)
|
||||||
# include <iostream>
|
# include <iostream>
|
||||||
#endif
|
#endif
|
||||||
#include <boost/variant/apply_visitor.hpp>
|
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
namespace qi = boost::spirit::qi;
|
|
||||||
namespace sp = boost::spirit;
|
namespace sp = boost::spirit;
|
||||||
|
|
||||||
BOOST_FUSION_ADAPT_STRUCT(
|
BOOST_FUSION_ADAPT_STRUCT(
|
||||||
|
@ -67,66 +67,8 @@ BOOST_FUSION_ADAPT_STRUCT(
|
||||||
)
|
)
|
||||||
|
|
||||||
namespace duck { namespace sl {
|
namespace duck { namespace sl {
|
||||||
namespace {
|
std::vector<ScrapNode> parse (std::string_view parData) {
|
||||||
template <typename I, typename Skipper>
|
ScrapGrammar<std::string_view::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
|
||||||
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
|
|
||||||
public:
|
|
||||||
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
|
||||||
using qi::char_;
|
|
||||||
using qi::lexeme;
|
|
||||||
using qi::alpha;
|
|
||||||
using qi::alnum;
|
|
||||||
using qi::graph;
|
|
||||||
using qi::attr;
|
|
||||||
using qi::eol;
|
|
||||||
using qi::eoi;
|
|
||||||
using qi::lit;
|
|
||||||
using qi::string;
|
|
||||||
using qi::as_string;
|
|
||||||
using qi::no_skip;
|
|
||||||
|
|
||||||
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
|
|
||||||
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
|
|
||||||
source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
|
|
||||||
url = -(+alpha >> string("://")) >> alpha >> *graph;
|
|
||||||
mustache_like_token = "{{" >> identifier >> "}}";
|
|
||||||
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
|
|
||||||
xpath_assignment %= identifier >>
|
|
||||||
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
|
|
||||||
as_string[lexeme[+(graph | char_(" \t"))]];
|
|
||||||
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
|
|
||||||
|
|
||||||
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
|
|
||||||
assignment_list >> +eol >> "end";
|
|
||||||
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
|
|
||||||
|
|
||||||
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
|
|
||||||
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
|
|
||||||
|
|
||||||
assignment_list = (xpath_assignment | struct_block) % +eol;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
template <typename F>
|
|
||||||
using RuleType = qi::rule<I, F, Skipper>;
|
|
||||||
|
|
||||||
RuleType<std::vector<ScrapNode>()> start;
|
|
||||||
RuleType<FromBlock()> from_block;
|
|
||||||
RuleType<std::string()> url;
|
|
||||||
RuleType<std::string()> mustache_like_token;
|
|
||||||
RuleType<std::string()> quoted_string;
|
|
||||||
RuleType<XPathElement()> xpath_assignment;
|
|
||||||
RuleType<std::string()> identifier;
|
|
||||||
RuleType<SourceInfo()> source_info;
|
|
||||||
RuleType<ApplyBlock()> apply_block;
|
|
||||||
RuleType<StructBlock()> struct_block;
|
|
||||||
RuleType<MustacheBlock()> mustache_block;
|
|
||||||
RuleType<std::vector<StructItem>()> assignment_list;
|
|
||||||
};
|
|
||||||
} //unnamed namespace
|
|
||||||
|
|
||||||
std::vector<ScrapNode> parse (const std::string& parData) {
|
|
||||||
ScrapGrammar<std::string::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
|
|
||||||
auto it_start = parData.cbegin();
|
auto it_start = parData.cbegin();
|
||||||
|
|
||||||
std::vector<ScrapNode> retval;
|
std::vector<ScrapNode> retval;
|
||||||
|
|
|
@ -20,10 +20,10 @@
|
||||||
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
|
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
|
||||||
|
|
||||||
#include "scrap_node.hpp"
|
#include "scrap_node.hpp"
|
||||||
#include <string>
|
#include <string_view>
|
||||||
|
|
||||||
namespace duck { namespace sl {
|
namespace duck { namespace sl {
|
||||||
std::vector<ScrapNode> parse ( const std::string& parData );
|
std::vector<ScrapNode> parse ( std::string_view parData );
|
||||||
}} //namespace duck::sl
|
}} //namespace duck::sl
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
51
src/scraplang/parse_exports.cpp
Normal file
51
src/scraplang/parse_exports.cpp
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
/* Copyright (C) 2017-2020 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "scraplang/parse_exports.hpp"
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
|
||||||
|
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
|
||||||
|
template bool boost::spirit::qi::phrase_parse<
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
|
||||||
|
boost::spirit::ascii::blank_type,
|
||||||
|
std::vector<duck::sl::ScrapNode>
|
||||||
|
> (
|
||||||
|
std::basic_string<char>::const_iterator&,
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
duck::sl::ScrapGrammar<
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
boost::spirit::qi::ascii::blank_type
|
||||||
|
> const&,
|
||||||
|
boost::spirit::ascii::blank_type const&,
|
||||||
|
std::vector<duck::sl::ScrapNode>&
|
||||||
|
);
|
||||||
|
|
||||||
|
template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;
|
54
src/scraplang/parse_exports.hpp
Normal file
54
src/scraplang/parse_exports.hpp
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
/* Copyright (C) 2017-2020 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "scraplang/scrap_node.hpp"
|
||||||
|
#include "scraplang/scrapgrammar.hpp"
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
extern template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
|
||||||
|
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
|
||||||
|
extern template bool boost::spirit::qi::phrase_parse<
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
|
||||||
|
boost::spirit::ascii::blank_type,
|
||||||
|
std::vector<duck::sl::ScrapNode>
|
||||||
|
> (
|
||||||
|
std::basic_string<char>::const_iterator&,
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
duck::sl::ScrapGrammar<
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
boost::spirit::qi::ascii::blank_type
|
||||||
|
> const&,
|
||||||
|
boost::spirit::ascii::blank_type const&,
|
||||||
|
std::vector<duck::sl::ScrapNode>&
|
||||||
|
);
|
||||||
|
|
||||||
|
extern template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;
|
81
src/scraplang/scrapgrammar.hpp
Normal file
81
src/scraplang/scrapgrammar.hpp
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
/* Copyright (C) 2017-2020 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <boost/spirit/include/qi.hpp>
|
||||||
|
|
||||||
|
namespace duck::sl {
|
||||||
|
namespace qi = ::boost::spirit::qi;
|
||||||
|
|
||||||
|
template <typename I, typename Skipper>
|
||||||
|
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
|
||||||
|
public:
|
||||||
|
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
||||||
|
using qi::char_;
|
||||||
|
using qi::lexeme;
|
||||||
|
using qi::alpha;
|
||||||
|
using qi::alnum;
|
||||||
|
using qi::graph;
|
||||||
|
using qi::attr;
|
||||||
|
using qi::eol;
|
||||||
|
using qi::eoi;
|
||||||
|
using qi::lit;
|
||||||
|
using qi::string;
|
||||||
|
using qi::as_string;
|
||||||
|
using qi::no_skip;
|
||||||
|
|
||||||
|
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
|
||||||
|
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
|
||||||
|
source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
|
||||||
|
url = -(+alpha >> string("://")) >> alpha >> *graph;
|
||||||
|
mustache_like_token = "{{" >> identifier >> "}}";
|
||||||
|
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
|
||||||
|
xpath_assignment %= identifier >>
|
||||||
|
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
|
||||||
|
as_string[lexeme[+(graph | char_(" \t"))]];
|
||||||
|
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
|
||||||
|
|
||||||
|
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
|
||||||
|
assignment_list >> +eol >> "end";
|
||||||
|
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
|
||||||
|
|
||||||
|
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
|
||||||
|
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
|
||||||
|
|
||||||
|
assignment_list = (xpath_assignment | struct_block) % +eol;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename F>
|
||||||
|
using RuleType = qi::rule<I, F, Skipper>;
|
||||||
|
|
||||||
|
RuleType<std::vector<ScrapNode>()> start;
|
||||||
|
RuleType<FromBlock()> from_block;
|
||||||
|
RuleType<std::string()> url;
|
||||||
|
RuleType<std::string()> mustache_like_token;
|
||||||
|
RuleType<std::string()> quoted_string;
|
||||||
|
RuleType<XPathElement()> xpath_assignment;
|
||||||
|
RuleType<std::string()> identifier;
|
||||||
|
RuleType<SourceInfo()> source_info;
|
||||||
|
RuleType<ApplyBlock()> apply_block;
|
||||||
|
RuleType<StructBlock()> struct_block;
|
||||||
|
RuleType<MustacheBlock()> mustache_block;
|
||||||
|
RuleType<std::vector<StructItem>()> assignment_list;
|
||||||
|
};
|
||||||
|
} //namespace duck::sl
|
|
@ -47,9 +47,10 @@ namespace duck { namespace sl {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool) :
|
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) :
|
||||||
m_cached_results(),
|
m_cached_results(),
|
||||||
m_pool(html_pool)
|
m_pool(html_pool),
|
||||||
|
m_xpath(parXPath)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -73,7 +74,7 @@ namespace duck { namespace sl {
|
||||||
#endif
|
#endif
|
||||||
const std::string* html = m_pool->GetByID(id);
|
const std::string* html = m_pool->GetByID(id);
|
||||||
|
|
||||||
curr_vec = xpath_query(*html, std::string(parQuery));
|
curr_vec = m_xpath->run_query(*html, std::string(parQuery));
|
||||||
std::cout << "First time for this query, result cached now\n";
|
std::cout << "First time for this query, result cached now\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#define id46DB8F4F85E2417E9AF0B1A410240D4F
|
#define id46DB8F4F85E2417E9AF0B1A410240D4F
|
||||||
|
|
||||||
#include "html_pool_base.hpp"
|
#include "html_pool_base.hpp"
|
||||||
|
#include "xpath_fwd.hpp"
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -27,7 +28,7 @@
|
||||||
namespace duck { namespace sl {
|
namespace duck { namespace sl {
|
||||||
class XPathRunner {
|
class XPathRunner {
|
||||||
public:
|
public:
|
||||||
explicit XPathRunner (HtmlPoolBaseSP html_pool);
|
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath);
|
||||||
~XPathRunner();
|
~XPathRunner();
|
||||||
|
|
||||||
const std::vector<std::string>& query (
|
const std::vector<std::string>& query (
|
||||||
|
@ -40,6 +41,7 @@ namespace duck { namespace sl {
|
||||||
|
|
||||||
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
||||||
HtmlPoolBaseSP m_pool;
|
HtmlPoolBaseSP m_pool;
|
||||||
|
XPathPtr m_xpath;
|
||||||
};
|
};
|
||||||
}} //namespace duck::sl
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,11 @@ namespace duck {
|
||||||
}
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
XPathBatchResults xpath_query (const std::string& parXML, const std::vector<std::string>& parQueries) {
|
XPath::XPath() = default;
|
||||||
|
|
||||||
|
XPath::~XPath() = default;
|
||||||
|
|
||||||
|
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries) -> BatchResults {
|
||||||
XQilla& xqilla = m_xqilla;
|
XQilla& xqilla = m_xqilla;
|
||||||
XercesConfiguration xconfig;
|
XercesConfiguration xconfig;
|
||||||
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY_UPDATE, &xconfig));
|
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY_UPDATE, &xconfig));
|
||||||
|
@ -61,7 +65,7 @@ namespace duck {
|
||||||
}
|
}
|
||||||
context->setContextItem(ptr);
|
context->setContextItem(ptr);
|
||||||
|
|
||||||
XPathBatchResults retval;
|
BatchResults retval;
|
||||||
for (const auto& xpath : parQueries) {
|
for (const auto& xpath : parQueries) {
|
||||||
AutoDelete<XQQuery> query(xqilla.parse(X(xpath.c_str())));
|
AutoDelete<XQQuery> query(xqilla.parse(X(xpath.c_str())));
|
||||||
context->setContextPosition(1);
|
context->setContextPosition(1);
|
||||||
|
@ -75,11 +79,11 @@ namespace duck {
|
||||||
}
|
}
|
||||||
retval.push_back(std::move(new_lst));
|
retval.push_back(std::move(new_lst));
|
||||||
}
|
}
|
||||||
return std::move(retval);
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> xpath_query (const std::string& parXML, const std::string& parQuery) {
|
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery) {
|
||||||
auto query_res = xpath_query(parXML, std::vector<std::string>{parQuery});
|
auto query_res = run_query(parXML, std::vector<std::string>{parQuery});
|
||||||
if (query_res.empty() or query_res.front().empty()) {
|
if (query_res.empty() or query_res.front().empty()) {
|
||||||
return std::vector<std::string>();
|
return std::vector<std::string>();
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
|
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||||
#define id21E0A6F345D24C5D83D3B1F74EC810F7
|
#define id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||||
|
|
||||||
|
#include "xpath_fwd.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <exception>
|
#include <exception>
|
||||||
|
@ -26,8 +27,6 @@
|
||||||
#include <xqilla/xqilla-simple.hpp>
|
#include <xqilla/xqilla-simple.hpp>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
typedef std::vector<std::vector<std::pair<std::string, std::string>>> XPathBatchResults;
|
|
||||||
|
|
||||||
class ParseError : public std::exception {
|
class ParseError : public std::exception {
|
||||||
public:
|
public:
|
||||||
ParseError ( int parLine, int parColumn, std::string parMessage );
|
ParseError ( int parLine, int parColumn, std::string parMessage );
|
||||||
|
@ -36,8 +35,19 @@ namespace duck {
|
||||||
std::vector<char> m_msg;
|
std::vector<char> m_msg;
|
||||||
};
|
};
|
||||||
|
|
||||||
XPathBatchResults xpath_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
class XPath : public Kakoune::SafeCountable {
|
||||||
std::vector<std::string> xpath_query ( const std::string& parXML, const std::string& parQuery );
|
public:
|
||||||
|
typedef std::vector<std::vector<std::pair<std::string, std::string>>> BatchResults;
|
||||||
|
|
||||||
|
XPath();
|
||||||
|
~XPath();
|
||||||
|
|
||||||
|
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
||||||
|
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery );
|
||||||
|
|
||||||
|
private:
|
||||||
|
XQilla m_xqilla;
|
||||||
|
};
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
29
src/xpath_fwd.hpp
Normal file
29
src/xpath_fwd.hpp
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
/* Copyright (C) 2015-2020 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id08062CD6C4904D94BFF57990C44B6CCB
|
||||||
|
#define id08062CD6C4904D94BFF57990C44B6CCB
|
||||||
|
|
||||||
|
#include "kakoune/safe_ptr.hh"
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
class XPath;
|
||||||
|
using XPathPtr = Kakoune::SafePtr<XPath>;
|
||||||
|
} //namespace duck
|
||||||
|
|
||||||
|
#endif
|
Loading…
Add table
Reference in a new issue