diff --git a/CMakeLists.txt b/CMakeLists.txt index d3eb079..37c6456 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ add_executable(${PROJECT_NAME} src/html_pool.cpp src/htmlretrieve.cpp src/commandline.cpp + src/scraplang/parse_exports.cpp src/scraplang/parse.cpp src/scraplang/apply.cpp src/xpath.cpp diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp index 8ec9f5b..6a7cf6c 100644 --- a/src/htmlretrieve.cpp +++ b/src/htmlretrieve.cpp @@ -31,19 +31,6 @@ namespace duck { namespace { - void dropScriptTags (std::string& html) { - size_t open_index = 0; - const std::string open_tag(""); - - while (html.npos != (open_index = html.find(open_tag, open_index))) { - assert(open_index < html.size()); - auto close_index = html.find(close_tag, open_index + open_tag.size()); - if (close_index == html.npos) - close_index = html.size(); - html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index); - } - } bool isHttps (const std::string_view& parUrl) { const char protocol[] = "https://"; @@ -56,7 +43,6 @@ namespace duck { } //unnamed namespace std::string clean_html (std::string&& html) { - dropScriptTags(html); // Initialize a Tidy document TidyDoc tidyDoc = tidyCreate(); diff --git a/src/main.cpp b/src/main.cpp index f3dd735..97c8050 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,7 @@ #include "scraplang.hpp" #include "html_pool.hpp" #include "read_all.hpp" +#include "safe_stack_object.hpp" #include #include #include @@ -32,8 +33,8 @@ namespace { void dump_string ( const std::string& parPathDest, const std::string& parData ); - void load_from_commandline ( const boost::program_options::variables_map& parVarMap ); - void load_from_model ( const boost::program_options::variables_map& parVarMap ); + void load_from_commandline ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath ); + void load_from_model ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath ); } //unnamed namespace int main (int argc, char* argv[]) { @@ -51,10 +52,11 @@ int main (int argc, char* argv[]) { } try { + curry::SafeStackObject query; if (vm.count("model")) - load_from_model(vm); + load_from_model(vm, query); else - load_from_commandline(vm); + load_from_commandline(vm, query); } catch (const duck::ParseError& err) { std::cerr << err.what() << std::endl; @@ -74,7 +76,7 @@ namespace { *os << parData; } - void load_from_commandline (const boost::program_options::variables_map& parVarMap) { + void load_from_commandline (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) { const auto& vm = parVarMap; const auto url = vm["input-url"].as(); @@ -91,25 +93,25 @@ namespace { dump_string(vm["dump"].as(), html); } - const std::string xpath = parVarMap["xpath"].as(); + const std::string xpath_str = parVarMap["xpath"].as(); #if !defined(NDEBUG) std::cout << " -- XPath direct mode --\n"; std::cout << "URL : " << parVarMap["input-url"].as() << "\n"; - std::cout << "XPath: " << xpath << std::endl; + std::cout << "XPath: " << xpath_str << std::endl; std::cout << "Agent: " << parVarMap["agent"].as() << std::endl; #endif std::vector queries; queries.reserve(1); - queries.push_back(std::move(xpath)); - auto results = duck::xpath_query(html, queries); + queries.push_back(std::move(xpath_str)); + auto results = xpath->run_query(html, queries); for (const auto& lst : results[0]) { std::cout << lst.first << ": " << lst.second << '\n'; } } - void load_from_model (const boost::program_options::variables_map& parVarMap) { + void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) { #if !defined(NDEBUG) std::cout << " -- XPath model mode --\n"; if (parVarMap.count("input-url")) @@ -121,7 +123,7 @@ namespace { auto ast = duck::sl::parse(script); duck::HtmlPool html_pool(std::string(parVarMap["agent"].as())); - duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool)); + duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath); //auto list = duck::get_xpath_definitions(*ast); //std::vector expressions; diff --git a/src/scraplang/apply.cpp b/src/scraplang/apply.cpp index f054f49..e9dddf2 100644 --- a/src/scraplang/apply.cpp +++ b/src/scraplang/apply.cpp @@ -396,7 +396,8 @@ namespace duck { namespace sl { std::vector apply ( const ScrapNode& node, - HtmlPoolBaseSP html_pool + HtmlPoolBaseSP html_pool, + XPathPtr xpath ) { using std::placeholders::_1; @@ -410,7 +411,7 @@ namespace duck { namespace sl { retval.reserve(apply_entries.size()); std::cout << "-------------- visiting done ----------------\n"; - XPathRunner xpath_runner(html_pool); + XPathRunner xpath_runner(html_pool, xpath); for (auto& apply_entry : apply_entries) { std::string name(apply_entry.mustache_name); diff --git a/src/scraplang/apply.hpp b/src/scraplang/apply.hpp index 5017909..18efb4d 100644 --- a/src/scraplang/apply.hpp +++ b/src/scraplang/apply.hpp @@ -21,10 +21,15 @@ #include "scrap_node.hpp" #include "scraplang/html_pool_base.hpp" +#include "xpath_fwd.hpp" #include namespace duck { namespace sl { - std::vector apply (const ScrapNode& node, HtmlPoolBaseSP html_pool); + std::vector apply ( + const ScrapNode& node, + HtmlPoolBaseSP html_pool, + XPathPtr xpath + ); }} //namespace duck::sl #endif diff --git a/src/scraplang/parse.cpp b/src/scraplang/parse.cpp index 34d56a5..9005f65 100644 --- a/src/scraplang/parse.cpp +++ b/src/scraplang/parse.cpp @@ -17,7 +17,9 @@ */ #include "parse.hpp" -#include "element_def.hpp" +#include "scraplang/parse_exports.hpp" +#include "scraplang/scrapgrammar.hpp" +#include "scraplang/element_def.hpp" #include #include #include @@ -27,10 +29,8 @@ #if !defined(NDEBUG) # include #endif -#include #include -namespace qi = boost::spirit::qi; namespace sp = boost::spirit; BOOST_FUSION_ADAPT_STRUCT( @@ -67,66 +67,8 @@ BOOST_FUSION_ADAPT_STRUCT( ) namespace duck { namespace sl { - namespace { - template - class ScrapGrammar : public qi::grammar(), Skipper> { - public: - ScrapGrammar() : ScrapGrammar::base_type(start) { - using qi::char_; - using qi::lexeme; - using qi::alpha; - using qi::alnum; - using qi::graph; - using qi::attr; - using qi::eol; - using qi::eoi; - using qi::lit; - using qi::string; - using qi::as_string; - using qi::no_skip; - - start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi; - from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end"; - source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token)); - url = -(+alpha >> string("://")) >> alpha >> *graph; - mustache_like_token = "{{" >> identifier >> "}}"; - quoted_string %= lexeme['"' >> +(char_ - '"') >> '"']; - xpath_assignment %= identifier >> - -(lit("default") >> '(' >> quoted_string >> ')') >> "=" >> - as_string[lexeme[+(graph | char_(" \t"))]]; - identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))]; - - apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >> - assignment_list >> +eol >> "end"; - struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end"; - - mustache_block %= as_string[lit("==") >> identifier] >> eol >> - as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end"; - - assignment_list = (xpath_assignment | struct_block) % +eol; - } - - private: - template - using RuleType = qi::rule; - - RuleType()> start; - RuleType from_block; - RuleType url; - RuleType mustache_like_token; - RuleType quoted_string; - RuleType xpath_assignment; - RuleType identifier; - RuleType source_info; - RuleType apply_block; - RuleType struct_block; - RuleType mustache_block; - RuleType()> assignment_list; - }; - } //unnamed namespace - - std::vector parse (const std::string& parData) { - ScrapGrammar gramm; + std::vector parse (std::string_view parData) { + ScrapGrammar gramm; auto it_start = parData.cbegin(); std::vector retval; diff --git a/src/scraplang/parse.hpp b/src/scraplang/parse.hpp index c6a6696..c87c227 100644 --- a/src/scraplang/parse.hpp +++ b/src/scraplang/parse.hpp @@ -20,10 +20,10 @@ #define idBE96C2D49C4C413888A79EAEB2B9C0FA #include "scrap_node.hpp" -#include +#include namespace duck { namespace sl { - std::vector parse ( const std::string& parData ); + std::vector parse ( std::string_view parData ); }} //namespace duck::sl #endif diff --git a/src/scraplang/parse_exports.cpp b/src/scraplang/parse_exports.cpp new file mode 100644 index 0000000..6be24ad --- /dev/null +++ b/src/scraplang/parse_exports.cpp @@ -0,0 +1,51 @@ +/* Copyright (C) 2017-2020 Michele Santullo + * + * This file is part of DuckScraper. + * + * DuckScraper is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * DuckScraper is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with DuckScraper. If not, see . + */ + +#include "scraplang/parse_exports.hpp" +#include +#include + +template class boost::spirit::qi::grammar::const_iterator, boost::spirit::qi::ascii::blank_type>; + +template class boost::spirit::qi::rule, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>; +template class boost::spirit::qi::rule, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>; +template class boost::spirit::qi::rule, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>; +template class boost::spirit::qi::rule, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>; +template class boost::spirit::qi::rule, std::string(), boost::spirit::qi::ascii::blank_type>; +template class boost::spirit::qi::rule, std::vector(), boost::spirit::qi::ascii::blank_type>; +template class boost::spirit::qi::rule, std::vector(), boost::spirit::qi::ascii::blank_type>; +template class boost::spirit::qi::rule, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>; +template class boost::spirit::qi::rule, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>; + +template bool boost::spirit::qi::phrase_parse< + std::basic_string::const_iterator, + duck::sl::ScrapGrammar::const_iterator, boost::spirit::qi::ascii::blank_type>, + boost::spirit::ascii::blank_type, + std::vector +> ( + std::basic_string::const_iterator&, + std::basic_string::const_iterator, + duck::sl::ScrapGrammar< + std::basic_string::const_iterator, + boost::spirit::qi::ascii::blank_type + > const&, + boost::spirit::ascii::blank_type const&, + std::vector& +); + +template struct boost::spirit::use_operator; diff --git a/src/scraplang/parse_exports.hpp b/src/scraplang/parse_exports.hpp new file mode 100644 index 0000000..7e4106f --- /dev/null +++ b/src/scraplang/parse_exports.hpp @@ -0,0 +1,54 @@ +/* Copyright (C) 2017-2020 Michele Santullo + * + * This file is part of DuckScraper. + * + * DuckScraper is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * DuckScraper is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with DuckScraper. If not, see . + */ + +#pragma once + +#include "scraplang/scrap_node.hpp" +#include "scraplang/scrapgrammar.hpp" +#include +#include + +extern template class boost::spirit::qi::grammar::const_iterator, boost::spirit::qi::ascii::blank_type>; + +extern template class boost::spirit::qi::rule, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>; +extern template class boost::spirit::qi::rule, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>; +extern template class boost::spirit::qi::rule, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>; +extern template class boost::spirit::qi::rule, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>; +extern template class boost::spirit::qi::rule, std::string(), boost::spirit::qi::ascii::blank_type>; +extern template class boost::spirit::qi::rule, std::vector(), boost::spirit::qi::ascii::blank_type>; +extern template class boost::spirit::qi::rule, std::vector(), boost::spirit::qi::ascii::blank_type>; +extern template class boost::spirit::qi::rule, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>; +extern template class boost::spirit::qi::rule, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>; + +extern template bool boost::spirit::qi::phrase_parse< + std::basic_string::const_iterator, + duck::sl::ScrapGrammar::const_iterator, boost::spirit::qi::ascii::blank_type>, + boost::spirit::ascii::blank_type, + std::vector +> ( + std::basic_string::const_iterator&, + std::basic_string::const_iterator, + duck::sl::ScrapGrammar< + std::basic_string::const_iterator, + boost::spirit::qi::ascii::blank_type + > const&, + boost::spirit::ascii::blank_type const&, + std::vector& +); + +extern template struct boost::spirit::use_operator; diff --git a/src/scraplang/scrapgrammar.hpp b/src/scraplang/scrapgrammar.hpp new file mode 100644 index 0000000..3259123 --- /dev/null +++ b/src/scraplang/scrapgrammar.hpp @@ -0,0 +1,81 @@ +/* Copyright (C) 2017-2020 Michele Santullo + * + * This file is part of DuckScraper. + * + * DuckScraper is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * DuckScraper is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with DuckScraper. If not, see . + */ + +#pragma once + +#include + +namespace duck::sl { + namespace qi = ::boost::spirit::qi; + + template + class ScrapGrammar : public qi::grammar(), Skipper> { + public: + ScrapGrammar() : ScrapGrammar::base_type(start) { + using qi::char_; + using qi::lexeme; + using qi::alpha; + using qi::alnum; + using qi::graph; + using qi::attr; + using qi::eol; + using qi::eoi; + using qi::lit; + using qi::string; + using qi::as_string; + using qi::no_skip; + + start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi; + from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end"; + source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token)); + url = -(+alpha >> string("://")) >> alpha >> *graph; + mustache_like_token = "{{" >> identifier >> "}}"; + quoted_string %= lexeme['"' >> +(char_ - '"') >> '"']; + xpath_assignment %= identifier >> + -(lit("default") >> '(' >> quoted_string >> ')') >> "=" >> + as_string[lexeme[+(graph | char_(" \t"))]]; + identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))]; + + apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >> + assignment_list >> +eol >> "end"; + struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end"; + + mustache_block %= as_string[lit("==") >> identifier] >> eol >> + as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end"; + + assignment_list = (xpath_assignment | struct_block) % +eol; + } + + private: + template + using RuleType = qi::rule; + + RuleType()> start; + RuleType from_block; + RuleType url; + RuleType mustache_like_token; + RuleType quoted_string; + RuleType xpath_assignment; + RuleType identifier; + RuleType source_info; + RuleType apply_block; + RuleType struct_block; + RuleType mustache_block; + RuleType()> assignment_list; + }; +} //namespace duck::sl diff --git a/src/scraplang/xpath_runner.cpp b/src/scraplang/xpath_runner.cpp index 84014da..dabcf5e 100644 --- a/src/scraplang/xpath_runner.cpp +++ b/src/scraplang/xpath_runner.cpp @@ -47,9 +47,10 @@ namespace duck { namespace sl { } }; - XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool) : + XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath) : m_cached_results(), - m_pool(html_pool) + m_pool(html_pool), + m_xpath(parXPath) { } @@ -73,7 +74,7 @@ namespace duck { namespace sl { #endif const std::string* html = m_pool->GetByID(id); - curr_vec = xpath_query(*html, std::string(parQuery)); + curr_vec = m_xpath->run_query(*html, std::string(parQuery)); std::cout << "First time for this query, result cached now\n"; } diff --git a/src/scraplang/xpath_runner.hpp b/src/scraplang/xpath_runner.hpp index 925b1d5..ce84e53 100644 --- a/src/scraplang/xpath_runner.hpp +++ b/src/scraplang/xpath_runner.hpp @@ -20,6 +20,7 @@ #define id46DB8F4F85E2417E9AF0B1A410240D4F #include "html_pool_base.hpp" +#include "xpath_fwd.hpp" #include #include #include @@ -27,7 +28,7 @@ namespace duck { namespace sl { class XPathRunner { public: - explicit XPathRunner (HtmlPoolBaseSP html_pool); + XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath); ~XPathRunner(); const std::vector& query ( @@ -40,6 +41,7 @@ namespace duck { namespace sl { std::map> m_cached_results; HtmlPoolBaseSP m_pool; + XPathPtr m_xpath; }; }} //namespace duck::sl diff --git a/src/xpath.cpp b/src/xpath.cpp index ff935d3..75d90a4 100644 --- a/src/xpath.cpp +++ b/src/xpath.cpp @@ -47,7 +47,11 @@ namespace duck { } } //unnamed namespace - XPathBatchResults xpath_query (const std::string& parXML, const std::vector& parQueries) { + XPath::XPath() = default; + + XPath::~XPath() = default; + + auto XPath::run_query (const std::string& parXML, const std::vector& parQueries) -> BatchResults { XQilla& xqilla = m_xqilla; XercesConfiguration xconfig; AutoDelete context(xqilla.createContext(XQilla::XQUERY_UPDATE, &xconfig)); @@ -61,7 +65,7 @@ namespace duck { } context->setContextItem(ptr); - XPathBatchResults retval; + BatchResults retval; for (const auto& xpath : parQueries) { AutoDelete query(xqilla.parse(X(xpath.c_str()))); context->setContextPosition(1); @@ -75,11 +79,11 @@ namespace duck { } retval.push_back(std::move(new_lst)); } - return std::move(retval); + return retval; } - std::vector xpath_query (const std::string& parXML, const std::string& parQuery) { - auto query_res = xpath_query(parXML, std::vector{parQuery}); + std::vector XPath::run_query (const std::string& parXML, const std::string& parQuery) { + auto query_res = run_query(parXML, std::vector{parQuery}); if (query_res.empty() or query_res.front().empty()) { return std::vector(); } diff --git a/src/xpath.hpp b/src/xpath.hpp index 4d5031d..6166218 100644 --- a/src/xpath.hpp +++ b/src/xpath.hpp @@ -19,6 +19,7 @@ #ifndef id21E0A6F345D24C5D83D3B1F74EC810F7 #define id21E0A6F345D24C5D83D3B1F74EC810F7 +#include "xpath_fwd.hpp" #include #include #include @@ -26,8 +27,6 @@ #include namespace duck { - typedef std::vector>> XPathBatchResults; - class ParseError : public std::exception { public: ParseError ( int parLine, int parColumn, std::string parMessage ); @@ -36,8 +35,19 @@ namespace duck { std::vector m_msg; }; - XPathBatchResults xpath_query ( const std::string& parXML, const std::vector& parQueries ); - std::vector xpath_query ( const std::string& parXML, const std::string& parQuery ); + class XPath : public Kakoune::SafeCountable { + public: + typedef std::vector>> BatchResults; + + XPath(); + ~XPath(); + + BatchResults run_query ( const std::string& parXML, const std::vector& parQueries ); + std::vector run_query ( const std::string& parXML, const std::string& parQuery ); + + private: + XQilla m_xqilla; + }; } //namespace duck #endif diff --git a/src/xpath_fwd.hpp b/src/xpath_fwd.hpp new file mode 100644 index 0000000..732dc29 --- /dev/null +++ b/src/xpath_fwd.hpp @@ -0,0 +1,29 @@ +/* Copyright (C) 2015-2020 Michele Santullo + * + * This file is part of DuckScraper. + * + * DuckScraper is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * DuckScraper is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with DuckScraper. If not, see . + */ + +#ifndef id08062CD6C4904D94BFF57990C44B6CCB +#define id08062CD6C4904D94BFF57990C44B6CCB + +#include "kakoune/safe_ptr.hh" + +namespace duck { + class XPath; + using XPathPtr = Kakoune::SafePtr; +} //namespace duck + +#endif