146 lines
5 KiB
C++
146 lines
5 KiB
C++
/* Copyright (C) 2017 Michele Santullo
|
|
*
|
|
* This file is part of DuckScraper.
|
|
*
|
|
* DuckScraper is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* DuckScraper is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "parse.hpp"
|
|
#include "element_def.hpp"
|
|
#include <boost/spirit/include/qi.hpp>
|
|
#include <boost/spirit/include/phoenix_stl.hpp>
|
|
#include <boost/spirit/include/phoenix_fusion.hpp>
|
|
#include <boost/fusion/adapted/struct.hpp>
|
|
#include <boost/fusion/adapted/std_pair.hpp>
|
|
#include <utility>
|
|
#if !defined(NDEBUG)
|
|
# include <iostream>
|
|
#endif
|
|
#include <boost/variant/apply_visitor.hpp>
|
|
#include <stdexcept>
|
|
|
|
namespace qi = boost::spirit::qi;
|
|
namespace sp = boost::spirit;
|
|
|
|
BOOST_FUSION_ADAPT_STRUCT(
|
|
duck::sl::SourceInfo,
|
|
(std::string, value)
|
|
(duck::sl::SourceInfo::Type, type)
|
|
)
|
|
BOOST_FUSION_ADAPT_STRUCT(
|
|
duck::sl::FromBlock,
|
|
(duck::sl::SourceInfo, source)
|
|
(std::vector<duck::sl::StructItem>, xpaths)
|
|
)
|
|
BOOST_FUSION_ADAPT_STRUCT(
|
|
duck::sl::StructBlock,
|
|
(std::string, name)
|
|
(std::vector<duck::sl::StructItem>, xpaths)
|
|
)
|
|
BOOST_FUSION_ADAPT_STRUCT(
|
|
duck::sl::ApplyBlock,
|
|
(std::string, mustache_model)
|
|
(duck::sl::SourceInfo, source)
|
|
(std::vector<duck::sl::StructItem>, xpaths)
|
|
)
|
|
BOOST_FUSION_ADAPT_STRUCT(
|
|
duck::sl::MustacheBlock,
|
|
(std::string, name)
|
|
(std::string, content)
|
|
)
|
|
BOOST_FUSION_ADAPT_STRUCT(
|
|
duck::sl::XPathElement,
|
|
(std::string, name)
|
|
(std::optional<std::string>, def_val)
|
|
(std::string, xpath)
|
|
)
|
|
|
|
namespace duck { namespace sl {
|
|
namespace {
|
|
template <typename I, typename Skipper>
|
|
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
|
|
public:
|
|
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
|
using qi::char_;
|
|
using qi::lexeme;
|
|
using qi::alpha;
|
|
using qi::alnum;
|
|
using qi::graph;
|
|
using qi::attr;
|
|
using qi::eol;
|
|
using qi::eoi;
|
|
using qi::lit;
|
|
using qi::string;
|
|
using qi::as_string;
|
|
using qi::no_skip;
|
|
|
|
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
|
|
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
|
|
source_info = (url >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
|
|
url = -(+alpha >> string("://")) >> alpha >> *graph;
|
|
mustache_like_token = "{{" >> identifier >> "}}";
|
|
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
|
|
xpath_assignment %= identifier >>
|
|
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
|
|
as_string[lexeme[+(graph | char_(" \t"))]];
|
|
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
|
|
|
|
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
|
|
assignment_list >> +eol >> "end";
|
|
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
|
|
|
|
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
|
|
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
|
|
|
|
assignment_list = (xpath_assignment | struct_block) % +eol;
|
|
}
|
|
|
|
private:
|
|
template <typename F>
|
|
using RuleType = qi::rule<I, F, Skipper>;
|
|
|
|
RuleType<std::vector<ScrapNode>()> start;
|
|
RuleType<FromBlock()> from_block;
|
|
RuleType<std::string()> url;
|
|
RuleType<std::string()> mustache_like_token;
|
|
RuleType<std::string()> quoted_string;
|
|
RuleType<XPathElement()> xpath_assignment;
|
|
RuleType<std::string()> identifier;
|
|
RuleType<SourceInfo()> source_info;
|
|
RuleType<ApplyBlock()> apply_block;
|
|
RuleType<StructBlock()> struct_block;
|
|
RuleType<MustacheBlock()> mustache_block;
|
|
RuleType<std::vector<StructItem>()> assignment_list;
|
|
};
|
|
} //unnamed namespace
|
|
|
|
std::vector<ScrapNode> parse (const std::string& parData) {
|
|
ScrapGrammar<std::string::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
|
|
auto it_start = parData.cbegin();
|
|
|
|
std::vector<ScrapNode> retval;
|
|
const bool ok = qi::phrase_parse(it_start, parData.cend(), gramm, sp::ascii::blank, retval);
|
|
|
|
std::cout << "parse ok: " << std::boolalpha << ok << '\n';
|
|
std::cout << "end == it: " << std::boolalpha << (parData.cend() == it_start) << '\n';
|
|
std::cout << "begin == it: " << std::boolalpha << (parData.cbegin() == it_start) << '\n';
|
|
std::cout << "parse distance: " << std::distance(parData.cbegin(), it_start) << '\n';
|
|
std::cout << "all distance: " << std::distance(parData.cbegin(), parData.cend()) << " (size: " << parData.size() << ")\n";
|
|
|
|
if (parData.cend() != it_start or not ok) {
|
|
throw std::runtime_error("Error parsing input script");
|
|
}
|
|
return retval;
|
|
}
|
|
}} //namespace duck::sl
|