duckscraper/src/scraplang/parse.cpp

146 lines
5 KiB
C++

/* Copyright (C) 2017 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#include "parse.hpp"
#include "element_def.hpp"
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/fusion/adapted/std_pair.hpp>
#include <utility>
#if !defined(NDEBUG)
# include <iostream>
#endif
#include <boost/variant/apply_visitor.hpp>
#include <stdexcept>
namespace qi = boost::spirit::qi;
namespace sp = boost::spirit;
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::SourceInfo,
(std::string, value)
(duck::sl::SourceInfo::Type, type)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::FromBlock,
(duck::sl::SourceInfo, source)
(std::vector<duck::sl::StructItem>, xpaths)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::StructBlock,
(std::string, name)
(std::vector<duck::sl::StructItem>, xpaths)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::ApplyBlock,
(std::string, mustache_model)
(duck::sl::SourceInfo, source)
(std::vector<duck::sl::StructItem>, xpaths)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::MustacheBlock,
(std::string, name)
(std::string, content)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::XPathElement,
(std::string, name)
(std::optional<std::string>, def_val)
(std::string, xpath)
)
namespace duck { namespace sl {
namespace {
template <typename I, typename Skipper>
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
public:
ScrapGrammar() : ScrapGrammar::base_type(start) {
using qi::char_;
using qi::lexeme;
using qi::alpha;
using qi::alnum;
using qi::graph;
using qi::attr;
using qi::eol;
using qi::eoi;
using qi::lit;
using qi::string;
using qi::as_string;
using qi::no_skip;
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
source_info = (url >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
url = -(+alpha >> string("://")) >> alpha >> *graph;
mustache_like_token = "{{" >> identifier >> "}}";
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
xpath_assignment %= identifier >>
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
as_string[lexeme[+(graph | char_(" \t"))]];
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
assignment_list >> +eol >> "end";
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
assignment_list = (xpath_assignment | struct_block) % +eol;
}
private:
template <typename F>
using RuleType = qi::rule<I, F, Skipper>;
RuleType<std::vector<ScrapNode>()> start;
RuleType<FromBlock()> from_block;
RuleType<std::string()> url;
RuleType<std::string()> mustache_like_token;
RuleType<std::string()> quoted_string;
RuleType<XPathElement()> xpath_assignment;
RuleType<std::string()> identifier;
RuleType<SourceInfo()> source_info;
RuleType<ApplyBlock()> apply_block;
RuleType<StructBlock()> struct_block;
RuleType<MustacheBlock()> mustache_block;
RuleType<std::vector<StructItem>()> assignment_list;
};
} //unnamed namespace
std::vector<ScrapNode> parse (const std::string& parData) {
ScrapGrammar<std::string::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
auto it_start = parData.cbegin();
std::vector<ScrapNode> retval;
const bool ok = qi::phrase_parse(it_start, parData.cend(), gramm, sp::ascii::blank, retval);
std::cout << "parse ok: " << std::boolalpha << ok << '\n';
std::cout << "end == it: " << std::boolalpha << (parData.cend() == it_start) << '\n';
std::cout << "begin == it: " << std::boolalpha << (parData.cbegin() == it_start) << '\n';
std::cout << "parse distance: " << std::distance(parData.cbegin(), it_start) << '\n';
std::cout << "all distance: " << std::distance(parData.cbegin(), parData.cend()) << " (size: " << parData.size() << ")\n";
if (parData.cend() != it_start or not ok) {
throw std::runtime_error("Error parsing input script");
}
return retval;
}
}} //namespace duck::sl