WiP reworking the AST interpreter.
This commit is contained in:
parent
29f8fe299e
commit
fcb25ed456
9 changed files with 92 additions and 437 deletions
|
@ -30,8 +30,6 @@ add_executable(${PROJECT_NAME}
|
|||
src/commandline.cpp
|
||||
src/scraplang/parse.cpp
|
||||
src/scraplang/apply.cpp
|
||||
src/scraplang/xpath_manager.cpp
|
||||
src/scraplang/variables.cpp
|
||||
src/xpath.cpp
|
||||
)
|
||||
|
||||
|
|
|
@ -16,191 +16,41 @@
|
|||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
//#define APPLY_VERBOSE
|
||||
#define APPLY_VERBOSE
|
||||
|
||||
#include "apply.hpp"
|
||||
#include "mstch/mstch.hpp"
|
||||
#include "variables.hpp"
|
||||
#include "html_pool_base.hpp"
|
||||
#include "xpath_manager.hpp"
|
||||
#include <boost/variant/apply_visitor.hpp>
|
||||
#include <map>
|
||||
#include "scrap_node.hpp"
|
||||
#if defined(APPLY_VERBOSE)
|
||||
# include <iostream>
|
||||
# include "stream_scrap_node.hpp"
|
||||
#endif
|
||||
#include <map>
|
||||
#include <boost/variant/apply_visitor.hpp>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::ostream& operator<< (std::ostream& stream, XPathElement xpath) {
|
||||
stream << "xpath \"" << xpath.name << "\": \"" << xpath.xpath << "\" ";
|
||||
if (xpath.def_val)
|
||||
stream << "default: \"" << *xpath.def_val << '"';
|
||||
else
|
||||
stream << "no default";
|
||||
return stream;
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
typedef std::map<std::string, std::string> MustacheMap;
|
||||
typedef Kakoune::SafePtr<Variables> VariablesSP;
|
||||
typedef Kakoune::SafePtr<XPathManager> XPathManagerSP;
|
||||
typedef Kakoune::SafePtr<const XPathManager> XPathManagerCSP;
|
||||
|
||||
struct ApplyBlockContext {
|
||||
mstch::map context;
|
||||
SourceInfo target;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
class StructVisitor : public boost::static_visitor<> {
|
||||
struct XPathEntry {
|
||||
std::string value;
|
||||
class DictBuilder : public boost::static_visitor<> {
|
||||
public:
|
||||
StructVisitor (VariablesSP vars, XPathManagerCSP xpath_man) :
|
||||
m_variables(vars),
|
||||
m_xpath_man(xpath_man)
|
||||
{
|
||||
assert(m_variables);
|
||||
assert(m_xpath_man);
|
||||
}
|
||||
|
||||
void operator() (const XPathElement& v) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << '\t' << v << '\n';
|
||||
#endif
|
||||
m_variables->add_xpath(v);
|
||||
}
|
||||
|
||||
void operator() (const StructBlock& v) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << "\tstruct " << v.name << '\n';
|
||||
for (auto& xpath : v.xpaths) {
|
||||
std::cout << "\t\t" << xpath << '\n';
|
||||
}
|
||||
#endif
|
||||
|
||||
//for (const auto& xpath : v.xpaths) {
|
||||
// submap[xpath.name] = m_xpath_man->extract_one(xpath);
|
||||
//}
|
||||
assert(false);
|
||||
}
|
||||
|
||||
mstch::map&& steal_context() { return std::move(m_context); }
|
||||
DictBuilder (
|
||||
|
||||
private:
|
||||
VariablesSP m_variables;
|
||||
XPathManagerCSP m_xpath_man;
|
||||
mstch::map m_context;
|
||||
XPathSink m_xpaths;
|
||||
std::map<std::string, std::string> m_vars;
|
||||
};
|
||||
|
||||
class NodeVisitor : public boost::static_visitor<> {
|
||||
public:
|
||||
explicit NodeVisitor (HtmlPoolBaseSP html_pool) :
|
||||
m_xpath_man(html_pool),
|
||||
m_variables(XPathManagerSP(&m_xpath_man))
|
||||
{
|
||||
assert(html_pool);
|
||||
}
|
||||
|
||||
void operator() (const std::vector<ScrapNode>& v) {
|
||||
for (auto& itm : v) {
|
||||
boost::apply_visitor(*this, itm);
|
||||
}
|
||||
}
|
||||
|
||||
void operator() (const FromBlock& v) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << "--- FromBlock\n";
|
||||
std::cout << "\tsource: " << v.source.value << ", type " << static_cast<int>(v.source.type) << '\n';
|
||||
#endif
|
||||
|
||||
for (auto& xpath : v.xpaths) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << '\t' << xpath << '\n';
|
||||
#endif
|
||||
m_variables.add_xpath(xpath);
|
||||
assert(false); //e v.source che ce devo fa??
|
||||
}
|
||||
}
|
||||
|
||||
void operator() (const ApplyBlock& v) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << "--- ApplyBlock\n";
|
||||
std::cout << "\tmustache model: " << v.mustache_model << '\n';
|
||||
std::cout << "\tsource: " << v.source.value << ", type " << static_cast<int>(v.source.type) << '\n';
|
||||
#endif
|
||||
|
||||
auto struct_visitor = StructVisitor(
|
||||
VariablesSP(&m_variables),
|
||||
XPathManagerCSP(&m_xpath_man)
|
||||
);
|
||||
|
||||
for (auto& xpath : v.xpaths) {
|
||||
boost::apply_visitor(struct_visitor, xpath);
|
||||
}
|
||||
|
||||
ApplyBlockContext abctx;
|
||||
abctx.context = struct_visitor.steal_context();
|
||||
abctx.name = v.mustache_model;
|
||||
abctx.target = v.source;
|
||||
|
||||
m_apply_blocks.emplace_back(std::move(abctx));
|
||||
}
|
||||
|
||||
void operator() (const MustacheBlock& v) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << "--- MustacheBlock\n";
|
||||
std::cout << "\tname: " << v.name << '\n';
|
||||
std::cout << "\tcontent: " << v.content << '\n';
|
||||
#endif
|
||||
|
||||
m_mustaches[v.name] = v.content;
|
||||
}
|
||||
|
||||
const MustacheMap& mustaches() const { return m_mustaches; }
|
||||
const Variables& variables() const { return m_variables; }
|
||||
const std::vector<ApplyBlockContext>& apply_blocks() const { return m_apply_blocks; }
|
||||
|
||||
private:
|
||||
MustacheMap m_mustaches;
|
||||
XPathManager m_xpath_man;
|
||||
Variables m_variables;
|
||||
std::vector<ApplyBlockContext> m_apply_blocks;
|
||||
};
|
||||
|
||||
std::string resolve_source_ifn (const Variables& variables, const SourceInfo& src) {
|
||||
switch (src.type) {
|
||||
case SourceInfo::URL:
|
||||
return src.value;
|
||||
case SourceInfo::Token:
|
||||
return variables.resolve_string(src.value);
|
||||
}
|
||||
assert(false);
|
||||
return src.value;
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
||||
std::vector<std::string> apply (const ScrapNode& node, HtmlPoolBaseSP html_pool) {
|
||||
assert(html_pool);
|
||||
std::vector<std::string> apply (
|
||||
const ScrapNode& node,
|
||||
HtmlPoolBaseSP html_pool
|
||||
) {
|
||||
DictBuilder dict_builder(html_pool);
|
||||
|
||||
NodeVisitor visitor(html_pool);
|
||||
boost::apply_visitor(visitor, node);
|
||||
|
||||
const Variables& vars = visitor.variables();
|
||||
std::vector<std::string> retval;
|
||||
for (auto& apply_block : visitor.apply_blocks()) {
|
||||
std::cout << "looping on apply block \"" << apply_block.name << "\"\n";
|
||||
std::string src_url = resolve_source_ifn(vars, apply_block.target);
|
||||
|
||||
std::string mustache_model_name = vars.resolve_string(apply_block.name, src_url);
|
||||
const std::string& mustache_model = visitor.mustaches().at(mustache_model_name);
|
||||
//std::vector<SourceInfo> src_urls = vars.resolve_array(apply_block.target);
|
||||
|
||||
//for (const auto& src_url : src_urls) {
|
||||
// const auto html_id = html_pool->GetOrAdd(vars.resolve_string(src_url));
|
||||
// const auto* const html = html_pool->GetByID(html_id);
|
||||
//}
|
||||
}
|
||||
|
||||
return retval;
|
||||
boost::apply_visitor(dict_builder, node);
|
||||
}
|
||||
}} //namespace duck::sl
|
||||
|
|
|
@ -37,10 +37,6 @@ namespace duck { namespace sl {
|
|||
struct SourceInfo {
|
||||
enum Type { URL, Token };
|
||||
|
||||
SourceInfo() = default;
|
||||
SourceInfo (std::string&& val) : value(std::move(val)), type(Token) {}
|
||||
SourceInfo (const std::string& val) : value(val), type(Token) {}
|
||||
|
||||
std::string value;
|
||||
Type type;
|
||||
};
|
||||
|
|
72
src/scraplang/stream_scrap_node.hpp
Normal file
72
src/scraplang/stream_scrap_node.hpp
Normal file
|
@ -0,0 +1,72 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef idDB3415BA82504C00A2DAF0274BA9AC92
|
||||
#define idDB3415BA82504C00A2DAF0274BA9AC92
|
||||
|
||||
#include "scrap_node.hpp"
|
||||
#include <iostream>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
std::ostream& operator<< (std::ostream& stream, XPathElement xpath) {
|
||||
stream << "XPathElement \"" << xpath.name << "\": \"" <<
|
||||
xpath.xpath << "\" ";
|
||||
|
||||
if (xpath.def_val)
|
||||
stream << "default: \"" << *xpath.def_val << '"';
|
||||
else
|
||||
stream << "no default";
|
||||
return stream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const SourceInfo& src) {
|
||||
stream << "SourceInfo with ";
|
||||
switch (src.type) {
|
||||
case SourceInfo::URL:
|
||||
stream << "URL \"" << src.value << "\"";
|
||||
break;
|
||||
case SourceInfo::Token:
|
||||
stream << "value \"" << src.value << "\"";
|
||||
break;
|
||||
default:
|
||||
stream << "invalid content";
|
||||
}
|
||||
return stream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const FromBlock& blk) {
|
||||
stream << "FromBlock: " << blk.source << ", " <<
|
||||
blk.xpaths.size() << " xpath entries";
|
||||
return stream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const StructBlock& strct) {
|
||||
stream << "StructBlock \"" << strct.name << "\" with " <<
|
||||
strct.xpaths.size() << " xpath entries";
|
||||
return stream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const ApplyBlock& app) {
|
||||
stream << "ApplyBlock for \"" << app.mustache_model << "\": " <<
|
||||
app.source << ", " <<
|
||||
app.xpaths.size() << " elements";
|
||||
return stream;
|
||||
}
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
|
@ -1,96 +0,0 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "variables.hpp"
|
||||
#include "scrap_node.hpp"
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <cassert>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
namespace {
|
||||
} //unnamed namespace
|
||||
|
||||
Variables::Variables (XPathManagerSP parXPathMan) :
|
||||
m_xpath_man(parXPathMan)
|
||||
{
|
||||
}
|
||||
|
||||
Variables::~Variables() = default;
|
||||
|
||||
std::string Variables::resolve_string (
|
||||
const std::string& parName,
|
||||
const std::string& parUrl
|
||||
) const {
|
||||
std::cout << "resolving string \"" << parName << "\"...\n";
|
||||
|
||||
assert(not parName.empty());
|
||||
if (parName.empty())
|
||||
throw std::runtime_error("Unable to resolve invalid empty source info");
|
||||
|
||||
const auto& ret_variant = m_vars.at(parName);
|
||||
switch (ret_variant.which()) {
|
||||
case TypeXPathEntry:
|
||||
{
|
||||
const auto& entry = boost::get<XPathEntry>(ret_variant);
|
||||
assert(entry.url < m_urls.size());
|
||||
return m_xpath_man->extract_one(
|
||||
entry.xpath,
|
||||
parUrl
|
||||
entry.def_val,
|
||||
);
|
||||
}
|
||||
case TypeStringEntry:
|
||||
return boost::get<std::string>(ret_variant);
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
|
||||
throw std::runtime_error("Invalid variant type in resolve_string");
|
||||
}
|
||||
|
||||
void Variables::add_xpath (const XPathElement& parVal) {
|
||||
m_vars[parVal.name] = XPathEntry{
|
||||
parVal.def_val,
|
||||
parVal.xpath
|
||||
};
|
||||
}
|
||||
|
||||
void Variables::add_struct (const StructBlock& parVal) {
|
||||
m_vars[parVal.name] = StructureEntry {
|
||||
boost::copy_range<std::vector<VariableEntry>>(parVal.xpaths)
|
||||
};
|
||||
}
|
||||
|
||||
void Variables::add_string (const std::string& parName, std::string&& parVal) {
|
||||
std::cout << "Setting " << parName << " = " << parVal << " as string\n";
|
||||
m_vars[parName] = std::move(parVal);
|
||||
}
|
||||
|
||||
std::size_t Variables::url_index_add_ifn (std::string&& parURL) {
|
||||
auto it_found = std::find(m_urls.begin(), m_urls.end(), parURL);
|
||||
if (m_urls.end() != it_found) {
|
||||
return static_cast<std::size_t>(std::distance(m_urls.begin(), it_found));
|
||||
}
|
||||
else {
|
||||
m_urls.push_back(std::move(parURL));
|
||||
return m_urls.size() - 1;
|
||||
}
|
||||
}
|
||||
}} //namespace duck::sl
|
|
@ -1,81 +0,0 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id700FA165E5194907867EB4C02C4C1385
|
||||
#define id700FA165E5194907867EB4C02C4C1385
|
||||
|
||||
#include "mstch/mstch.hpp"
|
||||
#include "kakoune/safe_ptr.hh"
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <boost/variant.hpp>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
struct StructBlock;
|
||||
struct XPathElement;
|
||||
struct XPathManager;
|
||||
|
||||
typedef Kakoune::SafePtr<XPathManager> XPathManagerSP;
|
||||
|
||||
class Variables : public Kakoune::SafeCountable {
|
||||
public:
|
||||
explicit Variables (XPathManagerSP parXPathMan);
|
||||
~Variables();
|
||||
|
||||
std::string resolve_string (
|
||||
const std::string& parName,
|
||||
const std::string& parUrl
|
||||
) const;
|
||||
|
||||
void add_xpath (const XPathElement& parVal);
|
||||
void add_struct (const StructBlock& parVal);
|
||||
void add_string (const std::string& parName, std::string&& parVal);
|
||||
|
||||
private:
|
||||
struct XPathEntry {
|
||||
std::optional<std::string> def_val;
|
||||
std::string xpath;
|
||||
};
|
||||
|
||||
struct StructureEntry;
|
||||
|
||||
typedef boost::variant<
|
||||
XPathEntry,
|
||||
std::string,
|
||||
boost::recursive_wrapper<StructureEntry>
|
||||
> VariableEntry;
|
||||
|
||||
enum TypeVariableEntries {
|
||||
TypeXPathEntry,
|
||||
TypeStringEntry,
|
||||
TypeStructureEntry
|
||||
};
|
||||
|
||||
struct StructureEntry {
|
||||
std::vector<VariableEntry> struct_entries;
|
||||
};
|
||||
|
||||
std::size_t url_index_add_ifn (std::string&& parURL);
|
||||
|
||||
std::map<std::string, VariableEntry> m_vars;
|
||||
XPathManagerSP m_xpath_man;
|
||||
};
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
|
@ -1,38 +0,0 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "xpath_manager.hpp"
|
||||
#include "htmlretrieve.hpp"
|
||||
#include "xpath.hpp"
|
||||
|
||||
namespace duck { namespace sl {
|
||||
XPathManager::XPathManager (HtmlPoolBaseSP parHtmlPool) :
|
||||
m_html_pool(parHtmlPool)
|
||||
{
|
||||
}
|
||||
|
||||
std::string XPathManager::extract_one (
|
||||
const std::string& parXPath,
|
||||
const std::string& parURL,
|
||||
const std::optional<std::string>& parDefault
|
||||
) const {
|
||||
std::string html = *m_html_pool->GetByName(parURL);
|
||||
html = duck::clean_html(std::move(html));
|
||||
return duck::xpath_query(html, parXPath);
|
||||
}
|
||||
}} //namespace duck::sl
|
|
@ -1,46 +0,0 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id69826186710D4048BF6810202EDF310D
|
||||
#define id69826186710D4048BF6810202EDF310D
|
||||
|
||||
#include "scraplang/html_pool_base.hpp"
|
||||
#include "scraplang/scrap_node.hpp"
|
||||
#include "kakoune/safe_ptr.hh"
|
||||
#include <string>
|
||||
#include <optional>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
class XPathManager : public Kakoune::SafeCountable {
|
||||
public:
|
||||
explicit XPathManager (HtmlPoolBaseSP parHtmlPool);
|
||||
~XPathManager() = default;
|
||||
|
||||
std::string extract_one (
|
||||
const std::string& parXPath,
|
||||
const std::string& parURL,
|
||||
const std::optional<std::string>& parDefault
|
||||
) const;
|
||||
HtmlPoolBaseSP html_pool() const;
|
||||
|
||||
private:
|
||||
HtmlPoolBaseSP m_html_pool;
|
||||
};
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
|
@ -77,11 +77,11 @@ namespace duck {
|
|||
}
|
||||
|
||||
std::string xpath_query (const std::string& parXML, const std::string& parQuery) {
|
||||
auto retval = xpath_query(parXML, {parQuery});
|
||||
if (retval.empty())
|
||||
auto retval = xpath_query(parXML, std::vector<std::string>{parQuery});
|
||||
if (retval.empty() or retval.front().empty())
|
||||
return std::string();
|
||||
else
|
||||
return retval.front().second;
|
||||
return retval.front().front().second;
|
||||
}
|
||||
|
||||
ParseError::ParseError (int parLine, int parColumn, std::string parMessage) {
|
||||
|
|
Loading…
Add table
Reference in a new issue