Writing the code to go from tree to mustache dictionary.

This commit is contained in:
King_DuckZ 2018-01-17 23:24:35 +00:00
parent 41bb315b02
commit 6dffe9b848
5 changed files with 178 additions and 16 deletions

View file

@ -31,6 +31,7 @@ add_executable(${PROJECT_NAME}
src/scraplang/parse.cpp
src/scraplang/apply.cpp
src/xpath.cpp
src/scraplang/xpath_runner.cpp
)
target_include_directories(${PROJECT_NAME} SYSTEM

View file

@ -22,6 +22,7 @@
#include "mstch/mstch.hpp"
#include "html_pool_base.hpp"
#include "scrap_node.hpp"
#include "xpath_runner.hpp"
#if defined(APPLY_VERBOSE)
# include "stream_scrap_node.hpp"
#endif
@ -29,6 +30,9 @@
#include <boost/variant/apply_visitor.hpp>
#include <string_view>
namespace std {
} //namespace std
namespace duck { namespace sl {
#if defined(APPLY_VERBOSE)
#endif
@ -110,7 +114,7 @@ namespace duck { namespace sl {
class DictBuilder : public boost::static_visitor<> {
public:
explicit DictBuilder (HtmlPoolBaseSP parHtmlPool) :
explicit DictBuilder() :
m_current_mustache_name(nullptr),
m_current_mustache(nullptr)
{
@ -168,13 +172,68 @@ namespace duck { namespace sl {
const std::string* m_current_mustache_name;
MustacheEntry* m_current_mustache;
};
mstch::map to_mustache_dict_recursive (const EntryNode& parNode, std::string_view parSrc, XPAthRunner& parRunner) {
mstch::map retval;
for (const XPathElement* xpath : parNode.xpaths) {
assert(xpath);
}
}
mstch::map to_mustache_map (const EntryNodeList& parNodes, XPathRunner& parRunner) {
mstch::map retval;
for (auto& entry : parNodes) {
assert(entry.second.name.empty());
std::cout << "Analyzing entry " << *entry.first << '\n';
assert(entry.first);
std::string_view src_url;
if (SourceInfo::URL == entry.first->type) {
src_url = entry.first->value;
}
else {
assert(false); //not implemented
}
//mstch::map curr_entry_map = to_mustache_dict_recursive(
for (const XPathElement* xpath : entry.second.xpaths) {
assert(xpath);
std::cout << "Running query for \"" << xpath->name << "\"\n";
auto results = parRunner.query(src_url, xpath->xpath);
if (results.size() == 1) {
retval[xpath->name] = results.front();
}
else if (results.size() > 1) {
mstch::array values;
values.reserve(results.size());
std::copy(results.begin(), results.end(), std::back_inserter(values));
retval[xpath->name] = std::move(values);
}
else if (xpath->def_val) {
retval[xpath->name] = *xpath->def_val;
}
else {
retval[xpath->name] = std::string();
}
}
for (auto& curr_struct : entry.second.structs) {
assert(not curr_struct.name.empty());
retval[curr_struct.name] = struct_to_mustache(curr_struct, src_url, parRunner);
}
}
return retval;
}
} //unnamed namespace
std::vector<std::string> apply (
const ScrapNode& node,
HtmlPoolBaseSP html_pool
) {
DictBuilder dict_builder(html_pool);
using std::placeholders::_1;
DictBuilder dict_builder;
boost::apply_visitor(dict_builder, node);
std::vector<std::string> retval;
@ -182,8 +241,12 @@ namespace duck { namespace sl {
const MustacheEntryMap& mustaches = dict_builder.mustache_entries();
retval.reserve(mustaches.size());
std::cout << "-------------- visiting done ----------------\n";
XPathRunner xpath_runner(html_pool);
mstch::map mustache_ctx = to_mustache_map(global_entries, xpath_runner);
for (auto& must : mustaches) {
}
//for (auto& itm : dict_builder.global_entries()) {
// std::cout << "item: \"" << itm.first << "\", \"" <<
// itm.second->xpath << "\"\n";

View file

@ -34,19 +34,14 @@ namespace duck { namespace sl {
return stream;
}
std::ostream& operator<< (std::ostream& stream, const SourceInfo& src) {
stream << "SourceInfo with ";
switch (src.type) {
case SourceInfo::URL:
stream << "URL \"" << src.value << "\"";
break;
case SourceInfo::Token:
stream << "value \"" << src.value << "\"";
break;
default:
stream << "invalid content";
}
return stream;
std::ostream& operator<< (std::ostream& parStream, const duck::sl::SourceInfo& parInfo) {
if (duck::sl::SourceInfo::URL == parInfo.type)
parStream << '"' << parInfo.value << '"';
else if (duck::sl::SourceInfo::Token == parInfo.type)
parStream << '$' << parInfo.value;
else
parStream << "Invalid SourceInfo type";
return parStream;
}
std::ostream& operator<< (std::ostream& stream, const FromBlock& blk) {

View file

@ -0,0 +1,57 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#include "xpath_runner.hpp"
#include <cassert>
namespace duck { namespace sl {
struct XPathRunner::XPathKey {
XPathKey (const std::string& parSrc, const std::string& parQuery) :
source_address(parSrc),
xpath_query(parQuery)
{
assert(not source_address.empty());
}
std::string source_address;
std::string xpath_query;
bool operator< (const XPathKey& parOther) const {
return (
xpath_query == parOther.xpath_query and
source_address < parOther.source_address
) or (xpath_query < parOther.xpath_query);
}
};
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool) :
m_cached_results(),
m_pool(html_pool)
{
}
XPathRunner::~XPathRunner() = default;
const std::vector<std::string>& XPathRunner::query (
std::string_view parSrc,
std::string_view parQuery
) {
static std::vector<std::string> deleme;
return deleme;
}
}} //namespace duck::sl

View file

@ -0,0 +1,46 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id46DB8F4F85E2417E9AF0B1A410240D4F
#define id46DB8F4F85E2417E9AF0B1A410240D4F
#include "html_pool_base.hpp"
#include <map>
#include <string_view>
#include <string>
namespace duck { namespace sl {
class XPathRunner {
public:
explicit XPathRunner (HtmlPoolBaseSP html_pool);
~XPathRunner();
const std::vector<std::string>& query (
std::string_view parSrc,
std::string_view parQuery
);
private:
struct XPathKey;
std::map<XPathKey, std::vector<std::string>> m_cached_results;
HtmlPoolBaseSP m_pool;
};
}} //namespace duck::sl
#endif