Writing the code to go from tree to mustache dictionary.
This commit is contained in:
parent
41bb315b02
commit
6dffe9b848
5 changed files with 178 additions and 16 deletions
|
@ -31,6 +31,7 @@ add_executable(${PROJECT_NAME}
|
|||
src/scraplang/parse.cpp
|
||||
src/scraplang/apply.cpp
|
||||
src/xpath.cpp
|
||||
src/scraplang/xpath_runner.cpp
|
||||
)
|
||||
|
||||
target_include_directories(${PROJECT_NAME} SYSTEM
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "mstch/mstch.hpp"
|
||||
#include "html_pool_base.hpp"
|
||||
#include "scrap_node.hpp"
|
||||
#include "xpath_runner.hpp"
|
||||
#if defined(APPLY_VERBOSE)
|
||||
# include "stream_scrap_node.hpp"
|
||||
#endif
|
||||
|
@ -29,6 +30,9 @@
|
|||
#include <boost/variant/apply_visitor.hpp>
|
||||
#include <string_view>
|
||||
|
||||
namespace std {
|
||||
} //namespace std
|
||||
|
||||
namespace duck { namespace sl {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
#endif
|
||||
|
@ -110,7 +114,7 @@ namespace duck { namespace sl {
|
|||
|
||||
class DictBuilder : public boost::static_visitor<> {
|
||||
public:
|
||||
explicit DictBuilder (HtmlPoolBaseSP parHtmlPool) :
|
||||
explicit DictBuilder() :
|
||||
m_current_mustache_name(nullptr),
|
||||
m_current_mustache(nullptr)
|
||||
{
|
||||
|
@ -168,13 +172,68 @@ namespace duck { namespace sl {
|
|||
const std::string* m_current_mustache_name;
|
||||
MustacheEntry* m_current_mustache;
|
||||
};
|
||||
|
||||
mstch::map to_mustache_dict_recursive (const EntryNode& parNode, std::string_view parSrc, XPAthRunner& parRunner) {
|
||||
mstch::map retval;
|
||||
for (const XPathElement* xpath : parNode.xpaths) {
|
||||
assert(xpath);
|
||||
}
|
||||
}
|
||||
|
||||
mstch::map to_mustache_map (const EntryNodeList& parNodes, XPathRunner& parRunner) {
|
||||
mstch::map retval;
|
||||
for (auto& entry : parNodes) {
|
||||
assert(entry.second.name.empty());
|
||||
std::cout << "Analyzing entry " << *entry.first << '\n';
|
||||
|
||||
assert(entry.first);
|
||||
std::string_view src_url;
|
||||
if (SourceInfo::URL == entry.first->type) {
|
||||
src_url = entry.first->value;
|
||||
}
|
||||
else {
|
||||
assert(false); //not implemented
|
||||
}
|
||||
|
||||
//mstch::map curr_entry_map = to_mustache_dict_recursive(
|
||||
for (const XPathElement* xpath : entry.second.xpaths) {
|
||||
assert(xpath);
|
||||
std::cout << "Running query for \"" << xpath->name << "\"\n";
|
||||
auto results = parRunner.query(src_url, xpath->xpath);
|
||||
if (results.size() == 1) {
|
||||
retval[xpath->name] = results.front();
|
||||
}
|
||||
else if (results.size() > 1) {
|
||||
mstch::array values;
|
||||
values.reserve(results.size());
|
||||
std::copy(results.begin(), results.end(), std::back_inserter(values));
|
||||
retval[xpath->name] = std::move(values);
|
||||
}
|
||||
else if (xpath->def_val) {
|
||||
retval[xpath->name] = *xpath->def_val;
|
||||
}
|
||||
else {
|
||||
retval[xpath->name] = std::string();
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& curr_struct : entry.second.structs) {
|
||||
assert(not curr_struct.name.empty());
|
||||
retval[curr_struct.name] = struct_to_mustache(curr_struct, src_url, parRunner);
|
||||
}
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
||||
std::vector<std::string> apply (
|
||||
const ScrapNode& node,
|
||||
HtmlPoolBaseSP html_pool
|
||||
) {
|
||||
DictBuilder dict_builder(html_pool);
|
||||
using std::placeholders::_1;
|
||||
|
||||
DictBuilder dict_builder;
|
||||
boost::apply_visitor(dict_builder, node);
|
||||
|
||||
std::vector<std::string> retval;
|
||||
|
@ -182,8 +241,12 @@ namespace duck { namespace sl {
|
|||
const MustacheEntryMap& mustaches = dict_builder.mustache_entries();
|
||||
retval.reserve(mustaches.size());
|
||||
|
||||
|
||||
std::cout << "-------------- visiting done ----------------\n";
|
||||
XPathRunner xpath_runner(html_pool);
|
||||
mstch::map mustache_ctx = to_mustache_map(global_entries, xpath_runner);
|
||||
for (auto& must : mustaches) {
|
||||
}
|
||||
|
||||
//for (auto& itm : dict_builder.global_entries()) {
|
||||
// std::cout << "item: \"" << itm.first << "\", \"" <<
|
||||
// itm.second->xpath << "\"\n";
|
||||
|
|
|
@ -34,19 +34,14 @@ namespace duck { namespace sl {
|
|||
return stream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const SourceInfo& src) {
|
||||
stream << "SourceInfo with ";
|
||||
switch (src.type) {
|
||||
case SourceInfo::URL:
|
||||
stream << "URL \"" << src.value << "\"";
|
||||
break;
|
||||
case SourceInfo::Token:
|
||||
stream << "value \"" << src.value << "\"";
|
||||
break;
|
||||
default:
|
||||
stream << "invalid content";
|
||||
}
|
||||
return stream;
|
||||
std::ostream& operator<< (std::ostream& parStream, const duck::sl::SourceInfo& parInfo) {
|
||||
if (duck::sl::SourceInfo::URL == parInfo.type)
|
||||
parStream << '"' << parInfo.value << '"';
|
||||
else if (duck::sl::SourceInfo::Token == parInfo.type)
|
||||
parStream << '$' << parInfo.value;
|
||||
else
|
||||
parStream << "Invalid SourceInfo type";
|
||||
return parStream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const FromBlock& blk) {
|
||||
|
|
57
src/scraplang/xpath_runner.cpp
Normal file
57
src/scraplang/xpath_runner.cpp
Normal file
|
@ -0,0 +1,57 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "xpath_runner.hpp"
|
||||
#include <cassert>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
struct XPathRunner::XPathKey {
|
||||
XPathKey (const std::string& parSrc, const std::string& parQuery) :
|
||||
source_address(parSrc),
|
||||
xpath_query(parQuery)
|
||||
{
|
||||
assert(not source_address.empty());
|
||||
}
|
||||
|
||||
std::string source_address;
|
||||
std::string xpath_query;
|
||||
|
||||
bool operator< (const XPathKey& parOther) const {
|
||||
return (
|
||||
xpath_query == parOther.xpath_query and
|
||||
source_address < parOther.source_address
|
||||
) or (xpath_query < parOther.xpath_query);
|
||||
}
|
||||
};
|
||||
|
||||
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool) :
|
||||
m_cached_results(),
|
||||
m_pool(html_pool)
|
||||
{
|
||||
}
|
||||
|
||||
XPathRunner::~XPathRunner() = default;
|
||||
|
||||
const std::vector<std::string>& XPathRunner::query (
|
||||
std::string_view parSrc,
|
||||
std::string_view parQuery
|
||||
) {
|
||||
static std::vector<std::string> deleme;
|
||||
return deleme;
|
||||
}
|
||||
}} //namespace duck::sl
|
46
src/scraplang/xpath_runner.hpp
Normal file
46
src/scraplang/xpath_runner.hpp
Normal file
|
@ -0,0 +1,46 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id46DB8F4F85E2417E9AF0B1A410240D4F
|
||||
#define id46DB8F4F85E2417E9AF0B1A410240D4F
|
||||
|
||||
#include "html_pool_base.hpp"
|
||||
#include <map>
|
||||
#include <string_view>
|
||||
#include <string>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
class XPathRunner {
|
||||
public:
|
||||
explicit XPathRunner (HtmlPoolBaseSP html_pool);
|
||||
~XPathRunner();
|
||||
|
||||
const std::vector<std::string>& query (
|
||||
std::string_view parSrc,
|
||||
std::string_view parQuery
|
||||
);
|
||||
|
||||
private:
|
||||
struct XPathKey;
|
||||
|
||||
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
||||
HtmlPoolBaseSP m_pool;
|
||||
};
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
Loading…
Reference in a new issue