From 6dffe9b8482d2ca35b4398451112bc8bb73576a7 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Wed, 17 Jan 2018 23:24:35 +0000 Subject: [PATCH] Writing the code to go from tree to mustache dictionary. --- CMakeLists.txt | 1 + src/scraplang/apply.cpp | 69 +++++++++++++++++++++++++++-- src/scraplang/stream_scrap_node.hpp | 21 ++++----- src/scraplang/xpath_runner.cpp | 57 ++++++++++++++++++++++++ src/scraplang/xpath_runner.hpp | 46 +++++++++++++++++++ 5 files changed, 178 insertions(+), 16 deletions(-) create mode 100644 src/scraplang/xpath_runner.cpp create mode 100644 src/scraplang/xpath_runner.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b450978..8e50bff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ add_executable(${PROJECT_NAME} src/scraplang/parse.cpp src/scraplang/apply.cpp src/xpath.cpp + src/scraplang/xpath_runner.cpp ) target_include_directories(${PROJECT_NAME} SYSTEM diff --git a/src/scraplang/apply.cpp b/src/scraplang/apply.cpp index 513f4a9..9de4514 100644 --- a/src/scraplang/apply.cpp +++ b/src/scraplang/apply.cpp @@ -22,6 +22,7 @@ #include "mstch/mstch.hpp" #include "html_pool_base.hpp" #include "scrap_node.hpp" +#include "xpath_runner.hpp" #if defined(APPLY_VERBOSE) # include "stream_scrap_node.hpp" #endif @@ -29,6 +30,9 @@ #include #include +namespace std { +} //namespace std + namespace duck { namespace sl { #if defined(APPLY_VERBOSE) #endif @@ -110,7 +114,7 @@ namespace duck { namespace sl { class DictBuilder : public boost::static_visitor<> { public: - explicit DictBuilder (HtmlPoolBaseSP parHtmlPool) : + explicit DictBuilder() : m_current_mustache_name(nullptr), m_current_mustache(nullptr) { @@ -168,13 +172,68 @@ namespace duck { namespace sl { const std::string* m_current_mustache_name; MustacheEntry* m_current_mustache; }; + + mstch::map to_mustache_dict_recursive (const EntryNode& parNode, std::string_view parSrc, XPAthRunner& parRunner) { + mstch::map retval; + for (const XPathElement* xpath : parNode.xpaths) { + assert(xpath); + } + } + + mstch::map to_mustache_map (const EntryNodeList& parNodes, XPathRunner& parRunner) { + mstch::map retval; + for (auto& entry : parNodes) { + assert(entry.second.name.empty()); + std::cout << "Analyzing entry " << *entry.first << '\n'; + + assert(entry.first); + std::string_view src_url; + if (SourceInfo::URL == entry.first->type) { + src_url = entry.first->value; + } + else { + assert(false); //not implemented + } + + //mstch::map curr_entry_map = to_mustache_dict_recursive( + for (const XPathElement* xpath : entry.second.xpaths) { + assert(xpath); + std::cout << "Running query for \"" << xpath->name << "\"\n"; + auto results = parRunner.query(src_url, xpath->xpath); + if (results.size() == 1) { + retval[xpath->name] = results.front(); + } + else if (results.size() > 1) { + mstch::array values; + values.reserve(results.size()); + std::copy(results.begin(), results.end(), std::back_inserter(values)); + retval[xpath->name] = std::move(values); + } + else if (xpath->def_val) { + retval[xpath->name] = *xpath->def_val; + } + else { + retval[xpath->name] = std::string(); + } + } + + for (auto& curr_struct : entry.second.structs) { + assert(not curr_struct.name.empty()); + retval[curr_struct.name] = struct_to_mustache(curr_struct, src_url, parRunner); + } + } + + return retval; + } } //unnamed namespace std::vector apply ( const ScrapNode& node, HtmlPoolBaseSP html_pool ) { - DictBuilder dict_builder(html_pool); + using std::placeholders::_1; + + DictBuilder dict_builder; boost::apply_visitor(dict_builder, node); std::vector retval; @@ -182,8 +241,12 @@ namespace duck { namespace sl { const MustacheEntryMap& mustaches = dict_builder.mustache_entries(); retval.reserve(mustaches.size()); - std::cout << "-------------- visiting done ----------------\n"; + XPathRunner xpath_runner(html_pool); + mstch::map mustache_ctx = to_mustache_map(global_entries, xpath_runner); + for (auto& must : mustaches) { + } + //for (auto& itm : dict_builder.global_entries()) { // std::cout << "item: \"" << itm.first << "\", \"" << // itm.second->xpath << "\"\n"; diff --git a/src/scraplang/stream_scrap_node.hpp b/src/scraplang/stream_scrap_node.hpp index be15348..630744a 100644 --- a/src/scraplang/stream_scrap_node.hpp +++ b/src/scraplang/stream_scrap_node.hpp @@ -34,19 +34,14 @@ namespace duck { namespace sl { return stream; } - std::ostream& operator<< (std::ostream& stream, const SourceInfo& src) { - stream << "SourceInfo with "; - switch (src.type) { - case SourceInfo::URL: - stream << "URL \"" << src.value << "\""; - break; - case SourceInfo::Token: - stream << "value \"" << src.value << "\""; - break; - default: - stream << "invalid content"; - } - return stream; + std::ostream& operator<< (std::ostream& parStream, const duck::sl::SourceInfo& parInfo) { + if (duck::sl::SourceInfo::URL == parInfo.type) + parStream << '"' << parInfo.value << '"'; + else if (duck::sl::SourceInfo::Token == parInfo.type) + parStream << '$' << parInfo.value; + else + parStream << "Invalid SourceInfo type"; + return parStream; } std::ostream& operator<< (std::ostream& stream, const FromBlock& blk) { diff --git a/src/scraplang/xpath_runner.cpp b/src/scraplang/xpath_runner.cpp new file mode 100644 index 0000000..c19a6a5 --- /dev/null +++ b/src/scraplang/xpath_runner.cpp @@ -0,0 +1,57 @@ +/* Copyright (C) 2015 Michele Santullo + * + * This file is part of DuckScraper. + * + * DuckScraper is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * DuckScraper is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with DuckScraper. If not, see . + */ + +#include "xpath_runner.hpp" +#include + +namespace duck { namespace sl { + struct XPathRunner::XPathKey { + XPathKey (const std::string& parSrc, const std::string& parQuery) : + source_address(parSrc), + xpath_query(parQuery) + { + assert(not source_address.empty()); + } + + std::string source_address; + std::string xpath_query; + + bool operator< (const XPathKey& parOther) const { + return ( + xpath_query == parOther.xpath_query and + source_address < parOther.source_address + ) or (xpath_query < parOther.xpath_query); + } + }; + + XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool) : + m_cached_results(), + m_pool(html_pool) + { + } + + XPathRunner::~XPathRunner() = default; + + const std::vector& XPathRunner::query ( + std::string_view parSrc, + std::string_view parQuery + ) { + static std::vector deleme; + return deleme; + } +}} //namespace duck::sl diff --git a/src/scraplang/xpath_runner.hpp b/src/scraplang/xpath_runner.hpp new file mode 100644 index 0000000..925b1d5 --- /dev/null +++ b/src/scraplang/xpath_runner.hpp @@ -0,0 +1,46 @@ +/* Copyright (C) 2015 Michele Santullo + * + * This file is part of DuckScraper. + * + * DuckScraper is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * DuckScraper is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with DuckScraper. If not, see . + */ + +#ifndef id46DB8F4F85E2417E9AF0B1A410240D4F +#define id46DB8F4F85E2417E9AF0B1A410240D4F + +#include "html_pool_base.hpp" +#include +#include +#include + +namespace duck { namespace sl { + class XPathRunner { + public: + explicit XPathRunner (HtmlPoolBaseSP html_pool); + ~XPathRunner(); + + const std::vector& query ( + std::string_view parSrc, + std::string_view parQuery + ); + + private: + struct XPathKey; + + std::map> m_cached_results; + HtmlPoolBaseSP m_pool; + }; +}} //namespace duck::sl + +#endif