From b028e8c492c278a1296876a3a4a3fa7b3fcc0b76 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Thu, 8 Feb 2018 00:57:16 +0000 Subject: [PATCH] Lots of crap but it works. I'll improve code as I go. --- src/htmlretrieve.cpp | 6 +++--- src/htmlretrieve.hpp | 3 ++- src/scraplang/apply.cpp | 13 +++++++++++-- src/scraplang/html_pool_base.hpp | 3 ++- src/scraplang/xpath_runner.cpp | 29 ++++++++++++++++++++++++----- src/xpath.cpp | 18 ++++++++++++------ src/xpath.hpp | 2 +- 7 files changed, 55 insertions(+), 19 deletions(-) diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp index b466fd3..0c9ef66 100644 --- a/src/htmlretrieve.cpp +++ b/src/htmlretrieve.cpp @@ -45,7 +45,7 @@ namespace duck { } } - bool isHttps (const std::string& parUrl) { + bool isHttps (const std::string_view& parUrl) { const char protocol[] = "https://"; const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1; if (parUrl.size() < protocolLen) @@ -103,7 +103,7 @@ namespace duck { } - std::string fetch_html (const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) { + std::string fetch_html (const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) { using curl::curl_easy; using curl::curl_pair; using curl::curl_ios; @@ -111,7 +111,7 @@ namespace duck { std::ostringstream oss; curl_ios wr(oss); curl_easy easy(wr); - easy.add(curl_pair(CURLOPT_URL, parSource)); + easy.add(curl_pair(CURLOPT_URL, std::string(parSource))); if (isHttps(parSource)) { easy.add(curl_pair(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer)); easy.add(curl_pair(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost)); diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp index ea1123d..578585a 100644 --- a/src/htmlretrieve.hpp +++ b/src/htmlretrieve.hpp @@ -20,9 +20,10 @@ #define idC6776D903059465191FFB64FCFD6B86A #include +#include namespace duck { - std::string fetch_html ( const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost ); + std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost ); std::string clean_html ( std::string&& html ); } //namespace duck diff --git a/src/scraplang/apply.cpp b/src/scraplang/apply.cpp index 58fdda6..edcb188 100644 --- a/src/scraplang/apply.cpp +++ b/src/scraplang/apply.cpp @@ -72,6 +72,8 @@ namespace duck { namespace sl { content(""), mustache_name(parMstchName) { + assert(apply_to); + assert(not apply_to->value.empty()); } ApplyEntry (ApplyEntry&&) = default; ApplyEntry& operator=(ApplyEntry&&) = default; @@ -150,6 +152,7 @@ namespace duck { namespace sl { #if defined(APPLY_VERBOSE) std::cout << parVal << '\n'; #endif + assert(not parVal.source.value.empty()); m_apply_entries.emplace_back(&parVal.source, parVal.mustache_model); store_entry_subtree(parVal.xpaths, m_apply_entries.back().content); } @@ -353,9 +356,15 @@ namespace duck { namespace sl { for (auto& apply_entry : apply_entries) { EntryNodeList entry_node {std::make_pair(apply_entry.apply_to, apply_entry.content)}; mstch::map entry_ctx = to_mustache_map(entry_node, xpath_runner); - //std::cout << "Raw mustache for \"" << must.first << "\":\n" << - // must.second.text << "\nRendered mustache:\n"; std::string name(apply_entry.mustache_name); + + std::cout << "context size: " << entry_ctx.size() << '\n'; + for (auto& ctx_itm : entry_ctx) { + std::cout << '\t' << ctx_itm.first << '\n'; + } + std::cout << "Raw mustache for \"" << name << "\":\n" << + mustaches.at(name).text << "\nRendered mustache:\n"; + std::cout << mstch::render(mustaches.at(name).text, entry_ctx) << std::endl; } diff --git a/src/scraplang/html_pool_base.hpp b/src/scraplang/html_pool_base.hpp index ec659be..8b22459 100644 --- a/src/scraplang/html_pool_base.hpp +++ b/src/scraplang/html_pool_base.hpp @@ -21,10 +21,11 @@ #include "implem/ResourcePool.hpp" #include "kakoune/safe_ptr.hh" +#include namespace duck { namespace sl { namespace implem { - typedef duckutil::ResourcePool HtmlPoolBase; + typedef duckutil::ResourcePool HtmlPoolBase; } //namespace implem class HtmlPoolBase : public implem::HtmlPoolBase, public Kakoune::SafeCountable { diff --git a/src/scraplang/xpath_runner.cpp b/src/scraplang/xpath_runner.cpp index 058f9bd..432df09 100644 --- a/src/scraplang/xpath_runner.cpp +++ b/src/scraplang/xpath_runner.cpp @@ -17,13 +17,15 @@ */ #include "xpath_runner.hpp" +#include "xpath.hpp" #include +#include namespace duck { namespace sl { struct XPathRunner::XPathKey { - XPathKey (const std::string& parSrc, const std::string& parQuery) : - source_address(parSrc), - xpath_query(parQuery) + XPathKey (const std::string_view& parSrc, const std::string_view& parQuery) : + source_address(std::string(parSrc)), + xpath_query(std::string(parQuery)) { assert(not source_address.empty()); } @@ -51,7 +53,24 @@ namespace duck { namespace sl { std::string_view parSrc, std::string_view parQuery ) { - static std::vector deleme {"hello", "world"}; - return deleme; + std::cout << "XPathRunner::query()\n"; + auto ins_retval = m_cached_results.insert(std::make_pair(XPathKey(parSrc, parQuery), std::vector())); + const bool inserted = ins_retval.second; + assert(ins_retval.first != m_cached_results.end()); + std::vector& curr_vec = ins_retval.first->second; + + if (inserted) { + const auto id = m_pool->AddResource(parSrc); + const std::string* html = m_pool->GetByID(id); + + curr_vec = xpath_query(*html, std::string(parQuery)); + } + + std::cout << "returning " << curr_vec.size() << " items: "; + for (auto& i : curr_vec) { + std:: cout << '"' << i << "\", "; + } + std::cout << '\n'; + return curr_vec; } }} //namespace duck::sl diff --git a/src/xpath.cpp b/src/xpath.cpp index 6c34279..0ef3db0 100644 --- a/src/xpath.cpp +++ b/src/xpath.cpp @@ -76,12 +76,18 @@ namespace duck { return std::move(retval); } - std::string xpath_query (const std::string& parXML, const std::string& parQuery) { - auto retval = xpath_query(parXML, std::vector{parQuery}); - if (retval.empty() or retval.front().empty()) - return std::string(); - else - return retval.front().front().second; + std::vector xpath_query (const std::string& parXML, const std::string& parQuery) { + auto query_res = xpath_query(parXML, std::vector{parQuery}); + if (query_res.empty() or query_res.front().empty()) { + return std::vector(); + } + else { + std::vector retval; + const std::vector>& src = query_res.front(); + retval.reserve(src.size()); + std::transform(src.begin(), src.end(), std::back_inserter(retval), [](const auto& pair) { return pair.second; }); + return retval; + } } ParseError::ParseError (int parLine, int parColumn, std::string parMessage) { diff --git a/src/xpath.hpp b/src/xpath.hpp index d95d1f2..12ce69c 100644 --- a/src/xpath.hpp +++ b/src/xpath.hpp @@ -36,7 +36,7 @@ namespace duck { }; XPathBatchResults xpath_query ( const std::string& parXML, const std::vector& parQueries ); - std::string xpath_query ( const std::string& parXML, const std::string& parQuery ); + std::vector xpath_query ( const std::string& parXML, const std::string& parQuery ); } //namespace duck #endif