From 5d2c5863a5e48a54fe95bffb8f9d6d29e0c727d9 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Thu, 15 Feb 2018 10:29:05 +0000 Subject: [PATCH] Making ApplyBlocks work with {{variable}} sources. --- sample.scrap | 20 +++---- src/html_pool.cpp | 12 ++++ src/scraplang/apply.cpp | 103 +++++++++++++++++++++++++++------ src/scraplang/xpath_runner.cpp | 4 +- 4 files changed, 110 insertions(+), 29 deletions(-) diff --git a/sample.scrap b/sample.scrap index cb2b184..15460b4 100644 --- a/sample.scrap +++ b/sample.scrap @@ -1,17 +1,17 @@ from http://sid-story.wikia.com/wiki/Album - pages = //blah/blah/text() + pages = //section/header/h2/a/@href end -apply {{sidian_info_model}} to {{pages}} - struct Sidians - sidian_name default("n/a") = //table[@class="wikitable sortable"]/tr/td[4]/a/text() - activ_probability default("0") = //table[@class="wikitable sortable"]/tr/td[3]/text() +apply {{test_mustache}} to {{pages}} + struct paragraphs + paragraph = //section/header/h2/a/text() end - something_else = /html/head/text() end -==sidian_info_model -{{#Sidians}} -{{sidian_name}} {{activ_probability}} -{{/Sidians}} +==test_mustache +Paragraphs: {{#paragraphs}} +- {{paragraph}} +{{/paragraphs}} + +kthx bye! ==end diff --git a/src/html_pool.cpp b/src/html_pool.cpp index 756222a..c964884 100644 --- a/src/html_pool.cpp +++ b/src/html_pool.cpp @@ -24,6 +24,12 @@ #include #include +//#define HTML_ALWAYS_STDIN + +#if !defined(NDEBUG) && defined(HTML_ALWAYS_STDIN) +# define HTML_ALWAYS_STDIN_ENABLED +#endif + namespace duck { HtmlPool::HtmlPool (std::string&& agent_name) : m_agent(std::move(agent_name)) @@ -33,7 +39,13 @@ namespace duck { auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* { std::unique_ptr html; + std::cout << "Fetching html from \"" << parRes << "\"\n"; + +#if defined(HTML_ALWAYS_STDIN_ENABLED) + if (true) { +#else if (parRes == "-") { +#endif html = std::make_unique(read_all(std::cin)); } else { diff --git a/src/scraplang/apply.cpp b/src/scraplang/apply.cpp index edcb188..f054f49 100644 --- a/src/scraplang/apply.cpp +++ b/src/scraplang/apply.cpp @@ -29,6 +29,7 @@ #include #include #include +#include namespace std { } //namespace std @@ -52,13 +53,14 @@ namespace duck { namespace sl { using MustacheEntryMap = std::map; struct EntryNode { - EntryNode (const EntryNode&) = default; - explicit EntryNode (const std::string& parName) : + explicit EntryNode (const std::string_view& parName) : name(parName) { } EntryNode (EntryNode&&) = default; + EntryNode (const EntryNode&) = default; EntryNode& operator= (EntryNode&&) = default; + EntryNode& operator= (const EntryNode&) = default; std::string_view name; std::vector structs; @@ -85,6 +87,7 @@ namespace duck { namespace sl { class StructItemExtractor : public boost::static_visitor<> { public: + StructItemExtractor() = delete; explicit StructItemExtractor (EntryNode& parRoot) : m_root(parRoot) { @@ -145,7 +148,8 @@ namespace duck { namespace sl { EntryNode("") )); - store_entry_subtree(parVal.xpaths, m_global_entries.back().second); + EntryNode& curr_node = m_global_entries.back().second; + store_entry_subtree(parVal.xpaths, curr_node); } void operator() (const ApplyBlock& parVal) { @@ -232,6 +236,39 @@ namespace duck { namespace sl { const std::size_t m_expected_size; }; + const std::vector& query_xpath_by_name ( + const EntryNodeList& parNodes, + const std::string_view& parName, + XPathRunner& parRunner + ) { + for (auto& curr_node : parNodes) { + assert(curr_node.first); + const SourceInfo& source = *curr_node.first; + const EntryNode& entry = curr_node.second; + assert(entry.name.empty()); + + auto it_found = std::find_if( + entry.xpaths.begin(), + entry.xpaths.end(), + [&parName](const auto& xpath_elem) { + return xpath_elem->name == parName; + } + ); + + if (it_found != entry.xpaths.end()) { + const XPathElement* const val = *it_found; + assert(val); + return parRunner.query(source.value, val->xpath); + } + } + + static const std::vector empty_retval; + std::cout << "query_xpath_by_name(parNodes, \"" << parName << + "\", parRunner) -> nothing found" << std::endl; + assert(false); //throw? + return empty_retval; + } + std::size_t largest_array_size_in (mstch::map& parMap) { typedef ItemCountingVisitor ITC; using boost::apply_visitor; @@ -318,11 +355,14 @@ namespace duck { namespace sl { assert(entry.first); std::string_view src_url; - if (SourceInfo::URL == entry.first->type) { + + switch (entry.first->type) { + case SourceInfo::URL: src_url = entry.first->value; - } - else { - assert(false); //not implemented + break; + case SourceInfo::Token: + default: + assert(false); //not reached } mstch::map curr_entry_map = to_mustache_dict_recursive(entry.second, src_url, parRunner); @@ -332,6 +372,26 @@ namespace duck { namespace sl { return retval; } + + void exec_apply_block ( + const SourceInfo& parSourceInfo, + const EntryNode& parEntryNode, + const MustacheEntry& parMustache, + XPathRunner& parXPathRunner + ) { + EntryNodeList entry_node {std::make_pair(&parSourceInfo, parEntryNode)}; + mstch::map entry_ctx = to_mustache_map(entry_node, parXPathRunner); + for (auto& ctx : parMustache.context) { + entry_ctx[ctx.first] = ctx.second; + } + + std::cout << "context size: " << entry_ctx.size() << '\n'; + for (auto& ctx_itm : entry_ctx) { + std::cout << '\t' << ctx_itm.first << '\n'; + } + + std::cout << mstch::render(parMustache.text, entry_ctx) << std::endl; + } } //unnamed namespace std::vector apply ( @@ -351,21 +411,30 @@ namespace duck { namespace sl { std::cout << "-------------- visiting done ----------------\n"; XPathRunner xpath_runner(html_pool); - mstch::map mustache_ctx = to_mustache_map(global_entries, xpath_runner); for (auto& apply_entry : apply_entries) { - EntryNodeList entry_node {std::make_pair(apply_entry.apply_to, apply_entry.content)}; - mstch::map entry_ctx = to_mustache_map(entry_node, xpath_runner); std::string name(apply_entry.mustache_name); + const auto& mustache = mustaches.at(name); + if (SourceInfo::Token == apply_entry.apply_to->type) { + std::vector sources = + query_xpath_by_name(global_entries, apply_entry.apply_to->value, xpath_runner); - std::cout << "context size: " << entry_ctx.size() << '\n'; - for (auto& ctx_itm : entry_ctx) { - std::cout << '\t' << ctx_itm.first << '\n'; + for (auto& source : sources) { + SourceInfo new_source; + new_source.value = source; + new_source.type = SourceInfo::URL; + + EntryNode new_node(apply_entry.content.name); + new_node.structs = apply_entry.content.structs; + new_node.xpaths = apply_entry.content.xpaths; + + exec_apply_block(new_source, new_node, mustache, xpath_runner); + } + } + else { + assert(apply_entry.apply_to); + exec_apply_block(*apply_entry.apply_to, apply_entry.content, mustache, xpath_runner); } - std::cout << "Raw mustache for \"" << name << "\":\n" << - mustaches.at(name).text << "\nRendered mustache:\n"; - - std::cout << mstch::render(mustaches.at(name).text, entry_ctx) << std::endl; } return retval; diff --git a/src/scraplang/xpath_runner.cpp b/src/scraplang/xpath_runner.cpp index 432df09..d1167e6 100644 --- a/src/scraplang/xpath_runner.cpp +++ b/src/scraplang/xpath_runner.cpp @@ -53,7 +53,7 @@ namespace duck { namespace sl { std::string_view parSrc, std::string_view parQuery ) { - std::cout << "XPathRunner::query()\n"; + std::cout << "XPathRunner::query() - \"" << parQuery << "\"\n"; auto ins_retval = m_cached_results.insert(std::make_pair(XPathKey(parSrc, parQuery), std::vector())); const bool inserted = ins_retval.second; assert(ins_retval.first != m_cached_results.end()); @@ -68,7 +68,7 @@ namespace duck { namespace sl { std::cout << "returning " << curr_vec.size() << " items: "; for (auto& i : curr_vec) { - std:: cout << '"' << i << "\", "; + std::cout << '"' << i << "\", "; } std::cout << '\n'; return curr_vec;