Trying to get scraplang implemented

Lots of changes I made on the train and had little
time to make tidily.
Use c++17 (for std::optional)
Clean up the cmake script a bit
Get rid of unused stuff
Skeleton implementation of some classes for scraplang
This commit is contained in:
King_DuckZ 2018-01-10 11:09:56 +00:00
parent c31d317d51
commit f0e7a1d136
32 changed files with 1765 additions and 288 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
build/
tags

3
.gitmodules vendored
View file

@ -4,3 +4,6 @@
[submodule "lib/tidy"]
path = lib/tidy
url = https://github.com/htacg/tidy-html5.git
[submodule "lib/mstch"]
path = lib/mstch
url = https://github.com/KingDuckZ/mstch.git

View file

@ -8,8 +8,11 @@ include(GetGitRevisionDescription)
find_package(PugiXML REQUIRED)
find_package(Boost 1.32.0 COMPONENTS program_options)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -Wextra -g -O3 -fomit-frame-pointer")
set(DEFAULT_USER_AGENT "DuckScraper")
set(PROJECT_VERSION_BETA "1")
@ -20,25 +23,30 @@ configure_file(
"${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h"
)
include_directories(SYSTEM
lib/tidy/include
${PUGIXML_INCLUDE_DIR}
lib/curlcpp/include
${Boost_INCLUDE_DIRS}
)
include_directories(
src/
"${PROJECT_BINARY_DIR}"
)
add_executable(${PROJECT_NAME}
src/main.cpp
src/html_pool.cpp
src/htmlretrieve.cpp
src/commandline.cpp
src/scraplang/scraplang.cpp
src/scraplang/parse.cpp
src/scraplang/apply.cpp
src/scraplang/xpath_manager.cpp
src/scraplang/variables.cpp
src/xpath.cpp
)
target_include_directories(${PROJECT_NAME} SYSTEM
PRIVATE lib/tidy/include
PRIVATE ${PUGIXML_INCLUDE_DIR}
PRIVATE lib/curlcpp/include
PRIVATE ${Boost_INCLUDE_DIRS}
PRIVATE lib/mstch/include
)
target_include_directories(${PROJECT_NAME}
PRIVATE src/
PRIVATE "${PROJECT_BINARY_DIR}"
)
if (BUILD_SHARED_TIDY)
set(TIDY_LIB "tidy-share")
else(BUILD_SHARED_TIDY)
@ -46,10 +54,15 @@ else(BUILD_SHARED_TIDY)
endif(BUILD_SHARED_TIDY)
target_link_libraries(${PROJECT_NAME}
${TIDY_LIB}
${PUGIXML_LIBRARIES}
curlcpp
${Boost_LIBRARIES}
PRIVATE ${TIDY_LIB}
PRIVATE ${PUGIXML_LIBRARIES}
PRIVATE curlcpp
PRIVATE ${Boost_LIBRARIES}
PRIVATE mstch
)
target_compile_definitions(${PROJECT_NAME}
PRIVATE $<$<CONFIG:DEBUG>:KAK_DEBUG>
)
#unset those variables so cmake files from dependencies won't complain about
@ -62,3 +75,4 @@ unset(PROJECT_VERSION)
set(BUILD_SHARED_LIB ${BUILD_SHARED_TIDY}) #for tidy
add_subdirectory(lib/tidy)
add_subdirectory(lib/curlcpp)
add_subdirectory(lib/mstch)

1
lib/mstch Submodule

@ -0,0 +1 @@
Subproject commit 45122d1d515c90a54d509d4b2d8d9279348518f5

17
sample.scrap Normal file
View file

@ -0,0 +1,17 @@
from http://sid-story.wikia.com/wiki/Album
pages = //blah/blah/text()
end
apply {{sidian_info_model}} to {{pages}}
struct Sidians
sidian_name default("n/a") = //table[@class="wikitable sortable"]/tr/td[4]/a/text()
activ_probability default("0") = //table[@class="wikitable sortable"]/tr/td[3]/text()
end
something_else = /html/head/text()
end
==sidian_info_model
{{#Sidians}}
{{sidian_name}} {{activ_probability}}
{{/Sidians}}
==end

View file

@ -52,6 +52,7 @@ namespace duck {
po::options_description query_options("Query options");
query_options.add_options()
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
;
po::options_description positional_options("Positional options");
positional_options.add_options()
@ -86,6 +87,7 @@ namespace duck {
std::cout << "redistribute it under certain conditions.\n"; //type `show c' for details.
std::cout << '\n';
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
std::cout << " " << PROGRAM_NAME << " [options...] --model <path> <url>\n";
std::cout << "You can pass - as the url to read from stdin\n";
std::cout << visible;
return true;
@ -96,11 +98,14 @@ namespace duck {
return true;
}
if (parVarMap.count("input-url") == 0) {
if (parVarMap.count("input-url") == 0 and parVarMap.count("model") == 0) {
throw std::invalid_argument("No input URL specified");
}
if (parVarMap.count("xpath") == 0) {
throw std::invalid_argument("No XPath expression specified");
if (not (parVarMap.count("xpath") or parVarMap.count("model"))) {
throw std::invalid_argument("No XPath expression specified and no input model given");
}
else if (parVarMap.count("xpath") and parVarMap.count("model")) {
throw std::invalid_argument("Received both model and XPath expression, but only one of the two is allowed");
}
return false;
}

View file

@ -16,34 +16,31 @@
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef idBE96C2D49C4C413888A79EAEB2B9C0FA
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
#include <vector>
#include "html_pool.hpp"
#include "htmlretrieve.hpp"
#include <string>
#include <memory>
#include <utility>
namespace duck {
struct ScrapNode;
struct element_def;
HtmlPool::HtmlPool (std::string&& agent_name) :
m_agent(std::move(agent_name))
{
}
class ScrapNodePtr {
public:
explicit ScrapNodePtr ( ScrapNode* parPtr );
ScrapNodePtr ( ScrapNodePtr&& parOther );
~ScrapNodePtr ( void ) noexcept;
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
auto html = std::make_unique<std::string>(
fetch_html(parRes, m_agent, false, false)
);
*html = duck::clean_html(std::move(*html));
return html.release();
}
ScrapNode& operator* ( void ) { return *m_ptr; }
const ScrapNode& operator* ( void ) const { return *m_ptr; }
ScrapNode& operator-> ( void ) { return *m_ptr; }
const ScrapNode& operator-> ( void ) const { return *m_ptr; }
void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept {
delete parRes;
}
private:
std::unique_ptr<ScrapNode> m_ptr;
};
ScrapNodePtr parse_scraplang ( const std::string& parData );
std::vector<element_def> get_xpath_definitions ( const ScrapNode& parAST );
auto HtmlPool::GetResourceNameFromResourceObject (ResourceObjectParameterType parRes) -> ResourceNameType {
return parRes;
}
} //namespace duck
#endif

42
src/html_pool.hpp Normal file
View file

@ -0,0 +1,42 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef idCDCACC393BE24CBD94A3B5E2985984A3
#define idCDCACC393BE24CBD94A3B5E2985984A3
#include "scraplang/html_pool_base.hpp"
namespace duck {
class HtmlPool : public ::duck::sl::HtmlPoolBase {
typedef ::duck::sl::HtmlPoolBase::ResourceType ResourceType;
typedef ::duck::sl::HtmlPoolBase::ResourceNameType ResourceNameType;
typedef ::duck::sl::HtmlPoolBase::ResourceObjectParameterType ResourceObjectParameterType;
typedef ::duck::sl::HtmlPoolBase::ResourceNameParamType ResourceNameParamType;
virtual ResourceType* OnResourceLoad (ResourceObjectParameterType parRes);
virtual void OnResourceDestroy (ResourceNameParamType parName, ResourceType* parRes) noexcept;
virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes);
std::string m_agent;
public:
explicit HtmlPool (std::string&& agent_name);
};
} //namespace duck
#endif

115
src/kakoune/ref_ptr.hh Normal file
View file

@ -0,0 +1,115 @@
#ifndef ref_ptr_hh_INCLUDED
#define ref_ptr_hh_INCLUDED
#include <utility>
namespace Kakoune
{
struct RefCountable
{
int refcount = 0;
virtual ~RefCountable() = default;
};
struct RefCountablePolicy
{
static void inc_ref(RefCountable* r, void*) noexcept { ++r->refcount; }
static void dec_ref(RefCountable* r, void*) { if (--r->refcount == 0) delete r; }
static void ptr_moved(RefCountable*, void*, void*) noexcept {}
};
template<typename T, typename Policy = RefCountablePolicy>
struct RefPtr
{
RefPtr() = default;
explicit RefPtr(T* ptr) : m_ptr(ptr) { acquire(); }
~RefPtr() { release(); }
RefPtr(const RefPtr& other) : m_ptr(other.m_ptr) { acquire(); }
RefPtr(RefPtr&& other)
noexcept(noexcept(std::declval<RefPtr>().moved(nullptr)))
: m_ptr(other.m_ptr) { other.m_ptr = nullptr; moved(&other); }
RefPtr& operator=(const RefPtr& other)
{
if (other.m_ptr != m_ptr)
{
release();
m_ptr = other.m_ptr;
acquire();
}
return *this;
}
RefPtr& operator=(RefPtr&& other)
{
release();
m_ptr = other.m_ptr;
other.m_ptr = nullptr;
moved(&other);
return *this;
}
RefPtr& operator=(T* ptr)
{
if (ptr != m_ptr)
{
release();
m_ptr = ptr;
acquire();
}
return *this;
}
[[gnu::always_inline]]
T* operator->() const { return m_ptr; }
[[gnu::always_inline]]
T& operator*() const { return *m_ptr; }
[[gnu::always_inline]]
T* get() const { return m_ptr; }
[[gnu::always_inline]]
explicit operator bool() const { return m_ptr; }
void reset(T* ptr = nullptr)
{
if (ptr == m_ptr)
return;
release();
m_ptr = ptr;
acquire();
}
friend bool operator==(const RefPtr& lhs, const RefPtr& rhs) { return lhs.m_ptr == rhs.m_ptr; }
friend bool operator!=(const RefPtr& lhs, const RefPtr& rhs) { return lhs.m_ptr != rhs.m_ptr; }
private:
T* m_ptr = nullptr;
[[gnu::always_inline]]
void acquire()
{
if (m_ptr)
Policy::inc_ref(m_ptr, this);
}
[[gnu::always_inline]]
void release()
{
if (m_ptr)
Policy::dec_ref(m_ptr, this);
}
[[gnu::always_inline]]
void moved(void* from)
noexcept(noexcept(Policy::ptr_moved(nullptr, nullptr, nullptr)))
{
if (m_ptr)
Policy::ptr_moved(m_ptr, from, this);
}
};
}
#endif // ref_ptr_hh_INCLUDED

109
src/kakoune/safe_ptr.hh Normal file
View file

@ -0,0 +1,109 @@
#ifndef safe_ptr_hh_INCLUDED
#define safe_ptr_hh_INCLUDED
// #define SAFE_PTR_TRACK_CALLSTACKS
//King_DuckZ:
#include <cassert>
#define kak_assert(a) assert(a)
//#include "assert.hh"
#include "ref_ptr.hh"
#include <type_traits>
#include <utility>
#ifdef SAFE_PTR_TRACK_CALLSTACKS
#include "backtrace.hh"
#include "vector.hh"
#include <algorithm>
#endif
namespace Kakoune
{
// *** SafePtr: objects that assert nobody references them when they die ***
class SafeCountable
{
public:
#ifdef KAK_DEBUG
SafeCountable() : m_count(0) {}
SafeCountable (SafeCountable&&) : m_count(0) {}
~SafeCountable()
{
kak_assert(m_count == 0);
#ifdef SAFE_PTR_TRACK_CALLSTACKS
kak_assert(m_callstacks.empty());
#endif
}
private:
friend struct SafeCountablePolicy;
#ifdef SAFE_PTR_TRACK_CALLSTACKS
struct Callstack
{
Callstack(void* p) : ptr(p) {}
void* ptr;
Backtrace bt;
};
mutable Vector<Callstack> m_callstacks;
#endif
mutable int m_count;
#endif
};
struct SafeCountablePolicy
{
#ifdef KAK_DEBUG
static void inc_ref(const SafeCountable* sc, void* ptr) noexcept
{
++sc->m_count;
#ifdef SAFE_PTR_TRACK_CALLSTACKS
sc->m_callstacks.emplace_back(ptr);
#else
static_cast<void>(ptr);
#endif
}
static void dec_ref(const SafeCountable* sc, void* ptr) noexcept
{
--sc->m_count;
kak_assert(sc->m_count >= 0);
#ifdef SAFE_PTR_TRACK_CALLSTACKS
auto it = std::find_if(sc->m_callstacks.begin(), sc->m_callstacks.end(),
[=](const SafeCountable::Callstack& cs) { return cs.ptr == ptr; });
kak_assert(it != sc->m_callstacks.end());
sc->m_callstacks.erase(it);
#else
static_cast<void>(ptr);
#endif
}
static void ptr_moved(const SafeCountable* sc, void* from, void* to) noexcept
{
#ifdef SAFE_PTR_TRACK_CALLSTACKS
auto it = std::find_if(sc->m_callstacks.begin(), sc->m_callstacks.end(),
[=](const SafeCountable::Callstack& cs) { return cs.ptr == from; });
kak_assert(it != sc->m_callstacks.end());
it->ptr = to;
#else
static_cast<void>(sc);
static_cast<void>(from);
static_cast<void>(to);
#endif
}
#else
static void inc_ref(const SafeCountable*, void*) noexcept {}
static void dec_ref(const SafeCountable*, void*) noexcept {}
static void ptr_moved(const SafeCountable*, void*, void*) noexcept {}
#endif
};
template<typename T>
using SafePtr = RefPtr<T, SafeCountablePolicy>;
}
#endif // safe_ptr_hh_INCLUDED

View file

@ -27,9 +27,15 @@
#include <memory>
#include <iterator>
#include <stdexcept>
#include "scraplang.hpp"
#include "html_pool.hpp"
namespace {
void dump_string ( const std::string& parPathDest, const std::string& parData );
std::string read_all ( std::istream& parStream );
std::string read_all ( std::istream&& parStream );
void load_from_commandline ( const boost::program_options::variables_map& parVarMap );
void load_from_model ( const boost::program_options::variables_map& parVarMap );
} //unnamed namespace
int main (int argc, char* argv[]) {
@ -46,54 +52,20 @@ int main (int argc, char* argv[]) {
return 2;
}
const auto url = vm["input-url"].as<std::string>();
const auto xpath = vm["xpath"].as<std::string>();
#if !defined(NDEBUG)
std::cout << "URL : " << url << "\n";
std::cout << "XPath: " << xpath << std::endl;
std::cout << "Agent: " << vm["agent"].as<std::string>() << std::endl;
#endif
std::string html;
if ("-" != url) {
html = duck::fetch_html(url, vm["agent"].as<std::string>(), false, false);
}
else {
std::cin >> std::noskipws;
std::istream_iterator<char> it(std::cin);
std::istream_iterator<char> end;
html = std::string(it, end);
}
if (vm.count("dump-raw")) {
dump_string(vm["dump-raw"].as<std::string>(), html);
}
html = duck::clean_html(std::move(html));
if (vm.count("dump")) {
dump_string(vm["dump"].as<std::string>(), html);
}
try {
std::vector<std::string> queries;
queries.reserve(1);
queries.push_back(std::move(xpath));
auto results = duck::xpath_query(html, queries);
for (const auto& lst : results[0]) {
std::cout << lst.first << ": " << lst.second << '\n';
}
if (vm.count("model"))
load_from_model(vm);
else
load_from_commandline(vm);
}
catch (const duck::ParseError& err) {
std::cerr << err.what() << std::endl;
return 1;
}
return 0;
}
namespace {
void dump_string (const std::string& parPathDest, const std::string& parData) {
std::unique_ptr<std::ofstream> ofs;
const bool use_stdout = ("-" == parPathDest);
@ -103,4 +75,84 @@ namespace {
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
*os << parData;
}
std::string read_all (std::istream& parStream) {
parStream >> std::noskipws;
std::istream_iterator<char> it(parStream);
std::istream_iterator<char> end;
return std::string(it, end);
}
std::string read_all (std::istream&& parStream) {
return read_all(parStream);
}
void load_from_commandline (const boost::program_options::variables_map& parVarMap) {
const auto& vm = parVarMap;
const auto url = vm["input-url"].as<std::string>();
std::string html;
if ("-" != url) {
html = duck::fetch_html(url, vm["agent"].as<std::string>(), false, false);
}
else {
html = read_all(std::cin);
}
if (vm.count("dump-raw")) {
dump_string(vm["dump-raw"].as<std::string>(), html);
}
html = duck::clean_html(std::move(html));
if (vm.count("dump")) {
dump_string(vm["dump"].as<std::string>(), html);
}
const std::string xpath = parVarMap["xpath"].as<std::string>();
#if !defined(NDEBUG)
std::cout << " -- XPath direct mode --\n";
std::cout << "URL : " << parVarMap["input-url"].as<std::string>() << "\n";
std::cout << "XPath: " << xpath << std::endl;
std::cout << "Agent: " << parVarMap["agent"].as<std::string>() << std::endl;
#endif
std::vector<std::string> queries;
queries.reserve(1);
queries.push_back(std::move(xpath));
auto results = duck::xpath_query(html, queries);
for (const auto& lst : results[0]) {
std::cout << lst.first << ": " << lst.second << '\n';
}
}
void load_from_model (const boost::program_options::variables_map& parVarMap) {
#if !defined(NDEBUG)
std::cout << " -- XPath model mode --\n";
if (parVarMap.count("input-url"))
std::cout << "URL : " << parVarMap["input-url"].as<std::string>() << "\n";
std::cout << "Model: " << parVarMap["model"].as<std::string>() << std::endl;
std::cout << "Agent: " << parVarMap["agent"].as<std::string>() << std::endl;
#endif
const std::string script = read_all(std::ifstream(parVarMap["model"].as<std::string>()));
auto ast = duck::sl::parse(script);
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool));
//auto list = duck::get_xpath_definitions(*ast);
//std::vector<std::string> expressions;
//expressions.reserve(list.size());
//for (duck::element_def& elem : list) {
// expressions.push_back(std::move(elem.xpath));
//}
//auto results = duck::xpath_query(parXML, expressions);
//duck::print_results(std::cout, *ast, list, results);
//for (const auto& list : results) {
// std::cout << "------\n";
// for (const auto& result : list) {
// std::cout << result.first << ": " << result.second << '\n';
// }
//}
}
} //unnamed namespace

25
src/scraplang.hpp Normal file
View file

@ -0,0 +1,25 @@
/* Copyright (C) 2017 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id8483FDE5CA0E4F40BDE0F4469AC2DF79
#define id8483FDE5CA0E4F40BDE0F4469AC2DF79
#include "scraplang/parse.hpp"
#include "scraplang/apply.hpp"
#endif

188
src/scraplang/apply.cpp Normal file
View file

@ -0,0 +1,188 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
//#define APPLY_VERBOSE
#include "apply.hpp"
#include "mstch/mstch.hpp"
#include "variables.hpp"
#include "html_pool_base.hpp"
#include "xpath_manager.hpp"
#include <boost/variant/apply_visitor.hpp>
#include <map>
#if defined(APPLY_VERBOSE)
# include <iostream>
#endif
namespace duck { namespace sl {
#if defined(APPLY_VERBOSE)
std::ostream& operator<< (std::ostream& stream, XPathElement xpath) {
stream << "xpath \"" << xpath.name << "\": \"" << xpath.xpath << "\" ";
if (xpath.def_val)
stream << "default: \"" << *xpath.def_val << '"';
else
stream << "no default";
return stream;
}
#endif
namespace {
typedef std::map<std::string, std::string> MustacheMap;
typedef Kakoune::SafePtr<Variables> VariablesSP;
typedef Kakoune::SafePtr<const Variables> VariablesCSP;
typedef Kakoune::SafePtr<XPathManager> XPathManagerSP;
typedef Kakoune::SafePtr<const XPathManager> XPathManagerCSP;
struct ApplyBlockContext {
mstch::map context;
SourceInfo target;
std::string name;
};
class StructVisitor : public boost::static_visitor<> {
public:
StructVisitor (VariablesCSP vars, XPathManagerCSP xpath_man) :
m_variables(vars),
m_xpath_man(xpath_man)
{
assert(m_variables);
assert(m_xpath_man);
}
void operator() (const XPathElement& v) {
#if defined(APPLY_VERBOSE)
std::cout << '\t' << v << '\n';
#endif
m_context[v.name] = m_xpath_man->extract_one(v);
}
void operator() (const StructBlock& v) {
#if defined(APPLY_VERBOSE)
std::cout << "\tstruct " << v.name << '\n';
for (auto& xpath : v.xpaths) {
std::cout << "\t\t" << xpath << '\n';
}
#endif
mstch::map submap;
for (const auto& xpath : v.xpaths) {
submap[xpath.name] = m_xpath_man->extract_one(xpath);
}
}
mstch::map&& steal_context() { return std::move(m_context); }
private:
VariablesCSP m_variables;
XPathManagerCSP m_xpath_man;
mstch::map m_context;
};
class NodeVisitor : public boost::static_visitor<> {
public:
explicit NodeVisitor (HtmlPoolBaseSP html_pool) :
m_xpath_man(html_pool)
{
assert(html_pool);
}
void operator() (const std::vector<ScrapNode>& v) {
for (auto& itm : v) {
boost::apply_visitor(*this, itm);
}
}
void operator() (const FromBlock& v) {
#if defined(APPLY_VERBOSE)
std::cout << "--- FromBlock\n";
std::cout << "\tsource: " << v.source.value << ", type " << static_cast<int>(v.source.type) << '\n';
#endif
for (auto& xpath : v.xpaths) {
#if defined(APPLY_VERBOSE)
std::cout << '\t' << xpath << '\n';
#endif
m_variables.add_xpath(v.source, xpath);
}
}
void operator() (const ApplyBlock& v) {
#if defined(APPLY_VERBOSE)
std::cout << "--- ApplyBlock\n";
std::cout << "\tmustache model: " << v.mustache_model << '\n';
std::cout << "\tsource: " << v.source.value << ", type " << static_cast<int>(v.source.type) << '\n';
#endif
auto struct_visitor = StructVisitor(
VariablesCSP(&m_variables),
XPathManagerCSP(&m_xpath_man)
);
for (auto& xpath : v.xpaths) {
boost::apply_visitor(struct_visitor, xpath);
}
ApplyBlockContext abctx;
abctx.context = struct_visitor.steal_context();
m_apply_blocks.push_back(std::move(abctx));
}
void operator() (const MustacheBlock& v) {
#if defined(APPLY_VERBOSE)
std::cout << "--- MustacheBlock\n";
std::cout << "\tname: " << v.name << '\n';
std::cout << "\tcontent: " << v.content << '\n';
#endif
m_mustaches[v.name] = v.content;
}
const MustacheMap& mustaches() const { return m_mustaches; }
const Variables& variables() const { return m_variables; }
const std::vector<ApplyBlockContext>& apply_blocks() const { return m_apply_blocks; }
private:
MustacheMap m_mustaches;
Variables m_variables;
XPathManager m_xpath_man;
std::vector<ApplyBlockContext> m_apply_blocks;
};
} //unnamed namespace
std::vector<std::string> apply (const ScrapNode& node, HtmlPoolBaseSP html_pool) {
assert(html_pool);
NodeVisitor visitor(html_pool);
boost::apply_visitor(visitor, node);
std::vector<std::string> retval;
for (auto& apply_block : visitor.apply_blocks()) {
std::string mustache_model_name = visitor.variables().resolve_string(apply_block.name);
const std::string& mustache_model = visitor.mustaches().at(mustache_model_name);
std::vector<SourceInfo> src_urls = visitor.variables().resolve_array(apply_block.target);
for (const auto& src_url : src_urls) {
const auto html_id = html_pool->GetOrAdd(visitor.variables().resolve_string(src_url));
const auto* const html = html_pool->GetByID(html_id);
}
}
return retval;
}
}} //namespace duck::sl

30
src/scraplang/apply.hpp Normal file
View file

@ -0,0 +1,30 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef idC73DBB42FB76433BAFC0B73EAC3B70FF
#define idC73DBB42FB76433BAFC0B73EAC3B70FF
#include "scrap_node.hpp"
#include "scraplang/html_pool_base.hpp"
#include <string>
namespace duck { namespace sl {
std::vector<std::string> apply (const ScrapNode& node, HtmlPoolBaseSP html_pool);
}} //namespace duck::sl
#endif

View file

@ -1,4 +1,4 @@
/* Copyright (C) 2015 Michele Santullo
/* Copyright (C) 2017 Michele Santullo
*
* This file is part of DuckScraper.
*
@ -19,22 +19,15 @@
#ifndef id3875B5F868524EC3A1B83971D4A85777
#define id3875B5F868524EC3A1B83971D4A85777
#include "element_types.hpp"
#include <string>
namespace duck {
enum ElementTypes {
ElementType_String,
ElementType_Integer,
ElementType_Boolean,
ElementType_Null,
ElementType_Double
};
struct element_def {
namespace duck { namespace sl {
struct ElementDef {
std::string name;
std::string xpath;
ElementTypes type;
};
} //namespace duck
}} //namespace duck::sl
#endif

View file

@ -0,0 +1,32 @@
/* Copyright (C) 2017 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id1AC876186C4B48DD900399994C27A741
#define id1AC876186C4B48DD900399994C27A741
namespace duck { namespace sl {
enum ElementTypes {
ElementType_String,
ElementType_Integer,
ElementType_Boolean,
ElementType_Null,
ElementType_Double
};
}} //namespace duck::sl
#endif

View file

@ -0,0 +1,39 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef idDD58822D7D8B4AA7A0DD16B1CDEF413E
#define idDD58822D7D8B4AA7A0DD16B1CDEF413E
#include "implem/ResourcePool.hpp"
#include "kakoune/safe_ptr.hh"
namespace duck { namespace sl {
namespace implem {
typedef duckutil::ResourcePool<std::string, std::string> HtmlPoolBase;
} //namespace implem
class HtmlPoolBase : public implem::HtmlPoolBase, public Kakoune::SafeCountable {
public:
using implem::HtmlPoolBase::HtmlPoolBase;
using implem::HtmlPoolBase::operator=;
};
typedef Kakoune::SafePtr<HtmlPoolBase> HtmlPoolBaseSP;
}} //namespace duck::sl
#endif

View file

@ -0,0 +1,121 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id1A180A0568E84FD88D57FAB82C69600E
#define id1A180A0568E84FD88D57FAB82C69600E
#include "SaltedIndex.hpp"
#include <map>
#include <cstdint>
#include <cassert>
#include <type_traits>
#include <vector>
#include <algorithm>
namespace duckutil {
namespace Implem {
template <typename Res, typename Name, typename IDT>
class ResourceResNameWrapper {
public:
typedef IDT IDType;
ResourceResNameWrapper ( const Name* parName, Res* parRes, IDType parID );
ResourceResNameWrapper ( const ResourceResNameWrapper& ) = delete;
~ResourceResNameWrapper ( void ) { assert(0 == m_refcount); }
ResourceResNameWrapper& operator= (const ResourceResNameWrapper&) = delete;
Res& GetResource ( void ) { return *m_resource; }
const Res& GetResource ( void ) const { return *m_resource; }
const Name& GetName ( void ) const { return *m_name; }
void Retain ( void ) { ++m_refcount; }
bool Release ( void ) { assert(m_refcount > 0); --m_refcount; return (0 == m_refcount); }
uint32_t GetRefCount ( void ) const { return m_refcount; }
uint32_t GetResourceID ( void ) const { return m_resId; }
bool IsEmpty ( void ) const { return NULL == m_resource; }
void Reset ( void ) { m_resource = NULL; m_name = NULL; m_refcount = m_resId = 0; }
void DropRefCount ( void ) { m_refcount = 0; }
bool operator== ( const ResourceResNameWrapper& parOther ) const { return (GetName() == parOther.GetName()); }
bool operator!= ( const ResourceResNameWrapper& parOther ) const { return not (GetName() == parOther.GetName()); }
bool operator< ( const ResourceResNameWrapper& parOther ) const { return (GetName() < parOther.GetName()); }
bool operator> ( const ResourceResNameWrapper& parOther ) const { return (parOther.GetName() < GetName()); }
bool operator>= ( const ResourceResNameWrapper& parOther ) const { return not (GetName() < parOther.GetName()); }
bool operator<= ( const ResourceResNameWrapper& parOther ) const { return not (parOther.GetName() < GetName()); }
private:
Res* m_resource;
const Name* m_name;
uint16_t m_refcount;
uint16_t m_resId;
};
} //namespace Implem
template <typename Res, typename Name, typename Object=Name>
class ResourcePool {
public:
typedef uint32_t IDType;
typedef Name ResourceNameType;
private:
typedef Implem::ResourceResNameWrapper<Res, Name, IDType> ResourceWrapperType;
typedef std::map<Name, ResourceWrapperType*> ResourceMapType;
typedef std::vector<ResourceWrapperType*> ResourceVectorType;
protected:
typedef typename std::conditional<std::is_fundamental<ResourceNameType>::value, ResourceNameType, const ResourceNameType&>::type ResourceNameParamType;
typedef typename std::conditional<std::is_fundamental<Object>::value, Object, const Object&>::type ResourceObjectParameterType;
public:
typedef Res ResourceType;
typedef Object ResourceObjectType;
ResourcePool ( void ) = default;
ResourcePool ( const ResourcePool& ) = delete;
virtual ~ResourcePool ( void ) = default;
ResourcePool& operator= (const ResourcePool&) = delete;
ResourceType* GetByName ( ResourceNameParamType parName );
const ResourceType* GetByName ( ResourceNameParamType parName ) const;
IDType GetOrAdd ( ResourceObjectParameterType parObjectName );
ResourceType* GetByID ( IDType parID );
const ResourceType* GetByID ( IDType parID ) const;
bool IsEmpty ( void ) const;
IDType AddResource ( ResourceObjectParameterType parRes );
void ReleaseResource ( IDType parRes );
void ReleaseResourceByName ( ResourceNameParamType parName );
void Dispose ( void ) noexcept;
protected:
void Dispose_IgnoreReferenceCount ( void );
virtual ResourceType* OnResourceLoad ( ResourceObjectParameterType parRes ) = 0;
virtual void OnResourceDestroy ( ResourceNameParamType parName, ResourceType* parRes ) noexcept = 0;
virtual ResourceNameType GetResourceNameFromResourceObject ( ResourceObjectParameterType parRes ) = 0;
private:
bool ReleaseResource ( typename ResourceMapType::iterator& parMapRes, typename ResourceVectorType::iterator& parVecRes );
ResourceMapType m_mapContainer; //For accesses by name
ResourceVectorType m_linearContainer; //For accesses by index
};
} //namespace duckutil
#include "ResourcePool.inl"
#endif

View file

@ -0,0 +1,243 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
namespace duckutil {
namespace Implem {
///---------------------------------------------------------------------
///---------------------------------------------------------------------
template <typename Res, typename Name, typename IDT>
ResourceResNameWrapper<Res, Name, IDT>::ResourceResNameWrapper (const Name* parName, Res* parRes, IDType parID) {
assert(nullptr != parRes);
assert(nullptr != parName);
m_resource = parRes;
m_name = parName;
m_refcount = 0;
m_resId = static_cast<uint16_t>(parID);
}
///---------------------------------------------------------------------
///---------------------------------------------------------------------
template <typename V>
inline void TrimTrailingNulls (V& parVector) {
const std::size_t nullsCount = std::find_if(parVector.rbegin(), parVector.rend(), std::bind1st(std::not_equal_to<typename V::value_type>(), nullptr)) - parVector.rbegin();
assert(nullsCount <= parVector.size());
if (nullsCount) {
assert(nullptr == parVector.back());
parVector.resize(parVector.size() - nullsCount);
}
}
} //namespace Implem
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
typename ResourcePool<Res, Name, Object>::IDType ResourcePool<Res, Name, Object>::GetOrAdd (ResourceObjectParameterType parObjectName) {
const ResourceNameType name = GetResourceNameFromResourceObject(parObjectName);
typename ResourceMapType::const_iterator itFind = m_mapContainer.find(name);
IDType retVal;
if (m_mapContainer.end() == itFind) {
retVal = AddResource(parObjectName);
}
else {
typename ResourceVectorType::const_iterator itVecFind = std::find(m_linearContainer.begin(), m_linearContainer.end(), itFind->second);
assert(m_linearContainer.end() != itVecFind);
retVal = static_cast<IDType>(itVecFind - m_linearContainer.begin() + 1);
}
return retVal;
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByName (ResourceNameParamType parName) {
typename ResourceMapType::iterator itFind = m_mapContainer.find(parName);
if (m_mapContainer.end() == itFind)
return nullptr;
else
return itFind->second;
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
const typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByName (ResourceNameParamType parName) const {
typename ResourceMapType::const_iterator itFind = m_mapContainer.find(parName);
if (m_mapContainer.end() == itFind)
return nullptr;
else
return itFind->second;
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByID (IDType parID) {
assert(parID > 0);
if (0 == parID)
return nullptr;
const auto index = static_cast<std::size_t>(parID - 1);
if (index < m_linearContainer.size()) {
ResourceWrapperType* res = m_linearContainer[index];
return &res->GetResource();
}
else {
return nullptr;
}
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
const typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByID (IDType parID) const {
assert(parID > 0);
if (0 == parID)
return nullptr;
const auto index = static_cast<std::size_t>(parID - 1);
if (index < m_linearContainer.size()) {
ResourceWrapperType* res = m_linearContainer[index];
return &res->GetResource();
}
else {
return nullptr;
}
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
typename ResourcePool<Res, Name, Object>::IDType ResourcePool<Res, Name, Object>::AddResource (ResourceObjectParameterType parRes) {
const ResourceNameType name = GetResourceNameFromResourceObject(parRes);
typename ResourceMapType::iterator itPreExisting = m_mapContainer.find(name);
if (m_mapContainer.end() != itPreExisting) {
// if (itPreExisting->IsEmpty()) {
// OnResourceReload(name);
assert(nullptr != itPreExisting->second);
assert(not itPreExisting->second->IsEmpty());
itPreExisting->second->Retain();
return itPreExisting->second->GetResourceID();
}
else {
assert(m_mapContainer.end() == m_mapContainer.find(name));
ResourceType* const newRes = OnResourceLoad(parRes);
if (newRes) {
std::pair<typename ResourceMapType::iterator, bool> newIt = m_mapContainer.insert(std::pair<ResourceNameType, ResourceWrapperType*>(name, nullptr));
IDType newID = static_cast<IDType>(m_linearContainer.size() + 1);
ResourceWrapperType* const newWrapper = new ResourceWrapperType(&newIt.first->first, newRes, newID);
assert(nullptr != newWrapper);
m_linearContainer.push_back(newWrapper);
newIt.first->second = newWrapper;
newWrapper->Retain();
return newID;
}
}
return 0;
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
void ResourcePool<Res, Name, Object>::ReleaseResource (IDType parRes) {
assert(parRes > 0);
assert(static_cast<std::size_t>(parRes) <= m_linearContainer.size());
typename ResourceVectorType::iterator rele = m_linearContainer.begin() + (parRes - 1);
assert(nullptr != *rele);
assert(rele - m_linearContainer.begin() == static_cast<int>(parRes - 1));
typename ResourceMapType::iterator relemap = m_mapContainer.find((*rele)->GetName());
assert(m_mapContainer.end() != relemap);
if (ReleaseResource(relemap, rele)) {
delete relemap->second;
m_mapContainer.erase(relemap);
Implem::TrimTrailingNulls(m_linearContainer);
}
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
void ResourcePool<Res, Name, Object>::ReleaseResourceByName (ResourceNameParamType parName) {
typename ResourceMapType::iterator rele = m_mapContainer.find(parName);
assert(m_mapContainer.end() != rele);
const IDType resId = rele->second->GetResourceID();
assert(static_cast<std::size_t>(resId) <= m_linearContainer.size());
assert(resId > 0);
if (ReleaseResource(rele, m_linearContainer.begin() + (resId - 1))) {
delete rele->second;
m_mapContainer.erase(rele);
Implem::TrimTrailingNulls(m_linearContainer);
}
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
bool ResourcePool<Res, Name, Object>::ReleaseResource (typename ResourceMapType::iterator& parMapRes, typename ResourceVectorType::iterator& parVecRes) {
assert(parMapRes->second == *parVecRes);
assert(nullptr != *parVecRes);
ResourceWrapperType& currRes = **parVecRes;
assert(not currRes.IsEmpty());
if (not currRes.IsEmpty()) {
if (currRes.Release()) {
this->OnResourceDestroy(currRes.GetName(), &currRes.GetResource());
currRes.Reset();
*parVecRes = nullptr;
return true;
}
}
return false;
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
bool ResourcePool<Res, Name, Object>::IsEmpty() const {
return m_mapContainer.empty() and m_linearContainer.empty();
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
void ResourcePool<Res, Name, Object>::Dispose() noexcept {
for (auto& currItem : m_linearContainer) {
if (nullptr != currItem)
this->OnResourceDestroy(currItem->GetName(), &currItem->GetResource());
delete currItem;
}
m_linearContainer.clear();
m_mapContainer.clear();
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename Res, typename Name, typename Object>
void ResourcePool<Res, Name, Object>::Dispose_IgnoreReferenceCount() {
for (auto& currItem : m_linearContainer) {
currItem->DropRefCount();
}
Dispose();
}
} //namespace duckcore

View file

@ -0,0 +1,68 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id8D3B62D447574A23A82F8E9C60A629BD
#define id8D3B62D447574A23A82F8E9C60A629BD
#include <cstddef>
namespace duckutil {
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize=sizeof(T)*8-IndexBitSize>
class SaltedIndex {
static_assert((SaltBitSize + IndexBitSize) == sizeof(T) * 8, "Type size is too small");
static_assert(SaltBitSize > 0, "Invalid salt size");
static_assert(IndexBitSize > 0, "Invalid index size");
public:
enum {
SaltSize = SaltBitSize,
IndexSize = IndexBitSize,
MaxSalt = (1 << SaltBitSize) - 1,
MaxIndex = (1 << IndexBitSize) - 1
};
SaltedIndex ( void );
SaltedIndex ( const SaltedIndex& parOther );
explicit SaltedIndex ( T parIndex );
SaltedIndex ( T parSalt, T parIndex );
~SaltedIndex ( void );
T GetSaltOnly ( void ) const { return m_salt; }
T GetIndexOnly ( void ) const { return m_index; }
T GetSaltedIndex ( void ) const { return m_saltedIndex; }
void SetSalt ( T parSalt );
void SetIndex ( T parIndex );
T IncreaseSalt ( void );
bool operator== ( const SaltedIndex& parOther ) const { return GetSaltedIndex() == parOther.GetSaltedIndex(); }
bool operator!= ( const SaltedIndex& parOther ) const { return GetSaltedIndex() != parOther.GetSaltedIndex(); }
bool operator< ( const SaltedIndex& parOther ) const { return GetSaltedIndex() < parOther.GetSaltedIndex(); }
private:
union {
struct {
T m_index : IndexBitSize;
T m_salt : SaltBitSize;
};
T m_saltedIndex;
};
};
} //namespace duckutil
#include "SaltedIndex.inl"
#endif

View file

@ -0,0 +1,86 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
namespace duckutil {
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex() :
m_saltedIndex(0)
{
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (const SaltedIndex& parOther) :
m_saltedIndex(parOther.GetSaltedIndex())
{
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (T parIndex) :
m_saltedIndex(parIndex)
{
Assert(m_saltedIndex <= MaxIndex);
Assert(m_salt == 0);
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (T parSalt, T parIndex) :
m_index(parIndex),
m_salt(parSalt)
{
Assert(parSalt <= MaxSalt);
Assert(parIndex <= MaxIndex);
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
SaltedIndex<T, IndexBitSize, SaltBitSize>::~SaltedIndex() {
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
void SaltedIndex<T, IndexBitSize, SaltBitSize>::SetSalt (T parSalt) {
Assert(parSalt <= MaxSalt);
m_salt = parSalt;
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
void SaltedIndex<T, IndexBitSize, SaltBitSize>::SetIndex (T parIndex) {
Assert(parIndex <= MaxIndex);
m_index = parIndex;
}
///-------------------------------------------------------------------------
///-------------------------------------------------------------------------
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
T SaltedIndex<T, IndexBitSize, SaltBitSize>::IncreaseSalt() {
Assert(m_salt < MaxSalt);
++m_salt;
}
} //namespace duckutil

168
src/scraplang/parse.cpp Normal file
View file

@ -0,0 +1,168 @@
/* Copyright (C) 2017 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#include "parse.hpp"
#include "element_def.hpp"
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/fusion/adapted/std_pair.hpp>
#include <utility>
#if !defined(NDEBUG)
# include <iostream>
#endif
#include <boost/variant/apply_visitor.hpp>
#include <stdexcept>
namespace qi = boost::spirit::qi;
namespace sp = boost::spirit;
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::SourceInfo,
(std::string, value)
(duck::sl::SourceInfo::Type, type)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::FromBlock,
(duck::sl::SourceInfo, source)
(std::vector<duck::sl::XPathElement>, xpaths)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::StructBlock,
(std::string, name)
(std::vector<duck::sl::XPathElement>, xpaths)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::ApplyBlock,
(std::string, mustache_model)
(duck::sl::SourceInfo, source)
(std::vector<duck::sl::StructItem>, xpaths)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::MustacheBlock,
(std::string, name)
(std::string, content)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::sl::XPathElement,
(std::string, name)
(std::optional<std::string>, def_val)
(std::string, xpath)
)
namespace duck { namespace sl {
namespace {
struct ElementTypeSymbol : qi::symbols<char, ElementTypes> {
ElementTypeSymbol() {
add
("string", ElementType_String)
("integer", ElementType_Integer)
("boolean", ElementType_Boolean)
("null", ElementType_Null)
("double", ElementType_Double)
;
}
};
template <typename I, typename Skipper>
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
public:
ScrapGrammar() : ScrapGrammar::base_type(start) {
using qi::char_;
using qi::lexeme;
using qi::alpha;
using qi::alnum;
using qi::graph;
using qi::attr;
using qi::eol;
using qi::eoi;
using qi::lit;
using qi::string;
using qi::as_string;
using qi::no_skip;
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
from_block = lit("from") >> source_info >> +eol >> (xpath_assignment % +eol) >> +eol >> "end";
source_info = (url >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
url = -(+alpha >> string("://")) >> alpha >> *graph;
mustache_like_token = "{{" >> identifier >> "}}";
quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'];
xpath_assignment = identifier >> -(lit("default") >> '(' >> quoted_string >> ')') >> "=" >> +graph;
identifier %= lexeme[(alpha | char_('_')) >> *(alnum | char_('_'))];
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
((xpath_assignment | struct_block) % +eol) >> +eol >> "end";
struct_block = "struct" >> identifier >> +eol >> (xpath_assignment % +eol) >> +eol >> "end";
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
}
private:
template <typename F>
using RuleType = qi::rule<I, F, Skipper>;
RuleType<std::vector<ScrapNode>()> start;
RuleType<FromBlock()> from_block;
RuleType<std::string()> url;
RuleType<std::string()> mustache_like_token;
RuleType<std::string()> quoted_string;
RuleType<XPathElement()> xpath_assignment;
RuleType<std::string()> identifier;
RuleType<SourceInfo()> source_info;
RuleType<ApplyBlock()> apply_block;
RuleType<StructBlock()> struct_block;
RuleType<MustacheBlock()> mustache_block;
};
} //unnamed namespace
std::vector<ScrapNode> parse (const std::string& parData) {
ScrapGrammar<std::string::const_iterator, boost::spirit::qi::ascii::blank_type> gramm;
auto it_start = parData.cbegin();
std::vector<ScrapNode> retval;
const bool ok = qi::phrase_parse(it_start, parData.cend(), gramm, sp::ascii::blank, retval);
std::cout << "parse ok: " << std::boolalpha << ok << '\n';
std::cout << "end == it: " << std::boolalpha << (parData.cend() == it_start) << '\n';
std::cout << "begin == it: " << std::boolalpha << (parData.cbegin() == it_start) << '\n';
std::cout << "parse distance: " << std::distance(parData.cbegin(), it_start) << '\n';
std::cout << "all distance: " << std::distance(parData.cbegin(), parData.cend()) << " (size: " << parData.size() << ")\n";
if (parData.cend() != it_start or not ok) {
throw std::runtime_error("Error parsing input script");
}
return retval;
}
// std::vector<element_def> get_xpath_definitions (const ScrapNode& parAST) {
// std::vector<element_def> retval;
// implem::XPathVisitor xpath_vis(&retval);
// boost::apply_visitor(xpath_vis, parAST);
// return std::move(retval);
// }
//
// void print_results (std::ostream& parOut, const ScrapNode& parAST, const std::vector<element_def>& parOutcome, const ResultList& parResList) {
//#if !defined(NDEBUG)
// std::cout << "print_results()...\n";
//#endif
// implem::ResultPrinter printer(&parOut, &parOutcome, & parResList);
// boost::apply_visitor(printer, parAST);
// }
}} //namespace duck::sl

30
src/scraplang/parse.hpp Normal file
View file

@ -0,0 +1,30 @@
/* Copyright (C) 2017 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef idBE96C2D49C4C413888A79EAEB2B9C0FA
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
#include "scrap_node.hpp"
#include <string>
namespace duck { namespace sl {
std::vector<ScrapNode> parse ( const std::string& parData );
//std::vector<element_def> get_xpath_definitions ( const ScrapNode& parAST );
}} //namespace duck::sl
#endif

View file

@ -0,0 +1,94 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id9919CCB09DDD429C8128632F13D370ED
#define id9919CCB09DDD429C8128632F13D370ED
//#include "element_def.hpp"
#include <boost/spirit/include/support_extended_variant.hpp>
#include <string>
#include <vector>
#include <map>
#include <optional>
#include <utility>
namespace duck { namespace sl {
struct XPathElement {
std::string name;
std::optional<std::string> def_val;
std::string xpath;
};
struct SourceInfo {
enum Type { URL, Token };
SourceInfo() = default;
SourceInfo (std::string&& val) : value(std::move(val)), type(Token) {}
SourceInfo (const std::string& val) : value(val), type(Token) {}
std::string value;
Type type;
};
struct FromBlock {
SourceInfo source;
std::vector<XPathElement> xpaths;
};
struct StructBlock {
std::string name;
std::vector<XPathElement> xpaths;
};
struct StructItem : boost::spirit::extended_variant<
XPathElement,
StructBlock
> {
StructItem() : base_type() {}
StructItem (const XPathElement& value) : base_type(value) {}
StructItem (const StructBlock& value) : base_type(value) {}
using base_type::operator=;
};
struct ApplyBlock {
std::string mustache_model;
SourceInfo source;
std::vector<StructItem> xpaths;
};
struct MustacheBlock {
std::string name;
std::string content;
};
struct ScrapNode : boost::spirit::extended_variant<
boost::recursive_wrapper<std::vector<ScrapNode>>,
FromBlock,
ApplyBlock,
MustacheBlock
> {
ScrapNode() : base_type() {}
ScrapNode (const std::vector<ScrapNode>& value) : base_type(value) {}
ScrapNode (const FromBlock& value) : base_type(value) {}
ScrapNode (const ApplyBlock& value) : base_type(value) {}
ScrapNode (const MustacheBlock& value) : base_type(value) {}
using base_type::operator=;
};
}} //namespace duck::sl
#endif

View file

@ -1,75 +0,0 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id9919CCB09DDD429C8128632F13D370ED
#define id9919CCB09DDD429C8128632F13D370ED
#include "scraplang_element.hpp"
#include <boost/spirit/include/support_extended_variant.hpp>
#include <string>
#include <vector>
#include <map>
namespace duck {
struct ScrapNode;
namespace implem {
struct map;
struct array;
struct element : boost::spirit::extended_variant<
boost::recursive_wrapper<map>,
boost::recursive_wrapper<array>,
std::string,
int,
double
>
{
element ( void ) = default;
element ( const map& parOther ) : base_type(parOther) {}
element ( const array& parOther ) : base_type(parOther) {}
element ( const std::string& parOther ) : base_type(parOther) {}
element ( double parOther ) : base_type(parOther) {}
element ( int parOther ) : base_type(parOther) {}
};
struct map : std::map<std::string, element> {
};
struct array : std::vector<element> {
};
struct node_list {
std::vector<ScrapNode> nodes;
};
} //namespace implem
struct ScrapNode : boost::spirit::extended_variant<
element_def,
implem::map,
implem::node_list
>
{
ScrapNode ( void ) = default;
ScrapNode ( const element_def& parOther ) : base_type(parOther) {}
ScrapNode ( const implem::map& parOther ) : base_type(parOther) {}
ScrapNode ( const implem::node_list& parOther ) : base_type(parOther) {}
};
} //namespace duck
#endif

View file

@ -1,119 +0,0 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#include "scraplang.hpp"
#include "scrapast.hpp"
#include "scraplang_visit_xpath.hpp"
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/fusion/adapted/std_pair.hpp>
#include <utility>
#include <boost/variant/apply_visitor.hpp>
namespace qi = boost::spirit::qi;
namespace sp = boost::spirit;
BOOST_FUSION_ADAPT_STRUCT(
duck::element_def,
(std::string, name)
(std::string, xpath)
(duck::ElementTypes, type)
)
BOOST_FUSION_ADAPT_STRUCT(
duck::implem::node_list,
(std::vector<duck::ScrapNode>, nodes)
)
namespace duck {
namespace {
struct ElementTypeSymbol : qi::symbols<char, ElementTypes> {
ElementTypeSymbol() {
add
("string", ElementType_String)
("integer", ElementType_Integer)
("boolean", ElementType_Boolean)
("null", ElementType_Null)
("double", ElementType_Double)
;
}
};
template <typename I>
struct ScrapGrammar : qi::grammar<I, ScrapNode(), sp::ascii::space_type> {
ScrapGrammar() : ScrapGrammar::base_type(start) {
using qi::lit;
using qi::char_;
using qi::lexeme;
using qi::double_;
using qi::int_;
using qi::eps;
start = whole;
whole = eps >> *xpath_definition >> -map;
xpath_definition = identifier >> lit('=') >> string >> "as" >> data_type;
identifier = (char_('a', 'z') | char_('A', 'Z') | '_') >> *(char_('a', 'z') | char_('A', 'Z') | '_' | char_('0', '9'));
string %= lexeme['"' >> +(char_ - '"') >> '"'];
map = lit('{') >> ((identifier >> lit('=') >> value) % lit(',')) >> lit('}');
array = lit('[') >> *(value % lit(',')) >> lit(']');
value = string | double_ | int_ | array | map | identifier;
}
qi::rule<I, ScrapNode(), sp::ascii::space_type> start;
qi::rule<I, implem::node_list(), sp::ascii::space_type> whole;
qi::rule<I, element_def(), sp::ascii::space_type> xpath_definition;
qi::rule<I, std::string(), sp::ascii::space_type> identifier;
qi::rule<I, std::string(), sp::ascii::space_type> string;
qi::rule<I, implem::map(), sp::ascii::space_type> map;
qi::rule<I, implem::array(), sp::ascii::space_type> array;
qi::rule<I, implem::element(), sp::ascii::space_type> value;
ElementTypeSymbol data_type;
};
} //unnamed namespace
ScrapNodePtr parse_scraplang (const std::string& parData) {
ScrapGrammar<std::string::const_iterator> gramm;
ScrapNodePtr retval(new ScrapNode);
auto it_start = parData.cbegin();
qi::phrase_parse(it_start, parData.cend(), gramm, sp::ascii::space, *retval);
return std::move(retval);
}
std::vector<element_def> get_xpath_definitions (const ScrapNode& parAST) {
std::vector<element_def> retval;
implem::XPathVisitor xpath_vis(&retval);
boost::apply_visitor(xpath_vis, parAST);
return std::move(retval);
}
ScrapNodePtr::ScrapNodePtr (ScrapNode* parPtr) :
m_ptr(parPtr)
{
}
ScrapNodePtr::ScrapNodePtr (ScrapNodePtr&& parOther) :
m_ptr(std::move(parOther.m_ptr))
{
}
ScrapNodePtr::~ScrapNodePtr() noexcept {
}
} //namespace duck

View file

@ -0,0 +1,29 @@
#ifndef idB20734D678524FAA8AC94F2AB2FDAA94
#define idB20734D678524FAA8AC94F2AB2FDAA94
#include "scrapast.hpp"
#include <vector>
namespace duck {
typedef std::vector<std::vector<std::pair<std::string, std::string>>> ResulList;
struct element_def;
namespace implem {
class ResultPrinter {
public:
typedef void result_type;
explicit ResultPrinter ( const std::vector<element_def>* parQueries, const ResultList* parResults );
void operator() ( const element_def& parElem );
void operator() ( const implem::map& parMap );
void operator() ( const node_list& parNodes );
private:
const std::vector<element_def>* const m_queries;
const ResulList* const m_results;
};
} //namespace implem
} //namespace duck
#endif

View file

@ -0,0 +1,43 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#include "variables.hpp"
#include "scrap_node.hpp"
namespace duck { namespace sl {
Variables::Variables() = default;
Variables::~Variables() = default;
std::string Variables::resolve_string (const SourceInfo& parName) const {
assert(false);
return std::string();
}
std::vector<SourceInfo> Variables::resolve_array (const SourceInfo& parName) const {
assert(false);
return std::vector<SourceInfo>();
}
void Variables::add_xpath (const SourceInfo& parSource, const XPathElement& parVal) {
assert(false);
}
void Variables::add_struct (const SourceInfo& parSource, const StructBlock& parVal) {
assert(false);
}
}} //namespace duck::sl

View file

@ -0,0 +1,49 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id700FA165E5194907867EB4C02C4C1385
#define id700FA165E5194907867EB4C02C4C1385
#include "mstch/mstch.hpp"
#include "kakoune/safe_ptr.hh"
#include <string>
#include <map>
#include <vector>
namespace duck { namespace sl {
struct SourceInfo;
struct StructBlock;
struct XPathElement;
class Variables : public Kakoune::SafeCountable {
public:
Variables();
~Variables();
std::string resolve_string (const SourceInfo& parName) const;
std::vector<SourceInfo> resolve_array (const SourceInfo& parName) const;
void add_xpath (const SourceInfo& parSource, const XPathElement& parVal);
void add_struct (const SourceInfo& parSource, const StructBlock& parVal);
private:
//std::map<std::string,
};
}} //namespace duck::sl
#endif

View file

@ -0,0 +1,34 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#include "xpath_manager.hpp"
#include "scrap_node.hpp"
namespace duck { namespace sl {
XPathManager::XPathManager (HtmlPoolBaseSP parHtmlPool) :
m_html_pool(parHtmlPool)
{
}
std::string XPathManager::extract_one (
const XPathElement& parXPath
) const {
assert(false);
return std::string();
}
}} //namespace duck::sl

View file

@ -0,0 +1,43 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id69826186710D4048BF6810202EDF310D
#define id69826186710D4048BF6810202EDF310D
#include "scraplang/html_pool_base.hpp"
#include "scraplang/scrap_node.hpp"
#include "kakoune/safe_ptr.hh"
#include <string>
namespace duck { namespace sl {
struct XPathElement;
class XPathManager : public Kakoune::SafeCountable {
public:
explicit XPathManager (HtmlPoolBaseSP parHtmlPool);
~XPathManager() = default;
std::string extract_one (const XPathElement& parXPath) const;
HtmlPoolBaseSP html_pool() const;
private:
HtmlPoolBaseSP m_html_pool;
};
}} //namespace duck::sl
#endif