Compare commits
41 commits
Author | SHA1 | Date | |
---|---|---|---|
fa08abd00d | |||
d64a4af105 | |||
9cd2608406 | |||
5a9e4e09a4 | |||
329ccef6ef | |||
4958a83ddb | |||
830ab42c49 | |||
32f87e5185 | |||
b79d758e8e | |||
b536026f58 | |||
55eb7c1fc0 | |||
bdb858de5a | |||
6e35c880a4 | |||
33866b3d6b | |||
54ac44b81d | |||
3dcbd48067 | |||
5de2dfbe70 | |||
d97cf03a34 | |||
7170347969 | |||
60d6c2cb61 | |||
430886085c | |||
9dba8043f1 | |||
494364c22e | |||
5d2c5863a5 | |||
b028e8c492 | |||
a6916f6179 | |||
1d750ad2f9 | |||
76f403b3ce | |||
84a599e771 | |||
79ac7534f2 | |||
a9ff092401 | |||
8d2c9f9013 | |||
b39621ea51 | |||
6dffe9b848 | |||
41bb315b02 | |||
2fd4daf52c | |||
26b912d66c | |||
3572803f66 | |||
fcb25ed456 | |||
29f8fe299e | |||
f0e7a1d136 |
48 changed files with 2997 additions and 346 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1 +1,3 @@
|
||||||
build/
|
build/
|
||||||
|
tags
|
||||||
|
compile_commands.json
|
||||||
|
|
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -4,3 +4,6 @@
|
||||||
[submodule "lib/tidy"]
|
[submodule "lib/tidy"]
|
||||||
path = lib/tidy
|
path = lib/tidy
|
||||||
url = https://github.com/htacg/tidy-html5.git
|
url = https://github.com/htacg/tidy-html5.git
|
||||||
|
[submodule "lib/mstch"]
|
||||||
|
path = lib/mstch
|
||||||
|
url = https://github.com/KingDuckZ/mstch.git
|
||||||
|
|
|
@ -5,11 +5,15 @@ project(duckscraper VERSION 0.2.1 LANGUAGES CXX)
|
||||||
option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared library" OFF)
|
option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared library" OFF)
|
||||||
|
|
||||||
include(GetGitRevisionDescription)
|
include(GetGitRevisionDescription)
|
||||||
find_package(PugiXML REQUIRED)
|
|
||||||
find_package(Boost 1.32.0 COMPONENTS program_options)
|
find_package(Boost 1.32.0 COMPONENTS program_options)
|
||||||
|
find_package(XQilla 2.3.3 REQUIRED)
|
||||||
|
find_package(Iconv REQUIRED)
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
|
||||||
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -Wextra -g -O3 -fomit-frame-pointer")
|
||||||
|
|
||||||
set(DEFAULT_USER_AGENT "DuckScraper")
|
set(DEFAULT_USER_AGENT "DuckScraper")
|
||||||
set(PROJECT_VERSION_BETA "1")
|
set(PROJECT_VERSION_BETA "1")
|
||||||
|
@ -20,23 +24,30 @@ configure_file(
|
||||||
"${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h"
|
"${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h"
|
||||||
)
|
)
|
||||||
|
|
||||||
include_directories(SYSTEM
|
|
||||||
lib/tidy/include
|
|
||||||
${PUGIXML_INCLUDE_DIR}
|
|
||||||
lib/curlcpp/include
|
|
||||||
${Boost_INCLUDE_DIRS}
|
|
||||||
)
|
|
||||||
include_directories(
|
|
||||||
src/
|
|
||||||
"${PROJECT_BINARY_DIR}"
|
|
||||||
)
|
|
||||||
|
|
||||||
add_executable(${PROJECT_NAME}
|
add_executable(${PROJECT_NAME}
|
||||||
src/main.cpp
|
src/main.cpp
|
||||||
|
src/html_pool.cpp
|
||||||
src/htmlretrieve.cpp
|
src/htmlretrieve.cpp
|
||||||
src/commandline.cpp
|
src/commandline.cpp
|
||||||
src/scraplang/scraplang.cpp
|
src/scraplang/parse_exports.cpp
|
||||||
|
src/scraplang/parse.cpp
|
||||||
|
src/scraplang/apply.cpp
|
||||||
|
src/scraplang/xpath_runner.cpp
|
||||||
src/xpath.cpp
|
src/xpath.cpp
|
||||||
|
src/read_all.cpp
|
||||||
|
src/iconv_wrapper.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
target_include_directories(${PROJECT_NAME} SYSTEM
|
||||||
|
PRIVATE lib/tidy/include
|
||||||
|
PRIVATE ${PUGIXML_INCLUDE_DIR}
|
||||||
|
PRIVATE lib/curlcpp/include
|
||||||
|
PRIVATE ${Boost_INCLUDE_DIRS}
|
||||||
|
PRIVATE lib/mstch/include
|
||||||
|
)
|
||||||
|
target_include_directories(${PROJECT_NAME}
|
||||||
|
PRIVATE src/
|
||||||
|
PRIVATE "${PROJECT_BINARY_DIR}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_SHARED_TIDY)
|
if (BUILD_SHARED_TIDY)
|
||||||
|
@ -46,10 +57,16 @@ else(BUILD_SHARED_TIDY)
|
||||||
endif(BUILD_SHARED_TIDY)
|
endif(BUILD_SHARED_TIDY)
|
||||||
|
|
||||||
target_link_libraries(${PROJECT_NAME}
|
target_link_libraries(${PROJECT_NAME}
|
||||||
${TIDY_LIB}
|
PRIVATE ${TIDY_LIB}
|
||||||
${PUGIXML_LIBRARIES}
|
PRIVATE ${PUGIXML_LIBRARIES}
|
||||||
curlcpp
|
PRIVATE curlcpp
|
||||||
${Boost_LIBRARIES}
|
PRIVATE ${Boost_LIBRARIES}
|
||||||
|
PRIVATE mstch
|
||||||
|
PRIVATE XQilla::XQilla
|
||||||
|
)
|
||||||
|
|
||||||
|
target_compile_definitions(${PROJECT_NAME}
|
||||||
|
PRIVATE $<$<CONFIG:DEBUG>:KAK_DEBUG>
|
||||||
)
|
)
|
||||||
|
|
||||||
#unset those variables so cmake files from dependencies won't complain about
|
#unset those variables so cmake files from dependencies won't complain about
|
||||||
|
@ -62,3 +79,4 @@ unset(PROJECT_VERSION)
|
||||||
set(BUILD_SHARED_LIB ${BUILD_SHARED_TIDY}) #for tidy
|
set(BUILD_SHARED_LIB ${BUILD_SHARED_TIDY}) #for tidy
|
||||||
add_subdirectory(lib/tidy)
|
add_subdirectory(lib/tidy)
|
||||||
add_subdirectory(lib/curlcpp)
|
add_subdirectory(lib/curlcpp)
|
||||||
|
add_subdirectory(lib/mstch)
|
||||||
|
|
28
cmake/Modules/FindXQilla.cmake
Normal file
28
cmake/Modules/FindXQilla.cmake
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
# Find the XQilla library
|
||||||
|
# originally taken from
|
||||||
|
# https://github.com/rug-compling/alpinocorpus/blob/master/cmake/FindXQilla.cmake
|
||||||
|
|
||||||
|
find_path(XQILLA_INCLUDE_DIR NAMES xqilla/xqilla-simple.hpp)
|
||||||
|
find_library(XQILLA_LIBRARY NAMES xqilla)
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(
|
||||||
|
XQILLA
|
||||||
|
DEFAULT_MSG
|
||||||
|
XQILLA_INCLUDE_DIR
|
||||||
|
XQILLA_LIBRARY
|
||||||
|
)
|
||||||
|
set(XQILLA_LIBRARIES ${XQILLA_LIBRARY})
|
||||||
|
mark_as_advanced(XQILLA_INCLUDE_DIR XQILLA_LIBRARY)
|
||||||
|
|
||||||
|
if (XQILLA_FOUND)
|
||||||
|
find_package(XercesC REQUIRED)
|
||||||
|
|
||||||
|
if (NOT TARGET XQilla::XQilla)
|
||||||
|
add_library(XQilla::XQilla UNKNOWN IMPORTED)
|
||||||
|
set_target_properties(XQilla::XQilla PROPERTIES
|
||||||
|
INTERFACE_INCLUDE_DIRECTORIES "${XQILLA_INCLUDE_DIR}"
|
||||||
|
IMPORTED_LOCATION "${XQILLA_LIBRARY}"
|
||||||
|
INTERFACE_LINK_LIBRARIES XercesC::XercesC
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
endif()
|
|
@ -1 +1 @@
|
||||||
Subproject commit 194fdb0ced92a993a60cd0810610845a12023e82
|
Subproject commit 0c2f06df81c6cb24fad11fd12d69a2dd19360285
|
1
lib/mstch
Submodule
1
lib/mstch
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 45122d1d515c90a54d509d4b2d8d9279348518f5
|
2
lib/tidy
2
lib/tidy
|
@ -1 +1 @@
|
||||||
Subproject commit 67192ba77e539539d15cc716303ac686bacddd61
|
Subproject commit d1b906991a7587688d384b648c55731f9be52506
|
94
map_form.txt
Normal file
94
map_form.txt
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
```
|
||||||
|
apply {{mustache_name}} to {{pages}}
|
||||||
|
A = /html/head/text()
|
||||||
|
struct B
|
||||||
|
C default("n/a") = //table[@class="wikitable sortable"]/tr/td[4]/a/text()
|
||||||
|
D default("0") = //table[@class="wikitable sortable"]/tr/td[3]/text()
|
||||||
|
struct E
|
||||||
|
F = /html/head/inner_names/text()
|
||||||
|
G = /html/head/inner_probabilities/text()
|
||||||
|
end
|
||||||
|
H = /html/head/inner_names/text()
|
||||||
|
end
|
||||||
|
I = /html/head/inner_names/text()
|
||||||
|
end
|
||||||
|
|
||||||
|
==mustache_name
|
||||||
|
blah
|
||||||
|
==end
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
The above should result in the following:
|
||||||
|
|
||||||
|
```
|
||||||
|
A[]
|
||||||
|
B[] --- C
|
||||||
|
--- D
|
||||||
|
--- E[] --- F
|
||||||
|
--- G
|
||||||
|
--- H
|
||||||
|
--- I[]
|
||||||
|
```
|
||||||
|
|
||||||
|
For example, given these query results:
|
||||||
|
|
||||||
|
```
|
||||||
|
A[] = {a1, a2, a3}
|
||||||
|
C = c1
|
||||||
|
D[] = {d1, d2}
|
||||||
|
F[] = {f1, f2, f3}
|
||||||
|
G = g1
|
||||||
|
h = h1
|
||||||
|
i = i1
|
||||||
|
```
|
||||||
|
|
||||||
|
then the complete result in tree form shall be:
|
||||||
|
|
||||||
|
```
|
||||||
|
{
|
||||||
|
A => [a1, a2, a3],
|
||||||
|
B => [
|
||||||
|
{
|
||||||
|
C => c1,
|
||||||
|
D => d1,
|
||||||
|
E => [
|
||||||
|
{
|
||||||
|
F => f1,
|
||||||
|
G => g1
|
||||||
|
}, {
|
||||||
|
F => f2,
|
||||||
|
G => ""
|
||||||
|
}, {
|
||||||
|
F => f3,
|
||||||
|
G => ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
H => h1
|
||||||
|
}, {
|
||||||
|
C => "",
|
||||||
|
D => d2,
|
||||||
|
E => [
|
||||||
|
{
|
||||||
|
F => f1,
|
||||||
|
G => g1
|
||||||
|
}, {
|
||||||
|
F => f2,
|
||||||
|
G => ""
|
||||||
|
}, {
|
||||||
|
F => f3,
|
||||||
|
G => ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
H => ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Please note that:
|
||||||
|
|
||||||
|
* arrays inside a struct turn the struct itself into an array, while its items become just single item values
|
||||||
|
* if a struct contains no arrays, then the struct shall not become an array - that is, a struct generates an array with as many elements as the largest element in the struct itself
|
||||||
|
* there are as many of any one struct as the size of the largest array inside it
|
||||||
|
* nested structs get duplicated in every outer struct they are part of; in the example above E has as many elements as there items in F (2, the largest between F and G), and the whole array of E is duplicated in every element of B
|
17
sample.scrap
Normal file
17
sample.scrap
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
from http://sid-story.wikia.com/wiki/Album
|
||||||
|
pages = //section/header/h2/a/@href
|
||||||
|
end
|
||||||
|
|
||||||
|
apply {{test_mustache}} to {{pages}}
|
||||||
|
struct paragraphs
|
||||||
|
paragraph = //section/header/h2/a/text()
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
==test_mustache
|
||||||
|
Paragraphs: {{#paragraphs}}
|
||||||
|
- {{paragraph}}
|
||||||
|
{{/paragraphs}}
|
||||||
|
|
||||||
|
kthx bye!
|
||||||
|
==end
|
|
@ -47,11 +47,13 @@ namespace duck {
|
||||||
("help,h", "Produces this help message")
|
("help,h", "Produces this help message")
|
||||||
("version", "Prints the program's version and quits")
|
("version", "Prints the program's version and quits")
|
||||||
("dump,d", po::value<std::string>(), "Cleans the retrieved html and saves it to the named file; use - for stdout")
|
("dump,d", po::value<std::string>(), "Cleans the retrieved html and saves it to the named file; use - for stdout")
|
||||||
("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
|
|
||||||
;
|
;
|
||||||
po::options_description query_options("Query options");
|
po::options_description query_options("Query options");
|
||||||
query_options.add_options()
|
query_options.add_options()
|
||||||
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
||||||
|
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
|
||||||
|
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
|
||||||
|
("namespace,n", po::value<std::string>()->default_value("http://www.w3.org/1999/xhtml"), "Default namespace for XPath queries (try empty string if you get no results)")
|
||||||
;
|
;
|
||||||
po::options_description positional_options("Positional options");
|
po::options_description positional_options("Positional options");
|
||||||
positional_options.add_options()
|
positional_options.add_options()
|
||||||
|
@ -86,6 +88,7 @@ namespace duck {
|
||||||
std::cout << "redistribute it under certain conditions.\n"; //type `show c' for details.
|
std::cout << "redistribute it under certain conditions.\n"; //type `show c' for details.
|
||||||
std::cout << '\n';
|
std::cout << '\n';
|
||||||
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
|
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
|
||||||
|
std::cout << " " << PROGRAM_NAME << " [options...] --model <path> <url>\n";
|
||||||
std::cout << "You can pass - as the url to read from stdin\n";
|
std::cout << "You can pass - as the url to read from stdin\n";
|
||||||
std::cout << visible;
|
std::cout << visible;
|
||||||
return true;
|
return true;
|
||||||
|
@ -96,11 +99,14 @@ namespace duck {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parVarMap.count("input-url") == 0) {
|
if (parVarMap.count("input-url") == 0 and parVarMap.count("model") == 0) {
|
||||||
throw std::invalid_argument("No input URL specified");
|
throw std::invalid_argument("No input URL specified");
|
||||||
}
|
}
|
||||||
if (parVarMap.count("xpath") == 0) {
|
if (not (parVarMap.count("xpath") or parVarMap.count("model"))) {
|
||||||
throw std::invalid_argument("No XPath expression specified");
|
throw std::invalid_argument("No XPath expression specified and no input model given");
|
||||||
|
}
|
||||||
|
else if (parVarMap.count("xpath") and parVarMap.count("model")) {
|
||||||
|
throw std::invalid_argument("Received both model and XPath expression, but only one of the two is allowed");
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
61
src/html_pool.cpp
Normal file
61
src/html_pool.cpp
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "html_pool.hpp"
|
||||||
|
#include "htmlretrieve.hpp"
|
||||||
|
#include "read_all.hpp"
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
#include <utility>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
|
||||||
|
m_agent(std::move(agent_name)),
|
||||||
|
m_src_charset(std::move(src_charset)),
|
||||||
|
m_iconv(m_src_charset, "utf-8", IconvWrapper::ModeIgnore)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlPool::~HtmlPool() noexcept = default;
|
||||||
|
|
||||||
|
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
|
||||||
|
std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n";
|
||||||
|
|
||||||
|
std::unique_ptr<std::string> utf8_html;
|
||||||
|
if (parRes == "-") {
|
||||||
|
utf8_html = std::make_unique<std::string>(read_all(std::cin));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
utf8_html = std::make_unique<std::string>(
|
||||||
|
fetch_html(parRes, m_agent, false, false)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
*utf8_html = duck::clean_html(m_iconv.conv_char(*utf8_html), "utf-8");
|
||||||
|
return utf8_html.release();
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept {
|
||||||
|
delete parRes;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto HtmlPool::GetResourceNameFromResourceObject (ResourceObjectParameterType parRes) -> ResourceNameType {
|
||||||
|
return parRes;
|
||||||
|
}
|
||||||
|
} //namespace duck
|
46
src/html_pool.hpp
Normal file
46
src/html_pool.hpp
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef idCDCACC393BE24CBD94A3B5E2985984A3
|
||||||
|
#define idCDCACC393BE24CBD94A3B5E2985984A3
|
||||||
|
|
||||||
|
#include "scraplang/html_pool_base.hpp"
|
||||||
|
#include "iconv_wrapper.hpp"
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
class HtmlPool : public ::duck::sl::HtmlPoolBase {
|
||||||
|
typedef ::duck::sl::HtmlPoolBase::ResourceType ResourceType;
|
||||||
|
typedef ::duck::sl::HtmlPoolBase::ResourceNameType ResourceNameType;
|
||||||
|
typedef ::duck::sl::HtmlPoolBase::ResourceObjectParameterType ResourceObjectParameterType;
|
||||||
|
typedef ::duck::sl::HtmlPoolBase::ResourceNameParamType ResourceNameParamType;
|
||||||
|
|
||||||
|
virtual ResourceType* OnResourceLoad (ResourceObjectParameterType parRes);
|
||||||
|
virtual void OnResourceDestroy (ResourceNameParamType parName, ResourceType* parRes) noexcept;
|
||||||
|
virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes);
|
||||||
|
|
||||||
|
std::string m_agent;
|
||||||
|
std::string m_src_charset;
|
||||||
|
IconvWrapper m_iconv;
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlPool(std::string&& agent_name, std::string&& src_charset);
|
||||||
|
~HtmlPool() noexcept;
|
||||||
|
};
|
||||||
|
} //namespace duck
|
||||||
|
|
||||||
|
#endif
|
|
@ -1,4 +1,4 @@
|
||||||
/* Copyright (C) 2015 Michele Santullo
|
/* Copyright (C) 2015-2020 Michele Santullo
|
||||||
*
|
*
|
||||||
* This file is part of DuckScraper.
|
* This file is part of DuckScraper.
|
||||||
*
|
*
|
||||||
|
@ -28,24 +28,51 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
#include <cctype>
|
||||||
|
#include <iterator>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
namespace {
|
namespace {
|
||||||
void dropScriptTags (std::string& html) {
|
std::string make_lowercase (std::string_view in) {
|
||||||
size_t open_index = 0;
|
std::string out;
|
||||||
const std::string open_tag("<script");
|
std::transform(in.begin(), in.end(), std::back_inserter(out), [](unsigned char c){return std::tolower(c);});
|
||||||
const std::string close_tag("</script>");
|
return out;
|
||||||
|
|
||||||
while (html.npos != (open_index = html.find(open_tag, open_index))) {
|
|
||||||
assert(open_index < html.size());
|
|
||||||
auto close_index = html.find(close_tag, open_index + open_tag.size());
|
|
||||||
if (close_index == html.npos)
|
|
||||||
close_index = html.size();
|
|
||||||
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isHttps (const std::string& parUrl) {
|
TidyEncodingOptions charset_to_enum (std::string_view name) {
|
||||||
|
const std::string lower_name = make_lowercase(name);
|
||||||
|
if (lower_name == "ascii")
|
||||||
|
return TidyEncAscii;
|
||||||
|
//else if (lower_name == "???")
|
||||||
|
// return TidyEncLatin0;
|
||||||
|
//else if (lower_name == "???")
|
||||||
|
// return TidyEncLatin1;
|
||||||
|
else if (lower_name == "utf-8")
|
||||||
|
return TidyEncUtf8;
|
||||||
|
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||||
|
else if (lower_name == "iso-2022-cn")
|
||||||
|
return TidyEncIso2022;
|
||||||
|
#endif
|
||||||
|
else if (lower_name == "mac")
|
||||||
|
return TidyEncMac;
|
||||||
|
else if (lower_name == "windows-1252")
|
||||||
|
return TidyEncWin1252;
|
||||||
|
else if (lower_name == "ibm858")
|
||||||
|
return TidyEncIbm858;
|
||||||
|
else if (lower_name == "utf-16le")
|
||||||
|
return TidyEncUtf16le;
|
||||||
|
else if (lower_name == "utf-16be")
|
||||||
|
return TidyEncUtf16be;
|
||||||
|
else if (lower_name == "utf-16")
|
||||||
|
return TidyEncUtf16;
|
||||||
|
else if (lower_name == "big-5")
|
||||||
|
return TidyEncBig5;
|
||||||
|
else if (lower_name == "shift-jis" or lower_name == "shift_jis")
|
||||||
|
return TidyEncShiftjis;
|
||||||
|
throw std::runtime_error("Failed to recognise \"" + std::string(name) + "\" as a valid source charset");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isHttps (const std::string_view& parUrl) {
|
||||||
const char protocol[] = "https://";
|
const char protocol[] = "https://";
|
||||||
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
|
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
|
||||||
if (parUrl.size() < protocolLen)
|
if (parUrl.size() < protocolLen)
|
||||||
|
@ -55,8 +82,7 @@ namespace duck {
|
||||||
}
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
std::string clean_html (std::string&& html) {
|
std::string clean_html (std::string&& html, OptString src_charset) {
|
||||||
dropScriptTags(html);
|
|
||||||
|
|
||||||
// Initialize a Tidy document
|
// Initialize a Tidy document
|
||||||
TidyDoc tidyDoc = tidyCreate();
|
TidyDoc tidyDoc = tidyCreate();
|
||||||
|
@ -68,8 +94,21 @@ namespace duck {
|
||||||
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
||||||
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
||||||
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no)
|
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyEscapeScripts, yes)
|
||||||
|
&& tidyOptSetInt(tidyDoc, TidyNewline, TidyLF)
|
||||||
|
&& tidyOptSetBool(tidyDoc,TidyFixUri, yes)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyHideComments, yes)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyPunctWrap, no)
|
||||||
|
&& tidyOptSetInt(tidyDoc, TidyOutCharEncoding, TidyEncUtf8)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyMetaCharset, yes)
|
||||||
|
&& tidyOptSetValue(tidyDoc, TidyDoctype, "omit")
|
||||||
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
|
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
|
||||||
|
|
||||||
|
if (src_charset)
|
||||||
|
tidyOptSetInt(tidyDoc, TidyInCharEncoding, charset_to_enum(*src_charset));
|
||||||
|
|
||||||
int tidyResponseCode = -1;
|
int tidyResponseCode = -1;
|
||||||
|
|
||||||
// Parse input
|
// Parse input
|
||||||
|
@ -103,14 +142,15 @@ namespace duck {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::string fetch_html (const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
std::string fetch_html (const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
||||||
using curl::curl_easy;
|
using curl::curl_easy;
|
||||||
using curl::curl_pair;
|
using curl::curl_pair;
|
||||||
|
using curl::curl_ios;
|
||||||
|
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
curl_writer wr(oss);
|
curl_ios<std::ostream> wr(oss);
|
||||||
curl_easy easy(wr);
|
curl_easy easy(wr);
|
||||||
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
|
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_URL, std::string(parSource)));
|
||||||
if (isHttps(parSource)) {
|
if (isHttps(parSource)) {
|
||||||
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
|
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
|
||||||
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
|
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
|
||||||
|
@ -118,6 +158,9 @@ namespace duck {
|
||||||
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
|
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
|
||||||
easy.add(curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
|
easy.add(curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
|
||||||
|
|
||||||
|
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_ACCEPT_ENCODING, "gzip"));
|
||||||
|
easy.add(curl_pair<CURLoption, long>(CURLOPT_HTTP_CONTENT_DECODING, 1L));
|
||||||
|
|
||||||
//try {
|
//try {
|
||||||
easy.perform();
|
easy.perform();
|
||||||
//}
|
//}
|
||||||
|
@ -127,6 +170,7 @@ namespace duck {
|
||||||
//return 1;
|
//return 1;
|
||||||
//}
|
//}
|
||||||
|
|
||||||
|
//return FetchedHtml(oss.str(), easy.get_info<CURLINFO_CONTENT_TYPE>().get());
|
||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
|
@ -20,10 +20,14 @@
|
||||||
#define idC6776D903059465191FFB64FCFD6B86A
|
#define idC6776D903059465191FFB64FCFD6B86A
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <optional>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
std::string fetch_html ( const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
typedef std::optional<std::string_view> OptString;
|
||||||
std::string clean_html ( std::string&& html );
|
|
||||||
|
std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||||
|
std::string clean_html ( std::string&& html, OptString src_charset );
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
143
src/iconv_wrapper.cpp
Normal file
143
src/iconv_wrapper.cpp
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "iconv_wrapper.hpp"
|
||||||
|
#include <iconv.h>
|
||||||
|
#include <cassert>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
namespace {
|
||||||
|
class IconvDeleter {
|
||||||
|
public:
|
||||||
|
typedef iconv_t pointer;
|
||||||
|
|
||||||
|
void operator() (pointer& resource) {
|
||||||
|
if (resource and reinterpret_cast<iconv_t>(-1) != resource) {
|
||||||
|
iconv_close(resource);
|
||||||
|
resource = static_cast<iconv_t>(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} //unnamed namespace
|
||||||
|
|
||||||
|
typedef std::unique_ptr<iconv_t, IconvDeleter> UniqueIconv;
|
||||||
|
|
||||||
|
IconvBadSequence::IconvBadSequence (const std::string& message) :
|
||||||
|
std::domain_error(message)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
IconvOpenFailure::IconvOpenFailure (const std::string& message) :
|
||||||
|
std::logic_error(message)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
struct IconvWrapper::LocalData {
|
||||||
|
UniqueIconv context;
|
||||||
|
};
|
||||||
|
|
||||||
|
IconvWrapper::IconvWrapper(const std::string& from, std::string to, Mode mode) :
|
||||||
|
m_local(std::make_unique<LocalData>())
|
||||||
|
{
|
||||||
|
switch (mode) {
|
||||||
|
case ModeIgnore:
|
||||||
|
to += "//IGNORE";
|
||||||
|
break;
|
||||||
|
case ModeTransliterate:
|
||||||
|
to += "//TRANSLIT";
|
||||||
|
break;
|
||||||
|
case ModeDefault:
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_local->context.reset(iconv_open(to.c_str(), from.c_str()));
|
||||||
|
if (reinterpret_cast<iconv_t>(-1) == m_local->context.get()) {
|
||||||
|
auto msg = std::string("Failed to create an iconv context for \"") +
|
||||||
|
from + "\" to \"" + to + "\" conversion (error code " +
|
||||||
|
std::to_string(errno);
|
||||||
|
if (EINVAL == errno)
|
||||||
|
msg += " EINVAL";
|
||||||
|
msg += ")";
|
||||||
|
|
||||||
|
throw IconvOpenFailure(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
IconvWrapper::~IconvWrapper() noexcept = default;
|
||||||
|
|
||||||
|
void IconvWrapper::conv (
|
||||||
|
const char* buff,
|
||||||
|
std::size_t len,
|
||||||
|
PtrGetterFunc get_ptr,
|
||||||
|
SizeGetterFunc get_size,
|
||||||
|
ReallocFunc realloc,
|
||||||
|
std::size_t grow_hint
|
||||||
|
) {
|
||||||
|
assert(buff);
|
||||||
|
assert(len);
|
||||||
|
|
||||||
|
const constexpr std::size_t def_inc = 16;
|
||||||
|
const constexpr std::size_t iconv_err = static_cast<std::size_t>(-1);
|
||||||
|
|
||||||
|
std::size_t nchars;
|
||||||
|
std::size_t inbytesleft = len;
|
||||||
|
char* inbuff = const_cast<char*>(buff);
|
||||||
|
std::ptrdiff_t out_offset = 0;
|
||||||
|
std::size_t grow_factor = grow_hint;
|
||||||
|
std::size_t outbytesleft;
|
||||||
|
|
||||||
|
do {
|
||||||
|
realloc(std::max(inbytesleft * grow_factor, def_inc) + get_size());
|
||||||
|
assert(get_size() > static_cast<std::size_t>(out_offset));
|
||||||
|
outbytesleft = get_size() - out_offset;
|
||||||
|
char* outbuff = get_ptr() + out_offset;
|
||||||
|
|
||||||
|
const auto old_inbytesleft = inbytesleft;
|
||||||
|
const auto old_outbytesleft = outbytesleft;
|
||||||
|
|
||||||
|
nchars = ::iconv(
|
||||||
|
m_local->context.get(),
|
||||||
|
&inbuff,
|
||||||
|
&inbytesleft,
|
||||||
|
&outbuff,
|
||||||
|
&outbytesleft
|
||||||
|
);
|
||||||
|
if (iconv_err == nchars) {
|
||||||
|
const auto pos_str = std::to_string(len - inbytesleft);
|
||||||
|
switch (errno) {
|
||||||
|
case EILSEQ:
|
||||||
|
throw IconvBadSequence("Invalid input multibyte sequence at byte " + pos_str);
|
||||||
|
case EINVAL:
|
||||||
|
throw IconvBadSequence("Incomplete input multibyte sequence at byte " + pos_str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out_offset = std::distance(get_ptr(), outbuff);
|
||||||
|
assert(out_offset >= 0);
|
||||||
|
|
||||||
|
const auto in_diff = old_inbytesleft - inbytesleft;
|
||||||
|
const auto out_diff = old_outbytesleft - outbytesleft;
|
||||||
|
grow_factor = std::max<std::size_t>(1, out_diff / in_diff);
|
||||||
|
} while (iconv_err == nchars and E2BIG == errno);
|
||||||
|
|
||||||
|
assert(outbytesleft < get_size());
|
||||||
|
realloc(get_size() - outbytesleft);
|
||||||
|
}
|
||||||
|
} //namespace duck
|
87
src/iconv_wrapper.hpp
Normal file
87
src/iconv_wrapper.hpp
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <functional>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
class IconvBadSequence : public std::domain_error {
|
||||||
|
public:
|
||||||
|
explicit IconvBadSequence(const std::string& message);
|
||||||
|
};
|
||||||
|
class IconvOpenFailure : public std::logic_error {
|
||||||
|
public:
|
||||||
|
explicit IconvOpenFailure(const std::string& message);
|
||||||
|
};
|
||||||
|
|
||||||
|
class IconvWrapper {
|
||||||
|
typedef std::function<char*()> PtrGetterFunc;
|
||||||
|
typedef std::function<void(std::size_t)> ReallocFunc;
|
||||||
|
typedef std::function<std::size_t()> SizeGetterFunc;
|
||||||
|
public:
|
||||||
|
enum Mode {
|
||||||
|
ModeTransliterate, ModeIgnore, ModeDefault
|
||||||
|
};
|
||||||
|
|
||||||
|
IconvWrapper (const std::string& from, std::string to, Mode mode=ModeDefault);
|
||||||
|
~IconvWrapper() noexcept;
|
||||||
|
|
||||||
|
template <typename CIn, typename COut>
|
||||||
|
std::basic_string<COut> conv (std::basic_string_view<CIn> text);
|
||||||
|
|
||||||
|
std::string conv_char (std::string_view text) {return conv<char, char>(text);}
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct LocalData;
|
||||||
|
|
||||||
|
void conv (
|
||||||
|
const char* buff,
|
||||||
|
std::size_t len,
|
||||||
|
PtrGetterFunc get_ptr,
|
||||||
|
SizeGetterFunc get_size,
|
||||||
|
ReallocFunc realloc,
|
||||||
|
std::size_t grow_hint
|
||||||
|
);
|
||||||
|
|
||||||
|
std::unique_ptr<LocalData> m_local;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename CIn, typename COut>
|
||||||
|
std::basic_string<COut> IconvWrapper::conv (std::basic_string_view<CIn> text) {
|
||||||
|
typedef std::basic_string<COut> string;
|
||||||
|
|
||||||
|
if (text.empty())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
string retval;
|
||||||
|
this->conv(
|
||||||
|
reinterpret_cast<const char*>(text.data()),
|
||||||
|
text.size() * sizeof(CIn),
|
||||||
|
[&retval](){return reinterpret_cast<char*>(retval.data());},
|
||||||
|
[&retval]()->std::size_t {return retval.size() * sizeof(COut);},
|
||||||
|
[&retval](std::size_t sz){retval.resize((sz + sizeof(COut) - 1) / sizeof(COut));},
|
||||||
|
sizeof(COut) / sizeof(CIn)
|
||||||
|
);
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
} //namespace duck
|
115
src/kakoune/ref_ptr.hh
Normal file
115
src/kakoune/ref_ptr.hh
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
#ifndef ref_ptr_hh_INCLUDED
|
||||||
|
#define ref_ptr_hh_INCLUDED
|
||||||
|
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
namespace Kakoune
|
||||||
|
{
|
||||||
|
|
||||||
|
struct RefCountable
|
||||||
|
{
|
||||||
|
int refcount = 0;
|
||||||
|
virtual ~RefCountable() = default;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct RefCountablePolicy
|
||||||
|
{
|
||||||
|
static void inc_ref(RefCountable* r, void*) noexcept { ++r->refcount; }
|
||||||
|
static void dec_ref(RefCountable* r, void*) { if (--r->refcount == 0) delete r; }
|
||||||
|
static void ptr_moved(RefCountable*, void*, void*) noexcept {}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename Policy = RefCountablePolicy>
|
||||||
|
struct RefPtr
|
||||||
|
{
|
||||||
|
RefPtr() = default;
|
||||||
|
explicit RefPtr(T* ptr) : m_ptr(ptr) { acquire(); }
|
||||||
|
~RefPtr() { release(); }
|
||||||
|
RefPtr(const RefPtr& other) : m_ptr(other.m_ptr) { acquire(); }
|
||||||
|
RefPtr(RefPtr&& other)
|
||||||
|
noexcept(noexcept(std::declval<RefPtr>().moved(nullptr)))
|
||||||
|
: m_ptr(other.m_ptr) { other.m_ptr = nullptr; moved(&other); }
|
||||||
|
|
||||||
|
RefPtr& operator=(const RefPtr& other)
|
||||||
|
{
|
||||||
|
if (other.m_ptr != m_ptr)
|
||||||
|
{
|
||||||
|
release();
|
||||||
|
m_ptr = other.m_ptr;
|
||||||
|
acquire();
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
RefPtr& operator=(RefPtr&& other)
|
||||||
|
{
|
||||||
|
release();
|
||||||
|
m_ptr = other.m_ptr;
|
||||||
|
other.m_ptr = nullptr;
|
||||||
|
moved(&other);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
RefPtr& operator=(T* ptr)
|
||||||
|
{
|
||||||
|
if (ptr != m_ptr)
|
||||||
|
{
|
||||||
|
release();
|
||||||
|
m_ptr = ptr;
|
||||||
|
acquire();
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
[[gnu::always_inline]]
|
||||||
|
T* operator->() const { return m_ptr; }
|
||||||
|
[[gnu::always_inline]]
|
||||||
|
T& operator*() const { return *m_ptr; }
|
||||||
|
|
||||||
|
[[gnu::always_inline]]
|
||||||
|
T* get() const { return m_ptr; }
|
||||||
|
|
||||||
|
[[gnu::always_inline]]
|
||||||
|
explicit operator bool() const { return m_ptr; }
|
||||||
|
|
||||||
|
void reset(T* ptr = nullptr)
|
||||||
|
{
|
||||||
|
if (ptr == m_ptr)
|
||||||
|
return;
|
||||||
|
release();
|
||||||
|
m_ptr = ptr;
|
||||||
|
acquire();
|
||||||
|
}
|
||||||
|
|
||||||
|
friend bool operator==(const RefPtr& lhs, const RefPtr& rhs) { return lhs.m_ptr == rhs.m_ptr; }
|
||||||
|
friend bool operator!=(const RefPtr& lhs, const RefPtr& rhs) { return lhs.m_ptr != rhs.m_ptr; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
T* m_ptr = nullptr;
|
||||||
|
|
||||||
|
[[gnu::always_inline]]
|
||||||
|
void acquire()
|
||||||
|
{
|
||||||
|
if (m_ptr)
|
||||||
|
Policy::inc_ref(m_ptr, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[gnu::always_inline]]
|
||||||
|
void release()
|
||||||
|
{
|
||||||
|
if (m_ptr)
|
||||||
|
Policy::dec_ref(m_ptr, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[gnu::always_inline]]
|
||||||
|
void moved(void* from)
|
||||||
|
noexcept(noexcept(Policy::ptr_moved(nullptr, nullptr, nullptr)))
|
||||||
|
{
|
||||||
|
if (m_ptr)
|
||||||
|
Policy::ptr_moved(m_ptr, from, this);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // ref_ptr_hh_INCLUDED
|
109
src/kakoune/safe_ptr.hh
Normal file
109
src/kakoune/safe_ptr.hh
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
#ifndef safe_ptr_hh_INCLUDED
|
||||||
|
#define safe_ptr_hh_INCLUDED
|
||||||
|
|
||||||
|
// #define SAFE_PTR_TRACK_CALLSTACKS
|
||||||
|
|
||||||
|
//King_DuckZ:
|
||||||
|
#include <cassert>
|
||||||
|
#define kak_assert(a) assert(a)
|
||||||
|
|
||||||
|
//#include "assert.hh"
|
||||||
|
#include "ref_ptr.hh"
|
||||||
|
|
||||||
|
#include <type_traits>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||||
|
#include "backtrace.hh"
|
||||||
|
#include "vector.hh"
|
||||||
|
#include <algorithm>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace Kakoune
|
||||||
|
{
|
||||||
|
|
||||||
|
// *** SafePtr: objects that assert nobody references them when they die ***
|
||||||
|
|
||||||
|
class SafeCountable
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
#ifdef KAK_DEBUG
|
||||||
|
SafeCountable() : m_count(0) {}
|
||||||
|
SafeCountable (SafeCountable&&) : m_count(0) {}
|
||||||
|
~SafeCountable()
|
||||||
|
{
|
||||||
|
kak_assert(m_count == 0);
|
||||||
|
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||||
|
kak_assert(m_callstacks.empty());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend struct SafeCountablePolicy;
|
||||||
|
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||||
|
struct Callstack
|
||||||
|
{
|
||||||
|
Callstack(void* p) : ptr(p) {}
|
||||||
|
void* ptr;
|
||||||
|
Backtrace bt;
|
||||||
|
};
|
||||||
|
|
||||||
|
mutable Vector<Callstack> m_callstacks;
|
||||||
|
#endif
|
||||||
|
mutable int m_count;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SafeCountablePolicy
|
||||||
|
{
|
||||||
|
#ifdef KAK_DEBUG
|
||||||
|
static void inc_ref(const SafeCountable* sc, void* ptr) noexcept
|
||||||
|
{
|
||||||
|
++sc->m_count;
|
||||||
|
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||||
|
sc->m_callstacks.emplace_back(ptr);
|
||||||
|
#else
|
||||||
|
static_cast<void>(ptr);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void dec_ref(const SafeCountable* sc, void* ptr) noexcept
|
||||||
|
{
|
||||||
|
--sc->m_count;
|
||||||
|
kak_assert(sc->m_count >= 0);
|
||||||
|
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||||
|
auto it = std::find_if(sc->m_callstacks.begin(), sc->m_callstacks.end(),
|
||||||
|
[=](const SafeCountable::Callstack& cs) { return cs.ptr == ptr; });
|
||||||
|
kak_assert(it != sc->m_callstacks.end());
|
||||||
|
sc->m_callstacks.erase(it);
|
||||||
|
#else
|
||||||
|
static_cast<void>(ptr);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ptr_moved(const SafeCountable* sc, void* from, void* to) noexcept
|
||||||
|
{
|
||||||
|
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||||
|
auto it = std::find_if(sc->m_callstacks.begin(), sc->m_callstacks.end(),
|
||||||
|
[=](const SafeCountable::Callstack& cs) { return cs.ptr == from; });
|
||||||
|
kak_assert(it != sc->m_callstacks.end());
|
||||||
|
it->ptr = to;
|
||||||
|
#else
|
||||||
|
static_cast<void>(sc);
|
||||||
|
static_cast<void>(from);
|
||||||
|
static_cast<void>(to);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static void inc_ref(const SafeCountable*, void*) noexcept {}
|
||||||
|
static void dec_ref(const SafeCountable*, void*) noexcept {}
|
||||||
|
static void ptr_moved(const SafeCountable*, void*, void*) noexcept {}
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
using SafePtr = RefPtr<T, SafeCountablePolicy>;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // safe_ptr_hh_INCLUDED
|
124
src/main.cpp
124
src/main.cpp
|
@ -16,20 +16,24 @@
|
||||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "htmlretrieve.hpp"
|
|
||||||
#include "commandline.hpp"
|
#include "commandline.hpp"
|
||||||
#include "xpath.hpp"
|
#include "xpath.hpp"
|
||||||
|
#include "scraplang.hpp"
|
||||||
|
#include "html_pool.hpp"
|
||||||
|
#include "read_all.hpp"
|
||||||
|
#include "safe_stack_object.hpp"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <ciso646>
|
#include <ciso646>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <iterator>
|
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void dump_string ( const std::string& parPathDest, const std::string& parData );
|
void dump_string ( const std::string& parPathDest, const std::string& parData );
|
||||||
|
void load_from_commandline ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
|
||||||
|
void load_from_model ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
int main (int argc, char* argv[]) {
|
int main (int argc, char* argv[]) {
|
||||||
|
@ -46,54 +50,25 @@ int main (int argc, char* argv[]) {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto url = vm["input-url"].as<std::string>();
|
|
||||||
const auto xpath = vm["xpath"].as<std::string>();
|
|
||||||
#if !defined(NDEBUG)
|
|
||||||
std::cout << "URL : " << url << "\n";
|
|
||||||
std::cout << "XPath: " << xpath << std::endl;
|
|
||||||
std::cout << "Agent: " << vm["agent"].as<std::string>() << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::string html;
|
|
||||||
|
|
||||||
if ("-" != url) {
|
|
||||||
html = duck::fetch_html(url, vm["agent"].as<std::string>(), false, false);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
std::cin >> std::noskipws;
|
|
||||||
std::istream_iterator<char> it(std::cin);
|
|
||||||
std::istream_iterator<char> end;
|
|
||||||
html = std::string(it, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (vm.count("dump-raw")) {
|
|
||||||
dump_string(vm["dump-raw"].as<std::string>(), html);
|
|
||||||
}
|
|
||||||
|
|
||||||
html = duck::clean_html(std::move(html));
|
|
||||||
if (vm.count("dump")) {
|
|
||||||
dump_string(vm["dump"].as<std::string>(), html);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
std::vector<std::string> queries;
|
curry::SafeStackObject<duck::XPath> query;
|
||||||
queries.reserve(1);
|
if (vm.count("model"))
|
||||||
queries.push_back(std::move(xpath));
|
load_from_model(vm, query);
|
||||||
auto results = duck::xpath_query(html, queries);
|
else
|
||||||
for (const auto& lst : results[0]) {
|
load_from_commandline(vm, query);
|
||||||
std::cout << lst.first << ": " << lst.second << '\n';
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
catch (const duck::ParseError& err) {
|
catch (const duck::ParseError& err) {
|
||||||
std::cerr << err.what() << std::endl;
|
std::cerr << err.what() << std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
catch (const std::runtime_error& err) {
|
||||||
|
std::cerr << err.what() << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
void dump_string (const std::string& parPathDest, const std::string& parData) {
|
void dump_string (const std::string& parPathDest, const std::string& parData) {
|
||||||
std::unique_ptr<std::ofstream> ofs;
|
std::unique_ptr<std::ofstream> ofs;
|
||||||
const bool use_stdout = ("-" == parPathDest);
|
const bool use_stdout = ("-" == parPathDest);
|
||||||
|
@ -103,4 +78,73 @@ namespace {
|
||||||
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
||||||
*os << parData;
|
*os << parData;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void load_from_commandline (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
const auto& vm = parVarMap;
|
||||||
|
const auto url = vm["input-url"].as<string>();
|
||||||
|
|
||||||
|
duck::HtmlPool html_pool(
|
||||||
|
string(parVarMap["agent"].as<string>()),
|
||||||
|
string(parVarMap["from-code"].as<string>())
|
||||||
|
);
|
||||||
|
const auto in_html_id = html_pool.GetOrAdd(url);
|
||||||
|
string html = *html_pool.GetByID(in_html_id);
|
||||||
|
if (vm.count("dump")) {
|
||||||
|
dump_string(vm["dump"].as<string>(), html);
|
||||||
|
}
|
||||||
|
|
||||||
|
const string xpath_str = parVarMap["xpath"].as<string>();
|
||||||
|
|
||||||
|
#if !defined(NDEBUG)
|
||||||
|
std::cout << " -- XPath direct mode --\n";
|
||||||
|
std::cout << "URL : " << parVarMap["input-url"].as<string>() << "\n";
|
||||||
|
std::cout << "XPath: " << xpath_str << std::endl;
|
||||||
|
std::cout << "Agent: " << parVarMap["agent"].as<string>() << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::vector<string> queries;
|
||||||
|
queries.reserve(1);
|
||||||
|
queries.push_back(std::move(xpath_str));
|
||||||
|
auto results = xpath->run_query(html, queries, parVarMap["namespace"].as<string>());
|
||||||
|
for (const auto& lst : results[0]) {
|
||||||
|
std::cout << lst.first << ": " << lst.second << '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
#if !defined(NDEBUG)
|
||||||
|
std::cout << " -- XPath model mode --\n";
|
||||||
|
if (parVarMap.count("input-url"))
|
||||||
|
std::cout << "URL : " << parVarMap["input-url"].as<string>() << "\n";
|
||||||
|
std::cout << "Model: " << parVarMap["model"].as<string>() << std::endl;
|
||||||
|
std::cout << "Agent: " << parVarMap["agent"].as<string>() << std::endl;
|
||||||
|
#endif
|
||||||
|
const string script = duck::read_all(parVarMap["model"].as<string>());
|
||||||
|
auto ast = duck::sl::parse(script);
|
||||||
|
|
||||||
|
duck::HtmlPool html_pool(
|
||||||
|
string(parVarMap["agent"].as<string>()),
|
||||||
|
string(parVarMap["from-code"].as<string>())
|
||||||
|
);
|
||||||
|
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath, string(parVarMap["namespace"].as<string>()));
|
||||||
|
//auto list = duck::get_xpath_definitions(*ast);
|
||||||
|
|
||||||
|
//std::vector<string> expressions;
|
||||||
|
//expressions.reserve(list.size());
|
||||||
|
//for (duck::element_def& elem : list) {
|
||||||
|
// expressions.push_back(std::move(elem.xpath));
|
||||||
|
//}
|
||||||
|
//auto results = duck::xpath_query(parXML, expressions);
|
||||||
|
//duck::print_results(std::cout, *ast, list, results);
|
||||||
|
//for (const auto& list : results) {
|
||||||
|
// std::cout << "------\n";
|
||||||
|
// for (const auto& result : list) {
|
||||||
|
// std::cout << result.first << ": " << result.second << '\n';
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
38
src/read_all.cpp
Normal file
38
src/read_all.cpp
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "read_all.hpp"
|
||||||
|
#include <fstream>
|
||||||
|
#include <iterator>
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
std::string read_all (std::istream& parStream) {
|
||||||
|
parStream >> std::noskipws;
|
||||||
|
std::istream_iterator<char> it(parStream);
|
||||||
|
std::istream_iterator<char> end;
|
||||||
|
return std::string(it, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string read_all (std::istream&& parStream) {
|
||||||
|
return read_all(parStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string read_all (const std::string& parPath) {
|
||||||
|
return read_all(std::ifstream(parPath));
|
||||||
|
}
|
||||||
|
} //namespace duck
|
|
@ -16,34 +16,22 @@
|
||||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef idBE96C2D49C4C413888A79EAEB2B9C0FA
|
#ifndef id0768F384342E4FD58028BE415A725169
|
||||||
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
|
#define id0768F384342E4FD58028BE415A725169
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <memory>
|
|
||||||
|
namespace std {
|
||||||
|
template <typename Char, typename Traits>
|
||||||
|
class basic_istream;
|
||||||
|
|
||||||
|
typedef basic_istream<char> istream;
|
||||||
|
} //namespace std
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
struct ScrapNode;
|
std::string read_all ( std::istream& parStream );
|
||||||
struct element_def;
|
std::string read_all ( std::istream&& parStream );
|
||||||
|
std::string read_all ( const std::string& parPath );
|
||||||
class ScrapNodePtr {
|
|
||||||
public:
|
|
||||||
explicit ScrapNodePtr ( ScrapNode* parPtr );
|
|
||||||
ScrapNodePtr ( ScrapNodePtr&& parOther );
|
|
||||||
~ScrapNodePtr ( void ) noexcept;
|
|
||||||
|
|
||||||
ScrapNode& operator* ( void ) { return *m_ptr; }
|
|
||||||
const ScrapNode& operator* ( void ) const { return *m_ptr; }
|
|
||||||
ScrapNode& operator-> ( void ) { return *m_ptr; }
|
|
||||||
const ScrapNode& operator-> ( void ) const { return *m_ptr; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<ScrapNode> m_ptr;
|
|
||||||
};
|
|
||||||
|
|
||||||
ScrapNodePtr parse_scraplang ( const std::string& parData );
|
|
||||||
std::vector<element_def> get_xpath_definitions ( const ScrapNode& parAST );
|
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
#endif
|
#endif
|
104
src/safe_stack_object.hpp
Normal file
104
src/safe_stack_object.hpp
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
/*
|
||||||
|
Copyright 2016, 2017 Michele "King_DuckZ" Santullo
|
||||||
|
|
||||||
|
This file is part of MyCurry.
|
||||||
|
|
||||||
|
MyCurry is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
MyCurry is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with MyCurry. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "kakoune/safe_ptr.hh"
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
namespace curry {
|
||||||
|
template <typename T>
|
||||||
|
class SafeStackObject {
|
||||||
|
public:
|
||||||
|
typedef Kakoune::SafePtr<T> safe_ptr;
|
||||||
|
|
||||||
|
SafeStackObject();
|
||||||
|
SafeStackObject (SafeStackObject&& parOther);
|
||||||
|
SafeStackObject (const SafeStackObject& parOther) = delete;
|
||||||
|
template <typename... Args> explicit SafeStackObject (Args&&... parArgs);
|
||||||
|
~SafeStackObject() noexcept = default;
|
||||||
|
|
||||||
|
SafeStackObject& operator= (SafeStackObject&& parOther) = delete;
|
||||||
|
SafeStackObject& operator= (const SafeStackObject& parOther) = delete;
|
||||||
|
|
||||||
|
operator Kakoune::SafePtr<T>&();
|
||||||
|
template <typename U>
|
||||||
|
operator Kakoune::SafePtr<U>();
|
||||||
|
T& operator*();
|
||||||
|
safe_ptr& operator->();
|
||||||
|
|
||||||
|
private:
|
||||||
|
T m_obj;
|
||||||
|
safe_ptr m_obj_ptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
SafeStackObject<T>::SafeStackObject() :
|
||||||
|
m_obj(),
|
||||||
|
m_obj_ptr(&m_obj)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
SafeStackObject<T>::SafeStackObject (SafeStackObject&& parOther) :
|
||||||
|
m_obj(std::move(parOther.m_obj)),
|
||||||
|
m_obj_ptr(&m_obj)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
template <typename... Args>
|
||||||
|
SafeStackObject<T>::SafeStackObject (Args&&... parArgs) :
|
||||||
|
m_obj(std::forward<Args>(parArgs)...),
|
||||||
|
m_obj_ptr(&m_obj)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
//template <typename T>
|
||||||
|
//SafeStackObject& SafeStackObject<T>::operator= (SafeStackObject&& parOther) {
|
||||||
|
// m_obj = std::move(parOther.m_obj);
|
||||||
|
// m_obj_ptr = std::move(parOther.m_obj_ptr);
|
||||||
|
// m_ob
|
||||||
|
//}
|
||||||
|
|
||||||
|
//template <typename T>
|
||||||
|
//SafeStackObject& SafeStackObject<T>::operator= (const SafeStackObject& parOther) {
|
||||||
|
//}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
SafeStackObject<T>::operator Kakoune::SafePtr<T>&() {
|
||||||
|
return m_obj_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
template <typename U>
|
||||||
|
SafeStackObject<T>::operator Kakoune::SafePtr<U>() {
|
||||||
|
return Kakoune::SafePtr<U>(&m_obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
T& SafeStackObject<T>::operator*() {
|
||||||
|
return *m_obj_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
auto SafeStackObject<T>::operator->() -> safe_ptr& {
|
||||||
|
return m_obj_ptr;
|
||||||
|
}
|
||||||
|
} //namespace curry
|
25
src/scraplang.hpp
Normal file
25
src/scraplang.hpp
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
/* Copyright (C) 2017 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id8483FDE5CA0E4F40BDE0F4469AC2DF79
|
||||||
|
#define id8483FDE5CA0E4F40BDE0F4469AC2DF79
|
||||||
|
|
||||||
|
#include "scraplang/parse.hpp"
|
||||||
|
#include "scraplang/apply.hpp"
|
||||||
|
|
||||||
|
#endif
|
517
src/scraplang/apply.cpp
Normal file
517
src/scraplang/apply.cpp
Normal file
|
@ -0,0 +1,517 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define APPLY_VERBOSE
|
||||||
|
|
||||||
|
#include "apply.hpp"
|
||||||
|
#include "mstch/mstch.hpp"
|
||||||
|
#include "html_pool_base.hpp"
|
||||||
|
#include "scrap_node.hpp"
|
||||||
|
#include "xpath_runner.hpp"
|
||||||
|
#if defined(APPLY_VERBOSE)
|
||||||
|
# include "stream_scrap_node.hpp"
|
||||||
|
#endif
|
||||||
|
#include <map>
|
||||||
|
#include <boost/variant/apply_visitor.hpp>
|
||||||
|
#include <string_view>
|
||||||
|
#include <list>
|
||||||
|
#include <functional>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
#if defined(APPLY_VERBOSE)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
struct EntryNode;
|
||||||
|
|
||||||
|
struct MustacheEntry {
|
||||||
|
std::string text;
|
||||||
|
mstch::map context;
|
||||||
|
};
|
||||||
|
|
||||||
|
using EntryNodeList = std::vector<std::pair<
|
||||||
|
const SourceInfo*,
|
||||||
|
EntryNode
|
||||||
|
>>;
|
||||||
|
using MustacheEntryMap = std::map<std::string, MustacheEntry>;
|
||||||
|
typedef std::function<void(std::size_t)> FixLengthCommand;
|
||||||
|
|
||||||
|
struct EntryNode {
|
||||||
|
explicit EntryNode (const std::string_view& parName) :
|
||||||
|
name(parName)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
EntryNode (EntryNode&&) = default;
|
||||||
|
EntryNode (const EntryNode&) = default;
|
||||||
|
EntryNode& operator= (EntryNode&&) = default;
|
||||||
|
EntryNode& operator= (const EntryNode&) = default;
|
||||||
|
|
||||||
|
std::string_view name;
|
||||||
|
std::vector<EntryNode> structs;
|
||||||
|
std::vector<const XPathElement*> xpaths;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ApplyEntry {
|
||||||
|
ApplyEntry (const ApplyEntry&) = default;
|
||||||
|
ApplyEntry (const SourceInfo* parAppTo, std::string_view parMstchName) :
|
||||||
|
apply_to(parAppTo),
|
||||||
|
content(""),
|
||||||
|
mustache_name(parMstchName)
|
||||||
|
{
|
||||||
|
assert(apply_to);
|
||||||
|
assert(not apply_to->value.empty());
|
||||||
|
}
|
||||||
|
ApplyEntry (ApplyEntry&&) = default;
|
||||||
|
ApplyEntry& operator=(ApplyEntry&&) = default;
|
||||||
|
|
||||||
|
const SourceInfo* apply_to;
|
||||||
|
EntryNode content;
|
||||||
|
std::string_view mustache_name;
|
||||||
|
};
|
||||||
|
|
||||||
|
class StructItemExtractor : public boost::static_visitor<> {
|
||||||
|
public:
|
||||||
|
StructItemExtractor() = delete;
|
||||||
|
explicit StructItemExtractor (EntryNode& parRoot) :
|
||||||
|
m_root(parRoot)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator() (const XPathElement& parVal) {
|
||||||
|
m_root.xpaths.push_back(&parVal);
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator() (const StructBlock& parVal) {
|
||||||
|
m_root.structs.emplace_back(parVal.name);
|
||||||
|
StructItemExtractor visitor(m_root.structs.back());
|
||||||
|
for (auto& itm : parVal.xpaths) {
|
||||||
|
boost::apply_visitor(visitor, itm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
EntryNode& m_root;
|
||||||
|
};
|
||||||
|
|
||||||
|
class FillWithClonesVisitor : public boost::static_visitor<mstch::node&&> {
|
||||||
|
public:
|
||||||
|
explicit FillWithClonesVisitor (std::size_t exp_size) :
|
||||||
|
m_expected_size(exp_size)
|
||||||
|
{ }
|
||||||
|
virtual mstch::node&& operator()(mstch::array&& parOut) {
|
||||||
|
if (parOut.empty()) {
|
||||||
|
parOut.resize(m_expected_size);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::fill_n(
|
||||||
|
std::back_inserter(parOut),
|
||||||
|
std::max(m_expected_size, parOut.size()) - parOut.size(),
|
||||||
|
parOut.back()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
m_retval = std::move(parOut);
|
||||||
|
return std::move(m_retval);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
mstch::node&& operator()(T&& parOut) {
|
||||||
|
mstch::array retval;
|
||||||
|
retval.reserve(m_expected_size);
|
||||||
|
retval.push_back(std::move(parOut));
|
||||||
|
return (*this)(std::move(retval));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
mstch::node m_retval;
|
||||||
|
std::size_t m_expected_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
class FillWithStringsVisitor : public FillWithClonesVisitor {
|
||||||
|
public:
|
||||||
|
FillWithStringsVisitor (const std::string& parVal, std::size_t exp_size) :
|
||||||
|
FillWithClonesVisitor(exp_size),
|
||||||
|
m_value(parVal)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
mstch::node&& operator()(mstch::array&& parOut) override {
|
||||||
|
std::fill_n(
|
||||||
|
std::back_inserter(parOut),
|
||||||
|
std::max(m_expected_size, parOut.size()) - parOut.size(),
|
||||||
|
m_value.get()
|
||||||
|
);
|
||||||
|
m_retval = std::move(parOut);
|
||||||
|
return std::move(m_retval);
|
||||||
|
}
|
||||||
|
|
||||||
|
using FillWithClonesVisitor::operator();
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::reference_wrapper<const std::string> m_value;
|
||||||
|
};
|
||||||
|
|
||||||
|
void fill_with_defaults (mstch::map& parMap, std::size_t parTotal, const std::string& parKey, const std::string& parDefault) {
|
||||||
|
FillWithStringsVisitor visitor(parDefault, parTotal);
|
||||||
|
parMap[parKey] = boost::apply_visitor(visitor, std::move(parMap[parKey]));
|
||||||
|
}
|
||||||
|
|
||||||
|
void fill_with_last_item_clones (mstch::map& parMap, std::size_t parTotal, const std::string& parKey) {
|
||||||
|
FillWithClonesVisitor visitor(parTotal);
|
||||||
|
parMap[parKey] = boost::apply_visitor(visitor, std::move(parMap[parKey]));
|
||||||
|
}
|
||||||
|
|
||||||
|
mstch::map to_mustache_dict_recursive (
|
||||||
|
const EntryNode& parNode,
|
||||||
|
std::string_view parSrc,
|
||||||
|
XPathRunner& parRunner
|
||||||
|
);
|
||||||
|
|
||||||
|
void store_entry_subtree (
|
||||||
|
const std::vector<StructItem>& parXPaths,
|
||||||
|
EntryNode& parCurrList
|
||||||
|
) {
|
||||||
|
for (auto& itm : parXPaths) {
|
||||||
|
StructItemExtractor extractor(parCurrList);
|
||||||
|
boost::apply_visitor(extractor, itm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class DictBuilder : public boost::static_visitor<> {
|
||||||
|
public:
|
||||||
|
explicit DictBuilder() :
|
||||||
|
m_current_mustache_name(nullptr),
|
||||||
|
m_current_mustache(nullptr)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator() (const std::vector<ScrapNode>& parVal) {
|
||||||
|
for (auto& val : parVal) {
|
||||||
|
boost::apply_visitor(*this, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator() (const FromBlock& parVal) {
|
||||||
|
#if defined(APPLY_VERBOSE)
|
||||||
|
std::cout << parVal << '\n';
|
||||||
|
#endif
|
||||||
|
m_global_entries.emplace_back(std::make_pair(
|
||||||
|
&parVal.source,
|
||||||
|
EntryNode("")
|
||||||
|
));
|
||||||
|
|
||||||
|
EntryNode& curr_node = m_global_entries.back().second;
|
||||||
|
store_entry_subtree(parVal.xpaths, curr_node);
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator() (const ApplyBlock& parVal) {
|
||||||
|
#if defined(APPLY_VERBOSE)
|
||||||
|
std::cout << parVal << '\n';
|
||||||
|
#endif
|
||||||
|
assert(not parVal.source.value.empty());
|
||||||
|
m_apply_entries.emplace_back(&parVal.source, parVal.mustache_model);
|
||||||
|
store_entry_subtree(parVal.xpaths, m_apply_entries.back().content);
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator() (const MustacheBlock& parVal) {
|
||||||
|
#if defined(APPLY_VERBOSE)
|
||||||
|
std::cout << "Mustache block \"" << parVal.name << "\"\n";
|
||||||
|
#endif
|
||||||
|
const auto curr_name = m_current_mustache_name;
|
||||||
|
|
||||||
|
if (not curr_name or *curr_name != parVal.name) {
|
||||||
|
m_mustaches[parVal.name] = MustacheEntry();
|
||||||
|
auto it_found = m_mustaches.find(parVal.name);
|
||||||
|
m_current_mustache_name = &it_found->first;
|
||||||
|
m_current_mustache = &it_found->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_current_mustache->text = parVal.content;
|
||||||
|
}
|
||||||
|
|
||||||
|
const EntryNodeList& global_entries() const { return m_global_entries; }
|
||||||
|
const MustacheEntryMap& mustache_entries() const { return m_mustaches; }
|
||||||
|
const std::vector<ApplyEntry>& apply_entries() const { return m_apply_entries; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
EntryNodeList m_global_entries;
|
||||||
|
std::vector<ApplyEntry> m_apply_entries;
|
||||||
|
MustacheEntryMap m_mustaches;
|
||||||
|
const std::string* m_current_mustache_name;
|
||||||
|
MustacheEntry* m_current_mustache;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ItemCountingVisitor : public boost::static_visitor<std::size_t> {
|
||||||
|
template <typename T>
|
||||||
|
std::size_t operator()(const T&) const { return 1; }
|
||||||
|
std::size_t operator()(const mstch::array& parItem) const { return parItem.size(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
class ArraysToStructArrayVisitor : public boost::static_visitor<> {
|
||||||
|
public:
|
||||||
|
explicit ArraysToStructArrayVisitor (std::size_t parExpectedSize) :
|
||||||
|
m_expected_size(parExpectedSize)
|
||||||
|
{
|
||||||
|
m_array.resize(m_expected_size, mstch::map());
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator()(const std::string& parName, const mstch::array& parItem) {
|
||||||
|
for (std::size_t z = 0; z < parItem.size(); ++z) {
|
||||||
|
auto& curr_map = boost::get<mstch::map>(m_array[z]);
|
||||||
|
curr_map[parName] = parItem[z];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void operator()(const std::string& parName, const T& parItem) {
|
||||||
|
auto& curr_map = boost::get<mstch::map>(m_array[0]);
|
||||||
|
curr_map[parName] = parItem;
|
||||||
|
}
|
||||||
|
|
||||||
|
mstch::node steal_struct() {
|
||||||
|
if (1 == m_expected_size)
|
||||||
|
return mstch::node(std::move(m_array[0]));
|
||||||
|
else
|
||||||
|
return mstch::node(std::move(m_array));
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
mstch::array m_array;
|
||||||
|
const std::size_t m_expected_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<std::string>& query_xpath_by_name (
|
||||||
|
const EntryNodeList& parNodes,
|
||||||
|
const std::string_view& parName,
|
||||||
|
XPathRunner& parRunner
|
||||||
|
) {
|
||||||
|
for (auto& curr_node : parNodes) {
|
||||||
|
assert(curr_node.first);
|
||||||
|
const SourceInfo& source = *curr_node.first;
|
||||||
|
const EntryNode& entry = curr_node.second;
|
||||||
|
assert(entry.name.empty());
|
||||||
|
|
||||||
|
auto it_found = std::find_if(
|
||||||
|
entry.xpaths.begin(),
|
||||||
|
entry.xpaths.end(),
|
||||||
|
[&parName](const auto& xpath_elem) {
|
||||||
|
return xpath_elem->name == parName;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (it_found != entry.xpaths.end()) {
|
||||||
|
const XPathElement* const val = *it_found;
|
||||||
|
assert(val);
|
||||||
|
return parRunner.query(source.value, val->xpath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static const std::vector<std::string> empty_retval;
|
||||||
|
std::cout << "query_xpath_by_name(parNodes, \"" << parName <<
|
||||||
|
"\", parRunner) -> nothing found" << std::endl;
|
||||||
|
assert(false); //throw?
|
||||||
|
return empty_retval;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t largest_array_size_in (const mstch::map& parMap) {
|
||||||
|
typedef ItemCountingVisitor ITC;
|
||||||
|
using boost::apply_visitor;
|
||||||
|
|
||||||
|
if (parMap.empty())
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return apply_visitor(
|
||||||
|
ITC(),
|
||||||
|
std::max_element(parMap.begin(), parMap.end(), [](const auto& a, const auto& b) {
|
||||||
|
return apply_visitor(ITC(), a.second) < apply_visitor(ITC(), b.second);
|
||||||
|
})->second
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
void fill_with_xpaths (
|
||||||
|
mstch::map& parOut,
|
||||||
|
std::vector<FixLengthCommand>& parFixCommands,
|
||||||
|
const EntryNode& parNode,
|
||||||
|
std::string_view parSrc,
|
||||||
|
XPathRunner& parRunner
|
||||||
|
) {
|
||||||
|
using std::placeholders::_1;
|
||||||
|
|
||||||
|
for (const XPathElement* xpath : parNode.xpaths) {
|
||||||
|
assert(xpath);
|
||||||
|
std::cout << "Running query for \"" << xpath->name << "\"\n";
|
||||||
|
auto results = parRunner.query(parSrc, xpath->xpath);
|
||||||
|
if (results.size() == 1) {
|
||||||
|
parOut[xpath->name] = results.front();
|
||||||
|
}
|
||||||
|
else if (results.size() > 1) {
|
||||||
|
mstch::array values;
|
||||||
|
values.reserve(results.size());
|
||||||
|
std::copy(results.begin(), results.end(), std::back_inserter(values));
|
||||||
|
parOut[xpath->name] = std::move(values);
|
||||||
|
}
|
||||||
|
else if (xpath->def_val) {
|
||||||
|
parOut[xpath->name] = *xpath->def_val;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
parOut[xpath->name] = std::string();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (xpath->def_val)
|
||||||
|
parFixCommands.push_back(std::bind(&fill_with_defaults, std::ref(parOut), _1, std::cref(xpath->name), std::cref(*xpath->def_val)));
|
||||||
|
else
|
||||||
|
parFixCommands.push_back(std::bind(&fill_with_last_item_clones, std::ref(parOut), _1, std::cref(xpath->name)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void fill_with_structs (
|
||||||
|
mstch::map& parOut,
|
||||||
|
const EntryNode& parNode,
|
||||||
|
std::string_view parSrc,
|
||||||
|
XPathRunner& parRunner
|
||||||
|
) {
|
||||||
|
for (auto& curr_struct : parNode.structs) {
|
||||||
|
assert(not curr_struct.name.empty());
|
||||||
|
|
||||||
|
auto new_struct = to_mustache_dict_recursive(curr_struct, parSrc, parRunner);
|
||||||
|
const std::size_t extracted_struct_size = largest_array_size_in(new_struct);
|
||||||
|
|
||||||
|
ArraysToStructArrayVisitor fix_visitor(extracted_struct_size);
|
||||||
|
for (auto&& itm : new_struct) {
|
||||||
|
auto visitor = [&fix_visitor,&name=itm.first](const auto& var) { fix_visitor(name, var); };
|
||||||
|
boost::apply_visitor(visitor, itm.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
parOut[std::string(curr_struct.name)] = fix_visitor.steal_struct();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mstch::map to_mustache_dict_recursive (
|
||||||
|
const EntryNode& parNode,
|
||||||
|
std::string_view parSrc,
|
||||||
|
XPathRunner& parRunner
|
||||||
|
) {
|
||||||
|
mstch::map retval;
|
||||||
|
std::vector<FixLengthCommand> fix_commands;
|
||||||
|
|
||||||
|
fill_with_xpaths(retval, fix_commands, parNode, parSrc, parRunner);
|
||||||
|
fill_with_structs(retval, parNode, parSrc, parRunner);
|
||||||
|
|
||||||
|
const std::size_t largest = largest_array_size_in(retval);
|
||||||
|
for (const auto& command : fix_commands) {
|
||||||
|
command(largest);
|
||||||
|
}
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
|
||||||
|
mstch::map to_mustache_map (const EntryNodeList& parNodes, XPathRunner& parRunner) {
|
||||||
|
mstch::map retval;
|
||||||
|
for (auto& entry : parNodes) {
|
||||||
|
assert(entry.second.name.empty());
|
||||||
|
std::cout << "Analyzing entry " << *entry.first << '\n';
|
||||||
|
|
||||||
|
assert(entry.first);
|
||||||
|
std::string_view src_url;
|
||||||
|
|
||||||
|
switch (entry.first->type) {
|
||||||
|
case SourceInfo::URL:
|
||||||
|
src_url = entry.first->value;
|
||||||
|
break;
|
||||||
|
case SourceInfo::Token:
|
||||||
|
default:
|
||||||
|
assert(false); //not reached
|
||||||
|
}
|
||||||
|
|
||||||
|
mstch::map curr_entry_map = to_mustache_dict_recursive(entry.second, src_url, parRunner);
|
||||||
|
curr_entry_map.merge(std::move(retval));
|
||||||
|
retval.swap(curr_entry_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
|
||||||
|
void exec_apply_block (
|
||||||
|
const SourceInfo& parSourceInfo,
|
||||||
|
const EntryNode& parEntryNode,
|
||||||
|
const MustacheEntry& parMustache,
|
||||||
|
XPathRunner& parXPathRunner
|
||||||
|
) {
|
||||||
|
EntryNodeList entry_node {std::make_pair(&parSourceInfo, parEntryNode)};
|
||||||
|
mstch::map entry_ctx = to_mustache_map(entry_node, parXPathRunner);
|
||||||
|
for (auto& ctx : parMustache.context) {
|
||||||
|
entry_ctx[ctx.first] = ctx.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "context size: " << entry_ctx.size() << '\n';
|
||||||
|
for (auto& ctx_itm : entry_ctx) {
|
||||||
|
std::cout << '\t' << ctx_itm.first << '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << mstch::render(parMustache.text, entry_ctx) << std::endl;
|
||||||
|
}
|
||||||
|
} //unnamed namespace
|
||||||
|
|
||||||
|
std::vector<std::string> apply (
|
||||||
|
const ScrapNode& node,
|
||||||
|
HtmlPoolBaseSP html_pool,
|
||||||
|
XPathPtr xpath,
|
||||||
|
std::string&& parDefNamespace
|
||||||
|
) {
|
||||||
|
using std::placeholders::_1;
|
||||||
|
|
||||||
|
DictBuilder dict_builder;
|
||||||
|
boost::apply_visitor(dict_builder, node);
|
||||||
|
|
||||||
|
std::vector<std::string> retval;
|
||||||
|
const EntryNodeList& global_entries = dict_builder.global_entries();
|
||||||
|
const MustacheEntryMap& mustaches = dict_builder.mustache_entries();
|
||||||
|
const std::vector<ApplyEntry> apply_entries = dict_builder.apply_entries();
|
||||||
|
retval.reserve(apply_entries.size());
|
||||||
|
|
||||||
|
std::cout << "-------------- visiting done ----------------\n";
|
||||||
|
XPathRunner xpath_runner(html_pool, xpath, std::move(parDefNamespace));
|
||||||
|
|
||||||
|
for (auto& apply_entry : apply_entries) {
|
||||||
|
std::string name(apply_entry.mustache_name);
|
||||||
|
const auto& mustache = mustaches.at(name);
|
||||||
|
if (SourceInfo::Token == apply_entry.apply_to->type) {
|
||||||
|
std::vector<std::string> sources =
|
||||||
|
query_xpath_by_name(global_entries, apply_entry.apply_to->value, xpath_runner);
|
||||||
|
|
||||||
|
for (auto& source : sources) {
|
||||||
|
SourceInfo new_source;
|
||||||
|
new_source.value = source;
|
||||||
|
new_source.type = SourceInfo::URL;
|
||||||
|
|
||||||
|
EntryNode new_node(apply_entry.content.name);
|
||||||
|
new_node.structs = apply_entry.content.structs;
|
||||||
|
new_node.xpaths = apply_entry.content.xpaths;
|
||||||
|
|
||||||
|
exec_apply_block(new_source, new_node, mustache, xpath_runner);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(apply_entry.apply_to);
|
||||||
|
exec_apply_block(*apply_entry.apply_to, apply_entry.content, mustache, xpath_runner);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
}} //namespace duck::sl
|
36
src/scraplang/apply.hpp
Normal file
36
src/scraplang/apply.hpp
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef idC73DBB42FB76433BAFC0B73EAC3B70FF
|
||||||
|
#define idC73DBB42FB76433BAFC0B73EAC3B70FF
|
||||||
|
|
||||||
|
#include "scrap_node.hpp"
|
||||||
|
#include "scraplang/html_pool_base.hpp"
|
||||||
|
#include "xpath_fwd.hpp"
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
std::vector<std::string> apply (
|
||||||
|
const ScrapNode& node,
|
||||||
|
HtmlPoolBaseSP html_pool,
|
||||||
|
XPathPtr xpath,
|
||||||
|
std::string&& parDefNamespace
|
||||||
|
);
|
||||||
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
#endif
|
|
@ -1,4 +1,4 @@
|
||||||
/* Copyright (C) 2015 Michele Santullo
|
/* Copyright (C) 2017 Michele Santullo
|
||||||
*
|
*
|
||||||
* This file is part of DuckScraper.
|
* This file is part of DuckScraper.
|
||||||
*
|
*
|
||||||
|
@ -19,22 +19,15 @@
|
||||||
#ifndef id3875B5F868524EC3A1B83971D4A85777
|
#ifndef id3875B5F868524EC3A1B83971D4A85777
|
||||||
#define id3875B5F868524EC3A1B83971D4A85777
|
#define id3875B5F868524EC3A1B83971D4A85777
|
||||||
|
|
||||||
|
#include "element_types.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck { namespace sl {
|
||||||
enum ElementTypes {
|
struct ElementDef {
|
||||||
ElementType_String,
|
|
||||||
ElementType_Integer,
|
|
||||||
ElementType_Boolean,
|
|
||||||
ElementType_Null,
|
|
||||||
ElementType_Double
|
|
||||||
};
|
|
||||||
|
|
||||||
struct element_def {
|
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string xpath;
|
std::string xpath;
|
||||||
ElementTypes type;
|
ElementTypes type;
|
||||||
};
|
};
|
||||||
} //namespace duck
|
}} //namespace duck::sl
|
||||||
|
|
||||||
#endif
|
#endif
|
32
src/scraplang/element_types.hpp
Normal file
32
src/scraplang/element_types.hpp
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
/* Copyright (C) 2017 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id1AC876186C4B48DD900399994C27A741
|
||||||
|
#define id1AC876186C4B48DD900399994C27A741
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
enum ElementTypes {
|
||||||
|
ElementType_String,
|
||||||
|
ElementType_Integer,
|
||||||
|
ElementType_Boolean,
|
||||||
|
ElementType_Null,
|
||||||
|
ElementType_Double
|
||||||
|
};
|
||||||
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
#endif
|
40
src/scraplang/html_pool_base.hpp
Normal file
40
src/scraplang/html_pool_base.hpp
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef idDD58822D7D8B4AA7A0DD16B1CDEF413E
|
||||||
|
#define idDD58822D7D8B4AA7A0DD16B1CDEF413E
|
||||||
|
|
||||||
|
#include "implem/ResourcePool.hpp"
|
||||||
|
#include "kakoune/safe_ptr.hh"
|
||||||
|
#include <string_view>
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
namespace implem {
|
||||||
|
typedef duckutil::ResourcePool<std::string, std::string_view> HtmlPoolBase;
|
||||||
|
} //namespace implem
|
||||||
|
|
||||||
|
class HtmlPoolBase : public implem::HtmlPoolBase, public Kakoune::SafeCountable {
|
||||||
|
public:
|
||||||
|
using implem::HtmlPoolBase::HtmlPoolBase;
|
||||||
|
using implem::HtmlPoolBase::operator=;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef Kakoune::SafePtr<HtmlPoolBase> HtmlPoolBaseSP;
|
||||||
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
#endif
|
121
src/scraplang/implem/ResourcePool.hpp
Normal file
121
src/scraplang/implem/ResourcePool.hpp
Normal file
|
@ -0,0 +1,121 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id1A180A0568E84FD88D57FAB82C69600E
|
||||||
|
#define id1A180A0568E84FD88D57FAB82C69600E
|
||||||
|
|
||||||
|
#include "SaltedIndex.hpp"
|
||||||
|
#include <map>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cassert>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
namespace duckutil {
|
||||||
|
namespace Implem {
|
||||||
|
template <typename Res, typename Name, typename IDT>
|
||||||
|
class ResourceResNameWrapper {
|
||||||
|
public:
|
||||||
|
typedef IDT IDType;
|
||||||
|
|
||||||
|
ResourceResNameWrapper ( const Name* parName, Res* parRes, IDType parID );
|
||||||
|
ResourceResNameWrapper ( const ResourceResNameWrapper& ) = delete;
|
||||||
|
~ResourceResNameWrapper ( void ) { assert(0 == m_refcount); }
|
||||||
|
|
||||||
|
ResourceResNameWrapper& operator= (const ResourceResNameWrapper&) = delete;
|
||||||
|
|
||||||
|
Res& GetResource ( void ) { return *m_resource; }
|
||||||
|
const Res& GetResource ( void ) const { return *m_resource; }
|
||||||
|
const Name& GetName ( void ) const { return *m_name; }
|
||||||
|
void Retain ( void ) { ++m_refcount; }
|
||||||
|
bool Release ( void ) { assert(m_refcount > 0); --m_refcount; return (0 == m_refcount); }
|
||||||
|
uint32_t GetRefCount ( void ) const { return m_refcount; }
|
||||||
|
uint32_t GetResourceID ( void ) const { return m_resId; }
|
||||||
|
bool IsEmpty ( void ) const { return NULL == m_resource; }
|
||||||
|
void Reset ( void ) { m_resource = NULL; m_name = NULL; m_refcount = m_resId = 0; }
|
||||||
|
void DropRefCount ( void ) { m_refcount = 0; }
|
||||||
|
|
||||||
|
bool operator== ( const ResourceResNameWrapper& parOther ) const { return (GetName() == parOther.GetName()); }
|
||||||
|
bool operator!= ( const ResourceResNameWrapper& parOther ) const { return not (GetName() == parOther.GetName()); }
|
||||||
|
bool operator< ( const ResourceResNameWrapper& parOther ) const { return (GetName() < parOther.GetName()); }
|
||||||
|
bool operator> ( const ResourceResNameWrapper& parOther ) const { return (parOther.GetName() < GetName()); }
|
||||||
|
bool operator>= ( const ResourceResNameWrapper& parOther ) const { return not (GetName() < parOther.GetName()); }
|
||||||
|
bool operator<= ( const ResourceResNameWrapper& parOther ) const { return not (parOther.GetName() < GetName()); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
Res* m_resource;
|
||||||
|
const Name* m_name;
|
||||||
|
uint16_t m_refcount;
|
||||||
|
uint16_t m_resId;
|
||||||
|
};
|
||||||
|
} //namespace Implem
|
||||||
|
|
||||||
|
template <typename Res, typename Name, typename Object=Name>
|
||||||
|
class ResourcePool {
|
||||||
|
public:
|
||||||
|
typedef uint32_t IDType;
|
||||||
|
typedef Name ResourceNameType;
|
||||||
|
private:
|
||||||
|
typedef Implem::ResourceResNameWrapper<Res, Name, IDType> ResourceWrapperType;
|
||||||
|
typedef std::map<Name, ResourceWrapperType*> ResourceMapType;
|
||||||
|
typedef std::vector<ResourceWrapperType*> ResourceVectorType;
|
||||||
|
protected:
|
||||||
|
typedef typename std::conditional<std::is_fundamental<ResourceNameType>::value, ResourceNameType, const ResourceNameType&>::type ResourceNameParamType;
|
||||||
|
typedef typename std::conditional<std::is_fundamental<Object>::value, Object, const Object&>::type ResourceObjectParameterType;
|
||||||
|
public:
|
||||||
|
typedef Res ResourceType;
|
||||||
|
typedef Object ResourceObjectType;
|
||||||
|
|
||||||
|
ResourcePool ( void ) = default;
|
||||||
|
ResourcePool ( const ResourcePool& ) = delete;
|
||||||
|
virtual ~ResourcePool ( void ) = default;
|
||||||
|
|
||||||
|
ResourcePool& operator= (const ResourcePool&) = delete;
|
||||||
|
|
||||||
|
ResourceType* GetByName ( ResourceNameParamType parName );
|
||||||
|
const ResourceType* GetByName ( ResourceNameParamType parName ) const;
|
||||||
|
IDType GetOrAdd ( ResourceObjectParameterType parObjectName );
|
||||||
|
ResourceType* GetByID ( IDType parID );
|
||||||
|
const ResourceType* GetByID ( IDType parID ) const;
|
||||||
|
bool IsEmpty ( void ) const;
|
||||||
|
|
||||||
|
IDType AddResource ( ResourceObjectParameterType parRes );
|
||||||
|
void ReleaseResource ( IDType parRes );
|
||||||
|
void ReleaseResourceByName ( ResourceNameParamType parName );
|
||||||
|
|
||||||
|
void Dispose ( void ) noexcept;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void Dispose_IgnoreReferenceCount ( void );
|
||||||
|
|
||||||
|
virtual ResourceType* OnResourceLoad ( ResourceObjectParameterType parRes ) = 0;
|
||||||
|
virtual void OnResourceDestroy ( ResourceNameParamType parName, ResourceType* parRes ) noexcept = 0;
|
||||||
|
virtual ResourceNameType GetResourceNameFromResourceObject ( ResourceObjectParameterType parRes ) = 0;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool ReleaseResource ( typename ResourceMapType::iterator& parMapRes, typename ResourceVectorType::iterator& parVecRes );
|
||||||
|
|
||||||
|
ResourceMapType m_mapContainer; //For accesses by name
|
||||||
|
ResourceVectorType m_linearContainer; //For accesses by index
|
||||||
|
};
|
||||||
|
} //namespace duckutil
|
||||||
|
|
||||||
|
#include "ResourcePool.inl"
|
||||||
|
|
||||||
|
#endif
|
243
src/scraplang/implem/ResourcePool.inl
Normal file
243
src/scraplang/implem/ResourcePool.inl
Normal file
|
@ -0,0 +1,243 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace duckutil {
|
||||||
|
namespace Implem {
|
||||||
|
///---------------------------------------------------------------------
|
||||||
|
///---------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename IDT>
|
||||||
|
ResourceResNameWrapper<Res, Name, IDT>::ResourceResNameWrapper (const Name* parName, Res* parRes, IDType parID) {
|
||||||
|
assert(nullptr != parRes);
|
||||||
|
assert(nullptr != parName);
|
||||||
|
|
||||||
|
m_resource = parRes;
|
||||||
|
m_name = parName;
|
||||||
|
m_refcount = 0;
|
||||||
|
m_resId = static_cast<uint16_t>(parID);
|
||||||
|
}
|
||||||
|
|
||||||
|
///---------------------------------------------------------------------
|
||||||
|
///---------------------------------------------------------------------
|
||||||
|
template <typename V>
|
||||||
|
inline void TrimTrailingNulls (V& parVector) {
|
||||||
|
const std::size_t nullsCount = std::find_if(parVector.rbegin(), parVector.rend(), std::bind1st(std::not_equal_to<typename V::value_type>(), nullptr)) - parVector.rbegin();
|
||||||
|
assert(nullsCount <= parVector.size());
|
||||||
|
if (nullsCount) {
|
||||||
|
assert(nullptr == parVector.back());
|
||||||
|
parVector.resize(parVector.size() - nullsCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} //namespace Implem
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
typename ResourcePool<Res, Name, Object>::IDType ResourcePool<Res, Name, Object>::GetOrAdd (ResourceObjectParameterType parObjectName) {
|
||||||
|
const ResourceNameType name = GetResourceNameFromResourceObject(parObjectName);
|
||||||
|
typename ResourceMapType::const_iterator itFind = m_mapContainer.find(name);
|
||||||
|
IDType retVal;
|
||||||
|
if (m_mapContainer.end() == itFind) {
|
||||||
|
retVal = AddResource(parObjectName);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
typename ResourceVectorType::const_iterator itVecFind = std::find(m_linearContainer.begin(), m_linearContainer.end(), itFind->second);
|
||||||
|
assert(m_linearContainer.end() != itVecFind);
|
||||||
|
retVal = static_cast<IDType>(itVecFind - m_linearContainer.begin() + 1);
|
||||||
|
}
|
||||||
|
return retVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByName (ResourceNameParamType parName) {
|
||||||
|
typename ResourceMapType::iterator itFind = m_mapContainer.find(parName);
|
||||||
|
if (m_mapContainer.end() == itFind)
|
||||||
|
return nullptr;
|
||||||
|
else
|
||||||
|
return &itFind->second->GetResource();
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
const typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByName (ResourceNameParamType parName) const {
|
||||||
|
typename ResourceMapType::const_iterator itFind = m_mapContainer.find(parName);
|
||||||
|
if (m_mapContainer.end() == itFind)
|
||||||
|
return nullptr;
|
||||||
|
else
|
||||||
|
return &itFind->second->GetResource();
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByID (IDType parID) {
|
||||||
|
assert(parID > 0);
|
||||||
|
if (0 == parID)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
const auto index = static_cast<std::size_t>(parID - 1);
|
||||||
|
if (index < m_linearContainer.size()) {
|
||||||
|
ResourceWrapperType* res = m_linearContainer[index];
|
||||||
|
return &res->GetResource();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
const typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByID (IDType parID) const {
|
||||||
|
assert(parID > 0);
|
||||||
|
if (0 == parID)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
const auto index = static_cast<std::size_t>(parID - 1);
|
||||||
|
if (index < m_linearContainer.size()) {
|
||||||
|
ResourceWrapperType* res = m_linearContainer[index];
|
||||||
|
return &res->GetResource();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
typename ResourcePool<Res, Name, Object>::IDType ResourcePool<Res, Name, Object>::AddResource (ResourceObjectParameterType parRes) {
|
||||||
|
const ResourceNameType name = GetResourceNameFromResourceObject(parRes);
|
||||||
|
typename ResourceMapType::iterator itPreExisting = m_mapContainer.find(name);
|
||||||
|
if (m_mapContainer.end() != itPreExisting) {
|
||||||
|
// if (itPreExisting->IsEmpty()) {
|
||||||
|
// OnResourceReload(name);
|
||||||
|
assert(nullptr != itPreExisting->second);
|
||||||
|
assert(not itPreExisting->second->IsEmpty());
|
||||||
|
itPreExisting->second->Retain();
|
||||||
|
return itPreExisting->second->GetResourceID();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(m_mapContainer.end() == m_mapContainer.find(name));
|
||||||
|
ResourceType* const newRes = OnResourceLoad(parRes);
|
||||||
|
if (newRes) {
|
||||||
|
std::pair<typename ResourceMapType::iterator, bool> newIt = m_mapContainer.insert(std::pair<ResourceNameType, ResourceWrapperType*>(name, nullptr));
|
||||||
|
|
||||||
|
IDType newID = static_cast<IDType>(m_linearContainer.size() + 1);
|
||||||
|
ResourceWrapperType* const newWrapper = new ResourceWrapperType(&newIt.first->first, newRes, newID);
|
||||||
|
assert(nullptr != newWrapper);
|
||||||
|
m_linearContainer.push_back(newWrapper);
|
||||||
|
newIt.first->second = newWrapper;
|
||||||
|
newWrapper->Retain();
|
||||||
|
return newID;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
void ResourcePool<Res, Name, Object>::ReleaseResource (IDType parRes) {
|
||||||
|
assert(parRes > 0);
|
||||||
|
assert(static_cast<std::size_t>(parRes) <= m_linearContainer.size());
|
||||||
|
|
||||||
|
typename ResourceVectorType::iterator rele = m_linearContainer.begin() + (parRes - 1);
|
||||||
|
assert(nullptr != *rele);
|
||||||
|
assert(rele - m_linearContainer.begin() == static_cast<int>(parRes - 1));
|
||||||
|
|
||||||
|
typename ResourceMapType::iterator relemap = m_mapContainer.find((*rele)->GetName());
|
||||||
|
assert(m_mapContainer.end() != relemap);
|
||||||
|
|
||||||
|
if (ReleaseResource(relemap, rele)) {
|
||||||
|
delete relemap->second;
|
||||||
|
m_mapContainer.erase(relemap);
|
||||||
|
Implem::TrimTrailingNulls(m_linearContainer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
void ResourcePool<Res, Name, Object>::ReleaseResourceByName (ResourceNameParamType parName) {
|
||||||
|
typename ResourceMapType::iterator rele = m_mapContainer.find(parName);
|
||||||
|
assert(m_mapContainer.end() != rele);
|
||||||
|
|
||||||
|
const IDType resId = rele->second->GetResourceID();
|
||||||
|
assert(static_cast<std::size_t>(resId) <= m_linearContainer.size());
|
||||||
|
assert(resId > 0);
|
||||||
|
|
||||||
|
if (ReleaseResource(rele, m_linearContainer.begin() + (resId - 1))) {
|
||||||
|
delete rele->second;
|
||||||
|
m_mapContainer.erase(rele);
|
||||||
|
Implem::TrimTrailingNulls(m_linearContainer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
bool ResourcePool<Res, Name, Object>::ReleaseResource (typename ResourceMapType::iterator& parMapRes, typename ResourceVectorType::iterator& parVecRes) {
|
||||||
|
assert(parMapRes->second == *parVecRes);
|
||||||
|
assert(nullptr != *parVecRes);
|
||||||
|
ResourceWrapperType& currRes = **parVecRes;
|
||||||
|
|
||||||
|
assert(not currRes.IsEmpty());
|
||||||
|
if (not currRes.IsEmpty()) {
|
||||||
|
if (currRes.Release()) {
|
||||||
|
this->OnResourceDestroy(currRes.GetName(), &currRes.GetResource());
|
||||||
|
currRes.Reset();
|
||||||
|
*parVecRes = nullptr;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
bool ResourcePool<Res, Name, Object>::IsEmpty() const {
|
||||||
|
return m_mapContainer.empty() and m_linearContainer.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
void ResourcePool<Res, Name, Object>::Dispose() noexcept {
|
||||||
|
for (auto& currItem : m_linearContainer) {
|
||||||
|
if (nullptr != currItem)
|
||||||
|
this->OnResourceDestroy(currItem->GetName(), &currItem->GetResource());
|
||||||
|
delete currItem;
|
||||||
|
}
|
||||||
|
m_linearContainer.clear();
|
||||||
|
m_mapContainer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename Res, typename Name, typename Object>
|
||||||
|
void ResourcePool<Res, Name, Object>::Dispose_IgnoreReferenceCount() {
|
||||||
|
for (auto& currItem : m_linearContainer) {
|
||||||
|
currItem->DropRefCount();
|
||||||
|
}
|
||||||
|
Dispose();
|
||||||
|
}
|
||||||
|
} //namespace duckcore
|
68
src/scraplang/implem/SaltedIndex.hpp
Normal file
68
src/scraplang/implem/SaltedIndex.hpp
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id8D3B62D447574A23A82F8E9C60A629BD
|
||||||
|
#define id8D3B62D447574A23A82F8E9C60A629BD
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
|
||||||
|
namespace duckutil {
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize=sizeof(T)*8-IndexBitSize>
|
||||||
|
class SaltedIndex {
|
||||||
|
static_assert((SaltBitSize + IndexBitSize) == sizeof(T) * 8, "Type size is too small");
|
||||||
|
static_assert(SaltBitSize > 0, "Invalid salt size");
|
||||||
|
static_assert(IndexBitSize > 0, "Invalid index size");
|
||||||
|
public:
|
||||||
|
enum {
|
||||||
|
SaltSize = SaltBitSize,
|
||||||
|
IndexSize = IndexBitSize,
|
||||||
|
MaxSalt = (1 << SaltBitSize) - 1,
|
||||||
|
MaxIndex = (1 << IndexBitSize) - 1
|
||||||
|
};
|
||||||
|
|
||||||
|
SaltedIndex ( void );
|
||||||
|
SaltedIndex ( const SaltedIndex& parOther );
|
||||||
|
explicit SaltedIndex ( T parIndex );
|
||||||
|
SaltedIndex ( T parSalt, T parIndex );
|
||||||
|
~SaltedIndex ( void );
|
||||||
|
|
||||||
|
T GetSaltOnly ( void ) const { return m_salt; }
|
||||||
|
T GetIndexOnly ( void ) const { return m_index; }
|
||||||
|
T GetSaltedIndex ( void ) const { return m_saltedIndex; }
|
||||||
|
void SetSalt ( T parSalt );
|
||||||
|
void SetIndex ( T parIndex );
|
||||||
|
T IncreaseSalt ( void );
|
||||||
|
|
||||||
|
bool operator== ( const SaltedIndex& parOther ) const { return GetSaltedIndex() == parOther.GetSaltedIndex(); }
|
||||||
|
bool operator!= ( const SaltedIndex& parOther ) const { return GetSaltedIndex() != parOther.GetSaltedIndex(); }
|
||||||
|
bool operator< ( const SaltedIndex& parOther ) const { return GetSaltedIndex() < parOther.GetSaltedIndex(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
T m_index : IndexBitSize;
|
||||||
|
T m_salt : SaltBitSize;
|
||||||
|
};
|
||||||
|
T m_saltedIndex;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
} //namespace duckutil
|
||||||
|
|
||||||
|
#include "SaltedIndex.inl"
|
||||||
|
|
||||||
|
#endif
|
86
src/scraplang/implem/SaltedIndex.inl
Normal file
86
src/scraplang/implem/SaltedIndex.inl
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace duckutil {
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||||
|
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex() :
|
||||||
|
m_saltedIndex(0)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||||
|
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (const SaltedIndex& parOther) :
|
||||||
|
m_saltedIndex(parOther.GetSaltedIndex())
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||||
|
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (T parIndex) :
|
||||||
|
m_saltedIndex(parIndex)
|
||||||
|
{
|
||||||
|
Assert(m_saltedIndex <= MaxIndex);
|
||||||
|
Assert(m_salt == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||||
|
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (T parSalt, T parIndex) :
|
||||||
|
m_index(parIndex),
|
||||||
|
m_salt(parSalt)
|
||||||
|
{
|
||||||
|
Assert(parSalt <= MaxSalt);
|
||||||
|
Assert(parIndex <= MaxIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||||
|
SaltedIndex<T, IndexBitSize, SaltBitSize>::~SaltedIndex() {
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||||
|
void SaltedIndex<T, IndexBitSize, SaltBitSize>::SetSalt (T parSalt) {
|
||||||
|
Assert(parSalt <= MaxSalt);
|
||||||
|
m_salt = parSalt;
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||||
|
void SaltedIndex<T, IndexBitSize, SaltBitSize>::SetIndex (T parIndex) {
|
||||||
|
Assert(parIndex <= MaxIndex);
|
||||||
|
m_index = parIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
///-------------------------------------------------------------------------
|
||||||
|
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||||
|
T SaltedIndex<T, IndexBitSize, SaltBitSize>::IncreaseSalt() {
|
||||||
|
Assert(m_salt < MaxSalt);
|
||||||
|
++m_salt;
|
||||||
|
}
|
||||||
|
} //namespace duckutil
|
90
src/scraplang/parse.cpp
Normal file
90
src/scraplang/parse.cpp
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
/* Copyright (C) 2017 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "parse.hpp"
|
||||||
|
#include "scraplang/parse_exports.hpp"
|
||||||
|
#include "scraplang/scrapgrammar.hpp"
|
||||||
|
#include "scraplang/element_def.hpp"
|
||||||
|
#include <boost/spirit/include/qi.hpp>
|
||||||
|
#include <boost/spirit/include/phoenix_stl.hpp>
|
||||||
|
#include <boost/spirit/include/phoenix_fusion.hpp>
|
||||||
|
#include <boost/fusion/adapted/struct.hpp>
|
||||||
|
#include <boost/fusion/adapted/std_pair.hpp>
|
||||||
|
#include <utility>
|
||||||
|
#if !defined(NDEBUG)
|
||||||
|
# include <iostream>
|
||||||
|
#endif
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
namespace sp = boost::spirit;
|
||||||
|
|
||||||
|
BOOST_FUSION_ADAPT_STRUCT(
|
||||||
|
duck::sl::SourceInfo,
|
||||||
|
(std::string, value)
|
||||||
|
(duck::sl::SourceInfo::Type, type)
|
||||||
|
)
|
||||||
|
BOOST_FUSION_ADAPT_STRUCT(
|
||||||
|
duck::sl::FromBlock,
|
||||||
|
(duck::sl::SourceInfo, source)
|
||||||
|
(std::vector<duck::sl::StructItem>, xpaths)
|
||||||
|
)
|
||||||
|
BOOST_FUSION_ADAPT_STRUCT(
|
||||||
|
duck::sl::StructBlock,
|
||||||
|
(std::string, name)
|
||||||
|
(std::vector<duck::sl::StructItem>, xpaths)
|
||||||
|
)
|
||||||
|
BOOST_FUSION_ADAPT_STRUCT(
|
||||||
|
duck::sl::ApplyBlock,
|
||||||
|
(std::string, mustache_model)
|
||||||
|
(duck::sl::SourceInfo, source)
|
||||||
|
(std::vector<duck::sl::StructItem>, xpaths)
|
||||||
|
)
|
||||||
|
BOOST_FUSION_ADAPT_STRUCT(
|
||||||
|
duck::sl::MustacheBlock,
|
||||||
|
(std::string, name)
|
||||||
|
(std::string, content)
|
||||||
|
)
|
||||||
|
BOOST_FUSION_ADAPT_STRUCT(
|
||||||
|
duck::sl::XPathElement,
|
||||||
|
(std::string, name)
|
||||||
|
(std::optional<std::string>, def_val)
|
||||||
|
(std::string, xpath)
|
||||||
|
)
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
typedef kamokan::IniCommentSkipper<std::string_view::const_iterator> skipper_type;
|
||||||
|
|
||||||
|
std::vector<ScrapNode> parse (std::string_view parData) {
|
||||||
|
ScrapGrammar<std::string_view::const_iterator, skipper_type> gramm;
|
||||||
|
auto it_start = parData.cbegin();
|
||||||
|
|
||||||
|
std::vector<ScrapNode> retval;
|
||||||
|
const bool ok = qi::phrase_parse(it_start, parData.cend(), gramm, skipper_type(), retval);
|
||||||
|
|
||||||
|
std::cout << "parse ok: " << std::boolalpha << ok << '\n';
|
||||||
|
std::cout << "end == it: " << std::boolalpha << (parData.cend() == it_start) << '\n';
|
||||||
|
std::cout << "begin == it: " << std::boolalpha << (parData.cbegin() == it_start) << '\n';
|
||||||
|
std::cout << "parse distance: " << std::distance(parData.cbegin(), it_start) << '\n';
|
||||||
|
std::cout << "all distance: " << std::distance(parData.cbegin(), parData.cend()) << " (size: " << parData.size() << ")\n";
|
||||||
|
|
||||||
|
if (parData.cend() != it_start or not ok) {
|
||||||
|
throw std::runtime_error("Error parsing input script");
|
||||||
|
}
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
}} //namespace duck::sl
|
29
src/scraplang/parse.hpp
Normal file
29
src/scraplang/parse.hpp
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
/* Copyright (C) 2017 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef idBE96C2D49C4C413888A79EAEB2B9C0FA
|
||||||
|
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
|
||||||
|
|
||||||
|
#include "scrap_node.hpp"
|
||||||
|
#include <string_view>
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
std::vector<ScrapNode> parse ( std::string_view parData );
|
||||||
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
#endif
|
51
src/scraplang/parse_exports.cpp
Normal file
51
src/scraplang/parse_exports.cpp
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
/* Copyright (C) 2017-2020 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "scraplang/parse_exports.hpp"
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
|
||||||
|
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
|
||||||
|
template bool boost::spirit::qi::phrase_parse<
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
|
||||||
|
boost::spirit::ascii::blank_type,
|
||||||
|
std::vector<duck::sl::ScrapNode>
|
||||||
|
> (
|
||||||
|
std::basic_string<char>::const_iterator&,
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
duck::sl::ScrapGrammar<
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
boost::spirit::qi::ascii::blank_type
|
||||||
|
> const&,
|
||||||
|
boost::spirit::ascii::blank_type const&,
|
||||||
|
std::vector<duck::sl::ScrapNode>&
|
||||||
|
);
|
||||||
|
|
||||||
|
template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;
|
54
src/scraplang/parse_exports.hpp
Normal file
54
src/scraplang/parse_exports.hpp
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
/* Copyright (C) 2017-2020 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "scraplang/scrap_node.hpp"
|
||||||
|
#include "scraplang/scrapgrammar.hpp"
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
extern template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
|
||||||
|
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
|
||||||
|
|
||||||
|
extern template bool boost::spirit::qi::phrase_parse<
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
|
||||||
|
boost::spirit::ascii::blank_type,
|
||||||
|
std::vector<duck::sl::ScrapNode>
|
||||||
|
> (
|
||||||
|
std::basic_string<char>::const_iterator&,
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
duck::sl::ScrapGrammar<
|
||||||
|
std::basic_string<char>::const_iterator,
|
||||||
|
boost::spirit::qi::ascii::blank_type
|
||||||
|
> const&,
|
||||||
|
boost::spirit::ascii::blank_type const&,
|
||||||
|
std::vector<duck::sl::ScrapNode>&
|
||||||
|
);
|
||||||
|
|
||||||
|
extern template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;
|
92
src/scraplang/scrap_node.hpp
Normal file
92
src/scraplang/scrap_node.hpp
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id9919CCB09DDD429C8128632F13D370ED
|
||||||
|
#define id9919CCB09DDD429C8128632F13D370ED
|
||||||
|
|
||||||
|
//#include "element_def.hpp"
|
||||||
|
#include <boost/spirit/include/support_extended_variant.hpp>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <optional>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
struct XPathElement {
|
||||||
|
std::string name;
|
||||||
|
std::optional<std::string> def_val;
|
||||||
|
std::string xpath;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct StructBlock;
|
||||||
|
|
||||||
|
struct StructItem : boost::spirit::extended_variant<
|
||||||
|
XPathElement,
|
||||||
|
boost::recursive_wrapper<StructBlock>
|
||||||
|
> {
|
||||||
|
StructItem() : base_type() {}
|
||||||
|
StructItem (const XPathElement& value) : base_type(value) {}
|
||||||
|
StructItem (const StructBlock& value) : base_type(value) {}
|
||||||
|
using base_type::operator=;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct StructBlock {
|
||||||
|
std::string name;
|
||||||
|
std::vector<StructItem> xpaths;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SourceInfo {
|
||||||
|
enum Type { URL, Token };
|
||||||
|
|
||||||
|
std::string value;
|
||||||
|
Type type;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FromBlock {
|
||||||
|
SourceInfo source;
|
||||||
|
std::vector<StructItem> xpaths;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ApplyBlock {
|
||||||
|
std::string mustache_model;
|
||||||
|
SourceInfo source;
|
||||||
|
std::vector<StructItem> xpaths;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MustacheBlock {
|
||||||
|
std::string name;
|
||||||
|
std::string content;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ScrapNode : boost::spirit::extended_variant<
|
||||||
|
boost::recursive_wrapper<std::vector<ScrapNode>>,
|
||||||
|
FromBlock,
|
||||||
|
ApplyBlock,
|
||||||
|
MustacheBlock
|
||||||
|
> {
|
||||||
|
ScrapNode() : base_type() {}
|
||||||
|
ScrapNode (const std::vector<ScrapNode>& value) : base_type(value) {}
|
||||||
|
ScrapNode (const FromBlock& value) : base_type(value) {}
|
||||||
|
ScrapNode (const ApplyBlock& value) : base_type(value) {}
|
||||||
|
ScrapNode (const MustacheBlock& value) : base_type(value) {}
|
||||||
|
using base_type::operator=;
|
||||||
|
};
|
||||||
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
#endif
|
|
@ -1,75 +0,0 @@
|
||||||
/* Copyright (C) 2015 Michele Santullo
|
|
||||||
*
|
|
||||||
* This file is part of DuckScraper.
|
|
||||||
*
|
|
||||||
* DuckScraper is free software: you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation, either version 3 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* DuckScraper is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef id9919CCB09DDD429C8128632F13D370ED
|
|
||||||
#define id9919CCB09DDD429C8128632F13D370ED
|
|
||||||
|
|
||||||
#include "scraplang_element.hpp"
|
|
||||||
#include <boost/spirit/include/support_extended_variant.hpp>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <map>
|
|
||||||
|
|
||||||
namespace duck {
|
|
||||||
struct ScrapNode;
|
|
||||||
|
|
||||||
namespace implem {
|
|
||||||
struct map;
|
|
||||||
struct array;
|
|
||||||
|
|
||||||
struct element : boost::spirit::extended_variant<
|
|
||||||
boost::recursive_wrapper<map>,
|
|
||||||
boost::recursive_wrapper<array>,
|
|
||||||
std::string,
|
|
||||||
int,
|
|
||||||
double
|
|
||||||
>
|
|
||||||
{
|
|
||||||
element ( void ) = default;
|
|
||||||
element ( const map& parOther ) : base_type(parOther) {}
|
|
||||||
element ( const array& parOther ) : base_type(parOther) {}
|
|
||||||
element ( const std::string& parOther ) : base_type(parOther) {}
|
|
||||||
element ( double parOther ) : base_type(parOther) {}
|
|
||||||
element ( int parOther ) : base_type(parOther) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct map : std::map<std::string, element> {
|
|
||||||
};
|
|
||||||
|
|
||||||
struct array : std::vector<element> {
|
|
||||||
};
|
|
||||||
|
|
||||||
struct node_list {
|
|
||||||
std::vector<ScrapNode> nodes;
|
|
||||||
};
|
|
||||||
} //namespace implem
|
|
||||||
|
|
||||||
struct ScrapNode : boost::spirit::extended_variant<
|
|
||||||
element_def,
|
|
||||||
implem::map,
|
|
||||||
implem::node_list
|
|
||||||
>
|
|
||||||
{
|
|
||||||
ScrapNode ( void ) = default;
|
|
||||||
ScrapNode ( const element_def& parOther ) : base_type(parOther) {}
|
|
||||||
ScrapNode ( const implem::map& parOther ) : base_type(parOther) {}
|
|
||||||
ScrapNode ( const implem::node_list& parOther ) : base_type(parOther) {}
|
|
||||||
};
|
|
||||||
} //namespace duck
|
|
||||||
|
|
||||||
#endif
|
|
108
src/scraplang/scrapgrammar.hpp
Normal file
108
src/scraplang/scrapgrammar.hpp
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
/* Copyright (C) 2017-2020 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <boost/spirit/include/qi.hpp>
|
||||||
|
#include <boost/spirit/include/phoenix_operator.hpp>
|
||||||
|
|
||||||
|
namespace kamokan {
|
||||||
|
template <typename Iterator>
|
||||||
|
struct IniCommentSkipper : boost::spirit::qi::grammar<Iterator> {
|
||||||
|
IniCommentSkipper() :
|
||||||
|
IniCommentSkipper::base_type(skipping),
|
||||||
|
first_char(true)
|
||||||
|
{
|
||||||
|
namespace px = boost::phoenix;
|
||||||
|
using boost::spirit::qi::blank;
|
||||||
|
using boost::spirit::qi::lit;
|
||||||
|
using boost::spirit::qi::eol;
|
||||||
|
using boost::spirit::qi::char_;
|
||||||
|
using boost::spirit::qi::eps;
|
||||||
|
|
||||||
|
skipping = comment | blank;
|
||||||
|
comment = (eps(px::cref(first_char) == true) | eol) >>
|
||||||
|
*blank >> lit("#")[px::ref(first_char) = false] >>
|
||||||
|
*(!eol >> char_);
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::spirit::qi::rule<Iterator> skipping;
|
||||||
|
boost::spirit::qi::rule<Iterator> comment;
|
||||||
|
bool first_char;
|
||||||
|
};
|
||||||
|
} //namespace kamokan
|
||||||
|
|
||||||
|
namespace duck::sl {
|
||||||
|
namespace qi = ::boost::spirit::qi;
|
||||||
|
|
||||||
|
template <typename I, typename Skipper>
|
||||||
|
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
|
||||||
|
public:
|
||||||
|
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
||||||
|
using qi::char_;
|
||||||
|
using qi::lexeme;
|
||||||
|
using qi::alpha;
|
||||||
|
using qi::alnum;
|
||||||
|
using qi::graph;
|
||||||
|
using qi::attr;
|
||||||
|
using qi::eol;
|
||||||
|
using qi::eoi;
|
||||||
|
using qi::lit;
|
||||||
|
using qi::string;
|
||||||
|
using qi::as_string;
|
||||||
|
using qi::no_skip;
|
||||||
|
|
||||||
|
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
|
||||||
|
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
|
||||||
|
source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
|
||||||
|
url = -(+alpha >> string("://")) >> alpha >> *graph;
|
||||||
|
mustache_like_token = "{{" >> identifier >> "}}";
|
||||||
|
quoted_string %= lexeme['"' >> *(char_ - '"') >> '"'];
|
||||||
|
xpath_assignment %= identifier >>
|
||||||
|
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
|
||||||
|
as_string[lexeme[+(graph | char_(" \t"))]];
|
||||||
|
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
|
||||||
|
|
||||||
|
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
|
||||||
|
assignment_list >> +eol >> "end";
|
||||||
|
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
|
||||||
|
|
||||||
|
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
|
||||||
|
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
|
||||||
|
|
||||||
|
assignment_list = (xpath_assignment | struct_block) % +eol;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename F>
|
||||||
|
using RuleType = qi::rule<I, F, Skipper>;
|
||||||
|
|
||||||
|
RuleType<std::vector<ScrapNode>()> start;
|
||||||
|
RuleType<FromBlock()> from_block;
|
||||||
|
RuleType<std::string()> url;
|
||||||
|
RuleType<std::string()> mustache_like_token;
|
||||||
|
RuleType<std::string()> quoted_string;
|
||||||
|
RuleType<XPathElement()> xpath_assignment;
|
||||||
|
RuleType<std::string()> identifier;
|
||||||
|
RuleType<SourceInfo()> source_info;
|
||||||
|
RuleType<ApplyBlock()> apply_block;
|
||||||
|
RuleType<StructBlock()> struct_block;
|
||||||
|
RuleType<MustacheBlock()> mustache_block;
|
||||||
|
RuleType<std::vector<StructItem>()> assignment_list;
|
||||||
|
};
|
||||||
|
} //namespace duck::sl
|
|
@ -1,119 +0,0 @@
|
||||||
/* Copyright (C) 2015 Michele Santullo
|
|
||||||
*
|
|
||||||
* This file is part of DuckScraper.
|
|
||||||
*
|
|
||||||
* DuckScraper is free software: you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation, either version 3 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* DuckScraper is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "scraplang.hpp"
|
|
||||||
#include "scrapast.hpp"
|
|
||||||
#include "scraplang_visit_xpath.hpp"
|
|
||||||
#include <boost/spirit/include/qi.hpp>
|
|
||||||
#include <boost/spirit/include/phoenix_stl.hpp>
|
|
||||||
#include <boost/spirit/include/phoenix_fusion.hpp>
|
|
||||||
#include <boost/fusion/adapted/struct.hpp>
|
|
||||||
#include <boost/fusion/adapted/std_pair.hpp>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#include <boost/variant/apply_visitor.hpp>
|
|
||||||
|
|
||||||
namespace qi = boost::spirit::qi;
|
|
||||||
namespace sp = boost::spirit;
|
|
||||||
|
|
||||||
BOOST_FUSION_ADAPT_STRUCT(
|
|
||||||
duck::element_def,
|
|
||||||
(std::string, name)
|
|
||||||
(std::string, xpath)
|
|
||||||
(duck::ElementTypes, type)
|
|
||||||
)
|
|
||||||
BOOST_FUSION_ADAPT_STRUCT(
|
|
||||||
duck::implem::node_list,
|
|
||||||
(std::vector<duck::ScrapNode>, nodes)
|
|
||||||
)
|
|
||||||
|
|
||||||
namespace duck {
|
|
||||||
namespace {
|
|
||||||
struct ElementTypeSymbol : qi::symbols<char, ElementTypes> {
|
|
||||||
ElementTypeSymbol() {
|
|
||||||
add
|
|
||||||
("string", ElementType_String)
|
|
||||||
("integer", ElementType_Integer)
|
|
||||||
("boolean", ElementType_Boolean)
|
|
||||||
("null", ElementType_Null)
|
|
||||||
("double", ElementType_Double)
|
|
||||||
;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename I>
|
|
||||||
struct ScrapGrammar : qi::grammar<I, ScrapNode(), sp::ascii::space_type> {
|
|
||||||
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
|
||||||
using qi::lit;
|
|
||||||
using qi::char_;
|
|
||||||
using qi::lexeme;
|
|
||||||
using qi::double_;
|
|
||||||
using qi::int_;
|
|
||||||
using qi::eps;
|
|
||||||
|
|
||||||
start = whole;
|
|
||||||
whole = eps >> *xpath_definition >> -map;
|
|
||||||
xpath_definition = identifier >> lit('=') >> string >> "as" >> data_type;
|
|
||||||
identifier = (char_('a', 'z') | char_('A', 'Z') | '_') >> *(char_('a', 'z') | char_('A', 'Z') | '_' | char_('0', '9'));
|
|
||||||
string %= lexeme['"' >> +(char_ - '"') >> '"'];
|
|
||||||
map = lit('{') >> ((identifier >> lit('=') >> value) % lit(',')) >> lit('}');
|
|
||||||
array = lit('[') >> *(value % lit(',')) >> lit(']');
|
|
||||||
value = string | double_ | int_ | array | map | identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
qi::rule<I, ScrapNode(), sp::ascii::space_type> start;
|
|
||||||
qi::rule<I, implem::node_list(), sp::ascii::space_type> whole;
|
|
||||||
qi::rule<I, element_def(), sp::ascii::space_type> xpath_definition;
|
|
||||||
qi::rule<I, std::string(), sp::ascii::space_type> identifier;
|
|
||||||
qi::rule<I, std::string(), sp::ascii::space_type> string;
|
|
||||||
qi::rule<I, implem::map(), sp::ascii::space_type> map;
|
|
||||||
qi::rule<I, implem::array(), sp::ascii::space_type> array;
|
|
||||||
qi::rule<I, implem::element(), sp::ascii::space_type> value;
|
|
||||||
ElementTypeSymbol data_type;
|
|
||||||
};
|
|
||||||
} //unnamed namespace
|
|
||||||
|
|
||||||
ScrapNodePtr parse_scraplang (const std::string& parData) {
|
|
||||||
ScrapGrammar<std::string::const_iterator> gramm;
|
|
||||||
ScrapNodePtr retval(new ScrapNode);
|
|
||||||
auto it_start = parData.cbegin();
|
|
||||||
|
|
||||||
qi::phrase_parse(it_start, parData.cend(), gramm, sp::ascii::space, *retval);
|
|
||||||
return std::move(retval);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<element_def> get_xpath_definitions (const ScrapNode& parAST) {
|
|
||||||
std::vector<element_def> retval;
|
|
||||||
implem::XPathVisitor xpath_vis(&retval);
|
|
||||||
boost::apply_visitor(xpath_vis, parAST);
|
|
||||||
return std::move(retval);
|
|
||||||
}
|
|
||||||
|
|
||||||
ScrapNodePtr::ScrapNodePtr (ScrapNode* parPtr) :
|
|
||||||
m_ptr(parPtr)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
ScrapNodePtr::ScrapNodePtr (ScrapNodePtr&& parOther) :
|
|
||||||
m_ptr(std::move(parOther.m_ptr))
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
ScrapNodePtr::~ScrapNodePtr() noexcept {
|
|
||||||
}
|
|
||||||
} //namespace duck
|
|
0
src/scraplang/scraplang_print_results.cpp
Normal file
0
src/scraplang/scraplang_print_results.cpp
Normal file
29
src/scraplang/scraplang_print_results.hpp
Normal file
29
src/scraplang/scraplang_print_results.hpp
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
#ifndef idB20734D678524FAA8AC94F2AB2FDAA94
|
||||||
|
#define idB20734D678524FAA8AC94F2AB2FDAA94
|
||||||
|
|
||||||
|
#include "scrapast.hpp"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
typedef std::vector<std::vector<std::pair<std::string, std::string>>> ResulList;
|
||||||
|
struct element_def;
|
||||||
|
|
||||||
|
namespace implem {
|
||||||
|
class ResultPrinter {
|
||||||
|
public:
|
||||||
|
typedef void result_type;
|
||||||
|
|
||||||
|
explicit ResultPrinter ( const std::vector<element_def>* parQueries, const ResultList* parResults );
|
||||||
|
|
||||||
|
void operator() ( const element_def& parElem );
|
||||||
|
void operator() ( const implem::map& parMap );
|
||||||
|
void operator() ( const node_list& parNodes );
|
||||||
|
|
||||||
|
private:
|
||||||
|
const std::vector<element_def>* const m_queries;
|
||||||
|
const ResulList* const m_results;
|
||||||
|
};
|
||||||
|
} //namespace implem
|
||||||
|
} //namespace duck
|
||||||
|
|
||||||
|
#endif
|
67
src/scraplang/stream_scrap_node.hpp
Normal file
67
src/scraplang/stream_scrap_node.hpp
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef idDB3415BA82504C00A2DAF0274BA9AC92
|
||||||
|
#define idDB3415BA82504C00A2DAF0274BA9AC92
|
||||||
|
|
||||||
|
#include "scrap_node.hpp"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
std::ostream& operator<< (std::ostream& stream, XPathElement xpath) {
|
||||||
|
stream << "XPathElement \"" << xpath.name << "\": \"" <<
|
||||||
|
xpath.xpath << "\" ";
|
||||||
|
|
||||||
|
if (xpath.def_val)
|
||||||
|
stream << "default: \"" << *xpath.def_val << '"';
|
||||||
|
else
|
||||||
|
stream << "no default";
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<< (std::ostream& parStream, const duck::sl::SourceInfo& parInfo) {
|
||||||
|
if (duck::sl::SourceInfo::URL == parInfo.type)
|
||||||
|
parStream << '"' << parInfo.value << '"';
|
||||||
|
else if (duck::sl::SourceInfo::Token == parInfo.type)
|
||||||
|
parStream << '$' << parInfo.value;
|
||||||
|
else
|
||||||
|
parStream << "Invalid SourceInfo type";
|
||||||
|
return parStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<< (std::ostream& stream, const FromBlock& blk) {
|
||||||
|
stream << "FromBlock: " << blk.source << ", " <<
|
||||||
|
blk.xpaths.size() << " xpath entries";
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<< (std::ostream& stream, const StructBlock& strct) {
|
||||||
|
stream << "StructBlock \"" << strct.name << "\" with " <<
|
||||||
|
strct.xpaths.size() << " xpath entries";
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<< (std::ostream& stream, const ApplyBlock& app) {
|
||||||
|
stream << "ApplyBlock for \"" << app.mustache_model << "\": " <<
|
||||||
|
app.source << ", " <<
|
||||||
|
app.xpaths.size() << " elements";
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
#endif
|
89
src/scraplang/xpath_runner.cpp
Normal file
89
src/scraplang/xpath_runner.cpp
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "xpath_runner.hpp"
|
||||||
|
#include "xpath.hpp"
|
||||||
|
#include <cassert>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
//#define HTML_ALWAYS_STDIN
|
||||||
|
|
||||||
|
#if !defined(NDEBUG) && defined(HTML_ALWAYS_STDIN)
|
||||||
|
# define HTML_ALWAYS_STDIN_ENABLED
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
struct XPathRunner::XPathKey {
|
||||||
|
XPathKey (const std::string_view& parSrc, const std::string_view& parQuery) :
|
||||||
|
source_address(std::string(parSrc)),
|
||||||
|
xpath_query(std::string(parQuery))
|
||||||
|
{
|
||||||
|
assert(not source_address.empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string source_address;
|
||||||
|
std::string xpath_query;
|
||||||
|
|
||||||
|
bool operator< (const XPathKey& parOther) const {
|
||||||
|
return (
|
||||||
|
xpath_query == parOther.xpath_query and
|
||||||
|
source_address < parOther.source_address
|
||||||
|
) or (xpath_query < parOther.xpath_query);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace) :
|
||||||
|
m_cached_results(),
|
||||||
|
m_def_namespace(std::move(parDefNamespace)),
|
||||||
|
m_pool(html_pool),
|
||||||
|
m_xpath(parXPath)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
XPathRunner::~XPathRunner() = default;
|
||||||
|
|
||||||
|
const std::vector<std::string>& XPathRunner::query (
|
||||||
|
std::string_view parSrc,
|
||||||
|
std::string_view parQuery
|
||||||
|
) {
|
||||||
|
std::cout << "XPathRunner::query(\"" << parSrc << "\", \"" << parQuery << "\")\"\n";
|
||||||
|
auto ins_retval = m_cached_results.insert(std::make_pair(XPathKey(parSrc, parQuery), std::vector<std::string>()));
|
||||||
|
const bool inserted = ins_retval.second;
|
||||||
|
assert(ins_retval.first != m_cached_results.end());
|
||||||
|
std::vector<std::string>& curr_vec = ins_retval.first->second;
|
||||||
|
|
||||||
|
if (inserted) {
|
||||||
|
#if defined(HTML_ALWAYS_STDIN_ENABLED)
|
||||||
|
const auto id = m_pool->GetOrAdd("-");
|
||||||
|
#else
|
||||||
|
const auto id = m_pool->GetOrAdd(parSrc);
|
||||||
|
#endif
|
||||||
|
const std::string* html = m_pool->GetByID(id);
|
||||||
|
|
||||||
|
curr_vec = m_xpath->run_query(*html, std::string(parQuery), m_def_namespace);
|
||||||
|
std::cout << "First time for this query, result cached now\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "returning " << curr_vec.size() << " items: ";
|
||||||
|
for (auto& i : curr_vec) {
|
||||||
|
std::cout << '"' << i << "\", ";
|
||||||
|
}
|
||||||
|
std::cout << '\n';
|
||||||
|
return curr_vec;
|
||||||
|
}
|
||||||
|
}} //namespace duck::sl
|
49
src/scraplang/xpath_runner.hpp
Normal file
49
src/scraplang/xpath_runner.hpp
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id46DB8F4F85E2417E9AF0B1A410240D4F
|
||||||
|
#define id46DB8F4F85E2417E9AF0B1A410240D4F
|
||||||
|
|
||||||
|
#include "html_pool_base.hpp"
|
||||||
|
#include "xpath_fwd.hpp"
|
||||||
|
#include <map>
|
||||||
|
#include <string_view>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace duck { namespace sl {
|
||||||
|
class XPathRunner {
|
||||||
|
public:
|
||||||
|
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace);
|
||||||
|
~XPathRunner();
|
||||||
|
|
||||||
|
const std::vector<std::string>& query (
|
||||||
|
std::string_view parSrc,
|
||||||
|
std::string_view parQuery
|
||||||
|
);
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct XPathKey;
|
||||||
|
|
||||||
|
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
||||||
|
std::string m_def_namespace;
|
||||||
|
HtmlPoolBaseSP m_pool;
|
||||||
|
XPathPtr m_xpath;
|
||||||
|
};
|
||||||
|
}} //namespace duck::sl
|
||||||
|
|
||||||
|
#endif
|
|
@ -17,7 +17,11 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "xpath.hpp"
|
#include "xpath.hpp"
|
||||||
#include <pugixml.hpp>
|
#include <xercesc/framework/MemBufInputSource.hpp>
|
||||||
|
#include <xercesc/util/XMLString.hpp>
|
||||||
|
#include <xqilla/exceptions/XQException.hpp>
|
||||||
|
#include <xqilla/exceptions/XMLParseException.hpp>
|
||||||
|
#include <xqilla/context/ContextHelpers.hpp>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -45,35 +49,64 @@ namespace duck {
|
||||||
}
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
XPathBatchResults xpath_query (const std::string& parXML, const std::vector<std::string>& parQueries) {
|
XPath::XPath() = default;
|
||||||
pugi::xml_document doc;
|
|
||||||
std::istringstream iss(parXML);
|
XPath::~XPath() = default;
|
||||||
pugi::xml_parse_result result(doc.load(iss));
|
|
||||||
if (not result) {
|
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace) -> BatchResults {
|
||||||
auto line_col = line_col_from_offset(result.offset, parXML);
|
XQilla& xqilla = m_xqilla;
|
||||||
throw ParseError(line_col.first, line_col.second, result.description());
|
XercesConfiguration xconfig;
|
||||||
|
xercesc::MemBufInputSource input_buf(reinterpret_cast<const XMLByte*>(parXML.c_str()), parXML.size(), "n/a", false);
|
||||||
|
BatchResults retval;
|
||||||
|
try {
|
||||||
|
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY3, &xconfig));
|
||||||
|
xconfig.populateStaticContext(context);
|
||||||
|
Node::Ptr ptr = context->parseDocument(input_buf);
|
||||||
|
context->setContextItem(ptr);
|
||||||
|
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
|
||||||
|
AutoDeleteArray<XMLCh> ns_wide = xercesc::XMLString::transcode(parDefNamespace.c_str());
|
||||||
|
context->setDefaultElementAndTypeNS(ns_wide);
|
||||||
|
|
||||||
|
for (const auto& xpath : parQueries) {
|
||||||
|
AutoDelete<XQQuery> query(nullptr);
|
||||||
|
{
|
||||||
|
AutoContextInfoReset resetter(context);
|
||||||
|
AutoDeleteArray<XMLCh> xpath_wide = xercesc::XMLString::transcode(xpath.c_str());
|
||||||
|
query.set(xqilla.parse(xpath_wide, context, nullptr, XQilla::NO_ADOPT_CONTEXT));
|
||||||
}
|
}
|
||||||
|
|
||||||
XPathBatchResults retval;
|
Result result = query->execute(context);
|
||||||
for (const auto& xpath : parQueries) {
|
Item::Ptr item;
|
||||||
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str());
|
|
||||||
std::vector<std::pair<std::string, std::string>> new_lst;
|
std::vector<std::pair<std::string, std::string>> new_lst;
|
||||||
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
|
while(nullptr != (item = result->next(context))) {
|
||||||
const pugi::xpath_node& node = *itFind;
|
new_lst.push_back(std::make_pair(std::string(), UTF8(item->asString(context))));
|
||||||
std::pair<std::string, std::string> new_itm;
|
|
||||||
if (node.node()) {
|
|
||||||
new_itm.first = std::string(node.node().name());
|
|
||||||
new_itm.second = std::string(node.node().value());
|
|
||||||
}
|
|
||||||
else if (node.attribute()) {
|
|
||||||
new_itm.first = std::string(node.attribute().name());
|
|
||||||
new_itm.second = std::string(node.attribute().value());
|
|
||||||
}
|
|
||||||
new_lst.push_back(std::move(new_itm));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
retval.push_back(std::move(new_lst));
|
retval.push_back(std::move(new_lst));
|
||||||
}
|
}
|
||||||
return std::move(retval);
|
}
|
||||||
|
catch (const XMLParseException& err) {
|
||||||
|
throw ParseError(err.getXQueryLine(), err.getXQueryColumn(), xercesc::XMLString::transcode(err.getError()));
|
||||||
|
}
|
||||||
|
catch (const XQException& err) {
|
||||||
|
throw std::runtime_error(xercesc::XMLString::transcode(err.getError()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace) {
|
||||||
|
auto query_res = run_query(parXML, std::vector<std::string>{parQuery}, parDefNamespace);
|
||||||
|
if (query_res.empty() or query_res.front().empty()) {
|
||||||
|
return std::vector<std::string>();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::vector<std::string> retval;
|
||||||
|
const std::vector<std::pair<std::string, std::string>>& src = query_res.front();
|
||||||
|
retval.reserve(src.size());
|
||||||
|
std::transform(src.begin(), src.end(), std::back_inserter(retval), [](const auto& pair) { return pair.second; });
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ParseError::ParseError (int parLine, int parColumn, std::string parMessage) {
|
ParseError::ParseError (int parLine, int parColumn, std::string parMessage) {
|
||||||
|
|
|
@ -19,14 +19,14 @@
|
||||||
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
|
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||||
#define id21E0A6F345D24C5D83D3B1F74EC810F7
|
#define id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||||
|
|
||||||
|
#include "xpath_fwd.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <exception>
|
#include <exception>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
#include <xqilla/xqilla-simple.hpp>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
typedef std::vector<std::vector<std::pair<std::string, std::string>>> XPathBatchResults;
|
|
||||||
|
|
||||||
class ParseError : public std::exception {
|
class ParseError : public std::exception {
|
||||||
public:
|
public:
|
||||||
ParseError ( int parLine, int parColumn, std::string parMessage );
|
ParseError ( int parLine, int parColumn, std::string parMessage );
|
||||||
|
@ -35,7 +35,19 @@ namespace duck {
|
||||||
std::vector<char> m_msg;
|
std::vector<char> m_msg;
|
||||||
};
|
};
|
||||||
|
|
||||||
XPathBatchResults xpath_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
class XPath : public Kakoune::SafeCountable {
|
||||||
|
public:
|
||||||
|
typedef std::vector<std::vector<std::pair<std::string, std::string>>> BatchResults;
|
||||||
|
|
||||||
|
XPath();
|
||||||
|
~XPath();
|
||||||
|
|
||||||
|
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace );
|
||||||
|
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace );
|
||||||
|
|
||||||
|
private:
|
||||||
|
XQilla m_xqilla;
|
||||||
|
};
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
29
src/xpath_fwd.hpp
Normal file
29
src/xpath_fwd.hpp
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
/* Copyright (C) 2015-2020 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id08062CD6C4904D94BFF57990C44B6CCB
|
||||||
|
#define id08062CD6C4904D94BFF57990C44B6CCB
|
||||||
|
|
||||||
|
#include "kakoune/safe_ptr.hh"
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
class XPath;
|
||||||
|
using XPathPtr = Kakoune::SafePtr<XPath>;
|
||||||
|
} //namespace duck
|
||||||
|
|
||||||
|
#endif
|
Loading…
Add table
Reference in a new issue