Compare commits
41 commits
Author | SHA1 | Date | |
---|---|---|---|
fa08abd00d | |||
d64a4af105 | |||
9cd2608406 | |||
5a9e4e09a4 | |||
329ccef6ef | |||
4958a83ddb | |||
830ab42c49 | |||
32f87e5185 | |||
b79d758e8e | |||
b536026f58 | |||
55eb7c1fc0 | |||
bdb858de5a | |||
6e35c880a4 | |||
33866b3d6b | |||
54ac44b81d | |||
3dcbd48067 | |||
5de2dfbe70 | |||
d97cf03a34 | |||
7170347969 | |||
60d6c2cb61 | |||
430886085c | |||
9dba8043f1 | |||
494364c22e | |||
5d2c5863a5 | |||
b028e8c492 | |||
a6916f6179 | |||
1d750ad2f9 | |||
76f403b3ce | |||
84a599e771 | |||
79ac7534f2 | |||
a9ff092401 | |||
8d2c9f9013 | |||
b39621ea51 | |||
6dffe9b848 | |||
41bb315b02 | |||
2fd4daf52c | |||
26b912d66c | |||
3572803f66 | |||
fcb25ed456 | |||
29f8fe299e | |||
f0e7a1d136 |
48 changed files with 2997 additions and 346 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1 +1,3 @@
|
|||
build/
|
||||
tags
|
||||
compile_commands.json
|
||||
|
|
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -4,3 +4,6 @@
|
|||
[submodule "lib/tidy"]
|
||||
path = lib/tidy
|
||||
url = https://github.com/htacg/tidy-html5.git
|
||||
[submodule "lib/mstch"]
|
||||
path = lib/mstch
|
||||
url = https://github.com/KingDuckZ/mstch.git
|
||||
|
|
|
@ -5,11 +5,15 @@ project(duckscraper VERSION 0.2.1 LANGUAGES CXX)
|
|||
option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared library" OFF)
|
||||
|
||||
include(GetGitRevisionDescription)
|
||||
find_package(PugiXML REQUIRED)
|
||||
find_package(Boost 1.32.0 COMPONENTS program_options)
|
||||
find_package(XQilla 2.3.3 REQUIRED)
|
||||
find_package(Iconv REQUIRED)
|
||||
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -Wextra -g -O3 -fomit-frame-pointer")
|
||||
|
||||
set(DEFAULT_USER_AGENT "DuckScraper")
|
||||
set(PROJECT_VERSION_BETA "1")
|
||||
|
@ -20,23 +24,30 @@ configure_file(
|
|||
"${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h"
|
||||
)
|
||||
|
||||
include_directories(SYSTEM
|
||||
lib/tidy/include
|
||||
${PUGIXML_INCLUDE_DIR}
|
||||
lib/curlcpp/include
|
||||
${Boost_INCLUDE_DIRS}
|
||||
)
|
||||
include_directories(
|
||||
src/
|
||||
"${PROJECT_BINARY_DIR}"
|
||||
)
|
||||
|
||||
add_executable(${PROJECT_NAME}
|
||||
src/main.cpp
|
||||
src/html_pool.cpp
|
||||
src/htmlretrieve.cpp
|
||||
src/commandline.cpp
|
||||
src/scraplang/scraplang.cpp
|
||||
src/scraplang/parse_exports.cpp
|
||||
src/scraplang/parse.cpp
|
||||
src/scraplang/apply.cpp
|
||||
src/scraplang/xpath_runner.cpp
|
||||
src/xpath.cpp
|
||||
src/read_all.cpp
|
||||
src/iconv_wrapper.cpp
|
||||
)
|
||||
|
||||
target_include_directories(${PROJECT_NAME} SYSTEM
|
||||
PRIVATE lib/tidy/include
|
||||
PRIVATE ${PUGIXML_INCLUDE_DIR}
|
||||
PRIVATE lib/curlcpp/include
|
||||
PRIVATE ${Boost_INCLUDE_DIRS}
|
||||
PRIVATE lib/mstch/include
|
||||
)
|
||||
target_include_directories(${PROJECT_NAME}
|
||||
PRIVATE src/
|
||||
PRIVATE "${PROJECT_BINARY_DIR}"
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_TIDY)
|
||||
|
@ -46,10 +57,16 @@ else(BUILD_SHARED_TIDY)
|
|||
endif(BUILD_SHARED_TIDY)
|
||||
|
||||
target_link_libraries(${PROJECT_NAME}
|
||||
${TIDY_LIB}
|
||||
${PUGIXML_LIBRARIES}
|
||||
curlcpp
|
||||
${Boost_LIBRARIES}
|
||||
PRIVATE ${TIDY_LIB}
|
||||
PRIVATE ${PUGIXML_LIBRARIES}
|
||||
PRIVATE curlcpp
|
||||
PRIVATE ${Boost_LIBRARIES}
|
||||
PRIVATE mstch
|
||||
PRIVATE XQilla::XQilla
|
||||
)
|
||||
|
||||
target_compile_definitions(${PROJECT_NAME}
|
||||
PRIVATE $<$<CONFIG:DEBUG>:KAK_DEBUG>
|
||||
)
|
||||
|
||||
#unset those variables so cmake files from dependencies won't complain about
|
||||
|
@ -62,3 +79,4 @@ unset(PROJECT_VERSION)
|
|||
set(BUILD_SHARED_LIB ${BUILD_SHARED_TIDY}) #for tidy
|
||||
add_subdirectory(lib/tidy)
|
||||
add_subdirectory(lib/curlcpp)
|
||||
add_subdirectory(lib/mstch)
|
||||
|
|
28
cmake/Modules/FindXQilla.cmake
Normal file
28
cmake/Modules/FindXQilla.cmake
Normal file
|
@ -0,0 +1,28 @@
|
|||
# Find the XQilla library
|
||||
# originally taken from
|
||||
# https://github.com/rug-compling/alpinocorpus/blob/master/cmake/FindXQilla.cmake
|
||||
|
||||
find_path(XQILLA_INCLUDE_DIR NAMES xqilla/xqilla-simple.hpp)
|
||||
find_library(XQILLA_LIBRARY NAMES xqilla)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(
|
||||
XQILLA
|
||||
DEFAULT_MSG
|
||||
XQILLA_INCLUDE_DIR
|
||||
XQILLA_LIBRARY
|
||||
)
|
||||
set(XQILLA_LIBRARIES ${XQILLA_LIBRARY})
|
||||
mark_as_advanced(XQILLA_INCLUDE_DIR XQILLA_LIBRARY)
|
||||
|
||||
if (XQILLA_FOUND)
|
||||
find_package(XercesC REQUIRED)
|
||||
|
||||
if (NOT TARGET XQilla::XQilla)
|
||||
add_library(XQilla::XQilla UNKNOWN IMPORTED)
|
||||
set_target_properties(XQilla::XQilla PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${XQILLA_INCLUDE_DIR}"
|
||||
IMPORTED_LOCATION "${XQILLA_LIBRARY}"
|
||||
INTERFACE_LINK_LIBRARIES XercesC::XercesC
|
||||
)
|
||||
endif()
|
||||
endif()
|
|
@ -1 +1 @@
|
|||
Subproject commit 194fdb0ced92a993a60cd0810610845a12023e82
|
||||
Subproject commit 0c2f06df81c6cb24fad11fd12d69a2dd19360285
|
1
lib/mstch
Submodule
1
lib/mstch
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 45122d1d515c90a54d509d4b2d8d9279348518f5
|
2
lib/tidy
2
lib/tidy
|
@ -1 +1 @@
|
|||
Subproject commit 67192ba77e539539d15cc716303ac686bacddd61
|
||||
Subproject commit d1b906991a7587688d384b648c55731f9be52506
|
94
map_form.txt
Normal file
94
map_form.txt
Normal file
|
@ -0,0 +1,94 @@
|
|||
```
|
||||
apply {{mustache_name}} to {{pages}}
|
||||
A = /html/head/text()
|
||||
struct B
|
||||
C default("n/a") = //table[@class="wikitable sortable"]/tr/td[4]/a/text()
|
||||
D default("0") = //table[@class="wikitable sortable"]/tr/td[3]/text()
|
||||
struct E
|
||||
F = /html/head/inner_names/text()
|
||||
G = /html/head/inner_probabilities/text()
|
||||
end
|
||||
H = /html/head/inner_names/text()
|
||||
end
|
||||
I = /html/head/inner_names/text()
|
||||
end
|
||||
|
||||
==mustache_name
|
||||
blah
|
||||
==end
|
||||
```
|
||||
|
||||
|
||||
The above should result in the following:
|
||||
|
||||
```
|
||||
A[]
|
||||
B[] --- C
|
||||
--- D
|
||||
--- E[] --- F
|
||||
--- G
|
||||
--- H
|
||||
--- I[]
|
||||
```
|
||||
|
||||
For example, given these query results:
|
||||
|
||||
```
|
||||
A[] = {a1, a2, a3}
|
||||
C = c1
|
||||
D[] = {d1, d2}
|
||||
F[] = {f1, f2, f3}
|
||||
G = g1
|
||||
h = h1
|
||||
i = i1
|
||||
```
|
||||
|
||||
then the complete result in tree form shall be:
|
||||
|
||||
```
|
||||
{
|
||||
A => [a1, a2, a3],
|
||||
B => [
|
||||
{
|
||||
C => c1,
|
||||
D => d1,
|
||||
E => [
|
||||
{
|
||||
F => f1,
|
||||
G => g1
|
||||
}, {
|
||||
F => f2,
|
||||
G => ""
|
||||
}, {
|
||||
F => f3,
|
||||
G => ""
|
||||
}
|
||||
],
|
||||
H => h1
|
||||
}, {
|
||||
C => "",
|
||||
D => d2,
|
||||
E => [
|
||||
{
|
||||
F => f1,
|
||||
G => g1
|
||||
}, {
|
||||
F => f2,
|
||||
G => ""
|
||||
}, {
|
||||
F => f3,
|
||||
G => ""
|
||||
}
|
||||
],
|
||||
H => ""
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Please note that:
|
||||
|
||||
* arrays inside a struct turn the struct itself into an array, while its items become just single item values
|
||||
* if a struct contains no arrays, then the struct shall not become an array - that is, a struct generates an array with as many elements as the largest element in the struct itself
|
||||
* there are as many of any one struct as the size of the largest array inside it
|
||||
* nested structs get duplicated in every outer struct they are part of; in the example above E has as many elements as there items in F (2, the largest between F and G), and the whole array of E is duplicated in every element of B
|
17
sample.scrap
Normal file
17
sample.scrap
Normal file
|
@ -0,0 +1,17 @@
|
|||
from http://sid-story.wikia.com/wiki/Album
|
||||
pages = //section/header/h2/a/@href
|
||||
end
|
||||
|
||||
apply {{test_mustache}} to {{pages}}
|
||||
struct paragraphs
|
||||
paragraph = //section/header/h2/a/text()
|
||||
end
|
||||
end
|
||||
|
||||
==test_mustache
|
||||
Paragraphs: {{#paragraphs}}
|
||||
- {{paragraph}}
|
||||
{{/paragraphs}}
|
||||
|
||||
kthx bye!
|
||||
==end
|
|
@ -47,11 +47,13 @@ namespace duck {
|
|||
("help,h", "Produces this help message")
|
||||
("version", "Prints the program's version and quits")
|
||||
("dump,d", po::value<std::string>(), "Cleans the retrieved html and saves it to the named file; use - for stdout")
|
||||
("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
|
||||
;
|
||||
po::options_description query_options("Query options");
|
||||
query_options.add_options()
|
||||
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
||||
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
|
||||
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
|
||||
("namespace,n", po::value<std::string>()->default_value("http://www.w3.org/1999/xhtml"), "Default namespace for XPath queries (try empty string if you get no results)")
|
||||
;
|
||||
po::options_description positional_options("Positional options");
|
||||
positional_options.add_options()
|
||||
|
@ -86,6 +88,7 @@ namespace duck {
|
|||
std::cout << "redistribute it under certain conditions.\n"; //type `show c' for details.
|
||||
std::cout << '\n';
|
||||
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
|
||||
std::cout << " " << PROGRAM_NAME << " [options...] --model <path> <url>\n";
|
||||
std::cout << "You can pass - as the url to read from stdin\n";
|
||||
std::cout << visible;
|
||||
return true;
|
||||
|
@ -96,11 +99,14 @@ namespace duck {
|
|||
return true;
|
||||
}
|
||||
|
||||
if (parVarMap.count("input-url") == 0) {
|
||||
if (parVarMap.count("input-url") == 0 and parVarMap.count("model") == 0) {
|
||||
throw std::invalid_argument("No input URL specified");
|
||||
}
|
||||
if (parVarMap.count("xpath") == 0) {
|
||||
throw std::invalid_argument("No XPath expression specified");
|
||||
if (not (parVarMap.count("xpath") or parVarMap.count("model"))) {
|
||||
throw std::invalid_argument("No XPath expression specified and no input model given");
|
||||
}
|
||||
else if (parVarMap.count("xpath") and parVarMap.count("model")) {
|
||||
throw std::invalid_argument("Received both model and XPath expression, but only one of the two is allowed");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
61
src/html_pool.cpp
Normal file
61
src/html_pool.cpp
Normal file
|
@ -0,0 +1,61 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "html_pool.hpp"
|
||||
#include "htmlretrieve.hpp"
|
||||
#include "read_all.hpp"
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <iostream>
|
||||
|
||||
namespace duck {
|
||||
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
|
||||
m_agent(std::move(agent_name)),
|
||||
m_src_charset(std::move(src_charset)),
|
||||
m_iconv(m_src_charset, "utf-8", IconvWrapper::ModeIgnore)
|
||||
{
|
||||
}
|
||||
|
||||
HtmlPool::~HtmlPool() noexcept = default;
|
||||
|
||||
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
|
||||
std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n";
|
||||
|
||||
std::unique_ptr<std::string> utf8_html;
|
||||
if (parRes == "-") {
|
||||
utf8_html = std::make_unique<std::string>(read_all(std::cin));
|
||||
}
|
||||
else {
|
||||
utf8_html = std::make_unique<std::string>(
|
||||
fetch_html(parRes, m_agent, false, false)
|
||||
);
|
||||
}
|
||||
|
||||
*utf8_html = duck::clean_html(m_iconv.conv_char(*utf8_html), "utf-8");
|
||||
return utf8_html.release();
|
||||
}
|
||||
|
||||
void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept {
|
||||
delete parRes;
|
||||
}
|
||||
|
||||
auto HtmlPool::GetResourceNameFromResourceObject (ResourceObjectParameterType parRes) -> ResourceNameType {
|
||||
return parRes;
|
||||
}
|
||||
} //namespace duck
|
46
src/html_pool.hpp
Normal file
46
src/html_pool.hpp
Normal file
|
@ -0,0 +1,46 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef idCDCACC393BE24CBD94A3B5E2985984A3
|
||||
#define idCDCACC393BE24CBD94A3B5E2985984A3
|
||||
|
||||
#include "scraplang/html_pool_base.hpp"
|
||||
#include "iconv_wrapper.hpp"
|
||||
|
||||
namespace duck {
|
||||
class HtmlPool : public ::duck::sl::HtmlPoolBase {
|
||||
typedef ::duck::sl::HtmlPoolBase::ResourceType ResourceType;
|
||||
typedef ::duck::sl::HtmlPoolBase::ResourceNameType ResourceNameType;
|
||||
typedef ::duck::sl::HtmlPoolBase::ResourceObjectParameterType ResourceObjectParameterType;
|
||||
typedef ::duck::sl::HtmlPoolBase::ResourceNameParamType ResourceNameParamType;
|
||||
|
||||
virtual ResourceType* OnResourceLoad (ResourceObjectParameterType parRes);
|
||||
virtual void OnResourceDestroy (ResourceNameParamType parName, ResourceType* parRes) noexcept;
|
||||
virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes);
|
||||
|
||||
std::string m_agent;
|
||||
std::string m_src_charset;
|
||||
IconvWrapper m_iconv;
|
||||
|
||||
public:
|
||||
HtmlPool(std::string&& agent_name, std::string&& src_charset);
|
||||
~HtmlPool() noexcept;
|
||||
};
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
|
@ -1,4 +1,4 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
/* Copyright (C) 2015-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
|
@ -28,24 +28,51 @@
|
|||
#include <memory>
|
||||
#include <cassert>
|
||||
#include <utility>
|
||||
#include <cctype>
|
||||
#include <iterator>
|
||||
|
||||
namespace duck {
|
||||
namespace {
|
||||
void dropScriptTags (std::string& html) {
|
||||
size_t open_index = 0;
|
||||
const std::string open_tag("<script");
|
||||
const std::string close_tag("</script>");
|
||||
|
||||
while (html.npos != (open_index = html.find(open_tag, open_index))) {
|
||||
assert(open_index < html.size());
|
||||
auto close_index = html.find(close_tag, open_index + open_tag.size());
|
||||
if (close_index == html.npos)
|
||||
close_index = html.size();
|
||||
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
|
||||
}
|
||||
std::string make_lowercase (std::string_view in) {
|
||||
std::string out;
|
||||
std::transform(in.begin(), in.end(), std::back_inserter(out), [](unsigned char c){return std::tolower(c);});
|
||||
return out;
|
||||
}
|
||||
|
||||
bool isHttps (const std::string& parUrl) {
|
||||
TidyEncodingOptions charset_to_enum (std::string_view name) {
|
||||
const std::string lower_name = make_lowercase(name);
|
||||
if (lower_name == "ascii")
|
||||
return TidyEncAscii;
|
||||
//else if (lower_name == "???")
|
||||
// return TidyEncLatin0;
|
||||
//else if (lower_name == "???")
|
||||
// return TidyEncLatin1;
|
||||
else if (lower_name == "utf-8")
|
||||
return TidyEncUtf8;
|
||||
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||
else if (lower_name == "iso-2022-cn")
|
||||
return TidyEncIso2022;
|
||||
#endif
|
||||
else if (lower_name == "mac")
|
||||
return TidyEncMac;
|
||||
else if (lower_name == "windows-1252")
|
||||
return TidyEncWin1252;
|
||||
else if (lower_name == "ibm858")
|
||||
return TidyEncIbm858;
|
||||
else if (lower_name == "utf-16le")
|
||||
return TidyEncUtf16le;
|
||||
else if (lower_name == "utf-16be")
|
||||
return TidyEncUtf16be;
|
||||
else if (lower_name == "utf-16")
|
||||
return TidyEncUtf16;
|
||||
else if (lower_name == "big-5")
|
||||
return TidyEncBig5;
|
||||
else if (lower_name == "shift-jis" or lower_name == "shift_jis")
|
||||
return TidyEncShiftjis;
|
||||
throw std::runtime_error("Failed to recognise \"" + std::string(name) + "\" as a valid source charset");
|
||||
}
|
||||
|
||||
bool isHttps (const std::string_view& parUrl) {
|
||||
const char protocol[] = "https://";
|
||||
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
|
||||
if (parUrl.size() < protocolLen)
|
||||
|
@ -55,8 +82,7 @@ namespace duck {
|
|||
}
|
||||
} //unnamed namespace
|
||||
|
||||
std::string clean_html (std::string&& html) {
|
||||
dropScriptTags(html);
|
||||
std::string clean_html (std::string&& html, OptString src_charset) {
|
||||
|
||||
// Initialize a Tidy document
|
||||
TidyDoc tidyDoc = tidyCreate();
|
||||
|
@ -68,8 +94,21 @@ namespace duck {
|
|||
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no)
|
||||
&& tidyOptSetBool(tidyDoc, TidyEscapeScripts, yes)
|
||||
&& tidyOptSetInt(tidyDoc, TidyNewline, TidyLF)
|
||||
&& tidyOptSetBool(tidyDoc,TidyFixUri, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyHideComments, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no)
|
||||
&& tidyOptSetBool(tidyDoc, TidyPunctWrap, no)
|
||||
&& tidyOptSetInt(tidyDoc, TidyOutCharEncoding, TidyEncUtf8)
|
||||
&& tidyOptSetBool(tidyDoc, TidyMetaCharset, yes)
|
||||
&& tidyOptSetValue(tidyDoc, TidyDoctype, "omit")
|
||||
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
|
||||
|
||||
if (src_charset)
|
||||
tidyOptSetInt(tidyDoc, TidyInCharEncoding, charset_to_enum(*src_charset));
|
||||
|
||||
int tidyResponseCode = -1;
|
||||
|
||||
// Parse input
|
||||
|
@ -103,14 +142,15 @@ namespace duck {
|
|||
}
|
||||
|
||||
|
||||
std::string fetch_html (const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
||||
std::string fetch_html (const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
||||
using curl::curl_easy;
|
||||
using curl::curl_pair;
|
||||
using curl::curl_ios;
|
||||
|
||||
std::ostringstream oss;
|
||||
curl_writer wr(oss);
|
||||
curl_ios<std::ostream> wr(oss);
|
||||
curl_easy easy(wr);
|
||||
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
|
||||
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_URL, std::string(parSource)));
|
||||
if (isHttps(parSource)) {
|
||||
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
|
||||
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
|
||||
|
@ -118,6 +158,9 @@ namespace duck {
|
|||
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
|
||||
easy.add(curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
|
||||
|
||||
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_ACCEPT_ENCODING, "gzip"));
|
||||
easy.add(curl_pair<CURLoption, long>(CURLOPT_HTTP_CONTENT_DECODING, 1L));
|
||||
|
||||
//try {
|
||||
easy.perform();
|
||||
//}
|
||||
|
@ -127,6 +170,7 @@ namespace duck {
|
|||
//return 1;
|
||||
//}
|
||||
|
||||
//return FetchedHtml(oss.str(), easy.get_info<CURLINFO_CONTENT_TYPE>().get());
|
||||
return oss.str();
|
||||
}
|
||||
} //namespace duck
|
||||
|
|
|
@ -20,10 +20,14 @@
|
|||
#define idC6776D903059465191FFB64FCFD6B86A
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <optional>
|
||||
|
||||
namespace duck {
|
||||
std::string fetch_html ( const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||
std::string clean_html ( std::string&& html );
|
||||
typedef std::optional<std::string_view> OptString;
|
||||
|
||||
std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||
std::string clean_html ( std::string&& html, OptString src_charset );
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
||||
|
|
143
src/iconv_wrapper.cpp
Normal file
143
src/iconv_wrapper.cpp
Normal file
|
@ -0,0 +1,143 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "iconv_wrapper.hpp"
|
||||
#include <iconv.h>
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
|
||||
namespace duck {
|
||||
namespace {
|
||||
class IconvDeleter {
|
||||
public:
|
||||
typedef iconv_t pointer;
|
||||
|
||||
void operator() (pointer& resource) {
|
||||
if (resource and reinterpret_cast<iconv_t>(-1) != resource) {
|
||||
iconv_close(resource);
|
||||
resource = static_cast<iconv_t>(0);
|
||||
}
|
||||
}
|
||||
};
|
||||
} //unnamed namespace
|
||||
|
||||
typedef std::unique_ptr<iconv_t, IconvDeleter> UniqueIconv;
|
||||
|
||||
IconvBadSequence::IconvBadSequence (const std::string& message) :
|
||||
std::domain_error(message)
|
||||
{
|
||||
}
|
||||
|
||||
IconvOpenFailure::IconvOpenFailure (const std::string& message) :
|
||||
std::logic_error(message)
|
||||
{
|
||||
}
|
||||
|
||||
struct IconvWrapper::LocalData {
|
||||
UniqueIconv context;
|
||||
};
|
||||
|
||||
IconvWrapper::IconvWrapper(const std::string& from, std::string to, Mode mode) :
|
||||
m_local(std::make_unique<LocalData>())
|
||||
{
|
||||
switch (mode) {
|
||||
case ModeIgnore:
|
||||
to += "//IGNORE";
|
||||
break;
|
||||
case ModeTransliterate:
|
||||
to += "//TRANSLIT";
|
||||
break;
|
||||
case ModeDefault:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
m_local->context.reset(iconv_open(to.c_str(), from.c_str()));
|
||||
if (reinterpret_cast<iconv_t>(-1) == m_local->context.get()) {
|
||||
auto msg = std::string("Failed to create an iconv context for \"") +
|
||||
from + "\" to \"" + to + "\" conversion (error code " +
|
||||
std::to_string(errno);
|
||||
if (EINVAL == errno)
|
||||
msg += " EINVAL";
|
||||
msg += ")";
|
||||
|
||||
throw IconvOpenFailure(msg);
|
||||
}
|
||||
}
|
||||
|
||||
IconvWrapper::~IconvWrapper() noexcept = default;
|
||||
|
||||
void IconvWrapper::conv (
|
||||
const char* buff,
|
||||
std::size_t len,
|
||||
PtrGetterFunc get_ptr,
|
||||
SizeGetterFunc get_size,
|
||||
ReallocFunc realloc,
|
||||
std::size_t grow_hint
|
||||
) {
|
||||
assert(buff);
|
||||
assert(len);
|
||||
|
||||
const constexpr std::size_t def_inc = 16;
|
||||
const constexpr std::size_t iconv_err = static_cast<std::size_t>(-1);
|
||||
|
||||
std::size_t nchars;
|
||||
std::size_t inbytesleft = len;
|
||||
char* inbuff = const_cast<char*>(buff);
|
||||
std::ptrdiff_t out_offset = 0;
|
||||
std::size_t grow_factor = grow_hint;
|
||||
std::size_t outbytesleft;
|
||||
|
||||
do {
|
||||
realloc(std::max(inbytesleft * grow_factor, def_inc) + get_size());
|
||||
assert(get_size() > static_cast<std::size_t>(out_offset));
|
||||
outbytesleft = get_size() - out_offset;
|
||||
char* outbuff = get_ptr() + out_offset;
|
||||
|
||||
const auto old_inbytesleft = inbytesleft;
|
||||
const auto old_outbytesleft = outbytesleft;
|
||||
|
||||
nchars = ::iconv(
|
||||
m_local->context.get(),
|
||||
&inbuff,
|
||||
&inbytesleft,
|
||||
&outbuff,
|
||||
&outbytesleft
|
||||
);
|
||||
if (iconv_err == nchars) {
|
||||
const auto pos_str = std::to_string(len - inbytesleft);
|
||||
switch (errno) {
|
||||
case EILSEQ:
|
||||
throw IconvBadSequence("Invalid input multibyte sequence at byte " + pos_str);
|
||||
case EINVAL:
|
||||
throw IconvBadSequence("Incomplete input multibyte sequence at byte " + pos_str);
|
||||
}
|
||||
}
|
||||
|
||||
out_offset = std::distance(get_ptr(), outbuff);
|
||||
assert(out_offset >= 0);
|
||||
|
||||
const auto in_diff = old_inbytesleft - inbytesleft;
|
||||
const auto out_diff = old_outbytesleft - outbytesleft;
|
||||
grow_factor = std::max<std::size_t>(1, out_diff / in_diff);
|
||||
} while (iconv_err == nchars and E2BIG == errno);
|
||||
|
||||
assert(outbytesleft < get_size());
|
||||
realloc(get_size() - outbytesleft);
|
||||
}
|
||||
} //namespace duck
|
87
src/iconv_wrapper.hpp
Normal file
87
src/iconv_wrapper.hpp
Normal file
|
@ -0,0 +1,87 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <functional>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace duck {
|
||||
class IconvBadSequence : public std::domain_error {
|
||||
public:
|
||||
explicit IconvBadSequence(const std::string& message);
|
||||
};
|
||||
class IconvOpenFailure : public std::logic_error {
|
||||
public:
|
||||
explicit IconvOpenFailure(const std::string& message);
|
||||
};
|
||||
|
||||
class IconvWrapper {
|
||||
typedef std::function<char*()> PtrGetterFunc;
|
||||
typedef std::function<void(std::size_t)> ReallocFunc;
|
||||
typedef std::function<std::size_t()> SizeGetterFunc;
|
||||
public:
|
||||
enum Mode {
|
||||
ModeTransliterate, ModeIgnore, ModeDefault
|
||||
};
|
||||
|
||||
IconvWrapper (const std::string& from, std::string to, Mode mode=ModeDefault);
|
||||
~IconvWrapper() noexcept;
|
||||
|
||||
template <typename CIn, typename COut>
|
||||
std::basic_string<COut> conv (std::basic_string_view<CIn> text);
|
||||
|
||||
std::string conv_char (std::string_view text) {return conv<char, char>(text);}
|
||||
|
||||
private:
|
||||
struct LocalData;
|
||||
|
||||
void conv (
|
||||
const char* buff,
|
||||
std::size_t len,
|
||||
PtrGetterFunc get_ptr,
|
||||
SizeGetterFunc get_size,
|
||||
ReallocFunc realloc,
|
||||
std::size_t grow_hint
|
||||
);
|
||||
|
||||
std::unique_ptr<LocalData> m_local;
|
||||
};
|
||||
|
||||
template <typename CIn, typename COut>
|
||||
std::basic_string<COut> IconvWrapper::conv (std::basic_string_view<CIn> text) {
|
||||
typedef std::basic_string<COut> string;
|
||||
|
||||
if (text.empty())
|
||||
return {};
|
||||
|
||||
string retval;
|
||||
this->conv(
|
||||
reinterpret_cast<const char*>(text.data()),
|
||||
text.size() * sizeof(CIn),
|
||||
[&retval](){return reinterpret_cast<char*>(retval.data());},
|
||||
[&retval]()->std::size_t {return retval.size() * sizeof(COut);},
|
||||
[&retval](std::size_t sz){retval.resize((sz + sizeof(COut) - 1) / sizeof(COut));},
|
||||
sizeof(COut) / sizeof(CIn)
|
||||
);
|
||||
return retval;
|
||||
}
|
||||
} //namespace duck
|
115
src/kakoune/ref_ptr.hh
Normal file
115
src/kakoune/ref_ptr.hh
Normal file
|
@ -0,0 +1,115 @@
|
|||
#ifndef ref_ptr_hh_INCLUDED
|
||||
#define ref_ptr_hh_INCLUDED
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace Kakoune
|
||||
{
|
||||
|
||||
struct RefCountable
|
||||
{
|
||||
int refcount = 0;
|
||||
virtual ~RefCountable() = default;
|
||||
};
|
||||
|
||||
struct RefCountablePolicy
|
||||
{
|
||||
static void inc_ref(RefCountable* r, void*) noexcept { ++r->refcount; }
|
||||
static void dec_ref(RefCountable* r, void*) { if (--r->refcount == 0) delete r; }
|
||||
static void ptr_moved(RefCountable*, void*, void*) noexcept {}
|
||||
};
|
||||
|
||||
template<typename T, typename Policy = RefCountablePolicy>
|
||||
struct RefPtr
|
||||
{
|
||||
RefPtr() = default;
|
||||
explicit RefPtr(T* ptr) : m_ptr(ptr) { acquire(); }
|
||||
~RefPtr() { release(); }
|
||||
RefPtr(const RefPtr& other) : m_ptr(other.m_ptr) { acquire(); }
|
||||
RefPtr(RefPtr&& other)
|
||||
noexcept(noexcept(std::declval<RefPtr>().moved(nullptr)))
|
||||
: m_ptr(other.m_ptr) { other.m_ptr = nullptr; moved(&other); }
|
||||
|
||||
RefPtr& operator=(const RefPtr& other)
|
||||
{
|
||||
if (other.m_ptr != m_ptr)
|
||||
{
|
||||
release();
|
||||
m_ptr = other.m_ptr;
|
||||
acquire();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
RefPtr& operator=(RefPtr&& other)
|
||||
{
|
||||
release();
|
||||
m_ptr = other.m_ptr;
|
||||
other.m_ptr = nullptr;
|
||||
moved(&other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
RefPtr& operator=(T* ptr)
|
||||
{
|
||||
if (ptr != m_ptr)
|
||||
{
|
||||
release();
|
||||
m_ptr = ptr;
|
||||
acquire();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
[[gnu::always_inline]]
|
||||
T* operator->() const { return m_ptr; }
|
||||
[[gnu::always_inline]]
|
||||
T& operator*() const { return *m_ptr; }
|
||||
|
||||
[[gnu::always_inline]]
|
||||
T* get() const { return m_ptr; }
|
||||
|
||||
[[gnu::always_inline]]
|
||||
explicit operator bool() const { return m_ptr; }
|
||||
|
||||
void reset(T* ptr = nullptr)
|
||||
{
|
||||
if (ptr == m_ptr)
|
||||
return;
|
||||
release();
|
||||
m_ptr = ptr;
|
||||
acquire();
|
||||
}
|
||||
|
||||
friend bool operator==(const RefPtr& lhs, const RefPtr& rhs) { return lhs.m_ptr == rhs.m_ptr; }
|
||||
friend bool operator!=(const RefPtr& lhs, const RefPtr& rhs) { return lhs.m_ptr != rhs.m_ptr; }
|
||||
|
||||
private:
|
||||
T* m_ptr = nullptr;
|
||||
|
||||
[[gnu::always_inline]]
|
||||
void acquire()
|
||||
{
|
||||
if (m_ptr)
|
||||
Policy::inc_ref(m_ptr, this);
|
||||
}
|
||||
|
||||
[[gnu::always_inline]]
|
||||
void release()
|
||||
{
|
||||
if (m_ptr)
|
||||
Policy::dec_ref(m_ptr, this);
|
||||
}
|
||||
|
||||
[[gnu::always_inline]]
|
||||
void moved(void* from)
|
||||
noexcept(noexcept(Policy::ptr_moved(nullptr, nullptr, nullptr)))
|
||||
{
|
||||
if (m_ptr)
|
||||
Policy::ptr_moved(m_ptr, from, this);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif // ref_ptr_hh_INCLUDED
|
109
src/kakoune/safe_ptr.hh
Normal file
109
src/kakoune/safe_ptr.hh
Normal file
|
@ -0,0 +1,109 @@
|
|||
#ifndef safe_ptr_hh_INCLUDED
|
||||
#define safe_ptr_hh_INCLUDED
|
||||
|
||||
// #define SAFE_PTR_TRACK_CALLSTACKS
|
||||
|
||||
//King_DuckZ:
|
||||
#include <cassert>
|
||||
#define kak_assert(a) assert(a)
|
||||
|
||||
//#include "assert.hh"
|
||||
#include "ref_ptr.hh"
|
||||
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||
#include "backtrace.hh"
|
||||
#include "vector.hh"
|
||||
#include <algorithm>
|
||||
#endif
|
||||
|
||||
namespace Kakoune
|
||||
{
|
||||
|
||||
// *** SafePtr: objects that assert nobody references them when they die ***
|
||||
|
||||
class SafeCountable
|
||||
{
|
||||
public:
|
||||
#ifdef KAK_DEBUG
|
||||
SafeCountable() : m_count(0) {}
|
||||
SafeCountable (SafeCountable&&) : m_count(0) {}
|
||||
~SafeCountable()
|
||||
{
|
||||
kak_assert(m_count == 0);
|
||||
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||
kak_assert(m_callstacks.empty());
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
friend struct SafeCountablePolicy;
|
||||
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||
struct Callstack
|
||||
{
|
||||
Callstack(void* p) : ptr(p) {}
|
||||
void* ptr;
|
||||
Backtrace bt;
|
||||
};
|
||||
|
||||
mutable Vector<Callstack> m_callstacks;
|
||||
#endif
|
||||
mutable int m_count;
|
||||
#endif
|
||||
};
|
||||
|
||||
struct SafeCountablePolicy
|
||||
{
|
||||
#ifdef KAK_DEBUG
|
||||
static void inc_ref(const SafeCountable* sc, void* ptr) noexcept
|
||||
{
|
||||
++sc->m_count;
|
||||
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||
sc->m_callstacks.emplace_back(ptr);
|
||||
#else
|
||||
static_cast<void>(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dec_ref(const SafeCountable* sc, void* ptr) noexcept
|
||||
{
|
||||
--sc->m_count;
|
||||
kak_assert(sc->m_count >= 0);
|
||||
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||
auto it = std::find_if(sc->m_callstacks.begin(), sc->m_callstacks.end(),
|
||||
[=](const SafeCountable::Callstack& cs) { return cs.ptr == ptr; });
|
||||
kak_assert(it != sc->m_callstacks.end());
|
||||
sc->m_callstacks.erase(it);
|
||||
#else
|
||||
static_cast<void>(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ptr_moved(const SafeCountable* sc, void* from, void* to) noexcept
|
||||
{
|
||||
#ifdef SAFE_PTR_TRACK_CALLSTACKS
|
||||
auto it = std::find_if(sc->m_callstacks.begin(), sc->m_callstacks.end(),
|
||||
[=](const SafeCountable::Callstack& cs) { return cs.ptr == from; });
|
||||
kak_assert(it != sc->m_callstacks.end());
|
||||
it->ptr = to;
|
||||
#else
|
||||
static_cast<void>(sc);
|
||||
static_cast<void>(from);
|
||||
static_cast<void>(to);
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
static void inc_ref(const SafeCountable*, void*) noexcept {}
|
||||
static void dec_ref(const SafeCountable*, void*) noexcept {}
|
||||
static void ptr_moved(const SafeCountable*, void*, void*) noexcept {}
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
using SafePtr = RefPtr<T, SafeCountablePolicy>;
|
||||
|
||||
}
|
||||
|
||||
#endif // safe_ptr_hh_INCLUDED
|
124
src/main.cpp
124
src/main.cpp
|
@ -16,20 +16,24 @@
|
|||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "htmlretrieve.hpp"
|
||||
#include "commandline.hpp"
|
||||
#include "xpath.hpp"
|
||||
#include "scraplang.hpp"
|
||||
#include "html_pool.hpp"
|
||||
#include "read_all.hpp"
|
||||
#include "safe_stack_object.hpp"
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <utility>
|
||||
#include <ciso646>
|
||||
#include <memory>
|
||||
#include <iterator>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace {
|
||||
void dump_string ( const std::string& parPathDest, const std::string& parData );
|
||||
void load_from_commandline ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
|
||||
void load_from_model ( const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath );
|
||||
} //unnamed namespace
|
||||
|
||||
int main (int argc, char* argv[]) {
|
||||
|
@ -46,54 +50,25 @@ int main (int argc, char* argv[]) {
|
|||
return 2;
|
||||
}
|
||||
|
||||
const auto url = vm["input-url"].as<std::string>();
|
||||
const auto xpath = vm["xpath"].as<std::string>();
|
||||
#if !defined(NDEBUG)
|
||||
std::cout << "URL : " << url << "\n";
|
||||
std::cout << "XPath: " << xpath << std::endl;
|
||||
std::cout << "Agent: " << vm["agent"].as<std::string>() << std::endl;
|
||||
#endif
|
||||
|
||||
std::string html;
|
||||
|
||||
if ("-" != url) {
|
||||
html = duck::fetch_html(url, vm["agent"].as<std::string>(), false, false);
|
||||
}
|
||||
else {
|
||||
std::cin >> std::noskipws;
|
||||
std::istream_iterator<char> it(std::cin);
|
||||
std::istream_iterator<char> end;
|
||||
html = std::string(it, end);
|
||||
}
|
||||
|
||||
if (vm.count("dump-raw")) {
|
||||
dump_string(vm["dump-raw"].as<std::string>(), html);
|
||||
}
|
||||
|
||||
html = duck::clean_html(std::move(html));
|
||||
if (vm.count("dump")) {
|
||||
dump_string(vm["dump"].as<std::string>(), html);
|
||||
}
|
||||
|
||||
try {
|
||||
std::vector<std::string> queries;
|
||||
queries.reserve(1);
|
||||
queries.push_back(std::move(xpath));
|
||||
auto results = duck::xpath_query(html, queries);
|
||||
for (const auto& lst : results[0]) {
|
||||
std::cout << lst.first << ": " << lst.second << '\n';
|
||||
}
|
||||
curry::SafeStackObject<duck::XPath> query;
|
||||
if (vm.count("model"))
|
||||
load_from_model(vm, query);
|
||||
else
|
||||
load_from_commandline(vm, query);
|
||||
}
|
||||
catch (const duck::ParseError& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
catch (const std::runtime_error& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void dump_string (const std::string& parPathDest, const std::string& parData) {
|
||||
std::unique_ptr<std::ofstream> ofs;
|
||||
const bool use_stdout = ("-" == parPathDest);
|
||||
|
@ -103,4 +78,73 @@ namespace {
|
|||
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
||||
*os << parData;
|
||||
}
|
||||
|
||||
void load_from_commandline (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||
using std::string;
|
||||
|
||||
const auto& vm = parVarMap;
|
||||
const auto url = vm["input-url"].as<string>();
|
||||
|
||||
duck::HtmlPool html_pool(
|
||||
string(parVarMap["agent"].as<string>()),
|
||||
string(parVarMap["from-code"].as<string>())
|
||||
);
|
||||
const auto in_html_id = html_pool.GetOrAdd(url);
|
||||
string html = *html_pool.GetByID(in_html_id);
|
||||
if (vm.count("dump")) {
|
||||
dump_string(vm["dump"].as<string>(), html);
|
||||
}
|
||||
|
||||
const string xpath_str = parVarMap["xpath"].as<string>();
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
std::cout << " -- XPath direct mode --\n";
|
||||
std::cout << "URL : " << parVarMap["input-url"].as<string>() << "\n";
|
||||
std::cout << "XPath: " << xpath_str << std::endl;
|
||||
std::cout << "Agent: " << parVarMap["agent"].as<string>() << std::endl;
|
||||
#endif
|
||||
|
||||
std::vector<string> queries;
|
||||
queries.reserve(1);
|
||||
queries.push_back(std::move(xpath_str));
|
||||
auto results = xpath->run_query(html, queries, parVarMap["namespace"].as<string>());
|
||||
for (const auto& lst : results[0]) {
|
||||
std::cout << lst.first << ": " << lst.second << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
void load_from_model (const boost::program_options::variables_map& parVarMap, duck::XPathPtr xpath) {
|
||||
using std::string;
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
std::cout << " -- XPath model mode --\n";
|
||||
if (parVarMap.count("input-url"))
|
||||
std::cout << "URL : " << parVarMap["input-url"].as<string>() << "\n";
|
||||
std::cout << "Model: " << parVarMap["model"].as<string>() << std::endl;
|
||||
std::cout << "Agent: " << parVarMap["agent"].as<string>() << std::endl;
|
||||
#endif
|
||||
const string script = duck::read_all(parVarMap["model"].as<string>());
|
||||
auto ast = duck::sl::parse(script);
|
||||
|
||||
duck::HtmlPool html_pool(
|
||||
string(parVarMap["agent"].as<string>()),
|
||||
string(parVarMap["from-code"].as<string>())
|
||||
);
|
||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath, string(parVarMap["namespace"].as<string>()));
|
||||
//auto list = duck::get_xpath_definitions(*ast);
|
||||
|
||||
//std::vector<string> expressions;
|
||||
//expressions.reserve(list.size());
|
||||
//for (duck::element_def& elem : list) {
|
||||
// expressions.push_back(std::move(elem.xpath));
|
||||
//}
|
||||
//auto results = duck::xpath_query(parXML, expressions);
|
||||
//duck::print_results(std::cout, *ast, list, results);
|
||||
//for (const auto& list : results) {
|
||||
// std::cout << "------\n";
|
||||
// for (const auto& result : list) {
|
||||
// std::cout << result.first << ": " << result.second << '\n';
|
||||
// }
|
||||
//}
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
|
38
src/read_all.cpp
Normal file
38
src/read_all.cpp
Normal file
|
@ -0,0 +1,38 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "read_all.hpp"
|
||||
#include <fstream>
|
||||
#include <iterator>
|
||||
|
||||
namespace duck {
|
||||
std::string read_all (std::istream& parStream) {
|
||||
parStream >> std::noskipws;
|
||||
std::istream_iterator<char> it(parStream);
|
||||
std::istream_iterator<char> end;
|
||||
return std::string(it, end);
|
||||
}
|
||||
|
||||
std::string read_all (std::istream&& parStream) {
|
||||
return read_all(parStream);
|
||||
}
|
||||
|
||||
std::string read_all (const std::string& parPath) {
|
||||
return read_all(std::ifstream(parPath));
|
||||
}
|
||||
} //namespace duck
|
|
@ -16,34 +16,22 @@
|
|||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef idBE96C2D49C4C413888A79EAEB2B9C0FA
|
||||
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
|
||||
#ifndef id0768F384342E4FD58028BE415A725169
|
||||
#define id0768F384342E4FD58028BE415A725169
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
namespace std {
|
||||
template <typename Char, typename Traits>
|
||||
class basic_istream;
|
||||
|
||||
typedef basic_istream<char> istream;
|
||||
} //namespace std
|
||||
|
||||
namespace duck {
|
||||
struct ScrapNode;
|
||||
struct element_def;
|
||||
|
||||
class ScrapNodePtr {
|
||||
public:
|
||||
explicit ScrapNodePtr ( ScrapNode* parPtr );
|
||||
ScrapNodePtr ( ScrapNodePtr&& parOther );
|
||||
~ScrapNodePtr ( void ) noexcept;
|
||||
|
||||
ScrapNode& operator* ( void ) { return *m_ptr; }
|
||||
const ScrapNode& operator* ( void ) const { return *m_ptr; }
|
||||
ScrapNode& operator-> ( void ) { return *m_ptr; }
|
||||
const ScrapNode& operator-> ( void ) const { return *m_ptr; }
|
||||
|
||||
private:
|
||||
std::unique_ptr<ScrapNode> m_ptr;
|
||||
};
|
||||
|
||||
ScrapNodePtr parse_scraplang ( const std::string& parData );
|
||||
std::vector<element_def> get_xpath_definitions ( const ScrapNode& parAST );
|
||||
std::string read_all ( std::istream& parStream );
|
||||
std::string read_all ( std::istream&& parStream );
|
||||
std::string read_all ( const std::string& parPath );
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
104
src/safe_stack_object.hpp
Normal file
104
src/safe_stack_object.hpp
Normal file
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
Copyright 2016, 2017 Michele "King_DuckZ" Santullo
|
||||
|
||||
This file is part of MyCurry.
|
||||
|
||||
MyCurry is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
MyCurry is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with MyCurry. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "kakoune/safe_ptr.hh"
|
||||
#include <utility>
|
||||
|
||||
namespace curry {
|
||||
template <typename T>
|
||||
class SafeStackObject {
|
||||
public:
|
||||
typedef Kakoune::SafePtr<T> safe_ptr;
|
||||
|
||||
SafeStackObject();
|
||||
SafeStackObject (SafeStackObject&& parOther);
|
||||
SafeStackObject (const SafeStackObject& parOther) = delete;
|
||||
template <typename... Args> explicit SafeStackObject (Args&&... parArgs);
|
||||
~SafeStackObject() noexcept = default;
|
||||
|
||||
SafeStackObject& operator= (SafeStackObject&& parOther) = delete;
|
||||
SafeStackObject& operator= (const SafeStackObject& parOther) = delete;
|
||||
|
||||
operator Kakoune::SafePtr<T>&();
|
||||
template <typename U>
|
||||
operator Kakoune::SafePtr<U>();
|
||||
T& operator*();
|
||||
safe_ptr& operator->();
|
||||
|
||||
private:
|
||||
T m_obj;
|
||||
safe_ptr m_obj_ptr;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
SafeStackObject<T>::SafeStackObject() :
|
||||
m_obj(),
|
||||
m_obj_ptr(&m_obj)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
SafeStackObject<T>::SafeStackObject (SafeStackObject&& parOther) :
|
||||
m_obj(std::move(parOther.m_obj)),
|
||||
m_obj_ptr(&m_obj)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename... Args>
|
||||
SafeStackObject<T>::SafeStackObject (Args&&... parArgs) :
|
||||
m_obj(std::forward<Args>(parArgs)...),
|
||||
m_obj_ptr(&m_obj)
|
||||
{
|
||||
}
|
||||
|
||||
//template <typename T>
|
||||
//SafeStackObject& SafeStackObject<T>::operator= (SafeStackObject&& parOther) {
|
||||
// m_obj = std::move(parOther.m_obj);
|
||||
// m_obj_ptr = std::move(parOther.m_obj_ptr);
|
||||
// m_ob
|
||||
//}
|
||||
|
||||
//template <typename T>
|
||||
//SafeStackObject& SafeStackObject<T>::operator= (const SafeStackObject& parOther) {
|
||||
//}
|
||||
|
||||
template <typename T>
|
||||
SafeStackObject<T>::operator Kakoune::SafePtr<T>&() {
|
||||
return m_obj_ptr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename U>
|
||||
SafeStackObject<T>::operator Kakoune::SafePtr<U>() {
|
||||
return Kakoune::SafePtr<U>(&m_obj);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T& SafeStackObject<T>::operator*() {
|
||||
return *m_obj_ptr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
auto SafeStackObject<T>::operator->() -> safe_ptr& {
|
||||
return m_obj_ptr;
|
||||
}
|
||||
} //namespace curry
|
25
src/scraplang.hpp
Normal file
25
src/scraplang.hpp
Normal file
|
@ -0,0 +1,25 @@
|
|||
/* Copyright (C) 2017 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id8483FDE5CA0E4F40BDE0F4469AC2DF79
|
||||
#define id8483FDE5CA0E4F40BDE0F4469AC2DF79
|
||||
|
||||
#include "scraplang/parse.hpp"
|
||||
#include "scraplang/apply.hpp"
|
||||
|
||||
#endif
|
517
src/scraplang/apply.cpp
Normal file
517
src/scraplang/apply.cpp
Normal file
|
@ -0,0 +1,517 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#define APPLY_VERBOSE
|
||||
|
||||
#include "apply.hpp"
|
||||
#include "mstch/mstch.hpp"
|
||||
#include "html_pool_base.hpp"
|
||||
#include "scrap_node.hpp"
|
||||
#include "xpath_runner.hpp"
|
||||
#if defined(APPLY_VERBOSE)
|
||||
# include "stream_scrap_node.hpp"
|
||||
#endif
|
||||
#include <map>
|
||||
#include <boost/variant/apply_visitor.hpp>
|
||||
#include <string_view>
|
||||
#include <list>
|
||||
#include <functional>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
struct EntryNode;
|
||||
|
||||
struct MustacheEntry {
|
||||
std::string text;
|
||||
mstch::map context;
|
||||
};
|
||||
|
||||
using EntryNodeList = std::vector<std::pair<
|
||||
const SourceInfo*,
|
||||
EntryNode
|
||||
>>;
|
||||
using MustacheEntryMap = std::map<std::string, MustacheEntry>;
|
||||
typedef std::function<void(std::size_t)> FixLengthCommand;
|
||||
|
||||
struct EntryNode {
|
||||
explicit EntryNode (const std::string_view& parName) :
|
||||
name(parName)
|
||||
{
|
||||
}
|
||||
EntryNode (EntryNode&&) = default;
|
||||
EntryNode (const EntryNode&) = default;
|
||||
EntryNode& operator= (EntryNode&&) = default;
|
||||
EntryNode& operator= (const EntryNode&) = default;
|
||||
|
||||
std::string_view name;
|
||||
std::vector<EntryNode> structs;
|
||||
std::vector<const XPathElement*> xpaths;
|
||||
};
|
||||
|
||||
struct ApplyEntry {
|
||||
ApplyEntry (const ApplyEntry&) = default;
|
||||
ApplyEntry (const SourceInfo* parAppTo, std::string_view parMstchName) :
|
||||
apply_to(parAppTo),
|
||||
content(""),
|
||||
mustache_name(parMstchName)
|
||||
{
|
||||
assert(apply_to);
|
||||
assert(not apply_to->value.empty());
|
||||
}
|
||||
ApplyEntry (ApplyEntry&&) = default;
|
||||
ApplyEntry& operator=(ApplyEntry&&) = default;
|
||||
|
||||
const SourceInfo* apply_to;
|
||||
EntryNode content;
|
||||
std::string_view mustache_name;
|
||||
};
|
||||
|
||||
class StructItemExtractor : public boost::static_visitor<> {
|
||||
public:
|
||||
StructItemExtractor() = delete;
|
||||
explicit StructItemExtractor (EntryNode& parRoot) :
|
||||
m_root(parRoot)
|
||||
{
|
||||
}
|
||||
|
||||
void operator() (const XPathElement& parVal) {
|
||||
m_root.xpaths.push_back(&parVal);
|
||||
}
|
||||
|
||||
void operator() (const StructBlock& parVal) {
|
||||
m_root.structs.emplace_back(parVal.name);
|
||||
StructItemExtractor visitor(m_root.structs.back());
|
||||
for (auto& itm : parVal.xpaths) {
|
||||
boost::apply_visitor(visitor, itm);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
EntryNode& m_root;
|
||||
};
|
||||
|
||||
class FillWithClonesVisitor : public boost::static_visitor<mstch::node&&> {
|
||||
public:
|
||||
explicit FillWithClonesVisitor (std::size_t exp_size) :
|
||||
m_expected_size(exp_size)
|
||||
{ }
|
||||
virtual mstch::node&& operator()(mstch::array&& parOut) {
|
||||
if (parOut.empty()) {
|
||||
parOut.resize(m_expected_size);
|
||||
}
|
||||
else {
|
||||
std::fill_n(
|
||||
std::back_inserter(parOut),
|
||||
std::max(m_expected_size, parOut.size()) - parOut.size(),
|
||||
parOut.back()
|
||||
);
|
||||
}
|
||||
m_retval = std::move(parOut);
|
||||
return std::move(m_retval);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mstch::node&& operator()(T&& parOut) {
|
||||
mstch::array retval;
|
||||
retval.reserve(m_expected_size);
|
||||
retval.push_back(std::move(parOut));
|
||||
return (*this)(std::move(retval));
|
||||
}
|
||||
|
||||
protected:
|
||||
mstch::node m_retval;
|
||||
std::size_t m_expected_size;
|
||||
};
|
||||
|
||||
class FillWithStringsVisitor : public FillWithClonesVisitor {
|
||||
public:
|
||||
FillWithStringsVisitor (const std::string& parVal, std::size_t exp_size) :
|
||||
FillWithClonesVisitor(exp_size),
|
||||
m_value(parVal)
|
||||
{ }
|
||||
|
||||
mstch::node&& operator()(mstch::array&& parOut) override {
|
||||
std::fill_n(
|
||||
std::back_inserter(parOut),
|
||||
std::max(m_expected_size, parOut.size()) - parOut.size(),
|
||||
m_value.get()
|
||||
);
|
||||
m_retval = std::move(parOut);
|
||||
return std::move(m_retval);
|
||||
}
|
||||
|
||||
using FillWithClonesVisitor::operator();
|
||||
|
||||
private:
|
||||
std::reference_wrapper<const std::string> m_value;
|
||||
};
|
||||
|
||||
void fill_with_defaults (mstch::map& parMap, std::size_t parTotal, const std::string& parKey, const std::string& parDefault) {
|
||||
FillWithStringsVisitor visitor(parDefault, parTotal);
|
||||
parMap[parKey] = boost::apply_visitor(visitor, std::move(parMap[parKey]));
|
||||
}
|
||||
|
||||
void fill_with_last_item_clones (mstch::map& parMap, std::size_t parTotal, const std::string& parKey) {
|
||||
FillWithClonesVisitor visitor(parTotal);
|
||||
parMap[parKey] = boost::apply_visitor(visitor, std::move(parMap[parKey]));
|
||||
}
|
||||
|
||||
mstch::map to_mustache_dict_recursive (
|
||||
const EntryNode& parNode,
|
||||
std::string_view parSrc,
|
||||
XPathRunner& parRunner
|
||||
);
|
||||
|
||||
void store_entry_subtree (
|
||||
const std::vector<StructItem>& parXPaths,
|
||||
EntryNode& parCurrList
|
||||
) {
|
||||
for (auto& itm : parXPaths) {
|
||||
StructItemExtractor extractor(parCurrList);
|
||||
boost::apply_visitor(extractor, itm);
|
||||
}
|
||||
}
|
||||
|
||||
class DictBuilder : public boost::static_visitor<> {
|
||||
public:
|
||||
explicit DictBuilder() :
|
||||
m_current_mustache_name(nullptr),
|
||||
m_current_mustache(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
void operator() (const std::vector<ScrapNode>& parVal) {
|
||||
for (auto& val : parVal) {
|
||||
boost::apply_visitor(*this, val);
|
||||
}
|
||||
}
|
||||
|
||||
void operator() (const FromBlock& parVal) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << parVal << '\n';
|
||||
#endif
|
||||
m_global_entries.emplace_back(std::make_pair(
|
||||
&parVal.source,
|
||||
EntryNode("")
|
||||
));
|
||||
|
||||
EntryNode& curr_node = m_global_entries.back().second;
|
||||
store_entry_subtree(parVal.xpaths, curr_node);
|
||||
}
|
||||
|
||||
void operator() (const ApplyBlock& parVal) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << parVal << '\n';
|
||||
#endif
|
||||
assert(not parVal.source.value.empty());
|
||||
m_apply_entries.emplace_back(&parVal.source, parVal.mustache_model);
|
||||
store_entry_subtree(parVal.xpaths, m_apply_entries.back().content);
|
||||
}
|
||||
|
||||
void operator() (const MustacheBlock& parVal) {
|
||||
#if defined(APPLY_VERBOSE)
|
||||
std::cout << "Mustache block \"" << parVal.name << "\"\n";
|
||||
#endif
|
||||
const auto curr_name = m_current_mustache_name;
|
||||
|
||||
if (not curr_name or *curr_name != parVal.name) {
|
||||
m_mustaches[parVal.name] = MustacheEntry();
|
||||
auto it_found = m_mustaches.find(parVal.name);
|
||||
m_current_mustache_name = &it_found->first;
|
||||
m_current_mustache = &it_found->second;
|
||||
}
|
||||
|
||||
m_current_mustache->text = parVal.content;
|
||||
}
|
||||
|
||||
const EntryNodeList& global_entries() const { return m_global_entries; }
|
||||
const MustacheEntryMap& mustache_entries() const { return m_mustaches; }
|
||||
const std::vector<ApplyEntry>& apply_entries() const { return m_apply_entries; }
|
||||
|
||||
private:
|
||||
EntryNodeList m_global_entries;
|
||||
std::vector<ApplyEntry> m_apply_entries;
|
||||
MustacheEntryMap m_mustaches;
|
||||
const std::string* m_current_mustache_name;
|
||||
MustacheEntry* m_current_mustache;
|
||||
};
|
||||
|
||||
struct ItemCountingVisitor : public boost::static_visitor<std::size_t> {
|
||||
template <typename T>
|
||||
std::size_t operator()(const T&) const { return 1; }
|
||||
std::size_t operator()(const mstch::array& parItem) const { return parItem.size(); }
|
||||
};
|
||||
|
||||
class ArraysToStructArrayVisitor : public boost::static_visitor<> {
|
||||
public:
|
||||
explicit ArraysToStructArrayVisitor (std::size_t parExpectedSize) :
|
||||
m_expected_size(parExpectedSize)
|
||||
{
|
||||
m_array.resize(m_expected_size, mstch::map());
|
||||
}
|
||||
|
||||
void operator()(const std::string& parName, const mstch::array& parItem) {
|
||||
for (std::size_t z = 0; z < parItem.size(); ++z) {
|
||||
auto& curr_map = boost::get<mstch::map>(m_array[z]);
|
||||
curr_map[parName] = parItem[z];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void operator()(const std::string& parName, const T& parItem) {
|
||||
auto& curr_map = boost::get<mstch::map>(m_array[0]);
|
||||
curr_map[parName] = parItem;
|
||||
}
|
||||
|
||||
mstch::node steal_struct() {
|
||||
if (1 == m_expected_size)
|
||||
return mstch::node(std::move(m_array[0]));
|
||||
else
|
||||
return mstch::node(std::move(m_array));
|
||||
}
|
||||
|
||||
private:
|
||||
mstch::array m_array;
|
||||
const std::size_t m_expected_size;
|
||||
};
|
||||
|
||||
const std::vector<std::string>& query_xpath_by_name (
|
||||
const EntryNodeList& parNodes,
|
||||
const std::string_view& parName,
|
||||
XPathRunner& parRunner
|
||||
) {
|
||||
for (auto& curr_node : parNodes) {
|
||||
assert(curr_node.first);
|
||||
const SourceInfo& source = *curr_node.first;
|
||||
const EntryNode& entry = curr_node.second;
|
||||
assert(entry.name.empty());
|
||||
|
||||
auto it_found = std::find_if(
|
||||
entry.xpaths.begin(),
|
||||
entry.xpaths.end(),
|
||||
[&parName](const auto& xpath_elem) {
|
||||
return xpath_elem->name == parName;
|
||||
}
|
||||
);
|
||||
|
||||
if (it_found != entry.xpaths.end()) {
|
||||
const XPathElement* const val = *it_found;
|
||||
assert(val);
|
||||
return parRunner.query(source.value, val->xpath);
|
||||
}
|
||||
}
|
||||
|
||||
static const std::vector<std::string> empty_retval;
|
||||
std::cout << "query_xpath_by_name(parNodes, \"" << parName <<
|
||||
"\", parRunner) -> nothing found" << std::endl;
|
||||
assert(false); //throw?
|
||||
return empty_retval;
|
||||
}
|
||||
|
||||
std::size_t largest_array_size_in (const mstch::map& parMap) {
|
||||
typedef ItemCountingVisitor ITC;
|
||||
using boost::apply_visitor;
|
||||
|
||||
if (parMap.empty())
|
||||
return 0;
|
||||
|
||||
return apply_visitor(
|
||||
ITC(),
|
||||
std::max_element(parMap.begin(), parMap.end(), [](const auto& a, const auto& b) {
|
||||
return apply_visitor(ITC(), a.second) < apply_visitor(ITC(), b.second);
|
||||
})->second
|
||||
);
|
||||
}
|
||||
|
||||
void fill_with_xpaths (
|
||||
mstch::map& parOut,
|
||||
std::vector<FixLengthCommand>& parFixCommands,
|
||||
const EntryNode& parNode,
|
||||
std::string_view parSrc,
|
||||
XPathRunner& parRunner
|
||||
) {
|
||||
using std::placeholders::_1;
|
||||
|
||||
for (const XPathElement* xpath : parNode.xpaths) {
|
||||
assert(xpath);
|
||||
std::cout << "Running query for \"" << xpath->name << "\"\n";
|
||||
auto results = parRunner.query(parSrc, xpath->xpath);
|
||||
if (results.size() == 1) {
|
||||
parOut[xpath->name] = results.front();
|
||||
}
|
||||
else if (results.size() > 1) {
|
||||
mstch::array values;
|
||||
values.reserve(results.size());
|
||||
std::copy(results.begin(), results.end(), std::back_inserter(values));
|
||||
parOut[xpath->name] = std::move(values);
|
||||
}
|
||||
else if (xpath->def_val) {
|
||||
parOut[xpath->name] = *xpath->def_val;
|
||||
}
|
||||
else {
|
||||
parOut[xpath->name] = std::string();
|
||||
}
|
||||
|
||||
if (xpath->def_val)
|
||||
parFixCommands.push_back(std::bind(&fill_with_defaults, std::ref(parOut), _1, std::cref(xpath->name), std::cref(*xpath->def_val)));
|
||||
else
|
||||
parFixCommands.push_back(std::bind(&fill_with_last_item_clones, std::ref(parOut), _1, std::cref(xpath->name)));
|
||||
}
|
||||
}
|
||||
|
||||
void fill_with_structs (
|
||||
mstch::map& parOut,
|
||||
const EntryNode& parNode,
|
||||
std::string_view parSrc,
|
||||
XPathRunner& parRunner
|
||||
) {
|
||||
for (auto& curr_struct : parNode.structs) {
|
||||
assert(not curr_struct.name.empty());
|
||||
|
||||
auto new_struct = to_mustache_dict_recursive(curr_struct, parSrc, parRunner);
|
||||
const std::size_t extracted_struct_size = largest_array_size_in(new_struct);
|
||||
|
||||
ArraysToStructArrayVisitor fix_visitor(extracted_struct_size);
|
||||
for (auto&& itm : new_struct) {
|
||||
auto visitor = [&fix_visitor,&name=itm.first](const auto& var) { fix_visitor(name, var); };
|
||||
boost::apply_visitor(visitor, itm.second);
|
||||
}
|
||||
|
||||
parOut[std::string(curr_struct.name)] = fix_visitor.steal_struct();
|
||||
}
|
||||
}
|
||||
|
||||
mstch::map to_mustache_dict_recursive (
|
||||
const EntryNode& parNode,
|
||||
std::string_view parSrc,
|
||||
XPathRunner& parRunner
|
||||
) {
|
||||
mstch::map retval;
|
||||
std::vector<FixLengthCommand> fix_commands;
|
||||
|
||||
fill_with_xpaths(retval, fix_commands, parNode, parSrc, parRunner);
|
||||
fill_with_structs(retval, parNode, parSrc, parRunner);
|
||||
|
||||
const std::size_t largest = largest_array_size_in(retval);
|
||||
for (const auto& command : fix_commands) {
|
||||
command(largest);
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
mstch::map to_mustache_map (const EntryNodeList& parNodes, XPathRunner& parRunner) {
|
||||
mstch::map retval;
|
||||
for (auto& entry : parNodes) {
|
||||
assert(entry.second.name.empty());
|
||||
std::cout << "Analyzing entry " << *entry.first << '\n';
|
||||
|
||||
assert(entry.first);
|
||||
std::string_view src_url;
|
||||
|
||||
switch (entry.first->type) {
|
||||
case SourceInfo::URL:
|
||||
src_url = entry.first->value;
|
||||
break;
|
||||
case SourceInfo::Token:
|
||||
default:
|
||||
assert(false); //not reached
|
||||
}
|
||||
|
||||
mstch::map curr_entry_map = to_mustache_dict_recursive(entry.second, src_url, parRunner);
|
||||
curr_entry_map.merge(std::move(retval));
|
||||
retval.swap(curr_entry_map);
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
void exec_apply_block (
|
||||
const SourceInfo& parSourceInfo,
|
||||
const EntryNode& parEntryNode,
|
||||
const MustacheEntry& parMustache,
|
||||
XPathRunner& parXPathRunner
|
||||
) {
|
||||
EntryNodeList entry_node {std::make_pair(&parSourceInfo, parEntryNode)};
|
||||
mstch::map entry_ctx = to_mustache_map(entry_node, parXPathRunner);
|
||||
for (auto& ctx : parMustache.context) {
|
||||
entry_ctx[ctx.first] = ctx.second;
|
||||
}
|
||||
|
||||
std::cout << "context size: " << entry_ctx.size() << '\n';
|
||||
for (auto& ctx_itm : entry_ctx) {
|
||||
std::cout << '\t' << ctx_itm.first << '\n';
|
||||
}
|
||||
|
||||
std::cout << mstch::render(parMustache.text, entry_ctx) << std::endl;
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
||||
std::vector<std::string> apply (
|
||||
const ScrapNode& node,
|
||||
HtmlPoolBaseSP html_pool,
|
||||
XPathPtr xpath,
|
||||
std::string&& parDefNamespace
|
||||
) {
|
||||
using std::placeholders::_1;
|
||||
|
||||
DictBuilder dict_builder;
|
||||
boost::apply_visitor(dict_builder, node);
|
||||
|
||||
std::vector<std::string> retval;
|
||||
const EntryNodeList& global_entries = dict_builder.global_entries();
|
||||
const MustacheEntryMap& mustaches = dict_builder.mustache_entries();
|
||||
const std::vector<ApplyEntry> apply_entries = dict_builder.apply_entries();
|
||||
retval.reserve(apply_entries.size());
|
||||
|
||||
std::cout << "-------------- visiting done ----------------\n";
|
||||
XPathRunner xpath_runner(html_pool, xpath, std::move(parDefNamespace));
|
||||
|
||||
for (auto& apply_entry : apply_entries) {
|
||||
std::string name(apply_entry.mustache_name);
|
||||
const auto& mustache = mustaches.at(name);
|
||||
if (SourceInfo::Token == apply_entry.apply_to->type) {
|
||||
std::vector<std::string> sources =
|
||||
query_xpath_by_name(global_entries, apply_entry.apply_to->value, xpath_runner);
|
||||
|
||||
for (auto& source : sources) {
|
||||
SourceInfo new_source;
|
||||
new_source.value = source;
|
||||
new_source.type = SourceInfo::URL;
|
||||
|
||||
EntryNode new_node(apply_entry.content.name);
|
||||
new_node.structs = apply_entry.content.structs;
|
||||
new_node.xpaths = apply_entry.content.xpaths;
|
||||
|
||||
exec_apply_block(new_source, new_node, mustache, xpath_runner);
|
||||
}
|
||||
}
|
||||
else {
|
||||
assert(apply_entry.apply_to);
|
||||
exec_apply_block(*apply_entry.apply_to, apply_entry.content, mustache, xpath_runner);
|
||||
}
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
}} //namespace duck::sl
|
36
src/scraplang/apply.hpp
Normal file
36
src/scraplang/apply.hpp
Normal file
|
@ -0,0 +1,36 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef idC73DBB42FB76433BAFC0B73EAC3B70FF
|
||||
#define idC73DBB42FB76433BAFC0B73EAC3B70FF
|
||||
|
||||
#include "scrap_node.hpp"
|
||||
#include "scraplang/html_pool_base.hpp"
|
||||
#include "xpath_fwd.hpp"
|
||||
#include <string>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
std::vector<std::string> apply (
|
||||
const ScrapNode& node,
|
||||
HtmlPoolBaseSP html_pool,
|
||||
XPathPtr xpath,
|
||||
std::string&& parDefNamespace
|
||||
);
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
|
@ -1,4 +1,4 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
/* Copyright (C) 2017 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
|
@ -19,22 +19,15 @@
|
|||
#ifndef id3875B5F868524EC3A1B83971D4A85777
|
||||
#define id3875B5F868524EC3A1B83971D4A85777
|
||||
|
||||
#include "element_types.hpp"
|
||||
#include <string>
|
||||
|
||||
namespace duck {
|
||||
enum ElementTypes {
|
||||
ElementType_String,
|
||||
ElementType_Integer,
|
||||
ElementType_Boolean,
|
||||
ElementType_Null,
|
||||
ElementType_Double
|
||||
};
|
||||
|
||||
struct element_def {
|
||||
namespace duck { namespace sl {
|
||||
struct ElementDef {
|
||||
std::string name;
|
||||
std::string xpath;
|
||||
ElementTypes type;
|
||||
};
|
||||
} //namespace duck
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
32
src/scraplang/element_types.hpp
Normal file
32
src/scraplang/element_types.hpp
Normal file
|
@ -0,0 +1,32 @@
|
|||
/* Copyright (C) 2017 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id1AC876186C4B48DD900399994C27A741
|
||||
#define id1AC876186C4B48DD900399994C27A741
|
||||
|
||||
namespace duck { namespace sl {
|
||||
enum ElementTypes {
|
||||
ElementType_String,
|
||||
ElementType_Integer,
|
||||
ElementType_Boolean,
|
||||
ElementType_Null,
|
||||
ElementType_Double
|
||||
};
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
40
src/scraplang/html_pool_base.hpp
Normal file
40
src/scraplang/html_pool_base.hpp
Normal file
|
@ -0,0 +1,40 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef idDD58822D7D8B4AA7A0DD16B1CDEF413E
|
||||
#define idDD58822D7D8B4AA7A0DD16B1CDEF413E
|
||||
|
||||
#include "implem/ResourcePool.hpp"
|
||||
#include "kakoune/safe_ptr.hh"
|
||||
#include <string_view>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
namespace implem {
|
||||
typedef duckutil::ResourcePool<std::string, std::string_view> HtmlPoolBase;
|
||||
} //namespace implem
|
||||
|
||||
class HtmlPoolBase : public implem::HtmlPoolBase, public Kakoune::SafeCountable {
|
||||
public:
|
||||
using implem::HtmlPoolBase::HtmlPoolBase;
|
||||
using implem::HtmlPoolBase::operator=;
|
||||
};
|
||||
|
||||
typedef Kakoune::SafePtr<HtmlPoolBase> HtmlPoolBaseSP;
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
121
src/scraplang/implem/ResourcePool.hpp
Normal file
121
src/scraplang/implem/ResourcePool.hpp
Normal file
|
@ -0,0 +1,121 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id1A180A0568E84FD88D57FAB82C69600E
|
||||
#define id1A180A0568E84FD88D57FAB82C69600E
|
||||
|
||||
#include "SaltedIndex.hpp"
|
||||
#include <map>
|
||||
#include <cstdint>
|
||||
#include <cassert>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
namespace duckutil {
|
||||
namespace Implem {
|
||||
template <typename Res, typename Name, typename IDT>
|
||||
class ResourceResNameWrapper {
|
||||
public:
|
||||
typedef IDT IDType;
|
||||
|
||||
ResourceResNameWrapper ( const Name* parName, Res* parRes, IDType parID );
|
||||
ResourceResNameWrapper ( const ResourceResNameWrapper& ) = delete;
|
||||
~ResourceResNameWrapper ( void ) { assert(0 == m_refcount); }
|
||||
|
||||
ResourceResNameWrapper& operator= (const ResourceResNameWrapper&) = delete;
|
||||
|
||||
Res& GetResource ( void ) { return *m_resource; }
|
||||
const Res& GetResource ( void ) const { return *m_resource; }
|
||||
const Name& GetName ( void ) const { return *m_name; }
|
||||
void Retain ( void ) { ++m_refcount; }
|
||||
bool Release ( void ) { assert(m_refcount > 0); --m_refcount; return (0 == m_refcount); }
|
||||
uint32_t GetRefCount ( void ) const { return m_refcount; }
|
||||
uint32_t GetResourceID ( void ) const { return m_resId; }
|
||||
bool IsEmpty ( void ) const { return NULL == m_resource; }
|
||||
void Reset ( void ) { m_resource = NULL; m_name = NULL; m_refcount = m_resId = 0; }
|
||||
void DropRefCount ( void ) { m_refcount = 0; }
|
||||
|
||||
bool operator== ( const ResourceResNameWrapper& parOther ) const { return (GetName() == parOther.GetName()); }
|
||||
bool operator!= ( const ResourceResNameWrapper& parOther ) const { return not (GetName() == parOther.GetName()); }
|
||||
bool operator< ( const ResourceResNameWrapper& parOther ) const { return (GetName() < parOther.GetName()); }
|
||||
bool operator> ( const ResourceResNameWrapper& parOther ) const { return (parOther.GetName() < GetName()); }
|
||||
bool operator>= ( const ResourceResNameWrapper& parOther ) const { return not (GetName() < parOther.GetName()); }
|
||||
bool operator<= ( const ResourceResNameWrapper& parOther ) const { return not (parOther.GetName() < GetName()); }
|
||||
|
||||
private:
|
||||
Res* m_resource;
|
||||
const Name* m_name;
|
||||
uint16_t m_refcount;
|
||||
uint16_t m_resId;
|
||||
};
|
||||
} //namespace Implem
|
||||
|
||||
template <typename Res, typename Name, typename Object=Name>
|
||||
class ResourcePool {
|
||||
public:
|
||||
typedef uint32_t IDType;
|
||||
typedef Name ResourceNameType;
|
||||
private:
|
||||
typedef Implem::ResourceResNameWrapper<Res, Name, IDType> ResourceWrapperType;
|
||||
typedef std::map<Name, ResourceWrapperType*> ResourceMapType;
|
||||
typedef std::vector<ResourceWrapperType*> ResourceVectorType;
|
||||
protected:
|
||||
typedef typename std::conditional<std::is_fundamental<ResourceNameType>::value, ResourceNameType, const ResourceNameType&>::type ResourceNameParamType;
|
||||
typedef typename std::conditional<std::is_fundamental<Object>::value, Object, const Object&>::type ResourceObjectParameterType;
|
||||
public:
|
||||
typedef Res ResourceType;
|
||||
typedef Object ResourceObjectType;
|
||||
|
||||
ResourcePool ( void ) = default;
|
||||
ResourcePool ( const ResourcePool& ) = delete;
|
||||
virtual ~ResourcePool ( void ) = default;
|
||||
|
||||
ResourcePool& operator= (const ResourcePool&) = delete;
|
||||
|
||||
ResourceType* GetByName ( ResourceNameParamType parName );
|
||||
const ResourceType* GetByName ( ResourceNameParamType parName ) const;
|
||||
IDType GetOrAdd ( ResourceObjectParameterType parObjectName );
|
||||
ResourceType* GetByID ( IDType parID );
|
||||
const ResourceType* GetByID ( IDType parID ) const;
|
||||
bool IsEmpty ( void ) const;
|
||||
|
||||
IDType AddResource ( ResourceObjectParameterType parRes );
|
||||
void ReleaseResource ( IDType parRes );
|
||||
void ReleaseResourceByName ( ResourceNameParamType parName );
|
||||
|
||||
void Dispose ( void ) noexcept;
|
||||
|
||||
protected:
|
||||
void Dispose_IgnoreReferenceCount ( void );
|
||||
|
||||
virtual ResourceType* OnResourceLoad ( ResourceObjectParameterType parRes ) = 0;
|
||||
virtual void OnResourceDestroy ( ResourceNameParamType parName, ResourceType* parRes ) noexcept = 0;
|
||||
virtual ResourceNameType GetResourceNameFromResourceObject ( ResourceObjectParameterType parRes ) = 0;
|
||||
|
||||
private:
|
||||
bool ReleaseResource ( typename ResourceMapType::iterator& parMapRes, typename ResourceVectorType::iterator& parVecRes );
|
||||
|
||||
ResourceMapType m_mapContainer; //For accesses by name
|
||||
ResourceVectorType m_linearContainer; //For accesses by index
|
||||
};
|
||||
} //namespace duckutil
|
||||
|
||||
#include "ResourcePool.inl"
|
||||
|
||||
#endif
|
243
src/scraplang/implem/ResourcePool.inl
Normal file
243
src/scraplang/implem/ResourcePool.inl
Normal file
|
@ -0,0 +1,243 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
namespace duckutil {
|
||||
namespace Implem {
|
||||
///---------------------------------------------------------------------
|
||||
///---------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename IDT>
|
||||
ResourceResNameWrapper<Res, Name, IDT>::ResourceResNameWrapper (const Name* parName, Res* parRes, IDType parID) {
|
||||
assert(nullptr != parRes);
|
||||
assert(nullptr != parName);
|
||||
|
||||
m_resource = parRes;
|
||||
m_name = parName;
|
||||
m_refcount = 0;
|
||||
m_resId = static_cast<uint16_t>(parID);
|
||||
}
|
||||
|
||||
///---------------------------------------------------------------------
|
||||
///---------------------------------------------------------------------
|
||||
template <typename V>
|
||||
inline void TrimTrailingNulls (V& parVector) {
|
||||
const std::size_t nullsCount = std::find_if(parVector.rbegin(), parVector.rend(), std::bind1st(std::not_equal_to<typename V::value_type>(), nullptr)) - parVector.rbegin();
|
||||
assert(nullsCount <= parVector.size());
|
||||
if (nullsCount) {
|
||||
assert(nullptr == parVector.back());
|
||||
parVector.resize(parVector.size() - nullsCount);
|
||||
}
|
||||
}
|
||||
} //namespace Implem
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
typename ResourcePool<Res, Name, Object>::IDType ResourcePool<Res, Name, Object>::GetOrAdd (ResourceObjectParameterType parObjectName) {
|
||||
const ResourceNameType name = GetResourceNameFromResourceObject(parObjectName);
|
||||
typename ResourceMapType::const_iterator itFind = m_mapContainer.find(name);
|
||||
IDType retVal;
|
||||
if (m_mapContainer.end() == itFind) {
|
||||
retVal = AddResource(parObjectName);
|
||||
}
|
||||
else {
|
||||
typename ResourceVectorType::const_iterator itVecFind = std::find(m_linearContainer.begin(), m_linearContainer.end(), itFind->second);
|
||||
assert(m_linearContainer.end() != itVecFind);
|
||||
retVal = static_cast<IDType>(itVecFind - m_linearContainer.begin() + 1);
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByName (ResourceNameParamType parName) {
|
||||
typename ResourceMapType::iterator itFind = m_mapContainer.find(parName);
|
||||
if (m_mapContainer.end() == itFind)
|
||||
return nullptr;
|
||||
else
|
||||
return &itFind->second->GetResource();
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
const typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByName (ResourceNameParamType parName) const {
|
||||
typename ResourceMapType::const_iterator itFind = m_mapContainer.find(parName);
|
||||
if (m_mapContainer.end() == itFind)
|
||||
return nullptr;
|
||||
else
|
||||
return &itFind->second->GetResource();
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByID (IDType parID) {
|
||||
assert(parID > 0);
|
||||
if (0 == parID)
|
||||
return nullptr;
|
||||
|
||||
const auto index = static_cast<std::size_t>(parID - 1);
|
||||
if (index < m_linearContainer.size()) {
|
||||
ResourceWrapperType* res = m_linearContainer[index];
|
||||
return &res->GetResource();
|
||||
}
|
||||
else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
const typename ResourcePool<Res, Name, Object>::ResourceType* ResourcePool<Res, Name, Object>::GetByID (IDType parID) const {
|
||||
assert(parID > 0);
|
||||
if (0 == parID)
|
||||
return nullptr;
|
||||
|
||||
const auto index = static_cast<std::size_t>(parID - 1);
|
||||
if (index < m_linearContainer.size()) {
|
||||
ResourceWrapperType* res = m_linearContainer[index];
|
||||
return &res->GetResource();
|
||||
}
|
||||
else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
typename ResourcePool<Res, Name, Object>::IDType ResourcePool<Res, Name, Object>::AddResource (ResourceObjectParameterType parRes) {
|
||||
const ResourceNameType name = GetResourceNameFromResourceObject(parRes);
|
||||
typename ResourceMapType::iterator itPreExisting = m_mapContainer.find(name);
|
||||
if (m_mapContainer.end() != itPreExisting) {
|
||||
// if (itPreExisting->IsEmpty()) {
|
||||
// OnResourceReload(name);
|
||||
assert(nullptr != itPreExisting->second);
|
||||
assert(not itPreExisting->second->IsEmpty());
|
||||
itPreExisting->second->Retain();
|
||||
return itPreExisting->second->GetResourceID();
|
||||
}
|
||||
else {
|
||||
assert(m_mapContainer.end() == m_mapContainer.find(name));
|
||||
ResourceType* const newRes = OnResourceLoad(parRes);
|
||||
if (newRes) {
|
||||
std::pair<typename ResourceMapType::iterator, bool> newIt = m_mapContainer.insert(std::pair<ResourceNameType, ResourceWrapperType*>(name, nullptr));
|
||||
|
||||
IDType newID = static_cast<IDType>(m_linearContainer.size() + 1);
|
||||
ResourceWrapperType* const newWrapper = new ResourceWrapperType(&newIt.first->first, newRes, newID);
|
||||
assert(nullptr != newWrapper);
|
||||
m_linearContainer.push_back(newWrapper);
|
||||
newIt.first->second = newWrapper;
|
||||
newWrapper->Retain();
|
||||
return newID;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
void ResourcePool<Res, Name, Object>::ReleaseResource (IDType parRes) {
|
||||
assert(parRes > 0);
|
||||
assert(static_cast<std::size_t>(parRes) <= m_linearContainer.size());
|
||||
|
||||
typename ResourceVectorType::iterator rele = m_linearContainer.begin() + (parRes - 1);
|
||||
assert(nullptr != *rele);
|
||||
assert(rele - m_linearContainer.begin() == static_cast<int>(parRes - 1));
|
||||
|
||||
typename ResourceMapType::iterator relemap = m_mapContainer.find((*rele)->GetName());
|
||||
assert(m_mapContainer.end() != relemap);
|
||||
|
||||
if (ReleaseResource(relemap, rele)) {
|
||||
delete relemap->second;
|
||||
m_mapContainer.erase(relemap);
|
||||
Implem::TrimTrailingNulls(m_linearContainer);
|
||||
}
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
void ResourcePool<Res, Name, Object>::ReleaseResourceByName (ResourceNameParamType parName) {
|
||||
typename ResourceMapType::iterator rele = m_mapContainer.find(parName);
|
||||
assert(m_mapContainer.end() != rele);
|
||||
|
||||
const IDType resId = rele->second->GetResourceID();
|
||||
assert(static_cast<std::size_t>(resId) <= m_linearContainer.size());
|
||||
assert(resId > 0);
|
||||
|
||||
if (ReleaseResource(rele, m_linearContainer.begin() + (resId - 1))) {
|
||||
delete rele->second;
|
||||
m_mapContainer.erase(rele);
|
||||
Implem::TrimTrailingNulls(m_linearContainer);
|
||||
}
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
bool ResourcePool<Res, Name, Object>::ReleaseResource (typename ResourceMapType::iterator& parMapRes, typename ResourceVectorType::iterator& parVecRes) {
|
||||
assert(parMapRes->second == *parVecRes);
|
||||
assert(nullptr != *parVecRes);
|
||||
ResourceWrapperType& currRes = **parVecRes;
|
||||
|
||||
assert(not currRes.IsEmpty());
|
||||
if (not currRes.IsEmpty()) {
|
||||
if (currRes.Release()) {
|
||||
this->OnResourceDestroy(currRes.GetName(), &currRes.GetResource());
|
||||
currRes.Reset();
|
||||
*parVecRes = nullptr;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
bool ResourcePool<Res, Name, Object>::IsEmpty() const {
|
||||
return m_mapContainer.empty() and m_linearContainer.empty();
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
void ResourcePool<Res, Name, Object>::Dispose() noexcept {
|
||||
for (auto& currItem : m_linearContainer) {
|
||||
if (nullptr != currItem)
|
||||
this->OnResourceDestroy(currItem->GetName(), &currItem->GetResource());
|
||||
delete currItem;
|
||||
}
|
||||
m_linearContainer.clear();
|
||||
m_mapContainer.clear();
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename Res, typename Name, typename Object>
|
||||
void ResourcePool<Res, Name, Object>::Dispose_IgnoreReferenceCount() {
|
||||
for (auto& currItem : m_linearContainer) {
|
||||
currItem->DropRefCount();
|
||||
}
|
||||
Dispose();
|
||||
}
|
||||
} //namespace duckcore
|
68
src/scraplang/implem/SaltedIndex.hpp
Normal file
68
src/scraplang/implem/SaltedIndex.hpp
Normal file
|
@ -0,0 +1,68 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id8D3B62D447574A23A82F8E9C60A629BD
|
||||
#define id8D3B62D447574A23A82F8E9C60A629BD
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace duckutil {
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize=sizeof(T)*8-IndexBitSize>
|
||||
class SaltedIndex {
|
||||
static_assert((SaltBitSize + IndexBitSize) == sizeof(T) * 8, "Type size is too small");
|
||||
static_assert(SaltBitSize > 0, "Invalid salt size");
|
||||
static_assert(IndexBitSize > 0, "Invalid index size");
|
||||
public:
|
||||
enum {
|
||||
SaltSize = SaltBitSize,
|
||||
IndexSize = IndexBitSize,
|
||||
MaxSalt = (1 << SaltBitSize) - 1,
|
||||
MaxIndex = (1 << IndexBitSize) - 1
|
||||
};
|
||||
|
||||
SaltedIndex ( void );
|
||||
SaltedIndex ( const SaltedIndex& parOther );
|
||||
explicit SaltedIndex ( T parIndex );
|
||||
SaltedIndex ( T parSalt, T parIndex );
|
||||
~SaltedIndex ( void );
|
||||
|
||||
T GetSaltOnly ( void ) const { return m_salt; }
|
||||
T GetIndexOnly ( void ) const { return m_index; }
|
||||
T GetSaltedIndex ( void ) const { return m_saltedIndex; }
|
||||
void SetSalt ( T parSalt );
|
||||
void SetIndex ( T parIndex );
|
||||
T IncreaseSalt ( void );
|
||||
|
||||
bool operator== ( const SaltedIndex& parOther ) const { return GetSaltedIndex() == parOther.GetSaltedIndex(); }
|
||||
bool operator!= ( const SaltedIndex& parOther ) const { return GetSaltedIndex() != parOther.GetSaltedIndex(); }
|
||||
bool operator< ( const SaltedIndex& parOther ) const { return GetSaltedIndex() < parOther.GetSaltedIndex(); }
|
||||
|
||||
private:
|
||||
union {
|
||||
struct {
|
||||
T m_index : IndexBitSize;
|
||||
T m_salt : SaltBitSize;
|
||||
};
|
||||
T m_saltedIndex;
|
||||
};
|
||||
};
|
||||
} //namespace duckutil
|
||||
|
||||
#include "SaltedIndex.inl"
|
||||
|
||||
#endif
|
86
src/scraplang/implem/SaltedIndex.inl
Normal file
86
src/scraplang/implem/SaltedIndex.inl
Normal file
|
@ -0,0 +1,86 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
namespace duckutil {
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex() :
|
||||
m_saltedIndex(0)
|
||||
{
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (const SaltedIndex& parOther) :
|
||||
m_saltedIndex(parOther.GetSaltedIndex())
|
||||
{
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (T parIndex) :
|
||||
m_saltedIndex(parIndex)
|
||||
{
|
||||
Assert(m_saltedIndex <= MaxIndex);
|
||||
Assert(m_salt == 0);
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||
SaltedIndex<T, IndexBitSize, SaltBitSize>::SaltedIndex (T parSalt, T parIndex) :
|
||||
m_index(parIndex),
|
||||
m_salt(parSalt)
|
||||
{
|
||||
Assert(parSalt <= MaxSalt);
|
||||
Assert(parIndex <= MaxIndex);
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||
SaltedIndex<T, IndexBitSize, SaltBitSize>::~SaltedIndex() {
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||
void SaltedIndex<T, IndexBitSize, SaltBitSize>::SetSalt (T parSalt) {
|
||||
Assert(parSalt <= MaxSalt);
|
||||
m_salt = parSalt;
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||
void SaltedIndex<T, IndexBitSize, SaltBitSize>::SetIndex (T parIndex) {
|
||||
Assert(parIndex <= MaxIndex);
|
||||
m_index = parIndex;
|
||||
}
|
||||
|
||||
///-------------------------------------------------------------------------
|
||||
///-------------------------------------------------------------------------
|
||||
template <typename T, std::size_t IndexBitSize, std::size_t SaltBitSize>
|
||||
T SaltedIndex<T, IndexBitSize, SaltBitSize>::IncreaseSalt() {
|
||||
Assert(m_salt < MaxSalt);
|
||||
++m_salt;
|
||||
}
|
||||
} //namespace duckutil
|
90
src/scraplang/parse.cpp
Normal file
90
src/scraplang/parse.cpp
Normal file
|
@ -0,0 +1,90 @@
|
|||
/* Copyright (C) 2017 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "parse.hpp"
|
||||
#include "scraplang/parse_exports.hpp"
|
||||
#include "scraplang/scrapgrammar.hpp"
|
||||
#include "scraplang/element_def.hpp"
|
||||
#include <boost/spirit/include/qi.hpp>
|
||||
#include <boost/spirit/include/phoenix_stl.hpp>
|
||||
#include <boost/spirit/include/phoenix_fusion.hpp>
|
||||
#include <boost/fusion/adapted/struct.hpp>
|
||||
#include <boost/fusion/adapted/std_pair.hpp>
|
||||
#include <utility>
|
||||
#if !defined(NDEBUG)
|
||||
# include <iostream>
|
||||
#endif
|
||||
#include <stdexcept>
|
||||
|
||||
namespace sp = boost::spirit;
|
||||
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
duck::sl::SourceInfo,
|
||||
(std::string, value)
|
||||
(duck::sl::SourceInfo::Type, type)
|
||||
)
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
duck::sl::FromBlock,
|
||||
(duck::sl::SourceInfo, source)
|
||||
(std::vector<duck::sl::StructItem>, xpaths)
|
||||
)
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
duck::sl::StructBlock,
|
||||
(std::string, name)
|
||||
(std::vector<duck::sl::StructItem>, xpaths)
|
||||
)
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
duck::sl::ApplyBlock,
|
||||
(std::string, mustache_model)
|
||||
(duck::sl::SourceInfo, source)
|
||||
(std::vector<duck::sl::StructItem>, xpaths)
|
||||
)
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
duck::sl::MustacheBlock,
|
||||
(std::string, name)
|
||||
(std::string, content)
|
||||
)
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
duck::sl::XPathElement,
|
||||
(std::string, name)
|
||||
(std::optional<std::string>, def_val)
|
||||
(std::string, xpath)
|
||||
)
|
||||
|
||||
namespace duck { namespace sl {
|
||||
typedef kamokan::IniCommentSkipper<std::string_view::const_iterator> skipper_type;
|
||||
|
||||
std::vector<ScrapNode> parse (std::string_view parData) {
|
||||
ScrapGrammar<std::string_view::const_iterator, skipper_type> gramm;
|
||||
auto it_start = parData.cbegin();
|
||||
|
||||
std::vector<ScrapNode> retval;
|
||||
const bool ok = qi::phrase_parse(it_start, parData.cend(), gramm, skipper_type(), retval);
|
||||
|
||||
std::cout << "parse ok: " << std::boolalpha << ok << '\n';
|
||||
std::cout << "end == it: " << std::boolalpha << (parData.cend() == it_start) << '\n';
|
||||
std::cout << "begin == it: " << std::boolalpha << (parData.cbegin() == it_start) << '\n';
|
||||
std::cout << "parse distance: " << std::distance(parData.cbegin(), it_start) << '\n';
|
||||
std::cout << "all distance: " << std::distance(parData.cbegin(), parData.cend()) << " (size: " << parData.size() << ")\n";
|
||||
|
||||
if (parData.cend() != it_start or not ok) {
|
||||
throw std::runtime_error("Error parsing input script");
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
}} //namespace duck::sl
|
29
src/scraplang/parse.hpp
Normal file
29
src/scraplang/parse.hpp
Normal file
|
@ -0,0 +1,29 @@
|
|||
/* Copyright (C) 2017 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef idBE96C2D49C4C413888A79EAEB2B9C0FA
|
||||
#define idBE96C2D49C4C413888A79EAEB2B9C0FA
|
||||
|
||||
#include "scrap_node.hpp"
|
||||
#include <string_view>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
std::vector<ScrapNode> parse ( std::string_view parData );
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
51
src/scraplang/parse_exports.cpp
Normal file
51
src/scraplang/parse_exports.cpp
Normal file
|
@ -0,0 +1,51 @@
|
|||
/* Copyright (C) 2017-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "scraplang/parse_exports.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
|
||||
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
|
||||
|
||||
template bool boost::spirit::qi::phrase_parse<
|
||||
std::basic_string<char>::const_iterator,
|
||||
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
|
||||
boost::spirit::ascii::blank_type,
|
||||
std::vector<duck::sl::ScrapNode>
|
||||
> (
|
||||
std::basic_string<char>::const_iterator&,
|
||||
std::basic_string<char>::const_iterator,
|
||||
duck::sl::ScrapGrammar<
|
||||
std::basic_string<char>::const_iterator,
|
||||
boost::spirit::qi::ascii::blank_type
|
||||
> const&,
|
||||
boost::spirit::ascii::blank_type const&,
|
||||
std::vector<duck::sl::ScrapNode>&
|
||||
);
|
||||
|
||||
template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;
|
54
src/scraplang/parse_exports.hpp
Normal file
54
src/scraplang/parse_exports.hpp
Normal file
|
@ -0,0 +1,54 @@
|
|||
/* Copyright (C) 2017-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "scraplang/scrap_node.hpp"
|
||||
#include "scraplang/scrapgrammar.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
extern template class boost::spirit::qi::grammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>;
|
||||
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::ApplyBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::FromBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::MustacheBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::SourceInfo(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::string(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::ScrapNode>(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, std::vector<duck::sl::StructItem>(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::StructBlock(), boost::spirit::qi::ascii::blank_type>;
|
||||
extern template class boost::spirit::qi::rule<std::basic_string<char>, duck::sl::XPathElement(), boost::spirit::qi::ascii::blank_type>;
|
||||
|
||||
extern template bool boost::spirit::qi::phrase_parse<
|
||||
std::basic_string<char>::const_iterator,
|
||||
duck::sl::ScrapGrammar<std::basic_string<char>::const_iterator, boost::spirit::qi::ascii::blank_type>,
|
||||
boost::spirit::ascii::blank_type,
|
||||
std::vector<duck::sl::ScrapNode>
|
||||
> (
|
||||
std::basic_string<char>::const_iterator&,
|
||||
std::basic_string<char>::const_iterator,
|
||||
duck::sl::ScrapGrammar<
|
||||
std::basic_string<char>::const_iterator,
|
||||
boost::spirit::qi::ascii::blank_type
|
||||
> const&,
|
||||
boost::spirit::ascii::blank_type const&,
|
||||
std::vector<duck::sl::ScrapNode>&
|
||||
);
|
||||
|
||||
extern template struct boost::spirit::use_operator<boost::spirit::qi::domain, boost::proto::tag::shift_right>;
|
92
src/scraplang/scrap_node.hpp
Normal file
92
src/scraplang/scrap_node.hpp
Normal file
|
@ -0,0 +1,92 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id9919CCB09DDD429C8128632F13D370ED
|
||||
#define id9919CCB09DDD429C8128632F13D370ED
|
||||
|
||||
//#include "element_def.hpp"
|
||||
#include <boost/spirit/include/support_extended_variant.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <utility>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
struct XPathElement {
|
||||
std::string name;
|
||||
std::optional<std::string> def_val;
|
||||
std::string xpath;
|
||||
};
|
||||
|
||||
struct StructBlock;
|
||||
|
||||
struct StructItem : boost::spirit::extended_variant<
|
||||
XPathElement,
|
||||
boost::recursive_wrapper<StructBlock>
|
||||
> {
|
||||
StructItem() : base_type() {}
|
||||
StructItem (const XPathElement& value) : base_type(value) {}
|
||||
StructItem (const StructBlock& value) : base_type(value) {}
|
||||
using base_type::operator=;
|
||||
};
|
||||
|
||||
struct StructBlock {
|
||||
std::string name;
|
||||
std::vector<StructItem> xpaths;
|
||||
};
|
||||
|
||||
struct SourceInfo {
|
||||
enum Type { URL, Token };
|
||||
|
||||
std::string value;
|
||||
Type type;
|
||||
};
|
||||
|
||||
struct FromBlock {
|
||||
SourceInfo source;
|
||||
std::vector<StructItem> xpaths;
|
||||
};
|
||||
|
||||
struct ApplyBlock {
|
||||
std::string mustache_model;
|
||||
SourceInfo source;
|
||||
std::vector<StructItem> xpaths;
|
||||
};
|
||||
|
||||
struct MustacheBlock {
|
||||
std::string name;
|
||||
std::string content;
|
||||
};
|
||||
|
||||
struct ScrapNode : boost::spirit::extended_variant<
|
||||
boost::recursive_wrapper<std::vector<ScrapNode>>,
|
||||
FromBlock,
|
||||
ApplyBlock,
|
||||
MustacheBlock
|
||||
> {
|
||||
ScrapNode() : base_type() {}
|
||||
ScrapNode (const std::vector<ScrapNode>& value) : base_type(value) {}
|
||||
ScrapNode (const FromBlock& value) : base_type(value) {}
|
||||
ScrapNode (const ApplyBlock& value) : base_type(value) {}
|
||||
ScrapNode (const MustacheBlock& value) : base_type(value) {}
|
||||
using base_type::operator=;
|
||||
};
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
|
@ -1,75 +0,0 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id9919CCB09DDD429C8128632F13D370ED
|
||||
#define id9919CCB09DDD429C8128632F13D370ED
|
||||
|
||||
#include "scraplang_element.hpp"
|
||||
#include <boost/spirit/include/support_extended_variant.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
namespace duck {
|
||||
struct ScrapNode;
|
||||
|
||||
namespace implem {
|
||||
struct map;
|
||||
struct array;
|
||||
|
||||
struct element : boost::spirit::extended_variant<
|
||||
boost::recursive_wrapper<map>,
|
||||
boost::recursive_wrapper<array>,
|
||||
std::string,
|
||||
int,
|
||||
double
|
||||
>
|
||||
{
|
||||
element ( void ) = default;
|
||||
element ( const map& parOther ) : base_type(parOther) {}
|
||||
element ( const array& parOther ) : base_type(parOther) {}
|
||||
element ( const std::string& parOther ) : base_type(parOther) {}
|
||||
element ( double parOther ) : base_type(parOther) {}
|
||||
element ( int parOther ) : base_type(parOther) {}
|
||||
};
|
||||
|
||||
struct map : std::map<std::string, element> {
|
||||
};
|
||||
|
||||
struct array : std::vector<element> {
|
||||
};
|
||||
|
||||
struct node_list {
|
||||
std::vector<ScrapNode> nodes;
|
||||
};
|
||||
} //namespace implem
|
||||
|
||||
struct ScrapNode : boost::spirit::extended_variant<
|
||||
element_def,
|
||||
implem::map,
|
||||
implem::node_list
|
||||
>
|
||||
{
|
||||
ScrapNode ( void ) = default;
|
||||
ScrapNode ( const element_def& parOther ) : base_type(parOther) {}
|
||||
ScrapNode ( const implem::map& parOther ) : base_type(parOther) {}
|
||||
ScrapNode ( const implem::node_list& parOther ) : base_type(parOther) {}
|
||||
};
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
108
src/scraplang/scrapgrammar.hpp
Normal file
108
src/scraplang/scrapgrammar.hpp
Normal file
|
@ -0,0 +1,108 @@
|
|||
/* Copyright (C) 2017-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <boost/spirit/include/qi.hpp>
|
||||
#include <boost/spirit/include/phoenix_operator.hpp>
|
||||
|
||||
namespace kamokan {
|
||||
template <typename Iterator>
|
||||
struct IniCommentSkipper : boost::spirit::qi::grammar<Iterator> {
|
||||
IniCommentSkipper() :
|
||||
IniCommentSkipper::base_type(skipping),
|
||||
first_char(true)
|
||||
{
|
||||
namespace px = boost::phoenix;
|
||||
using boost::spirit::qi::blank;
|
||||
using boost::spirit::qi::lit;
|
||||
using boost::spirit::qi::eol;
|
||||
using boost::spirit::qi::char_;
|
||||
using boost::spirit::qi::eps;
|
||||
|
||||
skipping = comment | blank;
|
||||
comment = (eps(px::cref(first_char) == true) | eol) >>
|
||||
*blank >> lit("#")[px::ref(first_char) = false] >>
|
||||
*(!eol >> char_);
|
||||
}
|
||||
|
||||
boost::spirit::qi::rule<Iterator> skipping;
|
||||
boost::spirit::qi::rule<Iterator> comment;
|
||||
bool first_char;
|
||||
};
|
||||
} //namespace kamokan
|
||||
|
||||
namespace duck::sl {
|
||||
namespace qi = ::boost::spirit::qi;
|
||||
|
||||
template <typename I, typename Skipper>
|
||||
class ScrapGrammar : public qi::grammar<I, std::vector<ScrapNode>(), Skipper> {
|
||||
public:
|
||||
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
||||
using qi::char_;
|
||||
using qi::lexeme;
|
||||
using qi::alpha;
|
||||
using qi::alnum;
|
||||
using qi::graph;
|
||||
using qi::attr;
|
||||
using qi::eol;
|
||||
using qi::eoi;
|
||||
using qi::lit;
|
||||
using qi::string;
|
||||
using qi::as_string;
|
||||
using qi::no_skip;
|
||||
|
||||
start = *eol >> (from_block | apply_block | mustache_block) % +eol >> *eol >> eoi;
|
||||
from_block = lit("from") >> source_info >> +eol >> assignment_list >> +eol >> "end";
|
||||
source_info = ((url | string("-")) >> attr(SourceInfo::URL)) | (mustache_like_token >> attr(SourceInfo::Token));
|
||||
url = -(+alpha >> string("://")) >> alpha >> *graph;
|
||||
mustache_like_token = "{{" >> identifier >> "}}";
|
||||
quoted_string %= lexeme['"' >> *(char_ - '"') >> '"'];
|
||||
xpath_assignment %= identifier >>
|
||||
-(lit("default") >> '(' >> quoted_string >> ')') >> "=" >>
|
||||
as_string[lexeme[+(graph | char_(" \t"))]];
|
||||
identifier %= lexeme[(alpha | char_('_')) >> *(-char_('.') >> +(alnum | char_('_')))];
|
||||
|
||||
apply_block = lit("apply") >> mustache_like_token >> "to" >> source_info >> +eol >>
|
||||
assignment_list >> +eol >> "end";
|
||||
struct_block = "struct" >> identifier >> +eol >> assignment_list >> +eol >> "end";
|
||||
|
||||
mustache_block %= as_string[lit("==") >> identifier] >> eol >>
|
||||
as_string[no_skip[+(!lit("==end") >> char_)]] >> "==end";
|
||||
|
||||
assignment_list = (xpath_assignment | struct_block) % +eol;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename F>
|
||||
using RuleType = qi::rule<I, F, Skipper>;
|
||||
|
||||
RuleType<std::vector<ScrapNode>()> start;
|
||||
RuleType<FromBlock()> from_block;
|
||||
RuleType<std::string()> url;
|
||||
RuleType<std::string()> mustache_like_token;
|
||||
RuleType<std::string()> quoted_string;
|
||||
RuleType<XPathElement()> xpath_assignment;
|
||||
RuleType<std::string()> identifier;
|
||||
RuleType<SourceInfo()> source_info;
|
||||
RuleType<ApplyBlock()> apply_block;
|
||||
RuleType<StructBlock()> struct_block;
|
||||
RuleType<MustacheBlock()> mustache_block;
|
||||
RuleType<std::vector<StructItem>()> assignment_list;
|
||||
};
|
||||
} //namespace duck::sl
|
|
@ -1,119 +0,0 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "scraplang.hpp"
|
||||
#include "scrapast.hpp"
|
||||
#include "scraplang_visit_xpath.hpp"
|
||||
#include <boost/spirit/include/qi.hpp>
|
||||
#include <boost/spirit/include/phoenix_stl.hpp>
|
||||
#include <boost/spirit/include/phoenix_fusion.hpp>
|
||||
#include <boost/fusion/adapted/struct.hpp>
|
||||
#include <boost/fusion/adapted/std_pair.hpp>
|
||||
#include <utility>
|
||||
|
||||
#include <boost/variant/apply_visitor.hpp>
|
||||
|
||||
namespace qi = boost::spirit::qi;
|
||||
namespace sp = boost::spirit;
|
||||
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
duck::element_def,
|
||||
(std::string, name)
|
||||
(std::string, xpath)
|
||||
(duck::ElementTypes, type)
|
||||
)
|
||||
BOOST_FUSION_ADAPT_STRUCT(
|
||||
duck::implem::node_list,
|
||||
(std::vector<duck::ScrapNode>, nodes)
|
||||
)
|
||||
|
||||
namespace duck {
|
||||
namespace {
|
||||
struct ElementTypeSymbol : qi::symbols<char, ElementTypes> {
|
||||
ElementTypeSymbol() {
|
||||
add
|
||||
("string", ElementType_String)
|
||||
("integer", ElementType_Integer)
|
||||
("boolean", ElementType_Boolean)
|
||||
("null", ElementType_Null)
|
||||
("double", ElementType_Double)
|
||||
;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename I>
|
||||
struct ScrapGrammar : qi::grammar<I, ScrapNode(), sp::ascii::space_type> {
|
||||
ScrapGrammar() : ScrapGrammar::base_type(start) {
|
||||
using qi::lit;
|
||||
using qi::char_;
|
||||
using qi::lexeme;
|
||||
using qi::double_;
|
||||
using qi::int_;
|
||||
using qi::eps;
|
||||
|
||||
start = whole;
|
||||
whole = eps >> *xpath_definition >> -map;
|
||||
xpath_definition = identifier >> lit('=') >> string >> "as" >> data_type;
|
||||
identifier = (char_('a', 'z') | char_('A', 'Z') | '_') >> *(char_('a', 'z') | char_('A', 'Z') | '_' | char_('0', '9'));
|
||||
string %= lexeme['"' >> +(char_ - '"') >> '"'];
|
||||
map = lit('{') >> ((identifier >> lit('=') >> value) % lit(',')) >> lit('}');
|
||||
array = lit('[') >> *(value % lit(',')) >> lit(']');
|
||||
value = string | double_ | int_ | array | map | identifier;
|
||||
}
|
||||
|
||||
qi::rule<I, ScrapNode(), sp::ascii::space_type> start;
|
||||
qi::rule<I, implem::node_list(), sp::ascii::space_type> whole;
|
||||
qi::rule<I, element_def(), sp::ascii::space_type> xpath_definition;
|
||||
qi::rule<I, std::string(), sp::ascii::space_type> identifier;
|
||||
qi::rule<I, std::string(), sp::ascii::space_type> string;
|
||||
qi::rule<I, implem::map(), sp::ascii::space_type> map;
|
||||
qi::rule<I, implem::array(), sp::ascii::space_type> array;
|
||||
qi::rule<I, implem::element(), sp::ascii::space_type> value;
|
||||
ElementTypeSymbol data_type;
|
||||
};
|
||||
} //unnamed namespace
|
||||
|
||||
ScrapNodePtr parse_scraplang (const std::string& parData) {
|
||||
ScrapGrammar<std::string::const_iterator> gramm;
|
||||
ScrapNodePtr retval(new ScrapNode);
|
||||
auto it_start = parData.cbegin();
|
||||
|
||||
qi::phrase_parse(it_start, parData.cend(), gramm, sp::ascii::space, *retval);
|
||||
return std::move(retval);
|
||||
}
|
||||
|
||||
std::vector<element_def> get_xpath_definitions (const ScrapNode& parAST) {
|
||||
std::vector<element_def> retval;
|
||||
implem::XPathVisitor xpath_vis(&retval);
|
||||
boost::apply_visitor(xpath_vis, parAST);
|
||||
return std::move(retval);
|
||||
}
|
||||
|
||||
ScrapNodePtr::ScrapNodePtr (ScrapNode* parPtr) :
|
||||
m_ptr(parPtr)
|
||||
{
|
||||
}
|
||||
|
||||
ScrapNodePtr::ScrapNodePtr (ScrapNodePtr&& parOther) :
|
||||
m_ptr(std::move(parOther.m_ptr))
|
||||
{
|
||||
}
|
||||
|
||||
ScrapNodePtr::~ScrapNodePtr() noexcept {
|
||||
}
|
||||
} //namespace duck
|
0
src/scraplang/scraplang_print_results.cpp
Normal file
0
src/scraplang/scraplang_print_results.cpp
Normal file
29
src/scraplang/scraplang_print_results.hpp
Normal file
29
src/scraplang/scraplang_print_results.hpp
Normal file
|
@ -0,0 +1,29 @@
|
|||
#ifndef idB20734D678524FAA8AC94F2AB2FDAA94
|
||||
#define idB20734D678524FAA8AC94F2AB2FDAA94
|
||||
|
||||
#include "scrapast.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace duck {
|
||||
typedef std::vector<std::vector<std::pair<std::string, std::string>>> ResulList;
|
||||
struct element_def;
|
||||
|
||||
namespace implem {
|
||||
class ResultPrinter {
|
||||
public:
|
||||
typedef void result_type;
|
||||
|
||||
explicit ResultPrinter ( const std::vector<element_def>* parQueries, const ResultList* parResults );
|
||||
|
||||
void operator() ( const element_def& parElem );
|
||||
void operator() ( const implem::map& parMap );
|
||||
void operator() ( const node_list& parNodes );
|
||||
|
||||
private:
|
||||
const std::vector<element_def>* const m_queries;
|
||||
const ResulList* const m_results;
|
||||
};
|
||||
} //namespace implem
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
67
src/scraplang/stream_scrap_node.hpp
Normal file
67
src/scraplang/stream_scrap_node.hpp
Normal file
|
@ -0,0 +1,67 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef idDB3415BA82504C00A2DAF0274BA9AC92
|
||||
#define idDB3415BA82504C00A2DAF0274BA9AC92
|
||||
|
||||
#include "scrap_node.hpp"
|
||||
#include <iostream>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
std::ostream& operator<< (std::ostream& stream, XPathElement xpath) {
|
||||
stream << "XPathElement \"" << xpath.name << "\": \"" <<
|
||||
xpath.xpath << "\" ";
|
||||
|
||||
if (xpath.def_val)
|
||||
stream << "default: \"" << *xpath.def_val << '"';
|
||||
else
|
||||
stream << "no default";
|
||||
return stream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& parStream, const duck::sl::SourceInfo& parInfo) {
|
||||
if (duck::sl::SourceInfo::URL == parInfo.type)
|
||||
parStream << '"' << parInfo.value << '"';
|
||||
else if (duck::sl::SourceInfo::Token == parInfo.type)
|
||||
parStream << '$' << parInfo.value;
|
||||
else
|
||||
parStream << "Invalid SourceInfo type";
|
||||
return parStream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const FromBlock& blk) {
|
||||
stream << "FromBlock: " << blk.source << ", " <<
|
||||
blk.xpaths.size() << " xpath entries";
|
||||
return stream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const StructBlock& strct) {
|
||||
stream << "StructBlock \"" << strct.name << "\" with " <<
|
||||
strct.xpaths.size() << " xpath entries";
|
||||
return stream;
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& stream, const ApplyBlock& app) {
|
||||
stream << "ApplyBlock for \"" << app.mustache_model << "\": " <<
|
||||
app.source << ", " <<
|
||||
app.xpaths.size() << " elements";
|
||||
return stream;
|
||||
}
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
89
src/scraplang/xpath_runner.cpp
Normal file
89
src/scraplang/xpath_runner.cpp
Normal file
|
@ -0,0 +1,89 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "xpath_runner.hpp"
|
||||
#include "xpath.hpp"
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
//#define HTML_ALWAYS_STDIN
|
||||
|
||||
#if !defined(NDEBUG) && defined(HTML_ALWAYS_STDIN)
|
||||
# define HTML_ALWAYS_STDIN_ENABLED
|
||||
#endif
|
||||
|
||||
namespace duck { namespace sl {
|
||||
struct XPathRunner::XPathKey {
|
||||
XPathKey (const std::string_view& parSrc, const std::string_view& parQuery) :
|
||||
source_address(std::string(parSrc)),
|
||||
xpath_query(std::string(parQuery))
|
||||
{
|
||||
assert(not source_address.empty());
|
||||
}
|
||||
|
||||
std::string source_address;
|
||||
std::string xpath_query;
|
||||
|
||||
bool operator< (const XPathKey& parOther) const {
|
||||
return (
|
||||
xpath_query == parOther.xpath_query and
|
||||
source_address < parOther.source_address
|
||||
) or (xpath_query < parOther.xpath_query);
|
||||
}
|
||||
};
|
||||
|
||||
XPathRunner::XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace) :
|
||||
m_cached_results(),
|
||||
m_def_namespace(std::move(parDefNamespace)),
|
||||
m_pool(html_pool),
|
||||
m_xpath(parXPath)
|
||||
{
|
||||
}
|
||||
|
||||
XPathRunner::~XPathRunner() = default;
|
||||
|
||||
const std::vector<std::string>& XPathRunner::query (
|
||||
std::string_view parSrc,
|
||||
std::string_view parQuery
|
||||
) {
|
||||
std::cout << "XPathRunner::query(\"" << parSrc << "\", \"" << parQuery << "\")\"\n";
|
||||
auto ins_retval = m_cached_results.insert(std::make_pair(XPathKey(parSrc, parQuery), std::vector<std::string>()));
|
||||
const bool inserted = ins_retval.second;
|
||||
assert(ins_retval.first != m_cached_results.end());
|
||||
std::vector<std::string>& curr_vec = ins_retval.first->second;
|
||||
|
||||
if (inserted) {
|
||||
#if defined(HTML_ALWAYS_STDIN_ENABLED)
|
||||
const auto id = m_pool->GetOrAdd("-");
|
||||
#else
|
||||
const auto id = m_pool->GetOrAdd(parSrc);
|
||||
#endif
|
||||
const std::string* html = m_pool->GetByID(id);
|
||||
|
||||
curr_vec = m_xpath->run_query(*html, std::string(parQuery), m_def_namespace);
|
||||
std::cout << "First time for this query, result cached now\n";
|
||||
}
|
||||
|
||||
std::cout << "returning " << curr_vec.size() << " items: ";
|
||||
for (auto& i : curr_vec) {
|
||||
std::cout << '"' << i << "\", ";
|
||||
}
|
||||
std::cout << '\n';
|
||||
return curr_vec;
|
||||
}
|
||||
}} //namespace duck::sl
|
49
src/scraplang/xpath_runner.hpp
Normal file
49
src/scraplang/xpath_runner.hpp
Normal file
|
@ -0,0 +1,49 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id46DB8F4F85E2417E9AF0B1A410240D4F
|
||||
#define id46DB8F4F85E2417E9AF0B1A410240D4F
|
||||
|
||||
#include "html_pool_base.hpp"
|
||||
#include "xpath_fwd.hpp"
|
||||
#include <map>
|
||||
#include <string_view>
|
||||
#include <string>
|
||||
|
||||
namespace duck { namespace sl {
|
||||
class XPathRunner {
|
||||
public:
|
||||
XPathRunner (HtmlPoolBaseSP html_pool, XPathPtr& parXPath, std::string&& parDefNamespace);
|
||||
~XPathRunner();
|
||||
|
||||
const std::vector<std::string>& query (
|
||||
std::string_view parSrc,
|
||||
std::string_view parQuery
|
||||
);
|
||||
|
||||
private:
|
||||
struct XPathKey;
|
||||
|
||||
std::map<XPathKey, std::vector<std::string>> m_cached_results;
|
||||
std::string m_def_namespace;
|
||||
HtmlPoolBaseSP m_pool;
|
||||
XPathPtr m_xpath;
|
||||
};
|
||||
}} //namespace duck::sl
|
||||
|
||||
#endif
|
|
@ -17,7 +17,11 @@
|
|||
*/
|
||||
|
||||
#include "xpath.hpp"
|
||||
#include <pugixml.hpp>
|
||||
#include <xercesc/framework/MemBufInputSource.hpp>
|
||||
#include <xercesc/util/XMLString.hpp>
|
||||
#include <xqilla/exceptions/XQException.hpp>
|
||||
#include <xqilla/exceptions/XMLParseException.hpp>
|
||||
#include <xqilla/context/ContextHelpers.hpp>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
|
@ -45,35 +49,64 @@ namespace duck {
|
|||
}
|
||||
} //unnamed namespace
|
||||
|
||||
XPathBatchResults xpath_query (const std::string& parXML, const std::vector<std::string>& parQueries) {
|
||||
pugi::xml_document doc;
|
||||
std::istringstream iss(parXML);
|
||||
pugi::xml_parse_result result(doc.load(iss));
|
||||
if (not result) {
|
||||
auto line_col = line_col_from_offset(result.offset, parXML);
|
||||
throw ParseError(line_col.first, line_col.second, result.description());
|
||||
XPath::XPath() = default;
|
||||
|
||||
XPath::~XPath() = default;
|
||||
|
||||
auto XPath::run_query (const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace) -> BatchResults {
|
||||
XQilla& xqilla = m_xqilla;
|
||||
XercesConfiguration xconfig;
|
||||
xercesc::MemBufInputSource input_buf(reinterpret_cast<const XMLByte*>(parXML.c_str()), parXML.size(), "n/a", false);
|
||||
BatchResults retval;
|
||||
try {
|
||||
AutoDelete<DynamicContext> context(xqilla.createContext(XQilla::XQUERY3, &xconfig));
|
||||
xconfig.populateStaticContext(context);
|
||||
Node::Ptr ptr = context->parseDocument(input_buf);
|
||||
context->setContextItem(ptr);
|
||||
//see http://xqilla.sourceforge.net/docs/simple-api/classStaticContext.html#adc869a84712459fa49db67fe837c9b01
|
||||
AutoDeleteArray<XMLCh> ns_wide = xercesc::XMLString::transcode(parDefNamespace.c_str());
|
||||
context->setDefaultElementAndTypeNS(ns_wide);
|
||||
|
||||
for (const auto& xpath : parQueries) {
|
||||
AutoDelete<XQQuery> query(nullptr);
|
||||
{
|
||||
AutoContextInfoReset resetter(context);
|
||||
AutoDeleteArray<XMLCh> xpath_wide = xercesc::XMLString::transcode(xpath.c_str());
|
||||
query.set(xqilla.parse(xpath_wide, context, nullptr, XQilla::NO_ADOPT_CONTEXT));
|
||||
}
|
||||
|
||||
XPathBatchResults retval;
|
||||
for (const auto& xpath : parQueries) {
|
||||
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str());
|
||||
Result result = query->execute(context);
|
||||
Item::Ptr item;
|
||||
std::vector<std::pair<std::string, std::string>> new_lst;
|
||||
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
|
||||
const pugi::xpath_node& node = *itFind;
|
||||
std::pair<std::string, std::string> new_itm;
|
||||
if (node.node()) {
|
||||
new_itm.first = std::string(node.node().name());
|
||||
new_itm.second = std::string(node.node().value());
|
||||
}
|
||||
else if (node.attribute()) {
|
||||
new_itm.first = std::string(node.attribute().name());
|
||||
new_itm.second = std::string(node.attribute().value());
|
||||
}
|
||||
new_lst.push_back(std::move(new_itm));
|
||||
while(nullptr != (item = result->next(context))) {
|
||||
new_lst.push_back(std::make_pair(std::string(), UTF8(item->asString(context))));
|
||||
}
|
||||
|
||||
retval.push_back(std::move(new_lst));
|
||||
}
|
||||
return std::move(retval);
|
||||
}
|
||||
catch (const XMLParseException& err) {
|
||||
throw ParseError(err.getXQueryLine(), err.getXQueryColumn(), xercesc::XMLString::transcode(err.getError()));
|
||||
}
|
||||
catch (const XQException& err) {
|
||||
throw std::runtime_error(xercesc::XMLString::transcode(err.getError()));
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
std::vector<std::string> XPath::run_query (const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace) {
|
||||
auto query_res = run_query(parXML, std::vector<std::string>{parQuery}, parDefNamespace);
|
||||
if (query_res.empty() or query_res.front().empty()) {
|
||||
return std::vector<std::string>();
|
||||
}
|
||||
else {
|
||||
std::vector<std::string> retval;
|
||||
const std::vector<std::pair<std::string, std::string>>& src = query_res.front();
|
||||
retval.reserve(src.size());
|
||||
std::transform(src.begin(), src.end(), std::back_inserter(retval), [](const auto& pair) { return pair.second; });
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
|
||||
ParseError::ParseError (int parLine, int parColumn, std::string parMessage) {
|
||||
|
|
|
@ -19,14 +19,14 @@
|
|||
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||
#define id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||
|
||||
#include "xpath_fwd.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <exception>
|
||||
#include <utility>
|
||||
#include <xqilla/xqilla-simple.hpp>
|
||||
|
||||
namespace duck {
|
||||
typedef std::vector<std::vector<std::pair<std::string, std::string>>> XPathBatchResults;
|
||||
|
||||
class ParseError : public std::exception {
|
||||
public:
|
||||
ParseError ( int parLine, int parColumn, std::string parMessage );
|
||||
|
@ -35,7 +35,19 @@ namespace duck {
|
|||
std::vector<char> m_msg;
|
||||
};
|
||||
|
||||
XPathBatchResults xpath_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
||||
class XPath : public Kakoune::SafeCountable {
|
||||
public:
|
||||
typedef std::vector<std::vector<std::pair<std::string, std::string>>> BatchResults;
|
||||
|
||||
XPath();
|
||||
~XPath();
|
||||
|
||||
BatchResults run_query ( const std::string& parXML, const std::vector<std::string>& parQueries, const std::string& parDefNamespace );
|
||||
std::vector<std::string> run_query ( const std::string& parXML, const std::string& parQuery, const std::string& parDefNamespace );
|
||||
|
||||
private:
|
||||
XQilla m_xqilla;
|
||||
};
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
||||
|
|
29
src/xpath_fwd.hpp
Normal file
29
src/xpath_fwd.hpp
Normal file
|
@ -0,0 +1,29 @@
|
|||
/* Copyright (C) 2015-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id08062CD6C4904D94BFF57990C44B6CCB
|
||||
#define id08062CD6C4904D94BFF57990C44B6CCB
|
||||
|
||||
#include "kakoune/safe_ptr.hh"
|
||||
|
||||
namespace duck {
|
||||
class XPath;
|
||||
using XPathPtr = Kakoune::SafePtr<XPath>;
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue