Working example.
Invoke it with ie: ./scraper http://www.dilbert.com '//div[@class='\''STR_Image'\'']/a/img/@src'
This commit is contained in:
parent
aa015ddd6a
commit
cb00e484fa
4 changed files with 50 additions and 10 deletions
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
[submodule "lib/curlcpp"]
|
||||||
|
path = lib/curlcpp
|
||||||
|
url = https://github.com/JosephP91/curlcpp.git
|
|
@ -3,15 +3,17 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
|
||||||
project(scraper CXX)
|
project(scraper CXX)
|
||||||
|
|
||||||
add_subdirectory(lib/tidy)
|
add_subdirectory(lib/tidy)
|
||||||
|
add_subdirectory(lib/curlcpp)
|
||||||
|
|
||||||
find_package(PugiXML REQUIRED)
|
find_package(PugiXML REQUIRED)
|
||||||
#find_package(CURL REQUIRED)
|
|
||||||
#${CURL_INCLUDE_DIR}
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
|
||||||
#${CURL_LIBRARIES}
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
|
||||||
|
|
||||||
include_directories(SYSTEM
|
include_directories(SYSTEM
|
||||||
lib/tidy/include
|
lib/tidy/include
|
||||||
${PUGIXML_INCLUDE_DIR}
|
${PUGIXML_INCLUDE_DIR}
|
||||||
|
${CURLCPP_SOURCE_DIR}/include
|
||||||
)
|
)
|
||||||
|
|
||||||
add_executable(${PROJECT_NAME}
|
add_executable(${PROJECT_NAME}
|
||||||
|
@ -21,4 +23,5 @@ add_executable(${PROJECT_NAME}
|
||||||
target_link_libraries(${PROJECT_NAME}
|
target_link_libraries(${PROJECT_NAME}
|
||||||
tidy
|
tidy
|
||||||
${PUGIXML_LIBRARIES}
|
${PUGIXML_LIBRARIES}
|
||||||
|
curlcpp
|
||||||
)
|
)
|
||||||
|
|
1
lib/curlcpp
Submodule
1
lib/curlcpp
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 05bad3db527ee3c76730d42104ddaaa8a6376a3f
|
47
src/main.cpp
47
src/main.cpp
|
@ -5,6 +5,9 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <pugixml.hpp>
|
#include <pugixml.hpp>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <curl_easy.h>
|
||||||
|
#include <cstring>
|
||||||
|
#include <stack>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
const char g_testData[] = {
|
const char g_testData[] = {
|
||||||
|
@ -113,7 +116,8 @@ namespace {
|
||||||
std::string cleanHTML(const std::string &html){
|
std::string cleanHTML(const std::string &html){
|
||||||
// Initialize a Tidy document
|
// Initialize a Tidy document
|
||||||
TidyDoc tidyDoc = tidyCreate();
|
TidyDoc tidyDoc = tidyCreate();
|
||||||
TidyBuffer tidyOutputBuffer = {0};
|
TidyBuffer tidyOutputBuffer;
|
||||||
|
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
|
||||||
|
|
||||||
// Configure Tidy
|
// Configure Tidy
|
||||||
// The flags tell Tidy to output XML and disable showing warnings
|
// The flags tell Tidy to output XML and disable showing warnings
|
||||||
|
@ -150,9 +154,34 @@ namespace {
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
int main (int argc, char* argv[]) {
|
int main (int argc, char* argv[]) {
|
||||||
const std::string tidyHtml(cleanHTML(g_testData));
|
std::string tidyHtml;
|
||||||
std::cout << tidyHtml << std::endl;
|
if (argc != 3) {
|
||||||
std::cout << "XPath: " << argv[1] << std::endl;
|
std::cerr << "Usage: scraper <URL> <XPath>" << std::endl;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* const& url = argv[1];
|
||||||
|
const char* const& xpath = argv[2];
|
||||||
|
|
||||||
|
std::cout << "URL : " << url << "\n";
|
||||||
|
std::cout << "XPath: " << xpath << std::endl;
|
||||||
|
|
||||||
|
{
|
||||||
|
std::ostringstream oss;
|
||||||
|
curl::curl_easy easy(oss);
|
||||||
|
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, url));
|
||||||
|
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, "duckscraper"));
|
||||||
|
easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
|
||||||
|
try {
|
||||||
|
easy.perform();
|
||||||
|
}
|
||||||
|
catch (curl_error& err) {
|
||||||
|
std::stack<std::pair<std::string, std::string> > errors = err.what();
|
||||||
|
err.print_traceback();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
tidyHtml = cleanHTML(oss.str());
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
pugi::xml_document doc;
|
pugi::xml_document doc;
|
||||||
|
@ -163,11 +192,15 @@ int main (int argc, char* argv[]) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
pugi::xpath_node_set xpathRes = doc.select_nodes(argv[1]);
|
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath);
|
||||||
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
|
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
|
||||||
const pugi::xpath_node& node = *itFind;
|
const pugi::xpath_node& node = *itFind;
|
||||||
std::cout << node.node().name() << ": ";
|
if (node.node()) {
|
||||||
std::cout << node.node().attribute("name").value() << "\n";
|
std::cout << node.node().name() << ": " << node.node().value() << "\n";
|
||||||
|
}
|
||||||
|
else if (node.attribute()) {
|
||||||
|
std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
Loading…
Reference in a new issue