Working example.

Invoke it with ie:
./scraper http://www.dilbert.com '//div[@class='\''STR_Image'\'']/a/img/@src'
This commit is contained in:
King_DuckZ 2014-06-07 20:44:43 +02:00
parent aa015ddd6a
commit cb00e484fa
4 changed files with 50 additions and 10 deletions

3
.gitmodules vendored Normal file
View file

@ -0,0 +1,3 @@
[submodule "lib/curlcpp"]
path = lib/curlcpp
url = https://github.com/JosephP91/curlcpp.git

View file

@ -3,15 +3,17 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
project(scraper CXX)
add_subdirectory(lib/tidy)
add_subdirectory(lib/curlcpp)
find_package(PugiXML REQUIRED)
#find_package(CURL REQUIRED)
#${CURL_INCLUDE_DIR}
#${CURL_LIBRARIES}
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
include_directories(SYSTEM
lib/tidy/include
${PUGIXML_INCLUDE_DIR}
${CURLCPP_SOURCE_DIR}/include
)
add_executable(${PROJECT_NAME}
@ -21,4 +23,5 @@ add_executable(${PROJECT_NAME}
target_link_libraries(${PROJECT_NAME}
tidy
${PUGIXML_LIBRARIES}
curlcpp
)

1
lib/curlcpp Submodule

@ -0,0 +1 @@
Subproject commit 05bad3db527ee3c76730d42104ddaaa8a6376a3f

View file

@ -5,6 +5,9 @@
#include <string>
#include <pugixml.hpp>
#include <sstream>
#include <curl_easy.h>
#include <cstring>
#include <stack>
namespace {
const char g_testData[] = {
@ -113,7 +116,8 @@ namespace {
std::string cleanHTML(const std::string &html){
// Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate();
TidyBuffer tidyOutputBuffer = {0};
TidyBuffer tidyOutputBuffer;
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
// Configure Tidy
// The flags tell Tidy to output XML and disable showing warnings
@ -150,9 +154,34 @@ namespace {
} //unnamed namespace
int main (int argc, char* argv[]) {
const std::string tidyHtml(cleanHTML(g_testData));
std::cout << tidyHtml << std::endl;
std::cout << "XPath: " << argv[1] << std::endl;
std::string tidyHtml;
if (argc != 3) {
std::cerr << "Usage: scraper <URL> <XPath>" << std::endl;
return 2;
}
const char* const& url = argv[1];
const char* const& xpath = argv[2];
std::cout << "URL : " << url << "\n";
std::cout << "XPath: " << xpath << std::endl;
{
std::ostringstream oss;
curl::curl_easy easy(oss);
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, url));
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, "duckscraper"));
easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
try {
easy.perform();
}
catch (curl_error& err) {
std::stack<std::pair<std::string, std::string> > errors = err.what();
err.print_traceback();
return 1;
}
tidyHtml = cleanHTML(oss.str());
}
{
pugi::xml_document doc;
@ -163,11 +192,15 @@ int main (int argc, char* argv[]) {
return 1;
}
pugi::xpath_node_set xpathRes = doc.select_nodes(argv[1]);
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath);
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
const pugi::xpath_node& node = *itFind;
std::cout << node.node().name() << ": ";
std::cout << node.node().attribute("name").value() << "\n";
if (node.node()) {
std::cout << node.node().name() << ": " << node.node().value() << "\n";
}
else if (node.attribute()) {
std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n";
}
}
}
return 0;