From cb00e484fa39cc0c803d123eaa90b082cf4baa64 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Sat, 7 Jun 2014 20:44:43 +0200 Subject: [PATCH] Working example. Invoke it with ie: ./scraper http://www.dilbert.com '//div[@class='\''STR_Image'\'']/a/img/@src' --- .gitmodules | 3 +++ CMakeLists.txt | 9 ++++++--- lib/curlcpp | 1 + src/main.cpp | 47 ++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 .gitmodules create mode 160000 lib/curlcpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..b14b78f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lib/curlcpp"] + path = lib/curlcpp + url = https://github.com/JosephP91/curlcpp.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c48e2f..46e33a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,15 +3,17 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") project(scraper CXX) add_subdirectory(lib/tidy) +add_subdirectory(lib/curlcpp) find_package(PugiXML REQUIRED) -#find_package(CURL REQUIRED) -#${CURL_INCLUDE_DIR} -#${CURL_LIBRARIES} + +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer") include_directories(SYSTEM lib/tidy/include ${PUGIXML_INCLUDE_DIR} + ${CURLCPP_SOURCE_DIR}/include ) add_executable(${PROJECT_NAME} @@ -21,4 +23,5 @@ add_executable(${PROJECT_NAME} target_link_libraries(${PROJECT_NAME} tidy ${PUGIXML_LIBRARIES} + curlcpp ) diff --git a/lib/curlcpp b/lib/curlcpp new file mode 160000 index 0000000..05bad3d --- /dev/null +++ b/lib/curlcpp @@ -0,0 +1 @@ +Subproject commit 05bad3db527ee3c76730d42104ddaaa8a6376a3f diff --git a/src/main.cpp b/src/main.cpp index 3b43c50..f1c8c88 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include namespace { const char g_testData[] = { @@ -113,7 +116,8 @@ namespace { std::string cleanHTML(const std::string &html){ // Initialize a Tidy document TidyDoc tidyDoc = tidyCreate(); - TidyBuffer tidyOutputBuffer = {0}; + TidyBuffer tidyOutputBuffer; + std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer)); // Configure Tidy // The flags tell Tidy to output XML and disable showing warnings @@ -150,9 +154,34 @@ namespace { } //unnamed namespace int main (int argc, char* argv[]) { - const std::string tidyHtml(cleanHTML(g_testData)); - std::cout << tidyHtml << std::endl; - std::cout << "XPath: " << argv[1] << std::endl; + std::string tidyHtml; + if (argc != 3) { + std::cerr << "Usage: scraper " << std::endl; + return 2; + } + + const char* const& url = argv[1]; + const char* const& xpath = argv[2]; + + std::cout << "URL : " << url << "\n"; + std::cout << "XPath: " << xpath << std::endl; + + { + std::ostringstream oss; + curl::curl_easy easy(oss); + easy.add(curl::curl_pair(CURLOPT_URL, url)); + easy.add(curl::curl_pair(CURLOPT_USERAGENT, "duckscraper")); + easy.add(curl::curl_pair(CURLOPT_FOLLOWLOCATION, 1L)); + try { + easy.perform(); + } + catch (curl_error& err) { + std::stack > errors = err.what(); + err.print_traceback(); + return 1; + } + tidyHtml = cleanHTML(oss.str()); + } { pugi::xml_document doc; @@ -163,11 +192,15 @@ int main (int argc, char* argv[]) { return 1; } - pugi::xpath_node_set xpathRes = doc.select_nodes(argv[1]); + pugi::xpath_node_set xpathRes = doc.select_nodes(xpath); for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) { const pugi::xpath_node& node = *itFind; - std::cout << node.node().name() << ": "; - std::cout << node.node().attribute("name").value() << "\n"; + if (node.node()) { + std::cout << node.node().name() << ": " << node.node().value() << "\n"; + } + else if (node.attribute()) { + std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n"; + } } } return 0;