Working example.

Invoke it with ie: ./scraper http://www.dilbert.com '//div[@class='\''STR_Image'\'']/a/img/@src'
2014-06-07 20:44:43 +02:00 · 2014-06-07 20:44:43 +02:00 · cb00e484fa
commit cb00e484fa
parent aa015ddd6a
4 changed files with 50 additions and 10 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "lib/curlcpp"]
+	path = lib/curlcpp
+	url = https://github.com/JosephP91/curlcpp.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,15 +3,17 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 project(scraper CXX)

 add_subdirectory(lib/tidy)
+add_subdirectory(lib/curlcpp)

 find_package(PugiXML REQUIRED)
-#find_package(CURL REQUIRED)
-#${CURL_INCLUDE_DIR}
-#${CURL_LIBRARIES}
+
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")

 include_directories(SYSTEM
 	lib/tidy/include
 	${PUGIXML_INCLUDE_DIR}
+	${CURLCPP_SOURCE_DIR}/include
 )

 add_executable(${PROJECT_NAME}
@ -21,4 +23,5 @@ add_executable(${PROJECT_NAME}
 target_link_libraries(${PROJECT_NAME}
 	tidy
 	${PUGIXML_LIBRARIES}
+	curlcpp
 )
--- a/lib/curlcpp
+++ b/lib/curlcpp
@ -0,0 +1 @@
+Subproject commit 05bad3db527ee3c76730d42104ddaaa8a6376a3f
--- a/src/main.cpp
+++ b/src/main.cpp
@ -5,6 +5,9 @@
 #include <string>
 #include <pugixml.hpp>
 #include <sstream>
+#include <curl_easy.h>
+#include <cstring>
+#include <stack>

 namespace {
 	const char g_testData[] = {
@ -113,7 +116,8 @@ namespace {
 	std::string cleanHTML(const std::string &html){
 		// Initialize a Tidy document
 		TidyDoc tidyDoc = tidyCreate();
-		TidyBuffer tidyOutputBuffer = {0};
+		TidyBuffer tidyOutputBuffer;
+		std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));

 		// Configure Tidy
 		// The flags tell Tidy to output XML and disable showing warnings
@ -150,9 +154,34 @@ namespace {
 } //unnamed namespace

 int main (int argc, char* argv[]) {
-	const std::string tidyHtml(cleanHTML(g_testData));
-	std::cout << tidyHtml << std::endl;
-	std::cout << "XPath: " << argv[1] << std::endl;
+	std::string tidyHtml;
+	if (argc != 3) {
+		std::cerr << "Usage: scraper <URL> <XPath>" << std::endl;
+		return 2;
+	}
+
+	const char* const& url = argv[1];
+	const char* const& xpath = argv[2];
+
+	std::cout << "URL  : " << url << "\n";
+	std::cout << "XPath: " << xpath << std::endl;
+
+	{
+		std::ostringstream oss;
+		curl::curl_easy easy(oss);
+		easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, url));
+		easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, "duckscraper"));
+		easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
+		try {
+			easy.perform();
+		}
+		catch (curl_error& err) {
+			std::stack<std::pair<std::string, std::string> > errors = err.what();
+			err.print_traceback();
+			return 1;
+		}
+		tidyHtml = cleanHTML(oss.str());
+	}

 	{
 		pugi::xml_document doc;
@ -163,11 +192,15 @@ int main (int argc, char* argv[]) {
 			return 1;
 		}

-		pugi::xpath_node_set xpathRes = doc.select_nodes(argv[1]);
+		pugi::xpath_node_set xpathRes = doc.select_nodes(xpath);
 		for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
 			const pugi::xpath_node& node = *itFind;
-			std::cout << node.node().name() << ": ";
-			std::cout << node.node().attribute("name").value() << "\n";
+			if (node.node()) {
+				std::cout << node.node().name() << ": " << node.node().value() << "\n";
+			}
+			else if (node.attribute()) {
+				std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n";
+			}
 		}
 	}
 	return 0;
				`@ -0,0 +1 @@`
				`Subproject commit 05bad3db527ee3c76730d42104ddaaa8a6376a3f`