Refactoring to put html retrieval & cleaning into a separate file.

This version should also be capable of retrieving data from https urls.
2014-06-07 22:07:13 +02:00 · 2014-06-07 22:07:13 +02:00 · 0e077a4930
commit 0e077a4930
parent cb00e484fa
7 changed files with 298 additions and 168 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,23 +1,40 @@
 cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
-project(scraper CXX)
+project(duckscraper CXX)

 add_subdirectory(lib/tidy)
 add_subdirectory(lib/curlcpp)

+include(GetGitRevisionDescription)
 find_package(PugiXML REQUIRED)

 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")

+set(DEFAULT_USER_AGENT "DuckScraper")
+set(PROJECT_VERSION_MAJOR "0")
+set(PROJECT_VERSION_MINOR "1")
+set(PROJECT_VERSION_BETA "1")
+get_git_head_revision(GIT_REFSPEC PROJECT_VERSION_GIT)
+
+configure_file(
+	"${PROJECT_SOURCE_DIR}/src/${PROJECT_NAME}Config.h.in"
+	"${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h"
+)
+
 include_directories(SYSTEM
 	lib/tidy/include
 	${PUGIXML_INCLUDE_DIR}
 	${CURLCPP_SOURCE_DIR}/include
 )
+include_directories(
+	src/
+	"${PROJECT_BINARY_DIR}"
+)

 add_executable(${PROJECT_NAME}
 	src/main.cpp
+	src/htmlretrieve.cpp
 )

 target_link_libraries(${PROJECT_NAME}
--- a/cmake/Modules/GetGitRevisionDescription.cmake
+++ b/cmake/Modules/GetGitRevisionDescription.cmake
@ -0,0 +1,123 @@
+# - Returns a version string from Git
+#
+# These functions force a re-configure on each git commit so that you can
+# trust the values of the variables in your build system.
+#
+#  get_git_head_revision(<refspecvar> <hashvar> [<additional arguments to git describe> ...])
+#
+# Returns the refspec and sha hash of the current head revision
+#
+#  git_describe(<var> [<additional arguments to git describe> ...])
+#
+# Returns the results of git describe on the source tree, and adjusting
+# the output so that it tests false if an error occurs.
+#
+#  git_get_exact_tag(<var> [<additional arguments to git describe> ...])
+#
+# Returns the results of git describe --exact-match on the source tree,
+# and adjusting the output so that it tests false if there was no exact
+# matching tag.
+#
+# Requires CMake 2.6 or newer (uses the 'function' command)
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+if(__get_git_revision_description)
+	return()
+endif()
+set(__get_git_revision_description YES)
+
+# We must run the following at "include" time, not at function call time,
+# to find the path to this module rather than the path to a calling list file
+get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH)
+
+function(get_git_head_revision _refspecvar _hashvar)
+	set(GIT_PARENT_DIR "${CMAKE_SOURCE_DIR}")
+	set(GIT_DIR "${GIT_PARENT_DIR}/.git")
+	while(NOT EXISTS "${GIT_DIR}")	# .git dir not found, search parent directories
+		set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}")
+		get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH)
+		if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT)
+			# We have reached the root directory, we are not in git
+			set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
+			set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
+			return()
+		endif()
+		set(GIT_DIR "${GIT_PARENT_DIR}/.git")
+	endwhile()
+	set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data")
+	if(NOT EXISTS "${GIT_DATA}")
+		file(MAKE_DIRECTORY "${GIT_DATA}")
+	endif()
+
+	if(NOT EXISTS "${GIT_DIR}/HEAD")
+		return()
+	endif()
+	set(HEAD_FILE "${GIT_DATA}/HEAD")
+	configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY)
+
+	configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in"
+		"${GIT_DATA}/grabRef.cmake"
+		@ONLY)
+	include("${GIT_DATA}/grabRef.cmake")
+
+	set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE)
+	set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE)
+endfunction()
+
+function(git_describe _var)
+	if(NOT GIT_FOUND)
+		find_package(Git QUIET)
+	endif()
+	get_git_head_revision(refspec hash)
+	if(NOT GIT_FOUND)
+		set(${_var} "GIT-NOTFOUND" PARENT_SCOPE)
+		return()
+	endif()
+	if(NOT hash)
+		set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE)
+		return()
+	endif()
+
+	# TODO sanitize
+	#if((${ARGN}" MATCHES "&&") OR
+	#	(ARGN MATCHES "||") OR
+	#	(ARGN MATCHES "\\;"))
+	#	message("Please report the following error to the project!")
+	#	message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}")
+	#endif()
+
+	#message(STATUS "Arguments to execute_process: ${ARGN}")
+
+	execute_process(COMMAND
+		"${GIT_EXECUTABLE}"
+		describe
+		${hash}
+		${ARGN}
+		WORKING_DIRECTORY
+		"${CMAKE_SOURCE_DIR}"
+		RESULT_VARIABLE
+		res
+		OUTPUT_VARIABLE
+		out
+		ERROR_QUIET
+		OUTPUT_STRIP_TRAILING_WHITESPACE)
+	if(NOT res EQUAL 0)
+		set(out "${out}-${res}-NOTFOUND")
+	endif()
+
+	set(${_var} "${out}" PARENT_SCOPE)
+endfunction()
+
+function(git_get_exact_tag _var)
+	git_describe(out --exact-match ${ARGN})
+	set(${_var} "${out}" PARENT_SCOPE)
+endfunction()
--- a/cmake/Modules/GetGitRevisionDescription.cmake.in
+++ b/cmake/Modules/GetGitRevisionDescription.cmake.in
@ -0,0 +1,38 @@
+# 
+# Internal file for GetGitRevisionDescription.cmake
+#
+# Requires CMake 2.6 or newer (uses the 'function' command)
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+set(HEAD_HASH)
+
+file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024)
+
+string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS)
+if(HEAD_CONTENTS MATCHES "ref")
+	# named branch
+	string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}")
+	if(EXISTS "@GIT_DIR@/${HEAD_REF}")
+		configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
+	elseif(EXISTS "@GIT_DIR@/logs/${HEAD_REF}")
+		configure_file("@GIT_DIR@/logs/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
+		set(HEAD_HASH "${HEAD_REF}")
+	endif()
+else()
+	# detached HEAD
+	configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY)
+endif()
+
+if(NOT HEAD_HASH)
+	file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024)
+	string(STRIP "${HEAD_HASH}" HEAD_HASH)
+endif()
--- a/src/duckscraperConfig.h.in
+++ b/src/duckscraperConfig.h.in
@ -0,0 +1,11 @@
+#ifndef idE2B0CC679C2B47AD928F00D45AEBDCBD
+#define idE2B0CC679C2B47AD928F00D45AEBDCBD
+
+#define DEFAULT_USER_AGENT "@DEFAULT_USER_AGENT@/@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@"
+#define PROGRAM_NAME "@PROJECT_NAME@"
+#define VERSION_MAJOR @PROJECT_VERSION_MAJOR@
+#define VERSION_MINOR @PROJECT_VERSION_MINOR@
+#define VERSION_BETA @PROJECT_VERSION_BETA@
+#define VERSION_GIT "@PROJECT_VERSION_GIT@"
+
+#endif
--- a/src/htmlretrieve.cpp
+++ b/src/htmlretrieve.cpp
@ -0,0 +1,88 @@
+#include "htmlretrieve.hpp"
+#include "duckscraperConfig.h"
+#include <ciso646>
+#include <tidy/tidy.h>
+#include <tidy/buffio.h>
+#include <sstream>
+#include <curl_easy.h>
+#include <cstring>
+#include <stack>
+#include <algorithm>
+
+namespace duck {
+	namespace {
+		std::string cleanHTML(const std::string &html) {
+			// Initialize a Tidy document
+			TidyDoc tidyDoc = tidyCreate();
+			TidyBuffer tidyOutputBuffer;
+			std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
+
+			// Configure Tidy
+			// The flags tell Tidy to output XML and disable showing warnings
+			bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
+				&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
+				&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
+				&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
+
+			int tidyResponseCode = -1;
+
+			// Parse input
+			if (configSuccess)
+				tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
+
+			// Process HTML
+			if (tidyResponseCode >= 0)
+				tidyResponseCode = tidyCleanAndRepair(tidyDoc);
+
+			// Output the HTML to our buffer
+			if (tidyResponseCode >= 0)
+				tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
+
+			// Any errors from Tidy?
+			if (tidyResponseCode < 0)
+				throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
+
+			// Grab the result from the buffer and then free Tidy's memory
+			std::string tidyResult = (char*)tidyOutputBuffer.bp;
+			tidyBufFree(&tidyOutputBuffer);
+			tidyRelease(tidyDoc);
+
+			return tidyResult;
+		}
+
+		bool isHttps (const std::string& parUrl) {
+			const char protocol[] = "https://";
+			const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
+			if (parUrl.size() < protocolLen)
+				return false;
+
+			return std::equal(protocol, protocol + protocolLen, parUrl.begin());
+		}
+	} //unnamed namespace
+
+	std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
+		std::ostringstream oss;
+		curl::curl_easy easy(oss);
+		easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
+		if (isHttps(parSource)) {
+			easy.add(curl::curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
+			easy.add(curl::curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
+		}
+		easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
+		easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
+
+		//try {
+			easy.perform();
+		//}
+		//catch (curl_error& err) {
+			//std::stack<std::pair<std::string, std::string> > errors = err.what();
+			//err.print_traceback();
+			//return 1;
+		//}
+		return cleanHTML(oss.str());
+	}
+
+	std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost) {
+		return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost);
+	}
+} //namespace duck
--- a/src/htmlretrieve.hpp
+++ b/src/htmlretrieve.hpp
@ -0,0 +1,11 @@
+#ifndef idC6776D903059465191FFB64FCFD6B86A
+#define idC6776D903059465191FFB64FCFD6B86A
+
+#include <string>
+
+namespace duck {
+	std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
+	std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost );
+} //namespace duck
+
+#endif
--- a/src/main.cpp
+++ b/src/main.cpp
@ -1,161 +1,18 @@
+#include "htmlretrieve.hpp"
+#include "duckscraperConfig.h"
 #include <iostream>
-#include <ciso646>
-#include <tidy/tidy.h>
-#include <tidy/buffio.h>
 #include <string>
 #include <pugixml.hpp>
 #include <sstream>
-#include <curl_easy.h>
-#include <cstring>
-#include <stack>
-
-namespace {
-	const char g_testData[] = {
-		"<!DOCTYPE html>\n"
-		"<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"
-		"<title>LibReSSL</title>\n"
-		"<link rel=\"stylesheet\" href=\"http://www.libressl.org/libre-ssl.css\">\n"
-		"<meta charset=\"utf-8\">\n"
-		"<meta name=\"resource-type\" content=\"document\">\n"
-		"<meta name=\"description\" content=\"the main LibreSSL page\">\n"
-		"<meta name=\"keywords\" content=\"LibreSSL,main\"/\n"
-		"<meta name=\"distribution\" content=\"global\">\n"
-		"</head>\n"
-		"<body>\n"
-		"<section class=\"libre-ssl\">\n"
-		"<hgroup>\n"
-		"<header>\n"
-		"<h1>LibReSSL</h1>\n"
-		"</header>\n"
-		"</hgroup>\n"
-		"<section class=\"introduction\">\n"
-		"<p>LibReSSL is a <strong>FREE</strong> version of the SSL/TLS protocol\n"
-		"forked from <a\n"
-		"href=\"http://www.youtube.com/watch?v=mRCGDUsdRDU\">OpenSSL</a></p>\n"
-		"<p>\n"
-		"At the moment we are <a\n"
-		"href=\"http://freshbsd.org/search?project=openbsd&q=file.name:libssl\">too\n"
-		"busy deleting and rewriting code</a> to make a decent web page. No we\n"
-		"don't want help making web pages, thank you.\n"
-		"</p>\n"
-		"<p>Check back here soon for updates.</p>\n"
-		"</section>\n"
-		"<section class=\"resources\">\n"
-		"<hgroup>\n"
-		"<header>\n"
-		"<h2>Resources:</h2>\n"
-		"</header>\n"
-		"</hgroup>\n"
-		"<section class=\"open-bsd\">\n"
-		"<hgroup>\n"
-		"<header>\n"
-		"<h3>For OpenBSD</h3>\n"
-		"</header>\n"
-		"</hgroup>\n"
-		"<ul class=\"indent-list\">\n"
-		"<li><a href=\"http://www.openbsd.org/ftp.html\">FTP</a></li>\n"
-		"<li><a href=\"http://www.openbsd.org/anoncvs.html\">AnonCVS</a></li>\n"
-		"<li><a href=\"http://www.openbsd.org/cgi-bin/cvsweb/src/lib/libssl\">CVSWeb</a></li>\n"
-		"</ul>\n"
-		"</section>\n"
-		"<section>\n"
-		"<hgroup>\n"
-		"<header>\n"
-		"<h3>For other OS's</h3>\n"
-		"</header>\n"
-		"</hgroup>\n"
-		"<p>\n"
-		"<ul class=\"indent-list\">\n"
-		"<li><a href=\"http://www.openbsd.org/papers/bsdcan14-libressl\">LibReSSL Presentation by Bob Beck from BSDCAN 2014</a></li>\n"
-		"</ul>\n"
-		"</p>\n"
-		"<p>\n"
-		"Multi OS support will happen once we have\n"
-		"</p>\n"
-		"<ul>\n"
-		"<li><a href=\"http://en.wikipedia.org/wiki/Flensing\">Flensed</a>,\n"
-		"refactored, rewritten, and fixed enough of the code so we have stable\n"
-		"baseline that we trust and can be maintained/improved.</li>\n"
-		"<li>The right Portability team in place.</li>\n"
-		"<li>A <a href=\"http://www.openbsdfoundation.org/\">Stable Commitment of\n"
-		"Funding</a> to support an increased development and porting\n"
-		"effort. </li>\n"
-		"</ul>\n"
-		"<p>\n"
-		"We know you all want this tomorrow. We are working as fast as we can\n"
-		"but our primary focus is good software that we trust to run\n"
-		"ourselves. <em>We don't want to break your heart</em>.\n"
-		"</p>\n"
-		"</section>\n"
-		"</section>\n"
-		"<section class=\"acknowledgements\">\n"
-		"<p>\n"
-		"LibReSSL is primarily developed by <a\n"
-		"href=\"http://www.openbsd.org/\">the OpenBSD Project</a>, and its first\n"
-		"inclusion into an operating system will be in <a\n"
-		"href=\"http://www.openbsd.org/56.html\">OpenBSD 5.6</a>.\n"
-		"</p>\n"
-		"<p>\n"
-		"LibReSSL is supported financially by <a\n"
-		"href=\"http://www.openbsdfoundation.org/\">The OpenBSD Foundation</a> as\n"
-		"well as by the <a href=\"http://www.openbsd.org/donations.html\">The\n"
-		"OpenBSD Project</a>. Please consider <a\n"
-		"href=\"http://www.openbsdfoundation.org/donations.html\">donating</a> to\n"
-		"support our efforts.<br>\n"
-		"</p>\n"
-		"</section>\n"
-		"<section class=\"we-love-web-devs-especially-those-that-write-blink-tags-for-us\">\n"
-		"<p>This page scientifically designed to annoy web hipsters. <a\n"
-		"href=\"http://www.openbsdfoundation.org/donations.html\">Donate now</a>\n"
-		"to stop the Comic Sans and Blink Tags</p>\n"
-		"</section>\n"
-		"</section>\n"
-		"</body></html>\n"
-	};
-
-	std::string cleanHTML(const std::string &html){
-		// Initialize a Tidy document
-		TidyDoc tidyDoc = tidyCreate();
-		TidyBuffer tidyOutputBuffer;
-		std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
-
-		// Configure Tidy
-		// The flags tell Tidy to output XML and disable showing warnings
-		bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
-			&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
-			&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
-			&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
-
-		int tidyResponseCode = -1;
-
-		// Parse input
-		if (configSuccess)
-			tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
-
-		// Process HTML
-		if (tidyResponseCode >= 0)
-			tidyResponseCode = tidyCleanAndRepair(tidyDoc);
-
-		// Output the HTML to our buffer
-		if (tidyResponseCode >= 0)
-			tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
-
-		// Any errors from Tidy?
-		if (tidyResponseCode < 0)
-			throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
-
-		// Grab the result from the buffer and then free Tidy's memory
-		std::string tidyResult = (char*)tidyOutputBuffer.bp;
-		tidyBufFree(&tidyOutputBuffer);
-		tidyRelease(tidyDoc);
-
-		return tidyResult;
-	}
-} //unnamed namespace

 int main (int argc, char* argv[]) {
-	std::string tidyHtml;
 	if (argc != 3) {
+		std::cerr << PROGRAM_NAME << "v" << VERSION_MAJOR << "." << VERSION_MINOR;
+#if VERSION_BETA
+		std::cerr << "b";
+#endif
+		std::cerr << " git revision " << VERSION_GIT << "\n";
+		std::cerr << "Default user agent is \"" << DEFAULT_USER_AGENT << "\"\n";
 		std::cerr << "Usage: scraper <URL> <XPath>" << std::endl;
 		return 2;
 	}
@ -166,22 +23,7 @@ int main (int argc, char* argv[]) {
 	std::cout << "URL  : " << url << "\n";
 	std::cout << "XPath: " << xpath << std::endl;

-	{
-		std::ostringstream oss;
-		curl::curl_easy easy(oss);
-		easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, url));
-		easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, "duckscraper"));
-		easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
-		try {
-			easy.perform();
-		}
-		catch (curl_error& err) {
-			std::stack<std::pair<std::string, std::string> > errors = err.what();
-			err.print_traceback();
-			return 1;
-		}
-		tidyHtml = cleanHTML(oss.str());
-	}
+	std::string tidyHtml = duck::getCleanHtml(url, false, false);

 	{
 		pugi::xml_document doc;