diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46e33a2..d649a2a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,23 +1,40 @@
cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
-project(scraper CXX)
+project(duckscraper CXX)
add_subdirectory(lib/tidy)
add_subdirectory(lib/curlcpp)
+include(GetGitRevisionDescription)
find_package(PugiXML REQUIRED)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
+set(DEFAULT_USER_AGENT "DuckScraper")
+set(PROJECT_VERSION_MAJOR "0")
+set(PROJECT_VERSION_MINOR "1")
+set(PROJECT_VERSION_BETA "1")
+get_git_head_revision(GIT_REFSPEC PROJECT_VERSION_GIT)
+
+configure_file(
+ "${PROJECT_SOURCE_DIR}/src/${PROJECT_NAME}Config.h.in"
+ "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h"
+)
+
include_directories(SYSTEM
lib/tidy/include
${PUGIXML_INCLUDE_DIR}
${CURLCPP_SOURCE_DIR}/include
)
+include_directories(
+ src/
+ "${PROJECT_BINARY_DIR}"
+)
add_executable(${PROJECT_NAME}
src/main.cpp
+ src/htmlretrieve.cpp
)
target_link_libraries(${PROJECT_NAME}
diff --git a/cmake/Modules/GetGitRevisionDescription.cmake b/cmake/Modules/GetGitRevisionDescription.cmake
new file mode 100644
index 0000000..1bf0230
--- /dev/null
+++ b/cmake/Modules/GetGitRevisionDescription.cmake
@@ -0,0 +1,123 @@
+# - Returns a version string from Git
+#
+# These functions force a re-configure on each git commit so that you can
+# trust the values of the variables in your build system.
+#
+# get_git_head_revision( [ ...])
+#
+# Returns the refspec and sha hash of the current head revision
+#
+# git_describe( [ ...])
+#
+# Returns the results of git describe on the source tree, and adjusting
+# the output so that it tests false if an error occurs.
+#
+# git_get_exact_tag( [ ...])
+#
+# Returns the results of git describe --exact-match on the source tree,
+# and adjusting the output so that it tests false if there was no exact
+# matching tag.
+#
+# Requires CMake 2.6 or newer (uses the 'function' command)
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+if(__get_git_revision_description)
+ return()
+endif()
+set(__get_git_revision_description YES)
+
+# We must run the following at "include" time, not at function call time,
+# to find the path to this module rather than the path to a calling list file
+get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH)
+
+function(get_git_head_revision _refspecvar _hashvar)
+ set(GIT_PARENT_DIR "${CMAKE_SOURCE_DIR}")
+ set(GIT_DIR "${GIT_PARENT_DIR}/.git")
+ while(NOT EXISTS "${GIT_DIR}") # .git dir not found, search parent directories
+ set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}")
+ get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH)
+ if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT)
+ # We have reached the root directory, we are not in git
+ set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
+ set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
+ return()
+ endif()
+ set(GIT_DIR "${GIT_PARENT_DIR}/.git")
+ endwhile()
+ set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data")
+ if(NOT EXISTS "${GIT_DATA}")
+ file(MAKE_DIRECTORY "${GIT_DATA}")
+ endif()
+
+ if(NOT EXISTS "${GIT_DIR}/HEAD")
+ return()
+ endif()
+ set(HEAD_FILE "${GIT_DATA}/HEAD")
+ configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY)
+
+ configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in"
+ "${GIT_DATA}/grabRef.cmake"
+ @ONLY)
+ include("${GIT_DATA}/grabRef.cmake")
+
+ set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE)
+ set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE)
+endfunction()
+
+function(git_describe _var)
+ if(NOT GIT_FOUND)
+ find_package(Git QUIET)
+ endif()
+ get_git_head_revision(refspec hash)
+ if(NOT GIT_FOUND)
+ set(${_var} "GIT-NOTFOUND" PARENT_SCOPE)
+ return()
+ endif()
+ if(NOT hash)
+ set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE)
+ return()
+ endif()
+
+ # TODO sanitize
+ #if((${ARGN}" MATCHES "&&") OR
+ # (ARGN MATCHES "||") OR
+ # (ARGN MATCHES "\\;"))
+ # message("Please report the following error to the project!")
+ # message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}")
+ #endif()
+
+ #message(STATUS "Arguments to execute_process: ${ARGN}")
+
+ execute_process(COMMAND
+ "${GIT_EXECUTABLE}"
+ describe
+ ${hash}
+ ${ARGN}
+ WORKING_DIRECTORY
+ "${CMAKE_SOURCE_DIR}"
+ RESULT_VARIABLE
+ res
+ OUTPUT_VARIABLE
+ out
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(NOT res EQUAL 0)
+ set(out "${out}-${res}-NOTFOUND")
+ endif()
+
+ set(${_var} "${out}" PARENT_SCOPE)
+endfunction()
+
+function(git_get_exact_tag _var)
+ git_describe(out --exact-match ${ARGN})
+ set(${_var} "${out}" PARENT_SCOPE)
+endfunction()
diff --git a/cmake/Modules/GetGitRevisionDescription.cmake.in b/cmake/Modules/GetGitRevisionDescription.cmake.in
new file mode 100644
index 0000000..888ce13
--- /dev/null
+++ b/cmake/Modules/GetGitRevisionDescription.cmake.in
@@ -0,0 +1,38 @@
+#
+# Internal file for GetGitRevisionDescription.cmake
+#
+# Requires CMake 2.6 or newer (uses the 'function' command)
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+set(HEAD_HASH)
+
+file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024)
+
+string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS)
+if(HEAD_CONTENTS MATCHES "ref")
+ # named branch
+ string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}")
+ if(EXISTS "@GIT_DIR@/${HEAD_REF}")
+ configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
+ elseif(EXISTS "@GIT_DIR@/logs/${HEAD_REF}")
+ configure_file("@GIT_DIR@/logs/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
+ set(HEAD_HASH "${HEAD_REF}")
+ endif()
+else()
+ # detached HEAD
+ configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY)
+endif()
+
+if(NOT HEAD_HASH)
+ file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024)
+ string(STRIP "${HEAD_HASH}" HEAD_HASH)
+endif()
diff --git a/src/duckscraperConfig.h.in b/src/duckscraperConfig.h.in
new file mode 100644
index 0000000..053575e
--- /dev/null
+++ b/src/duckscraperConfig.h.in
@@ -0,0 +1,11 @@
+#ifndef idE2B0CC679C2B47AD928F00D45AEBDCBD
+#define idE2B0CC679C2B47AD928F00D45AEBDCBD
+
+#define DEFAULT_USER_AGENT "@DEFAULT_USER_AGENT@/@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@"
+#define PROGRAM_NAME "@PROJECT_NAME@"
+#define VERSION_MAJOR @PROJECT_VERSION_MAJOR@
+#define VERSION_MINOR @PROJECT_VERSION_MINOR@
+#define VERSION_BETA @PROJECT_VERSION_BETA@
+#define VERSION_GIT "@PROJECT_VERSION_GIT@"
+
+#endif
diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp
new file mode 100644
index 0000000..d5fcdfa
--- /dev/null
+++ b/src/htmlretrieve.cpp
@@ -0,0 +1,88 @@
+#include "htmlretrieve.hpp"
+#include "duckscraperConfig.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace duck {
+ namespace {
+ std::string cleanHTML(const std::string &html) {
+ // Initialize a Tidy document
+ TidyDoc tidyDoc = tidyCreate();
+ TidyBuffer tidyOutputBuffer;
+ std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
+
+ // Configure Tidy
+ // The flags tell Tidy to output XML and disable showing warnings
+ bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
+ && tidyOptSetBool(tidyDoc, TidyQuiet, yes)
+ && tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
+ && tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
+
+ int tidyResponseCode = -1;
+
+ // Parse input
+ if (configSuccess)
+ tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
+
+ // Process HTML
+ if (tidyResponseCode >= 0)
+ tidyResponseCode = tidyCleanAndRepair(tidyDoc);
+
+ // Output the HTML to our buffer
+ if (tidyResponseCode >= 0)
+ tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
+
+ // Any errors from Tidy?
+ if (tidyResponseCode < 0)
+ throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
+
+ // Grab the result from the buffer and then free Tidy's memory
+ std::string tidyResult = (char*)tidyOutputBuffer.bp;
+ tidyBufFree(&tidyOutputBuffer);
+ tidyRelease(tidyDoc);
+
+ return tidyResult;
+ }
+
+ bool isHttps (const std::string& parUrl) {
+ const char protocol[] = "https://";
+ const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
+ if (parUrl.size() < protocolLen)
+ return false;
+
+ return std::equal(protocol, protocol + protocolLen, parUrl.begin());
+ }
+ } //unnamed namespace
+
+ std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
+ std::ostringstream oss;
+ curl::curl_easy easy(oss);
+ easy.add(curl::curl_pair(CURLOPT_URL, parSource));
+ if (isHttps(parSource)) {
+ easy.add(curl::curl_pair(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
+ easy.add(curl::curl_pair(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
+ }
+ easy.add(curl::curl_pair(CURLOPT_USERAGENT, parUserAgent));
+ easy.add(curl::curl_pair(CURLOPT_FOLLOWLOCATION, 1L));
+
+ //try {
+ easy.perform();
+ //}
+ //catch (curl_error& err) {
+ //std::stack > errors = err.what();
+ //err.print_traceback();
+ //return 1;
+ //}
+ return cleanHTML(oss.str());
+ }
+
+ std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost) {
+ return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost);
+ }
+} //namespace duck
diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp
new file mode 100644
index 0000000..08d48a2
--- /dev/null
+++ b/src/htmlretrieve.hpp
@@ -0,0 +1,11 @@
+#ifndef idC6776D903059465191FFB64FCFD6B86A
+#define idC6776D903059465191FFB64FCFD6B86A
+
+#include
+
+namespace duck {
+ std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
+ std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost );
+} //namespace duck
+
+#endif
diff --git a/src/main.cpp b/src/main.cpp
index f1c8c88..58fde18 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,161 +1,18 @@
+#include "htmlretrieve.hpp"
+#include "duckscraperConfig.h"
#include
-#include
-#include
-#include
#include
#include
#include
-#include
-#include
-#include
-
-namespace {
- const char g_testData[] = {
- "\n"
- " \n"
- "LibReSSL \n"
- " \n"
- " \n"
- " \n"
- " \n"
- " \n"
- "\n"
- "\n"
- "\n"
- "\n"
- "\n"
- " \n"
- "\n"
- "LibReSSL is a FREE version of the SSL/TLS protocol\n"
- "forked from OpenSSL
\n"
- "\n"
- "At the moment we are too\n"
- "busy deleting and rewriting code to make a decent web page. No we\n"
- "don't want help making web pages, thank you.\n"
- "
\n"
- "Check back here soon for updates.
\n"
- " \n"
- "\n"
- "\n"
- "\n"
- " \n"
- "\n"
- "\n"
- "\n"
- "For OpenBSD \n"
- " \n"
- " \n"
- "\n"
- " \n"
- "\n"
- "\n"
- "\n"
- "For other OS's \n"
- " \n"
- " \n"
- "\n"
- "
\n"
- "
\n"
- "\n"
- "Multi OS support will happen once we have\n"
- "
\n"
- "\n"
- "Flensed ,\n"
- "refactored, rewritten, and fixed enough of the code so we have stable\n"
- "baseline that we trust and can be maintained/improved. \n"
- "The right Portability team in place. \n"
- "A Stable Commitment of\n"
- "Funding to support an increased development and porting\n"
- "effort. \n"
- " \n"
- "\n"
- "We know you all want this tomorrow. We are working as fast as we can\n"
- "but our primary focus is good software that we trust to run\n"
- "ourselves. We don't want to break your heart .\n"
- "
\n"
- "\n"
- "\n"
- "\n"
- "\n"
- "\n"
- "\n"
- };
-
- std::string cleanHTML(const std::string &html){
- // Initialize a Tidy document
- TidyDoc tidyDoc = tidyCreate();
- TidyBuffer tidyOutputBuffer;
- std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
-
- // Configure Tidy
- // The flags tell Tidy to output XML and disable showing warnings
- bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
- && tidyOptSetBool(tidyDoc, TidyQuiet, yes)
- && tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
- && tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
-
- int tidyResponseCode = -1;
-
- // Parse input
- if (configSuccess)
- tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
-
- // Process HTML
- if (tidyResponseCode >= 0)
- tidyResponseCode = tidyCleanAndRepair(tidyDoc);
-
- // Output the HTML to our buffer
- if (tidyResponseCode >= 0)
- tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
-
- // Any errors from Tidy?
- if (tidyResponseCode < 0)
- throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
-
- // Grab the result from the buffer and then free Tidy's memory
- std::string tidyResult = (char*)tidyOutputBuffer.bp;
- tidyBufFree(&tidyOutputBuffer);
- tidyRelease(tidyDoc);
-
- return tidyResult;
- }
-} //unnamed namespace
int main (int argc, char* argv[]) {
- std::string tidyHtml;
if (argc != 3) {
+ std::cerr << PROGRAM_NAME << "v" << VERSION_MAJOR << "." << VERSION_MINOR;
+#if VERSION_BETA
+ std::cerr << "b";
+#endif
+ std::cerr << " git revision " << VERSION_GIT << "\n";
+ std::cerr << "Default user agent is \"" << DEFAULT_USER_AGENT << "\"\n";
std::cerr << "Usage: scraper " << std::endl;
return 2;
}
@@ -166,22 +23,7 @@ int main (int argc, char* argv[]) {
std::cout << "URL : " << url << "\n";
std::cout << "XPath: " << xpath << std::endl;
- {
- std::ostringstream oss;
- curl::curl_easy easy(oss);
- easy.add(curl::curl_pair(CURLOPT_URL, url));
- easy.add(curl::curl_pair(CURLOPT_USERAGENT, "duckscraper"));
- easy.add(curl::curl_pair(CURLOPT_FOLLOWLOCATION, 1L));
- try {
- easy.perform();
- }
- catch (curl_error& err) {
- std::stack > errors = err.what();
- err.print_traceback();
- return 1;
- }
- tidyHtml = cleanHTML(oss.str());
- }
+ std::string tidyHtml = duck::getCleanHtml(url, false, false);
{
pugi::xml_document doc;