From 0e077a4930ed29fc5a4fe2e79c83cbb80d0d646e Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Sat, 7 Jun 2014 22:07:13 +0200 Subject: [PATCH] Refactoring to put html retrieval & cleaning into a separate file. This version should also be capable of retrieving data from https urls. --- CMakeLists.txt | 19 +- cmake/Modules/GetGitRevisionDescription.cmake | 123 ++++++++++++ .../GetGitRevisionDescription.cmake.in | 38 ++++ src/duckscraperConfig.h.in | 11 ++ src/htmlretrieve.cpp | 88 +++++++++ src/htmlretrieve.hpp | 11 ++ src/main.cpp | 176 +----------------- 7 files changed, 298 insertions(+), 168 deletions(-) create mode 100644 cmake/Modules/GetGitRevisionDescription.cmake create mode 100644 cmake/Modules/GetGitRevisionDescription.cmake.in create mode 100644 src/duckscraperConfig.h.in create mode 100644 src/htmlretrieve.cpp create mode 100644 src/htmlretrieve.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 46e33a2..d649a2a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,23 +1,40 @@ cmake_minimum_required(VERSION 2.8 FATAL_ERROR) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") -project(scraper CXX) +project(duckscraper CXX) add_subdirectory(lib/tidy) add_subdirectory(lib/curlcpp) +include(GetGitRevisionDescription) find_package(PugiXML REQUIRED) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer") +set(DEFAULT_USER_AGENT "DuckScraper") +set(PROJECT_VERSION_MAJOR "0") +set(PROJECT_VERSION_MINOR "1") +set(PROJECT_VERSION_BETA "1") +get_git_head_revision(GIT_REFSPEC PROJECT_VERSION_GIT) + +configure_file( + "${PROJECT_SOURCE_DIR}/src/${PROJECT_NAME}Config.h.in" + "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h" +) + include_directories(SYSTEM lib/tidy/include ${PUGIXML_INCLUDE_DIR} ${CURLCPP_SOURCE_DIR}/include ) +include_directories( + src/ + "${PROJECT_BINARY_DIR}" +) add_executable(${PROJECT_NAME} src/main.cpp + src/htmlretrieve.cpp ) target_link_libraries(${PROJECT_NAME} diff --git a/cmake/Modules/GetGitRevisionDescription.cmake b/cmake/Modules/GetGitRevisionDescription.cmake new file mode 100644 index 0000000..1bf0230 --- /dev/null +++ b/cmake/Modules/GetGitRevisionDescription.cmake @@ -0,0 +1,123 @@ +# - Returns a version string from Git +# +# These functions force a re-configure on each git commit so that you can +# trust the values of the variables in your build system. +# +# get_git_head_revision( [ ...]) +# +# Returns the refspec and sha hash of the current head revision +# +# git_describe( [ ...]) +# +# Returns the results of git describe on the source tree, and adjusting +# the output so that it tests false if an error occurs. +# +# git_get_exact_tag( [ ...]) +# +# Returns the results of git describe --exact-match on the source tree, +# and adjusting the output so that it tests false if there was no exact +# matching tag. +# +# Requires CMake 2.6 or newer (uses the 'function' command) +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + +if(__get_git_revision_description) + return() +endif() +set(__get_git_revision_description YES) + +# We must run the following at "include" time, not at function call time, +# to find the path to this module rather than the path to a calling list file +get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH) + +function(get_git_head_revision _refspecvar _hashvar) + set(GIT_PARENT_DIR "${CMAKE_SOURCE_DIR}") + set(GIT_DIR "${GIT_PARENT_DIR}/.git") + while(NOT EXISTS "${GIT_DIR}") # .git dir not found, search parent directories + set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}") + get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH) + if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT) + # We have reached the root directory, we are not in git + set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE) + set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE) + return() + endif() + set(GIT_DIR "${GIT_PARENT_DIR}/.git") + endwhile() + set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data") + if(NOT EXISTS "${GIT_DATA}") + file(MAKE_DIRECTORY "${GIT_DATA}") + endif() + + if(NOT EXISTS "${GIT_DIR}/HEAD") + return() + endif() + set(HEAD_FILE "${GIT_DATA}/HEAD") + configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY) + + configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in" + "${GIT_DATA}/grabRef.cmake" + @ONLY) + include("${GIT_DATA}/grabRef.cmake") + + set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE) + set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE) +endfunction() + +function(git_describe _var) + if(NOT GIT_FOUND) + find_package(Git QUIET) + endif() + get_git_head_revision(refspec hash) + if(NOT GIT_FOUND) + set(${_var} "GIT-NOTFOUND" PARENT_SCOPE) + return() + endif() + if(NOT hash) + set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE) + return() + endif() + + # TODO sanitize + #if((${ARGN}" MATCHES "&&") OR + # (ARGN MATCHES "||") OR + # (ARGN MATCHES "\\;")) + # message("Please report the following error to the project!") + # message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}") + #endif() + + #message(STATUS "Arguments to execute_process: ${ARGN}") + + execute_process(COMMAND + "${GIT_EXECUTABLE}" + describe + ${hash} + ${ARGN} + WORKING_DIRECTORY + "${CMAKE_SOURCE_DIR}" + RESULT_VARIABLE + res + OUTPUT_VARIABLE + out + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT res EQUAL 0) + set(out "${out}-${res}-NOTFOUND") + endif() + + set(${_var} "${out}" PARENT_SCOPE) +endfunction() + +function(git_get_exact_tag _var) + git_describe(out --exact-match ${ARGN}) + set(${_var} "${out}" PARENT_SCOPE) +endfunction() diff --git a/cmake/Modules/GetGitRevisionDescription.cmake.in b/cmake/Modules/GetGitRevisionDescription.cmake.in new file mode 100644 index 0000000..888ce13 --- /dev/null +++ b/cmake/Modules/GetGitRevisionDescription.cmake.in @@ -0,0 +1,38 @@ +# +# Internal file for GetGitRevisionDescription.cmake +# +# Requires CMake 2.6 or newer (uses the 'function' command) +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + +set(HEAD_HASH) + +file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024) + +string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS) +if(HEAD_CONTENTS MATCHES "ref") + # named branch + string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}") + if(EXISTS "@GIT_DIR@/${HEAD_REF}") + configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY) + elseif(EXISTS "@GIT_DIR@/logs/${HEAD_REF}") + configure_file("@GIT_DIR@/logs/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY) + set(HEAD_HASH "${HEAD_REF}") + endif() +else() + # detached HEAD + configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY) +endif() + +if(NOT HEAD_HASH) + file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024) + string(STRIP "${HEAD_HASH}" HEAD_HASH) +endif() diff --git a/src/duckscraperConfig.h.in b/src/duckscraperConfig.h.in new file mode 100644 index 0000000..053575e --- /dev/null +++ b/src/duckscraperConfig.h.in @@ -0,0 +1,11 @@ +#ifndef idE2B0CC679C2B47AD928F00D45AEBDCBD +#define idE2B0CC679C2B47AD928F00D45AEBDCBD + +#define DEFAULT_USER_AGENT "@DEFAULT_USER_AGENT@/@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@" +#define PROGRAM_NAME "@PROJECT_NAME@" +#define VERSION_MAJOR @PROJECT_VERSION_MAJOR@ +#define VERSION_MINOR @PROJECT_VERSION_MINOR@ +#define VERSION_BETA @PROJECT_VERSION_BETA@ +#define VERSION_GIT "@PROJECT_VERSION_GIT@" + +#endif diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp new file mode 100644 index 0000000..d5fcdfa --- /dev/null +++ b/src/htmlretrieve.cpp @@ -0,0 +1,88 @@ +#include "htmlretrieve.hpp" +#include "duckscraperConfig.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace duck { + namespace { + std::string cleanHTML(const std::string &html) { + // Initialize a Tidy document + TidyDoc tidyDoc = tidyCreate(); + TidyBuffer tidyOutputBuffer; + std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer)); + + // Configure Tidy + // The flags tell Tidy to output XML and disable showing warnings + bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes) + && tidyOptSetBool(tidyDoc, TidyQuiet, yes) + && tidyOptSetBool(tidyDoc, TidyNumEntities, yes) + && tidyOptSetBool(tidyDoc, TidyShowWarnings, no); + + int tidyResponseCode = -1; + + // Parse input + if (configSuccess) + tidyResponseCode = tidyParseString(tidyDoc, html.c_str()); + + // Process HTML + if (tidyResponseCode >= 0) + tidyResponseCode = tidyCleanAndRepair(tidyDoc); + + // Output the HTML to our buffer + if (tidyResponseCode >= 0) + tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer); + + // Any errors from Tidy? + if (tidyResponseCode < 0) + throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode); + + // Grab the result from the buffer and then free Tidy's memory + std::string tidyResult = (char*)tidyOutputBuffer.bp; + tidyBufFree(&tidyOutputBuffer); + tidyRelease(tidyDoc); + + return tidyResult; + } + + bool isHttps (const std::string& parUrl) { + const char protocol[] = "https://"; + const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1; + if (parUrl.size() < protocolLen) + return false; + + return std::equal(protocol, protocol + protocolLen, parUrl.begin()); + } + } //unnamed namespace + + std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) { + std::ostringstream oss; + curl::curl_easy easy(oss); + easy.add(curl::curl_pair(CURLOPT_URL, parSource)); + if (isHttps(parSource)) { + easy.add(curl::curl_pair(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer)); + easy.add(curl::curl_pair(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost)); + } + easy.add(curl::curl_pair(CURLOPT_USERAGENT, parUserAgent)); + easy.add(curl::curl_pair(CURLOPT_FOLLOWLOCATION, 1L)); + + //try { + easy.perform(); + //} + //catch (curl_error& err) { + //std::stack > errors = err.what(); + //err.print_traceback(); + //return 1; + //} + return cleanHTML(oss.str()); + } + + std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost) { + return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost); + } +} //namespace duck diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp new file mode 100644 index 0000000..08d48a2 --- /dev/null +++ b/src/htmlretrieve.hpp @@ -0,0 +1,11 @@ +#ifndef idC6776D903059465191FFB64FCFD6B86A +#define idC6776D903059465191FFB64FCFD6B86A + +#include + +namespace duck { + std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost ); + std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost ); +} //namespace duck + +#endif diff --git a/src/main.cpp b/src/main.cpp index f1c8c88..58fde18 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,161 +1,18 @@ +#include "htmlretrieve.hpp" +#include "duckscraperConfig.h" #include -#include -#include -#include #include #include #include -#include -#include -#include - -namespace { - const char g_testData[] = { - "\n" - "\n" - "LibReSSL\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "
\n" - "
\n" - "
\n" - "

LibReSSL

\n" - "
\n" - "
\n" - "
\n" - "

LibReSSL is a FREE version of the SSL/TLS protocol\n" - "forked from OpenSSL

\n" - "

\n" - "At the moment we are too\n" - "busy deleting and rewriting code to make a decent web page. No we\n" - "don't want help making web pages, thank you.\n" - "

\n" - "

Check back here soon for updates.

\n" - "
\n" - "
\n" - "
\n" - "
\n" - "

Resources:

\n" - "
\n" - "
\n" - "
\n" - "
\n" - "
\n" - "

For OpenBSD

\n" - "
\n" - "
\n" - "\n" - "
\n" - "
\n" - "
\n" - "
\n" - "

For other OS's

\n" - "
\n" - "
\n" - "

\n" - "

\n" - "

\n" - "

\n" - "Multi OS support will happen once we have\n" - "

\n" - "
    \n" - "
  • Flensed,\n" - "refactored, rewritten, and fixed enough of the code so we have stable\n" - "baseline that we trust and can be maintained/improved.
  • \n" - "
  • The right Portability team in place.
  • \n" - "
  • A Stable Commitment of\n" - "Funding to support an increased development and porting\n" - "effort.
  • \n" - "
\n" - "

\n" - "We know you all want this tomorrow. We are working as fast as we can\n" - "but our primary focus is good software that we trust to run\n" - "ourselves. We don't want to break your heart.\n" - "

\n" - "
\n" - "
\n" - "
\n" - "

\n" - "LibReSSL is primarily developed by the OpenBSD Project, and its first\n" - "inclusion into an operating system will be in OpenBSD 5.6.\n" - "

\n" - "

\n" - "LibReSSL is supported financially by The OpenBSD Foundation as\n" - "well as by the The\n" - "OpenBSD Project. Please consider donating to\n" - "support our efforts.
\n" - "

\n" - "
\n" - "\n" - "
\n" - "\n" - }; - - std::string cleanHTML(const std::string &html){ - // Initialize a Tidy document - TidyDoc tidyDoc = tidyCreate(); - TidyBuffer tidyOutputBuffer; - std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer)); - - // Configure Tidy - // The flags tell Tidy to output XML and disable showing warnings - bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes) - && tidyOptSetBool(tidyDoc, TidyQuiet, yes) - && tidyOptSetBool(tidyDoc, TidyNumEntities, yes) - && tidyOptSetBool(tidyDoc, TidyShowWarnings, no); - - int tidyResponseCode = -1; - - // Parse input - if (configSuccess) - tidyResponseCode = tidyParseString(tidyDoc, html.c_str()); - - // Process HTML - if (tidyResponseCode >= 0) - tidyResponseCode = tidyCleanAndRepair(tidyDoc); - - // Output the HTML to our buffer - if (tidyResponseCode >= 0) - tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer); - - // Any errors from Tidy? - if (tidyResponseCode < 0) - throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode); - - // Grab the result from the buffer and then free Tidy's memory - std::string tidyResult = (char*)tidyOutputBuffer.bp; - tidyBufFree(&tidyOutputBuffer); - tidyRelease(tidyDoc); - - return tidyResult; - } -} //unnamed namespace int main (int argc, char* argv[]) { - std::string tidyHtml; if (argc != 3) { + std::cerr << PROGRAM_NAME << "v" << VERSION_MAJOR << "." << VERSION_MINOR; +#if VERSION_BETA + std::cerr << "b"; +#endif + std::cerr << " git revision " << VERSION_GIT << "\n"; + std::cerr << "Default user agent is \"" << DEFAULT_USER_AGENT << "\"\n"; std::cerr << "Usage: scraper " << std::endl; return 2; } @@ -166,22 +23,7 @@ int main (int argc, char* argv[]) { std::cout << "URL : " << url << "\n"; std::cout << "XPath: " << xpath << std::endl; - { - std::ostringstream oss; - curl::curl_easy easy(oss); - easy.add(curl::curl_pair(CURLOPT_URL, url)); - easy.add(curl::curl_pair(CURLOPT_USERAGENT, "duckscraper")); - easy.add(curl::curl_pair(CURLOPT_FOLLOWLOCATION, 1L)); - try { - easy.perform(); - } - catch (curl_error& err) { - std::stack > errors = err.what(); - err.print_traceback(); - return 1; - } - tidyHtml = cleanHTML(oss.str()); - } + std::string tidyHtml = duck::getCleanHtml(url, false, false); { pugi::xml_document doc;