Refactoring to put html retrieval & cleaning into a separate file.

This version should also be capable of retrieving data from https urls.
This commit is contained in:
King_DuckZ 2014-06-07 22:07:13 +02:00
parent cb00e484fa
commit 0e077a4930
7 changed files with 298 additions and 168 deletions

View file

@ -1,23 +1,40 @@
cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
project(scraper CXX)
project(duckscraper CXX)
add_subdirectory(lib/tidy)
add_subdirectory(lib/curlcpp)
include(GetGitRevisionDescription)
find_package(PugiXML REQUIRED)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
set(DEFAULT_USER_AGENT "DuckScraper")
set(PROJECT_VERSION_MAJOR "0")
set(PROJECT_VERSION_MINOR "1")
set(PROJECT_VERSION_BETA "1")
get_git_head_revision(GIT_REFSPEC PROJECT_VERSION_GIT)
configure_file(
"${PROJECT_SOURCE_DIR}/src/${PROJECT_NAME}Config.h.in"
"${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h"
)
include_directories(SYSTEM
lib/tidy/include
${PUGIXML_INCLUDE_DIR}
${CURLCPP_SOURCE_DIR}/include
)
include_directories(
src/
"${PROJECT_BINARY_DIR}"
)
add_executable(${PROJECT_NAME}
src/main.cpp
src/htmlretrieve.cpp
)
target_link_libraries(${PROJECT_NAME}

View file

@ -0,0 +1,123 @@
# - Returns a version string from Git
#
# These functions force a re-configure on each git commit so that you can
# trust the values of the variables in your build system.
#
# get_git_head_revision(<refspecvar> <hashvar> [<additional arguments to git describe> ...])
#
# Returns the refspec and sha hash of the current head revision
#
# git_describe(<var> [<additional arguments to git describe> ...])
#
# Returns the results of git describe on the source tree, and adjusting
# the output so that it tests false if an error occurs.
#
# git_get_exact_tag(<var> [<additional arguments to git describe> ...])
#
# Returns the results of git describe --exact-match on the source tree,
# and adjusting the output so that it tests false if there was no exact
# matching tag.
#
# Requires CMake 2.6 or newer (uses the 'function' command)
#
# Original Author:
# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
# http://academic.cleardefinition.com
# Iowa State University HCI Graduate Program/VRAC
#
# Copyright Iowa State University 2009-2010.
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
if(__get_git_revision_description)
return()
endif()
set(__get_git_revision_description YES)
# We must run the following at "include" time, not at function call time,
# to find the path to this module rather than the path to a calling list file
get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH)
function(get_git_head_revision _refspecvar _hashvar)
set(GIT_PARENT_DIR "${CMAKE_SOURCE_DIR}")
set(GIT_DIR "${GIT_PARENT_DIR}/.git")
while(NOT EXISTS "${GIT_DIR}") # .git dir not found, search parent directories
set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}")
get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH)
if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT)
# We have reached the root directory, we are not in git
set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
return()
endif()
set(GIT_DIR "${GIT_PARENT_DIR}/.git")
endwhile()
set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data")
if(NOT EXISTS "${GIT_DATA}")
file(MAKE_DIRECTORY "${GIT_DATA}")
endif()
if(NOT EXISTS "${GIT_DIR}/HEAD")
return()
endif()
set(HEAD_FILE "${GIT_DATA}/HEAD")
configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY)
configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in"
"${GIT_DATA}/grabRef.cmake"
@ONLY)
include("${GIT_DATA}/grabRef.cmake")
set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE)
set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE)
endfunction()
function(git_describe _var)
if(NOT GIT_FOUND)
find_package(Git QUIET)
endif()
get_git_head_revision(refspec hash)
if(NOT GIT_FOUND)
set(${_var} "GIT-NOTFOUND" PARENT_SCOPE)
return()
endif()
if(NOT hash)
set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE)
return()
endif()
# TODO sanitize
#if((${ARGN}" MATCHES "&&") OR
# (ARGN MATCHES "||") OR
# (ARGN MATCHES "\\;"))
# message("Please report the following error to the project!")
# message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}")
#endif()
#message(STATUS "Arguments to execute_process: ${ARGN}")
execute_process(COMMAND
"${GIT_EXECUTABLE}"
describe
${hash}
${ARGN}
WORKING_DIRECTORY
"${CMAKE_SOURCE_DIR}"
RESULT_VARIABLE
res
OUTPUT_VARIABLE
out
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT res EQUAL 0)
set(out "${out}-${res}-NOTFOUND")
endif()
set(${_var} "${out}" PARENT_SCOPE)
endfunction()
function(git_get_exact_tag _var)
git_describe(out --exact-match ${ARGN})
set(${_var} "${out}" PARENT_SCOPE)
endfunction()

View file

@ -0,0 +1,38 @@
#
# Internal file for GetGitRevisionDescription.cmake
#
# Requires CMake 2.6 or newer (uses the 'function' command)
#
# Original Author:
# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
# http://academic.cleardefinition.com
# Iowa State University HCI Graduate Program/VRAC
#
# Copyright Iowa State University 2009-2010.
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
set(HEAD_HASH)
file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024)
string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS)
if(HEAD_CONTENTS MATCHES "ref")
# named branch
string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}")
if(EXISTS "@GIT_DIR@/${HEAD_REF}")
configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
elseif(EXISTS "@GIT_DIR@/logs/${HEAD_REF}")
configure_file("@GIT_DIR@/logs/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
set(HEAD_HASH "${HEAD_REF}")
endif()
else()
# detached HEAD
configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY)
endif()
if(NOT HEAD_HASH)
file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024)
string(STRIP "${HEAD_HASH}" HEAD_HASH)
endif()

View file

@ -0,0 +1,11 @@
#ifndef idE2B0CC679C2B47AD928F00D45AEBDCBD
#define idE2B0CC679C2B47AD928F00D45AEBDCBD
#define DEFAULT_USER_AGENT "@DEFAULT_USER_AGENT@/@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@"
#define PROGRAM_NAME "@PROJECT_NAME@"
#define VERSION_MAJOR @PROJECT_VERSION_MAJOR@
#define VERSION_MINOR @PROJECT_VERSION_MINOR@
#define VERSION_BETA @PROJECT_VERSION_BETA@
#define VERSION_GIT "@PROJECT_VERSION_GIT@"
#endif

88
src/htmlretrieve.cpp Normal file
View file

@ -0,0 +1,88 @@
#include "htmlretrieve.hpp"
#include "duckscraperConfig.h"
#include <ciso646>
#include <tidy/tidy.h>
#include <tidy/buffio.h>
#include <sstream>
#include <curl_easy.h>
#include <cstring>
#include <stack>
#include <algorithm>
namespace duck {
namespace {
std::string cleanHTML(const std::string &html) {
// Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate();
TidyBuffer tidyOutputBuffer;
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
// Configure Tidy
// The flags tell Tidy to output XML and disable showing warnings
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
int tidyResponseCode = -1;
// Parse input
if (configSuccess)
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
// Process HTML
if (tidyResponseCode >= 0)
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
// Output the HTML to our buffer
if (tidyResponseCode >= 0)
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
// Any errors from Tidy?
if (tidyResponseCode < 0)
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
// Grab the result from the buffer and then free Tidy's memory
std::string tidyResult = (char*)tidyOutputBuffer.bp;
tidyBufFree(&tidyOutputBuffer);
tidyRelease(tidyDoc);
return tidyResult;
}
bool isHttps (const std::string& parUrl) {
const char protocol[] = "https://";
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
if (parUrl.size() < protocolLen)
return false;
return std::equal(protocol, protocol + protocolLen, parUrl.begin());
}
} //unnamed namespace
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
std::ostringstream oss;
curl::curl_easy easy(oss);
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
if (isHttps(parSource)) {
easy.add(curl::curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
easy.add(curl::curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
}
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
//try {
easy.perform();
//}
//catch (curl_error& err) {
//std::stack<std::pair<std::string, std::string> > errors = err.what();
//err.print_traceback();
//return 1;
//}
return cleanHTML(oss.str());
}
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost) {
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost);
}
} //namespace duck

11
src/htmlretrieve.hpp Normal file
View file

@ -0,0 +1,11 @@
#ifndef idC6776D903059465191FFB64FCFD6B86A
#define idC6776D903059465191FFB64FCFD6B86A
#include <string>
namespace duck {
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost );
} //namespace duck
#endif

View file

@ -1,161 +1,18 @@
#include "htmlretrieve.hpp"
#include "duckscraperConfig.h"
#include <iostream>
#include <ciso646>
#include <tidy/tidy.h>
#include <tidy/buffio.h>
#include <string>
#include <pugixml.hpp>
#include <sstream>
#include <curl_easy.h>
#include <cstring>
#include <stack>
namespace {
const char g_testData[] = {
"<!DOCTYPE html>\n"
"<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"
"<title>LibReSSL</title>\n"
"<link rel=\"stylesheet\" href=\"http://www.libressl.org/libre-ssl.css\">\n"
"<meta charset=\"utf-8\">\n"
"<meta name=\"resource-type\" content=\"document\">\n"
"<meta name=\"description\" content=\"the main LibreSSL page\">\n"
"<meta name=\"keywords\" content=\"LibreSSL,main\"/\n"
"<meta name=\"distribution\" content=\"global\">\n"
"</head>\n"
"<body>\n"
"<section class=\"libre-ssl\">\n"
"<hgroup>\n"
"<header>\n"
"<h1>LibReSSL</h1>\n"
"</header>\n"
"</hgroup>\n"
"<section class=\"introduction\">\n"
"<p>LibReSSL is a <strong>FREE</strong> version of the SSL/TLS protocol\n"
"forked from <a\n"
"href=\"http://www.youtube.com/watch?v=mRCGDUsdRDU\">OpenSSL</a></p>\n"
"<p>\n"
"At the moment we are <a\n"
"href=\"http://freshbsd.org/search?project=openbsd&q=file.name:libssl\">too\n"
"busy deleting and rewriting code</a> to make a decent web page. No we\n"
"don't want help making web pages, thank you.\n"
"</p>\n"
"<p>Check back here soon for updates.</p>\n"
"</section>\n"
"<section class=\"resources\">\n"
"<hgroup>\n"
"<header>\n"
"<h2>Resources:</h2>\n"
"</header>\n"
"</hgroup>\n"
"<section class=\"open-bsd\">\n"
"<hgroup>\n"
"<header>\n"
"<h3>For OpenBSD</h3>\n"
"</header>\n"
"</hgroup>\n"
"<ul class=\"indent-list\">\n"
"<li><a href=\"http://www.openbsd.org/ftp.html\">FTP</a></li>\n"
"<li><a href=\"http://www.openbsd.org/anoncvs.html\">AnonCVS</a></li>\n"
"<li><a href=\"http://www.openbsd.org/cgi-bin/cvsweb/src/lib/libssl\">CVSWeb</a></li>\n"
"</ul>\n"
"</section>\n"
"<section>\n"
"<hgroup>\n"
"<header>\n"
"<h3>For other OS's</h3>\n"
"</header>\n"
"</hgroup>\n"
"<p>\n"
"<ul class=\"indent-list\">\n"
"<li><a href=\"http://www.openbsd.org/papers/bsdcan14-libressl\">LibReSSL Presentation by Bob Beck from BSDCAN 2014</a></li>\n"
"</ul>\n"
"</p>\n"
"<p>\n"
"Multi OS support will happen once we have\n"
"</p>\n"
"<ul>\n"
"<li><a href=\"http://en.wikipedia.org/wiki/Flensing\">Flensed</a>,\n"
"refactored, rewritten, and fixed enough of the code so we have stable\n"
"baseline that we trust and can be maintained/improved.</li>\n"
"<li>The right Portability team in place.</li>\n"
"<li>A <a href=\"http://www.openbsdfoundation.org/\">Stable Commitment of\n"
"Funding</a> to support an increased development and porting\n"
"effort. </li>\n"
"</ul>\n"
"<p>\n"
"We know you all want this tomorrow. We are working as fast as we can\n"
"but our primary focus is good software that we trust to run\n"
"ourselves. <em>We don't want to break your heart</em>.\n"
"</p>\n"
"</section>\n"
"</section>\n"
"<section class=\"acknowledgements\">\n"
"<p>\n"
"LibReSSL is primarily developed by <a\n"
"href=\"http://www.openbsd.org/\">the OpenBSD Project</a>, and its first\n"
"inclusion into an operating system will be in <a\n"
"href=\"http://www.openbsd.org/56.html\">OpenBSD 5.6</a>.\n"
"</p>\n"
"<p>\n"
"LibReSSL is supported financially by <a\n"
"href=\"http://www.openbsdfoundation.org/\">The OpenBSD Foundation</a> as\n"
"well as by the <a href=\"http://www.openbsd.org/donations.html\">The\n"
"OpenBSD Project</a>. Please consider <a\n"
"href=\"http://www.openbsdfoundation.org/donations.html\">donating</a> to\n"
"support our efforts.<br>\n"
"</p>\n"
"</section>\n"
"<section class=\"we-love-web-devs-especially-those-that-write-blink-tags-for-us\">\n"
"<p>This page scientifically designed to annoy web hipsters. <a\n"
"href=\"http://www.openbsdfoundation.org/donations.html\">Donate now</a>\n"
"to stop the Comic Sans and Blink Tags</p>\n"
"</section>\n"
"</section>\n"
"</body></html>\n"
};
std::string cleanHTML(const std::string &html){
// Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate();
TidyBuffer tidyOutputBuffer;
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
// Configure Tidy
// The flags tell Tidy to output XML and disable showing warnings
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
int tidyResponseCode = -1;
// Parse input
if (configSuccess)
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
// Process HTML
if (tidyResponseCode >= 0)
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
// Output the HTML to our buffer
if (tidyResponseCode >= 0)
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
// Any errors from Tidy?
if (tidyResponseCode < 0)
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
// Grab the result from the buffer and then free Tidy's memory
std::string tidyResult = (char*)tidyOutputBuffer.bp;
tidyBufFree(&tidyOutputBuffer);
tidyRelease(tidyDoc);
return tidyResult;
}
} //unnamed namespace
int main (int argc, char* argv[]) {
std::string tidyHtml;
if (argc != 3) {
std::cerr << PROGRAM_NAME << "v" << VERSION_MAJOR << "." << VERSION_MINOR;
#if VERSION_BETA
std::cerr << "b";
#endif
std::cerr << " git revision " << VERSION_GIT << "\n";
std::cerr << "Default user agent is \"" << DEFAULT_USER_AGENT << "\"\n";
std::cerr << "Usage: scraper <URL> <XPath>" << std::endl;
return 2;
}
@ -166,22 +23,7 @@ int main (int argc, char* argv[]) {
std::cout << "URL : " << url << "\n";
std::cout << "XPath: " << xpath << std::endl;
{
std::ostringstream oss;
curl::curl_easy easy(oss);
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, url));
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, "duckscraper"));
easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
try {
easy.perform();
}
catch (curl_error& err) {
std::stack<std::pair<std::string, std::string> > errors = err.what();
err.print_traceback();
return 1;
}
tidyHtml = cleanHTML(oss.str());
}
std::string tidyHtml = duck::getCleanHtml(url, false, false);
{
pugi::xml_document doc;