Refactoring to put html retrieval & cleaning into a separate file.
This version should also be capable of retrieving data from https urls.
This commit is contained in:
parent
cb00e484fa
commit
0e077a4930
7 changed files with 298 additions and 168 deletions
|
@ -1,23 +1,40 @@
|
|||
cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
|
||||
project(scraper CXX)
|
||||
project(duckscraper CXX)
|
||||
|
||||
add_subdirectory(lib/tidy)
|
||||
add_subdirectory(lib/curlcpp)
|
||||
|
||||
include(GetGitRevisionDescription)
|
||||
find_package(PugiXML REQUIRED)
|
||||
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
|
||||
|
||||
set(DEFAULT_USER_AGENT "DuckScraper")
|
||||
set(PROJECT_VERSION_MAJOR "0")
|
||||
set(PROJECT_VERSION_MINOR "1")
|
||||
set(PROJECT_VERSION_BETA "1")
|
||||
get_git_head_revision(GIT_REFSPEC PROJECT_VERSION_GIT)
|
||||
|
||||
configure_file(
|
||||
"${PROJECT_SOURCE_DIR}/src/${PROJECT_NAME}Config.h.in"
|
||||
"${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h"
|
||||
)
|
||||
|
||||
include_directories(SYSTEM
|
||||
lib/tidy/include
|
||||
${PUGIXML_INCLUDE_DIR}
|
||||
${CURLCPP_SOURCE_DIR}/include
|
||||
)
|
||||
include_directories(
|
||||
src/
|
||||
"${PROJECT_BINARY_DIR}"
|
||||
)
|
||||
|
||||
add_executable(${PROJECT_NAME}
|
||||
src/main.cpp
|
||||
src/htmlretrieve.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(${PROJECT_NAME}
|
||||
|
|
123
cmake/Modules/GetGitRevisionDescription.cmake
Normal file
123
cmake/Modules/GetGitRevisionDescription.cmake
Normal file
|
@ -0,0 +1,123 @@
|
|||
# - Returns a version string from Git
|
||||
#
|
||||
# These functions force a re-configure on each git commit so that you can
|
||||
# trust the values of the variables in your build system.
|
||||
#
|
||||
# get_git_head_revision(<refspecvar> <hashvar> [<additional arguments to git describe> ...])
|
||||
#
|
||||
# Returns the refspec and sha hash of the current head revision
|
||||
#
|
||||
# git_describe(<var> [<additional arguments to git describe> ...])
|
||||
#
|
||||
# Returns the results of git describe on the source tree, and adjusting
|
||||
# the output so that it tests false if an error occurs.
|
||||
#
|
||||
# git_get_exact_tag(<var> [<additional arguments to git describe> ...])
|
||||
#
|
||||
# Returns the results of git describe --exact-match on the source tree,
|
||||
# and adjusting the output so that it tests false if there was no exact
|
||||
# matching tag.
|
||||
#
|
||||
# Requires CMake 2.6 or newer (uses the 'function' command)
|
||||
#
|
||||
# Original Author:
|
||||
# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
|
||||
# http://academic.cleardefinition.com
|
||||
# Iowa State University HCI Graduate Program/VRAC
|
||||
#
|
||||
# Copyright Iowa State University 2009-2010.
|
||||
# Distributed under the Boost Software License, Version 1.0.
|
||||
# (See accompanying file LICENSE_1_0.txt or copy at
|
||||
# http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
if(__get_git_revision_description)
|
||||
return()
|
||||
endif()
|
||||
set(__get_git_revision_description YES)
|
||||
|
||||
# We must run the following at "include" time, not at function call time,
|
||||
# to find the path to this module rather than the path to a calling list file
|
||||
get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH)
|
||||
|
||||
function(get_git_head_revision _refspecvar _hashvar)
|
||||
set(GIT_PARENT_DIR "${CMAKE_SOURCE_DIR}")
|
||||
set(GIT_DIR "${GIT_PARENT_DIR}/.git")
|
||||
while(NOT EXISTS "${GIT_DIR}") # .git dir not found, search parent directories
|
||||
set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}")
|
||||
get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH)
|
||||
if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT)
|
||||
# We have reached the root directory, we are not in git
|
||||
set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
|
||||
set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
|
||||
return()
|
||||
endif()
|
||||
set(GIT_DIR "${GIT_PARENT_DIR}/.git")
|
||||
endwhile()
|
||||
set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data")
|
||||
if(NOT EXISTS "${GIT_DATA}")
|
||||
file(MAKE_DIRECTORY "${GIT_DATA}")
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS "${GIT_DIR}/HEAD")
|
||||
return()
|
||||
endif()
|
||||
set(HEAD_FILE "${GIT_DATA}/HEAD")
|
||||
configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY)
|
||||
|
||||
configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in"
|
||||
"${GIT_DATA}/grabRef.cmake"
|
||||
@ONLY)
|
||||
include("${GIT_DATA}/grabRef.cmake")
|
||||
|
||||
set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE)
|
||||
set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
function(git_describe _var)
|
||||
if(NOT GIT_FOUND)
|
||||
find_package(Git QUIET)
|
||||
endif()
|
||||
get_git_head_revision(refspec hash)
|
||||
if(NOT GIT_FOUND)
|
||||
set(${_var} "GIT-NOTFOUND" PARENT_SCOPE)
|
||||
return()
|
||||
endif()
|
||||
if(NOT hash)
|
||||
set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE)
|
||||
return()
|
||||
endif()
|
||||
|
||||
# TODO sanitize
|
||||
#if((${ARGN}" MATCHES "&&") OR
|
||||
# (ARGN MATCHES "||") OR
|
||||
# (ARGN MATCHES "\\;"))
|
||||
# message("Please report the following error to the project!")
|
||||
# message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}")
|
||||
#endif()
|
||||
|
||||
#message(STATUS "Arguments to execute_process: ${ARGN}")
|
||||
|
||||
execute_process(COMMAND
|
||||
"${GIT_EXECUTABLE}"
|
||||
describe
|
||||
${hash}
|
||||
${ARGN}
|
||||
WORKING_DIRECTORY
|
||||
"${CMAKE_SOURCE_DIR}"
|
||||
RESULT_VARIABLE
|
||||
res
|
||||
OUTPUT_VARIABLE
|
||||
out
|
||||
ERROR_QUIET
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(NOT res EQUAL 0)
|
||||
set(out "${out}-${res}-NOTFOUND")
|
||||
endif()
|
||||
|
||||
set(${_var} "${out}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
function(git_get_exact_tag _var)
|
||||
git_describe(out --exact-match ${ARGN})
|
||||
set(${_var} "${out}" PARENT_SCOPE)
|
||||
endfunction()
|
38
cmake/Modules/GetGitRevisionDescription.cmake.in
Normal file
38
cmake/Modules/GetGitRevisionDescription.cmake.in
Normal file
|
@ -0,0 +1,38 @@
|
|||
#
|
||||
# Internal file for GetGitRevisionDescription.cmake
|
||||
#
|
||||
# Requires CMake 2.6 or newer (uses the 'function' command)
|
||||
#
|
||||
# Original Author:
|
||||
# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
|
||||
# http://academic.cleardefinition.com
|
||||
# Iowa State University HCI Graduate Program/VRAC
|
||||
#
|
||||
# Copyright Iowa State University 2009-2010.
|
||||
# Distributed under the Boost Software License, Version 1.0.
|
||||
# (See accompanying file LICENSE_1_0.txt or copy at
|
||||
# http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
set(HEAD_HASH)
|
||||
|
||||
file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024)
|
||||
|
||||
string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS)
|
||||
if(HEAD_CONTENTS MATCHES "ref")
|
||||
# named branch
|
||||
string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}")
|
||||
if(EXISTS "@GIT_DIR@/${HEAD_REF}")
|
||||
configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
|
||||
elseif(EXISTS "@GIT_DIR@/logs/${HEAD_REF}")
|
||||
configure_file("@GIT_DIR@/logs/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
|
||||
set(HEAD_HASH "${HEAD_REF}")
|
||||
endif()
|
||||
else()
|
||||
# detached HEAD
|
||||
configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY)
|
||||
endif()
|
||||
|
||||
if(NOT HEAD_HASH)
|
||||
file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024)
|
||||
string(STRIP "${HEAD_HASH}" HEAD_HASH)
|
||||
endif()
|
11
src/duckscraperConfig.h.in
Normal file
11
src/duckscraperConfig.h.in
Normal file
|
@ -0,0 +1,11 @@
|
|||
#ifndef idE2B0CC679C2B47AD928F00D45AEBDCBD
|
||||
#define idE2B0CC679C2B47AD928F00D45AEBDCBD
|
||||
|
||||
#define DEFAULT_USER_AGENT "@DEFAULT_USER_AGENT@/@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@"
|
||||
#define PROGRAM_NAME "@PROJECT_NAME@"
|
||||
#define VERSION_MAJOR @PROJECT_VERSION_MAJOR@
|
||||
#define VERSION_MINOR @PROJECT_VERSION_MINOR@
|
||||
#define VERSION_BETA @PROJECT_VERSION_BETA@
|
||||
#define VERSION_GIT "@PROJECT_VERSION_GIT@"
|
||||
|
||||
#endif
|
88
src/htmlretrieve.cpp
Normal file
88
src/htmlretrieve.cpp
Normal file
|
@ -0,0 +1,88 @@
|
|||
#include "htmlretrieve.hpp"
|
||||
#include "duckscraperConfig.h"
|
||||
#include <ciso646>
|
||||
#include <tidy/tidy.h>
|
||||
#include <tidy/buffio.h>
|
||||
#include <sstream>
|
||||
#include <curl_easy.h>
|
||||
#include <cstring>
|
||||
#include <stack>
|
||||
#include <algorithm>
|
||||
|
||||
namespace duck {
|
||||
namespace {
|
||||
std::string cleanHTML(const std::string &html) {
|
||||
// Initialize a Tidy document
|
||||
TidyDoc tidyDoc = tidyCreate();
|
||||
TidyBuffer tidyOutputBuffer;
|
||||
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
|
||||
|
||||
// Configure Tidy
|
||||
// The flags tell Tidy to output XML and disable showing warnings
|
||||
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
|
||||
|
||||
int tidyResponseCode = -1;
|
||||
|
||||
// Parse input
|
||||
if (configSuccess)
|
||||
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
|
||||
|
||||
// Process HTML
|
||||
if (tidyResponseCode >= 0)
|
||||
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
|
||||
|
||||
// Output the HTML to our buffer
|
||||
if (tidyResponseCode >= 0)
|
||||
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
|
||||
|
||||
// Any errors from Tidy?
|
||||
if (tidyResponseCode < 0)
|
||||
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
|
||||
|
||||
// Grab the result from the buffer and then free Tidy's memory
|
||||
std::string tidyResult = (char*)tidyOutputBuffer.bp;
|
||||
tidyBufFree(&tidyOutputBuffer);
|
||||
tidyRelease(tidyDoc);
|
||||
|
||||
return tidyResult;
|
||||
}
|
||||
|
||||
bool isHttps (const std::string& parUrl) {
|
||||
const char protocol[] = "https://";
|
||||
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
|
||||
if (parUrl.size() < protocolLen)
|
||||
return false;
|
||||
|
||||
return std::equal(protocol, protocol + protocolLen, parUrl.begin());
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
||||
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
||||
std::ostringstream oss;
|
||||
curl::curl_easy easy(oss);
|
||||
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
|
||||
if (isHttps(parSource)) {
|
||||
easy.add(curl::curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
|
||||
easy.add(curl::curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
|
||||
}
|
||||
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
|
||||
easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
|
||||
|
||||
//try {
|
||||
easy.perform();
|
||||
//}
|
||||
//catch (curl_error& err) {
|
||||
//std::stack<std::pair<std::string, std::string> > errors = err.what();
|
||||
//err.print_traceback();
|
||||
//return 1;
|
||||
//}
|
||||
return cleanHTML(oss.str());
|
||||
}
|
||||
|
||||
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
||||
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost);
|
||||
}
|
||||
} //namespace duck
|
11
src/htmlretrieve.hpp
Normal file
11
src/htmlretrieve.hpp
Normal file
|
@ -0,0 +1,11 @@
|
|||
#ifndef idC6776D903059465191FFB64FCFD6B86A
|
||||
#define idC6776D903059465191FFB64FCFD6B86A
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace duck {
|
||||
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
176
src/main.cpp
176
src/main.cpp
|
@ -1,161 +1,18 @@
|
|||
#include "htmlretrieve.hpp"
|
||||
#include "duckscraperConfig.h"
|
||||
#include <iostream>
|
||||
#include <ciso646>
|
||||
#include <tidy/tidy.h>
|
||||
#include <tidy/buffio.h>
|
||||
#include <string>
|
||||
#include <pugixml.hpp>
|
||||
#include <sstream>
|
||||
#include <curl_easy.h>
|
||||
#include <cstring>
|
||||
#include <stack>
|
||||
|
||||
namespace {
|
||||
const char g_testData[] = {
|
||||
"<!DOCTYPE html>\n"
|
||||
"<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"
|
||||
"<title>LibReSSL</title>\n"
|
||||
"<link rel=\"stylesheet\" href=\"http://www.libressl.org/libre-ssl.css\">\n"
|
||||
"<meta charset=\"utf-8\">\n"
|
||||
"<meta name=\"resource-type\" content=\"document\">\n"
|
||||
"<meta name=\"description\" content=\"the main LibreSSL page\">\n"
|
||||
"<meta name=\"keywords\" content=\"LibreSSL,main\"/\n"
|
||||
"<meta name=\"distribution\" content=\"global\">\n"
|
||||
"</head>\n"
|
||||
"<body>\n"
|
||||
"<section class=\"libre-ssl\">\n"
|
||||
"<hgroup>\n"
|
||||
"<header>\n"
|
||||
"<h1>LibReSSL</h1>\n"
|
||||
"</header>\n"
|
||||
"</hgroup>\n"
|
||||
"<section class=\"introduction\">\n"
|
||||
"<p>LibReSSL is a <strong>FREE</strong> version of the SSL/TLS protocol\n"
|
||||
"forked from <a\n"
|
||||
"href=\"http://www.youtube.com/watch?v=mRCGDUsdRDU\">OpenSSL</a></p>\n"
|
||||
"<p>\n"
|
||||
"At the moment we are <a\n"
|
||||
"href=\"http://freshbsd.org/search?project=openbsd&q=file.name:libssl\">too\n"
|
||||
"busy deleting and rewriting code</a> to make a decent web page. No we\n"
|
||||
"don't want help making web pages, thank you.\n"
|
||||
"</p>\n"
|
||||
"<p>Check back here soon for updates.</p>\n"
|
||||
"</section>\n"
|
||||
"<section class=\"resources\">\n"
|
||||
"<hgroup>\n"
|
||||
"<header>\n"
|
||||
"<h2>Resources:</h2>\n"
|
||||
"</header>\n"
|
||||
"</hgroup>\n"
|
||||
"<section class=\"open-bsd\">\n"
|
||||
"<hgroup>\n"
|
||||
"<header>\n"
|
||||
"<h3>For OpenBSD</h3>\n"
|
||||
"</header>\n"
|
||||
"</hgroup>\n"
|
||||
"<ul class=\"indent-list\">\n"
|
||||
"<li><a href=\"http://www.openbsd.org/ftp.html\">FTP</a></li>\n"
|
||||
"<li><a href=\"http://www.openbsd.org/anoncvs.html\">AnonCVS</a></li>\n"
|
||||
"<li><a href=\"http://www.openbsd.org/cgi-bin/cvsweb/src/lib/libssl\">CVSWeb</a></li>\n"
|
||||
"</ul>\n"
|
||||
"</section>\n"
|
||||
"<section>\n"
|
||||
"<hgroup>\n"
|
||||
"<header>\n"
|
||||
"<h3>For other OS's</h3>\n"
|
||||
"</header>\n"
|
||||
"</hgroup>\n"
|
||||
"<p>\n"
|
||||
"<ul class=\"indent-list\">\n"
|
||||
"<li><a href=\"http://www.openbsd.org/papers/bsdcan14-libressl\">LibReSSL Presentation by Bob Beck from BSDCAN 2014</a></li>\n"
|
||||
"</ul>\n"
|
||||
"</p>\n"
|
||||
"<p>\n"
|
||||
"Multi OS support will happen once we have\n"
|
||||
"</p>\n"
|
||||
"<ul>\n"
|
||||
"<li><a href=\"http://en.wikipedia.org/wiki/Flensing\">Flensed</a>,\n"
|
||||
"refactored, rewritten, and fixed enough of the code so we have stable\n"
|
||||
"baseline that we trust and can be maintained/improved.</li>\n"
|
||||
"<li>The right Portability team in place.</li>\n"
|
||||
"<li>A <a href=\"http://www.openbsdfoundation.org/\">Stable Commitment of\n"
|
||||
"Funding</a> to support an increased development and porting\n"
|
||||
"effort. </li>\n"
|
||||
"</ul>\n"
|
||||
"<p>\n"
|
||||
"We know you all want this tomorrow. We are working as fast as we can\n"
|
||||
"but our primary focus is good software that we trust to run\n"
|
||||
"ourselves. <em>We don't want to break your heart</em>.\n"
|
||||
"</p>\n"
|
||||
"</section>\n"
|
||||
"</section>\n"
|
||||
"<section class=\"acknowledgements\">\n"
|
||||
"<p>\n"
|
||||
"LibReSSL is primarily developed by <a\n"
|
||||
"href=\"http://www.openbsd.org/\">the OpenBSD Project</a>, and its first\n"
|
||||
"inclusion into an operating system will be in <a\n"
|
||||
"href=\"http://www.openbsd.org/56.html\">OpenBSD 5.6</a>.\n"
|
||||
"</p>\n"
|
||||
"<p>\n"
|
||||
"LibReSSL is supported financially by <a\n"
|
||||
"href=\"http://www.openbsdfoundation.org/\">The OpenBSD Foundation</a> as\n"
|
||||
"well as by the <a href=\"http://www.openbsd.org/donations.html\">The\n"
|
||||
"OpenBSD Project</a>. Please consider <a\n"
|
||||
"href=\"http://www.openbsdfoundation.org/donations.html\">donating</a> to\n"
|
||||
"support our efforts.<br>\n"
|
||||
"</p>\n"
|
||||
"</section>\n"
|
||||
"<section class=\"we-love-web-devs-especially-those-that-write-blink-tags-for-us\">\n"
|
||||
"<p>This page scientifically designed to annoy web hipsters. <a\n"
|
||||
"href=\"http://www.openbsdfoundation.org/donations.html\">Donate now</a>\n"
|
||||
"to stop the Comic Sans and Blink Tags</p>\n"
|
||||
"</section>\n"
|
||||
"</section>\n"
|
||||
"</body></html>\n"
|
||||
};
|
||||
|
||||
std::string cleanHTML(const std::string &html){
|
||||
// Initialize a Tidy document
|
||||
TidyDoc tidyDoc = tidyCreate();
|
||||
TidyBuffer tidyOutputBuffer;
|
||||
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
|
||||
|
||||
// Configure Tidy
|
||||
// The flags tell Tidy to output XML and disable showing warnings
|
||||
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
|
||||
|
||||
int tidyResponseCode = -1;
|
||||
|
||||
// Parse input
|
||||
if (configSuccess)
|
||||
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
|
||||
|
||||
// Process HTML
|
||||
if (tidyResponseCode >= 0)
|
||||
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
|
||||
|
||||
// Output the HTML to our buffer
|
||||
if (tidyResponseCode >= 0)
|
||||
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
|
||||
|
||||
// Any errors from Tidy?
|
||||
if (tidyResponseCode < 0)
|
||||
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
|
||||
|
||||
// Grab the result from the buffer and then free Tidy's memory
|
||||
std::string tidyResult = (char*)tidyOutputBuffer.bp;
|
||||
tidyBufFree(&tidyOutputBuffer);
|
||||
tidyRelease(tidyDoc);
|
||||
|
||||
return tidyResult;
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
||||
int main (int argc, char* argv[]) {
|
||||
std::string tidyHtml;
|
||||
if (argc != 3) {
|
||||
std::cerr << PROGRAM_NAME << "v" << VERSION_MAJOR << "." << VERSION_MINOR;
|
||||
#if VERSION_BETA
|
||||
std::cerr << "b";
|
||||
#endif
|
||||
std::cerr << " git revision " << VERSION_GIT << "\n";
|
||||
std::cerr << "Default user agent is \"" << DEFAULT_USER_AGENT << "\"\n";
|
||||
std::cerr << "Usage: scraper <URL> <XPath>" << std::endl;
|
||||
return 2;
|
||||
}
|
||||
|
@ -166,22 +23,7 @@ int main (int argc, char* argv[]) {
|
|||
std::cout << "URL : " << url << "\n";
|
||||
std::cout << "XPath: " << xpath << std::endl;
|
||||
|
||||
{
|
||||
std::ostringstream oss;
|
||||
curl::curl_easy easy(oss);
|
||||
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, url));
|
||||
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, "duckscraper"));
|
||||
easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
|
||||
try {
|
||||
easy.perform();
|
||||
}
|
||||
catch (curl_error& err) {
|
||||
std::stack<std::pair<std::string, std::string> > errors = err.what();
|
||||
err.print_traceback();
|
||||
return 1;
|
||||
}
|
||||
tidyHtml = cleanHTML(oss.str());
|
||||
}
|
||||
std::string tidyHtml = duck::getCleanHtml(url, false, false);
|
||||
|
||||
{
|
||||
pugi::xml_document doc;
|
||||
|
|
Loading…
Reference in a new issue