Atm you still need to specify some parameter for the url, even if it's not needed. The good news is that the value doesn't have to be a valid URL, so any string will do.
121 lines
3.8 KiB
121 lines
3.8 KiB
#include "htmlretrieve.hpp"
#include "duckscraperConfig.h"
#include <ciso646>
#include <tidy.h>
#include <tidybuffio.h>
#include <sstream>
#include <curl_easy.h>
#include <cstring>
#include <stack>
#include <algorithm>
#include <memory>
#include <cassert>
#include <utility>
namespace duck {
namespace {
void dropScriptTags (std::string& html) {
size_t open_index = 0;
const std::string open_tag("<script");
const std::string close_tag("</script>");
while (html.npos != (open_index = html.find(open_tag, open_index))) {
assert(open_index < html.size());
auto close_index = html.find(close_tag, open_index + open_tag.size());
if (close_index == html.npos)
close_index = html.size();
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
bool isHttps (const std::string& parUrl) {
const char protocol[] = "https://";
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
if (parUrl.size() < protocolLen)
return false;
return std::equal(protocol, protocol + protocolLen, parUrl.begin());
} //unnamed namespace
std::string cleanHTML (std::string&& html) {
// Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate();
TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0};
// Configure Tidy
// The flags tell Tidy to output XML and disable showing warnings
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
int tidyResponseCode = -1;
// Parse input
if (configSuccess) {
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
// Process HTML
if (tidyResponseCode >= 0)
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
if (tidyResponseCode >= 0)
tidyResponseCode = tidyRunDiagnostics(tidyDoc);
if (tidyResponseCode > 1)
tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1);
// Output the HTML to our buffer
if (tidyResponseCode >= 0)
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
// Any errors from Tidy?
if (tidyResponseCode < 0)
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
// Grab the result from the buffer and then free Tidy's memory
std::string tidyResult = (char*)tidyOutputBuffer.bp;
return tidyResult;
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
using curl::curl_easy;
using curl::curl_pair;
std::ostringstream oss;
curl_writer wr(oss);
curl_easy easy(wr);
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
if (isHttps(parSource)) {
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
easy.add(curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
//try {
//catch (curl_error& err) {
//std::stack<std::pair<std::string, std::string> > errors = err.what();
//return 1;
std::string raw_data(oss.str());
if (parDumpRaw) {
return cleanHTML(std::move(raw_data));
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
} //namespace duck