duckscraper/src/htmlretrieve.cpp
King_DuckZ 49aa62815a Allow piping.
Atm you still need to specify some parameter for the url, even if
it's not needed. The good news is that the value doesn't have to
be a valid URL, so any string will do.
2015-09-28 23:37:42 +02:00

121 lines
3.8 KiB
C++

#include "htmlretrieve.hpp"
#include "duckscraperConfig.h"
#include <ciso646>
#include <tidy.h>
#include <tidybuffio.h>
#include <sstream>
#include <curl_easy.h>
#include <cstring>
#include <stack>
#include <algorithm>
#include <memory>
#include <cassert>
#include <utility>
namespace duck {
namespace {
void dropScriptTags (std::string& html) {
size_t open_index = 0;
const std::string open_tag("<script");
const std::string close_tag("</script>");
while (html.npos != (open_index = html.find(open_tag, open_index))) {
assert(open_index < html.size());
auto close_index = html.find(close_tag, open_index + open_tag.size());
if (close_index == html.npos)
close_index = html.size();
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
}
}
bool isHttps (const std::string& parUrl) {
const char protocol[] = "https://";
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
if (parUrl.size() < protocolLen)
return false;
return std::equal(protocol, protocol + protocolLen, parUrl.begin());
}
} //unnamed namespace
std::string cleanHTML (std::string&& html) {
dropScriptTags(html);
// Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate();
TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0};
// Configure Tidy
// The flags tell Tidy to output XML and disable showing warnings
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
int tidyResponseCode = -1;
// Parse input
if (configSuccess) {
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
}
// Process HTML
if (tidyResponseCode >= 0)
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
if (tidyResponseCode >= 0)
tidyResponseCode = tidyRunDiagnostics(tidyDoc);
if (tidyResponseCode > 1)
tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1);
// Output the HTML to our buffer
if (tidyResponseCode >= 0)
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
// Any errors from Tidy?
if (tidyResponseCode < 0)
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
// Grab the result from the buffer and then free Tidy's memory
std::string tidyResult = (char*)tidyOutputBuffer.bp;
tidyBufFree(&tidyOutputBuffer);
tidyRelease(tidyDoc);
return tidyResult;
}
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
using curl::curl_easy;
using curl::curl_pair;
std::ostringstream oss;
curl_writer wr(oss);
curl_easy easy(wr);
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
if (isHttps(parSource)) {
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
}
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
easy.add(curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
//try {
easy.perform();
//}
//catch (curl_error& err) {
//std::stack<std::pair<std::string, std::string> > errors = err.what();
//err.print_traceback();
//return 1;
//}
std::string raw_data(oss.str());
if (parDumpRaw) {
parDumpRaw(raw_data);
}
return cleanHTML(std::move(raw_data));
}
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
}
} //namespace duck