King_DuckZ
49aa62815a
Atm you still need to specify some parameter for the url, even if it's not needed. The good news is that the value doesn't have to be a valid URL, so any string will do.
121 lines
3.8 KiB
C++
121 lines
3.8 KiB
C++
#include "htmlretrieve.hpp"
|
|
#include "duckscraperConfig.h"
|
|
#include <ciso646>
|
|
#include <tidy.h>
|
|
#include <tidybuffio.h>
|
|
#include <sstream>
|
|
#include <curl_easy.h>
|
|
#include <cstring>
|
|
#include <stack>
|
|
#include <algorithm>
|
|
#include <memory>
|
|
#include <cassert>
|
|
#include <utility>
|
|
|
|
namespace duck {
|
|
namespace {
|
|
void dropScriptTags (std::string& html) {
|
|
size_t open_index = 0;
|
|
const std::string open_tag("<script");
|
|
const std::string close_tag("</script>");
|
|
|
|
while (html.npos != (open_index = html.find(open_tag, open_index))) {
|
|
assert(open_index < html.size());
|
|
auto close_index = html.find(close_tag, open_index + open_tag.size());
|
|
if (close_index == html.npos)
|
|
close_index = html.size();
|
|
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
|
|
}
|
|
}
|
|
|
|
bool isHttps (const std::string& parUrl) {
|
|
const char protocol[] = "https://";
|
|
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
|
|
if (parUrl.size() < protocolLen)
|
|
return false;
|
|
|
|
return std::equal(protocol, protocol + protocolLen, parUrl.begin());
|
|
}
|
|
} //unnamed namespace
|
|
|
|
std::string cleanHTML (std::string&& html) {
|
|
dropScriptTags(html);
|
|
|
|
// Initialize a Tidy document
|
|
TidyDoc tidyDoc = tidyCreate();
|
|
TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0};
|
|
|
|
// Configure Tidy
|
|
// The flags tell Tidy to output XML and disable showing warnings
|
|
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
|
|
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
|
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
|
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
|
|
|
|
int tidyResponseCode = -1;
|
|
|
|
// Parse input
|
|
if (configSuccess) {
|
|
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
|
|
}
|
|
|
|
// Process HTML
|
|
if (tidyResponseCode >= 0)
|
|
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
|
|
|
|
if (tidyResponseCode >= 0)
|
|
tidyResponseCode = tidyRunDiagnostics(tidyDoc);
|
|
if (tidyResponseCode > 1)
|
|
tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1);
|
|
|
|
// Output the HTML to our buffer
|
|
if (tidyResponseCode >= 0)
|
|
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
|
|
|
|
// Any errors from Tidy?
|
|
if (tidyResponseCode < 0)
|
|
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
|
|
|
|
// Grab the result from the buffer and then free Tidy's memory
|
|
std::string tidyResult = (char*)tidyOutputBuffer.bp;
|
|
tidyBufFree(&tidyOutputBuffer);
|
|
tidyRelease(tidyDoc);
|
|
|
|
return tidyResult;
|
|
}
|
|
|
|
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
|
using curl::curl_easy;
|
|
using curl::curl_pair;
|
|
|
|
std::ostringstream oss;
|
|
curl_writer wr(oss);
|
|
curl_easy easy(wr);
|
|
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
|
|
if (isHttps(parSource)) {
|
|
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
|
|
easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
|
|
}
|
|
easy.add(curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
|
|
easy.add(curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
|
|
|
|
//try {
|
|
easy.perform();
|
|
//}
|
|
//catch (curl_error& err) {
|
|
//std::stack<std::pair<std::string, std::string> > errors = err.what();
|
|
//err.print_traceback();
|
|
//return 1;
|
|
//}
|
|
|
|
std::string raw_data(oss.str());
|
|
if (parDumpRaw) {
|
|
parDumpRaw(raw_data);
|
|
}
|
|
return cleanHTML(std::move(raw_data));
|
|
}
|
|
|
|
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
|
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
|
|
}
|
|
} //namespace duck
|