duckscraper/src/htmlretrieve.cpp

#include "htmlretrieve.hpp"
#include "duckscraperConfig.h"
#include <ciso646>
#include <tidy.h>
#include <tidybuffio.h>
#include <sstream>
#include <curl_easy.h>
#include <cstring>
#include <stack>
#include <algorithm>
#include <memory>
#include <cassert>
#include <utility>

namespace duck {
	namespace {
		void dropScriptTags (std::string& html) {
			size_t open_index = 0;
			const std::string open_tag("<script");
			const std::string close_tag("</script>");

			while (html.npos != (open_index = html.find(open_tag, open_index))) {
				assert(open_index < html.size());
				auto close_index = html.find(close_tag, open_index + open_tag.size());
				if (close_index == html.npos)
					close_index = html.size();
				html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
			}
		}

		bool isHttps (const std::string& parUrl) {
			const char protocol[] = "https://";
			const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
			if (parUrl.size() < protocolLen)
				return false;

			return std::equal(protocol, protocol + protocolLen, parUrl.begin());
		}
	} //unnamed namespace

	std::string cleanHTML (std::string&& html) {
		dropScriptTags(html);

		// Initialize a Tidy document
		TidyDoc tidyDoc = tidyCreate();
		TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0};

		// Configure Tidy
		// The flags tell Tidy to output XML and disable showing warnings
		bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
			&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
			&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
			&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);

		int tidyResponseCode = -1;

		// Parse input
		if (configSuccess) {
			tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
		}

		// Process HTML
		if (tidyResponseCode >= 0)
			tidyResponseCode = tidyCleanAndRepair(tidyDoc);

		if (tidyResponseCode >= 0)
			tidyResponseCode = tidyRunDiagnostics(tidyDoc);
		if (tidyResponseCode > 1)
			tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1);

		// Output the HTML to our buffer
		if (tidyResponseCode >= 0)
			tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);

		// Any errors from Tidy?
		if (tidyResponseCode < 0)
			throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);

		// Grab the result from the buffer and then free Tidy's memory
		std::string tidyResult = (char*)tidyOutputBuffer.bp;
		tidyBufFree(&tidyOutputBuffer);
		tidyRelease(tidyDoc);

		return tidyResult;
	}

	std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
		using curl::curl_easy;
		using curl::curl_pair;

		std::ostringstream oss;
		curl_writer wr(oss);
		curl_easy easy(wr);
		easy.add(curl_pair<CURLoption, std::string>(CURLOPT_URL, parSource));
		if (isHttps(parSource)) {
			easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYPEER, parSslVerifyPeer));
			easy.add(curl_pair<CURLoption, bool>(CURLOPT_SSL_VERIFYHOST, parSslVerifyHost));
		}
		easy.add(curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, parUserAgent));
		easy.add(curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));

		//try {
			easy.perform();
		//}
		//catch (curl_error& err) {
			//std::stack<std::pair<std::string, std::string> > errors = err.what();
			//err.print_traceback();
			//return 1;
		//}

		std::string raw_data(oss.str());
		if (parDumpRaw) {
			parDumpRaw(raw_data);
		}
		return cleanHTML(std::move(raw_data));
	}

	std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
		return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
	}
} //namespace duck