diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp index 73e0487..fc014ed 100644 --- a/src/htmlretrieve.cpp +++ b/src/htmlretrieve.cpp @@ -28,52 +28,6 @@ namespace duck { } } - std::string cleanHTML (std::string&& html) { - dropScriptTags(html); - - // Initialize a Tidy document - TidyDoc tidyDoc = tidyCreate(); - TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0}; - - // Configure Tidy - // The flags tell Tidy to output XML and disable showing warnings - bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes) - && tidyOptSetBool(tidyDoc, TidyQuiet, yes) - && tidyOptSetBool(tidyDoc, TidyNumEntities, yes) - && tidyOptSetBool(tidyDoc, TidyShowWarnings, no); - - int tidyResponseCode = -1; - - // Parse input - if (configSuccess) { - tidyResponseCode = tidyParseString(tidyDoc, html.c_str()); - } - - // Process HTML - if (tidyResponseCode >= 0) - tidyResponseCode = tidyCleanAndRepair(tidyDoc); - - if (tidyResponseCode >= 0) - tidyResponseCode = tidyRunDiagnostics(tidyDoc); - if (tidyResponseCode > 1) - tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1); - - // Output the HTML to our buffer - if (tidyResponseCode >= 0) - tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer); - - // Any errors from Tidy? - if (tidyResponseCode < 0) - throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode); - - // Grab the result from the buffer and then free Tidy's memory - std::string tidyResult = (char*)tidyOutputBuffer.bp; - tidyBufFree(&tidyOutputBuffer); - tidyRelease(tidyDoc); - - return tidyResult; - } - bool isHttps (const std::string& parUrl) { const char protocol[] = "https://"; const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1; @@ -84,6 +38,52 @@ namespace duck { } } //unnamed namespace + std::string cleanHTML (std::string&& html) { + dropScriptTags(html); + + // Initialize a Tidy document + TidyDoc tidyDoc = tidyCreate(); + TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0}; + + // Configure Tidy + // The flags tell Tidy to output XML and disable showing warnings + bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes) + && tidyOptSetBool(tidyDoc, TidyQuiet, yes) + && tidyOptSetBool(tidyDoc, TidyNumEntities, yes) + && tidyOptSetBool(tidyDoc, TidyShowWarnings, no); + + int tidyResponseCode = -1; + + // Parse input + if (configSuccess) { + tidyResponseCode = tidyParseString(tidyDoc, html.c_str()); + } + + // Process HTML + if (tidyResponseCode >= 0) + tidyResponseCode = tidyCleanAndRepair(tidyDoc); + + if (tidyResponseCode >= 0) + tidyResponseCode = tidyRunDiagnostics(tidyDoc); + if (tidyResponseCode > 1) + tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1); + + // Output the HTML to our buffer + if (tidyResponseCode >= 0) + tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer); + + // Any errors from Tidy? + if (tidyResponseCode < 0) + throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode); + + // Grab the result from the buffer and then free Tidy's memory + std::string tidyResult = (char*)tidyOutputBuffer.bp; + tidyBufFree(&tidyOutputBuffer); + tidyRelease(tidyDoc); + + return tidyResult; + } + std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) { using curl::curl_easy; using curl::curl_pair; diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp index e244b46..6aad5ba 100644 --- a/src/htmlretrieve.hpp +++ b/src/htmlretrieve.hpp @@ -8,6 +8,8 @@ namespace duck { using DumpRawFunc = std::function; std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() ); std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() ); + + std::string cleanHTML ( std::string&& html ); } //namespace duck #endif diff --git a/src/main.cpp b/src/main.cpp index f395f12..460960d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include #define STRINGIZE_IMPL(s) #s #define STRINGIZE(s) STRINGIZE_IMPL(s) @@ -83,6 +85,32 @@ namespace { std::ostream* const os = (use_stdout ? &std::cout : ofs.get()); *os << parData; } + + std::string getCleanHtml (const std::string& parUrl, const po::variables_map& parVarMap) { + std::string tidyHtml; + + if (isatty(fileno(stdin))) { + tidyHtml = duck::getCleanHtml( + parUrl, + false, + false, + (parVarMap.count("dump-raw") ? + std::bind(&dump_string, parVarMap["dump-raw"].as(), std::placeholders::_1) + : + duck::DumpRawFunc() + ) + ); + } + else { + std::cin >> std::noskipws; + std::istream_iterator it(std::cin); + std::istream_iterator end; + std::string results(it, end); + tidyHtml = duck::cleanHTML(std::move(results)); + } + + return std::move(tidyHtml); + } } //unnamed namespace int main (int argc, char* argv[]) { @@ -97,16 +125,7 @@ int main (int argc, char* argv[]) { std::cout << "XPath: " << xpath << std::endl; #endif - const std::string tidyHtml = duck::getCleanHtml( - url, - false, - false, - (vm.count("dump-raw") ? - std::bind(&dump_string, vm["dump-raw"].as(), std::placeholders::_1) - : - duck::DumpRawFunc() - ) - ); + const std::string tidyHtml(getCleanHtml(url, vm)); if (vm.count("dump")) { dump_string(vm["dump"].as(), tidyHtml); }