diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp
index 73e0487..fc014ed 100644
--- a/src/htmlretrieve.cpp
+++ b/src/htmlretrieve.cpp
@@ -28,52 +28,6 @@ namespace duck {
}
}
- std::string cleanHTML (std::string&& html) {
- dropScriptTags(html);
-
- // Initialize a Tidy document
- TidyDoc tidyDoc = tidyCreate();
- TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0};
-
- // Configure Tidy
- // The flags tell Tidy to output XML and disable showing warnings
- bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
- && tidyOptSetBool(tidyDoc, TidyQuiet, yes)
- && tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
- && tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
-
- int tidyResponseCode = -1;
-
- // Parse input
- if (configSuccess) {
- tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
- }
-
- // Process HTML
- if (tidyResponseCode >= 0)
- tidyResponseCode = tidyCleanAndRepair(tidyDoc);
-
- if (tidyResponseCode >= 0)
- tidyResponseCode = tidyRunDiagnostics(tidyDoc);
- if (tidyResponseCode > 1)
- tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1);
-
- // Output the HTML to our buffer
- if (tidyResponseCode >= 0)
- tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
-
- // Any errors from Tidy?
- if (tidyResponseCode < 0)
- throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
-
- // Grab the result from the buffer and then free Tidy's memory
- std::string tidyResult = (char*)tidyOutputBuffer.bp;
- tidyBufFree(&tidyOutputBuffer);
- tidyRelease(tidyDoc);
-
- return tidyResult;
- }
-
bool isHttps (const std::string& parUrl) {
const char protocol[] = "https://";
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
@@ -84,6 +38,52 @@ namespace duck {
}
} //unnamed namespace
+ std::string cleanHTML (std::string&& html) {
+ dropScriptTags(html);
+
+ // Initialize a Tidy document
+ TidyDoc tidyDoc = tidyCreate();
+ TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0};
+
+ // Configure Tidy
+ // The flags tell Tidy to output XML and disable showing warnings
+ bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
+ && tidyOptSetBool(tidyDoc, TidyQuiet, yes)
+ && tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
+ && tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
+
+ int tidyResponseCode = -1;
+
+ // Parse input
+ if (configSuccess) {
+ tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
+ }
+
+ // Process HTML
+ if (tidyResponseCode >= 0)
+ tidyResponseCode = tidyCleanAndRepair(tidyDoc);
+
+ if (tidyResponseCode >= 0)
+ tidyResponseCode = tidyRunDiagnostics(tidyDoc);
+ if (tidyResponseCode > 1)
+ tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1);
+
+ // Output the HTML to our buffer
+ if (tidyResponseCode >= 0)
+ tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
+
+ // Any errors from Tidy?
+ if (tidyResponseCode < 0)
+ throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
+
+ // Grab the result from the buffer and then free Tidy's memory
+ std::string tidyResult = (char*)tidyOutputBuffer.bp;
+ tidyBufFree(&tidyOutputBuffer);
+ tidyRelease(tidyDoc);
+
+ return tidyResult;
+ }
+
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
using curl::curl_easy;
using curl::curl_pair;
diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp
index e244b46..6aad5ba 100644
--- a/src/htmlretrieve.hpp
+++ b/src/htmlretrieve.hpp
@@ -8,6 +8,8 @@ namespace duck {
using DumpRawFunc = std::function;
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
+
+ std::string cleanHTML ( std::string&& html );
} //namespace duck
#endif
diff --git a/src/main.cpp b/src/main.cpp
index f395f12..460960d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -10,6 +10,8 @@
#include
#include
#include
+#include
+#include
#define STRINGIZE_IMPL(s) #s
#define STRINGIZE(s) STRINGIZE_IMPL(s)
@@ -83,6 +85,32 @@ namespace {
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
*os << parData;
}
+
+ std::string getCleanHtml (const std::string& parUrl, const po::variables_map& parVarMap) {
+ std::string tidyHtml;
+
+ if (isatty(fileno(stdin))) {
+ tidyHtml = duck::getCleanHtml(
+ parUrl,
+ false,
+ false,
+ (parVarMap.count("dump-raw") ?
+ std::bind(&dump_string, parVarMap["dump-raw"].as(), std::placeholders::_1)
+ :
+ duck::DumpRawFunc()
+ )
+ );
+ }
+ else {
+ std::cin >> std::noskipws;
+ std::istream_iterator it(std::cin);
+ std::istream_iterator end;
+ std::string results(it, end);
+ tidyHtml = duck::cleanHTML(std::move(results));
+ }
+
+ return std::move(tidyHtml);
+ }
} //unnamed namespace
int main (int argc, char* argv[]) {
@@ -97,16 +125,7 @@ int main (int argc, char* argv[]) {
std::cout << "XPath: " << xpath << std::endl;
#endif
- const std::string tidyHtml = duck::getCleanHtml(
- url,
- false,
- false,
- (vm.count("dump-raw") ?
- std::bind(&dump_string, vm["dump-raw"].as(), std::placeholders::_1)
- :
- duck::DumpRawFunc()
- )
- );
+ const std::string tidyHtml(getCleanHtml(url, vm));
if (vm.count("dump")) {
dump_string(vm["dump"].as(), tidyHtml);
}