Allow piping.
Atm you still need to specify some parameter for the url, even if it's not needed. The good news is that the value doesn't have to be a valid URL, so any string will do.
This commit is contained in:
parent
943e760ffd
commit
49aa62815a
3 changed files with 77 additions and 56 deletions
|
@ -28,52 +28,6 @@ namespace duck {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string cleanHTML (std::string&& html) {
|
|
||||||
dropScriptTags(html);
|
|
||||||
|
|
||||||
// Initialize a Tidy document
|
|
||||||
TidyDoc tidyDoc = tidyCreate();
|
|
||||||
TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0};
|
|
||||||
|
|
||||||
// Configure Tidy
|
|
||||||
// The flags tell Tidy to output XML and disable showing warnings
|
|
||||||
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
|
|
||||||
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
|
||||||
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
|
||||||
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
|
|
||||||
|
|
||||||
int tidyResponseCode = -1;
|
|
||||||
|
|
||||||
// Parse input
|
|
||||||
if (configSuccess) {
|
|
||||||
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process HTML
|
|
||||||
if (tidyResponseCode >= 0)
|
|
||||||
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
|
|
||||||
|
|
||||||
if (tidyResponseCode >= 0)
|
|
||||||
tidyResponseCode = tidyRunDiagnostics(tidyDoc);
|
|
||||||
if (tidyResponseCode > 1)
|
|
||||||
tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1);
|
|
||||||
|
|
||||||
// Output the HTML to our buffer
|
|
||||||
if (tidyResponseCode >= 0)
|
|
||||||
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
|
|
||||||
|
|
||||||
// Any errors from Tidy?
|
|
||||||
if (tidyResponseCode < 0)
|
|
||||||
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
|
|
||||||
|
|
||||||
// Grab the result from the buffer and then free Tidy's memory
|
|
||||||
std::string tidyResult = (char*)tidyOutputBuffer.bp;
|
|
||||||
tidyBufFree(&tidyOutputBuffer);
|
|
||||||
tidyRelease(tidyDoc);
|
|
||||||
|
|
||||||
return tidyResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isHttps (const std::string& parUrl) {
|
bool isHttps (const std::string& parUrl) {
|
||||||
const char protocol[] = "https://";
|
const char protocol[] = "https://";
|
||||||
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
|
const size_t protocolLen = sizeof(protocol) / sizeof(protocol[0]) - 1;
|
||||||
|
@ -84,6 +38,52 @@ namespace duck {
|
||||||
}
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
|
std::string cleanHTML (std::string&& html) {
|
||||||
|
dropScriptTags(html);
|
||||||
|
|
||||||
|
// Initialize a Tidy document
|
||||||
|
TidyDoc tidyDoc = tidyCreate();
|
||||||
|
TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0};
|
||||||
|
|
||||||
|
// Configure Tidy
|
||||||
|
// The flags tell Tidy to output XML and disable showing warnings
|
||||||
|
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
|
||||||
|
|
||||||
|
int tidyResponseCode = -1;
|
||||||
|
|
||||||
|
// Parse input
|
||||||
|
if (configSuccess) {
|
||||||
|
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process HTML
|
||||||
|
if (tidyResponseCode >= 0)
|
||||||
|
tidyResponseCode = tidyCleanAndRepair(tidyDoc);
|
||||||
|
|
||||||
|
if (tidyResponseCode >= 0)
|
||||||
|
tidyResponseCode = tidyRunDiagnostics(tidyDoc);
|
||||||
|
if (tidyResponseCode > 1)
|
||||||
|
tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1);
|
||||||
|
|
||||||
|
// Output the HTML to our buffer
|
||||||
|
if (tidyResponseCode >= 0)
|
||||||
|
tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);
|
||||||
|
|
||||||
|
// Any errors from Tidy?
|
||||||
|
if (tidyResponseCode < 0)
|
||||||
|
throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode);
|
||||||
|
|
||||||
|
// Grab the result from the buffer and then free Tidy's memory
|
||||||
|
std::string tidyResult = (char*)tidyOutputBuffer.bp;
|
||||||
|
tidyBufFree(&tidyOutputBuffer);
|
||||||
|
tidyRelease(tidyDoc);
|
||||||
|
|
||||||
|
return tidyResult;
|
||||||
|
}
|
||||||
|
|
||||||
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
||||||
using curl::curl_easy;
|
using curl::curl_easy;
|
||||||
using curl::curl_pair;
|
using curl::curl_pair;
|
||||||
|
|
|
@ -8,6 +8,8 @@ namespace duck {
|
||||||
using DumpRawFunc = std::function<void(const std::string&)>;
|
using DumpRawFunc = std::function<void(const std::string&)>;
|
||||||
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
||||||
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
||||||
|
|
||||||
|
std::string cleanHTML ( std::string&& html );
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
39
src/main.cpp
39
src/main.cpp
|
@ -10,6 +10,8 @@
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <iterator>
|
||||||
|
|
||||||
#define STRINGIZE_IMPL(s) #s
|
#define STRINGIZE_IMPL(s) #s
|
||||||
#define STRINGIZE(s) STRINGIZE_IMPL(s)
|
#define STRINGIZE(s) STRINGIZE_IMPL(s)
|
||||||
|
@ -83,6 +85,32 @@ namespace {
|
||||||
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
||||||
*os << parData;
|
*os << parData;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string getCleanHtml (const std::string& parUrl, const po::variables_map& parVarMap) {
|
||||||
|
std::string tidyHtml;
|
||||||
|
|
||||||
|
if (isatty(fileno(stdin))) {
|
||||||
|
tidyHtml = duck::getCleanHtml(
|
||||||
|
parUrl,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
(parVarMap.count("dump-raw") ?
|
||||||
|
std::bind(&dump_string, parVarMap["dump-raw"].as<std::string>(), std::placeholders::_1)
|
||||||
|
:
|
||||||
|
duck::DumpRawFunc()
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::cin >> std::noskipws;
|
||||||
|
std::istream_iterator<char> it(std::cin);
|
||||||
|
std::istream_iterator<char> end;
|
||||||
|
std::string results(it, end);
|
||||||
|
tidyHtml = duck::cleanHTML(std::move(results));
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::move(tidyHtml);
|
||||||
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
int main (int argc, char* argv[]) {
|
int main (int argc, char* argv[]) {
|
||||||
|
@ -97,16 +125,7 @@ int main (int argc, char* argv[]) {
|
||||||
std::cout << "XPath: " << xpath << std::endl;
|
std::cout << "XPath: " << xpath << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const std::string tidyHtml = duck::getCleanHtml(
|
const std::string tidyHtml(getCleanHtml(url, vm));
|
||||||
url,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
(vm.count("dump-raw") ?
|
|
||||||
std::bind(&dump_string, vm["dump-raw"].as<std::string>(), std::placeholders::_1)
|
|
||||||
:
|
|
||||||
duck::DumpRawFunc()
|
|
||||||
)
|
|
||||||
);
|
|
||||||
if (vm.count("dump")) {
|
if (vm.count("dump")) {
|
||||||
dump_string(vm["dump"].as<std::string>(), tidyHtml);
|
dump_string(vm["dump"].as<std::string>(), tidyHtml);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue