From 943e760ffd4c2ddd976664bc9a54393554ca131e Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Mon, 28 Sep 2015 23:24:23 +0200 Subject: [PATCH] Add dump parameters. Allows to dump both raw and cleaned up html. --- src/htmlretrieve.cpp | 14 ++++++++++---- src/htmlretrieve.hpp | 6 ++++-- src/main.cpp | 32 +++++++++++++++++++++++++++++--- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp index b560d4f..73e0487 100644 --- a/src/htmlretrieve.cpp +++ b/src/htmlretrieve.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace duck { namespace { @@ -83,7 +84,7 @@ namespace duck { } } //unnamed namespace - std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) { + std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) { using curl::curl_easy; using curl::curl_pair; @@ -106,10 +107,15 @@ namespace duck { //err.print_traceback(); //return 1; //} - return cleanHTML(oss.str()); + + std::string raw_data(oss.str()); + if (parDumpRaw) { + parDumpRaw(raw_data); + } + return cleanHTML(std::move(raw_data)); } - std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost) { - return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost); + std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) { + return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw); } } //namespace duck diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp index 08d48a2..e244b46 100644 --- a/src/htmlretrieve.hpp +++ b/src/htmlretrieve.hpp @@ -2,10 +2,12 @@ #define idC6776D903059465191FFB64FCFD6B86A #include +#include namespace duck { - std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost ); - std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost ); + using DumpRawFunc = std::function; + std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() ); + std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() ); } //namespace duck #endif diff --git a/src/main.cpp b/src/main.cpp index 8b254f3..f395f12 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -4,10 +4,12 @@ #include #include #include +#include #include #include #include -#include +#include +#include #define STRINGIZE_IMPL(s) #s #define STRINGIZE(s) STRINGIZE_IMPL(s) @@ -26,11 +28,13 @@ namespace { #endif ; - bool parse_commandline (int parArgc, char* parArgv[], po::variables_map parVarMap) { + bool parse_commandline (int parArgc, char* parArgv[], po::variables_map& parVarMap) { po::options_description desc("General"); desc.add_options() ("help,h", "Produces this help message") ("version", "Prints the program's version and quits") + ("dump,d", po::value(), "Cleans the retrieved html and saves it to the named file; use - for stdout") + ("dump-raw,D", po::value(), "Saves the retrieved html to the named file; use - for stdout") ; po::options_description positional_options("Positional options"); positional_options.add_options() @@ -69,6 +73,16 @@ namespace { } return false; } + + void dump_string (const std::string& parPathDest, const std::string& parData) { + std::unique_ptr ofs; + const bool use_stdout = ("-" == parPathDest); + if (not use_stdout) { + ofs.reset(new std::ofstream(parPathDest)); + } + std::ostream* const os = (use_stdout ? &std::cout : ofs.get()); + *os << parData; + } } //unnamed namespace int main (int argc, char* argv[]) { @@ -83,7 +97,19 @@ int main (int argc, char* argv[]) { std::cout << "XPath: " << xpath << std::endl; #endif - std::string tidyHtml = duck::getCleanHtml(vm["input-url"].as(), false, false); + const std::string tidyHtml = duck::getCleanHtml( + url, + false, + false, + (vm.count("dump-raw") ? + std::bind(&dump_string, vm["dump-raw"].as(), std::placeholders::_1) + : + duck::DumpRawFunc() + ) + ); + if (vm.count("dump")) { + dump_string(vm["dump"].as(), tidyHtml); + } { pugi::xml_document doc;