diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp
index b560d4f..73e0487 100644
--- a/src/htmlretrieve.cpp
+++ b/src/htmlretrieve.cpp
@@ -10,6 +10,7 @@
#include
#include
#include
+#include
namespace duck {
namespace {
@@ -83,7 +84,7 @@ namespace duck {
}
} //unnamed namespace
- std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
+ std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
using curl::curl_easy;
using curl::curl_pair;
@@ -106,10 +107,15 @@ namespace duck {
//err.print_traceback();
//return 1;
//}
- return cleanHTML(oss.str());
+
+ std::string raw_data(oss.str());
+ if (parDumpRaw) {
+ parDumpRaw(raw_data);
+ }
+ return cleanHTML(std::move(raw_data));
}
- std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost) {
- return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost);
+ std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
+ return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
}
} //namespace duck
diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp
index 08d48a2..e244b46 100644
--- a/src/htmlretrieve.hpp
+++ b/src/htmlretrieve.hpp
@@ -2,10 +2,12 @@
#define idC6776D903059465191FFB64FCFD6B86A
#include
+#include
namespace duck {
- std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
- std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost );
+ using DumpRawFunc = std::function;
+ std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
+ std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
} //namespace duck
#endif
diff --git a/src/main.cpp b/src/main.cpp
index 8b254f3..f395f12 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -4,10 +4,12 @@
#include
#include
#include
+#include
#include
#include
#include
-#include
+#include
+#include
#define STRINGIZE_IMPL(s) #s
#define STRINGIZE(s) STRINGIZE_IMPL(s)
@@ -26,11 +28,13 @@ namespace {
#endif
;
- bool parse_commandline (int parArgc, char* parArgv[], po::variables_map parVarMap) {
+ bool parse_commandline (int parArgc, char* parArgv[], po::variables_map& parVarMap) {
po::options_description desc("General");
desc.add_options()
("help,h", "Produces this help message")
("version", "Prints the program's version and quits")
+ ("dump,d", po::value(), "Cleans the retrieved html and saves it to the named file; use - for stdout")
+ ("dump-raw,D", po::value(), "Saves the retrieved html to the named file; use - for stdout")
;
po::options_description positional_options("Positional options");
positional_options.add_options()
@@ -69,6 +73,16 @@ namespace {
}
return false;
}
+
+ void dump_string (const std::string& parPathDest, const std::string& parData) {
+ std::unique_ptr ofs;
+ const bool use_stdout = ("-" == parPathDest);
+ if (not use_stdout) {
+ ofs.reset(new std::ofstream(parPathDest));
+ }
+ std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
+ *os << parData;
+ }
} //unnamed namespace
int main (int argc, char* argv[]) {
@@ -83,7 +97,19 @@ int main (int argc, char* argv[]) {
std::cout << "XPath: " << xpath << std::endl;
#endif
- std::string tidyHtml = duck::getCleanHtml(vm["input-url"].as(), false, false);
+ const std::string tidyHtml = duck::getCleanHtml(
+ url,
+ false,
+ false,
+ (vm.count("dump-raw") ?
+ std::bind(&dump_string, vm["dump-raw"].as(), std::placeholders::_1)
+ :
+ duck::DumpRawFunc()
+ )
+ );
+ if (vm.count("dump")) {
+ dump_string(vm["dump"].as(), tidyHtml);
+ }
{
pugi::xml_document doc;