Add dump parameters.
Allows to dump both raw and cleaned up html.
This commit is contained in:
parent
00150938dd
commit
943e760ffd
3 changed files with 43 additions and 9 deletions
|
@ -10,6 +10,7 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
namespace {
|
namespace {
|
||||||
|
@ -83,7 +84,7 @@ namespace duck {
|
||||||
}
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
||||||
using curl::curl_easy;
|
using curl::curl_easy;
|
||||||
using curl::curl_pair;
|
using curl::curl_pair;
|
||||||
|
|
||||||
|
@ -106,10 +107,15 @@ namespace duck {
|
||||||
//err.print_traceback();
|
//err.print_traceback();
|
||||||
//return 1;
|
//return 1;
|
||||||
//}
|
//}
|
||||||
return cleanHTML(oss.str());
|
|
||||||
|
std::string raw_data(oss.str());
|
||||||
|
if (parDumpRaw) {
|
||||||
|
parDumpRaw(raw_data);
|
||||||
|
}
|
||||||
|
return cleanHTML(std::move(raw_data));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
||||||
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost);
|
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
|
||||||
}
|
}
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
|
@ -2,10 +2,12 @@
|
||||||
#define idC6776D903059465191FFB64FCFD6B86A
|
#define idC6776D903059465191FFB64FCFD6B86A
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
using DumpRawFunc = std::function<void(const std::string&)>;
|
||||||
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost );
|
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
||||||
|
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
32
src/main.cpp
32
src/main.cpp
|
@ -4,10 +4,12 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <pugixml.hpp>
|
#include <pugixml.hpp>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <fstream>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <ciso646>
|
#include <ciso646>
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <unistd.h>
|
#include <memory>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
#define STRINGIZE_IMPL(s) #s
|
#define STRINGIZE_IMPL(s) #s
|
||||||
#define STRINGIZE(s) STRINGIZE_IMPL(s)
|
#define STRINGIZE(s) STRINGIZE_IMPL(s)
|
||||||
|
@ -26,11 +28,13 @@ namespace {
|
||||||
#endif
|
#endif
|
||||||
;
|
;
|
||||||
|
|
||||||
bool parse_commandline (int parArgc, char* parArgv[], po::variables_map parVarMap) {
|
bool parse_commandline (int parArgc, char* parArgv[], po::variables_map& parVarMap) {
|
||||||
po::options_description desc("General");
|
po::options_description desc("General");
|
||||||
desc.add_options()
|
desc.add_options()
|
||||||
("help,h", "Produces this help message")
|
("help,h", "Produces this help message")
|
||||||
("version", "Prints the program's version and quits")
|
("version", "Prints the program's version and quits")
|
||||||
|
("dump,d", po::value<std::string>(), "Cleans the retrieved html and saves it to the named file; use - for stdout")
|
||||||
|
("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
|
||||||
;
|
;
|
||||||
po::options_description positional_options("Positional options");
|
po::options_description positional_options("Positional options");
|
||||||
positional_options.add_options()
|
positional_options.add_options()
|
||||||
|
@ -69,6 +73,16 @@ namespace {
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void dump_string (const std::string& parPathDest, const std::string& parData) {
|
||||||
|
std::unique_ptr<std::ofstream> ofs;
|
||||||
|
const bool use_stdout = ("-" == parPathDest);
|
||||||
|
if (not use_stdout) {
|
||||||
|
ofs.reset(new std::ofstream(parPathDest));
|
||||||
|
}
|
||||||
|
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
||||||
|
*os << parData;
|
||||||
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
int main (int argc, char* argv[]) {
|
int main (int argc, char* argv[]) {
|
||||||
|
@ -83,7 +97,19 @@ int main (int argc, char* argv[]) {
|
||||||
std::cout << "XPath: " << xpath << std::endl;
|
std::cout << "XPath: " << xpath << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::string tidyHtml = duck::getCleanHtml(vm["input-url"].as<std::string>(), false, false);
|
const std::string tidyHtml = duck::getCleanHtml(
|
||||||
|
url,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
(vm.count("dump-raw") ?
|
||||||
|
std::bind(&dump_string, vm["dump-raw"].as<std::string>(), std::placeholders::_1)
|
||||||
|
:
|
||||||
|
duck::DumpRawFunc()
|
||||||
|
)
|
||||||
|
);
|
||||||
|
if (vm.count("dump")) {
|
||||||
|
dump_string(vm["dump"].as<std::string>(), tidyHtml);
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
pugi::xml_document doc;
|
pugi::xml_document doc;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue