duckscraper/src/main.cpp
King_DuckZ 943e760ffd Add dump parameters.
Allows to dump both raw and cleaned up html.
2015-09-28 23:24:23 +02:00

158 lines
4.5 KiB
C++

#include "htmlretrieve.hpp"
#include "duckscraperConfig.h"
#include <iostream>
#include <string>
#include <pugixml.hpp>
#include <sstream>
#include <fstream>
#include <utility>
#include <ciso646>
#include <boost/program_options.hpp>
#include <memory>
#include <functional>
#define STRINGIZE_IMPL(s) #s
#define STRINGIZE(s) STRINGIZE_IMPL(s)
namespace po = boost::program_options;
namespace {
typedef std::pair<int, int> LineColType;
LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData );
const char* const g_version_string =
PROGRAM_NAME " v" STRINGIZE(VERSION_MAJOR) "." STRINGIZE(VERSION_MINOR)
#if VERSION_BETA
"b"
#endif
;
bool parse_commandline (int parArgc, char* parArgv[], po::variables_map& parVarMap) {
po::options_description desc("General");
desc.add_options()
("help,h", "Produces this help message")
("version", "Prints the program's version and quits")
("dump,d", po::value<std::string>(), "Cleans the retrieved html and saves it to the named file; use - for stdout")
("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
;
po::options_description positional_options("Positional options");
positional_options.add_options()
("input-url", po::value<std::string>(), "Input URL")
("xpath", po::value<std::string>(), "XPath expression")
;
po::options_description all("Available options");
all.add(desc).add(positional_options);
po::positional_options_description pd;
pd.add("input-url", 1).add("xpath", 1);
po::store(po::command_line_parser(parArgc, parArgv).options(all).positional(pd).run(), parVarMap);
po::notify(parVarMap);
if (parVarMap.count("help")) {
po::options_description visible("Available options");
visible.add(desc);
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
std::cout << visible;
return true;
}
else if (parVarMap.count("version")) {
std::cout << g_version_string;
std::cout << " git revision " << VERSION_GIT << "\n";
return true;
}
if (parVarMap.count("input-url") == 0) {
std::cerr << "No input url specified, use --help for help" << std::endl;
//return 2;
return true;
}
if (parVarMap.count("xpath") == 0) {
std::cerr << "No XPath expression specified, use --help for help" << std::endl;
//return 2;
return true;
}
return false;
}
void dump_string (const std::string& parPathDest, const std::string& parData) {
std::unique_ptr<std::ofstream> ofs;
const bool use_stdout = ("-" == parPathDest);
if (not use_stdout) {
ofs.reset(new std::ofstream(parPathDest));
}
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
*os << parData;
}
} //unnamed namespace
int main (int argc, char* argv[]) {
po::variables_map vm;
if (parse_commandline(argc, argv, vm)) {
return 0;
}
const auto url = vm["input-url"].as<std::string>();
const auto xpath = vm["xpath"].as<std::string>();
#if !defined(NDEBUG)
std::cout << "URL : " << url << "\n";
std::cout << "XPath: " << xpath << std::endl;
#endif
const std::string tidyHtml = duck::getCleanHtml(
url,
false,
false,
(vm.count("dump-raw") ?
std::bind(&dump_string, vm["dump-raw"].as<std::string>(), std::placeholders::_1)
:
duck::DumpRawFunc()
)
);
if (vm.count("dump")) {
dump_string(vm["dump"].as<std::string>(), tidyHtml);
}
{
pugi::xml_document doc;
std::istringstream iss(tidyHtml);
pugi::xml_parse_result result(doc.load(iss));
if (not result) {
auto line_col = line_col_from_offset(result.offset, tidyHtml);
std::cerr << "Error parsing the source XML at line " <<
line_col.first << " col " << line_col.second << ":\n" <<
result.description() << std::endl;
return 1;
}
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str());
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
const pugi::xpath_node& node = *itFind;
if (node.node()) {
std::cout << node.node().name() << ": " << node.node().value() << "\n";
}
else if (node.attribute()) {
std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n";
}
}
}
return 0;
}
namespace {
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
size_t index = 0;
int line = 1;
int chara = 1;
while (parOffset and index < parData.size()) {
if (parData[index] == '\n') {
chara = 1;
++line;
}
else {
++chara;
}
++index;
--parOffset;
}
return std::make_pair(line, chara);
}
} //unnamed namespace