158 lines
4.5 KiB
C++
158 lines
4.5 KiB
C++
#include "htmlretrieve.hpp"
|
|
#include "duckscraperConfig.h"
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <pugixml.hpp>
|
|
#include <sstream>
|
|
#include <fstream>
|
|
#include <utility>
|
|
#include <ciso646>
|
|
#include <boost/program_options.hpp>
|
|
#include <memory>
|
|
#include <functional>
|
|
|
|
#define STRINGIZE_IMPL(s) #s
|
|
#define STRINGIZE(s) STRINGIZE_IMPL(s)
|
|
|
|
namespace po = boost::program_options;
|
|
|
|
namespace {
|
|
typedef std::pair<int, int> LineColType;
|
|
|
|
LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData );
|
|
|
|
const char* const g_version_string =
|
|
PROGRAM_NAME " v" STRINGIZE(VERSION_MAJOR) "." STRINGIZE(VERSION_MINOR)
|
|
#if VERSION_BETA
|
|
"b"
|
|
#endif
|
|
;
|
|
|
|
bool parse_commandline (int parArgc, char* parArgv[], po::variables_map& parVarMap) {
|
|
po::options_description desc("General");
|
|
desc.add_options()
|
|
("help,h", "Produces this help message")
|
|
("version", "Prints the program's version and quits")
|
|
("dump,d", po::value<std::string>(), "Cleans the retrieved html and saves it to the named file; use - for stdout")
|
|
("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
|
|
;
|
|
po::options_description positional_options("Positional options");
|
|
positional_options.add_options()
|
|
("input-url", po::value<std::string>(), "Input URL")
|
|
("xpath", po::value<std::string>(), "XPath expression")
|
|
;
|
|
po::options_description all("Available options");
|
|
all.add(desc).add(positional_options);
|
|
po::positional_options_description pd;
|
|
pd.add("input-url", 1).add("xpath", 1);
|
|
po::store(po::command_line_parser(parArgc, parArgv).options(all).positional(pd).run(), parVarMap);
|
|
po::notify(parVarMap);
|
|
|
|
if (parVarMap.count("help")) {
|
|
po::options_description visible("Available options");
|
|
visible.add(desc);
|
|
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
|
|
std::cout << visible;
|
|
return true;
|
|
}
|
|
else if (parVarMap.count("version")) {
|
|
std::cout << g_version_string;
|
|
std::cout << " git revision " << VERSION_GIT << "\n";
|
|
return true;
|
|
}
|
|
|
|
if (parVarMap.count("input-url") == 0) {
|
|
std::cerr << "No input url specified, use --help for help" << std::endl;
|
|
//return 2;
|
|
return true;
|
|
}
|
|
if (parVarMap.count("xpath") == 0) {
|
|
std::cerr << "No XPath expression specified, use --help for help" << std::endl;
|
|
//return 2;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void dump_string (const std::string& parPathDest, const std::string& parData) {
|
|
std::unique_ptr<std::ofstream> ofs;
|
|
const bool use_stdout = ("-" == parPathDest);
|
|
if (not use_stdout) {
|
|
ofs.reset(new std::ofstream(parPathDest));
|
|
}
|
|
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
|
*os << parData;
|
|
}
|
|
} //unnamed namespace
|
|
|
|
int main (int argc, char* argv[]) {
|
|
po::variables_map vm;
|
|
if (parse_commandline(argc, argv, vm)) {
|
|
return 0;
|
|
}
|
|
const auto url = vm["input-url"].as<std::string>();
|
|
const auto xpath = vm["xpath"].as<std::string>();
|
|
#if !defined(NDEBUG)
|
|
std::cout << "URL : " << url << "\n";
|
|
std::cout << "XPath: " << xpath << std::endl;
|
|
#endif
|
|
|
|
const std::string tidyHtml = duck::getCleanHtml(
|
|
url,
|
|
false,
|
|
false,
|
|
(vm.count("dump-raw") ?
|
|
std::bind(&dump_string, vm["dump-raw"].as<std::string>(), std::placeholders::_1)
|
|
:
|
|
duck::DumpRawFunc()
|
|
)
|
|
);
|
|
if (vm.count("dump")) {
|
|
dump_string(vm["dump"].as<std::string>(), tidyHtml);
|
|
}
|
|
|
|
{
|
|
pugi::xml_document doc;
|
|
std::istringstream iss(tidyHtml);
|
|
pugi::xml_parse_result result(doc.load(iss));
|
|
if (not result) {
|
|
auto line_col = line_col_from_offset(result.offset, tidyHtml);
|
|
std::cerr << "Error parsing the source XML at line " <<
|
|
line_col.first << " col " << line_col.second << ":\n" <<
|
|
result.description() << std::endl;
|
|
return 1;
|
|
}
|
|
|
|
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str());
|
|
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
|
|
const pugi::xpath_node& node = *itFind;
|
|
if (node.node()) {
|
|
std::cout << node.node().name() << ": " << node.node().value() << "\n";
|
|
}
|
|
else if (node.attribute()) {
|
|
std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n";
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
namespace {
|
|
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
|
|
size_t index = 0;
|
|
int line = 1;
|
|
int chara = 1;
|
|
while (parOffset and index < parData.size()) {
|
|
if (parData[index] == '\n') {
|
|
chara = 1;
|
|
++line;
|
|
}
|
|
else {
|
|
++chara;
|
|
}
|
|
++index;
|
|
--parOffset;
|
|
}
|
|
return std::make_pair(line, chara);
|
|
}
|
|
} //unnamed namespace
|