Don't detect if it's a tty - only read from stdin when url is -
This commit is contained in:
parent
db1311839d
commit
c304ffbbf0
3 changed files with 41 additions and 70 deletions
|
@ -1,5 +1,4 @@
|
||||||
#include "htmlretrieve.hpp"
|
#include "htmlretrieve.hpp"
|
||||||
#include "duckscraperConfig.h"
|
|
||||||
#include <ciso646>
|
#include <ciso646>
|
||||||
#include <tidy.h>
|
#include <tidy.h>
|
||||||
#include <tidybuffio.h>
|
#include <tidybuffio.h>
|
||||||
|
@ -38,7 +37,7 @@ namespace duck {
|
||||||
}
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
std::string cleanHTML (std::string&& html) {
|
std::string clean_html (std::string&& html) {
|
||||||
dropScriptTags(html);
|
dropScriptTags(html);
|
||||||
|
|
||||||
// Initialize a Tidy document
|
// Initialize a Tidy document
|
||||||
|
@ -50,7 +49,8 @@ namespace duck {
|
||||||
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
|
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
|
||||||
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
||||||
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
||||||
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
|
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
|
||||||
|
|
||||||
int tidyResponseCode = -1;
|
int tidyResponseCode = -1;
|
||||||
|
|
||||||
|
@ -84,7 +84,8 @@ namespace duck {
|
||||||
return tidyResult;
|
return tidyResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
|
||||||
|
std::string fetch_html (const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
||||||
using curl::curl_easy;
|
using curl::curl_easy;
|
||||||
using curl::curl_pair;
|
using curl::curl_pair;
|
||||||
|
|
||||||
|
@ -108,14 +109,6 @@ namespace duck {
|
||||||
//return 1;
|
//return 1;
|
||||||
//}
|
//}
|
||||||
|
|
||||||
std::string raw_data(oss.str());
|
return oss.str();
|
||||||
if (parDumpRaw) {
|
|
||||||
parDumpRaw(raw_data);
|
|
||||||
}
|
|
||||||
return cleanHTML(std::move(raw_data));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
|
||||||
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
|
|
||||||
}
|
}
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
|
@ -2,14 +2,10 @@
|
||||||
#define idC6776D903059465191FFB64FCFD6B86A
|
#define idC6776D903059465191FFB64FCFD6B86A
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <functional>
|
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
using DumpRawFunc = std::function<void(const std::string&)>;
|
std::string fetch_html ( const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||||
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
std::string clean_html ( std::string&& html );
|
||||||
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
|
||||||
|
|
||||||
std::string cleanHTML ( std::string&& html );
|
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
84
src/main.cpp
84
src/main.cpp
|
@ -10,9 +10,8 @@
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <unistd.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
#define STRINGIZE_IMPL(s) #s
|
#define STRINGIZE_IMPL(s) #s
|
||||||
#define STRINGIZE(s) STRINGIZE_IMPL(s)
|
#define STRINGIZE(s) STRINGIZE_IMPL(s)
|
||||||
|
@ -55,6 +54,7 @@ namespace {
|
||||||
po::options_description visible("Available options");
|
po::options_description visible("Available options");
|
||||||
visible.add(desc);
|
visible.add(desc);
|
||||||
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
|
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
|
||||||
|
std::cout << "You can pass - as the url to read from stdin\n";
|
||||||
std::cout << visible;
|
std::cout << visible;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -65,14 +65,10 @@ namespace {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parVarMap.count("input-url") == 0) {
|
if (parVarMap.count("input-url") == 0) {
|
||||||
std::cerr << "No input url specified, use --help for help" << std::endl;
|
throw std::invalid_argument("No input URL specified");
|
||||||
//return 2;
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
if (parVarMap.count("xpath") == 0) {
|
if (parVarMap.count("xpath") == 0) {
|
||||||
std::cerr << "No XPath expression specified, use --help for help" << std::endl;
|
throw std::invalid_argument("No XPath expression specified");
|
||||||
//return 2;
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -86,50 +82,20 @@ namespace {
|
||||||
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
||||||
*os << parData;
|
*os << parData;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string getCleanHtml (const std::string& parUrl, const po::variables_map& parVarMap) {
|
|
||||||
std::string tidyHtml;
|
|
||||||
struct stat stats;
|
|
||||||
|
|
||||||
const int r = fstat(fileno(stdin), &stats);
|
|
||||||
bool interactive = true;
|
|
||||||
if (r < 0) {
|
|
||||||
//TODO: error
|
|
||||||
interactive = false;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
interactive = static_cast<bool>(S_ISCHR(stats.st_mode));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (interactive) {
|
|
||||||
tidyHtml = duck::getCleanHtml(
|
|
||||||
parUrl,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
(parVarMap.count("dump-raw") ?
|
|
||||||
std::bind(&dump_string, parVarMap["dump-raw"].as<std::string>(), std::placeholders::_1)
|
|
||||||
:
|
|
||||||
duck::DumpRawFunc()
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
std::cin >> std::noskipws;
|
|
||||||
std::istream_iterator<char> it(std::cin);
|
|
||||||
std::istream_iterator<char> end;
|
|
||||||
std::string results(it, end);
|
|
||||||
tidyHtml = duck::cleanHTML(std::move(results));
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::move(tidyHtml);
|
|
||||||
}
|
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
int main (int argc, char* argv[]) {
|
int main (int argc, char* argv[]) {
|
||||||
po::variables_map vm;
|
po::variables_map vm;
|
||||||
if (parse_commandline(argc, argv, vm)) {
|
try {
|
||||||
return 0;
|
if (parse_commandline(argc, argv, vm)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
catch (const std::invalid_argument& err) {
|
||||||
|
std::cerr << err.what() << "\nUse --help for help" << std::endl;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
const auto url = vm["input-url"].as<std::string>();
|
const auto url = vm["input-url"].as<std::string>();
|
||||||
const auto xpath = vm["xpath"].as<std::string>();
|
const auto xpath = vm["xpath"].as<std::string>();
|
||||||
#if !defined(NDEBUG)
|
#if !defined(NDEBUG)
|
||||||
|
@ -137,17 +103,33 @@ int main (int argc, char* argv[]) {
|
||||||
std::cout << "XPath: " << xpath << std::endl;
|
std::cout << "XPath: " << xpath << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const std::string tidyHtml(getCleanHtml(url, vm));
|
std::string html;
|
||||||
|
|
||||||
|
if ("-" != url) {
|
||||||
|
html = duck::fetch_html(url, DEFAULT_USER_AGENT, false, false);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::cin >> std::noskipws;
|
||||||
|
std::istream_iterator<char> it(std::cin);
|
||||||
|
std::istream_iterator<char> end;
|
||||||
|
html = std::string(it, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vm.count("dump-raw")) {
|
||||||
|
dump_string(vm["dump-raw"].as<std::string>(), html);
|
||||||
|
}
|
||||||
|
|
||||||
|
html = duck::clean_html(std::move(html));
|
||||||
if (vm.count("dump")) {
|
if (vm.count("dump")) {
|
||||||
dump_string(vm["dump"].as<std::string>(), tidyHtml);
|
dump_string(vm["dump"].as<std::string>(), html);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
pugi::xml_document doc;
|
pugi::xml_document doc;
|
||||||
std::istringstream iss(tidyHtml);
|
std::istringstream iss(html);
|
||||||
pugi::xml_parse_result result(doc.load(iss));
|
pugi::xml_parse_result result(doc.load(iss));
|
||||||
if (not result) {
|
if (not result) {
|
||||||
auto line_col = line_col_from_offset(result.offset, tidyHtml);
|
auto line_col = line_col_from_offset(result.offset, html);
|
||||||
std::cerr << "Error parsing the source XML at line " <<
|
std::cerr << "Error parsing the source XML at line " <<
|
||||||
line_col.first << " col " << line_col.second << ":\n" <<
|
line_col.first << " col " << line_col.second << ":\n" <<
|
||||||
result.description() << std::endl;
|
result.description() << std::endl;
|
||||||
|
|
Loading…
Reference in a new issue