Don't detect if it's a tty - only read from stdin when url is -

This commit is contained in:
King_DuckZ 2015-09-29 21:01:33 +02:00
parent db1311839d
commit c304ffbbf0
3 changed files with 41 additions and 70 deletions

View file

@ -1,5 +1,4 @@
#include "htmlretrieve.hpp"
#include "duckscraperConfig.h"
#include <ciso646>
#include <tidy.h>
#include <tidybuffio.h>
@ -38,7 +37,7 @@ namespace duck {
}
} //unnamed namespace
std::string cleanHTML (std::string&& html) {
std::string clean_html (std::string&& html) {
dropScriptTags(html);
// Initialize a Tidy document
@ -50,7 +49,8 @@ namespace duck {
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no)
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
int tidyResponseCode = -1;
@ -84,7 +84,8 @@ namespace duck {
return tidyResult;
}
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
std::string fetch_html (const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
using curl::curl_easy;
using curl::curl_pair;
@ -108,14 +109,6 @@ namespace duck {
//return 1;
//}
std::string raw_data(oss.str());
if (parDumpRaw) {
parDumpRaw(raw_data);
}
return cleanHTML(std::move(raw_data));
}
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
return oss.str();
}
} //namespace duck

View file

@ -2,14 +2,10 @@
#define idC6776D903059465191FFB64FCFD6B86A
#include <string>
#include <functional>
namespace duck {
using DumpRawFunc = std::function<void(const std::string&)>;
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
std::string cleanHTML ( std::string&& html );
std::string fetch_html ( const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
std::string clean_html ( std::string&& html );
} //namespace duck
#endif

View file

@ -10,9 +10,8 @@
#include <boost/program_options.hpp>
#include <memory>
#include <functional>
#include <unistd.h>
#include <sys/stat.h>
#include <iterator>
#include <stdexcept>
#define STRINGIZE_IMPL(s) #s
#define STRINGIZE(s) STRINGIZE_IMPL(s)
@ -55,6 +54,7 @@ namespace {
po::options_description visible("Available options");
visible.add(desc);
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
std::cout << "You can pass - as the url to read from stdin\n";
std::cout << visible;
return true;
}
@ -65,14 +65,10 @@ namespace {
}
if (parVarMap.count("input-url") == 0) {
std::cerr << "No input url specified, use --help for help" << std::endl;
//return 2;
return true;
throw std::invalid_argument("No input URL specified");
}
if (parVarMap.count("xpath") == 0) {
std::cerr << "No XPath expression specified, use --help for help" << std::endl;
//return 2;
return true;
throw std::invalid_argument("No XPath expression specified");
}
return false;
}
@ -86,50 +82,20 @@ namespace {
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
*os << parData;
}
std::string getCleanHtml (const std::string& parUrl, const po::variables_map& parVarMap) {
std::string tidyHtml;
struct stat stats;
const int r = fstat(fileno(stdin), &stats);
bool interactive = true;
if (r < 0) {
//TODO: error
interactive = false;
}
else {
interactive = static_cast<bool>(S_ISCHR(stats.st_mode));
}
if (interactive) {
tidyHtml = duck::getCleanHtml(
parUrl,
false,
false,
(parVarMap.count("dump-raw") ?
std::bind(&dump_string, parVarMap["dump-raw"].as<std::string>(), std::placeholders::_1)
:
duck::DumpRawFunc()
)
);
}
else {
std::cin >> std::noskipws;
std::istream_iterator<char> it(std::cin);
std::istream_iterator<char> end;
std::string results(it, end);
tidyHtml = duck::cleanHTML(std::move(results));
}
return std::move(tidyHtml);
}
} //unnamed namespace
int main (int argc, char* argv[]) {
po::variables_map vm;
if (parse_commandline(argc, argv, vm)) {
return 0;
try {
if (parse_commandline(argc, argv, vm)) {
return 0;
}
}
catch (const std::invalid_argument& err) {
std::cerr << err.what() << "\nUse --help for help" << std::endl;
return 2;
}
const auto url = vm["input-url"].as<std::string>();
const auto xpath = vm["xpath"].as<std::string>();
#if !defined(NDEBUG)
@ -137,17 +103,33 @@ int main (int argc, char* argv[]) {
std::cout << "XPath: " << xpath << std::endl;
#endif
const std::string tidyHtml(getCleanHtml(url, vm));
std::string html;
if ("-" != url) {
html = duck::fetch_html(url, DEFAULT_USER_AGENT, false, false);
}
else {
std::cin >> std::noskipws;
std::istream_iterator<char> it(std::cin);
std::istream_iterator<char> end;
html = std::string(it, end);
}
if (vm.count("dump-raw")) {
dump_string(vm["dump-raw"].as<std::string>(), html);
}
html = duck::clean_html(std::move(html));
if (vm.count("dump")) {
dump_string(vm["dump"].as<std::string>(), tidyHtml);
dump_string(vm["dump"].as<std::string>(), html);
}
{
pugi::xml_document doc;
std::istringstream iss(tidyHtml);
std::istringstream iss(html);
pugi::xml_parse_result result(doc.load(iss));
if (not result) {
auto line_col = line_col_from_offset(result.offset, tidyHtml);
auto line_col = line_col_from_offset(result.offset, html);
std::cerr << "Error parsing the source XML at line " <<
line_col.first << " col " << line_col.second << ":\n" <<
result.description() << std::endl;