Don't detect if it's a tty - only read from stdin when url is -

This commit is contained in:
King_DuckZ 2015-09-29 21:01:33 +02:00
parent db1311839d
commit c304ffbbf0
3 changed files with 41 additions and 70 deletions

View file

@ -1,5 +1,4 @@
#include "htmlretrieve.hpp" #include "htmlretrieve.hpp"
#include "duckscraperConfig.h"
#include <ciso646> #include <ciso646>
#include <tidy.h> #include <tidy.h>
#include <tidybuffio.h> #include <tidybuffio.h>
@ -38,7 +37,7 @@ namespace duck {
} }
} //unnamed namespace } //unnamed namespace
std::string cleanHTML (std::string&& html) { std::string clean_html (std::string&& html) {
dropScriptTags(html); dropScriptTags(html);
// Initialize a Tidy document // Initialize a Tidy document
@ -50,7 +49,8 @@ namespace duck {
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes) bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes) && tidyOptSetBool(tidyDoc, TidyQuiet, yes)
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes) && tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no); && tidyOptSetBool(tidyDoc, TidyShowWarnings, no)
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
int tidyResponseCode = -1; int tidyResponseCode = -1;
@ -84,7 +84,8 @@ namespace duck {
return tidyResult; return tidyResult;
} }
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
std::string fetch_html (const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
using curl::curl_easy; using curl::curl_easy;
using curl::curl_pair; using curl::curl_pair;
@ -108,14 +109,6 @@ namespace duck {
//return 1; //return 1;
//} //}
std::string raw_data(oss.str()); return oss.str();
if (parDumpRaw) {
parDumpRaw(raw_data);
}
return cleanHTML(std::move(raw_data));
}
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
} }
} //namespace duck } //namespace duck

View file

@ -2,14 +2,10 @@
#define idC6776D903059465191FFB64FCFD6B86A #define idC6776D903059465191FFB64FCFD6B86A
#include <string> #include <string>
#include <functional>
namespace duck { namespace duck {
using DumpRawFunc = std::function<void(const std::string&)>; std::string fetch_html ( const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() ); std::string clean_html ( std::string&& html );
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
std::string cleanHTML ( std::string&& html );
} //namespace duck } //namespace duck
#endif #endif

View file

@ -10,9 +10,8 @@
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <memory> #include <memory>
#include <functional> #include <functional>
#include <unistd.h>
#include <sys/stat.h>
#include <iterator> #include <iterator>
#include <stdexcept>
#define STRINGIZE_IMPL(s) #s #define STRINGIZE_IMPL(s) #s
#define STRINGIZE(s) STRINGIZE_IMPL(s) #define STRINGIZE(s) STRINGIZE_IMPL(s)
@ -55,6 +54,7 @@ namespace {
po::options_description visible("Available options"); po::options_description visible("Available options");
visible.add(desc); visible.add(desc);
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n"; std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
std::cout << "You can pass - as the url to read from stdin\n";
std::cout << visible; std::cout << visible;
return true; return true;
} }
@ -65,14 +65,10 @@ namespace {
} }
if (parVarMap.count("input-url") == 0) { if (parVarMap.count("input-url") == 0) {
std::cerr << "No input url specified, use --help for help" << std::endl; throw std::invalid_argument("No input URL specified");
//return 2;
return true;
} }
if (parVarMap.count("xpath") == 0) { if (parVarMap.count("xpath") == 0) {
std::cerr << "No XPath expression specified, use --help for help" << std::endl; throw std::invalid_argument("No XPath expression specified");
//return 2;
return true;
} }
return false; return false;
} }
@ -86,50 +82,20 @@ namespace {
std::ostream* const os = (use_stdout ? &std::cout : ofs.get()); std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
*os << parData; *os << parData;
} }
std::string getCleanHtml (const std::string& parUrl, const po::variables_map& parVarMap) {
std::string tidyHtml;
struct stat stats;
const int r = fstat(fileno(stdin), &stats);
bool interactive = true;
if (r < 0) {
//TODO: error
interactive = false;
}
else {
interactive = static_cast<bool>(S_ISCHR(stats.st_mode));
}
if (interactive) {
tidyHtml = duck::getCleanHtml(
parUrl,
false,
false,
(parVarMap.count("dump-raw") ?
std::bind(&dump_string, parVarMap["dump-raw"].as<std::string>(), std::placeholders::_1)
:
duck::DumpRawFunc()
)
);
}
else {
std::cin >> std::noskipws;
std::istream_iterator<char> it(std::cin);
std::istream_iterator<char> end;
std::string results(it, end);
tidyHtml = duck::cleanHTML(std::move(results));
}
return std::move(tidyHtml);
}
} //unnamed namespace } //unnamed namespace
int main (int argc, char* argv[]) { int main (int argc, char* argv[]) {
po::variables_map vm; po::variables_map vm;
if (parse_commandline(argc, argv, vm)) { try {
return 0; if (parse_commandline(argc, argv, vm)) {
return 0;
}
} }
catch (const std::invalid_argument& err) {
std::cerr << err.what() << "\nUse --help for help" << std::endl;
return 2;
}
const auto url = vm["input-url"].as<std::string>(); const auto url = vm["input-url"].as<std::string>();
const auto xpath = vm["xpath"].as<std::string>(); const auto xpath = vm["xpath"].as<std::string>();
#if !defined(NDEBUG) #if !defined(NDEBUG)
@ -137,17 +103,33 @@ int main (int argc, char* argv[]) {
std::cout << "XPath: " << xpath << std::endl; std::cout << "XPath: " << xpath << std::endl;
#endif #endif
const std::string tidyHtml(getCleanHtml(url, vm)); std::string html;
if ("-" != url) {
html = duck::fetch_html(url, DEFAULT_USER_AGENT, false, false);
}
else {
std::cin >> std::noskipws;
std::istream_iterator<char> it(std::cin);
std::istream_iterator<char> end;
html = std::string(it, end);
}
if (vm.count("dump-raw")) {
dump_string(vm["dump-raw"].as<std::string>(), html);
}
html = duck::clean_html(std::move(html));
if (vm.count("dump")) { if (vm.count("dump")) {
dump_string(vm["dump"].as<std::string>(), tidyHtml); dump_string(vm["dump"].as<std::string>(), html);
} }
{ {
pugi::xml_document doc; pugi::xml_document doc;
std::istringstream iss(tidyHtml); std::istringstream iss(html);
pugi::xml_parse_result result(doc.load(iss)); pugi::xml_parse_result result(doc.load(iss));
if (not result) { if (not result) {
auto line_col = line_col_from_offset(result.offset, tidyHtml); auto line_col = line_col_from_offset(result.offset, html);
std::cerr << "Error parsing the source XML at line " << std::cerr << "Error parsing the source XML at line " <<
line_col.first << " col " << line_col.second << ":\n" << line_col.first << " col " << line_col.second << ":\n" <<
result.description() << std::endl; result.description() << std::endl;