From c304ffbbf02f0e407adbdf54f299183e480302f7 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Tue, 29 Sep 2015 21:01:33 +0200 Subject: [PATCH] Don't detect if it's a tty - only read from stdin when url is - --- src/htmlretrieve.cpp | 19 ++++------ src/htmlretrieve.hpp | 8 ++--- src/main.cpp | 84 +++++++++++++++++--------------------------- 3 files changed, 41 insertions(+), 70 deletions(-) diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp index fc014ed..7e12a87 100644 --- a/src/htmlretrieve.cpp +++ b/src/htmlretrieve.cpp @@ -1,5 +1,4 @@ #include "htmlretrieve.hpp" -#include "duckscraperConfig.h" #include #include #include @@ -38,7 +37,7 @@ namespace duck { } } //unnamed namespace - std::string cleanHTML (std::string&& html) { + std::string clean_html (std::string&& html) { dropScriptTags(html); // Initialize a Tidy document @@ -50,7 +49,8 @@ namespace duck { bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes) && tidyOptSetBool(tidyDoc, TidyQuiet, yes) && tidyOptSetBool(tidyDoc, TidyNumEntities, yes) - && tidyOptSetBool(tidyDoc, TidyShowWarnings, no); + && tidyOptSetBool(tidyDoc, TidyShowWarnings, no) + && tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes); int tidyResponseCode = -1; @@ -84,7 +84,8 @@ namespace duck { return tidyResult; } - std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) { + + std::string fetch_html (const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) { using curl::curl_easy; using curl::curl_pair; @@ -108,14 +109,6 @@ namespace duck { //return 1; //} - std::string raw_data(oss.str()); - if (parDumpRaw) { - parDumpRaw(raw_data); - } - return cleanHTML(std::move(raw_data)); - } - - std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) { - return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw); + return oss.str(); } } //namespace duck diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp index 6aad5ba..b14810f 100644 --- a/src/htmlretrieve.hpp +++ b/src/htmlretrieve.hpp @@ -2,14 +2,10 @@ #define idC6776D903059465191FFB64FCFD6B86A #include -#include namespace duck { - using DumpRawFunc = std::function; - std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() ); - std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() ); - - std::string cleanHTML ( std::string&& html ); + std::string fetch_html ( const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost ); + std::string clean_html ( std::string&& html ); } //namespace duck #endif diff --git a/src/main.cpp b/src/main.cpp index 2f6b028..c0ce648 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,9 +10,8 @@ #include #include #include -#include -#include #include +#include #define STRINGIZE_IMPL(s) #s #define STRINGIZE(s) STRINGIZE_IMPL(s) @@ -55,6 +54,7 @@ namespace { po::options_description visible("Available options"); visible.add(desc); std::cout << "Usage: " << PROGRAM_NAME << " [options...] \n"; + std::cout << "You can pass - as the url to read from stdin\n"; std::cout << visible; return true; } @@ -65,14 +65,10 @@ namespace { } if (parVarMap.count("input-url") == 0) { - std::cerr << "No input url specified, use --help for help" << std::endl; - //return 2; - return true; + throw std::invalid_argument("No input URL specified"); } if (parVarMap.count("xpath") == 0) { - std::cerr << "No XPath expression specified, use --help for help" << std::endl; - //return 2; - return true; + throw std::invalid_argument("No XPath expression specified"); } return false; } @@ -86,50 +82,20 @@ namespace { std::ostream* const os = (use_stdout ? &std::cout : ofs.get()); *os << parData; } - - std::string getCleanHtml (const std::string& parUrl, const po::variables_map& parVarMap) { - std::string tidyHtml; - struct stat stats; - - const int r = fstat(fileno(stdin), &stats); - bool interactive = true; - if (r < 0) { - //TODO: error - interactive = false; - } - else { - interactive = static_cast(S_ISCHR(stats.st_mode)); - } - - if (interactive) { - tidyHtml = duck::getCleanHtml( - parUrl, - false, - false, - (parVarMap.count("dump-raw") ? - std::bind(&dump_string, parVarMap["dump-raw"].as(), std::placeholders::_1) - : - duck::DumpRawFunc() - ) - ); - } - else { - std::cin >> std::noskipws; - std::istream_iterator it(std::cin); - std::istream_iterator end; - std::string results(it, end); - tidyHtml = duck::cleanHTML(std::move(results)); - } - - return std::move(tidyHtml); - } } //unnamed namespace int main (int argc, char* argv[]) { po::variables_map vm; - if (parse_commandline(argc, argv, vm)) { - return 0; + try { + if (parse_commandline(argc, argv, vm)) { + return 0; + } } + catch (const std::invalid_argument& err) { + std::cerr << err.what() << "\nUse --help for help" << std::endl; + return 2; + } + const auto url = vm["input-url"].as(); const auto xpath = vm["xpath"].as(); #if !defined(NDEBUG) @@ -137,17 +103,33 @@ int main (int argc, char* argv[]) { std::cout << "XPath: " << xpath << std::endl; #endif - const std::string tidyHtml(getCleanHtml(url, vm)); + std::string html; + + if ("-" != url) { + html = duck::fetch_html(url, DEFAULT_USER_AGENT, false, false); + } + else { + std::cin >> std::noskipws; + std::istream_iterator it(std::cin); + std::istream_iterator end; + html = std::string(it, end); + } + + if (vm.count("dump-raw")) { + dump_string(vm["dump-raw"].as(), html); + } + + html = duck::clean_html(std::move(html)); if (vm.count("dump")) { - dump_string(vm["dump"].as(), tidyHtml); + dump_string(vm["dump"].as(), html); } { pugi::xml_document doc; - std::istringstream iss(tidyHtml); + std::istringstream iss(html); pugi::xml_parse_result result(doc.load(iss)); if (not result) { - auto line_col = line_col_from_offset(result.offset, tidyHtml); + auto line_col = line_col_from_offset(result.offset, html); std::cerr << "Error parsing the source XML at line " << line_col.first << " col " << line_col.second << ":\n" << result.description() << std::endl;