Don't detect if it's a tty - only read from stdin when url is -
This commit is contained in:
parent
db1311839d
commit
c304ffbbf0
3 changed files with 41 additions and 70 deletions
|
@ -1,5 +1,4 @@
|
|||
#include "htmlretrieve.hpp"
|
||||
#include "duckscraperConfig.h"
|
||||
#include <ciso646>
|
||||
#include <tidy.h>
|
||||
#include <tidybuffio.h>
|
||||
|
@ -38,7 +37,7 @@ namespace duck {
|
|||
}
|
||||
} //unnamed namespace
|
||||
|
||||
std::string cleanHTML (std::string&& html) {
|
||||
std::string clean_html (std::string&& html) {
|
||||
dropScriptTags(html);
|
||||
|
||||
// Initialize a Tidy document
|
||||
|
@ -50,7 +49,8 @@ namespace duck {
|
|||
bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyQuiet, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyNumEntities, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no);
|
||||
&& tidyOptSetBool(tidyDoc, TidyShowWarnings, no)
|
||||
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
|
||||
|
||||
int tidyResponseCode = -1;
|
||||
|
||||
|
@ -84,7 +84,8 @@ namespace duck {
|
|||
return tidyResult;
|
||||
}
|
||||
|
||||
std::string getCleanHtml (const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
||||
|
||||
std::string fetch_html (const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost) {
|
||||
using curl::curl_easy;
|
||||
using curl::curl_pair;
|
||||
|
||||
|
@ -108,14 +109,6 @@ namespace duck {
|
|||
//return 1;
|
||||
//}
|
||||
|
||||
std::string raw_data(oss.str());
|
||||
if (parDumpRaw) {
|
||||
parDumpRaw(raw_data);
|
||||
}
|
||||
return cleanHTML(std::move(raw_data));
|
||||
}
|
||||
|
||||
std::string getCleanHtml (const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw) {
|
||||
return getCleanHtml(parSource, std::string(DEFAULT_USER_AGENT), parSslVerifyPeer, parSslVerifyHost, parDumpRaw);
|
||||
return oss.str();
|
||||
}
|
||||
} //namespace duck
|
||||
|
|
|
@ -2,14 +2,10 @@
|
|||
#define idC6776D903059465191FFB64FCFD6B86A
|
||||
|
||||
#include <string>
|
||||
#include <functional>
|
||||
|
||||
namespace duck {
|
||||
using DumpRawFunc = std::function<void(const std::string&)>;
|
||||
std::string getCleanHtml ( const std::string& parSource, const std::string& parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
||||
std::string getCleanHtml ( const std::string& parSource, bool parSslVerifyPeer, bool parSslVerifyHost, DumpRawFunc parDumpRaw=DumpRawFunc() );
|
||||
|
||||
std::string cleanHTML ( std::string&& html );
|
||||
std::string fetch_html ( const std::string& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||
std::string clean_html ( std::string&& html );
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
||||
|
|
84
src/main.cpp
84
src/main.cpp
|
@ -10,9 +10,8 @@
|
|||
#include <boost/program_options.hpp>
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include <iterator>
|
||||
#include <stdexcept>
|
||||
|
||||
#define STRINGIZE_IMPL(s) #s
|
||||
#define STRINGIZE(s) STRINGIZE_IMPL(s)
|
||||
|
@ -55,6 +54,7 @@ namespace {
|
|||
po::options_description visible("Available options");
|
||||
visible.add(desc);
|
||||
std::cout << "Usage: " << PROGRAM_NAME << " [options...] <url> <xpath>\n";
|
||||
std::cout << "You can pass - as the url to read from stdin\n";
|
||||
std::cout << visible;
|
||||
return true;
|
||||
}
|
||||
|
@ -65,14 +65,10 @@ namespace {
|
|||
}
|
||||
|
||||
if (parVarMap.count("input-url") == 0) {
|
||||
std::cerr << "No input url specified, use --help for help" << std::endl;
|
||||
//return 2;
|
||||
return true;
|
||||
throw std::invalid_argument("No input URL specified");
|
||||
}
|
||||
if (parVarMap.count("xpath") == 0) {
|
||||
std::cerr << "No XPath expression specified, use --help for help" << std::endl;
|
||||
//return 2;
|
||||
return true;
|
||||
throw std::invalid_argument("No XPath expression specified");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -86,50 +82,20 @@ namespace {
|
|||
std::ostream* const os = (use_stdout ? &std::cout : ofs.get());
|
||||
*os << parData;
|
||||
}
|
||||
|
||||
std::string getCleanHtml (const std::string& parUrl, const po::variables_map& parVarMap) {
|
||||
std::string tidyHtml;
|
||||
struct stat stats;
|
||||
|
||||
const int r = fstat(fileno(stdin), &stats);
|
||||
bool interactive = true;
|
||||
if (r < 0) {
|
||||
//TODO: error
|
||||
interactive = false;
|
||||
}
|
||||
else {
|
||||
interactive = static_cast<bool>(S_ISCHR(stats.st_mode));
|
||||
}
|
||||
|
||||
if (interactive) {
|
||||
tidyHtml = duck::getCleanHtml(
|
||||
parUrl,
|
||||
false,
|
||||
false,
|
||||
(parVarMap.count("dump-raw") ?
|
||||
std::bind(&dump_string, parVarMap["dump-raw"].as<std::string>(), std::placeholders::_1)
|
||||
:
|
||||
duck::DumpRawFunc()
|
||||
)
|
||||
);
|
||||
}
|
||||
else {
|
||||
std::cin >> std::noskipws;
|
||||
std::istream_iterator<char> it(std::cin);
|
||||
std::istream_iterator<char> end;
|
||||
std::string results(it, end);
|
||||
tidyHtml = duck::cleanHTML(std::move(results));
|
||||
}
|
||||
|
||||
return std::move(tidyHtml);
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
||||
int main (int argc, char* argv[]) {
|
||||
po::variables_map vm;
|
||||
if (parse_commandline(argc, argv, vm)) {
|
||||
return 0;
|
||||
try {
|
||||
if (parse_commandline(argc, argv, vm)) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
catch (const std::invalid_argument& err) {
|
||||
std::cerr << err.what() << "\nUse --help for help" << std::endl;
|
||||
return 2;
|
||||
}
|
||||
|
||||
const auto url = vm["input-url"].as<std::string>();
|
||||
const auto xpath = vm["xpath"].as<std::string>();
|
||||
#if !defined(NDEBUG)
|
||||
|
@ -137,17 +103,33 @@ int main (int argc, char* argv[]) {
|
|||
std::cout << "XPath: " << xpath << std::endl;
|
||||
#endif
|
||||
|
||||
const std::string tidyHtml(getCleanHtml(url, vm));
|
||||
std::string html;
|
||||
|
||||
if ("-" != url) {
|
||||
html = duck::fetch_html(url, DEFAULT_USER_AGENT, false, false);
|
||||
}
|
||||
else {
|
||||
std::cin >> std::noskipws;
|
||||
std::istream_iterator<char> it(std::cin);
|
||||
std::istream_iterator<char> end;
|
||||
html = std::string(it, end);
|
||||
}
|
||||
|
||||
if (vm.count("dump-raw")) {
|
||||
dump_string(vm["dump-raw"].as<std::string>(), html);
|
||||
}
|
||||
|
||||
html = duck::clean_html(std::move(html));
|
||||
if (vm.count("dump")) {
|
||||
dump_string(vm["dump"].as<std::string>(), tidyHtml);
|
||||
dump_string(vm["dump"].as<std::string>(), html);
|
||||
}
|
||||
|
||||
{
|
||||
pugi::xml_document doc;
|
||||
std::istringstream iss(tidyHtml);
|
||||
std::istringstream iss(html);
|
||||
pugi::xml_parse_result result(doc.load(iss));
|
||||
if (not result) {
|
||||
auto line_col = line_col_from_offset(result.offset, tidyHtml);
|
||||
auto line_col = line_col_from_offset(result.offset, html);
|
||||
std::cerr << "Error parsing the source XML at line " <<
|
||||
line_col.first << " col " << line_col.second << ":\n" <<
|
||||
result.description() << std::endl;
|
||||
|
|
Loading…
Reference in a new issue