Add --from-code option for users to force the source charset.

This commit is contained in:
King_DuckZ 2020-02-19 17:21:20 +01:00
parent 54ac44b81d
commit 33866b3d6b
6 changed files with 74 additions and 15 deletions

View file

@ -52,6 +52,7 @@ namespace duck {
query_options.add_options() query_options.add_options()
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server") ("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line") ("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
; ;
po::options_description positional_options("Positional options"); po::options_description positional_options("Positional options");
positional_options.add_options() positional_options.add_options()

View file

@ -25,27 +25,29 @@
#include <iostream> #include <iostream>
namespace duck { namespace duck {
HtmlPool::HtmlPool (std::string&& agent_name) : HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
m_agent(std::move(agent_name)) m_agent(std::move(agent_name)),
m_src_charset(std::move(src_charset))
{ {
} }
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* { HtmlPool::~HtmlPool() noexcept = default;
std::unique_ptr<std::string> html;
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n"; std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n";
std::unique_ptr<std::string> utf8_html;
if (parRes == "-") { if (parRes == "-") {
html = std::make_unique<std::string>(read_all(std::cin)); utf8_html = std::make_unique<std::string>(read_all(std::cin));
} }
else { else {
html = std::make_unique<std::string>( utf8_html = std::make_unique<std::string>(
fetch_html(parRes, m_agent, false, false) fetch_html(parRes, m_agent, false, false)
); );
} }
*html = duck::clean_html(std::move(*html)); *utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset)));
return html.release(); return utf8_html.release();
} }
void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept { void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept {

View file

@ -33,9 +33,11 @@ namespace duck {
virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes); virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes);
std::string m_agent; std::string m_agent;
std::string m_src_charset;
public: public:
explicit HtmlPool (std::string&& agent_name); HtmlPool(std::string&& agent_name, std::string&& src_charset);
~HtmlPool() noexcept;
}; };
} //namespace duck } //namespace duck

View file

@ -1,4 +1,4 @@
/* Copyright (C) 2015 Michele Santullo /* Copyright (C) 2015-2020 Michele Santullo
* *
* This file is part of DuckScraper. * This file is part of DuckScraper.
* *
@ -28,9 +28,49 @@
#include <memory> #include <memory>
#include <cassert> #include <cassert>
#include <utility> #include <utility>
#include <cctype>
#include <iterator>
namespace duck { namespace duck {
namespace { namespace {
std::string make_lowercase (std::string_view in) {
std::string out;
std::transform(in.begin(), in.end(), std::back_inserter(out), [](unsigned char c){return std::tolower(c);});
return out;
}
TidyEncodingOptions charset_to_enum (std::string_view name) {
const std::string lower_name = make_lowercase(name);
if (lower_name == "ascii")
return TidyEncAscii;
//else if (lower_name == "???")
// return TidyEncLatin0;
//else if (lower_name == "???")
// return TidyEncLatin1;
else if (lower_name == "utf-8")
return TidyEncUtf8;
#ifndef NO_NATIVE_ISO2022_SUPPORT
else if (lower_name == "iso-2022-cn")
return TidyEncIso2022;
#endif
else if (lower_name == "mac")
return TidyEncMac;
else if (lower_name == "windows-1252")
return TidyEncWin1252;
else if (lower_name == "ibm858")
return TidyEncIbm858;
else if (lower_name == "utf-16le")
return TidyEncUtf16le;
else if (lower_name == "utf-16be")
return TidyEncUtf16be;
else if (lower_name == "utf-16")
return TidyEncUtf16;
else if (lower_name == "big-5")
return TidyEncBig5;
else if (lower_name == "shift-jis" or lower_name == "shift_jis")
return TidyEncShiftjis;
throw std::runtime_error("Failed to recognise \"" + std::string(name) + "\" as a valid source charset");
}
bool isHttps (const std::string_view& parUrl) { bool isHttps (const std::string_view& parUrl) {
const char protocol[] = "https://"; const char protocol[] = "https://";
@ -42,7 +82,7 @@ namespace duck {
} }
} //unnamed namespace } //unnamed namespace
std::string clean_html (std::string&& html) { std::string clean_html (std::string&& html, OptString src_charset) {
// Initialize a Tidy document // Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate(); TidyDoc tidyDoc = tidyCreate();
@ -61,8 +101,13 @@ namespace duck {
&& tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes) && tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes)
&& tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no) && tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no)
&& tidyOptSetBool(tidyDoc, TidyPunctWrap, no) && tidyOptSetBool(tidyDoc, TidyPunctWrap, no)
&& tidyOptSetInt(tidyDoc, TidyOutCharEncoding, TidyEncUtf8)
&& tidyOptSetBool(tidyDoc, TidyMetaCharset, yes)
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes); && tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
if (src_charset)
tidyOptSetInt(tidyDoc, TidyInCharEncoding, charset_to_enum(*src_charset));
int tidyResponseCode = -1; int tidyResponseCode = -1;
// Parse input // Parse input
@ -121,6 +166,7 @@ namespace duck {
//return 1; //return 1;
//} //}
//return FetchedHtml(oss.str(), easy.get_info<CURLINFO_CONTENT_TYPE>().get());
return oss.str(); return oss.str();
} }
} //namespace duck } //namespace duck

View file

@ -21,10 +21,13 @@
#include <string> #include <string>
#include <string_view> #include <string_view>
#include <optional>
namespace duck { namespace duck {
typedef std::optional<std::string_view> OptString;
std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost ); std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
std::string clean_html ( std::string&& html ); std::string clean_html ( std::string&& html, OptString src_charset );
} //namespace duck } //namespace duck
#endif #endif

View file

@ -83,10 +83,12 @@ namespace {
const auto& vm = parVarMap; const auto& vm = parVarMap;
const auto url = vm["input-url"].as<std::string>(); const auto url = vm["input-url"].as<std::string>();
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>())); duck::HtmlPool html_pool(
std::string(parVarMap["agent"].as<std::string>()),
std::string(parVarMap["from-code"].as<std::string>())
);
const auto in_html_id = html_pool.GetOrAdd(url); const auto in_html_id = html_pool.GetOrAdd(url);
std::string html = *html_pool.GetByID(in_html_id); std::string html = *html_pool.GetByID(in_html_id);
if (vm.count("dump")) { if (vm.count("dump")) {
dump_string(vm["dump"].as<std::string>(), html); dump_string(vm["dump"].as<std::string>(), html);
} }
@ -120,7 +122,10 @@ namespace {
const std::string script = duck::read_all(parVarMap["model"].as<std::string>()); const std::string script = duck::read_all(parVarMap["model"].as<std::string>());
auto ast = duck::sl::parse(script); auto ast = duck::sl::parse(script);
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>())); duck::HtmlPool html_pool(
std::string(parVarMap["agent"].as<std::string>()),
std::string(parVarMap["from-code"].as<std::string>())
);
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath); duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
//auto list = duck::get_xpath_definitions(*ast); //auto list = duck::get_xpath_definitions(*ast);