Add --from-code option for users to force the source charset.

This commit is contained in:
King_DuckZ 2020-02-19 17:21:20 +01:00
parent 54ac44b81d
commit 33866b3d6b
6 changed files with 74 additions and 15 deletions

View file

@ -52,6 +52,7 @@ namespace duck {
query_options.add_options()
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
;
po::options_description positional_options("Positional options");
positional_options.add_options()

View file

@ -25,27 +25,29 @@
#include <iostream>
namespace duck {
HtmlPool::HtmlPool (std::string&& agent_name) :
m_agent(std::move(agent_name))
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
m_agent(std::move(agent_name)),
m_src_charset(std::move(src_charset))
{
}
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
std::unique_ptr<std::string> html;
HtmlPool::~HtmlPool() noexcept = default;
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n";
std::unique_ptr<std::string> utf8_html;
if (parRes == "-") {
html = std::make_unique<std::string>(read_all(std::cin));
utf8_html = std::make_unique<std::string>(read_all(std::cin));
}
else {
html = std::make_unique<std::string>(
utf8_html = std::make_unique<std::string>(
fetch_html(parRes, m_agent, false, false)
);
}
*html = duck::clean_html(std::move(*html));
return html.release();
*utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset)));
return utf8_html.release();
}
void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept {

View file

@ -33,9 +33,11 @@ namespace duck {
virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes);
std::string m_agent;
std::string m_src_charset;
public:
explicit HtmlPool (std::string&& agent_name);
HtmlPool(std::string&& agent_name, std::string&& src_charset);
~HtmlPool() noexcept;
};
} //namespace duck

View file

@ -1,4 +1,4 @@
/* Copyright (C) 2015 Michele Santullo
/* Copyright (C) 2015-2020 Michele Santullo
*
* This file is part of DuckScraper.
*
@ -28,9 +28,49 @@
#include <memory>
#include <cassert>
#include <utility>
#include <cctype>
#include <iterator>
namespace duck {
namespace {
std::string make_lowercase (std::string_view in) {
std::string out;
std::transform(in.begin(), in.end(), std::back_inserter(out), [](unsigned char c){return std::tolower(c);});
return out;
}
TidyEncodingOptions charset_to_enum (std::string_view name) {
const std::string lower_name = make_lowercase(name);
if (lower_name == "ascii")
return TidyEncAscii;
//else if (lower_name == "???")
// return TidyEncLatin0;
//else if (lower_name == "???")
// return TidyEncLatin1;
else if (lower_name == "utf-8")
return TidyEncUtf8;
#ifndef NO_NATIVE_ISO2022_SUPPORT
else if (lower_name == "iso-2022-cn")
return TidyEncIso2022;
#endif
else if (lower_name == "mac")
return TidyEncMac;
else if (lower_name == "windows-1252")
return TidyEncWin1252;
else if (lower_name == "ibm858")
return TidyEncIbm858;
else if (lower_name == "utf-16le")
return TidyEncUtf16le;
else if (lower_name == "utf-16be")
return TidyEncUtf16be;
else if (lower_name == "utf-16")
return TidyEncUtf16;
else if (lower_name == "big-5")
return TidyEncBig5;
else if (lower_name == "shift-jis" or lower_name == "shift_jis")
return TidyEncShiftjis;
throw std::runtime_error("Failed to recognise \"" + std::string(name) + "\" as a valid source charset");
}
bool isHttps (const std::string_view& parUrl) {
const char protocol[] = "https://";
@ -42,7 +82,7 @@ namespace duck {
}
} //unnamed namespace
std::string clean_html (std::string&& html) {
std::string clean_html (std::string&& html, OptString src_charset) {
// Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate();
@ -61,8 +101,13 @@ namespace duck {
&& tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes)
&& tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no)
&& tidyOptSetBool(tidyDoc, TidyPunctWrap, no)
&& tidyOptSetInt(tidyDoc, TidyOutCharEncoding, TidyEncUtf8)
&& tidyOptSetBool(tidyDoc, TidyMetaCharset, yes)
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
if (src_charset)
tidyOptSetInt(tidyDoc, TidyInCharEncoding, charset_to_enum(*src_charset));
int tidyResponseCode = -1;
// Parse input
@ -121,6 +166,7 @@ namespace duck {
//return 1;
//}
//return FetchedHtml(oss.str(), easy.get_info<CURLINFO_CONTENT_TYPE>().get());
return oss.str();
}
} //namespace duck

View file

@ -21,10 +21,13 @@
#include <string>
#include <string_view>
#include <optional>
namespace duck {
typedef std::optional<std::string_view> OptString;
std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
std::string clean_html ( std::string&& html );
std::string clean_html ( std::string&& html, OptString src_charset );
} //namespace duck
#endif

View file

@ -83,10 +83,12 @@ namespace {
const auto& vm = parVarMap;
const auto url = vm["input-url"].as<std::string>();
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
duck::HtmlPool html_pool(
std::string(parVarMap["agent"].as<std::string>()),
std::string(parVarMap["from-code"].as<std::string>())
);
const auto in_html_id = html_pool.GetOrAdd(url);
std::string html = *html_pool.GetByID(in_html_id);
if (vm.count("dump")) {
dump_string(vm["dump"].as<std::string>(), html);
}
@ -120,7 +122,10 @@ namespace {
const std::string script = duck::read_all(parVarMap["model"].as<std::string>());
auto ast = duck::sl::parse(script);
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
duck::HtmlPool html_pool(
std::string(parVarMap["agent"].as<std::string>()),
std::string(parVarMap["from-code"].as<std::string>())
);
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
//auto list = duck::get_xpath_definitions(*ast);