From 33866b3d6b69fa50d7c97ef41d2b8fb8d1116f11 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Wed, 19 Feb 2020 17:21:20 +0100 Subject: [PATCH] Add --from-code option for users to force the source charset. --- src/commandline.cpp | 1 + src/html_pool.cpp | 18 +++++++++------- src/html_pool.hpp | 4 +++- src/htmlretrieve.cpp | 50 ++++++++++++++++++++++++++++++++++++++++++-- src/htmlretrieve.hpp | 5 ++++- src/main.cpp | 11 +++++++--- 6 files changed, 74 insertions(+), 15 deletions(-) diff --git a/src/commandline.cpp b/src/commandline.cpp index e25ee59..a86d166 100644 --- a/src/commandline.cpp +++ b/src/commandline.cpp @@ -52,6 +52,7 @@ namespace duck { query_options.add_options() ("agent", po::value()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server") ("model,m", po::value(), "Read XPath expressions from the specified file instead of command line") + ("from-code,f", po::value()->default_value(""), "Force source charset to this, disregard any charset reported by the server") ; po::options_description positional_options("Positional options"); positional_options.add_options() diff --git a/src/html_pool.cpp b/src/html_pool.cpp index 8f43901..a8031f7 100644 --- a/src/html_pool.cpp +++ b/src/html_pool.cpp @@ -25,27 +25,29 @@ #include namespace duck { - HtmlPool::HtmlPool (std::string&& agent_name) : - m_agent(std::move(agent_name)) + HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) : + m_agent(std::move(agent_name)), + m_src_charset(std::move(src_charset)) { } - auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* { - std::unique_ptr html; + HtmlPool::~HtmlPool() noexcept = default; + auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* { std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n"; + std::unique_ptr utf8_html; if (parRes == "-") { - html = std::make_unique(read_all(std::cin)); + utf8_html = std::make_unique(read_all(std::cin)); } else { - html = std::make_unique( + utf8_html = std::make_unique( fetch_html(parRes, m_agent, false, false) ); } - *html = duck::clean_html(std::move(*html)); - return html.release(); + *utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset))); + return utf8_html.release(); } void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept { diff --git a/src/html_pool.hpp b/src/html_pool.hpp index d08d860..ea95fd3 100644 --- a/src/html_pool.hpp +++ b/src/html_pool.hpp @@ -33,9 +33,11 @@ namespace duck { virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes); std::string m_agent; + std::string m_src_charset; public: - explicit HtmlPool (std::string&& agent_name); + HtmlPool(std::string&& agent_name, std::string&& src_charset); + ~HtmlPool() noexcept; }; } //namespace duck diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp index 6a7cf6c..bd17eb0 100644 --- a/src/htmlretrieve.cpp +++ b/src/htmlretrieve.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2015 Michele Santullo +/* Copyright (C) 2015-2020 Michele Santullo * * This file is part of DuckScraper. * @@ -28,9 +28,49 @@ #include #include #include +#include +#include namespace duck { namespace { + std::string make_lowercase (std::string_view in) { + std::string out; + std::transform(in.begin(), in.end(), std::back_inserter(out), [](unsigned char c){return std::tolower(c);}); + return out; + } + + TidyEncodingOptions charset_to_enum (std::string_view name) { + const std::string lower_name = make_lowercase(name); + if (lower_name == "ascii") + return TidyEncAscii; + //else if (lower_name == "???") + // return TidyEncLatin0; + //else if (lower_name == "???") + // return TidyEncLatin1; + else if (lower_name == "utf-8") + return TidyEncUtf8; +#ifndef NO_NATIVE_ISO2022_SUPPORT + else if (lower_name == "iso-2022-cn") + return TidyEncIso2022; +#endif + else if (lower_name == "mac") + return TidyEncMac; + else if (lower_name == "windows-1252") + return TidyEncWin1252; + else if (lower_name == "ibm858") + return TidyEncIbm858; + else if (lower_name == "utf-16le") + return TidyEncUtf16le; + else if (lower_name == "utf-16be") + return TidyEncUtf16be; + else if (lower_name == "utf-16") + return TidyEncUtf16; + else if (lower_name == "big-5") + return TidyEncBig5; + else if (lower_name == "shift-jis" or lower_name == "shift_jis") + return TidyEncShiftjis; + throw std::runtime_error("Failed to recognise \"" + std::string(name) + "\" as a valid source charset"); + } bool isHttps (const std::string_view& parUrl) { const char protocol[] = "https://"; @@ -42,7 +82,7 @@ namespace duck { } } //unnamed namespace - std::string clean_html (std::string&& html) { + std::string clean_html (std::string&& html, OptString src_charset) { // Initialize a Tidy document TidyDoc tidyDoc = tidyCreate(); @@ -61,8 +101,13 @@ namespace duck { && tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes) && tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no) && tidyOptSetBool(tidyDoc, TidyPunctWrap, no) + && tidyOptSetInt(tidyDoc, TidyOutCharEncoding, TidyEncUtf8) + && tidyOptSetBool(tidyDoc, TidyMetaCharset, yes) && tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes); + if (src_charset) + tidyOptSetInt(tidyDoc, TidyInCharEncoding, charset_to_enum(*src_charset)); + int tidyResponseCode = -1; // Parse input @@ -121,6 +166,7 @@ namespace duck { //return 1; //} + //return FetchedHtml(oss.str(), easy.get_info().get()); return oss.str(); } } //namespace duck diff --git a/src/htmlretrieve.hpp b/src/htmlretrieve.hpp index 578585a..881747f 100644 --- a/src/htmlretrieve.hpp +++ b/src/htmlretrieve.hpp @@ -21,10 +21,13 @@ #include #include +#include namespace duck { + typedef std::optional OptString; + std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost ); - std::string clean_html ( std::string&& html ); + std::string clean_html ( std::string&& html, OptString src_charset ); } //namespace duck #endif diff --git a/src/main.cpp b/src/main.cpp index 04c9aec..03b2577 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -83,10 +83,12 @@ namespace { const auto& vm = parVarMap; const auto url = vm["input-url"].as(); - duck::HtmlPool html_pool(std::string(parVarMap["agent"].as())); + duck::HtmlPool html_pool( + std::string(parVarMap["agent"].as()), + std::string(parVarMap["from-code"].as()) + ); const auto in_html_id = html_pool.GetOrAdd(url); std::string html = *html_pool.GetByID(in_html_id); - if (vm.count("dump")) { dump_string(vm["dump"].as(), html); } @@ -120,7 +122,10 @@ namespace { const std::string script = duck::read_all(parVarMap["model"].as()); auto ast = duck::sl::parse(script); - duck::HtmlPool html_pool(std::string(parVarMap["agent"].as())); + duck::HtmlPool html_pool( + std::string(parVarMap["agent"].as()), + std::string(parVarMap["from-code"].as()) + ); duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath); //auto list = duck::get_xpath_definitions(*ast);