From fa08abd00d79d86123b122c078be9c8a1132912d Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Thu, 2 Apr 2020 19:50:28 +0200 Subject: [PATCH] Use iconv for converting html *before* passing input to tidyhtml. Tidyhtml seems to be unable to convert from iso-8859-1 and I suspect there will be many more failures in the future. So instead just make sure all input to it is utf-8 and tell tidy to assume its input is always utf-8. --- CMakeLists.txt | 2 + src/html_pool.cpp | 5 +- src/html_pool.hpp | 2 + src/iconv_wrapper.cpp | 143 ++++++++++++++++++++++++++++++++++++++++++ src/iconv_wrapper.hpp | 87 +++++++++++++++++++++++++ 5 files changed, 237 insertions(+), 2 deletions(-) create mode 100644 src/iconv_wrapper.cpp create mode 100644 src/iconv_wrapper.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f9717f..fa8ba88 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared librar include(GetGitRevisionDescription) find_package(Boost 1.32.0 COMPONENTS program_options) find_package(XQilla 2.3.3 REQUIRED) +find_package(Iconv REQUIRED) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -34,6 +35,7 @@ add_executable(${PROJECT_NAME} src/scraplang/xpath_runner.cpp src/xpath.cpp src/read_all.cpp + src/iconv_wrapper.cpp ) target_include_directories(${PROJECT_NAME} SYSTEM diff --git a/src/html_pool.cpp b/src/html_pool.cpp index a8031f7..8f24400 100644 --- a/src/html_pool.cpp +++ b/src/html_pool.cpp @@ -27,7 +27,8 @@ namespace duck { HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) : m_agent(std::move(agent_name)), - m_src_charset(std::move(src_charset)) + m_src_charset(std::move(src_charset)), + m_iconv(m_src_charset, "utf-8", IconvWrapper::ModeIgnore) { } @@ -46,7 +47,7 @@ namespace duck { ); } - *utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset))); + *utf8_html = duck::clean_html(m_iconv.conv_char(*utf8_html), "utf-8"); return utf8_html.release(); } diff --git a/src/html_pool.hpp b/src/html_pool.hpp index ea95fd3..2b7acca 100644 --- a/src/html_pool.hpp +++ b/src/html_pool.hpp @@ -20,6 +20,7 @@ #define idCDCACC393BE24CBD94A3B5E2985984A3 #include "scraplang/html_pool_base.hpp" +#include "iconv_wrapper.hpp" namespace duck { class HtmlPool : public ::duck::sl::HtmlPoolBase { @@ -34,6 +35,7 @@ namespace duck { std::string m_agent; std::string m_src_charset; + IconvWrapper m_iconv; public: HtmlPool(std::string&& agent_name, std::string&& src_charset); diff --git a/src/iconv_wrapper.cpp b/src/iconv_wrapper.cpp new file mode 100644 index 0000000..9c149fb --- /dev/null +++ b/src/iconv_wrapper.cpp @@ -0,0 +1,143 @@ +/* Copyright (C) 2015 Michele Santullo + * + * This file is part of DuckScraper. + * + * DuckScraper is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * DuckScraper is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with DuckScraper. If not, see . + */ + +#include "iconv_wrapper.hpp" +#include +#include +#include + +namespace duck { + namespace { + class IconvDeleter { + public: + typedef iconv_t pointer; + + void operator() (pointer& resource) { + if (resource and reinterpret_cast(-1) != resource) { + iconv_close(resource); + resource = static_cast(0); + } + } + }; + } //unnamed namespace + + typedef std::unique_ptr UniqueIconv; + + IconvBadSequence::IconvBadSequence (const std::string& message) : + std::domain_error(message) + { + } + + IconvOpenFailure::IconvOpenFailure (const std::string& message) : + std::logic_error(message) + { + } + + struct IconvWrapper::LocalData { + UniqueIconv context; + }; + + IconvWrapper::IconvWrapper(const std::string& from, std::string to, Mode mode) : + m_local(std::make_unique()) + { + switch (mode) { + case ModeIgnore: + to += "//IGNORE"; + break; + case ModeTransliterate: + to += "//TRANSLIT"; + break; + case ModeDefault: + default: + break; + } + + m_local->context.reset(iconv_open(to.c_str(), from.c_str())); + if (reinterpret_cast(-1) == m_local->context.get()) { + auto msg = std::string("Failed to create an iconv context for \"") + + from + "\" to \"" + to + "\" conversion (error code " + + std::to_string(errno); + if (EINVAL == errno) + msg += " EINVAL"; + msg += ")"; + + throw IconvOpenFailure(msg); + } + } + + IconvWrapper::~IconvWrapper() noexcept = default; + + void IconvWrapper::conv ( + const char* buff, + std::size_t len, + PtrGetterFunc get_ptr, + SizeGetterFunc get_size, + ReallocFunc realloc, + std::size_t grow_hint + ) { + assert(buff); + assert(len); + + const constexpr std::size_t def_inc = 16; + const constexpr std::size_t iconv_err = static_cast(-1); + + std::size_t nchars; + std::size_t inbytesleft = len; + char* inbuff = const_cast(buff); + std::ptrdiff_t out_offset = 0; + std::size_t grow_factor = grow_hint; + std::size_t outbytesleft; + + do { + realloc(std::max(inbytesleft * grow_factor, def_inc) + get_size()); + assert(get_size() > static_cast(out_offset)); + outbytesleft = get_size() - out_offset; + char* outbuff = get_ptr() + out_offset; + + const auto old_inbytesleft = inbytesleft; + const auto old_outbytesleft = outbytesleft; + + nchars = ::iconv( + m_local->context.get(), + &inbuff, + &inbytesleft, + &outbuff, + &outbytesleft + ); + if (iconv_err == nchars) { + const auto pos_str = std::to_string(len - inbytesleft); + switch (errno) { + case EILSEQ: + throw IconvBadSequence("Invalid input multibyte sequence at byte " + pos_str); + case EINVAL: + throw IconvBadSequence("Incomplete input multibyte sequence at byte " + pos_str); + } + } + + out_offset = std::distance(get_ptr(), outbuff); + assert(out_offset >= 0); + + const auto in_diff = old_inbytesleft - inbytesleft; + const auto out_diff = old_outbytesleft - outbytesleft; + grow_factor = std::max(1, out_diff / in_diff); + } while (iconv_err == nchars and E2BIG == errno); + + assert(outbytesleft < get_size()); + realloc(get_size() - outbytesleft); + } +} //namespace duck diff --git a/src/iconv_wrapper.hpp b/src/iconv_wrapper.hpp new file mode 100644 index 0000000..9680c7f --- /dev/null +++ b/src/iconv_wrapper.hpp @@ -0,0 +1,87 @@ +/* Copyright (C) 2015 Michele Santullo + * + * This file is part of DuckScraper. + * + * DuckScraper is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * DuckScraper is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with DuckScraper. If not, see . + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace duck { + class IconvBadSequence : public std::domain_error { + public: + explicit IconvBadSequence(const std::string& message); + }; + class IconvOpenFailure : public std::logic_error { + public: + explicit IconvOpenFailure(const std::string& message); + }; + + class IconvWrapper { + typedef std::function PtrGetterFunc; + typedef std::function ReallocFunc; + typedef std::function SizeGetterFunc; + public: + enum Mode { + ModeTransliterate, ModeIgnore, ModeDefault + }; + + IconvWrapper (const std::string& from, std::string to, Mode mode=ModeDefault); + ~IconvWrapper() noexcept; + + template + std::basic_string conv (std::basic_string_view text); + + std::string conv_char (std::string_view text) {return conv(text);} + + private: + struct LocalData; + + void conv ( + const char* buff, + std::size_t len, + PtrGetterFunc get_ptr, + SizeGetterFunc get_size, + ReallocFunc realloc, + std::size_t grow_hint + ); + + std::unique_ptr m_local; + }; + + template + std::basic_string IconvWrapper::conv (std::basic_string_view text) { + typedef std::basic_string string; + + if (text.empty()) + return {}; + + string retval; + this->conv( + reinterpret_cast(text.data()), + text.size() * sizeof(CIn), + [&retval](){return reinterpret_cast(retval.data());}, + [&retval]()->std::size_t {return retval.size() * sizeof(COut);}, + [&retval](std::size_t sz){retval.resize((sz + sizeof(COut) - 1) / sizeof(COut));}, + sizeof(COut) / sizeof(CIn) + ); + return retval; + } +} //namespace duck