Use iconv for converting html *before* passing input to tidyhtml.

Tidyhtml seems to be unable to convert from iso-8859-1 and I suspect
there will be many more failures in the future. So instead just
make sure all input to it is utf-8 and tell tidy to assume its
input is always utf-8.
This commit is contained in:
King_DuckZ 2020-04-02 19:50:28 +02:00
parent d64a4af105
commit fa08abd00d
5 changed files with 237 additions and 2 deletions

View file

@ -7,6 +7,7 @@ option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared librar
include(GetGitRevisionDescription)
find_package(Boost 1.32.0 COMPONENTS program_options)
find_package(XQilla 2.3.3 REQUIRED)
find_package(Iconv REQUIRED)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@ -34,6 +35,7 @@ add_executable(${PROJECT_NAME}
src/scraplang/xpath_runner.cpp
src/xpath.cpp
src/read_all.cpp
src/iconv_wrapper.cpp
)
target_include_directories(${PROJECT_NAME} SYSTEM

View file

@ -27,7 +27,8 @@
namespace duck {
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
m_agent(std::move(agent_name)),
m_src_charset(std::move(src_charset))
m_src_charset(std::move(src_charset)),
m_iconv(m_src_charset, "utf-8", IconvWrapper::ModeIgnore)
{
}
@ -46,7 +47,7 @@ namespace duck {
);
}
*utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset)));
*utf8_html = duck::clean_html(m_iconv.conv_char(*utf8_html), "utf-8");
return utf8_html.release();
}

View file

@ -20,6 +20,7 @@
#define idCDCACC393BE24CBD94A3B5E2985984A3
#include "scraplang/html_pool_base.hpp"
#include "iconv_wrapper.hpp"
namespace duck {
class HtmlPool : public ::duck::sl::HtmlPoolBase {
@ -34,6 +35,7 @@ namespace duck {
std::string m_agent;
std::string m_src_charset;
IconvWrapper m_iconv;
public:
HtmlPool(std::string&& agent_name, std::string&& src_charset);

143
src/iconv_wrapper.cpp Normal file
View file

@ -0,0 +1,143 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#include "iconv_wrapper.hpp"
#include <iconv.h>
#include <cassert>
#include <algorithm>
namespace duck {
namespace {
class IconvDeleter {
public:
typedef iconv_t pointer;
void operator() (pointer& resource) {
if (resource and reinterpret_cast<iconv_t>(-1) != resource) {
iconv_close(resource);
resource = static_cast<iconv_t>(0);
}
}
};
} //unnamed namespace
typedef std::unique_ptr<iconv_t, IconvDeleter> UniqueIconv;
IconvBadSequence::IconvBadSequence (const std::string& message) :
std::domain_error(message)
{
}
IconvOpenFailure::IconvOpenFailure (const std::string& message) :
std::logic_error(message)
{
}
struct IconvWrapper::LocalData {
UniqueIconv context;
};
IconvWrapper::IconvWrapper(const std::string& from, std::string to, Mode mode) :
m_local(std::make_unique<LocalData>())
{
switch (mode) {
case ModeIgnore:
to += "//IGNORE";
break;
case ModeTransliterate:
to += "//TRANSLIT";
break;
case ModeDefault:
default:
break;
}
m_local->context.reset(iconv_open(to.c_str(), from.c_str()));
if (reinterpret_cast<iconv_t>(-1) == m_local->context.get()) {
auto msg = std::string("Failed to create an iconv context for \"") +
from + "\" to \"" + to + "\" conversion (error code " +
std::to_string(errno);
if (EINVAL == errno)
msg += " EINVAL";
msg += ")";
throw IconvOpenFailure(msg);
}
}
IconvWrapper::~IconvWrapper() noexcept = default;
void IconvWrapper::conv (
const char* buff,
std::size_t len,
PtrGetterFunc get_ptr,
SizeGetterFunc get_size,
ReallocFunc realloc,
std::size_t grow_hint
) {
assert(buff);
assert(len);
const constexpr std::size_t def_inc = 16;
const constexpr std::size_t iconv_err = static_cast<std::size_t>(-1);
std::size_t nchars;
std::size_t inbytesleft = len;
char* inbuff = const_cast<char*>(buff);
std::ptrdiff_t out_offset = 0;
std::size_t grow_factor = grow_hint;
std::size_t outbytesleft;
do {
realloc(std::max(inbytesleft * grow_factor, def_inc) + get_size());
assert(get_size() > static_cast<std::size_t>(out_offset));
outbytesleft = get_size() - out_offset;
char* outbuff = get_ptr() + out_offset;
const auto old_inbytesleft = inbytesleft;
const auto old_outbytesleft = outbytesleft;
nchars = ::iconv(
m_local->context.get(),
&inbuff,
&inbytesleft,
&outbuff,
&outbytesleft
);
if (iconv_err == nchars) {
const auto pos_str = std::to_string(len - inbytesleft);
switch (errno) {
case EILSEQ:
throw IconvBadSequence("Invalid input multibyte sequence at byte " + pos_str);
case EINVAL:
throw IconvBadSequence("Incomplete input multibyte sequence at byte " + pos_str);
}
}
out_offset = std::distance(get_ptr(), outbuff);
assert(out_offset >= 0);
const auto in_diff = old_inbytesleft - inbytesleft;
const auto out_diff = old_outbytesleft - outbytesleft;
grow_factor = std::max<std::size_t>(1, out_diff / in_diff);
} while (iconv_err == nchars and E2BIG == errno);
assert(outbytesleft < get_size());
realloc(get_size() - outbytesleft);
}
} //namespace duck

87
src/iconv_wrapper.hpp Normal file
View file

@ -0,0 +1,87 @@
/* Copyright (C) 2015 Michele Santullo
*
* This file is part of DuckScraper.
*
* DuckScraper is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* DuckScraper is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <memory>
#include <string>
#include <string_view>
#include <functional>
#include <stdexcept>
namespace duck {
class IconvBadSequence : public std::domain_error {
public:
explicit IconvBadSequence(const std::string& message);
};
class IconvOpenFailure : public std::logic_error {
public:
explicit IconvOpenFailure(const std::string& message);
};
class IconvWrapper {
typedef std::function<char*()> PtrGetterFunc;
typedef std::function<void(std::size_t)> ReallocFunc;
typedef std::function<std::size_t()> SizeGetterFunc;
public:
enum Mode {
ModeTransliterate, ModeIgnore, ModeDefault
};
IconvWrapper (const std::string& from, std::string to, Mode mode=ModeDefault);
~IconvWrapper() noexcept;
template <typename CIn, typename COut>
std::basic_string<COut> conv (std::basic_string_view<CIn> text);
std::string conv_char (std::string_view text) {return conv<char, char>(text);}
private:
struct LocalData;
void conv (
const char* buff,
std::size_t len,
PtrGetterFunc get_ptr,
SizeGetterFunc get_size,
ReallocFunc realloc,
std::size_t grow_hint
);
std::unique_ptr<LocalData> m_local;
};
template <typename CIn, typename COut>
std::basic_string<COut> IconvWrapper::conv (std::basic_string_view<CIn> text) {
typedef std::basic_string<COut> string;
if (text.empty())
return {};
string retval;
this->conv(
reinterpret_cast<const char*>(text.data()),
text.size() * sizeof(CIn),
[&retval](){return reinterpret_cast<char*>(retval.data());},
[&retval]()->std::size_t {return retval.size() * sizeof(COut);},
[&retval](std::size_t sz){retval.resize((sz + sizeof(COut) - 1) / sizeof(COut));},
sizeof(COut) / sizeof(CIn)
);
return retval;
}
} //namespace duck