Use iconv for converting html *before* passing input to tidyhtml.
Tidyhtml seems to be unable to convert from iso-8859-1 and I suspect there will be many more failures in the future. So instead just make sure all input to it is utf-8 and tell tidy to assume its input is always utf-8.
This commit is contained in:
parent
d64a4af105
commit
fa08abd00d
5 changed files with 237 additions and 2 deletions
|
@ -7,6 +7,7 @@ option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared librar
|
|||
include(GetGitRevisionDescription)
|
||||
find_package(Boost 1.32.0 COMPONENTS program_options)
|
||||
find_package(XQilla 2.3.3 REQUIRED)
|
||||
find_package(Iconv REQUIRED)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
@ -34,6 +35,7 @@ add_executable(${PROJECT_NAME}
|
|||
src/scraplang/xpath_runner.cpp
|
||||
src/xpath.cpp
|
||||
src/read_all.cpp
|
||||
src/iconv_wrapper.cpp
|
||||
)
|
||||
|
||||
target_include_directories(${PROJECT_NAME} SYSTEM
|
||||
|
|
|
@ -27,7 +27,8 @@
|
|||
namespace duck {
|
||||
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
|
||||
m_agent(std::move(agent_name)),
|
||||
m_src_charset(std::move(src_charset))
|
||||
m_src_charset(std::move(src_charset)),
|
||||
m_iconv(m_src_charset, "utf-8", IconvWrapper::ModeIgnore)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -46,7 +47,7 @@ namespace duck {
|
|||
);
|
||||
}
|
||||
|
||||
*utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset)));
|
||||
*utf8_html = duck::clean_html(m_iconv.conv_char(*utf8_html), "utf-8");
|
||||
return utf8_html.release();
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#define idCDCACC393BE24CBD94A3B5E2985984A3
|
||||
|
||||
#include "scraplang/html_pool_base.hpp"
|
||||
#include "iconv_wrapper.hpp"
|
||||
|
||||
namespace duck {
|
||||
class HtmlPool : public ::duck::sl::HtmlPoolBase {
|
||||
|
@ -34,6 +35,7 @@ namespace duck {
|
|||
|
||||
std::string m_agent;
|
||||
std::string m_src_charset;
|
||||
IconvWrapper m_iconv;
|
||||
|
||||
public:
|
||||
HtmlPool(std::string&& agent_name, std::string&& src_charset);
|
||||
|
|
143
src/iconv_wrapper.cpp
Normal file
143
src/iconv_wrapper.cpp
Normal file
|
@ -0,0 +1,143 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "iconv_wrapper.hpp"
|
||||
#include <iconv.h>
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
|
||||
namespace duck {
|
||||
namespace {
|
||||
class IconvDeleter {
|
||||
public:
|
||||
typedef iconv_t pointer;
|
||||
|
||||
void operator() (pointer& resource) {
|
||||
if (resource and reinterpret_cast<iconv_t>(-1) != resource) {
|
||||
iconv_close(resource);
|
||||
resource = static_cast<iconv_t>(0);
|
||||
}
|
||||
}
|
||||
};
|
||||
} //unnamed namespace
|
||||
|
||||
typedef std::unique_ptr<iconv_t, IconvDeleter> UniqueIconv;
|
||||
|
||||
IconvBadSequence::IconvBadSequence (const std::string& message) :
|
||||
std::domain_error(message)
|
||||
{
|
||||
}
|
||||
|
||||
IconvOpenFailure::IconvOpenFailure (const std::string& message) :
|
||||
std::logic_error(message)
|
||||
{
|
||||
}
|
||||
|
||||
struct IconvWrapper::LocalData {
|
||||
UniqueIconv context;
|
||||
};
|
||||
|
||||
IconvWrapper::IconvWrapper(const std::string& from, std::string to, Mode mode) :
|
||||
m_local(std::make_unique<LocalData>())
|
||||
{
|
||||
switch (mode) {
|
||||
case ModeIgnore:
|
||||
to += "//IGNORE";
|
||||
break;
|
||||
case ModeTransliterate:
|
||||
to += "//TRANSLIT";
|
||||
break;
|
||||
case ModeDefault:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
m_local->context.reset(iconv_open(to.c_str(), from.c_str()));
|
||||
if (reinterpret_cast<iconv_t>(-1) == m_local->context.get()) {
|
||||
auto msg = std::string("Failed to create an iconv context for \"") +
|
||||
from + "\" to \"" + to + "\" conversion (error code " +
|
||||
std::to_string(errno);
|
||||
if (EINVAL == errno)
|
||||
msg += " EINVAL";
|
||||
msg += ")";
|
||||
|
||||
throw IconvOpenFailure(msg);
|
||||
}
|
||||
}
|
||||
|
||||
IconvWrapper::~IconvWrapper() noexcept = default;
|
||||
|
||||
void IconvWrapper::conv (
|
||||
const char* buff,
|
||||
std::size_t len,
|
||||
PtrGetterFunc get_ptr,
|
||||
SizeGetterFunc get_size,
|
||||
ReallocFunc realloc,
|
||||
std::size_t grow_hint
|
||||
) {
|
||||
assert(buff);
|
||||
assert(len);
|
||||
|
||||
const constexpr std::size_t def_inc = 16;
|
||||
const constexpr std::size_t iconv_err = static_cast<std::size_t>(-1);
|
||||
|
||||
std::size_t nchars;
|
||||
std::size_t inbytesleft = len;
|
||||
char* inbuff = const_cast<char*>(buff);
|
||||
std::ptrdiff_t out_offset = 0;
|
||||
std::size_t grow_factor = grow_hint;
|
||||
std::size_t outbytesleft;
|
||||
|
||||
do {
|
||||
realloc(std::max(inbytesleft * grow_factor, def_inc) + get_size());
|
||||
assert(get_size() > static_cast<std::size_t>(out_offset));
|
||||
outbytesleft = get_size() - out_offset;
|
||||
char* outbuff = get_ptr() + out_offset;
|
||||
|
||||
const auto old_inbytesleft = inbytesleft;
|
||||
const auto old_outbytesleft = outbytesleft;
|
||||
|
||||
nchars = ::iconv(
|
||||
m_local->context.get(),
|
||||
&inbuff,
|
||||
&inbytesleft,
|
||||
&outbuff,
|
||||
&outbytesleft
|
||||
);
|
||||
if (iconv_err == nchars) {
|
||||
const auto pos_str = std::to_string(len - inbytesleft);
|
||||
switch (errno) {
|
||||
case EILSEQ:
|
||||
throw IconvBadSequence("Invalid input multibyte sequence at byte " + pos_str);
|
||||
case EINVAL:
|
||||
throw IconvBadSequence("Incomplete input multibyte sequence at byte " + pos_str);
|
||||
}
|
||||
}
|
||||
|
||||
out_offset = std::distance(get_ptr(), outbuff);
|
||||
assert(out_offset >= 0);
|
||||
|
||||
const auto in_diff = old_inbytesleft - inbytesleft;
|
||||
const auto out_diff = old_outbytesleft - outbytesleft;
|
||||
grow_factor = std::max<std::size_t>(1, out_diff / in_diff);
|
||||
} while (iconv_err == nchars and E2BIG == errno);
|
||||
|
||||
assert(outbytesleft < get_size());
|
||||
realloc(get_size() - outbytesleft);
|
||||
}
|
||||
} //namespace duck
|
87
src/iconv_wrapper.hpp
Normal file
87
src/iconv_wrapper.hpp
Normal file
|
@ -0,0 +1,87 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
* DuckScraper is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* DuckScraper is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <functional>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace duck {
|
||||
class IconvBadSequence : public std::domain_error {
|
||||
public:
|
||||
explicit IconvBadSequence(const std::string& message);
|
||||
};
|
||||
class IconvOpenFailure : public std::logic_error {
|
||||
public:
|
||||
explicit IconvOpenFailure(const std::string& message);
|
||||
};
|
||||
|
||||
class IconvWrapper {
|
||||
typedef std::function<char*()> PtrGetterFunc;
|
||||
typedef std::function<void(std::size_t)> ReallocFunc;
|
||||
typedef std::function<std::size_t()> SizeGetterFunc;
|
||||
public:
|
||||
enum Mode {
|
||||
ModeTransliterate, ModeIgnore, ModeDefault
|
||||
};
|
||||
|
||||
IconvWrapper (const std::string& from, std::string to, Mode mode=ModeDefault);
|
||||
~IconvWrapper() noexcept;
|
||||
|
||||
template <typename CIn, typename COut>
|
||||
std::basic_string<COut> conv (std::basic_string_view<CIn> text);
|
||||
|
||||
std::string conv_char (std::string_view text) {return conv<char, char>(text);}
|
||||
|
||||
private:
|
||||
struct LocalData;
|
||||
|
||||
void conv (
|
||||
const char* buff,
|
||||
std::size_t len,
|
||||
PtrGetterFunc get_ptr,
|
||||
SizeGetterFunc get_size,
|
||||
ReallocFunc realloc,
|
||||
std::size_t grow_hint
|
||||
);
|
||||
|
||||
std::unique_ptr<LocalData> m_local;
|
||||
};
|
||||
|
||||
template <typename CIn, typename COut>
|
||||
std::basic_string<COut> IconvWrapper::conv (std::basic_string_view<CIn> text) {
|
||||
typedef std::basic_string<COut> string;
|
||||
|
||||
if (text.empty())
|
||||
return {};
|
||||
|
||||
string retval;
|
||||
this->conv(
|
||||
reinterpret_cast<const char*>(text.data()),
|
||||
text.size() * sizeof(CIn),
|
||||
[&retval](){return reinterpret_cast<char*>(retval.data());},
|
||||
[&retval]()->std::size_t {return retval.size() * sizeof(COut);},
|
||||
[&retval](std::size_t sz){retval.resize((sz + sizeof(COut) - 1) / sizeof(COut));},
|
||||
sizeof(COut) / sizeof(CIn)
|
||||
);
|
||||
return retval;
|
||||
}
|
||||
} //namespace duck
|
Loading…
Reference in a new issue