Use iconv for converting html *before* passing input to tidyhtml.
Tidyhtml seems to be unable to convert from iso-8859-1 and I suspect there will be many more failures in the future. So instead just make sure all input to it is utf-8 and tell tidy to assume its input is always utf-8.
This commit is contained in:
parent
d64a4af105
commit
fa08abd00d
5 changed files with 237 additions and 2 deletions
|
@ -7,6 +7,7 @@ option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared librar
|
||||||
include(GetGitRevisionDescription)
|
include(GetGitRevisionDescription)
|
||||||
find_package(Boost 1.32.0 COMPONENTS program_options)
|
find_package(Boost 1.32.0 COMPONENTS program_options)
|
||||||
find_package(XQilla 2.3.3 REQUIRED)
|
find_package(XQilla 2.3.3 REQUIRED)
|
||||||
|
find_package(Iconv REQUIRED)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
@ -34,6 +35,7 @@ add_executable(${PROJECT_NAME}
|
||||||
src/scraplang/xpath_runner.cpp
|
src/scraplang/xpath_runner.cpp
|
||||||
src/xpath.cpp
|
src/xpath.cpp
|
||||||
src/read_all.cpp
|
src/read_all.cpp
|
||||||
|
src/iconv_wrapper.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(${PROJECT_NAME} SYSTEM
|
target_include_directories(${PROJECT_NAME} SYSTEM
|
||||||
|
|
|
@ -27,7 +27,8 @@
|
||||||
namespace duck {
|
namespace duck {
|
||||||
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
|
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
|
||||||
m_agent(std::move(agent_name)),
|
m_agent(std::move(agent_name)),
|
||||||
m_src_charset(std::move(src_charset))
|
m_src_charset(std::move(src_charset)),
|
||||||
|
m_iconv(m_src_charset, "utf-8", IconvWrapper::ModeIgnore)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,7 +47,7 @@ namespace duck {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
*utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset)));
|
*utf8_html = duck::clean_html(m_iconv.conv_char(*utf8_html), "utf-8");
|
||||||
return utf8_html.release();
|
return utf8_html.release();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#define idCDCACC393BE24CBD94A3B5E2985984A3
|
#define idCDCACC393BE24CBD94A3B5E2985984A3
|
||||||
|
|
||||||
#include "scraplang/html_pool_base.hpp"
|
#include "scraplang/html_pool_base.hpp"
|
||||||
|
#include "iconv_wrapper.hpp"
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
class HtmlPool : public ::duck::sl::HtmlPoolBase {
|
class HtmlPool : public ::duck::sl::HtmlPoolBase {
|
||||||
|
@ -34,6 +35,7 @@ namespace duck {
|
||||||
|
|
||||||
std::string m_agent;
|
std::string m_agent;
|
||||||
std::string m_src_charset;
|
std::string m_src_charset;
|
||||||
|
IconvWrapper m_iconv;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
HtmlPool(std::string&& agent_name, std::string&& src_charset);
|
HtmlPool(std::string&& agent_name, std::string&& src_charset);
|
||||||
|
|
143
src/iconv_wrapper.cpp
Normal file
143
src/iconv_wrapper.cpp
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "iconv_wrapper.hpp"
|
||||||
|
#include <iconv.h>
|
||||||
|
#include <cassert>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
namespace {
|
||||||
|
class IconvDeleter {
|
||||||
|
public:
|
||||||
|
typedef iconv_t pointer;
|
||||||
|
|
||||||
|
void operator() (pointer& resource) {
|
||||||
|
if (resource and reinterpret_cast<iconv_t>(-1) != resource) {
|
||||||
|
iconv_close(resource);
|
||||||
|
resource = static_cast<iconv_t>(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} //unnamed namespace
|
||||||
|
|
||||||
|
typedef std::unique_ptr<iconv_t, IconvDeleter> UniqueIconv;
|
||||||
|
|
||||||
|
IconvBadSequence::IconvBadSequence (const std::string& message) :
|
||||||
|
std::domain_error(message)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
IconvOpenFailure::IconvOpenFailure (const std::string& message) :
|
||||||
|
std::logic_error(message)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
struct IconvWrapper::LocalData {
|
||||||
|
UniqueIconv context;
|
||||||
|
};
|
||||||
|
|
||||||
|
IconvWrapper::IconvWrapper(const std::string& from, std::string to, Mode mode) :
|
||||||
|
m_local(std::make_unique<LocalData>())
|
||||||
|
{
|
||||||
|
switch (mode) {
|
||||||
|
case ModeIgnore:
|
||||||
|
to += "//IGNORE";
|
||||||
|
break;
|
||||||
|
case ModeTransliterate:
|
||||||
|
to += "//TRANSLIT";
|
||||||
|
break;
|
||||||
|
case ModeDefault:
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_local->context.reset(iconv_open(to.c_str(), from.c_str()));
|
||||||
|
if (reinterpret_cast<iconv_t>(-1) == m_local->context.get()) {
|
||||||
|
auto msg = std::string("Failed to create an iconv context for \"") +
|
||||||
|
from + "\" to \"" + to + "\" conversion (error code " +
|
||||||
|
std::to_string(errno);
|
||||||
|
if (EINVAL == errno)
|
||||||
|
msg += " EINVAL";
|
||||||
|
msg += ")";
|
||||||
|
|
||||||
|
throw IconvOpenFailure(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
IconvWrapper::~IconvWrapper() noexcept = default;
|
||||||
|
|
||||||
|
void IconvWrapper::conv (
|
||||||
|
const char* buff,
|
||||||
|
std::size_t len,
|
||||||
|
PtrGetterFunc get_ptr,
|
||||||
|
SizeGetterFunc get_size,
|
||||||
|
ReallocFunc realloc,
|
||||||
|
std::size_t grow_hint
|
||||||
|
) {
|
||||||
|
assert(buff);
|
||||||
|
assert(len);
|
||||||
|
|
||||||
|
const constexpr std::size_t def_inc = 16;
|
||||||
|
const constexpr std::size_t iconv_err = static_cast<std::size_t>(-1);
|
||||||
|
|
||||||
|
std::size_t nchars;
|
||||||
|
std::size_t inbytesleft = len;
|
||||||
|
char* inbuff = const_cast<char*>(buff);
|
||||||
|
std::ptrdiff_t out_offset = 0;
|
||||||
|
std::size_t grow_factor = grow_hint;
|
||||||
|
std::size_t outbytesleft;
|
||||||
|
|
||||||
|
do {
|
||||||
|
realloc(std::max(inbytesleft * grow_factor, def_inc) + get_size());
|
||||||
|
assert(get_size() > static_cast<std::size_t>(out_offset));
|
||||||
|
outbytesleft = get_size() - out_offset;
|
||||||
|
char* outbuff = get_ptr() + out_offset;
|
||||||
|
|
||||||
|
const auto old_inbytesleft = inbytesleft;
|
||||||
|
const auto old_outbytesleft = outbytesleft;
|
||||||
|
|
||||||
|
nchars = ::iconv(
|
||||||
|
m_local->context.get(),
|
||||||
|
&inbuff,
|
||||||
|
&inbytesleft,
|
||||||
|
&outbuff,
|
||||||
|
&outbytesleft
|
||||||
|
);
|
||||||
|
if (iconv_err == nchars) {
|
||||||
|
const auto pos_str = std::to_string(len - inbytesleft);
|
||||||
|
switch (errno) {
|
||||||
|
case EILSEQ:
|
||||||
|
throw IconvBadSequence("Invalid input multibyte sequence at byte " + pos_str);
|
||||||
|
case EINVAL:
|
||||||
|
throw IconvBadSequence("Incomplete input multibyte sequence at byte " + pos_str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out_offset = std::distance(get_ptr(), outbuff);
|
||||||
|
assert(out_offset >= 0);
|
||||||
|
|
||||||
|
const auto in_diff = old_inbytesleft - inbytesleft;
|
||||||
|
const auto out_diff = old_outbytesleft - outbytesleft;
|
||||||
|
grow_factor = std::max<std::size_t>(1, out_diff / in_diff);
|
||||||
|
} while (iconv_err == nchars and E2BIG == errno);
|
||||||
|
|
||||||
|
assert(outbytesleft < get_size());
|
||||||
|
realloc(get_size() - outbytesleft);
|
||||||
|
}
|
||||||
|
} //namespace duck
|
87
src/iconv_wrapper.hpp
Normal file
87
src/iconv_wrapper.hpp
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
/* Copyright (C) 2015 Michele Santullo
|
||||||
|
*
|
||||||
|
* This file is part of DuckScraper.
|
||||||
|
*
|
||||||
|
* DuckScraper is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* DuckScraper is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with DuckScraper. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <functional>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
namespace duck {
|
||||||
|
class IconvBadSequence : public std::domain_error {
|
||||||
|
public:
|
||||||
|
explicit IconvBadSequence(const std::string& message);
|
||||||
|
};
|
||||||
|
class IconvOpenFailure : public std::logic_error {
|
||||||
|
public:
|
||||||
|
explicit IconvOpenFailure(const std::string& message);
|
||||||
|
};
|
||||||
|
|
||||||
|
class IconvWrapper {
|
||||||
|
typedef std::function<char*()> PtrGetterFunc;
|
||||||
|
typedef std::function<void(std::size_t)> ReallocFunc;
|
||||||
|
typedef std::function<std::size_t()> SizeGetterFunc;
|
||||||
|
public:
|
||||||
|
enum Mode {
|
||||||
|
ModeTransliterate, ModeIgnore, ModeDefault
|
||||||
|
};
|
||||||
|
|
||||||
|
IconvWrapper (const std::string& from, std::string to, Mode mode=ModeDefault);
|
||||||
|
~IconvWrapper() noexcept;
|
||||||
|
|
||||||
|
template <typename CIn, typename COut>
|
||||||
|
std::basic_string<COut> conv (std::basic_string_view<CIn> text);
|
||||||
|
|
||||||
|
std::string conv_char (std::string_view text) {return conv<char, char>(text);}
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct LocalData;
|
||||||
|
|
||||||
|
void conv (
|
||||||
|
const char* buff,
|
||||||
|
std::size_t len,
|
||||||
|
PtrGetterFunc get_ptr,
|
||||||
|
SizeGetterFunc get_size,
|
||||||
|
ReallocFunc realloc,
|
||||||
|
std::size_t grow_hint
|
||||||
|
);
|
||||||
|
|
||||||
|
std::unique_ptr<LocalData> m_local;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename CIn, typename COut>
|
||||||
|
std::basic_string<COut> IconvWrapper::conv (std::basic_string_view<CIn> text) {
|
||||||
|
typedef std::basic_string<COut> string;
|
||||||
|
|
||||||
|
if (text.empty())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
string retval;
|
||||||
|
this->conv(
|
||||||
|
reinterpret_cast<const char*>(text.data()),
|
||||||
|
text.size() * sizeof(CIn),
|
||||||
|
[&retval](){return reinterpret_cast<char*>(retval.data());},
|
||||||
|
[&retval]()->std::size_t {return retval.size() * sizeof(COut);},
|
||||||
|
[&retval](std::size_t sz){retval.resize((sz + sizeof(COut) - 1) / sizeof(COut));},
|
||||||
|
sizeof(COut) / sizeof(CIn)
|
||||||
|
);
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
} //namespace duck
|
Loading…
Reference in a new issue