diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f9717f..fa8ba88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared librar
include(GetGitRevisionDescription)
find_package(Boost 1.32.0 COMPONENTS program_options)
find_package(XQilla 2.3.3 REQUIRED)
+find_package(Iconv REQUIRED)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -34,6 +35,7 @@ add_executable(${PROJECT_NAME}
src/scraplang/xpath_runner.cpp
src/xpath.cpp
src/read_all.cpp
+ src/iconv_wrapper.cpp
)
target_include_directories(${PROJECT_NAME} SYSTEM
diff --git a/src/html_pool.cpp b/src/html_pool.cpp
index a8031f7..8f24400 100644
--- a/src/html_pool.cpp
+++ b/src/html_pool.cpp
@@ -27,7 +27,8 @@
namespace duck {
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
m_agent(std::move(agent_name)),
- m_src_charset(std::move(src_charset))
+ m_src_charset(std::move(src_charset)),
+ m_iconv(m_src_charset, "utf-8", IconvWrapper::ModeIgnore)
{
}
@@ -46,7 +47,7 @@ namespace duck {
);
}
- *utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset)));
+ *utf8_html = duck::clean_html(m_iconv.conv_char(*utf8_html), "utf-8");
return utf8_html.release();
}
diff --git a/src/html_pool.hpp b/src/html_pool.hpp
index ea95fd3..2b7acca 100644
--- a/src/html_pool.hpp
+++ b/src/html_pool.hpp
@@ -20,6 +20,7 @@
#define idCDCACC393BE24CBD94A3B5E2985984A3
#include "scraplang/html_pool_base.hpp"
+#include "iconv_wrapper.hpp"
namespace duck {
class HtmlPool : public ::duck::sl::HtmlPoolBase {
@@ -34,6 +35,7 @@ namespace duck {
std::string m_agent;
std::string m_src_charset;
+ IconvWrapper m_iconv;
public:
HtmlPool(std::string&& agent_name, std::string&& src_charset);
diff --git a/src/iconv_wrapper.cpp b/src/iconv_wrapper.cpp
new file mode 100644
index 0000000..9c149fb
--- /dev/null
+++ b/src/iconv_wrapper.cpp
@@ -0,0 +1,143 @@
+/* Copyright (C) 2015 Michele Santullo
+ *
+ * This file is part of DuckScraper.
+ *
+ * DuckScraper is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DuckScraper is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DuckScraper. If not, see .
+ */
+
+#include "iconv_wrapper.hpp"
+#include
+#include
+#include
+
+namespace duck {
+ namespace {
+ class IconvDeleter {
+ public:
+ typedef iconv_t pointer;
+
+ void operator() (pointer& resource) {
+ if (resource and reinterpret_cast(-1) != resource) {
+ iconv_close(resource);
+ resource = static_cast(0);
+ }
+ }
+ };
+ } //unnamed namespace
+
+ typedef std::unique_ptr UniqueIconv;
+
+ IconvBadSequence::IconvBadSequence (const std::string& message) :
+ std::domain_error(message)
+ {
+ }
+
+ IconvOpenFailure::IconvOpenFailure (const std::string& message) :
+ std::logic_error(message)
+ {
+ }
+
+ struct IconvWrapper::LocalData {
+ UniqueIconv context;
+ };
+
+ IconvWrapper::IconvWrapper(const std::string& from, std::string to, Mode mode) :
+ m_local(std::make_unique())
+ {
+ switch (mode) {
+ case ModeIgnore:
+ to += "//IGNORE";
+ break;
+ case ModeTransliterate:
+ to += "//TRANSLIT";
+ break;
+ case ModeDefault:
+ default:
+ break;
+ }
+
+ m_local->context.reset(iconv_open(to.c_str(), from.c_str()));
+ if (reinterpret_cast(-1) == m_local->context.get()) {
+ auto msg = std::string("Failed to create an iconv context for \"") +
+ from + "\" to \"" + to + "\" conversion (error code " +
+ std::to_string(errno);
+ if (EINVAL == errno)
+ msg += " EINVAL";
+ msg += ")";
+
+ throw IconvOpenFailure(msg);
+ }
+ }
+
+ IconvWrapper::~IconvWrapper() noexcept = default;
+
+ void IconvWrapper::conv (
+ const char* buff,
+ std::size_t len,
+ PtrGetterFunc get_ptr,
+ SizeGetterFunc get_size,
+ ReallocFunc realloc,
+ std::size_t grow_hint
+ ) {
+ assert(buff);
+ assert(len);
+
+ const constexpr std::size_t def_inc = 16;
+ const constexpr std::size_t iconv_err = static_cast(-1);
+
+ std::size_t nchars;
+ std::size_t inbytesleft = len;
+ char* inbuff = const_cast(buff);
+ std::ptrdiff_t out_offset = 0;
+ std::size_t grow_factor = grow_hint;
+ std::size_t outbytesleft;
+
+ do {
+ realloc(std::max(inbytesleft * grow_factor, def_inc) + get_size());
+ assert(get_size() > static_cast(out_offset));
+ outbytesleft = get_size() - out_offset;
+ char* outbuff = get_ptr() + out_offset;
+
+ const auto old_inbytesleft = inbytesleft;
+ const auto old_outbytesleft = outbytesleft;
+
+ nchars = ::iconv(
+ m_local->context.get(),
+ &inbuff,
+ &inbytesleft,
+ &outbuff,
+ &outbytesleft
+ );
+ if (iconv_err == nchars) {
+ const auto pos_str = std::to_string(len - inbytesleft);
+ switch (errno) {
+ case EILSEQ:
+ throw IconvBadSequence("Invalid input multibyte sequence at byte " + pos_str);
+ case EINVAL:
+ throw IconvBadSequence("Incomplete input multibyte sequence at byte " + pos_str);
+ }
+ }
+
+ out_offset = std::distance(get_ptr(), outbuff);
+ assert(out_offset >= 0);
+
+ const auto in_diff = old_inbytesleft - inbytesleft;
+ const auto out_diff = old_outbytesleft - outbytesleft;
+ grow_factor = std::max(1, out_diff / in_diff);
+ } while (iconv_err == nchars and E2BIG == errno);
+
+ assert(outbytesleft < get_size());
+ realloc(get_size() - outbytesleft);
+ }
+} //namespace duck
diff --git a/src/iconv_wrapper.hpp b/src/iconv_wrapper.hpp
new file mode 100644
index 0000000..9680c7f
--- /dev/null
+++ b/src/iconv_wrapper.hpp
@@ -0,0 +1,87 @@
+/* Copyright (C) 2015 Michele Santullo
+ *
+ * This file is part of DuckScraper.
+ *
+ * DuckScraper is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DuckScraper is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DuckScraper. If not, see .
+ */
+
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+
+namespace duck {
+ class IconvBadSequence : public std::domain_error {
+ public:
+ explicit IconvBadSequence(const std::string& message);
+ };
+ class IconvOpenFailure : public std::logic_error {
+ public:
+ explicit IconvOpenFailure(const std::string& message);
+ };
+
+ class IconvWrapper {
+ typedef std::function PtrGetterFunc;
+ typedef std::function ReallocFunc;
+ typedef std::function SizeGetterFunc;
+ public:
+ enum Mode {
+ ModeTransliterate, ModeIgnore, ModeDefault
+ };
+
+ IconvWrapper (const std::string& from, std::string to, Mode mode=ModeDefault);
+ ~IconvWrapper() noexcept;
+
+ template
+ std::basic_string conv (std::basic_string_view text);
+
+ std::string conv_char (std::string_view text) {return conv(text);}
+
+ private:
+ struct LocalData;
+
+ void conv (
+ const char* buff,
+ std::size_t len,
+ PtrGetterFunc get_ptr,
+ SizeGetterFunc get_size,
+ ReallocFunc realloc,
+ std::size_t grow_hint
+ );
+
+ std::unique_ptr m_local;
+ };
+
+ template
+ std::basic_string IconvWrapper::conv (std::basic_string_view text) {
+ typedef std::basic_string string;
+
+ if (text.empty())
+ return {};
+
+ string retval;
+ this->conv(
+ reinterpret_cast(text.data()),
+ text.size() * sizeof(CIn),
+ [&retval](){return reinterpret_cast(retval.data());},
+ [&retval]()->std::size_t {return retval.size() * sizeof(COut);},
+ [&retval](std::size_t sz){retval.resize((sz + sizeof(COut) - 1) / sizeof(COut));},
+ sizeof(COut) / sizeof(CIn)
+ );
+ return retval;
+ }
+} //namespace duck