Add --from-code option for users to force the source charset.
This commit is contained in:
parent
54ac44b81d
commit
33866b3d6b
6 changed files with 74 additions and 15 deletions
|
@ -52,6 +52,7 @@ namespace duck {
|
|||
query_options.add_options()
|
||||
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
||||
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
|
||||
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
|
||||
;
|
||||
po::options_description positional_options("Positional options");
|
||||
positional_options.add_options()
|
||||
|
|
|
@ -25,27 +25,29 @@
|
|||
#include <iostream>
|
||||
|
||||
namespace duck {
|
||||
HtmlPool::HtmlPool (std::string&& agent_name) :
|
||||
m_agent(std::move(agent_name))
|
||||
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
|
||||
m_agent(std::move(agent_name)),
|
||||
m_src_charset(std::move(src_charset))
|
||||
{
|
||||
}
|
||||
|
||||
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
|
||||
std::unique_ptr<std::string> html;
|
||||
HtmlPool::~HtmlPool() noexcept = default;
|
||||
|
||||
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
|
||||
std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n";
|
||||
|
||||
std::unique_ptr<std::string> utf8_html;
|
||||
if (parRes == "-") {
|
||||
html = std::make_unique<std::string>(read_all(std::cin));
|
||||
utf8_html = std::make_unique<std::string>(read_all(std::cin));
|
||||
}
|
||||
else {
|
||||
html = std::make_unique<std::string>(
|
||||
utf8_html = std::make_unique<std::string>(
|
||||
fetch_html(parRes, m_agent, false, false)
|
||||
);
|
||||
}
|
||||
|
||||
*html = duck::clean_html(std::move(*html));
|
||||
return html.release();
|
||||
*utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset)));
|
||||
return utf8_html.release();
|
||||
}
|
||||
|
||||
void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept {
|
||||
|
|
|
@ -33,9 +33,11 @@ namespace duck {
|
|||
virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes);
|
||||
|
||||
std::string m_agent;
|
||||
std::string m_src_charset;
|
||||
|
||||
public:
|
||||
explicit HtmlPool (std::string&& agent_name);
|
||||
HtmlPool(std::string&& agent_name, std::string&& src_charset);
|
||||
~HtmlPool() noexcept;
|
||||
};
|
||||
} //namespace duck
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* Copyright (C) 2015 Michele Santullo
|
||||
/* Copyright (C) 2015-2020 Michele Santullo
|
||||
*
|
||||
* This file is part of DuckScraper.
|
||||
*
|
||||
|
@ -28,9 +28,49 @@
|
|||
#include <memory>
|
||||
#include <cassert>
|
||||
#include <utility>
|
||||
#include <cctype>
|
||||
#include <iterator>
|
||||
|
||||
namespace duck {
|
||||
namespace {
|
||||
std::string make_lowercase (std::string_view in) {
|
||||
std::string out;
|
||||
std::transform(in.begin(), in.end(), std::back_inserter(out), [](unsigned char c){return std::tolower(c);});
|
||||
return out;
|
||||
}
|
||||
|
||||
TidyEncodingOptions charset_to_enum (std::string_view name) {
|
||||
const std::string lower_name = make_lowercase(name);
|
||||
if (lower_name == "ascii")
|
||||
return TidyEncAscii;
|
||||
//else if (lower_name == "???")
|
||||
// return TidyEncLatin0;
|
||||
//else if (lower_name == "???")
|
||||
// return TidyEncLatin1;
|
||||
else if (lower_name == "utf-8")
|
||||
return TidyEncUtf8;
|
||||
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||
else if (lower_name == "iso-2022-cn")
|
||||
return TidyEncIso2022;
|
||||
#endif
|
||||
else if (lower_name == "mac")
|
||||
return TidyEncMac;
|
||||
else if (lower_name == "windows-1252")
|
||||
return TidyEncWin1252;
|
||||
else if (lower_name == "ibm858")
|
||||
return TidyEncIbm858;
|
||||
else if (lower_name == "utf-16le")
|
||||
return TidyEncUtf16le;
|
||||
else if (lower_name == "utf-16be")
|
||||
return TidyEncUtf16be;
|
||||
else if (lower_name == "utf-16")
|
||||
return TidyEncUtf16;
|
||||
else if (lower_name == "big-5")
|
||||
return TidyEncBig5;
|
||||
else if (lower_name == "shift-jis" or lower_name == "shift_jis")
|
||||
return TidyEncShiftjis;
|
||||
throw std::runtime_error("Failed to recognise \"" + std::string(name) + "\" as a valid source charset");
|
||||
}
|
||||
|
||||
bool isHttps (const std::string_view& parUrl) {
|
||||
const char protocol[] = "https://";
|
||||
|
@ -42,7 +82,7 @@ namespace duck {
|
|||
}
|
||||
} //unnamed namespace
|
||||
|
||||
std::string clean_html (std::string&& html) {
|
||||
std::string clean_html (std::string&& html, OptString src_charset) {
|
||||
|
||||
// Initialize a Tidy document
|
||||
TidyDoc tidyDoc = tidyCreate();
|
||||
|
@ -61,8 +101,13 @@ namespace duck {
|
|||
&& tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no)
|
||||
&& tidyOptSetBool(tidyDoc, TidyPunctWrap, no)
|
||||
&& tidyOptSetInt(tidyDoc, TidyOutCharEncoding, TidyEncUtf8)
|
||||
&& tidyOptSetBool(tidyDoc, TidyMetaCharset, yes)
|
||||
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
|
||||
|
||||
if (src_charset)
|
||||
tidyOptSetInt(tidyDoc, TidyInCharEncoding, charset_to_enum(*src_charset));
|
||||
|
||||
int tidyResponseCode = -1;
|
||||
|
||||
// Parse input
|
||||
|
@ -121,6 +166,7 @@ namespace duck {
|
|||
//return 1;
|
||||
//}
|
||||
|
||||
//return FetchedHtml(oss.str(), easy.get_info<CURLINFO_CONTENT_TYPE>().get());
|
||||
return oss.str();
|
||||
}
|
||||
} //namespace duck
|
||||
|
|
|
@ -21,10 +21,13 @@
|
|||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <optional>
|
||||
|
||||
namespace duck {
|
||||
typedef std::optional<std::string_view> OptString;
|
||||
|
||||
std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||
std::string clean_html ( std::string&& html );
|
||||
std::string clean_html ( std::string&& html, OptString src_charset );
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
||||
|
|
11
src/main.cpp
11
src/main.cpp
|
@ -83,10 +83,12 @@ namespace {
|
|||
const auto& vm = parVarMap;
|
||||
const auto url = vm["input-url"].as<std::string>();
|
||||
|
||||
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
|
||||
duck::HtmlPool html_pool(
|
||||
std::string(parVarMap["agent"].as<std::string>()),
|
||||
std::string(parVarMap["from-code"].as<std::string>())
|
||||
);
|
||||
const auto in_html_id = html_pool.GetOrAdd(url);
|
||||
std::string html = *html_pool.GetByID(in_html_id);
|
||||
|
||||
if (vm.count("dump")) {
|
||||
dump_string(vm["dump"].as<std::string>(), html);
|
||||
}
|
||||
|
@ -120,7 +122,10 @@ namespace {
|
|||
const std::string script = duck::read_all(parVarMap["model"].as<std::string>());
|
||||
auto ast = duck::sl::parse(script);
|
||||
|
||||
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
|
||||
duck::HtmlPool html_pool(
|
||||
std::string(parVarMap["agent"].as<std::string>()),
|
||||
std::string(parVarMap["from-code"].as<std::string>())
|
||||
);
|
||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
|
||||
//auto list = duck::get_xpath_definitions(*ast);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue