Add --from-code option for users to force the source charset.
This commit is contained in:
parent
54ac44b81d
commit
33866b3d6b
6 changed files with 74 additions and 15 deletions
|
@ -52,6 +52,7 @@ namespace duck {
|
||||||
query_options.add_options()
|
query_options.add_options()
|
||||||
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
("agent", po::value<std::string>()->default_value(DEFAULT_USER_AGENT), "User agent that will be passed to the server")
|
||||||
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
|
("model,m", po::value<std::string>(), "Read XPath expressions from the specified file instead of command line")
|
||||||
|
("from-code,f", po::value<std::string>()->default_value(""), "Force source charset to this, disregard any charset reported by the server")
|
||||||
;
|
;
|
||||||
po::options_description positional_options("Positional options");
|
po::options_description positional_options("Positional options");
|
||||||
positional_options.add_options()
|
positional_options.add_options()
|
||||||
|
|
|
@ -25,27 +25,29 @@
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
HtmlPool::HtmlPool (std::string&& agent_name) :
|
HtmlPool::HtmlPool (std::string&& agent_name, std::string&& src_charset) :
|
||||||
m_agent(std::move(agent_name))
|
m_agent(std::move(agent_name)),
|
||||||
|
m_src_charset(std::move(src_charset))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
|
HtmlPool::~HtmlPool() noexcept = default;
|
||||||
std::unique_ptr<std::string> html;
|
|
||||||
|
|
||||||
|
auto HtmlPool::OnResourceLoad (ResourceObjectParameterType parRes) -> ResourceType* {
|
||||||
std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n";
|
std::cout << "OnResourceLoad(): fetching html from \"" << parRes << "\"\n";
|
||||||
|
|
||||||
|
std::unique_ptr<std::string> utf8_html;
|
||||||
if (parRes == "-") {
|
if (parRes == "-") {
|
||||||
html = std::make_unique<std::string>(read_all(std::cin));
|
utf8_html = std::make_unique<std::string>(read_all(std::cin));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
html = std::make_unique<std::string>(
|
utf8_html = std::make_unique<std::string>(
|
||||||
fetch_html(parRes, m_agent, false, false)
|
fetch_html(parRes, m_agent, false, false)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
*html = duck::clean_html(std::move(*html));
|
*utf8_html = duck::clean_html(std::move(*utf8_html), (m_src_charset.empty() ? OptString() : OptString(m_src_charset)));
|
||||||
return html.release();
|
return utf8_html.release();
|
||||||
}
|
}
|
||||||
|
|
||||||
void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept {
|
void HtmlPool::OnResourceDestroy (ResourceNameParamType, ResourceType* parRes) noexcept {
|
||||||
|
|
|
@ -33,9 +33,11 @@ namespace duck {
|
||||||
virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes);
|
virtual ResourceNameType GetResourceNameFromResourceObject (ResourceObjectParameterType parRes);
|
||||||
|
|
||||||
std::string m_agent;
|
std::string m_agent;
|
||||||
|
std::string m_src_charset;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit HtmlPool (std::string&& agent_name);
|
HtmlPool(std::string&& agent_name, std::string&& src_charset);
|
||||||
|
~HtmlPool() noexcept;
|
||||||
};
|
};
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* Copyright (C) 2015 Michele Santullo
|
/* Copyright (C) 2015-2020 Michele Santullo
|
||||||
*
|
*
|
||||||
* This file is part of DuckScraper.
|
* This file is part of DuckScraper.
|
||||||
*
|
*
|
||||||
|
@ -28,9 +28,49 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
#include <cctype>
|
||||||
|
#include <iterator>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
namespace {
|
namespace {
|
||||||
|
std::string make_lowercase (std::string_view in) {
|
||||||
|
std::string out;
|
||||||
|
std::transform(in.begin(), in.end(), std::back_inserter(out), [](unsigned char c){return std::tolower(c);});
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
TidyEncodingOptions charset_to_enum (std::string_view name) {
|
||||||
|
const std::string lower_name = make_lowercase(name);
|
||||||
|
if (lower_name == "ascii")
|
||||||
|
return TidyEncAscii;
|
||||||
|
//else if (lower_name == "???")
|
||||||
|
// return TidyEncLatin0;
|
||||||
|
//else if (lower_name == "???")
|
||||||
|
// return TidyEncLatin1;
|
||||||
|
else if (lower_name == "utf-8")
|
||||||
|
return TidyEncUtf8;
|
||||||
|
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||||
|
else if (lower_name == "iso-2022-cn")
|
||||||
|
return TidyEncIso2022;
|
||||||
|
#endif
|
||||||
|
else if (lower_name == "mac")
|
||||||
|
return TidyEncMac;
|
||||||
|
else if (lower_name == "windows-1252")
|
||||||
|
return TidyEncWin1252;
|
||||||
|
else if (lower_name == "ibm858")
|
||||||
|
return TidyEncIbm858;
|
||||||
|
else if (lower_name == "utf-16le")
|
||||||
|
return TidyEncUtf16le;
|
||||||
|
else if (lower_name == "utf-16be")
|
||||||
|
return TidyEncUtf16be;
|
||||||
|
else if (lower_name == "utf-16")
|
||||||
|
return TidyEncUtf16;
|
||||||
|
else if (lower_name == "big-5")
|
||||||
|
return TidyEncBig5;
|
||||||
|
else if (lower_name == "shift-jis" or lower_name == "shift_jis")
|
||||||
|
return TidyEncShiftjis;
|
||||||
|
throw std::runtime_error("Failed to recognise \"" + std::string(name) + "\" as a valid source charset");
|
||||||
|
}
|
||||||
|
|
||||||
bool isHttps (const std::string_view& parUrl) {
|
bool isHttps (const std::string_view& parUrl) {
|
||||||
const char protocol[] = "https://";
|
const char protocol[] = "https://";
|
||||||
|
@ -42,7 +82,7 @@ namespace duck {
|
||||||
}
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
std::string clean_html (std::string&& html) {
|
std::string clean_html (std::string&& html, OptString src_charset) {
|
||||||
|
|
||||||
// Initialize a Tidy document
|
// Initialize a Tidy document
|
||||||
TidyDoc tidyDoc = tidyCreate();
|
TidyDoc tidyDoc = tidyCreate();
|
||||||
|
@ -61,8 +101,13 @@ namespace duck {
|
||||||
&& tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes)
|
&& tidyOptSetBool(tidyDoc, TidyPPrintTabs, yes)
|
||||||
&& tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no)
|
&& tidyOptSetBool(tidyDoc, TidyLiteralAttribs, no)
|
||||||
&& tidyOptSetBool(tidyDoc, TidyPunctWrap, no)
|
&& tidyOptSetBool(tidyDoc, TidyPunctWrap, no)
|
||||||
|
&& tidyOptSetInt(tidyDoc, TidyOutCharEncoding, TidyEncUtf8)
|
||||||
|
&& tidyOptSetBool(tidyDoc, TidyMetaCharset, yes)
|
||||||
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
|
&& tidyOptSetBool(tidyDoc, TidyDropPropAttrs, yes);
|
||||||
|
|
||||||
|
if (src_charset)
|
||||||
|
tidyOptSetInt(tidyDoc, TidyInCharEncoding, charset_to_enum(*src_charset));
|
||||||
|
|
||||||
int tidyResponseCode = -1;
|
int tidyResponseCode = -1;
|
||||||
|
|
||||||
// Parse input
|
// Parse input
|
||||||
|
@ -121,6 +166,7 @@ namespace duck {
|
||||||
//return 1;
|
//return 1;
|
||||||
//}
|
//}
|
||||||
|
|
||||||
|
//return FetchedHtml(oss.str(), easy.get_info<CURLINFO_CONTENT_TYPE>().get());
|
||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
|
@ -21,10 +21,13 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
|
#include <optional>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
|
typedef std::optional<std::string_view> OptString;
|
||||||
|
|
||||||
std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
std::string fetch_html ( const std::string_view& parSource, std::string parUserAgent, bool parSslVerifyPeer, bool parSslVerifyHost );
|
||||||
std::string clean_html ( std::string&& html );
|
std::string clean_html ( std::string&& html, OptString src_charset );
|
||||||
} //namespace duck
|
} //namespace duck
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
11
src/main.cpp
11
src/main.cpp
|
@ -83,10 +83,12 @@ namespace {
|
||||||
const auto& vm = parVarMap;
|
const auto& vm = parVarMap;
|
||||||
const auto url = vm["input-url"].as<std::string>();
|
const auto url = vm["input-url"].as<std::string>();
|
||||||
|
|
||||||
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
|
duck::HtmlPool html_pool(
|
||||||
|
std::string(parVarMap["agent"].as<std::string>()),
|
||||||
|
std::string(parVarMap["from-code"].as<std::string>())
|
||||||
|
);
|
||||||
const auto in_html_id = html_pool.GetOrAdd(url);
|
const auto in_html_id = html_pool.GetOrAdd(url);
|
||||||
std::string html = *html_pool.GetByID(in_html_id);
|
std::string html = *html_pool.GetByID(in_html_id);
|
||||||
|
|
||||||
if (vm.count("dump")) {
|
if (vm.count("dump")) {
|
||||||
dump_string(vm["dump"].as<std::string>(), html);
|
dump_string(vm["dump"].as<std::string>(), html);
|
||||||
}
|
}
|
||||||
|
@ -120,7 +122,10 @@ namespace {
|
||||||
const std::string script = duck::read_all(parVarMap["model"].as<std::string>());
|
const std::string script = duck::read_all(parVarMap["model"].as<std::string>());
|
||||||
auto ast = duck::sl::parse(script);
|
auto ast = duck::sl::parse(script);
|
||||||
|
|
||||||
duck::HtmlPool html_pool(std::string(parVarMap["agent"].as<std::string>()));
|
duck::HtmlPool html_pool(
|
||||||
|
std::string(parVarMap["agent"].as<std::string>()),
|
||||||
|
std::string(parVarMap["from-code"].as<std::string>())
|
||||||
|
);
|
||||||
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
|
duck::sl::apply(ast, duck::sl::HtmlPoolBaseSP(&html_pool), xpath);
|
||||||
//auto list = duck::get_xpath_definitions(*ast);
|
//auto list = duck::get_xpath_definitions(*ast);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue