Quick dirty fix to avoid invalid characters in scripts.

Note that with this change scritps are stripped away, so
you won't find any <script></script> pair in the html.

Also print some more detailed info about errors.
This commit is contained in:
King_DuckZ 2015-03-01 05:03:12 +01:00
parent 3bfea89568
commit 44992458ac
2 changed files with 58 additions and 5 deletions

View file

@ -8,14 +8,34 @@
#include <cstring> #include <cstring>
#include <stack> #include <stack>
#include <algorithm> #include <algorithm>
#include <memory>
#include <cassert>
namespace duck { namespace duck {
namespace { namespace {
std::string cleanHTML(const std::string &html) { void dropScriptTags (std::string& html) {
size_t open_index = 0;
const std::string open_tag("<script");
const std::string close_tag("</script>");
while (html.npos != (open_index = html.find(open_tag, open_index))) {
assert(open_index < html.size());
auto close_index = html.find(close_tag, open_index + open_tag.size());
if (close_index == html.npos)
close_index = html.size();
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
}
}
std::string cleanHTML (std::string&& html) {
dropScriptTags(html);
std::unique_ptr<char[]> html_copy(new char[html.size()]);
std::copy(html.begin(), html.end(), html_copy.get());
// Initialize a Tidy document // Initialize a Tidy document
TidyDoc tidyDoc = tidyCreate(); TidyDoc tidyDoc = tidyCreate();
TidyBuffer tidyOutputBuffer; TidyBuffer tidyOutputBuffer;
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer)); tidyBufInit(&tidyOutputBuffer);
// Configure Tidy // Configure Tidy
// The flags tell Tidy to output XML and disable showing warnings // The flags tell Tidy to output XML and disable showing warnings
@ -27,8 +47,10 @@ namespace duck {
int tidyResponseCode = -1; int tidyResponseCode = -1;
// Parse input // Parse input
if (configSuccess) if (configSuccess) {
tidyResponseCode = tidyParseString(tidyDoc, html.c_str()); tidyBufAppend(&tidyOutputBuffer, html_copy.get(), html.size());
tidyResponseCode = tidyParseBuffer(tidyDoc, &tidyOutputBuffer);
}
// Process HTML // Process HTML
if (tidyResponseCode >= 0) if (tidyResponseCode >= 0)

View file

@ -4,6 +4,14 @@
#include <string> #include <string>
#include <pugixml.hpp> #include <pugixml.hpp>
#include <sstream> #include <sstream>
#include <utility>
#include <ciso646>
namespace {
typedef std::pair<int, int> LineColType;
LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData );
} //unnamed namespace
int main (int argc, char* argv[]) { int main (int argc, char* argv[]) {
if (argc != 3) { if (argc != 3) {
@ -30,7 +38,10 @@ int main (int argc, char* argv[]) {
std::istringstream iss(tidyHtml); std::istringstream iss(tidyHtml);
pugi::xml_parse_result result(doc.load(iss)); pugi::xml_parse_result result(doc.load(iss));
if (not result) { if (not result) {
std::cerr << "Error parsing the source XML"; auto line_col = line_col_from_offset(result.offset, tidyHtml);
std::cerr << "Error parsing the source XML at line " <<
line_col.first << " col " << line_col.second << ":\n" <<
result.description() << std::endl;
return 1; return 1;
} }
@ -47,3 +58,23 @@ int main (int argc, char* argv[]) {
} }
return 0; return 0;
} }
namespace {
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
size_t index = 0;
int line = 1;
int chara = 1;
while (parOffset and index < parData.size()) {
if (parData[index] == '\n') {
chara = 1;
++line;
}
else {
++chara;
}
++index;
--parOffset;
}
return std::make_pair(line, chara);
}
} //unnamed namespace