From 44992458ac99e2183eed3e6d054afb54ed1c21f8 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Sun, 1 Mar 2015 05:03:12 +0100 Subject: [PATCH] Quick dirty fix to avoid invalid characters in scripts. Note that with this change scritps are stripped away, so you won't find any pair in the html. Also print some more detailed info about errors. --- src/htmlretrieve.cpp | 30 ++++++++++++++++++++++++++---- src/main.cpp | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp index d1206b8..c8982ae 100644 --- a/src/htmlretrieve.cpp +++ b/src/htmlretrieve.cpp @@ -8,14 +8,34 @@ #include #include #include +#include +#include namespace duck { namespace { - std::string cleanHTML(const std::string &html) { + void dropScriptTags (std::string& html) { + size_t open_index = 0; + const std::string open_tag(""); + + while (html.npos != (open_index = html.find(open_tag, open_index))) { + assert(open_index < html.size()); + auto close_index = html.find(close_tag, open_index + open_tag.size()); + if (close_index == html.npos) + close_index = html.size(); + html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index); + } + } + + std::string cleanHTML (std::string&& html) { + dropScriptTags(html); + std::unique_ptr html_copy(new char[html.size()]); + std::copy(html.begin(), html.end(), html_copy.get()); + // Initialize a Tidy document TidyDoc tidyDoc = tidyCreate(); TidyBuffer tidyOutputBuffer; - std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer)); + tidyBufInit(&tidyOutputBuffer); // Configure Tidy // The flags tell Tidy to output XML and disable showing warnings @@ -27,8 +47,10 @@ namespace duck { int tidyResponseCode = -1; // Parse input - if (configSuccess) - tidyResponseCode = tidyParseString(tidyDoc, html.c_str()); + if (configSuccess) { + tidyBufAppend(&tidyOutputBuffer, html_copy.get(), html.size()); + tidyResponseCode = tidyParseBuffer(tidyDoc, &tidyOutputBuffer); + } // Process HTML if (tidyResponseCode >= 0) diff --git a/src/main.cpp b/src/main.cpp index 58fde18..abb9160 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -4,6 +4,14 @@ #include #include #include +#include +#include + +namespace { + typedef std::pair LineColType; + + LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData ); +} //unnamed namespace int main (int argc, char* argv[]) { if (argc != 3) { @@ -30,7 +38,10 @@ int main (int argc, char* argv[]) { std::istringstream iss(tidyHtml); pugi::xml_parse_result result(doc.load(iss)); if (not result) { - std::cerr << "Error parsing the source XML"; + auto line_col = line_col_from_offset(result.offset, tidyHtml); + std::cerr << "Error parsing the source XML at line " << + line_col.first << " col " << line_col.second << ":\n" << + result.description() << std::endl; return 1; } @@ -47,3 +58,23 @@ int main (int argc, char* argv[]) { } return 0; } + +namespace { + LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) { + size_t index = 0; + int line = 1; + int chara = 1; + while (parOffset and index < parData.size()) { + if (parData[index] == '\n') { + chara = 1; + ++line; + } + else { + ++chara; + } + ++index; + --parOffset; + } + return std::make_pair(line, chara); + } +} //unnamed namespace