Quick dirty fix to avoid invalid characters in scripts.
Note that with this change scritps are stripped away, so you won't find any <script></script> pair in the html. Also print some more detailed info about errors.
This commit is contained in:
parent
3bfea89568
commit
44992458ac
2 changed files with 58 additions and 5 deletions
|
@ -8,14 +8,34 @@
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <stack>
|
#include <stack>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <memory>
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
namespace duck {
|
namespace duck {
|
||||||
namespace {
|
namespace {
|
||||||
std::string cleanHTML(const std::string &html) {
|
void dropScriptTags (std::string& html) {
|
||||||
|
size_t open_index = 0;
|
||||||
|
const std::string open_tag("<script");
|
||||||
|
const std::string close_tag("</script>");
|
||||||
|
|
||||||
|
while (html.npos != (open_index = html.find(open_tag, open_index))) {
|
||||||
|
assert(open_index < html.size());
|
||||||
|
auto close_index = html.find(close_tag, open_index + open_tag.size());
|
||||||
|
if (close_index == html.npos)
|
||||||
|
close_index = html.size();
|
||||||
|
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string cleanHTML (std::string&& html) {
|
||||||
|
dropScriptTags(html);
|
||||||
|
std::unique_ptr<char[]> html_copy(new char[html.size()]);
|
||||||
|
std::copy(html.begin(), html.end(), html_copy.get());
|
||||||
|
|
||||||
// Initialize a Tidy document
|
// Initialize a Tidy document
|
||||||
TidyDoc tidyDoc = tidyCreate();
|
TidyDoc tidyDoc = tidyCreate();
|
||||||
TidyBuffer tidyOutputBuffer;
|
TidyBuffer tidyOutputBuffer;
|
||||||
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
|
tidyBufInit(&tidyOutputBuffer);
|
||||||
|
|
||||||
// Configure Tidy
|
// Configure Tidy
|
||||||
// The flags tell Tidy to output XML and disable showing warnings
|
// The flags tell Tidy to output XML and disable showing warnings
|
||||||
|
@ -27,8 +47,10 @@ namespace duck {
|
||||||
int tidyResponseCode = -1;
|
int tidyResponseCode = -1;
|
||||||
|
|
||||||
// Parse input
|
// Parse input
|
||||||
if (configSuccess)
|
if (configSuccess) {
|
||||||
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
|
tidyBufAppend(&tidyOutputBuffer, html_copy.get(), html.size());
|
||||||
|
tidyResponseCode = tidyParseBuffer(tidyDoc, &tidyOutputBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
// Process HTML
|
// Process HTML
|
||||||
if (tidyResponseCode >= 0)
|
if (tidyResponseCode >= 0)
|
||||||
|
|
33
src/main.cpp
33
src/main.cpp
|
@ -4,6 +4,14 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <pugixml.hpp>
|
#include <pugixml.hpp>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <utility>
|
||||||
|
#include <ciso646>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
typedef std::pair<int, int> LineColType;
|
||||||
|
|
||||||
|
LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData );
|
||||||
|
} //unnamed namespace
|
||||||
|
|
||||||
int main (int argc, char* argv[]) {
|
int main (int argc, char* argv[]) {
|
||||||
if (argc != 3) {
|
if (argc != 3) {
|
||||||
|
@ -30,7 +38,10 @@ int main (int argc, char* argv[]) {
|
||||||
std::istringstream iss(tidyHtml);
|
std::istringstream iss(tidyHtml);
|
||||||
pugi::xml_parse_result result(doc.load(iss));
|
pugi::xml_parse_result result(doc.load(iss));
|
||||||
if (not result) {
|
if (not result) {
|
||||||
std::cerr << "Error parsing the source XML";
|
auto line_col = line_col_from_offset(result.offset, tidyHtml);
|
||||||
|
std::cerr << "Error parsing the source XML at line " <<
|
||||||
|
line_col.first << " col " << line_col.second << ":\n" <<
|
||||||
|
result.description() << std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,3 +58,23 @@ int main (int argc, char* argv[]) {
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
|
||||||
|
size_t index = 0;
|
||||||
|
int line = 1;
|
||||||
|
int chara = 1;
|
||||||
|
while (parOffset and index < parData.size()) {
|
||||||
|
if (parData[index] == '\n') {
|
||||||
|
chara = 1;
|
||||||
|
++line;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
++chara;
|
||||||
|
}
|
||||||
|
++index;
|
||||||
|
--parOffset;
|
||||||
|
}
|
||||||
|
return std::make_pair(line, chara);
|
||||||
|
}
|
||||||
|
} //unnamed namespace
|
||||||
|
|
Loading…
Reference in a new issue