Quick dirty fix to avoid invalid characters in scripts.
Note that with this change scritps are stripped away, so you won't find any <script></script> pair in the html. Also print some more detailed info about errors.
This commit is contained in:
parent
3bfea89568
commit
44992458ac
2 changed files with 58 additions and 5 deletions
|
@ -8,14 +8,34 @@
|
|||
#include <cstring>
|
||||
#include <stack>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <cassert>
|
||||
|
||||
namespace duck {
|
||||
namespace {
|
||||
std::string cleanHTML(const std::string &html) {
|
||||
void dropScriptTags (std::string& html) {
|
||||
size_t open_index = 0;
|
||||
const std::string open_tag("<script");
|
||||
const std::string close_tag("</script>");
|
||||
|
||||
while (html.npos != (open_index = html.find(open_tag, open_index))) {
|
||||
assert(open_index < html.size());
|
||||
auto close_index = html.find(close_tag, open_index + open_tag.size());
|
||||
if (close_index == html.npos)
|
||||
close_index = html.size();
|
||||
html.erase(open_index, std::min(html.size(), close_index + close_tag.size()) - open_index);
|
||||
}
|
||||
}
|
||||
|
||||
std::string cleanHTML (std::string&& html) {
|
||||
dropScriptTags(html);
|
||||
std::unique_ptr<char[]> html_copy(new char[html.size()]);
|
||||
std::copy(html.begin(), html.end(), html_copy.get());
|
||||
|
||||
// Initialize a Tidy document
|
||||
TidyDoc tidyDoc = tidyCreate();
|
||||
TidyBuffer tidyOutputBuffer;
|
||||
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
|
||||
tidyBufInit(&tidyOutputBuffer);
|
||||
|
||||
// Configure Tidy
|
||||
// The flags tell Tidy to output XML and disable showing warnings
|
||||
|
@ -27,8 +47,10 @@ namespace duck {
|
|||
int tidyResponseCode = -1;
|
||||
|
||||
// Parse input
|
||||
if (configSuccess)
|
||||
tidyResponseCode = tidyParseString(tidyDoc, html.c_str());
|
||||
if (configSuccess) {
|
||||
tidyBufAppend(&tidyOutputBuffer, html_copy.get(), html.size());
|
||||
tidyResponseCode = tidyParseBuffer(tidyDoc, &tidyOutputBuffer);
|
||||
}
|
||||
|
||||
// Process HTML
|
||||
if (tidyResponseCode >= 0)
|
||||
|
|
33
src/main.cpp
33
src/main.cpp
|
@ -4,6 +4,14 @@
|
|||
#include <string>
|
||||
#include <pugixml.hpp>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
#include <ciso646>
|
||||
|
||||
namespace {
|
||||
typedef std::pair<int, int> LineColType;
|
||||
|
||||
LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData );
|
||||
} //unnamed namespace
|
||||
|
||||
int main (int argc, char* argv[]) {
|
||||
if (argc != 3) {
|
||||
|
@ -30,7 +38,10 @@ int main (int argc, char* argv[]) {
|
|||
std::istringstream iss(tidyHtml);
|
||||
pugi::xml_parse_result result(doc.load(iss));
|
||||
if (not result) {
|
||||
std::cerr << "Error parsing the source XML";
|
||||
auto line_col = line_col_from_offset(result.offset, tidyHtml);
|
||||
std::cerr << "Error parsing the source XML at line " <<
|
||||
line_col.first << " col " << line_col.second << ":\n" <<
|
||||
result.description() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -47,3 +58,23 @@ int main (int argc, char* argv[]) {
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace {
|
||||
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
|
||||
size_t index = 0;
|
||||
int line = 1;
|
||||
int chara = 1;
|
||||
while (parOffset and index < parData.size()) {
|
||||
if (parData[index] == '\n') {
|
||||
chara = 1;
|
||||
++line;
|
||||
}
|
||||
else {
|
||||
++chara;
|
||||
}
|
||||
++index;
|
||||
--parOffset;
|
||||
}
|
||||
return std::make_pair(line, chara);
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
|
Loading…
Reference in a new issue