From 00150938dd9a22db637dd1c4e1742b1183a03e8b Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Mon, 28 Sep 2015 22:59:09 +0200 Subject: [PATCH] Fix the html cleaning code that was not really cleaning. --- src/htmlretrieve.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/htmlretrieve.cpp b/src/htmlretrieve.cpp index ac15768..b560d4f 100644 --- a/src/htmlretrieve.cpp +++ b/src/htmlretrieve.cpp @@ -29,13 +29,10 @@ namespace duck { std::string cleanHTML (std::string&& html) { dropScriptTags(html); - std::unique_ptr html_copy(new char[html.size()]); - std::copy(html.begin(), html.end(), html_copy.get()); // Initialize a Tidy document TidyDoc tidyDoc = tidyCreate(); - TidyBuffer tidyOutputBuffer; - tidyBufInit(&tidyOutputBuffer); + TidyBuffer tidyOutputBuffer = {nullptr, nullptr, 0, 0, 0}; // Configure Tidy // The flags tell Tidy to output XML and disable showing warnings @@ -48,14 +45,18 @@ namespace duck { // Parse input if (configSuccess) { - tidyBufAppend(&tidyOutputBuffer, html_copy.get(), html.size()); - tidyResponseCode = tidyParseBuffer(tidyDoc, &tidyOutputBuffer); + tidyResponseCode = tidyParseString(tidyDoc, html.c_str()); } // Process HTML if (tidyResponseCode >= 0) tidyResponseCode = tidyCleanAndRepair(tidyDoc); + if (tidyResponseCode >= 0) + tidyResponseCode = tidyRunDiagnostics(tidyDoc); + if (tidyResponseCode > 1) + tidyResponseCode = (tidyOptSetBool(tidyDoc, TidyForceOutput, yes) ? tidyResponseCode : -1); + // Output the HTML to our buffer if (tidyResponseCode >= 0) tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer);