From bdd50d2267cb77690d35931e3ab1f3f567b6c614 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Thu, 1 Oct 2015 14:18:02 +0200 Subject: [PATCH] Refactor xpath query into a separate function. --- CMakeLists.txt | 1 + src/main.cpp | 56 +++++++++---------------------------- src/xpath.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/xpath.hpp | 23 ++++++++++++++++ 4 files changed, 112 insertions(+), 43 deletions(-) create mode 100644 src/xpath.cpp create mode 100644 src/xpath.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index f60afee..a0f9781 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ add_executable(${PROJECT_NAME} src/htmlretrieve.cpp src/commandline.cpp src/scraplang/scraplang.cpp + src/xpath.cpp ) if (BUILD_SHARED_TIDY) diff --git a/src/main.cpp b/src/main.cpp index f324c6c..2a24def 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,9 +1,8 @@ #include "htmlretrieve.hpp" #include "commandline.hpp" +#include "xpath.hpp" #include #include -#include -#include #include #include #include @@ -12,9 +11,6 @@ #include namespace { - typedef std::pair LineColType; - - LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData ); void dump_string ( const std::string& parPathDest, const std::string& parData ); } //unnamed namespace @@ -61,50 +57,24 @@ int main (int argc, char* argv[]) { dump_string(vm["dump"].as(), html); } - { - pugi::xml_document doc; - std::istringstream iss(html); - pugi::xml_parse_result result(doc.load(iss)); - if (not result) { - auto line_col = line_col_from_offset(result.offset, html); - std::cerr << "Error parsing the source XML at line " << - line_col.first << " col " << line_col.second << ":\n" << - result.description() << std::endl; - return 1; - } - - pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str()); - for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) { - const pugi::xpath_node& node = *itFind; - if (node.node()) { - std::cout << node.node().name() << ": " << node.node().value() << "\n"; - } - else if (node.attribute()) { - std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n"; - } + try { + std::vector queries; + queries.reserve(1); + queries.push_back(std::move(xpath)); + auto results = duck::xpath_query(html, queries); + for (const auto& lst : results[0]) { + std::cout << lst.first << ": " << lst.second << '\n'; } } + catch (const duck::ParseError& err) { + std::cerr << err.what() << std::endl; + return 1; + } + return 0; } namespace { - LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) { - size_t index = 0; - int line = 1; - int chara = 1; - while (parOffset and index < parData.size()) { - if (parData[index] == '\n') { - chara = 1; - ++line; - } - else { - ++chara; - } - ++index; - --parOffset; - } - return std::make_pair(line, chara); - } void dump_string (const std::string& parPathDest, const std::string& parData) { std::unique_ptr ofs; diff --git a/src/xpath.cpp b/src/xpath.cpp new file mode 100644 index 0000000..14e8890 --- /dev/null +++ b/src/xpath.cpp @@ -0,0 +1,75 @@ +#include "xpath.hpp" +#include +#include +#include +#include + +namespace duck { + namespace { + typedef std::pair LineColType; + + LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) { + size_t index = 0; + int line = 1; + int chara = 1; + while (parOffset and index < parData.size()) { + if (parData[index] == '\n') { + chara = 1; + ++line; + } + else { + ++chara; + } + ++index; + --parOffset; + } + return std::make_pair(line, chara); + } + } //unnamed namespace + + XPathBatchResults xpath_query (const std::string& parXML, const std::vector& parQueries) { + pugi::xml_document doc; + std::istringstream iss(parXML); + pugi::xml_parse_result result(doc.load(iss)); + if (not result) { + auto line_col = line_col_from_offset(result.offset, parXML); + throw ParseError(line_col.first, line_col.second, result.description()); + } + + XPathBatchResults retval; + for (const auto& xpath : parQueries) { + pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str()); + std::vector> new_lst; + for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) { + const pugi::xpath_node& node = *itFind; + std::pair new_itm; + if (node.node()) { + new_itm.first = std::string(node.node().name()); + new_itm.second = std::string(node.node().value()); + } + else if (node.attribute()) { + new_itm.first = std::string(node.attribute().name()); + new_itm.second = std::string(node.attribute().value()); + } + new_lst.push_back(std::move(new_itm)); + } + retval.push_back(std::move(new_lst)); + } + return std::move(retval); + } + + ParseError::ParseError (int parLine, int parColumn, std::string parMessage) { + std::ostringstream oss; + oss << "Error parsing the source XML at line " << + parLine << " col " << parColumn << ":\n" << + parMessage << std::endl; + auto msg = oss.str(); + m_msg.resize(msg.size() + 1); + std::copy(msg.begin(), msg.end(), m_msg.begin()); + m_msg[m_msg.size() - 1] = '\0'; + } + + const char* ParseError::what() const noexcept { + return m_msg.data(); + } +} //namespace duck diff --git a/src/xpath.hpp b/src/xpath.hpp new file mode 100644 index 0000000..45c0bb9 --- /dev/null +++ b/src/xpath.hpp @@ -0,0 +1,23 @@ +#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7 +#define id21E0A6F345D24C5D83D3B1F74EC810F7 + +#include +#include +#include +#include + +namespace duck { + typedef std::vector>> XPathBatchResults; + + class ParseError : public std::exception { + public: + ParseError ( int parLine, int parColumn, std::string parMessage ); + virtual const char* what ( void ) const noexcept; + private: + std::vector m_msg; + }; + + XPathBatchResults xpath_query ( const std::string& parXML, const std::vector& parQueries ); +} //namespace duck + +#endif