Refactor xpath query into a separate function.

This commit is contained in:
King_DuckZ 2015-10-01 14:18:02 +02:00
parent c9db1d8ba3
commit bdd50d2267
4 changed files with 112 additions and 43 deletions

View file

@ -36,6 +36,7 @@ add_executable(${PROJECT_NAME}
src/htmlretrieve.cpp src/htmlretrieve.cpp
src/commandline.cpp src/commandline.cpp
src/scraplang/scraplang.cpp src/scraplang/scraplang.cpp
src/xpath.cpp
) )
if (BUILD_SHARED_TIDY) if (BUILD_SHARED_TIDY)

View file

@ -1,9 +1,8 @@
#include "htmlretrieve.hpp" #include "htmlretrieve.hpp"
#include "commandline.hpp" #include "commandline.hpp"
#include "xpath.hpp"
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <pugixml.hpp>
#include <sstream>
#include <fstream> #include <fstream>
#include <utility> #include <utility>
#include <ciso646> #include <ciso646>
@ -12,9 +11,6 @@
#include <stdexcept> #include <stdexcept>
namespace { namespace {
typedef std::pair<int, int> LineColType;
LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData );
void dump_string ( const std::string& parPathDest, const std::string& parData ); void dump_string ( const std::string& parPathDest, const std::string& parData );
} //unnamed namespace } //unnamed namespace
@ -61,50 +57,24 @@ int main (int argc, char* argv[]) {
dump_string(vm["dump"].as<std::string>(), html); dump_string(vm["dump"].as<std::string>(), html);
} }
{ try {
pugi::xml_document doc; std::vector<std::string> queries;
std::istringstream iss(html); queries.reserve(1);
pugi::xml_parse_result result(doc.load(iss)); queries.push_back(std::move(xpath));
if (not result) { auto results = duck::xpath_query(html, queries);
auto line_col = line_col_from_offset(result.offset, html); for (const auto& lst : results[0]) {
std::cerr << "Error parsing the source XML at line " << std::cout << lst.first << ": " << lst.second << '\n';
line_col.first << " col " << line_col.second << ":\n" <<
result.description() << std::endl;
return 1;
}
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str());
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
const pugi::xpath_node& node = *itFind;
if (node.node()) {
std::cout << node.node().name() << ": " << node.node().value() << "\n";
}
else if (node.attribute()) {
std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n";
}
} }
} }
catch (const duck::ParseError& err) {
std::cerr << err.what() << std::endl;
return 1;
}
return 0; return 0;
} }
namespace { namespace {
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
size_t index = 0;
int line = 1;
int chara = 1;
while (parOffset and index < parData.size()) {
if (parData[index] == '\n') {
chara = 1;
++line;
}
else {
++chara;
}
++index;
--parOffset;
}
return std::make_pair(line, chara);
}
void dump_string (const std::string& parPathDest, const std::string& parData) { void dump_string (const std::string& parPathDest, const std::string& parData) {
std::unique_ptr<std::ofstream> ofs; std::unique_ptr<std::ofstream> ofs;

75
src/xpath.cpp Normal file
View file

@ -0,0 +1,75 @@
#include "xpath.hpp"
#include <pugixml.hpp>
#include <sstream>
#include <stdexcept>
#include <algorithm>
namespace duck {
namespace {
typedef std::pair<int, int> LineColType;
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
size_t index = 0;
int line = 1;
int chara = 1;
while (parOffset and index < parData.size()) {
if (parData[index] == '\n') {
chara = 1;
++line;
}
else {
++chara;
}
++index;
--parOffset;
}
return std::make_pair(line, chara);
}
} //unnamed namespace
XPathBatchResults xpath_query (const std::string& parXML, const std::vector<std::string>& parQueries) {
pugi::xml_document doc;
std::istringstream iss(parXML);
pugi::xml_parse_result result(doc.load(iss));
if (not result) {
auto line_col = line_col_from_offset(result.offset, parXML);
throw ParseError(line_col.first, line_col.second, result.description());
}
XPathBatchResults retval;
for (const auto& xpath : parQueries) {
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str());
std::vector<std::pair<std::string, std::string>> new_lst;
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
const pugi::xpath_node& node = *itFind;
std::pair<std::string, std::string> new_itm;
if (node.node()) {
new_itm.first = std::string(node.node().name());
new_itm.second = std::string(node.node().value());
}
else if (node.attribute()) {
new_itm.first = std::string(node.attribute().name());
new_itm.second = std::string(node.attribute().value());
}
new_lst.push_back(std::move(new_itm));
}
retval.push_back(std::move(new_lst));
}
return std::move(retval);
}
ParseError::ParseError (int parLine, int parColumn, std::string parMessage) {
std::ostringstream oss;
oss << "Error parsing the source XML at line " <<
parLine << " col " << parColumn << ":\n" <<
parMessage << std::endl;
auto msg = oss.str();
m_msg.resize(msg.size() + 1);
std::copy(msg.begin(), msg.end(), m_msg.begin());
m_msg[m_msg.size() - 1] = '\0';
}
const char* ParseError::what() const noexcept {
return m_msg.data();
}
} //namespace duck

23
src/xpath.hpp Normal file
View file

@ -0,0 +1,23 @@
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
#define id21E0A6F345D24C5D83D3B1F74EC810F7
#include <string>
#include <vector>
#include <exception>
#include <utility>
namespace duck {
typedef std::vector<std::vector<std::pair<std::string, std::string>>> XPathBatchResults;
class ParseError : public std::exception {
public:
ParseError ( int parLine, int parColumn, std::string parMessage );
virtual const char* what ( void ) const noexcept;
private:
std::vector<char> m_msg;
};
XPathBatchResults xpath_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
} //namespace duck
#endif