Refactor xpath query into a separate function.
This commit is contained in:
parent
c9db1d8ba3
commit
bdd50d2267
4 changed files with 112 additions and 43 deletions
|
@ -36,6 +36,7 @@ add_executable(${PROJECT_NAME}
|
|||
src/htmlretrieve.cpp
|
||||
src/commandline.cpp
|
||||
src/scraplang/scraplang.cpp
|
||||
src/xpath.cpp
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_TIDY)
|
||||
|
|
56
src/main.cpp
56
src/main.cpp
|
@ -1,9 +1,8 @@
|
|||
#include "htmlretrieve.hpp"
|
||||
#include "commandline.hpp"
|
||||
#include "xpath.hpp"
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <pugixml.hpp>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <utility>
|
||||
#include <ciso646>
|
||||
|
@ -12,9 +11,6 @@
|
|||
#include <stdexcept>
|
||||
|
||||
namespace {
|
||||
typedef std::pair<int, int> LineColType;
|
||||
|
||||
LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData );
|
||||
void dump_string ( const std::string& parPathDest, const std::string& parData );
|
||||
} //unnamed namespace
|
||||
|
||||
|
@ -61,50 +57,24 @@ int main (int argc, char* argv[]) {
|
|||
dump_string(vm["dump"].as<std::string>(), html);
|
||||
}
|
||||
|
||||
{
|
||||
pugi::xml_document doc;
|
||||
std::istringstream iss(html);
|
||||
pugi::xml_parse_result result(doc.load(iss));
|
||||
if (not result) {
|
||||
auto line_col = line_col_from_offset(result.offset, html);
|
||||
std::cerr << "Error parsing the source XML at line " <<
|
||||
line_col.first << " col " << line_col.second << ":\n" <<
|
||||
result.description() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str());
|
||||
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
|
||||
const pugi::xpath_node& node = *itFind;
|
||||
if (node.node()) {
|
||||
std::cout << node.node().name() << ": " << node.node().value() << "\n";
|
||||
}
|
||||
else if (node.attribute()) {
|
||||
std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n";
|
||||
}
|
||||
try {
|
||||
std::vector<std::string> queries;
|
||||
queries.reserve(1);
|
||||
queries.push_back(std::move(xpath));
|
||||
auto results = duck::xpath_query(html, queries);
|
||||
for (const auto& lst : results[0]) {
|
||||
std::cout << lst.first << ": " << lst.second << '\n';
|
||||
}
|
||||
}
|
||||
catch (const duck::ParseError& err) {
|
||||
std::cerr << err.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace {
|
||||
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
|
||||
size_t index = 0;
|
||||
int line = 1;
|
||||
int chara = 1;
|
||||
while (parOffset and index < parData.size()) {
|
||||
if (parData[index] == '\n') {
|
||||
chara = 1;
|
||||
++line;
|
||||
}
|
||||
else {
|
||||
++chara;
|
||||
}
|
||||
++index;
|
||||
--parOffset;
|
||||
}
|
||||
return std::make_pair(line, chara);
|
||||
}
|
||||
|
||||
void dump_string (const std::string& parPathDest, const std::string& parData) {
|
||||
std::unique_ptr<std::ofstream> ofs;
|
||||
|
|
75
src/xpath.cpp
Normal file
75
src/xpath.cpp
Normal file
|
@ -0,0 +1,75 @@
|
|||
#include "xpath.hpp"
|
||||
#include <pugixml.hpp>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
|
||||
namespace duck {
|
||||
namespace {
|
||||
typedef std::pair<int, int> LineColType;
|
||||
|
||||
LineColType line_col_from_offset (ptrdiff_t parOffset, const std::string& parData) {
|
||||
size_t index = 0;
|
||||
int line = 1;
|
||||
int chara = 1;
|
||||
while (parOffset and index < parData.size()) {
|
||||
if (parData[index] == '\n') {
|
||||
chara = 1;
|
||||
++line;
|
||||
}
|
||||
else {
|
||||
++chara;
|
||||
}
|
||||
++index;
|
||||
--parOffset;
|
||||
}
|
||||
return std::make_pair(line, chara);
|
||||
}
|
||||
} //unnamed namespace
|
||||
|
||||
XPathBatchResults xpath_query (const std::string& parXML, const std::vector<std::string>& parQueries) {
|
||||
pugi::xml_document doc;
|
||||
std::istringstream iss(parXML);
|
||||
pugi::xml_parse_result result(doc.load(iss));
|
||||
if (not result) {
|
||||
auto line_col = line_col_from_offset(result.offset, parXML);
|
||||
throw ParseError(line_col.first, line_col.second, result.description());
|
||||
}
|
||||
|
||||
XPathBatchResults retval;
|
||||
for (const auto& xpath : parQueries) {
|
||||
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str());
|
||||
std::vector<std::pair<std::string, std::string>> new_lst;
|
||||
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
|
||||
const pugi::xpath_node& node = *itFind;
|
||||
std::pair<std::string, std::string> new_itm;
|
||||
if (node.node()) {
|
||||
new_itm.first = std::string(node.node().name());
|
||||
new_itm.second = std::string(node.node().value());
|
||||
}
|
||||
else if (node.attribute()) {
|
||||
new_itm.first = std::string(node.attribute().name());
|
||||
new_itm.second = std::string(node.attribute().value());
|
||||
}
|
||||
new_lst.push_back(std::move(new_itm));
|
||||
}
|
||||
retval.push_back(std::move(new_lst));
|
||||
}
|
||||
return std::move(retval);
|
||||
}
|
||||
|
||||
ParseError::ParseError (int parLine, int parColumn, std::string parMessage) {
|
||||
std::ostringstream oss;
|
||||
oss << "Error parsing the source XML at line " <<
|
||||
parLine << " col " << parColumn << ":\n" <<
|
||||
parMessage << std::endl;
|
||||
auto msg = oss.str();
|
||||
m_msg.resize(msg.size() + 1);
|
||||
std::copy(msg.begin(), msg.end(), m_msg.begin());
|
||||
m_msg[m_msg.size() - 1] = '\0';
|
||||
}
|
||||
|
||||
const char* ParseError::what() const noexcept {
|
||||
return m_msg.data();
|
||||
}
|
||||
} //namespace duck
|
23
src/xpath.hpp
Normal file
23
src/xpath.hpp
Normal file
|
@ -0,0 +1,23 @@
|
|||
#ifndef id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||
#define id21E0A6F345D24C5D83D3B1F74EC810F7
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <exception>
|
||||
#include <utility>
|
||||
|
||||
namespace duck {
|
||||
typedef std::vector<std::vector<std::pair<std::string, std::string>>> XPathBatchResults;
|
||||
|
||||
class ParseError : public std::exception {
|
||||
public:
|
||||
ParseError ( int parLine, int parColumn, std::string parMessage );
|
||||
virtual const char* what ( void ) const noexcept;
|
||||
private:
|
||||
std::vector<char> m_msg;
|
||||
};
|
||||
|
||||
XPathBatchResults xpath_query ( const std::string& parXML, const std::vector<std::string>& parQueries );
|
||||
} //namespace duck
|
||||
|
||||
#endif
|
Loading…
Reference in a new issue