From 430886085c09fcff5adc1461cb0a8599c3ee9163 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Tue, 18 Feb 2020 10:19:51 +0100 Subject: [PATCH] Use XQilla and Xerces-c from the system instead of pugixml. I don't think this commit works or even compiles, I have too many changes and I have to start committing from somewhere. At the same time I don't want to make a "lots of changes here and there" kind of commit. --- .gitignore | 1 + CMakeLists.txt | 3 +- cmake/Modules/FindXQilla.cmake | 28 +++++++++ src/safe_stack_object.hpp | 104 +++++++++++++++++++++++++++++++++ src/xpath.cpp | 42 ++++++------- src/xpath.hpp | 1 + 6 files changed, 158 insertions(+), 21 deletions(-) create mode 100644 cmake/Modules/FindXQilla.cmake create mode 100644 src/safe_stack_object.hpp diff --git a/.gitignore b/.gitignore index 332b722..f8ea2be 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build/ tags +compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index dfba656..d3eb079 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,8 +5,8 @@ project(duckscraper VERSION 0.2.1 LANGUAGES CXX) option(BUILD_SHARED_TIDY "Wheter you want to build tidy-html5 as a shared library" OFF) include(GetGitRevisionDescription) -find_package(PugiXML REQUIRED) find_package(Boost 1.32.0 COMPONENTS program_options) +find_package(XQilla 2.3.3 REQUIRED) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -59,6 +59,7 @@ target_link_libraries(${PROJECT_NAME} PRIVATE curlcpp PRIVATE ${Boost_LIBRARIES} PRIVATE mstch + PRIVATE XQilla::XQilla ) target_compile_definitions(${PROJECT_NAME} diff --git a/cmake/Modules/FindXQilla.cmake b/cmake/Modules/FindXQilla.cmake new file mode 100644 index 0000000..9201536 --- /dev/null +++ b/cmake/Modules/FindXQilla.cmake @@ -0,0 +1,28 @@ +# Find the XQilla library +# originally taken from +# https://github.com/rug-compling/alpinocorpus/blob/master/cmake/FindXQilla.cmake + +find_path(XQILLA_INCLUDE_DIR NAMES xqilla/xqilla-simple.hpp) +find_library(XQILLA_LIBRARY NAMES xqilla) +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + XQILLA + DEFAULT_MSG + XQILLA_INCLUDE_DIR + XQILLA_LIBRARY +) +set(XQILLA_LIBRARIES ${XQILLA_LIBRARY}) +mark_as_advanced(XQILLA_INCLUDE_DIR XQILLA_LIBRARY) + +if (XQILLA_FOUND) + find_package(XercesC REQUIRED) + + if (NOT TARGET XQilla::XQilla) + add_library(XQilla::XQilla UNKNOWN IMPORTED) + set_target_properties(XQilla::XQilla PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${XQILLA_INCLUDE_DIR}" + IMPORTED_LOCATION "${XQILLA_LIBRARY}" + INTERFACE_LINK_LIBRARIES XercesC::XercesC + ) + endif() +endif() diff --git a/src/safe_stack_object.hpp b/src/safe_stack_object.hpp new file mode 100644 index 0000000..1738501 --- /dev/null +++ b/src/safe_stack_object.hpp @@ -0,0 +1,104 @@ +/* + Copyright 2016, 2017 Michele "King_DuckZ" Santullo + + This file is part of MyCurry. + + MyCurry is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + MyCurry is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with MyCurry. If not, see . +*/ + +#pragma once + +#include "kakoune/safe_ptr.hh" +#include + +namespace curry { + template + class SafeStackObject { + public: + typedef Kakoune::SafePtr safe_ptr; + + SafeStackObject(); + SafeStackObject (SafeStackObject&& parOther); + SafeStackObject (const SafeStackObject& parOther) = delete; + template explicit SafeStackObject (Args&&... parArgs); + ~SafeStackObject() noexcept = default; + + SafeStackObject& operator= (SafeStackObject&& parOther) = delete; + SafeStackObject& operator= (const SafeStackObject& parOther) = delete; + + operator Kakoune::SafePtr&(); + template + operator Kakoune::SafePtr(); + T& operator*(); + safe_ptr& operator->(); + + private: + T m_obj; + safe_ptr m_obj_ptr; + }; + + template + SafeStackObject::SafeStackObject() : + m_obj(), + m_obj_ptr(&m_obj) + { + } + + template + SafeStackObject::SafeStackObject (SafeStackObject&& parOther) : + m_obj(std::move(parOther.m_obj)), + m_obj_ptr(&m_obj) + { + } + + template + template + SafeStackObject::SafeStackObject (Args&&... parArgs) : + m_obj(std::forward(parArgs)...), + m_obj_ptr(&m_obj) + { + } + + //template + //SafeStackObject& SafeStackObject::operator= (SafeStackObject&& parOther) { + // m_obj = std::move(parOther.m_obj); + // m_obj_ptr = std::move(parOther.m_obj_ptr); + // m_ob + //} + + //template + //SafeStackObject& SafeStackObject::operator= (const SafeStackObject& parOther) { + //} + + template + SafeStackObject::operator Kakoune::SafePtr&() { + return m_obj_ptr; + } + + template + template + SafeStackObject::operator Kakoune::SafePtr() { + return Kakoune::SafePtr(&m_obj); + } + + template + T& SafeStackObject::operator*() { + return *m_obj_ptr; + } + + template + auto SafeStackObject::operator->() -> safe_ptr& { + return m_obj_ptr; + } +} //namespace curry diff --git a/src/xpath.cpp b/src/xpath.cpp index 0ef3db0..ff935d3 100644 --- a/src/xpath.cpp +++ b/src/xpath.cpp @@ -17,7 +17,9 @@ */ #include "xpath.hpp" -#include +#include +#include +#include #include #include #include @@ -46,30 +48,30 @@ namespace duck { } //unnamed namespace XPathBatchResults xpath_query (const std::string& parXML, const std::vector& parQueries) { - pugi::xml_document doc; - std::istringstream iss(parXML); - pugi::xml_parse_result result(doc.load(iss)); - if (not result) { - auto line_col = line_col_from_offset(result.offset, parXML); - throw ParseError(line_col.first, line_col.second, result.description()); + XQilla& xqilla = m_xqilla; + XercesConfiguration xconfig; + AutoDelete context(xqilla.createContext(XQilla::XQUERY_UPDATE, &xconfig)); + xercesc::MemBufInputSource input_buf(reinterpret_cast(parXML.c_str()), parXML.size(), "n/a", false); + Node::Ptr ptr; + try { + ptr = context->parseDocument(input_buf); } + catch (const XMLParseException& err) { + throw ParseError(err.getXQueryLine(), err.getXQueryColumn(), xercesc::XMLString::transcode(err.getError())); + } + context->setContextItem(ptr); XPathBatchResults retval; for (const auto& xpath : parQueries) { - pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str()); + AutoDelete query(xqilla.parse(X(xpath.c_str()))); + context->setContextPosition(1); + context->setContextSize(1); + + Result result = query->execute(context); + Item::Ptr item; std::vector> new_lst; - for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) { - const pugi::xpath_node& node = *itFind; - std::pair new_itm; - if (node.node()) { - new_itm.first = std::string(node.node().name()); - new_itm.second = std::string(node.node().value()); - } - else if (node.attribute()) { - new_itm.first = std::string(node.attribute().name()); - new_itm.second = std::string(node.attribute().value()); - } - new_lst.push_back(std::move(new_itm)); + while(nullptr != (item = result->next(context))) { + new_lst.push_back(std::make_pair(std::string(), UTF8(item->asString(context)))); } retval.push_back(std::move(new_lst)); } diff --git a/src/xpath.hpp b/src/xpath.hpp index 12ce69c..4d5031d 100644 --- a/src/xpath.hpp +++ b/src/xpath.hpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace duck { typedef std::vector>> XPathBatchResults;