diff --git a/CMakeLists.txt b/CMakeLists.txt index 416b4a8..5c48e2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,15 +4,14 @@ project(scraper CXX) add_subdirectory(lib/tidy) -find_package(LibXML++ REQUIRED) -find_package(LibXml2 REQUIRED) +find_package(PugiXML REQUIRED) #find_package(CURL REQUIRED) #${CURL_INCLUDE_DIR} #${CURL_LIBRARIES} include_directories(SYSTEM lib/tidy/include - ${LibXML++_INCLUDE_DIRS} + ${PUGIXML_INCLUDE_DIR} ) add_executable(${PROJECT_NAME} @@ -21,5 +20,5 @@ add_executable(${PROJECT_NAME} target_link_libraries(${PROJECT_NAME} tidy - ${LibXML++_LIBRARIES} + ${PUGIXML_LIBRARIES} ) diff --git a/cmake/Modules/FindGlib.cmake b/cmake/Modules/FindGlib.cmake deleted file mode 100644 index b00d63e..0000000 --- a/cmake/Modules/FindGlib.cmake +++ /dev/null @@ -1,39 +0,0 @@ -# - Try to find Glib-2.0 -# Once done, this will define -# -# Glib_FOUND - system has Glib -# Glib_INCLUDE_DIRS - the Glib include directories -# Glib_LIBRARIES - link these to use Glib - -include(LibFindMacros) - -# Use pkg-config to get hints about paths -libfind_pkg_check_modules(Glib_PKGCONF glib-2.0) - -# Main include dir -find_path(Glib_INCLUDE_DIR - NAMES glib.h - PATHS ${Glib_PKGCONF_INCLUDE_DIRS} - PATH_SUFFIXES glib-2.0 -) - -# Glib-related libraries also use a separate config header, which is in lib dir -find_path(GlibConfig_INCLUDE_DIR - NAMES glibconfig.h - PATHS ${Glib_PKGCONF_INCLUDE_DIRS} /usr - PATH_SUFFIXES lib/glib-2.0/include ../lib/glib-2.0/include -) - -# Finally the library itself -find_library(Glib_LIBRARY - NAMES glib-2.0 - PATHS ${Glib_PKGCONF_LIBRARY_DIRS} -) - -# Set the include dir variables and the libraries and let libfind_process do the rest. -# NOTE: Singular variables for this library, plural for libraries this this lib depends on. -set(Glib_PROCESS_INCLUDES Glib_INCLUDE_DIR GlibConfig_INCLUDE_DIR) -set(Glib_PROCESS_LIBS Glib_LIBRARY) -libfind_process(Glib) - - diff --git a/cmake/Modules/FindGlibmm.cmake b/cmake/Modules/FindGlibmm.cmake deleted file mode 100644 index 8431c0e..0000000 --- a/cmake/Modules/FindGlibmm.cmake +++ /dev/null @@ -1,39 +0,0 @@ -# - Try to find Glibmm-2.4 -# Once done, this will define -# -# Glibmm_FOUND - system has Glibmm -# Glibmm_INCLUDE_DIRS - the Glibmm include directories -# Glibmm_LIBRARIES - link these to use Glibmm - -include(LibFindMacros) - -# Dependencies -libfind_package(Glibmm Glib) -libfind_package(Glibmm SigC++) - -# Use pkg-config to get hints about paths -libfind_pkg_check_modules(Glibmm_PKGCONF glibmm-2.4) - -# Main include dir -find_path(Glibmm_INCLUDE_DIR - NAMES glibmm/main.h - PATHS ${Glibmm_PKGCONF_INCLUDE_DIRS} - PATH_SUFFIXES glibmm-2.4 -) - -# Glib-related libraries also use a separate config header, which is in lib dir -find_path(GlibmmConfig_INCLUDE_DIR - NAMES glibmmconfig.h - PATHS ${Glibmm_PKGCONF_INCLUDE_DIRS} /usr - PATH_SUFFIXES lib/glibmm-2.4/include ../lib/glibmm-2.4/include -) - -# Finally the library itself -find_library(Glibmm_LIBRARY - NAMES glibmm-2.4 - PATHS ${Glibmm_PKGCONF_LIBRARY_DIRS} -) - -set(Glibmm_PROCESS_INCLUDES GlibmmConfig_INCLUDE_DIR) -libfind_process(Glibmm) - diff --git a/cmake/Modules/FindLibXML++.cmake b/cmake/Modules/FindLibXML++.cmake deleted file mode 100644 index e92543e..0000000 --- a/cmake/Modules/FindLibXML++.cmake +++ /dev/null @@ -1,38 +0,0 @@ -# - Try to find LibXML++ 2.6 -# Once done, this will define -# -# LibXML++_FOUND - system has LibXML++ -# LibXML++_INCLUDE_DIRS - the LibXML++ include directories -# LibXML++_LIBRARIES - link these to use LibXML++ - -include(LibFindMacros) - -# Dependencies -libfind_package(LibXML++ LibXML2) -libfind_package(LibXML++ Glibmm) - -# Use pkg-config to get hints about paths -libfind_pkg_check_modules(LibXML++_PKGCONF libxml++-2.6) - -# Main include dir -find_path(LibXML++_INCLUDE_DIR - NAMES libxml++/libxml++.h - PATHS ${LibXML++_PKGCONF_INCLUDE_DIRS} - PATH_SUFFIXES libxml++-2.6 -) - -# Glib-related libraries also use a separate config header, which is in lib dir -find_path(LibXML++Config_INCLUDE_DIR - NAMES libxml++config.h - PATHS ${LibXML++_PKGCONF_INCLUDE_DIRS} /usr - PATH_SUFFIXES lib/libxml++-2.6/include ../lib/libxml++-2.6/include -) - -# Finally the library itself -find_library(LibXML++_LIBRARY - NAMES xml++-2.6 - PATHS ${LibXML++_PKGCONF_LIBRARY_DIRS} -) - -set(LibXML++_PROCESS_INCLUDES LibXML++Config_INCLUDE_DIR) -libfind_process(LibXML++) diff --git a/cmake/Modules/FindLibXML2.cmake b/cmake/Modules/FindLibXML2.cmake deleted file mode 100644 index 8163e70..0000000 --- a/cmake/Modules/FindLibXML2.cmake +++ /dev/null @@ -1,30 +0,0 @@ -# - Try to find LibXML2 -# Once done, this will define -# -# LibXML2_FOUND - system has LibXML2 -# LibXML2_INCLUDE_DIRS - the LibXML2 include directories -# LibXML2_LIBRARIES - link these to use LibXML2 -# -# See documentation on how to write CMake scripts at -# http://www.cmake.org/Wiki/CMake:How_To_Find_Libraries - -include(LibFindMacros) - -libfind_pkg_check_modules(LibXML2_PKGCONF libxml-2.0) - -find_path(LibXML2_INCLUDE_DIR - NAMES libxml/xpath.h - PATHS ${LibXML2_PKGCONF_INCLUDE_DIRS} - PATHS ${LibXML2_PKGCONF_INCLUDE_DIRS}/libxml2 - PATH_SUFFIXES libxml2 -) - -find_library(LibXML2_LIBRARY - NAMES xml2 - PATHS ${LibXML2_PKGCONF_LIBRARY_DIRS} -) - -set(LibXML2_PROCESS_INCLUDES LibXML2_INCLUDE_DIR) -set(LibXML2_PROCESS_LIBS LibXML2_LIBRARY) -libfind_process(LibXML2) - diff --git a/cmake/Modules/FindPugiXML.cmake b/cmake/Modules/FindPugiXML.cmake new file mode 100644 index 0000000..6c241b5 --- /dev/null +++ b/cmake/Modules/FindPugiXML.cmake @@ -0,0 +1,29 @@ +# Find the pugixml XML parsing library. +# +# Sets the usual variables expected for find_package scripts: +# +# PUGIXML_INCLUDE_DIR - header location +# PUGIXML_LIBRARIES - library to link against +# PUGIXML_FOUND - true if pugixml was found. + +find_path (PUGIXML_INCLUDE_DIR + NAMES pugixml.hpp + PATHS ${PUGIXML_HOME}/include) +find_library (PUGIXML_LIBRARY + NAMES pugixml + PATHS ${PUGIXML_HOME}/lib) + +# Support the REQUIRED and QUIET arguments, and set PUGIXML_FOUND if found. +include (FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS (PugiXML DEFAULT_MSG PUGIXML_LIBRARY + PUGIXML_INCLUDE_DIR) + +if (PUGIXML_FOUND) + set (PUGIXML_LIBRARIES ${PUGIXML_LIBRARY}) + message (STATUS "PugiXML include = ${PUGIXML_INCLUDE_DIR}") + message (STATUS "PugiXML library = ${PUGIXML_LIBRARY}") +else () + message (STATUS "No PugiXML found") +endif() + +mark_as_advanced (PUGIXML_LIBRARY PUGIXML_INCLUDE_DIR) diff --git a/cmake/Modules/FindSigC++.cmake b/cmake/Modules/FindSigC++.cmake deleted file mode 100644 index 0d7ea0c..0000000 --- a/cmake/Modules/FindSigC++.cmake +++ /dev/null @@ -1,36 +0,0 @@ -# - Try to find SigC++-2.0 -# Once done, this will define -# -# SigC++_FOUND - system has SigC++ -# SigC++_INCLUDE_DIRS - the SigC++ include directories -# SigC++_LIBRARIES - link these to use SigC++ - -include(LibFindMacros) - -# Use pkg-config to get hints about paths -libfind_pkg_check_modules(SigC++_PKGCONF sigc++-2.0) - -# Main include dir -find_path(SigC++_INCLUDE_DIR - NAMES sigc++/sigc++.h - HINTS ${SigC++_PKGCONF_INCLUDE_DIRS} - PATH_SUFFIXES sigc++-2.0 -) - -# Glib-related libraries also use a separate config header, which is in lib dir -find_path(SigC++Config_INCLUDE_DIR - NAMES sigc++config.h - HINTS ${SigC++_PKGCONF_INCLUDE_DIRS} /usr - PATH_SUFFIXES lib/sigc++-2.0/include ../lib/sigc++-2.0/include -) - -# Finally the library itself -find_library(SigC++_LIBRARY - NAMES sigc-2.0 - HINTS ${SigC++_PKGCONF_LIBRARY_DIRS} -) - -set(SigC++_PROCESS_INCLUDES SigC++_INCLUDE_DIR SigC++Config_INCLUDE_DIR) -set(SigC++_PROCESS_LIBS SigC++_LIBRARY) -libfind_process(SigC++) - diff --git a/cmake/Modules/LibFindMacros.cmake b/cmake/Modules/LibFindMacros.cmake deleted file mode 100644 index 3ef5844..0000000 --- a/cmake/Modules/LibFindMacros.cmake +++ /dev/null @@ -1,266 +0,0 @@ -# Version 2.2 -# Public Domain, originally written by Lasse Kärkkäinen -# Maintained at https://github.com/Tronic/cmake-modules -# Please send your improvements as pull requests on Github. - -# Find another package and make it a dependency of the current package. -# This also automatically forwards the "REQUIRED" argument. -# Usage: libfind_package( [extra args to find_package]) -macro (libfind_package PREFIX PKG) - set(${PREFIX}_args ${PKG} ${ARGN}) - if (${PREFIX}_FIND_REQUIRED) - set(${PREFIX}_args ${${PREFIX}_args} REQUIRED) - endif() - find_package(${${PREFIX}_args}) - set(${PREFIX}_DEPENDENCIES ${${PREFIX}_DEPENDENCIES};${PKG}) - unset(${PREFIX}_args) -endmacro() - -# A simple wrapper to make pkg-config searches a bit easier. -# Works the same as CMake's internal pkg_check_modules but is always quiet. -macro (libfind_pkg_check_modules) - find_package(PkgConfig QUIET) - if (PKG_CONFIG_FOUND) - pkg_check_modules(${ARGN} QUIET) - endif() -endmacro() - -# Avoid useless copy&pasta by doing what most simple libraries do anyway: -# pkg-config, find headers, find library. -# Usage: libfind_pkg_detect( FIND_PATH [other args] FIND_LIBRARY [other args]) -# E.g. libfind_pkg_detect(SDL2 sdl2 FIND_PATH SDL.h PATH_SUFFIXES SDL2 FIND_LIBRARY SDL2) -function (libfind_pkg_detect PREFIX) - # Parse arguments - set(argname pkgargs) - foreach (i ${ARGN}) - if ("${i}" STREQUAL "FIND_PATH") - set(argname pathargs) - elseif ("${i}" STREQUAL "FIND_LIBRARY") - set(argname libraryargs) - else() - set(${argname} ${${argname}} ${i}) - endif() - endforeach() - if (NOT pkgargs) - message(FATAL_ERROR "libfind_pkg_detect requires at least a pkg_config package name to be passed.") - endif() - # Find library - libfind_pkg_check_modules(${PREFIX}_PKGCONF ${pkgargs}) - if (pathargs) - find_path(${PREFIX}_INCLUDE_DIR NAMES ${pathargs} HINTS ${${PREFIX}_PKGCONF_INCLUDE_DIRS}) - endif() - if (libraryargs) - find_library(${PREFIX}_LIBRARY NAMES ${libraryargs} HINTS ${${PREFIX}_PKGCONF_LIBRARY_DIRS}) - endif() -endfunction() - -# Extracts a version #define from a version.h file, output stored to _VERSION. -# Usage: libfind_version_header(Foobar foobar/version.h FOOBAR_VERSION_STR) -# Fourth argument "QUIET" may be used for silently testing different define names. -# This function does nothing if the version variable is already defined. -function (libfind_version_header PREFIX VERSION_H DEFINE_NAME) - # Skip processing if we already have a version or if the include dir was not found - if (${PREFIX}_VERSION OR NOT ${PREFIX}_INCLUDE_DIR) - return() - endif() - set(quiet ${${PREFIX}_FIND_QUIETLY}) - # Process optional arguments - foreach(arg ${ARGN}) - if (arg STREQUAL "QUIET") - set(quiet TRUE) - else() - message(AUTHOR_WARNING "Unknown argument ${arg} to libfind_version_header ignored.") - endif() - endforeach() - # Read the header and parse for version number - set(filename "${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") - if (NOT EXISTS ${filename}) - if (NOT quiet) - message(AUTHOR_WARNING "Unable to find ${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") - endif() - return() - endif() - file(READ "${filename}" header) - string(REGEX REPLACE ".*#[ \t]*define[ \t]*${DEFINE_NAME}[ \t]*\"([^\n]*)\".*" "\\1" match "${header}") - # No regex match? - if (match STREQUAL header) - if (NOT quiet) - message(AUTHOR_WARNING "Unable to find \#define ${DEFINE_NAME} \"\" from ${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") - endif() - return() - endif() - # Export the version string - set(${PREFIX}_VERSION "${match}" PARENT_SCOPE) -endfunction() - -# Do the final processing once the paths have been detected. -# If include dirs are needed, ${PREFIX}_PROCESS_INCLUDES should be set to contain -# all the variables, each of which contain one include directory. -# Ditto for ${PREFIX}_PROCESS_LIBS and library files. -# Will set ${PREFIX}_FOUND, ${PREFIX}_INCLUDE_DIRS and ${PREFIX}_LIBRARIES. -# Also handles errors in case library detection was required, etc. -function (libfind_process PREFIX) - # Skip processing if already processed during this configuration run - if (${PREFIX}_FOUND) - return() - endif() - - set(found TRUE) # Start with the assumption that the package was found - - # Did we find any files? Did we miss includes? These are for formatting better error messages. - set(some_files FALSE) - set(missing_headers FALSE) - - # Shorthands for some variables that we need often - set(quiet ${${PREFIX}_FIND_QUIETLY}) - set(required ${${PREFIX}_FIND_REQUIRED}) - set(exactver ${${PREFIX}_FIND_VERSION_EXACT}) - set(findver "${${PREFIX}_FIND_VERSION}") - set(version "${${PREFIX}_VERSION}") - - # Lists of config option names (all, includes, libs) - unset(configopts) - set(includeopts ${${PREFIX}_PROCESS_INCLUDES}) - set(libraryopts ${${PREFIX}_PROCESS_LIBS}) - - # Process deps to add to - foreach (i ${PREFIX} ${${PREFIX}_DEPENDENCIES}) - if (DEFINED ${i}_INCLUDE_OPTS OR DEFINED ${i}_LIBRARY_OPTS) - # The package seems to export option lists that we can use, woohoo! - list(APPEND includeopts ${${i}_INCLUDE_OPTS}) - list(APPEND libraryopts ${${i}_LIBRARY_OPTS}) - else() - # If plural forms don't exist or they equal singular forms - if ((NOT DEFINED ${i}_INCLUDE_DIRS AND NOT DEFINED ${i}_LIBRARIES) OR - ({i}_INCLUDE_DIR STREQUAL ${i}_INCLUDE_DIRS AND ${i}_LIBRARY STREQUAL ${i}_LIBRARIES)) - # Singular forms can be used - if (DEFINED ${i}_INCLUDE_DIR) - list(APPEND includeopts ${i}_INCLUDE_DIR) - endif() - if (DEFINED ${i}_LIBRARY) - list(APPEND libraryopts ${i}_LIBRARY) - endif() - else() - # Oh no, we don't know the option names - message(FATAL_ERROR "We couldn't determine config variable names for ${i} includes and libs. Aieeh!") - endif() - endif() - endforeach() - - if (includeopts) - list(REMOVE_DUPLICATES includeopts) - endif() - - if (libraryopts) - list(REMOVE_DUPLICATES libraryopts) - endif() - - string(REGEX REPLACE ".*[ ;]([^ ;]*(_INCLUDE_DIRS|_LIBRARIES))" "\\1" tmp "${includeopts} ${libraryopts}") - if (NOT tmp STREQUAL "${includeopts} ${libraryopts}") - message(AUTHOR_WARNING "Plural form ${tmp} found in config options of ${PREFIX}. This works as before but is now deprecated. Please only use singular forms INCLUDE_DIR and LIBRARY, and update your find scripts for LibFindMacros > 2.0 automatic dependency system (most often you can simply remove the PROCESS variables entirely).") - endif() - - # Include/library names separated by spaces (notice: not CMake lists) - unset(includes) - unset(libs) - - # Process all includes and set found false if any are missing - foreach (i ${includeopts}) - list(APPEND configopts ${i}) - if (NOT "${${i}}" STREQUAL "${i}-NOTFOUND") - list(APPEND includes "${${i}}") - else() - set(found FALSE) - set(missing_headers TRUE) - endif() - endforeach() - - # Process all libraries and set found false if any are missing - foreach (i ${libraryopts}) - list(APPEND configopts ${i}) - if (NOT "${${i}}" STREQUAL "${i}-NOTFOUND") - list(APPEND libs "${${i}}") - else() - set (found FALSE) - endif() - endforeach() - - # Version checks - if (found AND findver) - if (NOT version) - message(WARNING "The find module for ${PREFIX} does not provide version information, so we'll just assume that it is OK. Please fix the module or remove package version requirements to get rid of this warning.") - elseif (version VERSION_LESS findver OR (exactver AND NOT version VERSION_EQUAL findver)) - set(found FALSE) - set(version_unsuitable TRUE) - endif() - endif() - - # If all-OK, hide all config options, export variables, print status and exit - if (found) - foreach (i ${configopts}) - mark_as_advanced(${i}) - endforeach() - if (NOT quiet) - message(STATUS "Found ${PREFIX} ${${PREFIX}_VERSION}") - if (LIBFIND_DEBUG) - message(STATUS " ${PREFIX}_DEPENDENCIES=${${PREFIX}_DEPENDENCIES}") - message(STATUS " ${PREFIX}_INCLUDE_OPTS=${includeopts}") - message(STATUS " ${PREFIX}_INCLUDE_DIRS=${includes}") - message(STATUS " ${PREFIX}_LIBRARY_OPTS=${libraryopts}") - message(STATUS " ${PREFIX}_LIBRARIES=${libs}") - endif() - set (${PREFIX}_INCLUDE_OPTS ${includeopts} PARENT_SCOPE) - set (${PREFIX}_LIBRARY_OPTS ${libraryopts} PARENT_SCOPE) - set (${PREFIX}_INCLUDE_DIRS ${includes} PARENT_SCOPE) - set (${PREFIX}_LIBRARIES ${libs} PARENT_SCOPE) - set (${PREFIX}_FOUND TRUE PARENT_SCOPE) - endif() - return() - endif() - - # Format messages for debug info and the type of error - set(vars "Relevant CMake configuration variables:\n") - foreach (i ${configopts}) - mark_as_advanced(CLEAR ${i}) - set(val ${${i}}) - if ("${val}" STREQUAL "${i}-NOTFOUND") - set (val "") - elseif (val AND NOT EXISTS ${val}) - set (val "${val} (does not exist)") - else() - set(some_files TRUE) - endif() - set(vars "${vars} ${i}=${val}\n") - endforeach() - set(vars "${vars}You may use CMake GUI, cmake -D or ccmake to modify the values. Delete CMakeCache.txt to discard all values and force full re-detection if necessary.\n") - if (version_unsuitable) - set(msg "${PREFIX} ${${PREFIX}_VERSION} was found but") - if (exactver) - set(msg "${msg} only version ${findver} is acceptable.") - else() - set(msg "${msg} version ${findver} is the minimum requirement.") - endif() - else() - if (missing_headers) - set(msg "We could not find development headers for ${PREFIX}. Do you have the necessary dev package installed?") - elseif (some_files) - set(msg "We only found some files of ${PREFIX}, not all of them. Perhaps your installation is incomplete or maybe we just didn't look in the right place?") - if(findver) - set(msg "${msg} This could also be caused by incompatible version (if it helps, at least ${PREFIX} ${findver} should work).") - endif() - else() - set(msg "We were unable to find package ${PREFIX}.") - endif() - endif() - - # Fatal error out if REQUIRED - if (required) - set(msg "REQUIRED PACKAGE NOT FOUND\n${msg} This package is REQUIRED and you need to install it or adjust CMake configuration in order to continue building ${CMAKE_PROJECT_NAME}.") - message(FATAL_ERROR "${msg}\n${vars}") - endif() - # Otherwise just print a nasty warning - if (NOT quiet) - message(WARNING "WARNING: MISSING PACKAGE\n${msg} This package is NOT REQUIRED and you may ignore this warning but by doing so you may miss some functionality of ${CMAKE_PROJECT_NAME}. \n${vars}") - endif() -endfunction() - diff --git a/src/main.cpp b/src/main.cpp index 7f12284..3b43c50 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,7 +3,8 @@ #include #include #include -#include +#include +#include namespace { const char g_testData[] = { @@ -151,19 +152,23 @@ namespace { int main (int argc, char* argv[]) { const std::string tidyHtml(cleanHTML(g_testData)); std::cout << tidyHtml << std::endl; + std::cout << "XPath: " << argv[1] << std::endl; - //{ - // xmlpp::DomParser doc; + { + pugi::xml_document doc; + std::istringstream iss(tidyHtml); + pugi::xml_parse_result result(doc.load(iss)); + if (not result) { + std::cerr << "Error parsing the source XML"; + return 1; + } - // // 'response' contains your HTML - // doc.parse_memory(tidyHtml); - - // xmlpp::Document* const document = doc.get_document(); - // xmlpp::Element* const root = document->get_root_node(); - - // xmlpp::NodeSet elemns = root->find(argv[1]); - // std::cout << elemns[0]->get_line() << std::endl; - // std::cout << elemns.size() << std::endl; - //} + pugi::xpath_node_set xpathRes = doc.select_nodes(argv[1]); + for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) { + const pugi::xpath_node& node = *itFind; + std::cout << node.node().name() << ": "; + std::cout << node.node().attribute("name").value() << "\n"; + } + } return 0; }