diff --git a/CMakeLists.txt b/CMakeLists.txt index a0a425e..416b4a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,25 @@ cmake_minimum_required(VERSION 2.8 FATAL_ERROR) +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") project(scraper CXX) +add_subdirectory(lib/tidy) + +find_package(LibXML++ REQUIRED) +find_package(LibXml2 REQUIRED) +#find_package(CURL REQUIRED) +#${CURL_INCLUDE_DIR} +#${CURL_LIBRARIES} + +include_directories(SYSTEM + lib/tidy/include + ${LibXML++_INCLUDE_DIRS} +) + add_executable(${PROJECT_NAME} src/main.cpp ) + +target_link_libraries(${PROJECT_NAME} + tidy + ${LibXML++_LIBRARIES} +) diff --git a/cmake/Modules/FindGlib.cmake b/cmake/Modules/FindGlib.cmake new file mode 100644 index 0000000..b00d63e --- /dev/null +++ b/cmake/Modules/FindGlib.cmake @@ -0,0 +1,39 @@ +# - Try to find Glib-2.0 +# Once done, this will define +# +# Glib_FOUND - system has Glib +# Glib_INCLUDE_DIRS - the Glib include directories +# Glib_LIBRARIES - link these to use Glib + +include(LibFindMacros) + +# Use pkg-config to get hints about paths +libfind_pkg_check_modules(Glib_PKGCONF glib-2.0) + +# Main include dir +find_path(Glib_INCLUDE_DIR + NAMES glib.h + PATHS ${Glib_PKGCONF_INCLUDE_DIRS} + PATH_SUFFIXES glib-2.0 +) + +# Glib-related libraries also use a separate config header, which is in lib dir +find_path(GlibConfig_INCLUDE_DIR + NAMES glibconfig.h + PATHS ${Glib_PKGCONF_INCLUDE_DIRS} /usr + PATH_SUFFIXES lib/glib-2.0/include ../lib/glib-2.0/include +) + +# Finally the library itself +find_library(Glib_LIBRARY + NAMES glib-2.0 + PATHS ${Glib_PKGCONF_LIBRARY_DIRS} +) + +# Set the include dir variables and the libraries and let libfind_process do the rest. +# NOTE: Singular variables for this library, plural for libraries this this lib depends on. +set(Glib_PROCESS_INCLUDES Glib_INCLUDE_DIR GlibConfig_INCLUDE_DIR) +set(Glib_PROCESS_LIBS Glib_LIBRARY) +libfind_process(Glib) + + diff --git a/cmake/Modules/FindGlibmm.cmake b/cmake/Modules/FindGlibmm.cmake new file mode 100644 index 0000000..8431c0e --- /dev/null +++ b/cmake/Modules/FindGlibmm.cmake @@ -0,0 +1,39 @@ +# - Try to find Glibmm-2.4 +# Once done, this will define +# +# Glibmm_FOUND - system has Glibmm +# Glibmm_INCLUDE_DIRS - the Glibmm include directories +# Glibmm_LIBRARIES - link these to use Glibmm + +include(LibFindMacros) + +# Dependencies +libfind_package(Glibmm Glib) +libfind_package(Glibmm SigC++) + +# Use pkg-config to get hints about paths +libfind_pkg_check_modules(Glibmm_PKGCONF glibmm-2.4) + +# Main include dir +find_path(Glibmm_INCLUDE_DIR + NAMES glibmm/main.h + PATHS ${Glibmm_PKGCONF_INCLUDE_DIRS} + PATH_SUFFIXES glibmm-2.4 +) + +# Glib-related libraries also use a separate config header, which is in lib dir +find_path(GlibmmConfig_INCLUDE_DIR + NAMES glibmmconfig.h + PATHS ${Glibmm_PKGCONF_INCLUDE_DIRS} /usr + PATH_SUFFIXES lib/glibmm-2.4/include ../lib/glibmm-2.4/include +) + +# Finally the library itself +find_library(Glibmm_LIBRARY + NAMES glibmm-2.4 + PATHS ${Glibmm_PKGCONF_LIBRARY_DIRS} +) + +set(Glibmm_PROCESS_INCLUDES GlibmmConfig_INCLUDE_DIR) +libfind_process(Glibmm) + diff --git a/cmake/Modules/FindLibXML++.cmake b/cmake/Modules/FindLibXML++.cmake new file mode 100644 index 0000000..e92543e --- /dev/null +++ b/cmake/Modules/FindLibXML++.cmake @@ -0,0 +1,38 @@ +# - Try to find LibXML++ 2.6 +# Once done, this will define +# +# LibXML++_FOUND - system has LibXML++ +# LibXML++_INCLUDE_DIRS - the LibXML++ include directories +# LibXML++_LIBRARIES - link these to use LibXML++ + +include(LibFindMacros) + +# Dependencies +libfind_package(LibXML++ LibXML2) +libfind_package(LibXML++ Glibmm) + +# Use pkg-config to get hints about paths +libfind_pkg_check_modules(LibXML++_PKGCONF libxml++-2.6) + +# Main include dir +find_path(LibXML++_INCLUDE_DIR + NAMES libxml++/libxml++.h + PATHS ${LibXML++_PKGCONF_INCLUDE_DIRS} + PATH_SUFFIXES libxml++-2.6 +) + +# Glib-related libraries also use a separate config header, which is in lib dir +find_path(LibXML++Config_INCLUDE_DIR + NAMES libxml++config.h + PATHS ${LibXML++_PKGCONF_INCLUDE_DIRS} /usr + PATH_SUFFIXES lib/libxml++-2.6/include ../lib/libxml++-2.6/include +) + +# Finally the library itself +find_library(LibXML++_LIBRARY + NAMES xml++-2.6 + PATHS ${LibXML++_PKGCONF_LIBRARY_DIRS} +) + +set(LibXML++_PROCESS_INCLUDES LibXML++Config_INCLUDE_DIR) +libfind_process(LibXML++) diff --git a/cmake/Modules/FindLibXML2.cmake b/cmake/Modules/FindLibXML2.cmake new file mode 100644 index 0000000..8163e70 --- /dev/null +++ b/cmake/Modules/FindLibXML2.cmake @@ -0,0 +1,30 @@ +# - Try to find LibXML2 +# Once done, this will define +# +# LibXML2_FOUND - system has LibXML2 +# LibXML2_INCLUDE_DIRS - the LibXML2 include directories +# LibXML2_LIBRARIES - link these to use LibXML2 +# +# See documentation on how to write CMake scripts at +# http://www.cmake.org/Wiki/CMake:How_To_Find_Libraries + +include(LibFindMacros) + +libfind_pkg_check_modules(LibXML2_PKGCONF libxml-2.0) + +find_path(LibXML2_INCLUDE_DIR + NAMES libxml/xpath.h + PATHS ${LibXML2_PKGCONF_INCLUDE_DIRS} + PATHS ${LibXML2_PKGCONF_INCLUDE_DIRS}/libxml2 + PATH_SUFFIXES libxml2 +) + +find_library(LibXML2_LIBRARY + NAMES xml2 + PATHS ${LibXML2_PKGCONF_LIBRARY_DIRS} +) + +set(LibXML2_PROCESS_INCLUDES LibXML2_INCLUDE_DIR) +set(LibXML2_PROCESS_LIBS LibXML2_LIBRARY) +libfind_process(LibXML2) + diff --git a/cmake/Modules/FindSigC++.cmake b/cmake/Modules/FindSigC++.cmake new file mode 100644 index 0000000..0d7ea0c --- /dev/null +++ b/cmake/Modules/FindSigC++.cmake @@ -0,0 +1,36 @@ +# - Try to find SigC++-2.0 +# Once done, this will define +# +# SigC++_FOUND - system has SigC++ +# SigC++_INCLUDE_DIRS - the SigC++ include directories +# SigC++_LIBRARIES - link these to use SigC++ + +include(LibFindMacros) + +# Use pkg-config to get hints about paths +libfind_pkg_check_modules(SigC++_PKGCONF sigc++-2.0) + +# Main include dir +find_path(SigC++_INCLUDE_DIR + NAMES sigc++/sigc++.h + HINTS ${SigC++_PKGCONF_INCLUDE_DIRS} + PATH_SUFFIXES sigc++-2.0 +) + +# Glib-related libraries also use a separate config header, which is in lib dir +find_path(SigC++Config_INCLUDE_DIR + NAMES sigc++config.h + HINTS ${SigC++_PKGCONF_INCLUDE_DIRS} /usr + PATH_SUFFIXES lib/sigc++-2.0/include ../lib/sigc++-2.0/include +) + +# Finally the library itself +find_library(SigC++_LIBRARY + NAMES sigc-2.0 + HINTS ${SigC++_PKGCONF_LIBRARY_DIRS} +) + +set(SigC++_PROCESS_INCLUDES SigC++_INCLUDE_DIR SigC++Config_INCLUDE_DIR) +set(SigC++_PROCESS_LIBS SigC++_LIBRARY) +libfind_process(SigC++) + diff --git a/cmake/Modules/LibFindMacros.cmake b/cmake/Modules/LibFindMacros.cmake new file mode 100644 index 0000000..3ef5844 --- /dev/null +++ b/cmake/Modules/LibFindMacros.cmake @@ -0,0 +1,266 @@ +# Version 2.2 +# Public Domain, originally written by Lasse Kärkkäinen +# Maintained at https://github.com/Tronic/cmake-modules +# Please send your improvements as pull requests on Github. + +# Find another package and make it a dependency of the current package. +# This also automatically forwards the "REQUIRED" argument. +# Usage: libfind_package( [extra args to find_package]) +macro (libfind_package PREFIX PKG) + set(${PREFIX}_args ${PKG} ${ARGN}) + if (${PREFIX}_FIND_REQUIRED) + set(${PREFIX}_args ${${PREFIX}_args} REQUIRED) + endif() + find_package(${${PREFIX}_args}) + set(${PREFIX}_DEPENDENCIES ${${PREFIX}_DEPENDENCIES};${PKG}) + unset(${PREFIX}_args) +endmacro() + +# A simple wrapper to make pkg-config searches a bit easier. +# Works the same as CMake's internal pkg_check_modules but is always quiet. +macro (libfind_pkg_check_modules) + find_package(PkgConfig QUIET) + if (PKG_CONFIG_FOUND) + pkg_check_modules(${ARGN} QUIET) + endif() +endmacro() + +# Avoid useless copy&pasta by doing what most simple libraries do anyway: +# pkg-config, find headers, find library. +# Usage: libfind_pkg_detect( FIND_PATH [other args] FIND_LIBRARY [other args]) +# E.g. libfind_pkg_detect(SDL2 sdl2 FIND_PATH SDL.h PATH_SUFFIXES SDL2 FIND_LIBRARY SDL2) +function (libfind_pkg_detect PREFIX) + # Parse arguments + set(argname pkgargs) + foreach (i ${ARGN}) + if ("${i}" STREQUAL "FIND_PATH") + set(argname pathargs) + elseif ("${i}" STREQUAL "FIND_LIBRARY") + set(argname libraryargs) + else() + set(${argname} ${${argname}} ${i}) + endif() + endforeach() + if (NOT pkgargs) + message(FATAL_ERROR "libfind_pkg_detect requires at least a pkg_config package name to be passed.") + endif() + # Find library + libfind_pkg_check_modules(${PREFIX}_PKGCONF ${pkgargs}) + if (pathargs) + find_path(${PREFIX}_INCLUDE_DIR NAMES ${pathargs} HINTS ${${PREFIX}_PKGCONF_INCLUDE_DIRS}) + endif() + if (libraryargs) + find_library(${PREFIX}_LIBRARY NAMES ${libraryargs} HINTS ${${PREFIX}_PKGCONF_LIBRARY_DIRS}) + endif() +endfunction() + +# Extracts a version #define from a version.h file, output stored to _VERSION. +# Usage: libfind_version_header(Foobar foobar/version.h FOOBAR_VERSION_STR) +# Fourth argument "QUIET" may be used for silently testing different define names. +# This function does nothing if the version variable is already defined. +function (libfind_version_header PREFIX VERSION_H DEFINE_NAME) + # Skip processing if we already have a version or if the include dir was not found + if (${PREFIX}_VERSION OR NOT ${PREFIX}_INCLUDE_DIR) + return() + endif() + set(quiet ${${PREFIX}_FIND_QUIETLY}) + # Process optional arguments + foreach(arg ${ARGN}) + if (arg STREQUAL "QUIET") + set(quiet TRUE) + else() + message(AUTHOR_WARNING "Unknown argument ${arg} to libfind_version_header ignored.") + endif() + endforeach() + # Read the header and parse for version number + set(filename "${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") + if (NOT EXISTS ${filename}) + if (NOT quiet) + message(AUTHOR_WARNING "Unable to find ${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") + endif() + return() + endif() + file(READ "${filename}" header) + string(REGEX REPLACE ".*#[ \t]*define[ \t]*${DEFINE_NAME}[ \t]*\"([^\n]*)\".*" "\\1" match "${header}") + # No regex match? + if (match STREQUAL header) + if (NOT quiet) + message(AUTHOR_WARNING "Unable to find \#define ${DEFINE_NAME} \"\" from ${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") + endif() + return() + endif() + # Export the version string + set(${PREFIX}_VERSION "${match}" PARENT_SCOPE) +endfunction() + +# Do the final processing once the paths have been detected. +# If include dirs are needed, ${PREFIX}_PROCESS_INCLUDES should be set to contain +# all the variables, each of which contain one include directory. +# Ditto for ${PREFIX}_PROCESS_LIBS and library files. +# Will set ${PREFIX}_FOUND, ${PREFIX}_INCLUDE_DIRS and ${PREFIX}_LIBRARIES. +# Also handles errors in case library detection was required, etc. +function (libfind_process PREFIX) + # Skip processing if already processed during this configuration run + if (${PREFIX}_FOUND) + return() + endif() + + set(found TRUE) # Start with the assumption that the package was found + + # Did we find any files? Did we miss includes? These are for formatting better error messages. + set(some_files FALSE) + set(missing_headers FALSE) + + # Shorthands for some variables that we need often + set(quiet ${${PREFIX}_FIND_QUIETLY}) + set(required ${${PREFIX}_FIND_REQUIRED}) + set(exactver ${${PREFIX}_FIND_VERSION_EXACT}) + set(findver "${${PREFIX}_FIND_VERSION}") + set(version "${${PREFIX}_VERSION}") + + # Lists of config option names (all, includes, libs) + unset(configopts) + set(includeopts ${${PREFIX}_PROCESS_INCLUDES}) + set(libraryopts ${${PREFIX}_PROCESS_LIBS}) + + # Process deps to add to + foreach (i ${PREFIX} ${${PREFIX}_DEPENDENCIES}) + if (DEFINED ${i}_INCLUDE_OPTS OR DEFINED ${i}_LIBRARY_OPTS) + # The package seems to export option lists that we can use, woohoo! + list(APPEND includeopts ${${i}_INCLUDE_OPTS}) + list(APPEND libraryopts ${${i}_LIBRARY_OPTS}) + else() + # If plural forms don't exist or they equal singular forms + if ((NOT DEFINED ${i}_INCLUDE_DIRS AND NOT DEFINED ${i}_LIBRARIES) OR + ({i}_INCLUDE_DIR STREQUAL ${i}_INCLUDE_DIRS AND ${i}_LIBRARY STREQUAL ${i}_LIBRARIES)) + # Singular forms can be used + if (DEFINED ${i}_INCLUDE_DIR) + list(APPEND includeopts ${i}_INCLUDE_DIR) + endif() + if (DEFINED ${i}_LIBRARY) + list(APPEND libraryopts ${i}_LIBRARY) + endif() + else() + # Oh no, we don't know the option names + message(FATAL_ERROR "We couldn't determine config variable names for ${i} includes and libs. Aieeh!") + endif() + endif() + endforeach() + + if (includeopts) + list(REMOVE_DUPLICATES includeopts) + endif() + + if (libraryopts) + list(REMOVE_DUPLICATES libraryopts) + endif() + + string(REGEX REPLACE ".*[ ;]([^ ;]*(_INCLUDE_DIRS|_LIBRARIES))" "\\1" tmp "${includeopts} ${libraryopts}") + if (NOT tmp STREQUAL "${includeopts} ${libraryopts}") + message(AUTHOR_WARNING "Plural form ${tmp} found in config options of ${PREFIX}. This works as before but is now deprecated. Please only use singular forms INCLUDE_DIR and LIBRARY, and update your find scripts for LibFindMacros > 2.0 automatic dependency system (most often you can simply remove the PROCESS variables entirely).") + endif() + + # Include/library names separated by spaces (notice: not CMake lists) + unset(includes) + unset(libs) + + # Process all includes and set found false if any are missing + foreach (i ${includeopts}) + list(APPEND configopts ${i}) + if (NOT "${${i}}" STREQUAL "${i}-NOTFOUND") + list(APPEND includes "${${i}}") + else() + set(found FALSE) + set(missing_headers TRUE) + endif() + endforeach() + + # Process all libraries and set found false if any are missing + foreach (i ${libraryopts}) + list(APPEND configopts ${i}) + if (NOT "${${i}}" STREQUAL "${i}-NOTFOUND") + list(APPEND libs "${${i}}") + else() + set (found FALSE) + endif() + endforeach() + + # Version checks + if (found AND findver) + if (NOT version) + message(WARNING "The find module for ${PREFIX} does not provide version information, so we'll just assume that it is OK. Please fix the module or remove package version requirements to get rid of this warning.") + elseif (version VERSION_LESS findver OR (exactver AND NOT version VERSION_EQUAL findver)) + set(found FALSE) + set(version_unsuitable TRUE) + endif() + endif() + + # If all-OK, hide all config options, export variables, print status and exit + if (found) + foreach (i ${configopts}) + mark_as_advanced(${i}) + endforeach() + if (NOT quiet) + message(STATUS "Found ${PREFIX} ${${PREFIX}_VERSION}") + if (LIBFIND_DEBUG) + message(STATUS " ${PREFIX}_DEPENDENCIES=${${PREFIX}_DEPENDENCIES}") + message(STATUS " ${PREFIX}_INCLUDE_OPTS=${includeopts}") + message(STATUS " ${PREFIX}_INCLUDE_DIRS=${includes}") + message(STATUS " ${PREFIX}_LIBRARY_OPTS=${libraryopts}") + message(STATUS " ${PREFIX}_LIBRARIES=${libs}") + endif() + set (${PREFIX}_INCLUDE_OPTS ${includeopts} PARENT_SCOPE) + set (${PREFIX}_LIBRARY_OPTS ${libraryopts} PARENT_SCOPE) + set (${PREFIX}_INCLUDE_DIRS ${includes} PARENT_SCOPE) + set (${PREFIX}_LIBRARIES ${libs} PARENT_SCOPE) + set (${PREFIX}_FOUND TRUE PARENT_SCOPE) + endif() + return() + endif() + + # Format messages for debug info and the type of error + set(vars "Relevant CMake configuration variables:\n") + foreach (i ${configopts}) + mark_as_advanced(CLEAR ${i}) + set(val ${${i}}) + if ("${val}" STREQUAL "${i}-NOTFOUND") + set (val "") + elseif (val AND NOT EXISTS ${val}) + set (val "${val} (does not exist)") + else() + set(some_files TRUE) + endif() + set(vars "${vars} ${i}=${val}\n") + endforeach() + set(vars "${vars}You may use CMake GUI, cmake -D or ccmake to modify the values. Delete CMakeCache.txt to discard all values and force full re-detection if necessary.\n") + if (version_unsuitable) + set(msg "${PREFIX} ${${PREFIX}_VERSION} was found but") + if (exactver) + set(msg "${msg} only version ${findver} is acceptable.") + else() + set(msg "${msg} version ${findver} is the minimum requirement.") + endif() + else() + if (missing_headers) + set(msg "We could not find development headers for ${PREFIX}. Do you have the necessary dev package installed?") + elseif (some_files) + set(msg "We only found some files of ${PREFIX}, not all of them. Perhaps your installation is incomplete or maybe we just didn't look in the right place?") + if(findver) + set(msg "${msg} This could also be caused by incompatible version (if it helps, at least ${PREFIX} ${findver} should work).") + endif() + else() + set(msg "We were unable to find package ${PREFIX}.") + endif() + endif() + + # Fatal error out if REQUIRED + if (required) + set(msg "REQUIRED PACKAGE NOT FOUND\n${msg} This package is REQUIRED and you need to install it or adjust CMake configuration in order to continue building ${CMAKE_PROJECT_NAME}.") + message(FATAL_ERROR "${msg}\n${vars}") + endif() + # Otherwise just print a nasty warning + if (NOT quiet) + message(WARNING "WARNING: MISSING PACKAGE\n${msg} This package is NOT REQUIRED and you may ignore this warning but by doing so you may miss some functionality of ${CMAKE_PROJECT_NAME}. \n${vars}") + endif() +endfunction() + diff --git a/src/main.cpp b/src/main.cpp index ae17ce0..7f12284 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,9 @@ #include #include +#include +#include +#include +#include namespace { const char g_testData[] = { @@ -104,9 +108,62 @@ namespace { "\n" "\n" }; + + std::string cleanHTML(const std::string &html){ + // Initialize a Tidy document + TidyDoc tidyDoc = tidyCreate(); + TidyBuffer tidyOutputBuffer = {0}; + + // Configure Tidy + // The flags tell Tidy to output XML and disable showing warnings + bool configSuccess = tidyOptSetBool(tidyDoc, TidyXmlOut, yes) + && tidyOptSetBool(tidyDoc, TidyQuiet, yes) + && tidyOptSetBool(tidyDoc, TidyNumEntities, yes) + && tidyOptSetBool(tidyDoc, TidyShowWarnings, no); + + int tidyResponseCode = -1; + + // Parse input + if (configSuccess) + tidyResponseCode = tidyParseString(tidyDoc, html.c_str()); + + // Process HTML + if (tidyResponseCode >= 0) + tidyResponseCode = tidyCleanAndRepair(tidyDoc); + + // Output the HTML to our buffer + if (tidyResponseCode >= 0) + tidyResponseCode = tidySaveBuffer(tidyDoc, &tidyOutputBuffer); + + // Any errors from Tidy? + if (tidyResponseCode < 0) + throw ("Tidy encountered an error while parsing an HTML response. Tidy response code: " + tidyResponseCode); + + // Grab the result from the buffer and then free Tidy's memory + std::string tidyResult = (char*)tidyOutputBuffer.bp; + tidyBufFree(&tidyOutputBuffer); + tidyRelease(tidyDoc); + + return tidyResult; + } } //unnamed namespace int main (int argc, char* argv[]) { - std::cout << g_testData << std::endl; + const std::string tidyHtml(cleanHTML(g_testData)); + std::cout << tidyHtml << std::endl; + + //{ + // xmlpp::DomParser doc; + + // // 'response' contains your HTML + // doc.parse_memory(tidyHtml); + + // xmlpp::Document* const document = doc.get_document(); + // xmlpp::Element* const root = document->get_root_node(); + + // xmlpp::NodeSet elemns = root->find(argv[1]); + // std::cout << elemns[0]->get_line() << std::endl; + // std::cout << elemns.size() << std::endl; + //} return 0; }