diff --git a/CMakeLists.txt b/CMakeLists.txt index bc3be9c..5f36ecf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,19 +1,15 @@ -cmake_minimum_required(VERSION 2.8 FATAL_ERROR) +cmake_minimum_required(VERSION 3.0 FATAL_ERROR) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") -project(duckscraper CXX) - -add_subdirectory(lib/tidy) -add_subdirectory(lib/curlcpp) +project(duckscraper VERSION 0.2 LANGUAGES CXX) include(GetGitRevisionDescription) find_package(PugiXML REQUIRED) +find_package(Boost 1.32.0 COMPONENTS program_options) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer") set(DEFAULT_USER_AGENT "DuckScraper") -set(PROJECT_VERSION_MAJOR "0") -set(PROJECT_VERSION_MINOR "1") set(PROJECT_VERSION_BETA "1") get_git_head_revision(GIT_REFSPEC PROJECT_VERSION_GIT) @@ -25,7 +21,8 @@ configure_file( include_directories(SYSTEM lib/tidy/include ${PUGIXML_INCLUDE_DIR} - ${CURLCPP_SOURCE_DIR}/include + lib/curlcpp/include + ${Boost_INCLUDE_DIRS} ) include_directories( src/ @@ -41,4 +38,15 @@ target_link_libraries(${PROJECT_NAME} tidy-share ${PUGIXML_LIBRARIES} curlcpp + ${Boost_LIBRARIES} ) + +#unset those variables so cmake files from dependencies won't complain about +#new/old policy. Those unset statements can be removed once both libraries bump +#their cmake_minimum_required to 3.0+. +unset(PROJECT_VERSION_MAJOR) +unset(PROJECT_VERSION_MINOR) +unset(PROJECT_VERSION) +set(BUILD_SHARED_LIB ON) #for tidy +add_subdirectory(lib/tidy) +add_subdirectory(lib/curlcpp) diff --git a/src/main.cpp b/src/main.cpp index abb9160..8b254f3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,32 +6,84 @@ #include #include #include +#include +#include + +#define STRINGIZE_IMPL(s) #s +#define STRINGIZE(s) STRINGIZE_IMPL(s) + +namespace po = boost::program_options; namespace { typedef std::pair LineColType; LineColType line_col_from_offset ( ptrdiff_t parOffset, const std::string& parData ); + + const char* const g_version_string = + PROGRAM_NAME " v" STRINGIZE(VERSION_MAJOR) "." STRINGIZE(VERSION_MINOR) +#if VERSION_BETA + "b" +#endif + ; + + bool parse_commandline (int parArgc, char* parArgv[], po::variables_map parVarMap) { + po::options_description desc("General"); + desc.add_options() + ("help,h", "Produces this help message") + ("version", "Prints the program's version and quits") + ; + po::options_description positional_options("Positional options"); + positional_options.add_options() + ("input-url", po::value(), "Input URL") + ("xpath", po::value(), "XPath expression") + ; + po::options_description all("Available options"); + all.add(desc).add(positional_options); + po::positional_options_description pd; + pd.add("input-url", 1).add("xpath", 1); + po::store(po::command_line_parser(parArgc, parArgv).options(all).positional(pd).run(), parVarMap); + po::notify(parVarMap); + + if (parVarMap.count("help")) { + po::options_description visible("Available options"); + visible.add(desc); + std::cout << "Usage: " << PROGRAM_NAME << " [options...] \n"; + std::cout << visible; + return true; + } + else if (parVarMap.count("version")) { + std::cout << g_version_string; + std::cout << " git revision " << VERSION_GIT << "\n"; + return true; + } + + if (parVarMap.count("input-url") == 0) { + std::cerr << "No input url specified, use --help for help" << std::endl; + //return 2; + return true; + } + if (parVarMap.count("xpath") == 0) { + std::cerr << "No XPath expression specified, use --help for help" << std::endl; + //return 2; + return true; + } + return false; + } } //unnamed namespace int main (int argc, char* argv[]) { - if (argc != 3) { - std::cerr << PROGRAM_NAME << "v" << VERSION_MAJOR << "." << VERSION_MINOR; -#if VERSION_BETA - std::cerr << "b"; -#endif - std::cerr << " git revision " << VERSION_GIT << "\n"; - std::cerr << "Default user agent is \"" << DEFAULT_USER_AGENT << "\"\n"; - std::cerr << "Usage: scraper " << std::endl; - return 2; + po::variables_map vm; + if (parse_commandline(argc, argv, vm)) { + return 0; } - - const char* const& url = argv[1]; - const char* const& xpath = argv[2]; - + const auto url = vm["input-url"].as(); + const auto xpath = vm["xpath"].as(); +#if !defined(NDEBUG) std::cout << "URL : " << url << "\n"; std::cout << "XPath: " << xpath << std::endl; +#endif - std::string tidyHtml = duck::getCleanHtml(url, false, false); + std::string tidyHtml = duck::getCleanHtml(vm["input-url"].as(), false, false); { pugi::xml_document doc; @@ -45,7 +97,7 @@ int main (int argc, char* argv[]) { return 1; } - pugi::xpath_node_set xpathRes = doc.select_nodes(xpath); + pugi::xpath_node_set xpathRes = doc.select_nodes(xpath.c_str()); for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) { const pugi::xpath_node& node = *itFind; if (node.node()) {