Working example.
Invoke it with ie: ./scraper http://www.dilbert.com '//div[@class='\''STR_Image'\'']/a/img/@src'
This commit is contained in:
parent
aa015ddd6a
commit
cb00e484fa
4 changed files with 50 additions and 10 deletions
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
[submodule "lib/curlcpp"]
|
||||
path = lib/curlcpp
|
||||
url = https://github.com/JosephP91/curlcpp.git
|
|
@ -3,15 +3,17 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
|
|||
project(scraper CXX)
|
||||
|
||||
add_subdirectory(lib/tidy)
|
||||
add_subdirectory(lib/curlcpp)
|
||||
|
||||
find_package(PugiXML REQUIRED)
|
||||
#find_package(CURL REQUIRED)
|
||||
#${CURL_INCLUDE_DIR}
|
||||
#${CURL_LIBRARIES}
|
||||
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -Wall -Wextra -g -O0 -fno-omit-frame-pointer")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++11 -Wall -Wextra -g -O3 -fomit-frame-pointer")
|
||||
|
||||
include_directories(SYSTEM
|
||||
lib/tidy/include
|
||||
${PUGIXML_INCLUDE_DIR}
|
||||
${CURLCPP_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
add_executable(${PROJECT_NAME}
|
||||
|
@ -21,4 +23,5 @@ add_executable(${PROJECT_NAME}
|
|||
target_link_libraries(${PROJECT_NAME}
|
||||
tidy
|
||||
${PUGIXML_LIBRARIES}
|
||||
curlcpp
|
||||
)
|
||||
|
|
1
lib/curlcpp
Submodule
1
lib/curlcpp
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 05bad3db527ee3c76730d42104ddaaa8a6376a3f
|
47
src/main.cpp
47
src/main.cpp
|
@ -5,6 +5,9 @@
|
|||
#include <string>
|
||||
#include <pugixml.hpp>
|
||||
#include <sstream>
|
||||
#include <curl_easy.h>
|
||||
#include <cstring>
|
||||
#include <stack>
|
||||
|
||||
namespace {
|
||||
const char g_testData[] = {
|
||||
|
@ -113,7 +116,8 @@ namespace {
|
|||
std::string cleanHTML(const std::string &html){
|
||||
// Initialize a Tidy document
|
||||
TidyDoc tidyDoc = tidyCreate();
|
||||
TidyBuffer tidyOutputBuffer = {0};
|
||||
TidyBuffer tidyOutputBuffer;
|
||||
std::memset(&tidyOutputBuffer, 0, sizeof(TidyBuffer));
|
||||
|
||||
// Configure Tidy
|
||||
// The flags tell Tidy to output XML and disable showing warnings
|
||||
|
@ -150,9 +154,34 @@ namespace {
|
|||
} //unnamed namespace
|
||||
|
||||
int main (int argc, char* argv[]) {
|
||||
const std::string tidyHtml(cleanHTML(g_testData));
|
||||
std::cout << tidyHtml << std::endl;
|
||||
std::cout << "XPath: " << argv[1] << std::endl;
|
||||
std::string tidyHtml;
|
||||
if (argc != 3) {
|
||||
std::cerr << "Usage: scraper <URL> <XPath>" << std::endl;
|
||||
return 2;
|
||||
}
|
||||
|
||||
const char* const& url = argv[1];
|
||||
const char* const& xpath = argv[2];
|
||||
|
||||
std::cout << "URL : " << url << "\n";
|
||||
std::cout << "XPath: " << xpath << std::endl;
|
||||
|
||||
{
|
||||
std::ostringstream oss;
|
||||
curl::curl_easy easy(oss);
|
||||
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_URL, url));
|
||||
easy.add(curl::curl_pair<CURLoption, std::string>(CURLOPT_USERAGENT, "duckscraper"));
|
||||
easy.add(curl::curl_pair<CURLoption, long>(CURLOPT_FOLLOWLOCATION, 1L));
|
||||
try {
|
||||
easy.perform();
|
||||
}
|
||||
catch (curl_error& err) {
|
||||
std::stack<std::pair<std::string, std::string> > errors = err.what();
|
||||
err.print_traceback();
|
||||
return 1;
|
||||
}
|
||||
tidyHtml = cleanHTML(oss.str());
|
||||
}
|
||||
|
||||
{
|
||||
pugi::xml_document doc;
|
||||
|
@ -163,11 +192,15 @@ int main (int argc, char* argv[]) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
pugi::xpath_node_set xpathRes = doc.select_nodes(argv[1]);
|
||||
pugi::xpath_node_set xpathRes = doc.select_nodes(xpath);
|
||||
for (pugi::xpath_node_set::const_iterator itFind(xpathRes.begin()), itFindEND(xpathRes.end()); itFind != itFindEND; ++itFind) {
|
||||
const pugi::xpath_node& node = *itFind;
|
||||
std::cout << node.node().name() << ": ";
|
||||
std::cout << node.node().attribute("name").value() << "\n";
|
||||
if (node.node()) {
|
||||
std::cout << node.node().name() << ": " << node.node().value() << "\n";
|
||||
}
|
||||
else if (node.attribute()) {
|
||||
std::cout << node.attribute().name() << ": " << node.attribute().value() << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
|
Loading…
Reference in a new issue