diff --git a/CMakeLists.txt b/CMakeLists.txt index 29d6f2b..f60afee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ add_executable(${PROJECT_NAME} src/main.cpp src/htmlretrieve.cpp src/commandline.cpp + src/scraplang/scraplang.cpp ) if (BUILD_SHARED_TIDY) diff --git a/src/scraplang/scrapast.hpp b/src/scraplang/scrapast.hpp new file mode 100644 index 0000000..70b19d6 --- /dev/null +++ b/src/scraplang/scrapast.hpp @@ -0,0 +1,57 @@ +#ifndef id9919CCB09DDD429C8128632F13D370ED +#define id9919CCB09DDD429C8128632F13D370ED + +#include "scraplang_element.hpp" +#include +#include +#include +#include + +namespace duck { + struct ScrapNode; + + namespace implem { + struct map; + struct array; + + struct element : boost::spirit::extended_variant< + boost::recursive_wrapper, + boost::recursive_wrapper, + std::string, + int, + double + > + { + element ( void ) = default; + element ( const map& parOther ) : base_type(parOther) {} + element ( const array& parOther ) : base_type(parOther) {} + element ( const std::string& parOther ) : base_type(parOther) {} + element ( double parOther ) : base_type(parOther) {} + element ( int parOther ) : base_type(parOther) {} + }; + + struct map : std::map { + }; + + struct array : std::vector { + }; + + struct node_list { + std::vector nodes; + }; + } //namespace implem + + struct ScrapNode : boost::spirit::extended_variant< + element_def, + implem::map, + implem::node_list + > + { + ScrapNode ( void ) = default; + ScrapNode ( const element_def& parOther ) : base_type(parOther) {} + ScrapNode ( const implem::map& parOther ) : base_type(parOther) {} + ScrapNode ( const implem::node_list& parOther ) : base_type(parOther) {} + }; +} //namespace duck + +#endif diff --git a/src/scraplang/scraplang.cpp b/src/scraplang/scraplang.cpp new file mode 100644 index 0000000..15bac0a --- /dev/null +++ b/src/scraplang/scraplang.cpp @@ -0,0 +1,88 @@ +#include "scraplang.hpp" +#include "scrapast.hpp" +#include "scraplang_visit_xpath.hpp" +#include +#include +#include +#include +#include +#include + +#include + +namespace qi = boost::spirit::qi; +namespace sp = boost::spirit; + +BOOST_FUSION_ADAPT_STRUCT( + duck::element_def, + (std::string, name) + (std::string, xpath) + (duck::ElementTypes, type) +) +BOOST_FUSION_ADAPT_STRUCT( + duck::implem::node_list, + (std::vector, nodes) +) + +namespace duck { + namespace { + struct ElementTypeSymbol : qi::symbols { + ElementTypeSymbol() { + add + ("string", ElementType_String) + ("integer", ElementType_Integer) + ("boolean", ElementType_Boolean) + ("null", ElementType_Null) + ("double", ElementType_Double) + ; + } + }; + + template + struct ScrapGrammar : qi::grammar { + ScrapGrammar() : ScrapGrammar::base_type(start) { + using qi::lit; + using qi::char_; + using qi::lexeme; + using qi::double_; + using qi::int_; + using qi::eps; + + start = whole; + whole = eps >> *xpath_definition >> -map; + xpath_definition = identifier >> lit('=') >> string >> "as" >> data_type; + identifier = (char_('a', 'z') | char_('A', 'Z') | '_') >> *(char_('a', 'z') | char_('A', 'Z') | '_' | char_('0', '9')); + string %= lexeme['"' >> +(char_ - '"') >> '"']; + map = lit('{') >> ((identifier >> lit('=') >> value) % lit(',')) >> lit('}'); + array = lit('[') >> *(value % lit(',')) >> lit(']'); + value = string | double_ | int_ | array | map | identifier; + } + + qi::rule start; + qi::rule whole; + qi::rule xpath_definition; + qi::rule identifier; + qi::rule string; + qi::rule map; + qi::rule array; + qi::rule value; + ElementTypeSymbol data_type; + }; + } //unnamed namespace + + std::unique_ptr parse_scraplang (const std::string& parData) { + ScrapGrammar gramm; + std::unique_ptr retval(new ScrapNode); + auto it_start = parData.cbegin(); + + qi::phrase_parse(it_start, parData.cend(), gramm, sp::ascii::space, *retval); + return std::move(retval); + } + + std::vector get_xpath_definitions (const ScrapNode& parAST) { + std::vector retval; + implem::XPathVisitor xpath_vis(&retval); + boost::apply_visitor(xpath_vis, parAST); + return std::move(retval); + } +} //namespace duck diff --git a/src/scraplang/scraplang.hpp b/src/scraplang/scraplang.hpp new file mode 100644 index 0000000..f3b4f79 --- /dev/null +++ b/src/scraplang/scraplang.hpp @@ -0,0 +1,16 @@ +#ifndef idBE96C2D49C4C413888A79EAEB2B9C0FA +#define idBE96C2D49C4C413888A79EAEB2B9C0FA + +#include +#include +#include + +namespace duck { + struct ScrapNode; + struct element_def; + + std::unique_ptr parse_scraplang ( const std::string& parData ); + std::vector get_xpath_definitions ( const ScrapNode& parAST ); +} //namespace duck + +#endif diff --git a/src/scraplang/scraplang_element.hpp b/src/scraplang/scraplang_element.hpp new file mode 100644 index 0000000..4dc4b72 --- /dev/null +++ b/src/scraplang/scraplang_element.hpp @@ -0,0 +1,22 @@ +#ifndef id3875B5F868524EC3A1B83971D4A85777 +#define id3875B5F868524EC3A1B83971D4A85777 + +#include + +namespace duck { + enum ElementTypes { + ElementType_String, + ElementType_Integer, + ElementType_Boolean, + ElementType_Null, + ElementType_Double + }; + + struct element_def { + std::string name; + std::string xpath; + ElementTypes type; + }; +} //namespace duck + +#endif diff --git a/src/scraplang/scraplang_visit_xpath.hpp b/src/scraplang/scraplang_visit_xpath.hpp new file mode 100644 index 0000000..5bba99a --- /dev/null +++ b/src/scraplang/scraplang_visit_xpath.hpp @@ -0,0 +1,44 @@ +#ifndef id7648347E8EE84E65B69018880358C8DF +#define id7648347E8EE84E65B69018880358C8DF + +#include "scrapast.hpp" +#include + +namespace duck { + namespace implem { + class XPathVisitor { + public: + typedef void result_type; + + explicit XPathVisitor ( std::vector* parElements ); + + void operator() ( const element_def& parElem ); + void operator() ( const implem::map& parMap ); + void operator() ( const node_list& parNodes ); + + private: + std::vector* const m_elements; + }; + + inline XPathVisitor::XPathVisitor (std::vector* parElements) : + m_elements(parElements) + { + } + + inline void XPathVisitor::operator() (const element_def& parElem) { + m_elements->push_back(parElem); + } + + inline void XPathVisitor::operator() (const implem::map&) { + return; + } + + inline void XPathVisitor::operator() (const node_list& parNodes) { + for (const auto& node : parNodes.nodes) { + boost::apply_visitor(*this, node); + } + } + } //namespace implem +} //namespace duck + +#endif