#!/usr/bin/env ruby #coding: utf-8 require 'pathname' $APP_PATH = File.join(File.dirname(Pathname.new(__FILE__).realpath), "/") require 'rubygems' require 'net/http' require 'nokogiri' require 'yaml' DREAM_INFO = Struct.new(:strDesc, :strMeaning, :nNumber, :aSubDescs, :strReferenceURL) def DropIntermediateAccents(str) accents = { [/á\B/, /à\B/, /â\B/, /ä\B/, /ã\B/] => 'a', [/Ã\B/, /Ä\B/, /Â\B/, /À\B/, /Á\B/] => 'A', [/é\B/, /è\B/, /ê\B/, /ë\B/] => 'e', [/Ë\B/, /É\B/, /È\B/, /Ê\B/] => 'E', [/í\B/, /ì\B/, /î\B/, /ï\B/] => 'i', [/Í\B/, /Î\B/, /Ì\B/, /Ï\B/] => 'I', [/ó\B/, /ò\B/, /ô\B/, /ö\B/, /õ\B/] => 'o', [/Õ\B/, /Ö\B/, /Ô\B/, /Ò\B/, /Ó\B/] => 'O', [/ú\B/, /ù\B/, /û\B/, /ü\B/] => 'u', [/Ú\B/, /Û\B/, /Ù\B/, /Ü\B/] => 'U', [/ç/] => 'c', [/Ç/] => 'C', [/ñ/] => 'n', [/Ñ/] => 'N' } strRet = str.to_s() accents.each do |ac,rep| ac.each do |s| strRet.gsub!(s, rep) end end return strRet #.gsub(/[^\w\s.:,;@#§\[\]()=?!^"£$%&@°\\\/=*+-]\B/, "") end def GetPurifiedString(strText) strRet = DropIntermediateAccents(strText.strip()) #strRet = strText.strip() hCleaning = {"" => /\r|\n/, " " => /\s{2,}/, "'" => "’"} hCleaning.each do |strReplace, regMatch| strRet.gsub! regMatch, strReplace end return strRet.strip end def ExtractSubsections(strSectionRaw) aSubsections = Array.new regSubElement = /^\W*([^0-9;:,.-]+)[\s;:,.-]+([^0-9;:=-]+?)\b\W+(\d+)/ regInteger = /^\d+$/ mSubElement = regSubElement.match(strSectionRaw) nStart = 0 while mSubElement do diNew = DREAM_INFO.new(GetPurifiedString(mSubElement[1]), GetPurifiedString(mSubElement[2]), mSubElement[3].to_i()) aSubsections << diNew bCriticalError = regInteger =~ diNew.strMeaning || regInteger =~ diNew.strDesc bCriticalError |= diNew.strMeaning.empty? || diNew.strDesc.empty? $stderr.puts "M: " + diNew.strMeaning if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strMeaning $stderr.puts "D: " + diNew.strDesc if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strDesc raise "Critical error" if bCriticalError # puts "#{diNew.strDesc}: #{diNew.strMeaning} (#{diNew.nNumber})" nStart += mSubElement.end(0) mSubElement = regSubElement.match(strSectionRaw[nStart..-1]) end if aSubsections.length == 0 return nil else return aSubsections end end def ExtractElementsFromHtml(strReferenceUrl, strHtml) #The following query fetches
nodes from parent node "table" (ref: http://stackoverflow.com/questions/1485356/how-to-get-xpath-of-text-between-br-or-br) strXPathBR = "//table[@class=\"centraleUnico\"]/br/following-sibling::text() | //table[@class=\"centraleUnico\"]//br/preceding-sibling::text()" regHead = /^(.+?)\b.*=\s*(\d+)/ regDetails = /^.+?:.+?,\s*\d+/ bExpectingDetails = false diPrevItem = nil hRet = Hash.new doc = Nokogiri::HTML(strHtml) doc.xpath(strXPathBR).each do |link| strText = GetPurifiedString(link.content) bMatchHead, bMatchDetails = regHead =~ strText, regDetails =~ strText next unless bMatchHead || bMatchDetails bExpectingDetails = diPrevItem && bMatchDetails if bExpectingDetails then diPrevItem.aSubDescs = ExtractSubsections(strText) # raise "An item's details were expected" unless mExtract else mExtract = regHead.match(strText) raise "A new item was expected" unless mExtract diPrevItem = DREAM_INFO.new(mExtract[1], nil, mExtract[2].to_i(), nil, strReferenceUrl) hRet[diPrevItem.strDesc] = diPrevItem print diPrevItem.strDesc + " " end bExpectingDetails = !bExpectingDetails end # strHtml.force_encoding("iso-8859-1").encode("UTF-8") return hRet end aLetters = %w{a b c d e f g h i l m n o p q r s t u v z} HOST_URL = "www.metropolino.com" hElements = Hash.new conn = Net::HTTP.new(HOST_URL, 80) aLetters.each do |strLetter| strPageAddress = "/smorfia/interpretazione-dei-sogni-" + strLetter + ".asp" response, strPage = conn.get(strPageAddress) if response.is_a? Net::HTTPSuccess then hElements.merge! ExtractElementsFromHtml(HOST_URL + strPageAddress, strPage) else puts "Error while retrieving #{strPageAddress} (Error #{response.code}: #{response.message})" response.error! end end puts File.open(File.join($APP_PATH, "sogno_diz.yml"), "w") do |fDst| fDst.write(YAML::dump(hElements)) end