cercasogno/cercasogno_dizmaker.rb
2017-01-06 12:41:43 +00:00

129 lines
4.2 KiB
Ruby
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env ruby
#coding: utf-8
require 'pathname'
$APP_PATH = File.join(File.dirname(Pathname.new(__FILE__).realpath), "/")
require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'yaml'
DREAM_INFO = Struct.new(:strDesc, :strMeaning, :nNumber, :aSubDescs, :strReferenceURL)
def DropIntermediateAccents(str)
accents = {
[/á\B/, /à\B/, /â\B/, /ä\B/, /ã\B/] => 'a',
[/Ã\B/, /Ä\B/, /Â\B/, /À\B/, /Á\B/] => 'A',
[/é\B/, /è\B/, /ê\B/, /ë\B/] => 'e',
[/Ë\B/, /É\B/, /È\B/, /Ê\B/] => 'E',
[/í\B/, /ì\B/, /î\B/, /ï\B/] => 'i',
[/Í\B/, /Î\B/, /Ì\B/, /Ï\B/] => 'I',
[/ó\B/, /ò\B/, /ô\B/, /ö\B/, /õ\B/] => 'o',
[/Õ\B/, /Ö\B/, /Ô\B/, /Ò\B/, /Ó\B/] => 'O',
[/ú\B/, /ù\B/, /û\B/, /ü\B/] => 'u',
[/Ú\B/, /Û\B/, /Ù\B/, /Ü\B/] => 'U',
[/ç/] => 'c', [/Ç/] => 'C',
[/ñ/] => 'n', [/Ñ/] => 'N'
}
strRet = str.to_s()
accents.each do |ac,rep|
ac.each do |s|
strRet.gsub!(s, rep)
end
end
return strRet #.gsub(/[^\w\s.:,;@#§\[\]()=?!^"£$%&@°\\\/=*+-]\B/, "")
end
def GetPurifiedString(strText)
strRet = DropIntermediateAccents(strText.strip())
#strRet = strText.strip()
hCleaning = {"" => /\r|\n/, " " => /\s{2,}/, "'" => ""}
hCleaning.each do |strReplace, regMatch|
strRet.gsub! regMatch, strReplace
end
return strRet.strip
end
def ExtractSubsections(strSectionRaw)
aSubsections = Array.new
regSubElement = /^\W*([^0-9;:,.-]+)[\s;:,.-]+([^0-9;:=-]+?)\b\W+(\d+)/
regInteger = /^\d+$/
mSubElement = regSubElement.match(strSectionRaw)
nStart = 0
while mSubElement do
diNew = DREAM_INFO.new(GetPurifiedString(mSubElement[1]), GetPurifiedString(mSubElement[2]), mSubElement[3].to_i())
aSubsections << diNew
bCriticalError = regInteger =~ diNew.strMeaning || regInteger =~ diNew.strDesc
bCriticalError |= diNew.strMeaning.empty? || diNew.strDesc.empty?
$stderr.puts "M: " + diNew.strMeaning if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strMeaning
$stderr.puts "D: " + diNew.strDesc if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strDesc
raise "Critical error" if bCriticalError
# puts "#{diNew.strDesc}: #{diNew.strMeaning} (#{diNew.nNumber})"
nStart += mSubElement.end(0)
mSubElement = regSubElement.match(strSectionRaw[nStart..-1])
end
if aSubsections.length == 0
return nil
else
return aSubsections
end
end
def ExtractElementsFromHtml(strReferenceUrl, strHtml)
#The following query fetches <br> nodes from parent node "table" (ref: http://stackoverflow.com/questions/1485356/how-to-get-xpath-of-text-between-br-or-br)
strXPathBR = "//table[@class=\"centraleUnico\"]/br/following-sibling::text() | //table[@class=\"centraleUnico\"]//br/preceding-sibling::text()"
regHead = /^(.+?)\b.*=\s*(\d+)/
regDetails = /^.+?:.+?,\s*\d+/
bExpectingDetails = false
diPrevItem = nil
hRet = Hash.new
doc = Nokogiri::HTML(strHtml)
doc.xpath(strXPathBR).each do |link|
strText = GetPurifiedString(link.content)
bMatchHead, bMatchDetails = regHead =~ strText, regDetails =~ strText
next unless bMatchHead || bMatchDetails
bExpectingDetails = diPrevItem && bMatchDetails
if bExpectingDetails then
diPrevItem.aSubDescs = ExtractSubsections(strText)
# raise "An item's details were expected" unless mExtract
else
mExtract = regHead.match(strText)
raise "A new item was expected" unless mExtract
diPrevItem = DREAM_INFO.new(mExtract[1], nil, mExtract[2].to_i(), nil, strReferenceUrl)
hRet[diPrevItem.strDesc] = diPrevItem
print diPrevItem.strDesc + " "
end
bExpectingDetails = !bExpectingDetails
end
# strHtml.force_encoding("iso-8859-1").encode("UTF-8")
return hRet
end
aLetters = %w{a b c d e f g h i l m n o p q r s t u v z}
HOST_URL = "www.metropolino.com"
hElements = Hash.new
conn = Net::HTTP.new(HOST_URL, 80)
aLetters.each do |strLetter|
strPageAddress = "/smorfia/interpretazione-dei-sogni-" + strLetter + ".asp"
response, strPage = conn.get(strPageAddress)
if response.is_a? Net::HTTPSuccess then
hElements.merge! ExtractElementsFromHtml(HOST_URL + strPageAddress, strPage)
else
puts "Error while retrieving #{strPageAddress} (Error #{response.code}: #{response.message})"
response.error!
end
end
puts
File.open(File.join($APP_PATH, "sogno_diz.yml"), "w") do |fDst|
fDst.write(YAML::dump(hElements))
end