cercasogno/cercasogno_dizmaker.rb

130 lines
4.2 KiB
Ruby
Raw Permalink Normal View History

2017-01-06 12:41:43 +00:00
#!/usr/bin/env ruby
#coding: utf-8
require 'pathname'
$APP_PATH = File.join(File.dirname(Pathname.new(__FILE__).realpath), "/")
require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'yaml'
DREAM_INFO = Struct.new(:strDesc, :strMeaning, :nNumber, :aSubDescs, :strReferenceURL)
def DropIntermediateAccents(str)
accents = {
[/á\B/, /à\B/, /â\B/, /ä\B/, /ã\B/] => 'a',
[/Ã\B/, /Ä\B/, /Â\B/, /À\B/, /Á\B/] => 'A',
[/é\B/, /è\B/, /ê\B/, /ë\B/] => 'e',
[/Ë\B/, /É\B/, /È\B/, /Ê\B/] => 'E',
[/í\B/, /ì\B/, /î\B/, /ï\B/] => 'i',
[/Í\B/, /Î\B/, /Ì\B/, /Ï\B/] => 'I',
[/ó\B/, /ò\B/, /ô\B/, /ö\B/, /õ\B/] => 'o',
[/Õ\B/, /Ö\B/, /Ô\B/, /Ò\B/, /Ó\B/] => 'O',
[/ú\B/, /ù\B/, /û\B/, /ü\B/] => 'u',
[/Ú\B/, /Û\B/, /Ù\B/, /Ü\B/] => 'U',
[/ç/] => 'c', [/Ç/] => 'C',
[/ñ/] => 'n', [/Ñ/] => 'N'
}
strRet = str.to_s()
accents.each do |ac,rep|
ac.each do |s|
strRet.gsub!(s, rep)
end
end
return strRet #.gsub(/[^\w\s.:,;@#§\[\]()=?!^"£$%&@°\\\/=*+-]\B/, "")
end
def GetPurifiedString(strText)
strRet = DropIntermediateAccents(strText.strip())
#strRet = strText.strip()
hCleaning = {"" => /\r|\n/, " " => /\s{2,}/, "'" => ""}
hCleaning.each do |strReplace, regMatch|
strRet.gsub! regMatch, strReplace
end
return strRet.strip
end
def ExtractSubsections(strSectionRaw)
aSubsections = Array.new
regSubElement = /^\W*([^0-9;:,.-]+)[\s;:,.-]+([^0-9;:=-]+?)\b\W+(\d+)/
regInteger = /^\d+$/
mSubElement = regSubElement.match(strSectionRaw)
nStart = 0
while mSubElement do
diNew = DREAM_INFO.new(GetPurifiedString(mSubElement[1]), GetPurifiedString(mSubElement[2]), mSubElement[3].to_i())
aSubsections << diNew
bCriticalError = regInteger =~ diNew.strMeaning || regInteger =~ diNew.strDesc
bCriticalError |= diNew.strMeaning.empty? || diNew.strDesc.empty?
$stderr.puts "M: " + diNew.strMeaning if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strMeaning
$stderr.puts "D: " + diNew.strDesc if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strDesc
raise "Critical error" if bCriticalError
# puts "#{diNew.strDesc}: #{diNew.strMeaning} (#{diNew.nNumber})"
nStart += mSubElement.end(0)
mSubElement = regSubElement.match(strSectionRaw[nStart..-1])
end
if aSubsections.length == 0
return nil
else
return aSubsections
end
end
def ExtractElementsFromHtml(strReferenceUrl, strHtml)
#The following query fetches <br> nodes from parent node "table" (ref: http://stackoverflow.com/questions/1485356/how-to-get-xpath-of-text-between-br-or-br)
strXPathBR = "//table[@class=\"centraleUnico\"]/br/following-sibling::text() | //table[@class=\"centraleUnico\"]//br/preceding-sibling::text()"
regHead = /^(.+?)\b.*=\s*(\d+)/
regDetails = /^.+?:.+?,\s*\d+/
bExpectingDetails = false
diPrevItem = nil
hRet = Hash.new
doc = Nokogiri::HTML(strHtml)
doc.xpath(strXPathBR).each do |link|
strText = GetPurifiedString(link.content)
bMatchHead, bMatchDetails = regHead =~ strText, regDetails =~ strText
next unless bMatchHead || bMatchDetails
bExpectingDetails = diPrevItem && bMatchDetails
if bExpectingDetails then
diPrevItem.aSubDescs = ExtractSubsections(strText)
# raise "An item's details were expected" unless mExtract
else
mExtract = regHead.match(strText)
raise "A new item was expected" unless mExtract
diPrevItem = DREAM_INFO.new(mExtract[1], nil, mExtract[2].to_i(), nil, strReferenceUrl)
hRet[diPrevItem.strDesc] = diPrevItem
print diPrevItem.strDesc + " "
end
bExpectingDetails = !bExpectingDetails
end
# strHtml.force_encoding("iso-8859-1").encode("UTF-8")
return hRet
end
aLetters = %w{a b c d e f g h i l m n o p q r s t u v z}
HOST_URL = "www.metropolino.com"
hElements = Hash.new
conn = Net::HTTP.new(HOST_URL, 80)
aLetters.each do |strLetter|
strPageAddress = "/smorfia/interpretazione-dei-sogni-" + strLetter + ".asp"
response, strPage = conn.get(strPageAddress)
if response.is_a? Net::HTTPSuccess then
hElements.merge! ExtractElementsFromHtml(HOST_URL + strPageAddress, strPage)
else
puts "Error while retrieving #{strPageAddress} (Error #{response.code}: #{response.message})"
response.error!
end
end
puts
File.open(File.join($APP_PATH, "sogno_diz.yml"), "w") do |fDst|
fDst.write(YAML::dump(hElements))
end