129 lines
4.2 KiB
Ruby
129 lines
4.2 KiB
Ruby
#!/usr/bin/env ruby
|
||
#coding: utf-8
|
||
|
||
require 'pathname'
|
||
$APP_PATH = File.join(File.dirname(Pathname.new(__FILE__).realpath), "/")
|
||
|
||
require 'rubygems'
|
||
require 'net/http'
|
||
require 'nokogiri'
|
||
require 'yaml'
|
||
|
||
DREAM_INFO = Struct.new(:strDesc, :strMeaning, :nNumber, :aSubDescs, :strReferenceURL)
|
||
|
||
def DropIntermediateAccents(str)
|
||
accents = {
|
||
[/á\B/, /à\B/, /â\B/, /ä\B/, /ã\B/] => 'a',
|
||
[/Ã\B/, /Ä\B/, /Â\B/, /À\B/, /Á\B/] => 'A',
|
||
[/é\B/, /è\B/, /ê\B/, /ë\B/] => 'e',
|
||
[/Ë\B/, /É\B/, /È\B/, /Ê\B/] => 'E',
|
||
[/í\B/, /ì\B/, /î\B/, /ï\B/] => 'i',
|
||
[/Í\B/, /Î\B/, /Ì\B/, /Ï\B/] => 'I',
|
||
[/ó\B/, /ò\B/, /ô\B/, /ö\B/, /õ\B/] => 'o',
|
||
[/Õ\B/, /Ö\B/, /Ô\B/, /Ò\B/, /Ó\B/] => 'O',
|
||
[/ú\B/, /ù\B/, /û\B/, /ü\B/] => 'u',
|
||
[/Ú\B/, /Û\B/, /Ù\B/, /Ü\B/] => 'U',
|
||
[/ç/] => 'c', [/Ç/] => 'C',
|
||
[/ñ/] => 'n', [/Ñ/] => 'N'
|
||
}
|
||
strRet = str.to_s()
|
||
accents.each do |ac,rep|
|
||
ac.each do |s|
|
||
strRet.gsub!(s, rep)
|
||
end
|
||
end
|
||
return strRet #.gsub(/[^\w\s.:,;@#§\[\]()=?!^"£$%&@°\\\/=*+-]\B/, "")
|
||
end
|
||
|
||
def GetPurifiedString(strText)
|
||
strRet = DropIntermediateAccents(strText.strip())
|
||
#strRet = strText.strip()
|
||
hCleaning = {"" => /\r|\n/, " " => /\s{2,}/, "'" => "’"}
|
||
hCleaning.each do |strReplace, regMatch|
|
||
strRet.gsub! regMatch, strReplace
|
||
end
|
||
return strRet.strip
|
||
end
|
||
|
||
def ExtractSubsections(strSectionRaw)
|
||
aSubsections = Array.new
|
||
|
||
regSubElement = /^\W*([^0-9;:,.-]+)[\s;:,.-]+([^0-9;:=-]+?)\b\W+(\d+)/
|
||
regInteger = /^\d+$/
|
||
|
||
mSubElement = regSubElement.match(strSectionRaw)
|
||
nStart = 0
|
||
while mSubElement do
|
||
diNew = DREAM_INFO.new(GetPurifiedString(mSubElement[1]), GetPurifiedString(mSubElement[2]), mSubElement[3].to_i())
|
||
aSubsections << diNew
|
||
bCriticalError = regInteger =~ diNew.strMeaning || regInteger =~ diNew.strDesc
|
||
bCriticalError |= diNew.strMeaning.empty? || diNew.strDesc.empty?
|
||
$stderr.puts "M: " + diNew.strMeaning if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strMeaning
|
||
$stderr.puts "D: " + diNew.strDesc if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strDesc
|
||
raise "Critical error" if bCriticalError
|
||
# puts "#{diNew.strDesc}: #{diNew.strMeaning} (#{diNew.nNumber})"
|
||
nStart += mSubElement.end(0)
|
||
mSubElement = regSubElement.match(strSectionRaw[nStart..-1])
|
||
end
|
||
if aSubsections.length == 0
|
||
return nil
|
||
else
|
||
return aSubsections
|
||
end
|
||
end
|
||
|
||
def ExtractElementsFromHtml(strReferenceUrl, strHtml)
|
||
#The following query fetches <br> nodes from parent node "table" (ref: http://stackoverflow.com/questions/1485356/how-to-get-xpath-of-text-between-br-or-br)
|
||
strXPathBR = "//table[@class=\"centraleUnico\"]/br/following-sibling::text() | //table[@class=\"centraleUnico\"]//br/preceding-sibling::text()"
|
||
|
||
regHead = /^(.+?)\b.*=\s*(\d+)/
|
||
regDetails = /^.+?:.+?,\s*\d+/
|
||
|
||
bExpectingDetails = false
|
||
diPrevItem = nil
|
||
hRet = Hash.new
|
||
doc = Nokogiri::HTML(strHtml)
|
||
doc.xpath(strXPathBR).each do |link|
|
||
strText = GetPurifiedString(link.content)
|
||
bMatchHead, bMatchDetails = regHead =~ strText, regDetails =~ strText
|
||
next unless bMatchHead || bMatchDetails
|
||
|
||
bExpectingDetails = diPrevItem && bMatchDetails
|
||
|
||
if bExpectingDetails then
|
||
diPrevItem.aSubDescs = ExtractSubsections(strText)
|
||
# raise "An item's details were expected" unless mExtract
|
||
else
|
||
mExtract = regHead.match(strText)
|
||
raise "A new item was expected" unless mExtract
|
||
diPrevItem = DREAM_INFO.new(mExtract[1], nil, mExtract[2].to_i(), nil, strReferenceUrl)
|
||
hRet[diPrevItem.strDesc] = diPrevItem
|
||
print diPrevItem.strDesc + " "
|
||
end
|
||
bExpectingDetails = !bExpectingDetails
|
||
end
|
||
|
||
# strHtml.force_encoding("iso-8859-1").encode("UTF-8")
|
||
return hRet
|
||
end
|
||
|
||
aLetters = %w{a b c d e f g h i l m n o p q r s t u v z}
|
||
|
||
HOST_URL = "www.metropolino.com"
|
||
hElements = Hash.new
|
||
conn = Net::HTTP.new(HOST_URL, 80)
|
||
aLetters.each do |strLetter|
|
||
strPageAddress = "/smorfia/interpretazione-dei-sogni-" + strLetter + ".asp"
|
||
response, strPage = conn.get(strPageAddress)
|
||
if response.is_a? Net::HTTPSuccess then
|
||
hElements.merge! ExtractElementsFromHtml(HOST_URL + strPageAddress, strPage)
|
||
else
|
||
puts "Error while retrieving #{strPageAddress} (Error #{response.code}: #{response.message})"
|
||
response.error!
|
||
end
|
||
end
|
||
puts
|
||
|
||
File.open(File.join($APP_PATH, "sogno_diz.yml"), "w") do |fDst|
|
||
fDst.write(YAML::dump(hElements))
|
||
end
|