130 lines
4.2 KiB
Ruby
130 lines
4.2 KiB
Ruby
|
#!/usr/bin/env ruby
|
|||
|
#coding: utf-8
|
|||
|
|
|||
|
require 'pathname'
|
|||
|
$APP_PATH = File.join(File.dirname(Pathname.new(__FILE__).realpath), "/")
|
|||
|
|
|||
|
require 'rubygems'
|
|||
|
require 'net/http'
|
|||
|
require 'nokogiri'
|
|||
|
require 'yaml'
|
|||
|
|
|||
|
DREAM_INFO = Struct.new(:strDesc, :strMeaning, :nNumber, :aSubDescs, :strReferenceURL)
|
|||
|
|
|||
|
def DropIntermediateAccents(str)
|
|||
|
accents = {
|
|||
|
[/á\B/, /à\B/, /â\B/, /ä\B/, /ã\B/] => 'a',
|
|||
|
[/Ã\B/, /Ä\B/, /Â\B/, /À\B/, /Á\B/] => 'A',
|
|||
|
[/é\B/, /è\B/, /ê\B/, /ë\B/] => 'e',
|
|||
|
[/Ë\B/, /É\B/, /È\B/, /Ê\B/] => 'E',
|
|||
|
[/í\B/, /ì\B/, /î\B/, /ï\B/] => 'i',
|
|||
|
[/Í\B/, /Î\B/, /Ì\B/, /Ï\B/] => 'I',
|
|||
|
[/ó\B/, /ò\B/, /ô\B/, /ö\B/, /õ\B/] => 'o',
|
|||
|
[/Õ\B/, /Ö\B/, /Ô\B/, /Ò\B/, /Ó\B/] => 'O',
|
|||
|
[/ú\B/, /ù\B/, /û\B/, /ü\B/] => 'u',
|
|||
|
[/Ú\B/, /Û\B/, /Ù\B/, /Ü\B/] => 'U',
|
|||
|
[/ç/] => 'c', [/Ç/] => 'C',
|
|||
|
[/ñ/] => 'n', [/Ñ/] => 'N'
|
|||
|
}
|
|||
|
strRet = str.to_s()
|
|||
|
accents.each do |ac,rep|
|
|||
|
ac.each do |s|
|
|||
|
strRet.gsub!(s, rep)
|
|||
|
end
|
|||
|
end
|
|||
|
return strRet #.gsub(/[^\w\s.:,;@#§\[\]()=?!^"£$%&@°\\\/=*+-]\B/, "")
|
|||
|
end
|
|||
|
|
|||
|
def GetPurifiedString(strText)
|
|||
|
strRet = DropIntermediateAccents(strText.strip())
|
|||
|
#strRet = strText.strip()
|
|||
|
hCleaning = {"" => /\r|\n/, " " => /\s{2,}/, "'" => "’"}
|
|||
|
hCleaning.each do |strReplace, regMatch|
|
|||
|
strRet.gsub! regMatch, strReplace
|
|||
|
end
|
|||
|
return strRet.strip
|
|||
|
end
|
|||
|
|
|||
|
def ExtractSubsections(strSectionRaw)
|
|||
|
aSubsections = Array.new
|
|||
|
|
|||
|
regSubElement = /^\W*([^0-9;:,.-]+)[\s;:,.-]+([^0-9;:=-]+?)\b\W+(\d+)/
|
|||
|
regInteger = /^\d+$/
|
|||
|
|
|||
|
mSubElement = regSubElement.match(strSectionRaw)
|
|||
|
nStart = 0
|
|||
|
while mSubElement do
|
|||
|
diNew = DREAM_INFO.new(GetPurifiedString(mSubElement[1]), GetPurifiedString(mSubElement[2]), mSubElement[3].to_i())
|
|||
|
aSubsections << diNew
|
|||
|
bCriticalError = regInteger =~ diNew.strMeaning || regInteger =~ diNew.strDesc
|
|||
|
bCriticalError |= diNew.strMeaning.empty? || diNew.strDesc.empty?
|
|||
|
$stderr.puts "M: " + diNew.strMeaning if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strMeaning
|
|||
|
$stderr.puts "D: " + diNew.strDesc if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strDesc
|
|||
|
raise "Critical error" if bCriticalError
|
|||
|
# puts "#{diNew.strDesc}: #{diNew.strMeaning} (#{diNew.nNumber})"
|
|||
|
nStart += mSubElement.end(0)
|
|||
|
mSubElement = regSubElement.match(strSectionRaw[nStart..-1])
|
|||
|
end
|
|||
|
if aSubsections.length == 0
|
|||
|
return nil
|
|||
|
else
|
|||
|
return aSubsections
|
|||
|
end
|
|||
|
end
|
|||
|
|
|||
|
def ExtractElementsFromHtml(strReferenceUrl, strHtml)
|
|||
|
#The following query fetches <br> nodes from parent node "table" (ref: http://stackoverflow.com/questions/1485356/how-to-get-xpath-of-text-between-br-or-br)
|
|||
|
strXPathBR = "//table[@class=\"centraleUnico\"]/br/following-sibling::text() | //table[@class=\"centraleUnico\"]//br/preceding-sibling::text()"
|
|||
|
|
|||
|
regHead = /^(.+?)\b.*=\s*(\d+)/
|
|||
|
regDetails = /^.+?:.+?,\s*\d+/
|
|||
|
|
|||
|
bExpectingDetails = false
|
|||
|
diPrevItem = nil
|
|||
|
hRet = Hash.new
|
|||
|
doc = Nokogiri::HTML(strHtml)
|
|||
|
doc.xpath(strXPathBR).each do |link|
|
|||
|
strText = GetPurifiedString(link.content)
|
|||
|
bMatchHead, bMatchDetails = regHead =~ strText, regDetails =~ strText
|
|||
|
next unless bMatchHead || bMatchDetails
|
|||
|
|
|||
|
bExpectingDetails = diPrevItem && bMatchDetails
|
|||
|
|
|||
|
if bExpectingDetails then
|
|||
|
diPrevItem.aSubDescs = ExtractSubsections(strText)
|
|||
|
# raise "An item's details were expected" unless mExtract
|
|||
|
else
|
|||
|
mExtract = regHead.match(strText)
|
|||
|
raise "A new item was expected" unless mExtract
|
|||
|
diPrevItem = DREAM_INFO.new(mExtract[1], nil, mExtract[2].to_i(), nil, strReferenceUrl)
|
|||
|
hRet[diPrevItem.strDesc] = diPrevItem
|
|||
|
print diPrevItem.strDesc + " "
|
|||
|
end
|
|||
|
bExpectingDetails = !bExpectingDetails
|
|||
|
end
|
|||
|
|
|||
|
# strHtml.force_encoding("iso-8859-1").encode("UTF-8")
|
|||
|
return hRet
|
|||
|
end
|
|||
|
|
|||
|
aLetters = %w{a b c d e f g h i l m n o p q r s t u v z}
|
|||
|
|
|||
|
HOST_URL = "www.metropolino.com"
|
|||
|
hElements = Hash.new
|
|||
|
conn = Net::HTTP.new(HOST_URL, 80)
|
|||
|
aLetters.each do |strLetter|
|
|||
|
strPageAddress = "/smorfia/interpretazione-dei-sogni-" + strLetter + ".asp"
|
|||
|
response, strPage = conn.get(strPageAddress)
|
|||
|
if response.is_a? Net::HTTPSuccess then
|
|||
|
hElements.merge! ExtractElementsFromHtml(HOST_URL + strPageAddress, strPage)
|
|||
|
else
|
|||
|
puts "Error while retrieving #{strPageAddress} (Error #{response.code}: #{response.message})"
|
|||
|
response.error!
|
|||
|
end
|
|||
|
end
|
|||
|
puts
|
|||
|
|
|||
|
File.open(File.join($APP_PATH, "sogno_diz.yml"), "w") do |fDst|
|
|||
|
fDst.write(YAML::dump(hElements))
|
|||
|
end
|