cercasogno/cercasogno_dizmaker.rb

#!/usr/bin/env ruby
#coding: utf-8

require 'pathname'
$APP_PATH = File.join(File.dirname(Pathname.new(__FILE__).realpath), "/")

require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'yaml'

DREAM_INFO = Struct.new(:strDesc, :strMeaning, :nNumber, :aSubDescs, :strReferenceURL)

def DropIntermediateAccents(str)
	accents = {
		[/á\B/, /à\B/, /â\B/, /ä\B/, /ã\B/] => 'a',
		[/Ã\B/, /Ä\B/, /Â\B/, /À\B/, /Á\B/] => 'A',
		[/é\B/, /è\B/, /ê\B/, /ë\B/] => 'e',
	    [/Ë\B/, /É\B/, /È\B/, /Ê\B/] => 'E',
		[/í\B/, /ì\B/, /î\B/, /ï\B/] => 'i',
	    [/Í\B/, /Î\B/, /Ì\B/, /Ï\B/] => 'I',
		[/ó\B/, /ò\B/, /ô\B/, /ö\B/, /õ\B/] => 'o',
	    [/Õ\B/, /Ö\B/, /Ô\B/, /Ò\B/, /Ó\B/] => 'O',
		[/ú\B/, /ù\B/, /û\B/, /ü\B/] => 'u',
	    [/Ú\B/, /Û\B/, /Ù\B/, /Ü\B/] => 'U',
		[/ç/] => 'c', [/Ç/] => 'C',
	    [/ñ/] => 'n', [/Ñ/] => 'N'
	}
	strRet = str.to_s()
	accents.each do |ac,rep|
		ac.each do |s|
			strRet.gsub!(s, rep)
		end
	end
	return strRet #.gsub(/[^\w\s.:,;@#§\[\]()=?!^"£$%&@°\\\/=*+-]\B/, "")
end

def GetPurifiedString(strText)
	strRet = DropIntermediateAccents(strText.strip())
	#strRet = strText.strip()
	hCleaning = {"" => /\r|\n/, " " => /\s{2,}/, "'" => "’"}
	hCleaning.each do |strReplace, regMatch|
		strRet.gsub! regMatch, strReplace
	end
	return strRet.strip
end

def ExtractSubsections(strSectionRaw)
	aSubsections = Array.new

	regSubElement = /^\W*([^0-9;:,.-]+)[\s;:,.-]+([^0-9;:=-]+?)\b\W+(\d+)/
	regInteger = /^\d+$/

	mSubElement = regSubElement.match(strSectionRaw)
	nStart = 0
	while mSubElement do
		diNew = DREAM_INFO.new(GetPurifiedString(mSubElement[1]), GetPurifiedString(mSubElement[2]), mSubElement[3].to_i())
		aSubsections << diNew
		bCriticalError = regInteger =~ diNew.strMeaning || regInteger =~ diNew.strDesc
		bCriticalError |= diNew.strMeaning.empty? || diNew.strDesc.empty?
		$stderr.puts "M: " + diNew.strMeaning if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strMeaning
		$stderr.puts "D: " + diNew.strDesc if bCriticalError || /[-.:,;@#§+\\!"£$%&()=?^°\[\]*0-9]/ =~ diNew.strDesc
		raise "Critical error" if bCriticalError
#  		puts "#{diNew.strDesc}: #{diNew.strMeaning} (#{diNew.nNumber})"
		nStart += mSubElement.end(0)
		mSubElement = regSubElement.match(strSectionRaw[nStart..-1])
	end
	if aSubsections.length == 0
		return nil
	else
		return aSubsections
	end
end

def ExtractElementsFromHtml(strReferenceUrl, strHtml)
	#The following query fetches <br> nodes from parent node "table" (ref: http://stackoverflow.com/questions/1485356/how-to-get-xpath-of-text-between-br-or-br)
	strXPathBR = "//table[@class=\"centraleUnico\"]/br/following-sibling::text() | //table[@class=\"centraleUnico\"]//br/preceding-sibling::text()"

	regHead = /^(.+?)\b.*=\s*(\d+)/
	regDetails = /^.+?:.+?,\s*\d+/

	bExpectingDetails = false
	diPrevItem = nil
	hRet = Hash.new
	doc = Nokogiri::HTML(strHtml)
	doc.xpath(strXPathBR).each do |link|
		strText = GetPurifiedString(link.content)
		bMatchHead, bMatchDetails = regHead =~ strText, regDetails =~ strText
		next unless bMatchHead || bMatchDetails

		bExpectingDetails = diPrevItem && bMatchDetails

		if bExpectingDetails then
		  diPrevItem.aSubDescs = ExtractSubsections(strText)
# 		  raise "An item's details were expected" unless mExtract
		else
			mExtract = regHead.match(strText)
			raise "A new item was expected" unless mExtract
			diPrevItem = DREAM_INFO.new(mExtract[1], nil, mExtract[2].to_i(), nil, strReferenceUrl)
			hRet[diPrevItem.strDesc] = diPrevItem
			print diPrevItem.strDesc + " "
		end
		bExpectingDetails = !bExpectingDetails
	end

# 	strHtml.force_encoding("iso-8859-1").encode("UTF-8")
	return hRet
end

aLetters = %w{a b c d e f g h i l m n o p q r s t u v z}

HOST_URL = "www.metropolino.com"
hElements = Hash.new
conn = Net::HTTP.new(HOST_URL, 80)
aLetters.each do |strLetter|
	strPageAddress = "/smorfia/interpretazione-dei-sogni-" + strLetter + ".asp"
	response, strPage = conn.get(strPageAddress)
	if response.is_a? Net::HTTPSuccess then
		hElements.merge! ExtractElementsFromHtml(HOST_URL + strPageAddress, strPage)
	else
		puts "Error while retrieving #{strPageAddress} (Error #{response.code}: #{response.message})"
		response.error!
	end
end
puts

File.open(File.join($APP_PATH, "sogno_diz.yml"), "w") do |fDst|
	fDst.write(YAML::dump(hElements))
end