# -*- coding: utf-8 -*- import datetime import urllib import re from BeautifulSoup import BeautifulSoup weather_states = ['cloud', 'fog', 'rain', 'snow', 'storm', 'suncloud', 'sun', 'unknown'] fi_msn_states = [ ('lumi', 'snow'), ('sade', 'rain'), ('ukkosta', 'storm'), (u'puolipilvistä', 'suncloud'), ('sumu', 'fog'), ('pilvi', 'cloud'), (u'selkeää', 'sun') ] fi_msn_days = ['maanantai', 'tiistai', 'keskiviikko', 'torstai', 'perjantai', 'lauantai', 'sunnuntai'] fi_msn_months = ['tammi', 'helmi', 'maalis', 'huhti', 'touko', u'kesä', u'heinä', 'elo', 'syys', 'loka', 'marras', 'joulu'] def text_to_state(text_in): '''Convert a textual representation to weather state.''' text_in = unicode(text_in.lower()) for text_test, state in fi_msn_states: if text_test in text_in: return state return 'unknown' def inner_get_date(text_in, date_now = None): day_of_month, month_text = text_in.split(' ') dom_number = int(day_of_month.strip('.')) month_number = fi_msn_months.index(month_text) + 1 if date_now is None: date_now = datetime.datetime.now() result_year = date_now.year if date_now.month == 12 and month_number == 1: result_year += 1 date = datetime.date(result_year, month_number, dom_number) return date def get_date(text_in, date_now = None): try: date = inner_get_date(text_in, date_now) except ValueError: return None return date def get_time(text_in): return int(text_in.split(':')[0]) def getweather(html): '''Returns a list of pairs: (datetime, weather state)''' soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) current_date = None results = [] for row in soup.findAll('div', {'class': 'hf_outline'}): # Check for a day-row hf_date = row.find('div', {'class': 'hf_date'}) if hf_date: date_text = hf_date.findAll('span')[1].string current_date = get_date(date_text) continue # Parse forecast row ul = row.find('ul') if ul: timestring = ul.find('li', {'class': 'hf_time'}).string hour = get_time(timestring) state = text_to_state(ul.find('li', {'class': 'hf_forecast'}).string) if current_date is None: timestamp = None else: timestamp = datetime.datetime(current_date.year, current_date.month, current_date.day, hour, 0, 0) results.append((timestamp, state)) return results if __name__ == '__main__': print 'Unit tests' assert text_to_state(u'Melkein selkeää') == 'sun' assert text_to_state(u'Enimmäkseen pilvistä') == 'cloud' assert text_to_state(u'Sadekuuroja/selkeää') == 'rain' assert text_to_state(u'foobarplaaplöö') == 'unknown' #ref_date = datetime.date(2009, 7, 1) #assert get_date('torstai 20. elokuu', ref_date) == datetime.date(2009, 8, 20) #assert get_date(u'perjantai 10. heinäkuu', ref_date) == datetime.date(2009, 7, 10) #assert get_date(u'tiistai 10. heinäkuu', ref_date) is None #assert get_date('lauantai 22. elokuu', ref_date) == datetime.date(2009, 8, 22) html = urllib.urlopen('http://saa.fi.msn.com/hourly.aspx?wealocations=wc:32669&q=Tampere%2c+L%C3%A4nsi-Suomi+forecast:hourly').read() results = getweather(html) for timestamp, state in results: print timestamp, state assert len(results) > 3