import urllib import re from BeautifulSoup import BeautifulSoup class ParseError(Exception): """Couldn't parse document""" def makeurl(fromplace, toplace, fromnum = None, tonum = None): params = {'keya': fromplace, 'keyb': toplace, 'vm': '1', 'vp': '2', 'vs': '3', 'vr': '1', 'vn': '5'} if fromnum is not None: params['a'] = str(fromnum) if tonum is not None: params['b'] = str(tonum) return "http://atlas.tripplanner.fi/tkl/fi/?" + urllib.urlencode(params) def getroutes(html): b = BeautifulSoup(html) for table in b.findAll("table"): cells = table.findAll("td") if len(cells) >= 2 and "Matkaan" in str(cells[1]): break else: raise ParseError # Didn't find the table timepat = re.compile("^\s*[0-9]?[0-9][.:][0-9][0-9]\s*$") routepat = re.compile("line=([0-9a-zA-Z]{1,3})") results = [] for row in table.findAll("tr"): columns = row.findAll("td") if len(columns) < 7: continue leaves = str(columns[2].find(text = timepat)) arrives = str(columns[3].find(text = timepat)) route = routepat.findall(str(columns[6])) if leaves and arrives and route: results.append((leaves, arrives, route)) return results def cgimain(toplace, tonum = None): print "Content-type: text/plain\r\n\r" url = makeurl("Opiskelijankatu 4", toplace, tonum = tonum) data = urllib.urlopen(url).read() for leaves, arrives, route in getroutes(data): print leaves, arrives, ','.join(route) if __name__ == "__main__": u = makeurl("Opiskelijankatu 4", "Ammattikorkeakoulu") d = urllib.urlopen(u).read() print getroutes(d)