Created
December 2, 2012 21:35
-
-
Save RAbraham/4191170 to your computer and use it in GitHub Desktop.
Scraping the toronto muncipal government website for upcoming meetings based on input search text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
def search_toronto_site(text,committee,from_date,to_date,item_status): | |
from functools import partial | |
#from twill.commands import * | |
import twill.commands as crawler | |
SEARCH_FORM = 1 | |
search_page = "http://app.toronto.ca/tmmis/findAgendaItem.do?function=doPrepare" | |
fv = partial(crawler.fv,SEARCH_FORM) | |
# Grab the voting form | |
crawler.go(search_page) | |
# Fill out the form with the id, and set it to download | |
fv("word_or_phrase",text ) | |
if committee: | |
fv("decision_body",committee) | |
fv("fromDate",from_date) | |
fv("toDate",to_date) | |
fv("item_status",item_status) | |
crawler.submit() | |
return crawler.show() | |
def extract_agenda_items(html_page): | |
from bs4 import BeautifulSoup | |
FIRST_ROW_AFTER_HEADER = 1 | |
soup = BeautifulSoup(html_page) | |
rows = soup.find("table", {"id": "searchResultsTable"}).find_all("tr")[FIRST_ROW_AFTER_HEADER:] | |
agenda_items = [] | |
for result in rows: | |
meeting_date = result.find("td", {"class": "meetingDate"}).get_text() | |
item_num = result.find("td", {"class": "reference"}).find("a").get_text() | |
item_url = "http://app.toronto.ca" + str(result).split('(')[1].split(')')[0][1:-1] | |
title = result.find("td", {"class": "agendaItemTitle"}).get_text() | |
committee = result.find("td", {"class": "decisionBodyName"}).get_text() | |
agenda_items.append({'meeting_date': meeting_date, | |
'item_num': item_num, | |
'item_url': item_url, | |
'title': title, | |
'committee': committee}) | |
#print "Info: %s / %s / %s / %s / %s" % (meeting_date, item_num, item_url, title, committee) | |
return agenda_items | |
############################################################################################# | |
#CONSTANTS | |
################################################################################## | |
TEXT = 'Announcements' | |
WORD_GRAFFITI = 'Request' | |
COMMITTEE = 'Aboriginal Affairs Committee (2010-2014)' | |
COM_GRAFFITI = "Graffiti Panel" | |
FROM_DATE = "2012-11-30" | |
TO_DATE = "2012-11-30" | |
ITEM_STATUS = "Adopted" | |
#results_html = search_toronto_site(WORD_GRAFFITI,COM_GRAFFITI,FROM_DATE,TO_DATE,ITEM_STATUS) | |
results_html = search_toronto_site("","","","","") | |
agenda_items = extract_agenda_items(results_html) | |
print(results_html) | |
print("Total Results: %s" % (len(agenda_items))) | |
print json.dumps(agenda_items, sort_keys=True, indent=4, separators=(',', ': ')) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment