Last active
March 10, 2017 22:26
-
-
Save greglinch/5197267b6ff8fcb19192ba5443f1f71d to your computer and use it in GitHub Desktop.
Converts HTML table from congressional bio directory to a csv. For downloading images, see https://gist.github.com/greglinch/608001fa0ae39834af18354c9e8c6f09
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
''' | |
Prereqs: | |
- Go to the congressional bio directory http://bioguide.congress.gov/biosearch/biosearch.asp | |
- Search the parameters you want | |
- inspect element and copy the html | |
- paste into a file and (optional?) wrap with <html></html> tags | |
''' | |
def convert_html_to_csv(): | |
## set the file to read | |
file_in = 'FILENAME.html' | |
## open the file | |
with open(file_in, 'r') as read_file: | |
html_doc = read_file.read() | |
## HTML parser | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
## find all rows | |
rows = soup.find_all('tr') | |
## remove the header row | |
rows.pop(0) | |
## data row count | |
print 'Total data rows:\t' + str(len(rows)) + '\n' | |
# print rows[0].find_all('a') | |
output = 'id,lastname,statepostal,chamber\n' | |
rows_processed = 0 | |
for row in rows: | |
try: | |
member_id = row.find_all('a')[0]['href'].split('=')[1] | |
member_lastname = row.find_all('a')[0].string.split(',')[0].title() | |
member_statepostal = row.find_all('td')[-2].string | |
member_chamber = row.find_all('td')[2].string | |
output += '%s,%s,%s,%s\n' % (member_id, member_lastname, member_statepostal, member_chamber) | |
# output += '%s,%s\n' % (member_statepostal, member_chamber) | |
rows_processed += 1 | |
except: | |
print 'Error:\t\t' + str(row) + '\n' | |
print 'Rows processed:\t\t' + str(rows_processed) + '\n' | |
new_file = 'FILENAME.csv' | |
## write to the csv | |
with open(new_file, 'w') as write_file: | |
write_file.write(output) | |
## execute the function | |
convert_html_to_csv() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment