greglinch · March 10, 2017 22:26
diff --git a/convert_congress.py b/convert_congress.py
 from bs4 import BeautifulSoup


 '''
 Prereqs:
 - Go to the congressional bio directory http://bioguide.congress.gov/biosearch/biosearch.asp
 - Search the parameters you want
 - inspect element and copy the html
 - paste into a file and (optional?) wrap with <html></html> tags

 '''

 def convert_html_to_csv():
    ## set the file to read
    file_in = 'FILENAME.html'

    ## open the file
    with open(file_in, 'r') as read_file:
        html_doc = read_file.read()

    ## HTML parser
    soup = BeautifulSoup(html_doc, 'html.parser')

    ## find all rows
    rows = soup.find_all('tr')

    ## remove the header row
    rows.pop(0)

    ## data row count
    print 'Total data rows:\t' + str(len(rows)) + '\n'

    # print rows[0].find_all('a')

    output = 'id,lastname,statepostal,chamber\n'

    rows_processed = 0

    for row in rows:
        try:
            member_id = row.find_all('a')[0]['href'].split('=')[1]
            member_lastname = row.find_all('a')[0].string.split(',')[0].title()
            member_statepostal = row.find_all('td')[-2].string
            member_chamber = row.find_all('td')[2].string
            output += '%s,%s,%s,%s\n' % (member_id, member_lastname, member_statepostal, member_chamber)
            # output += '%s,%s\n' % (member_statepostal, member_chamber)
            rows_processed += 1
        except:
            print 'Error:\t\t' + str(row) + '\n'

    print 'Rows processed:\t\t' + str(rows_processed) + '\n'

    new_file = 'FILENAME.csv'

    ## write to the csv
    with open(new_file, 'w') as write_file:
        write_file.write(output)

 ## execute the function
 convert_html_to_csv()
	from bs4 import BeautifulSoup


	'''
	Prereqs:
	- Go to the congressional bio directory http://bioguide.congress.gov/biosearch/biosearch.asp
	- Search the parameters you want
	- inspect element and copy the html
	- paste into a file and (optional?) wrap with <html></html> tags

	'''

	def convert_html_to_csv():
	## set the file to read
	file_in = 'FILENAME.html'

	## open the file
	with open(file_in, 'r') as read_file:
	html_doc = read_file.read()

	## HTML parser
	soup = BeautifulSoup(html_doc, 'html.parser')

	## find all rows
	rows = soup.find_all('tr')

	## remove the header row
	rows.pop(0)

	## data row count
	print 'Total data rows:\t' + str(len(rows)) + '\n'

	# print rows[0].find_all('a')

	output = 'id,lastname,statepostal,chamber\n'

	rows_processed = 0

	for row in rows:
	try:
	member_id = row.find_all('a')[0]['href'].split('=')[1]
	member_lastname = row.find_all('a')[0].string.split(',')[0].title()
	member_statepostal = row.find_all('td')[-2].string
	member_chamber = row.find_all('td')[2].string
	output += '%s,%s,%s,%s\n' % (member_id, member_lastname, member_statepostal, member_chamber)
	# output += '%s,%s\n' % (member_statepostal, member_chamber)
	rows_processed += 1
	except:
	print 'Error:\t\t' + str(row) + '\n'

	print 'Rows processed:\t\t' + str(rows_processed) + '\n'

	new_file = 'FILENAME.csv'

	## write to the csv
	with open(new_file, 'w') as write_file:
	write_file.write(output)

	## execute the function
	convert_html_to_csv()