grahamanderson · May 20, 2019 00:56
diff --git a/permits.py b/permits.py
 import pandas as pd
 import json
 import numpy as np
 import re
 import requests
 import requests_cache
 from sqlalchemy import create_engine
 from sqlalchemy.engine.url import URL
 from sqlalchemy.orm.session import sessionmaker
 requests_cache.install_cache('./data/requests_cache', backend='sqlite', expire_after=86400)
 import settings as s

 # https://www.census.gov/construction/bps/msaannual.html

 def construction_by_metro(url,category):
    # https://www.census.gov/construction/bps/txt/tb3v2017.txt
    # https://www.census.gov/construction/bps/txt/tb3u201502.txt
    rows = []
    backup_by = 1
    begin_data = 10
    end_data = -4

    #r = requests.get(url)
    r = requests.get(url, proxies=s.proxies,
                     verify=False)

    if r.status_code == 404:
        raise ValueError('404 Error: {}'.format(url))

    for index, line in enumerate(r.text.split("\n")[begin_data:end_data]):

        # Clean
        line = line.replace('*', '')  # # Atlantic City-Hammonton,* NJ
        line = re.sub(r'([A-z]) ([A-z])', '\\1_\\2', line)  # 24 32 Little Rock => 24 32 Little_Rock
        line = re.sub(r'([A-z])\s+-\s+([A-z])', '\\1-\\2', line)  # Little Rock - North Little => Little Rock-North Little
        line = line.strip()

        # Create Rows
        row = [
            i.strip()
            for i in line.split(' ')
            if i != ''
        ]
        rows.append(row)

    ####
    # Fix Bad Rows
    for index, row in enumerate(rows):

        # State is first element ['AR','149', '147', ... ],
        if re.search(r'[A-Z]{2}', row[0]):
            rows[index - backup_by] = rows[index - backup_by] + row
            rows.pop(index)

        # Row was cut short ['999','13980', 'Blacksburg-Christiansburg-Radford']
        elif len(row) == 7 and row[0].isnumeric():
            rows[index - 1] = rows[index - 1] + row
            rows.pop(index)

    df = pd.DataFrame(rows)

    if category == 'u':
        df = df.iloc[:, 0:10]
        cols = ['CSA', 'CBSA', 'Name', 'States', '{}_Total', '{}_Unit_1', '{}_Units_2', '{}_Units_3_4', '{}_Units_5+',
                '{}_Structure_5+Units']
        cols = ([
            col.format(category)
            for col in cols
        ])
        df.columns = cols
        return df

    elif category == 'v':
        df = df.iloc[:, 0:9]
        cols = ['CSA', 'CBSA', 'Name', 'States', '{}_Total', '{}_Unit_1', '{}_Units_2', '{}_Units_3_4', '{}_Units_5+']
        cols = ([
            col.format(category)
            for col in cols
        ])
        df.columns = cols
        return df
    else:
        return 'you need to pass in u (for housing units) or v (for housing valuation)'


 def metro_construction_by_year(category='v', years=[2010,2017]):
    mdf = pd.DataFrame([])
    url = 'https://www.census.gov/construction/bps/txt/tb3{0}{1}{2}.txt'

    # Loop over years
    for year in years:

        # Loop over Months
        for i in range(1, 13):
            month = str(i).zfill(2)
            url = url.format(category, year, month)
            df = construction_by_metro(url, category)
            df.to_csv(
                    '{0}/tb3{1}{2}{3}.csv'.format(
                        './data/permits/metro/csv', category, year, month),
                    index=False)

            mdf = mdf.append(df)

            # Return Dataframe for single year
            mdf.to_csv(
                '{0}/tb3{1}{2}{3}.csv'.format(
                    './data/permits/metro/csv', category, year, '_y'),
                index=False)

    #####
    # Export Dataframe for All Years

    mdf.to_csv(
        '{0}/tb3{1}{2}_{3}.csv'.format(
            './data/permits/metro/csv', category, min(years), max(years)),
        index=False)

    # Pickle the Dataframe
    pkl_path = '{0}/tb3{1}{2}_{3}.pkl'.format('./data/permits/metro', category, min(years), max(years))
    mdf.to_pickle(pkl_path)

    # Send to Database
    mdf.to_sql('metro_permits_{}'.format(category), s.engine, if_exists='replace')

    # Return all years
    return mdf


 if __name__ == '__main__':
   df = metro_construction_by_year('v', list(range(2010, 2019)))
	import pandas as pd
	import json
	import numpy as np
	import re
	import requests
	import requests_cache
	from sqlalchemy import create_engine
	from sqlalchemy.engine.url import URL
	from sqlalchemy.orm.session import sessionmaker
	requests_cache.install_cache('./data/requests_cache', backend='sqlite', expire_after=86400)
	import settings as s

	# https://www.census.gov/construction/bps/msaannual.html

	def construction_by_metro(url,category):
	# https://www.census.gov/construction/bps/txt/tb3v2017.txt
	# https://www.census.gov/construction/bps/txt/tb3u201502.txt
	rows = []
	backup_by = 1
	begin_data = 10
	end_data = -4

	#r = requests.get(url)
	r = requests.get(url, proxies=s.proxies,
	verify=False)

	if r.status_code == 404:
	raise ValueError('404 Error: {}'.format(url))

	for index, line in enumerate(r.text.split("\n")[begin_data:end_data]):

	# Clean
	line = line.replace('', '') # # Atlantic City-Hammonton, NJ
	line = re.sub(r'([A-z]) ([A-z])', '\\1_\\2', line) # 24 32 Little Rock => 24 32 Little_Rock
	line = re.sub(r'([A-z])\s+-\s+([A-z])', '\\1-\\2', line) # Little Rock - North Little => Little Rock-North Little
	line = line.strip()

	# Create Rows
	row = [
	i.strip()
	for i in line.split(' ')
	if i != ''
	]
	rows.append(row)

	####
	# Fix Bad Rows
	for index, row in enumerate(rows):

	# State is first element ['AR','149', '147', ... ],
	if re.search(r'[A-Z]{2}', row[0]):
	rows[index - backup_by] = rows[index - backup_by] + row
	rows.pop(index)

	# Row was cut short ['999','13980', 'Blacksburg-Christiansburg-Radford']
	elif len(row) == 7 and row[0].isnumeric():
	rows[index - 1] = rows[index - 1] + row
	rows.pop(index)

	df = pd.DataFrame(rows)

	if category == 'u':
	df = df.iloc[:, 0:10]
	cols = ['CSA', 'CBSA', 'Name', 'States', '{}_Total', '{}_Unit_1', '{}_Units_2', '{}_Units_3_4', '{}_Units_5+',
	'{}_Structure_5+Units']
	cols = ([
	col.format(category)
	for col in cols
	])
	df.columns = cols
	return df

	elif category == 'v':
	df = df.iloc[:, 0:9]
	cols = ['CSA', 'CBSA', 'Name', 'States', '{}_Total', '{}_Unit_1', '{}_Units_2', '{}_Units_3_4', '{}_Units_5+']
	cols = ([
	col.format(category)
	for col in cols
	])
	df.columns = cols
	return df
	else:
	return 'you need to pass in u (for housing units) or v (for housing valuation)'


	def metro_construction_by_year(category='v', years=[2010,2017]):
	mdf = pd.DataFrame([])
	url = 'https://www.census.gov/construction/bps/txt/tb3{0}{1}{2}.txt'

	# Loop over years
	for year in years:

	# Loop over Months
	for i in range(1, 13):
	month = str(i).zfill(2)
	url = url.format(category, year, month)
	df = construction_by_metro(url, category)
	df.to_csv(
	'{0}/tb3{1}{2}{3}.csv'.format(
	'./data/permits/metro/csv', category, year, month),
	index=False)

	mdf = mdf.append(df)

	# Return Dataframe for single year
	mdf.to_csv(
	'{0}/tb3{1}{2}{3}.csv'.format(
	'./data/permits/metro/csv', category, year, '_y'),
	index=False)

	#####
	# Export Dataframe for All Years

	mdf.to_csv(
	'{0}/tb3{1}{2}_{3}.csv'.format(
	'./data/permits/metro/csv', category, min(years), max(years)),
	index=False)

	# Pickle the Dataframe
	pkl_path = '{0}/tb3{1}{2}_{3}.pkl'.format('./data/permits/metro', category, min(years), max(years))
	mdf.to_pickle(pkl_path)

	# Send to Database
	mdf.to_sql('metro_permits_{}'.format(category), s.engine, if_exists='replace')

	# Return all years
	return mdf


	if __name__ == '__main__':
	df = metro_construction_by_year('v', list(range(2010, 2019)))