Skip to content

Instantly share code, notes, and snippets.

@grahamanderson
Last active May 20, 2019 00:56
Show Gist options
  • Save grahamanderson/d6fc4a3bfc59950d7f7055360d4ac342 to your computer and use it in GitHub Desktop.
Save grahamanderson/d6fc4a3bfc59950d7f7055360d4ac342 to your computer and use it in GitHub Desktop.
Census Permit Text files have dodgy formatting. See https://www.census.gov/construction/bps/msamonthly.html. On occasion, text data needs reformatting. If you're looking for indications of economic growth, metro permit data is crucial.
import pandas as pd
import json
import numpy as np
import re
import requests
import requests_cache
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy.orm.session import sessionmaker
requests_cache.install_cache('./data/requests_cache', backend='sqlite', expire_after=86400)
import settings as s
# https://www.census.gov/construction/bps/msaannual.html
def construction_by_metro(url,category):
# https://www.census.gov/construction/bps/txt/tb3v2017.txt
# https://www.census.gov/construction/bps/txt/tb3u201502.txt
rows = []
backup_by = 1
begin_data = 10
end_data = -4
#r = requests.get(url)
r = requests.get(url, proxies=s.proxies,
verify=False)
if r.status_code == 404:
raise ValueError('404 Error: {}'.format(url))
for index, line in enumerate(r.text.split("\n")[begin_data:end_data]):
# Clean
line = line.replace('*', '') # # Atlantic City-Hammonton,* NJ
line = re.sub(r'([A-z]) ([A-z])', '\\1_\\2', line) # 24 32 Little Rock => 24 32 Little_Rock
line = re.sub(r'([A-z])\s+-\s+([A-z])', '\\1-\\2', line) # Little Rock - North Little => Little Rock-North Little
line = line.strip()
# Create Rows
row = [
i.strip()
for i in line.split(' ')
if i != ''
]
rows.append(row)
####
# Fix Bad Rows
for index, row in enumerate(rows):
# State is first element ['AR','149', '147', ... ],
if re.search(r'[A-Z]{2}', row[0]):
rows[index - backup_by] = rows[index - backup_by] + row
rows.pop(index)
# Row was cut short ['999','13980', 'Blacksburg-Christiansburg-Radford']
elif len(row) == 7 and row[0].isnumeric():
rows[index - 1] = rows[index - 1] + row
rows.pop(index)
df = pd.DataFrame(rows)
if category == 'u':
df = df.iloc[:, 0:10]
cols = ['CSA', 'CBSA', 'Name', 'States', '{}_Total', '{}_Unit_1', '{}_Units_2', '{}_Units_3_4', '{}_Units_5+',
'{}_Structure_5+Units']
cols = ([
col.format(category)
for col in cols
])
df.columns = cols
return df
elif category == 'v':
df = df.iloc[:, 0:9]
cols = ['CSA', 'CBSA', 'Name', 'States', '{}_Total', '{}_Unit_1', '{}_Units_2', '{}_Units_3_4', '{}_Units_5+']
cols = ([
col.format(category)
for col in cols
])
df.columns = cols
return df
else:
return 'you need to pass in u (for housing units) or v (for housing valuation)'
def metro_construction_by_year(category='v', years=[2010,2017]):
mdf = pd.DataFrame([])
url = 'https://www.census.gov/construction/bps/txt/tb3{0}{1}{2}.txt'
# Loop over years
for year in years:
# Loop over Months
for i in range(1, 13):
month = str(i).zfill(2)
url = url.format(category, year, month)
df = construction_by_metro(url, category)
df.to_csv(
'{0}/tb3{1}{2}{3}.csv'.format(
'./data/permits/metro/csv', category, year, month),
index=False)
mdf = mdf.append(df)
# Return Dataframe for single year
mdf.to_csv(
'{0}/tb3{1}{2}{3}.csv'.format(
'./data/permits/metro/csv', category, year, '_y'),
index=False)
#####
# Export Dataframe for All Years
mdf.to_csv(
'{0}/tb3{1}{2}_{3}.csv'.format(
'./data/permits/metro/csv', category, min(years), max(years)),
index=False)
# Pickle the Dataframe
pkl_path = '{0}/tb3{1}{2}_{3}.pkl'.format('./data/permits/metro', category, min(years), max(years))
mdf.to_pickle(pkl_path)
# Send to Database
mdf.to_sql('metro_permits_{}'.format(category), s.engine, if_exists='replace')
# Return all years
return mdf
if __name__ == '__main__':
df = metro_construction_by_year('v', list(range(2010, 2019)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment