Last active
May 20, 2019 00:56
-
-
Save grahamanderson/d6fc4a3bfc59950d7f7055360d4ac342 to your computer and use it in GitHub Desktop.
Census Permit Text files have dodgy formatting. See https://www.census.gov/construction/bps/msamonthly.html. On occasion, text data needs reformatting. If you're looking for indications of economic growth, metro permit data is crucial.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import json | |
import numpy as np | |
import re | |
import requests | |
import requests_cache | |
from sqlalchemy import create_engine | |
from sqlalchemy.engine.url import URL | |
from sqlalchemy.orm.session import sessionmaker | |
requests_cache.install_cache('./data/requests_cache', backend='sqlite', expire_after=86400) | |
import settings as s | |
# https://www.census.gov/construction/bps/msaannual.html | |
def construction_by_metro(url,category): | |
# https://www.census.gov/construction/bps/txt/tb3v2017.txt | |
# https://www.census.gov/construction/bps/txt/tb3u201502.txt | |
rows = [] | |
backup_by = 1 | |
begin_data = 10 | |
end_data = -4 | |
#r = requests.get(url) | |
r = requests.get(url, proxies=s.proxies, | |
verify=False) | |
if r.status_code == 404: | |
raise ValueError('404 Error: {}'.format(url)) | |
for index, line in enumerate(r.text.split("\n")[begin_data:end_data]): | |
# Clean | |
line = line.replace('*', '') # # Atlantic City-Hammonton,* NJ | |
line = re.sub(r'([A-z]) ([A-z])', '\\1_\\2', line) # 24 32 Little Rock => 24 32 Little_Rock | |
line = re.sub(r'([A-z])\s+-\s+([A-z])', '\\1-\\2', line) # Little Rock - North Little => Little Rock-North Little | |
line = line.strip() | |
# Create Rows | |
row = [ | |
i.strip() | |
for i in line.split(' ') | |
if i != '' | |
] | |
rows.append(row) | |
#### | |
# Fix Bad Rows | |
for index, row in enumerate(rows): | |
# State is first element ['AR','149', '147', ... ], | |
if re.search(r'[A-Z]{2}', row[0]): | |
rows[index - backup_by] = rows[index - backup_by] + row | |
rows.pop(index) | |
# Row was cut short ['999','13980', 'Blacksburg-Christiansburg-Radford'] | |
elif len(row) == 7 and row[0].isnumeric(): | |
rows[index - 1] = rows[index - 1] + row | |
rows.pop(index) | |
df = pd.DataFrame(rows) | |
if category == 'u': | |
df = df.iloc[:, 0:10] | |
cols = ['CSA', 'CBSA', 'Name', 'States', '{}_Total', '{}_Unit_1', '{}_Units_2', '{}_Units_3_4', '{}_Units_5+', | |
'{}_Structure_5+Units'] | |
cols = ([ | |
col.format(category) | |
for col in cols | |
]) | |
df.columns = cols | |
return df | |
elif category == 'v': | |
df = df.iloc[:, 0:9] | |
cols = ['CSA', 'CBSA', 'Name', 'States', '{}_Total', '{}_Unit_1', '{}_Units_2', '{}_Units_3_4', '{}_Units_5+'] | |
cols = ([ | |
col.format(category) | |
for col in cols | |
]) | |
df.columns = cols | |
return df | |
else: | |
return 'you need to pass in u (for housing units) or v (for housing valuation)' | |
def metro_construction_by_year(category='v', years=[2010,2017]): | |
mdf = pd.DataFrame([]) | |
url = 'https://www.census.gov/construction/bps/txt/tb3{0}{1}{2}.txt' | |
# Loop over years | |
for year in years: | |
# Loop over Months | |
for i in range(1, 13): | |
month = str(i).zfill(2) | |
url = url.format(category, year, month) | |
df = construction_by_metro(url, category) | |
df.to_csv( | |
'{0}/tb3{1}{2}{3}.csv'.format( | |
'./data/permits/metro/csv', category, year, month), | |
index=False) | |
mdf = mdf.append(df) | |
# Return Dataframe for single year | |
mdf.to_csv( | |
'{0}/tb3{1}{2}{3}.csv'.format( | |
'./data/permits/metro/csv', category, year, '_y'), | |
index=False) | |
##### | |
# Export Dataframe for All Years | |
mdf.to_csv( | |
'{0}/tb3{1}{2}_{3}.csv'.format( | |
'./data/permits/metro/csv', category, min(years), max(years)), | |
index=False) | |
# Pickle the Dataframe | |
pkl_path = '{0}/tb3{1}{2}_{3}.pkl'.format('./data/permits/metro', category, min(years), max(years)) | |
mdf.to_pickle(pkl_path) | |
# Send to Database | |
mdf.to_sql('metro_permits_{}'.format(category), s.engine, if_exists='replace') | |
# Return all years | |
return mdf | |
if __name__ == '__main__': | |
df = metro_construction_by_year('v', list(range(2010, 2019))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment