gunessenturk · September 20, 2018 03:55
diff --git a/TDI_W2.ipynb b/TDI_W2.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import datetime as ddtt\n",
    "import numpy as np\n",
    "import folium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get reported crime data\n",
    "df = pd.read_csv('LA_Crime_Data_from_2010_to_11sep2018.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.loc[ (df['Premise Description'] == 'STREET') \n",
    "            | (df['Premise Description'] == 'PARKING LOT') \n",
    "            | (df['Premise Description'] == 'SIDEWALK') \n",
    "            | (df['Premise Description'] == 'PARK/PLAYGROUND') \n",
    "            | (df['Premise Description'] == 'ALLEY')\n",
    "            | (df['Premise Description'] == 'GAS STATION')\n",
    "            | (df['Premise Description'] == 'BUS STOP')\n",
    "            | (df['Premise Description'] == 'OTHER/OUTSIDE')\n",
    "            | (df['Premise Description'] == 'PEDESTRIAN OVERCROSSING')\n",
    "            | (df['Premise Description'] == 'UNDERPASS/BRIDGE*')\n",
    "            | (df['Premise Description'] == 'SKATEBOARD FACILITY/SKATEBOARD PARK*')\n",
    "           ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clean column 'Location'\n",
    "df['Location '] = df['Location '].str.replace('\\(','')\n",
    "df['Location '] = df['Location '].str.replace('\\)','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add columns 'Latitude', 'Longitude', 'Year Occurred', 'Month Occurred', 'Day Occurred'\n",
    "df['Latitude'] = df['Location '].str.replace(',.*', '').astype('float')\n",
    "df['Longitude'] = df['Location '].str.replace('.*,', '').astype('float')\n",
    "df['Year Occurred'] = df['Date Occurred'].str.replace('.*/', '').astype('int')\n",
    "df['Month Occurred'] = ( df['Date Occurred']\n",
    "                              .str.replace('/.*/.*', '').astype('int') )\n",
    "df['Day Occurred'] = ( df['Date Occurred']\n",
    "                            .str.replace('^\\d\\d/', '').str.replace('/.*', '').astype('int') )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert 'Date Occurred' and 'Date Reported' to datetime\n",
    "df['Date Occurred'] = pd.to_datetime(df['Date Occurred'], format='%m/%d/%Y')\n",
    "df['Date Reported'] = pd.to_datetime(df['Date Reported'], format='%m/%d/%Y')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add column 'Weekday Occurred'\n",
    "df['Weekday Occurred'] = df['Date Occurred'].apply(lambda x: ddtt.datetime.weekday(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop('Location ', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop rows with zero lat or lon\n",
    "df = df[(df['Longitude']!= 0) & (df['Latitude'] != 0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['PUMA'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crime_area = df[['Area Name', 'DR Number']].groupby(['Area Name']).count().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_days = pd.Timestamp('September 11, 2018') - pd.Timestamp('January 1, 2010')\n",
    "num_days = (num_days / np.timedelta64(1, 'D')).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot number of reported crimes by area name\n",
    "\n",
    "plt.figure(figsize=(16,10))\n",
    "plt.subplot(1, 1, 1)\n",
    "bars = plt.bar(x=df_crime_area['Area Name'], \n",
    "               height=np.round(df_crime_area['DR Number'] / num_days) )\n",
    "plt.tick_params(top='off', bottom='off', left='off', \n",
    "                right='off', labelleft='off', labelbottom='on')\n",
    "plt.title('Number of Crimes per Day - Los Angeles City\\n (street or similar premise)', \n",
    "          fontsize=28)\n",
    "for bar in bars:\n",
    "    plt.gca().text(bar.get_x() + bar.get_width()/2, bar.get_height() - 0.5, \n",
    "                   '{:.0f}'.format(bar.get_height()), \n",
    "                 ha='center', color='w', fontsize=20)\n",
    "    \n",
    "x = plt.gca().xaxis\n",
    "\n",
    "for item in x.get_ticklabels():\n",
    "    item.set_rotation(70)\n",
    "    item.set_fontsize(24)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_18_jan = df[(df['Year Occurred'] == 2018) & (df['Month Occurred'] == 1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "LA_map_2018_jan = folium.Map(location=[33.9829, -118.3338],\n",
    "                        zoom_start=10,\n",
    "#                        tiles=\"Stamen Toner\")\n",
    "                        tiles='cartodbpositron')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(df_18_jan)):\n",
    "    lat = df_18_jan['Latitude'].iloc[i]\n",
    "    lon = df_18_jan['Longitude'].iloc[i]\n",
    "    folium.CircleMarker([lat, lon], radius=1, color='red', popup=df_18_jan['Area Name'].iloc[i]).add_to(LA_map_2018_jan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LA_map_2018_jan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[215:220]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "def getPUMA(lonlat):\n",
    "    params = {  'f'            :'pjson', \n",
    "                'geometry'     : lonlat, \n",
    "                'geometryType'  : 'esriGeometryPoint',\n",
    "                'inSR'         : 4265,\n",
    "                'spatialRel'   : 'esriSpatialRelIntersects',\n",
    "                'outFields': 'BASENAME,STATE,PUMA',\n",
    "                'returnGeometry': 'false',\n",
    "              #  'returnTrueCurves': 'false',\n",
    "              #  'returnIdsOnly': 'false',\n",
    "              #  'returnCountOnly': 'false',\n",
    "              #  'returnZ' : 'false',\n",
    "               # 'returnM' : 'false',\n",
    "               # 'returnDistinctValues' : 'false',\n",
    "              #  'returnExtentsOnly' :'false',\n",
    "             }\n",
    "    response = requests.get('https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/PUMA_TAD_TAZ_UGA_ZCTA/MapServer/0/query', params=params)\n",
    "    if response:\n",
    "        if response.json()['features'][0]['attributes']['PUMA']:\n",
    "            PUMA = response.json()['features'][0]['attributes']['PUMA']\n",
    "        else:\n",
    "            PUMA = -2\n",
    "    else:\n",
    "        PUMA = -1\n",
    "    return PUMA\n",
    "\n",
    "\n",
    "getPUMA(\"-118.3157, 34.0454\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Takes too long\n",
    "###  df['PUMA'] = df[['Longitude', 'Latitude']].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)\n",
    "#getPUMA(str(df['Longitude']) + ', ' + str(df['Latitude']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[:20, -1:] = df[['Longitude', 'Latitude']].iloc[:20].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[20:50, -1:] = df[['Longitude', 'Latitude']].iloc[20:50].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[50:100, -1:] = df[['Longitude', 'Latitude']].iloc[50:100].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[100:150, -1:] = df[['Longitude', 'Latitude']].iloc[100:150].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[150:300, -1:] = df[['Longitude', 'Latitude']].iloc[150:300].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[300:400, -1:] = df[['Longitude', 'Latitude']].iloc[300:400].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[400:600, -1:] = df[['Longitude', 'Latitude']].iloc[400:600].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[600:800, -1:] = df[['Longitude', 'Latitude']].iloc[600:800].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[800:1000, -1:] = df[['Longitude', 'Latitude']].iloc[800:1000].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[1000:1200, -1:] = df[['Longitude', 'Latitude']].iloc[1000:1200].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[1200:1400, -1:] = df[['Longitude', 'Latitude']].iloc[1200:1400].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[1400:1600, -1:] = df[['Longitude', 'Latitude']].iloc[1400:1600].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[1600:1800, -1:] = df[['Longitude', 'Latitude']].iloc[1600:1800].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[1800:2000, -1:] = df[['Longitude', 'Latitude']].iloc[1800:2000].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_2000 = df.iloc[:2000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import dill\n",
    "# dill.dump(df_2000, open('df_LA_wPUMA_first2000.pkd', 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_2000 = dill.load(open('df_LA_wPUMA_first2000.pkd', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_2000.drop(['MO Codes', 'Victim Age', 'Victim Sex','Victim Descent', \n",
    "#               'Crime Code 1', 'Crime Code 2', 'Crime Code 3','Crime Code 4'], axis=1, inplace=True)\n",
    "# df_2000.drop(['Address', 'Cross Street'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Crime data from Jan 2016 only\n",
    "df_Jan2016 = df[(df['Year Occurred']==2016) & (df['Month Occurred']==1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.drop(['MO Codes', 'Victim Age', 'Victim Sex','Victim Descent', \n",
    "               'Crime Code 1', 'Crime Code 2', 'Crime Code 3','Crime Code 4', \n",
    "                'Address', 'Cross Street'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[:500, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[:500].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[500:1000, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[500:1000].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[1000:1500, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[1000:1500].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[1500:2000, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[1500:2000].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[2000:2500, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[2000:2500].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[2500:3000, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[2500:3000].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[3000:3500, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[3000:3500].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[3500:4000, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[3500:4000].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[4000:4500, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[4000:4500].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[4500:5000, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[4500:5000].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[5000:5500, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[5000:5500].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[5500:6000, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[5500:6000].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[6000:6500, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[6000:6500].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016.iloc[6500:, -1:] = df_Jan2016[['Longitude', 'Latitude']].iloc[6500:].apply(lambda x: getPUMA(str(x[0]) + ', ' + str(x[1])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import dill\n",
    "# dill.dump(df_Jan2016, open('df_LA_wPUMA_jan2016.pkd', 'wb'))\n",
    "# df_Jan2016 = dill.load(open('df_LA_wPUMA_jan2016.pkd', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016['PUMA'] = df_Jan2016['PUMA'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016_by_PUMA = df_Jan2016[['PUMA', 'DR Number']].groupby(['PUMA']).count().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016_by_PUMA = df_Jan2016_by_PUMA.sort_values(by='DR Number')[16:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016_by_PUMA['PUMA'] = df_Jan2016_by_PUMA['PUMA'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_Jan2016_by_PUMA['PUMA_str'] = df_Jan2016_by_PUMA['PUMA'].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot number of reported crimes by area name\n",
    "\n",
    "plt.figure(figsize=(16,10))\n",
    "plt.subplot(1, 1, 1)\n",
    "bars = plt.bar(x=df_Jan2016_by_PUMA['PUMA_str'], \n",
    "               height=df_Jan2016_by_PUMA['DR Number'] )\n",
    "plt.tick_params(top='off', bottom='off', left='off', \n",
    "                right='off', labelleft='off', labelbottom='on')\n",
    "plt.title('Jan 2016 Total Number of Crimes - Los Angeles City\\n (street or similar premise)', \n",
    "          fontsize=28)\n",
    "for bar in bars:\n",
    "    plt.gca().text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2.0, \n",
    "                   '{:.0f}'.format(bar.get_height()), \n",
    "                 ha='center', color='k', fontsize=16)\n",
    "    \n",
    "x = plt.gca().xaxis\n",
    "\n",
    "for item in x.get_ticklabels():\n",
    "    item.set_rotation(70)\n",
    "    item.set_fontsize(24)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_Jan2016_by_PUMA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined = pd.merge(df_Jan2016_by_PUMA, acs_mean, how='inner', left_on='PUMA', right_on='PUMA')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(combined['HINCP'], combined['DR Number'])\n",
    "plt.title('Number of Crimes (Jan 2016)', fontsize=14)\n",
    "plt.xlabel('Average Household Income', fontsize=14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(combined['NPF'], combined['DR Number'])\n",
    "plt.title('Number of Crimes (Jan 2016)', fontsize=14)\n",
    "plt.xlabel('Average Number of Persons in Family', fontsize=14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(combined['GRPIP'], combined['DR Number'])\n",
    "plt.title('Number of Crimes (Jan 2016)', fontsize=14)\n",
    "plt.xlabel('Rent as % of Household Income', fontsize=14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(combined['NOC'], combined['DR Number'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "acs_mean = acs_ca.groupby(['PUMA'])[['HINCP', 'NPF', 'GRPIP', 'NOC']].mean().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get reported American Community Survey data for California, housing level\n",
    "acs_ca = pd.read_csv('csv_hca/ss16hca.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "acs_ca.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "acs_ca.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "acs_ca.iloc[:, 4:20].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# monthly rent\n",
    "# $1 to $99999 (Rounded and top-coded)\n",
    "acs_ca['RNTP'].hist()  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Total payment on all second and junior mortgages and home equity loans (monthly amount)\n",
    "# $1 to $99999 (Rounded and top-coded)\n",
    "acs_ca['SMP'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Property value\n",
    "# $1 to $9999999 (Rounded and top-coded)\n",
    "acs_ca['VALP'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# When structure first built\n",
    "acs_ca['YBL'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Family income (past 12 months)\n",
    "# Note: Use ADJINC to adjust FINCP to constant dollars.\n",
    "acs_ca['FINCP'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Household income (past 12 months)\n",
    "# Note: Use ADJINC to adjust HINCP to constant dollars.\n",
    "acs_ca['HINCP'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import this"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gross rent (monthly amount)\n",
    "# $1 - $99999 (Components are rounded)\n",
    "# Note: Use ADJHSG to adjust GRNTP to constant dollars.\n",
    "acs_ca['GRNTP'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gross rent as a percentage of household income past 12 months\n",
    "# 101         .101% or more\n",
    "acs_ca['GRPIP'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# When moved into this house or apartment\n",
    "acs_ca['MV'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of own children in household (unweighted)\n",
    "acs_ca['NOC'].hist(log=True, bins=14)  # it ranges from 0 to 13, with one hh w/ 11, 12, and 13 kids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(acs_ca[acs_ca['NOC']==10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of persons in family (unweighted)\n",
    "acs_ca['NPF'].hist(log=True, bins=19)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "acs_ca['NPF'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Grandparent headed household with no parent present\n",
    "'''b .N/A (GQ/vacant)\n",
    "0 .Not a grandparent headed household with no parent present 1 .Grandparent headed household with no parent present'''\n",
    "acs_ca['NPP'].hist(log=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(acs_ca[acs_ca['NPP'] == 1]) *100 / len(acs_ca)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Property taxes (yearly amount)\n",
    "# $10000+(Top-coded)\n",
    "acs_ca['TAXP'].hist(bins = 68)\n",
    "# Note: No adjustment factor is applied to TAXP."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }