Last active
July 21, 2019 21:04
-
-
Save shuozhang1985/c1acdbee962b412e4127be02bd05d6a1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import os | |
#### yellow taxi: feature selection | |
os.chdir("/Users/shuozhang/Desktop/capstone/yellow taxi") | |
yellow_raw=pd.read_csv("yellow_tripdata_2014-05.csv", index_col=False) | |
yellow_raw.columns = [c.replace(" ", "") for c in yellow_raw.columns] # remove space in column name | |
yellow=yellow_raw[['pickup_datetime', 'pickup_latitude', 'pickup_longitude']] | |
yellow.columns=['Date', 'Lat', 'Lon'] # rename for future merge | |
yellow["Type"]="yellow" # EDA analysis label | |
yellow["Date"]=yellow["Date"].str[0:13].replace("\s+","-", regex=True) | |
#### green taxi: feature selection | |
os.chdir("/Users/shuozhang/Desktop/capstone/green taxi") | |
green_raw=pd.read_csv("green_tripdata_2014-05.csv", index_col=False) | |
green_raw.columns=[c.replace(" ", "") for c in green_raw.columns] | |
green_raw['lpep_pickup_datetime']=green_raw['VendorID'] # the first 3 columns are mislabeled | |
green_raw.index=np.arange(green_raw.shape[0]) # index is wrong | |
green=green_raw[['lpep_pickup_datetime', 'Pickup_longitude', 'Pickup_latitude']] | |
green.columns=['Date', 'Lon', 'Lat'] | |
green["Type"]="green" | |
green["Date"]=green["Date"].str[0:13].replace("\s+","-", regex=True) | |
#### uber: feature selection | |
os.chdir("/Users/shuozhang/Desktop/capstone/uber", index_col=False) | |
uber=pd.read_csv("uber-raw-data-may14.csv") | |
uber.columns=['Date', 'Lat', 'Lon', 'Base'] | |
uber.drop(['Base'], axis=1, inplace=True) | |
uber["Type"]="uber" | |
uber["Date"]=pd.to_datetime(uber["Date"]).dt.strftime('%Y-%m-%d-%H') | |
#### weather: feature selection | |
os.chdir("/Users/shuozhang/Desktop/capstone") | |
weather=pd.read_csv("weather_2014_04.csv", index_col=False) | |
weather.columns=[c.replace(" ", "") for c in weather.columns] | |
weather=weather.drop_duplicates(subset='IDX', keep='first') | |
weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"],format="%Y%m%d%H%M") | |
weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"]).dt.strftime('%Y-%m-%d-%H') | |
weather.drop([col for col in ["SPD","DEWP","IDX", "SD"] if col in weather], axis=1, inplace=True) | |
weather.columns=['Date', 'Temp', 'Rain'] | |
weather=weather.fillna(0) | |
#### data merge | |
data=pd.concat([yellow, green, uber], axis=0) | |
dataall=pd.merge(data, weather, how='left', on=['Date']) | |
dataall.to_csv('may_data_all.csv', sep='\t') | |
#### apply the same process to april, june, july, august, september | |
#### dimension reduction: transfer coordinates to zipcode | |
os.chdir("/Users/shuozhang/Desktop/data") | |
raw_data=pd.read_csv('may_data_all.csv', sep='\t', index_col=False) | |
data=raw_data.loc[map(lambda x,y: x >=(-75) and x<=(-72) and y >= 40 and y <= 41.5, \ | |
may.Lon, may.Lat),:] # pick a wide range of coordinates to include the whole new york city | |
data['Lat']=map(lambda x: round(x,3), data['Lat']) # round lat, lon to reduce number of google map API call | |
data['Lon']=map(lambda x: round(x,3), data['Lon']) | |
data["Lat"]=data["Lat"].astype(str) | |
data["Lon"]=data["Lon"].astype(str) | |
data["Coord"]=data[["Lat", "Lon"]].apply(lambda x: ",".join(x), axis=1) | |
data.to_csv('may_data.csv', sep='\t', index_col=False) | |
##### apply the same process to april, june, july, august, september | |
#### Google map API call | |
apr=pd.read_csv('apr_data.csv', sep='\t', index_col=False) | |
may=pd.read_csv('may_data.csv', sep='\t', index_col=False) | |
june=pd.read_csv('june_data.csv', sep='\t', index_col=False) | |
july=pd.read_csv('july_data.csv', sep='\t', index_col=False) | |
aug=pd.read_csv('aug_data.csv', sep='\t', index_col=False) | |
sep=pd.read_csv('sep_data.csv', sep='\t', index_col=False) | |
ls_coord=np.concatenate((apr["Coord"],may["Coord"], june["Coord"], july["Coord"],\ | |
aug["Coord"], sep["Coord"]), axis=0) | |
df=pd.DataFrame(list(set(ls_coord))) | |
#### use google map api to read zipcode | |
from geopy.geocoders import Nominatim | |
import re | |
import time | |
import csv | |
geolocator = Nominatim() | |
zipcode = [] | |
zipcode_all = [] | |
for n in range(0,df.shape[0]): | |
location = geolocator.reverse(df[n], timeout=50) | |
if location.address == None: | |
zipcode = "NaN" | |
else: | |
num = str(re.findall(r"\D(\d{5})\D", location.address)) | |
num = str(filter(str.isdigit, num)) | |
zipcode = num | |
output = str(df[n]) + "," + zipcode | |
print output | |
time.sleep(1) | |
with open(r"zipcode_output.csv", 'a') as f: | |
writer = csv.writer(f,delimiter='\t') | |
writer.writerow([output]) | |
#### data merge | |
zipcode=pd.read_csv('zipoutput.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'}) | |
may_raw=pd.read_csv('may_data.csv', sep='\t', index_col=False) | |
may=pd.merge(may_raw, zipcode, how='left', on=['Coord']) | |
#### apply the same process to april, june, july, august, september | |
nyc_zipcode=pd.read_csv('nyc_zipcode.csv', sep='\t', index_col=False, dtype={'Zipcode':'S10'}) | |
may=may[may['zipcode'].isin(nyc_zipcode['Zipcode'])] # filter nyc-only zipcode | |
may_aggre=may.groupby(['Date', 'Temp', 'Rain', 'zipcode']).size().reset_index(name='count') | |
may_aggre.to_csv('may_final_data.csv', sep='\t', index_col=False) | |
#### apply the same process to april, june, july, august, september, and finally combine together as data_final | |
#### add sub features | |
import numpy as np | |
import pandas as pd | |
import os | |
os.chdir("/Users/shuozhang/Desktop/data") | |
data_final=pd.read_csv('data_all_final.csv', sep='\t', encoding='utf-8', index_col=0, dtype={'zipcode':'S10'}) | |
date=pd.to_datetime(data_final['Date'], format='%Y-%m-%d-%H') | |
data_final['Month']=date.dt.month | |
data_final['Day']=date.dt.day | |
data_final['Hour']=date.dt.hour | |
temp=pd.DatetimeIndex(date) | |
data_final['Weekday']=temp.weekday | |
data_model=data_final[['Month', 'Weekday', 'Hour', 'Temp', 'Rain', 'zipcode', 'count']] | |
data_model.to_csv('nycmodeldata.csv', sep='\t', encoding='utf-8', index_col=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
why are you filtering the zipcode at line 120