shuozhang1985 · July 21, 2019 21:04 · KhondokerTanvirHossain · Jul 21, 2019
diff --git a/Data Pre-processing b/Data Pre-processing
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import os

 #### yellow taxi: feature selection

 os.chdir("/Users/shuozhang/Desktop/capstone/yellow taxi")
 yellow_raw=pd.read_csv("yellow_tripdata_2014-05.csv", index_col=False)
 yellow_raw.columns = [c.replace(" ", "") for c in yellow_raw.columns] # remove space in column name
 yellow=yellow_raw[['pickup_datetime', 'pickup_latitude', 'pickup_longitude']]
 yellow.columns=['Date', 'Lat', 'Lon'] # rename for future merge
 yellow["Type"]="yellow" # EDA analysis label
 yellow["Date"]=yellow["Date"].str[0:13].replace("\s+","-", regex=True) 

 #### green taxi: feature selection

 os.chdir("/Users/shuozhang/Desktop/capstone/green taxi")
 green_raw=pd.read_csv("green_tripdata_2014-05.csv", index_col=False)
 green_raw.columns=[c.replace(" ", "") for c in green_raw.columns]  
 green_raw['lpep_pickup_datetime']=green_raw['VendorID'] # the first 3 columns are mislabeled
 green_raw.index=np.arange(green_raw.shape[0])  # index is wrong
 green=green_raw[['lpep_pickup_datetime', 'Pickup_longitude', 'Pickup_latitude']]
 green.columns=['Date', 'Lon', 'Lat']
 green["Type"]="green"
 green["Date"]=green["Date"].str[0:13].replace("\s+","-", regex=True)

 #### uber: feature selection

 os.chdir("/Users/shuozhang/Desktop/capstone/uber", index_col=False)
 uber=pd.read_csv("uber-raw-data-may14.csv")
 uber.columns=['Date', 'Lat', 'Lon', 'Base']
 uber.drop(['Base'], axis=1, inplace=True)
 uber["Type"]="uber"
 uber["Date"]=pd.to_datetime(uber["Date"]).dt.strftime('%Y-%m-%d-%H')

 #### weather: feature selection

 os.chdir("/Users/shuozhang/Desktop/capstone")
 weather=pd.read_csv("weather_2014_04.csv", index_col=False)
 weather.columns=[c.replace(" ", "") for c in weather.columns]
 weather=weather.drop_duplicates(subset='IDX', keep='first') 
 weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"],format="%Y%m%d%H%M")
 weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"]).dt.strftime('%Y-%m-%d-%H')
 weather.drop([col for col in ["SPD","DEWP","IDX", "SD"] if col in weather], axis=1, inplace=True)
 weather.columns=['Date', 'Temp', 'Rain']
 weather=weather.fillna(0)

 #### data merge

 data=pd.concat([yellow, green, uber], axis=0)
 dataall=pd.merge(data, weather, how='left', on=['Date'])
 dataall.to_csv('may_data_all.csv', sep='\t')

 #### apply the same process to april, june, july, august, september



 #### dimension reduction: transfer coordinates to zipcode

 os.chdir("/Users/shuozhang/Desktop/data")
 raw_data=pd.read_csv('may_data_all.csv', sep='\t', index_col=False)

 data=raw_data.loc[map(lambda x,y: x >=(-75) and x<=(-72) and y >= 40 and y <= 41.5, \
                  may.Lon, may.Lat),:] # pick a wide range of coordinates to include the whole new york city 
 data['Lat']=map(lambda x: round(x,3), data['Lat']) # round lat, lon to reduce number of google map API call
 data['Lon']=map(lambda x: round(x,3), data['Lon'])
 data["Lat"]=data["Lat"].astype(str)
 data["Lon"]=data["Lon"].astype(str)
 data["Coord"]=data[["Lat", "Lon"]].apply(lambda x: ",".join(x), axis=1)
 data.to_csv('may_data.csv', sep='\t', index_col=False)

 ##### apply the same process to april, june, july, august, september

 #### Google map API call
 apr=pd.read_csv('apr_data.csv', sep='\t', index_col=False)
 may=pd.read_csv('may_data.csv', sep='\t', index_col=False)
 june=pd.read_csv('june_data.csv', sep='\t', index_col=False)
 july=pd.read_csv('july_data.csv', sep='\t', index_col=False)
 aug=pd.read_csv('aug_data.csv', sep='\t', index_col=False)
 sep=pd.read_csv('sep_data.csv', sep='\t', index_col=False)
 ls_coord=np.concatenate((apr["Coord"],may["Coord"], june["Coord"], july["Coord"],\
                   aug["Coord"], sep["Coord"]), axis=0)
 df=pd.DataFrame(list(set(ls_coord)))

 #### use google map api to read zipcode

 from geopy.geocoders import Nominatim
 import re
 import time
 import csv
 geolocator = Nominatim()
 zipcode = []
 zipcode_all = []

 for n in range(0,df.shape[0]):
   location = geolocator.reverse(df[n], timeout=50)
   if location.address == None:
       zipcode = "NaN"
   else:
       num = str(re.findall(r"\D(\d{5})\D", location.address))
       num = str(filter(str.isdigit, num))
       zipcode = num
   output = str(df[n]) + "," + zipcode
   print output
   time.sleep(1)
   with open(r"zipcode_output.csv", 'a') as f:
       writer = csv.writer(f,delimiter='\t')
       writer.writerow([output])

 #### data merge

 zipcode=pd.read_csv('zipoutput.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'})
 may_raw=pd.read_csv('may_data.csv', sep='\t', index_col=False)
 may=pd.merge(may_raw, zipcode, how='left', on=['Coord'])

 #### apply the same process to april, june, july, august, september


 nyc_zipcode=pd.read_csv('nyc_zipcode.csv', sep='\t', index_col=False, dtype={'Zipcode':'S10'})
 may=may[may['zipcode'].isin(nyc_zipcode['Zipcode'])] # filter nyc-only zipcode
 may_aggre=may.groupby(['Date', 'Temp', 'Rain', 'zipcode']).size().reset_index(name='count')
 may_aggre.to_csv('may_final_data.csv', sep='\t', index_col=False)

 #### apply the same process to april, june, july, august, september, and finally combine together as data_final

 #### add sub features

 import numpy as np
 import pandas as pd
 import os
 os.chdir("/Users/shuozhang/Desktop/data")
 data_final=pd.read_csv('data_all_final.csv', sep='\t', encoding='utf-8', index_col=0, dtype={'zipcode':'S10'})
 date=pd.to_datetime(data_final['Date'], format='%Y-%m-%d-%H')
 data_final['Month']=date.dt.month
 data_final['Day']=date.dt.day
 data_final['Hour']=date.dt.hour
 temp=pd.DatetimeIndex(date)
 data_final['Weekday']=temp.weekday
 data_model=data_final[['Month', 'Weekday', 'Hour', 'Temp', 'Rain', 'zipcode', 'count']]
 data_model.to_csv('nycmodeldata.csv', sep='\t', encoding='utf-8', index_col=False)
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import os

	#### yellow taxi: feature selection

	os.chdir("/Users/shuozhang/Desktop/capstone/yellow taxi")
	yellow_raw=pd.read_csv("yellow_tripdata_2014-05.csv", index_col=False)
	yellow_raw.columns = [c.replace(" ", "") for c in yellow_raw.columns] # remove space in column name
	yellow=yellow_raw[['pickup_datetime', 'pickup_latitude', 'pickup_longitude']]
	yellow.columns=['Date', 'Lat', 'Lon'] # rename for future merge
	yellow["Type"]="yellow" # EDA analysis label
	yellow["Date"]=yellow["Date"].str[0:13].replace("\s+","-", regex=True)

	#### green taxi: feature selection

	os.chdir("/Users/shuozhang/Desktop/capstone/green taxi")
	green_raw=pd.read_csv("green_tripdata_2014-05.csv", index_col=False)
	green_raw.columns=[c.replace(" ", "") for c in green_raw.columns]
	green_raw['lpep_pickup_datetime']=green_raw['VendorID'] # the first 3 columns are mislabeled
	green_raw.index=np.arange(green_raw.shape[0]) # index is wrong
	green=green_raw[['lpep_pickup_datetime', 'Pickup_longitude', 'Pickup_latitude']]
	green.columns=['Date', 'Lon', 'Lat']
	green["Type"]="green"
	green["Date"]=green["Date"].str[0:13].replace("\s+","-", regex=True)

	#### uber: feature selection

	os.chdir("/Users/shuozhang/Desktop/capstone/uber", index_col=False)
	uber=pd.read_csv("uber-raw-data-may14.csv")
	uber.columns=['Date', 'Lat', 'Lon', 'Base']
	uber.drop(['Base'], axis=1, inplace=True)
	uber["Type"]="uber"
	uber["Date"]=pd.to_datetime(uber["Date"]).dt.strftime('%Y-%m-%d-%H')

	#### weather: feature selection

	os.chdir("/Users/shuozhang/Desktop/capstone")
	weather=pd.read_csv("weather_2014_04.csv", index_col=False)
	weather.columns=[c.replace(" ", "") for c in weather.columns]
	weather=weather.drop_duplicates(subset='IDX', keep='first')
	weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"],format="%Y%m%d%H%M")
	weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"]).dt.strftime('%Y-%m-%d-%H')
	weather.drop([col for col in ["SPD","DEWP","IDX", "SD"] if col in weather], axis=1, inplace=True)
	weather.columns=['Date', 'Temp', 'Rain']
	weather=weather.fillna(0)

	#### data merge

	data=pd.concat([yellow, green, uber], axis=0)
	dataall=pd.merge(data, weather, how='left', on=['Date'])
	dataall.to_csv('may_data_all.csv', sep='\t')

	#### apply the same process to april, june, july, august, september



	#### dimension reduction: transfer coordinates to zipcode

	os.chdir("/Users/shuozhang/Desktop/data")
	raw_data=pd.read_csv('may_data_all.csv', sep='\t', index_col=False)

	data=raw_data.loc[map(lambda x,y: x >=(-75) and x<=(-72) and y >= 40 and y <= 41.5, \
	may.Lon, may.Lat),:] # pick a wide range of coordinates to include the whole new york city
	data['Lat']=map(lambda x: round(x,3), data['Lat']) # round lat, lon to reduce number of google map API call
	data['Lon']=map(lambda x: round(x,3), data['Lon'])
	data["Lat"]=data["Lat"].astype(str)
	data["Lon"]=data["Lon"].astype(str)
	data["Coord"]=data[["Lat", "Lon"]].apply(lambda x: ",".join(x), axis=1)
	data.to_csv('may_data.csv', sep='\t', index_col=False)

	##### apply the same process to april, june, july, august, september

	#### Google map API call
	apr=pd.read_csv('apr_data.csv', sep='\t', index_col=False)
	may=pd.read_csv('may_data.csv', sep='\t', index_col=False)
	june=pd.read_csv('june_data.csv', sep='\t', index_col=False)
	july=pd.read_csv('july_data.csv', sep='\t', index_col=False)
	aug=pd.read_csv('aug_data.csv', sep='\t', index_col=False)
	sep=pd.read_csv('sep_data.csv', sep='\t', index_col=False)
	ls_coord=np.concatenate((apr["Coord"],may["Coord"], june["Coord"], july["Coord"],\
	aug["Coord"], sep["Coord"]), axis=0)
	df=pd.DataFrame(list(set(ls_coord)))

	#### use google map api to read zipcode

	from geopy.geocoders import Nominatim
	import re
	import time
	import csv
	geolocator = Nominatim()
	zipcode = []
	zipcode_all = []

	for n in range(0,df.shape[0]):
	location = geolocator.reverse(df[n], timeout=50)
	if location.address == None:
	zipcode = "NaN"
	else:
	num = str(re.findall(r"\D(\d{5})\D", location.address))
	num = str(filter(str.isdigit, num))
	zipcode = num
	output = str(df[n]) + "," + zipcode
	print output
	time.sleep(1)
	with open(r"zipcode_output.csv", 'a') as f:
	writer = csv.writer(f,delimiter='\t')
	writer.writerow([output])

	#### data merge

	zipcode=pd.read_csv('zipoutput.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'})
	may_raw=pd.read_csv('may_data.csv', sep='\t', index_col=False)
	may=pd.merge(may_raw, zipcode, how='left', on=['Coord'])

	#### apply the same process to april, june, july, august, september


	nyc_zipcode=pd.read_csv('nyc_zipcode.csv', sep='\t', index_col=False, dtype={'Zipcode':'S10'})
	may=may[may['zipcode'].isin(nyc_zipcode['Zipcode'])] # filter nyc-only zipcode
	may_aggre=may.groupby(['Date', 'Temp', 'Rain', 'zipcode']).size().reset_index(name='count')
	may_aggre.to_csv('may_final_data.csv', sep='\t', index_col=False)

	#### apply the same process to april, june, july, august, september, and finally combine together as data_final

	#### add sub features

	import numpy as np
	import pandas as pd
	import os
	os.chdir("/Users/shuozhang/Desktop/data")
	data_final=pd.read_csv('data_all_final.csv', sep='\t', encoding='utf-8', index_col=0, dtype={'zipcode':'S10'})
	date=pd.to_datetime(data_final['Date'], format='%Y-%m-%d-%H')
	data_final['Month']=date.dt.month
	data_final['Day']=date.dt.day
	data_final['Hour']=date.dt.hour
	temp=pd.DatetimeIndex(date)
	data_final['Weekday']=temp.weekday
	data_model=data_final[['Month', 'Weekday', 'Hour', 'Temp', 'Rain', 'zipcode', 'count']]
	data_model.to_csv('nycmodeldata.csv', sep='\t', encoding='utf-8', index_col=False)