Last active
October 20, 2017 13:12
-
-
Save rdmtinez/b50caa212082fe23f0af9c11995602f8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import os | |
#import pandas as pd | |
#import datetime as dt | |
####you want to optimize the conversion and create a summary report#### | |
def raw_converter_main(raw_dir, wrt_dir, wrt_massisve=False): | |
"""This function is designed to condense the raw csv files, made by the | |
'download_csv.py' function held by Christian Glunk and the Deepfiled | |
Connect team, into a more maneagable form""" | |
#print everything ('all') to console without print() statement | |
from IPython.core.interactiveshell import InteractiveShell | |
InteractiveShell.ast_node_interactivity = 'all' | |
import os | |
import pandas as pd | |
import datetime as dt | |
os.chdir(raw_dir) | |
csv_list = os.listdir(raw_dir) | |
#relevant variables in raw csvs to extract | |
extract_vars = ['AN1', 'DTC', 'HM1', 'HM2', | |
'HM3', 'HM4', 'T1', 'T2', | |
'T3', 'T4', 'latitude', | |
'longitude', 'measurement_time'] | |
#for DevID column indexing | |
b_count = 0 | |
p_count = 0 | |
for csv in csv_list[0:2]: | |
with open(csv) as f: | |
#import csv, parse datetime and set as index | |
raw_df = pd.read_csv(f) | |
#parse_dates=True | |
#draw relevant columns from rawdf | |
df = raw_df[extract_vars] | |
#Numerizing instruments | |
imei = csv[7:-4] | |
df.insert(0, column='IMEI', value=imei) | |
#renames initial columns columns | |
new_names = {'AN1':'Volts', 'DTC':'DigiTemp', 'measurement_time':'DateTime', | |
'longitude':'Lng', 'latitude':'Lat'} | |
df = df.rename(columns=new_names) | |
print("During OPEN_csv_df") | |
print(df) | |
#DateTime as datetime | |
df['DateTime'] = pd.to_datetime(df['DateTime']) | |
#fills missing lat, long values for the sake of row completeness | |
cols = ['Lat', 'Lng'] | |
df[cols] = df[cols].ffill() | |
df[cols] = df[cols].bfill() | |
#call cleaning functions | |
df = rc_remove(df, imei) | |
df = var_insert(df, imei) | |
#write new csvs, and performs some last cleaning steps | |
if 'HM2' in df: | |
#insert premium_count for later indexing based on individual number, | |
#clear out any remaining NaNs, set DateTime as index | |
"""can be done at a later step""" | |
#p_count += 1 | |
#df.insert(0, column='DevID', value=p_count) | |
#df = df[pd.notnull(df['T2'])] | |
#df = df[pd.notnull(df['T1'])] | |
#write to wrt_dir | |
print("writing premium_", imei) | |
df.to_csv(wrt_dir+'premium/premium_'+imei+'.csv', index=False) | |
else: | |
#insert basic_count for later indexing on individual number, | |
#clear out any remaining NaNs | |
"""can be done in a later step""" | |
#b_count +=1 | |
#df.insert(0, column='DevID', value=b_count) | |
#df = df[pd.notnull(df['T1'])] | |
#wrtite to wrt_dir | |
print("writing basic_", imei) | |
df.to_csv(wrt_dir+'basic/basic_'+imei+'.csv', index=False) | |
#if wrt_massive: | |
#call massive writing function | |
def rc_remove(df, imei): | |
"""This function removes row columns and values from the the raw_df. | |
It also assigns non-ambigious temp column names. In the raw files some | |
temperature columns are in T1, the switches to T2, etc""" | |
import pandas as pd | |
#remove rows where Voltage = Null | |
df = df[pd.notnull(df['Volts'])] | |
keep during raw conversion, but | |
#drop rows where voltage values are -255.00 | |
#(UNSURE WHY THIS '-255.0' value EXISTS) | |
"""df = df[df['Volts'] != -255.0] | |
keep for raw conversion, but remove change during | |
manipultion and analysis""" | |
#keep the the columns that are at least 30% populated | |
df = df.dropna(thresh=len(df)*.30, axis=1) | |
#This removes the ambiguity in the names of T# columns of certain csvs which temperature | |
#measurements go into several columns, as well as removing rows with NaN values in column | |
vars_len = len(df.columns.values) | |
if vars_len <9: | |
df.columns.values[-4] = 'T1' | |
else: | |
df.columns.values[-4] = 'T2' | |
df.columns.values[-5] = 'T1' | |
print('during REMOVE df') | |
return df | |
def var_insert(df, imei): | |
"""This function shifts around columns, inserts new variables into | |
into the dataframe""" | |
import math | |
from datetime import timedelta | |
#insert calculated volumetric water content measurements #and change in volumetric water | |
#content and populate for two subsequent values under DateTime if their difference == 30min | |
#populate the dVWC column with the difference of the adjoining values under VWC. | |
last_col = len(df.columns.values) -1 | |
df.insert(last_col, column='VWC', value=None) | |
df['VWC'] = .000494*(df['Volts']*1000)-.544 | |
##Take the following into account during manipulation | |
"""df.insert(last_col+1, column='dVWC', value=None) | |
df.loc[(df['DateTime'].shift(-1) - df['DateTime']) == timedelta(minutes=30), | |
'dVWC'] = df['VWC'].shift(-1) - df['VWC'] | |
#Then shift those values by one row remove top row holding NaN | |
#fill rest of NaN with preceeding real value | |
df['dVWC'] = df['dVWC'].shift(+1) | |
df = df[1:] | |
df['dVWC'] = df['dVWC'].ffill()""" | |
#insert 'month', 'week', 'day' columns... these can be set as indeces | |
#for used for simpler visualization, ##better to add during other algos | |
""" df.insert(1, column="Month", value = df['DateTime'].dt.strftime('%b')) | |
df.insert(2, column="Week", value = df['DateTime'].dt.week) | |
df.insert(3, column="Day", value = df['DateTime'].dt.day)""" | |
return df | |
raw_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/raw_CSVs/' | |
wrt_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/condensed_CSVs/' | |
raw_converter_main(raw_dir, wrt_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment