Created
April 16, 2020 17:27
-
-
Save romeokienzler/badcd9a08afc8e360d8053d9c0ca157a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": "# The code was removed by Watson Studio for sharing." | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": "import types\nimport pandas as pd\nfrom botocore.client import Config\nimport ibm_boto3\n\ndef __iter__(self): return 0\n\n\nclient = ibm_boto3.client(service_name='s3',\n ibm_api_key_id=credentials_1['IBM_API_KEY_ID'],\n ibm_auth_endpoint=credentials_1['IBM_AUTH_ENDPOINT'],\n config=Config(signature_version='oauth'),\n endpoint_url=credentials_1['ENDPOINT'])\nbucket=credentials_1['BUCKET']" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": "client.download_file(Bucket=bucket,Key='cases.csv',Filename='cases.csv')" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": "df_raw = pd.read_csv('cases.csv')" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "['Austria',\n 'Belgium',\n 'Canada',\n 'Czechia',\n 'Denmark',\n 'Dominican_Republic',\n 'Ecuador',\n 'France',\n 'Germany',\n 'Hungary',\n 'Iran',\n 'Ireland',\n 'Israel',\n 'Italy',\n 'Netherlands',\n 'Norway',\n 'Portugal',\n 'Romania',\n 'Spain',\n 'Sweden',\n 'Switzerland',\n 'Turkey',\n 'United_Kingdom',\n 'United_States_of_America']" | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "window_size = 10\n\ndf = df_raw\n\n#df = df[df['countriesAndTerritories']=='Italy']\n\ndf = df.sort_values(['year', 'month' ,'day'], ascending=[1, 1, 1])\n\ndf['ts'] = pd.to_datetime(df[['year', 'month' ,'day']])\n\ndf['increase_cases'] = df.cases - df.cases.shift(1)\n\ndf['increase_ratio'] = df.cases / df.cases.shift(1)\n\ndf['cases_estimated'] = df.deaths*100\n\ndf['total_cases_estimated'] = df['cases_estimated'].cumsum()\n\ndf['percentage_infected'] = 100/df[\"2018\"]*df['total_cases_estimated']\n\ndf['percentage_died'] = 100/df[\"2018\"]*df['deaths']\n\n\nfor i in range(1,window_size+1):\n df['cases_'+str(i)+'_days_before'] = df.cases.shift(i)\n \n \nfor i in range(1,window_size+1):\n df['percentage_died_'+str(i)+'_days_before'] = df.percentage_died.shift(i)\n \n \nfor i in range(1,window_size+1):\n df['deaths_'+str(i)+'_days_before'] = df.deaths.shift(i)\n\naverage = 0\nfor i in range(1,window_size+1):\n average = average + df.cases.shift(i)\n \ndf['cases_'+str(window_size)+'_day_average'] = average/window_size\n\naverage = 0\nfor i in range(1,window_size+1):\n average = average + df.percentage_died.shift(i)\n \ndf['percentage_died_'+str(window_size)+'_day_average'] = average/window_size\n\naverage = 0\nfor i in range(1,window_size+1):\n average = average + df.deaths.shift(i)\n \ndf['deaths_'+str(window_size)+'_day_average'] = average/window_size\n\n#for i in range(1,window_size+1):\n# df = df[df['deaths_'+str(i)+'_days_before']>0]\n\n#df = df[df['Cases']>100]\n\n\n#df = df[df['ts']>'2020-04-01']\n\ngroup_by_deaths = df.groupby(['countriesAndTerritories']).sum()['percentage_died']\nbad_countries = list(group_by_deaths[group_by_deaths>0.001].index)\n\n\n\n\n#df = df[df['countriesAndTerritories'].isin(bad_countries)]\n#df\nbad_countries\n\n\n#Germany 1607\n#Italy 15253\n#Netherlands 1538\n#Spain 11570\n#United_Kingdom 4972\n#United_States_of_America 10973" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": "df.to_csv('cases_features.csv')" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": "client.upload_file(Filename='cases_features.csv',Bucket=bucket,Key='cases_features.csv')" | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.6", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Romeo,
I was wondering in section 5: while first sorting your csv at date and then using the shift function to calculate the increase_cases column, aren't you comparing cases from different countries instead of different days? As I'm now reading it, I would assume we want to compare the cases for the same country.