Last active
June 24, 2024 12:33
-
-
Save rcsmit/8f2954eef8009c747df47dbca885e32e to your computer and use it in GitHub Desktop.
Get from Eurostat : Deaths by week, sex and 5-year age group
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_data_eurostat(): | |
"""Get from Eurostat : Deaths by week, sex and 5-year age group | |
Data from https://ec.europa.eu/eurostat/databrowser/product/view/demo_r_mwk_05?lang=en | |
https://ec.europa.eu/eurostat/databrowser/bookmark/fbd80cd8-7b96-4ad9-98be-1358dd80f191?lang=en | |
https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/dataflow/ESTAT/DEMO_R_MWK_05/1.0?references=descendants&detail=referencepartial&format=sdmx_2.1_generic&compressed=true | |
Returns: | |
df: dataframe with weekly mortality in 5 year tranches | |
DATAFLOW LAST UPDATE freq age sex ... OBS_VALUE OBS_FLAG age_sex jaar weeknr | |
0 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W TOTAL F ... 1868.0 p TOTAL_F 2000 1 | |
1 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W TOTAL M ... 1699.0 p TOTAL_M 2000 1 | |
2 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W TOTAL T ... 3567.0 p TOTAL_T 2000 1 | |
... | |
80071 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W Y_LT5 M ... 8.0 p Y_LT5_M 2024 19 | |
80072 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W Y_LT5 T ... 12.0 p Y_LT5_T 2024 19 | |
W = Weekly | |
NL = Netherlands | |
NR = Number | |
P = Provisory | |
""" | |
code = "DEMO_R_MWK_05" | |
# ['freq', 'age', 'sex', 'unit', 'geo'] | |
# pars = eurostat.get_pars(code) | |
# result : ['freq', 'age', 'sex', 'unit', 'geo'] | |
# for p in pars: | |
# par_values = eurostat.get_par_values(code,p) | |
# print (f"{p} ------------------------------") | |
# print (par_values) | |
my_filter_pars = {'beginPeriod': 2015, 'geo': ['NL']} # beginPeriod is ignored somehow | |
flags = True | |
if flags: | |
df = eurostat.get_data_df(code, flags=True, filter_pars=my_filter_pars, verbose=True, reverse_time=False) | |
print (df) | |
# Identify value and flag columns | |
value_columns = [col for col in df.columns if col.endswith('_value')] | |
flag_columns = [col for col in df.columns if col.endswith('_flag')] | |
# Melt the value columns | |
df_values = df.melt(id_vars=['freq', 'age', 'sex', 'unit', 'geo\\TIME_PERIOD'], | |
value_vars=value_columns, | |
var_name='TIME_PERIOD', value_name='OBS_VALUE') | |
# Remove '_value' suffix from TIME_PERIOD column | |
df_values['TIME_PERIOD'] = df_values['TIME_PERIOD'].str.replace('_value', '') | |
# Melt the flag columns | |
df_flags = df.melt(id_vars=['freq', 'age', 'sex', 'unit', 'geo\\TIME_PERIOD'], | |
value_vars=flag_columns, | |
var_name='TIME_PERIOD', value_name='OBS_FLAG') | |
# Remove '_flag' suffix from TIME_PERIOD column | |
df_flags['TIME_PERIOD'] = df_flags['TIME_PERIOD'].str.replace('_flag', '') | |
# Merge the values and flags dataframes | |
df_long = pd.merge(df_values, df_flags, on=['freq', 'age', 'sex', 'unit', 'geo\\TIME_PERIOD', 'TIME_PERIOD']) | |
# Add additional columns | |
df_long['DATAFLOW'] = 'ESTAT:DEMO_R_MWK_05(1.0)' | |
df_long['LAST UPDATE'] = '14/06/24 23:00:00' | |
# Rename the columns to match the desired output | |
df_long.rename(columns={'geo\\TIME_PERIOD': 'geo'}, inplace=True) | |
# Filter out rows with None values in OBS_VALUE | |
df_long = df_long[df_long['OBS_VALUE'].notna()] | |
# Reorder the columns | |
df_long = df_long[['DATAFLOW', 'LAST UPDATE', 'freq', 'age', 'sex', 'unit', 'geo', 'TIME_PERIOD', 'OBS_VALUE', 'OBS_FLAG']] | |
else: | |
df = eurostat.get_data_df(code, flags=False, filter_pars=my_filter_pars, verbose=True, reverse_time=False) | |
print (df) | |
# Melt the dataframe to long format | |
df_long = df.melt(id_vars=['freq', 'age', 'sex', 'unit', r'geo\TIME_PERIOD'], | |
var_name='TIME_PERIOD', value_name='OBS_VALUE') | |
# Add additional columns, made to be reverse compatible with older code | |
df_long['DATAFLOW'] = 'ESTAT:DEMO_R_MWK_05(1.0)' | |
df_long['LAST UPDATE'] = '24/06/24 23:00:00' | |
#df_long['OBS_FLAG'] = 'p' | |
# Rename the columns to match the desired output | |
df_long.rename(columns={'geo\\TIME_PERIOD': 'geo'}, inplace=True) | |
# Filter out rows with None values in OBS_VALUE | |
df_long = df_long[df_long['OBS_VALUE'].notna()] | |
# Reorder the columns | |
df_long = df_long[['DATAFLOW', 'LAST UPDATE', 'freq', 'age', 'sex', 'unit', 'geo', 'TIME_PERIOD', 'OBS_VALUE', 'OBS_FLAG']] | |
df_long["age_sex"] = df_long["age"] + "_" +df_long["sex"] | |
df_long["jaar"] = (df_long["TIME_PERIOD"].str[:4]).astype(int) | |
df_long["weeknr"] = (df_long["TIME_PERIOD"].str[6:]).astype(int) | |
# Display the resulting dataframe | |
print (df_long) | |
return (df_long) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment