Created
August 9, 2023 17:32
-
-
Save MaxHalford/5d3da23bbf8bf76e9f23c62a4f2539a9 to your computer and use it in GitHub Desktop.
Metric decomposition
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Metric decomposition" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- https://docs.google.com/spreadsheets/d/1-hYJesNCMlCyANPOPeLijg1sCfv-6nREPtkHtTVEwd4/edit#gid=0\n", | |
"- https://observablehq.com/@carbonfact/diff" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"https://chat.openai.com/share/6a5e1c60-b9a2-42bb-9b23-39e61358b577" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[0.09999999999999999, 0.068]" | |
] | |
}, | |
"execution_count": 49, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# denominators\n", | |
"D = [\n", | |
" [1000, 2000],\n", | |
" [800, 1200]\n", | |
"]\n", | |
"# shares\n", | |
"S = [\n", | |
" [dj / sum(d) for j, dj in enumerate(d)]\n", | |
" for i, d in enumerate(D)\n", | |
"]\n", | |
"# numerators\n", | |
"N = [\n", | |
" [150, 150],\n", | |
" [88, 48]\n", | |
"]\n", | |
"# ratios\n", | |
"R = [\n", | |
" [ni / di for ni, di in zip(n, d)]\n", | |
" for n, d in zip(N, D)\n", | |
"]\n", | |
"# shares x ratios\n", | |
"SR = [\n", | |
" [ri * si for ri, si in zip(r, s)]\n", | |
" for r, s in zip(R, S)\n", | |
"]\n", | |
"# global means\n", | |
"M = list(map(sum, SR))\n", | |
"M" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"-0.03199999999999999" | |
] | |
}, | |
"execution_count": 60, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"inner = [\n", | |
" S[1][0] * (R[1][0] - R[0][0]),\n", | |
" S[1][1] * (R[1][1] - R[0][1]),\n", | |
"]\n", | |
"mix = [\n", | |
" (S[1][0] - S[0][0]) * (R[0][0] - M[0]),\n", | |
" (S[1][1] - S[0][1]) * (R[0][1] - M[0])\n", | |
"]\n", | |
"sum(inner) + sum(mix)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[0.09999999999999999, 0.068]" | |
] | |
}, | |
"execution_count": 67, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"KPI = [\n", | |
" sum(ri * si for ri, si in zip(r, s))\n", | |
" for r, s in zip(R, S)\n", | |
"]\n", | |
"KPI" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.068" | |
] | |
}, | |
"execution_count": 70, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(\n", | |
" KPI[0] +\n", | |
" sum(S[1][i] * (R[1][i] - R[0][i]) for i in range(2)) +\n", | |
" sum((S[1][i] - S[0][i]) * (R[0][i] - M[0]) for i in range(2))\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# denominators\n", | |
"D = [\n", | |
" [1000, 2000],\n", | |
" [800, 1200]\n", | |
"]\n", | |
"# shares\n", | |
"S = [\n", | |
" [dj / sum(d) for j, dj in enumerate(d)]\n", | |
" for i, d in enumerate(D)\n", | |
"]\n", | |
"# numerators\n", | |
"N = [\n", | |
" [150, 150],\n", | |
" [88, 48]\n", | |
"]\n", | |
"# ratios\n", | |
"R = [\n", | |
" [ni / di for ni, di in zip(n, d)]\n", | |
" for n, d in zip(N, D)\n", | |
"]\n", | |
"# shares x ratios\n", | |
"SR = [\n", | |
" [ri * si for ri, si in zip(r, s)]\n", | |
" for r, s in zip(R, S)\n", | |
"]\n", | |
"# global means\n", | |
"M = list(map(sum, SR))\n", | |
"# KPIs\n", | |
"KPI = [\n", | |
" sum(ri * si for ri, si in zip(r, s))\n", | |
" for r, s in zip(R, S)\n", | |
"]\n", | |
"# decomposed KPI\n", | |
"KPI[1] == (\n", | |
" KPI[0] +\n", | |
" sum(S[1][i] * (R[1][i] - R[0][i]) for i in range(2)) +\n", | |
" sum((S[1][i] - S[0][i]) * (R[0][i] - KPI[0]) for i in range(2))\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.068" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"M[1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.068" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"KPI[1]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"---" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>person</th>\n", | |
" <th>claim_type</th>\n", | |
" <th>date</th>\n", | |
" <th>year</th>\n", | |
" <th>amount</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>John</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>2021-04-08</td>\n", | |
" <td>2021</td>\n", | |
" <td>129.66</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Jane</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>2021-09-03</td>\n", | |
" <td>2021</td>\n", | |
" <td>127.07</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Jane</td>\n", | |
" <td>Physiotherapy</td>\n", | |
" <td>2021-02-07</td>\n", | |
" <td>2021</td>\n", | |
" <td>125.27</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Michael</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>2021-12-21</td>\n", | |
" <td>2021</td>\n", | |
" <td>122.45</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Michael</td>\n", | |
" <td>Physiotherapy</td>\n", | |
" <td>2021-10-09</td>\n", | |
" <td>2021</td>\n", | |
" <td>132.82</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" person claim_type date year amount\n", | |
"0 John Dentist 2021-04-08 2021 129.66\n", | |
"1 Jane Dentist 2021-09-03 2021 127.07\n", | |
"2 Jane Physiotherapy 2021-02-07 2021 125.27\n", | |
"3 Michael Dentist 2021-12-21 2021 122.45\n", | |
"4 Michael Physiotherapy 2021-10-09 2021 132.82" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import random\n", | |
"\n", | |
"random.seed(42)\n", | |
"\n", | |
"# Function to generate a random cost based on the claim type and year\n", | |
"def generate_claim_cost(claim_type, year):\n", | |
" if claim_type == 'Dentist':\n", | |
" base_cost = 100\n", | |
" elif claim_type == 'Psychiatrist':\n", | |
" base_cost = 150\n", | |
" elif claim_type == 'General Physician':\n", | |
" base_cost = 80\n", | |
" elif claim_type == 'Physiotherapy':\n", | |
" base_cost = 120\n", | |
" else:\n", | |
" base_cost = 50\n", | |
"\n", | |
" # Adjust cost based on year\n", | |
" if year == 2021:\n", | |
" base_cost *= 1.2\n", | |
" elif year == 2023:\n", | |
" base_cost *= 1.5\n", | |
"\n", | |
" # Add some random variation\n", | |
" cost = random.uniform(base_cost - 20, base_cost + 20)\n", | |
" return round(cost, 2)\n", | |
"\n", | |
"# Generating sample data\n", | |
"claim_types = ['Dentist', 'Psychiatrist', 'General Physician', 'Physiotherapy']\n", | |
"years = [2021, 2022, 2023]\n", | |
"people = ['John', 'Jane', 'Michael', 'Emily', 'William', 'Emma', 'Daniel', 'Olivia', 'Lucas', 'Ava']\n", | |
"\n", | |
"data = []\n", | |
"for year in years:\n", | |
" for person in people:\n", | |
" num_claims = random.randint(1, 5) # Random number of claims per person per year\n", | |
" for _ in range(num_claims):\n", | |
" claim_type = random.choice(claim_types)\n", | |
" cost = generate_claim_cost(claim_type, year)\n", | |
" date = pd.to_datetime(f\"{random.randint(1, 12)}/{random.randint(1, 28)}/{year}\", format='%m/%d/%Y')\n", | |
" data.append([person, claim_type, date, year, cost])\n", | |
"\n", | |
"# Create the DataFrame\n", | |
"columns = ['person', 'claim_type', 'date', 'year', 'amount']\n", | |
"claims_df = pd.DataFrame(data, columns=columns)\n", | |
"\n", | |
"# Display the DataFrame\n", | |
"claims_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"80" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(claims_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| person | claim_type | date | year | amount |\n", | |
"|:---------|:--------------|:--------------------|-------:|---------:|\n", | |
"| John | Dentist | 2021-04-08 00:00:00 | 2021 | 129.66 |\n", | |
"| Jane | Dentist | 2021-09-03 00:00:00 | 2021 | 127.07 |\n", | |
"| Jane | Physiotherapy | 2021-02-07 00:00:00 | 2021 | 125.27 |\n", | |
"| Michael | Dentist | 2021-12-21 00:00:00 | 2021 | 122.45 |\n", | |
"| Michael | Physiotherapy | 2021-10-09 00:00:00 | 2021 | 132.82 |\n" | |
] | |
} | |
], | |
"source": [ | |
"print(claims_df.head().to_markdown(index=False))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Sums" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- Total cost\n", | |
"- Total footprint" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| year | sum | diff |\n", | |
"|-------:|--------:|--------:|\n", | |
"| 2021 | 3814.54 | nan |\n", | |
"| 2022 | 2890.29 | -924.25 |\n", | |
"| 2023 | 4178.03 | 1287.74 |\n" | |
] | |
} | |
], | |
"source": [ | |
"sums = claims_df.groupby('year')['amount'].sum()\n", | |
"sums = pd.DataFrame({'sum': sums, 'diff': sums - sums.shift()})\n", | |
"print(sums.to_markdown())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| year | claim_type | sum | diff |\n", | |
"|-------:|:------------------|--------:|--------:|\n", | |
"| 2021 | Dentist | 1104.42 | nan |\n", | |
"| 2021 | General Physician | 594.44 | nan |\n", | |
"| 2021 | Physiotherapy | 801.78 | nan |\n", | |
"| 2021 | Psychiatrist | 1313.9 | nan |\n", | |
"| 2022 | Dentist | 622.48 | -481.94 |\n", | |
"| 2022 | General Physician | 749.08 | 154.64 |\n", | |
"| 2022 | Physiotherapy | 339.45 | -462.33 |\n", | |
"| 2022 | Psychiatrist | 1179.28 | -134.62 |\n", | |
"| 2023 | Dentist | 1440.99 | 818.51 |\n", | |
"| 2023 | General Physician | 826.18 | 77.1 |\n", | |
"| 2023 | Physiotherapy | 1049.15 | 709.7 |\n", | |
"| 2023 | Psychiatrist | 861.71 | -317.57 |\n" | |
] | |
} | |
], | |
"source": [ | |
"print(pd.DataFrame({\n", | |
" 'sum': (s := claims_df.groupby(['year', 'claim_type'])['amount'].sum()),\n", | |
" 'diff': (\n", | |
" s - s.groupby('claim_type').shift()\n", | |
" )\n", | |
"}).reset_index().to_markdown(index=False))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>year</th>\n", | |
" <th>claim_type</th>\n", | |
" <th>mean</th>\n", | |
" <th>count</th>\n", | |
" <th>sum</th>\n", | |
" <th>mean_lag</th>\n", | |
" <th>count_lag</th>\n", | |
" <th>inner</th>\n", | |
" <th>mix</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2021</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>122.713333</td>\n", | |
" <td>9</td>\n", | |
" <td>1104.42</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2021</td>\n", | |
" <td>General Physician</td>\n", | |
" <td>99.073333</td>\n", | |
" <td>6</td>\n", | |
" <td>594.44</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2021</td>\n", | |
" <td>Physiotherapy</td>\n", | |
" <td>133.630000</td>\n", | |
" <td>6</td>\n", | |
" <td>801.78</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2021</td>\n", | |
" <td>Psychiatrist</td>\n", | |
" <td>187.700000</td>\n", | |
" <td>7</td>\n", | |
" <td>1313.90</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2022</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>103.746667</td>\n", | |
" <td>6</td>\n", | |
" <td>622.48</td>\n", | |
" <td>122.713333</td>\n", | |
" <td>9.0</td>\n", | |
" <td>-170.700000</td>\n", | |
" <td>-311.240000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>2022</td>\n", | |
" <td>General Physician</td>\n", | |
" <td>83.231111</td>\n", | |
" <td>9</td>\n", | |
" <td>749.08</td>\n", | |
" <td>99.073333</td>\n", | |
" <td>6.0</td>\n", | |
" <td>-95.053333</td>\n", | |
" <td>249.693333</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>2022</td>\n", | |
" <td>Physiotherapy</td>\n", | |
" <td>113.150000</td>\n", | |
" <td>3</td>\n", | |
" <td>339.45</td>\n", | |
" <td>133.630000</td>\n", | |
" <td>6.0</td>\n", | |
" <td>-122.880000</td>\n", | |
" <td>-339.450000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>2022</td>\n", | |
" <td>Psychiatrist</td>\n", | |
" <td>147.410000</td>\n", | |
" <td>8</td>\n", | |
" <td>1179.28</td>\n", | |
" <td>187.700000</td>\n", | |
" <td>7.0</td>\n", | |
" <td>-282.030000</td>\n", | |
" <td>147.410000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>2023</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>160.110000</td>\n", | |
" <td>9</td>\n", | |
" <td>1440.99</td>\n", | |
" <td>103.746667</td>\n", | |
" <td>6.0</td>\n", | |
" <td>338.180000</td>\n", | |
" <td>480.330000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>2023</td>\n", | |
" <td>General Physician</td>\n", | |
" <td>118.025714</td>\n", | |
" <td>7</td>\n", | |
" <td>826.18</td>\n", | |
" <td>83.231111</td>\n", | |
" <td>9.0</td>\n", | |
" <td>313.151429</td>\n", | |
" <td>-236.051429</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>2023</td>\n", | |
" <td>Physiotherapy</td>\n", | |
" <td>174.858333</td>\n", | |
" <td>6</td>\n", | |
" <td>1049.15</td>\n", | |
" <td>113.150000</td>\n", | |
" <td>3.0</td>\n", | |
" <td>185.125000</td>\n", | |
" <td>524.575000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>2023</td>\n", | |
" <td>Psychiatrist</td>\n", | |
" <td>215.427500</td>\n", | |
" <td>4</td>\n", | |
" <td>861.71</td>\n", | |
" <td>147.410000</td>\n", | |
" <td>8.0</td>\n", | |
" <td>544.140000</td>\n", | |
" <td>-861.710000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" year claim_type mean count sum mean_lag \\\n", | |
"0 2021 Dentist 122.713333 9 1104.42 NaN \n", | |
"1 2021 General Physician 99.073333 6 594.44 NaN \n", | |
"2 2021 Physiotherapy 133.630000 6 801.78 NaN \n", | |
"3 2021 Psychiatrist 187.700000 7 1313.90 NaN \n", | |
"4 2022 Dentist 103.746667 6 622.48 122.713333 \n", | |
"5 2022 General Physician 83.231111 9 749.08 99.073333 \n", | |
"6 2022 Physiotherapy 113.150000 3 339.45 133.630000 \n", | |
"7 2022 Psychiatrist 147.410000 8 1179.28 187.700000 \n", | |
"8 2023 Dentist 160.110000 9 1440.99 103.746667 \n", | |
"9 2023 General Physician 118.025714 7 826.18 83.231111 \n", | |
"10 2023 Physiotherapy 174.858333 6 1049.15 113.150000 \n", | |
"11 2023 Psychiatrist 215.427500 4 861.71 147.410000 \n", | |
"\n", | |
" count_lag inner mix \n", | |
"0 NaN NaN NaN \n", | |
"1 NaN NaN NaN \n", | |
"2 NaN NaN NaN \n", | |
"3 NaN NaN NaN \n", | |
"4 9.0 -170.700000 -311.240000 \n", | |
"5 6.0 -95.053333 249.693333 \n", | |
"6 6.0 -122.880000 -339.450000 \n", | |
"7 7.0 -282.030000 147.410000 \n", | |
"8 6.0 338.180000 480.330000 \n", | |
"9 9.0 313.151429 -236.051429 \n", | |
"10 3.0 185.125000 524.575000 \n", | |
"11 8.0 544.140000 -861.710000 " | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"metric = 'amount'\n", | |
"period = 'year'\n", | |
"dimension = 'claim_type'\n", | |
"\n", | |
"totals = (\n", | |
" claims_df\n", | |
" .groupby([period, dimension])\n", | |
" [metric]\n", | |
" .agg(['mean', 'count', 'sum'])\n", | |
" .reset_index()\n", | |
" .sort_values(period)\n", | |
")\n", | |
"\n", | |
"totals['mean_lag'] = totals.groupby(dimension)['mean'].shift(1)\n", | |
"totals['count_lag'] = totals.groupby(dimension)['count'].shift(1)\n", | |
"totals['inner'] = totals.eval('(mean - mean_lag) * count_lag')\n", | |
"totals['mix'] = totals.eval('(count - count_lag) * mean')\n", | |
"totals" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| year | 0 |\n", | |
"|-------:|--------:|\n", | |
"| 2021 | 0 |\n", | |
"| 2022 | -924.25 |\n", | |
"| 2023 | 1287.74 |\n" | |
] | |
} | |
], | |
"source": [ | |
"print(totals.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| person | claim_type | date | year | amount | status |\n", | |
"|:---------|:-------------|:--------------------|-------:|---------:|:---------|\n", | |
"| Jane | Dentist | 2023-03-26 00:00:00 | 2023 | 136.11 | NEW |\n", | |
"| Megan | Dentist | 2023-06-01 00:00:00 | 2023 | 138.99 | EXISTING |\n", | |
"| John | Psychiatrist | 2021-10-09 00:00:00 | 2021 | 168.82 | NEW |\n", | |
"| Emily | Psychiatrist | 2024-10-16 00:00:00 | 2024 | 132.29 | EXISTING |\n", | |
"| Michael | Dentist | 2023-10-15 00:00:00 | 2023 | 145.39 | NEW |\n" | |
] | |
} | |
], | |
"source": [ | |
"import collections\n", | |
"import random\n", | |
"import names # This library generates human names\n", | |
"import pandas as pd\n", | |
"\n", | |
"random.seed(42)\n", | |
"\n", | |
"# Function to generate a random cost based on the claim type and year\n", | |
"def generate_claim_cost(claim_type, year):\n", | |
" if claim_type == 'Dentist':\n", | |
" base_cost = 100\n", | |
" elif claim_type == 'Psychiatrist':\n", | |
" base_cost = 150\n", | |
"\n", | |
" # Adjust cost based on year\n", | |
" if year == 2021:\n", | |
" base_cost *= 1.2\n", | |
" elif year == 2023:\n", | |
" base_cost *= 1.5\n", | |
"\n", | |
" # Add some random variation\n", | |
" cost = random.uniform(base_cost - 20, base_cost + 20)\n", | |
" return round(cost, 2)\n", | |
"\n", | |
"# Generating sample data\n", | |
"claim_types = ['Dentist', 'Psychiatrist']\n", | |
"years = [2021, 2022, 2023, 2024]\n", | |
"people = ['John', 'Jane', 'Michael', 'Emily', 'William']\n", | |
"\n", | |
"data = []\n", | |
"for year in years:\n", | |
" new_people = (\n", | |
" [names.get_first_name() for _ in range(random.randint(1, 3))]\n", | |
" if year > 2021\n", | |
" else []\n", | |
" )\n", | |
" existing_people = [person for person in people if random.random() > 0.3]\n", | |
" people_this_year = existing_people + new_people\n", | |
" people.extend(new_people)\n", | |
"\n", | |
" for person in people_this_year:\n", | |
" num_claims = random.randint(1, 5) # Random number of claims per existing customer per year\n", | |
" for _ in range(num_claims):\n", | |
" claim_type = random.choice(claim_types)\n", | |
" cost = generate_claim_cost(claim_type, year)\n", | |
" date = pd.to_datetime(f\"{random.randint(1, 12)}/{random.randint(1, 28)}/{year}\", format='%m/%d/%Y')\n", | |
" data.append([person, claim_type, date, year, cost])\n", | |
"\n", | |
"# Create the DataFrame\n", | |
"columns = ['person', 'claim_type', 'date', 'year', 'amount']\n", | |
"claims_df = pd.DataFrame(data, columns=columns)\n", | |
"\n", | |
"# Indicate whether people are existing, new, or returning\n", | |
"years_seen = collections.defaultdict(set)\n", | |
"statuses = []\n", | |
"for claim in claims_df.to_dict(orient='records'):\n", | |
" years_seen[claim['person']].add(claim['year'])\n", | |
" if claim['year'] - 1 in years_seen[claim['person']]:\n", | |
" statuses.append('EXISTING')\n", | |
" elif any(year < claim['year'] for year in years_seen[claim['person']]):\n", | |
" statuses.append('RETURNING')\n", | |
" elif not {year for year in years_seen[claim['person']] if year != claim['year']}:\n", | |
" statuses.append('NEW')\n", | |
"\n", | |
"claims_df['status'] = statuses\n", | |
"\n", | |
"print(claims_df.sample(5).to_markdown(index=False))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| year | 0 |\n", | |
"|-------:|--------:|\n", | |
"| 2021 | 0 |\n", | |
"| 2022 | -924.25 |\n", | |
"| 2023 | 1287.74 |\n" | |
] | |
} | |
], | |
"source": [ | |
"print(totals.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| year | sum | diff |\n", | |
"|-------:|--------:|---------:|\n", | |
"| 2021 | 1312.28 | nan |\n", | |
"| 2022 | 676.06 | -636.22 |\n", | |
"| 2023 | 5191.31 | 4515.25 |\n", | |
"| 2024 | 1966.73 | -3224.58 |\n" | |
] | |
} | |
], | |
"source": [ | |
"sums = claims_df.groupby('year')['amount'].sum()\n", | |
"sums = pd.DataFrame({'sum': sums, 'diff': sums - sums.shift()})\n", | |
"print(sums.to_markdown())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Ratios\n", | |
"\n", | |
"- Cost by claim\n", | |
"- Cost by user\n", | |
"- Footprint by product\n", | |
"- Footprint by gram" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| year | average | diff |\n", | |
"|-------:|----------:|---------:|\n", | |
"| 2021 | 145.809 | nan |\n", | |
"| 2022 | 112.677 | -33.1322 |\n", | |
"| 2023 | 173.044 | 60.367 |\n", | |
"| 2024 | 122.921 | -50.123 |\n" | |
] | |
} | |
], | |
"source": [ | |
"averages = claims_df.groupby('year')['amount'].mean()\n", | |
"averages = pd.DataFrame({'average': averages, 'diff': averages - averages.shift()})\n", | |
"print(averages.to_markdown())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| year | claim_type | sum | count |\n", | |
"|-------:|:-------------|--------:|--------:|\n", | |
"| 2021 | Dentist | 614.36 | 5 |\n", | |
"| 2021 | Psychiatrist | 697.92 | 4 |\n", | |
"| 2022 | Dentist | 393.5 | 4 |\n", | |
"| 2022 | Psychiatrist | 282.56 | 2 |\n", | |
"| 2023 | Dentist | 2967.3 | 20 |\n" | |
] | |
} | |
], | |
"source": [ | |
"metric = 'amount'\n", | |
"period = 'year'\n", | |
"dimension = 'claim_type'\n", | |
"\n", | |
"decomp = (\n", | |
" claims_df\n", | |
" .groupby([period, dimension], dropna=True)\n", | |
" [metric].agg(['sum', 'count'])\n", | |
" .reset_index()\n", | |
" .sort_values(period)\n", | |
")\n", | |
"print(decomp.head().to_markdown(index=False))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>year</th>\n", | |
" <th>claim_type</th>\n", | |
" <th>sum</th>\n", | |
" <th>count</th>\n", | |
" <th>mean</th>\n", | |
" <th>share</th>\n", | |
" <th>global_mean</th>\n", | |
" <th>mean_lag</th>\n", | |
" <th>share_lag</th>\n", | |
" <th>global_mean_lag</th>\n", | |
" <th>inner</th>\n", | |
" <th>mix</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2021</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>614.36</td>\n", | |
" <td>5</td>\n", | |
" <td>122.87200</td>\n", | |
" <td>0.555556</td>\n", | |
" <td>145.808889</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2021</td>\n", | |
" <td>Psychiatrist</td>\n", | |
" <td>697.92</td>\n", | |
" <td>4</td>\n", | |
" <td>174.48000</td>\n", | |
" <td>0.444444</td>\n", | |
" <td>145.808889</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2022</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>393.50</td>\n", | |
" <td>4</td>\n", | |
" <td>98.37500</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>112.676667</td>\n", | |
" <td>122.872</td>\n", | |
" <td>0.555556</td>\n", | |
" <td>145.808889</td>\n", | |
" <td>-16.331333</td>\n", | |
" <td>-2.548543</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2022</td>\n", | |
" <td>Psychiatrist</td>\n", | |
" <td>282.56</td>\n", | |
" <td>2</td>\n", | |
" <td>141.28000</td>\n", | |
" <td>0.333333</td>\n", | |
" <td>112.676667</td>\n", | |
" <td>174.480</td>\n", | |
" <td>0.444444</td>\n", | |
" <td>145.808889</td>\n", | |
" <td>-11.066667</td>\n", | |
" <td>-3.185679</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2023</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>2967.30</td>\n", | |
" <td>20</td>\n", | |
" <td>148.36500</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>173.043667</td>\n", | |
" <td>98.375</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>112.676667</td>\n", | |
" <td>33.326667</td>\n", | |
" <td>-0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>2023</td>\n", | |
" <td>Psychiatrist</td>\n", | |
" <td>2224.01</td>\n", | |
" <td>10</td>\n", | |
" <td>222.40100</td>\n", | |
" <td>0.333333</td>\n", | |
" <td>173.043667</td>\n", | |
" <td>141.280</td>\n", | |
" <td>0.333333</td>\n", | |
" <td>112.676667</td>\n", | |
" <td>27.040333</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>2024</td>\n", | |
" <td>Dentist</td>\n", | |
" <td>781.46</td>\n", | |
" <td>8</td>\n", | |
" <td>97.68250</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>122.920625</td>\n", | |
" <td>148.365</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>173.043667</td>\n", | |
" <td>-25.341250</td>\n", | |
" <td>4.113111</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>2024</td>\n", | |
" <td>Psychiatrist</td>\n", | |
" <td>1185.27</td>\n", | |
" <td>8</td>\n", | |
" <td>148.15875</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>122.920625</td>\n", | |
" <td>222.401</td>\n", | |
" <td>0.333333</td>\n", | |
" <td>173.043667</td>\n", | |
" <td>-37.121125</td>\n", | |
" <td>8.226222</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" year claim_type sum count mean share global_mean \\\n", | |
"0 2021 Dentist 614.36 5 122.87200 0.555556 145.808889 \n", | |
"1 2021 Psychiatrist 697.92 4 174.48000 0.444444 145.808889 \n", | |
"2 2022 Dentist 393.50 4 98.37500 0.666667 112.676667 \n", | |
"3 2022 Psychiatrist 282.56 2 141.28000 0.333333 112.676667 \n", | |
"4 2023 Dentist 2967.30 20 148.36500 0.666667 173.043667 \n", | |
"5 2023 Psychiatrist 2224.01 10 222.40100 0.333333 173.043667 \n", | |
"6 2024 Dentist 781.46 8 97.68250 0.500000 122.920625 \n", | |
"7 2024 Psychiatrist 1185.27 8 148.15875 0.500000 122.920625 \n", | |
"\n", | |
" mean_lag share_lag global_mean_lag inner mix \n", | |
"0 NaN NaN NaN NaN NaN \n", | |
"1 NaN NaN NaN NaN NaN \n", | |
"2 122.872 0.555556 145.808889 -16.331333 -2.548543 \n", | |
"3 174.480 0.444444 145.808889 -11.066667 -3.185679 \n", | |
"4 98.375 0.666667 112.676667 33.326667 -0.000000 \n", | |
"5 141.280 0.333333 112.676667 27.040333 0.000000 \n", | |
"6 148.365 0.666667 173.043667 -25.341250 4.113111 \n", | |
"7 222.401 0.333333 173.043667 -37.121125 8.226222 " | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"decomp['mean'] = decomp.eval('sum / count')\n", | |
"decomp['share'] = decomp['count'] / decomp.groupby('year')['count'].transform('sum')\n", | |
"decomp['global_mean'] = (\n", | |
" decomp.groupby('year')['sum'].transform('sum') /\n", | |
" decomp.groupby('year')['count'].transform('sum')\n", | |
")\n", | |
"decomp['mean_lag'] = decomp.groupby(dimension)['mean'].shift(1)\n", | |
"decomp['share_lag'] = decomp.groupby(dimension)['share'].shift(1)\n", | |
"decomp['global_mean_lag'] = decomp.groupby(dimension)['global_mean'].shift(1)\n", | |
"decomp['inner'] = decomp.eval('share * (mean - mean_lag)')\n", | |
"decomp['mix'] = decomp.eval('(share - share_lag) * (mean_lag - global_mean_lag)')\n", | |
"decomp" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"| year | 0 |\n", | |
"|-------:|---------:|\n", | |
"| 2021 | 0 |\n", | |
"| 2022 | -33.1322 |\n", | |
"| 2023 | 60.367 |\n", | |
"| 2024 | -50.123 |\n" | |
] | |
} | |
], | |
"source": [ | |
"print(decomp.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### DuckDB" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "CatalogException", | |
"evalue": "Catalog Error: Table with name \"claims\" already exists!", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mCatalogException\u001b[0m Traceback (most recent call last)", | |
"\u001b[1;32m/Users/max/projects/maxhalford.github.io/decomp.ipynb Cell 15\u001b[0m in \u001b[0;36m3\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/max/projects/maxhalford.github.io/decomp.ipynb#X40sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mduckdb\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/max/projects/maxhalford.github.io/decomp.ipynb#X40sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m duckdb\u001b[39m.\u001b[39;49msql(\u001b[39m\"\u001b[39;49m\u001b[39mCREATE TABLE claims AS SELECT * FROM claims_df\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", | |
"\u001b[0;31mCatalogException\u001b[0m: Catalog Error: Table with name \"claims\" already exists!" | |
] | |
} | |
], | |
"source": [ | |
"import duckdb\n", | |
"\n", | |
"duckdb.sql(\"CREATE TABLE claims AS SELECT * FROM claims_df\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.0" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment