Created
February 22, 2023 08:46
-
-
Save AhmedAlhallag/d6ee8c6dfd782c0838174a17223d28aa to your computer and use it in GitHub Desktop.
Desktop/DataCleaningLab2_1.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# KH4005CEM Lab 1.2: Detecting & Handling Missing Values " | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Note: This step assumes that you grabed the values (as an .xlsx file or as a .csv file) from somewhere\n# either from kaggle, or web scraped\n\n\n# importing Vlues dataset using pandas library \nimport pandas as pd \n\n# df = pd.read_csv(\"dataset.csv\")\ndf = pd.read_csv(\"dataset_InLab.csv\")\n", | |
"execution_count": 272, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "print(df)\ndf\n", | |
"execution_count": 270, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " ID City Items Price Payment Tax Transaction\n0 1 Colombo 3 500.0 ? 0.25 yes\n1 2 Singapore 2 1500.0 0 0.31 yes\n2 3 ? 4 2000.0 1 1.20 yes\n3 4 Tokyo 4300 1300.0 # 0.80 no\n4 5 Melbourne 6 1.3 -- 0.50 no\n5 6 Singapore 4 1800.0 1 0.30 no\n6 7 ? 5 999999999.0 1 0.40 yes\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 270, | |
"data": { | |
"text/plain": " ID City Items Price Payment Tax Transaction\n0 1 Colombo 3 500.0 ? 0.25 yes\n1 2 Singapore 2 1500.0 0 0.31 yes\n2 3 ? 4 2000.0 1 1.20 yes\n3 4 Tokyo 4300 1300.0 # 0.80 no\n4 5 Melbourne 6 1.3 -- 0.50 no\n5 6 Singapore 4 1800.0 1 0.30 no\n6 7 ? 5 999999999.0 1 0.40 yes", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>City</th>\n <th>Items</th>\n <th>Price</th>\n <th>Payment</th>\n <th>Tax</th>\n <th>Transaction</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Colombo</td>\n <td>3</td>\n <td>500.0</td>\n <td>?</td>\n <td>0.25</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Singapore</td>\n <td>2</td>\n <td>1500.0</td>\n <td>0</td>\n <td>0.31</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>?</td>\n <td>4</td>\n <td>2000.0</td>\n <td>1</td>\n <td>1.20</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>Tokyo</td>\n <td>4300</td>\n <td>1300.0</td>\n <td>#</td>\n <td>0.80</td>\n <td>no</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>Melbourne</td>\n <td>6</td>\n <td>1.3</td>\n <td>--</td>\n <td>0.50</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>Singapore</td>\n <td>4</td>\n <td>1800.0</td>\n <td>1</td>\n <td>0.30</td>\n <td>no</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td>?</td>\n <td>5</td>\n <td>999999999.0</td>\n <td>1</td>\n <td>0.40</td>\n <td>yes</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Intro to Regular Expression\n- Check the regex quick guide on Notion Page\n- Replace any missing valus, such as: special symbols, empty strings, speacial words (NA, null, etc..) with a pythonic NaN data type\n- NaN (Not a Number) data types are defined in the numpy library\n- Converting missing values to NaN types will allow you to generate a summary of columns containing missing values" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import numpy as np\n\n# replace empty strings and Question marks with a nan\ndf.replace(r\"\\?|^$\",\"\", regex=True)\n", | |
"execution_count": 273, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 273, | |
"data": { | |
"text/plain": " ID City Items Price Payment Tax Transaction\n0 1 Colombo 3 500.0 0.25 yes\n1 2 Singapore 2 1500.0 0 0.31 yes\n2 3 4 2000.0 1 1.20 yes\n3 4 Tokyo 4300 1300.0 # 0.80 no\n4 5 Melbourne 6 1.3 -- 0.50 no\n5 6 Singapore 4 1800.0 1 0.30 no\n6 7 5 999999999.0 1 0.40 yes", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>City</th>\n <th>Items</th>\n <th>Price</th>\n <th>Payment</th>\n <th>Tax</th>\n <th>Transaction</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Colombo</td>\n <td>3</td>\n <td>500.0</td>\n <td></td>\n <td>0.25</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Singapore</td>\n <td>2</td>\n <td>1500.0</td>\n <td>0</td>\n <td>0.31</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td></td>\n <td>4</td>\n <td>2000.0</td>\n <td>1</td>\n <td>1.20</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>Tokyo</td>\n <td>4300</td>\n <td>1300.0</td>\n <td>#</td>\n <td>0.80</td>\n <td>no</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>Melbourne</td>\n <td>6</td>\n <td>1.3</td>\n <td>--</td>\n <td>0.50</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>Singapore</td>\n <td>4</td>\n <td>1800.0</td>\n <td>1</td>\n <td>0.30</td>\n <td>no</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td></td>\n <td>5</td>\n <td>999999999.0</td>\n <td>1</td>\n <td>0.40</td>\n <td>yes</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Generate a summary report of Missing Values" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.isna().sum()\n", | |
"execution_count": 256, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 256, | |
"data": { | |
"text/plain": "ID 0\nCity 2\nItems 0\nPrice 0\nPayment 1\nTax 0\nTransaction 0\ndtype: int64" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Grab list of column names that contains missing values" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.columns[df.isna().any()]\n# Convert it into a list\ndf.columns[df.isna().any()].to_list()\n", | |
"execution_count": 257, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 257, | |
"data": { | |
"text/plain": "['City', 'Payment']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 258, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 258, | |
"data": { | |
"text/plain": " ID City Items Price Payment Tax Transaction\n0 1 Colombo 3 500.0 NaN 0.25 yes\n1 2 Singapore 2 1500.0 0 0.31 yes\n2 3 NaN 4 2000.0 1 1.20 yes\n3 4 Tokyo 4300 1300.0 # 0.80 no\n4 5 Melbourne 6 1.3 -- 0.50 no\n5 6 Singapore 4 1800.0 1 0.30 no\n6 7 NaN 5 999999999.0 1 0.40 yes", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>City</th>\n <th>Items</th>\n <th>Price</th>\n <th>Payment</th>\n <th>Tax</th>\n <th>Transaction</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Colombo</td>\n <td>3</td>\n <td>500.0</td>\n <td>NaN</td>\n <td>0.25</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Singapore</td>\n <td>2</td>\n <td>1500.0</td>\n <td>0</td>\n <td>0.31</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>NaN</td>\n <td>4</td>\n <td>2000.0</td>\n <td>1</td>\n <td>1.20</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>Tokyo</td>\n <td>4300</td>\n <td>1300.0</td>\n <td>#</td>\n <td>0.80</td>\n <td>no</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>Melbourne</td>\n <td>6</td>\n <td>1.3</td>\n <td>--</td>\n <td>0.50</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>Singapore</td>\n <td>4</td>\n <td>1800.0</td>\n <td>1</td>\n <td>0.30</td>\n <td>no</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td>NaN</td>\n <td>5</td>\n <td>999999999.0</td>\n <td>1</td>\n <td>0.40</td>\n <td>yes</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Put all of the previous work to Action \n#### Create a function that replaces missing values & one that detects them" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def detect_missing(df):\n df = convert_missing_to(df, np.nan)\n cols_with_missing = df.columns[df.isna().any()].tolist()\n if cols_with_missing:\n return True, cols_with_missing \n return False, None\n\ndef convert_missing_to(df, cri):\n return df.replace(r\"\\?|^$\",cri, regex=True)\n\n \n ", | |
"execution_count": 274, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Step 1: detect?\ndetect_missing(df)\n\n# Step 2: convert missing to NaN\ndf_withNan = convert_missing_to(df,np.nan)\n\n# Step 3: Generate summary \ndf_withNan.isna().sum()\n\n", | |
"execution_count": 275, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 275, | |
"data": { | |
"text/plain": "ID 0\nCity 2\nItems 0\nPrice 0\nPayment 1\nTax 0\nTransaction 0\ndtype: int64" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Handle Missing " | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### To handle missing data, we can get the mode for categorical columns (\"City\") or the mean for numerical columns (Payment)\n- Note: there are other ways as well. We will focus on the statistical ones for this lab" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Before we get the mode for the City column, we need to make sure missing values are removed (generating a temporary version of the dataframe to get the mode from)" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# convert missing to empty (empty strings are easier to exclude than the nans)\ndf_empty = convert_missing_to(df,\"\")", | |
"execution_count": 276, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df_empty", | |
"execution_count": 277, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 277, | |
"data": { | |
"text/plain": " ID City Items Price Payment Tax Transaction\n0 1 Colombo 3 500.0 0.25 yes\n1 2 Singapore 2 1500.0 0 0.31 yes\n2 3 4 2000.0 1 1.20 yes\n3 4 Tokyo 4300 1300.0 # 0.80 no\n4 5 Melbourne 6 1.3 -- 0.50 no\n5 6 Singapore 4 1800.0 1 0.30 no\n6 7 5 999999999.0 1 0.40 yes", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>City</th>\n <th>Items</th>\n <th>Price</th>\n <th>Payment</th>\n <th>Tax</th>\n <th>Transaction</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Colombo</td>\n <td>3</td>\n <td>500.0</td>\n <td></td>\n <td>0.25</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Singapore</td>\n <td>2</td>\n <td>1500.0</td>\n <td>0</td>\n <td>0.31</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td></td>\n <td>4</td>\n <td>2000.0</td>\n <td>1</td>\n <td>1.20</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>Tokyo</td>\n <td>4300</td>\n <td>1300.0</td>\n <td>#</td>\n <td>0.80</td>\n <td>no</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>Melbourne</td>\n <td>6</td>\n <td>1.3</td>\n <td>--</td>\n <td>0.50</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>Singapore</td>\n <td>4</td>\n <td>1800.0</td>\n <td>1</td>\n <td>0.30</td>\n <td>no</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td></td>\n <td>5</td>\n <td>999999999.0</td>\n <td>1</td>\n <td>0.40</td>\n <td>yes</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# create a copy of the City column that do not contain missing values (delete missing values)\n\nCity_unique = []\nfor val in df_empty[\"City\"].to_list():\n if val:\n City_unique.append(val)", | |
"execution_count": 278, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "City_unique", | |
"execution_count": 279, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 279, | |
"data": { | |
"text/plain": "['Colombo', 'Singapore', 'Tokyo', 'Melbourne', 'Singapore']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# import stats from scipy to get the mode method \nfrom scipy import stats", | |
"execution_count": 280, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mode_city = stats.mode(City_unique)[0][0]\n", | |
"execution_count": 281, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# generate another final copy of the dataframe, and replace the missing values from the original table with an \n# alias that is unique to us \ndf_final = convert_missing_to(df,\"TO BE REPLACED\")\n\n# now replace that alias with the mode value\ndf_final['City'] = df_final['City'].apply( lambda x: x.replace(\"TO BE REPLACED\",mode_city))", | |
"execution_count": 282, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df_final", | |
"execution_count": 283, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 283, | |
"data": { | |
"text/plain": " ID City Items Price Payment Tax Transaction\n0 1 Colombo 3 500.0 TO BE REPLACED 0.25 yes\n1 2 Singapore 2 1500.0 0 0.31 yes\n2 3 Singapore 4 2000.0 1 1.20 yes\n3 4 Tokyo 4300 1300.0 # 0.80 no\n4 5 Melbourne 6 1.3 -- 0.50 no\n5 6 Singapore 4 1800.0 1 0.30 no\n6 7 Singapore 5 999999999.0 1 0.40 yes", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>City</th>\n <th>Items</th>\n <th>Price</th>\n <th>Payment</th>\n <th>Tax</th>\n <th>Transaction</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Colombo</td>\n <td>3</td>\n <td>500.0</td>\n <td>TO BE REPLACED</td>\n <td>0.25</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Singapore</td>\n <td>2</td>\n <td>1500.0</td>\n <td>0</td>\n <td>0.31</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Singapore</td>\n <td>4</td>\n <td>2000.0</td>\n <td>1</td>\n <td>1.20</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>Tokyo</td>\n <td>4300</td>\n <td>1300.0</td>\n <td>#</td>\n <td>0.80</td>\n <td>no</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>Melbourne</td>\n <td>6</td>\n <td>1.3</td>\n <td>--</td>\n <td>0.50</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>Singapore</td>\n <td>4</td>\n <td>1800.0</td>\n <td>1</td>\n <td>0.30</td>\n <td>no</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td>Singapore</td>\n <td>5</td>\n <td>999999999.0</td>\n <td>1</td>\n <td>0.40</td>\n <td>yes</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Activity: Do the same steps for the 'Payment\" Column\n- Note: you would want to replace missing values (after aliasing them) with the \"mean\" this time! since these are numerical.\n- We can continue on the df_empty and operate on the payment column" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# solution\n# Get mean for list of payments without the missing values \n\nPayment_unique = []\n\nfor val in df_empty[\"Payment\"].to_list():\n if val:\n Payment_unique.append(val)\n \nprint(Payment_unique)\n", | |
"execution_count": 230, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "['0', '1', '0', '0', '1', '1']\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "Payment_unique = list(map(int, Payment_unique))\nprint(Payment_unique)", | |
"execution_count": 231, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[0, 1, 0, 0, 1, 1]\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# solution\n\nmean_payment = np.mean(Payment_unique)\ndf_final = convert_missing_to(df_final,\"TO BE REPLACED\")\ndf_final['Payment'] = df_final['Payment'].apply( lambda x: x.replace(\"TO BE REPLACED\",str(mean_payment)))", | |
"execution_count": 234, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df_final", | |
"execution_count": 237, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 237, | |
"data": { | |
"text/plain": " ID City Items Price Payment Tax Transaction\n0 1 Colombo 3 500.0 0.5 0.25 yes\n1 2 Singapore 2 1500.0 0 0.31 yes\n2 3 Singapore 4 2000.0 1 1.20 yes\n3 4 Tokyo 4300 1300.0 0 0.80 no\n4 5 Melbourne 6 1.3 0 0.50 no\n5 6 Singapore 4 1800.0 1 0.30 no\n6 7 Singapore 5 700.0 1 0.40 yes", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>City</th>\n <th>Items</th>\n <th>Price</th>\n <th>Payment</th>\n <th>Tax</th>\n <th>Transaction</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Colombo</td>\n <td>3</td>\n <td>500.0</td>\n <td>0.5</td>\n <td>0.25</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Singapore</td>\n <td>2</td>\n <td>1500.0</td>\n <td>0</td>\n <td>0.31</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Singapore</td>\n <td>4</td>\n <td>2000.0</td>\n <td>1</td>\n <td>1.20</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>Tokyo</td>\n <td>4300</td>\n <td>1300.0</td>\n <td>0</td>\n <td>0.80</td>\n <td>no</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>Melbourne</td>\n <td>6</td>\n <td>1.3</td>\n <td>0</td>\n <td>0.50</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>Singapore</td>\n <td>4</td>\n <td>1800.0</td>\n <td>1</td>\n <td>0.30</td>\n <td>no</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td>Singapore</td>\n <td>5</td>\n <td>700.0</td>\n <td>1</td>\n <td>0.40</td>\n <td>yes</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Detecting & Handling Noise/Outliers (next lab)" | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.8.8", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "Desktop/DataCleaningLab2_1.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment