Created
August 24, 2016 13:28
-
-
Save aegorenkov/b865aa9e77110255c58c4720d56b6ba1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn import datasets, cross_validation, linear_model, metrics\n", | |
"import pandas as pd\n", | |
"\n", | |
"%matplotlib inline\n", | |
"# Load in pre-packaged data from sklearn and convert to Pandas DF\n", | |
"iris = datasets.load_iris()\n", | |
"\n", | |
"# Convert feature data to DataFrame\n", | |
"irisdf = pd.DataFrame(iris.data, columns=iris.feature_names)\n", | |
"\n", | |
"# Convert outcome data to DataFrame\n", | |
"iris_outcome = pd.DataFrame(iris.target, columns=['species'])\n", | |
"\n", | |
"# We need to put the outcome and data together if we want to explore it\n", | |
"# They have matching indicies, so let's join them by index\n", | |
"irisdf = irisdf.join(iris_outcome)\n", | |
"\n", | |
"# The variable names are annoying, let's change them\n", | |
"irisdf.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create a Logistic Regression model to predict the species type\n", | |
"features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n", | |
"X = irisdf[irisdf.species.isin([0, 1, 2])][features]\n", | |
"y = irisdf[irisdf.species.isin([0, 1, 2])]['species']\n", | |
"X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=.5, random_state=5)\n", | |
"logit = linear_model.LogisticRegression(multi_class='ovr', class_weight={0:1, 1:1, 2:1}).fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"y_pred = logit.predict(X_test)\n", | |
"y_pred_proba = logit.predict_proba(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[22, 0, 0],\n", | |
" [ 0, 21, 8],\n", | |
" [ 0, 0, 24]])" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"metrics.confusion_matrix(y_test, y_pred)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[53, 0],\n", | |
" [ 0, 22]])" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"replacement = {0:1, 1:0, 2:0}\n", | |
"metrics.confusion_matrix(y_test.replace(replacement), \n", | |
" pd.Series(y_pred).replace(replacement))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[46, 0],\n", | |
" [ 8, 21]])" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"replacement = {0:0, 1:1, 2:0}\n", | |
"metrics.confusion_matrix(y_test.replace(replacement), \n", | |
" pd.Series(y_pred).replace(replacement))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[43, 8],\n", | |
" [ 0, 24]])" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"replacement = {0:0, 1:0, 2:1}\n", | |
"metrics.confusion_matrix(y_test.replace(replacement), \n", | |
" pd.Series(y_pred).replace(replacement))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.893333333333\n" | |
] | |
} | |
], | |
"source": [ | |
"print metrics.accuracy_score(y_test, y_pred)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 1.00 1.00 1.00 22\n", | |
" 1 1.00 0.72 0.84 29\n", | |
" 2 0.75 1.00 0.86 24\n", | |
"\n", | |
"avg / total 0.92 0.89 0.89 75\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# The simple way however is to let Sklearn summarize what we are interested in\n", | |
"print metrics.classification_report(y_test, y_pred)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**The precision on class 2 is weaksauce and recall is maxed out. Let's downsample class 2.**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create a Logistic Regression model to predict the species type\n", | |
"features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n", | |
"X = irisdf[irisdf.species.isin([0, 1, 2])][features]\n", | |
"y = irisdf[irisdf.species.isin([0, 1, 2])]['species']\n", | |
"X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=.5, random_state=5)\n", | |
"logit = linear_model.LogisticRegression(multi_class='ovr', class_weight={0:1, 1:1, 2:.4}).fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"y_pred = logit.predict(X_test)\n", | |
"y_pred_proba = logit.predict_proba(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.946666666667\n" | |
] | |
} | |
], | |
"source": [ | |
"print metrics.accuracy_score(y_test, y_pred)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 1.00 1.00 1.00 22\n", | |
" 1 0.96 0.90 0.93 29\n", | |
" 2 0.88 0.96 0.92 24\n", | |
"\n", | |
"avg / total 0.95 0.95 0.95 75\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# The simple way however is to let Sklearn summarize what we are interested in\n", | |
"print metrics.classification_report(y_test, y_pred)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [Root]", | |
"language": "python", | |
"name": "Python [Root]" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment