philip-bl · April 20, 2017 16:51
diff --git a/pandas_suck_no_create_index.ipynb b/pandas_suck_no_create_index.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from random import randint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "NUM_RECORDS = 100000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>str</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>int</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>a3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>a4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    str\n",
       "int    \n",
       "0    a0\n",
       "1    a1\n",
       "2    a2\n",
       "3    a3\n",
       "4    a4"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "str_by_int = pd.DataFrame.from_records(\n",
    "    ({\"int\": i, \"str\": \"a\" + str(i)} for i in range(NUM_RECORDS)),\n",
    "    index=\"int\"\n",
    ")\n",
    "str_by_int.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Incorrect Way\n",
    "\n",
    "This is the way to index by column that is suggested on stackoverflow as the\n",
    "answer for many questions, e.g. [1](https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas) [2](https://stackoverflow.com/questions/31756340/selecting-rows-from-a-dataframe-based-on-values-in-multiple-columns-in-pandas). It is slow as fuck and that is why it's bad."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_int_without_rev_lookup():\n",
    "    random_str = \"a\" + str(randint(0, NUM_RECORDS - 1))\n",
    "    matching_int = str_by_int.loc[str_by_int[\"str\"] == random_str]\n",
    "    return matching_int"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000 loops, best of 1: 6.9 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit -n 1000 -r 1 get_int_without_rev_lookup()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correct way\n",
    "\n",
    "This is the correct way. It is also the reason why pandas suck compared to SQL.\n",
    "In SQL we can do this using ``CREATE INDEX``, without creating any additional variables or tables.\n",
    "In pandas we need to write this boilerplate code."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>int</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>str</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a0</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a1</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a2</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a3</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a4</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     int\n",
       "str     \n",
       "a0     0\n",
       "a1     1\n",
       "a2     2\n",
       "a3     3\n",
       "a4     4"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "int_by_str = str_by_int.reset_index().set_index(\"str\", verify_integrity=True)\n",
    "int_by_str.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_int_with_rev_lookup():\n",
    "    random_str = \"a\" + str(randint(0, NUM_RECORDS - 1))\n",
    "    matching_int = int_by_str.loc[random_str]\n",
    "    return matching_int"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000 loops, best of 1: 92.3 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit -n 1000 -r 1 get_int_with_rev_lookup()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "6.9 milliseconds vs 92.3 microseconds. The correct way is about 60 times faster for a table with 100000 rows.\n",
    "And the difference will be even more significant for bigger tables."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"from random import randint"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"NUM_RECORDS = 100000"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>str</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>int</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>a0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>a1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>a2</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>a3</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>a4</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" str\n",
	"int \n",
	"0 a0\n",
	"1 a1\n",
	"2 a2\n",
	"3 a3\n",
	"4 a4"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"str_by_int = pd.DataFrame.from_records(\n",
	" ({\"int\": i, \"str\": \"a\" + str(i)} for i in range(NUM_RECORDS)),\n",
	" index=\"int\"\n",
	")\n",
	"str_by_int.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Incorrect Way\n",
	"\n",
	"This is the way to index by column that is suggested on stackoverflow as the\n",
	"answer for many questions, e.g. [1](https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas) [2](https://stackoverflow.com/questions/31756340/selecting-rows-from-a-dataframe-based-on-values-in-multiple-columns-in-pandas). It is slow as fuck and that is why it's bad."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_int_without_rev_lookup():\n",
	" random_str = \"a\" + str(randint(0, NUM_RECORDS - 1))\n",
	" matching_int = str_by_int.loc[str_by_int[\"str\"] == random_str]\n",
	" return matching_int"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1000 loops, best of 1: 6.9 ms per loop\n"
	]
	}
	],
	"source": [
	"%timeit -n 1000 -r 1 get_int_without_rev_lookup()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Correct way\n",
	"\n",
	"This is the correct way. It is also the reason why pandas suck compared to SQL.\n",
	"In SQL we can do this using ``CREATE INDEX``, without creating any additional variables or tables.\n",
	"In pandas we need to write this boilerplate code."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>int</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>str</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a0</th>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>a1</th>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>a2</th>\n",
	" <td>2</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>a3</th>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>a4</th>\n",
	" <td>4</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" int\n",
	"str \n",
	"a0 0\n",
	"a1 1\n",
	"a2 2\n",
	"a3 3\n",
	"a4 4"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"int_by_str = str_by_int.reset_index().set_index(\"str\", verify_integrity=True)\n",
	"int_by_str.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_int_with_rev_lookup():\n",
	" random_str = \"a\" + str(randint(0, NUM_RECORDS - 1))\n",
	" matching_int = int_by_str.loc[random_str]\n",
	" return matching_int"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1000 loops, best of 1: 92.3 µs per loop\n"
	]
	}
	],
	"source": [
	"%timeit -n 1000 -r 1 get_int_with_rev_lookup()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"6.9 milliseconds vs 92.3 microseconds. The correct way is about 60 times faster for a table with 100000 rows.\n",
	"And the difference will be even more significant for bigger tables."
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}