{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Timing Experiment" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.colors as mcolors\n", "import matplotlib.pyplot as plt\n", "import warnings\n", "import detect_simpsons_paradox as dsp\n", "import sp_data_util as sp_dat\n", "import time" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will draw samples from a number of clusters according to a Gaussian Mixture Model and add both continuous and categorical noise values. \n", "\n", "First we have to set up the number of clusters, samples and extra values. \n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# set the data size\n", "N = int(10**5)\n", "# and 5 extra continuous attributes and 5 extra categorical attributes\n", "num_clusters = 32\n", "numExtra = 5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, we generate cluseters that are roughly distributed with a positive trend that will help us ensure that SP occurs throughout the dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD8CAYAAAB0IB+mAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvFvnyVgAAFYdJREFUeJzt3X+sXGd95/H3t8Zkb6HtDcSbJjehDtR4lZLWpqPAKm03JW2dpIiYaIWSXbVpG9VFAhW6lSu7VIKtVkp2DaWtWqUyJSUUCKXEMRFNa0KCiraSU65xZJsEQ5ImxDcmvt1gQOUqdZzv/jHnwtjc6zvjOfPjzPN+SVd35pkz93kmx/nMzPc85zmRmUiSJt8PjHoAkqThMPAlqRAGviQVwsCXpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhXhRtxtGxO3AG4Fjmfmaqu09wG8C89Vmv5+Z91aPbQduBk4Cv52Ze1bq47zzzsu1a9f2Mn5JKt6+ffv+NTPXrLRd14EPfAj4M+DDp7W/PzPf29kQEZcCNwA/AVwIfDYiXp2ZJ8/Uwdq1a5mdne1hSJKkiHiym+26Lulk5ueBZ7vc/Drg45n5XGb+C/AocHm3fUmS6ldHDf/tEXEgIm6PiHOrthngqY5tjlRtkqQR6TfwbwNeBWwAjgLv6/UPRMSWiJiNiNn5+fmVnyBJOit9BX5mPpOZJzPzBeADfK9sMwdc3LHpRVXbUn9jZ2a2MrO1Zs2KxxwkSWepr8CPiAs67r4ZOFTdvge4ISLOiYhLgHXAP/fTlySpP71My7wTuBI4LyKOAO8GroyIDUACTwC/BZCZX4qITwAPA88Db1tpho5Uit3759ix5zBPH1/gwukptm5az+aNHuLS4MU4XfGq1Wql0zI1yXbvn2P7roMsnPje55+p1au45frLDH2dtYjYl5mtlbbzTFtpiHbsOXxK2AMsnDjJjj2HRzQilcTAl4bo6eMLPbVLdTLwpSG6cHqqp3apTga+NERbN61navWqU9qmVq9i66b1IxqRStLLWjqS+rR4YNZZOhoFA18ass0bZwx4jYQlHUkqhIEvSYUw8CWpEAa+JBXCwJekQhj4klQIA1+SCmHgS1IhDHxJKoSBL0mFMPAlqRAGviQVwsCXpEIY+JJUCJdHljRyu/fPeY2AIej6E35E3B4RxyLiUEfbjoj4ckQciIi7I2K6al8bEQsR8VD18xeDGLyk5tu9f47tuw4yd3yBBOaOL7B910F2758b9dAmTi8lnQ8BV5/Wdh/wmsz8SeArwPaOxx7LzA3Vz1v7G6akSbVjz2EWTpw8pW3hxEl27Dk8ohFNrq4DPzM/Dzx7WttnMvP56u5e4KIaxyapAE8fX+ipXWevzoO2vwH8fcf9SyJif0T8Y0T87HJPiogtETEbEbPz8/M1DkdSE1w4PdVTu85eLYEfEe8Cngc+WjUdBV6RmRuB/wF8LCJ+eKnnZubOzGxlZmvNmjV1DEdSg2zdtJ6p1atOaZtavYqtm9aPaESTq+9ZOhHxa8AbgasyMwEy8znguer2voh4DHg1MNtvf5Imy+JsHGfpDF5fgR8RVwO/B/yXzPxOR/sa4NnMPBkRrwTWAY/3NVJJE2vzxhkDfgi6DvyIuBO4EjgvIo4A76Y9K+cc4L6IANhbzcj5OeAPI+IE8ALw1sx8dsk/LEkaiq4DPzNvXKL5g8tsexdw19kOSpJUP5dWkKRCGPiSVAjX0pHGlOvLqG4GvjSGFteXWVxyYHF9GcDQ11mzpCONIdeX0SAY+NIYcn0ZDYKBL40h15fRIBj40hhyfRkNggdtpTHk+jIaBANfGlOuL6O6WdKRpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhTDwJakQBr4kFcITryRphIZ53QMDX5JGZNjXPeippBMRt0fEsYg41NH2soi4LyK+Wv0+t2qPiPjTiHg0Ig5ExGvrHrwkNdmwr3vQaw3/Q8DVp7VtA+7PzHXA/dV9gGuAddXPFuC2sx+mJE2eYV/3oKfAz8zPA8+e1nwdcEd1+w5gc0f7h7NtLzAdERf0M1hJmiTDvu5BHbN0zs/Mo9XtrwPnV7dngKc6tjtStUmSGP51D2o9aJuZGRHZy3MiYgvtkg+veMUr6hyOJNVmELNphn3dgzoC/5mIuCAzj1Ylm2NV+xxwccd2F1Vtp8jMncBOgFar1dObhSQNwyBn0wzzugd1lHTuAW6qbt8EfKqj/Ver2TqvB77ZUfqRpMYY9myaQenpE35E3AlcCZwXEUeAdwO3Ap+IiJuBJ4G3VJvfC1wLPAp8B/j1msYsSUM17Nk0g9JT4Gfmjcs8dNUS2ybwtrMZlCSNkwunp5hbItwHNZtmUFxLR5JWMOzZNIPi0gqStIJhz6YZFANfkrowzNk0g2LgS8sY5iqG0jAY+NIShr2KoTQMBr60hDPNuzbwx4/fxrpj4EtLmJR51yXw21j3nJYpLWHYqxjq7E3KWbDDYOBLS5iUedcl8NtY9yzpSEuYlHnXdRvHWvmknAU7DAa+tIxJmHddp3GtlW/dtP6UcYHfxpZjSUdSV8a1Vr554wy3XH8ZM9NTBDAzPcUt11/mm/US/IQvqSvjXCv321h3/IQvqSvOXGo+A19SV5y51HyWdCR1xZlLzWfgS+qatfJms6QjSYUw8CWpEJZ0pB6N49mmUjcMfKkH43q2qdSNvks6EbE+Ih7q+PlWRLwzIt4TEXMd7dfWMWBplMb1bFOpG31/ws/Mw8AGgIhYBcwBdwO/Drw/M9/bbx/SuBjns02lldRd0rkKeCwzn4yImv+0NHqjWpnR4waqQ92zdG4A7uy4//aIOBARt0fEuTX3JQ3dKM42XTxuMHd8geR7xw12758bWJ+aTLUFfkS8GHgT8LdV023Aq2iXe44C71vmeVsiYjYiZufn5+sajjQQo1iZ0eMGqkudJZ1rgC9m5jMAi78BIuIDwKeXelJm7gR2ArRaraxxPNJADPtsU48bqC51lnRupKOcExEXdDz2ZuBQjX1JxXCVStWllsCPiJcAvwjs6mj+PxFxMCIOAD8P/E4dfUmlcZVK1aWWkk5m/hvw8tPafqWOvy2VzlUqVRfPtJUawFUqVQcXT5OkQhj4klQIA1+SCmHgS1IhDHxJKoSBL0mFcFqmpK65amezGfhSn0oJQa/21XyWdKQ+lLR0sat2Np+BL/WhpBB01c7ms6QjLaObUk1JITiqq32pPn7Cl5bQbammpKWLXbWz+Qx8aQndlmpKCsFRXO1L9bKkIy2h21JNaUsXu2pnsxn40hJ6qVcbgmoKSzrSEkoq1agcfsKXllBaqUZlMPClZViq0aSxpCNJhTDwJakQtZV0IuIJ4NvASeD5zGxFxMuAvwHWAk8Ab8nMb9TVpySpe3V/wv/5zNyQma3q/jbg/sxcB9xf3ZfUILv3z3HFrQ9wyba/44pbH5jIheFKMeiSznXAHdXtO4DNA+5PUo1KWg20BHUGfgKfiYh9EbGlajs/M49Wt78OnH/6kyJiS0TMRsTs/Px8jcOR1K+SVgMtQZ3TMn8mM+ci4j8C90XElzsfzMyMiDz9SZm5E9gJ0Gq1vu9xSaNT0mqgJajtE35mzlW/jwF3A5cDz0TEBQDV72N19Sdp8EpaDbQEtQR+RLwkIn5o8TbwS8Ah4B7gpmqzm4BP1dGftBwPMNbLJSYmS10lnfOBuyNi8W9+LDP/ISK+AHwiIm4GngTeUlN/0vfxmqv1c4mJyRKZ41M2b7VaOTs7O+phqKGuuPWBJVe4nJme4p+2vWEEI5KGIyL2dUyHX5Zn2mpieIBROjMXT1Mj/cHug9z54FOczGRVBDe+7mKvuSqtwE/4apw/2H2Qj+z9GiercuTJTD6y92usffmUBxilMzDw1Th3PvjUku17H/+G11yVzsCSjhrn5DITDU5muoa9dAZ+wlfjrGpP/+26XVKbga/GufF1F/fULqnNko4a539tvgzg+2bpLLZLWponXklSw3V74pWf8CWpJrv3z431MhQGviTVoAlrOXnQVpJq0ISLxfgJv2Dj/vVTapImrOXkJ/xCea1SqV5NuFiMgV+oJnz9lJqkCReLsaRTqCZ8/ZSapAkXizHwC+VSwlL9xn0tJ0s6hWrC109J9fITfqGa8PVTUr0M/IKN+9dPSfXqu6QTERdHxOci4uGI+FJEvKNqf09EzEXEQ9XPtf0PV9Lu/XNccesDXLLt77ji1gecSquu1fEJ/3ngdzPzixHxQ8C+iLiveuz9mfneGvqQBqZJJ6A14fR9ja++P+Fn5tHM/GJ1+9vAI4D/8tQITTsBzfMn1I9aZ+lExFpgI/Bg1fT2iDgQEbdHxLl19iXVoWkB6vkT6kdtgR8RLwXuAt6Zmd8CbgNeBWwAjgLvW+Z5WyJiNiJm5+fn6xqO1JWmBWgTTt/X+Kol8CNiNe2w/2hm7gLIzGcy82RmvgB8ALh8qedm5s7MbGVma82aNXUMR3hgr1tNC1DPn1A/6pilE8AHgUcy84862i/o2OzNwKF++1J3mlaXHqWmBejmjTPccv1lzExPEcDM9BS3XH+ZB2zVlTpm6VwB/ApwMCIeqtp+H7gxIjYACTwB/FYNfakLZ6pLGwynauIJaJ4/obPVd+Bn5v8FYomH7u33b+vsNK0ufbphT5M0QFUK19KZQE2rS3eyHCUNjoE/gZpWl+7U7TRJD0pLvXMtnQnUxLr0om7KUZ5tKp0dA39CNbUu3c06/R6Uls6OJR2NlW7KUU0/KC2NioGvsdLNPPMmH5SWRsmSjsbOSuWorZvWn1LDh+YclB6WJq0AquEx8NU4TT4oPQwe1NZyDHw1UlMPSg+DB7W1HGv40oTxoLaWY+BLE8aD2lqOga/G8Oza7jT5TGsNljV8NYIHIrvnQW0tx8BXI3ggsjce1NZSLOmoETwQKfXPwFcjeCBS6l+RJZ1hn4V4pv48I7I7nl0r9a+4wB/2wb8z9QdM/IHIut7QPBAp9S8yc9Rj+K5Wq5Wzs7MD7eOKWx9Ycvndmekp/mnbG4baHzDUsQzb6W920P5U7kW3pXpFxL7MbK20XXE1/GEf/DtTf5N+ILLbq1dJGo6JKOn0Ujbo5gIbdVqpv2GOZdgm/Q1NapqBf8KPiKsj4nBEPBoR2+r++71e9HrYZyGeqb9JPyPSmTXSeBlo4EfEKuDPgWuAS4EbI+LSOvvotWzQzQU26nSm/oY9lmGb9Dc0qWkGXdK5HHg0Mx8HiIiPA9cBD9fVwdmUDYZ9FuKZ+pvkMyKdWSONl0EH/gzwVMf9I8Dr6uxg2DV59WaS39Ckphn5LJ2I2BIRsxExOz8/3/PzLRtIUncGHfhzwMUd9y+q2r4rM3dmZiszW2vWrOm5g0mvg0tSXQZd0vkCsC4iLqEd9DcA/63uTiwbSNLKBhr4mfl8RLwd2AOsAm7PzC8Nsk9J0tIGfuJVZt4L3DvofiRJZzbyg7aSpOEw8CWpEAa+JBXCwJekQhj4klQIA1+SCmHgS1IhDHxJKoSBL0mFMPAlqRAGviQVwsCXpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhTDwJakQA7/Eofq3e/8cO/Yc5unjC1w4PcXWTeu9aLuknhn4Y273/jm27zrIwomTAMwdX2D7roMAhr6knljSGXM79hz+btgvWjhxkh17Do9oRJKaqq/Aj4gdEfHliDgQEXdHxHTVvjYiFiLioernL+oZbnmePr7QU7skLaffT/j3Aa/JzJ8EvgJs73jssczcUP28tc9+inXh9FRP7ZK0nL4CPzM/k5nPV3f3Ahf1PyR12rppPVOrV53SNrV6FVs3rR/RiCQ1VZ01/N8A/r7j/iURsT8i/jEifna5J0XEloiYjYjZ+fn5GoczGTZvnOGW6y9jZnqKAGamp7jl+ss8YCupZ5GZZ94g4rPAjy7x0Lsy81PVNu8CWsD1mZkRcQ7w0sz8fxHx08Bu4Ccy81tn6qvVauXs7OzZvA5JKlZE7MvM1krbrTgtMzN/YYWOfg14I3BVVu8emfkc8Fx1e19EPAa8GjDNJWlE+p2lczXwe8CbMvM7He1rImJVdfuVwDrg8X76kiT1p98Tr/4MOAe4LyIA9lYzcn4O+MOIOAG8ALw1M5/tsy9JUh/6CvzM/PFl2u8C7urnb0uS6uWZtpJUiBVn6QxTRMwDT/bwlPOAfx3QcMadr71Mpb72Ul83dPfafywz16z0h8Yq8HsVEbPdTEWaRL52X3tJSn3dUO9rt6QjSYUw8CWpEE0P/J2jHsAI+drLVOprL/V1Q42vvdE1fElS95r+CV+S1KXGBn5EXB0RhyPi0YjYNurxDFJEXBwRn4uIhyPiSxHxjqr9ZRFxX0R8tfp97qjHOggRsapaefXT1f1LIuLBat//TUS8eNRjHISImI6IT1YXGXokIv5zQfv8d6p/64ci4s6I+A+Tut8j4vaIOBYRhzraltzP0fan1X+DAxHx2l76amTgV+v0/DlwDXApcGNEXDraUQ3U88DvZualwOuBt1Wvdxtwf2auA+6v7k+idwCPdNz/38D7qzO9vwHcPJJRDd6fAP+Qmf8J+Cna/w0mfp9HxAzw20ArM18DrAJuYHL3+4eAq09rW24/X0N7bbJ1wBbgtl46amTgA5cDj2bm45n578DHgetGPKaBycyjmfnF6va3af+PP0P7Nd9RbXYHsHk0IxyciLgI+GXgL6v7AbwB+GS1yaS+7h+hvSbVBwEy898z8zgF7PPKi4CpiHgR8IPAUSZ0v2fm54HT1xpbbj9fB3w42/YC0xFxQbd9NTXwZ4CnOu4fqdomXkSsBTYCDwLnZ+bR6qGvA+ePaFiD9Me0V2R9obr/cuB4x5XWJnXfXwLMA39VlbP+MiJeQgH7PDPngPcCX6Md9N8E9lHGfl+03H7uK/uaGvhFioiX0l6U7p2nX0ymuhbBRE25iog3Ascyc9+oxzICLwJeC9yWmRuBf+O08s0k7nOAql59He03vQuBl/D9JY9i1Lmfmxr4c8DFHfcvqtomVkSsph32H83MXVXzM4tf56rfx0Y1vgG5AnhTRDxBu2z3Btp17enqqz5M7r4/AhzJzAer+5+k/QYw6fsc4BeAf8nM+cw8Aeyi/W+hhP2+aLn93Ff2NTXwvwCsq47av5j2AZ17Rjymganq1h8EHsnMP+p46B7gpur2TcCnhj22QcrM7Zl5UWaupb2PH8jM/w58Dviv1WYT97oBMvPrwFMRsXi1+quAh5nwfV75GvD6iPjB6t/+4muf+P3eYbn9fA/wq9VsndcD3+wo/awsMxv5A1wLfAV4jPb1dUc+pgG+1p+h/ZXuAPBQ9XMt7Xr2/cBXgc8CLxv1WAf43+BK4NPV7VcC/ww8CvwtcM6oxzeg17yB9mVBD9C+LvS5pexz4H8CXwYOAX9N+0JLE7nfgTtpH6s4Qfub3c3L7WcgaM9QfAw4SHsmU9d9eaatJBWiqSUdSVKPDHxJKoSBL0mFMPAlqRAGviQVwsCXpEIY+JJUCANfkgrx/wGgwfpwqLMOXgAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "mu = np.asarray([[1,1],[5,5]])\n", "\n", "variance = 1000\n", "\n", "# generate rest of the mu\n", "for i in range(num_clusters - 2):\n", " mu_x = np.random.randint(10, 99);\n", " mu_y = np.random.normal(mu_x, np.sqrt(variance))\n", " mu_new = np.asarray([mu_x,mu_y])\n", " mu = np.append(mu,[mu_new],axis=0)\n", "\n", "plt.scatter(mu[:,0],mu[:,1])\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next we use a built in function to our package that takes a list of means and a covariance " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/smb/anaconda2/envs/simpsonsparadox/lib/python3.6/site-packages/sp_data_util/SPData.py:134: RuntimeWarning: covariance is not positive-semidefinite.\n", " x = np.asarray([np.random.multivariate_normal(mu[z_i],cov) for z_i in z])\n" ] } ], "source": [ "# covariance of each cluster\n", "cov = [[.6,-1],[0,.6]]\n", "\n", "# call mixed_regression_sp to generate the data set\n", "latent_df = sp_dat.mixed_regression_sp_extra(N,mu,cov, numExtra)\n", " " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([3, 3, 3, 2, 2, 2, 4, 0, 1, 2, 1, 3, 0, 3, 4, 3, 4, 0, 3, 4])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.choice(range(5),20,)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 26\n", "1 27\n", "2 0\n", "3 8\n", "4 8\n", "Name: cluster, dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latent_df['cluster'].head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.scatter(latent_df['x1'], latent_df['x2'],\n", " c = latent_df['cluster'], marker= 'o')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100000, 13)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check the size of the data\n", "latent_df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since we store the data in a pandas dataframe, we can easily sample a subset of the rows and we can check how that works:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10000\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
x1x2clustercon_0con_1con_2con_3con_4cat_0cat_1cat_2cat_3cat_4
4778740.2853558.279074442.821083-5.9459829.11658447.626833-7.2336637054279196
9721034.1705754.50437929-17.100323-29.930051-157.99335726.58118174.0245407264231360
4252119.381192-21.54557628-168.137523-76.675448146.536801168.084883178.4688239216668976
9645251.97514261.5562681135.396721109.82601098.528868-37.957128-179.428724584705471
4711542.53239440.353842691.51516099.488643-24.809279-95.285213145.3792525295827579
\n", "
" ], "text/plain": [ " x1 x2 cluster con_0 con_1 con_2 \\\n", "47787 40.285355 8.279074 4 42.821083 -5.945982 9.116584 \n", "97210 34.170575 4.504379 29 -17.100323 -29.930051 -157.993357 \n", "42521 19.381192 -21.545576 28 -168.137523 -76.675448 146.536801 \n", "96452 51.975142 61.556268 11 35.396721 109.826010 98.528868 \n", "47115 42.532394 40.353842 6 91.515160 99.488643 -24.809279 \n", "\n", " con_3 con_4 cat_0 cat_1 cat_2 cat_3 cat_4 \n", "47787 47.626833 -7.233663 70 54 27 91 96 \n", "97210 26.581181 74.024540 72 64 23 13 60 \n", "42521 168.084883 178.468823 92 16 66 89 76 \n", "96452 -37.957128 -179.428724 58 4 70 54 71 \n", "47115 -95.285213 145.379252 52 95 82 75 79 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_df = latent_df.sample(frac=.1) \n", "print(len(subset_df))\n", "subset_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we can do the Time experiment for the whole dataset and the sampled dataset." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# whole data set\n", "data_portions = np.linspace(.1,1,10)\n", "\n", "time_data = []\n", "\n", "for cur_portion in data_portions:\n", " start_time = time.time()\n", " dsp.detect_simpsons_paradox(latent_df.sample(frac=cur_portion))\n", " time_data.append([cur_portion, (time.time() - start_time)])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
portion of datatime
00.13.679361
10.23.910151
20.33.797533
30.44.676100
40.53.796321
50.64.221745
60.74.523880
70.84.056846
80.95.073282
91.05.017606
\n", "
" ], "text/plain": [ " portion of data time\n", "0 0.1 3.679361\n", "1 0.2 3.910151\n", "2 0.3 3.797533\n", "3 0.4 4.676100\n", "4 0.5 3.796321\n", "5 0.6 4.221745\n", "6 0.7 4.523880\n", "7 0.8 4.056846\n", "8 0.9 5.073282\n", "9 1.0 5.017606" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "time_res = pd.DataFrame(data = time_data, columns =['portion of data','time'])\n", "time_res # show the results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Computing it just once, is not the most indicative, so we can repeat the experiment and then compute statistics on that. We repeat it 4 more times to get a total of 5" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "num_repeats = 4\n", "\n", "for cur_portion in np.repeat(data_portions,num_repeats):\n", " start_time = time.time()\n", " dsp.detect_simpsons_paradox(latent_df.sample(frac=cur_portion))\n", " time_data.append([cur_portion, (time.time() - start_time)])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "50" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "time_res = pd.DataFrame(data = time_data, columns =['portion','time'])\n", "len(time_res)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we have 50 rows in our result table and we can compute the statistics that we want. We want to first, group the data by the portion of the data so that we can compute the mean and variance of all of the trials of each portion. " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
time
countmeanstdmin25%50%75%max
portion
0.15.04.4464970.8267053.6793613.8126364.0618625.3137675.364857
0.25.04.2569700.2597003.9101514.0607274.3799424.4052774.528751
0.35.04.7047180.9293173.7975334.1715794.3235875.0942946.136595
0.45.03.9368460.4188503.6664403.7356023.7491043.8569854.676100
0.55.04.4561020.7529063.7963214.0263594.1291144.6460395.682674
0.65.04.5403010.6072444.1374534.1732154.2217454.5925805.576510
0.75.04.5452140.5089784.0063324.0979224.5238804.9115305.186406
0.85.04.2892200.5945783.7915773.8390774.0568464.5526315.205971
0.95.04.3183490.4531693.9688184.0457364.0999374.4039755.073282
1.05.04.4962780.5736713.9036253.9983214.4045085.0176065.157329
\n", "
" ], "text/plain": [ " time \\\n", " count mean std min 25% 50% 75% \n", "portion \n", "0.1 5.0 4.446497 0.826705 3.679361 3.812636 4.061862 5.313767 \n", "0.2 5.0 4.256970 0.259700 3.910151 4.060727 4.379942 4.405277 \n", "0.3 5.0 4.704718 0.929317 3.797533 4.171579 4.323587 5.094294 \n", "0.4 5.0 3.936846 0.418850 3.666440 3.735602 3.749104 3.856985 \n", "0.5 5.0 4.456102 0.752906 3.796321 4.026359 4.129114 4.646039 \n", "0.6 5.0 4.540301 0.607244 4.137453 4.173215 4.221745 4.592580 \n", "0.7 5.0 4.545214 0.508978 4.006332 4.097922 4.523880 4.911530 \n", "0.8 5.0 4.289220 0.594578 3.791577 3.839077 4.056846 4.552631 \n", "0.9 5.0 4.318349 0.453169 3.968818 4.045736 4.099937 4.403975 \n", "1.0 5.0 4.496278 0.573671 3.903625 3.998321 4.404508 5.017606 \n", "\n", " \n", " max \n", "portion \n", "0.1 5.364857 \n", "0.2 4.528751 \n", "0.3 6.136595 \n", "0.4 4.676100 \n", "0.5 5.682674 \n", "0.6 5.576510 \n", "0.7 5.186406 \n", "0.8 5.205971 \n", "0.9 5.073282 \n", "1.0 5.157329 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "time_repeats = time_res.groupby('portion')\n", "time_repeats.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can plot the means to see if there's a clear trend" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "time_repeats.mean().plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }