{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Informatics 2 - Foundations of Data Science\n", "\n", "# Hypothesis testing\n", "\n", "David Sterratt, 2020-2024" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from scipy.stats import binom\n", "import pandas as pd\n", "import seaborn as sns\n", "from scipy.stats import expon\n", "from scipy.stats import norm\n", "from scipy.stats import chi2\n", "from scipy import stats\n", "import pandas as pd\n", "import matplotlib\n", "matplotlib.rcParams['figure.dpi'] = 300" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Swain vs. Alabama" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "population_proportions = [.26, .74]\n", "population_proportions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Example of np.random.choice\n", "# np.random.choice(['Black', 'Non-black'], 100, True, population_proportions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def panel_number_black():\n", " panel = np.random.choice(['Black', 'Non-black'], 100, True, population_proportions)\n", " return(np.sum(panel == 'Black'))\n", "\n", "panel_number_black()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Statistical simulation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k = 10000\n", "panels = np.zeros(k)\n", "\n", "for i in np.arange(k):\n", " panels[i] = panel_number_black()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(6, 2))\n", "plt.subplot(1, 2, 1)\n", "hist_panels = plt.hist(panels, bins=np.arange(5.5,40.), density=True)\n", "plt.vlines( 8, 0, 0.02, color='red')\n", "plt.vlines(15, 0, 0.02, color='magenta')\n", "plt.vlines(20, 0, 0.02, color='yellow')\n", "\n", "\n", "plt.title('Simulation')\n", "\n", "plt.xlabel('Number of Black people on panel')\n", "plt.ylabel('Relative frequency')\n", "\n", "plt.subplot(1, 2, 2)\n", "\n", "n = 100\n", "p = population_proportions[0]\n", "x = np.arange(0, 40, 1)\n", "\n", "plt.plot(x, binom.pmf(x, n, p), 'o', ms=4, label='binom pmf')\n", "plt.plot(x, norm.pdf(x, n*p, np.sqrt(n*p*(1-p))))\n", "\n", "ax = plt.gca()\n", "ax.vlines(x, 0, binom.pmf(x, n, p), alpha=0.5)\n", "\n", "#plt.scatter(8, 0.005, color='red')\n", "plt.vlines( 8, 0, 0.02, color='red')\n", "plt.vlines(15, 0, 0.02, color='magenta')\n", "plt.vlines(20, 0, 0.02, color='yellow')\n", "\n", "\n", "plt.title('Binomial distribution')\n", "plt.ylabel('Probability')\n", "plt.xlabel('Number of Black people on panel')\n", "plt.tight_layout()\n", "plt.savefig('swain-versus-alabama.pdf')\n", "plt.savefig('swain-versus-alabama.png')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### p-values from simulation\n", "\n", "Firstly, with original data $t_0=8$:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pvalue_8 = np.sum(hist_panels[0][np.where(hist_panels[1] < 8)])\n", "pvalue_8" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then with artificial data $t_0=20$:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pvalue_20 = np.sum(hist_panels[0][np.where(hist_panels[1] < 20)])\n", "pvalue_20" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally with artificial data $t_0=15$." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pvalue_15 = np.sum(hist_panels[0][np.where(hist_panels[1] < 15)])\n", "pvalue_15" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hist_panels[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hist_panels[1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.min(panels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### p-values from Z-test\n", "\n", "Since $n$ is large we can use the normal approximation: $\\mu=np$ and $\\sigma =\\sqrt{np(1-p)}$. We then compute $z = \\frac{t_0-\\mu}{\\sigma}$, and find _p_-value from the cumulative distribution of the normal function (aka \"Standard normal curve areas\" in statistical tables)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n = 100\n", "p = population_proportions[0]\n", "mu = n*p\n", "sigma = np.sqrt(n*p*(1-p))\n", "z = (8 - mu)/sigma\n", "norm.cdf(z)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z = (20 - mu)/sigma\n", "norm.cdf(z)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t_0 = [8, 15, 20]\n", "p_values = pd.DataFrame({'t_0': t_0,\n", " 'Simulation': [pvalue_8, pvalue_15, pvalue_20],\n", " 'Binomial': binom.cdf(t_0, n=n, p=p),\n", " 'Normal': norm.cdf(t_0, loc=n*p, scale=np.sqrt(n*p*(1-p)))})\n", "\n", "# p_values.to_latex('swain-p-values.tex', float_format='%3.3g')\n", "p_values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Rejection regions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mu = n*p\n", "sigma = np.sqrt(n*p*(1-p))\n", "x = np.arange(0, 40, 0.1)\n", "\n", "plt.figure(figsize=(6,2.5))\n", "\n", "## One tailed\n", "plt.subplot(1,2,1)\n", "plt.plot(x, norm.pdf(x, mu, sigma), color='blue')\n", "\n", "# x_lower = np.linspace(0, mu + sigma*norm.isf(1-0.1))\n", "# plt.fill_between(x_lower, x_lower*0, norm.pdf(x_lower, mu, sigma), color='brown', label='p<0.1')\n", "# x_lower = np.linspace(0, mu + sigma*norm.isf(1-0.05))\n", "# plt.fill_between(x_lower, x_lower*0, norm.pdf(x_lower, mu, sigma), color='orange', label='p<0.05')\n", "# x_lower = np.linspace(0, mu + sigma*norm.isf(1-0.01))\n", "# plt.fill_between(x_lower, x_lower*0, norm.pdf(x_lower, mu, sigma), color='yellow', label='p<0.01')\n", "\n", "x_lower = mu + sigma*norm.isf(1-0.1)\n", "plt.vlines(x_lower, 0, norm.pdf(x_lower, mu, sigma), color='brown', label='p<0.1')\n", "x_lower = mu + sigma*norm.isf(1-0.05)\n", "plt.vlines(x_lower, 0, norm.pdf(x_lower, mu, sigma), color='orange', label='p<0.05')\n", "x_lower = mu + sigma*norm.isf(1-0.01)\n", "plt.vlines(x_lower, 0, norm.pdf(x_lower, mu, sigma), color='lightblue', label='p<0.01')\n", "\n", "plt.scatter(15, 0, color='magenta')\n", "plt.xlabel('$T_0$')\n", "plt.title('Lower-tailed')\n", "plt.legend()\n", "\n", "# Two tailed\n", "plt.subplot(1,2,2)\n", "\n", "plt.plot(x, norm.pdf(x, mu, sigma), color='blue')\n", "\n", "# x_lower = np.linspace(0, mu + sigma*norm.isf(1-0.05))\n", "# plt.fill_between(x_lower, x_lower*0, norm.pdf(x_lower, mu, sigma), color='brown', label='p<0.1')\n", "# x_upper = np.linspace(40, mu + sigma*norm.isf(0.05))\n", "# plt.fill_between(x_upper, x_upper*0, norm.pdf(x_upper, mu, sigma), color='brown')\n", "\n", "x_lower = mu + sigma*norm.isf(1-0.05)\n", "plt.vlines(x_lower, 0, norm.pdf(x_lower, mu, sigma), color='brown', label='p<0.1')\n", "x_upper = mu + sigma*norm.isf(0.05)\n", "plt.vlines(x_upper, 0, norm.pdf(x_upper, mu, sigma), color='brown')\n", "\n", "# x_lower = np.linspace(0, mu + sigma*norm.isf(1-0.025))\n", "# plt.fill_between(x_lower, x_lower*0, norm.pdf(x_lower, mu, sigma), color='orange', label='p<0.05')\n", "# x_upper = np.linspace(40, mu + sigma*norm.isf(0.025))\n", "# plt.fill_between(x_upper, x_upper*0, norm.pdf(x_upper, mu, sigma), color='orange')\n", "\n", "x_lower = mu + sigma*norm.isf(1-0.025)\n", "plt.vlines(x_lower, 0, norm.pdf(x_lower, mu, sigma), color='orange', label='p<0.05')\n", "x_upper = mu + sigma*norm.isf(0.025)\n", "plt.vlines(x_upper, 0, norm.pdf(x_upper, mu, sigma), color='orange')\n", "\n", "# x_lower = np.linspace(0, mu + sigma*norm.isf(1-0.005))\n", "# plt.fill_between(x_lower, x_lower*0, norm.pdf(x_lower, mu, sigma), color='yellow', label='p<0.01')\n", "# x_upper = np.linspace(40, mu + sigma*norm.isf(0.005))\n", "# plt.fill_between(x_upper, x_upper*0, norm.pdf(x_upper, mu, sigma), color='yellow')\n", "\n", "x_lower = mu + sigma*norm.isf(1-0.005)\n", "plt.vlines(x_lower, 0, norm.pdf(x_lower, mu, sigma), color='lightblue', label='p<0.01')\n", "x_upper = mu + sigma*norm.isf(0.005)\n", "plt.vlines(x_upper, 0, norm.pdf(x_upper, mu, sigma), color='lightblue')\n", "\n", "\n", "plt.scatter(15, 0, color='magenta')\n", "plt.xlabel('$T_0$')\n", "plt.title('Two-tailed')\n", "plt.legend()\n", "\n", "plt.tight_layout()\n", "plt.savefig('rejection-regions.png')\n", "plt.savefig('rejection-regions.pdf')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multiple categories - Alameda county\n", "\n", "### Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dat = pd.DataFrame({'Population %': {'Caucasian': 54, 'Black/AA': 18, 'Hispanic': 12,\n", " 'Asian/PI': 15, 'Other': 1},\n", " 'Observed panel numbers': {'Caucasian': 780, 'Black/AA': 117, 'Hispanic': 114,\n", " 'Asian/PI': 384, 'Other': 58}})\n", "dat = dat.transpose()\n", "# dat['Total'] = dat.sum~(1)\n", "\n", "dat.loc['Expected panel numbers'] = dat.loc['Population %']/100*dat.loc['Observed panel numbers'].sum()\n", "dat.loc['(Observed-Expected)^2/Expected'] = \\\n", " np.power(dat.loc['Expected panel numbers'] - dat.loc['Observed panel numbers'], 2) / \\\n", " dat.loc['Expected panel numbers']\n", "\n", "dat['Total'] = dat.sum(1)\n", "\n", "# dat.to_latex('alameda-cell-counts.tex', float_format='%2.2f')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Statistical simulation of $\\chi^2$ statistic" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "K = 1000 # Number of repetitions\n", "n_obs = dat.drop('Total', axis=1).loc['Observed panel numbers']\n", "p = dat.drop('Total', axis=1).loc['Population %']/100\n", "k = len(p) # Number of categories\n", "n = np.sum(n_obs)\n", "n_exp = n*p\n", "chi2 = np.zeros(K)\n", "\n", "for j in range(K):\n", " # Sample\n", " n_sim = np.random.multinomial(n, p)\n", " chi2[j] = np.sum(np.power((n_sim - n_exp), 2)/n_exp)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Plot the distribution from the simulation, along with the theoretical distribution ($\\chi^2$ with $k-1$ degrees of freedom)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(4, 3))\n", "plt.hist(chi2, 50, density=True, label='Simulation')\n", "x = np.arange(0, 30)\n", "plt.plot(x, stats.chi2.pdf(x, k-1), label='Prob. distribution')\n", "plt.legend()\n", "plt.xlabel('$\\chi^2$')\n", "plt.tight_layout()\n", "plt.savefig('alameda-chi2-dist.pdf')\n", "plt.savefig('alameda-chi2-dist.png')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# stats.chi2.pdf(x, 2)\n", "# stats.chi2.pdf(x, k-1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Homogeneity" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_obs = pd.DataFrame({'Female': {'Depressed': round(2078*1.44/100), 'Not depressed': round(2078*(100-1.44)/100)},\n", " 'Male': {'Depressed': round(1675*0.7/100), 'Not depressed': round(1675*(100-0.7)/100)}})\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_obs_tot = n_obs.copy()\n", "n_obs_tot['Total'] = n_obs.sum(1)\n", "n_obs_tot.loc['Total'] = n_obs_tot.sum(0)\n", "p_Depressed = n_obs_tot.loc['Depressed', 'Total']/n_obs_tot.loc['Total', 'Total']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# p_Depressed = np.mean(n_obs_tot.loc['Depressed', ['Female', 'Male']]/n_obs_tot.loc['Total', ['Female', 'Male']])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p_Depressed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_exp = pd.DataFrame({'Depressed': p_Depressed * n_obs_tot.loc['Total', ['Female', 'Male']],\n", " 'Not depressed': (1 - p_Depressed) * n_obs_tot.loc['Total', ['Female', 'Male']]}).transpose()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# n_exp.to_latex('contingency-table-exp.tex', float_format='%2.2f')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "chisq = np.sum(np.sum(np.power(n_exp - n_obs, 2)/n_exp))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "chisq" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "1 - stats.chi2.cdf(chisq, df=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "1 - stats.chi2.cdf(chisq, 2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "chisq*0.79" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ni = n_obs.sum(1)\n", "nj = n_obs.sum(0)\n", "n = ni.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_exp = np.array([ni]).transpose() * np.array(nj)/n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }