{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Informatics 2 - Foundations of Data Science\n",
    "\n",
    "# Logistic regression\n",
    "\n",
    "David Sterratt, 2020-2024"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import matplotlib\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import Pipeline\n",
    "from scipy.special import expit\n",
    "\n",
    "\n",
    "matplotlib.rcParams['figure.dpi'] = 200"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Logistic function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = np.arange(-10, 10, 0.1)\n",
    "\n",
    "plt.figure(figsize=(6, 2))\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.plot(x, expit(x), label='f(x)')\n",
    "plt.legend()\n",
    "# plt.annotate(, (-10, 0.8), color='blue')\n",
    "plt.xlabel('x')\n",
    "plt.ylabel('f(x)')\n",
    "plt.tight_layout()\n",
    "plt.savefig('logistic.pdf')\n",
    "plt.savefig('logistic.png')\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "plt.plot(x, expit(x-4), label='f(x-4)')\n",
    "plt.plot(x, expit(x/2-2), label='f(x/2-2)')\n",
    "plt.legend()\n",
    "plt.xlabel('x')\n",
    "plt.ylabel('f(x)')\n",
    "plt.tight_layout()\n",
    "plt.savefig('logistic.pdf')\n",
    "plt.savefig('logistic.png')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Credit Example\n",
    "\n",
    "This is nice, but we'd need to do log transforms on the variables, apart, perhaps, from Age, which would make the explanation harder. Cleaned version of UCI credit scoring dataset: https://github.com/davidcsterratt/Credit_Shiny  https://nycdatascience.com/blog/student-works/credit-card-approval-analysis/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Loading and cleaning\n",
    "\n",
    "credit = pd.read_csv('Credit_Approval.csv', na_values=['?'])\n",
    "credit.replace('+', 1, inplace=True)\n",
    "credit.replace('-', 0, inplace=True)\n",
    "credit.replace('?', pd.NA, inplace=True)\n",
    "credit['Gender']=credit['Gender'].replace('a', 0)\n",
    "credit['Gender']=credit['Gender'].replace('b', 1)\n",
    "credit['Employed']=credit['Employed'].replace('f', 0)\n",
    "credit['Employed']=credit['Employed'].replace('t', 1)\n",
    "credit.dropna(inplace=True)\n",
    "credit.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Drop irrelevant (we suppose!) ZipCode\n",
    "credit.drop(['ZipCode'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit.groupby('Gender').mean(numeric_only=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(6,6))\n",
    "# sns.set(font_scale=2)\n",
    "sns.pairplot(credit.drop(['Debt', 'YearsEmployed'], axis=1), hue='Approved')\n",
    "# grid_kws=dict(font_scale=10))\n",
    "plt.tight_layout()\n",
    "plt.savefig('credit-pairplot.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plt.scatter(np.log(credit['Income']+1), credit['Approved'])\n",
    "# plt.xlim([0, 5000])\n",
    "credit['LogIncome'] = np.log10(credit['Income'] + 1)\n",
    "credit['LogCreditScore'] = np.log(credit['CreditScore'] + 1)\n",
    "credit.drop(['Income', 'CreditScore'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.pairplot(credit.drop(['Debt', 'YearsEmployed'], axis=1), hue='Approved')\n",
    "plt.savefig('credit-pairplot.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit.drop(['Debt', 'YearsEmployed'], axis=1).corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sns.set(font_scale=1)\n",
    "\n",
    "credit_sorted = credit.sort_values('Age')\n",
    "plt.figure(figsize=(3, 1.5))\n",
    "sns.scatterplot(x='Age', y='Approved', data=credit_sorted.sample(50, random_state=1))\n",
    "plt.tight_layout()\n",
    "plt.savefig('credit-age.pdf')\n",
    "plt.savefig('credit-age.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X = credit_sorted[['Age']].to_numpy(copy=True) \n",
    "y = credit_sorted['Approved'].to_numpy()\n",
    "clf = LogisticRegression(random_state=0).fit(X, y)\n",
    "beta0 = clf.intercept_\n",
    "beta1 = clf.coef_[0][0]\n",
    "plt.figure(figsize=(3, 2))\n",
    "\n",
    "sns.scatterplot(x='Age', y='Approved', data=credit_sorted.sample(50, random_state=1))\n",
    "plt.plot(X, clf.predict_proba(X)[:,1])\n",
    "plt.annotate('$\\\\hat\\\\beta_0=%2.3f$'%(beta0), (20, 0.8))\n",
    "plt.annotate('$\\\\hat\\\\beta_1=%2.3f$'%(beta1), (20, 0.6))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('credit-age-lr.pdf')\n",
    "plt.savefig('credit-age-lr.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# np.exp(clf.coef_)\n",
    "# clf.intercept_/clf.coef_\n",
    "beta0 = clf.intercept_\n",
    "beta1 = clf.coef_[0][0]\n",
    "print('beta_0 = %2.3f; beta_1 = %2.3f; offset = %2.3f'%(beta0, beta1, -beta0/beta1))\n",
    "print('f(beta_0) = %2.3f'%(expit(beta0)))\n",
    "\n",
    "clf.coef_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Employment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "contingency_table = credit.pivot_table('Approved', 'Employed')\n",
    "contingency_table['Not approved'] = 1 - contingency_table['Approved']\n",
    "contingency_table['Approval odds'] = contingency_table['Approved']/contingency_table['Not approved']\n",
    "contingency_table.to_latex('credit-employment-contingency.tex', float_format='%2.2f')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "odds_ratio = contingency_table.loc[1, 'Approval odds']/contingency_table.loc[0, 'Approval odds']\n",
    "odds_ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.log(odds_ratio)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = credit_sorted[['Employed']].to_numpy(copy=True) \n",
    "y = credit_sorted['Approved'].to_numpy()\n",
    "clf = LogisticRegression(random_state=0).fit(X, y)\n",
    "sns.relplot(x='Employed', y='Approved', data=credit_sorted.sample(50, random_state=1))\n",
    "plt.plot(X, clf.predict_proba(X)[:,1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Both age and Employed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = credit_sorted[['Age', 'Employed']].to_numpy(copy=True) \n",
    "y = credit_sorted['Approved'].to_numpy()\n",
    "logr = LogisticRegression(random_state=0).fit(X, y)\n",
    "np.exp(logr.coef_)[0]\n",
    "logr.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "beta = np.array([logr.intercept_[0], logr.coef_[0][0], logr.coef_[0][1]])\n",
    "beta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_age_employment = pd.DataFrame({'Variable': ['Intercept', 'Age', 'Employed'],\n",
    "                                      'Coefficient': beta,\n",
    "                                     'Odds or OR': np.exp(beta)})\n",
    "credit_age_employment.index=['$\\\\hat\\\\beta_0$', '$\\\\hat\\\\beta_1$', '$\\\\hat\\\\beta_2$']\n",
    "# credit_age_employment.to_latex('credit-employment-age-coeffs.tex', float_format='%2.3f')\n",
    "credit_age_employment"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Age and LogIncome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = credit_sorted[['Age', 'LogIncome']].to_numpy(copy=True) \n",
    "y = credit_sorted['Approved'].to_numpy()\n",
    "logr = LogisticRegression(random_state=0).fit(X, y)\n",
    "np.exp(logr.coef_)[0]\n",
    "logr.coef_\n",
    "logr.score(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.scatterplot(x='Age', y='LogIncome', hue='Approved', data=credit_sorted)\n",
    "x1 = np.arange(10, 90, 10)\n",
    "beta0 = logr.intercept_\n",
    "beta1 = logr.coef_[0][0]\n",
    "beta2 = logr.coef_[0][1]\n",
    "c = 0\n",
    "x2 = (c - beta0-beta1*x1)/beta2\n",
    "plt.plot(x1, x2, color='black')\n",
    "c = np.log(3)\n",
    "x2 = (c - beta0-beta1*x1)/beta2\n",
    "plt.plot(x1, x2, color='gray')\n",
    "c = np.log(1/3)\n",
    "x2 = (c - beta0-beta1*x1)/beta2\n",
    "plt.plot(x1, x2, color='gray')\n",
    "plt.ylim([0, 6])\n",
    "plt.xlabel('Age (years)')\n",
    "plt.ylabel('Log (base 10) of Income')\n",
    "\n",
    "\n",
    "plt.savefig('credit-age-logincome-decision-boundary.pdf')\n",
    "plt.savefig('credit-age-logincome-decision-boundary.png')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scaled Logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logr_scaled = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())])\n",
    "logr_scaled.fit(X, y)\n",
    "# np.exp(clf.coef_)[0]\n",
    "# clf.coef_\n",
    "logr_scaled.score(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Comparison with KNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 15\n",
    "neigh = KNeighborsClassifier(n_neighbors=k)\n",
    "neigh.fit(X, y)\n",
    "neigh.score(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### KNN  scaled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 11\n",
    "neigh_scaled = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=k))])\n",
    "neigh_scaled.fit(X, y)\n",
    "neigh_scaled.score(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_min, x_max = 10, 80\n",
    "y_min, y_max =  0,  6\n",
    "xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),\n",
    "                     np.arange(y_min, y_max, 0.1))\n",
    "\n",
    "fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(6, 3))\n",
    "plt.sca(ax[0])\n",
    "sns.scatterplot(x='Age', y='LogIncome', hue='Approved', data=credit_sorted)\n",
    "\n",
    "C = logr.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "C = C.reshape(xx.shape)\n",
    "# plt.contour(xx, yy, C, levels=[0.5])\n",
    "x2 = (0 - beta0-beta1*x1)/beta2\n",
    "plt.plot(x1, x2, color='black')\n",
    "plt.ylim([0, 6])\n",
    "plt.sca(ax[1])\n",
    "sns.scatterplot(x='Age', y='LogIncome', hue='Approved', data=credit_sorted)\n",
    "\n",
    "C = neigh_scaled.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "C = C.reshape(xx.shape)\n",
    "plt.contour(xx, yy, C, levels=[0.5])\n",
    "plt.tight_layout()\n",
    "\n",
    "plt.savefig('logistic-knn.png')\n",
    "plt.savefig('logistic-knn.pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Boostrapping confidence intervals\n",
    "We'll go back to the age and employment example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(0)\n",
    "B = 1000\n",
    "n = len(credit_sorted)\n",
    "\n",
    "beta0star = np.zeros(B)\n",
    "beta1star = np.zeros(B)\n",
    "beta2star = np.zeros(B)\n",
    "\n",
    "for i in range(B):\n",
    "    credit_sample = credit_sorted.sample(n, replace=True)\n",
    "    X = credit_sample[['Age', 'Employed']].to_numpy(copy=True) \n",
    "    y = credit_sample['Approved'].to_numpy()\n",
    "    clf = LogisticRegression().fit(X, y)\n",
    "    beta0star[i] = clf.intercept_[0]\n",
    "    beta1star[i] = clf.coef_[0][0]\n",
    "    beta2star[i] = clf.coef_[0][1]\n",
    "  \n",
    "Odds_star = pd.Series(np.exp(beta0star))\n",
    "OR_age_star = pd.Series(np.exp(beta1star))\n",
    "OR_employment_star = pd.Series(np.exp(beta2star))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Odds_star.quantile([0.05, 0.95])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "OR_age_star.quantile([0.05, 0.95])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "OR_employment_star.quantile([0.05, 0.95])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=[6,2], dpi=200)\n",
    "\n",
    "plt.subplot(1, 3, 1)\n",
    "plt.hist(np.exp(beta0star), bins=20, density=True)\n",
    "plt.xlabel('odds (Intercept)')\n",
    "plt.vlines(Odds_star.quantile([0.05, 0.95]), 0, 10, color='orange')\n",
    "\n",
    "\n",
    "plt.subplot(1, 3, 2)\n",
    "plt.hist(np.exp(beta1star), bins=20, density=True)\n",
    "plt.xlabel('OR (Year of age)')\n",
    "plt.vlines(OR_age_star.quantile([0.05, 0.95]), 0, 45, color='orange')\n",
    "\n",
    "plt.subplot(1, 3, 3)\n",
    "plt.hist(np.exp(beta2star), bins=20, density=True)\n",
    "plt.vlines(OR_employment_star.quantile([0.05, 0.95]), 0, 0.5, color='orange')\n",
    "\n",
    "plt.xlabel('OR (Employment)')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('credit-employment-age-bootstrap.pdf')\n",
    "plt.savefig('credit-employment-age-bootstrap.png')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Making logistic regression transparent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_sorted.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X = credit_sorted[['Employed', 'Age', 'LogIncome']].to_numpy(copy=True) \n",
    "y = credit_sorted['Approved'].to_numpy()\n",
    "logr = LogisticRegression(random_state=0).fit(X, y)\n",
    "logr.coef_\n",
    "# logr.score(X, y)\n",
    "\n",
    "print(\"If you are in employment you score %2.3f, if not you score 0\"%(logr.coef_[0][0]))\n",
    "print(\"Multiply your age by %2.3f and add the result to your score\"%(logr.coef_[0][1]))\n",
    "print(\"Round your income to the nearest 1000. Multiply the number of zeros in this figure by %2.3f and add the result to your score\"%(logr.coef_[0][2]))\n",
    "print(\"If you scored more than %2.3f, your credit will be approved\"%(-logr.intercept_))\n",
    "\n",
    "-logr.intercept_\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Logistic regression on full dataset\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_full = credit.drop(['Citizen', 'DriversLicense', 'PriorDefault', 'EducationLevel', 'Ethnicity', 'BankCustomer', 'Married'], 1) \n",
    "credit_full['LogDebt'] = np.log10(credit_full['Debt'] + 1)\n",
    "credit_full['LogYearsEmployed'] = np.log10(credit_full['YearsEmployed'] + 1)\n",
    "\n",
    "credit_full.drop(['Debt'], axis=1, inplace=True)\n",
    "credit_full.drop(['YearsEmployed'], axis=1, inplace=True)\n",
    "\n",
    "sns.pairplot(credit_full, hue='Approved')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_gender = credit_full.drop(['Approved'], axis=1).to_numpy(copy=True) \n",
    "y = credit['Approved'].to_numpy()\n",
    "\n",
    "logr_gender = LogisticRegression(random_state=0).fit(X_gender, y)\n",
    "\n",
    "coeffs_gender = pd.Series({'Intercept': logr_gender.intercept_})\n",
    "coeffs_gender = coeffs_gender.append(pd.Series(logr_gender.coef_[0], index=credit_full.columns.drop('Approved')))\n",
    "\n",
    "logr_gender.score(X_gender, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "coeffs_gender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_nogender = credit_full.drop(['Approved', 'Gender'], axis=1).to_numpy(copy=True) \n",
    "y = credit['Approved'].to_numpy()\n",
    "\n",
    "logr_nogender = LogisticRegression(random_state=0).fit(X_nogender, y)\n",
    "\n",
    "coeffs_nogender = pd.Series({'Intercept': logr_nogender.intercept_})\n",
    "coeffs_nogender = coeffs_nogender.append(pd.Series(logr_nogender.coef_[0], index=credit_full.columns.drop(['Approved', 'Gender'])))\n",
    "\n",
    "logr_nogender.score(X_nogender, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "coeffs_nogender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "personA = pd.Series({'Gender': 1,\n",
    "                     'Age': 30,\n",
    "                     'Employed': 1,\n",
    "                     'LogIncome': 3.1,\n",
    "                     'LogCreditScore': 0,\n",
    "                     'LogDebt': 0, \n",
    "                     'LogYearsEmployed': np.log10(5)})\n",
    "\n",
    "personB = personA.copy()\n",
    "personB['Gender'] = 0\n",
    "\n",
    "np.sum(personA * coeffs_nogender.drop('Intercept')) + coeffs_nogender['Intercept']\n",
    "np.sum(personB * coeffs_nogender.drop('Intercept')) + coeffs_nogender['Intercept']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(np.sum(personA * coeffs_gender.drop('Intercept')) + coeffs_nogender['Intercept'])\n",
    "print(np.sum(personB * coeffs_gender.drop('Intercept')) + coeffs_nogender['Intercept'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# logr_gender.intercept_ = logr_gender.intercept_ + 1\n",
    "\n",
    "logr_nogender.intercept_ = logr_nogender.intercept_ + 1\n",
    "\n",
    "credit_full['ApprovedLRgender'] = logr_gender.predict(X_gender)\n",
    "credit_full['ApprovedLRnogender'] = logr_nogender.predict(X_nogender)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_full.groupby('Gender').mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bootstrapping, Age, Employment and LogIncome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "B = 1000\n",
    "n = len(credit_sorted)\n",
    "\n",
    "beta0star = np.zeros(B)\n",
    "beta1star = np.zeros(B)\n",
    "beta2star = np.zeros(B)\n",
    "beta3star = np.zeros(B)\n",
    "\n",
    "\n",
    "for i in range(B):\n",
    "    credit_sample = credit_sorted.sample(n, replace=True)\n",
    "    X = credit_sample[['Age', 'Employed', 'LogIncome']].to_numpy(copy=True) \n",
    "    y = credit_sample['Approved'].to_numpy()\n",
    "    clf = LogisticRegression().fit(X, y)\n",
    "    beta0star[i] = clf.intercept_[0]\n",
    "    beta1star[i] = clf.coef_[0][0]\n",
    "    beta2star[i] = clf.coef_[0][1]\n",
    "    beta3star[i] = clf.coef_[0][2]\n",
    "  \n",
    "Odds_star = pd.Series(np.exp(beta0star))\n",
    "OR_age_star = pd.Series(np.exp(beta1star))\n",
    "OR_employment_star = pd.Series(np.exp(beta2star))\n",
    "OR_logincome_star = pd.Series(np.exp(beta3star))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=[6,2], dpi=300)\n",
    "\n",
    "plt.subplot(1, 4, 1)\n",
    "plt.hist(np.exp(beta0star), bins=20, density=True)\n",
    "plt.xlabel('odds (Intercept)')\n",
    "plt.vlines(Odds_star.quantile([0.05, 0.95]), 0, 10, color='orange')\n",
    "\n",
    "\n",
    "plt.subplot(1, 4, 2)\n",
    "plt.hist(np.exp(beta1star), bins=20, density=True)\n",
    "plt.xlabel('OR (Year of age)')\n",
    "plt.vlines(OR_age_star.quantile([0.05, 0.95]), 0, 45, color='orange')\n",
    "\n",
    "plt.subplot(1, 4, 3)\n",
    "plt.hist(np.exp(beta2star), bins=20, density=True)\n",
    "plt.vlines(OR_employment_star.quantile([0.05, 0.95]), 0, 0.5, color='orange')\n",
    "plt.xlabel('OR (Employment)')\n",
    "\n",
    "plt.subplot(1, 4, 4)\n",
    "plt.hist(np.exp(beta3star), bins=20, density=True)\n",
    "plt.vlines(OR_logincome_star.quantile([0.05, 0.95]), 0, 0.5, color='orange')\n",
    "plt.xlabel('OR (Log Income)')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "OR_age_star.quantile([0.05, 0.95])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "OR_employment_star.quantile([0.05, 0.95])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "OR_logincome_star.quantile([0.05, 0.95])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}