{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Informatics 2 - Foundations of Data Science\n",
    "\n",
    "# Clustering with K-means and basic Hierarchical clustering\n",
    "\n",
    "David Sterratt, 2020-2024"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import scipy\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib\n",
    "from scipy.cluster import hierarchy\n",
    "\n",
    "matplotlib.rcParams['savefig.dpi'] = 300\n",
    "matplotlib.rcParams['font.size'] = 12\n",
    "matplotlib.rcParams['figure.figsize'] = [3, 3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load data\n",
    "We'll use the [oranges and lemons data set](http://homepages.inf.ed.ac.uk/imurray2/teaching/oranges_and_lemons/) collected by Iain Murray."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dat = pd.read_csv('fruits.csv')\n",
    "dat.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(5, 5))\n",
    "ax = sns.scatterplot(x='Width (cm)', y='Height (cm)', data=dat)\n",
    "ax.set_aspect(1)\n",
    "ax.set_xlim([2, 10])\n",
    "plt.tight_layout()\n",
    "#fig.savefig('fruit-raw.png')\n",
    "#fig.savefig('fruit-raw.pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Functions to perform K-means clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sqdist(m, x):\n",
    "    n = x.shape[0]\n",
    "    D = x.shape[1]\n",
    "    K = m.shape[0]\n",
    "    sumsq = np.zeros((n, K))\n",
    "    for i in range(D):\n",
    "        sumsq = sumsq + np.power(np.subtract.outer(x[:,i], m[:,i]), 2)\n",
    "    return(sumsq)\n",
    "\n",
    "def kmeans(x, K, maxiter, verbose=True):\n",
    "    n = x.shape[0]\n",
    "    D = x.shape[1]\n",
    "    \n",
    "    ## Initialisation\n",
    "    \n",
    "    # Choose random data points as cluster centres\n",
    "    m = x[np.random.choice(range(n), K, replace=False), :]\n",
    "    sqd = sqdist(x, m)\n",
    "    c = sqd.argmin(0)\n",
    "        \n",
    "    i = 0\n",
    "    converged = False\n",
    "    c0 = np.zeros(n)\n",
    "    ## Iterate\n",
    "    while (not converged) and (i < maxiter):\n",
    "        i = i + 1\n",
    "        ## Assign each point to closest cluster centre\n",
    "\n",
    "        ## Compute distances\n",
    "        sqd = sqdist(x, m)\n",
    "        c = sqd.argmin(0)\n",
    "\n",
    "        ## Recompute means\n",
    "        for k in range(K):\n",
    "            m[k,:] = np.mean(x[k == c,:], 0)     \n",
    "        \n",
    "        if ((c0 == c).all()):\n",
    "            if verbose:\n",
    "                print(\"Converged after \" + str(i) + \" iterations\")\n",
    "            converged = True\n",
    "            \n",
    "        c0 = c.copy() ## Keep copy of old cluster assignments to check for convergence\n",
    "\n",
    "    return((c, m))\n",
    "\n",
    "def kmeans_iteration(x, m):\n",
    "    K = m.shape[0]\n",
    "    \n",
    "    ## Compute distances\n",
    "    sqd = sqdist(x, m)\n",
    "    c = sqd.argmin(0)\n",
    "\n",
    "    ## Recompute means\n",
    "    for k in range(K):\n",
    "        m[k,:] = np.mean(x[k == c,:], 0)\n",
    "    \n",
    "    return((c, m))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cluster the data\n",
    "\n",
    "Our K-means algorithm doesn't standardise the data, so we need to do this ourselves first."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = dat[['Width (cm)', 'Height (cm)']].to_numpy()\n",
    "z = (x - x.mean(0))/x.std(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dat['Cluster'], zbar = kmeans(z, 5, 100)\n",
    "xbar = pd.DataFrame(zbar*np.array(x.std()) + np.array(x.mean()), columns = ['Width (cm)', 'Height (cm)'])\n",
    "xbar['Cluster'] = list(range(xbar.shape[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(5, 5))\n",
    "ax = sns.scatterplot(x='Width (cm)', y='Height (cm)', hue='Cluster', style='Cluster', data=dat,\n",
    "                     palette='colorblind')\n",
    "# sns.scatterplot(x=xbar[:,0], y=xbar[:,1], color='black', data=dat, palette='colorblind')\n",
    "ax.set_xlim([2, 10])\n",
    "ax.set_aspect(1)\n",
    "\n",
    "\n",
    "# plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)\n",
    "plt.tight_layout()\n",
    "##fig.savefig('fruit-clustered.png')\n",
    "##fig.savefig('fruit-clustered.pdf')\n",
    "\n",
    "sns.scatterplot(x='Width (cm)', y='Height (cm)', color='black', style='Cluster', data=xbar, palette='colorblind',\n",
    "               legend=False, sizes=(150,150), size='Cluster', alpha=0.5)\n",
    "##fig.savefig('fruit-clustered-with-centres.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(5, 5))\n",
    "\n",
    "ax = sns.scatterplot(x='Width (cm)', y='Height (cm)', hue='Fruit', style='Fruit', data=dat, palette='colorblind')\n",
    "# plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)\n",
    "ax.set_xlim([2, 10])\n",
    "ax.set_aspect(1)\n",
    "ax.legend(loc='upper left')\n",
    "plt.tight_layout()\n",
    "\n",
    "#fig.savefig('fruit-original.png')\n",
    "#fig.savefig('fruit-original.pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Kmeans step-by-step"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sum_sq_err(m, x, c):\n",
    "    n = x.shape[0]\n",
    "    E = 0\n",
    "    for i in range(n):\n",
    "        E = E + np.sum(np.power(x[i,:] - m[c[i],:], 2))\n",
    "    return(E)\n",
    "\n",
    "def plot_kmeans(c, zbar, z):\n",
    "    fig = plt.figure(figsize=(5, 5))\n",
    "    ax = sns.scatterplot(x=z[:,0], y=z[:,1], hue=c, style=c, palette='colorblind')\n",
    "    sns.scatterplot(x=zbar[:,0], y=zbar[:,1], color='black', style=list(range(zbar.shape[0])),\n",
    "                        palette='colorblind', legend=False, sizes=(150,150), size=2, alpha=0.5)\n",
    "    ax.set_xlim([-3, 3])\n",
    "    ax.set_ylim([-3, 3])\n",
    "    ax.set_aspect(1)\n",
    "    plt.annotate(\"E = \" + '%3.2f'%(sum_sq_err(zbar, z, c)), (1, -2.5))\n",
    "    plt.xlabel('Width')\n",
    "    plt.ylabel('Height')\n",
    "    plt.tight_layout()\n",
    "\n",
    "\n",
    "x = dat[['Width (cm)', 'Height (cm)']].to_numpy()\n",
    "z = (x - x.mean(0))/x.std(0)\n",
    "c, zbar = kmeans(z, 5, 0)\n",
    "i = 1\n",
    "plot_kmeans(c, zbar, z)\n",
    "plt.savefig('iteration' + str(i) + '.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "i = i + 1\n",
    "c, zbar = kmeans_iteration(z, zbar)\n",
    "plot_kmeans(c, zbar, z)\n",
    "plt.savefig('iteration' + str(i) + '.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c, zbar = kmeans_iteration(z, zbar)\n",
    "plot_kmeans(c, zbar, z)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c, zbar = kmeans_iteration(z, zbar)\n",
    "plot_kmeans(c, zbar, z)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c, zbar = kmeans_iteration(z, zbar)\n",
    "plot_kmeans(c, zbar, z)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c, zbar = kmeans_iteration(z, zbar)\n",
    "plot_kmeans(c, zbar, z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluation of Kmeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c, zbar = kmeans(z, 5, 10)\n",
    "plot_kmeans(c, zbar, z)\n",
    "# plt.savefig('kmeans-evaluation1.png')\n",
    "# plt.savefig('kmeans-evaluation1.pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c, zbar = kmeans(z, 5, 10)\n",
    "plot_kmeans(c, zbar, z)\n",
    "# plt.savefig('kmeans-evaluation2.png')\n",
    "# plt.savefig('kmeans-evaluation2.pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multiple attempts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def kmeans_multiple(z, K, maxiter, n_attempts):\n",
    "    c0, zbar0 = kmeans(z, K, maxiter, False)\n",
    "    E0 = sum_sq_err(zbar0, z, c0)\n",
    "    for i in range(n_attempts):\n",
    "        c, zbar = kmeans(z, K, maxiter, False)\n",
    "        E = sum_sq_err(zbar0, z, c0)\n",
    "        if E < E0:\n",
    "            c0 = c.copy()\n",
    "            zbar0 = zbar.copy()\n",
    "            E0 = E\n",
    "    return(c0, zbar0, E0)\n",
    "\n",
    "def scree(z, Ks, maxiter, n_attempts):\n",
    "    E = np.zeros(len(Ks))\n",
    "    for i in range(len(Ks)):\n",
    "        c, zbar, E[i] = kmeans_multiple(z, Ks[i], maxiter, n_attempts)\n",
    "    return(pd.DataFrame({'K': Ks, 'E': E}))\n",
    "\n",
    "out = scree(z, np.arange(1, 7), 10, 1000)\n",
    "out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=[4, 3])\n",
    "sns.lineplot(x='K', y='E', data=out)\n",
    "plt.ylim([0, 120])\n",
    "plt.annotate('Elbow', (5, 13))\n",
    "plt.tight_layout()\n",
    "plt.savefig('kmeans-scree.png')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hierarchical clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 11))\n",
    "hc = hierarchy.linkage(x, 'centroid')\n",
    "hierarchy.dendrogram(hc, orientation='right', labels=np.array(dat.Type), leaf_font_size=10,\n",
    "                    color_threshold=1.0)\n",
    "plt.xlabel('Inter-cluster distance')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('fruit-hierarchical.png')\n",
    "plt.savefig('fruit-hierarchical.pdf')\n",
    "\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}