{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Informatics 2 - Foundations of Data Science\n", "\n", "# Clustering with K-means and basic Hierarchical clustering\n", "\n", "David Sterratt, 2020-2024" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import numpy as np\n", "import scipy\n", "import matplotlib.pyplot as plt\n", "import matplotlib\n", "from scipy.cluster import hierarchy\n", "\n", "matplotlib.rcParams['savefig.dpi'] = 300\n", "matplotlib.rcParams['font.size'] = 12\n", "matplotlib.rcParams['figure.figsize'] = [3, 3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data\n", "We'll use the [oranges and lemons data set](http://homepages.inf.ed.ac.uk/imurray2/teaching/oranges_and_lemons/) collected by Iain Murray." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dat = pd.read_csv('fruits.csv')\n", "dat.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = plt.figure(figsize=(5, 5))\n", "ax = sns.scatterplot(x='Width (cm)', y='Height (cm)', data=dat)\n", "ax.set_aspect(1)\n", "ax.set_xlim([2, 10])\n", "plt.tight_layout()\n", "#fig.savefig('fruit-raw.png')\n", "#fig.savefig('fruit-raw.pdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Functions to perform K-means clustering" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def sqdist(m, x):\n", " n = x.shape[0]\n", " D = x.shape[1]\n", " K = m.shape[0]\n", " sumsq = np.zeros((n, K))\n", " for i in range(D):\n", " sumsq = sumsq + np.power(np.subtract.outer(x[:,i], m[:,i]), 2)\n", " return(sumsq)\n", "\n", "def kmeans(x, K, maxiter, verbose=True):\n", " n = x.shape[0]\n", " D = x.shape[1]\n", " \n", " ## Initialisation\n", " \n", " # Choose random data points as cluster centres\n", " m = x[np.random.choice(range(n), K, replace=False), :]\n", " sqd = sqdist(x, m)\n", " c = sqd.argmin(0)\n", " \n", " i = 0\n", " converged = False\n", " c0 = np.zeros(n)\n", " ## Iterate\n", " while (not converged) and (i < maxiter):\n", " i = i + 1\n", " ## Assign each point to closest cluster centre\n", "\n", " ## Compute distances\n", " sqd = sqdist(x, m)\n", " c = sqd.argmin(0)\n", "\n", " ## Recompute means\n", " for k in range(K):\n", " m[k,:] = np.mean(x[k == c,:], 0) \n", " \n", " if ((c0 == c).all()):\n", " if verbose:\n", " print(\"Converged after \" + str(i) + \" iterations\")\n", " converged = True\n", " \n", " c0 = c.copy() ## Keep copy of old cluster assignments to check for convergence\n", "\n", " return((c, m))\n", "\n", "def kmeans_iteration(x, m):\n", " K = m.shape[0]\n", " \n", " ## Compute distances\n", " sqd = sqdist(x, m)\n", " c = sqd.argmin(0)\n", "\n", " ## Recompute means\n", " for k in range(K):\n", " m[k,:] = np.mean(x[k == c,:], 0)\n", " \n", " return((c, m))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cluster the data\n", "\n", "Our K-means algorithm doesn't standardise the data, so we need to do this ourselves first." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x = dat[['Width (cm)', 'Height (cm)']].to_numpy()\n", "z = (x - x.mean(0))/x.std(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dat['Cluster'], zbar = kmeans(z, 5, 100)\n", "xbar = pd.DataFrame(zbar*np.array(x.std()) + np.array(x.mean()), columns = ['Width (cm)', 'Height (cm)'])\n", "xbar['Cluster'] = list(range(xbar.shape[0]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = plt.figure(figsize=(5, 5))\n", "ax = sns.scatterplot(x='Width (cm)', y='Height (cm)', hue='Cluster', style='Cluster', data=dat,\n", " palette='colorblind')\n", "# sns.scatterplot(x=xbar[:,0], y=xbar[:,1], color='black', data=dat, palette='colorblind')\n", "ax.set_xlim([2, 10])\n", "ax.set_aspect(1)\n", "\n", "\n", "# plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)\n", "plt.tight_layout()\n", "##fig.savefig('fruit-clustered.png')\n", "##fig.savefig('fruit-clustered.pdf')\n", "\n", "sns.scatterplot(x='Width (cm)', y='Height (cm)', color='black', style='Cluster', data=xbar, palette='colorblind',\n", " legend=False, sizes=(150,150), size='Cluster', alpha=0.5)\n", "##fig.savefig('fruit-clustered-with-centres.png')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = plt.figure(figsize=(5, 5))\n", "\n", "ax = sns.scatterplot(x='Width (cm)', y='Height (cm)', hue='Fruit', style='Fruit', data=dat, palette='colorblind')\n", "# plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)\n", "ax.set_xlim([2, 10])\n", "ax.set_aspect(1)\n", "ax.legend(loc='upper left')\n", "plt.tight_layout()\n", "\n", "#fig.savefig('fruit-original.png')\n", "#fig.savefig('fruit-original.pdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Kmeans step-by-step" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def sum_sq_err(m, x, c):\n", " n = x.shape[0]\n", " E = 0\n", " for i in range(n):\n", " E = E + np.sum(np.power(x[i,:] - m[c[i],:], 2))\n", " return(E)\n", "\n", "def plot_kmeans(c, zbar, z):\n", " fig = plt.figure(figsize=(5, 5))\n", " ax = sns.scatterplot(x=z[:,0], y=z[:,1], hue=c, style=c, palette='colorblind')\n", " sns.scatterplot(x=zbar[:,0], y=zbar[:,1], color='black', style=list(range(zbar.shape[0])),\n", " palette='colorblind', legend=False, sizes=(150,150), size=2, alpha=0.5)\n", " ax.set_xlim([-3, 3])\n", " ax.set_ylim([-3, 3])\n", " ax.set_aspect(1)\n", " plt.annotate(\"E = \" + '%3.2f'%(sum_sq_err(zbar, z, c)), (1, -2.5))\n", " plt.xlabel('Width')\n", " plt.ylabel('Height')\n", " plt.tight_layout()\n", "\n", "\n", "x = dat[['Width (cm)', 'Height (cm)']].to_numpy()\n", "z = (x - x.mean(0))/x.std(0)\n", "c, zbar = kmeans(z, 5, 0)\n", "i = 1\n", "plot_kmeans(c, zbar, z)\n", "plt.savefig('iteration' + str(i) + '.png')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "i = i + 1\n", "c, zbar = kmeans_iteration(z, zbar)\n", "plot_kmeans(c, zbar, z)\n", "plt.savefig('iteration' + str(i) + '.png')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "c, zbar = kmeans_iteration(z, zbar)\n", "plot_kmeans(c, zbar, z)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "c, zbar = kmeans_iteration(z, zbar)\n", "plot_kmeans(c, zbar, z)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "c, zbar = kmeans_iteration(z, zbar)\n", "plot_kmeans(c, zbar, z)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "c, zbar = kmeans_iteration(z, zbar)\n", "plot_kmeans(c, zbar, z)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation of Kmeans" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "c, zbar = kmeans(z, 5, 10)\n", "plot_kmeans(c, zbar, z)\n", "# plt.savefig('kmeans-evaluation1.png')\n", "# plt.savefig('kmeans-evaluation1.pdf')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "c, zbar = kmeans(z, 5, 10)\n", "plot_kmeans(c, zbar, z)\n", "# plt.savefig('kmeans-evaluation2.png')\n", "# plt.savefig('kmeans-evaluation2.pdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multiple attempts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def kmeans_multiple(z, K, maxiter, n_attempts):\n", " c0, zbar0 = kmeans(z, K, maxiter, False)\n", " E0 = sum_sq_err(zbar0, z, c0)\n", " for i in range(n_attempts):\n", " c, zbar = kmeans(z, K, maxiter, False)\n", " E = sum_sq_err(zbar0, z, c0)\n", " if E < E0:\n", " c0 = c.copy()\n", " zbar0 = zbar.copy()\n", " E0 = E\n", " return(c0, zbar0, E0)\n", "\n", "def scree(z, Ks, maxiter, n_attempts):\n", " E = np.zeros(len(Ks))\n", " for i in range(len(Ks)):\n", " c, zbar, E[i] = kmeans_multiple(z, Ks[i], maxiter, n_attempts)\n", " return(pd.DataFrame({'K': Ks, 'E': E}))\n", "\n", "out = scree(z, np.arange(1, 7), 10, 1000)\n", "out" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=[4, 3])\n", "sns.lineplot(x='K', y='E', data=out)\n", "plt.ylim([0, 120])\n", "plt.annotate('Elbow', (5, 13))\n", "plt.tight_layout()\n", "plt.savefig('kmeans-scree.png')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hierarchical clustering" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(10, 11))\n", "hc = hierarchy.linkage(x, 'centroid')\n", "hierarchy.dendrogram(hc, orientation='right', labels=np.array(dat.Type), leaf_font_size=10,\n", " color_threshold=1.0)\n", "plt.xlabel('Inter-cluster distance')\n", "\n", "plt.tight_layout()\n", "plt.savefig('fruit-hierarchical.png')\n", "plt.savefig('fruit-hierarchical.pdf')\n", "\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" } }, "nbformat": 4, "nbformat_minor": 4 }