{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# TP1 KMEANS\n", "\n", "On nous propose de coder l'algorithme des kmeans afin de faire du clustering sur 2 classes puis plus de 2 classes.\n", "Plus tard, on utilisera notre algorithme pour segmenter une image sur l'information de couleur." ] }, { "cell_type": "code", "execution_count": 379, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import scipy.spatial" ] }, { "cell_type": "code", "execution_count": 380, "metadata": {}, "outputs": [], "source": [ "# mean = [1,2,3,4]\n", "# sd = [0.25, 0.25, 0.1, 0.2]\n", "clusters = 2\n", "mean = np.random.randint(5, size=clusters)\n", "sd = [0.25, 0.25, 0.3]\n", "dim = 2\n", "nb = 50\n", "K= clusters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fonctions à utiliser pour le clustering" ] }, { "cell_type": "code", "execution_count": 381, "metadata": {}, "outputs": [], "source": [ "def gen_points(mean=1,sd=0.5, nb=100, dim=2, clusters=2):\n", " size = []\n", " # for i in range(0,dim):\n", " size.append(nb)\n", " size.append(dim)\n", " points = np.random.normal(mean[0],sd[0],size=size)\n", " for i in range(1,clusters):\n", " points = np.concatenate((points,np.random.normal(mean[i],sd[i],size=size)),axis=0)\n", " \n", " return points" ] }, { "cell_type": "code", "execution_count": 382, "metadata": {}, "outputs": [], "source": [ "def distance(points,Pc): \n", " return scipy.spatial.distance.cdist(points[:,:], Pc[:,:])" ] }, { "cell_type": "code", "execution_count": 383, "metadata": {}, "outputs": [], "source": [ "def kmeans(points = [0,0], K = 1, nb=1, dim=2):\n", " # Initialisation K prototypes\n", " Pc_index = []\n", " Pc_save = np.zeros([K,dim])\n", " clusters = []\n", " iter = 0\n", " eps = 0.1\n", "\n", " for i in range(0,K):\n", " Pc_index.append(np.random.randint(0,nb*dim))\n", " Pc = points[Pc_index,:]\n", "\n", " while (np.mean(distance(Pc,Pc_save)) > eps and iter < 10):\n", " iter += 1\n", " Pc_save = Pc\n", " # print(Pc)\n", " # print(points[:,:Pc.shape[0]])\n", " dist = distance(points=points[:,:Pc.shape[0]],Pc=Pc)\n", " clust = np.argmin(dist, axis=1)\n", " clust = np.expand_dims(clust, axis=0)\n", " points = np.append(points[:,:Pc.shape[0]], clust.T, axis=1)\n", " # print(points)\n", " Pc = np.zeros([K,dim])\n", " index = np.array([])\n", "\n", " for n in range(0,2*nb):\n", " for k in range(0,K):\n", " index = np.append(index, (clust==k).sum())\n", " if points[n,-1] == k:\n", " # print(points)\n", " # print(Pc)\n", " Pc[k,:] = np.add(Pc[k,:], points[n,:-1])\n", "\n", " for k in range(0,K):\n", " Pc[k,:] = np.divide(Pc[k,:],index[k])\n", "\n", " # print(Pc)\n", " return Pc, points\n" ] }, { "cell_type": "code", "execution_count": 384, "metadata": {}, "outputs": [], "source": [ "colors=['red', 'green','yellow','blue','purple', 'orange']\n", "def visualisation(points, Pc=[0,0], dim=2, K=1):\n", " if(dim==2):\n", " for k in range(0,K):\n", " for n in range(0,len(points)):\n", " plt.plot(points[n,0], points[n,1], 'o', color=colors[int(points[n,-1])])\n", " plt.plot(Pc[:,0],Pc[:,1],'r+')\n", " plt.grid(True)\n", " plt.axis([min(mean)-1,max(mean)+1,min(mean)-1,max(mean)+1])" ] }, { "cell_type": "code", "execution_count": 385, "metadata": {}, "outputs": [], "source": [ "points = gen_points(mean,sd,nb,dim,clusters)\n", "# print(points.shape)\n", "# print(points.mean(axis=0))\n", "# print(points)" ] }, { "cell_type": "code", "execution_count": 386, "metadata": {}, "outputs": [], "source": [ "dist = distance(points,points)\n", "# print(dist)" ] }, { "cell_type": "code", "execution_count": 387, "metadata": {}, "outputs": [], "source": [ "Pc, clusters = kmeans(points,K=K,nb=nb,dim=dim)\n", "# print(Pc)\n", "# print(clusters)\n" ] }, { "cell_type": "code", "execution_count": 388, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 2.00659379 2.0037594 ]\n", " [-0.05586229 -0.02372516]]\n", "[0 2]\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "visualisation(clusters, Pc, dim=dim, K=K)\n", "print(Pc)\n", "print(mean)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 2 }