corr 2 entrainement

2022-10-07 13:05:56 +02:00 · 2022-10-07 13:05:56 +02:00 · 6d392763fc
commit 6d392763fc
parent c11c8cc2bb
1 changed files with 192 additions and 0 deletions
--- a/wordvec_model.ipynb
+++ b/wordvec_model.ipynb
@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.decomposition import IncrementalPCA    # inital reduction\n",
+    "from sklearn.manifold import TSNE                   # final reduction\n",
+    "import numpy as np   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.models import Word2Vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p_1 = [\"le\", \"chat\", \"mange\", \"la\", \"souris\"]\n",
+    "p_2 = [\"le\", \"chien\", \"regarde\", \"le\", \"canard\"]\n",
+    "p_3 = [\"le\",\"canard\", \"regarde\", \"le\", \"chat\"]\n",
+    "corpus = [p_1, p_2, p_3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(7, 75)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = Word2Vec(min_count=1, vector_size=5)\n",
+    "model.build_vocab(corpus)  # prepare the model vocabulary\n",
+    "model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-0.03632035  0.0575316   0.01983747 -0.1657043  -0.18897638]\n",
+      "[-0.06811106 -0.01892909  0.1153778  -0.15044104 -0.07872642]\n",
+      "[0.14623405 0.10140646 0.13515887 0.01525312 0.12701929]\n",
+      "[('souris', 0.7668752074241638), ('canard', 0.6178626418113708)]\n",
+      "[('le', 0.45935946702957153), ('mange', 0.17478135228157043)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model.wv['chat'])\n",
+    "print(model.wv['souris'])\n",
+    "print(model.wv['chien'])\n",
+    "\n",
+    "print(model.wv.most_similar(positive=['chat'], topn=2))\n",
+    "print(model.wv.most_similar(positive=['regarde'], topn=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "perplexity must be less than n_samples",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19020\\1722442376.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mx_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreduce_dimensions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mplot_with_plotly\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplot_in_notebook\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19020\\1722442376.py\u001b[0m in \u001b[0;36mreduce_dimensions\u001b[1;34m(model)\u001b[0m\n\u001b[0;32m      8\u001b[0m     \u001b[1;31m# reduce using t-SNE\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m     \u001b[0mtsne\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTSNE\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn_components\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnum_dimensions\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m     \u001b[0mvectors\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtsne\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvectors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m     \u001b[0mx_vals\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvectors\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\Users\\Sasa\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py\u001b[0m in \u001b[0;36mfit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m   1120\u001b[0m             \u001b[0mEmbedding\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mtraining\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlow\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mdimensional\u001b[0m \u001b[0mspace\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1121\u001b[0m         \"\"\"\n\u001b[1;32m-> 1122\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_params_vs_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1123\u001b[0m         \u001b[0membedding\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1124\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0membedding_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0membedding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\Users\\Sasa\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py\u001b[0m in \u001b[0;36m_check_params_vs_input\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    791\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_check_params_vs_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    792\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mperplexity\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 793\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"perplexity must be less than n_samples\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    794\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    795\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskip_num_points\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mValueError\u001b[0m: perplexity must be less than n_samples"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "def reduce_dimensions(model):\n",
+    "    num_dimensions = 2  # final num dimensions (2D, 3D, etc)\n",
+    "\n",
+    "    # extract the words & their vectors, as numpy arrays\n",
+    "    vectors = np.asarray(model.wv.vectors)\n",
+    "    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings\n",
+    "\n",
+    "    # reduce using t-SNE\n",
+    "    tsne = TSNE(n_components=num_dimensions, random_state=0)\n",
+    "    vectors = tsne.fit_transform(vectors)\n",
+    "\n",
+    "    x_vals = [v[0] for v in vectors]\n",
+    "    y_vals = [v[1] for v in vectors]\n",
+    "    return x_vals, y_vals, labels\n",
+    "\n",
+    "\n",
+    "x_vals, y_vals, labels = reduce_dimensions(model)\n",
+    "\n",
+    "def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):\n",
+    "    from plotly.offline import init_notebook_mode, iplot, plot\n",
+    "    import plotly.graph_objs as go\n",
+    "\n",
+    "    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)\n",
+    "    data = [trace]\n",
+    "\n",
+    "    if plot_in_notebook:\n",
+    "        init_notebook_mode(connected=True)\n",
+    "        iplot(data, filename='word-embedding-plot')\n",
+    "    else:\n",
+    "        plot(data, filename='word-embedding-plot.html')\n",
+    "\n",
+    "\n",
+    "def plot_with_matplotlib(x_vals, y_vals, labels):\n",
+    "    import matplotlib.pyplot as plt\n",
+    "    import random\n",
+    "\n",
+    "    random.seed(0)\n",
+    "\n",
+    "    plt.figure(figsize=(12, 12))\n",
+    "    plt.scatter(x_vals, y_vals)\n",
+    "\n",
+    "    #\n",
+    "    # Label randomly subsampled 25 data points\n",
+    "    #\n",
+    "    indices = list(range(len(labels)))\n",
+    "    selected_indices = random.sample(indices, 25)\n",
+    "    for i in selected_indices:\n",
+    "        plt.annotate(labels[i], (x_vals[i], y_vals[i]))\n",
+    "\n",
+    "try:\n",
+    "    get_ipython()\n",
+    "except Exception:\n",
+    "    plot_function = plot_with_matplotlib\n",
+    "else:\n",
+    "    plot_function = plot_with_plotly\n",
+    "\n",
+    "plot_function(x_vals, y_vals, labels)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.4 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "2ef431f6525756fa8a44688585fa332ef3b2e5fcfe8fe75df35bbf7028a8b511"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}