diff --git a/wordvec_model.ipynb b/wordvec_model.ipynb new file mode 100644 index 0000000..8926f44 --- /dev/null +++ b/wordvec_model.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import IncrementalPCA # inital reduction\n", + "from sklearn.manifold import TSNE # final reduction\n", + "import numpy as np " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.models import Word2Vec" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "p_1 = [\"le\", \"chat\", \"mange\", \"la\", \"souris\"]\n", + "p_2 = [\"le\", \"chien\", \"regarde\", \"le\", \"canard\"]\n", + "p_3 = [\"le\",\"canard\", \"regarde\", \"le\", \"chat\"]\n", + "corpus = [p_1, p_2, p_3]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7, 75)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = Word2Vec(min_count=1, vector_size=5)\n", + "model.build_vocab(corpus) # prepare the model vocabulary\n", + "model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.03632035 0.0575316 0.01983747 -0.1657043 -0.18897638]\n", + "[-0.06811106 -0.01892909 0.1153778 -0.15044104 -0.07872642]\n", + "[0.14623405 0.10140646 0.13515887 0.01525312 0.12701929]\n", + "[('souris', 0.7668752074241638), ('canard', 0.6178626418113708)]\n", + "[('le', 0.45935946702957153), ('mange', 0.17478135228157043)]\n" + ] + } + ], + "source": [ + "print(model.wv['chat'])\n", + "print(model.wv['souris'])\n", + "print(model.wv['chien'])\n", + "\n", + "print(model.wv.most_similar(positive=['chat'], topn=2))\n", + "print(model.wv.most_similar(positive=['regarde'], topn=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "perplexity must be less than n_samples", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19020\\1722442376.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mx_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreduce_dimensions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mplot_with_plotly\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplot_in_notebook\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19020\\1722442376.py\u001b[0m in \u001b[0;36mreduce_dimensions\u001b[1;34m(model)\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;31m# reduce using t-SNE\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mtsne\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTSNE\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn_components\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnum_dimensions\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mvectors\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtsne\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvectors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mx_vals\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvectors\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\Sasa\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py\u001b[0m in \u001b[0;36mfit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 1120\u001b[0m \u001b[0mEmbedding\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mtraining\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlow\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mdimensional\u001b[0m \u001b[0mspace\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1121\u001b[0m \"\"\"\n\u001b[1;32m-> 1122\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_params_vs_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1123\u001b[0m \u001b[0membedding\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1124\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0membedding_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0membedding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\Sasa\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py\u001b[0m in \u001b[0;36m_check_params_vs_input\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 791\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_check_params_vs_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 792\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mperplexity\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 793\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"perplexity must be less than n_samples\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 794\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 795\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskip_num_points\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: perplexity must be less than n_samples" + ] + } + ], + "source": [ + "\n", + "def reduce_dimensions(model):\n", + " num_dimensions = 2 # final num dimensions (2D, 3D, etc)\n", + "\n", + " # extract the words & their vectors, as numpy arrays\n", + " vectors = np.asarray(model.wv.vectors)\n", + " labels = np.asarray(model.wv.index_to_key) # fixed-width numpy strings\n", + "\n", + " # reduce using t-SNE\n", + " tsne = TSNE(n_components=num_dimensions, random_state=0)\n", + " vectors = tsne.fit_transform(vectors)\n", + "\n", + " x_vals = [v[0] for v in vectors]\n", + " y_vals = [v[1] for v in vectors]\n", + " return x_vals, y_vals, labels\n", + "\n", + "\n", + "x_vals, y_vals, labels = reduce_dimensions(model)\n", + "\n", + "def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):\n", + " from plotly.offline import init_notebook_mode, iplot, plot\n", + " import plotly.graph_objs as go\n", + "\n", + " trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)\n", + " data = [trace]\n", + "\n", + " if plot_in_notebook:\n", + " init_notebook_mode(connected=True)\n", + " iplot(data, filename='word-embedding-plot')\n", + " else:\n", + " plot(data, filename='word-embedding-plot.html')\n", + "\n", + "\n", + "def plot_with_matplotlib(x_vals, y_vals, labels):\n", + " import matplotlib.pyplot as plt\n", + " import random\n", + "\n", + " random.seed(0)\n", + "\n", + " plt.figure(figsize=(12, 12))\n", + " plt.scatter(x_vals, y_vals)\n", + "\n", + " #\n", + " # Label randomly subsampled 25 data points\n", + " #\n", + " indices = list(range(len(labels)))\n", + " selected_indices = random.sample(indices, 25)\n", + " for i in selected_indices:\n", + " plt.annotate(labels[i], (x_vals[i], y_vals[i]))\n", + "\n", + "try:\n", + " get_ipython()\n", + "except Exception:\n", + " plot_function = plot_with_matplotlib\n", + "else:\n", + " plot_function = plot_with_plotly\n", + "\n", + "plot_function(x_vals, y_vals, labels)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.4 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "2ef431f6525756fa8a44688585fa332ef3b2e5fcfe8fe75df35bbf7028a8b511" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}