{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import IncrementalPCA # inital reduction\n", "from sklearn.manifold import TSNE # final reduction\n", "import numpy as np " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from gensim.models import Word2Vec" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "p_1 = [\"le\", \"chat\", \"mange\", \"la\", \"souris\"]\n", "p_2 = [\"le\", \"chien\", \"regarde\", \"le\", \"canard\"]\n", "p_3 = [\"le\",\"canard\", \"regarde\", \"le\", \"chat\"]\n", "corpus = [p_1, p_2, p_3]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7, 75)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = Word2Vec(min_count=1, vector_size=5)\n", "model.build_vocab(corpus) # prepare the model vocabulary\n", "model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[-0.03632035 0.0575316 0.01983747 -0.1657043 -0.18897638]\n", "[-0.06811106 -0.01892909 0.1153778 -0.15044104 -0.07872642]\n", "[0.14623405 0.10140646 0.13515887 0.01525312 0.12701929]\n", "[('souris', 0.7668752074241638), ('canard', 0.6178626418113708)]\n", "[('le', 0.45935946702957153), ('mange', 0.17478135228157043)]\n" ] } ], "source": [ "print(model.wv['chat'])\n", "print(model.wv['souris'])\n", "print(model.wv['chien'])\n", "\n", "print(model.wv.most_similar(positive=['chat'], topn=2))\n", "print(model.wv.most_similar(positive=['regarde'], topn=2))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "perplexity must be less than n_samples", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19020\\1722442376.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mx_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreduce_dimensions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mplot_with_plotly\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplot_in_notebook\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19020\\1722442376.py\u001b[0m in \u001b[0;36mreduce_dimensions\u001b[1;34m(model)\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;31m# reduce using t-SNE\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mtsne\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTSNE\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn_components\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnum_dimensions\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mvectors\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtsne\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvectors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mx_vals\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvectors\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\\Users\\Sasa\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py\u001b[0m in \u001b[0;36mfit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 1120\u001b[0m \u001b[0mEmbedding\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mtraining\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlow\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mdimensional\u001b[0m \u001b[0mspace\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1121\u001b[0m \"\"\"\n\u001b[1;32m-> 1122\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_params_vs_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1123\u001b[0m \u001b[0membedding\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1124\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0membedding_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0membedding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\\Users\\Sasa\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py\u001b[0m in \u001b[0;36m_check_params_vs_input\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 791\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_check_params_vs_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 792\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mperplexity\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 793\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"perplexity must be less than n_samples\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 794\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 795\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskip_num_points\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mValueError\u001b[0m: perplexity must be less than n_samples" ] } ], "source": [ "\n", "def reduce_dimensions(model):\n", " num_dimensions = 2 # final num dimensions (2D, 3D, etc)\n", "\n", " # extract the words & their vectors, as numpy arrays\n", " vectors = np.asarray(model.wv.vectors)\n", " labels = np.asarray(model.wv.index_to_key) # fixed-width numpy strings\n", "\n", " # reduce using t-SNE\n", " tsne = TSNE(n_components=num_dimensions, random_state=0)\n", " vectors = tsne.fit_transform(vectors)\n", "\n", " x_vals = [v[0] for v in vectors]\n", " y_vals = [v[1] for v in vectors]\n", " return x_vals, y_vals, labels\n", "\n", "\n", "x_vals, y_vals, labels = reduce_dimensions(model)\n", "\n", "def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):\n", " from plotly.offline import init_notebook_mode, iplot, plot\n", " import plotly.graph_objs as go\n", "\n", " trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)\n", " data = [trace]\n", "\n", " if plot_in_notebook:\n", " init_notebook_mode(connected=True)\n", " iplot(data, filename='word-embedding-plot')\n", " else:\n", " plot(data, filename='word-embedding-plot.html')\n", "\n", "\n", "def plot_with_matplotlib(x_vals, y_vals, labels):\n", " import matplotlib.pyplot as plt\n", " import random\n", "\n", " random.seed(0)\n", "\n", " plt.figure(figsize=(12, 12))\n", " plt.scatter(x_vals, y_vals)\n", "\n", " #\n", " # Label randomly subsampled 25 data points\n", " #\n", " indices = list(range(len(labels)))\n", " selected_indices = random.sample(indices, 25)\n", " for i in selected_indices:\n", " plt.annotate(labels[i], (x_vals[i], y_vals[i]))\n", "\n", "try:\n", " get_ipython()\n", "except Exception:\n", " plot_function = plot_with_matplotlib\n", "else:\n", " plot_function = plot_with_plotly\n", "\n", "plot_function(x_vals, y_vals, labels)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.4 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "2ef431f6525756fa8a44688585fa332ef3b2e5fcfe8fe75df35bbf7028a8b511" } } }, "nbformat": 4, "nbformat_minor": 2 }