corr 2 entrainement

This commit is contained in:
higepi 2022-10-07 13:05:56 +02:00
parent c11c8cc2bb
commit 6d392763fc

192
wordvec_model.ipynb Normal file
View file

@ -0,0 +1,192 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.decomposition import IncrementalPCA # inital reduction\n",
"from sklearn.manifold import TSNE # final reduction\n",
"import numpy as np "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from gensim.models import Word2Vec"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"p_1 = [\"le\", \"chat\", \"mange\", \"la\", \"souris\"]\n",
"p_2 = [\"le\", \"chien\", \"regarde\", \"le\", \"canard\"]\n",
"p_3 = [\"le\",\"canard\", \"regarde\", \"le\", \"chat\"]\n",
"corpus = [p_1, p_2, p_3]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7, 75)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = Word2Vec(min_count=1, vector_size=5)\n",
"model.build_vocab(corpus) # prepare the model vocabulary\n",
"model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-0.03632035 0.0575316 0.01983747 -0.1657043 -0.18897638]\n",
"[-0.06811106 -0.01892909 0.1153778 -0.15044104 -0.07872642]\n",
"[0.14623405 0.10140646 0.13515887 0.01525312 0.12701929]\n",
"[('souris', 0.7668752074241638), ('canard', 0.6178626418113708)]\n",
"[('le', 0.45935946702957153), ('mange', 0.17478135228157043)]\n"
]
}
],
"source": [
"print(model.wv['chat'])\n",
"print(model.wv['souris'])\n",
"print(model.wv['chien'])\n",
"\n",
"print(model.wv.most_similar(positive=['chat'], topn=2))\n",
"print(model.wv.most_similar(positive=['regarde'], topn=2))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "perplexity must be less than n_samples",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19020\\1722442376.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mx_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreduce_dimensions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mplot_with_plotly\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_vals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplot_in_notebook\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19020\\1722442376.py\u001b[0m in \u001b[0;36mreduce_dimensions\u001b[1;34m(model)\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;31m# reduce using t-SNE\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mtsne\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTSNE\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn_components\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnum_dimensions\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mvectors\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtsne\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvectors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mx_vals\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvectors\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\Sasa\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py\u001b[0m in \u001b[0;36mfit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 1120\u001b[0m \u001b[0mEmbedding\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mtraining\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlow\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mdimensional\u001b[0m \u001b[0mspace\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1121\u001b[0m \"\"\"\n\u001b[1;32m-> 1122\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_params_vs_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1123\u001b[0m \u001b[0membedding\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1124\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0membedding_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0membedding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\Sasa\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py\u001b[0m in \u001b[0;36m_check_params_vs_input\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 791\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_check_params_vs_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 792\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mperplexity\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 793\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"perplexity must be less than n_samples\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 794\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 795\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskip_num_points\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: perplexity must be less than n_samples"
]
}
],
"source": [
"\n",
"def reduce_dimensions(model):\n",
" num_dimensions = 2 # final num dimensions (2D, 3D, etc)\n",
"\n",
" # extract the words & their vectors, as numpy arrays\n",
" vectors = np.asarray(model.wv.vectors)\n",
" labels = np.asarray(model.wv.index_to_key) # fixed-width numpy strings\n",
"\n",
" # reduce using t-SNE\n",
" tsne = TSNE(n_components=num_dimensions, random_state=0)\n",
" vectors = tsne.fit_transform(vectors)\n",
"\n",
" x_vals = [v[0] for v in vectors]\n",
" y_vals = [v[1] for v in vectors]\n",
" return x_vals, y_vals, labels\n",
"\n",
"\n",
"x_vals, y_vals, labels = reduce_dimensions(model)\n",
"\n",
"def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):\n",
" from plotly.offline import init_notebook_mode, iplot, plot\n",
" import plotly.graph_objs as go\n",
"\n",
" trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)\n",
" data = [trace]\n",
"\n",
" if plot_in_notebook:\n",
" init_notebook_mode(connected=True)\n",
" iplot(data, filename='word-embedding-plot')\n",
" else:\n",
" plot(data, filename='word-embedding-plot.html')\n",
"\n",
"\n",
"def plot_with_matplotlib(x_vals, y_vals, labels):\n",
" import matplotlib.pyplot as plt\n",
" import random\n",
"\n",
" random.seed(0)\n",
"\n",
" plt.figure(figsize=(12, 12))\n",
" plt.scatter(x_vals, y_vals)\n",
"\n",
" #\n",
" # Label randomly subsampled 25 data points\n",
" #\n",
" indices = list(range(len(labels)))\n",
" selected_indices = random.sample(indices, 25)\n",
" for i in selected_indices:\n",
" plt.annotate(labels[i], (x_vals[i], y_vals[i]))\n",
"\n",
"try:\n",
" get_ipython()\n",
"except Exception:\n",
" plot_function = plot_with_matplotlib\n",
"else:\n",
" plot_function = plot_with_plotly\n",
"\n",
"plot_function(x_vals, y_vals, labels)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.4 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "2ef431f6525756fa8a44688585fa332ef3b2e5fcfe8fe75df35bbf7028a8b511"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}