w2v article

This commit is contained in:
higepi 2022-09-26 13:27:41 +02:00
parent d900c7e01d
commit 7a2c703abb
4 changed files with 300 additions and 2 deletions

View file

@ -1,2 +1,6 @@
TP de Word Embedding pour la revue 3EI.
# TP de Word Embedding pour la revue 3EI.
## Méthodes de plongement lexical
CBOW et Skig-gram
## Utilisation de Gensim pour Word2Vec
https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

Binary file not shown.

287
wordvec.ipynb Normal file
View file

@ -0,0 +1,287 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import gensim.downloader as api\n",
"wv = api.load('word2vec-google-news-300')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Affichage des premiers mots du dictionnaire"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"word #0/3000000 is </s>\n",
"word #1/3000000 is in\n",
"word #2/3000000 is for\n",
"word #3/3000000 is that\n",
"word #4/3000000 is is\n"
]
}
],
"source": [
"# Affichage de quelques mots du vocabulaire\n",
"for index, word in enumerate(wv.index_to_key):\n",
" if index == 5:\n",
" break\n",
" print(f\"word #{index}/{len(wv.index_to_key)} is {word}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 1.25976562e-01 2.97851562e-02 8.60595703e-03 1.39648438e-01\n",
" -2.56347656e-02 -3.61328125e-02 1.11816406e-01 -1.98242188e-01\n",
" 5.12695312e-02 3.63281250e-01 -2.42187500e-01 -3.02734375e-01\n",
" -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01\n",
" 3.46679688e-02 5.21850586e-03 4.63867188e-02 1.28906250e-01\n",
" 1.36718750e-01 1.12792969e-01 5.95703125e-02 1.36718750e-01\n",
" 1.01074219e-01 -1.76757812e-01 -2.51953125e-01 5.98144531e-02\n",
" 3.41796875e-01 -3.11279297e-02 1.04492188e-01 6.17675781e-02\n",
" 1.24511719e-01 4.00390625e-01 -3.22265625e-01 8.39843750e-02\n",
" 3.90625000e-02 5.85937500e-03 7.03125000e-02 1.72851562e-01\n",
" 1.38671875e-01 -2.31445312e-01 2.83203125e-01 1.42578125e-01\n",
" 3.41796875e-01 -2.39257812e-02 -1.09863281e-01 3.32031250e-02\n",
" -5.46875000e-02 1.53198242e-02 -1.62109375e-01 1.58203125e-01\n",
" -2.59765625e-01 2.01416016e-02 -1.63085938e-01 1.35803223e-03\n",
" -1.44531250e-01 -5.68847656e-02 4.29687500e-02 -2.46582031e-02\n",
" 1.85546875e-01 4.47265625e-01 9.58251953e-03 1.31835938e-01\n",
" 9.86328125e-02 -1.85546875e-01 -1.00097656e-01 -1.33789062e-01\n",
" -1.25000000e-01 2.83203125e-01 1.23046875e-01 5.32226562e-02\n",
" -1.77734375e-01 8.59375000e-02 -2.18505859e-02 2.05078125e-02\n",
" -1.39648438e-01 2.51464844e-02 1.38671875e-01 -1.05468750e-01\n",
" 1.38671875e-01 8.88671875e-02 -7.51953125e-02 -2.13623047e-02\n",
" 1.72851562e-01 4.63867188e-02 -2.65625000e-01 8.91113281e-03\n",
" 1.49414062e-01 3.78417969e-02 2.38281250e-01 -1.24511719e-01\n",
" -2.17773438e-01 -1.81640625e-01 2.97851562e-02 5.71289062e-02\n",
" -2.89306641e-02 1.24511719e-02 9.66796875e-02 -2.31445312e-01\n",
" 5.81054688e-02 6.68945312e-02 7.08007812e-02 -3.08593750e-01\n",
" -2.14843750e-01 1.45507812e-01 -4.27734375e-01 -9.39941406e-03\n",
" 1.54296875e-01 -7.66601562e-02 2.89062500e-01 2.77343750e-01\n",
" -4.86373901e-04 -1.36718750e-01 3.24218750e-01 -2.46093750e-01\n",
" -3.03649902e-03 -2.11914062e-01 1.25000000e-01 2.69531250e-01\n",
" 2.04101562e-01 8.25195312e-02 -2.01171875e-01 -1.60156250e-01\n",
" -3.78417969e-02 -1.20117188e-01 1.15234375e-01 -4.10156250e-02\n",
" -3.95507812e-02 -8.98437500e-02 6.34765625e-03 2.03125000e-01\n",
" 1.86523438e-01 2.73437500e-01 6.29882812e-02 1.41601562e-01\n",
" -9.81445312e-02 1.38671875e-01 1.82617188e-01 1.73828125e-01\n",
" 1.73828125e-01 -2.37304688e-01 1.78710938e-01 6.34765625e-02\n",
" 2.36328125e-01 -2.08984375e-01 8.74023438e-02 -1.66015625e-01\n",
" -7.91015625e-02 2.43164062e-01 -8.88671875e-02 1.26953125e-01\n",
" -2.16796875e-01 -1.73828125e-01 -3.59375000e-01 -8.25195312e-02\n",
" -6.49414062e-02 5.07812500e-02 1.35742188e-01 -7.47070312e-02\n",
" -1.64062500e-01 1.15356445e-02 4.45312500e-01 -2.15820312e-01\n",
" -1.11328125e-01 -1.92382812e-01 1.70898438e-01 -1.25000000e-01\n",
" 2.65502930e-03 1.92382812e-01 -1.74804688e-01 1.39648438e-01\n",
" 2.92968750e-01 1.13281250e-01 5.95703125e-02 -6.39648438e-02\n",
" 9.96093750e-02 -2.72216797e-02 1.96533203e-02 4.27246094e-02\n",
" -2.46093750e-01 6.39648438e-02 -2.25585938e-01 -1.68945312e-01\n",
" 2.89916992e-03 8.20312500e-02 3.41796875e-01 4.32128906e-02\n",
" 1.32812500e-01 1.42578125e-01 7.61718750e-02 5.98144531e-02\n",
" -1.19140625e-01 2.74658203e-03 -6.29882812e-02 -2.72216797e-02\n",
" -4.82177734e-03 -8.20312500e-02 -2.49023438e-02 -4.00390625e-01\n",
" -1.06933594e-01 4.24804688e-02 7.76367188e-02 -1.16699219e-01\n",
" 7.37304688e-02 -9.22851562e-02 1.07910156e-01 1.58203125e-01\n",
" 4.24804688e-02 1.26953125e-01 3.61328125e-02 2.67578125e-01\n",
" -1.01074219e-01 -3.02734375e-01 -5.76171875e-02 5.05371094e-02\n",
" 5.26428223e-04 -2.07031250e-01 -1.38671875e-01 -8.97216797e-03\n",
" -2.78320312e-02 -1.41601562e-01 2.07031250e-01 -1.58203125e-01\n",
" 1.27929688e-01 1.49414062e-01 -2.24609375e-02 -8.44726562e-02\n",
" 1.22558594e-01 2.15820312e-01 -2.13867188e-01 -3.12500000e-01\n",
" -3.73046875e-01 4.08935547e-03 1.07421875e-01 1.06933594e-01\n",
" 7.32421875e-02 8.97216797e-03 -3.88183594e-02 -1.29882812e-01\n",
" 1.49414062e-01 -2.14843750e-01 -1.83868408e-03 9.91210938e-02\n",
" 1.57226562e-01 -1.14257812e-01 -2.05078125e-01 9.91210938e-02\n",
" 3.69140625e-01 -1.97265625e-01 3.54003906e-02 1.09375000e-01\n",
" 1.31835938e-01 1.66992188e-01 2.35351562e-01 1.04980469e-01\n",
" -4.96093750e-01 -1.64062500e-01 -1.56250000e-01 -5.22460938e-02\n",
" 1.03027344e-01 2.43164062e-01 -1.88476562e-01 5.07812500e-02\n",
" -9.37500000e-02 -6.68945312e-02 2.27050781e-02 7.61718750e-02\n",
" 2.89062500e-01 3.10546875e-01 -5.37109375e-02 2.28515625e-01\n",
" 2.51464844e-02 6.78710938e-02 -1.21093750e-01 -2.15820312e-01\n",
" -2.73437500e-01 -3.07617188e-02 -3.37890625e-01 1.53320312e-01\n",
" 2.33398438e-01 -2.08007812e-01 3.73046875e-01 8.20312500e-02\n",
" 2.51953125e-01 -7.61718750e-02 -4.66308594e-02 -2.23388672e-02\n",
" 2.99072266e-02 -5.93261719e-02 -4.66918945e-03 -2.44140625e-01\n",
" -2.09960938e-01 -2.87109375e-01 -4.54101562e-02 -1.77734375e-01\n",
" -2.79296875e-01 -8.59375000e-02 9.13085938e-02 2.51953125e-01]\n"
]
}
],
"source": [
"# Conversion du mot en vecteur\n",
"vec_king = wv['king']\n",
"print(vec_king)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Similarités entre les mots"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('lumber', 0.678311824798584), ('timber', 0.6622092127799988), ('softwoods', 0.649677038192749)]\n",
"[('fleur', 0.6820620894432068), ('jardin', 0.6616703271865845), (\"c'est_le\", 0.6514866352081299)]\n"
]
}
],
"source": [
"# N mots les plus proches du mot cible\n",
"print(wv.most_similar(positive=['wood'], topn=3))\n",
"print(wv.most_similar(positive=['bois'], topn=3))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'wood'\t'pine'\tSimilarité : 0.59\n",
"'wood'\t'leaf'\tSimilarité : 0.30\n",
"'wood'\t'plank'\tSimilarité : 0.26\n"
]
}
],
"source": [
"# Similarité entre des paires de mots\n",
"pairs = [\n",
" ('wood', 'pine'), \n",
" ('wood', 'leaf'),\n",
" ('wood', 'plank') \n",
"]\n",
"for w1, w2 in pairs:\n",
" print('%r\\t%r\\tSimilarité : %.2f' % (w1, w2, wv.similarity(w1, w2)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Le modèle a été entrainé en anglais donc il ne fonctionne pas dans les autres langues"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Exemple d'opération sur les mots"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('mother', 0.8671472072601318)]\n"
]
}
],
"source": [
"# Opérations sur les mots\n",
"vec_father = wv['father']\n",
"vec_man = wv['man']\n",
"vec_woman = wv['woman']\n",
"\n",
"result = wv.most_similar(positive=(vec_father - vec_man + vec_woman), topn=1)\n",
"print(result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On obtient bien le mot \"maman\" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut également jouer au jeu de l'intru"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"iron\n"
]
}
],
"source": [
"print(wv.doesnt_match(['wood', 'oak', 'tree', 'iron', 'leaf']))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.4 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "2ef431f6525756fa8a44688585fa332ef3b2e5fcfe8fe75df35bbf7028a8b511"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -9,3 +9,10 @@ wv = api.load('word2vec-google-news-300')
print(wv.most_similar(positive=['car'], topn=5))
print(wv.most_similar(positive=['voiture'], topn=5))
vec_father = wv['father']
vec_man = wv['man']
vec_woman = wv['woman']
result = wv.most_similar(positive=(vec_father - vec_man + vec_woman), topn=1)
print(result)