w2v article
This commit is contained in:
parent
d900c7e01d
commit
7a2c703abb
4 changed files with 300 additions and 2 deletions
|
@ -1,2 +1,6 @@
|
||||||
TP de Word Embedding pour la revue 3EI.
|
# TP de Word Embedding pour la revue 3EI.
|
||||||
|
## Méthodes de plongement lexical
|
||||||
|
CBOW et Skig-gram
|
||||||
|
|
||||||
|
## Utilisation de Gensim pour Word2Vec
|
||||||
https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py
|
https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py
|
Binary file not shown.
287
wordvec.ipynb
Normal file
287
wordvec.ipynb
Normal file
|
@ -0,0 +1,287 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import gensim.downloader as api\n",
|
||||||
|
"wv = api.load('word2vec-google-news-300')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Affichage des premiers mots du dictionnaire"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"word #0/3000000 is </s>\n",
|
||||||
|
"word #1/3000000 is in\n",
|
||||||
|
"word #2/3000000 is for\n",
|
||||||
|
"word #3/3000000 is that\n",
|
||||||
|
"word #4/3000000 is is\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Affichage de quelques mots du vocabulaire\n",
|
||||||
|
"for index, word in enumerate(wv.index_to_key):\n",
|
||||||
|
" if index == 5:\n",
|
||||||
|
" break\n",
|
||||||
|
" print(f\"word #{index}/{len(wv.index_to_key)} is {word}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[ 1.25976562e-01 2.97851562e-02 8.60595703e-03 1.39648438e-01\n",
|
||||||
|
" -2.56347656e-02 -3.61328125e-02 1.11816406e-01 -1.98242188e-01\n",
|
||||||
|
" 5.12695312e-02 3.63281250e-01 -2.42187500e-01 -3.02734375e-01\n",
|
||||||
|
" -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01\n",
|
||||||
|
" 3.46679688e-02 5.21850586e-03 4.63867188e-02 1.28906250e-01\n",
|
||||||
|
" 1.36718750e-01 1.12792969e-01 5.95703125e-02 1.36718750e-01\n",
|
||||||
|
" 1.01074219e-01 -1.76757812e-01 -2.51953125e-01 5.98144531e-02\n",
|
||||||
|
" 3.41796875e-01 -3.11279297e-02 1.04492188e-01 6.17675781e-02\n",
|
||||||
|
" 1.24511719e-01 4.00390625e-01 -3.22265625e-01 8.39843750e-02\n",
|
||||||
|
" 3.90625000e-02 5.85937500e-03 7.03125000e-02 1.72851562e-01\n",
|
||||||
|
" 1.38671875e-01 -2.31445312e-01 2.83203125e-01 1.42578125e-01\n",
|
||||||
|
" 3.41796875e-01 -2.39257812e-02 -1.09863281e-01 3.32031250e-02\n",
|
||||||
|
" -5.46875000e-02 1.53198242e-02 -1.62109375e-01 1.58203125e-01\n",
|
||||||
|
" -2.59765625e-01 2.01416016e-02 -1.63085938e-01 1.35803223e-03\n",
|
||||||
|
" -1.44531250e-01 -5.68847656e-02 4.29687500e-02 -2.46582031e-02\n",
|
||||||
|
" 1.85546875e-01 4.47265625e-01 9.58251953e-03 1.31835938e-01\n",
|
||||||
|
" 9.86328125e-02 -1.85546875e-01 -1.00097656e-01 -1.33789062e-01\n",
|
||||||
|
" -1.25000000e-01 2.83203125e-01 1.23046875e-01 5.32226562e-02\n",
|
||||||
|
" -1.77734375e-01 8.59375000e-02 -2.18505859e-02 2.05078125e-02\n",
|
||||||
|
" -1.39648438e-01 2.51464844e-02 1.38671875e-01 -1.05468750e-01\n",
|
||||||
|
" 1.38671875e-01 8.88671875e-02 -7.51953125e-02 -2.13623047e-02\n",
|
||||||
|
" 1.72851562e-01 4.63867188e-02 -2.65625000e-01 8.91113281e-03\n",
|
||||||
|
" 1.49414062e-01 3.78417969e-02 2.38281250e-01 -1.24511719e-01\n",
|
||||||
|
" -2.17773438e-01 -1.81640625e-01 2.97851562e-02 5.71289062e-02\n",
|
||||||
|
" -2.89306641e-02 1.24511719e-02 9.66796875e-02 -2.31445312e-01\n",
|
||||||
|
" 5.81054688e-02 6.68945312e-02 7.08007812e-02 -3.08593750e-01\n",
|
||||||
|
" -2.14843750e-01 1.45507812e-01 -4.27734375e-01 -9.39941406e-03\n",
|
||||||
|
" 1.54296875e-01 -7.66601562e-02 2.89062500e-01 2.77343750e-01\n",
|
||||||
|
" -4.86373901e-04 -1.36718750e-01 3.24218750e-01 -2.46093750e-01\n",
|
||||||
|
" -3.03649902e-03 -2.11914062e-01 1.25000000e-01 2.69531250e-01\n",
|
||||||
|
" 2.04101562e-01 8.25195312e-02 -2.01171875e-01 -1.60156250e-01\n",
|
||||||
|
" -3.78417969e-02 -1.20117188e-01 1.15234375e-01 -4.10156250e-02\n",
|
||||||
|
" -3.95507812e-02 -8.98437500e-02 6.34765625e-03 2.03125000e-01\n",
|
||||||
|
" 1.86523438e-01 2.73437500e-01 6.29882812e-02 1.41601562e-01\n",
|
||||||
|
" -9.81445312e-02 1.38671875e-01 1.82617188e-01 1.73828125e-01\n",
|
||||||
|
" 1.73828125e-01 -2.37304688e-01 1.78710938e-01 6.34765625e-02\n",
|
||||||
|
" 2.36328125e-01 -2.08984375e-01 8.74023438e-02 -1.66015625e-01\n",
|
||||||
|
" -7.91015625e-02 2.43164062e-01 -8.88671875e-02 1.26953125e-01\n",
|
||||||
|
" -2.16796875e-01 -1.73828125e-01 -3.59375000e-01 -8.25195312e-02\n",
|
||||||
|
" -6.49414062e-02 5.07812500e-02 1.35742188e-01 -7.47070312e-02\n",
|
||||||
|
" -1.64062500e-01 1.15356445e-02 4.45312500e-01 -2.15820312e-01\n",
|
||||||
|
" -1.11328125e-01 -1.92382812e-01 1.70898438e-01 -1.25000000e-01\n",
|
||||||
|
" 2.65502930e-03 1.92382812e-01 -1.74804688e-01 1.39648438e-01\n",
|
||||||
|
" 2.92968750e-01 1.13281250e-01 5.95703125e-02 -6.39648438e-02\n",
|
||||||
|
" 9.96093750e-02 -2.72216797e-02 1.96533203e-02 4.27246094e-02\n",
|
||||||
|
" -2.46093750e-01 6.39648438e-02 -2.25585938e-01 -1.68945312e-01\n",
|
||||||
|
" 2.89916992e-03 8.20312500e-02 3.41796875e-01 4.32128906e-02\n",
|
||||||
|
" 1.32812500e-01 1.42578125e-01 7.61718750e-02 5.98144531e-02\n",
|
||||||
|
" -1.19140625e-01 2.74658203e-03 -6.29882812e-02 -2.72216797e-02\n",
|
||||||
|
" -4.82177734e-03 -8.20312500e-02 -2.49023438e-02 -4.00390625e-01\n",
|
||||||
|
" -1.06933594e-01 4.24804688e-02 7.76367188e-02 -1.16699219e-01\n",
|
||||||
|
" 7.37304688e-02 -9.22851562e-02 1.07910156e-01 1.58203125e-01\n",
|
||||||
|
" 4.24804688e-02 1.26953125e-01 3.61328125e-02 2.67578125e-01\n",
|
||||||
|
" -1.01074219e-01 -3.02734375e-01 -5.76171875e-02 5.05371094e-02\n",
|
||||||
|
" 5.26428223e-04 -2.07031250e-01 -1.38671875e-01 -8.97216797e-03\n",
|
||||||
|
" -2.78320312e-02 -1.41601562e-01 2.07031250e-01 -1.58203125e-01\n",
|
||||||
|
" 1.27929688e-01 1.49414062e-01 -2.24609375e-02 -8.44726562e-02\n",
|
||||||
|
" 1.22558594e-01 2.15820312e-01 -2.13867188e-01 -3.12500000e-01\n",
|
||||||
|
" -3.73046875e-01 4.08935547e-03 1.07421875e-01 1.06933594e-01\n",
|
||||||
|
" 7.32421875e-02 8.97216797e-03 -3.88183594e-02 -1.29882812e-01\n",
|
||||||
|
" 1.49414062e-01 -2.14843750e-01 -1.83868408e-03 9.91210938e-02\n",
|
||||||
|
" 1.57226562e-01 -1.14257812e-01 -2.05078125e-01 9.91210938e-02\n",
|
||||||
|
" 3.69140625e-01 -1.97265625e-01 3.54003906e-02 1.09375000e-01\n",
|
||||||
|
" 1.31835938e-01 1.66992188e-01 2.35351562e-01 1.04980469e-01\n",
|
||||||
|
" -4.96093750e-01 -1.64062500e-01 -1.56250000e-01 -5.22460938e-02\n",
|
||||||
|
" 1.03027344e-01 2.43164062e-01 -1.88476562e-01 5.07812500e-02\n",
|
||||||
|
" -9.37500000e-02 -6.68945312e-02 2.27050781e-02 7.61718750e-02\n",
|
||||||
|
" 2.89062500e-01 3.10546875e-01 -5.37109375e-02 2.28515625e-01\n",
|
||||||
|
" 2.51464844e-02 6.78710938e-02 -1.21093750e-01 -2.15820312e-01\n",
|
||||||
|
" -2.73437500e-01 -3.07617188e-02 -3.37890625e-01 1.53320312e-01\n",
|
||||||
|
" 2.33398438e-01 -2.08007812e-01 3.73046875e-01 8.20312500e-02\n",
|
||||||
|
" 2.51953125e-01 -7.61718750e-02 -4.66308594e-02 -2.23388672e-02\n",
|
||||||
|
" 2.99072266e-02 -5.93261719e-02 -4.66918945e-03 -2.44140625e-01\n",
|
||||||
|
" -2.09960938e-01 -2.87109375e-01 -4.54101562e-02 -1.77734375e-01\n",
|
||||||
|
" -2.79296875e-01 -8.59375000e-02 9.13085938e-02 2.51953125e-01]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Conversion du mot en vecteur\n",
|
||||||
|
"vec_king = wv['king']\n",
|
||||||
|
"print(vec_king)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Similarités entre les mots"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[('lumber', 0.678311824798584), ('timber', 0.6622092127799988), ('softwoods', 0.649677038192749)]\n",
|
||||||
|
"[('fleur', 0.6820620894432068), ('jardin', 0.6616703271865845), (\"c'est_le\", 0.6514866352081299)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# N mots les plus proches du mot cible\n",
|
||||||
|
"print(wv.most_similar(positive=['wood'], topn=3))\n",
|
||||||
|
"print(wv.most_similar(positive=['bois'], topn=3))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"'wood'\t'pine'\tSimilarité : 0.59\n",
|
||||||
|
"'wood'\t'leaf'\tSimilarité : 0.30\n",
|
||||||
|
"'wood'\t'plank'\tSimilarité : 0.26\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Similarité entre des paires de mots\n",
|
||||||
|
"pairs = [\n",
|
||||||
|
" ('wood', 'pine'), \n",
|
||||||
|
" ('wood', 'leaf'),\n",
|
||||||
|
" ('wood', 'plank') \n",
|
||||||
|
"]\n",
|
||||||
|
"for w1, w2 in pairs:\n",
|
||||||
|
" print('%r\\t%r\\tSimilarité : %.2f' % (w1, w2, wv.similarity(w1, w2)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Le modèle a été entrainé en anglais donc il ne fonctionne pas dans les autres langues"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Exemple d'opération sur les mots"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[('mother', 0.8671472072601318)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Opérations sur les mots\n",
|
||||||
|
"vec_father = wv['father']\n",
|
||||||
|
"vec_man = wv['man']\n",
|
||||||
|
"vec_woman = wv['woman']\n",
|
||||||
|
"\n",
|
||||||
|
"result = wv.most_similar(positive=(vec_father - vec_man + vec_woman), topn=1)\n",
|
||||||
|
"print(result)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"On obtient bien le mot \"maman\" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"On peut également jouer au jeu de l'intru"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"iron\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(wv.doesnt_match(['wood', 'oak', 'tree', 'iron', 'leaf']))"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.9.4 64-bit",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.4"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4,
|
||||||
|
"vscode": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "2ef431f6525756fa8a44688585fa332ef3b2e5fcfe8fe75df35bbf7028a8b511"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
|
@ -9,3 +9,10 @@ wv = api.load('word2vec-google-news-300')
|
||||||
|
|
||||||
print(wv.most_similar(positive=['car'], topn=5))
|
print(wv.most_similar(positive=['car'], topn=5))
|
||||||
print(wv.most_similar(positive=['voiture'], topn=5))
|
print(wv.most_similar(positive=['voiture'], topn=5))
|
||||||
|
|
||||||
|
vec_father = wv['father']
|
||||||
|
vec_man = wv['man']
|
||||||
|
vec_woman = wv['woman']
|
||||||
|
|
||||||
|
result = wv.most_similar(positive=(vec_father - vec_man + vec_woman), topn=1)
|
||||||
|
print(result)
|
Loading…
Reference in a new issue