In [3]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np   

In [None]:
from gensim.models import Word2Vec

In [4]:
p_1 = ["le", "chat", "mange", "la", "souris"]
p_2 = ["le", "chien", "regarde", "le", "canard"]
p_3 = ["le","canard", "regarde", "le", "chat"]
corpus = [p_1, p_2, p_3]

In [12]:
model = Word2Vec(min_count=1, vector_size=5)
model.build_vocab(corpus)  # prepare the model vocabulary
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

(7, 75)

In [15]:
print(model.wv['chat'])
print(model.wv['souris'])
print(model.wv['chien'])

print(model.wv.most_similar(positive=['chat'], topn=2))
print(model.wv.most_similar(positive=['regarde'], topn=2))

[-0.03632035  0.0575316   0.01983747 -0.1657043  -0.18897638]
[-0.06811106 -0.01892909  0.1153778  -0.15044104 -0.07872642]
[0.14623405 0.10140646 0.13515887 0.01525312 0.12701929]
[('souris', 0.7668752074241638), ('canard', 0.6178626418113708)]
[('le', 0.45935946702957153), ('mange', 0.17478135228157043)]


In [16]:

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)

ValueError: perplexity must be less than n_samples