TP_WordEmbedding/Exo.py

import pandas as pd
import matplotlib.pyplot as plt

document_1 = "le chat mange la souris"
document_2 = "le chien regarde le canard"
document_3 = "le canard regarde le chat"

corpus = (document_1, document_2, document_3)

# construction du dictionnaire
vocabulary = []
for d in corpus:
    for w in d.split(" "):
        if w not in vocabulary:
            vocabulary.append(w)

# calcul d'un histogramme simple sur le corpus

# initialisation de l'histogramme
freq = dict()
for v in vocabulary:
    freq[v] = 0

# comptage des occurrences
for d in corpus:
    for w in d.split(" "):
        freq[w] += 1

print(freq)
df = pd.DataFrame({'freq':freq.values()}, index=freq.keys())
ax = df.plot.bar(rot=0)
plt.show()

# calcul d'un histogramme par document
import numpy as np

V = len(vocabulary)
D = len(corpus)

tf_idf = np.zeros([D, V])

for i, d in enumerate(corpus):
    for w in d.split(" "):
        j = vocabulary.index(w)
        tf_idf[i,j] += 1

print(tf_idf)