TP_WordEmbedding/Exo.py
2022-09-22 17:41:54 +02:00

47 lines
No EOL
1,005 B
Python

import pandas as pd
import matplotlib.pyplot as plt
document_1 = "le chat mange la souris"
document_2 = "le chien regarde le canard"
document_3 = "le canard regarde le chat"
corpus = (document_1, document_2, document_3)
# construction du dictionnaire
vocabulary = []
for d in corpus:
for w in d.split(" "):
if w not in vocabulary:
vocabulary.append(w)
# calcul d'un histogramme simple sur le corpus
# initialisation de l'histogramme
freq = dict()
for v in vocabulary:
freq[v] = 0
# comptage des occurrences
for d in corpus:
for w in d.split(" "):
freq[w] += 1
print(freq)
df = pd.DataFrame({'freq':freq.values()}, index=freq.keys())
ax = df.plot.bar(rot=0)
plt.show()
# calcul d'un histogramme par document
import numpy as np
V = len(vocabulary)
D = len(corpus)
tf_idf = np.zeros([D, V])
for i, d in enumerate(corpus):
for w in d.split(" "):
j = vocabulary.index(w)
tf_idf[i,j] += 1
print(tf_idf)