import pandas as pd import matplotlib.pyplot as plt document_1 = "le chat mange la souris" document_2 = "le chien regarde le canard" document_3 = "le canard regarde le chat" corpus = (document_1, document_2, document_3) # construction du dictionnaire vocabulary = [] for d in corpus: for w in d.split(" "): if w not in vocabulary: vocabulary.append(w) # calcul d'un histogramme simple sur le corpus # initialisation de l'histogramme freq = dict() for v in vocabulary: freq[v] = 0 # comptage des occurrences for d in corpus: for w in d.split(" "): freq[w] += 1 print(freq) df = pd.DataFrame({'freq':freq.values()}, index=freq.keys()) ax = df.plot.bar(rot=0) plt.show() # calcul d'un histogramme par document import numpy as np V = len(vocabulary) D = len(corpus) tf_idf = np.zeros([D, V]) for i, d in enumerate(corpus): for w in d.split(" "): j = vocabulary.index(w) tf_idf[i,j] += 1 print(tf_idf)