47 lines
1,005 B
Python
47 lines
1,005 B
Python
|
import pandas as pd
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
document_1 = "le chat mange la souris"
|
||
|
document_2 = "le chien regarde le canard"
|
||
|
document_3 = "le canard regarde le chat"
|
||
|
|
||
|
corpus = (document_1, document_2, document_3)
|
||
|
|
||
|
# construction du dictionnaire
|
||
|
vocabulary = []
|
||
|
for d in corpus:
|
||
|
for w in d.split(" "):
|
||
|
if w not in vocabulary:
|
||
|
vocabulary.append(w)
|
||
|
|
||
|
# calcul d'un histogramme simple sur le corpus
|
||
|
|
||
|
# initialisation de l'histogramme
|
||
|
freq = dict()
|
||
|
for v in vocabulary:
|
||
|
freq[v] = 0
|
||
|
|
||
|
# comptage des occurrences
|
||
|
for d in corpus:
|
||
|
for w in d.split(" "):
|
||
|
freq[w] += 1
|
||
|
|
||
|
print(freq)
|
||
|
df = pd.DataFrame({'freq':freq.values()}, index=freq.keys())
|
||
|
ax = df.plot.bar(rot=0)
|
||
|
plt.show()
|
||
|
|
||
|
# calcul d'un histogramme par document
|
||
|
import numpy as np
|
||
|
|
||
|
V = len(vocabulary)
|
||
|
D = len(corpus)
|
||
|
|
||
|
tf_idf = np.zeros([D, V])
|
||
|
|
||
|
for i, d in enumerate(corpus):
|
||
|
for w in d.split(" "):
|
||
|
j = vocabulary.index(w)
|
||
|
tf_idf[i,j] += 1
|
||
|
|
||
|
print(tf_idf)
|