master
parent
a207c71687
commit
b452b1b253
@ -0,0 +1,47 @@
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
document_1 = "le chat mange la souris"
|
||||
document_2 = "le chien regarde le canard"
|
||||
document_3 = "le canard regarde le chat"
|
||||
|
||||
corpus = (document_1, document_2, document_3)
|
||||
|
||||
# construction du dictionnaire
|
||||
vocabulary = []
|
||||
for d in corpus:
|
||||
for w in d.split(" "):
|
||||
if w not in vocabulary:
|
||||
vocabulary.append(w)
|
||||
|
||||
# calcul d'un histogramme simple sur le corpus
|
||||
|
||||
# initialisation de l'histogramme
|
||||
freq = dict()
|
||||
for v in vocabulary:
|
||||
freq[v] = 0
|
||||
|
||||
# comptage des occurrences
|
||||
for d in corpus:
|
||||
for w in d.split(" "):
|
||||
freq[w] += 1
|
||||
|
||||
print(freq)
|
||||
df = pd.DataFrame({'freq':freq.values()}, index=freq.keys())
|
||||
ax = df.plot.bar(rot=0)
|
||||
plt.show()
|
||||
|
||||
# calcul d'un histogramme par document
|
||||
import numpy as np
|
||||
|
||||
V = len(vocabulary)
|
||||
D = len(corpus)
|
||||
|
||||
tf_idf = np.zeros([D, V])
|
||||
|
||||
for i, d in enumerate(corpus):
|
||||
for w in d.split(" "):
|
||||
j = vocabulary.index(w)
|
||||
tf_idf[i,j] += 1
|
||||
|
||||
print(tf_idf)
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue