-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtf_idf.py
49 lines (45 loc) · 1.73 KB
/
tf_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import collections
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
top_n_words = 25
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=top_n_words):
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
for idx, score in sorted_items:
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
def tf_idf(groupings):
topics = []
labels = []
for label, sentences in groupings.items():
idf = []
for sentence in sentences:
idf+=sentence
topics.append(idf)
labels.append(label)
topics_input = []
for sentences in topics:
topics_input.append(" ".join(sentences))
cv=CountVectorizer(max_df=0.9,max_features=10000)
word_count_vector=cv.fit_transform(topics_input)
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
feature_names=cv.get_feature_names()
all_key_words = []
cluster_words = {}
for i in range(len(topics_input)):
doc=topics_input[i]
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
sorted_items=sort_coo(tf_idf_vector.tocoo())
keywords=extract_topn_from_vector(feature_names,sorted_items,10)
cluster_words[labels[i]] = list(keywords.keys())
all_key_words += list(keywords.keys())
return (set(all_key_words), cluster_words)