- Notifications
You must be signed in to change notification settings - Fork 296
/
Copy pathtopicmodel.py
133 lines (98 loc) · 3.88 KB
/
topicmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
'''A simple topic model using singular value decomposition
applied to a corpus of CNN stories.
'''
importjson
importnumpyasnp
fromcollectionsimportCounter
fromscipy.cluster.vqimportkmeans2
# from numpy.linalg import svd
fromsvdimportsvd
defnormalize(matrix):
'''Normalize a document term matrix according to a local and
global normalization factor.
For this we chose a simple logarithmic local normalization
with a global normalization based on entropy.
'''
num_words, num_docs=matrix.shape
local_factors=np.log(np.ones(matrix.shape) +matrix.copy())
probabilities=matrix.copy()
row_sums=np.sum(matrix, axis=1)
# divide each column by the row sums
assertall(x>0forxinrow_sums)
probabilities= (probabilities.T/row_sums).T
entropies= (probabilities*np.ma.log(probabilities).filled(0) /
np.log(num_docs))
global_factors=np.ones(num_words) +np.sum(entropies, axis=1)
# multiply each column by the global factors for the rows
normalized_matrix= (local_factors.T*global_factors).T
returnnormalized_matrix
defmake_document_term_matrix(documents):
'''Return the document-term matrix for the given list of stories.
Arguments:
documents: a list of dictionaries of the form
{
'words': [string]
'text': string
}
The list of words include repetition.
Returns:
A document-term matrix. Entry [i, j] is the count of word i
in story j.
'''
words=all_words(documents)
word_to_index=dict((word, i) fori, wordinenumerate(words))
index_to_word=dict(enumerate(words))
index_to_document=dict(enumerate(documents))
matrix=np.zeros((len(words), len(documents)))
fordoc_id, documentinenumerate(documents):
doc_words=Counter(document['words'])
forword, countindoc_words.items():
matrix[word_to_index[word], doc_id] =count
returnmatrix, (index_to_word, index_to_document)
defcluster(vectors):
print(vectors)
returnkmeans2(vectors, k=len(vectors[0]))
defall_words(documents):
'''Return a list of all unique words in the input list of documents.'''
words=set()
forentryindocuments:
words|=set(entry['words'])
returnlist(sorted(words))
defload(filename='all_stories.json'):
withopen(filename, 'r') asinfile:
returnjson.loads(infile.read())
defcluster_stories(documents, k=10):
'''Cluster a set of documents using a simple SVD-based topic model.
Arguments:
documents: a list of dictionaries of the form
{
'words': [string]
'text': string
}
k: the number of singular values to compute.
Returns:
A pair of (word_clusters, document_clusters), where word_clusters
is a clustering over the set of all words in all documents, and
document_clustering is a clustering over the set of documents.
'''
matrix, (index_to_word, index_to_document) =make_document_term_matrix(
documents)
matrix=normalize(matrix)
sigma, U, V=svd(matrix, k=k)
projected_documents=np.dot(matrix.T, U)
projected_words=np.dot(matrix, V.T)
document_centers, document_clustering=cluster(projected_documents)
word_centers, word_clustering=cluster(projected_words)
word_clusters=tuple(
tuple(index_to_word[i]
for (i, x) inenumerate(word_clustering) ifx==j)
forjinrange(len(set(word_clustering)))
)
document_clusters=tuple(
tuple(index_to_document[i]['text']
for (i, x) inenumerate(document_clustering) ifx==j)
forjinrange(len(set(document_clustering)))
)
returnword_clusters, document_clusters
if__name__=="__main__":
word_clusters, document_clusters=cluster_stories(load())