programmers-introduction-to-mathematics/singular_value_decomposition/topicmodel.py at master · pim-book/programmers-introduction-to-mathematics · GitHub

Name: programmers-introduction-to-mathematics/singular_value_decomposition/topicmodel.py at master · pim-book/programmers-introduction-to-mathematics · GitHub
Rating: 4.5 (6204 reviews)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
'''A simple topic model using singular value decomposition
applied to a corpus of CNN stories.
'''
importjson
importnumpyasnp
fromcollectionsimportCounter
fromscipy.cluster.vqimportkmeans2

# from numpy.linalg import svd
fromsvdimportsvd


defnormalize(matrix):
'''Normalize a document term matrix according to a local and
 global normalization factor.

 For this we chose a simple logarithmic local normalization
 with a global normalization based on entropy.
 '''
num_words, num_docs=matrix.shape
local_factors=np.log(np.ones(matrix.shape) +matrix.copy())

probabilities=matrix.copy()
row_sums=np.sum(matrix, axis=1)

# divide each column by the row sums
assertall(x>0forxinrow_sums)
probabilities= (probabilities.T/row_sums).T

entropies= (probabilities*np.ma.log(probabilities).filled(0) /
np.log(num_docs))
global_factors=np.ones(num_words) +np.sum(entropies, axis=1)

# multiply each column by the global factors for the rows
normalized_matrix= (local_factors.T*global_factors).T
returnnormalized_matrix


defmake_document_term_matrix(documents):
'''Return the document-term matrix for the given list of stories.

 Arguments:
 documents: a list of dictionaries of the form

 {
 'words': [string]
 'text': string
 }

 The list of words include repetition.

 Returns:
 A document-term matrix. Entry [i, j] is the count of word i
 in story j.
 '''
words=all_words(documents)
word_to_index=dict((word, i) fori, wordinenumerate(words))
index_to_word=dict(enumerate(words))
index_to_document=dict(enumerate(documents))

matrix=np.zeros((len(words), len(documents)))
fordoc_id, documentinenumerate(documents):
doc_words=Counter(document['words'])
forword, countindoc_words.items():
matrix[word_to_index[word], doc_id] =count

returnmatrix, (index_to_word, index_to_document)


defcluster(vectors):
print(vectors)
returnkmeans2(vectors, k=len(vectors[0]))


defall_words(documents):
'''Return a list of all unique words in the input list of documents.'''
words=set()
forentryindocuments:
words|=set(entry['words'])
returnlist(sorted(words))


defload(filename='all_stories.json'):
withopen(filename, 'r') asinfile:
returnjson.loads(infile.read())


defcluster_stories(documents, k=10):
'''Cluster a set of documents using a simple SVD-based topic model.

 Arguments:
 documents: a list of dictionaries of the form

 {
 'words': [string]
 'text': string
 }

 k: the number of singular values to compute.

 Returns:
 A pair of (word_clusters, document_clusters), where word_clusters
 is a clustering over the set of all words in all documents, and
 document_clustering is a clustering over the set of documents.
 '''
matrix, (index_to_word, index_to_document) =make_document_term_matrix(
documents)
matrix=normalize(matrix)
sigma, U, V=svd(matrix, k=k)

projected_documents=np.dot(matrix.T, U)
projected_words=np.dot(matrix, V.T)

document_centers, document_clustering=cluster(projected_documents)
word_centers, word_clustering=cluster(projected_words)

word_clusters=tuple(
tuple(index_to_word[i]
for (i, x) inenumerate(word_clustering) ifx==j)
forjinrange(len(set(word_clustering)))
 )

document_clusters=tuple(
tuple(index_to_document[i]['text']
for (i, x) inenumerate(document_clustering) ifx==j)
forjinrange(len(set(document_clustering)))
 )

returnword_clusters, document_clusters


if__name__=="__main__":
word_clusters, document_clusters=cluster_stories(load())