- Notifications
You must be signed in to change notification settings - Fork 296
/
Copy pathtopicmodel_test.py
109 lines (92 loc) · 2.69 KB
/
topicmodel_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
fromassertpyimportassert_that
importnumpy
importrandom
fromtopicmodelimportall_words
fromtopicmodelimportcluster_stories
fromtopicmodelimportmake_document_term_matrix
EPSILON=1e-9
deftest_all_words_empty():
assert_that(all_words([])).is_equal_to([])
deftest_all_words_single_doc():
document= {
'words': ['b', 'c', 'a']
}
assert_that(all_words([document])).is_equal_to(['a', 'b', 'c'])
deftest_all_words_many_docs():
doc1= {
'words': ['b', 'c', 'a']
}
doc2= {
'words': ['b', 'd', 'a']
}
doc3= {
'words': ['b', 'd', 'e']
}
assert_that(all_words([doc1, doc2, doc3])).is_equal_to(
['a', 'b', 'c', 'd', 'e'])
deftest_make_document_term_matrix_empty():
matrix, (index_to_word, index_to_document) =make_document_term_matrix([])
assert_that(index_to_document).is_equal_to({})
assert_that(index_to_word).is_equal_to({})
assert_that(matrix).is_equal_to(numpy.zeros((0, 0)))
deftest_make_document_term_matrix():
doc1= {
'words': ['b', 'c', 'a']
}
doc2= {
'words': ['b', 'd', 'a']
}
doc3= {
'words': ['b', 'd', 'e']
}
matrix, (index_to_word, index_to_document) =make_document_term_matrix(
[doc1, doc2, doc3])
assert_that(index_to_document).is_equal_to(
dict(enumerate([doc1, doc2, doc3])))
assert_that(index_to_word).is_equal_to(dict(enumerate('abcde')))
expected_matrix=numpy.array([
[1, 1, 0],
[1, 1, 1],
[1, 0, 0],
[0, 1, 1],
[0, 0, 1],
])
flattened_actual=matrix.flatten()
flattened_expected=expected_matrix.flatten()
for (a, b) inzip(flattened_actual, flattened_expected):
assert_that(a).is_close_to(b, EPSILON)
deftest_cluster_stories():
random.seed(1)
numpy.random.seed(1)
doc1= {
'words': ['b', 'c', 'a', 'd', 'e', 'c'],
'text': 'doc1',
}
doc2= {
'words': ['b', 'd', 'a', 'e', 'e', 'c'],
'text': 'doc2',
}
doc3= {
'words': ['x', 'y', 'z', 'x', 'y', 'w'],
'text': 'doc3',
}
doc4= {
'words': ['w', 'y', 'z', 'y', 'z'],
'text': 'doc4',
}
doc5= {
'words': ['z', 'w', 'z', 'w', 'w'],
'text': 'doc5',
}
doc6= {
'words': ['c', 'c', 'a', 'e', 'e'],
'text': 'doc6',
}
word_clusters, document_clusters=cluster_stories([
doc1, doc2, doc3, doc4, doc5, doc6], k=2)
assert_that(set(word_clusters)).contains_only(
('a', 'b', 'c', 'd', 'e'),
('w', 'x', 'y', 'z'))
assert_that(set(document_clusters)).contains_only(
('doc1', 'doc2', 'doc6'),
('doc3', 'doc4', 'doc5'))