- Notifications
You must be signed in to change notification settings - Fork 46.7k
/
Copy pathtop_k_frequent_words.py
100 lines (84 loc) · 3.09 KB
/
top_k_frequent_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
Finds the top K most frequent words from the provided word list.
This implementation aims to show how to solve the problem using the Heap class
already present in this repository.
Computing order statistics is, in fact, a typical usage of heaps.
This is mostly shown for educational purposes, since the problem can be solved
in a few lines using collections.Counter from the Python standard library:
from collections import Counter
def top_k_frequent_words(words, k_value):
return [x[0] for x in Counter(words).most_common(k_value)]
"""
fromcollectionsimportCounter
fromfunctoolsimporttotal_ordering
fromdata_structures.heap.heapimportHeap
@total_ordering
classWordCount:
def__init__(self, word: str, count: int) ->None:
self.word=word
self.count=count
def__eq__(self, other: object) ->bool:
"""
>>> WordCount('a', 1).__eq__(WordCount('b', 1))
True
>>> WordCount('a', 1).__eq__(WordCount('a', 1))
True
>>> WordCount('a', 1).__eq__(WordCount('a', 2))
False
>>> WordCount('a', 1).__eq__(WordCount('b', 2))
False
>>> WordCount('a', 1).__eq__(1)
NotImplemented
"""
ifnotisinstance(other, WordCount):
returnNotImplemented
returnself.count==other.count
def__lt__(self, other: object) ->bool:
"""
>>> WordCount('a', 1).__lt__(WordCount('b', 1))
False
>>> WordCount('a', 1).__lt__(WordCount('a', 1))
False
>>> WordCount('a', 1).__lt__(WordCount('a', 2))
True
>>> WordCount('a', 1).__lt__(WordCount('b', 2))
True
>>> WordCount('a', 2).__lt__(WordCount('a', 1))
False
>>> WordCount('a', 2).__lt__(WordCount('b', 1))
False
>>> WordCount('a', 1).__lt__(1)
NotImplemented
"""
ifnotisinstance(other, WordCount):
returnNotImplemented
returnself.count<other.count
deftop_k_frequent_words(words: list[str], k_value: int) ->list[str]:
"""
Returns the `k_value` most frequently occurring words,
in non-increasing order of occurrence.
In this context, a word is defined as an element in the provided list.
In case `k_value` is greater than the number of distinct words, a value of k equal
to the number of distinct words will be considered, instead.
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
['c', 'a', 'b']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
['c', 'a']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
['c']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
[]
>>> top_k_frequent_words([], 1)
[]
>>> top_k_frequent_words(['a', 'a'], 2)
['a']
"""
heap: Heap[WordCount] =Heap()
count_by_word=Counter(words)
heap.build_max_heap(
[WordCount(word, count) forword, countincount_by_word.items()]
)
return [heap.extract_max().wordfor_inrange(min(k_value, len(count_by_word)))]
if__name__=="__main__":
importdoctest
doctest.testmod()