import mmh3 from random_things import string_generator class HashMap: def __init__(self, num_of_buckets=4, load_factor=0.75): assert (num_of_buckets >= 1) self.num_of_buckets = num_of_buckets self.mp = [[] for i in range(num_of_buckets)] self.LOAD_FACTOR = load_factor self.num_of_elements_in_ht = 0 self.longest_chain = [-1, -1] def bucket(self, key, num_of_buckets): k = mmh3.hash(key) return k % num_of_buckets # return HashMap.my_own_hash_function(key) % 3 def _does_it_need_resizing(self): return self.num_of_elements_in_ht / self.num_of_buckets >= self.LOAD_FACTOR def resize(self): if self._does_it_need_resizing(): new_hash_table_size = self.num_of_buckets * 2 new_hash_table = [[] for i in range(new_hash_table_size)] for linkedlist in self.mp: for element in linkedlist: bucket_index = self.bucket(element[0], new_hash_table_size) new_hash_table[bucket_index].append(element) if len(new_hash_table[bucket_index]) > self.longest_chain[0]: self.longest_chain = [len(new_hash_table[bucket_index]), new_hash_table[bucket_index]] self.num_of_buckets = new_hash_table_size self.mp = new_hash_table def set(self, key, value): bucket_index = self.bucket(key, self.num_of_buckets) element = self._get(self.mp[bucket_index], key) if element: element[1] = value return element self.mp[bucket_index].append([key, value]) if len(self.mp[bucket_index]) > self.longest_chain[0]: self.longest_chain = [len(self.mp[bucket_index]), self.mp[bucket_index]] self.num_of_elements_in_ht += 1 self.resize() return [key, value] def _get(self, bucket, key): for ele in bucket: if ele[0] == key: return ele return None def get(self, key): bucket_index = self.bucket(key, self.num_of_buckets) get_resp = self._get(self.mp[bucket_index], key) return get_resp[1] if get_resp else None def remove(self, key): bucket_index = self.bucket(key, self.num_of_buckets) for ele in self.mp[bucket_index]: if ele[0] == key: self.mp[bucket_index].remove(ele) self.resize() self.num_of_elements_in_ht -= 1 if len(self.mp[bucket_index]) < self.longest_chain[0]: self.longest_chain = [len(self.mp[bucket_index]), self.mp[bucket_index]] return def get_the_entire_map(self): return self.mp.copy() @staticmethod def my_own_hash_function(key): return 0 def get_longest_chain(self): return self.longest_chain def num_of_elements(self): return self.num_of_elements_in_ht def keys(self): k = [] for linkedlist in self.mp: for element in linkedlist: k.append(element[0]) return k def main(): mp = HashMap(2, load_factor=1) # mp.set("my_string", "my_string") # mp.set("my_string2", "my_string") # mp.set("my_string3", "my_string") # mp.set("my_string4", "my_string") # mp.set("my_string5", "my_string") # mp.set("my_string6", "my_string") # mp.set("my_string7", "my_string") # mp.set("my_string8", "my_string") # print(f"Size of map {len(mp.get_the_entire_map())} and number of elements is : {mp.num_of_elements()}") import time import random start = time.time() while True: for my_count, my_string in enumerate(string_generator(20, random.randint(10, 50000))): mp.set(my_string, my_string) mp.set(my_string, my_string) if time.time() - start >= 1: print( f"Size of map {len(mp.get_the_entire_map())} and number of elements is : {mp.num_of_elements()}. Longest chain {mp.get_longest_chain()[0]}") time.sleep(0.200) # start = time.time() for key in mp.keys(): mp.remove(key) if time.time() - start >= 1: print( f"Size of map {len(mp.get_the_entire_map())} and number of elements is : {mp.num_of_elements()}. Longest chain {mp.get_longest_chain()[0]}") time.sleep(0.200) start = time.time() mp = HashMap(2) # for idx, val in enumerate(mp.get_the_entire_map()): # print(idx, len(val)) # print(mp.set("my_string", "my_string1")) # print(mp.get("my_string")) if __name__ == "__main__": main()
1 Answer
cite your references
from random_things import string_generator
What is the meaning of this? We need a # comment
in the source code describing its provenance.
Based on pep-8 and how you wrote this, you're telling the Gentle Reader that this dep comes from a library outside of the current project. If that's not accurate, use isort to clarify your intent.
(OTOH your reference to mmh3 was very clear, thank you, as MurmurHash is available on pypi.)
extra parens
def __init__(self, num_of_buckets=4, load_factor=0.75): ... self.num_of_buckets = num_of_buckets
This is perfectly nice and bog standard.
Consider renaming the first parameter so we assign like this:
self.num_of_buckets = initial_num_of_buckets
That would stress that the caller is only offering a hint, and is not bound to that many buckets over the map's lifetime.
assert (num_of_buckets >= 1)
No. Please elide the extra (
)
parentheses. Suppose we get a feature request in future for displaying the actual number upon failure. We wouldn't want to trick some poor maintenance engineer into accidentally writing the following erroneous code:
assert (num_of_buckets >= 1, num_of_buckets)
comments where appropriate
self.longest_chain = [-1, -1]
That's rather cryptic. Offer a hint about how to interpret that 2-tuple. And consider using tuple
rather than list
. Bonus points for a self-describing namedtuple.
Later on the whole cryptic ele[0]
thing is tedious and would similarly benefit from a self-describing datastructure.
self.mp = [[] for i in range(num_of_buckets)]
Sorry, you lost me there. No idea how I should mentally pronounce "mp". Please offer the Gentle Reader a clue about those two words.
(I really hope it isn't a vowel-less form of "map". To avoid shadowing a builtin we conventionally append an _
underscore, giving map_
.)
Similarly, tell us about "ht" (num_of_elements_in_ht
). Wouldn't you like to use the conventional name of size
?
EDIT Further down in the code you reveal it is a Hash Table. I'd been expecting Hash Map based on the class name, whatever.
use object attributes
def bucket(self, key, num_of_buckets): k = mmh3.hash(key) return k % num_of_buckets
Ummm, sorry, you lost me there. Why did we pass in 2nd parameter? Instead of return k % self.num_of_buckets
?
EDIT OIC, resize
could have doubled self.num_of_buckets
early, but instead chose to do it late. Consider doing it earlier, prior to looping.
# return HashMap.my_own_hash_function(key) % 3
Commented out code is fine during debugging. But you're requesting review for a PR merge-to-main, and that's the right time to prune such things from the source before showing it to others.
use accurate names
for linkedlist in self.mp:
I see what the identifier is trying to tell me, but it's not clear to me that we have a linked list of nodes that each have a node.next
field. If you had wanted that, I would have expected the import of deque.
def resize(self):
Consider renaming this to maybe_resize
, since often the call will be a no-op.
use a None sentinel
def _get(self, bucket, key): ... return None
Wow, I didn't see that one coming. A HashMap
is like a dict
, but one cannot store a {"foo": None}
value?!? And set
doesn't complain when given a None
?
App-level programmers are going to want to use None
. Recommend you create your own none_sentinel = object()
so you can return
and test against that.
@staticmethod def my_own_hash_function(key): return 0
World's greatest hash function, I know. In a """docstring""" you might spell out that this is for testing. Plus this gives you a great opportunity to specify what you expect of a hash function, e.g. whether negative return values are acceptable. Minimally you should annotate:
def my_own_hash_function(key: str) -> int:
public attributes
def get_longest_chain(self): return self.longest_chain
Ok, I bit my tongue on the map .copy() thing, but this is even more silly. This public method is absolutely not pulling its weight, given that it offers access to a public attribute. Either rename to self._longest_chain
or delete this method.
def num_of_elements(self): return self.num_of_elements_in_ht
Yup, not getting that one, either. I would expect this container to conform to the Sized
protocol, so that len(my_map)
would be how an app developer accesses this. Suggesting a name of def __len__
.
offer a generator
def keys(self):
This method is lovely, thank you.
It allocates storage proportional to map size. Consider offering a yield element[0]
generator instead. If someone really needs a list they can just request list(my_map.keys())
, same as they would for a dict
.
loop where appropriate
# mp.set("my_string8", "my_string")
Same remark as above -- review time is when we clean up comments.
But even before you commented it out, a for
loop would have been a much more convenient way to stimulate the system-under-test.
stress the code
enumerate(string_generator(20, ...
I'm guessing we insert twenty keys.
That's great during manual testing when you're trying to initially get things working, so you're not visually overwhelmed by a data dump.
Now that it works and you're obtaining timings, I encourage you to fill it with many more keys. That way if an operation isn't O(1) or O(N) it will show up in the timing data. I'm concerned this code may be rather on the slow side.
Publishing a self-evaluating test suite as part of this codebase wouldn't hurt.
This code appears to achieve its objectives.
Given a few trivial renames I would be willing to delegate or accept maintenance tasks on it.
- \$\begingroup\$Hi. Thank you for all the valuable suggestions. I agree with all of them. There are few things I did to check if my map is working correctly. I have two queries 1. What is the right place to resize() a hashmap? 2. I have tried benchmarking and as the hash function seems to be deciding factor. Do you see this implementation being much slower than the official stdlib dict? I do understand it being implemented in C and having bindings to python. There are a lot of optimizations that would be in the native dict as well. So far I am not accounting for real edge cases.\$\endgroup\$– piepiCommentedAug 12, 2023 at 12:28