forked from TheAlgorithms/Python
- Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjaro_winkler.py
80 lines (67 loc) · 2.13 KB
/
jaro_winkler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"""
defjaro_winkler(str1: str, str2: str) ->float:
"""
Jaro-Winkler distance is a string metric measuring an edit distance between two
sequences.
Output value is between 0.0 and 1.0.
>>> jaro_winkler("martha", "marhta")
0.9611111111111111
>>> jaro_winkler("CRATE", "TRACE")
0.7333333333333334
>>> jaro_winkler("test", "dbdbdbdb")
0.0
>>> jaro_winkler("test", "test")
1.0
>>> jaro_winkler("hello world", "HeLLo W0rlD")
0.6363636363636364
>>> jaro_winkler("test", "")
0.0
>>> jaro_winkler("hello", "world")
0.4666666666666666
>>> jaro_winkler("hell**o", "*world")
0.4365079365079365
"""
defget_matched_characters(_str1: str, _str2: str) ->str:
matched= []
limit=min(len(_str1), len(_str2)) //2
fori, charinenumerate(_str1):
left=int(max(0, i-limit))
right=int(min(i+limit+1, len(_str2)))
ifcharin_str2[left:right]:
matched.append(char)
_str2= (
f"{_str2[0 : _str2.index(char)]}{_str2[_str2.index(char) +1 :]}"
)
return"".join(matched)
# matching characters
matching_1=get_matched_characters(str1, str2)
matching_2=get_matched_characters(str2, str1)
match_count=len(matching_1)
# transposition
transpositions= (
len([(c1, c2) forc1, c2inzip(matching_1, matching_2) ifc1!=c2]) //2
)
ifnotmatch_count:
jaro=0.0
else:
jaro= (
1
/3
* (
match_count/len(str1)
+match_count/len(str2)
+ (match_count-transpositions) /match_count
)
)
# common prefix up to 4 characters
prefix_len=0
forc1, c2inzip(str1[:4], str2[:4]):
ifc1==c2:
prefix_len+=1
else:
break
returnjaro+0.1*prefix_len* (1-jaro)
if__name__=="__main__":
importdoctest
doctest.testmod()
print(jaro_winkler("hello", "world"))