forked from llvm/llvm-project
- Notifications
You must be signed in to change notification settings - Fork 339
/
Copy pathunicode-case-fold.py
executable file
·143 lines (116 loc) · 4.56 KB
/
unicode-case-fold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
"""
Unicode case folding database conversion utility
Parses the database and generates a C++ function which implements the case
folding algorithm. The database entries are of the form:
<code>; <status>; <mapping>; # <name>
<status> can be one of four characters:
C - Common mappings
S - mappings for Simple case folding
F - mappings for Full case folding
T - special case for Turkish I characters
Right now this generates a function which implements simple case folding (C+S
entries).
"""
from __future__ importprint_function
importsys
importre
try:
fromurllib.requestimporturlopen
exceptImportError:
fromurllib2importurlopen
# This variable will body of the mappings function
body=""
# Reads file line-by-line, extracts Common and Simple case fold mappings and
# returns a (from_char, to_char, from_name) tuple.
defmappings(f):
previous_from=-1
expr=re.compile(r'^(.*); [CS]; (.*); # (.*)')
forlineinf:
m=expr.match(line)
ifnotm: continue
from_char=int(m.group(1), 16)
to_char=int(m.group(2), 16)
from_name=m.group(3)
iffrom_char<=previous_from:
raiseException("Duplicate or unsorted characters in input")
yieldfrom_char, to_char, from_name
previous_from=from_char
# Computes the shift (to_char - from_char) in a mapping.
defshift(mapping):
returnmapping[1] -mapping[0]
# Computes the stride (from_char2 - from_char1) of two mappings.
defstride2(mapping1, mapping2):
returnmapping2[0] -mapping1[0]
# Computes the stride of a list of mappings. The list should have at least two
# mappings. All mappings in the list are assumed to have the same stride.
defstride(block):
returnstride2(block[0], block[1])
# b is a list of mappings. All the mappings are assumed to have the same
# shift and the stride between adjecant mappings (if any) is constant.
defdump_block(b):
globalbody
iflen(b) ==1:
# Special case for handling blocks of length 1. We don't even need to
# emit the "if (C < X) return C" check below as all characters in this
# range will be caught by the "C < X" check emitted by the first
# non-trivial block.
body+=" // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
return
first=b[0][0]
last=first+stride(b) * (len(b)-1)
modulo=first%stride(b)
# All characters before this block map to themselves.
body+=" if (C < {0:#06x})\n return C;\n".format(first)
body+=" // {0} characters\n".format(len(b))
# Generic pattern: check upper bound (lower bound is checked by the "if"
# above) and modulo of C, return C+shift.
pattern=" if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"
ifstride(b) ==2andshift(b[0]) ==1andmodulo==0:
# Special case:
# We can elide the modulo-check because the expression "C|1" will map
# the intervening characters to themselves.
pattern=" if (C <= {0:#06x})\n return C | 1;\n"
elifstride(b) ==1:
# Another special case: X % 1 is always zero, so don't emit the
# modulo-check.
pattern=" if (C <= {0:#06x})\n return C + {3};\n"
body+=pattern.format(last, stride(b), modulo, shift(b[0]))
current_block= []
f=urlopen(sys.argv[1])
forminmappings(f):
iflen(current_block) ==0:
current_block.append(m)
continue
ifshift(current_block[0]) !=shift(m):
# Incompatible shift, start a new block.
dump_block(current_block)
current_block= [m]
continue
iflen(current_block) ==1orstride(current_block) ==stride2(current_block[-1], m):
current_block.append(m)
continue
# Incompatible stride, start a new block.
dump_block(current_block)
current_block= [m]
f.close()
dump_block(current_block)
print('//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//')
print('//')
print('// This file was generated by utils/unicode-case-fold.py from the Unicode')
print('// case folding database at')
print('// ', sys.argv[1])
print('//')
print('// To regenerate this file, run:')
print('// utils/unicode-case-fold.py \\')
print('// "{}" \\'.format(sys.argv[1]))
print('// > lib/Support/UnicodeCaseFold.cpp')
print('//')
print('//===----------------------------------------------------------------------===//')
print('')
print('#include "llvm/Support/Unicode.h"')
print('')
print("int llvm::sys::unicode::foldCharSimple(int C) {")
print(body)
print(" return C;")
print("}")