- Notifications
You must be signed in to change notification settings - Fork 1.9k
/
Copy pathSuffixArray.java
179 lines (154 loc) · 6.07 KB
/
SuffixArray.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
packagecom.jwetherell.algorithms.data_structures;
importjava.util.ArrayList;
importjava.util.Collections;
importjava.util.Comparator;
/**
* In computer science, a suffix array is a sorted array of all suffixes of a string.
* It is a data structure used, among others, in full text indices, data compression
* algorithms and within the field of bibliometrics.
* <p>
* @see <a href="https://en.wikipedia.org/wiki/Suffix_array">Suffix Array (Wikipedia)</a>
* <p>
* NOTE: This implementation returns starting indexes instead of full suffixes
* <br>
* @author Jakub Szarawarski <kubaszarawarski@gmail.com>
* @author Justin Wetherell <phishman3579@gmail.com>
*/
publicclassSuffixArray {
privatestaticfinalStringBuilderSTRING_BUILDER = newStringBuilder();
privatestaticfinalcharDEFAULT_END_SEQ_CHAR = '$';
privatefinalcharendSeqChar;
privateStringstring;
privateArrayList<Integer> suffixArray;
privateArrayList<Integer> KMRarray;
publicSuffixArray(CharSequencesequence) {
this(sequence, DEFAULT_END_SEQ_CHAR);
}
publicSuffixArray(CharSequencesequence, charendChar) {
endSeqChar = endChar;
string = buildStringWithEndChar(sequence);
}
publicArrayList<Integer> getSuffixArray() {
if (suffixArray == null)
KMRalgorithm();
returnsuffixArray;
}
/**
* @return inverted suffix array
*/
publicArrayList<Integer> getKMRarray() {
if (KMRarray == null)
KMRalgorithm();
returnKMRarray;
}
publicStringgetString(){
returnstring;
}
/**
* Creates suffix array using KMR algorithm with O(n log^2 n) complexity.
*
* For radius r:
* KMR[i] == k,
* when string[i..i+r-1] is kth r-letter substring of string sorted lexicographically
* KMR is counted for radius = 1,2,4,8 ...
* KMR for radius bigger than string length is the inverted suffix array
*/
privatevoidKMRalgorithm() {
finalintlength = string.length();
ArrayList<KMRsWithIndex> KMRinvertedList = newArrayList<KMRsWithIndex>();
ArrayList<Integer> KMR = getBasicKMR(length);
intradius = 1;
while (radius < length) {
KMRinvertedList = getKMRinvertedList(KMR, radius, length);
KMR = getKMR(KMRinvertedList, length);
radius *= 2;
}
KMRarray = newArrayList<Integer>(KMR.subList(0, length));
suffixArray = newArrayList<Integer>();
for (KMRsWithIndexkmr : KMRinvertedList) {
suffixArray.add(newInteger(kmr.index));
}
}
/**
* Creates KMR array for new radius from nearly inverted array.
* Elements from inverted array need to be grouped by substring tey represent.
*
* @param KMRinvertedList indexes are nearly inverted KMR array
* @param length string length
* @return KMR array for new radius
*/
privateArrayList<Integer> getKMR(ArrayList<KMRsWithIndex> KMRinvertedList, intlength) {
finalArrayList<Integer> KMR = newArrayList<Integer>(length*2);
for (inti=0; i<2*length; i++)
KMR.add(newInteger(-1));
intcounter = 0;
for (inti=0; i<length; i++){
if(i>0 && substringsAreEqual(KMRinvertedList, i))
counter++;
KMR.set(KMRinvertedList.get(i).index, newInteger(counter));
}
returnKMR;
}
privatebooleansubstringsAreEqual(ArrayList<KMRsWithIndex> KMRinvertedList, inti) {
return (KMRinvertedList.get(i-1).beginKMR.equals(KMRinvertedList.get(i).beginKMR) == false) ||
(KMRinvertedList.get(i-1).endKMR.equals(KMRinvertedList.get(i).endKMR) == false);
}
/**
* helper method to create KMR array for radius = radius from KMR array for radius = radius/2
*
* @param KMR KMR array for radius = radius/2
* @param radius new radius
* @param length string length
* @return list of KMRsWithIndex which indexes are nearly inverted KMR array
*/
privateArrayList<KMRsWithIndex> getKMRinvertedList(ArrayList<Integer> KMR, intradius, intlength) {
finalArrayList<KMRsWithIndex> KMRinvertedList = newArrayList<KMRsWithIndex>();
for (inti=0; i<length; i++)
KMRinvertedList.add(newKMRsWithIndex(KMR.get(i), KMR.get(i+radius), i));
Collections.sort(KMRinvertedList,
newComparator<KMRsWithIndex>() {
@Override
publicintcompare(KMRsWithIndexA, KMRsWithIndexB) {
if (A.beginKMR.equals(B.beginKMR) == false)
returnA.beginKMR.compareTo(B.beginKMR);
if (A.endKMR.equals(B.endKMR) == false)
returnA.endKMR.compareTo(B.endKMR);
returnA.index.compareTo(B.index);
}
}
);
returnKMRinvertedList;
}
/**
* KMR array for radius=1, instead of initial natural numbers ascii codes are used
*
* @param length length of string
* @return pseudo KMR array for radius=1
*/
privateArrayList<Integer> getBasicKMR(intlength) {
finalArrayList<Integer> result = newArrayList<Integer>(length*2);
finalchar[] characters = string.toCharArray();
for (inti=0; i<length; i++)
result.add(newInteger(characters[i]));
for (inti=0; i<length; i++)
result.add(newInteger(-1));
returnresult;
}
privateStringbuildStringWithEndChar(CharSequencesequence) {
STRING_BUILDER.setLength(0);
STRING_BUILDER.append(sequence);
if (STRING_BUILDER.indexOf(String.valueOf(endSeqChar)) < 0)
STRING_BUILDER.append(endSeqChar);
returnSTRING_BUILDER.toString();
}
privateclassKMRsWithIndex{
IntegerbeginKMR;
IntegerendKMR;
Integerindex;
KMRsWithIndex(Integerbegin, Integerend, Integerindex){
this.beginKMR = begin;
this.endKMR = end;
this.index = index;
}
}
}