1 // Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person
4 // obtaining a copy of this software and associated documentation
5 // files (the "Software"), to deal in the Software without
6 // restriction, including without limitation the rights to use,
7 // copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following
10 // conditions:
11 //
12 // The above copyright notice and this permission notice shall be
13 // included in all copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 // OTHER DEALINGS IN THE SOFTWARE.
23 //
24 // ChangeLog:
25 // 2016-07-22 - Initial commit and adaption to use PagedArray.
26 //                --Samuel Huang <huangs@chromium.org>
27 
28 #include "courgette/third_party/divsufsort/divsufsort_private.h"
29 
30 #include <stdlib.h>
31 
32 #define BUCKET_A_SIZE (ALPHABET_SIZE)
33 #define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
34 
35 #define BUCKET_A(_c0) bucket_A[(_c0)]
36 #if ALPHABET_SIZE == 256
37 #define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
38 #define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
39 #else
40 #define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
41 #define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
42 #endif
43 
44 namespace divsuf {
45 
46 /*- Private Functions -*/
47 
48 namespace {
49 
50 /* Sorts suffixes of type B*. */
51 saidx_t
sort_typeBstar(const sauchar_t * T,saidx_it SA,saidx_t * bucket_A,saidx_t * bucket_B,saidx_t n)52 sort_typeBstar(const sauchar_t *T, saidx_it SA,
53                saidx_t *bucket_A, saidx_t *bucket_B,
54                saidx_t n) {
55   saidx_it PAb, ISAb, buf;
56   saidx_t i, j, k, t, m, bufsize;
57   saint_t c0, c1;
58 
59   /* Initialize bucket arrays. */
60   for(i = 0; i < static_cast<saidx_t>(BUCKET_A_SIZE); ++i) { bucket_A[i] = 0; }
61   for(i = 0; i < static_cast<saidx_t>(BUCKET_B_SIZE); ++i) { bucket_B[i] = 0; }
62 
63   /* Count the number of occurrences of the first one or two characters of each
64      type A, B and B* suffix. Moreover, store the beginning position of all
65      type B* suffixes into the array SA. */
66   for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
67     /* type A suffix. */
68     do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
69     if(0 <= i) {
70       /* type B* suffix. */
71       ++BUCKET_BSTAR(c0, c1);
72       SA[--m] = i;
73       /* type B suffix. */
74       for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
75         ++BUCKET_B(c0, c1);
76       }
77     }
78   }
79   m = n - m;
80 /*
81 note:
82   A type B* suffix is lexicographically smaller than a type B suffix that
83   begins with the same first two characters.
84 */
85 
86   /* Calculate the index of start/end point of each bucket. */
87   for(c0 = 0, i = 0, j = 0; c0 < static_cast<saint_t>(ALPHABET_SIZE); ++c0) {
88     t = i + BUCKET_A(c0);
89     BUCKET_A(c0) = i + j; /* start point */
90     i = t + BUCKET_B(c0, c0);
91     for(c1 = c0 + 1; c1 < static_cast<saint_t>(ALPHABET_SIZE); ++c1) {
92       j += BUCKET_BSTAR(c0, c1);
93       BUCKET_BSTAR(c0, c1) = j; /* end point */
94       i += BUCKET_B(c0, c1);
95     }
96   }
97 
98   if(0 < m) {
99     /* Sort the type B* suffixes by their first two characters. */
100     PAb = SA + n - m; ISAb = SA + m;
101     for(i = m - 2; 0 <= i; --i) {
102       t = PAb[i], c0 = T[t], c1 = T[t + 1];
103       SA[--BUCKET_BSTAR(c0, c1)] = i;
104     }
105     t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
106     SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
107 
108     /* Sort the type B* substrings using sssort. */
109     buf = SA + m, bufsize = n - (2 * m);
110     for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
111       for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
112         i = BUCKET_BSTAR(c0, c1);
113         if(1 < (j - i)) {
114           sssort(T, PAb, SA + i, SA + j,
115                  buf, bufsize, 2, n, *(SA + i) == (m - 1));
116         }
117       }
118     }
119 
120     /* Compute ranks of type B* substrings. */
121     for(i = m - 1; 0 <= i; --i) {
122       if(0 <= SA[i]) {
123         j = i;
124         do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
125         SA[i + 1] = i - j;
126         if(i <= 0) { break; }
127       }
128       j = i;
129       do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
130       ISAb[SA[i]] = j;
131     }
132 
133     /* Construct the inverse suffix array of type B* suffixes using trsort. */
134     trsort(ISAb, SA, m, 1);
135 
136     /* Set the sorted order of tyoe B* suffixes. */
137     for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
138       for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
139       if(0 <= i) {
140         t = i;
141         for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
142         SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
143       }
144     }
145 
146     /* Calculate the index of start/end point of each bucket. */
147     BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
148     for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
149       i = BUCKET_A(c0 + 1) - 1;
150       for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
151         t = i - BUCKET_B(c0, c1);
152         BUCKET_B(c0, c1) = i; /* end point */
153 
154         /* Move all type B* suffixes to the correct position. */
155         for(i = t, j = BUCKET_BSTAR(c0, c1);
156             j <= k;
157             --i, --k) { SA[i] = SA[k]; }
158       }
159       BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
160       BUCKET_B(c0, c0) = i; /* end point */
161     }
162   }
163 
164   return m;
165 }
166 
167 /* Constructs the suffix array by using the sorted order of type B* suffixes. */
168 void
construct_SA(const sauchar_t * T,saidx_it SA,saidx_t * bucket_A,saidx_t * bucket_B,saidx_t n,saidx_t m)169 construct_SA(const sauchar_t *T, saidx_it SA,
170              saidx_t *bucket_A, saidx_t *bucket_B,
171              saidx_t n, saidx_t m) {
172   saidx_it i, j, k;
173   saidx_t s;
174   saint_t c0, c1, c2;
175 
176   if(0 < m) {
177     /* Construct the sorted order of type B suffixes by using
178        the sorted order of type B* suffixes. */
179     for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
180       /* Scan the suffix array from right to left. */
181       for (i = SA + BUCKET_BSTAR(c1, c1 + 1), j = SA + BUCKET_A(c1 + 1) - 1,
182           k = nullptr, c2 = -1;
183            i <= j; --j) {
184         if(0 < (s = *j)) {
185           assert(T[s] == c1);
186           assert(((s + 1) < n) && (T[s] <= T[s + 1]));
187           assert(T[s - 1] <= T[s]);
188           *j = ~s;
189           c0 = T[--s];
190           if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
191           if(c0 != c2) {
192             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
193             k = SA + BUCKET_B(c2 = c0, c1);
194           }
195           assert(k < j);
196           *k-- = s;
197         } else {
198           assert(((s == 0) && (T[s] == c1)) || (s < 0));
199           *j = ~s;
200         }
201       }
202     }
203   }
204 
205   /* Construct the suffix array by using
206      the sorted order of type B suffixes. */
207   k = SA + BUCKET_A(c2 = T[n - 1]);
208   *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
209   /* Scan the suffix array from left to right. */
210   for(i = SA, j = SA + n; i < j; ++i) {
211     if(0 < (s = *i)) {
212       assert(T[s - 1] >= T[s]);
213       c0 = T[--s];
214       if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
215       if(c0 != c2) {
216         BUCKET_A(c2) = k - SA;
217         k = SA + BUCKET_A(c2 = c0);
218       }
219       assert(i < k);
220       *k++ = s;
221     } else {
222       assert(s < 0);
223       *i = ~s;
224     }
225   }
226 }
227 
228 }  // namespace
229 
230 /*---------------------------------------------------------------------------*/
231 
232 /*- Function -*/
233 
234 saint_t
divsufsort(const sauchar_t * T,saidx_it SA,saidx_t n)235 divsufsort(const sauchar_t *T, saidx_it SA, saidx_t n) {
236   saidx_t *bucket_A, *bucket_B;
237   saidx_t m;
238   saint_t err = 0;
239 
240   /* Check arguments. */
241   if ((T == nullptr) || (SA == nullptr) || (n < 0)) {
242     return -1;
243   } else if (n == 0) {
244     return 0;
245   } else if (n == 1) {
246     SA[0] = 0;
247     return 0;
248   } else if (n == 2) {
249     m = (T[0] < T[1]);
250     SA[m ^ 1] = 0, SA[m] = 1;
251     return 0;
252   }
253 
254   bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
255   bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
256 
257   /* Suffixsort. */
258   if ((bucket_A != nullptr) && (bucket_B != nullptr)) {
259     m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
260     construct_SA(T, SA, bucket_A, bucket_B, n, m);
261   } else {
262     err = -2;
263   }
264 
265   free(bucket_B);
266   free(bucket_A);
267 
268   return err;
269 }
270 
divsufsort_include_empty(const sauchar_t * T,saidx_it SA,saidx_t n)271 saint_t divsufsort_include_empty(const sauchar_t *T, saidx_it SA, saidx_t n) {
272   SA[0] = n;  // Manually add the empty string suffix.
273   return divsufsort(T, SA + 1, n);
274 }
275 
276 }  // namespace divsuf
277