1 /* Byte-wise substring search, using the Two-Way algorithm.
2 * Copyright (C) 2008, 2010 Eric Blake
3 * Permission to use, copy, modify, and distribute this software
4 * is freely granted, provided that this notice is preserved.
5 */
6
7
8 /* Before including this file, you need to include <string.h>, and define:
9 RETURN_TYPE A macro that expands to the return type.
10 AVAILABLE(h, h_l, j, n_l) A macro that returns nonzero if there are
11 at least N_L bytes left starting at
12 H[J]. H is 'unsigned char *', H_L, J,
13 and N_L are 'size_t'; H_L is an
14 lvalue. For NUL-terminated searches,
15 H_L can be modified each iteration to
16 avoid having to compute the end of H
17 up front.
18
19 For case-insensitivity, you may optionally define:
20 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
21 characters of P1 and P2 are equal.
22 CANON_ELEMENT(c) A macro that canonicalizes an element
23 right after it has been fetched from
24 one of the two strings. The argument
25 is an 'unsigned char'; the result must
26 be an 'unsigned char' as well.
27
28 This file undefines the macros documented above, and defines
29 LONG_NEEDLE_THRESHOLD.
30 */
31
32 #include <limits.h>
33
34 /*
35 Python 2.7 (the only Python 2.x version supported as of now and until 2020)
36 is built on windows with Visual Studio 2008 C compiler. That dictates that
37 the compiler which must be used by authors of third party Python modules.
38 See https://mail.python.org/pipermail/distutils-sig/2014-September/024885.html
39
40 Unfortunately this version of Visual Studio doesn't claim to be C99 compatible
41 and in particular it lacks the stdint.h header. So we have to replace it with
42 a public domain version.
43
44 Visual Studio 2010 and later have stdint.h.
45 */
46
47 #ifdef _MSC_VER
48 #if _MSC_VER <= 1500
49 #include "win32/stdint.h"
50 #endif
51 #else
52 #include <stdint.h>
53 #endif
54
55 /* We use the Two-Way string matching algorithm, which guarantees
56 linear complexity with constant space. Additionally, for long
57 needles, we also use a bad character shift table similar to the
58 Boyer-Moore algorithm to achieve improved (potentially sub-linear)
59 performance.
60
61 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
62 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
63 */
64
65 /* Point at which computing a bad-byte shift table is likely to be
66 worthwhile. Small needles should not compute a table, since it
67 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
68 speedup no greater than a factor of NEEDLE_LEN. The larger the
69 needle, the better the potential performance gain. On the other
70 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
71 memory required for the table is prohibitive. */
72 #if CHAR_BIT < 10
73 # define LONG_NEEDLE_THRESHOLD 32U
74 #else
75 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
76 #endif
77
78 #define MAX(a, b) ((a < b) ? (b) : (a))
79
80 #ifndef CANON_ELEMENT
81 # define CANON_ELEMENT(c) c
82 #endif
83 #ifndef CMP_FUNC
84 # define CMP_FUNC memcmp
85 #endif
86
87 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
88 Return the index of the first byte in the right half, and set
89 *PERIOD to the global period of the right half.
90
91 The global period of a string is the smallest index (possibly its
92 length) at which all remaining bytes in the string are repetitions
93 of the prefix (the last repetition may be a subset of the prefix).
94
95 When NEEDLE is factored into two halves, a local period is the
96 length of the smallest word that shares a suffix with the left half
97 and shares a prefix with the right half. All factorizations of a
98 non-empty NEEDLE have a local period of at least 1 and no greater
99 than NEEDLE_LEN.
100
101 A critical factorization has the property that the local period
102 equals the global period. All strings have at least one critical
103 factorization with the left half smaller than the global period.
104
105 Given an ordered alphabet, a critical factorization can be computed
106 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
107 larger of two ordered maximal suffixes. The ordered maximal
108 suffixes are determined by lexicographic comparison of
109 periodicity. */
110 static size_t
critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)111 critical_factorization (const unsigned char *needle, size_t needle_len,
112 size_t *period)
113 {
114 /* Index of last byte of left half, or SIZE_MAX. */
115 size_t max_suffix, max_suffix_rev;
116 size_t j; /* Index into NEEDLE for current candidate suffix. */
117 size_t k; /* Offset into current period. */
118 size_t p; /* Intermediate period. */
119 unsigned char a, b; /* Current comparison bytes. */
120
121 /* Invariants:
122 0 <= j < NEEDLE_LEN - 1
123 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
124 min(max_suffix, max_suffix_rev) < global period of NEEDLE
125 1 <= p <= global period of NEEDLE
126 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
127 1 <= k <= p
128 */
129
130 /* Perform lexicographic search. */
131 max_suffix = SIZE_MAX;
132 j = 0;
133 k = p = 1;
134 while (j + k < needle_len)
135 {
136 a = CANON_ELEMENT (needle[j + k]);
137 b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
138 if (a < b)
139 {
140 /* Suffix is smaller, period is entire prefix so far. */
141 j += k;
142 k = 1;
143 p = j - max_suffix;
144 }
145 else if (a == b)
146 {
147 /* Advance through repetition of the current period. */
148 if (k != p)
149 ++k;
150 else
151 {
152 j += p;
153 k = 1;
154 }
155 }
156 else /* b < a */
157 {
158 /* Suffix is larger, start over from current location. */
159 max_suffix = j++;
160 k = p = 1;
161 }
162 }
163 *period = p;
164
165 /* Perform reverse lexicographic search. */
166 max_suffix_rev = SIZE_MAX;
167 j = 0;
168 k = p = 1;
169 while (j + k < needle_len)
170 {
171 a = CANON_ELEMENT (needle[j + k]);
172 b = CANON_ELEMENT (needle[max_suffix_rev + k]);
173 if (b < a)
174 {
175 /* Suffix is smaller, period is entire prefix so far. */
176 j += k;
177 k = 1;
178 p = j - max_suffix_rev;
179 }
180 else if (a == b)
181 {
182 /* Advance through repetition of the current period. */
183 if (k != p)
184 ++k;
185 else
186 {
187 j += p;
188 k = 1;
189 }
190 }
191 else /* a < b */
192 {
193 /* Suffix is larger, start over from current location. */
194 max_suffix_rev = j++;
195 k = p = 1;
196 }
197 }
198
199 /* Choose the longer suffix. Return the first byte of the right
200 half, rather than the last byte of the left half. */
201 if (max_suffix_rev + 1 < max_suffix + 1)
202 return max_suffix + 1;
203 *period = p;
204 return max_suffix_rev + 1;
205 }
206
207 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
208 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
209 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
210 Performance is guaranteed to be linear, with an initialization cost
211 of 2 * NEEDLE_LEN comparisons.
212
213 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
214 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
215 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
216 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
217 static RETURN_TYPE
two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)218 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
219 const unsigned char *needle, size_t needle_len)
220 {
221 size_t i; /* Index into current byte of NEEDLE. */
222 size_t j; /* Index into current window of HAYSTACK. */
223 size_t period; /* The period of the right half of needle. */
224 size_t suffix; /* The index of the right half of needle. */
225
226 /* Factor the needle into two halves, such that the left half is
227 smaller than the global period, and the right half is
228 periodic (with a period as large as NEEDLE_LEN - suffix). */
229 suffix = critical_factorization (needle, needle_len, &period);
230
231 /* Perform the search. Each iteration compares the right half
232 first. */
233 if (CMP_FUNC (needle, needle + period, suffix) == 0)
234 {
235 /* Entire needle is periodic; a mismatch can only advance by the
236 period, so use memory to avoid rescanning known occurrences
237 of the period. */
238 size_t memory = 0;
239 j = 0;
240 while (AVAILABLE (haystack, haystack_len, j, needle_len))
241 {
242 /* Scan for matches in right half. */
243 i = MAX (suffix, memory);
244 while (i < needle_len && (CANON_ELEMENT (needle[i])
245 == CANON_ELEMENT (haystack[i + j])))
246 ++i;
247 if (needle_len <= i)
248 {
249 /* Scan for matches in left half. */
250 i = suffix - 1;
251 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
252 == CANON_ELEMENT (haystack[i + j])))
253 --i;
254 if (i + 1 < memory + 1)
255 return (RETURN_TYPE) (haystack + j);
256 /* No match, so remember how many repetitions of period
257 on the right half were scanned. */
258 j += period;
259 memory = needle_len - period;
260 }
261 else
262 {
263 j += i - suffix + 1;
264 memory = 0;
265 }
266 }
267 }
268 else
269 {
270 /* The two halves of needle are distinct; no extra memory is
271 required, and any mismatch results in a maximal shift. */
272 period = MAX (suffix, needle_len - suffix) + 1;
273 j = 0;
274 while (AVAILABLE (haystack, haystack_len, j, needle_len))
275 {
276 /* Scan for matches in right half. */
277 i = suffix;
278 while (i < needle_len && (CANON_ELEMENT (needle[i])
279 == CANON_ELEMENT (haystack[i + j])))
280 ++i;
281 if (needle_len <= i)
282 {
283 /* Scan for matches in left half. */
284 i = suffix - 1;
285 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
286 == CANON_ELEMENT (haystack[i + j])))
287 --i;
288 if (i == SIZE_MAX)
289 return (RETURN_TYPE) (haystack + j);
290 j += period;
291 }
292 else
293 j += i - suffix + 1;
294 }
295 }
296 return NULL;
297 }
298
299 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
300 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
301 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
302 Performance is guaranteed to be linear, with an initialization cost
303 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
304
305 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
306 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
307 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
308 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
309 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
310 sublinear performance is not possible. */
311 static RETURN_TYPE
two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)312 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
313 const unsigned char *needle, size_t needle_len)
314 {
315 size_t i; /* Index into current byte of NEEDLE. */
316 size_t j; /* Index into current window of HAYSTACK. */
317 size_t period; /* The period of the right half of needle. */
318 size_t suffix; /* The index of the right half of needle. */
319 size_t shift_table[1U << CHAR_BIT]; /* See below. */
320
321 /* Factor the needle into two halves, such that the left half is
322 smaller than the global period, and the right half is
323 periodic (with a period as large as NEEDLE_LEN - suffix). */
324 suffix = critical_factorization (needle, needle_len, &period);
325
326 /* Populate shift_table. For each possible byte value c,
327 shift_table[c] is the distance from the last occurrence of c to
328 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
329 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
330 for (i = 0; i < 1U << CHAR_BIT; i++)
331 shift_table[i] = needle_len;
332 for (i = 0; i < needle_len; i++)
333 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
334
335 /* Perform the search. Each iteration compares the right half
336 first. */
337 if (CMP_FUNC (needle, needle + period, suffix) == 0)
338 {
339 /* Entire needle is periodic; a mismatch can only advance by the
340 period, so use memory to avoid rescanning known occurrences
341 of the period. */
342 size_t memory = 0;
343 size_t shift;
344 j = 0;
345 while (AVAILABLE (haystack, haystack_len, j, needle_len))
346 {
347 /* Check the last byte first; if it does not match, then
348 shift to the next possible match location. */
349 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
350 if (0 < shift)
351 {
352 if (memory && shift < period)
353 {
354 /* Since needle is periodic, but the last period has
355 a byte out of place, there can be no match until
356 after the mismatch. */
357 shift = needle_len - period;
358 }
359 memory = 0;
360 j += shift;
361 continue;
362 }
363 /* Scan for matches in right half. The last byte has
364 already been matched, by virtue of the shift table. */
365 i = MAX (suffix, memory);
366 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
367 == CANON_ELEMENT (haystack[i + j])))
368 ++i;
369 if (needle_len - 1 <= i)
370 {
371 /* Scan for matches in left half. */
372 i = suffix - 1;
373 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
374 == CANON_ELEMENT (haystack[i + j])))
375 --i;
376 if (i + 1 < memory + 1)
377 return (RETURN_TYPE) (haystack + j);
378 /* No match, so remember how many repetitions of period
379 on the right half were scanned. */
380 j += period;
381 memory = needle_len - period;
382 }
383 else
384 {
385 j += i - suffix + 1;
386 memory = 0;
387 }
388 }
389 }
390 else
391 {
392 /* The two halves of needle are distinct; no extra memory is
393 required, and any mismatch results in a maximal shift. */
394 size_t shift;
395 period = MAX (suffix, needle_len - suffix) + 1;
396 j = 0;
397 while (AVAILABLE (haystack, haystack_len, j, needle_len))
398 {
399 /* Check the last byte first; if it does not match, then
400 shift to the next possible match location. */
401 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
402 if (0 < shift)
403 {
404 j += shift;
405 continue;
406 }
407 /* Scan for matches in right half. The last byte has
408 already been matched, by virtue of the shift table. */
409 i = suffix;
410 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
411 == CANON_ELEMENT (haystack[i + j])))
412 ++i;
413 if (needle_len - 1 <= i)
414 {
415 /* Scan for matches in left half. */
416 i = suffix - 1;
417 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
418 == CANON_ELEMENT (haystack[i + j])))
419 --i;
420 if (i == SIZE_MAX)
421 return (RETURN_TYPE) (haystack + j);
422 j += period;
423 }
424 else
425 j += i - suffix + 1;
426 }
427 }
428 return NULL;
429 }
430
431 #undef AVAILABLE
432 #undef CANON_ELEMENT
433 #undef CMP_FUNC
434 #undef MAX
435 #undef RETURN_TYPE
436