1 /* Byte-wise substring search, using the Two-Way algorithm.
2  * Copyright (C) 2008, 2010 Eric Blake
3  * Permission to use, copy, modify, and distribute this software
4  * is freely granted, provided that this notice is preserved.
5  */
6 
7 
8 /* Before including this file, you need to include <string.h>, and define:
9      RETURN_TYPE		A macro that expands to the return type.
10      AVAILABLE(h, h_l, j, n_l)	A macro that returns nonzero if there are
11 				at least N_L bytes left starting at
12 				H[J].  H is 'unsigned char *', H_L, J,
13 				and N_L are 'size_t'; H_L is an
14 				lvalue.  For NUL-terminated searches,
15 				H_L can be modified each iteration to
16 				avoid having to compute the end of H
17 				up front.
18 
19   For case-insensitivity, you may optionally define:
20      CMP_FUNC(p1, p2, l)	A macro that returns 0 iff the first L
21 				characters of P1 and P2 are equal.
22      CANON_ELEMENT(c)		A macro that canonicalizes an element
23 				right after it has been fetched from
24 				one of the two strings.  The argument
25 				is an 'unsigned char'; the result must
26 				be an 'unsigned char' as well.
27 
28   This file undefines the macros documented above, and defines
29   LONG_NEEDLE_THRESHOLD.
30 */
31 
32 #include <limits.h>
33 
34 /*
35   Python 2.7 (the only Python 2.x version supported as of now and until 2020)
36   is built on windows with Visual Studio 2008 C compiler. That dictates that
37   the compiler which must be used by authors of third party Python modules.
38   See https://mail.python.org/pipermail/distutils-sig/2014-September/024885.html
39 
40   Unfortunately this version of Visual Studio doesn't claim to be C99 compatible
41   and in particular it lacks the stdint.h header. So we have to replace it with
42   a public domain version.
43 
44   Visual Studio 2010 and later have stdint.h.
45 */
46 
47 #ifdef _MSC_VER
48   #if _MSC_VER <= 1500
49     #include "win32/stdint.h"
50   #endif
51 #else
52   #include <stdint.h>
53 #endif
54 
55 /* We use the Two-Way string matching algorithm, which guarantees
56    linear complexity with constant space.  Additionally, for long
57    needles, we also use a bad character shift table similar to the
58    Boyer-Moore algorithm to achieve improved (potentially sub-linear)
59    performance.
60 
61    See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
62    and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
63 */
64 
65 /* Point at which computing a bad-byte shift table is likely to be
66    worthwhile.  Small needles should not compute a table, since it
67    adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
68    speedup no greater than a factor of NEEDLE_LEN.  The larger the
69    needle, the better the potential performance gain.  On the other
70    hand, on non-POSIX systems with CHAR_BIT larger than eight, the
71    memory required for the table is prohibitive.  */
72 #if CHAR_BIT < 10
73 # define LONG_NEEDLE_THRESHOLD 32U
74 #else
75 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
76 #endif
77 
78 #define MAX(a, b) ((a < b) ? (b) : (a))
79 
80 #ifndef CANON_ELEMENT
81 # define CANON_ELEMENT(c) c
82 #endif
83 #ifndef CMP_FUNC
84 # define CMP_FUNC memcmp
85 #endif
86 
87 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
88    Return the index of the first byte in the right half, and set
89    *PERIOD to the global period of the right half.
90 
91    The global period of a string is the smallest index (possibly its
92    length) at which all remaining bytes in the string are repetitions
93    of the prefix (the last repetition may be a subset of the prefix).
94 
95    When NEEDLE is factored into two halves, a local period is the
96    length of the smallest word that shares a suffix with the left half
97    and shares a prefix with the right half.  All factorizations of a
98    non-empty NEEDLE have a local period of at least 1 and no greater
99    than NEEDLE_LEN.
100 
101    A critical factorization has the property that the local period
102    equals the global period.  All strings have at least one critical
103    factorization with the left half smaller than the global period.
104 
105    Given an ordered alphabet, a critical factorization can be computed
106    in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
107    larger of two ordered maximal suffixes.  The ordered maximal
108    suffixes are determined by lexicographic comparison of
109    periodicity.  */
110 static size_t
critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)111 critical_factorization (const unsigned char *needle, size_t needle_len,
112 			size_t *period)
113 {
114   /* Index of last byte of left half, or SIZE_MAX.  */
115   size_t max_suffix, max_suffix_rev;
116   size_t j; /* Index into NEEDLE for current candidate suffix.  */
117   size_t k; /* Offset into current period.  */
118   size_t p; /* Intermediate period.  */
119   unsigned char a, b; /* Current comparison bytes.  */
120 
121   /* Invariants:
122      0 <= j < NEEDLE_LEN - 1
123      -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
124      min(max_suffix, max_suffix_rev) < global period of NEEDLE
125      1 <= p <= global period of NEEDLE
126      p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
127      1 <= k <= p
128   */
129 
130   /* Perform lexicographic search.  */
131   max_suffix = SIZE_MAX;
132   j = 0;
133   k = p = 1;
134   while (j + k < needle_len)
135     {
136       a = CANON_ELEMENT (needle[j + k]);
137       b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
138       if (a < b)
139 	{
140 	  /* Suffix is smaller, period is entire prefix so far.  */
141 	  j += k;
142 	  k = 1;
143 	  p = j - max_suffix;
144 	}
145       else if (a == b)
146 	{
147 	  /* Advance through repetition of the current period.  */
148 	  if (k != p)
149 	    ++k;
150 	  else
151 	    {
152 	      j += p;
153 	      k = 1;
154 	    }
155 	}
156       else /* b < a */
157 	{
158 	  /* Suffix is larger, start over from current location.  */
159 	  max_suffix = j++;
160 	  k = p = 1;
161 	}
162     }
163   *period = p;
164 
165   /* Perform reverse lexicographic search.  */
166   max_suffix_rev = SIZE_MAX;
167   j = 0;
168   k = p = 1;
169   while (j + k < needle_len)
170     {
171       a = CANON_ELEMENT (needle[j + k]);
172       b = CANON_ELEMENT (needle[max_suffix_rev + k]);
173       if (b < a)
174 	{
175 	  /* Suffix is smaller, period is entire prefix so far.  */
176 	  j += k;
177 	  k = 1;
178 	  p = j - max_suffix_rev;
179 	}
180       else if (a == b)
181 	{
182 	  /* Advance through repetition of the current period.  */
183 	  if (k != p)
184 	    ++k;
185 	  else
186 	    {
187 	      j += p;
188 	      k = 1;
189 	    }
190 	}
191       else /* a < b */
192 	{
193 	  /* Suffix is larger, start over from current location.  */
194 	  max_suffix_rev = j++;
195 	  k = p = 1;
196 	}
197     }
198 
199   /* Choose the longer suffix.  Return the first byte of the right
200      half, rather than the last byte of the left half.  */
201   if (max_suffix_rev + 1 < max_suffix + 1)
202     return max_suffix + 1;
203   *period = p;
204   return max_suffix_rev + 1;
205 }
206 
207 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
208    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
209    method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
210    Performance is guaranteed to be linear, with an initialization cost
211    of 2 * NEEDLE_LEN comparisons.
212 
213    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
214    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
215    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
216    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
217 static RETURN_TYPE
two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)218 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
219 		      const unsigned char *needle, size_t needle_len)
220 {
221   size_t i; /* Index into current byte of NEEDLE.  */
222   size_t j; /* Index into current window of HAYSTACK.  */
223   size_t period; /* The period of the right half of needle.  */
224   size_t suffix; /* The index of the right half of needle.  */
225 
226   /* Factor the needle into two halves, such that the left half is
227      smaller than the global period, and the right half is
228      periodic (with a period as large as NEEDLE_LEN - suffix).  */
229   suffix = critical_factorization (needle, needle_len, &period);
230 
231   /* Perform the search.  Each iteration compares the right half
232      first.  */
233   if (CMP_FUNC (needle, needle + period, suffix) == 0)
234     {
235       /* Entire needle is periodic; a mismatch can only advance by the
236 	 period, so use memory to avoid rescanning known occurrences
237 	 of the period.  */
238       size_t memory = 0;
239       j = 0;
240       while (AVAILABLE (haystack, haystack_len, j, needle_len))
241 	{
242 	  /* Scan for matches in right half.  */
243 	  i = MAX (suffix, memory);
244 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
245 				    == CANON_ELEMENT (haystack[i + j])))
246 	    ++i;
247 	  if (needle_len <= i)
248 	    {
249 	      /* Scan for matches in left half.  */
250 	      i = suffix - 1;
251 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
252 					== CANON_ELEMENT (haystack[i + j])))
253 		--i;
254 	      if (i + 1 < memory + 1)
255 		return (RETURN_TYPE) (haystack + j);
256 	      /* No match, so remember how many repetitions of period
257 		 on the right half were scanned.  */
258 	      j += period;
259 	      memory = needle_len - period;
260 	    }
261 	  else
262 	    {
263 	      j += i - suffix + 1;
264 	      memory = 0;
265 	    }
266 	}
267     }
268   else
269     {
270       /* The two halves of needle are distinct; no extra memory is
271 	 required, and any mismatch results in a maximal shift.  */
272       period = MAX (suffix, needle_len - suffix) + 1;
273       j = 0;
274       while (AVAILABLE (haystack, haystack_len, j, needle_len))
275 	{
276 	  /* Scan for matches in right half.  */
277 	  i = suffix;
278 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
279 				    == CANON_ELEMENT (haystack[i + j])))
280 	    ++i;
281 	  if (needle_len <= i)
282 	    {
283 	      /* Scan for matches in left half.  */
284 	      i = suffix - 1;
285 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
286 				       == CANON_ELEMENT (haystack[i + j])))
287 		--i;
288 	      if (i == SIZE_MAX)
289 		return (RETURN_TYPE) (haystack + j);
290 	      j += period;
291 	    }
292 	  else
293 	    j += i - suffix + 1;
294 	}
295     }
296   return NULL;
297 }
298 
299 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
300    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
301    method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
302    Performance is guaranteed to be linear, with an initialization cost
303    of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
304 
305    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
306    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
307    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
308    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
309    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
310    sublinear performance is not possible.  */
311 static RETURN_TYPE
two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)312 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
313 		     const unsigned char *needle, size_t needle_len)
314 {
315   size_t i; /* Index into current byte of NEEDLE.  */
316   size_t j; /* Index into current window of HAYSTACK.  */
317   size_t period; /* The period of the right half of needle.  */
318   size_t suffix; /* The index of the right half of needle.  */
319   size_t shift_table[1U << CHAR_BIT]; /* See below.  */
320 
321   /* Factor the needle into two halves, such that the left half is
322      smaller than the global period, and the right half is
323      periodic (with a period as large as NEEDLE_LEN - suffix).  */
324   suffix = critical_factorization (needle, needle_len, &period);
325 
326   /* Populate shift_table.  For each possible byte value c,
327      shift_table[c] is the distance from the last occurrence of c to
328      the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
329      shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
330   for (i = 0; i < 1U << CHAR_BIT; i++)
331     shift_table[i] = needle_len;
332   for (i = 0; i < needle_len; i++)
333     shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
334 
335   /* Perform the search.  Each iteration compares the right half
336      first.  */
337   if (CMP_FUNC (needle, needle + period, suffix) == 0)
338     {
339       /* Entire needle is periodic; a mismatch can only advance by the
340 	 period, so use memory to avoid rescanning known occurrences
341 	 of the period.  */
342       size_t memory = 0;
343       size_t shift;
344       j = 0;
345       while (AVAILABLE (haystack, haystack_len, j, needle_len))
346 	{
347 	  /* Check the last byte first; if it does not match, then
348 	     shift to the next possible match location.  */
349 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
350 	  if (0 < shift)
351 	    {
352 	      if (memory && shift < period)
353 		{
354 		  /* Since needle is periodic, but the last period has
355 		     a byte out of place, there can be no match until
356 		     after the mismatch.  */
357 		  shift = needle_len - period;
358 		}
359 	      memory = 0;
360 	      j += shift;
361 	      continue;
362 	    }
363 	  /* Scan for matches in right half.  The last byte has
364 	     already been matched, by virtue of the shift table.  */
365 	  i = MAX (suffix, memory);
366 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
367 					== CANON_ELEMENT (haystack[i + j])))
368 	    ++i;
369 	  if (needle_len - 1 <= i)
370 	    {
371 	      /* Scan for matches in left half.  */
372 	      i = suffix - 1;
373 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
374 					== CANON_ELEMENT (haystack[i + j])))
375 		--i;
376 	      if (i + 1 < memory + 1)
377 		return (RETURN_TYPE) (haystack + j);
378 	      /* No match, so remember how many repetitions of period
379 		 on the right half were scanned.  */
380 	      j += period;
381 	      memory = needle_len - period;
382 	    }
383 	  else
384 	    {
385 	      j += i - suffix + 1;
386 	      memory = 0;
387 	    }
388 	}
389     }
390   else
391     {
392       /* The two halves of needle are distinct; no extra memory is
393 	 required, and any mismatch results in a maximal shift.  */
394       size_t shift;
395       period = MAX (suffix, needle_len - suffix) + 1;
396       j = 0;
397       while (AVAILABLE (haystack, haystack_len, j, needle_len))
398 	{
399 	  /* Check the last byte first; if it does not match, then
400 	     shift to the next possible match location.  */
401 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
402 	  if (0 < shift)
403 	    {
404 	      j += shift;
405 	      continue;
406 	    }
407 	  /* Scan for matches in right half.  The last byte has
408 	     already been matched, by virtue of the shift table.  */
409 	  i = suffix;
410 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
411 					== CANON_ELEMENT (haystack[i + j])))
412 	    ++i;
413 	  if (needle_len - 1 <= i)
414 	    {
415 	      /* Scan for matches in left half.  */
416 	      i = suffix - 1;
417 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
418 				       == CANON_ELEMENT (haystack[i + j])))
419 		--i;
420 	      if (i == SIZE_MAX)
421 		return (RETURN_TYPE) (haystack + j);
422 	      j += period;
423 	    }
424 	  else
425 	    j += i - suffix + 1;
426 	}
427     }
428   return NULL;
429 }
430 
431 #undef AVAILABLE
432 #undef CANON_ELEMENT
433 #undef CMP_FUNC
434 #undef MAX
435 #undef RETURN_TYPE
436