1 /* Searching in a string. 2 Copyright (C) 2005-2011 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2005. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include <config.h> 19 20 /* Specification. */ 21 #include <string.h> 22 23 #include <stdbool.h> 24 #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */ 25 26 #include "malloca.h" 27 #include "mbuiter.h" 28 29 /* Knuth-Morris-Pratt algorithm. */ 30 #define UNIT unsigned char 31 #define CANON_ELEMENT(c) c 32 #include "str-kmp.h" 33 34 /* Knuth-Morris-Pratt algorithm. 35 See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm 36 Return a boolean indicating success: 37 Return true and set *RESULTP if the search was completed. 38 Return false if it was aborted because not enough memory was available. */ 39 static bool 40 knuth_morris_pratt_multibyte (const char *haystack, const char *needle, 41 const char **resultp) 42 { 43 size_t m = mbslen (needle); 44 mbchar_t *needle_mbchars; 45 size_t *table; 46 47 /* Allocate room for needle_mbchars and the table. */ 48 char *memory = (char *) nmalloca (m, sizeof (mbchar_t) + sizeof (size_t)); 49 if (memory == NULL) 50 return false; 51 needle_mbchars = (mbchar_t *) memory; 52 table = (size_t *) (memory + m * sizeof (mbchar_t)); 53 54 /* Fill needle_mbchars. */ 55 { 56 mbui_iterator_t iter; 57 size_t j; 58 59 j = 0; 60 for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++) 61 mb_copy (&needle_mbchars[j], &mbui_cur (iter)); 62 } 63 64 /* Fill the table. 65 For 0 < i < m: 66 0 < table[i] <= i is defined such that 67 forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], 68 and table[i] is as large as possible with this property. 69 This implies: 70 1) For 0 < i < m: 71 If table[i] < i, 72 needle[table[i]..i-1] = needle[0..i-1-table[i]]. 73 2) For 0 < i < m: 74 rhaystack[0..i-1] == needle[0..i-1] 75 and exists h, i <= h < m: rhaystack[h] != needle[h] 76 implies 77 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. 78 table[0] remains uninitialized. */ 79 { 80 size_t i, j; 81 82 /* i = 1: Nothing to verify for x = 0. */ 83 table[1] = 1; 84 j = 0; 85 86 for (i = 2; i < m; i++) 87 { 88 /* Here: j = i-1 - table[i-1]. 89 The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold 90 for x < table[i-1], by induction. 91 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 92 mbchar_t *b = &needle_mbchars[i - 1]; 93 94 for (;;) 95 { 96 /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] 97 is known to hold for x < i-1-j. 98 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 99 if (mb_equal (*b, needle_mbchars[j])) 100 { 101 /* Set table[i] := i-1-j. */ 102 table[i] = i - ++j; 103 break; 104 } 105 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 106 for x = i-1-j, because 107 needle[i-1] != needle[j] = needle[i-1-x]. */ 108 if (j == 0) 109 { 110 /* The inequality holds for all possible x. */ 111 table[i] = i; 112 break; 113 } 114 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 115 for i-1-j < x < i-1-j+table[j], because for these x: 116 needle[x..i-2] 117 = needle[x-(i-1-j)..j-1] 118 != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) 119 = needle[0..i-2-x], 120 hence needle[x..i-1] != needle[0..i-1-x]. 121 Furthermore 122 needle[i-1-j+table[j]..i-2] 123 = needle[table[j]..j-1] 124 = needle[0..j-1-table[j]] (by definition of table[j]). */ 125 j = j - table[j]; 126 } 127 /* Here: j = i - table[i]. */ 128 } 129 } 130 131 /* Search, using the table to accelerate the processing. */ 132 { 133 size_t j; 134 mbui_iterator_t rhaystack; 135 mbui_iterator_t phaystack; 136 137 *resultp = NULL; 138 j = 0; 139 mbui_init (rhaystack, haystack); 140 mbui_init (phaystack, haystack); 141 /* Invariant: phaystack = rhaystack + j. */ 142 while (mbui_avail (phaystack)) 143 if (mb_equal (needle_mbchars[j], mbui_cur (phaystack))) 144 { 145 j++; 146 mbui_advance (phaystack); 147 if (j == m) 148 { 149 /* The entire needle has been found. */ 150 *resultp = mbui_cur_ptr (rhaystack); 151 break; 152 } 153 } 154 else if (j > 0) 155 { 156 /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 157 size_t count = table[j]; 158 j -= count; 159 for (; count > 0; count--) 160 { 161 if (!mbui_avail (rhaystack)) 162 abort (); 163 mbui_advance (rhaystack); 164 } 165 } 166 else 167 { 168 /* Found a mismatch at needle[0] already. */ 169 if (!mbui_avail (rhaystack)) 170 abort (); 171 mbui_advance (rhaystack); 172 mbui_advance (phaystack); 173 } 174 } 175 176 freea (memory); 177 return true; 178 } 179 180 /* Find the first occurrence of the character string NEEDLE in the character 181 string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */ 182 char * 183 mbsstr (const char *haystack, const char *needle) 184 { 185 /* Be careful not to look at the entire extent of haystack or needle 186 until needed. This is useful because of these two cases: 187 - haystack may be very long, and a match of needle found early, 188 - needle may be very long, and not even a short initial segment of 189 needle may be found in haystack. */ 190 if (MB_CUR_MAX > 1) 191 { 192 mbui_iterator_t iter_needle; 193 194 mbui_init (iter_needle, needle); 195 if (mbui_avail (iter_needle)) 196 { 197 /* Minimizing the worst-case complexity: 198 Let n = mbslen(haystack), m = mbslen(needle). 199 The naïve algorithm is O(n*m) worst-case. 200 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 201 memory allocation. 202 To achieve linear complexity and yet amortize the cost of the 203 memory allocation, we activate the Knuth-Morris-Pratt algorithm 204 only once the naïve algorithm has already run for some time; more 205 precisely, when 206 - the outer loop count is >= 10, 207 - the average number of comparisons per outer loop is >= 5, 208 - the total number of comparisons is >= m. 209 But we try it only once. If the memory allocation attempt failed, 210 we don't retry it. */ 211 bool try_kmp = true; 212 size_t outer_loop_count = 0; 213 size_t comparison_count = 0; 214 size_t last_ccount = 0; /* last comparison count */ 215 mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */ 216 217 mbui_iterator_t iter_haystack; 218 219 mbui_init (iter_needle_last_ccount, needle); 220 mbui_init (iter_haystack, haystack); 221 for (;; mbui_advance (iter_haystack)) 222 { 223 if (!mbui_avail (iter_haystack)) 224 /* No match. */ 225 return NULL; 226 227 /* See whether it's advisable to use an asymptotically faster 228 algorithm. */ 229 if (try_kmp 230 && outer_loop_count >= 10 231 && comparison_count >= 5 * outer_loop_count) 232 { 233 /* See if needle + comparison_count now reaches the end of 234 needle. */ 235 size_t count = comparison_count - last_ccount; 236 for (; 237 count > 0 && mbui_avail (iter_needle_last_ccount); 238 count--) 239 mbui_advance (iter_needle_last_ccount); 240 last_ccount = comparison_count; 241 if (!mbui_avail (iter_needle_last_ccount)) 242 { 243 /* Try the Knuth-Morris-Pratt algorithm. */ 244 const char *result; 245 bool success = 246 knuth_morris_pratt_multibyte (haystack, needle, 247 &result); 248 if (success) 249 return (char *) result; 250 try_kmp = false; 251 } 252 } 253 254 outer_loop_count++; 255 comparison_count++; 256 if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle))) 257 /* The first character matches. */ 258 { 259 mbui_iterator_t rhaystack; 260 mbui_iterator_t rneedle; 261 262 memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t)); 263 mbui_advance (rhaystack); 264 265 mbui_init (rneedle, needle); 266 if (!mbui_avail (rneedle)) 267 abort (); 268 mbui_advance (rneedle); 269 270 for (;; mbui_advance (rhaystack), mbui_advance (rneedle)) 271 { 272 if (!mbui_avail (rneedle)) 273 /* Found a match. */ 274 return (char *) mbui_cur_ptr (iter_haystack); 275 if (!mbui_avail (rhaystack)) 276 /* No match. */ 277 return NULL; 278 comparison_count++; 279 if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle))) 280 /* Nothing in this round. */ 281 break; 282 } 283 } 284 } 285 } 286 else 287 return (char *) haystack; 288 } 289 else 290 { 291 if (*needle != '\0') 292 { 293 /* Minimizing the worst-case complexity: 294 Let n = strlen(haystack), m = strlen(needle). 295 The naïve algorithm is O(n*m) worst-case. 296 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 297 memory allocation. 298 To achieve linear complexity and yet amortize the cost of the 299 memory allocation, we activate the Knuth-Morris-Pratt algorithm 300 only once the naïve algorithm has already run for some time; more 301 precisely, when 302 - the outer loop count is >= 10, 303 - the average number of comparisons per outer loop is >= 5, 304 - the total number of comparisons is >= m. 305 But we try it only once. If the memory allocation attempt failed, 306 we don't retry it. */ 307 bool try_kmp = true; 308 size_t outer_loop_count = 0; 309 size_t comparison_count = 0; 310 size_t last_ccount = 0; /* last comparison count */ 311 const char *needle_last_ccount = needle; /* = needle + last_ccount */ 312 313 /* Speed up the following searches of needle by caching its first 314 character. */ 315 char b = *needle++; 316 317 for (;; haystack++) 318 { 319 if (*haystack == '\0') 320 /* No match. */ 321 return NULL; 322 323 /* See whether it's advisable to use an asymptotically faster 324 algorithm. */ 325 if (try_kmp 326 && outer_loop_count >= 10 327 && comparison_count >= 5 * outer_loop_count) 328 { 329 /* See if needle + comparison_count now reaches the end of 330 needle. */ 331 if (needle_last_ccount != NULL) 332 { 333 needle_last_ccount += 334 strnlen (needle_last_ccount, 335 comparison_count - last_ccount); 336 if (*needle_last_ccount == '\0') 337 needle_last_ccount = NULL; 338 last_ccount = comparison_count; 339 } 340 if (needle_last_ccount == NULL) 341 { 342 /* Try the Knuth-Morris-Pratt algorithm. */ 343 const unsigned char *result; 344 bool success = 345 knuth_morris_pratt ((const unsigned char *) haystack, 346 (const unsigned char *) (needle - 1), 347 strlen (needle - 1), 348 &result); 349 if (success) 350 return (char *) result; 351 try_kmp = false; 352 } 353 } 354 355 outer_loop_count++; 356 comparison_count++; 357 if (*haystack == b) 358 /* The first character matches. */ 359 { 360 const char *rhaystack = haystack + 1; 361 const char *rneedle = needle; 362 363 for (;; rhaystack++, rneedle++) 364 { 365 if (*rneedle == '\0') 366 /* Found a match. */ 367 return (char *) haystack; 368 if (*rhaystack == '\0') 369 /* No match. */ 370 return NULL; 371 comparison_count++; 372 if (*rhaystack != *rneedle) 373 /* Nothing in this round. */ 374 break; 375 } 376 } 377 } 378 } 379 else 380 return (char *) haystack; 381 } 382 } 383