1 /* Searching in a string. 2 Copyright (C) 2005-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2005. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include <config.h> 19 20 /* Specification. */ 21 #include <string.h> 22 23 #include <stdbool.h> 24 #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */ 25 26 #include "malloca.h" 27 #include "mbuiter.h" 28 29 /* Knuth-Morris-Pratt algorithm. */ 30 #define CANON_ELEMENT(c) c 31 #include "str-kmp.h" 32 33 /* Knuth-Morris-Pratt algorithm. 34 See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm 35 Return a boolean indicating success: 36 Return true and set *RESULTP if the search was completed. 37 Return false if it was aborted because not enough memory was available. */ 38 static bool 39 knuth_morris_pratt_multibyte (const char *haystack, const char *needle, 40 const char **resultp) 41 { 42 size_t m = mbslen (needle); 43 mbchar_t *needle_mbchars; 44 size_t *table; 45 46 /* Allocate room for needle_mbchars and the table. */ 47 char *memory = (char *) nmalloca (m, sizeof (mbchar_t) + sizeof (size_t)); 48 if (memory == NULL) 49 return false; 50 needle_mbchars = (mbchar_t *) memory; 51 table = (size_t *) (memory + m * sizeof (mbchar_t)); 52 53 /* Fill needle_mbchars. */ 54 { 55 mbui_iterator_t iter; 56 size_t j; 57 58 j = 0; 59 for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++) 60 mb_copy (&needle_mbchars[j], &mbui_cur (iter)); 61 } 62 63 /* Fill the table. 64 For 0 < i < m: 65 0 < table[i] <= i is defined such that 66 forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], 67 and table[i] is as large as possible with this property. 68 This implies: 69 1) For 0 < i < m: 70 If table[i] < i, 71 needle[table[i]..i-1] = needle[0..i-1-table[i]]. 72 2) For 0 < i < m: 73 rhaystack[0..i-1] == needle[0..i-1] 74 and exists h, i <= h < m: rhaystack[h] != needle[h] 75 implies 76 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. 77 table[0] remains uninitialized. */ 78 { 79 size_t i, j; 80 81 /* i = 1: Nothing to verify for x = 0. */ 82 table[1] = 1; 83 j = 0; 84 85 for (i = 2; i < m; i++) 86 { 87 /* Here: j = i-1 - table[i-1]. 88 The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold 89 for x < table[i-1], by induction. 90 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 91 mbchar_t *b = &needle_mbchars[i - 1]; 92 93 for (;;) 94 { 95 /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] 96 is known to hold for x < i-1-j. 97 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 98 if (mb_equal (*b, needle_mbchars[j])) 99 { 100 /* Set table[i] := i-1-j. */ 101 table[i] = i - ++j; 102 break; 103 } 104 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 105 for x = i-1-j, because 106 needle[i-1] != needle[j] = needle[i-1-x]. */ 107 if (j == 0) 108 { 109 /* The inequality holds for all possible x. */ 110 table[i] = i; 111 break; 112 } 113 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 114 for i-1-j < x < i-1-j+table[j], because for these x: 115 needle[x..i-2] 116 = needle[x-(i-1-j)..j-1] 117 != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) 118 = needle[0..i-2-x], 119 hence needle[x..i-1] != needle[0..i-1-x]. 120 Furthermore 121 needle[i-1-j+table[j]..i-2] 122 = needle[table[j]..j-1] 123 = needle[0..j-1-table[j]] (by definition of table[j]). */ 124 j = j - table[j]; 125 } 126 /* Here: j = i - table[i]. */ 127 } 128 } 129 130 /* Search, using the table to accelerate the processing. */ 131 { 132 size_t j; 133 mbui_iterator_t rhaystack; 134 mbui_iterator_t phaystack; 135 136 *resultp = NULL; 137 j = 0; 138 mbui_init (rhaystack, haystack); 139 mbui_init (phaystack, haystack); 140 /* Invariant: phaystack = rhaystack + j. */ 141 while (mbui_avail (phaystack)) 142 if (mb_equal (needle_mbchars[j], mbui_cur (phaystack))) 143 { 144 j++; 145 mbui_advance (phaystack); 146 if (j == m) 147 { 148 /* The entire needle has been found. */ 149 *resultp = mbui_cur_ptr (rhaystack); 150 break; 151 } 152 } 153 else if (j > 0) 154 { 155 /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 156 size_t count = table[j]; 157 j -= count; 158 for (; count > 0; count--) 159 { 160 if (!mbui_avail (rhaystack)) 161 abort (); 162 mbui_advance (rhaystack); 163 } 164 } 165 else 166 { 167 /* Found a mismatch at needle[0] already. */ 168 if (!mbui_avail (rhaystack)) 169 abort (); 170 mbui_advance (rhaystack); 171 mbui_advance (phaystack); 172 } 173 } 174 175 freea (memory); 176 return true; 177 } 178 179 /* Find the first occurrence of the character string NEEDLE in the character 180 string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */ 181 char * 182 mbsstr (const char *haystack, const char *needle) 183 { 184 /* Be careful not to look at the entire extent of haystack or needle 185 until needed. This is useful because of these two cases: 186 - haystack may be very long, and a match of needle found early, 187 - needle may be very long, and not even a short initial segment of 188 needle may be found in haystack. */ 189 if (MB_CUR_MAX > 1) 190 { 191 mbui_iterator_t iter_needle; 192 193 mbui_init (iter_needle, needle); 194 if (mbui_avail (iter_needle)) 195 { 196 /* Minimizing the worst-case complexity: 197 Let n = mbslen(haystack), m = mbslen(needle). 198 The naïve algorithm is O(n*m) worst-case. 199 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 200 memory allocation. 201 To achieve linear complexity and yet amortize the cost of the 202 memory allocation, we activate the Knuth-Morris-Pratt algorithm 203 only once the naïve algorithm has already run for some time; more 204 precisely, when 205 - the outer loop count is >= 10, 206 - the average number of comparisons per outer loop is >= 5, 207 - the total number of comparisons is >= m. 208 But we try it only once. If the memory allocation attempt failed, 209 we don't retry it. */ 210 bool try_kmp = true; 211 size_t outer_loop_count = 0; 212 size_t comparison_count = 0; 213 size_t last_ccount = 0; /* last comparison count */ 214 mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */ 215 216 mbui_iterator_t iter_haystack; 217 218 mbui_init (iter_needle_last_ccount, needle); 219 mbui_init (iter_haystack, haystack); 220 for (;; mbui_advance (iter_haystack)) 221 { 222 if (!mbui_avail (iter_haystack)) 223 /* No match. */ 224 return NULL; 225 226 /* See whether it's advisable to use an asymptotically faster 227 algorithm. */ 228 if (try_kmp 229 && outer_loop_count >= 10 230 && comparison_count >= 5 * outer_loop_count) 231 { 232 /* See if needle + comparison_count now reaches the end of 233 needle. */ 234 size_t count = comparison_count - last_ccount; 235 for (; 236 count > 0 && mbui_avail (iter_needle_last_ccount); 237 count--) 238 mbui_advance (iter_needle_last_ccount); 239 last_ccount = comparison_count; 240 if (!mbui_avail (iter_needle_last_ccount)) 241 { 242 /* Try the Knuth-Morris-Pratt algorithm. */ 243 const char *result; 244 bool success = 245 knuth_morris_pratt_multibyte (haystack, needle, 246 &result); 247 if (success) 248 return (char *) result; 249 try_kmp = false; 250 } 251 } 252 253 outer_loop_count++; 254 comparison_count++; 255 if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle))) 256 /* The first character matches. */ 257 { 258 mbui_iterator_t rhaystack; 259 mbui_iterator_t rneedle; 260 261 memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t)); 262 mbui_advance (rhaystack); 263 264 mbui_init (rneedle, needle); 265 if (!mbui_avail (rneedle)) 266 abort (); 267 mbui_advance (rneedle); 268 269 for (;; mbui_advance (rhaystack), mbui_advance (rneedle)) 270 { 271 if (!mbui_avail (rneedle)) 272 /* Found a match. */ 273 return (char *) mbui_cur_ptr (iter_haystack); 274 if (!mbui_avail (rhaystack)) 275 /* No match. */ 276 return NULL; 277 comparison_count++; 278 if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle))) 279 /* Nothing in this round. */ 280 break; 281 } 282 } 283 } 284 } 285 else 286 return (char *) haystack; 287 } 288 else 289 { 290 if (*needle != '\0') 291 { 292 /* Minimizing the worst-case complexity: 293 Let n = strlen(haystack), m = strlen(needle). 294 The naïve algorithm is O(n*m) worst-case. 295 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 296 memory allocation. 297 To achieve linear complexity and yet amortize the cost of the 298 memory allocation, we activate the Knuth-Morris-Pratt algorithm 299 only once the naïve algorithm has already run for some time; more 300 precisely, when 301 - the outer loop count is >= 10, 302 - the average number of comparisons per outer loop is >= 5, 303 - the total number of comparisons is >= m. 304 But we try it only once. If the memory allocation attempt failed, 305 we don't retry it. */ 306 bool try_kmp = true; 307 size_t outer_loop_count = 0; 308 size_t comparison_count = 0; 309 size_t last_ccount = 0; /* last comparison count */ 310 const char *needle_last_ccount = needle; /* = needle + last_ccount */ 311 312 /* Speed up the following searches of needle by caching its first 313 character. */ 314 char b = *needle++; 315 316 for (;; haystack++) 317 { 318 if (*haystack == '\0') 319 /* No match. */ 320 return NULL; 321 322 /* See whether it's advisable to use an asymptotically faster 323 algorithm. */ 324 if (try_kmp 325 && outer_loop_count >= 10 326 && comparison_count >= 5 * outer_loop_count) 327 { 328 /* See if needle + comparison_count now reaches the end of 329 needle. */ 330 if (needle_last_ccount != NULL) 331 { 332 needle_last_ccount += 333 strnlen (needle_last_ccount, 334 comparison_count - last_ccount); 335 if (*needle_last_ccount == '\0') 336 needle_last_ccount = NULL; 337 last_ccount = comparison_count; 338 } 339 if (needle_last_ccount == NULL) 340 { 341 /* Try the Knuth-Morris-Pratt algorithm. */ 342 const char *result; 343 bool success = 344 knuth_morris_pratt_unibyte (haystack, needle - 1, 345 &result); 346 if (success) 347 return (char *) result; 348 try_kmp = false; 349 } 350 } 351 352 outer_loop_count++; 353 comparison_count++; 354 if (*haystack == b) 355 /* The first character matches. */ 356 { 357 const char *rhaystack = haystack + 1; 358 const char *rneedle = needle; 359 360 for (;; rhaystack++, rneedle++) 361 { 362 if (*rneedle == '\0') 363 /* Found a match. */ 364 return (char *) haystack; 365 if (*rhaystack == '\0') 366 /* No match. */ 367 return NULL; 368 comparison_count++; 369 if (*rhaystack != *rneedle) 370 /* Nothing in this round. */ 371 break; 372 } 373 } 374 } 375 } 376 else 377 return (char *) haystack; 378 } 379 } 380