1 /* 2 tre-match-parallel.c - TRE parallel regex matching engine 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 /* 10 This algorithm searches for matches basically by reading characters 11 in the searched string one by one, starting at the beginning. All 12 matching paths in the TNFA are traversed in parallel. When two or 13 more paths reach the same state, exactly one is chosen according to 14 tag ordering rules; if returning submatches is not required it does 15 not matter which path is chosen. 16 17 The worst case time required for finding the leftmost and longest 18 match, or determining that there is no match, is always linearly 19 dependent on the length of the text being searched. 20 21 This algorithm cannot handle TNFAs with back referencing nodes. 22 See `tre-match-backtrack.c'. 23 */ 24 25 26 #ifdef HAVE_CONFIG_H 27 #include <config.h> 28 #endif /* HAVE_CONFIG_H */ 29 30 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state 31 info while running */ 32 #undef TRE_USE_ALLOCA 33 34 #ifdef TRE_USE_ALLOCA 35 /* AIX requires this to be the first thing in the file. */ 36 #ifndef __GNUC__ 37 # if HAVE_ALLOCA_H 38 # include <alloca.h> 39 # else 40 # ifdef _AIX 41 #pragma alloca 42 # else 43 # ifndef alloca /* predefined by HP cc +Olibcalls */ 44 char *alloca (); 45 # endif 46 # endif 47 # endif 48 #endif 49 #endif /* TRE_USE_ALLOCA */ 50 51 #include <assert.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #ifdef HAVE_WCHAR_H 55 #include <wchar.h> 56 #endif /* HAVE_WCHAR_H */ 57 #ifdef HAVE_WCTYPE_H 58 #include <wctype.h> 59 #endif /* HAVE_WCTYPE_H */ 60 #ifndef TRE_WCHAR 61 #include <ctype.h> 62 #endif /* !TRE_WCHAR */ 63 #ifdef HAVE_MALLOC_H 64 #include <malloc.h> 65 #endif /* HAVE_MALLOC_H */ 66 67 #include "tre-internal.h" 68 #include "tre-match-utils.h" 69 #include "tre.h" 70 #include "xmalloc.h" 71 72 73 74 typedef struct { 75 tre_tnfa_transition_t *state; 76 tre_tag_t *tags; 77 } tre_tnfa_reach_t; 78 79 typedef struct { 80 int pos; 81 tre_tag_t **tags; 82 } tre_reach_pos_t; 83 84 85 #ifdef TRE_DEBUG 86 static void 87 tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags) 88 { 89 DPRINT((" %p", (void *)state)); 90 if (num_tags > 0) 91 { 92 DPRINT(("/")); 93 tre_print_tags(tags, num_tags); 94 } 95 } 96 97 static void 98 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) 99 { 100 while (reach->state != NULL) 101 { 102 tre_print_reach1(reach->state, reach->tags, num_tags); 103 reach++; 104 } 105 DPRINT(("\n")); 106 107 } 108 #endif /* TRE_DEBUG */ 109 110 reg_errcode_t 111 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 112 tre_str_type_t type, tre_tag_t *match_tags, int eflags, 113 int *match_end_ofs) 114 { 115 /* State variables required by GET_NEXT_WCHAR. */ 116 tre_char_t prev_c = 0, next_c = 0; 117 const char *str_byte = string; 118 int pos = -1; 119 unsigned int pos_add_next = 1; 120 #ifdef TRE_WCHAR 121 const wchar_t *str_wide = string; 122 #ifdef TRE_MBSTATE 123 mbstate_t mbstate; 124 #endif /* TRE_MBSTATE */ 125 #endif /* TRE_WCHAR */ 126 int reg_notbol = eflags & REG_NOTBOL; 127 int reg_noteol = eflags & REG_NOTEOL; 128 int reg_newline = tnfa->cflags & REG_NEWLINE; 129 130 char *buf; 131 tre_tnfa_transition_t *trans_i; 132 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; 133 tre_reach_pos_t *reach_pos; 134 int *tag_i; 135 int num_tags, i; 136 137 int match_eo = -1; /* end offset of match (-1 if no match found yet) */ 138 #ifdef TRE_DEBUG 139 int once; 140 #endif /* TRE_DEBUG */ 141 tre_tag_t *tmp_tags = NULL; 142 tre_tag_t *tmp_iptr; 143 int tbytes; 144 int touch = 1; 145 146 #ifdef TRE_MBSTATE 147 memset(&mbstate, '\0', sizeof(mbstate)); 148 #endif /* TRE_MBSTATE */ 149 150 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); 151 152 if (!match_tags) 153 num_tags = 0; 154 else 155 num_tags = tnfa->num_tags; 156 157 /* Allocate memory for temporary data required for matching. This needs to 158 be done for every matching operation to be thread safe. This allocates 159 everything in a single large block from the stack frame using alloca() 160 or with malloc() if alloca is unavailable. */ 161 { 162 int rbytes, pbytes, total_bytes; 163 char *tmp_buf; 164 /* Compute the length of the block we need. */ 165 tbytes = sizeof(*tmp_tags) * num_tags; 166 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); 167 pbytes = sizeof(*reach_pos) * tnfa->num_states; 168 total_bytes = 169 (sizeof(long) - 1) * 4 /* for alignment paddings */ 170 + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes; 171 172 DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes)); 173 /* Allocate the memory. */ 174 #ifdef TRE_USE_ALLOCA 175 buf = alloca(total_bytes); 176 #else /* !TRE_USE_ALLOCA */ 177 buf = xmalloc((unsigned)total_bytes); 178 #endif /* !TRE_USE_ALLOCA */ 179 if (buf == NULL) 180 return REG_ESPACE; 181 memset(buf, 0, (size_t)total_bytes); 182 183 /* Get the various pointers within tmp_buf (properly aligned). */ 184 tmp_tags = (void *)buf; 185 tmp_buf = buf + tbytes; 186 tmp_buf += ALIGN(tmp_buf, long); 187 reach_next = (void *)tmp_buf; 188 tmp_buf += rbytes; 189 tmp_buf += ALIGN(tmp_buf, long); 190 reach = (void *)tmp_buf; 191 tmp_buf += rbytes; 192 tmp_buf += ALIGN(tmp_buf, long); 193 reach_pos = (void *)tmp_buf; 194 tmp_buf += pbytes; 195 tmp_buf += ALIGN(tmp_buf, long); 196 for (i = 0; i < tnfa->num_states; i++) 197 { 198 reach[i].tags = (void *)tmp_buf; 199 tmp_buf += tbytes; 200 reach_next[i].tags = (void *)tmp_buf; 201 tmp_buf += tbytes; 202 } 203 } 204 205 for (i = 0; i < tnfa->num_states; i++) 206 reach_pos[i].pos = -1; 207 208 /* If only one character can start a match, find it first. */ 209 if (tnfa->first_char >= 0 && str_byte) 210 { 211 const char *orig_str = str_byte; 212 int first = tnfa->first_char; 213 int found_high_bit = 0; 214 215 216 if (type == STR_BYTE) 217 { 218 if (len >= 0) 219 str_byte = memchr(orig_str, first, (size_t)len); 220 else 221 str_byte = strchr(orig_str, first); 222 } 223 else if (type == STR_MBS) 224 { 225 /* 226 * If the match character is ASCII, try to match the character 227 * directly, but if a high bit character is found, we stop there. 228 */ 229 if (first < 0x80) 230 { 231 if (len >= 0) 232 { 233 int i; 234 for (i = 0; ; str_byte++, i++) 235 { 236 if (i >= len) 237 { 238 str_byte = NULL; 239 break; 240 } 241 if (*str_byte == first) 242 break; 243 if (*str_byte & 0x80) 244 { 245 found_high_bit = 1; 246 break; 247 } 248 } 249 } 250 else 251 { 252 for (; ; str_byte++) 253 { 254 if (!*str_byte) 255 { 256 str_byte = NULL; 257 break; 258 } 259 if (*str_byte == first) 260 break; 261 if (*str_byte & 0x80) 262 { 263 found_high_bit = 1; 264 break; 265 } 266 } 267 } 268 } 269 else 270 { 271 if (len >= 0) 272 { 273 int i; 274 for (i = 0; ; str_byte++, i++) 275 { 276 if (i >= len) 277 { 278 str_byte = NULL; 279 break; 280 } 281 if (*str_byte & 0x80) 282 { 283 found_high_bit = 1; 284 break; 285 } 286 } 287 } 288 else 289 { 290 for (; ; str_byte++) 291 { 292 if (!*str_byte) 293 { 294 str_byte = NULL; 295 break; 296 } 297 if (*str_byte & 0x80) 298 { 299 found_high_bit = 1; 300 break; 301 } 302 } 303 } 304 } 305 } 306 if (str_byte == NULL) 307 { 308 #ifndef TRE_USE_ALLOCA 309 if (buf) 310 xfree(buf); 311 #endif /* !TRE_USE_ALLOCA */ 312 return REG_NOMATCH; 313 } 314 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); 315 if (!found_high_bit) 316 { 317 if (str_byte >= orig_str + 1) 318 prev_c = (unsigned char)*(str_byte - 1); 319 next_c = (unsigned char)*str_byte; 320 pos = str_byte - orig_str; 321 if (len < 0 || pos < len) 322 str_byte++; 323 } 324 else 325 { 326 if (str_byte == orig_str) 327 goto no_first_optimization; 328 /* 329 * Back up one character, fix up the position, then call 330 * GET_NEXT_WCHAR() to process the multibyte character. 331 */ 332 /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */ 333 next_c = (unsigned char)*(str_byte - 1); 334 pos = (str_byte - 1) - orig_str; 335 GET_NEXT_WCHAR(); 336 } 337 } 338 else 339 { 340 no_first_optimization: 341 GET_NEXT_WCHAR(); 342 pos = 0; 343 } 344 345 DPRINT(("length: %d\n", len)); 346 DPRINT(("pos:chr/code | states and tags\n")); 347 DPRINT(("-------------+------------------------------------------------\n")); 348 349 reach_next_i = reach_next; 350 while (/*CONSTCOND*/1) 351 { 352 /* If no match found yet, add the initial states to `reach_next'. */ 353 if (match_eo < 0) 354 { 355 DPRINT((" init >")); 356 trans_i = tnfa->initial; 357 while (trans_i->state != NULL) 358 { 359 if (reach_pos[trans_i->state_id].pos < pos) 360 { 361 if (trans_i->assertions 362 && CHECK_ASSERTIONS(trans_i->assertions)) 363 { 364 DPRINT(("assertion failed\n")); 365 trans_i++; 366 continue; 367 } 368 369 DPRINT((" %p", (void *)trans_i->state)); 370 reach_next_i->state = trans_i->state; 371 memset(reach_next_i->tags, 0, tbytes); 372 tag_i = trans_i->tags; 373 if (tag_i) 374 { 375 while (*tag_i >= 0) 376 { 377 if (*tag_i < num_tags) 378 tre_tag_set(reach_next_i->tags, *tag_i, pos, touch); 379 tag_i++; 380 } 381 touch++; 382 } 383 if (reach_next_i->state == tnfa->final) 384 { 385 DPRINT((" found empty match\n")); 386 match_eo = pos; 387 memcpy(match_tags, reach_next_i->tags, tbytes); 388 } 389 reach_pos[trans_i->state_id].pos = pos; 390 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 391 reach_next_i++; 392 } 393 trans_i++; 394 } 395 DPRINT(("\n")); 396 reach_next_i->state = NULL; 397 } 398 else 399 { 400 if (num_tags == 0 || reach_next_i == reach_next) 401 /*�We have found a match. */ 402 break; 403 } 404 405 /* Check for end of string. */ 406 if (len < 0) 407 { 408 if (next_c == L'\0') 409 break; 410 } 411 else 412 { 413 if (pos >= len) 414 break; 415 } 416 417 GET_NEXT_WCHAR(); 418 419 #ifdef TRE_DEBUG 420 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); 421 tre_print_reach(tnfa, reach_next, num_tags); 422 #endif /* TRE_DEBUG */ 423 424 /* Swap `reach' and `reach_next'. */ 425 reach_i = reach; 426 reach = reach_next; 427 reach_next = reach_i; 428 429 #ifdef TRE_DEBUG 430 once = 0; 431 #endif /* TRE_DEBUG */ 432 433 /* For each state in `reach' see if there is a transition leaving with 434 the current input symbol to a state not yet in `reach_next', and 435 add the destination states to `reach_next'. */ 436 reach_next_i = reach_next; 437 for (reach_i = reach; reach_i->state; reach_i++) 438 { 439 for (trans_i = reach_i->state; trans_i->state; trans_i++) 440 { 441 /* Does this transition match the input symbol? */ 442 if (trans_i->code_min <= (tre_cint_t)prev_c && 443 trans_i->code_max >= (tre_cint_t)prev_c) 444 { 445 if (trans_i->assertions 446 && (CHECK_ASSERTIONS(trans_i->assertions) 447 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 448 { 449 DPRINT(("assertion failed\n")); 450 continue; 451 } 452 453 /* Compute the tags after this transition. */ 454 memcpy(tmp_tags, reach_i->tags, tbytes); 455 tag_i = trans_i->tags; 456 if (tag_i != NULL) 457 { 458 while (*tag_i >= 0) 459 { 460 if (*tag_i < num_tags) 461 tre_tag_set(tmp_tags, *tag_i, pos, touch); 462 tag_i++; 463 } 464 touch++; 465 } 466 467 /* For each new transition, weed out those that don't 468 fulfill the minimal matching conditions. */ 469 if (tnfa->num_minimals && match_eo >= 0) 470 { 471 int skip = 0; 472 #ifdef TRE_DEBUG 473 if (!once) 474 { 475 DPRINT(("Checking minimal conditions: match_eo=%d " 476 "match_tags=", match_eo)); 477 tre_print_tags(match_tags, num_tags); 478 DPRINT(("\n")); 479 once++; 480 } 481 #endif /* TRE_DEBUG */ 482 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 483 { 484 int end = tnfa->minimal_tags[i]; 485 int start = tnfa->minimal_tags[i + 1]; 486 DPRINT((" Minimal start %d, end %d\n", start, end)); 487 if (tre_minimal_tag_order(start, end, match_tags, 488 tmp_tags) > 0) 489 { 490 skip = 1; 491 break; 492 } 493 } 494 if (skip) 495 { 496 #ifdef TRE_DEBUG 497 DPRINT((" Throwing out")); 498 tre_print_reach1(reach_i->state, tmp_tags, 499 num_tags); 500 DPRINT(("\n")); 501 #endif /* TRE_DEBUG */ 502 continue; 503 } 504 } 505 506 if (reach_pos[trans_i->state_id].pos < pos) 507 { 508 /* Found an unvisited node. */ 509 reach_next_i->state = trans_i->state; 510 tmp_iptr = reach_next_i->tags; 511 reach_next_i->tags = tmp_tags; 512 tmp_tags = tmp_iptr; 513 reach_pos[trans_i->state_id].pos = pos; 514 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 515 516 if (reach_next_i->state == tnfa->final 517 && (match_eo == -1 518 || (num_tags > 0 519 && tre_tag_get(reach_next_i->tags, 0) <= 520 tre_tag_get(match_tags, 0)))) 521 { 522 #ifdef TRE_DEBUG 523 DPRINT((" found match")); 524 tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags); 525 DPRINT(("\n")); 526 #endif /* TRE_DEBUG */ 527 match_eo = pos; 528 memcpy(match_tags, reach_next_i->tags, tbytes); 529 } 530 reach_next_i++; 531 532 } 533 else 534 { 535 assert(reach_pos[trans_i->state_id].pos == pos); 536 /* Another path has also reached this state. We choose 537 the winner by examining the tag values for both 538 paths. */ 539 if (tre_tag_order(num_tags, tnfa->tag_directions, 540 tmp_tags, 541 *reach_pos[trans_i->state_id].tags)) 542 { 543 /* The new path wins. */ 544 tmp_iptr = *reach_pos[trans_i->state_id].tags; 545 *reach_pos[trans_i->state_id].tags = tmp_tags; 546 if (trans_i->state == tnfa->final) 547 { 548 #ifdef TRE_DEBUG 549 DPRINT((" found better match")); 550 tre_print_reach1(trans_i->state, tmp_tags, num_tags); 551 DPRINT(("\n")); 552 #endif /* TRE_DEBUG */ 553 match_eo = pos; 554 memcpy(match_tags, tmp_tags, tbytes); 555 } 556 tmp_tags = tmp_iptr; 557 } 558 } 559 } 560 } 561 } 562 reach_next_i->state = NULL; 563 } 564 565 DPRINT(("match end offset = %d\n", match_eo)); 566 567 *match_end_ofs = match_eo; 568 #ifdef TRE_DEBUG 569 if (match_tags) 570 { 571 DPRINT(("Winning tags=")); 572 tre_print_tags_all(match_tags, num_tags); 573 DPRINT((" touch=%d\n", touch)); 574 } 575 #endif /* TRE_DEBUG */ 576 577 #ifndef TRE_USE_ALLOCA 578 if (buf) 579 xfree(buf); 580 #endif /* !TRE_USE_ALLOCA */ 581 582 return match_eo >= 0 ? REG_OK : REG_NOMATCH; 583 } 584 585 /* EOF */ 586