1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License along 17 with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 20 static void re_string_construct_common (const char *str, Idx len, 21 re_string_t *pstr, 22 REG_TRANSLATE_TYPE trans, bool icase, 23 const re_dfa_t *dfa) internal_function; 24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 25 const re_node_set *nodes, 26 re_hashval_t hash) internal_function; 27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 28 const re_node_set *nodes, 29 unsigned int context, 30 re_hashval_t hash) internal_function; 31 32 /* Functions for string operation. */ 33 34 /* This function allocate the buffers. It is necessary to call 35 re_string_reconstruct before using the object. */ 36 37 static reg_errcode_t 38 internal_function 39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 40 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 41 { 42 reg_errcode_t ret; 43 Idx init_buf_len; 44 45 /* Ensure at least one character fits into the buffers. */ 46 if (init_len < dfa->mb_cur_max) 47 init_len = dfa->mb_cur_max; 48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 49 re_string_construct_common (str, len, pstr, trans, icase, dfa); 50 51 ret = re_string_realloc_buffers (pstr, init_buf_len); 52 if (BE (ret != REG_NOERROR, 0)) 53 return ret; 54 55 pstr->word_char = dfa->word_char; 56 pstr->word_ops_used = dfa->word_ops_used; 57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 59 pstr->valid_raw_len = pstr->valid_len; 60 return REG_NOERROR; 61 } 62 63 /* This function allocate the buffers, and initialize them. */ 64 65 static reg_errcode_t 66 internal_function 67 re_string_construct (re_string_t *pstr, const char *str, Idx len, 68 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 69 { 70 reg_errcode_t ret; 71 memset (pstr, '\0', sizeof (re_string_t)); 72 re_string_construct_common (str, len, pstr, trans, icase, dfa); 73 74 if (len > 0) 75 { 76 ret = re_string_realloc_buffers (pstr, len + 1); 77 if (BE (ret != REG_NOERROR, 0)) 78 return ret; 79 } 80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 81 82 if (icase) 83 { 84 #ifdef RE_ENABLE_I18N 85 if (dfa->mb_cur_max > 1) 86 { 87 while (1) 88 { 89 ret = build_wcs_upper_buffer (pstr); 90 if (BE (ret != REG_NOERROR, 0)) 91 return ret; 92 if (pstr->valid_raw_len >= len) 93 break; 94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 95 break; 96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 97 if (BE (ret != REG_NOERROR, 0)) 98 return ret; 99 } 100 } 101 else 102 #endif /* RE_ENABLE_I18N */ 103 build_upper_buffer (pstr); 104 } 105 else 106 { 107 #ifdef RE_ENABLE_I18N 108 if (dfa->mb_cur_max > 1) 109 build_wcs_buffer (pstr); 110 else 111 #endif /* RE_ENABLE_I18N */ 112 { 113 if (trans != NULL) 114 re_string_translate_buffer (pstr); 115 else 116 { 117 pstr->valid_len = pstr->bufs_len; 118 pstr->valid_raw_len = pstr->bufs_len; 119 } 120 } 121 } 122 123 return REG_NOERROR; 124 } 125 126 /* Helper functions for re_string_allocate, and re_string_construct. */ 127 128 static reg_errcode_t 129 internal_function 130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 131 { 132 #ifdef RE_ENABLE_I18N 133 if (pstr->mb_cur_max > 1) 134 { 135 wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len); 136 if (BE (new_wcs == NULL, 0)) 137 return REG_ESPACE; 138 pstr->wcs = new_wcs; 139 if (pstr->offsets != NULL) 140 { 141 Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len); 142 if (BE (new_offsets == NULL, 0)) 143 return REG_ESPACE; 144 pstr->offsets = new_offsets; 145 } 146 } 147 #endif /* RE_ENABLE_I18N */ 148 if (pstr->mbs_allocated) 149 { 150 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 151 new_buf_len); 152 if (BE (new_mbs == NULL, 0)) 153 return REG_ESPACE; 154 pstr->mbs = new_mbs; 155 } 156 pstr->bufs_len = new_buf_len; 157 return REG_NOERROR; 158 } 159 160 161 static void 162 internal_function 163 re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 164 REG_TRANSLATE_TYPE trans, bool icase, 165 const re_dfa_t *dfa) 166 { 167 pstr->raw_mbs = (const unsigned char *) str; 168 pstr->len = len; 169 pstr->raw_len = len; 170 pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans; 171 pstr->icase = icase; 172 pstr->mbs_allocated = (trans != NULL || icase); 173 pstr->mb_cur_max = dfa->mb_cur_max; 174 pstr->is_utf8 = dfa->is_utf8; 175 pstr->map_notascii = dfa->map_notascii; 176 pstr->stop = pstr->len; 177 pstr->raw_stop = pstr->stop; 178 } 179 180 #ifdef RE_ENABLE_I18N 181 182 /* Build wide character buffer PSTR->WCS. 183 If the byte sequence of the string are: 184 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 185 Then wide character buffer will be: 186 <wc1> , WEOF , <wc2> , WEOF , <wc3> 187 We use WEOF for padding, they indicate that the position isn't 188 a first byte of a multibyte character. 189 190 Note that this function assumes PSTR->VALID_LEN elements are already 191 built and starts from PSTR->VALID_LEN. */ 192 193 static void 194 internal_function 195 build_wcs_buffer (re_string_t *pstr) 196 { 197 #ifdef _LIBC 198 unsigned char buf[MB_LEN_MAX]; 199 assert (MB_LEN_MAX >= pstr->mb_cur_max); 200 #else 201 unsigned char buf[64]; 202 #endif 203 mbstate_t prev_st; 204 Idx byte_idx, end_idx, remain_len; 205 size_t mbclen; 206 207 /* Build the buffers from pstr->valid_len to either pstr->len or 208 pstr->bufs_len. */ 209 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 210 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 211 { 212 wchar_t wc; 213 const char *p; 214 215 remain_len = end_idx - byte_idx; 216 prev_st = pstr->cur_state; 217 /* Apply the translation if we need. */ 218 if (BE (pstr->trans != NULL, 0)) 219 { 220 int i, ch; 221 222 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 223 { 224 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 225 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 226 } 227 p = (const char *) buf; 228 } 229 else 230 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 231 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); 232 if (BE (mbclen == (size_t) -2, 0)) 233 { 234 /* The buffer doesn't have enough space, finish to build. */ 235 pstr->cur_state = prev_st; 236 break; 237 } 238 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 239 { 240 /* We treat these cases as a singlebyte character. */ 241 mbclen = 1; 242 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 243 if (BE (pstr->trans != NULL, 0)) 244 wc = pstr->trans[wc]; 245 pstr->cur_state = prev_st; 246 } 247 248 /* Write wide character and padding. */ 249 pstr->wcs[byte_idx++] = wc; 250 /* Write paddings. */ 251 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 252 pstr->wcs[byte_idx++] = WEOF; 253 } 254 pstr->valid_len = byte_idx; 255 pstr->valid_raw_len = byte_idx; 256 } 257 258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 259 but for REG_ICASE. */ 260 261 static reg_errcode_t 262 internal_function 263 build_wcs_upper_buffer (re_string_t *pstr) 264 { 265 mbstate_t prev_st; 266 Idx src_idx, byte_idx, end_idx, remain_len; 267 size_t mbclen; 268 #ifdef _LIBC 269 char buf[MB_LEN_MAX]; 270 assert (MB_LEN_MAX >= pstr->mb_cur_max); 271 #else 272 char buf[64]; 273 #endif 274 275 byte_idx = pstr->valid_len; 276 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 277 278 /* The following optimization assumes that ASCII characters can be 279 mapped to wide characters with a simple cast. */ 280 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 281 { 282 while (byte_idx < end_idx) 283 { 284 wchar_t wc; 285 286 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 287 && mbsinit (&pstr->cur_state)) 288 { 289 /* In case of a singlebyte character. */ 290 pstr->mbs[byte_idx] 291 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 292 /* The next step uses the assumption that wchar_t is encoded 293 ASCII-safe: all ASCII values can be converted like this. */ 294 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 295 ++byte_idx; 296 continue; 297 } 298 299 remain_len = end_idx - byte_idx; 300 prev_st = pstr->cur_state; 301 mbclen = mbrtowc (&wc, 302 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 303 + byte_idx), remain_len, &pstr->cur_state); 304 if (BE ((size_t) (mbclen + 2) > 2, 1)) 305 { 306 wchar_t wcu = wc; 307 if (iswlower (wc)) 308 { 309 size_t mbcdlen; 310 311 wcu = towupper (wc); 312 mbcdlen = wcrtomb (buf, wcu, &prev_st); 313 if (BE (mbclen == mbcdlen, 1)) 314 memcpy (pstr->mbs + byte_idx, buf, mbclen); 315 else 316 { 317 src_idx = byte_idx; 318 goto offsets_needed; 319 } 320 } 321 else 322 memcpy (pstr->mbs + byte_idx, 323 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 324 pstr->wcs[byte_idx++] = wcu; 325 /* Write paddings. */ 326 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 327 pstr->wcs[byte_idx++] = WEOF; 328 } 329 else if (mbclen == (size_t) -1 || mbclen == 0) 330 { 331 /* It is an invalid character or '\0'. Just use the byte. */ 332 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 333 pstr->mbs[byte_idx] = ch; 334 /* And also cast it to wide char. */ 335 pstr->wcs[byte_idx++] = (wchar_t) ch; 336 if (BE (mbclen == (size_t) -1, 0)) 337 pstr->cur_state = prev_st; 338 } 339 else 340 { 341 /* The buffer doesn't have enough space, finish to build. */ 342 pstr->cur_state = prev_st; 343 break; 344 } 345 } 346 pstr->valid_len = byte_idx; 347 pstr->valid_raw_len = byte_idx; 348 return REG_NOERROR; 349 } 350 else 351 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 352 { 353 wchar_t wc; 354 const char *p; 355 offsets_needed: 356 remain_len = end_idx - byte_idx; 357 prev_st = pstr->cur_state; 358 if (BE (pstr->trans != NULL, 0)) 359 { 360 int i, ch; 361 362 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 363 { 364 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 365 buf[i] = pstr->trans[ch]; 366 } 367 p = (const char *) buf; 368 } 369 else 370 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 371 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); 372 if (BE ((size_t) (mbclen + 2) > 2, 1)) 373 { 374 wchar_t wcu = wc; 375 if (iswlower (wc)) 376 { 377 size_t mbcdlen; 378 379 wcu = towupper (wc); 380 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 381 if (BE (mbclen == mbcdlen, 1)) 382 memcpy (pstr->mbs + byte_idx, buf, mbclen); 383 else if (mbcdlen != (size_t) -1) 384 { 385 size_t i; 386 387 if (byte_idx + mbcdlen > pstr->bufs_len) 388 { 389 pstr->cur_state = prev_st; 390 break; 391 } 392 393 if (pstr->offsets == NULL) 394 { 395 pstr->offsets = re_xmalloc (Idx, pstr->bufs_len); 396 397 if (pstr->offsets == NULL) 398 return REG_ESPACE; 399 } 400 if (!pstr->offsets_needed) 401 { 402 for (i = 0; i < (size_t) byte_idx; ++i) 403 pstr->offsets[i] = i; 404 pstr->offsets_needed = 1; 405 } 406 407 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 408 pstr->wcs[byte_idx] = wcu; 409 pstr->offsets[byte_idx] = src_idx; 410 for (i = 1; i < mbcdlen; ++i) 411 { 412 pstr->offsets[byte_idx + i] 413 = src_idx + (i < mbclen ? i : mbclen - 1); 414 pstr->wcs[byte_idx + i] = WEOF; 415 } 416 pstr->len += mbcdlen - mbclen; 417 if (pstr->raw_stop > src_idx) 418 pstr->stop += mbcdlen - mbclen; 419 end_idx = (pstr->bufs_len > pstr->len) 420 ? pstr->len : pstr->bufs_len; 421 byte_idx += mbcdlen; 422 src_idx += mbclen; 423 continue; 424 } 425 else 426 memcpy (pstr->mbs + byte_idx, p, mbclen); 427 } 428 else 429 memcpy (pstr->mbs + byte_idx, p, mbclen); 430 431 if (BE (pstr->offsets_needed != 0, 0)) 432 { 433 size_t i; 434 for (i = 0; i < mbclen; ++i) 435 pstr->offsets[byte_idx + i] = src_idx + i; 436 } 437 src_idx += mbclen; 438 439 pstr->wcs[byte_idx++] = wcu; 440 /* Write paddings. */ 441 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 442 pstr->wcs[byte_idx++] = WEOF; 443 } 444 else if (mbclen == (size_t) -1 || mbclen == 0) 445 { 446 /* It is an invalid character or '\0'. Just use the byte. */ 447 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 448 449 if (BE (pstr->trans != NULL, 0)) 450 ch = pstr->trans [ch]; 451 pstr->mbs[byte_idx] = ch; 452 453 if (BE (pstr->offsets_needed != 0, 0)) 454 pstr->offsets[byte_idx] = src_idx; 455 ++src_idx; 456 457 /* And also cast it to wide char. */ 458 pstr->wcs[byte_idx++] = (wchar_t) ch; 459 if (BE (mbclen == (size_t) -1, 0)) 460 pstr->cur_state = prev_st; 461 } 462 else 463 { 464 /* The buffer doesn't have enough space, finish to build. */ 465 pstr->cur_state = prev_st; 466 break; 467 } 468 } 469 pstr->valid_len = byte_idx; 470 pstr->valid_raw_len = src_idx; 471 return REG_NOERROR; 472 } 473 474 /* Skip characters until the index becomes greater than NEW_RAW_IDX. 475 Return the index. */ 476 477 static Idx 478 internal_function 479 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 480 { 481 mbstate_t prev_st; 482 Idx rawbuf_idx; 483 size_t mbclen; 484 wchar_t wc = 0; 485 486 /* Skip the characters which are not necessary to check. */ 487 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 488 rawbuf_idx < new_raw_idx;) 489 { 490 Idx remain_len; 491 remain_len = pstr->len - rawbuf_idx; 492 prev_st = pstr->cur_state; 493 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx, 494 remain_len, &pstr->cur_state); 495 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 496 { 497 /* We treat these cases as a singlebyte character. */ 498 mbclen = 1; 499 pstr->cur_state = prev_st; 500 } 501 /* Then proceed the next character. */ 502 rawbuf_idx += mbclen; 503 } 504 *last_wc = (wint_t) wc; 505 return rawbuf_idx; 506 } 507 #endif /* RE_ENABLE_I18N */ 508 509 /* Build the buffer PSTR->MBS, and apply the translation if we need. 510 This function is used in case of REG_ICASE. */ 511 512 static void 513 internal_function 514 build_upper_buffer (re_string_t *pstr) 515 { 516 Idx char_idx, end_idx; 517 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 518 519 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 520 { 521 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 522 if (BE (pstr->trans != NULL, 0)) 523 ch = pstr->trans[ch]; 524 if (islower (ch)) 525 pstr->mbs[char_idx] = toupper (ch); 526 else 527 pstr->mbs[char_idx] = ch; 528 } 529 pstr->valid_len = char_idx; 530 pstr->valid_raw_len = char_idx; 531 } 532 533 /* Apply TRANS to the buffer in PSTR. */ 534 535 static void 536 internal_function 537 re_string_translate_buffer (re_string_t *pstr) 538 { 539 Idx buf_idx, end_idx; 540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 541 542 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 543 { 544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 545 pstr->mbs[buf_idx] = pstr->trans[ch]; 546 } 547 548 pstr->valid_len = buf_idx; 549 pstr->valid_raw_len = buf_idx; 550 } 551 552 /* This function re-construct the buffers. 553 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 554 convert to upper case in case of REG_ICASE, apply translation. */ 555 556 static reg_errcode_t 557 internal_function 558 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 559 { 560 Idx offset; 561 562 if (BE (pstr->raw_mbs_idx <= idx, 0)) 563 offset = idx - pstr->raw_mbs_idx; 564 else 565 { 566 /* Reset buffer. */ 567 #ifdef RE_ENABLE_I18N 568 if (pstr->mb_cur_max > 1) 569 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 570 #endif /* RE_ENABLE_I18N */ 571 pstr->len = pstr->raw_len; 572 pstr->stop = pstr->raw_stop; 573 pstr->valid_len = 0; 574 pstr->raw_mbs_idx = 0; 575 pstr->valid_raw_len = 0; 576 pstr->offsets_needed = 0; 577 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 578 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 579 if (!pstr->mbs_allocated) 580 pstr->mbs = (unsigned char *) pstr->raw_mbs; 581 offset = idx; 582 } 583 584 if (BE (offset != 0, 1)) 585 { 586 /* Are the characters which are already checked remain? */ 587 if (BE (offset < pstr->valid_raw_len, 1) 588 #ifdef RE_ENABLE_I18N 589 /* Handling this would enlarge the code too much. 590 Accept a slowdown in that case. */ 591 && pstr->offsets_needed == 0 592 #endif 593 ) 594 { 595 /* Yes, move them to the front of the buffer. */ 596 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags); 597 #ifdef RE_ENABLE_I18N 598 if (pstr->mb_cur_max > 1) 599 memmove (pstr->wcs, pstr->wcs + offset, 600 (pstr->valid_len - offset) * sizeof (wint_t)); 601 #endif /* RE_ENABLE_I18N */ 602 if (BE (pstr->mbs_allocated, 0)) 603 memmove (pstr->mbs, pstr->mbs + offset, 604 pstr->valid_len - offset); 605 pstr->valid_len -= offset; 606 pstr->valid_raw_len -= offset; 607 #if DEBUG 608 assert (pstr->valid_len > 0); 609 #endif 610 } 611 else 612 { 613 /* No, skip all characters until IDX. */ 614 #ifdef RE_ENABLE_I18N 615 if (BE (pstr->offsets_needed, 0)) 616 { 617 pstr->len = pstr->raw_len - idx + offset; 618 pstr->stop = pstr->raw_stop - idx + offset; 619 pstr->offsets_needed = 0; 620 } 621 #endif 622 pstr->valid_len = 0; 623 pstr->valid_raw_len = 0; 624 #ifdef RE_ENABLE_I18N 625 if (pstr->mb_cur_max > 1) 626 { 627 Idx wcs_idx; 628 wint_t wc = WEOF; 629 630 if (pstr->is_utf8) 631 { 632 const unsigned char *raw, *p, *q, *end; 633 634 /* Special case UTF-8. Multi-byte chars start with any 635 byte other than 0x80 - 0xbf. */ 636 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 637 end = raw + (offset - pstr->mb_cur_max); 638 for (p = raw + offset - 1; p >= end; --p) 639 if ((*p & 0xc0) != 0x80) 640 { 641 mbstate_t cur_state; 642 wchar_t wc2; 643 Idx mlen = raw + pstr->len - p; 644 unsigned char buf[6]; 645 size_t mbclen; 646 647 q = p; 648 if (BE (pstr->trans != NULL, 0)) 649 { 650 int i = mlen < 6 ? mlen : 6; 651 while (--i >= 0) 652 buf[i] = pstr->trans[p[i]]; 653 q = buf; 654 } 655 /* XXX Don't use mbrtowc, we know which conversion 656 to use (UTF-8 -> UCS4). */ 657 memset (&cur_state, 0, sizeof (cur_state)); 658 mbclen = mbrtowc (&wc2, (const char *) p, mlen, 659 &cur_state); 660 if (raw + offset - p <= mbclen && mbclen < (size_t) -2) 661 { 662 memset (&pstr->cur_state, '\0', 663 sizeof (mbstate_t)); 664 pstr->valid_len = mbclen - (raw + offset - p); 665 wc = wc2; 666 } 667 break; 668 } 669 } 670 671 if (wc == WEOF) 672 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 673 if (BE (pstr->valid_len, 0)) 674 { 675 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 676 pstr->wcs[wcs_idx] = WEOF; 677 if (pstr->mbs_allocated) 678 memset (pstr->mbs, -1, pstr->valid_len); 679 } 680 pstr->valid_raw_len = pstr->valid_len; 681 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 682 && IS_WIDE_WORD_CHAR (wc)) 683 ? CONTEXT_WORD 684 : ((IS_WIDE_NEWLINE (wc) 685 && pstr->newline_anchor) 686 ? CONTEXT_NEWLINE : 0)); 687 } 688 else 689 #endif /* RE_ENABLE_I18N */ 690 { 691 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 692 if (pstr->trans) 693 c = pstr->trans[c]; 694 pstr->tip_context = (bitset_contain (pstr->word_char, c) 695 ? CONTEXT_WORD 696 : ((IS_NEWLINE (c) && pstr->newline_anchor) 697 ? CONTEXT_NEWLINE : 0)); 698 } 699 } 700 if (!BE (pstr->mbs_allocated, 0)) 701 pstr->mbs += offset; 702 } 703 pstr->raw_mbs_idx = idx; 704 pstr->len -= offset; 705 pstr->stop -= offset; 706 707 /* Then build the buffers. */ 708 #ifdef RE_ENABLE_I18N 709 if (pstr->mb_cur_max > 1) 710 { 711 if (pstr->icase) 712 { 713 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 714 if (BE (ret != REG_NOERROR, 0)) 715 return ret; 716 } 717 else 718 build_wcs_buffer (pstr); 719 } 720 else 721 #endif /* RE_ENABLE_I18N */ 722 if (BE (pstr->mbs_allocated, 0)) 723 { 724 if (pstr->icase) 725 build_upper_buffer (pstr); 726 else if (pstr->trans != NULL) 727 re_string_translate_buffer (pstr); 728 } 729 else 730 pstr->valid_len = pstr->len; 731 732 pstr->cur_idx = 0; 733 return REG_NOERROR; 734 } 735 736 static unsigned char 737 internal_function __attribute ((pure)) 738 re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 739 { 740 int ch; 741 Idx off; 742 743 /* Handle the common (easiest) cases first. */ 744 if (BE (!pstr->mbs_allocated, 1)) 745 return re_string_peek_byte (pstr, idx); 746 747 #ifdef RE_ENABLE_I18N 748 if (pstr->mb_cur_max > 1 749 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 750 return re_string_peek_byte (pstr, idx); 751 #endif 752 753 off = pstr->cur_idx + idx; 754 #ifdef RE_ENABLE_I18N 755 if (pstr->offsets_needed) 756 off = pstr->offsets[off]; 757 #endif 758 759 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 760 761 #ifdef RE_ENABLE_I18N 762 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 763 this function returns CAPITAL LETTER I instead of first byte of 764 DOTLESS SMALL LETTER I. The latter would confuse the parser, 765 since peek_byte_case doesn't advance cur_idx in any way. */ 766 if (pstr->offsets_needed && !isascii (ch)) 767 return re_string_peek_byte (pstr, idx); 768 #endif 769 770 return ch; 771 } 772 773 static unsigned char 774 internal_function __attribute ((pure)) 775 re_string_fetch_byte_case (re_string_t *pstr) 776 { 777 if (BE (!pstr->mbs_allocated, 1)) 778 return re_string_fetch_byte (pstr); 779 780 #ifdef RE_ENABLE_I18N 781 if (pstr->offsets_needed) 782 { 783 Idx off; 784 int ch; 785 786 /* For tr_TR.UTF-8 [[:islower:]] there is 787 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 788 in that case the whole multi-byte character and return 789 the original letter. On the other side, with 790 [[: DOTLESS SMALL LETTER I return [[:I, as doing 791 anything else would complicate things too much. */ 792 793 if (!re_string_first_byte (pstr, pstr->cur_idx)) 794 return re_string_fetch_byte (pstr); 795 796 off = pstr->offsets[pstr->cur_idx]; 797 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 798 799 if (! isascii (ch)) 800 return re_string_fetch_byte (pstr); 801 802 re_string_skip_bytes (pstr, 803 re_string_char_size_at (pstr, pstr->cur_idx)); 804 return ch; 805 } 806 #endif 807 808 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 809 } 810 811 static void 812 internal_function 813 re_string_destruct (re_string_t *pstr) 814 { 815 #ifdef RE_ENABLE_I18N 816 re_free (pstr->wcs); 817 re_free (pstr->offsets); 818 #endif /* RE_ENABLE_I18N */ 819 if (pstr->mbs_allocated) 820 re_free (pstr->mbs); 821 } 822 823 /* Return the context at IDX in INPUT. */ 824 825 static unsigned int 826 internal_function 827 re_string_context_at (const re_string_t *input, Idx idx, int eflags) 828 { 829 int c; 830 if (BE (! REG_VALID_INDEX (idx), 0)) 831 /* In this case, we use the value stored in input->tip_context, 832 since we can't know the character in input->mbs[-1] here. */ 833 return input->tip_context; 834 if (BE (idx == input->len, 0)) 835 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 836 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 837 #ifdef RE_ENABLE_I18N 838 if (input->mb_cur_max > 1) 839 { 840 wint_t wc; 841 Idx wc_idx = idx; 842 while(input->wcs[wc_idx] == WEOF) 843 { 844 #ifdef DEBUG 845 /* It must not happen. */ 846 assert (REG_VALID_INDEX (wc_idx)); 847 #endif 848 --wc_idx; 849 if (! REG_VALID_INDEX (wc_idx)) 850 return input->tip_context; 851 } 852 wc = input->wcs[wc_idx]; 853 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 854 return CONTEXT_WORD; 855 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 856 ? CONTEXT_NEWLINE : 0); 857 } 858 else 859 #endif 860 { 861 c = re_string_byte_at (input, idx); 862 if (bitset_contain (input->word_char, c)) 863 return CONTEXT_WORD; 864 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 865 } 866 } 867 868 /* Functions for set operation. */ 869 870 static reg_errcode_t 871 internal_function 872 re_node_set_alloc (re_node_set *set, Idx size) 873 { 874 set->alloc = size; 875 set->nelem = 0; 876 set->elems = re_xmalloc (Idx, size); 877 if (BE (set->elems == NULL, 0)) 878 return REG_ESPACE; 879 return REG_NOERROR; 880 } 881 882 static reg_errcode_t 883 internal_function 884 re_node_set_init_1 (re_node_set *set, Idx elem) 885 { 886 set->alloc = 1; 887 set->nelem = 1; 888 set->elems = re_malloc (Idx, 1); 889 if (BE (set->elems == NULL, 0)) 890 { 891 set->alloc = set->nelem = 0; 892 return REG_ESPACE; 893 } 894 set->elems[0] = elem; 895 return REG_NOERROR; 896 } 897 898 static reg_errcode_t 899 internal_function 900 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 901 { 902 set->alloc = 2; 903 set->elems = re_malloc (Idx, 2); 904 if (BE (set->elems == NULL, 0)) 905 return REG_ESPACE; 906 if (elem1 == elem2) 907 { 908 set->nelem = 1; 909 set->elems[0] = elem1; 910 } 911 else 912 { 913 set->nelem = 2; 914 if (elem1 < elem2) 915 { 916 set->elems[0] = elem1; 917 set->elems[1] = elem2; 918 } 919 else 920 { 921 set->elems[0] = elem2; 922 set->elems[1] = elem1; 923 } 924 } 925 return REG_NOERROR; 926 } 927 928 static reg_errcode_t 929 internal_function 930 re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 931 { 932 dest->nelem = src->nelem; 933 if (src->nelem > 0) 934 { 935 dest->alloc = dest->nelem; 936 dest->elems = re_malloc (Idx, dest->alloc); 937 if (BE (dest->elems == NULL, 0)) 938 { 939 dest->alloc = dest->nelem = 0; 940 return REG_ESPACE; 941 } 942 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]); 943 } 944 else 945 re_node_set_init_empty (dest); 946 return REG_NOERROR; 947 } 948 949 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 950 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 951 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 952 953 static reg_errcode_t 954 internal_function 955 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 956 const re_node_set *src2) 957 { 958 Idx i1, i2, is, id, delta, sbase; 959 if (src1->nelem == 0 || src2->nelem == 0) 960 return REG_NOERROR; 961 962 /* We need dest->nelem + 2 * elems_in_intersection; this is a 963 conservative estimate. */ 964 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 965 { 966 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 967 Idx *new_elems; 968 if (sizeof (Idx) < 3 969 && (new_alloc < dest->alloc 970 || ((Idx) (src1->nelem + src2->nelem) < src1->nelem))) 971 return REG_ESPACE; 972 new_elems = re_xrealloc (dest->elems, Idx, new_alloc); 973 if (BE (new_elems == NULL, 0)) 974 return REG_ESPACE; 975 dest->elems = new_elems; 976 dest->alloc = new_alloc; 977 } 978 979 /* Find the items in the intersection of SRC1 and SRC2, and copy 980 into the top of DEST those that are not already in DEST itself. */ 981 sbase = dest->nelem + src1->nelem + src2->nelem; 982 i1 = src1->nelem - 1; 983 i2 = src2->nelem - 1; 984 id = dest->nelem - 1; 985 for (;;) 986 { 987 if (src1->elems[i1] == src2->elems[i2]) 988 { 989 /* Try to find the item in DEST. Maybe we could binary search? */ 990 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 991 --id; 992 993 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 994 dest->elems[--sbase] = src1->elems[i1]; 995 996 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 997 break; 998 } 999 1000 /* Lower the highest of the two items. */ 1001 else if (src1->elems[i1] < src2->elems[i2]) 1002 { 1003 if (! REG_VALID_INDEX (--i2)) 1004 break; 1005 } 1006 else 1007 { 1008 if (! REG_VALID_INDEX (--i1)) 1009 break; 1010 } 1011 } 1012 1013 id = dest->nelem - 1; 1014 is = dest->nelem + src1->nelem + src2->nelem - 1; 1015 delta = is - sbase + 1; 1016 1017 /* Now copy. When DELTA becomes zero, the remaining 1018 DEST elements are already in place; this is more or 1019 less the same loop that is in re_node_set_merge. */ 1020 dest->nelem += delta; 1021 if (delta > 0 && REG_VALID_INDEX (id)) 1022 for (;;) 1023 { 1024 if (dest->elems[is] > dest->elems[id]) 1025 { 1026 /* Copy from the top. */ 1027 dest->elems[id + delta--] = dest->elems[is--]; 1028 if (delta == 0) 1029 break; 1030 } 1031 else 1032 { 1033 /* Slide from the bottom. */ 1034 dest->elems[id + delta] = dest->elems[id]; 1035 if (! REG_VALID_INDEX (--id)) 1036 break; 1037 } 1038 } 1039 1040 /* Copy remaining SRC elements. */ 1041 memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]); 1042 1043 return REG_NOERROR; 1044 } 1045 1046 /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1047 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1048 1049 static reg_errcode_t 1050 internal_function 1051 re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1052 const re_node_set *src2) 1053 { 1054 Idx i1, i2, id; 1055 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1056 { 1057 dest->alloc = src1->nelem + src2->nelem; 1058 if (sizeof (Idx) < 2 && dest->alloc < src1->nelem) 1059 return REG_ESPACE; 1060 dest->elems = re_xmalloc (Idx, dest->alloc); 1061 if (BE (dest->elems == NULL, 0)) 1062 return REG_ESPACE; 1063 } 1064 else 1065 { 1066 if (src1 != NULL && src1->nelem > 0) 1067 return re_node_set_init_copy (dest, src1); 1068 else if (src2 != NULL && src2->nelem > 0) 1069 return re_node_set_init_copy (dest, src2); 1070 else 1071 re_node_set_init_empty (dest); 1072 return REG_NOERROR; 1073 } 1074 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1075 { 1076 if (src1->elems[i1] > src2->elems[i2]) 1077 { 1078 dest->elems[id++] = src2->elems[i2++]; 1079 continue; 1080 } 1081 if (src1->elems[i1] == src2->elems[i2]) 1082 ++i2; 1083 dest->elems[id++] = src1->elems[i1++]; 1084 } 1085 if (i1 < src1->nelem) 1086 { 1087 memcpy (dest->elems + id, src1->elems + i1, 1088 (src1->nelem - i1) * sizeof dest->elems[0]); 1089 id += src1->nelem - i1; 1090 } 1091 else if (i2 < src2->nelem) 1092 { 1093 memcpy (dest->elems + id, src2->elems + i2, 1094 (src2->nelem - i2) * sizeof dest->elems[0]); 1095 id += src2->nelem - i2; 1096 } 1097 dest->nelem = id; 1098 return REG_NOERROR; 1099 } 1100 1101 /* Calculate the union set of the sets DEST and SRC. And store it to 1102 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1103 1104 static reg_errcode_t 1105 internal_function 1106 re_node_set_merge (re_node_set *dest, const re_node_set *src) 1107 { 1108 Idx is, id, sbase, delta; 1109 if (src == NULL || src->nelem == 0) 1110 return REG_NOERROR; 1111 if (sizeof (Idx) < 3 1112 && ((Idx) (2 * src->nelem) < src->nelem 1113 || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem)) 1114 return REG_ESPACE; 1115 if (dest->alloc < 2 * src->nelem + dest->nelem) 1116 { 1117 Idx new_alloc = src->nelem + dest->alloc; 1118 Idx *new_buffer; 1119 if (sizeof (Idx) < 4 && new_alloc < dest->alloc) 1120 return REG_ESPACE; 1121 new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc); 1122 if (BE (new_buffer == NULL, 0)) 1123 return REG_ESPACE; 1124 dest->elems = new_buffer; 1125 dest->alloc = new_alloc; 1126 } 1127 1128 if (BE (dest->nelem == 0, 0)) 1129 { 1130 dest->nelem = src->nelem; 1131 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]); 1132 return REG_NOERROR; 1133 } 1134 1135 /* Copy into the top of DEST the items of SRC that are not 1136 found in DEST. Maybe we could binary search in DEST? */ 1137 for (sbase = dest->nelem + 2 * src->nelem, 1138 is = src->nelem - 1, id = dest->nelem - 1; 1139 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1140 { 1141 if (dest->elems[id] == src->elems[is]) 1142 is--, id--; 1143 else if (dest->elems[id] < src->elems[is]) 1144 dest->elems[--sbase] = src->elems[is--]; 1145 else /* if (dest->elems[id] > src->elems[is]) */ 1146 --id; 1147 } 1148 1149 if (REG_VALID_INDEX (is)) 1150 { 1151 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1152 sbase -= is + 1; 1153 memcpy (dest->elems + sbase, src->elems, 1154 (is + 1) * sizeof dest->elems[0]); 1155 } 1156 1157 id = dest->nelem - 1; 1158 is = dest->nelem + 2 * src->nelem - 1; 1159 delta = is - sbase + 1; 1160 if (delta == 0) 1161 return REG_NOERROR; 1162 1163 /* Now copy. When DELTA becomes zero, the remaining 1164 DEST elements are already in place. */ 1165 dest->nelem += delta; 1166 for (;;) 1167 { 1168 if (dest->elems[is] > dest->elems[id]) 1169 { 1170 /* Copy from the top. */ 1171 dest->elems[id + delta--] = dest->elems[is--]; 1172 if (delta == 0) 1173 break; 1174 } 1175 else 1176 { 1177 /* Slide from the bottom. */ 1178 dest->elems[id + delta] = dest->elems[id]; 1179 if (! REG_VALID_INDEX (--id)) 1180 { 1181 /* Copy remaining SRC elements. */ 1182 memcpy (dest->elems, dest->elems + sbase, 1183 delta * sizeof dest->elems[0]); 1184 break; 1185 } 1186 } 1187 } 1188 1189 return REG_NOERROR; 1190 } 1191 1192 /* Insert the new element ELEM to the re_node_set* SET. 1193 SET should not already have ELEM. 1194 Return true if successful. */ 1195 1196 static bool 1197 internal_function 1198 re_node_set_insert (re_node_set *set, Idx elem) 1199 { 1200 Idx idx; 1201 /* In case the set is empty. */ 1202 if (set->alloc == 0) 1203 return re_node_set_init_1 (set, elem) == REG_NOERROR; 1204 1205 if (BE (set->nelem, 0) == 0) 1206 { 1207 /* We already guaranteed above that set->alloc != 0. */ 1208 set->elems[0] = elem; 1209 ++set->nelem; 1210 return true; 1211 } 1212 1213 /* Realloc if we need. */ 1214 if (set->alloc == set->nelem) 1215 { 1216 Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc); 1217 if (BE (new_elems == NULL, 0)) 1218 return false; 1219 set->elems = new_elems; 1220 } 1221 1222 /* Move the elements which follows the new element. Test the 1223 first element separately to skip a check in the inner loop. */ 1224 if (elem < set->elems[0]) 1225 { 1226 idx = 0; 1227 for (idx = set->nelem; idx > 0; idx--) 1228 set->elems[idx] = set->elems[idx - 1]; 1229 } 1230 else 1231 { 1232 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1233 set->elems[idx] = set->elems[idx - 1]; 1234 } 1235 1236 /* Insert the new element. */ 1237 set->elems[idx] = elem; 1238 ++set->nelem; 1239 return true; 1240 } 1241 1242 /* Insert the new element ELEM to the re_node_set* SET. 1243 SET should not already have any element greater than or equal to ELEM. 1244 Return true if successful. */ 1245 1246 static bool 1247 internal_function 1248 re_node_set_insert_last (re_node_set *set, Idx elem) 1249 { 1250 /* Realloc if we need. */ 1251 if (set->alloc == set->nelem) 1252 { 1253 Idx *new_elems; 1254 new_elems = re_x2realloc (set->elems, Idx, &set->alloc); 1255 if (BE (new_elems == NULL, 0)) 1256 return false; 1257 set->elems = new_elems; 1258 } 1259 1260 /* Insert the new element. */ 1261 set->elems[set->nelem++] = elem; 1262 return true; 1263 } 1264 1265 /* Compare two node sets SET1 and SET2. 1266 Return true if SET1 and SET2 are equivalent. */ 1267 1268 static bool 1269 internal_function __attribute ((pure)) 1270 re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1271 { 1272 Idx i; 1273 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1274 return false; 1275 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1276 if (set1->elems[i] != set2->elems[i]) 1277 return false; 1278 return true; 1279 } 1280 1281 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1282 1283 static Idx 1284 internal_function __attribute ((pure)) 1285 re_node_set_contains (const re_node_set *set, Idx elem) 1286 { 1287 __re_size_t idx, right, mid; 1288 if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1289 return 0; 1290 1291 /* Binary search the element. */ 1292 idx = 0; 1293 right = set->nelem - 1; 1294 while (idx < right) 1295 { 1296 mid = (idx + right) / 2; 1297 if (set->elems[mid] < elem) 1298 idx = mid + 1; 1299 else 1300 right = mid; 1301 } 1302 return set->elems[idx] == elem ? idx + 1 : 0; 1303 } 1304 1305 static void 1306 internal_function 1307 re_node_set_remove_at (re_node_set *set, Idx idx) 1308 { 1309 if (idx < 0 || idx >= set->nelem) 1310 return; 1311 --set->nelem; 1312 for (; idx < set->nelem; idx++) 1313 set->elems[idx] = set->elems[idx + 1]; 1314 } 1315 1316 1317 /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1318 Or return REG_MISSING if an error occurred. */ 1319 1320 static Idx 1321 internal_function 1322 re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1323 { 1324 int type = token.type; 1325 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1326 { 1327 Idx new_nodes_alloc = dfa->nodes_alloc; 1328 Idx *new_nexts, *new_indices; 1329 re_node_set *new_edests, *new_eclosures; 1330 1331 re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t, 1332 &new_nodes_alloc); 1333 if (BE (new_nodes == NULL, 0)) 1334 return REG_MISSING; 1335 dfa->nodes = new_nodes; 1336 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1337 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1338 new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc); 1339 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1340 if (BE (new_nexts == NULL || new_indices == NULL 1341 || new_edests == NULL || new_eclosures == NULL, 0)) 1342 return REG_MISSING; 1343 dfa->nexts = new_nexts; 1344 dfa->org_indices = new_indices; 1345 dfa->edests = new_edests; 1346 dfa->eclosures = new_eclosures; 1347 dfa->nodes_alloc = new_nodes_alloc; 1348 } 1349 dfa->nodes[dfa->nodes_len] = token; 1350 dfa->nodes[dfa->nodes_len].constraint = 0; 1351 #ifdef RE_ENABLE_I18N 1352 dfa->nodes[dfa->nodes_len].accept_mb = 1353 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1354 #endif 1355 dfa->nexts[dfa->nodes_len] = REG_MISSING; 1356 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1357 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1358 return dfa->nodes_len++; 1359 } 1360 1361 static inline re_hashval_t 1362 internal_function 1363 calc_state_hash (const re_node_set *nodes, unsigned int context) 1364 { 1365 re_hashval_t hash = nodes->nelem + context; 1366 Idx i; 1367 for (i = 0 ; i < nodes->nelem ; i++) 1368 hash += nodes->elems[i]; 1369 return hash; 1370 } 1371 1372 /* Search for the state whose node_set is equivalent to NODES. 1373 Return the pointer to the state, if we found it in the DFA. 1374 Otherwise create the new one and return it. In case of an error 1375 return NULL and set the error code in ERR. 1376 Note: - We assume NULL as the invalid state, then it is possible that 1377 return value is NULL and ERR is REG_NOERROR. 1378 - We never return non-NULL value in case of any errors, it is for 1379 optimization. */ 1380 1381 static re_dfastate_t* 1382 internal_function 1383 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes) 1384 { 1385 re_hashval_t hash; 1386 re_dfastate_t *new_state; 1387 struct re_state_table_entry *spot; 1388 Idx i; 1389 #ifdef lint 1390 /* Suppress bogus uninitialized-variable warnings. */ 1391 *err = REG_NOERROR; 1392 #endif 1393 if (BE (nodes->nelem == 0, 0)) 1394 { 1395 *err = REG_NOERROR; 1396 return NULL; 1397 } 1398 hash = calc_state_hash (nodes, 0); 1399 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1400 1401 for (i = 0 ; i < spot->num ; i++) 1402 { 1403 re_dfastate_t *state = spot->array[i]; 1404 if (hash != state->hash) 1405 continue; 1406 if (re_node_set_compare (&state->nodes, nodes)) 1407 return state; 1408 } 1409 1410 /* There are no appropriate state in the dfa, create the new one. */ 1411 new_state = create_ci_newstate (dfa, nodes, hash); 1412 if (BE (new_state != NULL, 1)) 1413 return new_state; 1414 else 1415 { 1416 *err = REG_ESPACE; 1417 return NULL; 1418 } 1419 } 1420 1421 /* Search for the state whose node_set is equivalent to NODES and 1422 whose context is equivalent to CONTEXT. 1423 Return the pointer to the state, if we found it in the DFA. 1424 Otherwise create the new one and return it. In case of an error 1425 return NULL and set the error code in ERR. 1426 Note: - We assume NULL as the invalid state, then it is possible that 1427 return value is NULL and ERR is REG_NOERROR. 1428 - We never return non-NULL value in case of any errors, it is for 1429 optimization. */ 1430 1431 static re_dfastate_t* 1432 internal_function 1433 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa, 1434 const re_node_set *nodes, unsigned int context) 1435 { 1436 re_hashval_t hash; 1437 re_dfastate_t *new_state; 1438 struct re_state_table_entry *spot; 1439 Idx i; 1440 #ifdef lint 1441 /* Suppress bogus uninitialized-variable warnings. */ 1442 *err = REG_NOERROR; 1443 #endif 1444 if (nodes->nelem == 0) 1445 { 1446 *err = REG_NOERROR; 1447 return NULL; 1448 } 1449 hash = calc_state_hash (nodes, context); 1450 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1451 1452 for (i = 0 ; i < spot->num ; i++) 1453 { 1454 re_dfastate_t *state = spot->array[i]; 1455 if (state->hash == hash 1456 && state->context == context 1457 && re_node_set_compare (state->entrance_nodes, nodes)) 1458 return state; 1459 } 1460 /* There are no appropriate state in `dfa', create the new one. */ 1461 new_state = create_cd_newstate (dfa, nodes, context, hash); 1462 if (BE (new_state != NULL, 1)) 1463 return new_state; 1464 else 1465 { 1466 *err = REG_ESPACE; 1467 return NULL; 1468 } 1469 } 1470 1471 /* Finish initialization of the new state NEWSTATE, and using its hash value 1472 HASH put in the appropriate bucket of DFA's state table. Return value 1473 indicates the error code if failed. */ 1474 1475 static reg_errcode_t 1476 internal_function 1477 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash) 1478 { 1479 struct re_state_table_entry *spot; 1480 reg_errcode_t err; 1481 Idx i; 1482 1483 newstate->hash = hash; 1484 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1485 if (BE (err != REG_NOERROR, 0)) 1486 return REG_ESPACE; 1487 for (i = 0; i < newstate->nodes.nelem; i++) 1488 { 1489 Idx elem = newstate->nodes.elems[i]; 1490 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1491 { 1492 bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem); 1493 if (BE (! ok, 0)) 1494 return REG_ESPACE; 1495 } 1496 } 1497 1498 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1499 if (BE (spot->alloc <= spot->num, 0)) 1500 { 1501 Idx new_alloc = spot->num; 1502 re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *, 1503 &new_alloc); 1504 if (BE (new_array == NULL, 0)) 1505 return REG_ESPACE; 1506 spot->array = new_array; 1507 spot->alloc = new_alloc; 1508 } 1509 spot->array[spot->num++] = newstate; 1510 return REG_NOERROR; 1511 } 1512 1513 /* Create the new state which is independ of contexts. 1514 Return the new state if succeeded, otherwise return NULL. */ 1515 1516 static re_dfastate_t * 1517 internal_function 1518 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1519 re_hashval_t hash) 1520 { 1521 Idx i; 1522 reg_errcode_t err; 1523 re_dfastate_t *newstate; 1524 1525 newstate = re_calloc (re_dfastate_t, 1); 1526 if (BE (newstate == NULL, 0)) 1527 return NULL; 1528 err = re_node_set_init_copy (&newstate->nodes, nodes); 1529 if (BE (err != REG_NOERROR, 0)) 1530 { 1531 re_free (newstate); 1532 return NULL; 1533 } 1534 1535 newstate->entrance_nodes = &newstate->nodes; 1536 for (i = 0 ; i < nodes->nelem ; i++) 1537 { 1538 re_token_t *node = dfa->nodes + nodes->elems[i]; 1539 re_token_type_t type = node->type; 1540 if (type == CHARACTER && !node->constraint) 1541 continue; 1542 #ifdef RE_ENABLE_I18N 1543 newstate->accept_mb |= node->accept_mb; 1544 #endif /* RE_ENABLE_I18N */ 1545 1546 /* If the state has the halt node, the state is a halt state. */ 1547 if (type == END_OF_RE) 1548 newstate->halt = 1; 1549 else if (type == OP_BACK_REF) 1550 newstate->has_backref = 1; 1551 else if (type == ANCHOR || node->constraint) 1552 newstate->has_constraint = 1; 1553 } 1554 err = register_state (dfa, newstate, hash); 1555 if (BE (err != REG_NOERROR, 0)) 1556 { 1557 free_state (newstate); 1558 newstate = NULL; 1559 } 1560 return newstate; 1561 } 1562 1563 /* Create the new state which is depend on the context CONTEXT. 1564 Return the new state if succeeded, otherwise return NULL. */ 1565 1566 static re_dfastate_t * 1567 internal_function 1568 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1569 unsigned int context, re_hashval_t hash) 1570 { 1571 Idx i, nctx_nodes = 0; 1572 reg_errcode_t err; 1573 re_dfastate_t *newstate; 1574 1575 newstate = re_calloc (re_dfastate_t, 1); 1576 if (BE (newstate == NULL, 0)) 1577 return NULL; 1578 err = re_node_set_init_copy (&newstate->nodes, nodes); 1579 if (BE (err != REG_NOERROR, 0)) 1580 { 1581 re_free (newstate); 1582 return NULL; 1583 } 1584 1585 newstate->context = context; 1586 newstate->entrance_nodes = &newstate->nodes; 1587 1588 for (i = 0 ; i < nodes->nelem ; i++) 1589 { 1590 unsigned int constraint = 0; 1591 re_token_t *node = dfa->nodes + nodes->elems[i]; 1592 re_token_type_t type = node->type; 1593 if (node->constraint) 1594 constraint = node->constraint; 1595 1596 if (type == CHARACTER && !constraint) 1597 continue; 1598 #ifdef RE_ENABLE_I18N 1599 newstate->accept_mb |= node->accept_mb; 1600 #endif /* RE_ENABLE_I18N */ 1601 1602 /* If the state has the halt node, the state is a halt state. */ 1603 if (type == END_OF_RE) 1604 newstate->halt = 1; 1605 else if (type == OP_BACK_REF) 1606 newstate->has_backref = 1; 1607 else if (type == ANCHOR) 1608 constraint = node->opr.ctx_type; 1609 1610 if (constraint) 1611 { 1612 if (newstate->entrance_nodes == &newstate->nodes) 1613 { 1614 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1615 if (BE (newstate->entrance_nodes == NULL, 0)) 1616 { 1617 free_state (newstate); 1618 return NULL; 1619 } 1620 re_node_set_init_copy (newstate->entrance_nodes, nodes); 1621 nctx_nodes = 0; 1622 newstate->has_constraint = 1; 1623 } 1624 1625 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1626 { 1627 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1628 ++nctx_nodes; 1629 } 1630 } 1631 } 1632 err = register_state (dfa, newstate, hash); 1633 if (BE (err != REG_NOERROR, 0)) 1634 { 1635 free_state (newstate); 1636 newstate = NULL; 1637 } 1638 return newstate; 1639 } 1640 1641 static void 1642 internal_function 1643 free_state (re_dfastate_t *state) 1644 { 1645 re_node_set_free (&state->non_eps_nodes); 1646 re_node_set_free (&state->inveclosure); 1647 if (state->entrance_nodes != &state->nodes) 1648 { 1649 re_node_set_free (state->entrance_nodes); 1650 re_free (state->entrance_nodes); 1651 } 1652 re_node_set_free (&state->nodes); 1653 re_free (state->word_trtable); 1654 re_free (state->trtable); 1655 re_free (state); 1656 } 1657