1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002-2012 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License along 17 with this program; if not, see <http://www.gnu.org/licenses/>. */ 18 19 #include "verify.h" 20 #include "intprops.h" 21 static void re_string_construct_common (const char *str, Idx len, 22 re_string_t *pstr, 23 RE_TRANSLATE_TYPE trans, bool icase, 24 const re_dfa_t *dfa) internal_function; 25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 26 const re_node_set *nodes, 27 re_hashval_t hash) internal_function; 28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 29 const re_node_set *nodes, 30 unsigned int context, 31 re_hashval_t hash) internal_function; 32 33 /* Functions for string operation. */ 34 35 /* This function allocate the buffers. It is necessary to call 36 re_string_reconstruct before using the object. */ 37 38 static reg_errcode_t 39 internal_function __attribute_warn_unused_result__ 40 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 41 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 42 { 43 reg_errcode_t ret; 44 Idx init_buf_len; 45 46 /* Ensure at least one character fits into the buffers. */ 47 if (init_len < dfa->mb_cur_max) 48 init_len = dfa->mb_cur_max; 49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 50 re_string_construct_common (str, len, pstr, trans, icase, dfa); 51 52 ret = re_string_realloc_buffers (pstr, init_buf_len); 53 if (BE (ret != REG_NOERROR, 0)) 54 return ret; 55 56 pstr->word_char = dfa->word_char; 57 pstr->word_ops_used = dfa->word_ops_used; 58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 60 pstr->valid_raw_len = pstr->valid_len; 61 return REG_NOERROR; 62 } 63 64 /* This function allocate the buffers, and initialize them. */ 65 66 static reg_errcode_t 67 internal_function __attribute_warn_unused_result__ 68 re_string_construct (re_string_t *pstr, const char *str, Idx len, 69 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 70 { 71 reg_errcode_t ret; 72 memset (pstr, '\0', sizeof (re_string_t)); 73 re_string_construct_common (str, len, pstr, trans, icase, dfa); 74 75 if (len > 0) 76 { 77 ret = re_string_realloc_buffers (pstr, len + 1); 78 if (BE (ret != REG_NOERROR, 0)) 79 return ret; 80 } 81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 82 83 if (icase) 84 { 85 #ifdef RE_ENABLE_I18N 86 if (dfa->mb_cur_max > 1) 87 { 88 while (1) 89 { 90 ret = build_wcs_upper_buffer (pstr); 91 if (BE (ret != REG_NOERROR, 0)) 92 return ret; 93 if (pstr->valid_raw_len >= len) 94 break; 95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 96 break; 97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 98 if (BE (ret != REG_NOERROR, 0)) 99 return ret; 100 } 101 } 102 else 103 #endif /* RE_ENABLE_I18N */ 104 build_upper_buffer (pstr); 105 } 106 else 107 { 108 #ifdef RE_ENABLE_I18N 109 if (dfa->mb_cur_max > 1) 110 build_wcs_buffer (pstr); 111 else 112 #endif /* RE_ENABLE_I18N */ 113 { 114 if (trans != NULL) 115 re_string_translate_buffer (pstr); 116 else 117 { 118 pstr->valid_len = pstr->bufs_len; 119 pstr->valid_raw_len = pstr->bufs_len; 120 } 121 } 122 } 123 124 return REG_NOERROR; 125 } 126 127 /* Helper functions for re_string_allocate, and re_string_construct. */ 128 129 static reg_errcode_t 130 internal_function __attribute_warn_unused_result__ 131 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 132 { 133 #ifdef RE_ENABLE_I18N 134 if (pstr->mb_cur_max > 1) 135 { 136 wint_t *new_wcs; 137 138 /* Avoid overflow in realloc. */ 139 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 140 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0)) 141 return REG_ESPACE; 142 143 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 144 if (BE (new_wcs == NULL, 0)) 145 return REG_ESPACE; 146 pstr->wcs = new_wcs; 147 if (pstr->offsets != NULL) 148 { 149 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 150 if (BE (new_offsets == NULL, 0)) 151 return REG_ESPACE; 152 pstr->offsets = new_offsets; 153 } 154 } 155 #endif /* RE_ENABLE_I18N */ 156 if (pstr->mbs_allocated) 157 { 158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 159 new_buf_len); 160 if (BE (new_mbs == NULL, 0)) 161 return REG_ESPACE; 162 pstr->mbs = new_mbs; 163 } 164 pstr->bufs_len = new_buf_len; 165 return REG_NOERROR; 166 } 167 168 169 static void 170 internal_function 171 re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 172 RE_TRANSLATE_TYPE trans, bool icase, 173 const re_dfa_t *dfa) 174 { 175 pstr->raw_mbs = (const unsigned char *) str; 176 pstr->len = len; 177 pstr->raw_len = len; 178 pstr->trans = trans; 179 pstr->icase = icase; 180 pstr->mbs_allocated = (trans != NULL || icase); 181 pstr->mb_cur_max = dfa->mb_cur_max; 182 pstr->is_utf8 = dfa->is_utf8; 183 pstr->map_notascii = dfa->map_notascii; 184 pstr->stop = pstr->len; 185 pstr->raw_stop = pstr->stop; 186 } 187 188 #ifdef RE_ENABLE_I18N 189 190 /* Build wide character buffer PSTR->WCS. 191 If the byte sequence of the string are: 192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 193 Then wide character buffer will be: 194 <wc1> , WEOF , <wc2> , WEOF , <wc3> 195 We use WEOF for padding, they indicate that the position isn't 196 a first byte of a multibyte character. 197 198 Note that this function assumes PSTR->VALID_LEN elements are already 199 built and starts from PSTR->VALID_LEN. */ 200 201 static void 202 internal_function 203 build_wcs_buffer (re_string_t *pstr) 204 { 205 #ifdef _LIBC 206 unsigned char buf[MB_LEN_MAX]; 207 assert (MB_LEN_MAX >= pstr->mb_cur_max); 208 #else 209 unsigned char buf[64]; 210 #endif 211 mbstate_t prev_st; 212 Idx byte_idx, end_idx, remain_len; 213 size_t mbclen; 214 215 /* Build the buffers from pstr->valid_len to either pstr->len or 216 pstr->bufs_len. */ 217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 219 { 220 wchar_t wc; 221 const char *p; 222 223 remain_len = end_idx - byte_idx; 224 prev_st = pstr->cur_state; 225 /* Apply the translation if we need. */ 226 if (BE (pstr->trans != NULL, 0)) 227 { 228 int i, ch; 229 230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 231 { 232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 234 } 235 p = (const char *) buf; 236 } 237 else 238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 239 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 240 if (BE (mbclen == (size_t) -1 || mbclen == 0 241 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0)) 242 { 243 /* We treat these cases as a singlebyte character. */ 244 mbclen = 1; 245 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 246 if (BE (pstr->trans != NULL, 0)) 247 wc = pstr->trans[wc]; 248 pstr->cur_state = prev_st; 249 } 250 else if (BE (mbclen == (size_t) -2, 0)) 251 { 252 /* The buffer doesn't have enough space, finish to build. */ 253 pstr->cur_state = prev_st; 254 break; 255 } 256 257 /* Write wide character and padding. */ 258 pstr->wcs[byte_idx++] = wc; 259 /* Write paddings. */ 260 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 261 pstr->wcs[byte_idx++] = WEOF; 262 } 263 pstr->valid_len = byte_idx; 264 pstr->valid_raw_len = byte_idx; 265 } 266 267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 268 but for REG_ICASE. */ 269 270 static reg_errcode_t 271 internal_function __attribute_warn_unused_result__ 272 build_wcs_upper_buffer (re_string_t *pstr) 273 { 274 mbstate_t prev_st; 275 Idx src_idx, byte_idx, end_idx, remain_len; 276 size_t mbclen; 277 #ifdef _LIBC 278 char buf[MB_LEN_MAX]; 279 assert (MB_LEN_MAX >= pstr->mb_cur_max); 280 #else 281 char buf[64]; 282 #endif 283 284 byte_idx = pstr->valid_len; 285 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 286 287 /* The following optimization assumes that ASCII characters can be 288 mapped to wide characters with a simple cast. */ 289 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 290 { 291 while (byte_idx < end_idx) 292 { 293 wchar_t wc; 294 295 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 296 && mbsinit (&pstr->cur_state)) 297 { 298 /* In case of a singlebyte character. */ 299 pstr->mbs[byte_idx] 300 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 301 /* The next step uses the assumption that wchar_t is encoded 302 ASCII-safe: all ASCII values can be converted like this. */ 303 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 304 ++byte_idx; 305 continue; 306 } 307 308 remain_len = end_idx - byte_idx; 309 prev_st = pstr->cur_state; 310 mbclen = __mbrtowc (&wc, 311 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 312 + byte_idx), remain_len, &pstr->cur_state); 313 if (BE (mbclen < (size_t) -2, 1)) 314 { 315 wchar_t wcu = wc; 316 if (iswlower (wc)) 317 { 318 size_t mbcdlen; 319 320 wcu = towupper (wc); 321 mbcdlen = wcrtomb (buf, wcu, &prev_st); 322 if (BE (mbclen == mbcdlen, 1)) 323 memcpy (pstr->mbs + byte_idx, buf, mbclen); 324 else 325 { 326 src_idx = byte_idx; 327 goto offsets_needed; 328 } 329 } 330 else 331 memcpy (pstr->mbs + byte_idx, 332 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 333 pstr->wcs[byte_idx++] = wcu; 334 /* Write paddings. */ 335 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 336 pstr->wcs[byte_idx++] = WEOF; 337 } 338 else if (mbclen == (size_t) -1 || mbclen == 0 339 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) 340 { 341 /* It is an invalid character, an incomplete character 342 at the end of the string, or '\0'. Just use the byte. */ 343 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 344 pstr->mbs[byte_idx] = ch; 345 /* And also cast it to wide char. */ 346 pstr->wcs[byte_idx++] = (wchar_t) ch; 347 if (BE (mbclen == (size_t) -1, 0)) 348 pstr->cur_state = prev_st; 349 } 350 else 351 { 352 /* The buffer doesn't have enough space, finish to build. */ 353 pstr->cur_state = prev_st; 354 break; 355 } 356 } 357 pstr->valid_len = byte_idx; 358 pstr->valid_raw_len = byte_idx; 359 return REG_NOERROR; 360 } 361 else 362 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 363 { 364 wchar_t wc; 365 const char *p; 366 offsets_needed: 367 remain_len = end_idx - byte_idx; 368 prev_st = pstr->cur_state; 369 if (BE (pstr->trans != NULL, 0)) 370 { 371 int i, ch; 372 373 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 374 { 375 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 376 buf[i] = pstr->trans[ch]; 377 } 378 p = (const char *) buf; 379 } 380 else 381 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 382 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 383 if (BE (mbclen < (size_t) -2, 1)) 384 { 385 wchar_t wcu = wc; 386 if (iswlower (wc)) 387 { 388 size_t mbcdlen; 389 390 wcu = towupper (wc); 391 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 392 if (BE (mbclen == mbcdlen, 1)) 393 memcpy (pstr->mbs + byte_idx, buf, mbclen); 394 else if (mbcdlen != (size_t) -1) 395 { 396 size_t i; 397 398 if (byte_idx + mbcdlen > pstr->bufs_len) 399 { 400 pstr->cur_state = prev_st; 401 break; 402 } 403 404 if (pstr->offsets == NULL) 405 { 406 pstr->offsets = re_malloc (Idx, pstr->bufs_len); 407 408 if (pstr->offsets == NULL) 409 return REG_ESPACE; 410 } 411 if (!pstr->offsets_needed) 412 { 413 for (i = 0; i < (size_t) byte_idx; ++i) 414 pstr->offsets[i] = i; 415 pstr->offsets_needed = 1; 416 } 417 418 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 419 pstr->wcs[byte_idx] = wcu; 420 pstr->offsets[byte_idx] = src_idx; 421 for (i = 1; i < mbcdlen; ++i) 422 { 423 pstr->offsets[byte_idx + i] 424 = src_idx + (i < mbclen ? i : mbclen - 1); 425 pstr->wcs[byte_idx + i] = WEOF; 426 } 427 pstr->len += mbcdlen - mbclen; 428 if (pstr->raw_stop > src_idx) 429 pstr->stop += mbcdlen - mbclen; 430 end_idx = (pstr->bufs_len > pstr->len) 431 ? pstr->len : pstr->bufs_len; 432 byte_idx += mbcdlen; 433 src_idx += mbclen; 434 continue; 435 } 436 else 437 memcpy (pstr->mbs + byte_idx, p, mbclen); 438 } 439 else 440 memcpy (pstr->mbs + byte_idx, p, mbclen); 441 442 if (BE (pstr->offsets_needed != 0, 0)) 443 { 444 size_t i; 445 for (i = 0; i < mbclen; ++i) 446 pstr->offsets[byte_idx + i] = src_idx + i; 447 } 448 src_idx += mbclen; 449 450 pstr->wcs[byte_idx++] = wcu; 451 /* Write paddings. */ 452 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 453 pstr->wcs[byte_idx++] = WEOF; 454 } 455 else if (mbclen == (size_t) -1 || mbclen == 0 456 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) 457 { 458 /* It is an invalid character or '\0'. Just use the byte. */ 459 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 460 461 if (BE (pstr->trans != NULL, 0)) 462 ch = pstr->trans [ch]; 463 pstr->mbs[byte_idx] = ch; 464 465 if (BE (pstr->offsets_needed != 0, 0)) 466 pstr->offsets[byte_idx] = src_idx; 467 ++src_idx; 468 469 /* And also cast it to wide char. */ 470 pstr->wcs[byte_idx++] = (wchar_t) ch; 471 if (BE (mbclen == (size_t) -1, 0)) 472 pstr->cur_state = prev_st; 473 } 474 else 475 { 476 /* The buffer doesn't have enough space, finish to build. */ 477 pstr->cur_state = prev_st; 478 break; 479 } 480 } 481 pstr->valid_len = byte_idx; 482 pstr->valid_raw_len = src_idx; 483 return REG_NOERROR; 484 } 485 486 /* Skip characters until the index becomes greater than NEW_RAW_IDX. 487 Return the index. */ 488 489 static Idx 490 internal_function 491 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 492 { 493 mbstate_t prev_st; 494 Idx rawbuf_idx; 495 size_t mbclen; 496 wint_t wc = WEOF; 497 498 /* Skip the characters which are not necessary to check. */ 499 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 500 rawbuf_idx < new_raw_idx;) 501 { 502 wchar_t wc2; 503 Idx remain_len = pstr->raw_len - rawbuf_idx; 504 prev_st = pstr->cur_state; 505 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 506 remain_len, &pstr->cur_state); 507 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 508 { 509 /* We treat these cases as a single byte character. */ 510 if (mbclen == 0 || remain_len == 0) 511 wc = L'\0'; 512 else 513 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 514 mbclen = 1; 515 pstr->cur_state = prev_st; 516 } 517 else 518 wc = wc2; 519 /* Then proceed the next character. */ 520 rawbuf_idx += mbclen; 521 } 522 *last_wc = wc; 523 return rawbuf_idx; 524 } 525 #endif /* RE_ENABLE_I18N */ 526 527 /* Build the buffer PSTR->MBS, and apply the translation if we need. 528 This function is used in case of REG_ICASE. */ 529 530 static void 531 internal_function 532 build_upper_buffer (re_string_t *pstr) 533 { 534 Idx char_idx, end_idx; 535 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 536 537 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 538 { 539 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 540 if (BE (pstr->trans != NULL, 0)) 541 ch = pstr->trans[ch]; 542 if (islower (ch)) 543 pstr->mbs[char_idx] = toupper (ch); 544 else 545 pstr->mbs[char_idx] = ch; 546 } 547 pstr->valid_len = char_idx; 548 pstr->valid_raw_len = char_idx; 549 } 550 551 /* Apply TRANS to the buffer in PSTR. */ 552 553 static void 554 internal_function 555 re_string_translate_buffer (re_string_t *pstr) 556 { 557 Idx buf_idx, end_idx; 558 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 559 560 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 561 { 562 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 563 pstr->mbs[buf_idx] = pstr->trans[ch]; 564 } 565 566 pstr->valid_len = buf_idx; 567 pstr->valid_raw_len = buf_idx; 568 } 569 570 /* This function re-construct the buffers. 571 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 572 convert to upper case in case of REG_ICASE, apply translation. */ 573 574 static reg_errcode_t 575 internal_function __attribute_warn_unused_result__ 576 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 577 { 578 Idx offset; 579 580 if (BE (pstr->raw_mbs_idx <= idx, 0)) 581 offset = idx - pstr->raw_mbs_idx; 582 else 583 { 584 /* Reset buffer. */ 585 #ifdef RE_ENABLE_I18N 586 if (pstr->mb_cur_max > 1) 587 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 588 #endif /* RE_ENABLE_I18N */ 589 pstr->len = pstr->raw_len; 590 pstr->stop = pstr->raw_stop; 591 pstr->valid_len = 0; 592 pstr->raw_mbs_idx = 0; 593 pstr->valid_raw_len = 0; 594 pstr->offsets_needed = 0; 595 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 596 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 597 if (!pstr->mbs_allocated) 598 pstr->mbs = (unsigned char *) pstr->raw_mbs; 599 offset = idx; 600 } 601 602 if (BE (offset != 0, 1)) 603 { 604 /* Should the already checked characters be kept? */ 605 if (BE (offset < pstr->valid_raw_len, 1)) 606 { 607 /* Yes, move them to the front of the buffer. */ 608 #ifdef RE_ENABLE_I18N 609 if (BE (pstr->offsets_needed, 0)) 610 { 611 Idx low = 0, high = pstr->valid_len, mid; 612 do 613 { 614 mid = (high + low) / 2; 615 if (pstr->offsets[mid] > offset) 616 high = mid; 617 else if (pstr->offsets[mid] < offset) 618 low = mid + 1; 619 else 620 break; 621 } 622 while (low < high); 623 if (pstr->offsets[mid] < offset) 624 ++mid; 625 pstr->tip_context = re_string_context_at (pstr, mid - 1, 626 eflags); 627 /* This can be quite complicated, so handle specially 628 only the common and easy case where the character with 629 different length representation of lower and upper 630 case is present at or after offset. */ 631 if (pstr->valid_len > offset 632 && mid == offset && pstr->offsets[mid] == offset) 633 { 634 memmove (pstr->wcs, pstr->wcs + offset, 635 (pstr->valid_len - offset) * sizeof (wint_t)); 636 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 637 pstr->valid_len -= offset; 638 pstr->valid_raw_len -= offset; 639 for (low = 0; low < pstr->valid_len; low++) 640 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 641 } 642 else 643 { 644 /* Otherwise, just find out how long the partial multibyte 645 character at offset is and fill it with WEOF/255. */ 646 pstr->len = pstr->raw_len - idx + offset; 647 pstr->stop = pstr->raw_stop - idx + offset; 648 pstr->offsets_needed = 0; 649 while (mid > 0 && pstr->offsets[mid - 1] == offset) 650 --mid; 651 while (mid < pstr->valid_len) 652 if (pstr->wcs[mid] != WEOF) 653 break; 654 else 655 ++mid; 656 if (mid == pstr->valid_len) 657 pstr->valid_len = 0; 658 else 659 { 660 pstr->valid_len = pstr->offsets[mid] - offset; 661 if (pstr->valid_len) 662 { 663 for (low = 0; low < pstr->valid_len; ++low) 664 pstr->wcs[low] = WEOF; 665 memset (pstr->mbs, 255, pstr->valid_len); 666 } 667 } 668 pstr->valid_raw_len = pstr->valid_len; 669 } 670 } 671 else 672 #endif 673 { 674 pstr->tip_context = re_string_context_at (pstr, offset - 1, 675 eflags); 676 #ifdef RE_ENABLE_I18N 677 if (pstr->mb_cur_max > 1) 678 memmove (pstr->wcs, pstr->wcs + offset, 679 (pstr->valid_len - offset) * sizeof (wint_t)); 680 #endif /* RE_ENABLE_I18N */ 681 if (BE (pstr->mbs_allocated, 0)) 682 memmove (pstr->mbs, pstr->mbs + offset, 683 pstr->valid_len - offset); 684 pstr->valid_len -= offset; 685 pstr->valid_raw_len -= offset; 686 #if DEBUG 687 assert (pstr->valid_len > 0); 688 #endif 689 } 690 } 691 else 692 { 693 #ifdef RE_ENABLE_I18N 694 /* No, skip all characters until IDX. */ 695 Idx prev_valid_len = pstr->valid_len; 696 697 if (BE (pstr->offsets_needed, 0)) 698 { 699 pstr->len = pstr->raw_len - idx + offset; 700 pstr->stop = pstr->raw_stop - idx + offset; 701 pstr->offsets_needed = 0; 702 } 703 #endif 704 pstr->valid_len = 0; 705 #ifdef RE_ENABLE_I18N 706 if (pstr->mb_cur_max > 1) 707 { 708 Idx wcs_idx; 709 wint_t wc = WEOF; 710 711 if (pstr->is_utf8) 712 { 713 const unsigned char *raw, *p, *end; 714 715 /* Special case UTF-8. Multi-byte chars start with any 716 byte other than 0x80 - 0xbf. */ 717 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 718 end = raw + (offset - pstr->mb_cur_max); 719 if (end < pstr->raw_mbs) 720 end = pstr->raw_mbs; 721 p = raw + offset - 1; 722 #ifdef _LIBC 723 /* We know the wchar_t encoding is UCS4, so for the simple 724 case, ASCII characters, skip the conversion step. */ 725 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 726 { 727 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 728 /* pstr->valid_len = 0; */ 729 wc = (wchar_t) *p; 730 } 731 else 732 #endif 733 for (; p >= end; --p) 734 if ((*p & 0xc0) != 0x80) 735 { 736 mbstate_t cur_state; 737 wchar_t wc2; 738 Idx mlen = raw + pstr->len - p; 739 unsigned char buf[6]; 740 size_t mbclen; 741 742 const unsigned char *pp = p; 743 if (BE (pstr->trans != NULL, 0)) 744 { 745 int i = mlen < 6 ? mlen : 6; 746 while (--i >= 0) 747 buf[i] = pstr->trans[p[i]]; 748 pp = buf; 749 } 750 /* XXX Don't use mbrtowc, we know which conversion 751 to use (UTF-8 -> UCS4). */ 752 memset (&cur_state, 0, sizeof (cur_state)); 753 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen, 754 &cur_state); 755 if (raw + offset - p <= mbclen 756 && mbclen < (size_t) -2) 757 { 758 memset (&pstr->cur_state, '\0', 759 sizeof (mbstate_t)); 760 pstr->valid_len = mbclen - (raw + offset - p); 761 wc = wc2; 762 } 763 break; 764 } 765 } 766 767 if (wc == WEOF) 768 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 769 if (wc == WEOF) 770 pstr->tip_context 771 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 772 else 773 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 774 && IS_WIDE_WORD_CHAR (wc)) 775 ? CONTEXT_WORD 776 : ((IS_WIDE_NEWLINE (wc) 777 && pstr->newline_anchor) 778 ? CONTEXT_NEWLINE : 0)); 779 if (BE (pstr->valid_len, 0)) 780 { 781 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 782 pstr->wcs[wcs_idx] = WEOF; 783 if (pstr->mbs_allocated) 784 memset (pstr->mbs, 255, pstr->valid_len); 785 } 786 pstr->valid_raw_len = pstr->valid_len; 787 } 788 else 789 #endif /* RE_ENABLE_I18N */ 790 { 791 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 792 pstr->valid_raw_len = 0; 793 if (pstr->trans) 794 c = pstr->trans[c]; 795 pstr->tip_context = (bitset_contain (pstr->word_char, c) 796 ? CONTEXT_WORD 797 : ((IS_NEWLINE (c) && pstr->newline_anchor) 798 ? CONTEXT_NEWLINE : 0)); 799 } 800 } 801 if (!BE (pstr->mbs_allocated, 0)) 802 pstr->mbs += offset; 803 } 804 pstr->raw_mbs_idx = idx; 805 pstr->len -= offset; 806 pstr->stop -= offset; 807 808 /* Then build the buffers. */ 809 #ifdef RE_ENABLE_I18N 810 if (pstr->mb_cur_max > 1) 811 { 812 if (pstr->icase) 813 { 814 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 815 if (BE (ret != REG_NOERROR, 0)) 816 return ret; 817 } 818 else 819 build_wcs_buffer (pstr); 820 } 821 else 822 #endif /* RE_ENABLE_I18N */ 823 if (BE (pstr->mbs_allocated, 0)) 824 { 825 if (pstr->icase) 826 build_upper_buffer (pstr); 827 else if (pstr->trans != NULL) 828 re_string_translate_buffer (pstr); 829 } 830 else 831 pstr->valid_len = pstr->len; 832 833 pstr->cur_idx = 0; 834 return REG_NOERROR; 835 } 836 837 static unsigned char 838 internal_function __attribute ((pure)) 839 re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 840 { 841 int ch; 842 Idx off; 843 844 /* Handle the common (easiest) cases first. */ 845 if (BE (!pstr->mbs_allocated, 1)) 846 return re_string_peek_byte (pstr, idx); 847 848 #ifdef RE_ENABLE_I18N 849 if (pstr->mb_cur_max > 1 850 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 851 return re_string_peek_byte (pstr, idx); 852 #endif 853 854 off = pstr->cur_idx + idx; 855 #ifdef RE_ENABLE_I18N 856 if (pstr->offsets_needed) 857 off = pstr->offsets[off]; 858 #endif 859 860 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 861 862 #ifdef RE_ENABLE_I18N 863 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 864 this function returns CAPITAL LETTER I instead of first byte of 865 DOTLESS SMALL LETTER I. The latter would confuse the parser, 866 since peek_byte_case doesn't advance cur_idx in any way. */ 867 if (pstr->offsets_needed && !isascii (ch)) 868 return re_string_peek_byte (pstr, idx); 869 #endif 870 871 return ch; 872 } 873 874 static unsigned char 875 internal_function 876 re_string_fetch_byte_case (re_string_t *pstr) 877 { 878 if (BE (!pstr->mbs_allocated, 1)) 879 return re_string_fetch_byte (pstr); 880 881 #ifdef RE_ENABLE_I18N 882 if (pstr->offsets_needed) 883 { 884 Idx off; 885 int ch; 886 887 /* For tr_TR.UTF-8 [[:islower:]] there is 888 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 889 in that case the whole multi-byte character and return 890 the original letter. On the other side, with 891 [[: DOTLESS SMALL LETTER I return [[:I, as doing 892 anything else would complicate things too much. */ 893 894 if (!re_string_first_byte (pstr, pstr->cur_idx)) 895 return re_string_fetch_byte (pstr); 896 897 off = pstr->offsets[pstr->cur_idx]; 898 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 899 900 if (! isascii (ch)) 901 return re_string_fetch_byte (pstr); 902 903 re_string_skip_bytes (pstr, 904 re_string_char_size_at (pstr, pstr->cur_idx)); 905 return ch; 906 } 907 #endif 908 909 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 910 } 911 912 static void 913 internal_function 914 re_string_destruct (re_string_t *pstr) 915 { 916 #ifdef RE_ENABLE_I18N 917 re_free (pstr->wcs); 918 re_free (pstr->offsets); 919 #endif /* RE_ENABLE_I18N */ 920 if (pstr->mbs_allocated) 921 re_free (pstr->mbs); 922 } 923 924 /* Return the context at IDX in INPUT. */ 925 926 static unsigned int 927 internal_function 928 re_string_context_at (const re_string_t *input, Idx idx, int eflags) 929 { 930 int c; 931 if (BE (! REG_VALID_INDEX (idx), 0)) 932 /* In this case, we use the value stored in input->tip_context, 933 since we can't know the character in input->mbs[-1] here. */ 934 return input->tip_context; 935 if (BE (idx == input->len, 0)) 936 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 937 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 938 #ifdef RE_ENABLE_I18N 939 if (input->mb_cur_max > 1) 940 { 941 wint_t wc; 942 Idx wc_idx = idx; 943 while(input->wcs[wc_idx] == WEOF) 944 { 945 #ifdef DEBUG 946 /* It must not happen. */ 947 assert (REG_VALID_INDEX (wc_idx)); 948 #endif 949 --wc_idx; 950 if (! REG_VALID_INDEX (wc_idx)) 951 return input->tip_context; 952 } 953 wc = input->wcs[wc_idx]; 954 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 955 return CONTEXT_WORD; 956 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 957 ? CONTEXT_NEWLINE : 0); 958 } 959 else 960 #endif 961 { 962 c = re_string_byte_at (input, idx); 963 if (bitset_contain (input->word_char, c)) 964 return CONTEXT_WORD; 965 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 966 } 967 } 968 969 /* Functions for set operation. */ 970 971 static reg_errcode_t 972 internal_function __attribute_warn_unused_result__ 973 re_node_set_alloc (re_node_set *set, Idx size) 974 { 975 set->alloc = size; 976 set->nelem = 0; 977 set->elems = re_malloc (Idx, size); 978 if (BE (set->elems == NULL, 0)) 979 return REG_ESPACE; 980 return REG_NOERROR; 981 } 982 983 static reg_errcode_t 984 internal_function __attribute_warn_unused_result__ 985 re_node_set_init_1 (re_node_set *set, Idx elem) 986 { 987 set->alloc = 1; 988 set->nelem = 1; 989 set->elems = re_malloc (Idx, 1); 990 if (BE (set->elems == NULL, 0)) 991 { 992 set->alloc = set->nelem = 0; 993 return REG_ESPACE; 994 } 995 set->elems[0] = elem; 996 return REG_NOERROR; 997 } 998 999 static reg_errcode_t 1000 internal_function __attribute_warn_unused_result__ 1001 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 1002 { 1003 set->alloc = 2; 1004 set->elems = re_malloc (Idx, 2); 1005 if (BE (set->elems == NULL, 0)) 1006 return REG_ESPACE; 1007 if (elem1 == elem2) 1008 { 1009 set->nelem = 1; 1010 set->elems[0] = elem1; 1011 } 1012 else 1013 { 1014 set->nelem = 2; 1015 if (elem1 < elem2) 1016 { 1017 set->elems[0] = elem1; 1018 set->elems[1] = elem2; 1019 } 1020 else 1021 { 1022 set->elems[0] = elem2; 1023 set->elems[1] = elem1; 1024 } 1025 } 1026 return REG_NOERROR; 1027 } 1028 1029 static reg_errcode_t 1030 internal_function __attribute_warn_unused_result__ 1031 re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1032 { 1033 dest->nelem = src->nelem; 1034 if (src->nelem > 0) 1035 { 1036 dest->alloc = dest->nelem; 1037 dest->elems = re_malloc (Idx, dest->alloc); 1038 if (BE (dest->elems == NULL, 0)) 1039 { 1040 dest->alloc = dest->nelem = 0; 1041 return REG_ESPACE; 1042 } 1043 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1044 } 1045 else 1046 re_node_set_init_empty (dest); 1047 return REG_NOERROR; 1048 } 1049 1050 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1051 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1052 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1053 1054 static reg_errcode_t 1055 internal_function __attribute_warn_unused_result__ 1056 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1057 const re_node_set *src2) 1058 { 1059 Idx i1, i2, is, id, delta, sbase; 1060 if (src1->nelem == 0 || src2->nelem == 0) 1061 return REG_NOERROR; 1062 1063 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1064 conservative estimate. */ 1065 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1066 { 1067 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 1068 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 1069 if (BE (new_elems == NULL, 0)) 1070 return REG_ESPACE; 1071 dest->elems = new_elems; 1072 dest->alloc = new_alloc; 1073 } 1074 1075 /* Find the items in the intersection of SRC1 and SRC2, and copy 1076 into the top of DEST those that are not already in DEST itself. */ 1077 sbase = dest->nelem + src1->nelem + src2->nelem; 1078 i1 = src1->nelem - 1; 1079 i2 = src2->nelem - 1; 1080 id = dest->nelem - 1; 1081 for (;;) 1082 { 1083 if (src1->elems[i1] == src2->elems[i2]) 1084 { 1085 /* Try to find the item in DEST. Maybe we could binary search? */ 1086 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 1087 --id; 1088 1089 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 1090 dest->elems[--sbase] = src1->elems[i1]; 1091 1092 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1093 break; 1094 } 1095 1096 /* Lower the highest of the two items. */ 1097 else if (src1->elems[i1] < src2->elems[i2]) 1098 { 1099 if (! REG_VALID_INDEX (--i2)) 1100 break; 1101 } 1102 else 1103 { 1104 if (! REG_VALID_INDEX (--i1)) 1105 break; 1106 } 1107 } 1108 1109 id = dest->nelem - 1; 1110 is = dest->nelem + src1->nelem + src2->nelem - 1; 1111 delta = is - sbase + 1; 1112 1113 /* Now copy. When DELTA becomes zero, the remaining 1114 DEST elements are already in place; this is more or 1115 less the same loop that is in re_node_set_merge. */ 1116 dest->nelem += delta; 1117 if (delta > 0 && REG_VALID_INDEX (id)) 1118 for (;;) 1119 { 1120 if (dest->elems[is] > dest->elems[id]) 1121 { 1122 /* Copy from the top. */ 1123 dest->elems[id + delta--] = dest->elems[is--]; 1124 if (delta == 0) 1125 break; 1126 } 1127 else 1128 { 1129 /* Slide from the bottom. */ 1130 dest->elems[id + delta] = dest->elems[id]; 1131 if (! REG_VALID_INDEX (--id)) 1132 break; 1133 } 1134 } 1135 1136 /* Copy remaining SRC elements. */ 1137 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 1138 1139 return REG_NOERROR; 1140 } 1141 1142 /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1143 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1144 1145 static reg_errcode_t 1146 internal_function __attribute_warn_unused_result__ 1147 re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1148 const re_node_set *src2) 1149 { 1150 Idx i1, i2, id; 1151 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1152 { 1153 dest->alloc = src1->nelem + src2->nelem; 1154 dest->elems = re_malloc (Idx, dest->alloc); 1155 if (BE (dest->elems == NULL, 0)) 1156 return REG_ESPACE; 1157 } 1158 else 1159 { 1160 if (src1 != NULL && src1->nelem > 0) 1161 return re_node_set_init_copy (dest, src1); 1162 else if (src2 != NULL && src2->nelem > 0) 1163 return re_node_set_init_copy (dest, src2); 1164 else 1165 re_node_set_init_empty (dest); 1166 return REG_NOERROR; 1167 } 1168 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1169 { 1170 if (src1->elems[i1] > src2->elems[i2]) 1171 { 1172 dest->elems[id++] = src2->elems[i2++]; 1173 continue; 1174 } 1175 if (src1->elems[i1] == src2->elems[i2]) 1176 ++i2; 1177 dest->elems[id++] = src1->elems[i1++]; 1178 } 1179 if (i1 < src1->nelem) 1180 { 1181 memcpy (dest->elems + id, src1->elems + i1, 1182 (src1->nelem - i1) * sizeof (Idx)); 1183 id += src1->nelem - i1; 1184 } 1185 else if (i2 < src2->nelem) 1186 { 1187 memcpy (dest->elems + id, src2->elems + i2, 1188 (src2->nelem - i2) * sizeof (Idx)); 1189 id += src2->nelem - i2; 1190 } 1191 dest->nelem = id; 1192 return REG_NOERROR; 1193 } 1194 1195 /* Calculate the union set of the sets DEST and SRC. And store it to 1196 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1197 1198 static reg_errcode_t 1199 internal_function __attribute_warn_unused_result__ 1200 re_node_set_merge (re_node_set *dest, const re_node_set *src) 1201 { 1202 Idx is, id, sbase, delta; 1203 if (src == NULL || src->nelem == 0) 1204 return REG_NOERROR; 1205 if (dest->alloc < 2 * src->nelem + dest->nelem) 1206 { 1207 Idx new_alloc = 2 * (src->nelem + dest->alloc); 1208 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 1209 if (BE (new_buffer == NULL, 0)) 1210 return REG_ESPACE; 1211 dest->elems = new_buffer; 1212 dest->alloc = new_alloc; 1213 } 1214 1215 if (BE (dest->nelem == 0, 0)) 1216 { 1217 dest->nelem = src->nelem; 1218 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1219 return REG_NOERROR; 1220 } 1221 1222 /* Copy into the top of DEST the items of SRC that are not 1223 found in DEST. Maybe we could binary search in DEST? */ 1224 for (sbase = dest->nelem + 2 * src->nelem, 1225 is = src->nelem - 1, id = dest->nelem - 1; 1226 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1227 { 1228 if (dest->elems[id] == src->elems[is]) 1229 is--, id--; 1230 else if (dest->elems[id] < src->elems[is]) 1231 dest->elems[--sbase] = src->elems[is--]; 1232 else /* if (dest->elems[id] > src->elems[is]) */ 1233 --id; 1234 } 1235 1236 if (REG_VALID_INDEX (is)) 1237 { 1238 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1239 sbase -= is + 1; 1240 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 1241 } 1242 1243 id = dest->nelem - 1; 1244 is = dest->nelem + 2 * src->nelem - 1; 1245 delta = is - sbase + 1; 1246 if (delta == 0) 1247 return REG_NOERROR; 1248 1249 /* Now copy. When DELTA becomes zero, the remaining 1250 DEST elements are already in place. */ 1251 dest->nelem += delta; 1252 for (;;) 1253 { 1254 if (dest->elems[is] > dest->elems[id]) 1255 { 1256 /* Copy from the top. */ 1257 dest->elems[id + delta--] = dest->elems[is--]; 1258 if (delta == 0) 1259 break; 1260 } 1261 else 1262 { 1263 /* Slide from the bottom. */ 1264 dest->elems[id + delta] = dest->elems[id]; 1265 if (! REG_VALID_INDEX (--id)) 1266 { 1267 /* Copy remaining SRC elements. */ 1268 memcpy (dest->elems, dest->elems + sbase, 1269 delta * sizeof (Idx)); 1270 break; 1271 } 1272 } 1273 } 1274 1275 return REG_NOERROR; 1276 } 1277 1278 /* Insert the new element ELEM to the re_node_set* SET. 1279 SET should not already have ELEM. 1280 Return true if successful. */ 1281 1282 static bool 1283 internal_function __attribute_warn_unused_result__ 1284 re_node_set_insert (re_node_set *set, Idx elem) 1285 { 1286 Idx idx; 1287 /* In case the set is empty. */ 1288 if (set->alloc == 0) 1289 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 1290 1291 if (BE (set->nelem, 0) == 0) 1292 { 1293 /* We already guaranteed above that set->alloc != 0. */ 1294 set->elems[0] = elem; 1295 ++set->nelem; 1296 return true; 1297 } 1298 1299 /* Realloc if we need. */ 1300 if (set->alloc == set->nelem) 1301 { 1302 Idx *new_elems; 1303 set->alloc = set->alloc * 2; 1304 new_elems = re_realloc (set->elems, Idx, set->alloc); 1305 if (BE (new_elems == NULL, 0)) 1306 return false; 1307 set->elems = new_elems; 1308 } 1309 1310 /* Move the elements which follows the new element. Test the 1311 first element separately to skip a check in the inner loop. */ 1312 if (elem < set->elems[0]) 1313 { 1314 idx = 0; 1315 for (idx = set->nelem; idx > 0; idx--) 1316 set->elems[idx] = set->elems[idx - 1]; 1317 } 1318 else 1319 { 1320 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1321 set->elems[idx] = set->elems[idx - 1]; 1322 } 1323 1324 /* Insert the new element. */ 1325 set->elems[idx] = elem; 1326 ++set->nelem; 1327 return true; 1328 } 1329 1330 /* Insert the new element ELEM to the re_node_set* SET. 1331 SET should not already have any element greater than or equal to ELEM. 1332 Return true if successful. */ 1333 1334 static bool 1335 internal_function __attribute_warn_unused_result__ 1336 re_node_set_insert_last (re_node_set *set, Idx elem) 1337 { 1338 /* Realloc if we need. */ 1339 if (set->alloc == set->nelem) 1340 { 1341 Idx *new_elems; 1342 set->alloc = (set->alloc + 1) * 2; 1343 new_elems = re_realloc (set->elems, Idx, set->alloc); 1344 if (BE (new_elems == NULL, 0)) 1345 return false; 1346 set->elems = new_elems; 1347 } 1348 1349 /* Insert the new element. */ 1350 set->elems[set->nelem++] = elem; 1351 return true; 1352 } 1353 1354 /* Compare two node sets SET1 and SET2. 1355 Return true if SET1 and SET2 are equivalent. */ 1356 1357 static bool 1358 internal_function __attribute ((pure)) 1359 re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1360 { 1361 Idx i; 1362 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1363 return false; 1364 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1365 if (set1->elems[i] != set2->elems[i]) 1366 return false; 1367 return true; 1368 } 1369 1370 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1371 1372 static Idx 1373 internal_function __attribute ((pure)) 1374 re_node_set_contains (const re_node_set *set, Idx elem) 1375 { 1376 __re_size_t idx, right, mid; 1377 if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1378 return 0; 1379 1380 /* Binary search the element. */ 1381 idx = 0; 1382 right = set->nelem - 1; 1383 while (idx < right) 1384 { 1385 mid = (idx + right) / 2; 1386 if (set->elems[mid] < elem) 1387 idx = mid + 1; 1388 else 1389 right = mid; 1390 } 1391 return set->elems[idx] == elem ? idx + 1 : 0; 1392 } 1393 1394 static void 1395 internal_function 1396 re_node_set_remove_at (re_node_set *set, Idx idx) 1397 { 1398 verify (! TYPE_SIGNED (Idx)); 1399 /* if (idx < 0) 1400 return; */ 1401 if (idx >= set->nelem) 1402 return; 1403 --set->nelem; 1404 for (; idx < set->nelem; idx++) 1405 set->elems[idx] = set->elems[idx + 1]; 1406 } 1407 1408 1409 /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1410 Or return REG_MISSING if an error occurred. */ 1411 1412 static Idx 1413 internal_function 1414 re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1415 { 1416 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1417 { 1418 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1419 Idx *new_nexts, *new_indices; 1420 re_node_set *new_edests, *new_eclosures; 1421 re_token_t *new_nodes; 1422 1423 /* Avoid overflows in realloc. */ 1424 const size_t max_object_size = MAX (sizeof (re_token_t), 1425 MAX (sizeof (re_node_set), 1426 sizeof (Idx))); 1427 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0)) 1428 return REG_MISSING; 1429 1430 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1431 if (BE (new_nodes == NULL, 0)) 1432 return REG_MISSING; 1433 dfa->nodes = new_nodes; 1434 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1435 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1436 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1437 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1438 if (BE (new_nexts == NULL || new_indices == NULL 1439 || new_edests == NULL || new_eclosures == NULL, 0)) 1440 return REG_MISSING; 1441 dfa->nexts = new_nexts; 1442 dfa->org_indices = new_indices; 1443 dfa->edests = new_edests; 1444 dfa->eclosures = new_eclosures; 1445 dfa->nodes_alloc = new_nodes_alloc; 1446 } 1447 dfa->nodes[dfa->nodes_len] = token; 1448 dfa->nodes[dfa->nodes_len].constraint = 0; 1449 #ifdef RE_ENABLE_I18N 1450 { 1451 int type = token.type; 1452 dfa->nodes[dfa->nodes_len].accept_mb = 1453 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1454 } 1455 #endif 1456 dfa->nexts[dfa->nodes_len] = REG_MISSING; 1457 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1458 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1459 return dfa->nodes_len++; 1460 } 1461 1462 static inline re_hashval_t 1463 internal_function 1464 calc_state_hash (const re_node_set *nodes, unsigned int context) 1465 { 1466 re_hashval_t hash = nodes->nelem + context; 1467 Idx i; 1468 for (i = 0 ; i < nodes->nelem ; i++) 1469 hash += nodes->elems[i]; 1470 return hash; 1471 } 1472 1473 /* Search for the state whose node_set is equivalent to NODES. 1474 Return the pointer to the state, if we found it in the DFA. 1475 Otherwise create the new one and return it. In case of an error 1476 return NULL and set the error code in ERR. 1477 Note: - We assume NULL as the invalid state, then it is possible that 1478 return value is NULL and ERR is REG_NOERROR. 1479 - We never return non-NULL value in case of any errors, it is for 1480 optimization. */ 1481 1482 static re_dfastate_t * 1483 internal_function __attribute_warn_unused_result__ 1484 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1485 const re_node_set *nodes) 1486 { 1487 re_hashval_t hash; 1488 re_dfastate_t *new_state; 1489 struct re_state_table_entry *spot; 1490 Idx i; 1491 #ifdef lint 1492 /* Suppress bogus uninitialized-variable warnings. */ 1493 *err = REG_NOERROR; 1494 #endif 1495 if (BE (nodes->nelem == 0, 0)) 1496 { 1497 *err = REG_NOERROR; 1498 return NULL; 1499 } 1500 hash = calc_state_hash (nodes, 0); 1501 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1502 1503 for (i = 0 ; i < spot->num ; i++) 1504 { 1505 re_dfastate_t *state = spot->array[i]; 1506 if (hash != state->hash) 1507 continue; 1508 if (re_node_set_compare (&state->nodes, nodes)) 1509 return state; 1510 } 1511 1512 /* There are no appropriate state in the dfa, create the new one. */ 1513 new_state = create_ci_newstate (dfa, nodes, hash); 1514 if (BE (new_state == NULL, 0)) 1515 *err = REG_ESPACE; 1516 1517 return new_state; 1518 } 1519 1520 /* Search for the state whose node_set is equivalent to NODES and 1521 whose context is equivalent to CONTEXT. 1522 Return the pointer to the state, if we found it in the DFA. 1523 Otherwise create the new one and return it. In case of an error 1524 return NULL and set the error code in ERR. 1525 Note: - We assume NULL as the invalid state, then it is possible that 1526 return value is NULL and ERR is REG_NOERROR. 1527 - We never return non-NULL value in case of any errors, it is for 1528 optimization. */ 1529 1530 static re_dfastate_t * 1531 internal_function __attribute_warn_unused_result__ 1532 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1533 const re_node_set *nodes, unsigned int context) 1534 { 1535 re_hashval_t hash; 1536 re_dfastate_t *new_state; 1537 struct re_state_table_entry *spot; 1538 Idx i; 1539 #ifdef lint 1540 /* Suppress bogus uninitialized-variable warnings. */ 1541 *err = REG_NOERROR; 1542 #endif 1543 if (nodes->nelem == 0) 1544 { 1545 *err = REG_NOERROR; 1546 return NULL; 1547 } 1548 hash = calc_state_hash (nodes, context); 1549 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1550 1551 for (i = 0 ; i < spot->num ; i++) 1552 { 1553 re_dfastate_t *state = spot->array[i]; 1554 if (state->hash == hash 1555 && state->context == context 1556 && re_node_set_compare (state->entrance_nodes, nodes)) 1557 return state; 1558 } 1559 /* There are no appropriate state in 'dfa', create the new one. */ 1560 new_state = create_cd_newstate (dfa, nodes, context, hash); 1561 if (BE (new_state == NULL, 0)) 1562 *err = REG_ESPACE; 1563 1564 return new_state; 1565 } 1566 1567 /* Finish initialization of the new state NEWSTATE, and using its hash value 1568 HASH put in the appropriate bucket of DFA's state table. Return value 1569 indicates the error code if failed. */ 1570 1571 static reg_errcode_t 1572 __attribute_warn_unused_result__ 1573 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1574 re_hashval_t hash) 1575 { 1576 struct re_state_table_entry *spot; 1577 reg_errcode_t err; 1578 Idx i; 1579 1580 newstate->hash = hash; 1581 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1582 if (BE (err != REG_NOERROR, 0)) 1583 return REG_ESPACE; 1584 for (i = 0; i < newstate->nodes.nelem; i++) 1585 { 1586 Idx elem = newstate->nodes.elems[i]; 1587 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1588 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem)) 1589 return REG_ESPACE; 1590 } 1591 1592 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1593 if (BE (spot->alloc <= spot->num, 0)) 1594 { 1595 Idx new_alloc = 2 * spot->num + 2; 1596 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1597 new_alloc); 1598 if (BE (new_array == NULL, 0)) 1599 return REG_ESPACE; 1600 spot->array = new_array; 1601 spot->alloc = new_alloc; 1602 } 1603 spot->array[spot->num++] = newstate; 1604 return REG_NOERROR; 1605 } 1606 1607 static void 1608 free_state (re_dfastate_t *state) 1609 { 1610 re_node_set_free (&state->non_eps_nodes); 1611 re_node_set_free (&state->inveclosure); 1612 if (state->entrance_nodes != &state->nodes) 1613 { 1614 re_node_set_free (state->entrance_nodes); 1615 re_free (state->entrance_nodes); 1616 } 1617 re_node_set_free (&state->nodes); 1618 re_free (state->word_trtable); 1619 re_free (state->trtable); 1620 re_free (state); 1621 } 1622 1623 /* Create the new state which is independent of contexts. 1624 Return the new state if succeeded, otherwise return NULL. */ 1625 1626 static re_dfastate_t * 1627 internal_function __attribute_warn_unused_result__ 1628 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1629 re_hashval_t hash) 1630 { 1631 Idx i; 1632 reg_errcode_t err; 1633 re_dfastate_t *newstate; 1634 1635 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1636 if (BE (newstate == NULL, 0)) 1637 return NULL; 1638 err = re_node_set_init_copy (&newstate->nodes, nodes); 1639 if (BE (err != REG_NOERROR, 0)) 1640 { 1641 re_free (newstate); 1642 return NULL; 1643 } 1644 1645 newstate->entrance_nodes = &newstate->nodes; 1646 for (i = 0 ; i < nodes->nelem ; i++) 1647 { 1648 re_token_t *node = dfa->nodes + nodes->elems[i]; 1649 re_token_type_t type = node->type; 1650 if (type == CHARACTER && !node->constraint) 1651 continue; 1652 #ifdef RE_ENABLE_I18N 1653 newstate->accept_mb |= node->accept_mb; 1654 #endif /* RE_ENABLE_I18N */ 1655 1656 /* If the state has the halt node, the state is a halt state. */ 1657 if (type == END_OF_RE) 1658 newstate->halt = 1; 1659 else if (type == OP_BACK_REF) 1660 newstate->has_backref = 1; 1661 else if (type == ANCHOR || node->constraint) 1662 newstate->has_constraint = 1; 1663 } 1664 err = register_state (dfa, newstate, hash); 1665 if (BE (err != REG_NOERROR, 0)) 1666 { 1667 free_state (newstate); 1668 newstate = NULL; 1669 } 1670 return newstate; 1671 } 1672 1673 /* Create the new state which is depend on the context CONTEXT. 1674 Return the new state if succeeded, otherwise return NULL. */ 1675 1676 static re_dfastate_t * 1677 internal_function __attribute_warn_unused_result__ 1678 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1679 unsigned int context, re_hashval_t hash) 1680 { 1681 Idx i, nctx_nodes = 0; 1682 reg_errcode_t err; 1683 re_dfastate_t *newstate; 1684 1685 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1686 if (BE (newstate == NULL, 0)) 1687 return NULL; 1688 err = re_node_set_init_copy (&newstate->nodes, nodes); 1689 if (BE (err != REG_NOERROR, 0)) 1690 { 1691 re_free (newstate); 1692 return NULL; 1693 } 1694 1695 newstate->context = context; 1696 newstate->entrance_nodes = &newstate->nodes; 1697 1698 for (i = 0 ; i < nodes->nelem ; i++) 1699 { 1700 re_token_t *node = dfa->nodes + nodes->elems[i]; 1701 re_token_type_t type = node->type; 1702 unsigned int constraint = node->constraint; 1703 1704 if (type == CHARACTER && !constraint) 1705 continue; 1706 #ifdef RE_ENABLE_I18N 1707 newstate->accept_mb |= node->accept_mb; 1708 #endif /* RE_ENABLE_I18N */ 1709 1710 /* If the state has the halt node, the state is a halt state. */ 1711 if (type == END_OF_RE) 1712 newstate->halt = 1; 1713 else if (type == OP_BACK_REF) 1714 newstate->has_backref = 1; 1715 1716 if (constraint) 1717 { 1718 if (newstate->entrance_nodes == &newstate->nodes) 1719 { 1720 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1721 if (BE (newstate->entrance_nodes == NULL, 0)) 1722 { 1723 free_state (newstate); 1724 return NULL; 1725 } 1726 if (re_node_set_init_copy (newstate->entrance_nodes, nodes) 1727 != REG_NOERROR) 1728 return NULL; 1729 nctx_nodes = 0; 1730 newstate->has_constraint = 1; 1731 } 1732 1733 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1734 { 1735 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1736 ++nctx_nodes; 1737 } 1738 } 1739 } 1740 err = register_state (dfa, newstate, hash); 1741 if (BE (err != REG_NOERROR, 0)) 1742 { 1743 free_state (newstate); 1744 newstate = NULL; 1745 } 1746 return newstate; 1747 } 1748