1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Extended regular expression matching and search library. 4 Copyright (C) 2002-2011 Free Software Foundation, Inc. 5 This file is part of the GNU C Library. 6 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 7 8 This program is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3, or (at your option) 11 any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License along 19 with this program; if not, write to the Free Software Foundation, 20 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 21 22 #include "verify.h" 23 #include "intprops.h" 24 static void re_string_construct_common (const char *str, Idx len, 25 re_string_t *pstr, 26 RE_TRANSLATE_TYPE trans, bool icase, 27 const re_dfa_t *dfa) internal_function; 28 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 29 const re_node_set *nodes, 30 re_hashval_t hash) internal_function; 31 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 32 const re_node_set *nodes, 33 unsigned int context, 34 re_hashval_t hash) internal_function; 35 36 /* Functions for string operation. */ 37 38 /* This function allocate the buffers. It is necessary to call 39 re_string_reconstruct before using the object. */ 40 41 static reg_errcode_t 42 internal_function __attribute_warn_unused_result__ 43 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 44 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 45 { 46 reg_errcode_t ret; 47 Idx init_buf_len; 48 49 /* Ensure at least one character fits into the buffers. */ 50 if (init_len < dfa->mb_cur_max) 51 init_len = dfa->mb_cur_max; 52 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 53 re_string_construct_common (str, len, pstr, trans, icase, dfa); 54 55 ret = re_string_realloc_buffers (pstr, init_buf_len); 56 if (BE (ret != REG_NOERROR, 0)) 57 return ret; 58 59 pstr->word_char = dfa->word_char; 60 pstr->word_ops_used = dfa->word_ops_used; 61 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 62 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 63 pstr->valid_raw_len = pstr->valid_len; 64 return REG_NOERROR; 65 } 66 67 /* This function allocate the buffers, and initialize them. */ 68 69 static reg_errcode_t 70 internal_function __attribute_warn_unused_result__ 71 re_string_construct (re_string_t *pstr, const char *str, Idx len, 72 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 73 { 74 reg_errcode_t ret; 75 memset (pstr, '\0', sizeof (re_string_t)); 76 re_string_construct_common (str, len, pstr, trans, icase, dfa); 77 78 if (len > 0) 79 { 80 ret = re_string_realloc_buffers (pstr, len + 1); 81 if (BE (ret != REG_NOERROR, 0)) 82 return ret; 83 } 84 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 85 86 if (icase) 87 { 88 #ifdef RE_ENABLE_I18N 89 if (dfa->mb_cur_max > 1) 90 { 91 while (1) 92 { 93 ret = build_wcs_upper_buffer (pstr); 94 if (BE (ret != REG_NOERROR, 0)) 95 return ret; 96 if (pstr->valid_raw_len >= len) 97 break; 98 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 99 break; 100 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 101 if (BE (ret != REG_NOERROR, 0)) 102 return ret; 103 } 104 } 105 else 106 #endif /* RE_ENABLE_I18N */ 107 build_upper_buffer (pstr); 108 } 109 else 110 { 111 #ifdef RE_ENABLE_I18N 112 if (dfa->mb_cur_max > 1) 113 build_wcs_buffer (pstr); 114 else 115 #endif /* RE_ENABLE_I18N */ 116 { 117 if (trans != NULL) 118 re_string_translate_buffer (pstr); 119 else 120 { 121 pstr->valid_len = pstr->bufs_len; 122 pstr->valid_raw_len = pstr->bufs_len; 123 } 124 } 125 } 126 127 return REG_NOERROR; 128 } 129 130 /* Helper functions for re_string_allocate, and re_string_construct. */ 131 132 static reg_errcode_t 133 internal_function __attribute_warn_unused_result__ 134 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 135 { 136 #ifdef RE_ENABLE_I18N 137 if (pstr->mb_cur_max > 1) 138 { 139 wint_t *new_wcs; 140 141 /* Avoid overflow. */ 142 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 143 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 144 return REG_ESPACE; 145 146 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 147 if (BE (new_wcs == NULL, 0)) 148 return REG_ESPACE; 149 pstr->wcs = new_wcs; 150 if (pstr->offsets != NULL) 151 { 152 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 153 if (BE (new_offsets == NULL, 0)) 154 return REG_ESPACE; 155 pstr->offsets = new_offsets; 156 } 157 } 158 #endif /* RE_ENABLE_I18N */ 159 if (pstr->mbs_allocated) 160 { 161 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 162 new_buf_len); 163 if (BE (new_mbs == NULL, 0)) 164 return REG_ESPACE; 165 pstr->mbs = new_mbs; 166 } 167 pstr->bufs_len = new_buf_len; 168 return REG_NOERROR; 169 } 170 171 172 static void 173 internal_function 174 re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 175 RE_TRANSLATE_TYPE trans, bool icase, 176 const re_dfa_t *dfa) 177 { 178 pstr->raw_mbs = (const unsigned char *) str; 179 pstr->len = len; 180 pstr->raw_len = len; 181 pstr->trans = trans; 182 pstr->icase = icase; 183 pstr->mbs_allocated = (trans != NULL || icase); 184 pstr->mb_cur_max = dfa->mb_cur_max; 185 pstr->is_utf8 = dfa->is_utf8; 186 pstr->map_notascii = dfa->map_notascii; 187 pstr->stop = pstr->len; 188 pstr->raw_stop = pstr->stop; 189 } 190 191 #ifdef RE_ENABLE_I18N 192 193 /* Build wide character buffer PSTR->WCS. 194 If the byte sequence of the string are: 195 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 196 Then wide character buffer will be: 197 <wc1> , WEOF , <wc2> , WEOF , <wc3> 198 We use WEOF for padding, they indicate that the position isn't 199 a first byte of a multibyte character. 200 201 Note that this function assumes PSTR->VALID_LEN elements are already 202 built and starts from PSTR->VALID_LEN. */ 203 204 static void 205 internal_function 206 build_wcs_buffer (re_string_t *pstr) 207 { 208 #ifdef _LIBC 209 unsigned char buf[MB_LEN_MAX]; 210 assert (MB_LEN_MAX >= pstr->mb_cur_max); 211 #else 212 unsigned char buf[64]; 213 #endif 214 mbstate_t prev_st; 215 Idx byte_idx, end_idx, remain_len; 216 size_t mbclen; 217 218 /* Build the buffers from pstr->valid_len to either pstr->len or 219 pstr->bufs_len. */ 220 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 221 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 222 { 223 wchar_t wc; 224 const char *p; 225 226 remain_len = end_idx - byte_idx; 227 prev_st = pstr->cur_state; 228 /* Apply the translation if we need. */ 229 if (BE (pstr->trans != NULL, 0)) 230 { 231 int i, ch; 232 233 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 234 { 235 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 236 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 237 } 238 p = (const char *) buf; 239 } 240 else 241 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 242 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 243 if (BE (mbclen == (size_t) -2, 0)) 244 { 245 /* The buffer doesn't have enough space, finish to build. */ 246 pstr->cur_state = prev_st; 247 break; 248 } 249 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 250 { 251 /* We treat these cases as a singlebyte character. */ 252 mbclen = 1; 253 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 254 if (BE (pstr->trans != NULL, 0)) 255 wc = pstr->trans[wc]; 256 pstr->cur_state = prev_st; 257 } 258 259 /* Write wide character and padding. */ 260 pstr->wcs[byte_idx++] = wc; 261 /* Write paddings. */ 262 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 263 pstr->wcs[byte_idx++] = WEOF; 264 } 265 pstr->valid_len = byte_idx; 266 pstr->valid_raw_len = byte_idx; 267 } 268 269 /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 270 but for REG_ICASE. */ 271 272 static reg_errcode_t 273 internal_function __attribute_warn_unused_result__ 274 build_wcs_upper_buffer (re_string_t *pstr) 275 { 276 mbstate_t prev_st; 277 Idx src_idx, byte_idx, end_idx, remain_len; 278 size_t mbclen; 279 #ifdef _LIBC 280 char buf[MB_LEN_MAX]; 281 assert (MB_LEN_MAX >= pstr->mb_cur_max); 282 #else 283 char buf[64]; 284 #endif 285 286 byte_idx = pstr->valid_len; 287 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 288 289 /* The following optimization assumes that ASCII characters can be 290 mapped to wide characters with a simple cast. */ 291 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 292 { 293 while (byte_idx < end_idx) 294 { 295 wchar_t wc; 296 297 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 298 && mbsinit (&pstr->cur_state)) 299 { 300 /* In case of a singlebyte character. */ 301 pstr->mbs[byte_idx] 302 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 303 /* The next step uses the assumption that wchar_t is encoded 304 ASCII-safe: all ASCII values can be converted like this. */ 305 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 306 ++byte_idx; 307 continue; 308 } 309 310 remain_len = end_idx - byte_idx; 311 prev_st = pstr->cur_state; 312 mbclen = __mbrtowc (&wc, 313 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 314 + byte_idx), remain_len, &pstr->cur_state); 315 if (BE (mbclen < (size_t) -2, 1)) 316 { 317 wchar_t wcu = wc; 318 if (iswlower (wc)) 319 { 320 size_t mbcdlen; 321 322 wcu = towupper (wc); 323 mbcdlen = wcrtomb (buf, wcu, &prev_st); 324 if (BE (mbclen == mbcdlen, 1)) 325 memcpy (pstr->mbs + byte_idx, buf, mbclen); 326 else 327 { 328 src_idx = byte_idx; 329 goto offsets_needed; 330 } 331 } 332 else 333 memcpy (pstr->mbs + byte_idx, 334 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 335 pstr->wcs[byte_idx++] = wcu; 336 /* Write paddings. */ 337 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 338 pstr->wcs[byte_idx++] = WEOF; 339 } 340 else if (mbclen == (size_t) -1 || mbclen == 0) 341 { 342 /* It is an invalid character or '\0'. Just use the byte. */ 343 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 344 pstr->mbs[byte_idx] = ch; 345 /* And also cast it to wide char. */ 346 pstr->wcs[byte_idx++] = (wchar_t) ch; 347 if (BE (mbclen == (size_t) -1, 0)) 348 pstr->cur_state = prev_st; 349 } 350 else 351 { 352 /* The buffer doesn't have enough space, finish to build. */ 353 pstr->cur_state = prev_st; 354 break; 355 } 356 } 357 pstr->valid_len = byte_idx; 358 pstr->valid_raw_len = byte_idx; 359 return REG_NOERROR; 360 } 361 else 362 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 363 { 364 wchar_t wc; 365 const char *p; 366 offsets_needed: 367 remain_len = end_idx - byte_idx; 368 prev_st = pstr->cur_state; 369 if (BE (pstr->trans != NULL, 0)) 370 { 371 int i, ch; 372 373 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 374 { 375 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 376 buf[i] = pstr->trans[ch]; 377 } 378 p = (const char *) buf; 379 } 380 else 381 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 382 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 383 if (BE (mbclen < (size_t) -2, 1)) 384 { 385 wchar_t wcu = wc; 386 if (iswlower (wc)) 387 { 388 size_t mbcdlen; 389 390 wcu = towupper (wc); 391 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 392 if (BE (mbclen == mbcdlen, 1)) 393 memcpy (pstr->mbs + byte_idx, buf, mbclen); 394 else if (mbcdlen != (size_t) -1) 395 { 396 size_t i; 397 398 if (byte_idx + mbcdlen > pstr->bufs_len) 399 { 400 pstr->cur_state = prev_st; 401 break; 402 } 403 404 if (pstr->offsets == NULL) 405 { 406 pstr->offsets = re_malloc (Idx, pstr->bufs_len); 407 408 if (pstr->offsets == NULL) 409 return REG_ESPACE; 410 } 411 if (!pstr->offsets_needed) 412 { 413 for (i = 0; i < (size_t) byte_idx; ++i) 414 pstr->offsets[i] = i; 415 pstr->offsets_needed = 1; 416 } 417 418 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 419 pstr->wcs[byte_idx] = wcu; 420 pstr->offsets[byte_idx] = src_idx; 421 for (i = 1; i < mbcdlen; ++i) 422 { 423 pstr->offsets[byte_idx + i] 424 = src_idx + (i < mbclen ? i : mbclen - 1); 425 pstr->wcs[byte_idx + i] = WEOF; 426 } 427 pstr->len += mbcdlen - mbclen; 428 if (pstr->raw_stop > src_idx) 429 pstr->stop += mbcdlen - mbclen; 430 end_idx = (pstr->bufs_len > pstr->len) 431 ? pstr->len : pstr->bufs_len; 432 byte_idx += mbcdlen; 433 src_idx += mbclen; 434 continue; 435 } 436 else 437 memcpy (pstr->mbs + byte_idx, p, mbclen); 438 } 439 else 440 memcpy (pstr->mbs + byte_idx, p, mbclen); 441 442 if (BE (pstr->offsets_needed != 0, 0)) 443 { 444 size_t i; 445 for (i = 0; i < mbclen; ++i) 446 pstr->offsets[byte_idx + i] = src_idx + i; 447 } 448 src_idx += mbclen; 449 450 pstr->wcs[byte_idx++] = wcu; 451 /* Write paddings. */ 452 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 453 pstr->wcs[byte_idx++] = WEOF; 454 } 455 else if (mbclen == (size_t) -1 || mbclen == 0) 456 { 457 /* It is an invalid character or '\0'. Just use the byte. */ 458 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 459 460 if (BE (pstr->trans != NULL, 0)) 461 ch = pstr->trans [ch]; 462 pstr->mbs[byte_idx] = ch; 463 464 if (BE (pstr->offsets_needed != 0, 0)) 465 pstr->offsets[byte_idx] = src_idx; 466 ++src_idx; 467 468 /* And also cast it to wide char. */ 469 pstr->wcs[byte_idx++] = (wchar_t) ch; 470 if (BE (mbclen == (size_t) -1, 0)) 471 pstr->cur_state = prev_st; 472 } 473 else 474 { 475 /* The buffer doesn't have enough space, finish to build. */ 476 pstr->cur_state = prev_st; 477 break; 478 } 479 } 480 pstr->valid_len = byte_idx; 481 pstr->valid_raw_len = src_idx; 482 return REG_NOERROR; 483 } 484 485 /* Skip characters until the index becomes greater than NEW_RAW_IDX. 486 Return the index. */ 487 488 static Idx 489 internal_function 490 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 491 { 492 mbstate_t prev_st; 493 Idx rawbuf_idx; 494 size_t mbclen; 495 wint_t wc = WEOF; 496 497 /* Skip the characters which are not necessary to check. */ 498 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 499 rawbuf_idx < new_raw_idx;) 500 { 501 wchar_t wc2; 502 Idx remain_len; 503 remain_len = pstr->len - rawbuf_idx; 504 prev_st = pstr->cur_state; 505 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 506 remain_len, &pstr->cur_state); 507 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 508 { 509 /* We treat these cases as a single byte character. */ 510 if (mbclen == 0 || remain_len == 0) 511 wc = L'\0'; 512 else 513 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 514 mbclen = 1; 515 pstr->cur_state = prev_st; 516 } 517 else 518 wc = wc2; 519 /* Then proceed the next character. */ 520 rawbuf_idx += mbclen; 521 } 522 *last_wc = wc; 523 return rawbuf_idx; 524 } 525 #endif /* RE_ENABLE_I18N */ 526 527 /* Build the buffer PSTR->MBS, and apply the translation if we need. 528 This function is used in case of REG_ICASE. */ 529 530 static void 531 internal_function 532 build_upper_buffer (re_string_t *pstr) 533 { 534 Idx char_idx, end_idx; 535 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 536 537 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 538 { 539 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 540 if (BE (pstr->trans != NULL, 0)) 541 ch = pstr->trans[ch]; 542 if (islower (ch)) 543 pstr->mbs[char_idx] = toupper (ch); 544 else 545 pstr->mbs[char_idx] = ch; 546 } 547 pstr->valid_len = char_idx; 548 pstr->valid_raw_len = char_idx; 549 } 550 551 /* Apply TRANS to the buffer in PSTR. */ 552 553 static void 554 internal_function 555 re_string_translate_buffer (re_string_t *pstr) 556 { 557 Idx buf_idx, end_idx; 558 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 559 560 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 561 { 562 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 563 pstr->mbs[buf_idx] = pstr->trans[ch]; 564 } 565 566 pstr->valid_len = buf_idx; 567 pstr->valid_raw_len = buf_idx; 568 } 569 570 /* This function re-construct the buffers. 571 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 572 convert to upper case in case of REG_ICASE, apply translation. */ 573 574 static reg_errcode_t 575 internal_function __attribute_warn_unused_result__ 576 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 577 { 578 Idx offset; 579 580 if (BE (pstr->raw_mbs_idx <= idx, 0)) 581 offset = idx - pstr->raw_mbs_idx; 582 else 583 { 584 /* Reset buffer. */ 585 #ifdef RE_ENABLE_I18N 586 if (pstr->mb_cur_max > 1) 587 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 588 #endif /* RE_ENABLE_I18N */ 589 pstr->len = pstr->raw_len; 590 pstr->stop = pstr->raw_stop; 591 pstr->valid_len = 0; 592 pstr->raw_mbs_idx = 0; 593 pstr->valid_raw_len = 0; 594 pstr->offsets_needed = 0; 595 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 596 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 597 if (!pstr->mbs_allocated) 598 pstr->mbs = (unsigned char *) pstr->raw_mbs; 599 offset = idx; 600 } 601 602 if (BE (offset != 0, 1)) 603 { 604 /* Should the already checked characters be kept? */ 605 if (BE (offset < pstr->valid_raw_len, 1)) 606 { 607 /* Yes, move them to the front of the buffer. */ 608 #ifdef RE_ENABLE_I18N 609 if (BE (pstr->offsets_needed, 0)) 610 { 611 Idx low = 0, high = pstr->valid_len, mid; 612 do 613 { 614 mid = (high + low) / 2; 615 if (pstr->offsets[mid] > offset) 616 high = mid; 617 else if (pstr->offsets[mid] < offset) 618 low = mid + 1; 619 else 620 break; 621 } 622 while (low < high); 623 if (pstr->offsets[mid] < offset) 624 ++mid; 625 pstr->tip_context = re_string_context_at (pstr, mid - 1, 626 eflags); 627 /* This can be quite complicated, so handle specially 628 only the common and easy case where the character with 629 different length representation of lower and upper 630 case is present at or after offset. */ 631 if (pstr->valid_len > offset 632 && mid == offset && pstr->offsets[mid] == offset) 633 { 634 memmove (pstr->wcs, pstr->wcs + offset, 635 (pstr->valid_len - offset) * sizeof (wint_t)); 636 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 637 pstr->valid_len -= offset; 638 pstr->valid_raw_len -= offset; 639 for (low = 0; low < pstr->valid_len; low++) 640 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 641 } 642 else 643 { 644 /* Otherwise, just find out how long the partial multibyte 645 character at offset is and fill it with WEOF/255. */ 646 pstr->len = pstr->raw_len - idx + offset; 647 pstr->stop = pstr->raw_stop - idx + offset; 648 pstr->offsets_needed = 0; 649 while (mid > 0 && pstr->offsets[mid - 1] == offset) 650 --mid; 651 while (mid < pstr->valid_len) 652 if (pstr->wcs[mid] != WEOF) 653 break; 654 else 655 ++mid; 656 if (mid == pstr->valid_len) 657 pstr->valid_len = 0; 658 else 659 { 660 pstr->valid_len = pstr->offsets[mid] - offset; 661 if (pstr->valid_len) 662 { 663 for (low = 0; low < pstr->valid_len; ++low) 664 pstr->wcs[low] = WEOF; 665 memset (pstr->mbs, 255, pstr->valid_len); 666 } 667 } 668 pstr->valid_raw_len = pstr->valid_len; 669 } 670 } 671 else 672 #endif 673 { 674 pstr->tip_context = re_string_context_at (pstr, offset - 1, 675 eflags); 676 #ifdef RE_ENABLE_I18N 677 if (pstr->mb_cur_max > 1) 678 memmove (pstr->wcs, pstr->wcs + offset, 679 (pstr->valid_len - offset) * sizeof (wint_t)); 680 #endif /* RE_ENABLE_I18N */ 681 if (BE (pstr->mbs_allocated, 0)) 682 memmove (pstr->mbs, pstr->mbs + offset, 683 pstr->valid_len - offset); 684 pstr->valid_len -= offset; 685 pstr->valid_raw_len -= offset; 686 #if DEBUG 687 assert (pstr->valid_len > 0); 688 #endif 689 } 690 } 691 else 692 { 693 #ifdef RE_ENABLE_I18N 694 /* No, skip all characters until IDX. */ 695 Idx prev_valid_len = pstr->valid_len; 696 697 if (BE (pstr->offsets_needed, 0)) 698 { 699 pstr->len = pstr->raw_len - idx + offset; 700 pstr->stop = pstr->raw_stop - idx + offset; 701 pstr->offsets_needed = 0; 702 } 703 #endif 704 pstr->valid_len = 0; 705 #ifdef RE_ENABLE_I18N 706 if (pstr->mb_cur_max > 1) 707 { 708 Idx wcs_idx; 709 wint_t wc = WEOF; 710 711 if (pstr->is_utf8) 712 { 713 const unsigned char *raw, *p, *end; 714 715 /* Special case UTF-8. Multi-byte chars start with any 716 byte other than 0x80 - 0xbf. */ 717 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 718 end = raw + (offset - pstr->mb_cur_max); 719 if (end < pstr->raw_mbs) 720 end = pstr->raw_mbs; 721 p = raw + offset - 1; 722 #ifdef _LIBC 723 /* We know the wchar_t encoding is UCS4, so for the simple 724 case, ASCII characters, skip the conversion step. */ 725 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 726 { 727 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 728 /* pstr->valid_len = 0; */ 729 wc = (wchar_t) *p; 730 } 731 else 732 #endif 733 for (; p >= end; --p) 734 if ((*p & 0xc0) != 0x80) 735 { 736 mbstate_t cur_state; 737 wchar_t wc2; 738 Idx mlen = raw + pstr->len - p; 739 size_t mbclen; 740 741 #if 0 /* dead code: buf is set but never used */ 742 unsigned char buf[6]; 743 if (BE (pstr->trans != NULL, 0)) 744 { 745 int i = mlen < 6 ? mlen : 6; 746 while (--i >= 0) 747 buf[i] = pstr->trans[p[i]]; 748 } 749 #endif 750 /* XXX Don't use mbrtowc, we know which conversion 751 to use (UTF-8 -> UCS4). */ 752 memset (&cur_state, 0, sizeof (cur_state)); 753 mbclen = __mbrtowc (&wc2, (const char *) p, mlen, 754 &cur_state); 755 if (raw + offset - p <= mbclen 756 && mbclen < (size_t) -2) 757 { 758 memset (&pstr->cur_state, '\0', 759 sizeof (mbstate_t)); 760 pstr->valid_len = mbclen - (raw + offset - p); 761 wc = wc2; 762 } 763 break; 764 } 765 } 766 767 if (wc == WEOF) 768 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 769 if (wc == WEOF) 770 pstr->tip_context 771 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 772 else 773 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 774 && IS_WIDE_WORD_CHAR (wc)) 775 ? CONTEXT_WORD 776 : ((IS_WIDE_NEWLINE (wc) 777 && pstr->newline_anchor) 778 ? CONTEXT_NEWLINE : 0)); 779 if (BE (pstr->valid_len, 0)) 780 { 781 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 782 pstr->wcs[wcs_idx] = WEOF; 783 if (pstr->mbs_allocated) 784 memset (pstr->mbs, 255, pstr->valid_len); 785 } 786 pstr->valid_raw_len = pstr->valid_len; 787 } 788 else 789 #endif /* RE_ENABLE_I18N */ 790 { 791 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 792 pstr->valid_raw_len = 0; 793 if (pstr->trans) 794 c = pstr->trans[c]; 795 pstr->tip_context = (bitset_contain (pstr->word_char, c) 796 ? CONTEXT_WORD 797 : ((IS_NEWLINE (c) && pstr->newline_anchor) 798 ? CONTEXT_NEWLINE : 0)); 799 } 800 } 801 if (!BE (pstr->mbs_allocated, 0)) 802 pstr->mbs += offset; 803 } 804 pstr->raw_mbs_idx = idx; 805 pstr->len -= offset; 806 pstr->stop -= offset; 807 808 /* Then build the buffers. */ 809 #ifdef RE_ENABLE_I18N 810 if (pstr->mb_cur_max > 1) 811 { 812 if (pstr->icase) 813 { 814 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 815 if (BE (ret != REG_NOERROR, 0)) 816 return ret; 817 } 818 else 819 build_wcs_buffer (pstr); 820 } 821 else 822 #endif /* RE_ENABLE_I18N */ 823 if (BE (pstr->mbs_allocated, 0)) 824 { 825 if (pstr->icase) 826 build_upper_buffer (pstr); 827 else if (pstr->trans != NULL) 828 re_string_translate_buffer (pstr); 829 } 830 else 831 pstr->valid_len = pstr->len; 832 833 pstr->cur_idx = 0; 834 return REG_NOERROR; 835 } 836 837 static unsigned char 838 internal_function __attribute ((pure)) 839 re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 840 { 841 int ch; 842 Idx off; 843 844 /* Handle the common (easiest) cases first. */ 845 if (BE (!pstr->mbs_allocated, 1)) 846 return re_string_peek_byte (pstr, idx); 847 848 #ifdef RE_ENABLE_I18N 849 if (pstr->mb_cur_max > 1 850 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 851 return re_string_peek_byte (pstr, idx); 852 #endif 853 854 off = pstr->cur_idx + idx; 855 #ifdef RE_ENABLE_I18N 856 if (pstr->offsets_needed) 857 off = pstr->offsets[off]; 858 #endif 859 860 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 861 862 #ifdef RE_ENABLE_I18N 863 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 864 this function returns CAPITAL LETTER I instead of first byte of 865 DOTLESS SMALL LETTER I. The latter would confuse the parser, 866 since peek_byte_case doesn't advance cur_idx in any way. */ 867 if (pstr->offsets_needed && !isascii (ch)) 868 return re_string_peek_byte (pstr, idx); 869 #endif 870 871 return ch; 872 } 873 874 static unsigned char 875 internal_function __attribute ((pure)) 876 re_string_fetch_byte_case (re_string_t *pstr) 877 { 878 if (BE (!pstr->mbs_allocated, 1)) 879 return re_string_fetch_byte (pstr); 880 881 #ifdef RE_ENABLE_I18N 882 if (pstr->offsets_needed) 883 { 884 Idx off; 885 int ch; 886 887 /* For tr_TR.UTF-8 [[:islower:]] there is 888 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 889 in that case the whole multi-byte character and return 890 the original letter. On the other side, with 891 [[: DOTLESS SMALL LETTER I return [[:I, as doing 892 anything else would complicate things too much. */ 893 894 if (!re_string_first_byte (pstr, pstr->cur_idx)) 895 return re_string_fetch_byte (pstr); 896 897 off = pstr->offsets[pstr->cur_idx]; 898 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 899 900 if (! isascii (ch)) 901 return re_string_fetch_byte (pstr); 902 903 re_string_skip_bytes (pstr, 904 re_string_char_size_at (pstr, pstr->cur_idx)); 905 return ch; 906 } 907 #endif 908 909 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 910 } 911 912 static void 913 internal_function 914 re_string_destruct (re_string_t *pstr) 915 { 916 #ifdef RE_ENABLE_I18N 917 re_free (pstr->wcs); 918 re_free (pstr->offsets); 919 #endif /* RE_ENABLE_I18N */ 920 if (pstr->mbs_allocated) 921 re_free (pstr->mbs); 922 } 923 924 /* Return the context at IDX in INPUT. */ 925 926 static unsigned int 927 internal_function 928 re_string_context_at (const re_string_t *input, Idx idx, int eflags) 929 { 930 int c; 931 if (BE (! REG_VALID_INDEX (idx), 0)) 932 /* In this case, we use the value stored in input->tip_context, 933 since we can't know the character in input->mbs[-1] here. */ 934 return input->tip_context; 935 if (BE (idx == input->len, 0)) 936 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 937 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 938 #ifdef RE_ENABLE_I18N 939 if (input->mb_cur_max > 1) 940 { 941 wint_t wc; 942 Idx wc_idx = idx; 943 while(input->wcs[wc_idx] == WEOF) 944 { 945 #ifdef DEBUG 946 /* It must not happen. */ 947 assert (REG_VALID_INDEX (wc_idx)); 948 #endif 949 --wc_idx; 950 if (! REG_VALID_INDEX (wc_idx)) 951 return input->tip_context; 952 } 953 wc = input->wcs[wc_idx]; 954 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 955 return CONTEXT_WORD; 956 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 957 ? CONTEXT_NEWLINE : 0); 958 } 959 else 960 #endif 961 { 962 c = re_string_byte_at (input, idx); 963 if (bitset_contain (input->word_char, c)) 964 return CONTEXT_WORD; 965 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 966 } 967 } 968 969 /* Functions for set operation. */ 970 971 static reg_errcode_t 972 internal_function __attribute_warn_unused_result__ 973 re_node_set_alloc (re_node_set *set, Idx size) 974 { 975 set->alloc = size; 976 set->nelem = 0; 977 set->elems = re_malloc (Idx, size); 978 if (BE (set->elems == NULL, 0)) 979 return REG_ESPACE; 980 return REG_NOERROR; 981 } 982 983 static reg_errcode_t 984 internal_function __attribute_warn_unused_result__ 985 re_node_set_init_1 (re_node_set *set, Idx elem) 986 { 987 set->alloc = 1; 988 set->nelem = 1; 989 set->elems = re_malloc (Idx, 1); 990 if (BE (set->elems == NULL, 0)) 991 { 992 set->alloc = set->nelem = 0; 993 return REG_ESPACE; 994 } 995 set->elems[0] = elem; 996 return REG_NOERROR; 997 } 998 999 static reg_errcode_t 1000 internal_function __attribute_warn_unused_result__ 1001 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 1002 { 1003 set->alloc = 2; 1004 set->elems = re_malloc (Idx, 2); 1005 if (BE (set->elems == NULL, 0)) 1006 return REG_ESPACE; 1007 if (elem1 == elem2) 1008 { 1009 set->nelem = 1; 1010 set->elems[0] = elem1; 1011 } 1012 else 1013 { 1014 set->nelem = 2; 1015 if (elem1 < elem2) 1016 { 1017 set->elems[0] = elem1; 1018 set->elems[1] = elem2; 1019 } 1020 else 1021 { 1022 set->elems[0] = elem2; 1023 set->elems[1] = elem1; 1024 } 1025 } 1026 return REG_NOERROR; 1027 } 1028 1029 static reg_errcode_t 1030 internal_function __attribute_warn_unused_result__ 1031 re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1032 { 1033 dest->nelem = src->nelem; 1034 if (src->nelem > 0) 1035 { 1036 dest->alloc = dest->nelem; 1037 dest->elems = re_malloc (Idx, dest->alloc); 1038 if (BE (dest->elems == NULL, 0)) 1039 { 1040 dest->alloc = dest->nelem = 0; 1041 return REG_ESPACE; 1042 } 1043 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1044 } 1045 else 1046 re_node_set_init_empty (dest); 1047 return REG_NOERROR; 1048 } 1049 1050 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1051 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1052 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1053 1054 static reg_errcode_t 1055 internal_function __attribute_warn_unused_result__ 1056 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1057 const re_node_set *src2) 1058 { 1059 Idx i1, i2, is, id, delta, sbase; 1060 if (src1->nelem == 0 || src2->nelem == 0) 1061 return REG_NOERROR; 1062 1063 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1064 conservative estimate. */ 1065 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1066 { 1067 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 1068 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 1069 if (BE (new_elems == NULL, 0)) 1070 return REG_ESPACE; 1071 dest->elems = new_elems; 1072 dest->alloc = new_alloc; 1073 } 1074 1075 /* Find the items in the intersection of SRC1 and SRC2, and copy 1076 into the top of DEST those that are not already in DEST itself. */ 1077 sbase = dest->nelem + src1->nelem + src2->nelem; 1078 i1 = src1->nelem - 1; 1079 i2 = src2->nelem - 1; 1080 id = dest->nelem - 1; 1081 for (;;) 1082 { 1083 if (src1->elems[i1] == src2->elems[i2]) 1084 { 1085 /* Try to find the item in DEST. Maybe we could binary search? */ 1086 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 1087 --id; 1088 1089 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 1090 dest->elems[--sbase] = src1->elems[i1]; 1091 1092 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1093 break; 1094 } 1095 1096 /* Lower the highest of the two items. */ 1097 else if (src1->elems[i1] < src2->elems[i2]) 1098 { 1099 if (! REG_VALID_INDEX (--i2)) 1100 break; 1101 } 1102 else 1103 { 1104 if (! REG_VALID_INDEX (--i1)) 1105 break; 1106 } 1107 } 1108 1109 id = dest->nelem - 1; 1110 is = dest->nelem + src1->nelem + src2->nelem - 1; 1111 delta = is - sbase + 1; 1112 1113 /* Now copy. When DELTA becomes zero, the remaining 1114 DEST elements are already in place; this is more or 1115 less the same loop that is in re_node_set_merge. */ 1116 dest->nelem += delta; 1117 if (delta > 0 && REG_VALID_INDEX (id)) 1118 for (;;) 1119 { 1120 if (dest->elems[is] > dest->elems[id]) 1121 { 1122 /* Copy from the top. */ 1123 dest->elems[id + delta--] = dest->elems[is--]; 1124 if (delta == 0) 1125 break; 1126 } 1127 else 1128 { 1129 /* Slide from the bottom. */ 1130 dest->elems[id + delta] = dest->elems[id]; 1131 if (! REG_VALID_INDEX (--id)) 1132 break; 1133 } 1134 } 1135 1136 /* Copy remaining SRC elements. */ 1137 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 1138 1139 return REG_NOERROR; 1140 } 1141 1142 /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1143 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1144 1145 static reg_errcode_t 1146 internal_function __attribute_warn_unused_result__ 1147 re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1148 const re_node_set *src2) 1149 { 1150 Idx i1, i2, id; 1151 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1152 { 1153 dest->alloc = src1->nelem + src2->nelem; 1154 dest->elems = re_malloc (Idx, dest->alloc); 1155 if (BE (dest->elems == NULL, 0)) 1156 return REG_ESPACE; 1157 } 1158 else 1159 { 1160 if (src1 != NULL && src1->nelem > 0) 1161 return re_node_set_init_copy (dest, src1); 1162 else if (src2 != NULL && src2->nelem > 0) 1163 return re_node_set_init_copy (dest, src2); 1164 else 1165 re_node_set_init_empty (dest); 1166 return REG_NOERROR; 1167 } 1168 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1169 { 1170 if (src1->elems[i1] > src2->elems[i2]) 1171 { 1172 dest->elems[id++] = src2->elems[i2++]; 1173 continue; 1174 } 1175 if (src1->elems[i1] == src2->elems[i2]) 1176 ++i2; 1177 dest->elems[id++] = src1->elems[i1++]; 1178 } 1179 if (i1 < src1->nelem) 1180 { 1181 memcpy (dest->elems + id, src1->elems + i1, 1182 (src1->nelem - i1) * sizeof (Idx)); 1183 id += src1->nelem - i1; 1184 } 1185 else if (i2 < src2->nelem) 1186 { 1187 memcpy (dest->elems + id, src2->elems + i2, 1188 (src2->nelem - i2) * sizeof (Idx)); 1189 id += src2->nelem - i2; 1190 } 1191 dest->nelem = id; 1192 return REG_NOERROR; 1193 } 1194 1195 /* Calculate the union set of the sets DEST and SRC. And store it to 1196 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1197 1198 static reg_errcode_t 1199 internal_function __attribute_warn_unused_result__ 1200 re_node_set_merge (re_node_set *dest, const re_node_set *src) 1201 { 1202 Idx is, id, sbase, delta; 1203 if (src == NULL || src->nelem == 0) 1204 return REG_NOERROR; 1205 if (dest->alloc < 2 * src->nelem + dest->nelem) 1206 { 1207 Idx new_alloc = 2 * (src->nelem + dest->alloc); 1208 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 1209 if (BE (new_buffer == NULL, 0)) 1210 return REG_ESPACE; 1211 dest->elems = new_buffer; 1212 dest->alloc = new_alloc; 1213 } 1214 1215 if (BE (dest->nelem == 0, 0)) 1216 { 1217 dest->nelem = src->nelem; 1218 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1219 return REG_NOERROR; 1220 } 1221 1222 /* Copy into the top of DEST the items of SRC that are not 1223 found in DEST. Maybe we could binary search in DEST? */ 1224 for (sbase = dest->nelem + 2 * src->nelem, 1225 is = src->nelem - 1, id = dest->nelem - 1; 1226 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1227 { 1228 if (dest->elems[id] == src->elems[is]) 1229 is--, id--; 1230 else if (dest->elems[id] < src->elems[is]) 1231 dest->elems[--sbase] = src->elems[is--]; 1232 else /* if (dest->elems[id] > src->elems[is]) */ 1233 --id; 1234 } 1235 1236 if (REG_VALID_INDEX (is)) 1237 { 1238 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1239 sbase -= is + 1; 1240 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 1241 } 1242 1243 id = dest->nelem - 1; 1244 is = dest->nelem + 2 * src->nelem - 1; 1245 delta = is - sbase + 1; 1246 if (delta == 0) 1247 return REG_NOERROR; 1248 1249 /* Now copy. When DELTA becomes zero, the remaining 1250 DEST elements are already in place. */ 1251 dest->nelem += delta; 1252 for (;;) 1253 { 1254 if (dest->elems[is] > dest->elems[id]) 1255 { 1256 /* Copy from the top. */ 1257 dest->elems[id + delta--] = dest->elems[is--]; 1258 if (delta == 0) 1259 break; 1260 } 1261 else 1262 { 1263 /* Slide from the bottom. */ 1264 dest->elems[id + delta] = dest->elems[id]; 1265 if (! REG_VALID_INDEX (--id)) 1266 { 1267 /* Copy remaining SRC elements. */ 1268 memcpy (dest->elems, dest->elems + sbase, 1269 delta * sizeof (Idx)); 1270 break; 1271 } 1272 } 1273 } 1274 1275 return REG_NOERROR; 1276 } 1277 1278 /* Insert the new element ELEM to the re_node_set* SET. 1279 SET should not already have ELEM. 1280 Return true if successful. */ 1281 1282 static bool 1283 internal_function __attribute_warn_unused_result__ 1284 re_node_set_insert (re_node_set *set, Idx elem) 1285 { 1286 Idx idx; 1287 /* In case the set is empty. */ 1288 if (set->alloc == 0) 1289 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 1290 1291 if (BE (set->nelem, 0) == 0) 1292 { 1293 /* We already guaranteed above that set->alloc != 0. */ 1294 set->elems[0] = elem; 1295 ++set->nelem; 1296 return true; 1297 } 1298 1299 /* Realloc if we need. */ 1300 if (set->alloc == set->nelem) 1301 { 1302 Idx *new_elems; 1303 set->alloc = set->alloc * 2; 1304 new_elems = re_realloc (set->elems, Idx, set->alloc); 1305 if (BE (new_elems == NULL, 0)) 1306 return false; 1307 set->elems = new_elems; 1308 } 1309 1310 /* Move the elements which follows the new element. Test the 1311 first element separately to skip a check in the inner loop. */ 1312 if (elem < set->elems[0]) 1313 { 1314 idx = 0; 1315 for (idx = set->nelem; idx > 0; idx--) 1316 set->elems[idx] = set->elems[idx - 1]; 1317 } 1318 else 1319 { 1320 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1321 set->elems[idx] = set->elems[idx - 1]; 1322 } 1323 1324 /* Insert the new element. */ 1325 set->elems[idx] = elem; 1326 ++set->nelem; 1327 return true; 1328 } 1329 1330 /* Insert the new element ELEM to the re_node_set* SET. 1331 SET should not already have any element greater than or equal to ELEM. 1332 Return true if successful. */ 1333 1334 static bool 1335 internal_function __attribute_warn_unused_result__ 1336 re_node_set_insert_last (re_node_set *set, Idx elem) 1337 { 1338 /* Realloc if we need. */ 1339 if (set->alloc == set->nelem) 1340 { 1341 Idx *new_elems; 1342 set->alloc = (set->alloc + 1) * 2; 1343 new_elems = re_realloc (set->elems, Idx, set->alloc); 1344 if (BE (new_elems == NULL, 0)) 1345 return false; 1346 set->elems = new_elems; 1347 } 1348 1349 /* Insert the new element. */ 1350 set->elems[set->nelem++] = elem; 1351 return true; 1352 } 1353 1354 /* Compare two node sets SET1 and SET2. 1355 Return true if SET1 and SET2 are equivalent. */ 1356 1357 static bool 1358 internal_function __attribute ((pure)) 1359 re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1360 { 1361 Idx i; 1362 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1363 return false; 1364 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1365 if (set1->elems[i] != set2->elems[i]) 1366 return false; 1367 return true; 1368 } 1369 1370 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1371 1372 static Idx 1373 internal_function __attribute ((pure)) 1374 re_node_set_contains (const re_node_set *set, Idx elem) 1375 { 1376 __re_size_t idx, right, mid; 1377 if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1378 return 0; 1379 1380 /* Binary search the element. */ 1381 idx = 0; 1382 right = set->nelem - 1; 1383 while (idx < right) 1384 { 1385 mid = (idx + right) / 2; 1386 if (set->elems[mid] < elem) 1387 idx = mid + 1; 1388 else 1389 right = mid; 1390 } 1391 return set->elems[idx] == elem ? idx + 1 : 0; 1392 } 1393 1394 static void 1395 internal_function 1396 re_node_set_remove_at (re_node_set *set, Idx idx) 1397 { 1398 verify (! TYPE_SIGNED (Idx)); 1399 /* if (idx < 0) 1400 return; */ 1401 if (idx >= set->nelem) 1402 return; 1403 --set->nelem; 1404 for (; idx < set->nelem; idx++) 1405 set->elems[idx] = set->elems[idx + 1]; 1406 } 1407 1408 1409 /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1410 Or return REG_MISSING if an error occurred. */ 1411 1412 static Idx 1413 internal_function 1414 re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1415 { 1416 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1417 { 1418 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1419 Idx *new_nexts, *new_indices; 1420 re_node_set *new_edests, *new_eclosures; 1421 re_token_t *new_nodes; 1422 size_t max_object_size = 1423 MAX (sizeof (re_token_t), 1424 MAX (sizeof (re_node_set), 1425 sizeof (Idx))); 1426 1427 /* Avoid overflows. */ 1428 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0)) 1429 return REG_MISSING; 1430 1431 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1432 if (BE (new_nodes == NULL, 0)) 1433 return REG_MISSING; 1434 dfa->nodes = new_nodes; 1435 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1436 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1437 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1438 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1439 if (BE (new_nexts == NULL || new_indices == NULL 1440 || new_edests == NULL || new_eclosures == NULL, 0)) 1441 return REG_MISSING; 1442 dfa->nexts = new_nexts; 1443 dfa->org_indices = new_indices; 1444 dfa->edests = new_edests; 1445 dfa->eclosures = new_eclosures; 1446 dfa->nodes_alloc = new_nodes_alloc; 1447 } 1448 dfa->nodes[dfa->nodes_len] = token; 1449 dfa->nodes[dfa->nodes_len].constraint = 0; 1450 #ifdef RE_ENABLE_I18N 1451 { 1452 int type = token.type; 1453 dfa->nodes[dfa->nodes_len].accept_mb = 1454 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1455 } 1456 #endif 1457 dfa->nexts[dfa->nodes_len] = REG_MISSING; 1458 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1459 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1460 return dfa->nodes_len++; 1461 } 1462 1463 static inline re_hashval_t 1464 internal_function 1465 calc_state_hash (const re_node_set *nodes, unsigned int context) 1466 { 1467 re_hashval_t hash = nodes->nelem + context; 1468 Idx i; 1469 for (i = 0 ; i < nodes->nelem ; i++) 1470 hash += nodes->elems[i]; 1471 return hash; 1472 } 1473 1474 /* Search for the state whose node_set is equivalent to NODES. 1475 Return the pointer to the state, if we found it in the DFA. 1476 Otherwise create the new one and return it. In case of an error 1477 return NULL and set the error code in ERR. 1478 Note: - We assume NULL as the invalid state, then it is possible that 1479 return value is NULL and ERR is REG_NOERROR. 1480 - We never return non-NULL value in case of any errors, it is for 1481 optimization. */ 1482 1483 static re_dfastate_t * 1484 internal_function __attribute_warn_unused_result__ 1485 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1486 const re_node_set *nodes) 1487 { 1488 re_hashval_t hash; 1489 re_dfastate_t *new_state; 1490 struct re_state_table_entry *spot; 1491 Idx i; 1492 #ifdef lint 1493 /* Suppress bogus uninitialized-variable warnings. */ 1494 *err = REG_NOERROR; 1495 #endif 1496 if (BE (nodes->nelem == 0, 0)) 1497 { 1498 *err = REG_NOERROR; 1499 return NULL; 1500 } 1501 hash = calc_state_hash (nodes, 0); 1502 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1503 1504 for (i = 0 ; i < spot->num ; i++) 1505 { 1506 re_dfastate_t *state = spot->array[i]; 1507 if (hash != state->hash) 1508 continue; 1509 if (re_node_set_compare (&state->nodes, nodes)) 1510 return state; 1511 } 1512 1513 /* There are no appropriate state in the dfa, create the new one. */ 1514 new_state = create_ci_newstate (dfa, nodes, hash); 1515 if (BE (new_state == NULL, 0)) 1516 *err = REG_ESPACE; 1517 1518 return new_state; 1519 } 1520 1521 /* Search for the state whose node_set is equivalent to NODES and 1522 whose context is equivalent to CONTEXT. 1523 Return the pointer to the state, if we found it in the DFA. 1524 Otherwise create the new one and return it. In case of an error 1525 return NULL and set the error code in ERR. 1526 Note: - We assume NULL as the invalid state, then it is possible that 1527 return value is NULL and ERR is REG_NOERROR. 1528 - We never return non-NULL value in case of any errors, it is for 1529 optimization. */ 1530 1531 static re_dfastate_t * 1532 internal_function __attribute_warn_unused_result__ 1533 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1534 const re_node_set *nodes, unsigned int context) 1535 { 1536 re_hashval_t hash; 1537 re_dfastate_t *new_state; 1538 struct re_state_table_entry *spot; 1539 Idx i; 1540 #ifdef lint 1541 /* Suppress bogus uninitialized-variable warnings. */ 1542 *err = REG_NOERROR; 1543 #endif 1544 if (nodes->nelem == 0) 1545 { 1546 *err = REG_NOERROR; 1547 return NULL; 1548 } 1549 hash = calc_state_hash (nodes, context); 1550 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1551 1552 for (i = 0 ; i < spot->num ; i++) 1553 { 1554 re_dfastate_t *state = spot->array[i]; 1555 if (state->hash == hash 1556 && state->context == context 1557 && re_node_set_compare (state->entrance_nodes, nodes)) 1558 return state; 1559 } 1560 /* There are no appropriate state in `dfa', create the new one. */ 1561 new_state = create_cd_newstate (dfa, nodes, context, hash); 1562 if (BE (new_state == NULL, 0)) 1563 *err = REG_ESPACE; 1564 1565 return new_state; 1566 } 1567 1568 /* Finish initialization of the new state NEWSTATE, and using its hash value 1569 HASH put in the appropriate bucket of DFA's state table. Return value 1570 indicates the error code if failed. */ 1571 1572 static reg_errcode_t 1573 __attribute_warn_unused_result__ 1574 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1575 re_hashval_t hash) 1576 { 1577 struct re_state_table_entry *spot; 1578 reg_errcode_t err; 1579 Idx i; 1580 1581 newstate->hash = hash; 1582 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1583 if (BE (err != REG_NOERROR, 0)) 1584 return REG_ESPACE; 1585 for (i = 0; i < newstate->nodes.nelem; i++) 1586 { 1587 Idx elem = newstate->nodes.elems[i]; 1588 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1589 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0)) 1590 return REG_ESPACE; 1591 } 1592 1593 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1594 if (BE (spot->alloc <= spot->num, 0)) 1595 { 1596 Idx new_alloc = 2 * spot->num + 2; 1597 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1598 new_alloc); 1599 if (BE (new_array == NULL, 0)) 1600 return REG_ESPACE; 1601 spot->array = new_array; 1602 spot->alloc = new_alloc; 1603 } 1604 spot->array[spot->num++] = newstate; 1605 return REG_NOERROR; 1606 } 1607 1608 static void 1609 free_state (re_dfastate_t *state) 1610 { 1611 re_node_set_free (&state->non_eps_nodes); 1612 re_node_set_free (&state->inveclosure); 1613 if (state->entrance_nodes != &state->nodes) 1614 { 1615 re_node_set_free (state->entrance_nodes); 1616 re_free (state->entrance_nodes); 1617 } 1618 re_node_set_free (&state->nodes); 1619 re_free (state->word_trtable); 1620 re_free (state->trtable); 1621 re_free (state); 1622 } 1623 1624 /* Create the new state which is independ of contexts. 1625 Return the new state if succeeded, otherwise return NULL. */ 1626 1627 static re_dfastate_t * 1628 internal_function __attribute_warn_unused_result__ 1629 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1630 re_hashval_t hash) 1631 { 1632 Idx i; 1633 reg_errcode_t err; 1634 re_dfastate_t *newstate; 1635 1636 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1637 if (BE (newstate == NULL, 0)) 1638 return NULL; 1639 err = re_node_set_init_copy (&newstate->nodes, nodes); 1640 if (BE (err != REG_NOERROR, 0)) 1641 { 1642 re_free (newstate); 1643 return NULL; 1644 } 1645 1646 newstate->entrance_nodes = &newstate->nodes; 1647 for (i = 0 ; i < nodes->nelem ; i++) 1648 { 1649 re_token_t *node = dfa->nodes + nodes->elems[i]; 1650 re_token_type_t type = node->type; 1651 if (type == CHARACTER && !node->constraint) 1652 continue; 1653 #ifdef RE_ENABLE_I18N 1654 newstate->accept_mb |= node->accept_mb; 1655 #endif /* RE_ENABLE_I18N */ 1656 1657 /* If the state has the halt node, the state is a halt state. */ 1658 if (type == END_OF_RE) 1659 newstate->halt = 1; 1660 else if (type == OP_BACK_REF) 1661 newstate->has_backref = 1; 1662 else if (type == ANCHOR || node->constraint) 1663 newstate->has_constraint = 1; 1664 } 1665 err = register_state (dfa, newstate, hash); 1666 if (BE (err != REG_NOERROR, 0)) 1667 { 1668 free_state (newstate); 1669 newstate = NULL; 1670 } 1671 return newstate; 1672 } 1673 1674 /* Create the new state which is depend on the context CONTEXT. 1675 Return the new state if succeeded, otherwise return NULL. */ 1676 1677 static re_dfastate_t * 1678 internal_function __attribute_warn_unused_result__ 1679 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1680 unsigned int context, re_hashval_t hash) 1681 { 1682 Idx i, nctx_nodes = 0; 1683 reg_errcode_t err; 1684 re_dfastate_t *newstate; 1685 1686 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1687 if (BE (newstate == NULL, 0)) 1688 return NULL; 1689 err = re_node_set_init_copy (&newstate->nodes, nodes); 1690 if (BE (err != REG_NOERROR, 0)) 1691 { 1692 re_free (newstate); 1693 return NULL; 1694 } 1695 1696 newstate->context = context; 1697 newstate->entrance_nodes = &newstate->nodes; 1698 1699 for (i = 0 ; i < nodes->nelem ; i++) 1700 { 1701 re_token_t *node = dfa->nodes + nodes->elems[i]; 1702 re_token_type_t type = node->type; 1703 unsigned int constraint = node->constraint; 1704 1705 if (type == CHARACTER && !constraint) 1706 continue; 1707 #ifdef RE_ENABLE_I18N 1708 newstate->accept_mb |= node->accept_mb; 1709 #endif /* RE_ENABLE_I18N */ 1710 1711 /* If the state has the halt node, the state is a halt state. */ 1712 if (type == END_OF_RE) 1713 newstate->halt = 1; 1714 else if (type == OP_BACK_REF) 1715 newstate->has_backref = 1; 1716 1717 if (constraint) 1718 { 1719 if (newstate->entrance_nodes == &newstate->nodes) 1720 { 1721 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1722 if (BE (newstate->entrance_nodes == NULL, 0)) 1723 { 1724 free_state (newstate); 1725 return NULL; 1726 } 1727 if (re_node_set_init_copy (newstate->entrance_nodes, nodes) 1728 != REG_NOERROR) 1729 return NULL; 1730 nctx_nodes = 0; 1731 newstate->has_constraint = 1; 1732 } 1733 1734 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1735 { 1736 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1737 ++nctx_nodes; 1738 } 1739 } 1740 } 1741 err = register_state (dfa, newstate, hash); 1742 if (BE (err != REG_NOERROR, 0)) 1743 { 1744 free_state (newstate); 1745 newstate = NULL; 1746 } 1747 return newstate; 1748 } 1749