1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Extended regular expression matching and search library. 4 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 5 Software Foundation, Inc. 6 This file is part of the GNU C Library. 7 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation; either version 3, or (at your option) 12 any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License along 20 with this program; if not, write to the Free Software Foundation, 21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 #include "verify.h" 24 #include "intprops.h" 25 static void re_string_construct_common (const char *str, Idx len, 26 re_string_t *pstr, 27 RE_TRANSLATE_TYPE trans, bool icase, 28 const re_dfa_t *dfa) internal_function; 29 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 30 const re_node_set *nodes, 31 re_hashval_t hash) internal_function; 32 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 33 const re_node_set *nodes, 34 unsigned int context, 35 re_hashval_t hash) internal_function; 36 37 /* Functions for string operation. */ 38 39 /* This function allocate the buffers. It is necessary to call 40 re_string_reconstruct before using the object. */ 41 42 static reg_errcode_t 43 internal_function __attribute_warn_unused_result__ 44 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 45 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 46 { 47 reg_errcode_t ret; 48 Idx init_buf_len; 49 50 /* Ensure at least one character fits into the buffers. */ 51 if (init_len < dfa->mb_cur_max) 52 init_len = dfa->mb_cur_max; 53 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 54 re_string_construct_common (str, len, pstr, trans, icase, dfa); 55 56 ret = re_string_realloc_buffers (pstr, init_buf_len); 57 if (BE (ret != REG_NOERROR, 0)) 58 return ret; 59 60 pstr->word_char = dfa->word_char; 61 pstr->word_ops_used = dfa->word_ops_used; 62 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 63 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 64 pstr->valid_raw_len = pstr->valid_len; 65 return REG_NOERROR; 66 } 67 68 /* This function allocate the buffers, and initialize them. */ 69 70 static reg_errcode_t 71 internal_function __attribute_warn_unused_result__ 72 re_string_construct (re_string_t *pstr, const char *str, Idx len, 73 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 74 { 75 reg_errcode_t ret; 76 memset (pstr, '\0', sizeof (re_string_t)); 77 re_string_construct_common (str, len, pstr, trans, icase, dfa); 78 79 if (len > 0) 80 { 81 ret = re_string_realloc_buffers (pstr, len + 1); 82 if (BE (ret != REG_NOERROR, 0)) 83 return ret; 84 } 85 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 86 87 if (icase) 88 { 89 #ifdef RE_ENABLE_I18N 90 if (dfa->mb_cur_max > 1) 91 { 92 while (1) 93 { 94 ret = build_wcs_upper_buffer (pstr); 95 if (BE (ret != REG_NOERROR, 0)) 96 return ret; 97 if (pstr->valid_raw_len >= len) 98 break; 99 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 100 break; 101 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 102 if (BE (ret != REG_NOERROR, 0)) 103 return ret; 104 } 105 } 106 else 107 #endif /* RE_ENABLE_I18N */ 108 build_upper_buffer (pstr); 109 } 110 else 111 { 112 #ifdef RE_ENABLE_I18N 113 if (dfa->mb_cur_max > 1) 114 build_wcs_buffer (pstr); 115 else 116 #endif /* RE_ENABLE_I18N */ 117 { 118 if (trans != NULL) 119 re_string_translate_buffer (pstr); 120 else 121 { 122 pstr->valid_len = pstr->bufs_len; 123 pstr->valid_raw_len = pstr->bufs_len; 124 } 125 } 126 } 127 128 return REG_NOERROR; 129 } 130 131 /* Helper functions for re_string_allocate, and re_string_construct. */ 132 133 static reg_errcode_t 134 internal_function __attribute_warn_unused_result__ 135 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 136 { 137 #ifdef RE_ENABLE_I18N 138 if (pstr->mb_cur_max > 1) 139 { 140 wint_t *new_wcs; 141 142 /* Avoid overflow. */ 143 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 144 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 145 return REG_ESPACE; 146 147 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 148 if (BE (new_wcs == NULL, 0)) 149 return REG_ESPACE; 150 pstr->wcs = new_wcs; 151 if (pstr->offsets != NULL) 152 { 153 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 154 if (BE (new_offsets == NULL, 0)) 155 return REG_ESPACE; 156 pstr->offsets = new_offsets; 157 } 158 } 159 #endif /* RE_ENABLE_I18N */ 160 if (pstr->mbs_allocated) 161 { 162 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 163 new_buf_len); 164 if (BE (new_mbs == NULL, 0)) 165 return REG_ESPACE; 166 pstr->mbs = new_mbs; 167 } 168 pstr->bufs_len = new_buf_len; 169 return REG_NOERROR; 170 } 171 172 173 static void 174 internal_function 175 re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 176 RE_TRANSLATE_TYPE trans, bool icase, 177 const re_dfa_t *dfa) 178 { 179 pstr->raw_mbs = (const unsigned char *) str; 180 pstr->len = len; 181 pstr->raw_len = len; 182 pstr->trans = trans; 183 pstr->icase = icase; 184 pstr->mbs_allocated = (trans != NULL || icase); 185 pstr->mb_cur_max = dfa->mb_cur_max; 186 pstr->is_utf8 = dfa->is_utf8; 187 pstr->map_notascii = dfa->map_notascii; 188 pstr->stop = pstr->len; 189 pstr->raw_stop = pstr->stop; 190 } 191 192 #ifdef RE_ENABLE_I18N 193 194 /* Build wide character buffer PSTR->WCS. 195 If the byte sequence of the string are: 196 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 197 Then wide character buffer will be: 198 <wc1> , WEOF , <wc2> , WEOF , <wc3> 199 We use WEOF for padding, they indicate that the position isn't 200 a first byte of a multibyte character. 201 202 Note that this function assumes PSTR->VALID_LEN elements are already 203 built and starts from PSTR->VALID_LEN. */ 204 205 static void 206 internal_function 207 build_wcs_buffer (re_string_t *pstr) 208 { 209 #ifdef _LIBC 210 unsigned char buf[MB_LEN_MAX]; 211 assert (MB_LEN_MAX >= pstr->mb_cur_max); 212 #else 213 unsigned char buf[64]; 214 #endif 215 mbstate_t prev_st; 216 Idx byte_idx, end_idx, remain_len; 217 size_t mbclen; 218 219 /* Build the buffers from pstr->valid_len to either pstr->len or 220 pstr->bufs_len. */ 221 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 222 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 223 { 224 wchar_t wc; 225 const char *p; 226 227 remain_len = end_idx - byte_idx; 228 prev_st = pstr->cur_state; 229 /* Apply the translation if we need. */ 230 if (BE (pstr->trans != NULL, 0)) 231 { 232 int i, ch; 233 234 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 235 { 236 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 237 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 238 } 239 p = (const char *) buf; 240 } 241 else 242 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 243 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 244 if (BE (mbclen == (size_t) -2, 0)) 245 { 246 /* The buffer doesn't have enough space, finish to build. */ 247 pstr->cur_state = prev_st; 248 break; 249 } 250 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 251 { 252 /* We treat these cases as a singlebyte character. */ 253 mbclen = 1; 254 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 255 if (BE (pstr->trans != NULL, 0)) 256 wc = pstr->trans[wc]; 257 pstr->cur_state = prev_st; 258 } 259 260 /* Write wide character and padding. */ 261 pstr->wcs[byte_idx++] = wc; 262 /* Write paddings. */ 263 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 264 pstr->wcs[byte_idx++] = WEOF; 265 } 266 pstr->valid_len = byte_idx; 267 pstr->valid_raw_len = byte_idx; 268 } 269 270 /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 271 but for REG_ICASE. */ 272 273 static reg_errcode_t 274 internal_function __attribute_warn_unused_result__ 275 build_wcs_upper_buffer (re_string_t *pstr) 276 { 277 mbstate_t prev_st; 278 Idx src_idx, byte_idx, end_idx, remain_len; 279 size_t mbclen; 280 #ifdef _LIBC 281 char buf[MB_LEN_MAX]; 282 assert (MB_LEN_MAX >= pstr->mb_cur_max); 283 #else 284 char buf[64]; 285 #endif 286 287 byte_idx = pstr->valid_len; 288 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 289 290 /* The following optimization assumes that ASCII characters can be 291 mapped to wide characters with a simple cast. */ 292 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 293 { 294 while (byte_idx < end_idx) 295 { 296 wchar_t wc; 297 298 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 299 && mbsinit (&pstr->cur_state)) 300 { 301 /* In case of a singlebyte character. */ 302 pstr->mbs[byte_idx] 303 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 304 /* The next step uses the assumption that wchar_t is encoded 305 ASCII-safe: all ASCII values can be converted like this. */ 306 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 307 ++byte_idx; 308 continue; 309 } 310 311 remain_len = end_idx - byte_idx; 312 prev_st = pstr->cur_state; 313 mbclen = __mbrtowc (&wc, 314 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 315 + byte_idx), remain_len, &pstr->cur_state); 316 if (BE (mbclen < (size_t) -2, 1)) 317 { 318 wchar_t wcu = wc; 319 if (iswlower (wc)) 320 { 321 size_t mbcdlen; 322 323 wcu = towupper (wc); 324 mbcdlen = wcrtomb (buf, wcu, &prev_st); 325 if (BE (mbclen == mbcdlen, 1)) 326 memcpy (pstr->mbs + byte_idx, buf, mbclen); 327 else 328 { 329 src_idx = byte_idx; 330 goto offsets_needed; 331 } 332 } 333 else 334 memcpy (pstr->mbs + byte_idx, 335 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 336 pstr->wcs[byte_idx++] = wcu; 337 /* Write paddings. */ 338 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 339 pstr->wcs[byte_idx++] = WEOF; 340 } 341 else if (mbclen == (size_t) -1 || mbclen == 0) 342 { 343 /* It is an invalid character or '\0'. Just use the byte. */ 344 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 345 pstr->mbs[byte_idx] = ch; 346 /* And also cast it to wide char. */ 347 pstr->wcs[byte_idx++] = (wchar_t) ch; 348 if (BE (mbclen == (size_t) -1, 0)) 349 pstr->cur_state = prev_st; 350 } 351 else 352 { 353 /* The buffer doesn't have enough space, finish to build. */ 354 pstr->cur_state = prev_st; 355 break; 356 } 357 } 358 pstr->valid_len = byte_idx; 359 pstr->valid_raw_len = byte_idx; 360 return REG_NOERROR; 361 } 362 else 363 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 364 { 365 wchar_t wc; 366 const char *p; 367 offsets_needed: 368 remain_len = end_idx - byte_idx; 369 prev_st = pstr->cur_state; 370 if (BE (pstr->trans != NULL, 0)) 371 { 372 int i, ch; 373 374 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 375 { 376 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 377 buf[i] = pstr->trans[ch]; 378 } 379 p = (const char *) buf; 380 } 381 else 382 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 383 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 384 if (BE (mbclen < (size_t) -2, 1)) 385 { 386 wchar_t wcu = wc; 387 if (iswlower (wc)) 388 { 389 size_t mbcdlen; 390 391 wcu = towupper (wc); 392 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 393 if (BE (mbclen == mbcdlen, 1)) 394 memcpy (pstr->mbs + byte_idx, buf, mbclen); 395 else if (mbcdlen != (size_t) -1) 396 { 397 size_t i; 398 399 if (byte_idx + mbcdlen > pstr->bufs_len) 400 { 401 pstr->cur_state = prev_st; 402 break; 403 } 404 405 if (pstr->offsets == NULL) 406 { 407 pstr->offsets = re_malloc (Idx, pstr->bufs_len); 408 409 if (pstr->offsets == NULL) 410 return REG_ESPACE; 411 } 412 if (!pstr->offsets_needed) 413 { 414 for (i = 0; i < (size_t) byte_idx; ++i) 415 pstr->offsets[i] = i; 416 pstr->offsets_needed = 1; 417 } 418 419 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 420 pstr->wcs[byte_idx] = wcu; 421 pstr->offsets[byte_idx] = src_idx; 422 for (i = 1; i < mbcdlen; ++i) 423 { 424 pstr->offsets[byte_idx + i] 425 = src_idx + (i < mbclen ? i : mbclen - 1); 426 pstr->wcs[byte_idx + i] = WEOF; 427 } 428 pstr->len += mbcdlen - mbclen; 429 if (pstr->raw_stop > src_idx) 430 pstr->stop += mbcdlen - mbclen; 431 end_idx = (pstr->bufs_len > pstr->len) 432 ? pstr->len : pstr->bufs_len; 433 byte_idx += mbcdlen; 434 src_idx += mbclen; 435 continue; 436 } 437 else 438 memcpy (pstr->mbs + byte_idx, p, mbclen); 439 } 440 else 441 memcpy (pstr->mbs + byte_idx, p, mbclen); 442 443 if (BE (pstr->offsets_needed != 0, 0)) 444 { 445 size_t i; 446 for (i = 0; i < mbclen; ++i) 447 pstr->offsets[byte_idx + i] = src_idx + i; 448 } 449 src_idx += mbclen; 450 451 pstr->wcs[byte_idx++] = wcu; 452 /* Write paddings. */ 453 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 454 pstr->wcs[byte_idx++] = WEOF; 455 } 456 else if (mbclen == (size_t) -1 || mbclen == 0) 457 { 458 /* It is an invalid character or '\0'. Just use the byte. */ 459 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 460 461 if (BE (pstr->trans != NULL, 0)) 462 ch = pstr->trans [ch]; 463 pstr->mbs[byte_idx] = ch; 464 465 if (BE (pstr->offsets_needed != 0, 0)) 466 pstr->offsets[byte_idx] = src_idx; 467 ++src_idx; 468 469 /* And also cast it to wide char. */ 470 pstr->wcs[byte_idx++] = (wchar_t) ch; 471 if (BE (mbclen == (size_t) -1, 0)) 472 pstr->cur_state = prev_st; 473 } 474 else 475 { 476 /* The buffer doesn't have enough space, finish to build. */ 477 pstr->cur_state = prev_st; 478 break; 479 } 480 } 481 pstr->valid_len = byte_idx; 482 pstr->valid_raw_len = src_idx; 483 return REG_NOERROR; 484 } 485 486 /* Skip characters until the index becomes greater than NEW_RAW_IDX. 487 Return the index. */ 488 489 static Idx 490 internal_function 491 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 492 { 493 mbstate_t prev_st; 494 Idx rawbuf_idx; 495 size_t mbclen; 496 wint_t wc = WEOF; 497 498 /* Skip the characters which are not necessary to check. */ 499 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 500 rawbuf_idx < new_raw_idx;) 501 { 502 wchar_t wc2; 503 Idx remain_len; 504 remain_len = pstr->len - rawbuf_idx; 505 prev_st = pstr->cur_state; 506 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 507 remain_len, &pstr->cur_state); 508 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 509 { 510 /* We treat these cases as a single byte character. */ 511 if (mbclen == 0 || remain_len == 0) 512 wc = L'\0'; 513 else 514 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 515 mbclen = 1; 516 pstr->cur_state = prev_st; 517 } 518 else 519 wc = wc2; 520 /* Then proceed the next character. */ 521 rawbuf_idx += mbclen; 522 } 523 *last_wc = wc; 524 return rawbuf_idx; 525 } 526 #endif /* RE_ENABLE_I18N */ 527 528 /* Build the buffer PSTR->MBS, and apply the translation if we need. 529 This function is used in case of REG_ICASE. */ 530 531 static void 532 internal_function 533 build_upper_buffer (re_string_t *pstr) 534 { 535 Idx char_idx, end_idx; 536 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 537 538 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 539 { 540 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 541 if (BE (pstr->trans != NULL, 0)) 542 ch = pstr->trans[ch]; 543 if (islower (ch)) 544 pstr->mbs[char_idx] = toupper (ch); 545 else 546 pstr->mbs[char_idx] = ch; 547 } 548 pstr->valid_len = char_idx; 549 pstr->valid_raw_len = char_idx; 550 } 551 552 /* Apply TRANS to the buffer in PSTR. */ 553 554 static void 555 internal_function 556 re_string_translate_buffer (re_string_t *pstr) 557 { 558 Idx buf_idx, end_idx; 559 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 560 561 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 562 { 563 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 564 pstr->mbs[buf_idx] = pstr->trans[ch]; 565 } 566 567 pstr->valid_len = buf_idx; 568 pstr->valid_raw_len = buf_idx; 569 } 570 571 /* This function re-construct the buffers. 572 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 573 convert to upper case in case of REG_ICASE, apply translation. */ 574 575 static reg_errcode_t 576 internal_function __attribute_warn_unused_result__ 577 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 578 { 579 Idx offset; 580 581 if (BE (pstr->raw_mbs_idx <= idx, 0)) 582 offset = idx - pstr->raw_mbs_idx; 583 else 584 { 585 /* Reset buffer. */ 586 #ifdef RE_ENABLE_I18N 587 if (pstr->mb_cur_max > 1) 588 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 589 #endif /* RE_ENABLE_I18N */ 590 pstr->len = pstr->raw_len; 591 pstr->stop = pstr->raw_stop; 592 pstr->valid_len = 0; 593 pstr->raw_mbs_idx = 0; 594 pstr->valid_raw_len = 0; 595 pstr->offsets_needed = 0; 596 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 597 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 598 if (!pstr->mbs_allocated) 599 pstr->mbs = (unsigned char *) pstr->raw_mbs; 600 offset = idx; 601 } 602 603 if (BE (offset != 0, 1)) 604 { 605 /* Should the already checked characters be kept? */ 606 if (BE (offset < pstr->valid_raw_len, 1)) 607 { 608 /* Yes, move them to the front of the buffer. */ 609 #ifdef RE_ENABLE_I18N 610 if (BE (pstr->offsets_needed, 0)) 611 { 612 Idx low = 0, high = pstr->valid_len, mid; 613 do 614 { 615 mid = (high + low) / 2; 616 if (pstr->offsets[mid] > offset) 617 high = mid; 618 else if (pstr->offsets[mid] < offset) 619 low = mid + 1; 620 else 621 break; 622 } 623 while (low < high); 624 if (pstr->offsets[mid] < offset) 625 ++mid; 626 pstr->tip_context = re_string_context_at (pstr, mid - 1, 627 eflags); 628 /* This can be quite complicated, so handle specially 629 only the common and easy case where the character with 630 different length representation of lower and upper 631 case is present at or after offset. */ 632 if (pstr->valid_len > offset 633 && mid == offset && pstr->offsets[mid] == offset) 634 { 635 memmove (pstr->wcs, pstr->wcs + offset, 636 (pstr->valid_len - offset) * sizeof (wint_t)); 637 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 638 pstr->valid_len -= offset; 639 pstr->valid_raw_len -= offset; 640 for (low = 0; low < pstr->valid_len; low++) 641 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 642 } 643 else 644 { 645 /* Otherwise, just find out how long the partial multibyte 646 character at offset is and fill it with WEOF/255. */ 647 pstr->len = pstr->raw_len - idx + offset; 648 pstr->stop = pstr->raw_stop - idx + offset; 649 pstr->offsets_needed = 0; 650 while (mid > 0 && pstr->offsets[mid - 1] == offset) 651 --mid; 652 while (mid < pstr->valid_len) 653 if (pstr->wcs[mid] != WEOF) 654 break; 655 else 656 ++mid; 657 if (mid == pstr->valid_len) 658 pstr->valid_len = 0; 659 else 660 { 661 pstr->valid_len = pstr->offsets[mid] - offset; 662 if (pstr->valid_len) 663 { 664 for (low = 0; low < pstr->valid_len; ++low) 665 pstr->wcs[low] = WEOF; 666 memset (pstr->mbs, 255, pstr->valid_len); 667 } 668 } 669 pstr->valid_raw_len = pstr->valid_len; 670 } 671 } 672 else 673 #endif 674 { 675 pstr->tip_context = re_string_context_at (pstr, offset - 1, 676 eflags); 677 #ifdef RE_ENABLE_I18N 678 if (pstr->mb_cur_max > 1) 679 memmove (pstr->wcs, pstr->wcs + offset, 680 (pstr->valid_len - offset) * sizeof (wint_t)); 681 #endif /* RE_ENABLE_I18N */ 682 if (BE (pstr->mbs_allocated, 0)) 683 memmove (pstr->mbs, pstr->mbs + offset, 684 pstr->valid_len - offset); 685 pstr->valid_len -= offset; 686 pstr->valid_raw_len -= offset; 687 #if DEBUG 688 assert (pstr->valid_len > 0); 689 #endif 690 } 691 } 692 else 693 { 694 #ifdef RE_ENABLE_I18N 695 /* No, skip all characters until IDX. */ 696 Idx prev_valid_len = pstr->valid_len; 697 698 if (BE (pstr->offsets_needed, 0)) 699 { 700 pstr->len = pstr->raw_len - idx + offset; 701 pstr->stop = pstr->raw_stop - idx + offset; 702 pstr->offsets_needed = 0; 703 } 704 #endif 705 pstr->valid_len = 0; 706 #ifdef RE_ENABLE_I18N 707 if (pstr->mb_cur_max > 1) 708 { 709 Idx wcs_idx; 710 wint_t wc = WEOF; 711 712 if (pstr->is_utf8) 713 { 714 const unsigned char *raw, *p, *end; 715 716 /* Special case UTF-8. Multi-byte chars start with any 717 byte other than 0x80 - 0xbf. */ 718 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 719 end = raw + (offset - pstr->mb_cur_max); 720 if (end < pstr->raw_mbs) 721 end = pstr->raw_mbs; 722 p = raw + offset - 1; 723 #ifdef _LIBC 724 /* We know the wchar_t encoding is UCS4, so for the simple 725 case, ASCII characters, skip the conversion step. */ 726 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 727 { 728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 729 /* pstr->valid_len = 0; */ 730 wc = (wchar_t) *p; 731 } 732 else 733 #endif 734 for (; p >= end; --p) 735 if ((*p & 0xc0) != 0x80) 736 { 737 mbstate_t cur_state; 738 wchar_t wc2; 739 Idx mlen = raw + pstr->len - p; 740 size_t mbclen; 741 742 #if 0 /* dead code: buf is set but never used */ 743 unsigned char buf[6]; 744 if (BE (pstr->trans != NULL, 0)) 745 { 746 int i = mlen < 6 ? mlen : 6; 747 while (--i >= 0) 748 buf[i] = pstr->trans[p[i]]; 749 } 750 #endif 751 /* XXX Don't use mbrtowc, we know which conversion 752 to use (UTF-8 -> UCS4). */ 753 memset (&cur_state, 0, sizeof (cur_state)); 754 mbclen = __mbrtowc (&wc2, (const char *) p, mlen, 755 &cur_state); 756 if (raw + offset - p <= mbclen 757 && mbclen < (size_t) -2) 758 { 759 memset (&pstr->cur_state, '\0', 760 sizeof (mbstate_t)); 761 pstr->valid_len = mbclen - (raw + offset - p); 762 wc = wc2; 763 } 764 break; 765 } 766 } 767 768 if (wc == WEOF) 769 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 770 if (wc == WEOF) 771 pstr->tip_context 772 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 773 else 774 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 775 && IS_WIDE_WORD_CHAR (wc)) 776 ? CONTEXT_WORD 777 : ((IS_WIDE_NEWLINE (wc) 778 && pstr->newline_anchor) 779 ? CONTEXT_NEWLINE : 0)); 780 if (BE (pstr->valid_len, 0)) 781 { 782 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 783 pstr->wcs[wcs_idx] = WEOF; 784 if (pstr->mbs_allocated) 785 memset (pstr->mbs, 255, pstr->valid_len); 786 } 787 pstr->valid_raw_len = pstr->valid_len; 788 } 789 else 790 #endif /* RE_ENABLE_I18N */ 791 { 792 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 793 pstr->valid_raw_len = 0; 794 if (pstr->trans) 795 c = pstr->trans[c]; 796 pstr->tip_context = (bitset_contain (pstr->word_char, c) 797 ? CONTEXT_WORD 798 : ((IS_NEWLINE (c) && pstr->newline_anchor) 799 ? CONTEXT_NEWLINE : 0)); 800 } 801 } 802 if (!BE (pstr->mbs_allocated, 0)) 803 pstr->mbs += offset; 804 } 805 pstr->raw_mbs_idx = idx; 806 pstr->len -= offset; 807 pstr->stop -= offset; 808 809 /* Then build the buffers. */ 810 #ifdef RE_ENABLE_I18N 811 if (pstr->mb_cur_max > 1) 812 { 813 if (pstr->icase) 814 { 815 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 816 if (BE (ret != REG_NOERROR, 0)) 817 return ret; 818 } 819 else 820 build_wcs_buffer (pstr); 821 } 822 else 823 #endif /* RE_ENABLE_I18N */ 824 if (BE (pstr->mbs_allocated, 0)) 825 { 826 if (pstr->icase) 827 build_upper_buffer (pstr); 828 else if (pstr->trans != NULL) 829 re_string_translate_buffer (pstr); 830 } 831 else 832 pstr->valid_len = pstr->len; 833 834 pstr->cur_idx = 0; 835 return REG_NOERROR; 836 } 837 838 static unsigned char 839 internal_function __attribute ((pure)) 840 re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 841 { 842 int ch; 843 Idx off; 844 845 /* Handle the common (easiest) cases first. */ 846 if (BE (!pstr->mbs_allocated, 1)) 847 return re_string_peek_byte (pstr, idx); 848 849 #ifdef RE_ENABLE_I18N 850 if (pstr->mb_cur_max > 1 851 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 852 return re_string_peek_byte (pstr, idx); 853 #endif 854 855 off = pstr->cur_idx + idx; 856 #ifdef RE_ENABLE_I18N 857 if (pstr->offsets_needed) 858 off = pstr->offsets[off]; 859 #endif 860 861 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 862 863 #ifdef RE_ENABLE_I18N 864 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 865 this function returns CAPITAL LETTER I instead of first byte of 866 DOTLESS SMALL LETTER I. The latter would confuse the parser, 867 since peek_byte_case doesn't advance cur_idx in any way. */ 868 if (pstr->offsets_needed && !isascii (ch)) 869 return re_string_peek_byte (pstr, idx); 870 #endif 871 872 return ch; 873 } 874 875 static unsigned char 876 internal_function __attribute ((pure)) 877 re_string_fetch_byte_case (re_string_t *pstr) 878 { 879 if (BE (!pstr->mbs_allocated, 1)) 880 return re_string_fetch_byte (pstr); 881 882 #ifdef RE_ENABLE_I18N 883 if (pstr->offsets_needed) 884 { 885 Idx off; 886 int ch; 887 888 /* For tr_TR.UTF-8 [[:islower:]] there is 889 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 890 in that case the whole multi-byte character and return 891 the original letter. On the other side, with 892 [[: DOTLESS SMALL LETTER I return [[:I, as doing 893 anything else would complicate things too much. */ 894 895 if (!re_string_first_byte (pstr, pstr->cur_idx)) 896 return re_string_fetch_byte (pstr); 897 898 off = pstr->offsets[pstr->cur_idx]; 899 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 900 901 if (! isascii (ch)) 902 return re_string_fetch_byte (pstr); 903 904 re_string_skip_bytes (pstr, 905 re_string_char_size_at (pstr, pstr->cur_idx)); 906 return ch; 907 } 908 #endif 909 910 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 911 } 912 913 static void 914 internal_function 915 re_string_destruct (re_string_t *pstr) 916 { 917 #ifdef RE_ENABLE_I18N 918 re_free (pstr->wcs); 919 re_free (pstr->offsets); 920 #endif /* RE_ENABLE_I18N */ 921 if (pstr->mbs_allocated) 922 re_free (pstr->mbs); 923 } 924 925 /* Return the context at IDX in INPUT. */ 926 927 static unsigned int 928 internal_function 929 re_string_context_at (const re_string_t *input, Idx idx, int eflags) 930 { 931 int c; 932 if (BE (! REG_VALID_INDEX (idx), 0)) 933 /* In this case, we use the value stored in input->tip_context, 934 since we can't know the character in input->mbs[-1] here. */ 935 return input->tip_context; 936 if (BE (idx == input->len, 0)) 937 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 938 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 939 #ifdef RE_ENABLE_I18N 940 if (input->mb_cur_max > 1) 941 { 942 wint_t wc; 943 Idx wc_idx = idx; 944 while(input->wcs[wc_idx] == WEOF) 945 { 946 #ifdef DEBUG 947 /* It must not happen. */ 948 assert (REG_VALID_INDEX (wc_idx)); 949 #endif 950 --wc_idx; 951 if (! REG_VALID_INDEX (wc_idx)) 952 return input->tip_context; 953 } 954 wc = input->wcs[wc_idx]; 955 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 956 return CONTEXT_WORD; 957 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 958 ? CONTEXT_NEWLINE : 0); 959 } 960 else 961 #endif 962 { 963 c = re_string_byte_at (input, idx); 964 if (bitset_contain (input->word_char, c)) 965 return CONTEXT_WORD; 966 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 967 } 968 } 969 970 /* Functions for set operation. */ 971 972 static reg_errcode_t 973 internal_function __attribute_warn_unused_result__ 974 re_node_set_alloc (re_node_set *set, Idx size) 975 { 976 set->alloc = size; 977 set->nelem = 0; 978 set->elems = re_malloc (Idx, size); 979 if (BE (set->elems == NULL, 0)) 980 return REG_ESPACE; 981 return REG_NOERROR; 982 } 983 984 static reg_errcode_t 985 internal_function __attribute_warn_unused_result__ 986 re_node_set_init_1 (re_node_set *set, Idx elem) 987 { 988 set->alloc = 1; 989 set->nelem = 1; 990 set->elems = re_malloc (Idx, 1); 991 if (BE (set->elems == NULL, 0)) 992 { 993 set->alloc = set->nelem = 0; 994 return REG_ESPACE; 995 } 996 set->elems[0] = elem; 997 return REG_NOERROR; 998 } 999 1000 static reg_errcode_t 1001 internal_function __attribute_warn_unused_result__ 1002 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 1003 { 1004 set->alloc = 2; 1005 set->elems = re_malloc (Idx, 2); 1006 if (BE (set->elems == NULL, 0)) 1007 return REG_ESPACE; 1008 if (elem1 == elem2) 1009 { 1010 set->nelem = 1; 1011 set->elems[0] = elem1; 1012 } 1013 else 1014 { 1015 set->nelem = 2; 1016 if (elem1 < elem2) 1017 { 1018 set->elems[0] = elem1; 1019 set->elems[1] = elem2; 1020 } 1021 else 1022 { 1023 set->elems[0] = elem2; 1024 set->elems[1] = elem1; 1025 } 1026 } 1027 return REG_NOERROR; 1028 } 1029 1030 static reg_errcode_t 1031 internal_function __attribute_warn_unused_result__ 1032 re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1033 { 1034 dest->nelem = src->nelem; 1035 if (src->nelem > 0) 1036 { 1037 dest->alloc = dest->nelem; 1038 dest->elems = re_malloc (Idx, dest->alloc); 1039 if (BE (dest->elems == NULL, 0)) 1040 { 1041 dest->alloc = dest->nelem = 0; 1042 return REG_ESPACE; 1043 } 1044 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1045 } 1046 else 1047 re_node_set_init_empty (dest); 1048 return REG_NOERROR; 1049 } 1050 1051 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1052 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1053 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1054 1055 static reg_errcode_t 1056 internal_function __attribute_warn_unused_result__ 1057 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1058 const re_node_set *src2) 1059 { 1060 Idx i1, i2, is, id, delta, sbase; 1061 if (src1->nelem == 0 || src2->nelem == 0) 1062 return REG_NOERROR; 1063 1064 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1065 conservative estimate. */ 1066 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1067 { 1068 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 1069 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 1070 if (BE (new_elems == NULL, 0)) 1071 return REG_ESPACE; 1072 dest->elems = new_elems; 1073 dest->alloc = new_alloc; 1074 } 1075 1076 /* Find the items in the intersection of SRC1 and SRC2, and copy 1077 into the top of DEST those that are not already in DEST itself. */ 1078 sbase = dest->nelem + src1->nelem + src2->nelem; 1079 i1 = src1->nelem - 1; 1080 i2 = src2->nelem - 1; 1081 id = dest->nelem - 1; 1082 for (;;) 1083 { 1084 if (src1->elems[i1] == src2->elems[i2]) 1085 { 1086 /* Try to find the item in DEST. Maybe we could binary search? */ 1087 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 1088 --id; 1089 1090 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 1091 dest->elems[--sbase] = src1->elems[i1]; 1092 1093 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1094 break; 1095 } 1096 1097 /* Lower the highest of the two items. */ 1098 else if (src1->elems[i1] < src2->elems[i2]) 1099 { 1100 if (! REG_VALID_INDEX (--i2)) 1101 break; 1102 } 1103 else 1104 { 1105 if (! REG_VALID_INDEX (--i1)) 1106 break; 1107 } 1108 } 1109 1110 id = dest->nelem - 1; 1111 is = dest->nelem + src1->nelem + src2->nelem - 1; 1112 delta = is - sbase + 1; 1113 1114 /* Now copy. When DELTA becomes zero, the remaining 1115 DEST elements are already in place; this is more or 1116 less the same loop that is in re_node_set_merge. */ 1117 dest->nelem += delta; 1118 if (delta > 0 && REG_VALID_INDEX (id)) 1119 for (;;) 1120 { 1121 if (dest->elems[is] > dest->elems[id]) 1122 { 1123 /* Copy from the top. */ 1124 dest->elems[id + delta--] = dest->elems[is--]; 1125 if (delta == 0) 1126 break; 1127 } 1128 else 1129 { 1130 /* Slide from the bottom. */ 1131 dest->elems[id + delta] = dest->elems[id]; 1132 if (! REG_VALID_INDEX (--id)) 1133 break; 1134 } 1135 } 1136 1137 /* Copy remaining SRC elements. */ 1138 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 1139 1140 return REG_NOERROR; 1141 } 1142 1143 /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1144 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1145 1146 static reg_errcode_t 1147 internal_function __attribute_warn_unused_result__ 1148 re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1149 const re_node_set *src2) 1150 { 1151 Idx i1, i2, id; 1152 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1153 { 1154 dest->alloc = src1->nelem + src2->nelem; 1155 dest->elems = re_malloc (Idx, dest->alloc); 1156 if (BE (dest->elems == NULL, 0)) 1157 return REG_ESPACE; 1158 } 1159 else 1160 { 1161 if (src1 != NULL && src1->nelem > 0) 1162 return re_node_set_init_copy (dest, src1); 1163 else if (src2 != NULL && src2->nelem > 0) 1164 return re_node_set_init_copy (dest, src2); 1165 else 1166 re_node_set_init_empty (dest); 1167 return REG_NOERROR; 1168 } 1169 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1170 { 1171 if (src1->elems[i1] > src2->elems[i2]) 1172 { 1173 dest->elems[id++] = src2->elems[i2++]; 1174 continue; 1175 } 1176 if (src1->elems[i1] == src2->elems[i2]) 1177 ++i2; 1178 dest->elems[id++] = src1->elems[i1++]; 1179 } 1180 if (i1 < src1->nelem) 1181 { 1182 memcpy (dest->elems + id, src1->elems + i1, 1183 (src1->nelem - i1) * sizeof (Idx)); 1184 id += src1->nelem - i1; 1185 } 1186 else if (i2 < src2->nelem) 1187 { 1188 memcpy (dest->elems + id, src2->elems + i2, 1189 (src2->nelem - i2) * sizeof (Idx)); 1190 id += src2->nelem - i2; 1191 } 1192 dest->nelem = id; 1193 return REG_NOERROR; 1194 } 1195 1196 /* Calculate the union set of the sets DEST and SRC. And store it to 1197 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1198 1199 static reg_errcode_t 1200 internal_function __attribute_warn_unused_result__ 1201 re_node_set_merge (re_node_set *dest, const re_node_set *src) 1202 { 1203 Idx is, id, sbase, delta; 1204 if (src == NULL || src->nelem == 0) 1205 return REG_NOERROR; 1206 if (dest->alloc < 2 * src->nelem + dest->nelem) 1207 { 1208 Idx new_alloc = 2 * (src->nelem + dest->alloc); 1209 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 1210 if (BE (new_buffer == NULL, 0)) 1211 return REG_ESPACE; 1212 dest->elems = new_buffer; 1213 dest->alloc = new_alloc; 1214 } 1215 1216 if (BE (dest->nelem == 0, 0)) 1217 { 1218 dest->nelem = src->nelem; 1219 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1220 return REG_NOERROR; 1221 } 1222 1223 /* Copy into the top of DEST the items of SRC that are not 1224 found in DEST. Maybe we could binary search in DEST? */ 1225 for (sbase = dest->nelem + 2 * src->nelem, 1226 is = src->nelem - 1, id = dest->nelem - 1; 1227 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1228 { 1229 if (dest->elems[id] == src->elems[is]) 1230 is--, id--; 1231 else if (dest->elems[id] < src->elems[is]) 1232 dest->elems[--sbase] = src->elems[is--]; 1233 else /* if (dest->elems[id] > src->elems[is]) */ 1234 --id; 1235 } 1236 1237 if (REG_VALID_INDEX (is)) 1238 { 1239 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1240 sbase -= is + 1; 1241 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 1242 } 1243 1244 id = dest->nelem - 1; 1245 is = dest->nelem + 2 * src->nelem - 1; 1246 delta = is - sbase + 1; 1247 if (delta == 0) 1248 return REG_NOERROR; 1249 1250 /* Now copy. When DELTA becomes zero, the remaining 1251 DEST elements are already in place. */ 1252 dest->nelem += delta; 1253 for (;;) 1254 { 1255 if (dest->elems[is] > dest->elems[id]) 1256 { 1257 /* Copy from the top. */ 1258 dest->elems[id + delta--] = dest->elems[is--]; 1259 if (delta == 0) 1260 break; 1261 } 1262 else 1263 { 1264 /* Slide from the bottom. */ 1265 dest->elems[id + delta] = dest->elems[id]; 1266 if (! REG_VALID_INDEX (--id)) 1267 { 1268 /* Copy remaining SRC elements. */ 1269 memcpy (dest->elems, dest->elems + sbase, 1270 delta * sizeof (Idx)); 1271 break; 1272 } 1273 } 1274 } 1275 1276 return REG_NOERROR; 1277 } 1278 1279 /* Insert the new element ELEM to the re_node_set* SET. 1280 SET should not already have ELEM. 1281 Return true if successful. */ 1282 1283 static bool 1284 internal_function __attribute_warn_unused_result__ 1285 re_node_set_insert (re_node_set *set, Idx elem) 1286 { 1287 Idx idx; 1288 /* In case the set is empty. */ 1289 if (set->alloc == 0) 1290 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 1291 1292 if (BE (set->nelem, 0) == 0) 1293 { 1294 /* We already guaranteed above that set->alloc != 0. */ 1295 set->elems[0] = elem; 1296 ++set->nelem; 1297 return true; 1298 } 1299 1300 /* Realloc if we need. */ 1301 if (set->alloc == set->nelem) 1302 { 1303 Idx *new_elems; 1304 set->alloc = set->alloc * 2; 1305 new_elems = re_realloc (set->elems, Idx, set->alloc); 1306 if (BE (new_elems == NULL, 0)) 1307 return false; 1308 set->elems = new_elems; 1309 } 1310 1311 /* Move the elements which follows the new element. Test the 1312 first element separately to skip a check in the inner loop. */ 1313 if (elem < set->elems[0]) 1314 { 1315 idx = 0; 1316 for (idx = set->nelem; idx > 0; idx--) 1317 set->elems[idx] = set->elems[idx - 1]; 1318 } 1319 else 1320 { 1321 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1322 set->elems[idx] = set->elems[idx - 1]; 1323 } 1324 1325 /* Insert the new element. */ 1326 set->elems[idx] = elem; 1327 ++set->nelem; 1328 return true; 1329 } 1330 1331 /* Insert the new element ELEM to the re_node_set* SET. 1332 SET should not already have any element greater than or equal to ELEM. 1333 Return true if successful. */ 1334 1335 static bool 1336 internal_function __attribute_warn_unused_result__ 1337 re_node_set_insert_last (re_node_set *set, Idx elem) 1338 { 1339 /* Realloc if we need. */ 1340 if (set->alloc == set->nelem) 1341 { 1342 Idx *new_elems; 1343 set->alloc = (set->alloc + 1) * 2; 1344 new_elems = re_realloc (set->elems, Idx, set->alloc); 1345 if (BE (new_elems == NULL, 0)) 1346 return false; 1347 set->elems = new_elems; 1348 } 1349 1350 /* Insert the new element. */ 1351 set->elems[set->nelem++] = elem; 1352 return true; 1353 } 1354 1355 /* Compare two node sets SET1 and SET2. 1356 Return true if SET1 and SET2 are equivalent. */ 1357 1358 static bool 1359 internal_function __attribute ((pure)) 1360 re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1361 { 1362 Idx i; 1363 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1364 return false; 1365 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1366 if (set1->elems[i] != set2->elems[i]) 1367 return false; 1368 return true; 1369 } 1370 1371 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1372 1373 static Idx 1374 internal_function __attribute ((pure)) 1375 re_node_set_contains (const re_node_set *set, Idx elem) 1376 { 1377 __re_size_t idx, right, mid; 1378 if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1379 return 0; 1380 1381 /* Binary search the element. */ 1382 idx = 0; 1383 right = set->nelem - 1; 1384 while (idx < right) 1385 { 1386 mid = (idx + right) / 2; 1387 if (set->elems[mid] < elem) 1388 idx = mid + 1; 1389 else 1390 right = mid; 1391 } 1392 return set->elems[idx] == elem ? idx + 1 : 0; 1393 } 1394 1395 static void 1396 internal_function 1397 re_node_set_remove_at (re_node_set *set, Idx idx) 1398 { 1399 verify (! TYPE_SIGNED (Idx)); 1400 /* if (idx < 0) 1401 return; */ 1402 if (idx >= set->nelem) 1403 return; 1404 --set->nelem; 1405 for (; idx < set->nelem; idx++) 1406 set->elems[idx] = set->elems[idx + 1]; 1407 } 1408 1409 1410 /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1411 Or return REG_MISSING if an error occurred. */ 1412 1413 static Idx 1414 internal_function 1415 re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1416 { 1417 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1418 { 1419 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1420 Idx *new_nexts, *new_indices; 1421 re_node_set *new_edests, *new_eclosures; 1422 re_token_t *new_nodes; 1423 size_t max_object_size = 1424 MAX (sizeof (re_token_t), 1425 MAX (sizeof (re_node_set), 1426 sizeof (Idx))); 1427 1428 /* Avoid overflows. */ 1429 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0)) 1430 return REG_MISSING; 1431 1432 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1433 if (BE (new_nodes == NULL, 0)) 1434 return REG_MISSING; 1435 dfa->nodes = new_nodes; 1436 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1437 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1438 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1439 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1440 if (BE (new_nexts == NULL || new_indices == NULL 1441 || new_edests == NULL || new_eclosures == NULL, 0)) 1442 return REG_MISSING; 1443 dfa->nexts = new_nexts; 1444 dfa->org_indices = new_indices; 1445 dfa->edests = new_edests; 1446 dfa->eclosures = new_eclosures; 1447 dfa->nodes_alloc = new_nodes_alloc; 1448 } 1449 dfa->nodes[dfa->nodes_len] = token; 1450 dfa->nodes[dfa->nodes_len].constraint = 0; 1451 #ifdef RE_ENABLE_I18N 1452 { 1453 int type = token.type; 1454 dfa->nodes[dfa->nodes_len].accept_mb = 1455 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1456 } 1457 #endif 1458 dfa->nexts[dfa->nodes_len] = REG_MISSING; 1459 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1460 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1461 return dfa->nodes_len++; 1462 } 1463 1464 static inline re_hashval_t 1465 internal_function 1466 calc_state_hash (const re_node_set *nodes, unsigned int context) 1467 { 1468 re_hashval_t hash = nodes->nelem + context; 1469 Idx i; 1470 for (i = 0 ; i < nodes->nelem ; i++) 1471 hash += nodes->elems[i]; 1472 return hash; 1473 } 1474 1475 /* Search for the state whose node_set is equivalent to NODES. 1476 Return the pointer to the state, if we found it in the DFA. 1477 Otherwise create the new one and return it. In case of an error 1478 return NULL and set the error code in ERR. 1479 Note: - We assume NULL as the invalid state, then it is possible that 1480 return value is NULL and ERR is REG_NOERROR. 1481 - We never return non-NULL value in case of any errors, it is for 1482 optimization. */ 1483 1484 static re_dfastate_t * 1485 internal_function __attribute_warn_unused_result__ 1486 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1487 const re_node_set *nodes) 1488 { 1489 re_hashval_t hash; 1490 re_dfastate_t *new_state; 1491 struct re_state_table_entry *spot; 1492 Idx i; 1493 #ifdef lint 1494 /* Suppress bogus uninitialized-variable warnings. */ 1495 *err = REG_NOERROR; 1496 #endif 1497 if (BE (nodes->nelem == 0, 0)) 1498 { 1499 *err = REG_NOERROR; 1500 return NULL; 1501 } 1502 hash = calc_state_hash (nodes, 0); 1503 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1504 1505 for (i = 0 ; i < spot->num ; i++) 1506 { 1507 re_dfastate_t *state = spot->array[i]; 1508 if (hash != state->hash) 1509 continue; 1510 if (re_node_set_compare (&state->nodes, nodes)) 1511 return state; 1512 } 1513 1514 /* There are no appropriate state in the dfa, create the new one. */ 1515 new_state = create_ci_newstate (dfa, nodes, hash); 1516 if (BE (new_state == NULL, 0)) 1517 *err = REG_ESPACE; 1518 1519 return new_state; 1520 } 1521 1522 /* Search for the state whose node_set is equivalent to NODES and 1523 whose context is equivalent to CONTEXT. 1524 Return the pointer to the state, if we found it in the DFA. 1525 Otherwise create the new one and return it. In case of an error 1526 return NULL and set the error code in ERR. 1527 Note: - We assume NULL as the invalid state, then it is possible that 1528 return value is NULL and ERR is REG_NOERROR. 1529 - We never return non-NULL value in case of any errors, it is for 1530 optimization. */ 1531 1532 static re_dfastate_t * 1533 internal_function __attribute_warn_unused_result__ 1534 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1535 const re_node_set *nodes, unsigned int context) 1536 { 1537 re_hashval_t hash; 1538 re_dfastate_t *new_state; 1539 struct re_state_table_entry *spot; 1540 Idx i; 1541 #ifdef lint 1542 /* Suppress bogus uninitialized-variable warnings. */ 1543 *err = REG_NOERROR; 1544 #endif 1545 if (nodes->nelem == 0) 1546 { 1547 *err = REG_NOERROR; 1548 return NULL; 1549 } 1550 hash = calc_state_hash (nodes, context); 1551 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1552 1553 for (i = 0 ; i < spot->num ; i++) 1554 { 1555 re_dfastate_t *state = spot->array[i]; 1556 if (state->hash == hash 1557 && state->context == context 1558 && re_node_set_compare (state->entrance_nodes, nodes)) 1559 return state; 1560 } 1561 /* There are no appropriate state in `dfa', create the new one. */ 1562 new_state = create_cd_newstate (dfa, nodes, context, hash); 1563 if (BE (new_state == NULL, 0)) 1564 *err = REG_ESPACE; 1565 1566 return new_state; 1567 } 1568 1569 /* Finish initialization of the new state NEWSTATE, and using its hash value 1570 HASH put in the appropriate bucket of DFA's state table. Return value 1571 indicates the error code if failed. */ 1572 1573 static reg_errcode_t 1574 __attribute_warn_unused_result__ 1575 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1576 re_hashval_t hash) 1577 { 1578 struct re_state_table_entry *spot; 1579 reg_errcode_t err; 1580 Idx i; 1581 1582 newstate->hash = hash; 1583 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1584 if (BE (err != REG_NOERROR, 0)) 1585 return REG_ESPACE; 1586 for (i = 0; i < newstate->nodes.nelem; i++) 1587 { 1588 Idx elem = newstate->nodes.elems[i]; 1589 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1590 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0)) 1591 return REG_ESPACE; 1592 } 1593 1594 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1595 if (BE (spot->alloc <= spot->num, 0)) 1596 { 1597 Idx new_alloc = 2 * spot->num + 2; 1598 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1599 new_alloc); 1600 if (BE (new_array == NULL, 0)) 1601 return REG_ESPACE; 1602 spot->array = new_array; 1603 spot->alloc = new_alloc; 1604 } 1605 spot->array[spot->num++] = newstate; 1606 return REG_NOERROR; 1607 } 1608 1609 static void 1610 free_state (re_dfastate_t *state) 1611 { 1612 re_node_set_free (&state->non_eps_nodes); 1613 re_node_set_free (&state->inveclosure); 1614 if (state->entrance_nodes != &state->nodes) 1615 { 1616 re_node_set_free (state->entrance_nodes); 1617 re_free (state->entrance_nodes); 1618 } 1619 re_node_set_free (&state->nodes); 1620 re_free (state->word_trtable); 1621 re_free (state->trtable); 1622 re_free (state); 1623 } 1624 1625 /* Create the new state which is independ of contexts. 1626 Return the new state if succeeded, otherwise return NULL. */ 1627 1628 static re_dfastate_t * 1629 internal_function __attribute_warn_unused_result__ 1630 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1631 re_hashval_t hash) 1632 { 1633 Idx i; 1634 reg_errcode_t err; 1635 re_dfastate_t *newstate; 1636 1637 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1638 if (BE (newstate == NULL, 0)) 1639 return NULL; 1640 err = re_node_set_init_copy (&newstate->nodes, nodes); 1641 if (BE (err != REG_NOERROR, 0)) 1642 { 1643 re_free (newstate); 1644 return NULL; 1645 } 1646 1647 newstate->entrance_nodes = &newstate->nodes; 1648 for (i = 0 ; i < nodes->nelem ; i++) 1649 { 1650 re_token_t *node = dfa->nodes + nodes->elems[i]; 1651 re_token_type_t type = node->type; 1652 if (type == CHARACTER && !node->constraint) 1653 continue; 1654 #ifdef RE_ENABLE_I18N 1655 newstate->accept_mb |= node->accept_mb; 1656 #endif /* RE_ENABLE_I18N */ 1657 1658 /* If the state has the halt node, the state is a halt state. */ 1659 if (type == END_OF_RE) 1660 newstate->halt = 1; 1661 else if (type == OP_BACK_REF) 1662 newstate->has_backref = 1; 1663 else if (type == ANCHOR || node->constraint) 1664 newstate->has_constraint = 1; 1665 } 1666 err = register_state (dfa, newstate, hash); 1667 if (BE (err != REG_NOERROR, 0)) 1668 { 1669 free_state (newstate); 1670 newstate = NULL; 1671 } 1672 return newstate; 1673 } 1674 1675 /* Create the new state which is depend on the context CONTEXT. 1676 Return the new state if succeeded, otherwise return NULL. */ 1677 1678 static re_dfastate_t * 1679 internal_function __attribute_warn_unused_result__ 1680 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1681 unsigned int context, re_hashval_t hash) 1682 { 1683 Idx i, nctx_nodes = 0; 1684 reg_errcode_t err; 1685 re_dfastate_t *newstate; 1686 1687 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1688 if (BE (newstate == NULL, 0)) 1689 return NULL; 1690 err = re_node_set_init_copy (&newstate->nodes, nodes); 1691 if (BE (err != REG_NOERROR, 0)) 1692 { 1693 re_free (newstate); 1694 return NULL; 1695 } 1696 1697 newstate->context = context; 1698 newstate->entrance_nodes = &newstate->nodes; 1699 1700 for (i = 0 ; i < nodes->nelem ; i++) 1701 { 1702 re_token_t *node = dfa->nodes + nodes->elems[i]; 1703 re_token_type_t type = node->type; 1704 unsigned int constraint = node->constraint; 1705 1706 if (type == CHARACTER && !constraint) 1707 continue; 1708 #ifdef RE_ENABLE_I18N 1709 newstate->accept_mb |= node->accept_mb; 1710 #endif /* RE_ENABLE_I18N */ 1711 1712 /* If the state has the halt node, the state is a halt state. */ 1713 if (type == END_OF_RE) 1714 newstate->halt = 1; 1715 else if (type == OP_BACK_REF) 1716 newstate->has_backref = 1; 1717 1718 if (constraint) 1719 { 1720 if (newstate->entrance_nodes == &newstate->nodes) 1721 { 1722 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1723 if (BE (newstate->entrance_nodes == NULL, 0)) 1724 { 1725 free_state (newstate); 1726 return NULL; 1727 } 1728 if (re_node_set_init_copy (newstate->entrance_nodes, nodes) 1729 != REG_NOERROR) 1730 return NULL; 1731 nctx_nodes = 0; 1732 newstate->has_constraint = 1; 1733 } 1734 1735 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1736 { 1737 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1738 ++nctx_nodes; 1739 } 1740 } 1741 } 1742 err = register_state (dfa, newstate, hash); 1743 if (BE (err != REG_NOERROR, 0)) 1744 { 1745 free_state (newstate); 1746 newstate = NULL; 1747 } 1748 return newstate; 1749 } 1750