1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Extended regular expression matching and search library. 4 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 5 Software Foundation, Inc. 6 This file is part of the GNU C Library. 7 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation; either version 3, or (at your option) 12 any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License along 20 with this program; if not, write to the Free Software Foundation, 21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 #include "verify.h" 24 #include "intprops.h" 25 static void re_string_construct_common (const char *str, Idx len, 26 re_string_t *pstr, 27 RE_TRANSLATE_TYPE trans, bool icase, 28 const re_dfa_t *dfa) internal_function; 29 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 30 const re_node_set *nodes, 31 re_hashval_t hash) internal_function; 32 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 33 const re_node_set *nodes, 34 unsigned int context, 35 re_hashval_t hash) internal_function; 36 37 /* Functions for string operation. */ 38 39 /* This function allocate the buffers. It is necessary to call 40 re_string_reconstruct before using the object. */ 41 42 static reg_errcode_t 43 internal_function __attribute_warn_unused_result__ 44 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 45 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 46 { 47 reg_errcode_t ret; 48 Idx init_buf_len; 49 50 /* Ensure at least one character fits into the buffers. */ 51 if (init_len < dfa->mb_cur_max) 52 init_len = dfa->mb_cur_max; 53 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 54 re_string_construct_common (str, len, pstr, trans, icase, dfa); 55 56 ret = re_string_realloc_buffers (pstr, init_buf_len); 57 if (BE (ret != REG_NOERROR, 0)) 58 return ret; 59 60 pstr->word_char = dfa->word_char; 61 pstr->word_ops_used = dfa->word_ops_used; 62 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 63 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 64 pstr->valid_raw_len = pstr->valid_len; 65 return REG_NOERROR; 66 } 67 68 /* This function allocate the buffers, and initialize them. */ 69 70 static reg_errcode_t 71 internal_function __attribute_warn_unused_result__ 72 re_string_construct (re_string_t *pstr, const char *str, Idx len, 73 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 74 { 75 reg_errcode_t ret; 76 memset (pstr, '\0', sizeof (re_string_t)); 77 re_string_construct_common (str, len, pstr, trans, icase, dfa); 78 79 if (len > 0) 80 { 81 ret = re_string_realloc_buffers (pstr, len + 1); 82 if (BE (ret != REG_NOERROR, 0)) 83 return ret; 84 } 85 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 86 87 if (icase) 88 { 89 #ifdef RE_ENABLE_I18N 90 if (dfa->mb_cur_max > 1) 91 { 92 while (1) 93 { 94 ret = build_wcs_upper_buffer (pstr); 95 if (BE (ret != REG_NOERROR, 0)) 96 return ret; 97 if (pstr->valid_raw_len >= len) 98 break; 99 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 100 break; 101 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 102 if (BE (ret != REG_NOERROR, 0)) 103 return ret; 104 } 105 } 106 else 107 #endif /* RE_ENABLE_I18N */ 108 build_upper_buffer (pstr); 109 } 110 else 111 { 112 #ifdef RE_ENABLE_I18N 113 if (dfa->mb_cur_max > 1) 114 build_wcs_buffer (pstr); 115 else 116 #endif /* RE_ENABLE_I18N */ 117 { 118 if (trans != NULL) 119 re_string_translate_buffer (pstr); 120 else 121 { 122 pstr->valid_len = pstr->bufs_len; 123 pstr->valid_raw_len = pstr->bufs_len; 124 } 125 } 126 } 127 128 return REG_NOERROR; 129 } 130 131 /* Helper functions for re_string_allocate, and re_string_construct. */ 132 133 static reg_errcode_t 134 internal_function __attribute_warn_unused_result__ 135 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 136 { 137 #ifdef RE_ENABLE_I18N 138 if (pstr->mb_cur_max > 1) 139 { 140 wint_t *new_wcs; 141 142 /* Avoid overflow. */ 143 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 144 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 145 return REG_ESPACE; 146 147 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 148 if (BE (new_wcs == NULL, 0)) 149 return REG_ESPACE; 150 pstr->wcs = new_wcs; 151 if (pstr->offsets != NULL) 152 { 153 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 154 if (BE (new_offsets == NULL, 0)) 155 return REG_ESPACE; 156 pstr->offsets = new_offsets; 157 } 158 } 159 #endif /* RE_ENABLE_I18N */ 160 if (pstr->mbs_allocated) 161 { 162 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 163 new_buf_len); 164 if (BE (new_mbs == NULL, 0)) 165 return REG_ESPACE; 166 pstr->mbs = new_mbs; 167 } 168 pstr->bufs_len = new_buf_len; 169 return REG_NOERROR; 170 } 171 172 173 static void 174 internal_function 175 re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 176 RE_TRANSLATE_TYPE trans, bool icase, 177 const re_dfa_t *dfa) 178 { 179 pstr->raw_mbs = (const unsigned char *) str; 180 pstr->len = len; 181 pstr->raw_len = len; 182 pstr->trans = trans; 183 pstr->icase = icase; 184 pstr->mbs_allocated = (trans != NULL || icase); 185 pstr->mb_cur_max = dfa->mb_cur_max; 186 pstr->is_utf8 = dfa->is_utf8; 187 pstr->map_notascii = dfa->map_notascii; 188 pstr->stop = pstr->len; 189 pstr->raw_stop = pstr->stop; 190 } 191 192 #ifdef RE_ENABLE_I18N 193 194 /* Build wide character buffer PSTR->WCS. 195 If the byte sequence of the string are: 196 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 197 Then wide character buffer will be: 198 <wc1> , WEOF , <wc2> , WEOF , <wc3> 199 We use WEOF for padding, they indicate that the position isn't 200 a first byte of a multibyte character. 201 202 Note that this function assumes PSTR->VALID_LEN elements are already 203 built and starts from PSTR->VALID_LEN. */ 204 205 static void 206 internal_function 207 build_wcs_buffer (re_string_t *pstr) 208 { 209 #ifdef _LIBC 210 unsigned char buf[MB_LEN_MAX]; 211 assert (MB_LEN_MAX >= pstr->mb_cur_max); 212 #else 213 unsigned char buf[64]; 214 #endif 215 mbstate_t prev_st; 216 Idx byte_idx, end_idx, remain_len; 217 size_t mbclen; 218 219 /* Build the buffers from pstr->valid_len to either pstr->len or 220 pstr->bufs_len. */ 221 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 222 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 223 { 224 wchar_t wc; 225 const char *p; 226 227 remain_len = end_idx - byte_idx; 228 prev_st = pstr->cur_state; 229 /* Apply the translation if we need. */ 230 if (BE (pstr->trans != NULL, 0)) 231 { 232 int i, ch; 233 234 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 235 { 236 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 237 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 238 } 239 p = (const char *) buf; 240 } 241 else 242 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 243 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 244 if (BE (mbclen == (size_t) -2, 0)) 245 { 246 /* The buffer doesn't have enough space, finish to build. */ 247 pstr->cur_state = prev_st; 248 break; 249 } 250 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 251 { 252 /* We treat these cases as a singlebyte character. */ 253 mbclen = 1; 254 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 255 if (BE (pstr->trans != NULL, 0)) 256 wc = pstr->trans[wc]; 257 pstr->cur_state = prev_st; 258 } 259 260 /* Write wide character and padding. */ 261 pstr->wcs[byte_idx++] = wc; 262 /* Write paddings. */ 263 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 264 pstr->wcs[byte_idx++] = WEOF; 265 } 266 pstr->valid_len = byte_idx; 267 pstr->valid_raw_len = byte_idx; 268 } 269 270 /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 271 but for REG_ICASE. */ 272 273 static reg_errcode_t 274 internal_function __attribute_warn_unused_result__ 275 build_wcs_upper_buffer (re_string_t *pstr) 276 { 277 mbstate_t prev_st; 278 Idx src_idx, byte_idx, end_idx, remain_len; 279 size_t mbclen; 280 #ifdef _LIBC 281 char buf[MB_LEN_MAX]; 282 assert (MB_LEN_MAX >= pstr->mb_cur_max); 283 #else 284 char buf[64]; 285 #endif 286 287 byte_idx = pstr->valid_len; 288 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 289 290 /* The following optimization assumes that ASCII characters can be 291 mapped to wide characters with a simple cast. */ 292 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 293 { 294 while (byte_idx < end_idx) 295 { 296 wchar_t wc; 297 298 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 299 && mbsinit (&pstr->cur_state)) 300 { 301 /* In case of a singlebyte character. */ 302 pstr->mbs[byte_idx] 303 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 304 /* The next step uses the assumption that wchar_t is encoded 305 ASCII-safe: all ASCII values can be converted like this. */ 306 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 307 ++byte_idx; 308 continue; 309 } 310 311 remain_len = end_idx - byte_idx; 312 prev_st = pstr->cur_state; 313 mbclen = __mbrtowc (&wc, 314 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 315 + byte_idx), remain_len, &pstr->cur_state); 316 if (BE (mbclen < (size_t) -2, 1)) 317 { 318 wchar_t wcu = wc; 319 if (iswlower (wc)) 320 { 321 size_t mbcdlen; 322 323 wcu = towupper (wc); 324 mbcdlen = wcrtomb (buf, wcu, &prev_st); 325 if (BE (mbclen == mbcdlen, 1)) 326 memcpy (pstr->mbs + byte_idx, buf, mbclen); 327 else 328 { 329 src_idx = byte_idx; 330 goto offsets_needed; 331 } 332 } 333 else 334 memcpy (pstr->mbs + byte_idx, 335 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 336 pstr->wcs[byte_idx++] = wcu; 337 /* Write paddings. */ 338 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 339 pstr->wcs[byte_idx++] = WEOF; 340 } 341 else if (mbclen == (size_t) -1 || mbclen == 0) 342 { 343 /* It is an invalid character or '\0'. Just use the byte. */ 344 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 345 pstr->mbs[byte_idx] = ch; 346 /* And also cast it to wide char. */ 347 pstr->wcs[byte_idx++] = (wchar_t) ch; 348 if (BE (mbclen == (size_t) -1, 0)) 349 pstr->cur_state = prev_st; 350 } 351 else 352 { 353 /* The buffer doesn't have enough space, finish to build. */ 354 pstr->cur_state = prev_st; 355 break; 356 } 357 } 358 pstr->valid_len = byte_idx; 359 pstr->valid_raw_len = byte_idx; 360 return REG_NOERROR; 361 } 362 else 363 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 364 { 365 wchar_t wc; 366 const char *p; 367 offsets_needed: 368 remain_len = end_idx - byte_idx; 369 prev_st = pstr->cur_state; 370 if (BE (pstr->trans != NULL, 0)) 371 { 372 int i, ch; 373 374 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 375 { 376 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 377 buf[i] = pstr->trans[ch]; 378 } 379 p = (const char *) buf; 380 } 381 else 382 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 383 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 384 if (BE (mbclen < (size_t) -2, 1)) 385 { 386 wchar_t wcu = wc; 387 if (iswlower (wc)) 388 { 389 size_t mbcdlen; 390 391 wcu = towupper (wc); 392 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 393 if (BE (mbclen == mbcdlen, 1)) 394 memcpy (pstr->mbs + byte_idx, buf, mbclen); 395 else if (mbcdlen != (size_t) -1) 396 { 397 size_t i; 398 399 if (byte_idx + mbcdlen > pstr->bufs_len) 400 { 401 pstr->cur_state = prev_st; 402 break; 403 } 404 405 if (pstr->offsets == NULL) 406 { 407 pstr->offsets = re_malloc (Idx, pstr->bufs_len); 408 409 if (pstr->offsets == NULL) 410 return REG_ESPACE; 411 } 412 if (!pstr->offsets_needed) 413 { 414 for (i = 0; i < (size_t) byte_idx; ++i) 415 pstr->offsets[i] = i; 416 pstr->offsets_needed = 1; 417 } 418 419 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 420 pstr->wcs[byte_idx] = wcu; 421 pstr->offsets[byte_idx] = src_idx; 422 for (i = 1; i < mbcdlen; ++i) 423 { 424 pstr->offsets[byte_idx + i] 425 = src_idx + (i < mbclen ? i : mbclen - 1); 426 pstr->wcs[byte_idx + i] = WEOF; 427 } 428 pstr->len += mbcdlen - mbclen; 429 if (pstr->raw_stop > src_idx) 430 pstr->stop += mbcdlen - mbclen; 431 end_idx = (pstr->bufs_len > pstr->len) 432 ? pstr->len : pstr->bufs_len; 433 byte_idx += mbcdlen; 434 src_idx += mbclen; 435 continue; 436 } 437 else 438 memcpy (pstr->mbs + byte_idx, p, mbclen); 439 } 440 else 441 memcpy (pstr->mbs + byte_idx, p, mbclen); 442 443 if (BE (pstr->offsets_needed != 0, 0)) 444 { 445 size_t i; 446 for (i = 0; i < mbclen; ++i) 447 pstr->offsets[byte_idx + i] = src_idx + i; 448 } 449 src_idx += mbclen; 450 451 pstr->wcs[byte_idx++] = wcu; 452 /* Write paddings. */ 453 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 454 pstr->wcs[byte_idx++] = WEOF; 455 } 456 else if (mbclen == (size_t) -1 || mbclen == 0) 457 { 458 /* It is an invalid character or '\0'. Just use the byte. */ 459 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 460 461 if (BE (pstr->trans != NULL, 0)) 462 ch = pstr->trans [ch]; 463 pstr->mbs[byte_idx] = ch; 464 465 if (BE (pstr->offsets_needed != 0, 0)) 466 pstr->offsets[byte_idx] = src_idx; 467 ++src_idx; 468 469 /* And also cast it to wide char. */ 470 pstr->wcs[byte_idx++] = (wchar_t) ch; 471 if (BE (mbclen == (size_t) -1, 0)) 472 pstr->cur_state = prev_st; 473 } 474 else 475 { 476 /* The buffer doesn't have enough space, finish to build. */ 477 pstr->cur_state = prev_st; 478 break; 479 } 480 } 481 pstr->valid_len = byte_idx; 482 pstr->valid_raw_len = src_idx; 483 return REG_NOERROR; 484 } 485 486 /* Skip characters until the index becomes greater than NEW_RAW_IDX. 487 Return the index. */ 488 489 static Idx 490 internal_function 491 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 492 { 493 mbstate_t prev_st; 494 Idx rawbuf_idx; 495 size_t mbclen; 496 wint_t wc = WEOF; 497 498 /* Skip the characters which are not necessary to check. */ 499 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 500 rawbuf_idx < new_raw_idx;) 501 { 502 wchar_t wc2; 503 Idx remain_len; 504 remain_len = pstr->len - rawbuf_idx; 505 prev_st = pstr->cur_state; 506 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 507 remain_len, &pstr->cur_state); 508 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 509 { 510 /* We treat these cases as a single byte character. */ 511 if (mbclen == 0 || remain_len == 0) 512 wc = L'\0'; 513 else 514 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 515 mbclen = 1; 516 pstr->cur_state = prev_st; 517 } 518 else 519 wc = wc2; 520 /* Then proceed the next character. */ 521 rawbuf_idx += mbclen; 522 } 523 *last_wc = wc; 524 return rawbuf_idx; 525 } 526 #endif /* RE_ENABLE_I18N */ 527 528 /* Build the buffer PSTR->MBS, and apply the translation if we need. 529 This function is used in case of REG_ICASE. */ 530 531 static void 532 internal_function 533 build_upper_buffer (re_string_t *pstr) 534 { 535 Idx char_idx, end_idx; 536 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 537 538 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 539 { 540 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 541 if (BE (pstr->trans != NULL, 0)) 542 ch = pstr->trans[ch]; 543 if (islower (ch)) 544 pstr->mbs[char_idx] = toupper (ch); 545 else 546 pstr->mbs[char_idx] = ch; 547 } 548 pstr->valid_len = char_idx; 549 pstr->valid_raw_len = char_idx; 550 } 551 552 /* Apply TRANS to the buffer in PSTR. */ 553 554 static void 555 internal_function 556 re_string_translate_buffer (re_string_t *pstr) 557 { 558 Idx buf_idx, end_idx; 559 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 560 561 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 562 { 563 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 564 pstr->mbs[buf_idx] = pstr->trans[ch]; 565 } 566 567 pstr->valid_len = buf_idx; 568 pstr->valid_raw_len = buf_idx; 569 } 570 571 /* This function re-construct the buffers. 572 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 573 convert to upper case in case of REG_ICASE, apply translation. */ 574 575 static reg_errcode_t 576 internal_function __attribute_warn_unused_result__ 577 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 578 { 579 Idx offset; 580 581 if (BE (pstr->raw_mbs_idx <= idx, 0)) 582 offset = idx - pstr->raw_mbs_idx; 583 else 584 { 585 /* Reset buffer. */ 586 #ifdef RE_ENABLE_I18N 587 if (pstr->mb_cur_max > 1) 588 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 589 #endif /* RE_ENABLE_I18N */ 590 pstr->len = pstr->raw_len; 591 pstr->stop = pstr->raw_stop; 592 pstr->valid_len = 0; 593 pstr->raw_mbs_idx = 0; 594 pstr->valid_raw_len = 0; 595 pstr->offsets_needed = 0; 596 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 597 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 598 if (!pstr->mbs_allocated) 599 pstr->mbs = (unsigned char *) pstr->raw_mbs; 600 offset = idx; 601 } 602 603 if (BE (offset != 0, 1)) 604 { 605 /* Should the already checked characters be kept? */ 606 if (BE (offset < pstr->valid_raw_len, 1)) 607 { 608 /* Yes, move them to the front of the buffer. */ 609 #ifdef RE_ENABLE_I18N 610 if (BE (pstr->offsets_needed, 0)) 611 { 612 Idx low = 0, high = pstr->valid_len, mid; 613 do 614 { 615 mid = (high + low) / 2; 616 if (pstr->offsets[mid] > offset) 617 high = mid; 618 else if (pstr->offsets[mid] < offset) 619 low = mid + 1; 620 else 621 break; 622 } 623 while (low < high); 624 if (pstr->offsets[mid] < offset) 625 ++mid; 626 pstr->tip_context = re_string_context_at (pstr, mid - 1, 627 eflags); 628 /* This can be quite complicated, so handle specially 629 only the common and easy case where the character with 630 different length representation of lower and upper 631 case is present at or after offset. */ 632 if (pstr->valid_len > offset 633 && mid == offset && pstr->offsets[mid] == offset) 634 { 635 memmove (pstr->wcs, pstr->wcs + offset, 636 (pstr->valid_len - offset) * sizeof (wint_t)); 637 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 638 pstr->valid_len -= offset; 639 pstr->valid_raw_len -= offset; 640 for (low = 0; low < pstr->valid_len; low++) 641 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 642 } 643 else 644 { 645 /* Otherwise, just find out how long the partial multibyte 646 character at offset is and fill it with WEOF/255. */ 647 pstr->len = pstr->raw_len - idx + offset; 648 pstr->stop = pstr->raw_stop - idx + offset; 649 pstr->offsets_needed = 0; 650 while (mid > 0 && pstr->offsets[mid - 1] == offset) 651 --mid; 652 while (mid < pstr->valid_len) 653 if (pstr->wcs[mid] != WEOF) 654 break; 655 else 656 ++mid; 657 if (mid == pstr->valid_len) 658 pstr->valid_len = 0; 659 else 660 { 661 pstr->valid_len = pstr->offsets[mid] - offset; 662 if (pstr->valid_len) 663 { 664 for (low = 0; low < pstr->valid_len; ++low) 665 pstr->wcs[low] = WEOF; 666 memset (pstr->mbs, 255, pstr->valid_len); 667 } 668 } 669 pstr->valid_raw_len = pstr->valid_len; 670 } 671 } 672 else 673 #endif 674 { 675 pstr->tip_context = re_string_context_at (pstr, offset - 1, 676 eflags); 677 #ifdef RE_ENABLE_I18N 678 if (pstr->mb_cur_max > 1) 679 memmove (pstr->wcs, pstr->wcs + offset, 680 (pstr->valid_len - offset) * sizeof (wint_t)); 681 #endif /* RE_ENABLE_I18N */ 682 if (BE (pstr->mbs_allocated, 0)) 683 memmove (pstr->mbs, pstr->mbs + offset, 684 pstr->valid_len - offset); 685 pstr->valid_len -= offset; 686 pstr->valid_raw_len -= offset; 687 #if DEBUG 688 assert (pstr->valid_len > 0); 689 #endif 690 } 691 } 692 else 693 { 694 #ifdef RE_ENABLE_I18N 695 /* No, skip all characters until IDX. */ 696 Idx prev_valid_len = pstr->valid_len; 697 698 if (BE (pstr->offsets_needed, 0)) 699 { 700 pstr->len = pstr->raw_len - idx + offset; 701 pstr->stop = pstr->raw_stop - idx + offset; 702 pstr->offsets_needed = 0; 703 } 704 #endif 705 pstr->valid_len = 0; 706 #ifdef RE_ENABLE_I18N 707 if (pstr->mb_cur_max > 1) 708 { 709 Idx wcs_idx; 710 wint_t wc = WEOF; 711 712 if (pstr->is_utf8) 713 { 714 const unsigned char *raw, *p, *end; 715 716 /* Special case UTF-8. Multi-byte chars start with any 717 byte other than 0x80 - 0xbf. */ 718 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 719 end = raw + (offset - pstr->mb_cur_max); 720 if (end < pstr->raw_mbs) 721 end = pstr->raw_mbs; 722 p = raw + offset - 1; 723 #ifdef _LIBC 724 /* We know the wchar_t encoding is UCS4, so for the simple 725 case, ASCII characters, skip the conversion step. */ 726 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 727 { 728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 729 /* pstr->valid_len = 0; */ 730 wc = (wchar_t) *p; 731 } 732 else 733 #endif 734 for (; p >= end; --p) 735 if ((*p & 0xc0) != 0x80) 736 { 737 mbstate_t cur_state; 738 wchar_t wc2; 739 Idx mlen = raw + pstr->len - p; 740 unsigned char buf[6]; 741 size_t mbclen; 742 743 if (BE (pstr->trans != NULL, 0)) 744 { 745 int i = mlen < 6 ? mlen : 6; 746 while (--i >= 0) 747 buf[i] = pstr->trans[p[i]]; 748 } 749 /* XXX Don't use mbrtowc, we know which conversion 750 to use (UTF-8 -> UCS4). */ 751 memset (&cur_state, 0, sizeof (cur_state)); 752 mbclen = __mbrtowc (&wc2, (const char *) p, mlen, 753 &cur_state); 754 if (raw + offset - p <= mbclen 755 && mbclen < (size_t) -2) 756 { 757 memset (&pstr->cur_state, '\0', 758 sizeof (mbstate_t)); 759 pstr->valid_len = mbclen - (raw + offset - p); 760 wc = wc2; 761 } 762 break; 763 } 764 } 765 766 if (wc == WEOF) 767 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 768 if (wc == WEOF) 769 pstr->tip_context 770 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 771 else 772 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 773 && IS_WIDE_WORD_CHAR (wc)) 774 ? CONTEXT_WORD 775 : ((IS_WIDE_NEWLINE (wc) 776 && pstr->newline_anchor) 777 ? CONTEXT_NEWLINE : 0)); 778 if (BE (pstr->valid_len, 0)) 779 { 780 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 781 pstr->wcs[wcs_idx] = WEOF; 782 if (pstr->mbs_allocated) 783 memset (pstr->mbs, 255, pstr->valid_len); 784 } 785 pstr->valid_raw_len = pstr->valid_len; 786 } 787 else 788 #endif /* RE_ENABLE_I18N */ 789 { 790 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 791 pstr->valid_raw_len = 0; 792 if (pstr->trans) 793 c = pstr->trans[c]; 794 pstr->tip_context = (bitset_contain (pstr->word_char, c) 795 ? CONTEXT_WORD 796 : ((IS_NEWLINE (c) && pstr->newline_anchor) 797 ? CONTEXT_NEWLINE : 0)); 798 } 799 } 800 if (!BE (pstr->mbs_allocated, 0)) 801 pstr->mbs += offset; 802 } 803 pstr->raw_mbs_idx = idx; 804 pstr->len -= offset; 805 pstr->stop -= offset; 806 807 /* Then build the buffers. */ 808 #ifdef RE_ENABLE_I18N 809 if (pstr->mb_cur_max > 1) 810 { 811 if (pstr->icase) 812 { 813 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 814 if (BE (ret != REG_NOERROR, 0)) 815 return ret; 816 } 817 else 818 build_wcs_buffer (pstr); 819 } 820 else 821 #endif /* RE_ENABLE_I18N */ 822 if (BE (pstr->mbs_allocated, 0)) 823 { 824 if (pstr->icase) 825 build_upper_buffer (pstr); 826 else if (pstr->trans != NULL) 827 re_string_translate_buffer (pstr); 828 } 829 else 830 pstr->valid_len = pstr->len; 831 832 pstr->cur_idx = 0; 833 return REG_NOERROR; 834 } 835 836 static unsigned char 837 internal_function __attribute ((pure)) 838 re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 839 { 840 int ch; 841 Idx off; 842 843 /* Handle the common (easiest) cases first. */ 844 if (BE (!pstr->mbs_allocated, 1)) 845 return re_string_peek_byte (pstr, idx); 846 847 #ifdef RE_ENABLE_I18N 848 if (pstr->mb_cur_max > 1 849 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 850 return re_string_peek_byte (pstr, idx); 851 #endif 852 853 off = pstr->cur_idx + idx; 854 #ifdef RE_ENABLE_I18N 855 if (pstr->offsets_needed) 856 off = pstr->offsets[off]; 857 #endif 858 859 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 860 861 #ifdef RE_ENABLE_I18N 862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 863 this function returns CAPITAL LETTER I instead of first byte of 864 DOTLESS SMALL LETTER I. The latter would confuse the parser, 865 since peek_byte_case doesn't advance cur_idx in any way. */ 866 if (pstr->offsets_needed && !isascii (ch)) 867 return re_string_peek_byte (pstr, idx); 868 #endif 869 870 return ch; 871 } 872 873 static unsigned char 874 internal_function __attribute ((pure)) 875 re_string_fetch_byte_case (re_string_t *pstr) 876 { 877 if (BE (!pstr->mbs_allocated, 1)) 878 return re_string_fetch_byte (pstr); 879 880 #ifdef RE_ENABLE_I18N 881 if (pstr->offsets_needed) 882 { 883 Idx off; 884 int ch; 885 886 /* For tr_TR.UTF-8 [[:islower:]] there is 887 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 888 in that case the whole multi-byte character and return 889 the original letter. On the other side, with 890 [[: DOTLESS SMALL LETTER I return [[:I, as doing 891 anything else would complicate things too much. */ 892 893 if (!re_string_first_byte (pstr, pstr->cur_idx)) 894 return re_string_fetch_byte (pstr); 895 896 off = pstr->offsets[pstr->cur_idx]; 897 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 898 899 if (! isascii (ch)) 900 return re_string_fetch_byte (pstr); 901 902 re_string_skip_bytes (pstr, 903 re_string_char_size_at (pstr, pstr->cur_idx)); 904 return ch; 905 } 906 #endif 907 908 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 909 } 910 911 static void 912 internal_function 913 re_string_destruct (re_string_t *pstr) 914 { 915 #ifdef RE_ENABLE_I18N 916 re_free (pstr->wcs); 917 re_free (pstr->offsets); 918 #endif /* RE_ENABLE_I18N */ 919 if (pstr->mbs_allocated) 920 re_free (pstr->mbs); 921 } 922 923 /* Return the context at IDX in INPUT. */ 924 925 static unsigned int 926 internal_function 927 re_string_context_at (const re_string_t *input, Idx idx, int eflags) 928 { 929 int c; 930 if (BE (! REG_VALID_INDEX (idx), 0)) 931 /* In this case, we use the value stored in input->tip_context, 932 since we can't know the character in input->mbs[-1] here. */ 933 return input->tip_context; 934 if (BE (idx == input->len, 0)) 935 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 936 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 937 #ifdef RE_ENABLE_I18N 938 if (input->mb_cur_max > 1) 939 { 940 wint_t wc; 941 Idx wc_idx = idx; 942 while(input->wcs[wc_idx] == WEOF) 943 { 944 #ifdef DEBUG 945 /* It must not happen. */ 946 assert (REG_VALID_INDEX (wc_idx)); 947 #endif 948 --wc_idx; 949 if (! REG_VALID_INDEX (wc_idx)) 950 return input->tip_context; 951 } 952 wc = input->wcs[wc_idx]; 953 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 954 return CONTEXT_WORD; 955 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 956 ? CONTEXT_NEWLINE : 0); 957 } 958 else 959 #endif 960 { 961 c = re_string_byte_at (input, idx); 962 if (bitset_contain (input->word_char, c)) 963 return CONTEXT_WORD; 964 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 965 } 966 } 967 968 /* Functions for set operation. */ 969 970 static reg_errcode_t 971 internal_function __attribute_warn_unused_result__ 972 re_node_set_alloc (re_node_set *set, Idx size) 973 { 974 set->alloc = size; 975 set->nelem = 0; 976 set->elems = re_malloc (Idx, size); 977 if (BE (set->elems == NULL, 0)) 978 return REG_ESPACE; 979 return REG_NOERROR; 980 } 981 982 static reg_errcode_t 983 internal_function __attribute_warn_unused_result__ 984 re_node_set_init_1 (re_node_set *set, Idx elem) 985 { 986 set->alloc = 1; 987 set->nelem = 1; 988 set->elems = re_malloc (Idx, 1); 989 if (BE (set->elems == NULL, 0)) 990 { 991 set->alloc = set->nelem = 0; 992 return REG_ESPACE; 993 } 994 set->elems[0] = elem; 995 return REG_NOERROR; 996 } 997 998 static reg_errcode_t 999 internal_function __attribute_warn_unused_result__ 1000 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 1001 { 1002 set->alloc = 2; 1003 set->elems = re_malloc (Idx, 2); 1004 if (BE (set->elems == NULL, 0)) 1005 return REG_ESPACE; 1006 if (elem1 == elem2) 1007 { 1008 set->nelem = 1; 1009 set->elems[0] = elem1; 1010 } 1011 else 1012 { 1013 set->nelem = 2; 1014 if (elem1 < elem2) 1015 { 1016 set->elems[0] = elem1; 1017 set->elems[1] = elem2; 1018 } 1019 else 1020 { 1021 set->elems[0] = elem2; 1022 set->elems[1] = elem1; 1023 } 1024 } 1025 return REG_NOERROR; 1026 } 1027 1028 static reg_errcode_t 1029 internal_function __attribute_warn_unused_result__ 1030 re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1031 { 1032 dest->nelem = src->nelem; 1033 if (src->nelem > 0) 1034 { 1035 dest->alloc = dest->nelem; 1036 dest->elems = re_malloc (Idx, dest->alloc); 1037 if (BE (dest->elems == NULL, 0)) 1038 { 1039 dest->alloc = dest->nelem = 0; 1040 return REG_ESPACE; 1041 } 1042 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1043 } 1044 else 1045 re_node_set_init_empty (dest); 1046 return REG_NOERROR; 1047 } 1048 1049 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1050 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1051 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1052 1053 static reg_errcode_t 1054 internal_function __attribute_warn_unused_result__ 1055 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1056 const re_node_set *src2) 1057 { 1058 Idx i1, i2, is, id, delta, sbase; 1059 if (src1->nelem == 0 || src2->nelem == 0) 1060 return REG_NOERROR; 1061 1062 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1063 conservative estimate. */ 1064 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1065 { 1066 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 1067 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 1068 if (BE (new_elems == NULL, 0)) 1069 return REG_ESPACE; 1070 dest->elems = new_elems; 1071 dest->alloc = new_alloc; 1072 } 1073 1074 /* Find the items in the intersection of SRC1 and SRC2, and copy 1075 into the top of DEST those that are not already in DEST itself. */ 1076 sbase = dest->nelem + src1->nelem + src2->nelem; 1077 i1 = src1->nelem - 1; 1078 i2 = src2->nelem - 1; 1079 id = dest->nelem - 1; 1080 for (;;) 1081 { 1082 if (src1->elems[i1] == src2->elems[i2]) 1083 { 1084 /* Try to find the item in DEST. Maybe we could binary search? */ 1085 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 1086 --id; 1087 1088 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 1089 dest->elems[--sbase] = src1->elems[i1]; 1090 1091 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1092 break; 1093 } 1094 1095 /* Lower the highest of the two items. */ 1096 else if (src1->elems[i1] < src2->elems[i2]) 1097 { 1098 if (! REG_VALID_INDEX (--i2)) 1099 break; 1100 } 1101 else 1102 { 1103 if (! REG_VALID_INDEX (--i1)) 1104 break; 1105 } 1106 } 1107 1108 id = dest->nelem - 1; 1109 is = dest->nelem + src1->nelem + src2->nelem - 1; 1110 delta = is - sbase + 1; 1111 1112 /* Now copy. When DELTA becomes zero, the remaining 1113 DEST elements are already in place; this is more or 1114 less the same loop that is in re_node_set_merge. */ 1115 dest->nelem += delta; 1116 if (delta > 0 && REG_VALID_INDEX (id)) 1117 for (;;) 1118 { 1119 if (dest->elems[is] > dest->elems[id]) 1120 { 1121 /* Copy from the top. */ 1122 dest->elems[id + delta--] = dest->elems[is--]; 1123 if (delta == 0) 1124 break; 1125 } 1126 else 1127 { 1128 /* Slide from the bottom. */ 1129 dest->elems[id + delta] = dest->elems[id]; 1130 if (! REG_VALID_INDEX (--id)) 1131 break; 1132 } 1133 } 1134 1135 /* Copy remaining SRC elements. */ 1136 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 1137 1138 return REG_NOERROR; 1139 } 1140 1141 /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1142 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1143 1144 static reg_errcode_t 1145 internal_function __attribute_warn_unused_result__ 1146 re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1147 const re_node_set *src2) 1148 { 1149 Idx i1, i2, id; 1150 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1151 { 1152 dest->alloc = src1->nelem + src2->nelem; 1153 dest->elems = re_malloc (Idx, dest->alloc); 1154 if (BE (dest->elems == NULL, 0)) 1155 return REG_ESPACE; 1156 } 1157 else 1158 { 1159 if (src1 != NULL && src1->nelem > 0) 1160 return re_node_set_init_copy (dest, src1); 1161 else if (src2 != NULL && src2->nelem > 0) 1162 return re_node_set_init_copy (dest, src2); 1163 else 1164 re_node_set_init_empty (dest); 1165 return REG_NOERROR; 1166 } 1167 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1168 { 1169 if (src1->elems[i1] > src2->elems[i2]) 1170 { 1171 dest->elems[id++] = src2->elems[i2++]; 1172 continue; 1173 } 1174 if (src1->elems[i1] == src2->elems[i2]) 1175 ++i2; 1176 dest->elems[id++] = src1->elems[i1++]; 1177 } 1178 if (i1 < src1->nelem) 1179 { 1180 memcpy (dest->elems + id, src1->elems + i1, 1181 (src1->nelem - i1) * sizeof (Idx)); 1182 id += src1->nelem - i1; 1183 } 1184 else if (i2 < src2->nelem) 1185 { 1186 memcpy (dest->elems + id, src2->elems + i2, 1187 (src2->nelem - i2) * sizeof (Idx)); 1188 id += src2->nelem - i2; 1189 } 1190 dest->nelem = id; 1191 return REG_NOERROR; 1192 } 1193 1194 /* Calculate the union set of the sets DEST and SRC. And store it to 1195 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1196 1197 static reg_errcode_t 1198 internal_function __attribute_warn_unused_result__ 1199 re_node_set_merge (re_node_set *dest, const re_node_set *src) 1200 { 1201 Idx is, id, sbase, delta; 1202 if (src == NULL || src->nelem == 0) 1203 return REG_NOERROR; 1204 if (dest->alloc < 2 * src->nelem + dest->nelem) 1205 { 1206 Idx new_alloc = 2 * (src->nelem + dest->alloc); 1207 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 1208 if (BE (new_buffer == NULL, 0)) 1209 return REG_ESPACE; 1210 dest->elems = new_buffer; 1211 dest->alloc = new_alloc; 1212 } 1213 1214 if (BE (dest->nelem == 0, 0)) 1215 { 1216 dest->nelem = src->nelem; 1217 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1218 return REG_NOERROR; 1219 } 1220 1221 /* Copy into the top of DEST the items of SRC that are not 1222 found in DEST. Maybe we could binary search in DEST? */ 1223 for (sbase = dest->nelem + 2 * src->nelem, 1224 is = src->nelem - 1, id = dest->nelem - 1; 1225 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1226 { 1227 if (dest->elems[id] == src->elems[is]) 1228 is--, id--; 1229 else if (dest->elems[id] < src->elems[is]) 1230 dest->elems[--sbase] = src->elems[is--]; 1231 else /* if (dest->elems[id] > src->elems[is]) */ 1232 --id; 1233 } 1234 1235 if (REG_VALID_INDEX (is)) 1236 { 1237 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1238 sbase -= is + 1; 1239 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 1240 } 1241 1242 id = dest->nelem - 1; 1243 is = dest->nelem + 2 * src->nelem - 1; 1244 delta = is - sbase + 1; 1245 if (delta == 0) 1246 return REG_NOERROR; 1247 1248 /* Now copy. When DELTA becomes zero, the remaining 1249 DEST elements are already in place. */ 1250 dest->nelem += delta; 1251 for (;;) 1252 { 1253 if (dest->elems[is] > dest->elems[id]) 1254 { 1255 /* Copy from the top. */ 1256 dest->elems[id + delta--] = dest->elems[is--]; 1257 if (delta == 0) 1258 break; 1259 } 1260 else 1261 { 1262 /* Slide from the bottom. */ 1263 dest->elems[id + delta] = dest->elems[id]; 1264 if (! REG_VALID_INDEX (--id)) 1265 { 1266 /* Copy remaining SRC elements. */ 1267 memcpy (dest->elems, dest->elems + sbase, 1268 delta * sizeof (Idx)); 1269 break; 1270 } 1271 } 1272 } 1273 1274 return REG_NOERROR; 1275 } 1276 1277 /* Insert the new element ELEM to the re_node_set* SET. 1278 SET should not already have ELEM. 1279 Return true if successful. */ 1280 1281 static bool 1282 internal_function __attribute_warn_unused_result__ 1283 re_node_set_insert (re_node_set *set, Idx elem) 1284 { 1285 Idx idx; 1286 /* In case the set is empty. */ 1287 if (set->alloc == 0) 1288 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 1289 1290 if (BE (set->nelem, 0) == 0) 1291 { 1292 /* We already guaranteed above that set->alloc != 0. */ 1293 set->elems[0] = elem; 1294 ++set->nelem; 1295 return true; 1296 } 1297 1298 /* Realloc if we need. */ 1299 if (set->alloc == set->nelem) 1300 { 1301 Idx *new_elems; 1302 set->alloc = set->alloc * 2; 1303 new_elems = re_realloc (set->elems, Idx, set->alloc); 1304 if (BE (new_elems == NULL, 0)) 1305 return false; 1306 set->elems = new_elems; 1307 } 1308 1309 /* Move the elements which follows the new element. Test the 1310 first element separately to skip a check in the inner loop. */ 1311 if (elem < set->elems[0]) 1312 { 1313 idx = 0; 1314 for (idx = set->nelem; idx > 0; idx--) 1315 set->elems[idx] = set->elems[idx - 1]; 1316 } 1317 else 1318 { 1319 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1320 set->elems[idx] = set->elems[idx - 1]; 1321 } 1322 1323 /* Insert the new element. */ 1324 set->elems[idx] = elem; 1325 ++set->nelem; 1326 return true; 1327 } 1328 1329 /* Insert the new element ELEM to the re_node_set* SET. 1330 SET should not already have any element greater than or equal to ELEM. 1331 Return true if successful. */ 1332 1333 static bool 1334 internal_function __attribute_warn_unused_result__ 1335 re_node_set_insert_last (re_node_set *set, Idx elem) 1336 { 1337 /* Realloc if we need. */ 1338 if (set->alloc == set->nelem) 1339 { 1340 Idx *new_elems; 1341 set->alloc = (set->alloc + 1) * 2; 1342 new_elems = re_realloc (set->elems, Idx, set->alloc); 1343 if (BE (new_elems == NULL, 0)) 1344 return false; 1345 set->elems = new_elems; 1346 } 1347 1348 /* Insert the new element. */ 1349 set->elems[set->nelem++] = elem; 1350 return true; 1351 } 1352 1353 /* Compare two node sets SET1 and SET2. 1354 Return true if SET1 and SET2 are equivalent. */ 1355 1356 static bool 1357 internal_function __attribute ((pure)) 1358 re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1359 { 1360 Idx i; 1361 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1362 return false; 1363 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1364 if (set1->elems[i] != set2->elems[i]) 1365 return false; 1366 return true; 1367 } 1368 1369 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1370 1371 static Idx 1372 internal_function __attribute ((pure)) 1373 re_node_set_contains (const re_node_set *set, Idx elem) 1374 { 1375 __re_size_t idx, right, mid; 1376 if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1377 return 0; 1378 1379 /* Binary search the element. */ 1380 idx = 0; 1381 right = set->nelem - 1; 1382 while (idx < right) 1383 { 1384 mid = (idx + right) / 2; 1385 if (set->elems[mid] < elem) 1386 idx = mid + 1; 1387 else 1388 right = mid; 1389 } 1390 return set->elems[idx] == elem ? idx + 1 : 0; 1391 } 1392 1393 static void 1394 internal_function 1395 re_node_set_remove_at (re_node_set *set, Idx idx) 1396 { 1397 verify (! TYPE_SIGNED (Idx)); 1398 /* if (idx < 0) 1399 return; */ 1400 if (idx >= set->nelem) 1401 return; 1402 --set->nelem; 1403 for (; idx < set->nelem; idx++) 1404 set->elems[idx] = set->elems[idx + 1]; 1405 } 1406 1407 1408 /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1409 Or return REG_MISSING if an error occurred. */ 1410 1411 static Idx 1412 internal_function 1413 re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1414 { 1415 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1416 { 1417 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1418 Idx *new_nexts, *new_indices; 1419 re_node_set *new_edests, *new_eclosures; 1420 re_token_t *new_nodes; 1421 size_t max_object_size = 1422 MAX (sizeof (re_token_t), 1423 MAX (sizeof (re_node_set), 1424 sizeof (Idx))); 1425 1426 /* Avoid overflows. */ 1427 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0)) 1428 return REG_MISSING; 1429 1430 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1431 if (BE (new_nodes == NULL, 0)) 1432 return REG_MISSING; 1433 dfa->nodes = new_nodes; 1434 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1435 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1436 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1437 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1438 if (BE (new_nexts == NULL || new_indices == NULL 1439 || new_edests == NULL || new_eclosures == NULL, 0)) 1440 return REG_MISSING; 1441 dfa->nexts = new_nexts; 1442 dfa->org_indices = new_indices; 1443 dfa->edests = new_edests; 1444 dfa->eclosures = new_eclosures; 1445 dfa->nodes_alloc = new_nodes_alloc; 1446 } 1447 dfa->nodes[dfa->nodes_len] = token; 1448 dfa->nodes[dfa->nodes_len].constraint = 0; 1449 #ifdef RE_ENABLE_I18N 1450 { 1451 int type = token.type; 1452 dfa->nodes[dfa->nodes_len].accept_mb = 1453 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1454 } 1455 #endif 1456 dfa->nexts[dfa->nodes_len] = REG_MISSING; 1457 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1458 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1459 return dfa->nodes_len++; 1460 } 1461 1462 static inline re_hashval_t 1463 internal_function 1464 calc_state_hash (const re_node_set *nodes, unsigned int context) 1465 { 1466 re_hashval_t hash = nodes->nelem + context; 1467 Idx i; 1468 for (i = 0 ; i < nodes->nelem ; i++) 1469 hash += nodes->elems[i]; 1470 return hash; 1471 } 1472 1473 /* Search for the state whose node_set is equivalent to NODES. 1474 Return the pointer to the state, if we found it in the DFA. 1475 Otherwise create the new one and return it. In case of an error 1476 return NULL and set the error code in ERR. 1477 Note: - We assume NULL as the invalid state, then it is possible that 1478 return value is NULL and ERR is REG_NOERROR. 1479 - We never return non-NULL value in case of any errors, it is for 1480 optimization. */ 1481 1482 static re_dfastate_t * 1483 internal_function __attribute_warn_unused_result__ 1484 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1485 const re_node_set *nodes) 1486 { 1487 re_hashval_t hash; 1488 re_dfastate_t *new_state; 1489 struct re_state_table_entry *spot; 1490 Idx i; 1491 #ifdef lint 1492 /* Suppress bogus uninitialized-variable warnings. */ 1493 *err = REG_NOERROR; 1494 #endif 1495 if (BE (nodes->nelem == 0, 0)) 1496 { 1497 *err = REG_NOERROR; 1498 return NULL; 1499 } 1500 hash = calc_state_hash (nodes, 0); 1501 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1502 1503 for (i = 0 ; i < spot->num ; i++) 1504 { 1505 re_dfastate_t *state = spot->array[i]; 1506 if (hash != state->hash) 1507 continue; 1508 if (re_node_set_compare (&state->nodes, nodes)) 1509 return state; 1510 } 1511 1512 /* There are no appropriate state in the dfa, create the new one. */ 1513 new_state = create_ci_newstate (dfa, nodes, hash); 1514 if (BE (new_state == NULL, 0)) 1515 *err = REG_ESPACE; 1516 1517 return new_state; 1518 } 1519 1520 /* Search for the state whose node_set is equivalent to NODES and 1521 whose context is equivalent to CONTEXT. 1522 Return the pointer to the state, if we found it in the DFA. 1523 Otherwise create the new one and return it. In case of an error 1524 return NULL and set the error code in ERR. 1525 Note: - We assume NULL as the invalid state, then it is possible that 1526 return value is NULL and ERR is REG_NOERROR. 1527 - We never return non-NULL value in case of any errors, it is for 1528 optimization. */ 1529 1530 static re_dfastate_t * 1531 internal_function __attribute_warn_unused_result__ 1532 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1533 const re_node_set *nodes, unsigned int context) 1534 { 1535 re_hashval_t hash; 1536 re_dfastate_t *new_state; 1537 struct re_state_table_entry *spot; 1538 Idx i; 1539 #ifdef lint 1540 /* Suppress bogus uninitialized-variable warnings. */ 1541 *err = REG_NOERROR; 1542 #endif 1543 if (nodes->nelem == 0) 1544 { 1545 *err = REG_NOERROR; 1546 return NULL; 1547 } 1548 hash = calc_state_hash (nodes, context); 1549 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1550 1551 for (i = 0 ; i < spot->num ; i++) 1552 { 1553 re_dfastate_t *state = spot->array[i]; 1554 if (state->hash == hash 1555 && state->context == context 1556 && re_node_set_compare (state->entrance_nodes, nodes)) 1557 return state; 1558 } 1559 /* There are no appropriate state in `dfa', create the new one. */ 1560 new_state = create_cd_newstate (dfa, nodes, context, hash); 1561 if (BE (new_state == NULL, 0)) 1562 *err = REG_ESPACE; 1563 1564 return new_state; 1565 } 1566 1567 /* Finish initialization of the new state NEWSTATE, and using its hash value 1568 HASH put in the appropriate bucket of DFA's state table. Return value 1569 indicates the error code if failed. */ 1570 1571 static reg_errcode_t 1572 __attribute_warn_unused_result__ 1573 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1574 re_hashval_t hash) 1575 { 1576 struct re_state_table_entry *spot; 1577 reg_errcode_t err; 1578 Idx i; 1579 1580 newstate->hash = hash; 1581 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1582 if (BE (err != REG_NOERROR, 0)) 1583 return REG_ESPACE; 1584 for (i = 0; i < newstate->nodes.nelem; i++) 1585 { 1586 Idx elem = newstate->nodes.elems[i]; 1587 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1588 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0)) 1589 return REG_ESPACE; 1590 } 1591 1592 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1593 if (BE (spot->alloc <= spot->num, 0)) 1594 { 1595 Idx new_alloc = 2 * spot->num + 2; 1596 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1597 new_alloc); 1598 if (BE (new_array == NULL, 0)) 1599 return REG_ESPACE; 1600 spot->array = new_array; 1601 spot->alloc = new_alloc; 1602 } 1603 spot->array[spot->num++] = newstate; 1604 return REG_NOERROR; 1605 } 1606 1607 static void 1608 free_state (re_dfastate_t *state) 1609 { 1610 re_node_set_free (&state->non_eps_nodes); 1611 re_node_set_free (&state->inveclosure); 1612 if (state->entrance_nodes != &state->nodes) 1613 { 1614 re_node_set_free (state->entrance_nodes); 1615 re_free (state->entrance_nodes); 1616 } 1617 re_node_set_free (&state->nodes); 1618 re_free (state->word_trtable); 1619 re_free (state->trtable); 1620 re_free (state); 1621 } 1622 1623 /* Create the new state which is independ of contexts. 1624 Return the new state if succeeded, otherwise return NULL. */ 1625 1626 static re_dfastate_t * 1627 internal_function __attribute_warn_unused_result__ 1628 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1629 re_hashval_t hash) 1630 { 1631 Idx i; 1632 reg_errcode_t err; 1633 re_dfastate_t *newstate; 1634 1635 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1636 if (BE (newstate == NULL, 0)) 1637 return NULL; 1638 err = re_node_set_init_copy (&newstate->nodes, nodes); 1639 if (BE (err != REG_NOERROR, 0)) 1640 { 1641 re_free (newstate); 1642 return NULL; 1643 } 1644 1645 newstate->entrance_nodes = &newstate->nodes; 1646 for (i = 0 ; i < nodes->nelem ; i++) 1647 { 1648 re_token_t *node = dfa->nodes + nodes->elems[i]; 1649 re_token_type_t type = node->type; 1650 if (type == CHARACTER && !node->constraint) 1651 continue; 1652 #ifdef RE_ENABLE_I18N 1653 newstate->accept_mb |= node->accept_mb; 1654 #endif /* RE_ENABLE_I18N */ 1655 1656 /* If the state has the halt node, the state is a halt state. */ 1657 if (type == END_OF_RE) 1658 newstate->halt = 1; 1659 else if (type == OP_BACK_REF) 1660 newstate->has_backref = 1; 1661 else if (type == ANCHOR || node->constraint) 1662 newstate->has_constraint = 1; 1663 } 1664 err = register_state (dfa, newstate, hash); 1665 if (BE (err != REG_NOERROR, 0)) 1666 { 1667 free_state (newstate); 1668 newstate = NULL; 1669 } 1670 return newstate; 1671 } 1672 1673 /* Create the new state which is depend on the context CONTEXT. 1674 Return the new state if succeeded, otherwise return NULL. */ 1675 1676 static re_dfastate_t * 1677 internal_function __attribute_warn_unused_result__ 1678 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1679 unsigned int context, re_hashval_t hash) 1680 { 1681 Idx i, nctx_nodes = 0; 1682 reg_errcode_t err; 1683 re_dfastate_t *newstate; 1684 1685 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1686 if (BE (newstate == NULL, 0)) 1687 return NULL; 1688 err = re_node_set_init_copy (&newstate->nodes, nodes); 1689 if (BE (err != REG_NOERROR, 0)) 1690 { 1691 re_free (newstate); 1692 return NULL; 1693 } 1694 1695 newstate->context = context; 1696 newstate->entrance_nodes = &newstate->nodes; 1697 1698 for (i = 0 ; i < nodes->nelem ; i++) 1699 { 1700 re_token_t *node = dfa->nodes + nodes->elems[i]; 1701 re_token_type_t type = node->type; 1702 unsigned int constraint = node->constraint; 1703 1704 if (type == CHARACTER && !constraint) 1705 continue; 1706 #ifdef RE_ENABLE_I18N 1707 newstate->accept_mb |= node->accept_mb; 1708 #endif /* RE_ENABLE_I18N */ 1709 1710 /* If the state has the halt node, the state is a halt state. */ 1711 if (type == END_OF_RE) 1712 newstate->halt = 1; 1713 else if (type == OP_BACK_REF) 1714 newstate->has_backref = 1; 1715 1716 if (constraint) 1717 { 1718 if (newstate->entrance_nodes == &newstate->nodes) 1719 { 1720 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1721 if (BE (newstate->entrance_nodes == NULL, 0)) 1722 { 1723 free_state (newstate); 1724 return NULL; 1725 } 1726 if (re_node_set_init_copy (newstate->entrance_nodes, nodes) 1727 != REG_NOERROR) 1728 return NULL; 1729 nctx_nodes = 0; 1730 newstate->has_constraint = 1; 1731 } 1732 1733 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1734 { 1735 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1736 ++nctx_nodes; 1737 } 1738 } 1739 } 1740 err = register_state (dfa, newstate, hash); 1741 if (BE (err != REG_NOERROR, 0)) 1742 { 1743 free_state (newstate); 1744 newstate = NULL; 1745 } 1746 return newstate; 1747 } 1748