1 /* $NetBSD: ure.c,v 1.1.1.3 2010/12/12 15:22:07 adam Exp $ */ 2 3 /* OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.17.2.5 2010/04/13 20:23:04 kurt Exp */ 4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2010 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17 /* Copyright 1997, 1998, 1999 Computing Research Labs, 18 * New Mexico State University 19 * 20 * Permission is hereby granted, free of charge, to any person obtaining a 21 * copy of this software and associated documentation files (the "Software"), 22 * to deal in the Software without restriction, including without limitation 23 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 24 * and/or sell copies of the Software, and to permit persons to whom the 25 * Software is furnished to do so, subject to the following conditions: 26 * 27 * The above copyright notice and this permission notice shall be included in 28 * all copies or substantial portions of the Software. 29 * 30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 33 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 34 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 35 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 36 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 37 */ 38 /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp" */ 39 40 #include "portable.h" 41 42 #include <ac/stdlib.h> 43 #include <ac/string.h> 44 #include <ac/unistd.h> 45 46 #include "ure.h" 47 48 /* 49 * Flags used internally in the DFA. 50 */ 51 #define _URE_DFA_CASEFOLD 0x01 52 #define _URE_DFA_BLANKLINE 0x02 53 54 static unsigned long cclass_flags[] = { 55 0, 56 _URE_NONSPACING, 57 _URE_COMBINING, 58 _URE_NUMDIGIT, 59 _URE_NUMOTHER, 60 _URE_SPACESEP, 61 _URE_LINESEP, 62 _URE_PARASEP, 63 _URE_CNTRL, 64 _URE_PUA, 65 _URE_UPPER, 66 _URE_LOWER, 67 _URE_TITLE, 68 _URE_MODIFIER, 69 _URE_OTHERLETTER, 70 _URE_DASHPUNCT, 71 _URE_OPENPUNCT, 72 _URE_CLOSEPUNCT, 73 _URE_OTHERPUNCT, 74 _URE_MATHSYM, 75 _URE_CURRENCYSYM, 76 _URE_OTHERSYM, 77 _URE_LTR, 78 _URE_RTL, 79 _URE_EURONUM, 80 _URE_EURONUMSEP, 81 _URE_EURONUMTERM, 82 _URE_ARABNUM, 83 _URE_COMMONSEP, 84 _URE_BLOCKSEP, 85 _URE_SEGMENTSEP, 86 _URE_WHITESPACE, 87 _URE_OTHERNEUT, 88 }; 89 90 /* 91 * Symbol types for the DFA. 92 */ 93 #define _URE_ANY_CHAR 1 94 #define _URE_CHAR 2 95 #define _URE_CCLASS 3 96 #define _URE_NCCLASS 4 97 #define _URE_BOL_ANCHOR 5 98 #define _URE_EOL_ANCHOR 6 99 100 /* 101 * Op codes for converting the NFA to a DFA. 102 */ 103 #define _URE_SYMBOL 10 104 #define _URE_PAREN 11 105 #define _URE_QUEST 12 106 #define _URE_STAR 13 107 #define _URE_PLUS 14 108 #define _URE_ONE 15 109 #define _URE_AND 16 110 #define _URE_OR 17 111 112 #define _URE_NOOP 0xffff 113 114 #define _URE_REGSTART 0x8000 115 #define _URE_REGEND 0x4000 116 117 /* 118 * Structure used to handle a compacted range of characters. 119 */ 120 typedef struct { 121 ucs4_t min_code; 122 ucs4_t max_code; 123 } _ure_range_t; 124 125 typedef struct { 126 _ure_range_t *ranges; 127 ucs2_t ranges_used; 128 ucs2_t ranges_size; 129 } _ure_ccl_t; 130 131 typedef union { 132 ucs4_t chr; 133 _ure_ccl_t ccl; 134 } _ure_sym_t; 135 136 /* 137 * This is a general element structure used for expressions and stack 138 * elements. 139 */ 140 typedef struct { 141 ucs2_t reg; 142 ucs2_t onstack; 143 ucs2_t type; 144 ucs2_t lhs; 145 ucs2_t rhs; 146 } _ure_elt_t; 147 148 /* 149 * This is a structure used to track a list or a stack of states. 150 */ 151 typedef struct { 152 ucs2_t *slist; 153 ucs2_t slist_size; 154 ucs2_t slist_used; 155 } _ure_stlist_t; 156 157 /* 158 * Structure to track the list of unique states for a symbol 159 * during reduction. 160 */ 161 typedef struct { 162 ucs2_t id; 163 ucs2_t type; 164 unsigned long mods; 165 unsigned long props; 166 _ure_sym_t sym; 167 _ure_stlist_t states; 168 } _ure_symtab_t; 169 170 /* 171 * Structure to hold a single state. 172 */ 173 typedef struct { 174 ucs2_t id; 175 ucs2_t accepting; 176 ucs2_t pad; 177 _ure_stlist_t st; 178 _ure_elt_t *trans; 179 ucs2_t trans_size; 180 ucs2_t trans_used; 181 } _ure_state_t; 182 183 /* 184 * Structure used for keeping lists of states. 185 */ 186 typedef struct { 187 _ure_state_t *states; 188 ucs2_t states_size; 189 ucs2_t states_used; 190 } _ure_statetable_t; 191 192 /* 193 * Structure to track pairs of DFA states when equivalent states are 194 * merged. 195 */ 196 typedef struct { 197 ucs2_t l; 198 ucs2_t r; 199 } _ure_equiv_t; 200 201 /* 202 * Structure used for constructing the NFA and reducing to a minimal DFA. 203 */ 204 typedef struct _ure_buffer_t { 205 int reducing; 206 int error; 207 unsigned long flags; 208 209 _ure_stlist_t stack; 210 211 /* 212 * Table of unique symbols encountered. 213 */ 214 _ure_symtab_t *symtab; 215 ucs2_t symtab_size; 216 ucs2_t symtab_used; 217 218 /* 219 * Tracks the unique expressions generated for the NFA and when the NFA is 220 * reduced. 221 */ 222 _ure_elt_t *expr; 223 ucs2_t expr_used; 224 ucs2_t expr_size; 225 226 /* 227 * The reduced table of unique groups of NFA states. 228 */ 229 _ure_statetable_t states; 230 231 /* 232 * Tracks states when equivalent states are merged. 233 */ 234 _ure_equiv_t *equiv; 235 ucs2_t equiv_used; 236 ucs2_t equiv_size; 237 } _ure_buffer_t; 238 239 typedef struct { 240 ucs2_t symbol; 241 ucs2_t next_state; 242 } _ure_trans_t; 243 244 typedef struct { 245 ucs2_t accepting; 246 ucs2_t ntrans; 247 _ure_trans_t *trans; 248 } _ure_dstate_t; 249 250 typedef struct _ure_dfa_t { 251 unsigned long flags; 252 253 _ure_symtab_t *syms; 254 ucs2_t nsyms; 255 256 _ure_dstate_t *states; 257 ucs2_t nstates; 258 259 _ure_trans_t *trans; 260 ucs2_t ntrans; 261 } _ure_dfa_t; 262 263 /************************************************************************* 264 * 265 * Functions. 266 * 267 *************************************************************************/ 268 269 static void 270 _ure_memmove(char *dest, char *src, unsigned long bytes) 271 { 272 long i, j; 273 274 i = (long) bytes; 275 j = i & 7; 276 i = (i + 7) >> 3; 277 278 /* 279 * Do a memmove using Ye Olde Duff's Device for efficiency. 280 */ 281 if (src < dest) { 282 src += bytes; 283 dest += bytes; 284 285 switch (j) { 286 case 0: do { 287 *--dest = *--src; 288 case 7: *--dest = *--src; 289 case 6: *--dest = *--src; 290 case 5: *--dest = *--src; 291 case 4: *--dest = *--src; 292 case 3: *--dest = *--src; 293 case 2: *--dest = *--src; 294 case 1: *--dest = *--src; 295 } while (--i > 0); 296 } 297 } else if (src > dest) { 298 switch (j) { 299 case 0: do { 300 *dest++ = *src++; 301 case 7: *dest++ = *src++; 302 case 6: *dest++ = *src++; 303 case 5: *dest++ = *src++; 304 case 4: *dest++ = *src++; 305 case 3: *dest++ = *src++; 306 case 2: *dest++ = *src++; 307 case 1: *dest++ = *src++; 308 } while (--i > 0); 309 } 310 } 311 } 312 313 static void 314 _ure_push(ucs2_t v, _ure_buffer_t *b) 315 { 316 _ure_stlist_t *s; 317 318 if (b == 0) 319 return; 320 321 /* 322 * If the `reducing' parameter is non-zero, check to see if the value 323 * passed is already on the stack. 324 */ 325 if (b->reducing != 0 && b->expr[v].onstack != 0) 326 return; 327 328 s = &b->stack; 329 if (s->slist_used == s->slist_size) { 330 if (s->slist_size == 0) 331 s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); 332 else 333 s->slist = (ucs2_t *) realloc((char *) s->slist, 334 sizeof(ucs2_t) * (s->slist_size + 8)); 335 s->slist_size += 8; 336 } 337 s->slist[s->slist_used++] = v; 338 339 /* 340 * If the `reducing' parameter is non-zero, flag the element as being on 341 * the stack. 342 */ 343 if (b->reducing != 0) 344 b->expr[v].onstack = 1; 345 } 346 347 static ucs2_t 348 _ure_peek(_ure_buffer_t *b) 349 { 350 if (b == 0 || b->stack.slist_used == 0) 351 return _URE_NOOP; 352 353 return b->stack.slist[b->stack.slist_used - 1]; 354 } 355 356 static ucs2_t 357 _ure_pop(_ure_buffer_t *b) 358 { 359 ucs2_t v; 360 361 if (b == 0 || b->stack.slist_used == 0) 362 return _URE_NOOP; 363 364 v = b->stack.slist[--b->stack.slist_used]; 365 if (b->reducing) 366 b->expr[v].onstack = 0; 367 368 return v; 369 } 370 371 /************************************************************************* 372 * 373 * Start symbol parse functions. 374 * 375 *************************************************************************/ 376 377 /* 378 * Parse a comma-separated list of integers that represent character 379 * properties. Combine them into a mask that is returned in the `mask' 380 * variable, and return the number of characters consumed. 381 */ 382 static unsigned long 383 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask, 384 _ure_buffer_t *b) 385 { 386 unsigned long n, m; 387 ucs2_t *sp, *ep; 388 389 sp = pp; 390 ep = sp + limit; 391 392 for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) { 393 if (*sp == ',') { 394 /* 395 * Encountered a comma, so select the next character property flag 396 * and reset the number. 397 */ 398 m |= cclass_flags[n]; 399 n = 0; 400 } else if (*sp >= '0' && *sp <= '9') 401 /* 402 * Encountered a digit, so start or continue building the cardinal 403 * that represents the character property flag. 404 */ 405 n = (n * 10) + (*sp - '0'); 406 else 407 /* 408 * Encountered something that is not part of the property list. 409 * Indicate that we are done. 410 */ 411 break; 412 413 /* 414 * If a property number greater than 32 occurs, then there is a 415 * problem. Most likely a missing comma separator. 416 */ 417 if (n > 32) 418 b->error = _URE_INVALID_PROPERTY; 419 } 420 421 if (n != 0) 422 m |= cclass_flags[n]; 423 424 /* 425 * Set the mask that represents the group of character properties. 426 */ 427 *mask = m; 428 429 /* 430 * Return the number of characters consumed. 431 */ 432 return sp - pp; 433 } 434 435 /* 436 * Collect a hex number with 1 to 4 digits and return the number 437 * of characters used. 438 */ 439 static unsigned long 440 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n) 441 { 442 ucs2_t i; 443 ucs2_t *sp, *ep; 444 ucs4_t nn; 445 446 sp = np; 447 ep = sp + limit; 448 449 for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) { 450 if (*sp >= '0' && *sp <= '9') 451 nn = (nn << 4) + (*sp - '0'); 452 else if (*sp >= 'A' && *sp <= 'F') 453 nn = (nn << 4) + ((*sp - 'A') + 10); 454 else if (*sp >= 'a' && *sp <= 'f') 455 nn = (nn << 4) + ((*sp - 'a') + 10); 456 else 457 /* 458 * Encountered something that is not a hex digit. 459 */ 460 break; 461 } 462 463 /* 464 * Assign the character code collected and return the number of 465 * characters used. 466 */ 467 *n = nn; 468 469 return sp - np; 470 } 471 472 /* 473 * Insert a range into a character class, removing duplicates and ordering 474 * them in increasing range-start order. 475 */ 476 static void 477 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b) 478 { 479 ucs2_t i; 480 ucs4_t tmp; 481 _ure_range_t *rp; 482 483 /* 484 * If the `casefold' flag is set, then make sure both endpoints of the 485 * range are converted to lower case. 486 */ 487 if (b->flags & _URE_DFA_CASEFOLD) { 488 r->min_code = _ure_tolower(r->min_code); 489 r->max_code = _ure_tolower(r->max_code); 490 } 491 492 /* 493 * Swap the range endpoints if they are not in increasing order. 494 */ 495 if (r->min_code > r->max_code) { 496 tmp = r->min_code; 497 r->min_code = r->max_code; 498 r->max_code = tmp; 499 } 500 501 for (i = 0, rp = ccl->ranges; 502 i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ; 503 504 /* 505 * Check for a duplicate. 506 */ 507 if (i < ccl->ranges_used && 508 r->min_code == rp->min_code && r->max_code == rp->max_code) 509 return; 510 511 if (ccl->ranges_used == ccl->ranges_size) { 512 if (ccl->ranges_size == 0) 513 ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3); 514 else 515 ccl->ranges = (_ure_range_t *) 516 realloc((char *) ccl->ranges, 517 sizeof(_ure_range_t) * (ccl->ranges_size + 8)); 518 ccl->ranges_size += 8; 519 } 520 521 rp = ccl->ranges + ccl->ranges_used; 522 523 if (i < ccl->ranges_used) 524 _ure_memmove((char *) (rp + 1), (char *) rp, 525 sizeof(_ure_range_t) * (ccl->ranges_used - i)); 526 527 ccl->ranges_used++; 528 rp->min_code = r->min_code; 529 rp->max_code = r->max_code; 530 } 531 532 #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\ 533 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING) 534 #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT) 535 #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\ 536 _URE_OTHERPUNCT) 537 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\ 538 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM) 539 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP) 540 #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP) 541 542 typedef void (*_ure_cclsetup_t)( 543 _ure_symtab_t *sym, 544 unsigned long mask, 545 _ure_buffer_t *b 546 ); 547 548 typedef struct { 549 ucs2_t key; 550 unsigned long len; 551 unsigned long next; 552 _ure_cclsetup_t func; 553 unsigned long mask; 554 } _ure_trie_t; 555 556 static void 557 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 558 { 559 sym->props |= mask; 560 } 561 562 static void 563 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 564 { 565 _ure_range_t range; 566 567 sym->props |= mask; 568 569 /* 570 * Add the additional characters needed for handling isspace(). 571 */ 572 range.min_code = range.max_code = '\t'; 573 _ure_add_range(&sym->sym.ccl, &range, b); 574 range.min_code = range.max_code = '\r'; 575 _ure_add_range(&sym->sym.ccl, &range, b); 576 range.min_code = range.max_code = '\n'; 577 _ure_add_range(&sym->sym.ccl, &range, b); 578 range.min_code = range.max_code = '\f'; 579 _ure_add_range(&sym->sym.ccl, &range, b); 580 range.min_code = range.max_code = 0xfeff; 581 _ure_add_range(&sym->sym.ccl, &range, b); 582 } 583 584 static void 585 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 586 { 587 _ure_range_t range; 588 589 /* 590 * Add the additional characters needed for handling isxdigit(). 591 */ 592 range.min_code = '0'; 593 range.max_code = '9'; 594 _ure_add_range(&sym->sym.ccl, &range, b); 595 range.min_code = 'A'; 596 range.max_code = 'F'; 597 _ure_add_range(&sym->sym.ccl, &range, b); 598 range.min_code = 'a'; 599 range.max_code = 'f'; 600 _ure_add_range(&sym->sym.ccl, &range, b); 601 } 602 603 static _ure_trie_t cclass_trie[] = { 604 {0x003a, 1, 1, 0, 0}, 605 {0x0061, 9, 10, 0, 0}, 606 {0x0063, 8, 19, 0, 0}, 607 {0x0064, 7, 24, 0, 0}, 608 {0x0067, 6, 29, 0, 0}, 609 {0x006c, 5, 34, 0, 0}, 610 {0x0070, 4, 39, 0, 0}, 611 {0x0073, 3, 49, 0, 0}, 612 {0x0075, 2, 54, 0, 0}, 613 {0x0078, 1, 59, 0, 0}, 614 {0x006c, 1, 11, 0, 0}, 615 {0x006e, 2, 13, 0, 0}, 616 {0x0070, 1, 16, 0, 0}, 617 {0x0075, 1, 14, 0, 0}, 618 {0x006d, 1, 15, 0, 0}, 619 {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK}, 620 {0x0068, 1, 17, 0, 0}, 621 {0x0061, 1, 18, 0, 0}, 622 {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK}, 623 {0x006e, 1, 20, 0, 0}, 624 {0x0074, 1, 21, 0, 0}, 625 {0x0072, 1, 22, 0, 0}, 626 {0x006c, 1, 23, 0, 0}, 627 {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL}, 628 {0x0069, 1, 25, 0, 0}, 629 {0x0067, 1, 26, 0, 0}, 630 {0x0069, 1, 27, 0, 0}, 631 {0x0074, 1, 28, 0, 0}, 632 {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT}, 633 {0x0072, 1, 30, 0, 0}, 634 {0x0061, 1, 31, 0, 0}, 635 {0x0070, 1, 32, 0, 0}, 636 {0x0068, 1, 33, 0, 0}, 637 {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK}, 638 {0x006f, 1, 35, 0, 0}, 639 {0x0077, 1, 36, 0, 0}, 640 {0x0065, 1, 37, 0, 0}, 641 {0x0072, 1, 38, 0, 0}, 642 {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER}, 643 {0x0072, 2, 41, 0, 0}, 644 {0x0075, 1, 45, 0, 0}, 645 {0x0069, 1, 42, 0, 0}, 646 {0x006e, 1, 43, 0, 0}, 647 {0x0074, 1, 44, 0, 0}, 648 {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK}, 649 {0x006e, 1, 46, 0, 0}, 650 {0x0063, 1, 47, 0, 0}, 651 {0x0074, 1, 48, 0, 0}, 652 {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK}, 653 {0x0070, 1, 50, 0, 0}, 654 {0x0061, 1, 51, 0, 0}, 655 {0x0063, 1, 52, 0, 0}, 656 {0x0065, 1, 53, 0, 0}, 657 {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK}, 658 {0x0070, 1, 55, 0, 0}, 659 {0x0070, 1, 56, 0, 0}, 660 {0x0065, 1, 57, 0, 0}, 661 {0x0072, 1, 58, 0, 0}, 662 {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER}, 663 {0x0064, 1, 60, 0, 0}, 664 {0x0069, 1, 61, 0, 0}, 665 {0x0067, 1, 62, 0, 0}, 666 {0x0069, 1, 63, 0, 0}, 667 {0x0074, 1, 64, 0, 0}, 668 {0x003a, 1, 65, _ure_xdigit_setup, 0}, 669 }; 670 671 /* 672 * Probe for one of the POSIX colon delimited character classes in the static 673 * trie. 674 */ 675 static unsigned long 676 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym, 677 _ure_buffer_t *b) 678 { 679 int i; 680 unsigned long n; 681 _ure_trie_t *tp; 682 ucs2_t *sp, *ep; 683 684 /* 685 * If the number of characters left is less than 7, then this cannot be 686 * interpreted as one of the colon delimited classes. 687 */ 688 if (limit < 7) 689 return 0; 690 691 sp = cp; 692 ep = sp + limit; 693 tp = cclass_trie; 694 for (i = 0; sp < ep && i < 8; i++, sp++) { 695 n = tp->len; 696 697 for (; n > 0 && tp->key != *sp; tp++, n--) ; 698 699 if (n == 0) 700 return 0; 701 702 if (*sp == ':' && (i == 6 || i == 7)) { 703 sp++; 704 break; 705 } 706 if (sp + 1 < ep) 707 tp = cclass_trie + tp->next; 708 } 709 if (tp->func == 0) 710 return 0; 711 712 (*tp->func)(sym, tp->mask, b); 713 714 return sp - cp; 715 } 716 717 /* 718 * Construct a list of ranges and return the number of characters consumed. 719 */ 720 static unsigned long 721 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp, 722 _ure_buffer_t *b) 723 { 724 int range_end; 725 unsigned long n; 726 ucs2_t *sp, *ep; 727 ucs4_t c, last; 728 _ure_ccl_t *cclp; 729 _ure_range_t range; 730 731 sp = cp; 732 ep = sp + limit; 733 734 if (*sp == '^') { 735 symp->type = _URE_NCCLASS; 736 sp++; 737 } else 738 symp->type = _URE_CCLASS; 739 740 for (last = 0, range_end = 0; 741 b->error == _URE_OK && sp < ep && *sp != ']'; ) { 742 c = *sp++; 743 if (c == '\\') { 744 if (sp == ep) { 745 /* 746 * The EOS was encountered when expecting the reverse solidus 747 * to be followed by the character it is escaping. Set an 748 * error code and return the number of characters consumed up 749 * to this point. 750 */ 751 b->error = _URE_UNEXPECTED_EOS; 752 return sp - cp; 753 } 754 755 c = *sp++; 756 switch (c) { 757 case 'a': 758 c = 0x07; 759 break; 760 case 'b': 761 c = 0x08; 762 break; 763 case 'f': 764 c = 0x0c; 765 break; 766 case 'n': 767 c = 0x0a; 768 break; 769 case 'r': 770 c = 0x0d; 771 break; 772 case 't': 773 c = 0x09; 774 break; 775 case 'v': 776 c = 0x0b; 777 break; 778 case 'p': 779 case 'P': 780 sp += _ure_prop_list(sp, ep - sp, &symp->props, b); 781 /* 782 * Invert the bit mask of the properties if this is a negated 783 * character class or if 'P' is used to specify a list of 784 * character properties that should *not* match in a 785 * character class. 786 */ 787 if (c == 'P') 788 symp->props = ~symp->props; 789 continue; 790 break; 791 case 'x': 792 case 'X': 793 case 'u': 794 case 'U': 795 if (sp < ep && 796 ((*sp >= '0' && *sp <= '9') || 797 (*sp >= 'A' && *sp <= 'F') || 798 (*sp >= 'a' && *sp <= 'f'))) 799 sp += _ure_hex(sp, ep - sp, &c); 800 } 801 } else if (c == ':') { 802 /* 803 * Probe for a POSIX colon delimited character class. 804 */ 805 sp--; 806 if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0) 807 sp++; 808 else { 809 sp += n; 810 continue; 811 } 812 } 813 814 cclp = &symp->sym.ccl; 815 816 /* 817 * Check to see if the current character is a low surrogate that needs 818 * to be combined with a preceding high surrogate. 819 */ 820 if (last != 0) { 821 if (c >= 0xdc00 && c <= 0xdfff) 822 /* 823 * Construct the UTF16 character code. 824 */ 825 c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff)); 826 else { 827 /* 828 * Add the isolated high surrogate to the range. 829 */ 830 if (range_end == 1) 831 range.max_code = last & 0xffff; 832 else 833 range.min_code = range.max_code = last & 0xffff; 834 835 _ure_add_range(cclp, &range, b); 836 range_end = 0; 837 } 838 } 839 840 /* 841 * Clear the last character code. 842 */ 843 last = 0; 844 845 /* 846 * This slightly awkward code handles the different cases needed to 847 * construct a range. 848 */ 849 if (c >= 0xd800 && c <= 0xdbff) { 850 /* 851 * If the high surrogate is followed by a range indicator, simply 852 * add it as the range start. Otherwise, save it in case the next 853 * character is a low surrogate. 854 */ 855 if (*sp == '-') { 856 sp++; 857 range.min_code = c; 858 range_end = 1; 859 } else 860 last = c; 861 } else if (range_end == 1) { 862 range.max_code = c; 863 _ure_add_range(cclp, &range, b); 864 range_end = 0; 865 } else { 866 range.min_code = range.max_code = c; 867 if (*sp == '-') { 868 sp++; 869 range_end = 1; 870 } else 871 _ure_add_range(cclp, &range, b); 872 } 873 } 874 875 if (sp < ep && *sp == ']') 876 sp++; 877 else 878 /* 879 * The parse was not terminated by the character class close symbol 880 * (']'), so set an error code. 881 */ 882 b->error = _URE_CCLASS_OPEN; 883 884 return sp - cp; 885 } 886 887 /* 888 * Probe for a low surrogate hex code. 889 */ 890 static unsigned long 891 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c) 892 { 893 ucs4_t i, code; 894 ucs2_t *sp, *ep; 895 896 for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) { 897 if (*sp >= '0' && *sp <= '9') 898 code = (code << 4) + (*sp - '0'); 899 else if (*sp >= 'A' && *sp <= 'F') 900 code = (code << 4) + ((*sp - 'A') + 10); 901 else if (*sp >= 'a' && *sp <= 'f') 902 code = (code << 4) + ((*sp - 'a') + 10); 903 else 904 break; 905 } 906 907 *c = code; 908 return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0; 909 } 910 911 static unsigned long 912 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp, 913 _ure_buffer_t *b) 914 { 915 ucs4_t c; 916 ucs2_t *sp, *ep; 917 918 sp = sym; 919 ep = sym + limit; 920 921 if ((c = *sp++) == '\\') { 922 923 if (sp == ep) { 924 /* 925 * The EOS was encountered when expecting the reverse solidus to 926 * be followed by the character it is escaping. Set an error code 927 * and return the number of characters consumed up to this point. 928 */ 929 b->error = _URE_UNEXPECTED_EOS; 930 return sp - sym; 931 } 932 933 c = *sp++; 934 switch (c) { 935 case 'p': 936 case 'P': 937 symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS; 938 sp += _ure_prop_list(sp, ep - sp, &symp->props, b); 939 break; 940 case 'a': 941 symp->type = _URE_CHAR; 942 symp->sym.chr = 0x07; 943 break; 944 case 'b': 945 symp->type = _URE_CHAR; 946 symp->sym.chr = 0x08; 947 break; 948 case 'f': 949 symp->type = _URE_CHAR; 950 symp->sym.chr = 0x0c; 951 break; 952 case 'n': 953 symp->type = _URE_CHAR; 954 symp->sym.chr = 0x0a; 955 break; 956 case 'r': 957 symp->type = _URE_CHAR; 958 symp->sym.chr = 0x0d; 959 break; 960 case 't': 961 symp->type = _URE_CHAR; 962 symp->sym.chr = 0x09; 963 break; 964 case 'v': 965 symp->type = _URE_CHAR; 966 symp->sym.chr = 0x0b; 967 break; 968 case 'x': 969 case 'X': 970 case 'u': 971 case 'U': 972 /* 973 * Collect between 1 and 4 digits representing a UCS2 code. Fall 974 * through to the next case. 975 */ 976 if (sp < ep && 977 ((*sp >= '0' && *sp <= '9') || 978 (*sp >= 'A' && *sp <= 'F') || 979 (*sp >= 'a' && *sp <= 'f'))) 980 sp += _ure_hex(sp, ep - sp, &c); 981 /* FALLTHROUGH */ 982 default: 983 /* 984 * Simply add an escaped character here. 985 */ 986 symp->type = _URE_CHAR; 987 symp->sym.chr = c; 988 } 989 } else if (c == '^' || c == '$') 990 /* 991 * Handle the BOL and EOL anchors. This actually consists simply of 992 * setting a flag that indicates that the user supplied anchor match 993 * function should be called. This needs to be done instead of simply 994 * matching line/paragraph separators because beginning-of-text and 995 * end-of-text tests are needed as well. 996 */ 997 symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR; 998 else if (c == '[') 999 /* 1000 * Construct a character class. 1001 */ 1002 sp += _ure_cclass(sp, ep - sp, symp, b); 1003 else if (c == '.') 1004 symp->type = _URE_ANY_CHAR; 1005 else { 1006 symp->type = _URE_CHAR; 1007 symp->sym.chr = c; 1008 } 1009 1010 /* 1011 * If the symbol type happens to be a character and is a high surrogate, 1012 * then probe forward to see if it is followed by a low surrogate that 1013 * needs to be added. 1014 */ 1015 if (sp < ep && symp->type == _URE_CHAR && 1016 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) { 1017 1018 if (0xdc00 <= *sp && *sp <= 0xdfff) { 1019 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | 1020 (*sp & 0x03ff)); 1021 sp++; 1022 } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' || 1023 *(sp + 1) == 'u' || *(sp + 1) == 'U')) { 1024 sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c); 1025 if (0xdc00 <= c && c <= 0xdfff) { 1026 /* 1027 * Take into account the \[xu] in front of the hex code. 1028 */ 1029 sp += 2; 1030 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | 1031 (c & 0x03ff)); 1032 } 1033 } 1034 } 1035 1036 /* 1037 * Last, make sure any _URE_CHAR type symbols are changed to lower case if 1038 * the `casefold' flag is set. 1039 */ 1040 if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR) 1041 symp->sym.chr = _ure_tolower(symp->sym.chr); 1042 1043 /* 1044 * If the symbol constructed is anything other than one of the anchors, 1045 * make sure the _URE_DFA_BLANKLINE flag is removed. 1046 */ 1047 if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR) 1048 b->flags &= ~_URE_DFA_BLANKLINE; 1049 1050 /* 1051 * Return the number of characters consumed. 1052 */ 1053 return sp - sym; 1054 } 1055 1056 static int 1057 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b) 1058 { 1059 if (a->type != b->type || a->mods != b->mods || a->props != b->props) 1060 return 1; 1061 1062 if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) { 1063 if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used) 1064 return 1; 1065 if (a->sym.ccl.ranges_used > 0 && 1066 memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges, 1067 sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0) 1068 return 1; 1069 } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr) 1070 return 1; 1071 return 0; 1072 } 1073 1074 /* 1075 * Construct a symbol, but only keep unique symbols. 1076 */ 1077 static ucs2_t 1078 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed, 1079 _ure_buffer_t *b) 1080 { 1081 ucs2_t i; 1082 _ure_symtab_t *sp, symbol; 1083 1084 /* 1085 * Build the next symbol so we can test to see if it is already in the 1086 * symbol table. 1087 */ 1088 (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t)); 1089 *consumed = _ure_compile_symbol(sym, limit, &symbol, b); 1090 1091 /* 1092 * Check to see if the symbol exists. 1093 */ 1094 for (i = 0, sp = b->symtab; 1095 i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ; 1096 1097 if (i < b->symtab_used) { 1098 /* 1099 * Free up any ranges used for the symbol. 1100 */ 1101 if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) && 1102 symbol.sym.ccl.ranges_size > 0) 1103 free((char *) symbol.sym.ccl.ranges); 1104 1105 return b->symtab[i].id; 1106 } 1107 1108 /* 1109 * Need to add the new symbol. 1110 */ 1111 if (b->symtab_used == b->symtab_size) { 1112 if (b->symtab_size == 0) 1113 b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3); 1114 else 1115 b->symtab = (_ure_symtab_t *) 1116 realloc((char *) b->symtab, 1117 sizeof(_ure_symtab_t) * (b->symtab_size + 8)); 1118 sp = b->symtab + b->symtab_size; 1119 (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3); 1120 b->symtab_size += 8; 1121 } 1122 1123 symbol.id = b->symtab_used++; 1124 (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol, 1125 sizeof(_ure_symtab_t)); 1126 1127 return symbol.id; 1128 } 1129 1130 /************************************************************************* 1131 * 1132 * End symbol parse functions. 1133 * 1134 *************************************************************************/ 1135 1136 static ucs2_t 1137 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b) 1138 { 1139 ucs2_t i; 1140 1141 if (b == 0) 1142 return _URE_NOOP; 1143 1144 /* 1145 * Determine if the expression already exists or not. 1146 */ 1147 for (i = 0; i < b->expr_used; i++) { 1148 if (b->expr[i].type == type && b->expr[i].lhs == lhs && 1149 b->expr[i].rhs == rhs) 1150 break; 1151 } 1152 if (i < b->expr_used) 1153 return i; 1154 1155 /* 1156 * Need to add a new expression. 1157 */ 1158 if (b->expr_used == b->expr_size) { 1159 if (b->expr_size == 0) 1160 b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3); 1161 else 1162 b->expr = (_ure_elt_t *) 1163 realloc((char *) b->expr, 1164 sizeof(_ure_elt_t) * (b->expr_size + 8)); 1165 b->expr_size += 8; 1166 } 1167 1168 b->expr[b->expr_used].onstack = 0; 1169 b->expr[b->expr_used].type = type; 1170 b->expr[b->expr_used].lhs = lhs; 1171 b->expr[b->expr_used].rhs = rhs; 1172 1173 return b->expr_used++; 1174 } 1175 1176 static unsigned char spmap[] = { 1177 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 1178 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1179 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1180 }; 1181 1182 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \ 1183 (spmap[(cc) >> 3] & (1 << ((cc) & 7)))) 1184 1185 /* 1186 * Convert the regular expression into an NFA in a form that will be easy to 1187 * reduce to a DFA. The starting state for the reduction will be returned. 1188 */ 1189 static ucs2_t 1190 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b) 1191 { 1192 ucs2_t c, state, top, sym, *sp, *ep; 1193 unsigned long used; 1194 1195 state = _URE_NOOP; 1196 1197 sp = re; 1198 ep = sp + relen; 1199 while (b->error == _URE_OK && sp < ep) { 1200 c = *sp++; 1201 switch (c) { 1202 case '(': 1203 _ure_push(_URE_PAREN, b); 1204 break; 1205 case ')': 1206 /* 1207 * Check for the case of too many close parentheses. 1208 */ 1209 if (_ure_peek(b) == _URE_NOOP) { 1210 b->error = _URE_UNBALANCED_GROUP; 1211 break; 1212 } 1213 1214 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1215 /* 1216 * Make an expression with the AND or OR operator and its right 1217 * hand side. 1218 */ 1219 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1220 1221 /* 1222 * Remove the _URE_PAREN off the stack. 1223 */ 1224 (void) _ure_pop(b); 1225 break; 1226 case '*': 1227 state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b); 1228 break; 1229 case '+': 1230 state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b); 1231 break; 1232 case '?': 1233 state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b); 1234 break; 1235 case '|': 1236 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1237 /* 1238 * Make an expression with the AND or OR operator and its right 1239 * hand side. 1240 */ 1241 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1242 1243 _ure_push(state, b); 1244 _ure_push(_URE_OR, b); 1245 break; 1246 default: 1247 sp--; 1248 sym = _ure_make_symbol(sp, ep - sp, &used, b); 1249 sp += used; 1250 state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b); 1251 break; 1252 } 1253 1254 if (c != '(' && c != '|' && sp < ep && 1255 (!_ure_isspecial(*sp) || *sp == '(')) { 1256 _ure_push(state, b); 1257 _ure_push(_URE_AND, b); 1258 } 1259 } 1260 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1261 /* 1262 * Make an expression with the AND or OR operator and its right 1263 * hand side. 1264 */ 1265 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1266 1267 if (b->stack.slist_used > 0) 1268 b->error = _URE_UNBALANCED_GROUP; 1269 1270 return (b->error == _URE_OK) ? state : _URE_NOOP; 1271 } 1272 1273 static void 1274 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b) 1275 { 1276 ucs2_t i, *stp; 1277 _ure_symtab_t *sp; 1278 1279 /* 1280 * Locate the symbol in the symbol table so the state can be added. 1281 * If the symbol doesn't exist, then a real problem exists. 1282 */ 1283 for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id; 1284 i++, sp++) ; 1285 1286 /* 1287 * Now find out if the state exists in the symbol's state list. 1288 */ 1289 for (i = 0, stp = sp->states.slist; 1290 i < sp->states.slist_used && state > *stp; i++, stp++) ; 1291 1292 if (i == sp->states.slist_used || state < *stp) { 1293 /* 1294 * Need to add the state in order. 1295 */ 1296 if (sp->states.slist_used == sp->states.slist_size) { 1297 if (sp->states.slist_size == 0) 1298 sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); 1299 else 1300 sp->states.slist = (ucs2_t *) 1301 realloc((char *) sp->states.slist, 1302 sizeof(ucs2_t) * (sp->states.slist_size + 8)); 1303 sp->states.slist_size += 8; 1304 } 1305 if (i < sp->states.slist_used) 1306 (void) _ure_memmove((char *) (sp->states.slist + i + 1), 1307 (char *) (sp->states.slist + i), 1308 sizeof(ucs2_t) * (sp->states.slist_used - i)); 1309 sp->states.slist[i] = state; 1310 sp->states.slist_used++; 1311 } 1312 } 1313 1314 static ucs2_t 1315 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b) 1316 { 1317 ucs2_t i; 1318 _ure_state_t *sp; 1319 1320 for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) { 1321 if (sp->st.slist_used == nstates && 1322 memcmp((char *) states, (char *) sp->st.slist, 1323 sizeof(ucs2_t) * nstates) == 0) 1324 break; 1325 } 1326 1327 if (i == b->states.states_used) { 1328 /* 1329 * Need to add a new DFA state (set of NFA states). 1330 */ 1331 if (b->states.states_used == b->states.states_size) { 1332 if (b->states.states_size == 0) 1333 b->states.states = (_ure_state_t *) 1334 malloc(sizeof(_ure_state_t) << 3); 1335 else 1336 b->states.states = (_ure_state_t *) 1337 realloc((char *) b->states.states, 1338 sizeof(_ure_state_t) * (b->states.states_size + 8)); 1339 sp = b->states.states + b->states.states_size; 1340 (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3); 1341 b->states.states_size += 8; 1342 } 1343 1344 sp = b->states.states + b->states.states_used++; 1345 sp->id = i; 1346 1347 if (sp->st.slist_used + nstates > sp->st.slist_size) { 1348 if (sp->st.slist_size == 0) 1349 sp->st.slist = (ucs2_t *) 1350 malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates)); 1351 else 1352 sp->st.slist = (ucs2_t *) 1353 realloc((char *) sp->st.slist, 1354 sizeof(ucs2_t) * (sp->st.slist_used + nstates)); 1355 sp->st.slist_size = sp->st.slist_used + nstates; 1356 } 1357 sp->st.slist_used = nstates; 1358 (void) AC_MEMCPY((char *) sp->st.slist, (char *) states, 1359 sizeof(ucs2_t) * nstates); 1360 } 1361 1362 /* 1363 * Return the ID of the DFA state representing a group of NFA states. 1364 */ 1365 return i; 1366 } 1367 1368 static void 1369 _ure_reduce(ucs2_t start, _ure_buffer_t *b) 1370 { 1371 ucs2_t i, j, state, eval, syms, rhs; 1372 ucs2_t s1, s2, ns1, ns2; 1373 _ure_state_t *sp; 1374 _ure_symtab_t *smp; 1375 1376 b->reducing = 1; 1377 1378 /* 1379 * Add the starting state for the reduction. 1380 */ 1381 _ure_add_state(1, &start, b); 1382 1383 /* 1384 * Process each set of NFA states that get created. 1385 */ 1386 for (i = 0; i < b->states.states_used; i++) { 1387 sp = b->states.states + i; 1388 1389 /* 1390 * Push the current states on the stack. 1391 */ 1392 for (j = 0; j < sp->st.slist_used; j++) 1393 _ure_push(sp->st.slist[j], b); 1394 1395 /* 1396 * Reduce the NFA states. 1397 */ 1398 for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) { 1399 state = b->stack.slist[j]; 1400 eval = 1; 1401 1402 /* 1403 * This inner loop is the iterative equivalent of recursively 1404 * reducing subexpressions generated as a result of a reduction. 1405 */ 1406 while (eval) { 1407 switch (b->expr[state].type) { 1408 case _URE_SYMBOL: 1409 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1410 _ure_add_symstate(b->expr[state].lhs, ns1, b); 1411 syms++; 1412 eval = 0; 1413 break; 1414 case _URE_ONE: 1415 sp->accepting = 1; 1416 eval = 0; 1417 break; 1418 case _URE_QUEST: 1419 s1 = b->expr[state].lhs; 1420 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1421 state = _ure_make_expr(_URE_OR, ns1, s1, b); 1422 break; 1423 case _URE_PLUS: 1424 s1 = b->expr[state].lhs; 1425 ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b); 1426 state = _ure_make_expr(_URE_AND, s1, ns1, b); 1427 break; 1428 case _URE_STAR: 1429 s1 = b->expr[state].lhs; 1430 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1431 ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b); 1432 state = _ure_make_expr(_URE_OR, ns1, ns2, b); 1433 break; 1434 case _URE_OR: 1435 s1 = b->expr[state].lhs; 1436 s2 = b->expr[state].rhs; 1437 _ure_push(s1, b); 1438 _ure_push(s2, b); 1439 eval = 0; 1440 break; 1441 case _URE_AND: 1442 s1 = b->expr[state].lhs; 1443 s2 = b->expr[state].rhs; 1444 switch (b->expr[s1].type) { 1445 case _URE_SYMBOL: 1446 _ure_add_symstate(b->expr[s1].lhs, s2, b); 1447 syms++; 1448 eval = 0; 1449 break; 1450 case _URE_ONE: 1451 state = s2; 1452 break; 1453 case _URE_QUEST: 1454 ns1 = b->expr[s1].lhs; 1455 ns2 = _ure_make_expr(_URE_AND, ns1, s2, b); 1456 state = _ure_make_expr(_URE_OR, s2, ns2, b); 1457 break; 1458 case _URE_PLUS: 1459 ns1 = b->expr[s1].lhs; 1460 ns2 = _ure_make_expr(_URE_OR, s2, state, b); 1461 state = _ure_make_expr(_URE_AND, ns1, ns2, b); 1462 break; 1463 case _URE_STAR: 1464 ns1 = b->expr[s1].lhs; 1465 ns2 = _ure_make_expr(_URE_AND, ns1, state, b); 1466 state = _ure_make_expr(_URE_OR, s2, ns2, b); 1467 break; 1468 case _URE_OR: 1469 ns1 = b->expr[s1].lhs; 1470 ns2 = b->expr[s1].rhs; 1471 ns1 = _ure_make_expr(_URE_AND, ns1, s2, b); 1472 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); 1473 state = _ure_make_expr(_URE_OR, ns1, ns2, b); 1474 break; 1475 case _URE_AND: 1476 ns1 = b->expr[s1].lhs; 1477 ns2 = b->expr[s1].rhs; 1478 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); 1479 state = _ure_make_expr(_URE_AND, ns1, ns2, b); 1480 break; 1481 } 1482 } 1483 } 1484 } 1485 1486 /* 1487 * Clear the state stack. 1488 */ 1489 while (_ure_pop(b) != _URE_NOOP) ; 1490 1491 /* 1492 * Reset the state pointer because the reduction may have moved it 1493 * during a reallocation. 1494 */ 1495 sp = b->states.states + i; 1496 1497 /* 1498 * Generate the DFA states for the symbols collected during the 1499 * current reduction. 1500 */ 1501 if (sp->trans_used + syms > sp->trans_size) { 1502 if (sp->trans_size == 0) 1503 sp->trans = (_ure_elt_t *) 1504 malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms)); 1505 else 1506 sp->trans = (_ure_elt_t *) 1507 realloc((char *) sp->trans, 1508 sizeof(_ure_elt_t) * (sp->trans_used + syms)); 1509 sp->trans_size = sp->trans_used + syms; 1510 } 1511 1512 /* 1513 * Go through the symbol table and generate the DFA state transitions 1514 * for each symbol that has collected NFA states. 1515 */ 1516 for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) { 1517 sp = b->states.states + i; 1518 1519 if (smp->states.slist_used > 0) { 1520 sp->trans[syms].lhs = smp->id; 1521 rhs = _ure_add_state(smp->states.slist_used, 1522 smp->states.slist, b); 1523 /* 1524 * Reset the state pointer in case the reallocation moves it 1525 * in memory. 1526 */ 1527 sp = b->states.states + i; 1528 sp->trans[syms].rhs = rhs; 1529 1530 smp->states.slist_used = 0; 1531 syms++; 1532 } 1533 } 1534 1535 /* 1536 * Set the number of transitions actually used. 1537 */ 1538 sp->trans_used = syms; 1539 } 1540 b->reducing = 0; 1541 } 1542 1543 static void 1544 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b) 1545 { 1546 ucs2_t tmp; 1547 1548 l = b->states.states[l].id; 1549 r = b->states.states[r].id; 1550 1551 if (l == r) 1552 return; 1553 1554 if (l > r) { 1555 tmp = l; 1556 l = r; 1557 r = tmp; 1558 } 1559 1560 /* 1561 * Check to see if the equivalence pair already exists. 1562 */ 1563 for (tmp = 0; tmp < b->equiv_used && 1564 (b->equiv[tmp].l != l || b->equiv[tmp].r != r); 1565 tmp++) ; 1566 1567 if (tmp < b->equiv_used) 1568 return; 1569 1570 if (b->equiv_used == b->equiv_size) { 1571 if (b->equiv_size == 0) 1572 b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3); 1573 else 1574 b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv, 1575 sizeof(_ure_equiv_t) * 1576 (b->equiv_size + 8)); 1577 b->equiv_size += 8; 1578 } 1579 b->equiv[b->equiv_used].l = l; 1580 b->equiv[b->equiv_used].r = r; 1581 b->equiv_used++; 1582 } 1583 1584 /* 1585 * Merge the DFA states that are equivalent. 1586 */ 1587 static void 1588 _ure_merge_equiv(_ure_buffer_t *b) 1589 { 1590 ucs2_t i, j, k, eq, done; 1591 _ure_state_t *sp1, *sp2, *ls, *rs; 1592 1593 for (i = 0; i < b->states.states_used; i++) { 1594 sp1 = b->states.states + i; 1595 if (sp1->id != i) 1596 continue; 1597 for (j = 0; j < i; j++) { 1598 sp2 = b->states.states + j; 1599 if (sp2->id != j) 1600 continue; 1601 b->equiv_used = 0; 1602 _ure_add_equiv(i, j, b); 1603 for (eq = 0, done = 0; eq < b->equiv_used; eq++) { 1604 ls = b->states.states + b->equiv[eq].l; 1605 rs = b->states.states + b->equiv[eq].r; 1606 if (ls->accepting != rs->accepting || 1607 ls->trans_used != rs->trans_used) { 1608 done = 1; 1609 break; 1610 } 1611 for (k = 0; k < ls->trans_used && 1612 ls->trans[k].lhs == rs->trans[k].lhs; k++) ; 1613 if (k < ls->trans_used) { 1614 done = 1; 1615 break; 1616 } 1617 1618 for (k = 0; k < ls->trans_used; k++) 1619 _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b); 1620 } 1621 if (done == 0) 1622 break; 1623 } 1624 for (eq = 0; j < i && eq < b->equiv_used; eq++) 1625 b->states.states[b->equiv[eq].r].id = 1626 b->states.states[b->equiv[eq].l].id; 1627 } 1628 1629 /* 1630 * Renumber the states appropriately. 1631 */ 1632 for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used; 1633 sp1++, i++) 1634 sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id; 1635 } 1636 1637 /************************************************************************* 1638 * 1639 * API. 1640 * 1641 *************************************************************************/ 1642 1643 ure_buffer_t 1644 ure_buffer_create(void) 1645 { 1646 ure_buffer_t b; 1647 1648 b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t)); 1649 1650 return b; 1651 } 1652 1653 void 1654 ure_buffer_free(ure_buffer_t buf) 1655 { 1656 unsigned long i; 1657 1658 if (buf == 0) 1659 return; 1660 1661 if (buf->stack.slist_size > 0) 1662 free((char *) buf->stack.slist); 1663 1664 if (buf->expr_size > 0) 1665 free((char *) buf->expr); 1666 1667 for (i = 0; i < buf->symtab_size; i++) { 1668 if (buf->symtab[i].states.slist_size > 0) 1669 free((char *) buf->symtab[i].states.slist); 1670 } 1671 1672 if (buf->symtab_size > 0) 1673 free((char *) buf->symtab); 1674 1675 for (i = 0; i < buf->states.states_size; i++) { 1676 if (buf->states.states[i].trans_size > 0) 1677 free((char *) buf->states.states[i].trans); 1678 if (buf->states.states[i].st.slist_size > 0) 1679 free((char *) buf->states.states[i].st.slist); 1680 } 1681 1682 if (buf->states.states_size > 0) 1683 free((char *) buf->states.states); 1684 1685 if (buf->equiv_size > 0) 1686 free((char *) buf->equiv); 1687 1688 free((char *) buf); 1689 } 1690 1691 ure_dfa_t 1692 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf) 1693 { 1694 ucs2_t i, j, state; 1695 _ure_state_t *sp; 1696 _ure_dstate_t *dsp; 1697 _ure_trans_t *tp; 1698 ure_dfa_t dfa; 1699 1700 if (re == 0 || *re == 0 || relen == 0 || buf == 0) 1701 return 0; 1702 1703 /* 1704 * Reset the various fields of the compilation buffer. Default the flags 1705 * to indicate the presense of the "^$" pattern. If any other pattern 1706 * occurs, then this flag will be removed. This is done to catch this 1707 * special pattern and handle it specially when matching. 1708 */ 1709 buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0); 1710 buf->reducing = 0; 1711 buf->stack.slist_used = 0; 1712 buf->expr_used = 0; 1713 1714 for (i = 0; i < buf->symtab_used; i++) 1715 buf->symtab[i].states.slist_used = 0; 1716 buf->symtab_used = 0; 1717 1718 for (i = 0; i < buf->states.states_used; i++) { 1719 buf->states.states[i].st.slist_used = 0; 1720 buf->states.states[i].trans_used = 0; 1721 } 1722 buf->states.states_used = 0; 1723 1724 /* 1725 * Construct the NFA. If this stage returns a 0, then an error occured or 1726 * an empty expression was passed. 1727 */ 1728 if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP) 1729 return 0; 1730 1731 /* 1732 * Do the expression reduction to get the initial DFA. 1733 */ 1734 _ure_reduce(state, buf); 1735 1736 /* 1737 * Merge all the equivalent DFA states. 1738 */ 1739 _ure_merge_equiv(buf); 1740 1741 /* 1742 * Construct the minimal DFA. 1743 */ 1744 dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t)); 1745 (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t)); 1746 1747 dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE); 1748 1749 /* 1750 * Free up the NFA state groups and transfer the symbols from the buffer 1751 * to the DFA. 1752 */ 1753 for (i = 0; i < buf->symtab_size; i++) { 1754 if (buf->symtab[i].states.slist_size > 0) 1755 free((char *) buf->symtab[i].states.slist); 1756 } 1757 dfa->syms = buf->symtab; 1758 dfa->nsyms = buf->symtab_used; 1759 1760 buf->symtab_used = buf->symtab_size = 0; 1761 1762 /* 1763 * Collect the total number of states and transitions needed for the DFA. 1764 */ 1765 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; 1766 i++, sp++) { 1767 if (sp->id == state) { 1768 dfa->nstates++; 1769 dfa->ntrans += sp->trans_used; 1770 state++; 1771 } 1772 } 1773 1774 /* 1775 * Allocate enough space for the states and transitions. 1776 */ 1777 dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) * 1778 dfa->nstates); 1779 dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans); 1780 1781 /* 1782 * Actually transfer the DFA states from the buffer. 1783 */ 1784 dsp = dfa->states; 1785 tp = dfa->trans; 1786 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; 1787 i++, sp++) { 1788 if (sp->id == state) { 1789 dsp->trans = tp; 1790 dsp->ntrans = sp->trans_used; 1791 dsp->accepting = sp->accepting; 1792 1793 /* 1794 * Add the transitions for the state. 1795 */ 1796 for (j = 0; j < dsp->ntrans; j++, tp++) { 1797 tp->symbol = sp->trans[j].lhs; 1798 tp->next_state = buf->states.states[sp->trans[j].rhs].id; 1799 } 1800 1801 dsp++; 1802 state++; 1803 } 1804 } 1805 1806 return dfa; 1807 } 1808 1809 void 1810 ure_dfa_free(ure_dfa_t dfa) 1811 { 1812 ucs2_t i; 1813 1814 if (dfa == 0) 1815 return; 1816 1817 for (i = 0; i < dfa->nsyms; i++) { 1818 if ((dfa->syms[i].type == _URE_CCLASS || 1819 dfa->syms[i].type == _URE_NCCLASS) && 1820 dfa->syms[i].sym.ccl.ranges_size > 0) 1821 free((char *) dfa->syms[i].sym.ccl.ranges); 1822 } 1823 if (dfa->nsyms > 0) 1824 free((char *) dfa->syms); 1825 1826 if (dfa->nstates > 0) 1827 free((char *) dfa->states); 1828 if (dfa->ntrans > 0) 1829 free((char *) dfa->trans); 1830 free((char *) dfa); 1831 } 1832 1833 void 1834 ure_write_dfa(ure_dfa_t dfa, FILE *out) 1835 { 1836 ucs2_t i, j, k, h, l; 1837 _ure_dstate_t *sp; 1838 _ure_symtab_t *sym; 1839 _ure_range_t *rp; 1840 1841 if (dfa == 0 || out == 0) 1842 return; 1843 1844 /* 1845 * Write all the different character classes. 1846 */ 1847 for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) { 1848 if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) { 1849 fprintf(out, "C%hd = ", sym->id); 1850 if (sym->sym.ccl.ranges_used > 0) { 1851 putc('[', out); 1852 if (sym->type == _URE_NCCLASS) 1853 putc('^', out); 1854 } 1855 if (sym->props != 0) { 1856 if (sym->type == _URE_NCCLASS) 1857 fprintf(out, "\\P"); 1858 else 1859 fprintf(out, "\\p"); 1860 for (k = h = 0; k < 32; k++) { 1861 if (sym->props & (1 << k)) { 1862 if (h != 0) 1863 putc(',', out); 1864 fprintf(out, "%hd", k + 1); 1865 h = 1; 1866 } 1867 } 1868 } 1869 /* 1870 * Dump the ranges. 1871 */ 1872 for (k = 0, rp = sym->sym.ccl.ranges; 1873 k < sym->sym.ccl.ranges_used; k++, rp++) { 1874 /* 1875 * Check for UTF16 characters. 1876 */ 1877 if (0x10000 <= rp->min_code && 1878 rp->min_code <= 0x10ffff) { 1879 h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800); 1880 l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00); 1881 fprintf(out, "\\x%04hX\\x%04hX", h, l); 1882 } else 1883 fprintf(out, "\\x%04lX", rp->min_code & 0xffff); 1884 if (rp->max_code != rp->min_code) { 1885 putc('-', out); 1886 if (rp->max_code >= 0x10000 && 1887 rp->max_code <= 0x10ffff) { 1888 h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800); 1889 l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00); 1890 fprintf(out, "\\x%04hX\\x%04hX", h, l); 1891 } else 1892 fprintf(out, "\\x%04lX", rp->max_code & 0xffff); 1893 } 1894 } 1895 if (sym->sym.ccl.ranges_used > 0) 1896 putc(']', out); 1897 putc('\n', out); 1898 } 1899 } 1900 1901 for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) { 1902 fprintf(out, "S%hd = ", i); 1903 if (sp->accepting) { 1904 fprintf(out, "1 "); 1905 if (sp->ntrans) 1906 fprintf(out, "| "); 1907 } 1908 for (j = 0; j < sp->ntrans; j++) { 1909 if (j > 0) 1910 fprintf(out, "| "); 1911 1912 sym = dfa->syms + sp->trans[j].symbol; 1913 switch (sym->type) { 1914 case _URE_CHAR: 1915 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) { 1916 /* 1917 * Take care of UTF16 characters. 1918 */ 1919 h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800); 1920 l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00); 1921 fprintf(out, "\\x%04hX\\x%04hX ", h, l); 1922 } else 1923 fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff); 1924 break; 1925 case _URE_ANY_CHAR: 1926 fprintf(out, "<any> "); 1927 break; 1928 case _URE_BOL_ANCHOR: 1929 fprintf(out, "<bol-anchor> "); 1930 break; 1931 case _URE_EOL_ANCHOR: 1932 fprintf(out, "<eol-anchor> "); 1933 break; 1934 case _URE_CCLASS: 1935 case _URE_NCCLASS: 1936 fprintf(out, "[C%hd] ", sym->id); 1937 break; 1938 } 1939 fprintf(out, "S%hd", sp->trans[j].next_state); 1940 if (j + 1 < sp->ntrans) 1941 putc(' ', out); 1942 } 1943 putc('\n', out); 1944 } 1945 } 1946 1947 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\ 1948 (cc) == 0x2029) 1949 1950 int 1951 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen, 1952 unsigned long *match_start, unsigned long *match_end) 1953 { 1954 int i, j, matched, found, skip; 1955 unsigned long ms, me; 1956 ucs4_t c; 1957 ucs2_t *sp, *ep, *lp; 1958 _ure_dstate_t *stp; 1959 _ure_symtab_t *sym; 1960 _ure_range_t *rp; 1961 1962 if (dfa == 0 || text == 0) 1963 return 0; 1964 1965 /* 1966 * Handle the special case of an empty string matching the "^$" pattern. 1967 */ 1968 if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) { 1969 *match_start = *match_end = 0; 1970 return 1; 1971 } 1972 1973 sp = text; 1974 ep = sp + textlen; 1975 1976 ms = me = ~0; 1977 1978 stp = dfa->states; 1979 1980 for (found = skip = 0; found == 0 && sp < ep; ) { 1981 lp = sp; 1982 c = *sp++; 1983 1984 /* 1985 * Check to see if this is a high surrogate that should be 1986 * combined with a following low surrogate. 1987 */ 1988 if (sp < ep && 0xd800 <= c && c <= 0xdbff && 1989 0xdc00 <= *sp && *sp <= 0xdfff) 1990 c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff)); 1991 1992 /* 1993 * Determine if the character is non-spacing and should be skipped. 1994 */ 1995 if (_ure_matches_properties(_URE_NONSPACING, c) && 1996 (flags & URE_IGNORE_NONSPACING)) { 1997 sp++; 1998 continue; 1999 } 2000 2001 if (dfa->flags & _URE_DFA_CASEFOLD) 2002 c = _ure_tolower(c); 2003 2004 /* 2005 * See if one of the transitions matches. 2006 */ 2007 for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) { 2008 sym = dfa->syms + stp->trans[i].symbol; 2009 switch (sym->type) { 2010 case _URE_ANY_CHAR: 2011 if ((flags & URE_DOT_MATCHES_SEPARATORS) || 2012 !_ure_issep(c)) 2013 matched = 1; 2014 break; 2015 case _URE_CHAR: 2016 if (c == sym->sym.chr) 2017 matched = 1; 2018 break; 2019 case _URE_BOL_ANCHOR: 2020 if (lp == text) { 2021 sp = lp; 2022 matched = 1; 2023 } else if (_ure_issep(c)) { 2024 if (c == '\r' && sp < ep && *sp == '\n') 2025 sp++; 2026 lp = sp; 2027 matched = 1; 2028 } 2029 break; 2030 case _URE_EOL_ANCHOR: 2031 if (_ure_issep(c)) { 2032 /* 2033 * Put the pointer back before the separator so the match 2034 * end position will be correct. This case will also 2035 * cause the `sp' pointer to be advanced over the current 2036 * separator once the match end point has been recorded. 2037 */ 2038 sp = lp; 2039 matched = 1; 2040 } 2041 break; 2042 case _URE_CCLASS: 2043 case _URE_NCCLASS: 2044 if (sym->props != 0) 2045 matched = _ure_matches_properties(sym->props, c); 2046 for (j = 0, rp = sym->sym.ccl.ranges; 2047 j < sym->sym.ccl.ranges_used; j++, rp++) { 2048 if (rp->min_code <= c && c <= rp->max_code) 2049 matched = 1; 2050 } 2051 if (sym->type == _URE_NCCLASS) 2052 matched = !matched; 2053 break; 2054 } 2055 2056 if (matched) { 2057 if (ms == ~0UL) 2058 ms = lp - text; 2059 else 2060 me = sp - text; 2061 stp = dfa->states + stp->trans[i].next_state; 2062 2063 /* 2064 * If the match was an EOL anchor, adjust the pointer past the 2065 * separator that caused the match. The correct match 2066 * position has been recorded already. 2067 */ 2068 if (sym->type == _URE_EOL_ANCHOR) { 2069 /* 2070 * Skip the character that caused the match. 2071 */ 2072 sp++; 2073 2074 /* 2075 * Handle the infamous CRLF situation. 2076 */ 2077 if (sp < ep && c == '\r' && *sp == '\n') 2078 sp++; 2079 } 2080 } 2081 } 2082 2083 if (matched == 0) { 2084 if (stp->accepting == 0) { 2085 /* 2086 * If the last state was not accepting, then reset 2087 * and start over. 2088 */ 2089 stp = dfa->states; 2090 ms = me = ~0; 2091 } else 2092 /* 2093 * The last state was accepting, so terminate the matching 2094 * loop to avoid more work. 2095 */ 2096 found = 1; 2097 } else if (sp == ep) { 2098 if (!stp->accepting) { 2099 /* 2100 * This ugly hack is to make sure the end-of-line anchors 2101 * match when the source text hits the end. This is only done 2102 * if the last subexpression matches. 2103 */ 2104 for (i = 0; found == 0 && i < stp->ntrans; i++) { 2105 sym = dfa->syms + stp->trans[i].symbol; 2106 if (sym->type ==_URE_EOL_ANCHOR) { 2107 stp = dfa->states + stp->trans[i].next_state; 2108 if (stp->accepting) { 2109 me = sp - text; 2110 found = 1; 2111 } else 2112 break; 2113 } 2114 } 2115 } else { 2116 /* 2117 * Make sure any conditions that match all the way to the end 2118 * of the string match. 2119 */ 2120 found = 1; 2121 me = sp - text; 2122 } 2123 } 2124 } 2125 2126 if (found == 0) 2127 ms = me = ~0; 2128 2129 *match_start = ms; 2130 *match_end = me; 2131 2132 return (ms != ~0UL) ? 1 : 0; 2133 } 2134