1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/schemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 #ifdef HAVE_STDINT_H 30 #include <stdint.h> 31 #endif 32 33 #include <libxml/tree.h> 34 #include <libxml/parserInternals.h> 35 #include <libxml/xmlregexp.h> 36 #include <libxml/xmlautomata.h> 37 #include <libxml/xmlunicode.h> 38 39 #ifndef INT_MAX 40 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 41 #endif 42 #ifndef SIZE_MAX 43 #define SIZE_MAX ((size_t) -1) 44 #endif 45 46 /* #define DEBUG_REGEXP_GRAPH */ 47 /* #define DEBUG_REGEXP_EXEC */ 48 /* #define DEBUG_PUSH */ 49 /* #define DEBUG_COMPACTION */ 50 51 #define MAX_PUSH 10000000 52 53 #ifdef ERROR 54 #undef ERROR 55 #endif 56 #define ERROR(str) \ 57 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 58 xmlRegexpErrCompile(ctxt, str); 59 #define NEXT ctxt->cur++ 60 #define CUR (*(ctxt->cur)) 61 #define NXT(index) (ctxt->cur[index]) 62 63 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 64 #define NEXTL(l) ctxt->cur += l; 65 #define XML_REG_STRING_SEPARATOR '|' 66 /* 67 * Need PREV to check on a '-' within a Character Group. May only be used 68 * when it's guaranteed that cur is not at the beginning of ctxt->string! 69 */ 70 #define PREV (ctxt->cur[-1]) 71 72 /** 73 * TODO: 74 * 75 * macro to flag unimplemented blocks 76 */ 77 #define TODO \ 78 xmlGenericError(xmlGenericErrorContext, \ 79 "Unimplemented block at %s:%d\n", \ 80 __FILE__, __LINE__); 81 82 /************************************************************************ 83 * * 84 * Datatypes and structures * 85 * * 86 ************************************************************************/ 87 88 /* 89 * Note: the order of the enums below is significant, do not shuffle 90 */ 91 typedef enum { 92 XML_REGEXP_EPSILON = 1, 93 XML_REGEXP_CHARVAL, 94 XML_REGEXP_RANGES, 95 XML_REGEXP_SUBREG, /* used for () sub regexps */ 96 XML_REGEXP_STRING, 97 XML_REGEXP_ANYCHAR, /* . */ 98 XML_REGEXP_ANYSPACE, /* \s */ 99 XML_REGEXP_NOTSPACE, /* \S */ 100 XML_REGEXP_INITNAME, /* \l */ 101 XML_REGEXP_NOTINITNAME, /* \L */ 102 XML_REGEXP_NAMECHAR, /* \c */ 103 XML_REGEXP_NOTNAMECHAR, /* \C */ 104 XML_REGEXP_DECIMAL, /* \d */ 105 XML_REGEXP_NOTDECIMAL, /* \D */ 106 XML_REGEXP_REALCHAR, /* \w */ 107 XML_REGEXP_NOTREALCHAR, /* \W */ 108 XML_REGEXP_LETTER = 100, 109 XML_REGEXP_LETTER_UPPERCASE, 110 XML_REGEXP_LETTER_LOWERCASE, 111 XML_REGEXP_LETTER_TITLECASE, 112 XML_REGEXP_LETTER_MODIFIER, 113 XML_REGEXP_LETTER_OTHERS, 114 XML_REGEXP_MARK, 115 XML_REGEXP_MARK_NONSPACING, 116 XML_REGEXP_MARK_SPACECOMBINING, 117 XML_REGEXP_MARK_ENCLOSING, 118 XML_REGEXP_NUMBER, 119 XML_REGEXP_NUMBER_DECIMAL, 120 XML_REGEXP_NUMBER_LETTER, 121 XML_REGEXP_NUMBER_OTHERS, 122 XML_REGEXP_PUNCT, 123 XML_REGEXP_PUNCT_CONNECTOR, 124 XML_REGEXP_PUNCT_DASH, 125 XML_REGEXP_PUNCT_OPEN, 126 XML_REGEXP_PUNCT_CLOSE, 127 XML_REGEXP_PUNCT_INITQUOTE, 128 XML_REGEXP_PUNCT_FINQUOTE, 129 XML_REGEXP_PUNCT_OTHERS, 130 XML_REGEXP_SEPAR, 131 XML_REGEXP_SEPAR_SPACE, 132 XML_REGEXP_SEPAR_LINE, 133 XML_REGEXP_SEPAR_PARA, 134 XML_REGEXP_SYMBOL, 135 XML_REGEXP_SYMBOL_MATH, 136 XML_REGEXP_SYMBOL_CURRENCY, 137 XML_REGEXP_SYMBOL_MODIFIER, 138 XML_REGEXP_SYMBOL_OTHERS, 139 XML_REGEXP_OTHER, 140 XML_REGEXP_OTHER_CONTROL, 141 XML_REGEXP_OTHER_FORMAT, 142 XML_REGEXP_OTHER_PRIVATE, 143 XML_REGEXP_OTHER_NA, 144 XML_REGEXP_BLOCK_NAME 145 } xmlRegAtomType; 146 147 typedef enum { 148 XML_REGEXP_QUANT_EPSILON = 1, 149 XML_REGEXP_QUANT_ONCE, 150 XML_REGEXP_QUANT_OPT, 151 XML_REGEXP_QUANT_MULT, 152 XML_REGEXP_QUANT_PLUS, 153 XML_REGEXP_QUANT_ONCEONLY, 154 XML_REGEXP_QUANT_ALL, 155 XML_REGEXP_QUANT_RANGE 156 } xmlRegQuantType; 157 158 typedef enum { 159 XML_REGEXP_START_STATE = 1, 160 XML_REGEXP_FINAL_STATE, 161 XML_REGEXP_TRANS_STATE, 162 XML_REGEXP_SINK_STATE, 163 XML_REGEXP_UNREACH_STATE 164 } xmlRegStateType; 165 166 typedef enum { 167 XML_REGEXP_MARK_NORMAL = 0, 168 XML_REGEXP_MARK_START, 169 XML_REGEXP_MARK_VISITED 170 } xmlRegMarkedType; 171 172 typedef struct _xmlRegRange xmlRegRange; 173 typedef xmlRegRange *xmlRegRangePtr; 174 175 struct _xmlRegRange { 176 int neg; /* 0 normal, 1 not, 2 exclude */ 177 xmlRegAtomType type; 178 int start; 179 int end; 180 xmlChar *blockName; 181 }; 182 183 typedef struct _xmlRegAtom xmlRegAtom; 184 typedef xmlRegAtom *xmlRegAtomPtr; 185 186 typedef struct _xmlAutomataState xmlRegState; 187 typedef xmlRegState *xmlRegStatePtr; 188 189 struct _xmlRegAtom { 190 int no; 191 xmlRegAtomType type; 192 xmlRegQuantType quant; 193 int min; 194 int max; 195 196 void *valuep; 197 void *valuep2; 198 int neg; 199 int codepoint; 200 xmlRegStatePtr start; 201 xmlRegStatePtr start0; 202 xmlRegStatePtr stop; 203 int maxRanges; 204 int nbRanges; 205 xmlRegRangePtr *ranges; 206 void *data; 207 }; 208 209 typedef struct _xmlRegCounter xmlRegCounter; 210 typedef xmlRegCounter *xmlRegCounterPtr; 211 212 struct _xmlRegCounter { 213 int min; 214 int max; 215 }; 216 217 typedef struct _xmlRegTrans xmlRegTrans; 218 typedef xmlRegTrans *xmlRegTransPtr; 219 220 struct _xmlRegTrans { 221 xmlRegAtomPtr atom; 222 int to; 223 int counter; 224 int count; 225 int nd; 226 }; 227 228 struct _xmlAutomataState { 229 xmlRegStateType type; 230 xmlRegMarkedType mark; 231 xmlRegMarkedType markd; 232 xmlRegMarkedType reached; 233 int no; 234 int maxTrans; 235 int nbTrans; 236 xmlRegTrans *trans; 237 /* knowing states pointing to us can speed things up */ 238 int maxTransTo; 239 int nbTransTo; 240 int *transTo; 241 }; 242 243 typedef struct _xmlAutomata xmlRegParserCtxt; 244 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 245 246 #define AM_AUTOMATA_RNG 1 247 248 struct _xmlAutomata { 249 xmlChar *string; 250 xmlChar *cur; 251 252 int error; 253 int neg; 254 255 xmlRegStatePtr start; 256 xmlRegStatePtr end; 257 xmlRegStatePtr state; 258 259 xmlRegAtomPtr atom; 260 261 int maxAtoms; 262 int nbAtoms; 263 xmlRegAtomPtr *atoms; 264 265 int maxStates; 266 int nbStates; 267 xmlRegStatePtr *states; 268 269 int maxCounters; 270 int nbCounters; 271 xmlRegCounter *counters; 272 273 int determinist; 274 int negs; 275 int flags; 276 277 int depth; 278 }; 279 280 struct _xmlRegexp { 281 xmlChar *string; 282 int nbStates; 283 xmlRegStatePtr *states; 284 int nbAtoms; 285 xmlRegAtomPtr *atoms; 286 int nbCounters; 287 xmlRegCounter *counters; 288 int determinist; 289 int flags; 290 /* 291 * That's the compact form for determinists automatas 292 */ 293 int nbstates; 294 int *compact; 295 void **transdata; 296 int nbstrings; 297 xmlChar **stringMap; 298 }; 299 300 typedef struct _xmlRegExecRollback xmlRegExecRollback; 301 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 302 303 struct _xmlRegExecRollback { 304 xmlRegStatePtr state;/* the current state */ 305 int index; /* the index in the input stack */ 306 int nextbranch; /* the next transition to explore in that state */ 307 int *counts; /* save the automata state if it has some */ 308 }; 309 310 typedef struct _xmlRegInputToken xmlRegInputToken; 311 typedef xmlRegInputToken *xmlRegInputTokenPtr; 312 313 struct _xmlRegInputToken { 314 xmlChar *value; 315 void *data; 316 }; 317 318 struct _xmlRegExecCtxt { 319 int status; /* execution status != 0 indicate an error */ 320 int determinist; /* did we find an indeterministic behaviour */ 321 xmlRegexpPtr comp; /* the compiled regexp */ 322 xmlRegExecCallbacks callback; 323 void *data; 324 325 xmlRegStatePtr state;/* the current state */ 326 int transno; /* the current transition on that state */ 327 int transcount; /* the number of chars in char counted transitions */ 328 329 /* 330 * A stack of rollback states 331 */ 332 int maxRollbacks; 333 int nbRollbacks; 334 xmlRegExecRollback *rollbacks; 335 336 /* 337 * The state of the automata if any 338 */ 339 int *counts; 340 341 /* 342 * The input stack 343 */ 344 int inputStackMax; 345 int inputStackNr; 346 int index; 347 int *charStack; 348 const xmlChar *inputString; /* when operating on characters */ 349 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 350 351 /* 352 * error handling 353 */ 354 int errStateNo; /* the error state number */ 355 xmlRegStatePtr errState; /* the error state */ 356 xmlChar *errString; /* the string raising the error */ 357 int *errCounts; /* counters at the error state */ 358 int nbPush; 359 }; 360 361 #define REGEXP_ALL_COUNTER 0x123456 362 #define REGEXP_ALL_LAX_COUNTER 0x123457 363 364 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 365 static void xmlRegFreeState(xmlRegStatePtr state); 366 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 367 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 368 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 369 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 370 int neg, int start, int end, const xmlChar *blockName); 371 372 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 373 374 /************************************************************************ 375 * * 376 * Regexp memory error handler * 377 * * 378 ************************************************************************/ 379 /** 380 * xmlRegexpErrMemory: 381 * @extra: extra information 382 * 383 * Handle an out of memory condition 384 */ 385 static void 386 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 387 { 388 const char *regexp = NULL; 389 if (ctxt != NULL) { 390 regexp = (const char *) ctxt->string; 391 ctxt->error = XML_ERR_NO_MEMORY; 392 } 393 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 394 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 395 regexp, NULL, 0, 0, 396 "Memory allocation failed : %s\n", extra); 397 } 398 399 /** 400 * xmlRegexpErrCompile: 401 * @extra: extra information 402 * 403 * Handle a compilation failure 404 */ 405 static void 406 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 407 { 408 const char *regexp = NULL; 409 int idx = 0; 410 411 if (ctxt != NULL) { 412 regexp = (const char *) ctxt->string; 413 idx = ctxt->cur - ctxt->string; 414 ctxt->error = XML_REGEXP_COMPILE_ERROR; 415 } 416 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 417 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 418 regexp, NULL, idx, 0, 419 "failed to compile: %s\n", extra); 420 } 421 422 /************************************************************************ 423 * * 424 * Allocation/Deallocation * 425 * * 426 ************************************************************************/ 427 428 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 429 430 /** 431 * xmlRegCalloc2: 432 * @dim1: size of first dimension 433 * @dim2: size of second dimension 434 * @elemSize: size of element 435 * 436 * Allocate a two-dimensional array and set all elements to zero. 437 * 438 * Returns the new array or NULL in case of error. 439 */ 440 static void* 441 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) { 442 size_t totalSize; 443 void *ret; 444 445 /* Check for overflow */ 446 if (dim1 > SIZE_MAX / dim2 / elemSize) 447 return (NULL); 448 totalSize = dim1 * dim2 * elemSize; 449 ret = xmlMalloc(totalSize); 450 if (ret != NULL) 451 memset(ret, 0, totalSize); 452 return (ret); 453 } 454 455 /** 456 * xmlRegEpxFromParse: 457 * @ctxt: the parser context used to build it 458 * 459 * Allocate a new regexp and fill it with the result from the parser 460 * 461 * Returns the new regexp or NULL in case of error 462 */ 463 static xmlRegexpPtr 464 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 465 xmlRegexpPtr ret; 466 467 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 468 if (ret == NULL) { 469 xmlRegexpErrMemory(ctxt, "compiling regexp"); 470 return(NULL); 471 } 472 memset(ret, 0, sizeof(xmlRegexp)); 473 ret->string = ctxt->string; 474 ret->nbStates = ctxt->nbStates; 475 ret->states = ctxt->states; 476 ret->nbAtoms = ctxt->nbAtoms; 477 ret->atoms = ctxt->atoms; 478 ret->nbCounters = ctxt->nbCounters; 479 ret->counters = ctxt->counters; 480 ret->determinist = ctxt->determinist; 481 ret->flags = ctxt->flags; 482 if (ret->determinist == -1) { 483 xmlRegexpIsDeterminist(ret); 484 } 485 486 if ((ret->determinist != 0) && 487 (ret->nbCounters == 0) && 488 (ctxt->negs == 0) && 489 (ret->atoms != NULL) && 490 (ret->atoms[0] != NULL) && 491 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 492 int i, j, nbstates = 0, nbatoms = 0; 493 int *stateRemap; 494 int *stringRemap; 495 int *transitions; 496 void **transdata; 497 xmlChar **stringMap; 498 xmlChar *value; 499 500 /* 501 * Switch to a compact representation 502 * 1/ counting the effective number of states left 503 * 2/ counting the unique number of atoms, and check that 504 * they are all of the string type 505 * 3/ build a table state x atom for the transitions 506 */ 507 508 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 509 if (stateRemap == NULL) { 510 xmlRegexpErrMemory(ctxt, "compiling regexp"); 511 xmlFree(ret); 512 return(NULL); 513 } 514 for (i = 0;i < ret->nbStates;i++) { 515 if (ret->states[i] != NULL) { 516 stateRemap[i] = nbstates; 517 nbstates++; 518 } else { 519 stateRemap[i] = -1; 520 } 521 } 522 #ifdef DEBUG_COMPACTION 523 printf("Final: %d states\n", nbstates); 524 #endif 525 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 526 if (stringMap == NULL) { 527 xmlRegexpErrMemory(ctxt, "compiling regexp"); 528 xmlFree(stateRemap); 529 xmlFree(ret); 530 return(NULL); 531 } 532 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 533 if (stringRemap == NULL) { 534 xmlRegexpErrMemory(ctxt, "compiling regexp"); 535 xmlFree(stringMap); 536 xmlFree(stateRemap); 537 xmlFree(ret); 538 return(NULL); 539 } 540 for (i = 0;i < ret->nbAtoms;i++) { 541 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 542 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 543 value = ret->atoms[i]->valuep; 544 for (j = 0;j < nbatoms;j++) { 545 if (xmlStrEqual(stringMap[j], value)) { 546 stringRemap[i] = j; 547 break; 548 } 549 } 550 if (j >= nbatoms) { 551 stringRemap[i] = nbatoms; 552 stringMap[nbatoms] = xmlStrdup(value); 553 if (stringMap[nbatoms] == NULL) { 554 for (i = 0;i < nbatoms;i++) 555 xmlFree(stringMap[i]); 556 xmlFree(stringRemap); 557 xmlFree(stringMap); 558 xmlFree(stateRemap); 559 xmlFree(ret); 560 return(NULL); 561 } 562 nbatoms++; 563 } 564 } else { 565 xmlFree(stateRemap); 566 xmlFree(stringRemap); 567 for (i = 0;i < nbatoms;i++) 568 xmlFree(stringMap[i]); 569 xmlFree(stringMap); 570 xmlFree(ret); 571 return(NULL); 572 } 573 } 574 #ifdef DEBUG_COMPACTION 575 printf("Final: %d atoms\n", nbatoms); 576 #endif 577 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1, 578 sizeof(int)); 579 if (transitions == NULL) { 580 xmlFree(stateRemap); 581 xmlFree(stringRemap); 582 for (i = 0;i < nbatoms;i++) 583 xmlFree(stringMap[i]); 584 xmlFree(stringMap); 585 xmlFree(ret); 586 return(NULL); 587 } 588 589 /* 590 * Allocate the transition table. The first entry for each 591 * state corresponds to the state type. 592 */ 593 transdata = NULL; 594 595 for (i = 0;i < ret->nbStates;i++) { 596 int stateno, atomno, targetno, prev; 597 xmlRegStatePtr state; 598 xmlRegTransPtr trans; 599 600 stateno = stateRemap[i]; 601 if (stateno == -1) 602 continue; 603 state = ret->states[i]; 604 605 transitions[stateno * (nbatoms + 1)] = state->type; 606 607 for (j = 0;j < state->nbTrans;j++) { 608 trans = &(state->trans[j]); 609 if ((trans->to == -1) || (trans->atom == NULL)) 610 continue; 611 atomno = stringRemap[trans->atom->no]; 612 if ((trans->atom->data != NULL) && (transdata == NULL)) { 613 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms, 614 sizeof(void *)); 615 if (transdata == NULL) { 616 xmlRegexpErrMemory(ctxt, "compiling regexp"); 617 break; 618 } 619 } 620 targetno = stateRemap[trans->to]; 621 /* 622 * if the same atom can generate transitions to 2 different 623 * states then it means the automata is not deterministic and 624 * the compact form can't be used ! 625 */ 626 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 627 if (prev != 0) { 628 if (prev != targetno + 1) { 629 ret->determinist = 0; 630 #ifdef DEBUG_COMPACTION 631 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 632 i, j, trans->atom->no, trans->to, atomno, targetno); 633 printf(" previous to is %d\n", prev); 634 #endif 635 if (transdata != NULL) 636 xmlFree(transdata); 637 xmlFree(transitions); 638 xmlFree(stateRemap); 639 xmlFree(stringRemap); 640 for (i = 0;i < nbatoms;i++) 641 xmlFree(stringMap[i]); 642 xmlFree(stringMap); 643 goto not_determ; 644 } 645 } else { 646 #if 0 647 printf("State %d trans %d: atom %d to %d : %d to %d\n", 648 i, j, trans->atom->no, trans->to, atomno, targetno); 649 #endif 650 transitions[stateno * (nbatoms + 1) + atomno + 1] = 651 targetno + 1; /* to avoid 0 */ 652 if (transdata != NULL) 653 transdata[stateno * nbatoms + atomno] = 654 trans->atom->data; 655 } 656 } 657 } 658 ret->determinist = 1; 659 #ifdef DEBUG_COMPACTION 660 /* 661 * Debug 662 */ 663 for (i = 0;i < nbstates;i++) { 664 for (j = 0;j < nbatoms + 1;j++) { 665 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 666 } 667 printf("\n"); 668 } 669 printf("\n"); 670 #endif 671 /* 672 * Cleanup of the old data 673 */ 674 if (ret->states != NULL) { 675 for (i = 0;i < ret->nbStates;i++) 676 xmlRegFreeState(ret->states[i]); 677 xmlFree(ret->states); 678 } 679 ret->states = NULL; 680 ret->nbStates = 0; 681 if (ret->atoms != NULL) { 682 for (i = 0;i < ret->nbAtoms;i++) 683 xmlRegFreeAtom(ret->atoms[i]); 684 xmlFree(ret->atoms); 685 } 686 ret->atoms = NULL; 687 ret->nbAtoms = 0; 688 689 ret->compact = transitions; 690 ret->transdata = transdata; 691 ret->stringMap = stringMap; 692 ret->nbstrings = nbatoms; 693 ret->nbstates = nbstates; 694 xmlFree(stateRemap); 695 xmlFree(stringRemap); 696 } 697 not_determ: 698 ctxt->string = NULL; 699 ctxt->nbStates = 0; 700 ctxt->states = NULL; 701 ctxt->nbAtoms = 0; 702 ctxt->atoms = NULL; 703 ctxt->nbCounters = 0; 704 ctxt->counters = NULL; 705 return(ret); 706 } 707 708 /** 709 * xmlRegNewParserCtxt: 710 * @string: the string to parse 711 * 712 * Allocate a new regexp parser context 713 * 714 * Returns the new context or NULL in case of error 715 */ 716 static xmlRegParserCtxtPtr 717 xmlRegNewParserCtxt(const xmlChar *string) { 718 xmlRegParserCtxtPtr ret; 719 720 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 721 if (ret == NULL) 722 return(NULL); 723 memset(ret, 0, sizeof(xmlRegParserCtxt)); 724 if (string != NULL) 725 ret->string = xmlStrdup(string); 726 ret->cur = ret->string; 727 ret->neg = 0; 728 ret->negs = 0; 729 ret->error = 0; 730 ret->determinist = -1; 731 return(ret); 732 } 733 734 /** 735 * xmlRegNewRange: 736 * @ctxt: the regexp parser context 737 * @neg: is that negative 738 * @type: the type of range 739 * @start: the start codepoint 740 * @end: the end codepoint 741 * 742 * Allocate a new regexp range 743 * 744 * Returns the new range or NULL in case of error 745 */ 746 static xmlRegRangePtr 747 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 748 int neg, xmlRegAtomType type, int start, int end) { 749 xmlRegRangePtr ret; 750 751 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 752 if (ret == NULL) { 753 xmlRegexpErrMemory(ctxt, "allocating range"); 754 return(NULL); 755 } 756 ret->neg = neg; 757 ret->type = type; 758 ret->start = start; 759 ret->end = end; 760 return(ret); 761 } 762 763 /** 764 * xmlRegFreeRange: 765 * @range: the regexp range 766 * 767 * Free a regexp range 768 */ 769 static void 770 xmlRegFreeRange(xmlRegRangePtr range) { 771 if (range == NULL) 772 return; 773 774 if (range->blockName != NULL) 775 xmlFree(range->blockName); 776 xmlFree(range); 777 } 778 779 /** 780 * xmlRegCopyRange: 781 * @range: the regexp range 782 * 783 * Copy a regexp range 784 * 785 * Returns the new copy or NULL in case of error. 786 */ 787 static xmlRegRangePtr 788 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 789 xmlRegRangePtr ret; 790 791 if (range == NULL) 792 return(NULL); 793 794 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 795 range->end); 796 if (ret == NULL) 797 return(NULL); 798 if (range->blockName != NULL) { 799 ret->blockName = xmlStrdup(range->blockName); 800 if (ret->blockName == NULL) { 801 xmlRegexpErrMemory(ctxt, "allocating range"); 802 xmlRegFreeRange(ret); 803 return(NULL); 804 } 805 } 806 return(ret); 807 } 808 809 /** 810 * xmlRegNewAtom: 811 * @ctxt: the regexp parser context 812 * @type: the type of atom 813 * 814 * Allocate a new atom 815 * 816 * Returns the new atom or NULL in case of error 817 */ 818 static xmlRegAtomPtr 819 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 820 xmlRegAtomPtr ret; 821 822 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 823 if (ret == NULL) { 824 xmlRegexpErrMemory(ctxt, "allocating atom"); 825 return(NULL); 826 } 827 memset(ret, 0, sizeof(xmlRegAtom)); 828 ret->type = type; 829 ret->quant = XML_REGEXP_QUANT_ONCE; 830 ret->min = 0; 831 ret->max = 0; 832 return(ret); 833 } 834 835 /** 836 * xmlRegFreeAtom: 837 * @atom: the regexp atom 838 * 839 * Free a regexp atom 840 */ 841 static void 842 xmlRegFreeAtom(xmlRegAtomPtr atom) { 843 int i; 844 845 if (atom == NULL) 846 return; 847 848 for (i = 0;i < atom->nbRanges;i++) 849 xmlRegFreeRange(atom->ranges[i]); 850 if (atom->ranges != NULL) 851 xmlFree(atom->ranges); 852 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 853 xmlFree(atom->valuep); 854 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 855 xmlFree(atom->valuep2); 856 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 857 xmlFree(atom->valuep); 858 xmlFree(atom); 859 } 860 861 /** 862 * xmlRegCopyAtom: 863 * @ctxt: the regexp parser context 864 * @atom: the original atom 865 * 866 * Allocate a new regexp range 867 * 868 * Returns the new atom or NULL in case of error 869 */ 870 static xmlRegAtomPtr 871 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 872 xmlRegAtomPtr ret; 873 874 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 875 if (ret == NULL) { 876 xmlRegexpErrMemory(ctxt, "copying atom"); 877 return(NULL); 878 } 879 memset(ret, 0, sizeof(xmlRegAtom)); 880 ret->type = atom->type; 881 ret->quant = atom->quant; 882 ret->min = atom->min; 883 ret->max = atom->max; 884 if (atom->nbRanges > 0) { 885 int i; 886 887 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 888 atom->nbRanges); 889 if (ret->ranges == NULL) { 890 xmlRegexpErrMemory(ctxt, "copying atom"); 891 goto error; 892 } 893 for (i = 0;i < atom->nbRanges;i++) { 894 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 895 if (ret->ranges[i] == NULL) 896 goto error; 897 ret->nbRanges = i + 1; 898 } 899 } 900 return(ret); 901 902 error: 903 xmlRegFreeAtom(ret); 904 return(NULL); 905 } 906 907 static xmlRegStatePtr 908 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 909 xmlRegStatePtr ret; 910 911 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 912 if (ret == NULL) { 913 xmlRegexpErrMemory(ctxt, "allocating state"); 914 return(NULL); 915 } 916 memset(ret, 0, sizeof(xmlRegState)); 917 ret->type = XML_REGEXP_TRANS_STATE; 918 ret->mark = XML_REGEXP_MARK_NORMAL; 919 return(ret); 920 } 921 922 /** 923 * xmlRegFreeState: 924 * @state: the regexp state 925 * 926 * Free a regexp state 927 */ 928 static void 929 xmlRegFreeState(xmlRegStatePtr state) { 930 if (state == NULL) 931 return; 932 933 if (state->trans != NULL) 934 xmlFree(state->trans); 935 if (state->transTo != NULL) 936 xmlFree(state->transTo); 937 xmlFree(state); 938 } 939 940 /** 941 * xmlRegFreeParserCtxt: 942 * @ctxt: the regexp parser context 943 * 944 * Free a regexp parser context 945 */ 946 static void 947 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 948 int i; 949 if (ctxt == NULL) 950 return; 951 952 if (ctxt->string != NULL) 953 xmlFree(ctxt->string); 954 if (ctxt->states != NULL) { 955 for (i = 0;i < ctxt->nbStates;i++) 956 xmlRegFreeState(ctxt->states[i]); 957 xmlFree(ctxt->states); 958 } 959 if (ctxt->atoms != NULL) { 960 for (i = 0;i < ctxt->nbAtoms;i++) 961 xmlRegFreeAtom(ctxt->atoms[i]); 962 xmlFree(ctxt->atoms); 963 } 964 if (ctxt->counters != NULL) 965 xmlFree(ctxt->counters); 966 xmlFree(ctxt); 967 } 968 969 /************************************************************************ 970 * * 971 * Display of Data structures * 972 * * 973 ************************************************************************/ 974 975 static void 976 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 977 switch (type) { 978 case XML_REGEXP_EPSILON: 979 fprintf(output, "epsilon "); break; 980 case XML_REGEXP_CHARVAL: 981 fprintf(output, "charval "); break; 982 case XML_REGEXP_RANGES: 983 fprintf(output, "ranges "); break; 984 case XML_REGEXP_SUBREG: 985 fprintf(output, "subexpr "); break; 986 case XML_REGEXP_STRING: 987 fprintf(output, "string "); break; 988 case XML_REGEXP_ANYCHAR: 989 fprintf(output, "anychar "); break; 990 case XML_REGEXP_ANYSPACE: 991 fprintf(output, "anyspace "); break; 992 case XML_REGEXP_NOTSPACE: 993 fprintf(output, "notspace "); break; 994 case XML_REGEXP_INITNAME: 995 fprintf(output, "initname "); break; 996 case XML_REGEXP_NOTINITNAME: 997 fprintf(output, "notinitname "); break; 998 case XML_REGEXP_NAMECHAR: 999 fprintf(output, "namechar "); break; 1000 case XML_REGEXP_NOTNAMECHAR: 1001 fprintf(output, "notnamechar "); break; 1002 case XML_REGEXP_DECIMAL: 1003 fprintf(output, "decimal "); break; 1004 case XML_REGEXP_NOTDECIMAL: 1005 fprintf(output, "notdecimal "); break; 1006 case XML_REGEXP_REALCHAR: 1007 fprintf(output, "realchar "); break; 1008 case XML_REGEXP_NOTREALCHAR: 1009 fprintf(output, "notrealchar "); break; 1010 case XML_REGEXP_LETTER: 1011 fprintf(output, "LETTER "); break; 1012 case XML_REGEXP_LETTER_UPPERCASE: 1013 fprintf(output, "LETTER_UPPERCASE "); break; 1014 case XML_REGEXP_LETTER_LOWERCASE: 1015 fprintf(output, "LETTER_LOWERCASE "); break; 1016 case XML_REGEXP_LETTER_TITLECASE: 1017 fprintf(output, "LETTER_TITLECASE "); break; 1018 case XML_REGEXP_LETTER_MODIFIER: 1019 fprintf(output, "LETTER_MODIFIER "); break; 1020 case XML_REGEXP_LETTER_OTHERS: 1021 fprintf(output, "LETTER_OTHERS "); break; 1022 case XML_REGEXP_MARK: 1023 fprintf(output, "MARK "); break; 1024 case XML_REGEXP_MARK_NONSPACING: 1025 fprintf(output, "MARK_NONSPACING "); break; 1026 case XML_REGEXP_MARK_SPACECOMBINING: 1027 fprintf(output, "MARK_SPACECOMBINING "); break; 1028 case XML_REGEXP_MARK_ENCLOSING: 1029 fprintf(output, "MARK_ENCLOSING "); break; 1030 case XML_REGEXP_NUMBER: 1031 fprintf(output, "NUMBER "); break; 1032 case XML_REGEXP_NUMBER_DECIMAL: 1033 fprintf(output, "NUMBER_DECIMAL "); break; 1034 case XML_REGEXP_NUMBER_LETTER: 1035 fprintf(output, "NUMBER_LETTER "); break; 1036 case XML_REGEXP_NUMBER_OTHERS: 1037 fprintf(output, "NUMBER_OTHERS "); break; 1038 case XML_REGEXP_PUNCT: 1039 fprintf(output, "PUNCT "); break; 1040 case XML_REGEXP_PUNCT_CONNECTOR: 1041 fprintf(output, "PUNCT_CONNECTOR "); break; 1042 case XML_REGEXP_PUNCT_DASH: 1043 fprintf(output, "PUNCT_DASH "); break; 1044 case XML_REGEXP_PUNCT_OPEN: 1045 fprintf(output, "PUNCT_OPEN "); break; 1046 case XML_REGEXP_PUNCT_CLOSE: 1047 fprintf(output, "PUNCT_CLOSE "); break; 1048 case XML_REGEXP_PUNCT_INITQUOTE: 1049 fprintf(output, "PUNCT_INITQUOTE "); break; 1050 case XML_REGEXP_PUNCT_FINQUOTE: 1051 fprintf(output, "PUNCT_FINQUOTE "); break; 1052 case XML_REGEXP_PUNCT_OTHERS: 1053 fprintf(output, "PUNCT_OTHERS "); break; 1054 case XML_REGEXP_SEPAR: 1055 fprintf(output, "SEPAR "); break; 1056 case XML_REGEXP_SEPAR_SPACE: 1057 fprintf(output, "SEPAR_SPACE "); break; 1058 case XML_REGEXP_SEPAR_LINE: 1059 fprintf(output, "SEPAR_LINE "); break; 1060 case XML_REGEXP_SEPAR_PARA: 1061 fprintf(output, "SEPAR_PARA "); break; 1062 case XML_REGEXP_SYMBOL: 1063 fprintf(output, "SYMBOL "); break; 1064 case XML_REGEXP_SYMBOL_MATH: 1065 fprintf(output, "SYMBOL_MATH "); break; 1066 case XML_REGEXP_SYMBOL_CURRENCY: 1067 fprintf(output, "SYMBOL_CURRENCY "); break; 1068 case XML_REGEXP_SYMBOL_MODIFIER: 1069 fprintf(output, "SYMBOL_MODIFIER "); break; 1070 case XML_REGEXP_SYMBOL_OTHERS: 1071 fprintf(output, "SYMBOL_OTHERS "); break; 1072 case XML_REGEXP_OTHER: 1073 fprintf(output, "OTHER "); break; 1074 case XML_REGEXP_OTHER_CONTROL: 1075 fprintf(output, "OTHER_CONTROL "); break; 1076 case XML_REGEXP_OTHER_FORMAT: 1077 fprintf(output, "OTHER_FORMAT "); break; 1078 case XML_REGEXP_OTHER_PRIVATE: 1079 fprintf(output, "OTHER_PRIVATE "); break; 1080 case XML_REGEXP_OTHER_NA: 1081 fprintf(output, "OTHER_NA "); break; 1082 case XML_REGEXP_BLOCK_NAME: 1083 fprintf(output, "BLOCK "); break; 1084 } 1085 } 1086 1087 static void 1088 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1089 switch (type) { 1090 case XML_REGEXP_QUANT_EPSILON: 1091 fprintf(output, "epsilon "); break; 1092 case XML_REGEXP_QUANT_ONCE: 1093 fprintf(output, "once "); break; 1094 case XML_REGEXP_QUANT_OPT: 1095 fprintf(output, "? "); break; 1096 case XML_REGEXP_QUANT_MULT: 1097 fprintf(output, "* "); break; 1098 case XML_REGEXP_QUANT_PLUS: 1099 fprintf(output, "+ "); break; 1100 case XML_REGEXP_QUANT_RANGE: 1101 fprintf(output, "range "); break; 1102 case XML_REGEXP_QUANT_ONCEONLY: 1103 fprintf(output, "onceonly "); break; 1104 case XML_REGEXP_QUANT_ALL: 1105 fprintf(output, "all "); break; 1106 } 1107 } 1108 static void 1109 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1110 fprintf(output, " range: "); 1111 if (range->neg) 1112 fprintf(output, "negative "); 1113 xmlRegPrintAtomType(output, range->type); 1114 fprintf(output, "%c - %c\n", range->start, range->end); 1115 } 1116 1117 static void 1118 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1119 fprintf(output, " atom: "); 1120 if (atom == NULL) { 1121 fprintf(output, "NULL\n"); 1122 return; 1123 } 1124 if (atom->neg) 1125 fprintf(output, "not "); 1126 xmlRegPrintAtomType(output, atom->type); 1127 xmlRegPrintQuantType(output, atom->quant); 1128 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1129 fprintf(output, "%d-%d ", atom->min, atom->max); 1130 if (atom->type == XML_REGEXP_STRING) 1131 fprintf(output, "'%s' ", (char *) atom->valuep); 1132 if (atom->type == XML_REGEXP_CHARVAL) 1133 fprintf(output, "char %c\n", atom->codepoint); 1134 else if (atom->type == XML_REGEXP_RANGES) { 1135 int i; 1136 fprintf(output, "%d entries\n", atom->nbRanges); 1137 for (i = 0; i < atom->nbRanges;i++) 1138 xmlRegPrintRange(output, atom->ranges[i]); 1139 } else if (atom->type == XML_REGEXP_SUBREG) { 1140 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1141 } else { 1142 fprintf(output, "\n"); 1143 } 1144 } 1145 1146 static void 1147 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1148 fprintf(output, " trans: "); 1149 if (trans == NULL) { 1150 fprintf(output, "NULL\n"); 1151 return; 1152 } 1153 if (trans->to < 0) { 1154 fprintf(output, "removed\n"); 1155 return; 1156 } 1157 if (trans->nd != 0) { 1158 if (trans->nd == 2) 1159 fprintf(output, "last not determinist, "); 1160 else 1161 fprintf(output, "not determinist, "); 1162 } 1163 if (trans->counter >= 0) { 1164 fprintf(output, "counted %d, ", trans->counter); 1165 } 1166 if (trans->count == REGEXP_ALL_COUNTER) { 1167 fprintf(output, "all transition, "); 1168 } else if (trans->count >= 0) { 1169 fprintf(output, "count based %d, ", trans->count); 1170 } 1171 if (trans->atom == NULL) { 1172 fprintf(output, "epsilon to %d\n", trans->to); 1173 return; 1174 } 1175 if (trans->atom->type == XML_REGEXP_CHARVAL) 1176 fprintf(output, "char %c ", trans->atom->codepoint); 1177 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1178 } 1179 1180 static void 1181 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1182 int i; 1183 1184 fprintf(output, " state: "); 1185 if (state == NULL) { 1186 fprintf(output, "NULL\n"); 1187 return; 1188 } 1189 if (state->type == XML_REGEXP_START_STATE) 1190 fprintf(output, "START "); 1191 if (state->type == XML_REGEXP_FINAL_STATE) 1192 fprintf(output, "FINAL "); 1193 1194 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1195 for (i = 0;i < state->nbTrans; i++) { 1196 xmlRegPrintTrans(output, &(state->trans[i])); 1197 } 1198 } 1199 1200 #ifdef DEBUG_REGEXP_GRAPH 1201 static void 1202 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1203 int i; 1204 1205 fprintf(output, " ctxt: "); 1206 if (ctxt == NULL) { 1207 fprintf(output, "NULL\n"); 1208 return; 1209 } 1210 fprintf(output, "'%s' ", ctxt->string); 1211 if (ctxt->error) 1212 fprintf(output, "error "); 1213 if (ctxt->neg) 1214 fprintf(output, "neg "); 1215 fprintf(output, "\n"); 1216 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1217 for (i = 0;i < ctxt->nbAtoms; i++) { 1218 fprintf(output, " %02d ", i); 1219 xmlRegPrintAtom(output, ctxt->atoms[i]); 1220 } 1221 if (ctxt->atom != NULL) { 1222 fprintf(output, "current atom:\n"); 1223 xmlRegPrintAtom(output, ctxt->atom); 1224 } 1225 fprintf(output, "%d states:", ctxt->nbStates); 1226 if (ctxt->start != NULL) 1227 fprintf(output, " start: %d", ctxt->start->no); 1228 if (ctxt->end != NULL) 1229 fprintf(output, " end: %d", ctxt->end->no); 1230 fprintf(output, "\n"); 1231 for (i = 0;i < ctxt->nbStates; i++) { 1232 xmlRegPrintState(output, ctxt->states[i]); 1233 } 1234 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1235 for (i = 0;i < ctxt->nbCounters; i++) { 1236 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1237 ctxt->counters[i].max); 1238 } 1239 } 1240 #endif 1241 1242 /************************************************************************ 1243 * * 1244 * Finite Automata structures manipulations * 1245 * * 1246 ************************************************************************/ 1247 1248 static void 1249 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1250 int neg, xmlRegAtomType type, int start, int end, 1251 xmlChar *blockName) { 1252 xmlRegRangePtr range; 1253 1254 if (atom == NULL) { 1255 ERROR("add range: atom is NULL"); 1256 return; 1257 } 1258 if (atom->type != XML_REGEXP_RANGES) { 1259 ERROR("add range: atom is not ranges"); 1260 return; 1261 } 1262 if (atom->maxRanges == 0) { 1263 atom->maxRanges = 4; 1264 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1265 sizeof(xmlRegRangePtr)); 1266 if (atom->ranges == NULL) { 1267 xmlRegexpErrMemory(ctxt, "adding ranges"); 1268 atom->maxRanges = 0; 1269 return; 1270 } 1271 } else if (atom->nbRanges >= atom->maxRanges) { 1272 xmlRegRangePtr *tmp; 1273 atom->maxRanges *= 2; 1274 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1275 sizeof(xmlRegRangePtr)); 1276 if (tmp == NULL) { 1277 xmlRegexpErrMemory(ctxt, "adding ranges"); 1278 atom->maxRanges /= 2; 1279 return; 1280 } 1281 atom->ranges = tmp; 1282 } 1283 range = xmlRegNewRange(ctxt, neg, type, start, end); 1284 if (range == NULL) 1285 return; 1286 range->blockName = blockName; 1287 atom->ranges[atom->nbRanges++] = range; 1288 1289 } 1290 1291 static int 1292 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1293 if (ctxt->maxCounters == 0) { 1294 ctxt->maxCounters = 4; 1295 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1296 sizeof(xmlRegCounter)); 1297 if (ctxt->counters == NULL) { 1298 xmlRegexpErrMemory(ctxt, "allocating counter"); 1299 ctxt->maxCounters = 0; 1300 return(-1); 1301 } 1302 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1303 xmlRegCounter *tmp; 1304 ctxt->maxCounters *= 2; 1305 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1306 sizeof(xmlRegCounter)); 1307 if (tmp == NULL) { 1308 xmlRegexpErrMemory(ctxt, "allocating counter"); 1309 ctxt->maxCounters /= 2; 1310 return(-1); 1311 } 1312 ctxt->counters = tmp; 1313 } 1314 ctxt->counters[ctxt->nbCounters].min = -1; 1315 ctxt->counters[ctxt->nbCounters].max = -1; 1316 return(ctxt->nbCounters++); 1317 } 1318 1319 static int 1320 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1321 if (atom == NULL) { 1322 ERROR("atom push: atom is NULL"); 1323 return(-1); 1324 } 1325 if (ctxt->maxAtoms == 0) { 1326 ctxt->maxAtoms = 4; 1327 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1328 sizeof(xmlRegAtomPtr)); 1329 if (ctxt->atoms == NULL) { 1330 xmlRegexpErrMemory(ctxt, "pushing atom"); 1331 ctxt->maxAtoms = 0; 1332 return(-1); 1333 } 1334 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1335 xmlRegAtomPtr *tmp; 1336 ctxt->maxAtoms *= 2; 1337 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1338 sizeof(xmlRegAtomPtr)); 1339 if (tmp == NULL) { 1340 xmlRegexpErrMemory(ctxt, "allocating counter"); 1341 ctxt->maxAtoms /= 2; 1342 return(-1); 1343 } 1344 ctxt->atoms = tmp; 1345 } 1346 atom->no = ctxt->nbAtoms; 1347 ctxt->atoms[ctxt->nbAtoms++] = atom; 1348 return(0); 1349 } 1350 1351 static void 1352 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1353 int from) { 1354 if (target->maxTransTo == 0) { 1355 target->maxTransTo = 8; 1356 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1357 sizeof(int)); 1358 if (target->transTo == NULL) { 1359 xmlRegexpErrMemory(ctxt, "adding transition"); 1360 target->maxTransTo = 0; 1361 return; 1362 } 1363 } else if (target->nbTransTo >= target->maxTransTo) { 1364 int *tmp; 1365 target->maxTransTo *= 2; 1366 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1367 sizeof(int)); 1368 if (tmp == NULL) { 1369 xmlRegexpErrMemory(ctxt, "adding transition"); 1370 target->maxTransTo /= 2; 1371 return; 1372 } 1373 target->transTo = tmp; 1374 } 1375 target->transTo[target->nbTransTo] = from; 1376 target->nbTransTo++; 1377 } 1378 1379 static void 1380 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1381 xmlRegAtomPtr atom, xmlRegStatePtr target, 1382 int counter, int count) { 1383 1384 int nrtrans; 1385 1386 if (state == NULL) { 1387 ERROR("add state: state is NULL"); 1388 return; 1389 } 1390 if (target == NULL) { 1391 ERROR("add state: target is NULL"); 1392 return; 1393 } 1394 /* 1395 * Other routines follow the philosophy 'When in doubt, add a transition' 1396 * so we check here whether such a transition is already present and, if 1397 * so, silently ignore this request. 1398 */ 1399 1400 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1401 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1402 if ((trans->atom == atom) && 1403 (trans->to == target->no) && 1404 (trans->counter == counter) && 1405 (trans->count == count)) { 1406 #ifdef DEBUG_REGEXP_GRAPH 1407 printf("Ignoring duplicate transition from %d to %d\n", 1408 state->no, target->no); 1409 #endif 1410 return; 1411 } 1412 } 1413 1414 if (state->maxTrans == 0) { 1415 state->maxTrans = 8; 1416 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1417 sizeof(xmlRegTrans)); 1418 if (state->trans == NULL) { 1419 xmlRegexpErrMemory(ctxt, "adding transition"); 1420 state->maxTrans = 0; 1421 return; 1422 } 1423 } else if (state->nbTrans >= state->maxTrans) { 1424 xmlRegTrans *tmp; 1425 state->maxTrans *= 2; 1426 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1427 sizeof(xmlRegTrans)); 1428 if (tmp == NULL) { 1429 xmlRegexpErrMemory(ctxt, "adding transition"); 1430 state->maxTrans /= 2; 1431 return; 1432 } 1433 state->trans = tmp; 1434 } 1435 #ifdef DEBUG_REGEXP_GRAPH 1436 printf("Add trans from %d to %d ", state->no, target->no); 1437 if (count == REGEXP_ALL_COUNTER) 1438 printf("all transition\n"); 1439 else if (count >= 0) 1440 printf("count based %d\n", count); 1441 else if (counter >= 0) 1442 printf("counted %d\n", counter); 1443 else if (atom == NULL) 1444 printf("epsilon transition\n"); 1445 else if (atom != NULL) 1446 xmlRegPrintAtom(stdout, atom); 1447 #endif 1448 1449 state->trans[state->nbTrans].atom = atom; 1450 state->trans[state->nbTrans].to = target->no; 1451 state->trans[state->nbTrans].counter = counter; 1452 state->trans[state->nbTrans].count = count; 1453 state->trans[state->nbTrans].nd = 0; 1454 state->nbTrans++; 1455 xmlRegStateAddTransTo(ctxt, target, state->no); 1456 } 1457 1458 static int 1459 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1460 if (state == NULL) return(-1); 1461 if (ctxt->maxStates == 0) { 1462 ctxt->maxStates = 4; 1463 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1464 sizeof(xmlRegStatePtr)); 1465 if (ctxt->states == NULL) { 1466 xmlRegexpErrMemory(ctxt, "adding state"); 1467 ctxt->maxStates = 0; 1468 return(-1); 1469 } 1470 } else if (ctxt->nbStates >= ctxt->maxStates) { 1471 xmlRegStatePtr *tmp; 1472 ctxt->maxStates *= 2; 1473 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1474 sizeof(xmlRegStatePtr)); 1475 if (tmp == NULL) { 1476 xmlRegexpErrMemory(ctxt, "adding state"); 1477 ctxt->maxStates /= 2; 1478 return(-1); 1479 } 1480 ctxt->states = tmp; 1481 } 1482 state->no = ctxt->nbStates; 1483 ctxt->states[ctxt->nbStates++] = state; 1484 return(0); 1485 } 1486 1487 /** 1488 * xmlFAGenerateAllTransition: 1489 * @ctxt: a regexp parser context 1490 * @from: the from state 1491 * @to: the target state or NULL for building a new one 1492 * @lax: 1493 * 1494 */ 1495 static void 1496 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1497 xmlRegStatePtr from, xmlRegStatePtr to, 1498 int lax) { 1499 if (to == NULL) { 1500 to = xmlRegNewState(ctxt); 1501 xmlRegStatePush(ctxt, to); 1502 ctxt->state = to; 1503 } 1504 if (lax) 1505 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1506 else 1507 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1508 } 1509 1510 /** 1511 * xmlFAGenerateEpsilonTransition: 1512 * @ctxt: a regexp parser context 1513 * @from: the from state 1514 * @to: the target state or NULL for building a new one 1515 * 1516 */ 1517 static void 1518 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1519 xmlRegStatePtr from, xmlRegStatePtr to) { 1520 if (to == NULL) { 1521 to = xmlRegNewState(ctxt); 1522 xmlRegStatePush(ctxt, to); 1523 ctxt->state = to; 1524 } 1525 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1526 } 1527 1528 /** 1529 * xmlFAGenerateCountedEpsilonTransition: 1530 * @ctxt: a regexp parser context 1531 * @from: the from state 1532 * @to: the target state or NULL for building a new one 1533 * counter: the counter for that transition 1534 * 1535 */ 1536 static void 1537 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1538 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1539 if (to == NULL) { 1540 to = xmlRegNewState(ctxt); 1541 xmlRegStatePush(ctxt, to); 1542 ctxt->state = to; 1543 } 1544 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1545 } 1546 1547 /** 1548 * xmlFAGenerateCountedTransition: 1549 * @ctxt: a regexp parser context 1550 * @from: the from state 1551 * @to: the target state or NULL for building a new one 1552 * counter: the counter for that transition 1553 * 1554 */ 1555 static void 1556 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1557 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1558 if (to == NULL) { 1559 to = xmlRegNewState(ctxt); 1560 xmlRegStatePush(ctxt, to); 1561 ctxt->state = to; 1562 } 1563 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1564 } 1565 1566 /** 1567 * xmlFAGenerateTransitions: 1568 * @ctxt: a regexp parser context 1569 * @from: the from state 1570 * @to: the target state or NULL for building a new one 1571 * @atom: the atom generating the transition 1572 * 1573 * Returns 0 if success and -1 in case of error. 1574 */ 1575 static int 1576 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1577 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1578 xmlRegStatePtr end; 1579 int nullable = 0; 1580 1581 if (atom == NULL) { 1582 ERROR("generate transition: atom == NULL"); 1583 return(-1); 1584 } 1585 if (atom->type == XML_REGEXP_SUBREG) { 1586 /* 1587 * this is a subexpression handling one should not need to 1588 * create a new node except for XML_REGEXP_QUANT_RANGE. 1589 */ 1590 if (xmlRegAtomPush(ctxt, atom) < 0) { 1591 return(-1); 1592 } 1593 if ((to != NULL) && (atom->stop != to) && 1594 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1595 /* 1596 * Generate an epsilon transition to link to the target 1597 */ 1598 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1599 #ifdef DV 1600 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1601 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1602 to = xmlRegNewState(ctxt); 1603 xmlRegStatePush(ctxt, to); 1604 ctxt->state = to; 1605 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1606 #endif 1607 } 1608 switch (atom->quant) { 1609 case XML_REGEXP_QUANT_OPT: 1610 atom->quant = XML_REGEXP_QUANT_ONCE; 1611 /* 1612 * transition done to the state after end of atom. 1613 * 1. set transition from atom start to new state 1614 * 2. set transition from atom end to this state. 1615 */ 1616 if (to == NULL) { 1617 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1618 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1619 ctxt->state); 1620 } else { 1621 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1622 } 1623 break; 1624 case XML_REGEXP_QUANT_MULT: 1625 atom->quant = XML_REGEXP_QUANT_ONCE; 1626 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1627 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1628 break; 1629 case XML_REGEXP_QUANT_PLUS: 1630 atom->quant = XML_REGEXP_QUANT_ONCE; 1631 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1632 break; 1633 case XML_REGEXP_QUANT_RANGE: { 1634 int counter; 1635 xmlRegStatePtr inter, newstate; 1636 1637 /* 1638 * create the final state now if needed 1639 */ 1640 if (to != NULL) { 1641 newstate = to; 1642 } else { 1643 newstate = xmlRegNewState(ctxt); 1644 xmlRegStatePush(ctxt, newstate); 1645 } 1646 1647 /* 1648 * The principle here is to use counted transition 1649 * to avoid explosion in the number of states in the 1650 * graph. This is clearly more complex but should not 1651 * be exploitable at runtime. 1652 */ 1653 if ((atom->min == 0) && (atom->start0 == NULL)) { 1654 xmlRegAtomPtr copy; 1655 /* 1656 * duplicate a transition based on atom to count next 1657 * occurrences after 1. We cannot loop to atom->start 1658 * directly because we need an epsilon transition to 1659 * newstate. 1660 */ 1661 /* ???? For some reason it seems we never reach that 1662 case, I suppose this got optimized out before when 1663 building the automata */ 1664 copy = xmlRegCopyAtom(ctxt, atom); 1665 if (copy == NULL) 1666 return(-1); 1667 copy->quant = XML_REGEXP_QUANT_ONCE; 1668 copy->min = 0; 1669 copy->max = 0; 1670 1671 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1672 < 0) 1673 return(-1); 1674 inter = ctxt->state; 1675 counter = xmlRegGetCounter(ctxt); 1676 ctxt->counters[counter].min = atom->min - 1; 1677 ctxt->counters[counter].max = atom->max - 1; 1678 /* count the number of times we see it again */ 1679 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1680 atom->stop, counter); 1681 /* allow a way out based on the count */ 1682 xmlFAGenerateCountedTransition(ctxt, inter, 1683 newstate, counter); 1684 /* and also allow a direct exit for 0 */ 1685 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1686 newstate); 1687 } else { 1688 /* 1689 * either we need the atom at least once or there 1690 * is an atom->start0 allowing to easily plug the 1691 * epsilon transition. 1692 */ 1693 counter = xmlRegGetCounter(ctxt); 1694 ctxt->counters[counter].min = atom->min - 1; 1695 ctxt->counters[counter].max = atom->max - 1; 1696 /* count the number of times we see it again */ 1697 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1698 atom->start, counter); 1699 /* allow a way out based on the count */ 1700 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1701 newstate, counter); 1702 /* and if needed allow a direct exit for 0 */ 1703 if (atom->min == 0) 1704 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1705 newstate); 1706 1707 } 1708 atom->min = 0; 1709 atom->max = 0; 1710 atom->quant = XML_REGEXP_QUANT_ONCE; 1711 ctxt->state = newstate; 1712 } 1713 default: 1714 break; 1715 } 1716 return(0); 1717 } 1718 if ((atom->min == 0) && (atom->max == 0) && 1719 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1720 /* 1721 * we can discard the atom and generate an epsilon transition instead 1722 */ 1723 if (to == NULL) { 1724 to = xmlRegNewState(ctxt); 1725 if (to != NULL) 1726 xmlRegStatePush(ctxt, to); 1727 else { 1728 return(-1); 1729 } 1730 } 1731 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1732 ctxt->state = to; 1733 xmlRegFreeAtom(atom); 1734 return(0); 1735 } 1736 if (to == NULL) { 1737 to = xmlRegNewState(ctxt); 1738 if (to != NULL) 1739 xmlRegStatePush(ctxt, to); 1740 else { 1741 return(-1); 1742 } 1743 } 1744 end = to; 1745 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1746 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1747 /* 1748 * Do not pollute the target state by adding transitions from 1749 * it as it is likely to be the shared target of multiple branches. 1750 * So isolate with an epsilon transition. 1751 */ 1752 xmlRegStatePtr tmp; 1753 1754 tmp = xmlRegNewState(ctxt); 1755 if (tmp != NULL) 1756 xmlRegStatePush(ctxt, tmp); 1757 else { 1758 return(-1); 1759 } 1760 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1761 to = tmp; 1762 } 1763 if (xmlRegAtomPush(ctxt, atom) < 0) { 1764 return(-1); 1765 } 1766 if ((atom->quant == XML_REGEXP_QUANT_RANGE) && 1767 (atom->min == 0) && (atom->max > 0)) { 1768 nullable = 1; 1769 atom->min = 1; 1770 if (atom->max == 1) 1771 atom->quant = XML_REGEXP_QUANT_OPT; 1772 } 1773 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1774 ctxt->state = end; 1775 switch (atom->quant) { 1776 case XML_REGEXP_QUANT_OPT: 1777 atom->quant = XML_REGEXP_QUANT_ONCE; 1778 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1779 break; 1780 case XML_REGEXP_QUANT_MULT: 1781 atom->quant = XML_REGEXP_QUANT_ONCE; 1782 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1783 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1784 break; 1785 case XML_REGEXP_QUANT_PLUS: 1786 atom->quant = XML_REGEXP_QUANT_ONCE; 1787 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1788 break; 1789 case XML_REGEXP_QUANT_RANGE: 1790 if (nullable) 1791 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1792 break; 1793 default: 1794 break; 1795 } 1796 return(0); 1797 } 1798 1799 /** 1800 * xmlFAReduceEpsilonTransitions: 1801 * @ctxt: a regexp parser context 1802 * @fromnr: the from state 1803 * @tonr: the to state 1804 * @counter: should that transition be associated to a counted 1805 * 1806 */ 1807 static void 1808 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1809 int tonr, int counter) { 1810 int transnr; 1811 xmlRegStatePtr from; 1812 xmlRegStatePtr to; 1813 1814 #ifdef DEBUG_REGEXP_GRAPH 1815 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1816 #endif 1817 from = ctxt->states[fromnr]; 1818 if (from == NULL) 1819 return; 1820 to = ctxt->states[tonr]; 1821 if (to == NULL) 1822 return; 1823 if ((to->mark == XML_REGEXP_MARK_START) || 1824 (to->mark == XML_REGEXP_MARK_VISITED)) 1825 return; 1826 1827 to->mark = XML_REGEXP_MARK_VISITED; 1828 if (to->type == XML_REGEXP_FINAL_STATE) { 1829 #ifdef DEBUG_REGEXP_GRAPH 1830 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1831 #endif 1832 from->type = XML_REGEXP_FINAL_STATE; 1833 } 1834 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1835 if (to->trans[transnr].to < 0) 1836 continue; 1837 if (to->trans[transnr].atom == NULL) { 1838 /* 1839 * Don't remove counted transitions 1840 * Don't loop either 1841 */ 1842 if (to->trans[transnr].to != fromnr) { 1843 if (to->trans[transnr].count >= 0) { 1844 int newto = to->trans[transnr].to; 1845 1846 xmlRegStateAddTrans(ctxt, from, NULL, 1847 ctxt->states[newto], 1848 -1, to->trans[transnr].count); 1849 } else { 1850 #ifdef DEBUG_REGEXP_GRAPH 1851 printf("Found epsilon trans %d from %d to %d\n", 1852 transnr, tonr, to->trans[transnr].to); 1853 #endif 1854 if (to->trans[transnr].counter >= 0) { 1855 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1856 to->trans[transnr].to, 1857 to->trans[transnr].counter); 1858 } else { 1859 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1860 to->trans[transnr].to, 1861 counter); 1862 } 1863 } 1864 } 1865 } else { 1866 int newto = to->trans[transnr].to; 1867 1868 if (to->trans[transnr].counter >= 0) { 1869 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1870 ctxt->states[newto], 1871 to->trans[transnr].counter, -1); 1872 } else { 1873 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1874 ctxt->states[newto], counter, -1); 1875 } 1876 } 1877 } 1878 to->mark = XML_REGEXP_MARK_NORMAL; 1879 } 1880 1881 /** 1882 * xmlFAEliminateSimpleEpsilonTransitions: 1883 * @ctxt: a regexp parser context 1884 * 1885 * Eliminating general epsilon transitions can get costly in the general 1886 * algorithm due to the large amount of generated new transitions and 1887 * associated comparisons. However for simple epsilon transition used just 1888 * to separate building blocks when generating the automata this can be 1889 * reduced to state elimination: 1890 * - if there exists an epsilon from X to Y 1891 * - if there is no other transition from X 1892 * then X and Y are semantically equivalent and X can be eliminated 1893 * If X is the start state then make Y the start state, else replace the 1894 * target of all transitions to X by transitions to Y. 1895 */ 1896 static void 1897 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1898 int statenr, i, j, newto; 1899 xmlRegStatePtr state, tmp; 1900 1901 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1902 state = ctxt->states[statenr]; 1903 if (state == NULL) 1904 continue; 1905 if (state->nbTrans != 1) 1906 continue; 1907 if (state->type == XML_REGEXP_UNREACH_STATE) 1908 continue; 1909 /* is the only transition out a basic transition */ 1910 if ((state->trans[0].atom == NULL) && 1911 (state->trans[0].to >= 0) && 1912 (state->trans[0].to != statenr) && 1913 (state->trans[0].counter < 0) && 1914 (state->trans[0].count < 0)) { 1915 newto = state->trans[0].to; 1916 1917 if (state->type == XML_REGEXP_START_STATE) { 1918 #ifdef DEBUG_REGEXP_GRAPH 1919 printf("Found simple epsilon trans from start %d to %d\n", 1920 statenr, newto); 1921 #endif 1922 } else { 1923 #ifdef DEBUG_REGEXP_GRAPH 1924 printf("Found simple epsilon trans from %d to %d\n", 1925 statenr, newto); 1926 #endif 1927 for (i = 0;i < state->nbTransTo;i++) { 1928 tmp = ctxt->states[state->transTo[i]]; 1929 for (j = 0;j < tmp->nbTrans;j++) { 1930 if (tmp->trans[j].to == statenr) { 1931 #ifdef DEBUG_REGEXP_GRAPH 1932 printf("Changed transition %d on %d to go to %d\n", 1933 j, tmp->no, newto); 1934 #endif 1935 tmp->trans[j].to = -1; 1936 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1937 ctxt->states[newto], 1938 tmp->trans[j].counter, 1939 tmp->trans[j].count); 1940 } 1941 } 1942 } 1943 if (state->type == XML_REGEXP_FINAL_STATE) 1944 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1945 /* eliminate the transition completely */ 1946 state->nbTrans = 0; 1947 1948 state->type = XML_REGEXP_UNREACH_STATE; 1949 1950 } 1951 1952 } 1953 } 1954 } 1955 /** 1956 * xmlFAEliminateEpsilonTransitions: 1957 * @ctxt: a regexp parser context 1958 * 1959 */ 1960 static void 1961 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1962 int statenr, transnr; 1963 xmlRegStatePtr state; 1964 int has_epsilon; 1965 1966 if (ctxt->states == NULL) return; 1967 1968 /* 1969 * Eliminate simple epsilon transition and the associated unreachable 1970 * states. 1971 */ 1972 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1973 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1974 state = ctxt->states[statenr]; 1975 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1976 #ifdef DEBUG_REGEXP_GRAPH 1977 printf("Removed unreachable state %d\n", statenr); 1978 #endif 1979 xmlRegFreeState(state); 1980 ctxt->states[statenr] = NULL; 1981 } 1982 } 1983 1984 has_epsilon = 0; 1985 1986 /* 1987 * Build the completed transitions bypassing the epsilons 1988 * Use a marking algorithm to avoid loops 1989 * Mark sink states too. 1990 * Process from the latest states backward to the start when 1991 * there is long cascading epsilon chains this minimize the 1992 * recursions and transition compares when adding the new ones 1993 */ 1994 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1995 state = ctxt->states[statenr]; 1996 if (state == NULL) 1997 continue; 1998 if ((state->nbTrans == 0) && 1999 (state->type != XML_REGEXP_FINAL_STATE)) { 2000 state->type = XML_REGEXP_SINK_STATE; 2001 } 2002 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2003 if ((state->trans[transnr].atom == NULL) && 2004 (state->trans[transnr].to >= 0)) { 2005 if (state->trans[transnr].to == statenr) { 2006 state->trans[transnr].to = -1; 2007 #ifdef DEBUG_REGEXP_GRAPH 2008 printf("Removed loopback epsilon trans %d on %d\n", 2009 transnr, statenr); 2010 #endif 2011 } else if (state->trans[transnr].count < 0) { 2012 int newto = state->trans[transnr].to; 2013 2014 #ifdef DEBUG_REGEXP_GRAPH 2015 printf("Found epsilon trans %d from %d to %d\n", 2016 transnr, statenr, newto); 2017 #endif 2018 has_epsilon = 1; 2019 state->trans[transnr].to = -2; 2020 state->mark = XML_REGEXP_MARK_START; 2021 xmlFAReduceEpsilonTransitions(ctxt, statenr, 2022 newto, state->trans[transnr].counter); 2023 state->mark = XML_REGEXP_MARK_NORMAL; 2024 #ifdef DEBUG_REGEXP_GRAPH 2025 } else { 2026 printf("Found counted transition %d on %d\n", 2027 transnr, statenr); 2028 #endif 2029 } 2030 } 2031 } 2032 } 2033 /* 2034 * Eliminate the epsilon transitions 2035 */ 2036 if (has_epsilon) { 2037 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2038 state = ctxt->states[statenr]; 2039 if (state == NULL) 2040 continue; 2041 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2042 xmlRegTransPtr trans = &(state->trans[transnr]); 2043 if ((trans->atom == NULL) && 2044 (trans->count < 0) && 2045 (trans->to >= 0)) { 2046 trans->to = -1; 2047 } 2048 } 2049 } 2050 } 2051 2052 /* 2053 * Use this pass to detect unreachable states too 2054 */ 2055 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2056 state = ctxt->states[statenr]; 2057 if (state != NULL) 2058 state->reached = XML_REGEXP_MARK_NORMAL; 2059 } 2060 state = ctxt->states[0]; 2061 if (state != NULL) 2062 state->reached = XML_REGEXP_MARK_START; 2063 while (state != NULL) { 2064 xmlRegStatePtr target = NULL; 2065 state->reached = XML_REGEXP_MARK_VISITED; 2066 /* 2067 * Mark all states reachable from the current reachable state 2068 */ 2069 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2070 if ((state->trans[transnr].to >= 0) && 2071 ((state->trans[transnr].atom != NULL) || 2072 (state->trans[transnr].count >= 0))) { 2073 int newto = state->trans[transnr].to; 2074 2075 if (ctxt->states[newto] == NULL) 2076 continue; 2077 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2078 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2079 target = ctxt->states[newto]; 2080 } 2081 } 2082 } 2083 2084 /* 2085 * find the next accessible state not explored 2086 */ 2087 if (target == NULL) { 2088 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2089 state = ctxt->states[statenr]; 2090 if ((state != NULL) && (state->reached == 2091 XML_REGEXP_MARK_START)) { 2092 target = state; 2093 break; 2094 } 2095 } 2096 } 2097 state = target; 2098 } 2099 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2100 state = ctxt->states[statenr]; 2101 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2102 #ifdef DEBUG_REGEXP_GRAPH 2103 printf("Removed unreachable state %d\n", statenr); 2104 #endif 2105 xmlRegFreeState(state); 2106 ctxt->states[statenr] = NULL; 2107 } 2108 } 2109 2110 } 2111 2112 static int 2113 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2114 int ret = 0; 2115 2116 if ((range1->type == XML_REGEXP_RANGES) || 2117 (range2->type == XML_REGEXP_RANGES) || 2118 (range2->type == XML_REGEXP_SUBREG) || 2119 (range1->type == XML_REGEXP_SUBREG) || 2120 (range1->type == XML_REGEXP_STRING) || 2121 (range2->type == XML_REGEXP_STRING)) 2122 return(-1); 2123 2124 /* put them in order */ 2125 if (range1->type > range2->type) { 2126 xmlRegRangePtr tmp; 2127 2128 tmp = range1; 2129 range1 = range2; 2130 range2 = tmp; 2131 } 2132 if ((range1->type == XML_REGEXP_ANYCHAR) || 2133 (range2->type == XML_REGEXP_ANYCHAR)) { 2134 ret = 1; 2135 } else if ((range1->type == XML_REGEXP_EPSILON) || 2136 (range2->type == XML_REGEXP_EPSILON)) { 2137 return(0); 2138 } else if (range1->type == range2->type) { 2139 if (range1->type != XML_REGEXP_CHARVAL) 2140 ret = 1; 2141 else if ((range1->end < range2->start) || 2142 (range2->end < range1->start)) 2143 ret = 0; 2144 else 2145 ret = 1; 2146 } else if (range1->type == XML_REGEXP_CHARVAL) { 2147 int codepoint; 2148 int neg = 0; 2149 2150 /* 2151 * just check all codepoints in the range for acceptance, 2152 * this is usually way cheaper since done only once at 2153 * compilation than testing over and over at runtime or 2154 * pushing too many states when evaluating. 2155 */ 2156 if (((range1->neg == 0) && (range2->neg != 0)) || 2157 ((range1->neg != 0) && (range2->neg == 0))) 2158 neg = 1; 2159 2160 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2161 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2162 0, range2->start, range2->end, 2163 range2->blockName); 2164 if (ret < 0) 2165 return(-1); 2166 if (((neg == 1) && (ret == 0)) || 2167 ((neg == 0) && (ret == 1))) 2168 return(1); 2169 } 2170 return(0); 2171 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2172 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2173 if (range1->type == range2->type) { 2174 ret = xmlStrEqual(range1->blockName, range2->blockName); 2175 } else { 2176 /* 2177 * comparing a block range with anything else is way 2178 * too costly, and maintaining the table is like too much 2179 * memory too, so let's force the automata to save state 2180 * here. 2181 */ 2182 return(1); 2183 } 2184 } else if ((range1->type < XML_REGEXP_LETTER) || 2185 (range2->type < XML_REGEXP_LETTER)) { 2186 if ((range1->type == XML_REGEXP_ANYSPACE) && 2187 (range2->type == XML_REGEXP_NOTSPACE)) 2188 ret = 0; 2189 else if ((range1->type == XML_REGEXP_INITNAME) && 2190 (range2->type == XML_REGEXP_NOTINITNAME)) 2191 ret = 0; 2192 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2193 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2194 ret = 0; 2195 else if ((range1->type == XML_REGEXP_DECIMAL) && 2196 (range2->type == XML_REGEXP_NOTDECIMAL)) 2197 ret = 0; 2198 else if ((range1->type == XML_REGEXP_REALCHAR) && 2199 (range2->type == XML_REGEXP_NOTREALCHAR)) 2200 ret = 0; 2201 else { 2202 /* same thing to limit complexity */ 2203 return(1); 2204 } 2205 } else { 2206 ret = 0; 2207 /* range1->type < range2->type here */ 2208 switch (range1->type) { 2209 case XML_REGEXP_LETTER: 2210 /* all disjoint except in the subgroups */ 2211 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2212 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2213 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2214 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2215 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2216 ret = 1; 2217 break; 2218 case XML_REGEXP_MARK: 2219 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2220 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2221 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2222 ret = 1; 2223 break; 2224 case XML_REGEXP_NUMBER: 2225 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2226 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2227 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2228 ret = 1; 2229 break; 2230 case XML_REGEXP_PUNCT: 2231 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2232 (range2->type == XML_REGEXP_PUNCT_DASH) || 2233 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2234 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2235 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2236 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2237 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2238 ret = 1; 2239 break; 2240 case XML_REGEXP_SEPAR: 2241 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2242 (range2->type == XML_REGEXP_SEPAR_LINE) || 2243 (range2->type == XML_REGEXP_SEPAR_PARA)) 2244 ret = 1; 2245 break; 2246 case XML_REGEXP_SYMBOL: 2247 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2248 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2249 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2250 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2251 ret = 1; 2252 break; 2253 case XML_REGEXP_OTHER: 2254 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2255 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2256 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2257 ret = 1; 2258 break; 2259 default: 2260 if ((range2->type >= XML_REGEXP_LETTER) && 2261 (range2->type < XML_REGEXP_BLOCK_NAME)) 2262 ret = 0; 2263 else { 2264 /* safety net ! */ 2265 return(1); 2266 } 2267 } 2268 } 2269 if (((range1->neg == 0) && (range2->neg != 0)) || 2270 ((range1->neg != 0) && (range2->neg == 0))) 2271 ret = !ret; 2272 return(ret); 2273 } 2274 2275 /** 2276 * xmlFACompareAtomTypes: 2277 * @type1: an atom type 2278 * @type2: an atom type 2279 * 2280 * Compares two atoms type to check whether they intersect in some ways, 2281 * this is used by xmlFACompareAtoms only 2282 * 2283 * Returns 1 if they may intersect and 0 otherwise 2284 */ 2285 static int 2286 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2287 if ((type1 == XML_REGEXP_EPSILON) || 2288 (type1 == XML_REGEXP_CHARVAL) || 2289 (type1 == XML_REGEXP_RANGES) || 2290 (type1 == XML_REGEXP_SUBREG) || 2291 (type1 == XML_REGEXP_STRING) || 2292 (type1 == XML_REGEXP_ANYCHAR)) 2293 return(1); 2294 if ((type2 == XML_REGEXP_EPSILON) || 2295 (type2 == XML_REGEXP_CHARVAL) || 2296 (type2 == XML_REGEXP_RANGES) || 2297 (type2 == XML_REGEXP_SUBREG) || 2298 (type2 == XML_REGEXP_STRING) || 2299 (type2 == XML_REGEXP_ANYCHAR)) 2300 return(1); 2301 2302 if (type1 == type2) return(1); 2303 2304 /* simplify subsequent compares by making sure type1 < type2 */ 2305 if (type1 > type2) { 2306 xmlRegAtomType tmp = type1; 2307 type1 = type2; 2308 type2 = tmp; 2309 } 2310 switch (type1) { 2311 case XML_REGEXP_ANYSPACE: /* \s */ 2312 /* can't be a letter, number, mark, punctuation, symbol */ 2313 if ((type2 == XML_REGEXP_NOTSPACE) || 2314 ((type2 >= XML_REGEXP_LETTER) && 2315 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2316 ((type2 >= XML_REGEXP_NUMBER) && 2317 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2318 ((type2 >= XML_REGEXP_MARK) && 2319 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2320 ((type2 >= XML_REGEXP_PUNCT) && 2321 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2322 ((type2 >= XML_REGEXP_SYMBOL) && 2323 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2324 ) return(0); 2325 break; 2326 case XML_REGEXP_NOTSPACE: /* \S */ 2327 break; 2328 case XML_REGEXP_INITNAME: /* \l */ 2329 /* can't be a number, mark, separator, punctuation, symbol or other */ 2330 if ((type2 == XML_REGEXP_NOTINITNAME) || 2331 ((type2 >= XML_REGEXP_NUMBER) && 2332 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2333 ((type2 >= XML_REGEXP_MARK) && 2334 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2335 ((type2 >= XML_REGEXP_SEPAR) && 2336 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2337 ((type2 >= XML_REGEXP_PUNCT) && 2338 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2339 ((type2 >= XML_REGEXP_SYMBOL) && 2340 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2341 ((type2 >= XML_REGEXP_OTHER) && 2342 (type2 <= XML_REGEXP_OTHER_NA)) 2343 ) return(0); 2344 break; 2345 case XML_REGEXP_NOTINITNAME: /* \L */ 2346 break; 2347 case XML_REGEXP_NAMECHAR: /* \c */ 2348 /* can't be a mark, separator, punctuation, symbol or other */ 2349 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2350 ((type2 >= XML_REGEXP_MARK) && 2351 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2352 ((type2 >= XML_REGEXP_PUNCT) && 2353 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2354 ((type2 >= XML_REGEXP_SEPAR) && 2355 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2356 ((type2 >= XML_REGEXP_SYMBOL) && 2357 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2358 ((type2 >= XML_REGEXP_OTHER) && 2359 (type2 <= XML_REGEXP_OTHER_NA)) 2360 ) return(0); 2361 break; 2362 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2363 break; 2364 case XML_REGEXP_DECIMAL: /* \d */ 2365 /* can't be a letter, mark, separator, punctuation, symbol or other */ 2366 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2367 (type2 == XML_REGEXP_REALCHAR) || 2368 ((type2 >= XML_REGEXP_LETTER) && 2369 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2370 ((type2 >= XML_REGEXP_MARK) && 2371 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2372 ((type2 >= XML_REGEXP_PUNCT) && 2373 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2374 ((type2 >= XML_REGEXP_SEPAR) && 2375 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2376 ((type2 >= XML_REGEXP_SYMBOL) && 2377 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2378 ((type2 >= XML_REGEXP_OTHER) && 2379 (type2 <= XML_REGEXP_OTHER_NA)) 2380 )return(0); 2381 break; 2382 case XML_REGEXP_NOTDECIMAL: /* \D */ 2383 break; 2384 case XML_REGEXP_REALCHAR: /* \w */ 2385 /* can't be a mark, separator, punctuation, symbol or other */ 2386 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2387 ((type2 >= XML_REGEXP_MARK) && 2388 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2389 ((type2 >= XML_REGEXP_PUNCT) && 2390 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2391 ((type2 >= XML_REGEXP_SEPAR) && 2392 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2393 ((type2 >= XML_REGEXP_SYMBOL) && 2394 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2395 ((type2 >= XML_REGEXP_OTHER) && 2396 (type2 <= XML_REGEXP_OTHER_NA)) 2397 )return(0); 2398 break; 2399 case XML_REGEXP_NOTREALCHAR: /* \W */ 2400 break; 2401 /* 2402 * at that point we know both type 1 and type2 are from 2403 * character categories are ordered and are different, 2404 * it becomes simple because this is a partition 2405 */ 2406 case XML_REGEXP_LETTER: 2407 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2408 return(1); 2409 return(0); 2410 case XML_REGEXP_LETTER_UPPERCASE: 2411 case XML_REGEXP_LETTER_LOWERCASE: 2412 case XML_REGEXP_LETTER_TITLECASE: 2413 case XML_REGEXP_LETTER_MODIFIER: 2414 case XML_REGEXP_LETTER_OTHERS: 2415 return(0); 2416 case XML_REGEXP_MARK: 2417 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2418 return(1); 2419 return(0); 2420 case XML_REGEXP_MARK_NONSPACING: 2421 case XML_REGEXP_MARK_SPACECOMBINING: 2422 case XML_REGEXP_MARK_ENCLOSING: 2423 return(0); 2424 case XML_REGEXP_NUMBER: 2425 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2426 return(1); 2427 return(0); 2428 case XML_REGEXP_NUMBER_DECIMAL: 2429 case XML_REGEXP_NUMBER_LETTER: 2430 case XML_REGEXP_NUMBER_OTHERS: 2431 return(0); 2432 case XML_REGEXP_PUNCT: 2433 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2434 return(1); 2435 return(0); 2436 case XML_REGEXP_PUNCT_CONNECTOR: 2437 case XML_REGEXP_PUNCT_DASH: 2438 case XML_REGEXP_PUNCT_OPEN: 2439 case XML_REGEXP_PUNCT_CLOSE: 2440 case XML_REGEXP_PUNCT_INITQUOTE: 2441 case XML_REGEXP_PUNCT_FINQUOTE: 2442 case XML_REGEXP_PUNCT_OTHERS: 2443 return(0); 2444 case XML_REGEXP_SEPAR: 2445 if (type2 <= XML_REGEXP_SEPAR_PARA) 2446 return(1); 2447 return(0); 2448 case XML_REGEXP_SEPAR_SPACE: 2449 case XML_REGEXP_SEPAR_LINE: 2450 case XML_REGEXP_SEPAR_PARA: 2451 return(0); 2452 case XML_REGEXP_SYMBOL: 2453 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2454 return(1); 2455 return(0); 2456 case XML_REGEXP_SYMBOL_MATH: 2457 case XML_REGEXP_SYMBOL_CURRENCY: 2458 case XML_REGEXP_SYMBOL_MODIFIER: 2459 case XML_REGEXP_SYMBOL_OTHERS: 2460 return(0); 2461 case XML_REGEXP_OTHER: 2462 if (type2 <= XML_REGEXP_OTHER_NA) 2463 return(1); 2464 return(0); 2465 case XML_REGEXP_OTHER_CONTROL: 2466 case XML_REGEXP_OTHER_FORMAT: 2467 case XML_REGEXP_OTHER_PRIVATE: 2468 case XML_REGEXP_OTHER_NA: 2469 return(0); 2470 default: 2471 break; 2472 } 2473 return(1); 2474 } 2475 2476 /** 2477 * xmlFAEqualAtoms: 2478 * @atom1: an atom 2479 * @atom2: an atom 2480 * @deep: if not set only compare string pointers 2481 * 2482 * Compares two atoms to check whether they are the same exactly 2483 * this is used to remove equivalent transitions 2484 * 2485 * Returns 1 if same and 0 otherwise 2486 */ 2487 static int 2488 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2489 int ret = 0; 2490 2491 if (atom1 == atom2) 2492 return(1); 2493 if ((atom1 == NULL) || (atom2 == NULL)) 2494 return(0); 2495 2496 if (atom1->type != atom2->type) 2497 return(0); 2498 switch (atom1->type) { 2499 case XML_REGEXP_EPSILON: 2500 ret = 0; 2501 break; 2502 case XML_REGEXP_STRING: 2503 if (!deep) 2504 ret = (atom1->valuep == atom2->valuep); 2505 else 2506 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2507 (xmlChar *)atom2->valuep); 2508 break; 2509 case XML_REGEXP_CHARVAL: 2510 ret = (atom1->codepoint == atom2->codepoint); 2511 break; 2512 case XML_REGEXP_RANGES: 2513 /* too hard to do in the general case */ 2514 ret = 0; 2515 default: 2516 break; 2517 } 2518 return(ret); 2519 } 2520 2521 /** 2522 * xmlFACompareAtoms: 2523 * @atom1: an atom 2524 * @atom2: an atom 2525 * @deep: if not set only compare string pointers 2526 * 2527 * Compares two atoms to check whether they intersect in some ways, 2528 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2529 * 2530 * Returns 1 if yes and 0 otherwise 2531 */ 2532 static int 2533 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2534 int ret = 1; 2535 2536 if (atom1 == atom2) 2537 return(1); 2538 if ((atom1 == NULL) || (atom2 == NULL)) 2539 return(0); 2540 2541 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2542 (atom2->type == XML_REGEXP_ANYCHAR)) 2543 return(1); 2544 2545 if (atom1->type > atom2->type) { 2546 xmlRegAtomPtr tmp; 2547 tmp = atom1; 2548 atom1 = atom2; 2549 atom2 = tmp; 2550 } 2551 if (atom1->type != atom2->type) { 2552 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2553 /* if they can't intersect at the type level break now */ 2554 if (ret == 0) 2555 return(0); 2556 } 2557 switch (atom1->type) { 2558 case XML_REGEXP_STRING: 2559 if (!deep) 2560 ret = (atom1->valuep != atom2->valuep); 2561 else { 2562 xmlChar *val1 = (xmlChar *)atom1->valuep; 2563 xmlChar *val2 = (xmlChar *)atom2->valuep; 2564 int compound1 = (xmlStrchr(val1, '|') != NULL); 2565 int compound2 = (xmlStrchr(val2, '|') != NULL); 2566 2567 /* Ignore negative match flag for ##other namespaces */ 2568 if (compound1 != compound2) 2569 return(0); 2570 2571 ret = xmlRegStrEqualWildcard(val1, val2); 2572 } 2573 break; 2574 case XML_REGEXP_EPSILON: 2575 goto not_determinist; 2576 case XML_REGEXP_CHARVAL: 2577 if (atom2->type == XML_REGEXP_CHARVAL) { 2578 ret = (atom1->codepoint == atom2->codepoint); 2579 } else { 2580 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2581 if (ret < 0) 2582 ret = 1; 2583 } 2584 break; 2585 case XML_REGEXP_RANGES: 2586 if (atom2->type == XML_REGEXP_RANGES) { 2587 int i, j, res; 2588 xmlRegRangePtr r1, r2; 2589 2590 /* 2591 * need to check that none of the ranges eventually matches 2592 */ 2593 for (i = 0;i < atom1->nbRanges;i++) { 2594 for (j = 0;j < atom2->nbRanges;j++) { 2595 r1 = atom1->ranges[i]; 2596 r2 = atom2->ranges[j]; 2597 res = xmlFACompareRanges(r1, r2); 2598 if (res == 1) { 2599 ret = 1; 2600 goto done; 2601 } 2602 } 2603 } 2604 ret = 0; 2605 } 2606 break; 2607 default: 2608 goto not_determinist; 2609 } 2610 done: 2611 if (atom1->neg != atom2->neg) { 2612 ret = !ret; 2613 } 2614 if (ret == 0) 2615 return(0); 2616 not_determinist: 2617 return(1); 2618 } 2619 2620 /** 2621 * xmlFARecurseDeterminism: 2622 * @ctxt: a regexp parser context 2623 * 2624 * Check whether the associated regexp is determinist, 2625 * should be called after xmlFAEliminateEpsilonTransitions() 2626 * 2627 */ 2628 static int 2629 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2630 int to, xmlRegAtomPtr atom) { 2631 int ret = 1; 2632 int res; 2633 int transnr, nbTrans; 2634 xmlRegTransPtr t1; 2635 int deep = 1; 2636 2637 if (state == NULL) 2638 return(ret); 2639 if (state->markd == XML_REGEXP_MARK_VISITED) 2640 return(ret); 2641 2642 if (ctxt->flags & AM_AUTOMATA_RNG) 2643 deep = 0; 2644 2645 /* 2646 * don't recurse on transitions potentially added in the course of 2647 * the elimination. 2648 */ 2649 nbTrans = state->nbTrans; 2650 for (transnr = 0;transnr < nbTrans;transnr++) { 2651 t1 = &(state->trans[transnr]); 2652 /* 2653 * check transitions conflicting with the one looked at 2654 */ 2655 if (t1->atom == NULL) { 2656 if (t1->to < 0) 2657 continue; 2658 state->markd = XML_REGEXP_MARK_VISITED; 2659 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2660 to, atom); 2661 if (res == 0) { 2662 ret = 0; 2663 /* t1->nd = 1; */ 2664 } 2665 continue; 2666 } 2667 if (t1->to != to) 2668 continue; 2669 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2670 ret = 0; 2671 /* mark the transition as non-deterministic */ 2672 t1->nd = 1; 2673 } 2674 } 2675 return(ret); 2676 } 2677 2678 /** 2679 * xmlFAFinishRecurseDeterminism: 2680 * @ctxt: a regexp parser context 2681 * 2682 * Reset flags after checking determinism. 2683 */ 2684 static void 2685 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 2686 int transnr, nbTrans; 2687 2688 if (state == NULL) 2689 return; 2690 if (state->markd != XML_REGEXP_MARK_VISITED) 2691 return; 2692 state->markd = 0; 2693 2694 nbTrans = state->nbTrans; 2695 for (transnr = 0; transnr < nbTrans; transnr++) { 2696 xmlRegTransPtr t1 = &state->trans[transnr]; 2697 if ((t1->atom == NULL) && (t1->to >= 0)) 2698 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]); 2699 } 2700 } 2701 2702 /** 2703 * xmlFAComputesDeterminism: 2704 * @ctxt: a regexp parser context 2705 * 2706 * Check whether the associated regexp is determinist, 2707 * should be called after xmlFAEliminateEpsilonTransitions() 2708 * 2709 */ 2710 static int 2711 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2712 int statenr, transnr; 2713 xmlRegStatePtr state; 2714 xmlRegTransPtr t1, t2, last; 2715 int i; 2716 int ret = 1; 2717 int deep = 1; 2718 2719 #ifdef DEBUG_REGEXP_GRAPH 2720 printf("xmlFAComputesDeterminism\n"); 2721 xmlRegPrintCtxt(stdout, ctxt); 2722 #endif 2723 if (ctxt->determinist != -1) 2724 return(ctxt->determinist); 2725 2726 if (ctxt->flags & AM_AUTOMATA_RNG) 2727 deep = 0; 2728 2729 /* 2730 * First cleanup the automata removing cancelled transitions 2731 */ 2732 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2733 state = ctxt->states[statenr]; 2734 if (state == NULL) 2735 continue; 2736 if (state->nbTrans < 2) 2737 continue; 2738 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2739 t1 = &(state->trans[transnr]); 2740 /* 2741 * Determinism checks in case of counted or all transitions 2742 * will have to be handled separately 2743 */ 2744 if (t1->atom == NULL) { 2745 /* t1->nd = 1; */ 2746 continue; 2747 } 2748 if (t1->to == -1) /* eliminated */ 2749 continue; 2750 for (i = 0;i < transnr;i++) { 2751 t2 = &(state->trans[i]); 2752 if (t2->to == -1) /* eliminated */ 2753 continue; 2754 if (t2->atom != NULL) { 2755 if (t1->to == t2->to) { 2756 /* 2757 * Here we use deep because we want to keep the 2758 * transitions which indicate a conflict 2759 */ 2760 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2761 (t1->counter == t2->counter) && 2762 (t1->count == t2->count)) 2763 t2->to = -1; /* eliminated */ 2764 } 2765 } 2766 } 2767 } 2768 } 2769 2770 /* 2771 * Check for all states that there aren't 2 transitions 2772 * with the same atom and a different target. 2773 */ 2774 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2775 state = ctxt->states[statenr]; 2776 if (state == NULL) 2777 continue; 2778 if (state->nbTrans < 2) 2779 continue; 2780 last = NULL; 2781 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2782 t1 = &(state->trans[transnr]); 2783 /* 2784 * Determinism checks in case of counted or all transitions 2785 * will have to be handled separately 2786 */ 2787 if (t1->atom == NULL) { 2788 continue; 2789 } 2790 if (t1->to == -1) /* eliminated */ 2791 continue; 2792 for (i = 0;i < transnr;i++) { 2793 t2 = &(state->trans[i]); 2794 if (t2->to == -1) /* eliminated */ 2795 continue; 2796 if (t2->atom != NULL) { 2797 /* 2798 * But here we don't use deep because we want to 2799 * find transitions which indicate a conflict 2800 */ 2801 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2802 ret = 0; 2803 /* mark the transitions as non-deterministic ones */ 2804 t1->nd = 1; 2805 t2->nd = 1; 2806 last = t1; 2807 } 2808 } else if (t1->to != -1) { 2809 /* 2810 * do the closure in case of remaining specific 2811 * epsilon transitions like choices or all 2812 */ 2813 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2814 t2->to, t2->atom); 2815 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]); 2816 /* don't shortcut the computation so all non deterministic 2817 transition get marked down 2818 if (ret == 0) 2819 return(0); 2820 */ 2821 if (ret == 0) { 2822 t1->nd = 1; 2823 /* t2->nd = 1; */ 2824 last = t1; 2825 } 2826 } 2827 } 2828 /* don't shortcut the computation so all non deterministic 2829 transition get marked down 2830 if (ret == 0) 2831 break; */ 2832 } 2833 2834 /* 2835 * mark specifically the last non-deterministic transition 2836 * from a state since there is no need to set-up rollback 2837 * from it 2838 */ 2839 if (last != NULL) { 2840 last->nd = 2; 2841 } 2842 2843 /* don't shortcut the computation so all non deterministic 2844 transition get marked down 2845 if (ret == 0) 2846 break; */ 2847 } 2848 2849 ctxt->determinist = ret; 2850 return(ret); 2851 } 2852 2853 /************************************************************************ 2854 * * 2855 * Routines to check input against transition atoms * 2856 * * 2857 ************************************************************************/ 2858 2859 static int 2860 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2861 int start, int end, const xmlChar *blockName) { 2862 int ret = 0; 2863 2864 switch (type) { 2865 case XML_REGEXP_STRING: 2866 case XML_REGEXP_SUBREG: 2867 case XML_REGEXP_RANGES: 2868 case XML_REGEXP_EPSILON: 2869 return(-1); 2870 case XML_REGEXP_ANYCHAR: 2871 ret = ((codepoint != '\n') && (codepoint != '\r')); 2872 break; 2873 case XML_REGEXP_CHARVAL: 2874 ret = ((codepoint >= start) && (codepoint <= end)); 2875 break; 2876 case XML_REGEXP_NOTSPACE: 2877 neg = !neg; 2878 /* Falls through. */ 2879 case XML_REGEXP_ANYSPACE: 2880 ret = ((codepoint == '\n') || (codepoint == '\r') || 2881 (codepoint == '\t') || (codepoint == ' ')); 2882 break; 2883 case XML_REGEXP_NOTINITNAME: 2884 neg = !neg; 2885 /* Falls through. */ 2886 case XML_REGEXP_INITNAME: 2887 ret = (IS_LETTER(codepoint) || 2888 (codepoint == '_') || (codepoint == ':')); 2889 break; 2890 case XML_REGEXP_NOTNAMECHAR: 2891 neg = !neg; 2892 /* Falls through. */ 2893 case XML_REGEXP_NAMECHAR: 2894 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2895 (codepoint == '.') || (codepoint == '-') || 2896 (codepoint == '_') || (codepoint == ':') || 2897 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2898 break; 2899 case XML_REGEXP_NOTDECIMAL: 2900 neg = !neg; 2901 /* Falls through. */ 2902 case XML_REGEXP_DECIMAL: 2903 ret = xmlUCSIsCatNd(codepoint); 2904 break; 2905 case XML_REGEXP_REALCHAR: 2906 neg = !neg; 2907 /* Falls through. */ 2908 case XML_REGEXP_NOTREALCHAR: 2909 ret = xmlUCSIsCatP(codepoint); 2910 if (ret == 0) 2911 ret = xmlUCSIsCatZ(codepoint); 2912 if (ret == 0) 2913 ret = xmlUCSIsCatC(codepoint); 2914 break; 2915 case XML_REGEXP_LETTER: 2916 ret = xmlUCSIsCatL(codepoint); 2917 break; 2918 case XML_REGEXP_LETTER_UPPERCASE: 2919 ret = xmlUCSIsCatLu(codepoint); 2920 break; 2921 case XML_REGEXP_LETTER_LOWERCASE: 2922 ret = xmlUCSIsCatLl(codepoint); 2923 break; 2924 case XML_REGEXP_LETTER_TITLECASE: 2925 ret = xmlUCSIsCatLt(codepoint); 2926 break; 2927 case XML_REGEXP_LETTER_MODIFIER: 2928 ret = xmlUCSIsCatLm(codepoint); 2929 break; 2930 case XML_REGEXP_LETTER_OTHERS: 2931 ret = xmlUCSIsCatLo(codepoint); 2932 break; 2933 case XML_REGEXP_MARK: 2934 ret = xmlUCSIsCatM(codepoint); 2935 break; 2936 case XML_REGEXP_MARK_NONSPACING: 2937 ret = xmlUCSIsCatMn(codepoint); 2938 break; 2939 case XML_REGEXP_MARK_SPACECOMBINING: 2940 ret = xmlUCSIsCatMc(codepoint); 2941 break; 2942 case XML_REGEXP_MARK_ENCLOSING: 2943 ret = xmlUCSIsCatMe(codepoint); 2944 break; 2945 case XML_REGEXP_NUMBER: 2946 ret = xmlUCSIsCatN(codepoint); 2947 break; 2948 case XML_REGEXP_NUMBER_DECIMAL: 2949 ret = xmlUCSIsCatNd(codepoint); 2950 break; 2951 case XML_REGEXP_NUMBER_LETTER: 2952 ret = xmlUCSIsCatNl(codepoint); 2953 break; 2954 case XML_REGEXP_NUMBER_OTHERS: 2955 ret = xmlUCSIsCatNo(codepoint); 2956 break; 2957 case XML_REGEXP_PUNCT: 2958 ret = xmlUCSIsCatP(codepoint); 2959 break; 2960 case XML_REGEXP_PUNCT_CONNECTOR: 2961 ret = xmlUCSIsCatPc(codepoint); 2962 break; 2963 case XML_REGEXP_PUNCT_DASH: 2964 ret = xmlUCSIsCatPd(codepoint); 2965 break; 2966 case XML_REGEXP_PUNCT_OPEN: 2967 ret = xmlUCSIsCatPs(codepoint); 2968 break; 2969 case XML_REGEXP_PUNCT_CLOSE: 2970 ret = xmlUCSIsCatPe(codepoint); 2971 break; 2972 case XML_REGEXP_PUNCT_INITQUOTE: 2973 ret = xmlUCSIsCatPi(codepoint); 2974 break; 2975 case XML_REGEXP_PUNCT_FINQUOTE: 2976 ret = xmlUCSIsCatPf(codepoint); 2977 break; 2978 case XML_REGEXP_PUNCT_OTHERS: 2979 ret = xmlUCSIsCatPo(codepoint); 2980 break; 2981 case XML_REGEXP_SEPAR: 2982 ret = xmlUCSIsCatZ(codepoint); 2983 break; 2984 case XML_REGEXP_SEPAR_SPACE: 2985 ret = xmlUCSIsCatZs(codepoint); 2986 break; 2987 case XML_REGEXP_SEPAR_LINE: 2988 ret = xmlUCSIsCatZl(codepoint); 2989 break; 2990 case XML_REGEXP_SEPAR_PARA: 2991 ret = xmlUCSIsCatZp(codepoint); 2992 break; 2993 case XML_REGEXP_SYMBOL: 2994 ret = xmlUCSIsCatS(codepoint); 2995 break; 2996 case XML_REGEXP_SYMBOL_MATH: 2997 ret = xmlUCSIsCatSm(codepoint); 2998 break; 2999 case XML_REGEXP_SYMBOL_CURRENCY: 3000 ret = xmlUCSIsCatSc(codepoint); 3001 break; 3002 case XML_REGEXP_SYMBOL_MODIFIER: 3003 ret = xmlUCSIsCatSk(codepoint); 3004 break; 3005 case XML_REGEXP_SYMBOL_OTHERS: 3006 ret = xmlUCSIsCatSo(codepoint); 3007 break; 3008 case XML_REGEXP_OTHER: 3009 ret = xmlUCSIsCatC(codepoint); 3010 break; 3011 case XML_REGEXP_OTHER_CONTROL: 3012 ret = xmlUCSIsCatCc(codepoint); 3013 break; 3014 case XML_REGEXP_OTHER_FORMAT: 3015 ret = xmlUCSIsCatCf(codepoint); 3016 break; 3017 case XML_REGEXP_OTHER_PRIVATE: 3018 ret = xmlUCSIsCatCo(codepoint); 3019 break; 3020 case XML_REGEXP_OTHER_NA: 3021 /* ret = xmlUCSIsCatCn(codepoint); */ 3022 /* Seems it doesn't exist anymore in recent Unicode releases */ 3023 ret = 0; 3024 break; 3025 case XML_REGEXP_BLOCK_NAME: 3026 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 3027 break; 3028 } 3029 if (neg) 3030 return(!ret); 3031 return(ret); 3032 } 3033 3034 static int 3035 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 3036 int i, ret = 0; 3037 xmlRegRangePtr range; 3038 3039 if ((atom == NULL) || (!IS_CHAR(codepoint))) 3040 return(-1); 3041 3042 switch (atom->type) { 3043 case XML_REGEXP_SUBREG: 3044 case XML_REGEXP_EPSILON: 3045 return(-1); 3046 case XML_REGEXP_CHARVAL: 3047 return(codepoint == atom->codepoint); 3048 case XML_REGEXP_RANGES: { 3049 int accept = 0; 3050 3051 for (i = 0;i < atom->nbRanges;i++) { 3052 range = atom->ranges[i]; 3053 if (range->neg == 2) { 3054 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3055 0, range->start, range->end, 3056 range->blockName); 3057 if (ret != 0) 3058 return(0); /* excluded char */ 3059 } else if (range->neg) { 3060 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3061 0, range->start, range->end, 3062 range->blockName); 3063 if (ret == 0) 3064 accept = 1; 3065 else 3066 return(0); 3067 } else { 3068 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3069 0, range->start, range->end, 3070 range->blockName); 3071 if (ret != 0) 3072 accept = 1; /* might still be excluded */ 3073 } 3074 } 3075 return(accept); 3076 } 3077 case XML_REGEXP_STRING: 3078 printf("TODO: XML_REGEXP_STRING\n"); 3079 return(-1); 3080 case XML_REGEXP_ANYCHAR: 3081 case XML_REGEXP_ANYSPACE: 3082 case XML_REGEXP_NOTSPACE: 3083 case XML_REGEXP_INITNAME: 3084 case XML_REGEXP_NOTINITNAME: 3085 case XML_REGEXP_NAMECHAR: 3086 case XML_REGEXP_NOTNAMECHAR: 3087 case XML_REGEXP_DECIMAL: 3088 case XML_REGEXP_NOTDECIMAL: 3089 case XML_REGEXP_REALCHAR: 3090 case XML_REGEXP_NOTREALCHAR: 3091 case XML_REGEXP_LETTER: 3092 case XML_REGEXP_LETTER_UPPERCASE: 3093 case XML_REGEXP_LETTER_LOWERCASE: 3094 case XML_REGEXP_LETTER_TITLECASE: 3095 case XML_REGEXP_LETTER_MODIFIER: 3096 case XML_REGEXP_LETTER_OTHERS: 3097 case XML_REGEXP_MARK: 3098 case XML_REGEXP_MARK_NONSPACING: 3099 case XML_REGEXP_MARK_SPACECOMBINING: 3100 case XML_REGEXP_MARK_ENCLOSING: 3101 case XML_REGEXP_NUMBER: 3102 case XML_REGEXP_NUMBER_DECIMAL: 3103 case XML_REGEXP_NUMBER_LETTER: 3104 case XML_REGEXP_NUMBER_OTHERS: 3105 case XML_REGEXP_PUNCT: 3106 case XML_REGEXP_PUNCT_CONNECTOR: 3107 case XML_REGEXP_PUNCT_DASH: 3108 case XML_REGEXP_PUNCT_OPEN: 3109 case XML_REGEXP_PUNCT_CLOSE: 3110 case XML_REGEXP_PUNCT_INITQUOTE: 3111 case XML_REGEXP_PUNCT_FINQUOTE: 3112 case XML_REGEXP_PUNCT_OTHERS: 3113 case XML_REGEXP_SEPAR: 3114 case XML_REGEXP_SEPAR_SPACE: 3115 case XML_REGEXP_SEPAR_LINE: 3116 case XML_REGEXP_SEPAR_PARA: 3117 case XML_REGEXP_SYMBOL: 3118 case XML_REGEXP_SYMBOL_MATH: 3119 case XML_REGEXP_SYMBOL_CURRENCY: 3120 case XML_REGEXP_SYMBOL_MODIFIER: 3121 case XML_REGEXP_SYMBOL_OTHERS: 3122 case XML_REGEXP_OTHER: 3123 case XML_REGEXP_OTHER_CONTROL: 3124 case XML_REGEXP_OTHER_FORMAT: 3125 case XML_REGEXP_OTHER_PRIVATE: 3126 case XML_REGEXP_OTHER_NA: 3127 case XML_REGEXP_BLOCK_NAME: 3128 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3129 (const xmlChar *)atom->valuep); 3130 if (atom->neg) 3131 ret = !ret; 3132 break; 3133 } 3134 return(ret); 3135 } 3136 3137 /************************************************************************ 3138 * * 3139 * Saving and restoring state of an execution context * 3140 * * 3141 ************************************************************************/ 3142 3143 #ifdef DEBUG_REGEXP_EXEC 3144 static void 3145 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3146 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3147 if (exec->inputStack != NULL) { 3148 int i; 3149 printf(": "); 3150 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3151 printf("%s ", (const char *) 3152 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3153 } else { 3154 printf(": %s", &(exec->inputString[exec->index])); 3155 } 3156 printf("\n"); 3157 } 3158 #endif 3159 3160 static void 3161 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3162 #ifdef DEBUG_REGEXP_EXEC 3163 printf("saving "); 3164 exec->transno++; 3165 xmlFARegDebugExec(exec); 3166 exec->transno--; 3167 #endif 3168 #ifdef MAX_PUSH 3169 if (exec->nbPush > MAX_PUSH) { 3170 return; 3171 } 3172 exec->nbPush++; 3173 #endif 3174 3175 if (exec->maxRollbacks == 0) { 3176 exec->maxRollbacks = 4; 3177 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3178 sizeof(xmlRegExecRollback)); 3179 if (exec->rollbacks == NULL) { 3180 xmlRegexpErrMemory(NULL, "saving regexp"); 3181 exec->maxRollbacks = 0; 3182 return; 3183 } 3184 memset(exec->rollbacks, 0, 3185 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3186 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3187 xmlRegExecRollback *tmp; 3188 int len = exec->maxRollbacks; 3189 3190 exec->maxRollbacks *= 2; 3191 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3192 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3193 if (tmp == NULL) { 3194 xmlRegexpErrMemory(NULL, "saving regexp"); 3195 exec->maxRollbacks /= 2; 3196 return; 3197 } 3198 exec->rollbacks = tmp; 3199 tmp = &exec->rollbacks[len]; 3200 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3201 } 3202 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3203 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3204 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3205 if (exec->comp->nbCounters > 0) { 3206 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3207 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3208 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3209 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3210 xmlRegexpErrMemory(NULL, "saving regexp"); 3211 exec->status = -5; 3212 return; 3213 } 3214 } 3215 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3216 exec->comp->nbCounters * sizeof(int)); 3217 } 3218 exec->nbRollbacks++; 3219 } 3220 3221 static void 3222 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3223 if (exec->nbRollbacks <= 0) { 3224 exec->status = -1; 3225 #ifdef DEBUG_REGEXP_EXEC 3226 printf("rollback failed on empty stack\n"); 3227 #endif 3228 return; 3229 } 3230 exec->nbRollbacks--; 3231 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3232 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3233 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3234 if (exec->comp->nbCounters > 0) { 3235 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3236 fprintf(stderr, "exec save: allocation failed"); 3237 exec->status = -6; 3238 return; 3239 } 3240 if (exec->counts) { 3241 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3242 exec->comp->nbCounters * sizeof(int)); 3243 } 3244 } 3245 3246 #ifdef DEBUG_REGEXP_EXEC 3247 printf("restored "); 3248 xmlFARegDebugExec(exec); 3249 #endif 3250 } 3251 3252 /************************************************************************ 3253 * * 3254 * Verifier, running an input against a compiled regexp * 3255 * * 3256 ************************************************************************/ 3257 3258 static int 3259 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3260 xmlRegExecCtxt execval; 3261 xmlRegExecCtxtPtr exec = &execval; 3262 int ret, codepoint = 0, len, deter; 3263 3264 exec->inputString = content; 3265 exec->index = 0; 3266 exec->nbPush = 0; 3267 exec->determinist = 1; 3268 exec->maxRollbacks = 0; 3269 exec->nbRollbacks = 0; 3270 exec->rollbacks = NULL; 3271 exec->status = 0; 3272 exec->comp = comp; 3273 exec->state = comp->states[0]; 3274 exec->transno = 0; 3275 exec->transcount = 0; 3276 exec->inputStack = NULL; 3277 exec->inputStackMax = 0; 3278 if (comp->nbCounters > 0) { 3279 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3280 if (exec->counts == NULL) { 3281 xmlRegexpErrMemory(NULL, "running regexp"); 3282 return(-1); 3283 } 3284 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3285 } else 3286 exec->counts = NULL; 3287 while ((exec->status == 0) && (exec->state != NULL) && 3288 ((exec->inputString[exec->index] != 0) || 3289 ((exec->state != NULL) && 3290 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3291 xmlRegTransPtr trans; 3292 xmlRegAtomPtr atom; 3293 3294 /* 3295 * If end of input on non-terminal state, rollback, however we may 3296 * still have epsilon like transition for counted transitions 3297 * on counters, in that case don't break too early. Additionally, 3298 * if we are working on a range like "AB{0,2}", where B is not present, 3299 * we don't want to break. 3300 */ 3301 len = 1; 3302 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3303 /* 3304 * if there is a transition, we must check if 3305 * atom allows minOccurs of 0 3306 */ 3307 if (exec->transno < exec->state->nbTrans) { 3308 trans = &exec->state->trans[exec->transno]; 3309 if (trans->to >=0) { 3310 atom = trans->atom; 3311 if (!((atom->min == 0) && (atom->max > 0))) 3312 goto rollback; 3313 } 3314 } else 3315 goto rollback; 3316 } 3317 3318 exec->transcount = 0; 3319 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3320 trans = &exec->state->trans[exec->transno]; 3321 if (trans->to < 0) 3322 continue; 3323 atom = trans->atom; 3324 ret = 0; 3325 deter = 1; 3326 if (trans->count >= 0) { 3327 int count; 3328 xmlRegCounterPtr counter; 3329 3330 if (exec->counts == NULL) { 3331 exec->status = -1; 3332 goto error; 3333 } 3334 /* 3335 * A counted transition. 3336 */ 3337 3338 count = exec->counts[trans->count]; 3339 counter = &exec->comp->counters[trans->count]; 3340 #ifdef DEBUG_REGEXP_EXEC 3341 printf("testing count %d: val %d, min %d, max %d\n", 3342 trans->count, count, counter->min, counter->max); 3343 #endif 3344 ret = ((count >= counter->min) && (count <= counter->max)); 3345 if ((ret) && (counter->min != counter->max)) 3346 deter = 0; 3347 } else if (atom == NULL) { 3348 fprintf(stderr, "epsilon transition left at runtime\n"); 3349 exec->status = -2; 3350 break; 3351 } else if (exec->inputString[exec->index] != 0) { 3352 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3353 ret = xmlRegCheckCharacter(atom, codepoint); 3354 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3355 xmlRegStatePtr to = comp->states[trans->to]; 3356 3357 /* 3358 * this is a multiple input sequence 3359 * If there is a counter associated increment it now. 3360 * before potentially saving and rollback 3361 * do not increment if the counter is already over the 3362 * maximum limit in which case get to next transition 3363 */ 3364 if (trans->counter >= 0) { 3365 xmlRegCounterPtr counter; 3366 3367 if ((exec->counts == NULL) || 3368 (exec->comp == NULL) || 3369 (exec->comp->counters == NULL)) { 3370 exec->status = -1; 3371 goto error; 3372 } 3373 counter = &exec->comp->counters[trans->counter]; 3374 if (exec->counts[trans->counter] >= counter->max) 3375 continue; /* for loop on transitions */ 3376 3377 #ifdef DEBUG_REGEXP_EXEC 3378 printf("Increasing count %d\n", trans->counter); 3379 #endif 3380 exec->counts[trans->counter]++; 3381 } 3382 if (exec->state->nbTrans > exec->transno + 1) { 3383 xmlFARegExecSave(exec); 3384 } 3385 exec->transcount = 1; 3386 do { 3387 /* 3388 * Try to progress as much as possible on the input 3389 */ 3390 if (exec->transcount == atom->max) { 3391 break; 3392 } 3393 exec->index += len; 3394 /* 3395 * End of input: stop here 3396 */ 3397 if (exec->inputString[exec->index] == 0) { 3398 exec->index -= len; 3399 break; 3400 } 3401 if (exec->transcount >= atom->min) { 3402 int transno = exec->transno; 3403 xmlRegStatePtr state = exec->state; 3404 3405 /* 3406 * The transition is acceptable save it 3407 */ 3408 exec->transno = -1; /* trick */ 3409 exec->state = to; 3410 xmlFARegExecSave(exec); 3411 exec->transno = transno; 3412 exec->state = state; 3413 } 3414 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3415 len); 3416 ret = xmlRegCheckCharacter(atom, codepoint); 3417 exec->transcount++; 3418 } while (ret == 1); 3419 if (exec->transcount < atom->min) 3420 ret = 0; 3421 3422 /* 3423 * If the last check failed but one transition was found 3424 * possible, rollback 3425 */ 3426 if (ret < 0) 3427 ret = 0; 3428 if (ret == 0) { 3429 goto rollback; 3430 } 3431 if (trans->counter >= 0) { 3432 if (exec->counts == NULL) { 3433 exec->status = -1; 3434 goto error; 3435 } 3436 #ifdef DEBUG_REGEXP_EXEC 3437 printf("Decreasing count %d\n", trans->counter); 3438 #endif 3439 exec->counts[trans->counter]--; 3440 } 3441 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3442 /* 3443 * we don't match on the codepoint, but minOccurs of 0 3444 * says that's ok. Setting len to 0 inhibits stepping 3445 * over the codepoint. 3446 */ 3447 exec->transcount = 1; 3448 len = 0; 3449 ret = 1; 3450 } 3451 } else if ((atom->min == 0) && (atom->max > 0)) { 3452 /* another spot to match when minOccurs is 0 */ 3453 exec->transcount = 1; 3454 len = 0; 3455 ret = 1; 3456 } 3457 if (ret == 1) { 3458 if ((trans->nd == 1) || 3459 ((trans->count >= 0) && (deter == 0) && 3460 (exec->state->nbTrans > exec->transno + 1))) { 3461 #ifdef DEBUG_REGEXP_EXEC 3462 if (trans->nd == 1) 3463 printf("Saving on nd transition atom %d for %c at %d\n", 3464 trans->atom->no, codepoint, exec->index); 3465 else 3466 printf("Saving on counted transition count %d for %c at %d\n", 3467 trans->count, codepoint, exec->index); 3468 #endif 3469 xmlFARegExecSave(exec); 3470 } 3471 if (trans->counter >= 0) { 3472 xmlRegCounterPtr counter; 3473 3474 /* make sure we don't go over the counter maximum value */ 3475 if ((exec->counts == NULL) || 3476 (exec->comp == NULL) || 3477 (exec->comp->counters == NULL)) { 3478 exec->status = -1; 3479 goto error; 3480 } 3481 counter = &exec->comp->counters[trans->counter]; 3482 if (exec->counts[trans->counter] >= counter->max) 3483 continue; /* for loop on transitions */ 3484 #ifdef DEBUG_REGEXP_EXEC 3485 printf("Increasing count %d\n", trans->counter); 3486 #endif 3487 exec->counts[trans->counter]++; 3488 } 3489 if ((trans->count >= 0) && 3490 (trans->count < REGEXP_ALL_COUNTER)) { 3491 if (exec->counts == NULL) { 3492 exec->status = -1; 3493 goto error; 3494 } 3495 #ifdef DEBUG_REGEXP_EXEC 3496 printf("resetting count %d on transition\n", 3497 trans->count); 3498 #endif 3499 exec->counts[trans->count] = 0; 3500 } 3501 #ifdef DEBUG_REGEXP_EXEC 3502 printf("entering state %d\n", trans->to); 3503 #endif 3504 exec->state = comp->states[trans->to]; 3505 exec->transno = 0; 3506 if (trans->atom != NULL) { 3507 exec->index += len; 3508 } 3509 goto progress; 3510 } else if (ret < 0) { 3511 exec->status = -4; 3512 break; 3513 } 3514 } 3515 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3516 rollback: 3517 /* 3518 * Failed to find a way out 3519 */ 3520 exec->determinist = 0; 3521 #ifdef DEBUG_REGEXP_EXEC 3522 printf("rollback from state %d on %d:%c\n", exec->state->no, 3523 codepoint,codepoint); 3524 #endif 3525 xmlFARegExecRollBack(exec); 3526 } 3527 progress: 3528 continue; 3529 } 3530 error: 3531 if (exec->rollbacks != NULL) { 3532 if (exec->counts != NULL) { 3533 int i; 3534 3535 for (i = 0;i < exec->maxRollbacks;i++) 3536 if (exec->rollbacks[i].counts != NULL) 3537 xmlFree(exec->rollbacks[i].counts); 3538 } 3539 xmlFree(exec->rollbacks); 3540 } 3541 if (exec->state == NULL) 3542 return(-1); 3543 if (exec->counts != NULL) 3544 xmlFree(exec->counts); 3545 if (exec->status == 0) 3546 return(1); 3547 if (exec->status == -1) { 3548 if (exec->nbPush > MAX_PUSH) 3549 return(-1); 3550 return(0); 3551 } 3552 return(exec->status); 3553 } 3554 3555 /************************************************************************ 3556 * * 3557 * Progressive interface to the verifier one atom at a time * 3558 * * 3559 ************************************************************************/ 3560 #ifdef DEBUG_ERR 3561 static void testerr(xmlRegExecCtxtPtr exec); 3562 #endif 3563 3564 /** 3565 * xmlRegNewExecCtxt: 3566 * @comp: a precompiled regular expression 3567 * @callback: a callback function used for handling progresses in the 3568 * automata matching phase 3569 * @data: the context data associated to the callback in this context 3570 * 3571 * Build a context used for progressive evaluation of a regexp. 3572 * 3573 * Returns the new context 3574 */ 3575 xmlRegExecCtxtPtr 3576 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3577 xmlRegExecCtxtPtr exec; 3578 3579 if (comp == NULL) 3580 return(NULL); 3581 if ((comp->compact == NULL) && (comp->states == NULL)) 3582 return(NULL); 3583 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3584 if (exec == NULL) { 3585 xmlRegexpErrMemory(NULL, "creating execution context"); 3586 return(NULL); 3587 } 3588 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3589 exec->inputString = NULL; 3590 exec->index = 0; 3591 exec->determinist = 1; 3592 exec->maxRollbacks = 0; 3593 exec->nbRollbacks = 0; 3594 exec->rollbacks = NULL; 3595 exec->status = 0; 3596 exec->comp = comp; 3597 if (comp->compact == NULL) 3598 exec->state = comp->states[0]; 3599 exec->transno = 0; 3600 exec->transcount = 0; 3601 exec->callback = callback; 3602 exec->data = data; 3603 if (comp->nbCounters > 0) { 3604 /* 3605 * For error handling, exec->counts is allocated twice the size 3606 * the second half is used to store the data in case of rollback 3607 */ 3608 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3609 * 2); 3610 if (exec->counts == NULL) { 3611 xmlRegexpErrMemory(NULL, "creating execution context"); 3612 xmlFree(exec); 3613 return(NULL); 3614 } 3615 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3616 exec->errCounts = &exec->counts[comp->nbCounters]; 3617 } else { 3618 exec->counts = NULL; 3619 exec->errCounts = NULL; 3620 } 3621 exec->inputStackMax = 0; 3622 exec->inputStackNr = 0; 3623 exec->inputStack = NULL; 3624 exec->errStateNo = -1; 3625 exec->errString = NULL; 3626 exec->nbPush = 0; 3627 return(exec); 3628 } 3629 3630 /** 3631 * xmlRegFreeExecCtxt: 3632 * @exec: a regular expression evaluation context 3633 * 3634 * Free the structures associated to a regular expression evaluation context. 3635 */ 3636 void 3637 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3638 if (exec == NULL) 3639 return; 3640 3641 if (exec->rollbacks != NULL) { 3642 if (exec->counts != NULL) { 3643 int i; 3644 3645 for (i = 0;i < exec->maxRollbacks;i++) 3646 if (exec->rollbacks[i].counts != NULL) 3647 xmlFree(exec->rollbacks[i].counts); 3648 } 3649 xmlFree(exec->rollbacks); 3650 } 3651 if (exec->counts != NULL) 3652 xmlFree(exec->counts); 3653 if (exec->inputStack != NULL) { 3654 int i; 3655 3656 for (i = 0;i < exec->inputStackNr;i++) { 3657 if (exec->inputStack[i].value != NULL) 3658 xmlFree(exec->inputStack[i].value); 3659 } 3660 xmlFree(exec->inputStack); 3661 } 3662 if (exec->errString != NULL) 3663 xmlFree(exec->errString); 3664 xmlFree(exec); 3665 } 3666 3667 static void 3668 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3669 void *data) { 3670 #ifdef DEBUG_PUSH 3671 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3672 #endif 3673 if (exec->inputStackMax == 0) { 3674 exec->inputStackMax = 4; 3675 exec->inputStack = (xmlRegInputTokenPtr) 3676 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3677 if (exec->inputStack == NULL) { 3678 xmlRegexpErrMemory(NULL, "pushing input string"); 3679 exec->inputStackMax = 0; 3680 return; 3681 } 3682 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3683 xmlRegInputTokenPtr tmp; 3684 3685 exec->inputStackMax *= 2; 3686 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3687 exec->inputStackMax * sizeof(xmlRegInputToken)); 3688 if (tmp == NULL) { 3689 xmlRegexpErrMemory(NULL, "pushing input string"); 3690 exec->inputStackMax /= 2; 3691 return; 3692 } 3693 exec->inputStack = tmp; 3694 } 3695 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3696 exec->inputStack[exec->inputStackNr].data = data; 3697 exec->inputStackNr++; 3698 exec->inputStack[exec->inputStackNr].value = NULL; 3699 exec->inputStack[exec->inputStackNr].data = NULL; 3700 } 3701 3702 /** 3703 * xmlRegStrEqualWildcard: 3704 * @expStr: the string to be evaluated 3705 * @valStr: the validation string 3706 * 3707 * Checks if both strings are equal or have the same content. "*" 3708 * can be used as a wildcard in @valStr; "|" is used as a separator of 3709 * substrings in both @expStr and @valStr. 3710 * 3711 * Returns 1 if the comparison is satisfied and the number of substrings 3712 * is equal, 0 otherwise. 3713 */ 3714 3715 static int 3716 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3717 if (expStr == valStr) return(1); 3718 if (expStr == NULL) return(0); 3719 if (valStr == NULL) return(0); 3720 do { 3721 /* 3722 * Eval if we have a wildcard for the current item. 3723 */ 3724 if (*expStr != *valStr) { 3725 /* if one of them starts with a wildcard make valStr be it */ 3726 if (*valStr == '*') { 3727 const xmlChar *tmp; 3728 3729 tmp = valStr; 3730 valStr = expStr; 3731 expStr = tmp; 3732 } 3733 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3734 do { 3735 if (*valStr == XML_REG_STRING_SEPARATOR) 3736 break; 3737 valStr++; 3738 } while (*valStr != 0); 3739 continue; 3740 } else 3741 return(0); 3742 } 3743 expStr++; 3744 valStr++; 3745 } while (*valStr != 0); 3746 if (*expStr != 0) 3747 return (0); 3748 else 3749 return (1); 3750 } 3751 3752 /** 3753 * xmlRegCompactPushString: 3754 * @exec: a regexp execution context 3755 * @comp: the precompiled exec with a compact table 3756 * @value: a string token input 3757 * @data: data associated to the token to reuse in callbacks 3758 * 3759 * Push one input token in the execution context 3760 * 3761 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3762 * a negative value in case of error. 3763 */ 3764 static int 3765 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3766 xmlRegexpPtr comp, 3767 const xmlChar *value, 3768 void *data) { 3769 int state = exec->index; 3770 int i, target; 3771 3772 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3773 return(-1); 3774 3775 if (value == NULL) { 3776 /* 3777 * are we at a final state ? 3778 */ 3779 if (comp->compact[state * (comp->nbstrings + 1)] == 3780 XML_REGEXP_FINAL_STATE) 3781 return(1); 3782 return(0); 3783 } 3784 3785 #ifdef DEBUG_PUSH 3786 printf("value pushed: %s\n", value); 3787 #endif 3788 3789 /* 3790 * Examine all outside transitions from current state 3791 */ 3792 for (i = 0;i < comp->nbstrings;i++) { 3793 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3794 if ((target > 0) && (target <= comp->nbstates)) { 3795 target--; /* to avoid 0 */ 3796 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3797 exec->index = target; 3798 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3799 exec->callback(exec->data, value, 3800 comp->transdata[state * comp->nbstrings + i], data); 3801 } 3802 #ifdef DEBUG_PUSH 3803 printf("entering state %d\n", target); 3804 #endif 3805 if (comp->compact[target * (comp->nbstrings + 1)] == 3806 XML_REGEXP_SINK_STATE) 3807 goto error; 3808 3809 if (comp->compact[target * (comp->nbstrings + 1)] == 3810 XML_REGEXP_FINAL_STATE) 3811 return(1); 3812 return(0); 3813 } 3814 } 3815 } 3816 /* 3817 * Failed to find an exit transition out from current state for the 3818 * current token 3819 */ 3820 #ifdef DEBUG_PUSH 3821 printf("failed to find a transition for %s on state %d\n", value, state); 3822 #endif 3823 error: 3824 if (exec->errString != NULL) 3825 xmlFree(exec->errString); 3826 exec->errString = xmlStrdup(value); 3827 exec->errStateNo = state; 3828 exec->status = -1; 3829 #ifdef DEBUG_ERR 3830 testerr(exec); 3831 #endif 3832 return(-1); 3833 } 3834 3835 /** 3836 * xmlRegExecPushStringInternal: 3837 * @exec: a regexp execution context or NULL to indicate the end 3838 * @value: a string token input 3839 * @data: data associated to the token to reuse in callbacks 3840 * @compound: value was assembled from 2 strings 3841 * 3842 * Push one input token in the execution context 3843 * 3844 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3845 * a negative value in case of error. 3846 */ 3847 static int 3848 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3849 void *data, int compound) { 3850 xmlRegTransPtr trans; 3851 xmlRegAtomPtr atom; 3852 int ret; 3853 int final = 0; 3854 int progress = 1; 3855 3856 if (exec == NULL) 3857 return(-1); 3858 if (exec->comp == NULL) 3859 return(-1); 3860 if (exec->status != 0) 3861 return(exec->status); 3862 3863 if (exec->comp->compact != NULL) 3864 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3865 3866 if (value == NULL) { 3867 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3868 return(1); 3869 final = 1; 3870 } 3871 3872 #ifdef DEBUG_PUSH 3873 printf("value pushed: %s\n", value); 3874 #endif 3875 /* 3876 * If we have an active rollback stack push the new value there 3877 * and get back to where we were left 3878 */ 3879 if ((value != NULL) && (exec->inputStackNr > 0)) { 3880 xmlFARegExecSaveInputString(exec, value, data); 3881 value = exec->inputStack[exec->index].value; 3882 data = exec->inputStack[exec->index].data; 3883 #ifdef DEBUG_PUSH 3884 printf("value loaded: %s\n", value); 3885 #endif 3886 } 3887 3888 while ((exec->status == 0) && 3889 ((value != NULL) || 3890 ((final == 1) && 3891 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3892 3893 /* 3894 * End of input on non-terminal state, rollback, however we may 3895 * still have epsilon like transition for counted transitions 3896 * on counters, in that case don't break too early. 3897 */ 3898 if ((value == NULL) && (exec->counts == NULL)) 3899 goto rollback; 3900 3901 exec->transcount = 0; 3902 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3903 trans = &exec->state->trans[exec->transno]; 3904 if (trans->to < 0) 3905 continue; 3906 atom = trans->atom; 3907 ret = 0; 3908 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3909 int i; 3910 int count; 3911 xmlRegTransPtr t; 3912 xmlRegCounterPtr counter; 3913 3914 ret = 0; 3915 3916 #ifdef DEBUG_PUSH 3917 printf("testing all lax %d\n", trans->count); 3918 #endif 3919 /* 3920 * Check all counted transitions from the current state 3921 */ 3922 if ((value == NULL) && (final)) { 3923 ret = 1; 3924 } else if (value != NULL) { 3925 for (i = 0;i < exec->state->nbTrans;i++) { 3926 t = &exec->state->trans[i]; 3927 if ((t->counter < 0) || (t == trans)) 3928 continue; 3929 counter = &exec->comp->counters[t->counter]; 3930 count = exec->counts[t->counter]; 3931 if ((count < counter->max) && 3932 (t->atom != NULL) && 3933 (xmlStrEqual(value, t->atom->valuep))) { 3934 ret = 0; 3935 break; 3936 } 3937 if ((count >= counter->min) && 3938 (count < counter->max) && 3939 (t->atom != NULL) && 3940 (xmlStrEqual(value, t->atom->valuep))) { 3941 ret = 1; 3942 break; 3943 } 3944 } 3945 } 3946 } else if (trans->count == REGEXP_ALL_COUNTER) { 3947 int i; 3948 int count; 3949 xmlRegTransPtr t; 3950 xmlRegCounterPtr counter; 3951 3952 ret = 1; 3953 3954 #ifdef DEBUG_PUSH 3955 printf("testing all %d\n", trans->count); 3956 #endif 3957 /* 3958 * Check all counted transitions from the current state 3959 */ 3960 for (i = 0;i < exec->state->nbTrans;i++) { 3961 t = &exec->state->trans[i]; 3962 if ((t->counter < 0) || (t == trans)) 3963 continue; 3964 counter = &exec->comp->counters[t->counter]; 3965 count = exec->counts[t->counter]; 3966 if ((count < counter->min) || (count > counter->max)) { 3967 ret = 0; 3968 break; 3969 } 3970 } 3971 } else if (trans->count >= 0) { 3972 int count; 3973 xmlRegCounterPtr counter; 3974 3975 /* 3976 * A counted transition. 3977 */ 3978 3979 count = exec->counts[trans->count]; 3980 counter = &exec->comp->counters[trans->count]; 3981 #ifdef DEBUG_PUSH 3982 printf("testing count %d: val %d, min %d, max %d\n", 3983 trans->count, count, counter->min, counter->max); 3984 #endif 3985 ret = ((count >= counter->min) && (count <= counter->max)); 3986 } else if (atom == NULL) { 3987 fprintf(stderr, "epsilon transition left at runtime\n"); 3988 exec->status = -2; 3989 break; 3990 } else if (value != NULL) { 3991 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3992 if (atom->neg) { 3993 ret = !ret; 3994 if (!compound) 3995 ret = 0; 3996 } 3997 if ((ret == 1) && (trans->counter >= 0)) { 3998 xmlRegCounterPtr counter; 3999 int count; 4000 4001 count = exec->counts[trans->counter]; 4002 counter = &exec->comp->counters[trans->counter]; 4003 if (count >= counter->max) 4004 ret = 0; 4005 } 4006 4007 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4008 xmlRegStatePtr to = exec->comp->states[trans->to]; 4009 4010 /* 4011 * this is a multiple input sequence 4012 */ 4013 if (exec->state->nbTrans > exec->transno + 1) { 4014 if (exec->inputStackNr <= 0) { 4015 xmlFARegExecSaveInputString(exec, value, data); 4016 } 4017 xmlFARegExecSave(exec); 4018 } 4019 exec->transcount = 1; 4020 do { 4021 /* 4022 * Try to progress as much as possible on the input 4023 */ 4024 if (exec->transcount == atom->max) { 4025 break; 4026 } 4027 exec->index++; 4028 value = exec->inputStack[exec->index].value; 4029 data = exec->inputStack[exec->index].data; 4030 #ifdef DEBUG_PUSH 4031 printf("value loaded: %s\n", value); 4032 #endif 4033 4034 /* 4035 * End of input: stop here 4036 */ 4037 if (value == NULL) { 4038 exec->index --; 4039 break; 4040 } 4041 if (exec->transcount >= atom->min) { 4042 int transno = exec->transno; 4043 xmlRegStatePtr state = exec->state; 4044 4045 /* 4046 * The transition is acceptable save it 4047 */ 4048 exec->transno = -1; /* trick */ 4049 exec->state = to; 4050 if (exec->inputStackNr <= 0) { 4051 xmlFARegExecSaveInputString(exec, value, data); 4052 } 4053 xmlFARegExecSave(exec); 4054 exec->transno = transno; 4055 exec->state = state; 4056 } 4057 ret = xmlStrEqual(value, atom->valuep); 4058 exec->transcount++; 4059 } while (ret == 1); 4060 if (exec->transcount < atom->min) 4061 ret = 0; 4062 4063 /* 4064 * If the last check failed but one transition was found 4065 * possible, rollback 4066 */ 4067 if (ret < 0) 4068 ret = 0; 4069 if (ret == 0) { 4070 goto rollback; 4071 } 4072 } 4073 } 4074 if (ret == 1) { 4075 if ((exec->callback != NULL) && (atom != NULL) && 4076 (data != NULL)) { 4077 exec->callback(exec->data, atom->valuep, 4078 atom->data, data); 4079 } 4080 if (exec->state->nbTrans > exec->transno + 1) { 4081 if (exec->inputStackNr <= 0) { 4082 xmlFARegExecSaveInputString(exec, value, data); 4083 } 4084 xmlFARegExecSave(exec); 4085 } 4086 if (trans->counter >= 0) { 4087 #ifdef DEBUG_PUSH 4088 printf("Increasing count %d\n", trans->counter); 4089 #endif 4090 exec->counts[trans->counter]++; 4091 } 4092 if ((trans->count >= 0) && 4093 (trans->count < REGEXP_ALL_COUNTER)) { 4094 #ifdef DEBUG_REGEXP_EXEC 4095 printf("resetting count %d on transition\n", 4096 trans->count); 4097 #endif 4098 exec->counts[trans->count] = 0; 4099 } 4100 #ifdef DEBUG_PUSH 4101 printf("entering state %d\n", trans->to); 4102 #endif 4103 if ((exec->comp->states[trans->to] != NULL) && 4104 (exec->comp->states[trans->to]->type == 4105 XML_REGEXP_SINK_STATE)) { 4106 /* 4107 * entering a sink state, save the current state as error 4108 * state. 4109 */ 4110 if (exec->errString != NULL) 4111 xmlFree(exec->errString); 4112 exec->errString = xmlStrdup(value); 4113 exec->errState = exec->state; 4114 memcpy(exec->errCounts, exec->counts, 4115 exec->comp->nbCounters * sizeof(int)); 4116 } 4117 exec->state = exec->comp->states[trans->to]; 4118 exec->transno = 0; 4119 if (trans->atom != NULL) { 4120 if (exec->inputStack != NULL) { 4121 exec->index++; 4122 if (exec->index < exec->inputStackNr) { 4123 value = exec->inputStack[exec->index].value; 4124 data = exec->inputStack[exec->index].data; 4125 #ifdef DEBUG_PUSH 4126 printf("value loaded: %s\n", value); 4127 #endif 4128 } else { 4129 value = NULL; 4130 data = NULL; 4131 #ifdef DEBUG_PUSH 4132 printf("end of input\n"); 4133 #endif 4134 } 4135 } else { 4136 value = NULL; 4137 data = NULL; 4138 #ifdef DEBUG_PUSH 4139 printf("end of input\n"); 4140 #endif 4141 } 4142 } 4143 goto progress; 4144 } else if (ret < 0) { 4145 exec->status = -4; 4146 break; 4147 } 4148 } 4149 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4150 rollback: 4151 /* 4152 * if we didn't yet rollback on the current input 4153 * store the current state as the error state. 4154 */ 4155 if ((progress) && (exec->state != NULL) && 4156 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4157 progress = 0; 4158 if (exec->errString != NULL) 4159 xmlFree(exec->errString); 4160 exec->errString = xmlStrdup(value); 4161 exec->errState = exec->state; 4162 if (exec->comp->nbCounters) 4163 memcpy(exec->errCounts, exec->counts, 4164 exec->comp->nbCounters * sizeof(int)); 4165 } 4166 4167 /* 4168 * Failed to find a way out 4169 */ 4170 exec->determinist = 0; 4171 xmlFARegExecRollBack(exec); 4172 if ((exec->inputStack != NULL ) && (exec->status == 0)) { 4173 value = exec->inputStack[exec->index].value; 4174 data = exec->inputStack[exec->index].data; 4175 #ifdef DEBUG_PUSH 4176 printf("value loaded: %s\n", value); 4177 #endif 4178 } 4179 } 4180 continue; 4181 progress: 4182 progress = 1; 4183 continue; 4184 } 4185 if (exec->status == 0) { 4186 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4187 } 4188 #ifdef DEBUG_ERR 4189 if (exec->status < 0) { 4190 testerr(exec); 4191 } 4192 #endif 4193 return(exec->status); 4194 } 4195 4196 /** 4197 * xmlRegExecPushString: 4198 * @exec: a regexp execution context or NULL to indicate the end 4199 * @value: a string token input 4200 * @data: data associated to the token to reuse in callbacks 4201 * 4202 * Push one input token in the execution context 4203 * 4204 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4205 * a negative value in case of error. 4206 */ 4207 int 4208 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4209 void *data) { 4210 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4211 } 4212 4213 /** 4214 * xmlRegExecPushString2: 4215 * @exec: a regexp execution context or NULL to indicate the end 4216 * @value: the first string token input 4217 * @value2: the second string token input 4218 * @data: data associated to the token to reuse in callbacks 4219 * 4220 * Push one input token in the execution context 4221 * 4222 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4223 * a negative value in case of error. 4224 */ 4225 int 4226 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4227 const xmlChar *value2, void *data) { 4228 xmlChar buf[150]; 4229 int lenn, lenp, ret; 4230 xmlChar *str; 4231 4232 if (exec == NULL) 4233 return(-1); 4234 if (exec->comp == NULL) 4235 return(-1); 4236 if (exec->status != 0) 4237 return(exec->status); 4238 4239 if (value2 == NULL) 4240 return(xmlRegExecPushString(exec, value, data)); 4241 4242 lenn = strlen((char *) value2); 4243 lenp = strlen((char *) value); 4244 4245 if (150 < lenn + lenp + 2) { 4246 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4247 if (str == NULL) { 4248 exec->status = -1; 4249 return(-1); 4250 } 4251 } else { 4252 str = buf; 4253 } 4254 memcpy(&str[0], value, lenp); 4255 str[lenp] = XML_REG_STRING_SEPARATOR; 4256 memcpy(&str[lenp + 1], value2, lenn); 4257 str[lenn + lenp + 1] = 0; 4258 4259 if (exec->comp->compact != NULL) 4260 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4261 else 4262 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4263 4264 if (str != buf) 4265 xmlFree(str); 4266 return(ret); 4267 } 4268 4269 /** 4270 * xmlRegExecGetValues: 4271 * @exec: a regexp execution context 4272 * @err: error extraction or normal one 4273 * @nbval: pointer to the number of accepted values IN/OUT 4274 * @nbneg: return number of negative transitions 4275 * @values: pointer to the array of acceptable values 4276 * @terminal: return value if this was a terminal state 4277 * 4278 * Extract information from the regexp execution, internal routine to 4279 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4280 * 4281 * Returns: 0 in case of success or -1 in case of error. 4282 */ 4283 static int 4284 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4285 int *nbval, int *nbneg, 4286 xmlChar **values, int *terminal) { 4287 int maxval; 4288 int nb = 0; 4289 4290 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4291 (values == NULL) || (*nbval <= 0)) 4292 return(-1); 4293 4294 maxval = *nbval; 4295 *nbval = 0; 4296 *nbneg = 0; 4297 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4298 xmlRegexpPtr comp; 4299 int target, i, state; 4300 4301 comp = exec->comp; 4302 4303 if (err) { 4304 if (exec->errStateNo == -1) return(-1); 4305 state = exec->errStateNo; 4306 } else { 4307 state = exec->index; 4308 } 4309 if (terminal != NULL) { 4310 if (comp->compact[state * (comp->nbstrings + 1)] == 4311 XML_REGEXP_FINAL_STATE) 4312 *terminal = 1; 4313 else 4314 *terminal = 0; 4315 } 4316 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4317 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4318 if ((target > 0) && (target <= comp->nbstates) && 4319 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4320 XML_REGEXP_SINK_STATE)) { 4321 values[nb++] = comp->stringMap[i]; 4322 (*nbval)++; 4323 } 4324 } 4325 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4326 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4327 if ((target > 0) && (target <= comp->nbstates) && 4328 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4329 XML_REGEXP_SINK_STATE)) { 4330 values[nb++] = comp->stringMap[i]; 4331 (*nbneg)++; 4332 } 4333 } 4334 } else { 4335 int transno; 4336 xmlRegTransPtr trans; 4337 xmlRegAtomPtr atom; 4338 xmlRegStatePtr state; 4339 4340 if (terminal != NULL) { 4341 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4342 *terminal = 1; 4343 else 4344 *terminal = 0; 4345 } 4346 4347 if (err) { 4348 if (exec->errState == NULL) return(-1); 4349 state = exec->errState; 4350 } else { 4351 if (exec->state == NULL) return(-1); 4352 state = exec->state; 4353 } 4354 for (transno = 0; 4355 (transno < state->nbTrans) && (nb < maxval); 4356 transno++) { 4357 trans = &state->trans[transno]; 4358 if (trans->to < 0) 4359 continue; 4360 atom = trans->atom; 4361 if ((atom == NULL) || (atom->valuep == NULL)) 4362 continue; 4363 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4364 /* this should not be reached but ... */ 4365 TODO; 4366 } else if (trans->count == REGEXP_ALL_COUNTER) { 4367 /* this should not be reached but ... */ 4368 TODO; 4369 } else if (trans->counter >= 0) { 4370 xmlRegCounterPtr counter = NULL; 4371 int count; 4372 4373 if (err) 4374 count = exec->errCounts[trans->counter]; 4375 else 4376 count = exec->counts[trans->counter]; 4377 if (exec->comp != NULL) 4378 counter = &exec->comp->counters[trans->counter]; 4379 if ((counter == NULL) || (count < counter->max)) { 4380 if (atom->neg) 4381 values[nb++] = (xmlChar *) atom->valuep2; 4382 else 4383 values[nb++] = (xmlChar *) atom->valuep; 4384 (*nbval)++; 4385 } 4386 } else { 4387 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) && 4388 (exec->comp->states[trans->to]->type != 4389 XML_REGEXP_SINK_STATE)) { 4390 if (atom->neg) 4391 values[nb++] = (xmlChar *) atom->valuep2; 4392 else 4393 values[nb++] = (xmlChar *) atom->valuep; 4394 (*nbval)++; 4395 } 4396 } 4397 } 4398 for (transno = 0; 4399 (transno < state->nbTrans) && (nb < maxval); 4400 transno++) { 4401 trans = &state->trans[transno]; 4402 if (trans->to < 0) 4403 continue; 4404 atom = trans->atom; 4405 if ((atom == NULL) || (atom->valuep == NULL)) 4406 continue; 4407 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4408 continue; 4409 } else if (trans->count == REGEXP_ALL_COUNTER) { 4410 continue; 4411 } else if (trans->counter >= 0) { 4412 continue; 4413 } else { 4414 if ((exec->comp->states[trans->to] != NULL) && 4415 (exec->comp->states[trans->to]->type == 4416 XML_REGEXP_SINK_STATE)) { 4417 if (atom->neg) 4418 values[nb++] = (xmlChar *) atom->valuep2; 4419 else 4420 values[nb++] = (xmlChar *) atom->valuep; 4421 (*nbneg)++; 4422 } 4423 } 4424 } 4425 } 4426 return(0); 4427 } 4428 4429 /** 4430 * xmlRegExecNextValues: 4431 * @exec: a regexp execution context 4432 * @nbval: pointer to the number of accepted values IN/OUT 4433 * @nbneg: return number of negative transitions 4434 * @values: pointer to the array of acceptable values 4435 * @terminal: return value if this was a terminal state 4436 * 4437 * Extract information from the regexp execution, 4438 * the parameter @values must point to an array of @nbval string pointers 4439 * on return nbval will contain the number of possible strings in that 4440 * state and the @values array will be updated with them. The string values 4441 * returned will be freed with the @exec context and don't need to be 4442 * deallocated. 4443 * 4444 * Returns: 0 in case of success or -1 in case of error. 4445 */ 4446 int 4447 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4448 xmlChar **values, int *terminal) { 4449 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4450 } 4451 4452 /** 4453 * xmlRegExecErrInfo: 4454 * @exec: a regexp execution context generating an error 4455 * @string: return value for the error string 4456 * @nbval: pointer to the number of accepted values IN/OUT 4457 * @nbneg: return number of negative transitions 4458 * @values: pointer to the array of acceptable values 4459 * @terminal: return value if this was a terminal state 4460 * 4461 * Extract error information from the regexp execution, the parameter 4462 * @string will be updated with the value pushed and not accepted, 4463 * the parameter @values must point to an array of @nbval string pointers 4464 * on return nbval will contain the number of possible strings in that 4465 * state and the @values array will be updated with them. The string values 4466 * returned will be freed with the @exec context and don't need to be 4467 * deallocated. 4468 * 4469 * Returns: 0 in case of success or -1 in case of error. 4470 */ 4471 int 4472 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4473 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4474 if (exec == NULL) 4475 return(-1); 4476 if (string != NULL) { 4477 if (exec->status != 0) 4478 *string = exec->errString; 4479 else 4480 *string = NULL; 4481 } 4482 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4483 } 4484 4485 #ifdef DEBUG_ERR 4486 static void testerr(xmlRegExecCtxtPtr exec) { 4487 const xmlChar *string; 4488 xmlChar *values[5]; 4489 int nb = 5; 4490 int nbneg; 4491 int terminal; 4492 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4493 } 4494 #endif 4495 4496 #if 0 4497 static int 4498 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4499 xmlRegTransPtr trans; 4500 xmlRegAtomPtr atom; 4501 int ret; 4502 int codepoint, len; 4503 4504 if (exec == NULL) 4505 return(-1); 4506 if (exec->status != 0) 4507 return(exec->status); 4508 4509 while ((exec->status == 0) && 4510 ((exec->inputString[exec->index] != 0) || 4511 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4512 4513 /* 4514 * End of input on non-terminal state, rollback, however we may 4515 * still have epsilon like transition for counted transitions 4516 * on counters, in that case don't break too early. 4517 */ 4518 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4519 goto rollback; 4520 4521 exec->transcount = 0; 4522 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4523 trans = &exec->state->trans[exec->transno]; 4524 if (trans->to < 0) 4525 continue; 4526 atom = trans->atom; 4527 ret = 0; 4528 if (trans->count >= 0) { 4529 int count; 4530 xmlRegCounterPtr counter; 4531 4532 /* 4533 * A counted transition. 4534 */ 4535 4536 count = exec->counts[trans->count]; 4537 counter = &exec->comp->counters[trans->count]; 4538 #ifdef DEBUG_REGEXP_EXEC 4539 printf("testing count %d: val %d, min %d, max %d\n", 4540 trans->count, count, counter->min, counter->max); 4541 #endif 4542 ret = ((count >= counter->min) && (count <= counter->max)); 4543 } else if (atom == NULL) { 4544 fprintf(stderr, "epsilon transition left at runtime\n"); 4545 exec->status = -2; 4546 break; 4547 } else if (exec->inputString[exec->index] != 0) { 4548 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4549 ret = xmlRegCheckCharacter(atom, codepoint); 4550 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4551 xmlRegStatePtr to = exec->comp->states[trans->to]; 4552 4553 /* 4554 * this is a multiple input sequence 4555 */ 4556 if (exec->state->nbTrans > exec->transno + 1) { 4557 xmlFARegExecSave(exec); 4558 } 4559 exec->transcount = 1; 4560 do { 4561 /* 4562 * Try to progress as much as possible on the input 4563 */ 4564 if (exec->transcount == atom->max) { 4565 break; 4566 } 4567 exec->index += len; 4568 /* 4569 * End of input: stop here 4570 */ 4571 if (exec->inputString[exec->index] == 0) { 4572 exec->index -= len; 4573 break; 4574 } 4575 if (exec->transcount >= atom->min) { 4576 int transno = exec->transno; 4577 xmlRegStatePtr state = exec->state; 4578 4579 /* 4580 * The transition is acceptable save it 4581 */ 4582 exec->transno = -1; /* trick */ 4583 exec->state = to; 4584 xmlFARegExecSave(exec); 4585 exec->transno = transno; 4586 exec->state = state; 4587 } 4588 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4589 len); 4590 ret = xmlRegCheckCharacter(atom, codepoint); 4591 exec->transcount++; 4592 } while (ret == 1); 4593 if (exec->transcount < atom->min) 4594 ret = 0; 4595 4596 /* 4597 * If the last check failed but one transition was found 4598 * possible, rollback 4599 */ 4600 if (ret < 0) 4601 ret = 0; 4602 if (ret == 0) { 4603 goto rollback; 4604 } 4605 } 4606 } 4607 if (ret == 1) { 4608 if (exec->state->nbTrans > exec->transno + 1) { 4609 xmlFARegExecSave(exec); 4610 } 4611 /* 4612 * restart count for expressions like this ((abc){2})* 4613 */ 4614 if (trans->count >= 0) { 4615 #ifdef DEBUG_REGEXP_EXEC 4616 printf("Reset count %d\n", trans->count); 4617 #endif 4618 exec->counts[trans->count] = 0; 4619 } 4620 if (trans->counter >= 0) { 4621 #ifdef DEBUG_REGEXP_EXEC 4622 printf("Increasing count %d\n", trans->counter); 4623 #endif 4624 exec->counts[trans->counter]++; 4625 } 4626 #ifdef DEBUG_REGEXP_EXEC 4627 printf("entering state %d\n", trans->to); 4628 #endif 4629 exec->state = exec->comp->states[trans->to]; 4630 exec->transno = 0; 4631 if (trans->atom != NULL) { 4632 exec->index += len; 4633 } 4634 goto progress; 4635 } else if (ret < 0) { 4636 exec->status = -4; 4637 break; 4638 } 4639 } 4640 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4641 rollback: 4642 /* 4643 * Failed to find a way out 4644 */ 4645 exec->determinist = 0; 4646 xmlFARegExecRollBack(exec); 4647 } 4648 progress: 4649 continue; 4650 } 4651 } 4652 #endif 4653 /************************************************************************ 4654 * * 4655 * Parser for the Schemas Datatype Regular Expressions * 4656 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4657 * * 4658 ************************************************************************/ 4659 4660 /** 4661 * xmlFAIsChar: 4662 * @ctxt: a regexp parser context 4663 * 4664 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4665 */ 4666 static int 4667 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4668 int cur; 4669 int len; 4670 4671 cur = CUR_SCHAR(ctxt->cur, len); 4672 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4673 (cur == '*') || (cur == '+') || (cur == '(') || 4674 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4675 (cur == 0x5D) || (cur == 0)) 4676 return(-1); 4677 return(cur); 4678 } 4679 4680 /** 4681 * xmlFAParseCharProp: 4682 * @ctxt: a regexp parser context 4683 * 4684 * [27] charProp ::= IsCategory | IsBlock 4685 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4686 * Separators | Symbols | Others 4687 * [29] Letters ::= 'L' [ultmo]? 4688 * [30] Marks ::= 'M' [nce]? 4689 * [31] Numbers ::= 'N' [dlo]? 4690 * [32] Punctuation ::= 'P' [cdseifo]? 4691 * [33] Separators ::= 'Z' [slp]? 4692 * [34] Symbols ::= 'S' [mcko]? 4693 * [35] Others ::= 'C' [cfon]? 4694 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4695 */ 4696 static void 4697 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4698 int cur; 4699 xmlRegAtomType type = (xmlRegAtomType) 0; 4700 xmlChar *blockName = NULL; 4701 4702 cur = CUR; 4703 if (cur == 'L') { 4704 NEXT; 4705 cur = CUR; 4706 if (cur == 'u') { 4707 NEXT; 4708 type = XML_REGEXP_LETTER_UPPERCASE; 4709 } else if (cur == 'l') { 4710 NEXT; 4711 type = XML_REGEXP_LETTER_LOWERCASE; 4712 } else if (cur == 't') { 4713 NEXT; 4714 type = XML_REGEXP_LETTER_TITLECASE; 4715 } else if (cur == 'm') { 4716 NEXT; 4717 type = XML_REGEXP_LETTER_MODIFIER; 4718 } else if (cur == 'o') { 4719 NEXT; 4720 type = XML_REGEXP_LETTER_OTHERS; 4721 } else { 4722 type = XML_REGEXP_LETTER; 4723 } 4724 } else if (cur == 'M') { 4725 NEXT; 4726 cur = CUR; 4727 if (cur == 'n') { 4728 NEXT; 4729 /* nonspacing */ 4730 type = XML_REGEXP_MARK_NONSPACING; 4731 } else if (cur == 'c') { 4732 NEXT; 4733 /* spacing combining */ 4734 type = XML_REGEXP_MARK_SPACECOMBINING; 4735 } else if (cur == 'e') { 4736 NEXT; 4737 /* enclosing */ 4738 type = XML_REGEXP_MARK_ENCLOSING; 4739 } else { 4740 /* all marks */ 4741 type = XML_REGEXP_MARK; 4742 } 4743 } else if (cur == 'N') { 4744 NEXT; 4745 cur = CUR; 4746 if (cur == 'd') { 4747 NEXT; 4748 /* digital */ 4749 type = XML_REGEXP_NUMBER_DECIMAL; 4750 } else if (cur == 'l') { 4751 NEXT; 4752 /* letter */ 4753 type = XML_REGEXP_NUMBER_LETTER; 4754 } else if (cur == 'o') { 4755 NEXT; 4756 /* other */ 4757 type = XML_REGEXP_NUMBER_OTHERS; 4758 } else { 4759 /* all numbers */ 4760 type = XML_REGEXP_NUMBER; 4761 } 4762 } else if (cur == 'P') { 4763 NEXT; 4764 cur = CUR; 4765 if (cur == 'c') { 4766 NEXT; 4767 /* connector */ 4768 type = XML_REGEXP_PUNCT_CONNECTOR; 4769 } else if (cur == 'd') { 4770 NEXT; 4771 /* dash */ 4772 type = XML_REGEXP_PUNCT_DASH; 4773 } else if (cur == 's') { 4774 NEXT; 4775 /* open */ 4776 type = XML_REGEXP_PUNCT_OPEN; 4777 } else if (cur == 'e') { 4778 NEXT; 4779 /* close */ 4780 type = XML_REGEXP_PUNCT_CLOSE; 4781 } else if (cur == 'i') { 4782 NEXT; 4783 /* initial quote */ 4784 type = XML_REGEXP_PUNCT_INITQUOTE; 4785 } else if (cur == 'f') { 4786 NEXT; 4787 /* final quote */ 4788 type = XML_REGEXP_PUNCT_FINQUOTE; 4789 } else if (cur == 'o') { 4790 NEXT; 4791 /* other */ 4792 type = XML_REGEXP_PUNCT_OTHERS; 4793 } else { 4794 /* all punctuation */ 4795 type = XML_REGEXP_PUNCT; 4796 } 4797 } else if (cur == 'Z') { 4798 NEXT; 4799 cur = CUR; 4800 if (cur == 's') { 4801 NEXT; 4802 /* space */ 4803 type = XML_REGEXP_SEPAR_SPACE; 4804 } else if (cur == 'l') { 4805 NEXT; 4806 /* line */ 4807 type = XML_REGEXP_SEPAR_LINE; 4808 } else if (cur == 'p') { 4809 NEXT; 4810 /* paragraph */ 4811 type = XML_REGEXP_SEPAR_PARA; 4812 } else { 4813 /* all separators */ 4814 type = XML_REGEXP_SEPAR; 4815 } 4816 } else if (cur == 'S') { 4817 NEXT; 4818 cur = CUR; 4819 if (cur == 'm') { 4820 NEXT; 4821 type = XML_REGEXP_SYMBOL_MATH; 4822 /* math */ 4823 } else if (cur == 'c') { 4824 NEXT; 4825 type = XML_REGEXP_SYMBOL_CURRENCY; 4826 /* currency */ 4827 } else if (cur == 'k') { 4828 NEXT; 4829 type = XML_REGEXP_SYMBOL_MODIFIER; 4830 /* modifiers */ 4831 } else if (cur == 'o') { 4832 NEXT; 4833 type = XML_REGEXP_SYMBOL_OTHERS; 4834 /* other */ 4835 } else { 4836 /* all symbols */ 4837 type = XML_REGEXP_SYMBOL; 4838 } 4839 } else if (cur == 'C') { 4840 NEXT; 4841 cur = CUR; 4842 if (cur == 'c') { 4843 NEXT; 4844 /* control */ 4845 type = XML_REGEXP_OTHER_CONTROL; 4846 } else if (cur == 'f') { 4847 NEXT; 4848 /* format */ 4849 type = XML_REGEXP_OTHER_FORMAT; 4850 } else if (cur == 'o') { 4851 NEXT; 4852 /* private use */ 4853 type = XML_REGEXP_OTHER_PRIVATE; 4854 } else if (cur == 'n') { 4855 NEXT; 4856 /* not assigned */ 4857 type = XML_REGEXP_OTHER_NA; 4858 } else { 4859 /* all others */ 4860 type = XML_REGEXP_OTHER; 4861 } 4862 } else if (cur == 'I') { 4863 const xmlChar *start; 4864 NEXT; 4865 cur = CUR; 4866 if (cur != 's') { 4867 ERROR("IsXXXX expected"); 4868 return; 4869 } 4870 NEXT; 4871 start = ctxt->cur; 4872 cur = CUR; 4873 if (((cur >= 'a') && (cur <= 'z')) || 4874 ((cur >= 'A') && (cur <= 'Z')) || 4875 ((cur >= '0') && (cur <= '9')) || 4876 (cur == 0x2D)) { 4877 NEXT; 4878 cur = CUR; 4879 while (((cur >= 'a') && (cur <= 'z')) || 4880 ((cur >= 'A') && (cur <= 'Z')) || 4881 ((cur >= '0') && (cur <= '9')) || 4882 (cur == 0x2D)) { 4883 NEXT; 4884 cur = CUR; 4885 } 4886 } 4887 type = XML_REGEXP_BLOCK_NAME; 4888 blockName = xmlStrndup(start, ctxt->cur - start); 4889 } else { 4890 ERROR("Unknown char property"); 4891 return; 4892 } 4893 if (ctxt->atom == NULL) { 4894 ctxt->atom = xmlRegNewAtom(ctxt, type); 4895 if (ctxt->atom != NULL) 4896 ctxt->atom->valuep = blockName; 4897 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4898 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4899 type, 0, 0, blockName); 4900 } 4901 } 4902 4903 /** 4904 * xmlFAParseCharClassEsc: 4905 * @ctxt: a regexp parser context 4906 * 4907 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4908 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4909 * [25] catEsc ::= '\p{' charProp '}' 4910 * [26] complEsc ::= '\P{' charProp '}' 4911 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4912 */ 4913 static void 4914 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4915 int cur; 4916 4917 if (CUR == '.') { 4918 if (ctxt->atom == NULL) { 4919 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4920 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4921 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4922 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4923 } 4924 NEXT; 4925 return; 4926 } 4927 if (CUR != '\\') { 4928 ERROR("Escaped sequence: expecting \\"); 4929 return; 4930 } 4931 NEXT; 4932 cur = CUR; 4933 if (cur == 'p') { 4934 NEXT; 4935 if (CUR != '{') { 4936 ERROR("Expecting '{'"); 4937 return; 4938 } 4939 NEXT; 4940 xmlFAParseCharProp(ctxt); 4941 if (CUR != '}') { 4942 ERROR("Expecting '}'"); 4943 return; 4944 } 4945 NEXT; 4946 } else if (cur == 'P') { 4947 NEXT; 4948 if (CUR != '{') { 4949 ERROR("Expecting '{'"); 4950 return; 4951 } 4952 NEXT; 4953 xmlFAParseCharProp(ctxt); 4954 if (ctxt->atom != NULL) 4955 ctxt->atom->neg = 1; 4956 if (CUR != '}') { 4957 ERROR("Expecting '}'"); 4958 return; 4959 } 4960 NEXT; 4961 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4962 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4963 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4964 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4965 (cur == 0x5E)) { 4966 if (ctxt->atom == NULL) { 4967 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4968 if (ctxt->atom != NULL) { 4969 switch (cur) { 4970 case 'n': 4971 ctxt->atom->codepoint = '\n'; 4972 break; 4973 case 'r': 4974 ctxt->atom->codepoint = '\r'; 4975 break; 4976 case 't': 4977 ctxt->atom->codepoint = '\t'; 4978 break; 4979 default: 4980 ctxt->atom->codepoint = cur; 4981 } 4982 } 4983 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4984 switch (cur) { 4985 case 'n': 4986 cur = '\n'; 4987 break; 4988 case 'r': 4989 cur = '\r'; 4990 break; 4991 case 't': 4992 cur = '\t'; 4993 break; 4994 } 4995 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4996 XML_REGEXP_CHARVAL, cur, cur, NULL); 4997 } 4998 NEXT; 4999 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 5000 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 5001 (cur == 'w') || (cur == 'W')) { 5002 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 5003 5004 switch (cur) { 5005 case 's': 5006 type = XML_REGEXP_ANYSPACE; 5007 break; 5008 case 'S': 5009 type = XML_REGEXP_NOTSPACE; 5010 break; 5011 case 'i': 5012 type = XML_REGEXP_INITNAME; 5013 break; 5014 case 'I': 5015 type = XML_REGEXP_NOTINITNAME; 5016 break; 5017 case 'c': 5018 type = XML_REGEXP_NAMECHAR; 5019 break; 5020 case 'C': 5021 type = XML_REGEXP_NOTNAMECHAR; 5022 break; 5023 case 'd': 5024 type = XML_REGEXP_DECIMAL; 5025 break; 5026 case 'D': 5027 type = XML_REGEXP_NOTDECIMAL; 5028 break; 5029 case 'w': 5030 type = XML_REGEXP_REALCHAR; 5031 break; 5032 case 'W': 5033 type = XML_REGEXP_NOTREALCHAR; 5034 break; 5035 } 5036 NEXT; 5037 if (ctxt->atom == NULL) { 5038 ctxt->atom = xmlRegNewAtom(ctxt, type); 5039 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 5040 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5041 type, 0, 0, NULL); 5042 } 5043 } else { 5044 ERROR("Wrong escape sequence, misuse of character '\\'"); 5045 } 5046 } 5047 5048 /** 5049 * xmlFAParseCharRange: 5050 * @ctxt: a regexp parser context 5051 * 5052 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 5053 * [18] seRange ::= charOrEsc '-' charOrEsc 5054 * [20] charOrEsc ::= XmlChar | SingleCharEsc 5055 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 5056 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 5057 */ 5058 static void 5059 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 5060 int cur, len; 5061 int start = -1; 5062 int end = -1; 5063 5064 if (CUR == '\0') { 5065 ERROR("Expecting ']'"); 5066 return; 5067 } 5068 5069 cur = CUR; 5070 if (cur == '\\') { 5071 NEXT; 5072 cur = CUR; 5073 switch (cur) { 5074 case 'n': start = 0xA; break; 5075 case 'r': start = 0xD; break; 5076 case 't': start = 0x9; break; 5077 case '\\': case '|': case '.': case '-': case '^': case '?': 5078 case '*': case '+': case '{': case '}': case '(': case ')': 5079 case '[': case ']': 5080 start = cur; break; 5081 default: 5082 ERROR("Invalid escape value"); 5083 return; 5084 } 5085 end = start; 5086 len = 1; 5087 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5088 end = start = CUR_SCHAR(ctxt->cur, len); 5089 } else { 5090 ERROR("Expecting a char range"); 5091 return; 5092 } 5093 /* 5094 * Since we are "inside" a range, we can assume ctxt->cur is past 5095 * the start of ctxt->string, and PREV should be safe 5096 */ 5097 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5098 NEXTL(len); 5099 return; 5100 } 5101 NEXTL(len); 5102 cur = CUR; 5103 if ((cur != '-') || (NXT(1) == ']')) { 5104 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5105 XML_REGEXP_CHARVAL, start, end, NULL); 5106 return; 5107 } 5108 NEXT; 5109 cur = CUR; 5110 if (cur == '\\') { 5111 NEXT; 5112 cur = CUR; 5113 switch (cur) { 5114 case 'n': end = 0xA; break; 5115 case 'r': end = 0xD; break; 5116 case 't': end = 0x9; break; 5117 case '\\': case '|': case '.': case '-': case '^': case '?': 5118 case '*': case '+': case '{': case '}': case '(': case ')': 5119 case '[': case ']': 5120 end = cur; break; 5121 default: 5122 ERROR("Invalid escape value"); 5123 return; 5124 } 5125 len = 1; 5126 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) { 5127 end = CUR_SCHAR(ctxt->cur, len); 5128 } else { 5129 ERROR("Expecting the end of a char range"); 5130 return; 5131 } 5132 5133 /* TODO check that the values are acceptable character ranges for XML */ 5134 if (end < start) { 5135 ERROR("End of range is before start of range"); 5136 } else { 5137 NEXTL(len); 5138 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5139 XML_REGEXP_CHARVAL, start, end, NULL); 5140 } 5141 return; 5142 } 5143 5144 /** 5145 * xmlFAParsePosCharGroup: 5146 * @ctxt: a regexp parser context 5147 * 5148 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5149 */ 5150 static void 5151 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5152 do { 5153 if (CUR == '\\') { 5154 xmlFAParseCharClassEsc(ctxt); 5155 } else { 5156 xmlFAParseCharRange(ctxt); 5157 } 5158 } while ((CUR != ']') && (CUR != '-') && 5159 (CUR != 0) && (ctxt->error == 0)); 5160 } 5161 5162 /** 5163 * xmlFAParseCharGroup: 5164 * @ctxt: a regexp parser context 5165 * 5166 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5167 * [15] negCharGroup ::= '^' posCharGroup 5168 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5169 * [12] charClassExpr ::= '[' charGroup ']' 5170 */ 5171 static void 5172 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5173 int neg = ctxt->neg; 5174 5175 if (CUR == '^') { 5176 NEXT; 5177 ctxt->neg = !ctxt->neg; 5178 xmlFAParsePosCharGroup(ctxt); 5179 ctxt->neg = neg; 5180 } 5181 while ((CUR != ']') && (ctxt->error == 0)) { 5182 if ((CUR == '-') && (NXT(1) == '[')) { 5183 NEXT; /* eat the '-' */ 5184 NEXT; /* eat the '[' */ 5185 ctxt->neg = 2; 5186 xmlFAParseCharGroup(ctxt); 5187 ctxt->neg = neg; 5188 if (CUR == ']') { 5189 NEXT; 5190 } else { 5191 ERROR("charClassExpr: ']' expected"); 5192 } 5193 break; 5194 } else { 5195 xmlFAParsePosCharGroup(ctxt); 5196 } 5197 } 5198 } 5199 5200 /** 5201 * xmlFAParseCharClass: 5202 * @ctxt: a regexp parser context 5203 * 5204 * [11] charClass ::= charClassEsc | charClassExpr 5205 * [12] charClassExpr ::= '[' charGroup ']' 5206 */ 5207 static void 5208 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5209 if (CUR == '[') { 5210 NEXT; 5211 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5212 if (ctxt->atom == NULL) 5213 return; 5214 xmlFAParseCharGroup(ctxt); 5215 if (CUR == ']') { 5216 NEXT; 5217 } else { 5218 ERROR("xmlFAParseCharClass: ']' expected"); 5219 } 5220 } else { 5221 xmlFAParseCharClassEsc(ctxt); 5222 } 5223 } 5224 5225 /** 5226 * xmlFAParseQuantExact: 5227 * @ctxt: a regexp parser context 5228 * 5229 * [8] QuantExact ::= [0-9]+ 5230 * 5231 * Returns 0 if success or -1 in case of error 5232 */ 5233 static int 5234 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5235 int ret = 0; 5236 int ok = 0; 5237 int overflow = 0; 5238 5239 while ((CUR >= '0') && (CUR <= '9')) { 5240 if (ret > INT_MAX / 10) { 5241 overflow = 1; 5242 } else { 5243 int digit = CUR - '0'; 5244 5245 ret *= 10; 5246 if (ret > INT_MAX - digit) 5247 overflow = 1; 5248 else 5249 ret += digit; 5250 } 5251 ok = 1; 5252 NEXT; 5253 } 5254 if ((ok != 1) || (overflow == 1)) { 5255 return(-1); 5256 } 5257 return(ret); 5258 } 5259 5260 /** 5261 * xmlFAParseQuantifier: 5262 * @ctxt: a regexp parser context 5263 * 5264 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5265 * [5] quantity ::= quantRange | quantMin | QuantExact 5266 * [6] quantRange ::= QuantExact ',' QuantExact 5267 * [7] quantMin ::= QuantExact ',' 5268 * [8] QuantExact ::= [0-9]+ 5269 */ 5270 static int 5271 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5272 int cur; 5273 5274 cur = CUR; 5275 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5276 if (ctxt->atom != NULL) { 5277 if (cur == '?') 5278 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5279 else if (cur == '*') 5280 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5281 else if (cur == '+') 5282 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5283 } 5284 NEXT; 5285 return(1); 5286 } 5287 if (cur == '{') { 5288 int min = 0, max = 0; 5289 5290 NEXT; 5291 cur = xmlFAParseQuantExact(ctxt); 5292 if (cur >= 0) 5293 min = cur; 5294 else { 5295 ERROR("Improper quantifier"); 5296 } 5297 if (CUR == ',') { 5298 NEXT; 5299 if (CUR == '}') 5300 max = INT_MAX; 5301 else { 5302 cur = xmlFAParseQuantExact(ctxt); 5303 if (cur >= 0) 5304 max = cur; 5305 else { 5306 ERROR("Improper quantifier"); 5307 } 5308 } 5309 } 5310 if (CUR == '}') { 5311 NEXT; 5312 } else { 5313 ERROR("Unterminated quantifier"); 5314 } 5315 if (max == 0) 5316 max = min; 5317 if (ctxt->atom != NULL) { 5318 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5319 ctxt->atom->min = min; 5320 ctxt->atom->max = max; 5321 } 5322 return(1); 5323 } 5324 return(0); 5325 } 5326 5327 /** 5328 * xmlFAParseAtom: 5329 * @ctxt: a regexp parser context 5330 * 5331 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5332 */ 5333 static int 5334 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5335 int codepoint, len; 5336 5337 codepoint = xmlFAIsChar(ctxt); 5338 if (codepoint > 0) { 5339 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5340 if (ctxt->atom == NULL) 5341 return(-1); 5342 codepoint = CUR_SCHAR(ctxt->cur, len); 5343 ctxt->atom->codepoint = codepoint; 5344 NEXTL(len); 5345 return(1); 5346 } else if (CUR == '|') { 5347 return(0); 5348 } else if (CUR == 0) { 5349 return(0); 5350 } else if (CUR == ')') { 5351 return(0); 5352 } else if (CUR == '(') { 5353 xmlRegStatePtr start, oldend, start0; 5354 5355 NEXT; 5356 if (ctxt->depth >= 50) { 5357 ERROR("xmlFAParseAtom: maximum nesting depth exceeded"); 5358 return(-1); 5359 } 5360 /* 5361 * this extra Epsilon transition is needed if we count with 0 allowed 5362 * unfortunately this can't be known at that point 5363 */ 5364 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5365 start0 = ctxt->state; 5366 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5367 start = ctxt->state; 5368 oldend = ctxt->end; 5369 ctxt->end = NULL; 5370 ctxt->atom = NULL; 5371 ctxt->depth++; 5372 xmlFAParseRegExp(ctxt, 0); 5373 ctxt->depth--; 5374 if (CUR == ')') { 5375 NEXT; 5376 } else { 5377 ERROR("xmlFAParseAtom: expecting ')'"); 5378 } 5379 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5380 if (ctxt->atom == NULL) 5381 return(-1); 5382 ctxt->atom->start = start; 5383 ctxt->atom->start0 = start0; 5384 ctxt->atom->stop = ctxt->state; 5385 ctxt->end = oldend; 5386 return(1); 5387 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5388 xmlFAParseCharClass(ctxt); 5389 return(1); 5390 } 5391 return(0); 5392 } 5393 5394 /** 5395 * xmlFAParsePiece: 5396 * @ctxt: a regexp parser context 5397 * 5398 * [3] piece ::= atom quantifier? 5399 */ 5400 static int 5401 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5402 int ret; 5403 5404 ctxt->atom = NULL; 5405 ret = xmlFAParseAtom(ctxt); 5406 if (ret == 0) 5407 return(0); 5408 if (ctxt->atom == NULL) { 5409 ERROR("internal: no atom generated"); 5410 } 5411 xmlFAParseQuantifier(ctxt); 5412 return(1); 5413 } 5414 5415 /** 5416 * xmlFAParseBranch: 5417 * @ctxt: a regexp parser context 5418 * @to: optional target to the end of the branch 5419 * 5420 * @to is used to optimize by removing duplicate path in automata 5421 * in expressions like (a|b)(c|d) 5422 * 5423 * [2] branch ::= piece* 5424 */ 5425 static int 5426 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5427 xmlRegStatePtr previous; 5428 int ret; 5429 5430 previous = ctxt->state; 5431 ret = xmlFAParsePiece(ctxt); 5432 if (ret == 0) { 5433 /* Empty branch */ 5434 xmlFAGenerateEpsilonTransition(ctxt, previous, to); 5435 } else { 5436 if (xmlFAGenerateTransitions(ctxt, previous, 5437 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0) 5438 return(-1); 5439 previous = ctxt->state; 5440 ctxt->atom = NULL; 5441 } 5442 while ((ret != 0) && (ctxt->error == 0)) { 5443 ret = xmlFAParsePiece(ctxt); 5444 if (ret != 0) { 5445 if (xmlFAGenerateTransitions(ctxt, previous, 5446 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, 5447 ctxt->atom) < 0) 5448 return(-1); 5449 previous = ctxt->state; 5450 ctxt->atom = NULL; 5451 } 5452 } 5453 return(0); 5454 } 5455 5456 /** 5457 * xmlFAParseRegExp: 5458 * @ctxt: a regexp parser context 5459 * @top: is this the top-level expression ? 5460 * 5461 * [1] regExp ::= branch ( '|' branch )* 5462 */ 5463 static void 5464 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5465 xmlRegStatePtr start, end; 5466 5467 /* if not top start should have been generated by an epsilon trans */ 5468 start = ctxt->state; 5469 ctxt->end = NULL; 5470 xmlFAParseBranch(ctxt, NULL); 5471 if (top) { 5472 #ifdef DEBUG_REGEXP_GRAPH 5473 printf("State %d is final\n", ctxt->state->no); 5474 #endif 5475 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5476 } 5477 if (CUR != '|') { 5478 ctxt->end = ctxt->state; 5479 return; 5480 } 5481 end = ctxt->state; 5482 while ((CUR == '|') && (ctxt->error == 0)) { 5483 NEXT; 5484 ctxt->state = start; 5485 ctxt->end = NULL; 5486 xmlFAParseBranch(ctxt, end); 5487 } 5488 if (!top) { 5489 ctxt->state = end; 5490 ctxt->end = end; 5491 } 5492 } 5493 5494 /************************************************************************ 5495 * * 5496 * The basic API * 5497 * * 5498 ************************************************************************/ 5499 5500 /** 5501 * xmlRegexpPrint: 5502 * @output: the file for the output debug 5503 * @regexp: the compiled regexp 5504 * 5505 * Print the content of the compiled regular expression 5506 */ 5507 void 5508 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5509 int i; 5510 5511 if (output == NULL) 5512 return; 5513 fprintf(output, " regexp: "); 5514 if (regexp == NULL) { 5515 fprintf(output, "NULL\n"); 5516 return; 5517 } 5518 fprintf(output, "'%s' ", regexp->string); 5519 fprintf(output, "\n"); 5520 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5521 for (i = 0;i < regexp->nbAtoms; i++) { 5522 fprintf(output, " %02d ", i); 5523 xmlRegPrintAtom(output, regexp->atoms[i]); 5524 } 5525 fprintf(output, "%d states:", regexp->nbStates); 5526 fprintf(output, "\n"); 5527 for (i = 0;i < regexp->nbStates; i++) { 5528 xmlRegPrintState(output, regexp->states[i]); 5529 } 5530 fprintf(output, "%d counters:\n", regexp->nbCounters); 5531 for (i = 0;i < regexp->nbCounters; i++) { 5532 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5533 regexp->counters[i].max); 5534 } 5535 } 5536 5537 /** 5538 * xmlRegexpCompile: 5539 * @regexp: a regular expression string 5540 * 5541 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5542 * Appendix F and builds an automata suitable for testing strings against 5543 * that regular expression 5544 * 5545 * Returns the compiled expression or NULL in case of error 5546 */ 5547 xmlRegexpPtr 5548 xmlRegexpCompile(const xmlChar *regexp) { 5549 xmlRegexpPtr ret; 5550 xmlRegParserCtxtPtr ctxt; 5551 5552 ctxt = xmlRegNewParserCtxt(regexp); 5553 if (ctxt == NULL) 5554 return(NULL); 5555 5556 /* initialize the parser */ 5557 ctxt->end = NULL; 5558 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5559 xmlRegStatePush(ctxt, ctxt->start); 5560 5561 /* parse the expression building an automata */ 5562 xmlFAParseRegExp(ctxt, 1); 5563 if (CUR != 0) { 5564 ERROR("xmlFAParseRegExp: extra characters"); 5565 } 5566 if (ctxt->error != 0) { 5567 xmlRegFreeParserCtxt(ctxt); 5568 return(NULL); 5569 } 5570 ctxt->end = ctxt->state; 5571 ctxt->start->type = XML_REGEXP_START_STATE; 5572 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5573 5574 /* remove the Epsilon except for counted transitions */ 5575 xmlFAEliminateEpsilonTransitions(ctxt); 5576 5577 5578 if (ctxt->error != 0) { 5579 xmlRegFreeParserCtxt(ctxt); 5580 return(NULL); 5581 } 5582 ret = xmlRegEpxFromParse(ctxt); 5583 xmlRegFreeParserCtxt(ctxt); 5584 return(ret); 5585 } 5586 5587 /** 5588 * xmlRegexpExec: 5589 * @comp: the compiled regular expression 5590 * @content: the value to check against the regular expression 5591 * 5592 * Check if the regular expression generates the value 5593 * 5594 * Returns 1 if it matches, 0 if not and a negative value in case of error 5595 */ 5596 int 5597 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5598 if ((comp == NULL) || (content == NULL)) 5599 return(-1); 5600 return(xmlFARegExec(comp, content)); 5601 } 5602 5603 /** 5604 * xmlRegexpIsDeterminist: 5605 * @comp: the compiled regular expression 5606 * 5607 * Check if the regular expression is determinist 5608 * 5609 * Returns 1 if it yes, 0 if not and a negative value in case of error 5610 */ 5611 int 5612 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5613 xmlAutomataPtr am; 5614 int ret; 5615 5616 if (comp == NULL) 5617 return(-1); 5618 if (comp->determinist != -1) 5619 return(comp->determinist); 5620 5621 am = xmlNewAutomata(); 5622 if (am == NULL) 5623 return(-1); 5624 if (am->states != NULL) { 5625 int i; 5626 5627 for (i = 0;i < am->nbStates;i++) 5628 xmlRegFreeState(am->states[i]); 5629 xmlFree(am->states); 5630 } 5631 am->nbAtoms = comp->nbAtoms; 5632 am->atoms = comp->atoms; 5633 am->nbStates = comp->nbStates; 5634 am->states = comp->states; 5635 am->determinist = -1; 5636 am->flags = comp->flags; 5637 ret = xmlFAComputesDeterminism(am); 5638 am->atoms = NULL; 5639 am->states = NULL; 5640 xmlFreeAutomata(am); 5641 comp->determinist = ret; 5642 return(ret); 5643 } 5644 5645 /** 5646 * xmlRegFreeRegexp: 5647 * @regexp: the regexp 5648 * 5649 * Free a regexp 5650 */ 5651 void 5652 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5653 int i; 5654 if (regexp == NULL) 5655 return; 5656 5657 if (regexp->string != NULL) 5658 xmlFree(regexp->string); 5659 if (regexp->states != NULL) { 5660 for (i = 0;i < regexp->nbStates;i++) 5661 xmlRegFreeState(regexp->states[i]); 5662 xmlFree(regexp->states); 5663 } 5664 if (regexp->atoms != NULL) { 5665 for (i = 0;i < regexp->nbAtoms;i++) 5666 xmlRegFreeAtom(regexp->atoms[i]); 5667 xmlFree(regexp->atoms); 5668 } 5669 if (regexp->counters != NULL) 5670 xmlFree(regexp->counters); 5671 if (regexp->compact != NULL) 5672 xmlFree(regexp->compact); 5673 if (regexp->transdata != NULL) 5674 xmlFree(regexp->transdata); 5675 if (regexp->stringMap != NULL) { 5676 for (i = 0; i < regexp->nbstrings;i++) 5677 xmlFree(regexp->stringMap[i]); 5678 xmlFree(regexp->stringMap); 5679 } 5680 5681 xmlFree(regexp); 5682 } 5683 5684 #ifdef LIBXML_AUTOMATA_ENABLED 5685 /************************************************************************ 5686 * * 5687 * The Automata interface * 5688 * * 5689 ************************************************************************/ 5690 5691 /** 5692 * xmlNewAutomata: 5693 * 5694 * Create a new automata 5695 * 5696 * Returns the new object or NULL in case of failure 5697 */ 5698 xmlAutomataPtr 5699 xmlNewAutomata(void) { 5700 xmlAutomataPtr ctxt; 5701 5702 ctxt = xmlRegNewParserCtxt(NULL); 5703 if (ctxt == NULL) 5704 return(NULL); 5705 5706 /* initialize the parser */ 5707 ctxt->end = NULL; 5708 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5709 if (ctxt->start == NULL) { 5710 xmlFreeAutomata(ctxt); 5711 return(NULL); 5712 } 5713 ctxt->start->type = XML_REGEXP_START_STATE; 5714 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5715 xmlRegFreeState(ctxt->start); 5716 xmlFreeAutomata(ctxt); 5717 return(NULL); 5718 } 5719 ctxt->flags = 0; 5720 5721 return(ctxt); 5722 } 5723 5724 /** 5725 * xmlFreeAutomata: 5726 * @am: an automata 5727 * 5728 * Free an automata 5729 */ 5730 void 5731 xmlFreeAutomata(xmlAutomataPtr am) { 5732 if (am == NULL) 5733 return; 5734 xmlRegFreeParserCtxt(am); 5735 } 5736 5737 /** 5738 * xmlAutomataSetFlags: 5739 * @am: an automata 5740 * @flags: a set of internal flags 5741 * 5742 * Set some flags on the automata 5743 */ 5744 void 5745 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5746 if (am == NULL) 5747 return; 5748 am->flags |= flags; 5749 } 5750 5751 /** 5752 * xmlAutomataGetInitState: 5753 * @am: an automata 5754 * 5755 * Initial state lookup 5756 * 5757 * Returns the initial state of the automata 5758 */ 5759 xmlAutomataStatePtr 5760 xmlAutomataGetInitState(xmlAutomataPtr am) { 5761 if (am == NULL) 5762 return(NULL); 5763 return(am->start); 5764 } 5765 5766 /** 5767 * xmlAutomataSetFinalState: 5768 * @am: an automata 5769 * @state: a state in this automata 5770 * 5771 * Makes that state a final state 5772 * 5773 * Returns 0 or -1 in case of error 5774 */ 5775 int 5776 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5777 if ((am == NULL) || (state == NULL)) 5778 return(-1); 5779 state->type = XML_REGEXP_FINAL_STATE; 5780 return(0); 5781 } 5782 5783 /** 5784 * xmlAutomataNewTransition: 5785 * @am: an automata 5786 * @from: the starting point of the transition 5787 * @to: the target point of the transition or NULL 5788 * @token: the input string associated to that transition 5789 * @data: data passed to the callback function if the transition is activated 5790 * 5791 * If @to is NULL, this creates first a new target state in the automata 5792 * and then adds a transition from the @from state to the target state 5793 * activated by the value of @token 5794 * 5795 * Returns the target state or NULL in case of error 5796 */ 5797 xmlAutomataStatePtr 5798 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5799 xmlAutomataStatePtr to, const xmlChar *token, 5800 void *data) { 5801 xmlRegAtomPtr atom; 5802 5803 if ((am == NULL) || (from == NULL) || (token == NULL)) 5804 return(NULL); 5805 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5806 if (atom == NULL) 5807 return(NULL); 5808 atom->data = data; 5809 atom->valuep = xmlStrdup(token); 5810 5811 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5812 xmlRegFreeAtom(atom); 5813 return(NULL); 5814 } 5815 if (to == NULL) 5816 return(am->state); 5817 return(to); 5818 } 5819 5820 /** 5821 * xmlAutomataNewTransition2: 5822 * @am: an automata 5823 * @from: the starting point of the transition 5824 * @to: the target point of the transition or NULL 5825 * @token: the first input string associated to that transition 5826 * @token2: the second input string associated to that transition 5827 * @data: data passed to the callback function if the transition is activated 5828 * 5829 * If @to is NULL, this creates first a new target state in the automata 5830 * and then adds a transition from the @from state to the target state 5831 * activated by the value of @token 5832 * 5833 * Returns the target state or NULL in case of error 5834 */ 5835 xmlAutomataStatePtr 5836 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5837 xmlAutomataStatePtr to, const xmlChar *token, 5838 const xmlChar *token2, void *data) { 5839 xmlRegAtomPtr atom; 5840 5841 if ((am == NULL) || (from == NULL) || (token == NULL)) 5842 return(NULL); 5843 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5844 if (atom == NULL) 5845 return(NULL); 5846 atom->data = data; 5847 if ((token2 == NULL) || (*token2 == 0)) { 5848 atom->valuep = xmlStrdup(token); 5849 } else { 5850 int lenn, lenp; 5851 xmlChar *str; 5852 5853 lenn = strlen((char *) token2); 5854 lenp = strlen((char *) token); 5855 5856 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5857 if (str == NULL) { 5858 xmlRegFreeAtom(atom); 5859 return(NULL); 5860 } 5861 memcpy(&str[0], token, lenp); 5862 str[lenp] = '|'; 5863 memcpy(&str[lenp + 1], token2, lenn); 5864 str[lenn + lenp + 1] = 0; 5865 5866 atom->valuep = str; 5867 } 5868 5869 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5870 xmlRegFreeAtom(atom); 5871 return(NULL); 5872 } 5873 if (to == NULL) 5874 return(am->state); 5875 return(to); 5876 } 5877 5878 /** 5879 * xmlAutomataNewNegTrans: 5880 * @am: an automata 5881 * @from: the starting point of the transition 5882 * @to: the target point of the transition or NULL 5883 * @token: the first input string associated to that transition 5884 * @token2: the second input string associated to that transition 5885 * @data: data passed to the callback function if the transition is activated 5886 * 5887 * If @to is NULL, this creates first a new target state in the automata 5888 * and then adds a transition from the @from state to the target state 5889 * activated by any value except (@token,@token2) 5890 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5891 # the semantic of XSD ##other 5892 * 5893 * Returns the target state or NULL in case of error 5894 */ 5895 xmlAutomataStatePtr 5896 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5897 xmlAutomataStatePtr to, const xmlChar *token, 5898 const xmlChar *token2, void *data) { 5899 xmlRegAtomPtr atom; 5900 xmlChar err_msg[200]; 5901 5902 if ((am == NULL) || (from == NULL) || (token == NULL)) 5903 return(NULL); 5904 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5905 if (atom == NULL) 5906 return(NULL); 5907 atom->data = data; 5908 atom->neg = 1; 5909 if ((token2 == NULL) || (*token2 == 0)) { 5910 atom->valuep = xmlStrdup(token); 5911 } else { 5912 int lenn, lenp; 5913 xmlChar *str; 5914 5915 lenn = strlen((char *) token2); 5916 lenp = strlen((char *) token); 5917 5918 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5919 if (str == NULL) { 5920 xmlRegFreeAtom(atom); 5921 return(NULL); 5922 } 5923 memcpy(&str[0], token, lenp); 5924 str[lenp] = '|'; 5925 memcpy(&str[lenp + 1], token2, lenn); 5926 str[lenn + lenp + 1] = 0; 5927 5928 atom->valuep = str; 5929 } 5930 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5931 err_msg[199] = 0; 5932 atom->valuep2 = xmlStrdup(err_msg); 5933 5934 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5935 xmlRegFreeAtom(atom); 5936 return(NULL); 5937 } 5938 am->negs++; 5939 if (to == NULL) 5940 return(am->state); 5941 return(to); 5942 } 5943 5944 /** 5945 * xmlAutomataNewCountTrans2: 5946 * @am: an automata 5947 * @from: the starting point of the transition 5948 * @to: the target point of the transition or NULL 5949 * @token: the input string associated to that transition 5950 * @token2: the second input string associated to that transition 5951 * @min: the minimum successive occurrences of token 5952 * @max: the maximum successive occurrences of token 5953 * @data: data associated to the transition 5954 * 5955 * If @to is NULL, this creates first a new target state in the automata 5956 * and then adds a transition from the @from state to the target state 5957 * activated by a succession of input of value @token and @token2 and 5958 * whose number is between @min and @max 5959 * 5960 * Returns the target state or NULL in case of error 5961 */ 5962 xmlAutomataStatePtr 5963 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5964 xmlAutomataStatePtr to, const xmlChar *token, 5965 const xmlChar *token2, 5966 int min, int max, void *data) { 5967 xmlRegAtomPtr atom; 5968 int counter; 5969 5970 if ((am == NULL) || (from == NULL) || (token == NULL)) 5971 return(NULL); 5972 if (min < 0) 5973 return(NULL); 5974 if ((max < min) || (max < 1)) 5975 return(NULL); 5976 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5977 if (atom == NULL) 5978 return(NULL); 5979 if ((token2 == NULL) || (*token2 == 0)) { 5980 atom->valuep = xmlStrdup(token); 5981 } else { 5982 int lenn, lenp; 5983 xmlChar *str; 5984 5985 lenn = strlen((char *) token2); 5986 lenp = strlen((char *) token); 5987 5988 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5989 if (str == NULL) { 5990 xmlRegFreeAtom(atom); 5991 return(NULL); 5992 } 5993 memcpy(&str[0], token, lenp); 5994 str[lenp] = '|'; 5995 memcpy(&str[lenp + 1], token2, lenn); 5996 str[lenn + lenp + 1] = 0; 5997 5998 atom->valuep = str; 5999 } 6000 atom->data = data; 6001 if (min == 0) 6002 atom->min = 1; 6003 else 6004 atom->min = min; 6005 atom->max = max; 6006 6007 /* 6008 * associate a counter to the transition. 6009 */ 6010 counter = xmlRegGetCounter(am); 6011 am->counters[counter].min = min; 6012 am->counters[counter].max = max; 6013 6014 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6015 if (to == NULL) { 6016 to = xmlRegNewState(am); 6017 xmlRegStatePush(am, to); 6018 } 6019 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6020 xmlRegAtomPush(am, atom); 6021 am->state = to; 6022 6023 if (to == NULL) 6024 to = am->state; 6025 if (to == NULL) 6026 return(NULL); 6027 if (min == 0) 6028 xmlFAGenerateEpsilonTransition(am, from, to); 6029 return(to); 6030 } 6031 6032 /** 6033 * xmlAutomataNewCountTrans: 6034 * @am: an automata 6035 * @from: the starting point of the transition 6036 * @to: the target point of the transition or NULL 6037 * @token: the input string associated to that transition 6038 * @min: the minimum successive occurrences of token 6039 * @max: the maximum successive occurrences of token 6040 * @data: data associated to the transition 6041 * 6042 * If @to is NULL, this creates first a new target state in the automata 6043 * and then adds a transition from the @from state to the target state 6044 * activated by a succession of input of value @token and whose number 6045 * is between @min and @max 6046 * 6047 * Returns the target state or NULL in case of error 6048 */ 6049 xmlAutomataStatePtr 6050 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6051 xmlAutomataStatePtr to, const xmlChar *token, 6052 int min, int max, void *data) { 6053 xmlRegAtomPtr atom; 6054 int counter; 6055 6056 if ((am == NULL) || (from == NULL) || (token == NULL)) 6057 return(NULL); 6058 if (min < 0) 6059 return(NULL); 6060 if ((max < min) || (max < 1)) 6061 return(NULL); 6062 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6063 if (atom == NULL) 6064 return(NULL); 6065 atom->valuep = xmlStrdup(token); 6066 atom->data = data; 6067 if (min == 0) 6068 atom->min = 1; 6069 else 6070 atom->min = min; 6071 atom->max = max; 6072 6073 /* 6074 * associate a counter to the transition. 6075 */ 6076 counter = xmlRegGetCounter(am); 6077 am->counters[counter].min = min; 6078 am->counters[counter].max = max; 6079 6080 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6081 if (to == NULL) { 6082 to = xmlRegNewState(am); 6083 xmlRegStatePush(am, to); 6084 } 6085 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6086 xmlRegAtomPush(am, atom); 6087 am->state = to; 6088 6089 if (to == NULL) 6090 to = am->state; 6091 if (to == NULL) 6092 return(NULL); 6093 if (min == 0) 6094 xmlFAGenerateEpsilonTransition(am, from, to); 6095 return(to); 6096 } 6097 6098 /** 6099 * xmlAutomataNewOnceTrans2: 6100 * @am: an automata 6101 * @from: the starting point of the transition 6102 * @to: the target point of the transition or NULL 6103 * @token: the input string associated to that transition 6104 * @token2: the second input string associated to that transition 6105 * @min: the minimum successive occurrences of token 6106 * @max: the maximum successive occurrences of token 6107 * @data: data associated to the transition 6108 * 6109 * If @to is NULL, this creates first a new target state in the automata 6110 * and then adds a transition from the @from state to the target state 6111 * activated by a succession of input of value @token and @token2 and whose 6112 * number is between @min and @max, moreover that transition can only be 6113 * crossed once. 6114 * 6115 * Returns the target state or NULL in case of error 6116 */ 6117 xmlAutomataStatePtr 6118 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6119 xmlAutomataStatePtr to, const xmlChar *token, 6120 const xmlChar *token2, 6121 int min, int max, void *data) { 6122 xmlRegAtomPtr atom; 6123 int counter; 6124 6125 if ((am == NULL) || (from == NULL) || (token == NULL)) 6126 return(NULL); 6127 if (min < 1) 6128 return(NULL); 6129 if (max < min) 6130 return(NULL); 6131 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6132 if (atom == NULL) 6133 return(NULL); 6134 if ((token2 == NULL) || (*token2 == 0)) { 6135 atom->valuep = xmlStrdup(token); 6136 } else { 6137 int lenn, lenp; 6138 xmlChar *str; 6139 6140 lenn = strlen((char *) token2); 6141 lenp = strlen((char *) token); 6142 6143 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6144 if (str == NULL) { 6145 xmlRegFreeAtom(atom); 6146 return(NULL); 6147 } 6148 memcpy(&str[0], token, lenp); 6149 str[lenp] = '|'; 6150 memcpy(&str[lenp + 1], token2, lenn); 6151 str[lenn + lenp + 1] = 0; 6152 6153 atom->valuep = str; 6154 } 6155 atom->data = data; 6156 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6157 atom->min = min; 6158 atom->max = max; 6159 /* 6160 * associate a counter to the transition. 6161 */ 6162 counter = xmlRegGetCounter(am); 6163 am->counters[counter].min = 1; 6164 am->counters[counter].max = 1; 6165 6166 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6167 if (to == NULL) { 6168 to = xmlRegNewState(am); 6169 xmlRegStatePush(am, to); 6170 } 6171 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6172 xmlRegAtomPush(am, atom); 6173 am->state = to; 6174 return(to); 6175 } 6176 6177 6178 6179 /** 6180 * xmlAutomataNewOnceTrans: 6181 * @am: an automata 6182 * @from: the starting point of the transition 6183 * @to: the target point of the transition or NULL 6184 * @token: the input string associated to that transition 6185 * @min: the minimum successive occurrences of token 6186 * @max: the maximum successive occurrences of token 6187 * @data: data associated to the transition 6188 * 6189 * If @to is NULL, this creates first a new target state in the automata 6190 * and then adds a transition from the @from state to the target state 6191 * activated by a succession of input of value @token and whose number 6192 * is between @min and @max, moreover that transition can only be crossed 6193 * once. 6194 * 6195 * Returns the target state or NULL in case of error 6196 */ 6197 xmlAutomataStatePtr 6198 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6199 xmlAutomataStatePtr to, const xmlChar *token, 6200 int min, int max, void *data) { 6201 xmlRegAtomPtr atom; 6202 int counter; 6203 6204 if ((am == NULL) || (from == NULL) || (token == NULL)) 6205 return(NULL); 6206 if (min < 1) 6207 return(NULL); 6208 if (max < min) 6209 return(NULL); 6210 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6211 if (atom == NULL) 6212 return(NULL); 6213 atom->valuep = xmlStrdup(token); 6214 atom->data = data; 6215 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6216 atom->min = min; 6217 atom->max = max; 6218 /* 6219 * associate a counter to the transition. 6220 */ 6221 counter = xmlRegGetCounter(am); 6222 am->counters[counter].min = 1; 6223 am->counters[counter].max = 1; 6224 6225 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6226 if (to == NULL) { 6227 to = xmlRegNewState(am); 6228 xmlRegStatePush(am, to); 6229 } 6230 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6231 xmlRegAtomPush(am, atom); 6232 am->state = to; 6233 return(to); 6234 } 6235 6236 /** 6237 * xmlAutomataNewState: 6238 * @am: an automata 6239 * 6240 * Create a new disconnected state in the automata 6241 * 6242 * Returns the new state or NULL in case of error 6243 */ 6244 xmlAutomataStatePtr 6245 xmlAutomataNewState(xmlAutomataPtr am) { 6246 xmlAutomataStatePtr to; 6247 6248 if (am == NULL) 6249 return(NULL); 6250 to = xmlRegNewState(am); 6251 xmlRegStatePush(am, to); 6252 return(to); 6253 } 6254 6255 /** 6256 * xmlAutomataNewEpsilon: 6257 * @am: an automata 6258 * @from: the starting point of the transition 6259 * @to: the target point of the transition or NULL 6260 * 6261 * If @to is NULL, this creates first a new target state in the automata 6262 * and then adds an epsilon transition from the @from state to the 6263 * target state 6264 * 6265 * Returns the target state or NULL in case of error 6266 */ 6267 xmlAutomataStatePtr 6268 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6269 xmlAutomataStatePtr to) { 6270 if ((am == NULL) || (from == NULL)) 6271 return(NULL); 6272 xmlFAGenerateEpsilonTransition(am, from, to); 6273 if (to == NULL) 6274 return(am->state); 6275 return(to); 6276 } 6277 6278 /** 6279 * xmlAutomataNewAllTrans: 6280 * @am: an automata 6281 * @from: the starting point of the transition 6282 * @to: the target point of the transition or NULL 6283 * @lax: allow to transition if not all all transitions have been activated 6284 * 6285 * If @to is NULL, this creates first a new target state in the automata 6286 * and then adds a an ALL transition from the @from state to the 6287 * target state. That transition is an epsilon transition allowed only when 6288 * all transitions from the @from node have been activated. 6289 * 6290 * Returns the target state or NULL in case of error 6291 */ 6292 xmlAutomataStatePtr 6293 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6294 xmlAutomataStatePtr to, int lax) { 6295 if ((am == NULL) || (from == NULL)) 6296 return(NULL); 6297 xmlFAGenerateAllTransition(am, from, to, lax); 6298 if (to == NULL) 6299 return(am->state); 6300 return(to); 6301 } 6302 6303 /** 6304 * xmlAutomataNewCounter: 6305 * @am: an automata 6306 * @min: the minimal value on the counter 6307 * @max: the maximal value on the counter 6308 * 6309 * Create a new counter 6310 * 6311 * Returns the counter number or -1 in case of error 6312 */ 6313 int 6314 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6315 int ret; 6316 6317 if (am == NULL) 6318 return(-1); 6319 6320 ret = xmlRegGetCounter(am); 6321 if (ret < 0) 6322 return(-1); 6323 am->counters[ret].min = min; 6324 am->counters[ret].max = max; 6325 return(ret); 6326 } 6327 6328 /** 6329 * xmlAutomataNewCountedTrans: 6330 * @am: an automata 6331 * @from: the starting point of the transition 6332 * @to: the target point of the transition or NULL 6333 * @counter: the counter associated to that transition 6334 * 6335 * If @to is NULL, this creates first a new target state in the automata 6336 * and then adds an epsilon transition from the @from state to the target state 6337 * which will increment the counter provided 6338 * 6339 * Returns the target state or NULL in case of error 6340 */ 6341 xmlAutomataStatePtr 6342 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6343 xmlAutomataStatePtr to, int counter) { 6344 if ((am == NULL) || (from == NULL) || (counter < 0)) 6345 return(NULL); 6346 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6347 if (to == NULL) 6348 return(am->state); 6349 return(to); 6350 } 6351 6352 /** 6353 * xmlAutomataNewCounterTrans: 6354 * @am: an automata 6355 * @from: the starting point of the transition 6356 * @to: the target point of the transition or NULL 6357 * @counter: the counter associated to that transition 6358 * 6359 * If @to is NULL, this creates first a new target state in the automata 6360 * and then adds an epsilon transition from the @from state to the target state 6361 * which will be allowed only if the counter is within the right range. 6362 * 6363 * Returns the target state or NULL in case of error 6364 */ 6365 xmlAutomataStatePtr 6366 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6367 xmlAutomataStatePtr to, int counter) { 6368 if ((am == NULL) || (from == NULL) || (counter < 0)) 6369 return(NULL); 6370 xmlFAGenerateCountedTransition(am, from, to, counter); 6371 if (to == NULL) 6372 return(am->state); 6373 return(to); 6374 } 6375 6376 /** 6377 * xmlAutomataCompile: 6378 * @am: an automata 6379 * 6380 * Compile the automata into a Reg Exp ready for being executed. 6381 * The automata should be free after this point. 6382 * 6383 * Returns the compiled regexp or NULL in case of error 6384 */ 6385 xmlRegexpPtr 6386 xmlAutomataCompile(xmlAutomataPtr am) { 6387 xmlRegexpPtr ret; 6388 6389 if ((am == NULL) || (am->error != 0)) return(NULL); 6390 xmlFAEliminateEpsilonTransitions(am); 6391 /* xmlFAComputesDeterminism(am); */ 6392 ret = xmlRegEpxFromParse(am); 6393 6394 return(ret); 6395 } 6396 6397 /** 6398 * xmlAutomataIsDeterminist: 6399 * @am: an automata 6400 * 6401 * Checks if an automata is determinist. 6402 * 6403 * Returns 1 if true, 0 if not, and -1 in case of error 6404 */ 6405 int 6406 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6407 int ret; 6408 6409 if (am == NULL) 6410 return(-1); 6411 6412 ret = xmlFAComputesDeterminism(am); 6413 return(ret); 6414 } 6415 #endif /* LIBXML_AUTOMATA_ENABLED */ 6416 6417 #ifdef LIBXML_EXPR_ENABLED 6418 /************************************************************************ 6419 * * 6420 * Formal Expression handling code * 6421 * * 6422 ************************************************************************/ 6423 /************************************************************************ 6424 * * 6425 * Expression handling context * 6426 * * 6427 ************************************************************************/ 6428 6429 struct _xmlExpCtxt { 6430 xmlDictPtr dict; 6431 xmlExpNodePtr *table; 6432 int size; 6433 int nbElems; 6434 int nb_nodes; 6435 int maxNodes; 6436 const char *expr; 6437 const char *cur; 6438 int nb_cons; 6439 int tabSize; 6440 }; 6441 6442 /** 6443 * xmlExpNewCtxt: 6444 * @maxNodes: the maximum number of nodes 6445 * @dict: optional dictionary to use internally 6446 * 6447 * Creates a new context for manipulating expressions 6448 * 6449 * Returns the context or NULL in case of error 6450 */ 6451 xmlExpCtxtPtr 6452 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6453 xmlExpCtxtPtr ret; 6454 int size = 256; 6455 6456 if (maxNodes <= 4096) 6457 maxNodes = 4096; 6458 6459 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6460 if (ret == NULL) 6461 return(NULL); 6462 memset(ret, 0, sizeof(xmlExpCtxt)); 6463 ret->size = size; 6464 ret->nbElems = 0; 6465 ret->maxNodes = maxNodes; 6466 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6467 if (ret->table == NULL) { 6468 xmlFree(ret); 6469 return(NULL); 6470 } 6471 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6472 if (dict == NULL) { 6473 ret->dict = xmlDictCreate(); 6474 if (ret->dict == NULL) { 6475 xmlFree(ret->table); 6476 xmlFree(ret); 6477 return(NULL); 6478 } 6479 } else { 6480 ret->dict = dict; 6481 xmlDictReference(ret->dict); 6482 } 6483 return(ret); 6484 } 6485 6486 /** 6487 * xmlExpFreeCtxt: 6488 * @ctxt: an expression context 6489 * 6490 * Free an expression context 6491 */ 6492 void 6493 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6494 if (ctxt == NULL) 6495 return; 6496 xmlDictFree(ctxt->dict); 6497 if (ctxt->table != NULL) 6498 xmlFree(ctxt->table); 6499 xmlFree(ctxt); 6500 } 6501 6502 /************************************************************************ 6503 * * 6504 * Structure associated to an expression node * 6505 * * 6506 ************************************************************************/ 6507 #define MAX_NODES 10000 6508 6509 /* #define DEBUG_DERIV */ 6510 6511 /* 6512 * TODO: 6513 * - Wildcards 6514 * - public API for creation 6515 * 6516 * Started 6517 * - regression testing 6518 * 6519 * Done 6520 * - split into module and test tool 6521 * - memleaks 6522 */ 6523 6524 typedef enum { 6525 XML_EXP_NILABLE = (1 << 0) 6526 } xmlExpNodeInfo; 6527 6528 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6529 6530 struct _xmlExpNode { 6531 unsigned char type;/* xmlExpNodeType */ 6532 unsigned char info;/* OR of xmlExpNodeInfo */ 6533 unsigned short key; /* the hash key */ 6534 unsigned int ref; /* The number of references */ 6535 int c_max; /* the maximum length it can consume */ 6536 xmlExpNodePtr exp_left; 6537 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6538 union { 6539 struct { 6540 int f_min; 6541 int f_max; 6542 } count; 6543 struct { 6544 xmlExpNodePtr f_right; 6545 } children; 6546 const xmlChar *f_str; 6547 } field; 6548 }; 6549 6550 #define exp_min field.count.f_min 6551 #define exp_max field.count.f_max 6552 /* #define exp_left field.children.f_left */ 6553 #define exp_right field.children.f_right 6554 #define exp_str field.f_str 6555 6556 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6557 static xmlExpNode forbiddenExpNode = { 6558 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6559 }; 6560 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6561 static xmlExpNode emptyExpNode = { 6562 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6563 }; 6564 xmlExpNodePtr emptyExp = &emptyExpNode; 6565 6566 /************************************************************************ 6567 * * 6568 * The custom hash table for unicity and canonicalization * 6569 * of sub-expressions pointers * 6570 * * 6571 ************************************************************************/ 6572 /* 6573 * xmlExpHashNameComputeKey: 6574 * Calculate the hash key for a token 6575 */ 6576 static unsigned short 6577 xmlExpHashNameComputeKey(const xmlChar *name) { 6578 unsigned short value = 0L; 6579 char ch; 6580 6581 if (name != NULL) { 6582 value += 30 * (*name); 6583 while ((ch = *name++) != 0) { 6584 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6585 } 6586 } 6587 return (value); 6588 } 6589 6590 /* 6591 * xmlExpHashComputeKey: 6592 * Calculate the hash key for a compound expression 6593 */ 6594 static unsigned short 6595 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6596 xmlExpNodePtr right) { 6597 unsigned long value; 6598 unsigned short ret; 6599 6600 switch (type) { 6601 case XML_EXP_SEQ: 6602 value = left->key; 6603 value += right->key; 6604 value *= 3; 6605 ret = (unsigned short) value; 6606 break; 6607 case XML_EXP_OR: 6608 value = left->key; 6609 value += right->key; 6610 value *= 7; 6611 ret = (unsigned short) value; 6612 break; 6613 case XML_EXP_COUNT: 6614 value = left->key; 6615 value += right->key; 6616 ret = (unsigned short) value; 6617 break; 6618 default: 6619 ret = 0; 6620 } 6621 return(ret); 6622 } 6623 6624 6625 static xmlExpNodePtr 6626 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6627 xmlExpNodePtr ret; 6628 6629 if (ctxt->nb_nodes >= MAX_NODES) 6630 return(NULL); 6631 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6632 if (ret == NULL) 6633 return(NULL); 6634 memset(ret, 0, sizeof(xmlExpNode)); 6635 ret->type = type; 6636 ret->next = NULL; 6637 ctxt->nb_nodes++; 6638 ctxt->nb_cons++; 6639 return(ret); 6640 } 6641 6642 /** 6643 * xmlExpHashGetEntry: 6644 * @table: the hash table 6645 * 6646 * Get the unique entry from the hash table. The entry is created if 6647 * needed. @left and @right are consumed, i.e. their ref count will 6648 * be decremented by the operation. 6649 * 6650 * Returns the pointer or NULL in case of error 6651 */ 6652 static xmlExpNodePtr 6653 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6654 xmlExpNodePtr left, xmlExpNodePtr right, 6655 const xmlChar *name, int min, int max) { 6656 unsigned short kbase, key; 6657 xmlExpNodePtr entry; 6658 xmlExpNodePtr insert; 6659 6660 if (ctxt == NULL) 6661 return(NULL); 6662 6663 /* 6664 * Check for duplicate and insertion location. 6665 */ 6666 if (type == XML_EXP_ATOM) { 6667 kbase = xmlExpHashNameComputeKey(name); 6668 } else if (type == XML_EXP_COUNT) { 6669 /* COUNT reduction rule 1 */ 6670 /* a{1} -> a */ 6671 if (min == max) { 6672 if (min == 1) { 6673 return(left); 6674 } 6675 if (min == 0) { 6676 xmlExpFree(ctxt, left); 6677 return(emptyExp); 6678 } 6679 } 6680 if (min < 0) { 6681 xmlExpFree(ctxt, left); 6682 return(forbiddenExp); 6683 } 6684 if (max == -1) 6685 kbase = min + 79; 6686 else 6687 kbase = max - min; 6688 kbase += left->key; 6689 } else if (type == XML_EXP_OR) { 6690 /* Forbid reduction rules */ 6691 if (left->type == XML_EXP_FORBID) { 6692 xmlExpFree(ctxt, left); 6693 return(right); 6694 } 6695 if (right->type == XML_EXP_FORBID) { 6696 xmlExpFree(ctxt, right); 6697 return(left); 6698 } 6699 6700 /* OR reduction rule 1 */ 6701 /* a | a reduced to a */ 6702 if (left == right) { 6703 xmlExpFree(ctxt, right); 6704 return(left); 6705 } 6706 /* OR canonicalization rule 1 */ 6707 /* linearize (a | b) | c into a | (b | c) */ 6708 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6709 xmlExpNodePtr tmp = left; 6710 left = right; 6711 right = tmp; 6712 } 6713 /* OR reduction rule 2 */ 6714 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6715 if (right->type == XML_EXP_OR) { 6716 if ((left == right->exp_left) || 6717 (left == right->exp_right)) { 6718 xmlExpFree(ctxt, left); 6719 return(right); 6720 } 6721 } 6722 /* OR canonicalization rule 2 */ 6723 /* linearize (a | b) | c into a | (b | c) */ 6724 if (left->type == XML_EXP_OR) { 6725 xmlExpNodePtr tmp; 6726 6727 /* OR canonicalization rule 2 */ 6728 if ((left->exp_right->type != XML_EXP_OR) && 6729 (left->exp_right->key < left->exp_left->key)) { 6730 tmp = left->exp_right; 6731 left->exp_right = left->exp_left; 6732 left->exp_left = tmp; 6733 } 6734 left->exp_right->ref++; 6735 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6736 NULL, 0, 0); 6737 left->exp_left->ref++; 6738 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6739 NULL, 0, 0); 6740 6741 xmlExpFree(ctxt, left); 6742 return(tmp); 6743 } 6744 if (right->type == XML_EXP_OR) { 6745 /* Ordering in the tree */ 6746 /* C | (A | B) -> A | (B | C) */ 6747 if (left->key > right->exp_right->key) { 6748 xmlExpNodePtr tmp; 6749 right->exp_right->ref++; 6750 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6751 left, NULL, 0, 0); 6752 right->exp_left->ref++; 6753 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6754 tmp, NULL, 0, 0); 6755 xmlExpFree(ctxt, right); 6756 return(tmp); 6757 } 6758 /* Ordering in the tree */ 6759 /* B | (A | C) -> A | (B | C) */ 6760 if (left->key > right->exp_left->key) { 6761 xmlExpNodePtr tmp; 6762 right->exp_right->ref++; 6763 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6764 right->exp_right, NULL, 0, 0); 6765 right->exp_left->ref++; 6766 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6767 tmp, NULL, 0, 0); 6768 xmlExpFree(ctxt, right); 6769 return(tmp); 6770 } 6771 } 6772 /* we know both types are != XML_EXP_OR here */ 6773 else if (left->key > right->key) { 6774 xmlExpNodePtr tmp = left; 6775 left = right; 6776 right = tmp; 6777 } 6778 kbase = xmlExpHashComputeKey(type, left, right); 6779 } else if (type == XML_EXP_SEQ) { 6780 /* Forbid reduction rules */ 6781 if (left->type == XML_EXP_FORBID) { 6782 xmlExpFree(ctxt, right); 6783 return(left); 6784 } 6785 if (right->type == XML_EXP_FORBID) { 6786 xmlExpFree(ctxt, left); 6787 return(right); 6788 } 6789 /* Empty reduction rules */ 6790 if (right->type == XML_EXP_EMPTY) { 6791 return(left); 6792 } 6793 if (left->type == XML_EXP_EMPTY) { 6794 return(right); 6795 } 6796 kbase = xmlExpHashComputeKey(type, left, right); 6797 } else 6798 return(NULL); 6799 6800 key = kbase % ctxt->size; 6801 if (ctxt->table[key] != NULL) { 6802 for (insert = ctxt->table[key]; insert != NULL; 6803 insert = insert->next) { 6804 if ((insert->key == kbase) && 6805 (insert->type == type)) { 6806 if (type == XML_EXP_ATOM) { 6807 if (name == insert->exp_str) { 6808 insert->ref++; 6809 return(insert); 6810 } 6811 } else if (type == XML_EXP_COUNT) { 6812 if ((insert->exp_min == min) && (insert->exp_max == max) && 6813 (insert->exp_left == left)) { 6814 insert->ref++; 6815 left->ref--; 6816 return(insert); 6817 } 6818 } else if ((insert->exp_left == left) && 6819 (insert->exp_right == right)) { 6820 insert->ref++; 6821 left->ref--; 6822 right->ref--; 6823 return(insert); 6824 } 6825 } 6826 } 6827 } 6828 6829 entry = xmlExpNewNode(ctxt, type); 6830 if (entry == NULL) 6831 return(NULL); 6832 entry->key = kbase; 6833 if (type == XML_EXP_ATOM) { 6834 entry->exp_str = name; 6835 entry->c_max = 1; 6836 } else if (type == XML_EXP_COUNT) { 6837 entry->exp_min = min; 6838 entry->exp_max = max; 6839 entry->exp_left = left; 6840 if ((min == 0) || (IS_NILLABLE(left))) 6841 entry->info |= XML_EXP_NILABLE; 6842 if (max < 0) 6843 entry->c_max = -1; 6844 else 6845 entry->c_max = max * entry->exp_left->c_max; 6846 } else { 6847 entry->exp_left = left; 6848 entry->exp_right = right; 6849 if (type == XML_EXP_OR) { 6850 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6851 entry->info |= XML_EXP_NILABLE; 6852 if ((entry->exp_left->c_max == -1) || 6853 (entry->exp_right->c_max == -1)) 6854 entry->c_max = -1; 6855 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6856 entry->c_max = entry->exp_left->c_max; 6857 else 6858 entry->c_max = entry->exp_right->c_max; 6859 } else { 6860 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6861 entry->info |= XML_EXP_NILABLE; 6862 if ((entry->exp_left->c_max == -1) || 6863 (entry->exp_right->c_max == -1)) 6864 entry->c_max = -1; 6865 else 6866 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6867 } 6868 } 6869 entry->ref = 1; 6870 if (ctxt->table[key] != NULL) 6871 entry->next = ctxt->table[key]; 6872 6873 ctxt->table[key] = entry; 6874 ctxt->nbElems++; 6875 6876 return(entry); 6877 } 6878 6879 /** 6880 * xmlExpFree: 6881 * @ctxt: the expression context 6882 * @exp: the expression 6883 * 6884 * Dereference the expression 6885 */ 6886 void 6887 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6888 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6889 return; 6890 exp->ref--; 6891 if (exp->ref == 0) { 6892 unsigned short key; 6893 6894 /* Unlink it first from the hash table */ 6895 key = exp->key % ctxt->size; 6896 if (ctxt->table[key] == exp) { 6897 ctxt->table[key] = exp->next; 6898 } else { 6899 xmlExpNodePtr tmp; 6900 6901 tmp = ctxt->table[key]; 6902 while (tmp != NULL) { 6903 if (tmp->next == exp) { 6904 tmp->next = exp->next; 6905 break; 6906 } 6907 tmp = tmp->next; 6908 } 6909 } 6910 6911 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6912 xmlExpFree(ctxt, exp->exp_left); 6913 xmlExpFree(ctxt, exp->exp_right); 6914 } else if (exp->type == XML_EXP_COUNT) { 6915 xmlExpFree(ctxt, exp->exp_left); 6916 } 6917 xmlFree(exp); 6918 ctxt->nb_nodes--; 6919 } 6920 } 6921 6922 /** 6923 * xmlExpRef: 6924 * @exp: the expression 6925 * 6926 * Increase the reference count of the expression 6927 */ 6928 void 6929 xmlExpRef(xmlExpNodePtr exp) { 6930 if (exp != NULL) 6931 exp->ref++; 6932 } 6933 6934 /** 6935 * xmlExpNewAtom: 6936 * @ctxt: the expression context 6937 * @name: the atom name 6938 * @len: the atom name length in byte (or -1); 6939 * 6940 * Get the atom associated to this name from that context 6941 * 6942 * Returns the node or NULL in case of error 6943 */ 6944 xmlExpNodePtr 6945 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6946 if ((ctxt == NULL) || (name == NULL)) 6947 return(NULL); 6948 name = xmlDictLookup(ctxt->dict, name, len); 6949 if (name == NULL) 6950 return(NULL); 6951 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6952 } 6953 6954 /** 6955 * xmlExpNewOr: 6956 * @ctxt: the expression context 6957 * @left: left expression 6958 * @right: right expression 6959 * 6960 * Get the atom associated to the choice @left | @right 6961 * Note that @left and @right are consumed in the operation, to keep 6962 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6963 * this is true even in case of failure (unless ctxt == NULL). 6964 * 6965 * Returns the node or NULL in case of error 6966 */ 6967 xmlExpNodePtr 6968 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6969 if (ctxt == NULL) 6970 return(NULL); 6971 if ((left == NULL) || (right == NULL)) { 6972 xmlExpFree(ctxt, left); 6973 xmlExpFree(ctxt, right); 6974 return(NULL); 6975 } 6976 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6977 } 6978 6979 /** 6980 * xmlExpNewSeq: 6981 * @ctxt: the expression context 6982 * @left: left expression 6983 * @right: right expression 6984 * 6985 * Get the atom associated to the sequence @left , @right 6986 * Note that @left and @right are consumed in the operation, to keep 6987 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6988 * this is true even in case of failure (unless ctxt == NULL). 6989 * 6990 * Returns the node or NULL in case of error 6991 */ 6992 xmlExpNodePtr 6993 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6994 if (ctxt == NULL) 6995 return(NULL); 6996 if ((left == NULL) || (right == NULL)) { 6997 xmlExpFree(ctxt, left); 6998 xmlExpFree(ctxt, right); 6999 return(NULL); 7000 } 7001 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 7002 } 7003 7004 /** 7005 * xmlExpNewRange: 7006 * @ctxt: the expression context 7007 * @subset: the expression to be repeated 7008 * @min: the lower bound for the repetition 7009 * @max: the upper bound for the repetition, -1 means infinite 7010 * 7011 * Get the atom associated to the range (@subset){@min, @max} 7012 * Note that @subset is consumed in the operation, to keep 7013 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 7014 * this is true even in case of failure (unless ctxt == NULL). 7015 * 7016 * Returns the node or NULL in case of error 7017 */ 7018 xmlExpNodePtr 7019 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 7020 if (ctxt == NULL) 7021 return(NULL); 7022 if ((subset == NULL) || (min < 0) || (max < -1) || 7023 ((max >= 0) && (min > max))) { 7024 xmlExpFree(ctxt, subset); 7025 return(NULL); 7026 } 7027 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 7028 NULL, NULL, min, max)); 7029 } 7030 7031 /************************************************************************ 7032 * * 7033 * Public API for operations on expressions * 7034 * * 7035 ************************************************************************/ 7036 7037 static int 7038 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7039 const xmlChar**list, int len, int nb) { 7040 int tmp, tmp2; 7041 tail: 7042 switch (exp->type) { 7043 case XML_EXP_EMPTY: 7044 return(0); 7045 case XML_EXP_ATOM: 7046 for (tmp = 0;tmp < nb;tmp++) 7047 if (list[tmp] == exp->exp_str) 7048 return(0); 7049 if (nb >= len) 7050 return(-2); 7051 list[nb] = exp->exp_str; 7052 return(1); 7053 case XML_EXP_COUNT: 7054 exp = exp->exp_left; 7055 goto tail; 7056 case XML_EXP_SEQ: 7057 case XML_EXP_OR: 7058 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 7059 if (tmp < 0) 7060 return(tmp); 7061 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 7062 nb + tmp); 7063 if (tmp2 < 0) 7064 return(tmp2); 7065 return(tmp + tmp2); 7066 } 7067 return(-1); 7068 } 7069 7070 /** 7071 * xmlExpGetLanguage: 7072 * @ctxt: the expression context 7073 * @exp: the expression 7074 * @langList: where to store the tokens 7075 * @len: the allocated length of @list 7076 * 7077 * Find all the strings used in @exp and store them in @list 7078 * 7079 * Returns the number of unique strings found, -1 in case of errors and 7080 * -2 if there is more than @len strings 7081 */ 7082 int 7083 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7084 const xmlChar**langList, int len) { 7085 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 7086 return(-1); 7087 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 7088 } 7089 7090 static int 7091 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7092 const xmlChar**list, int len, int nb) { 7093 int tmp, tmp2; 7094 tail: 7095 switch (exp->type) { 7096 case XML_EXP_FORBID: 7097 return(0); 7098 case XML_EXP_EMPTY: 7099 return(0); 7100 case XML_EXP_ATOM: 7101 for (tmp = 0;tmp < nb;tmp++) 7102 if (list[tmp] == exp->exp_str) 7103 return(0); 7104 if (nb >= len) 7105 return(-2); 7106 list[nb] = exp->exp_str; 7107 return(1); 7108 case XML_EXP_COUNT: 7109 exp = exp->exp_left; 7110 goto tail; 7111 case XML_EXP_SEQ: 7112 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7113 if (tmp < 0) 7114 return(tmp); 7115 if (IS_NILLABLE(exp->exp_left)) { 7116 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7117 nb + tmp); 7118 if (tmp2 < 0) 7119 return(tmp2); 7120 tmp += tmp2; 7121 } 7122 return(tmp); 7123 case XML_EXP_OR: 7124 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7125 if (tmp < 0) 7126 return(tmp); 7127 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7128 nb + tmp); 7129 if (tmp2 < 0) 7130 return(tmp2); 7131 return(tmp + tmp2); 7132 } 7133 return(-1); 7134 } 7135 7136 /** 7137 * xmlExpGetStart: 7138 * @ctxt: the expression context 7139 * @exp: the expression 7140 * @tokList: where to store the tokens 7141 * @len: the allocated length of @list 7142 * 7143 * Find all the strings that appears at the start of the languages 7144 * accepted by @exp and store them in @list. E.g. for (a, b) | c 7145 * it will return the list [a, c] 7146 * 7147 * Returns the number of unique strings found, -1 in case of errors and 7148 * -2 if there is more than @len strings 7149 */ 7150 int 7151 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7152 const xmlChar**tokList, int len) { 7153 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 7154 return(-1); 7155 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 7156 } 7157 7158 /** 7159 * xmlExpIsNillable: 7160 * @exp: the expression 7161 * 7162 * Finds if the expression is nillable, i.e. if it accepts the empty sequence 7163 * 7164 * Returns 1 if nillable, 0 if not and -1 in case of error 7165 */ 7166 int 7167 xmlExpIsNillable(xmlExpNodePtr exp) { 7168 if (exp == NULL) 7169 return(-1); 7170 return(IS_NILLABLE(exp) != 0); 7171 } 7172 7173 static xmlExpNodePtr 7174 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 7175 { 7176 xmlExpNodePtr ret; 7177 7178 switch (exp->type) { 7179 case XML_EXP_EMPTY: 7180 return(forbiddenExp); 7181 case XML_EXP_FORBID: 7182 return(forbiddenExp); 7183 case XML_EXP_ATOM: 7184 if (exp->exp_str == str) { 7185 #ifdef DEBUG_DERIV 7186 printf("deriv atom: equal => Empty\n"); 7187 #endif 7188 ret = emptyExp; 7189 } else { 7190 #ifdef DEBUG_DERIV 7191 printf("deriv atom: mismatch => forbid\n"); 7192 #endif 7193 /* TODO wildcards here */ 7194 ret = forbiddenExp; 7195 } 7196 return(ret); 7197 case XML_EXP_OR: { 7198 xmlExpNodePtr tmp; 7199 7200 #ifdef DEBUG_DERIV 7201 printf("deriv or: => or(derivs)\n"); 7202 #endif 7203 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7204 if (tmp == NULL) { 7205 return(NULL); 7206 } 7207 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7208 if (ret == NULL) { 7209 xmlExpFree(ctxt, tmp); 7210 return(NULL); 7211 } 7212 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7213 NULL, 0, 0); 7214 return(ret); 7215 } 7216 case XML_EXP_SEQ: 7217 #ifdef DEBUG_DERIV 7218 printf("deriv seq: starting with left\n"); 7219 #endif 7220 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7221 if (ret == NULL) { 7222 return(NULL); 7223 } else if (ret == forbiddenExp) { 7224 if (IS_NILLABLE(exp->exp_left)) { 7225 #ifdef DEBUG_DERIV 7226 printf("deriv seq: left failed but nillable\n"); 7227 #endif 7228 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7229 } 7230 } else { 7231 #ifdef DEBUG_DERIV 7232 printf("deriv seq: left match => sequence\n"); 7233 #endif 7234 exp->exp_right->ref++; 7235 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7236 NULL, 0, 0); 7237 } 7238 return(ret); 7239 case XML_EXP_COUNT: { 7240 int min, max; 7241 xmlExpNodePtr tmp; 7242 7243 if (exp->exp_max == 0) 7244 return(forbiddenExp); 7245 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7246 if (ret == NULL) 7247 return(NULL); 7248 if (ret == forbiddenExp) { 7249 #ifdef DEBUG_DERIV 7250 printf("deriv count: pattern mismatch => forbid\n"); 7251 #endif 7252 return(ret); 7253 } 7254 if (exp->exp_max == 1) 7255 return(ret); 7256 if (exp->exp_max < 0) /* unbounded */ 7257 max = -1; 7258 else 7259 max = exp->exp_max - 1; 7260 if (exp->exp_min > 0) 7261 min = exp->exp_min - 1; 7262 else 7263 min = 0; 7264 exp->exp_left->ref++; 7265 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7266 NULL, min, max); 7267 if (ret == emptyExp) { 7268 #ifdef DEBUG_DERIV 7269 printf("deriv count: match to empty => new count\n"); 7270 #endif 7271 return(tmp); 7272 } 7273 #ifdef DEBUG_DERIV 7274 printf("deriv count: match => sequence with new count\n"); 7275 #endif 7276 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7277 NULL, 0, 0)); 7278 } 7279 } 7280 return(NULL); 7281 } 7282 7283 /** 7284 * xmlExpStringDerive: 7285 * @ctxt: the expression context 7286 * @exp: the expression 7287 * @str: the string 7288 * @len: the string len in bytes if available 7289 * 7290 * Do one step of Brzozowski derivation of the expression @exp with 7291 * respect to the input string 7292 * 7293 * Returns the resulting expression or NULL in case of internal error 7294 */ 7295 xmlExpNodePtr 7296 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7297 const xmlChar *str, int len) { 7298 const xmlChar *input; 7299 7300 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7301 return(NULL); 7302 } 7303 /* 7304 * check the string is in the dictionary, if yes use an interned 7305 * copy, otherwise we know it's not an acceptable input 7306 */ 7307 input = xmlDictExists(ctxt->dict, str, len); 7308 if (input == NULL) { 7309 return(forbiddenExp); 7310 } 7311 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7312 } 7313 7314 static int 7315 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7316 int ret = 1; 7317 7318 if (sub->c_max == -1) { 7319 if (exp->c_max != -1) 7320 ret = 0; 7321 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7322 ret = 0; 7323 } 7324 #if 0 7325 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7326 ret = 0; 7327 #endif 7328 return(ret); 7329 } 7330 7331 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7332 xmlExpNodePtr sub); 7333 /** 7334 * xmlExpDivide: 7335 * @ctxt: the expressions context 7336 * @exp: the englobing expression 7337 * @sub: the subexpression 7338 * @mult: the multiple expression 7339 * @remain: the remain from the derivation of the multiple 7340 * 7341 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7342 * so that sub{n} subsume exp 7343 * 7344 * Returns the multiple value if successful, 0 if it is not a multiple 7345 * and -1 in case of internal error. 7346 */ 7347 7348 static int 7349 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7350 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7351 int i; 7352 xmlExpNodePtr tmp, tmp2; 7353 7354 if (mult != NULL) *mult = NULL; 7355 if (remain != NULL) *remain = NULL; 7356 if (exp->c_max == -1) return(0); 7357 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7358 7359 for (i = 1;i <= exp->c_max;i++) { 7360 sub->ref++; 7361 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7362 sub, NULL, NULL, i, i); 7363 if (tmp == NULL) { 7364 return(-1); 7365 } 7366 if (!xmlExpCheckCard(tmp, exp)) { 7367 xmlExpFree(ctxt, tmp); 7368 continue; 7369 } 7370 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7371 if (tmp2 == NULL) { 7372 xmlExpFree(ctxt, tmp); 7373 return(-1); 7374 } 7375 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7376 if (remain != NULL) 7377 *remain = tmp2; 7378 else 7379 xmlExpFree(ctxt, tmp2); 7380 if (mult != NULL) 7381 *mult = tmp; 7382 else 7383 xmlExpFree(ctxt, tmp); 7384 #ifdef DEBUG_DERIV 7385 printf("Divide succeeded %d\n", i); 7386 #endif 7387 return(i); 7388 } 7389 xmlExpFree(ctxt, tmp); 7390 xmlExpFree(ctxt, tmp2); 7391 } 7392 #ifdef DEBUG_DERIV 7393 printf("Divide failed\n"); 7394 #endif 7395 return(0); 7396 } 7397 7398 /** 7399 * xmlExpExpDeriveInt: 7400 * @ctxt: the expressions context 7401 * @exp: the englobing expression 7402 * @sub: the subexpression 7403 * 7404 * Try to do a step of Brzozowski derivation but at a higher level 7405 * the input being a subexpression. 7406 * 7407 * Returns the resulting expression or NULL in case of internal error 7408 */ 7409 static xmlExpNodePtr 7410 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7411 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7412 const xmlChar **tab; 7413 int len, i; 7414 7415 /* 7416 * In case of equality and if the expression can only consume a finite 7417 * amount, then the derivation is empty 7418 */ 7419 if ((exp == sub) && (exp->c_max >= 0)) { 7420 #ifdef DEBUG_DERIV 7421 printf("Equal(exp, sub) and finite -> Empty\n"); 7422 #endif 7423 return(emptyExp); 7424 } 7425 /* 7426 * decompose sub sequence first 7427 */ 7428 if (sub->type == XML_EXP_EMPTY) { 7429 #ifdef DEBUG_DERIV 7430 printf("Empty(sub) -> Empty\n"); 7431 #endif 7432 exp->ref++; 7433 return(exp); 7434 } 7435 if (sub->type == XML_EXP_SEQ) { 7436 #ifdef DEBUG_DERIV 7437 printf("Seq(sub) -> decompose\n"); 7438 #endif 7439 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7440 if (tmp == NULL) 7441 return(NULL); 7442 if (tmp == forbiddenExp) 7443 return(tmp); 7444 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7445 xmlExpFree(ctxt, tmp); 7446 return(ret); 7447 } 7448 if (sub->type == XML_EXP_OR) { 7449 #ifdef DEBUG_DERIV 7450 printf("Or(sub) -> decompose\n"); 7451 #endif 7452 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7453 if (tmp == forbiddenExp) 7454 return(tmp); 7455 if (tmp == NULL) 7456 return(NULL); 7457 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7458 if ((ret == NULL) || (ret == forbiddenExp)) { 7459 xmlExpFree(ctxt, tmp); 7460 return(ret); 7461 } 7462 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7463 } 7464 if (!xmlExpCheckCard(exp, sub)) { 7465 #ifdef DEBUG_DERIV 7466 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7467 #endif 7468 return(forbiddenExp); 7469 } 7470 switch (exp->type) { 7471 case XML_EXP_EMPTY: 7472 if (sub == emptyExp) 7473 return(emptyExp); 7474 #ifdef DEBUG_DERIV 7475 printf("Empty(exp) -> Forbid\n"); 7476 #endif 7477 return(forbiddenExp); 7478 case XML_EXP_FORBID: 7479 #ifdef DEBUG_DERIV 7480 printf("Forbid(exp) -> Forbid\n"); 7481 #endif 7482 return(forbiddenExp); 7483 case XML_EXP_ATOM: 7484 if (sub->type == XML_EXP_ATOM) { 7485 /* TODO: handle wildcards */ 7486 if (exp->exp_str == sub->exp_str) { 7487 #ifdef DEBUG_DERIV 7488 printf("Atom match -> Empty\n"); 7489 #endif 7490 return(emptyExp); 7491 } 7492 #ifdef DEBUG_DERIV 7493 printf("Atom mismatch -> Forbid\n"); 7494 #endif 7495 return(forbiddenExp); 7496 } 7497 if ((sub->type == XML_EXP_COUNT) && 7498 (sub->exp_max == 1) && 7499 (sub->exp_left->type == XML_EXP_ATOM)) { 7500 /* TODO: handle wildcards */ 7501 if (exp->exp_str == sub->exp_left->exp_str) { 7502 #ifdef DEBUG_DERIV 7503 printf("Atom match -> Empty\n"); 7504 #endif 7505 return(emptyExp); 7506 } 7507 #ifdef DEBUG_DERIV 7508 printf("Atom mismatch -> Forbid\n"); 7509 #endif 7510 return(forbiddenExp); 7511 } 7512 #ifdef DEBUG_DERIV 7513 printf("Complex exp vs Atom -> Forbid\n"); 7514 #endif 7515 return(forbiddenExp); 7516 case XML_EXP_SEQ: 7517 /* try to get the sequence consumed only if possible */ 7518 if (xmlExpCheckCard(exp->exp_left, sub)) { 7519 /* See if the sequence can be consumed directly */ 7520 #ifdef DEBUG_DERIV 7521 printf("Seq trying left only\n"); 7522 #endif 7523 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7524 if ((ret != forbiddenExp) && (ret != NULL)) { 7525 #ifdef DEBUG_DERIV 7526 printf("Seq trying left only worked\n"); 7527 #endif 7528 /* 7529 * TODO: assumption here that we are determinist 7530 * i.e. we won't get to a nillable exp left 7531 * subset which could be matched by the right 7532 * part too. 7533 * e.g.: (a | b)+,(a | c) and 'a+,a' 7534 */ 7535 exp->exp_right->ref++; 7536 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7537 exp->exp_right, NULL, 0, 0)); 7538 } 7539 #ifdef DEBUG_DERIV 7540 } else { 7541 printf("Seq: left too short\n"); 7542 #endif 7543 } 7544 /* Try instead to decompose */ 7545 if (sub->type == XML_EXP_COUNT) { 7546 int min, max; 7547 7548 #ifdef DEBUG_DERIV 7549 printf("Seq: sub is a count\n"); 7550 #endif 7551 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7552 if (ret == NULL) 7553 return(NULL); 7554 if (ret != forbiddenExp) { 7555 #ifdef DEBUG_DERIV 7556 printf("Seq , Count match on left\n"); 7557 #endif 7558 if (sub->exp_max < 0) 7559 max = -1; 7560 else 7561 max = sub->exp_max -1; 7562 if (sub->exp_min > 0) 7563 min = sub->exp_min -1; 7564 else 7565 min = 0; 7566 exp->exp_right->ref++; 7567 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7568 exp->exp_right, NULL, 0, 0); 7569 if (tmp == NULL) 7570 return(NULL); 7571 7572 sub->exp_left->ref++; 7573 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7574 sub->exp_left, NULL, NULL, min, max); 7575 if (tmp2 == NULL) { 7576 xmlExpFree(ctxt, tmp); 7577 return(NULL); 7578 } 7579 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7580 xmlExpFree(ctxt, tmp); 7581 xmlExpFree(ctxt, tmp2); 7582 return(ret); 7583 } 7584 } 7585 /* we made no progress on structured operations */ 7586 break; 7587 case XML_EXP_OR: 7588 #ifdef DEBUG_DERIV 7589 printf("Or , trying both side\n"); 7590 #endif 7591 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7592 if (ret == NULL) 7593 return(NULL); 7594 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7595 if (tmp == NULL) { 7596 xmlExpFree(ctxt, ret); 7597 return(NULL); 7598 } 7599 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7600 case XML_EXP_COUNT: { 7601 int min, max; 7602 7603 if (sub->type == XML_EXP_COUNT) { 7604 /* 7605 * Try to see if the loop is completely subsumed 7606 */ 7607 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7608 if (tmp == NULL) 7609 return(NULL); 7610 if (tmp == forbiddenExp) { 7611 int mult; 7612 7613 #ifdef DEBUG_DERIV 7614 printf("Count, Count inner don't subsume\n"); 7615 #endif 7616 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7617 NULL, &tmp); 7618 if (mult <= 0) { 7619 #ifdef DEBUG_DERIV 7620 printf("Count, Count not multiple => forbidden\n"); 7621 #endif 7622 return(forbiddenExp); 7623 } 7624 if (sub->exp_max == -1) { 7625 max = -1; 7626 if (exp->exp_max == -1) { 7627 if (exp->exp_min <= sub->exp_min * mult) 7628 min = 0; 7629 else 7630 min = exp->exp_min - sub->exp_min * mult; 7631 } else { 7632 #ifdef DEBUG_DERIV 7633 printf("Count, Count finite can't subsume infinite\n"); 7634 #endif 7635 xmlExpFree(ctxt, tmp); 7636 return(forbiddenExp); 7637 } 7638 } else { 7639 if (exp->exp_max == -1) { 7640 #ifdef DEBUG_DERIV 7641 printf("Infinite loop consume mult finite loop\n"); 7642 #endif 7643 if (exp->exp_min > sub->exp_min * mult) { 7644 max = -1; 7645 min = exp->exp_min - sub->exp_min * mult; 7646 } else { 7647 max = -1; 7648 min = 0; 7649 } 7650 } else { 7651 if (exp->exp_max < sub->exp_max * mult) { 7652 #ifdef DEBUG_DERIV 7653 printf("loops max mult mismatch => forbidden\n"); 7654 #endif 7655 xmlExpFree(ctxt, tmp); 7656 return(forbiddenExp); 7657 } 7658 if (sub->exp_max * mult > exp->exp_min) 7659 min = 0; 7660 else 7661 min = exp->exp_min - sub->exp_max * mult; 7662 max = exp->exp_max - sub->exp_max * mult; 7663 } 7664 } 7665 } else if (!IS_NILLABLE(tmp)) { 7666 /* 7667 * TODO: loop here to try to grow if working on finite 7668 * blocks. 7669 */ 7670 #ifdef DEBUG_DERIV 7671 printf("Count, Count remain not nillable => forbidden\n"); 7672 #endif 7673 xmlExpFree(ctxt, tmp); 7674 return(forbiddenExp); 7675 } else if (sub->exp_max == -1) { 7676 if (exp->exp_max == -1) { 7677 if (exp->exp_min <= sub->exp_min) { 7678 #ifdef DEBUG_DERIV 7679 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7680 #endif 7681 max = -1; 7682 min = 0; 7683 } else { 7684 #ifdef DEBUG_DERIV 7685 printf("Infinite loops min => Count(X,Inf)\n"); 7686 #endif 7687 max = -1; 7688 min = exp->exp_min - sub->exp_min; 7689 } 7690 } else if (exp->exp_min > sub->exp_min) { 7691 #ifdef DEBUG_DERIV 7692 printf("loops min mismatch 1 => forbidden ???\n"); 7693 #endif 7694 xmlExpFree(ctxt, tmp); 7695 return(forbiddenExp); 7696 } else { 7697 max = -1; 7698 min = 0; 7699 } 7700 } else { 7701 if (exp->exp_max == -1) { 7702 #ifdef DEBUG_DERIV 7703 printf("Infinite loop consume finite loop\n"); 7704 #endif 7705 if (exp->exp_min > sub->exp_min) { 7706 max = -1; 7707 min = exp->exp_min - sub->exp_min; 7708 } else { 7709 max = -1; 7710 min = 0; 7711 } 7712 } else { 7713 if (exp->exp_max < sub->exp_max) { 7714 #ifdef DEBUG_DERIV 7715 printf("loops max mismatch => forbidden\n"); 7716 #endif 7717 xmlExpFree(ctxt, tmp); 7718 return(forbiddenExp); 7719 } 7720 if (sub->exp_max > exp->exp_min) 7721 min = 0; 7722 else 7723 min = exp->exp_min - sub->exp_max; 7724 max = exp->exp_max - sub->exp_max; 7725 } 7726 } 7727 #ifdef DEBUG_DERIV 7728 printf("loops match => SEQ(COUNT())\n"); 7729 #endif 7730 exp->exp_left->ref++; 7731 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7732 NULL, NULL, min, max); 7733 if (tmp2 == NULL) { 7734 return(NULL); 7735 } 7736 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7737 NULL, 0, 0); 7738 return(ret); 7739 } 7740 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7741 if (tmp == NULL) 7742 return(NULL); 7743 if (tmp == forbiddenExp) { 7744 #ifdef DEBUG_DERIV 7745 printf("loop mismatch => forbidden\n"); 7746 #endif 7747 return(forbiddenExp); 7748 } 7749 if (exp->exp_min > 0) 7750 min = exp->exp_min - 1; 7751 else 7752 min = 0; 7753 if (exp->exp_max < 0) 7754 max = -1; 7755 else 7756 max = exp->exp_max - 1; 7757 7758 #ifdef DEBUG_DERIV 7759 printf("loop match => SEQ(COUNT())\n"); 7760 #endif 7761 exp->exp_left->ref++; 7762 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7763 NULL, NULL, min, max); 7764 if (tmp2 == NULL) 7765 return(NULL); 7766 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7767 NULL, 0, 0); 7768 return(ret); 7769 } 7770 } 7771 7772 #ifdef DEBUG_DERIV 7773 printf("Fallback to derivative\n"); 7774 #endif 7775 if (IS_NILLABLE(sub)) { 7776 if (!(IS_NILLABLE(exp))) 7777 return(forbiddenExp); 7778 else 7779 ret = emptyExp; 7780 } else 7781 ret = NULL; 7782 /* 7783 * here the structured derivation made no progress so 7784 * we use the default token based derivation to force one more step 7785 */ 7786 if (ctxt->tabSize == 0) 7787 ctxt->tabSize = 40; 7788 7789 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7790 sizeof(const xmlChar *)); 7791 if (tab == NULL) { 7792 return(NULL); 7793 } 7794 7795 /* 7796 * collect all the strings accepted by the subexpression on input 7797 */ 7798 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7799 while (len < 0) { 7800 const xmlChar **temp; 7801 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7802 sizeof(const xmlChar *)); 7803 if (temp == NULL) { 7804 xmlFree((xmlChar **) tab); 7805 return(NULL); 7806 } 7807 tab = temp; 7808 ctxt->tabSize *= 2; 7809 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7810 } 7811 for (i = 0;i < len;i++) { 7812 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7813 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7814 xmlExpFree(ctxt, ret); 7815 xmlFree((xmlChar **) tab); 7816 return(tmp); 7817 } 7818 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7819 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7820 xmlExpFree(ctxt, tmp); 7821 xmlExpFree(ctxt, ret); 7822 xmlFree((xmlChar **) tab); 7823 return(tmp); 7824 } 7825 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7826 xmlExpFree(ctxt, tmp); 7827 xmlExpFree(ctxt, tmp2); 7828 7829 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7830 xmlExpFree(ctxt, ret); 7831 xmlFree((xmlChar **) tab); 7832 return(tmp3); 7833 } 7834 7835 if (ret == NULL) 7836 ret = tmp3; 7837 else { 7838 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7839 if (ret == NULL) { 7840 xmlFree((xmlChar **) tab); 7841 return(NULL); 7842 } 7843 } 7844 } 7845 xmlFree((xmlChar **) tab); 7846 return(ret); 7847 } 7848 7849 /** 7850 * xmlExpExpDerive: 7851 * @ctxt: the expressions context 7852 * @exp: the englobing expression 7853 * @sub: the subexpression 7854 * 7855 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7856 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7857 * it usually takes less than linear time and can handle expressions generating 7858 * infinite languages. 7859 * 7860 * Returns the resulting expression or NULL in case of internal error, the 7861 * result must be freed 7862 */ 7863 xmlExpNodePtr 7864 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7865 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7866 return(NULL); 7867 7868 /* 7869 * O(1) speedups 7870 */ 7871 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7872 #ifdef DEBUG_DERIV 7873 printf("Sub nillable and not exp : can't subsume\n"); 7874 #endif 7875 return(forbiddenExp); 7876 } 7877 if (xmlExpCheckCard(exp, sub) == 0) { 7878 #ifdef DEBUG_DERIV 7879 printf("sub generate longer sequences than exp : can't subsume\n"); 7880 #endif 7881 return(forbiddenExp); 7882 } 7883 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7884 } 7885 7886 /** 7887 * xmlExpSubsume: 7888 * @ctxt: the expressions context 7889 * @exp: the englobing expression 7890 * @sub: the subexpression 7891 * 7892 * Check whether @exp accepts all the languages accepted by @sub 7893 * the input being a subexpression. 7894 * 7895 * Returns 1 if true 0 if false and -1 in case of failure. 7896 */ 7897 int 7898 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7899 xmlExpNodePtr tmp; 7900 7901 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7902 return(-1); 7903 7904 /* 7905 * TODO: speedup by checking the language of sub is a subset of the 7906 * language of exp 7907 */ 7908 /* 7909 * O(1) speedups 7910 */ 7911 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7912 #ifdef DEBUG_DERIV 7913 printf("Sub nillable and not exp : can't subsume\n"); 7914 #endif 7915 return(0); 7916 } 7917 if (xmlExpCheckCard(exp, sub) == 0) { 7918 #ifdef DEBUG_DERIV 7919 printf("sub generate longer sequences than exp : can't subsume\n"); 7920 #endif 7921 return(0); 7922 } 7923 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7924 #ifdef DEBUG_DERIV 7925 printf("Result derivation :\n"); 7926 PRINT_EXP(tmp); 7927 #endif 7928 if (tmp == NULL) 7929 return(-1); 7930 if (tmp == forbiddenExp) 7931 return(0); 7932 if (tmp == emptyExp) 7933 return(1); 7934 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7935 xmlExpFree(ctxt, tmp); 7936 return(1); 7937 } 7938 xmlExpFree(ctxt, tmp); 7939 return(0); 7940 } 7941 7942 /************************************************************************ 7943 * * 7944 * Parsing expression * 7945 * * 7946 ************************************************************************/ 7947 7948 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7949 7950 #undef CUR 7951 #define CUR (*ctxt->cur) 7952 #undef NEXT 7953 #define NEXT ctxt->cur++; 7954 #undef IS_BLANK 7955 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7956 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7957 7958 static int 7959 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7960 int ret = 0; 7961 7962 SKIP_BLANKS 7963 if (CUR == '*') { 7964 NEXT 7965 return(-1); 7966 } 7967 if ((CUR < '0') || (CUR > '9')) 7968 return(-1); 7969 while ((CUR >= '0') && (CUR <= '9')) { 7970 ret = ret * 10 + (CUR - '0'); 7971 NEXT 7972 } 7973 return(ret); 7974 } 7975 7976 static xmlExpNodePtr 7977 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7978 const char *base; 7979 xmlExpNodePtr ret; 7980 const xmlChar *val; 7981 7982 SKIP_BLANKS 7983 base = ctxt->cur; 7984 if (*ctxt->cur == '(') { 7985 NEXT 7986 ret = xmlExpParseExpr(ctxt); 7987 SKIP_BLANKS 7988 if (*ctxt->cur != ')') { 7989 fprintf(stderr, "unbalanced '(' : %s\n", base); 7990 xmlExpFree(ctxt, ret); 7991 return(NULL); 7992 } 7993 NEXT; 7994 SKIP_BLANKS 7995 goto parse_quantifier; 7996 } 7997 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7998 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7999 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 8000 NEXT; 8001 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 8002 if (val == NULL) 8003 return(NULL); 8004 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 8005 if (ret == NULL) 8006 return(NULL); 8007 SKIP_BLANKS 8008 parse_quantifier: 8009 if (CUR == '{') { 8010 int min, max; 8011 8012 NEXT 8013 min = xmlExpParseNumber(ctxt); 8014 if (min < 0) { 8015 xmlExpFree(ctxt, ret); 8016 return(NULL); 8017 } 8018 SKIP_BLANKS 8019 if (CUR == ',') { 8020 NEXT 8021 max = xmlExpParseNumber(ctxt); 8022 SKIP_BLANKS 8023 } else 8024 max = min; 8025 if (CUR != '}') { 8026 xmlExpFree(ctxt, ret); 8027 return(NULL); 8028 } 8029 NEXT 8030 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 8031 min, max); 8032 SKIP_BLANKS 8033 } else if (CUR == '?') { 8034 NEXT 8035 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 8036 0, 1); 8037 SKIP_BLANKS 8038 } else if (CUR == '+') { 8039 NEXT 8040 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 8041 1, -1); 8042 SKIP_BLANKS 8043 } else if (CUR == '*') { 8044 NEXT 8045 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 8046 0, -1); 8047 SKIP_BLANKS 8048 } 8049 return(ret); 8050 } 8051 8052 8053 static xmlExpNodePtr 8054 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 8055 xmlExpNodePtr ret, right; 8056 8057 ret = xmlExpParseOr(ctxt); 8058 SKIP_BLANKS 8059 while (CUR == '|') { 8060 NEXT 8061 right = xmlExpParseOr(ctxt); 8062 if (right == NULL) { 8063 xmlExpFree(ctxt, ret); 8064 return(NULL); 8065 } 8066 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 8067 if (ret == NULL) 8068 return(NULL); 8069 } 8070 return(ret); 8071 } 8072 8073 static xmlExpNodePtr 8074 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 8075 xmlExpNodePtr ret, right; 8076 8077 ret = xmlExpParseSeq(ctxt); 8078 SKIP_BLANKS 8079 while (CUR == ',') { 8080 NEXT 8081 right = xmlExpParseSeq(ctxt); 8082 if (right == NULL) { 8083 xmlExpFree(ctxt, ret); 8084 return(NULL); 8085 } 8086 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 8087 if (ret == NULL) 8088 return(NULL); 8089 } 8090 return(ret); 8091 } 8092 8093 /** 8094 * xmlExpParse: 8095 * @ctxt: the expressions context 8096 * @expr: the 0 terminated string 8097 * 8098 * Minimal parser for regexps, it understand the following constructs 8099 * - string terminals 8100 * - choice operator | 8101 * - sequence operator , 8102 * - subexpressions (...) 8103 * - usual cardinality operators + * and ? 8104 * - finite sequences { min, max } 8105 * - infinite sequences { min, * } 8106 * There is minimal checkings made especially no checking on strings values 8107 * 8108 * Returns a new expression or NULL in case of failure 8109 */ 8110 xmlExpNodePtr 8111 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 8112 xmlExpNodePtr ret; 8113 8114 ctxt->expr = expr; 8115 ctxt->cur = expr; 8116 8117 ret = xmlExpParseExpr(ctxt); 8118 SKIP_BLANKS 8119 if (*ctxt->cur != 0) { 8120 xmlExpFree(ctxt, ret); 8121 return(NULL); 8122 } 8123 return(ret); 8124 } 8125 8126 static void 8127 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 8128 xmlExpNodePtr c; 8129 8130 if (expr == NULL) return; 8131 if (glob) xmlBufferWriteChar(buf, "("); 8132 switch (expr->type) { 8133 case XML_EXP_EMPTY: 8134 xmlBufferWriteChar(buf, "empty"); 8135 break; 8136 case XML_EXP_FORBID: 8137 xmlBufferWriteChar(buf, "forbidden"); 8138 break; 8139 case XML_EXP_ATOM: 8140 xmlBufferWriteCHAR(buf, expr->exp_str); 8141 break; 8142 case XML_EXP_SEQ: 8143 c = expr->exp_left; 8144 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8145 xmlExpDumpInt(buf, c, 1); 8146 else 8147 xmlExpDumpInt(buf, c, 0); 8148 xmlBufferWriteChar(buf, " , "); 8149 c = expr->exp_right; 8150 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8151 xmlExpDumpInt(buf, c, 1); 8152 else 8153 xmlExpDumpInt(buf, c, 0); 8154 break; 8155 case XML_EXP_OR: 8156 c = expr->exp_left; 8157 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8158 xmlExpDumpInt(buf, c, 1); 8159 else 8160 xmlExpDumpInt(buf, c, 0); 8161 xmlBufferWriteChar(buf, " | "); 8162 c = expr->exp_right; 8163 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8164 xmlExpDumpInt(buf, c, 1); 8165 else 8166 xmlExpDumpInt(buf, c, 0); 8167 break; 8168 case XML_EXP_COUNT: { 8169 char rep[40]; 8170 8171 c = expr->exp_left; 8172 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8173 xmlExpDumpInt(buf, c, 1); 8174 else 8175 xmlExpDumpInt(buf, c, 0); 8176 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 8177 rep[0] = '?'; 8178 rep[1] = 0; 8179 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8180 rep[0] = '*'; 8181 rep[1] = 0; 8182 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8183 rep[0] = '+'; 8184 rep[1] = 0; 8185 } else if (expr->exp_max == expr->exp_min) { 8186 snprintf(rep, 39, "{%d}", expr->exp_min); 8187 } else if (expr->exp_max < 0) { 8188 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8189 } else { 8190 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8191 } 8192 rep[39] = 0; 8193 xmlBufferWriteChar(buf, rep); 8194 break; 8195 } 8196 default: 8197 fprintf(stderr, "Error in tree\n"); 8198 } 8199 if (glob) 8200 xmlBufferWriteChar(buf, ")"); 8201 } 8202 /** 8203 * xmlExpDump: 8204 * @buf: a buffer to receive the output 8205 * @expr: the compiled expression 8206 * 8207 * Serialize the expression as compiled to the buffer 8208 */ 8209 void 8210 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8211 if ((buf == NULL) || (expr == NULL)) 8212 return; 8213 xmlExpDumpInt(buf, expr, 0); 8214 } 8215 8216 /** 8217 * xmlExpMaxToken: 8218 * @expr: a compiled expression 8219 * 8220 * Indicate the maximum number of input a expression can accept 8221 * 8222 * Returns the maximum length or -1 in case of error 8223 */ 8224 int 8225 xmlExpMaxToken(xmlExpNodePtr expr) { 8226 if (expr == NULL) 8227 return(-1); 8228 return(expr->c_max); 8229 } 8230 8231 /** 8232 * xmlExpCtxtNbNodes: 8233 * @ctxt: an expression context 8234 * 8235 * Debugging facility provides the number of allocated nodes at a that point 8236 * 8237 * Returns the number of nodes in use or -1 in case of error 8238 */ 8239 int 8240 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8241 if (ctxt == NULL) 8242 return(-1); 8243 return(ctxt->nb_nodes); 8244 } 8245 8246 /** 8247 * xmlExpCtxtNbCons: 8248 * @ctxt: an expression context 8249 * 8250 * Debugging facility provides the number of allocated nodes over lifetime 8251 * 8252 * Returns the number of nodes ever allocated or -1 in case of error 8253 */ 8254 int 8255 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8256 if (ctxt == NULL) 8257 return(-1); 8258 return(ctxt->nb_cons); 8259 } 8260 8261 #endif /* LIBXML_EXPR_ENABLED */ 8262 #define bottom_xmlregexp 8263 #include "elfgcchack.h" 8264 #endif /* LIBXML_REGEXP_ENABLED */ 8265