1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/schemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #include <limits.h> 27 28 #include <libxml/tree.h> 29 #include <libxml/parserInternals.h> 30 #include <libxml/xmlregexp.h> 31 #include <libxml/xmlautomata.h> 32 #include <libxml/xmlunicode.h> 33 34 #ifndef SIZE_MAX 35 #define SIZE_MAX ((size_t) -1) 36 #endif 37 38 /* #define DEBUG_REGEXP_GRAPH */ 39 /* #define DEBUG_REGEXP_EXEC */ 40 /* #define DEBUG_PUSH */ 41 /* #define DEBUG_COMPACTION */ 42 43 #define MAX_PUSH 10000000 44 45 #ifdef ERROR 46 #undef ERROR 47 #endif 48 #define ERROR(str) \ 49 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 50 xmlRegexpErrCompile(ctxt, str); 51 #define NEXT ctxt->cur++ 52 #define CUR (*(ctxt->cur)) 53 #define NXT(index) (ctxt->cur[index]) 54 55 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 56 #define NEXTL(l) ctxt->cur += l; 57 #define XML_REG_STRING_SEPARATOR '|' 58 /* 59 * Need PREV to check on a '-' within a Character Group. May only be used 60 * when it's guaranteed that cur is not at the beginning of ctxt->string! 61 */ 62 #define PREV (ctxt->cur[-1]) 63 64 /** 65 * TODO: 66 * 67 * macro to flag unimplemented blocks 68 */ 69 #define TODO \ 70 xmlGenericError(xmlGenericErrorContext, \ 71 "Unimplemented block at %s:%d\n", \ 72 __FILE__, __LINE__); 73 74 /************************************************************************ 75 * * 76 * Datatypes and structures * 77 * * 78 ************************************************************************/ 79 80 /* 81 * Note: the order of the enums below is significant, do not shuffle 82 */ 83 typedef enum { 84 XML_REGEXP_EPSILON = 1, 85 XML_REGEXP_CHARVAL, 86 XML_REGEXP_RANGES, 87 XML_REGEXP_SUBREG, /* used for () sub regexps */ 88 XML_REGEXP_STRING, 89 XML_REGEXP_ANYCHAR, /* . */ 90 XML_REGEXP_ANYSPACE, /* \s */ 91 XML_REGEXP_NOTSPACE, /* \S */ 92 XML_REGEXP_INITNAME, /* \l */ 93 XML_REGEXP_NOTINITNAME, /* \L */ 94 XML_REGEXP_NAMECHAR, /* \c */ 95 XML_REGEXP_NOTNAMECHAR, /* \C */ 96 XML_REGEXP_DECIMAL, /* \d */ 97 XML_REGEXP_NOTDECIMAL, /* \D */ 98 XML_REGEXP_REALCHAR, /* \w */ 99 XML_REGEXP_NOTREALCHAR, /* \W */ 100 XML_REGEXP_LETTER = 100, 101 XML_REGEXP_LETTER_UPPERCASE, 102 XML_REGEXP_LETTER_LOWERCASE, 103 XML_REGEXP_LETTER_TITLECASE, 104 XML_REGEXP_LETTER_MODIFIER, 105 XML_REGEXP_LETTER_OTHERS, 106 XML_REGEXP_MARK, 107 XML_REGEXP_MARK_NONSPACING, 108 XML_REGEXP_MARK_SPACECOMBINING, 109 XML_REGEXP_MARK_ENCLOSING, 110 XML_REGEXP_NUMBER, 111 XML_REGEXP_NUMBER_DECIMAL, 112 XML_REGEXP_NUMBER_LETTER, 113 XML_REGEXP_NUMBER_OTHERS, 114 XML_REGEXP_PUNCT, 115 XML_REGEXP_PUNCT_CONNECTOR, 116 XML_REGEXP_PUNCT_DASH, 117 XML_REGEXP_PUNCT_OPEN, 118 XML_REGEXP_PUNCT_CLOSE, 119 XML_REGEXP_PUNCT_INITQUOTE, 120 XML_REGEXP_PUNCT_FINQUOTE, 121 XML_REGEXP_PUNCT_OTHERS, 122 XML_REGEXP_SEPAR, 123 XML_REGEXP_SEPAR_SPACE, 124 XML_REGEXP_SEPAR_LINE, 125 XML_REGEXP_SEPAR_PARA, 126 XML_REGEXP_SYMBOL, 127 XML_REGEXP_SYMBOL_MATH, 128 XML_REGEXP_SYMBOL_CURRENCY, 129 XML_REGEXP_SYMBOL_MODIFIER, 130 XML_REGEXP_SYMBOL_OTHERS, 131 XML_REGEXP_OTHER, 132 XML_REGEXP_OTHER_CONTROL, 133 XML_REGEXP_OTHER_FORMAT, 134 XML_REGEXP_OTHER_PRIVATE, 135 XML_REGEXP_OTHER_NA, 136 XML_REGEXP_BLOCK_NAME 137 } xmlRegAtomType; 138 139 typedef enum { 140 XML_REGEXP_QUANT_EPSILON = 1, 141 XML_REGEXP_QUANT_ONCE, 142 XML_REGEXP_QUANT_OPT, 143 XML_REGEXP_QUANT_MULT, 144 XML_REGEXP_QUANT_PLUS, 145 XML_REGEXP_QUANT_ONCEONLY, 146 XML_REGEXP_QUANT_ALL, 147 XML_REGEXP_QUANT_RANGE 148 } xmlRegQuantType; 149 150 typedef enum { 151 XML_REGEXP_START_STATE = 1, 152 XML_REGEXP_FINAL_STATE, 153 XML_REGEXP_TRANS_STATE, 154 XML_REGEXP_SINK_STATE, 155 XML_REGEXP_UNREACH_STATE 156 } xmlRegStateType; 157 158 typedef enum { 159 XML_REGEXP_MARK_NORMAL = 0, 160 XML_REGEXP_MARK_START, 161 XML_REGEXP_MARK_VISITED 162 } xmlRegMarkedType; 163 164 typedef struct _xmlRegRange xmlRegRange; 165 typedef xmlRegRange *xmlRegRangePtr; 166 167 struct _xmlRegRange { 168 int neg; /* 0 normal, 1 not, 2 exclude */ 169 xmlRegAtomType type; 170 int start; 171 int end; 172 xmlChar *blockName; 173 }; 174 175 typedef struct _xmlRegAtom xmlRegAtom; 176 typedef xmlRegAtom *xmlRegAtomPtr; 177 178 typedef struct _xmlAutomataState xmlRegState; 179 typedef xmlRegState *xmlRegStatePtr; 180 181 struct _xmlRegAtom { 182 int no; 183 xmlRegAtomType type; 184 xmlRegQuantType quant; 185 int min; 186 int max; 187 188 void *valuep; 189 void *valuep2; 190 int neg; 191 int codepoint; 192 xmlRegStatePtr start; 193 xmlRegStatePtr start0; 194 xmlRegStatePtr stop; 195 int maxRanges; 196 int nbRanges; 197 xmlRegRangePtr *ranges; 198 void *data; 199 }; 200 201 typedef struct _xmlRegCounter xmlRegCounter; 202 typedef xmlRegCounter *xmlRegCounterPtr; 203 204 struct _xmlRegCounter { 205 int min; 206 int max; 207 }; 208 209 typedef struct _xmlRegTrans xmlRegTrans; 210 typedef xmlRegTrans *xmlRegTransPtr; 211 212 struct _xmlRegTrans { 213 xmlRegAtomPtr atom; 214 int to; 215 int counter; 216 int count; 217 int nd; 218 }; 219 220 struct _xmlAutomataState { 221 xmlRegStateType type; 222 xmlRegMarkedType mark; 223 xmlRegMarkedType markd; 224 xmlRegMarkedType reached; 225 int no; 226 int maxTrans; 227 int nbTrans; 228 xmlRegTrans *trans; 229 /* knowing states pointing to us can speed things up */ 230 int maxTransTo; 231 int nbTransTo; 232 int *transTo; 233 }; 234 235 typedef struct _xmlAutomata xmlRegParserCtxt; 236 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 237 238 #define AM_AUTOMATA_RNG 1 239 240 struct _xmlAutomata { 241 xmlChar *string; 242 xmlChar *cur; 243 244 int error; 245 int neg; 246 247 xmlRegStatePtr start; 248 xmlRegStatePtr end; 249 xmlRegStatePtr state; 250 251 xmlRegAtomPtr atom; 252 253 int maxAtoms; 254 int nbAtoms; 255 xmlRegAtomPtr *atoms; 256 257 int maxStates; 258 int nbStates; 259 xmlRegStatePtr *states; 260 261 int maxCounters; 262 int nbCounters; 263 xmlRegCounter *counters; 264 265 int determinist; 266 int negs; 267 int flags; 268 269 int depth; 270 }; 271 272 struct _xmlRegexp { 273 xmlChar *string; 274 int nbStates; 275 xmlRegStatePtr *states; 276 int nbAtoms; 277 xmlRegAtomPtr *atoms; 278 int nbCounters; 279 xmlRegCounter *counters; 280 int determinist; 281 int flags; 282 /* 283 * That's the compact form for determinists automatas 284 */ 285 int nbstates; 286 int *compact; 287 void **transdata; 288 int nbstrings; 289 xmlChar **stringMap; 290 }; 291 292 typedef struct _xmlRegExecRollback xmlRegExecRollback; 293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 294 295 struct _xmlRegExecRollback { 296 xmlRegStatePtr state;/* the current state */ 297 int index; /* the index in the input stack */ 298 int nextbranch; /* the next transition to explore in that state */ 299 int *counts; /* save the automata state if it has some */ 300 }; 301 302 typedef struct _xmlRegInputToken xmlRegInputToken; 303 typedef xmlRegInputToken *xmlRegInputTokenPtr; 304 305 struct _xmlRegInputToken { 306 xmlChar *value; 307 void *data; 308 }; 309 310 struct _xmlRegExecCtxt { 311 int status; /* execution status != 0 indicate an error */ 312 int determinist; /* did we find an indeterministic behaviour */ 313 xmlRegexpPtr comp; /* the compiled regexp */ 314 xmlRegExecCallbacks callback; 315 void *data; 316 317 xmlRegStatePtr state;/* the current state */ 318 int transno; /* the current transition on that state */ 319 int transcount; /* the number of chars in char counted transitions */ 320 321 /* 322 * A stack of rollback states 323 */ 324 int maxRollbacks; 325 int nbRollbacks; 326 xmlRegExecRollback *rollbacks; 327 328 /* 329 * The state of the automata if any 330 */ 331 int *counts; 332 333 /* 334 * The input stack 335 */ 336 int inputStackMax; 337 int inputStackNr; 338 int index; 339 int *charStack; 340 const xmlChar *inputString; /* when operating on characters */ 341 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 342 343 /* 344 * error handling 345 */ 346 int errStateNo; /* the error state number */ 347 xmlRegStatePtr errState; /* the error state */ 348 xmlChar *errString; /* the string raising the error */ 349 int *errCounts; /* counters at the error state */ 350 int nbPush; 351 }; 352 353 #define REGEXP_ALL_COUNTER 0x123456 354 #define REGEXP_ALL_LAX_COUNTER 0x123457 355 356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 357 static void xmlRegFreeState(xmlRegStatePtr state); 358 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 362 int neg, int start, int end, const xmlChar *blockName); 363 364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 365 366 /************************************************************************ 367 * * 368 * Regexp memory error handler * 369 * * 370 ************************************************************************/ 371 /** 372 * xmlRegexpErrMemory: 373 * @extra: extra information 374 * 375 * Handle an out of memory condition 376 */ 377 static void 378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 379 { 380 const char *regexp = NULL; 381 if (ctxt != NULL) { 382 regexp = (const char *) ctxt->string; 383 ctxt->error = XML_ERR_NO_MEMORY; 384 } 385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 387 regexp, NULL, 0, 0, 388 "Memory allocation failed : %s\n", extra); 389 } 390 391 /** 392 * xmlRegexpErrCompile: 393 * @extra: extra information 394 * 395 * Handle a compilation failure 396 */ 397 static void 398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 399 { 400 const char *regexp = NULL; 401 int idx = 0; 402 403 if (ctxt != NULL) { 404 regexp = (const char *) ctxt->string; 405 idx = ctxt->cur - ctxt->string; 406 ctxt->error = XML_REGEXP_COMPILE_ERROR; 407 } 408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 410 regexp, NULL, idx, 0, 411 "failed to compile: %s\n", extra); 412 } 413 414 /************************************************************************ 415 * * 416 * Allocation/Deallocation * 417 * * 418 ************************************************************************/ 419 420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 421 422 /** 423 * xmlRegCalloc2: 424 * @dim1: size of first dimension 425 * @dim2: size of second dimension 426 * @elemSize: size of element 427 * 428 * Allocate a two-dimensional array and set all elements to zero. 429 * 430 * Returns the new array or NULL in case of error. 431 */ 432 static void* 433 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) { 434 size_t totalSize; 435 void *ret; 436 437 /* Check for overflow */ 438 if (dim1 > SIZE_MAX / dim2 / elemSize) 439 return (NULL); 440 totalSize = dim1 * dim2 * elemSize; 441 ret = xmlMalloc(totalSize); 442 if (ret != NULL) 443 memset(ret, 0, totalSize); 444 return (ret); 445 } 446 447 /** 448 * xmlRegEpxFromParse: 449 * @ctxt: the parser context used to build it 450 * 451 * Allocate a new regexp and fill it with the result from the parser 452 * 453 * Returns the new regexp or NULL in case of error 454 */ 455 static xmlRegexpPtr 456 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 457 xmlRegexpPtr ret; 458 459 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 460 if (ret == NULL) { 461 xmlRegexpErrMemory(ctxt, "compiling regexp"); 462 return(NULL); 463 } 464 memset(ret, 0, sizeof(xmlRegexp)); 465 ret->string = ctxt->string; 466 ret->nbStates = ctxt->nbStates; 467 ret->states = ctxt->states; 468 ret->nbAtoms = ctxt->nbAtoms; 469 ret->atoms = ctxt->atoms; 470 ret->nbCounters = ctxt->nbCounters; 471 ret->counters = ctxt->counters; 472 ret->determinist = ctxt->determinist; 473 ret->flags = ctxt->flags; 474 if (ret->determinist == -1) { 475 xmlRegexpIsDeterminist(ret); 476 } 477 478 if ((ret->determinist != 0) && 479 (ret->nbCounters == 0) && 480 (ctxt->negs == 0) && 481 (ret->atoms != NULL) && 482 (ret->atoms[0] != NULL) && 483 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 484 int i, j, nbstates = 0, nbatoms = 0; 485 int *stateRemap; 486 int *stringRemap; 487 int *transitions; 488 void **transdata; 489 xmlChar **stringMap; 490 xmlChar *value; 491 492 /* 493 * Switch to a compact representation 494 * 1/ counting the effective number of states left 495 * 2/ counting the unique number of atoms, and check that 496 * they are all of the string type 497 * 3/ build a table state x atom for the transitions 498 */ 499 500 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 501 if (stateRemap == NULL) { 502 xmlRegexpErrMemory(ctxt, "compiling regexp"); 503 xmlFree(ret); 504 return(NULL); 505 } 506 for (i = 0;i < ret->nbStates;i++) { 507 if (ret->states[i] != NULL) { 508 stateRemap[i] = nbstates; 509 nbstates++; 510 } else { 511 stateRemap[i] = -1; 512 } 513 } 514 #ifdef DEBUG_COMPACTION 515 printf("Final: %d states\n", nbstates); 516 #endif 517 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 518 if (stringMap == NULL) { 519 xmlRegexpErrMemory(ctxt, "compiling regexp"); 520 xmlFree(stateRemap); 521 xmlFree(ret); 522 return(NULL); 523 } 524 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 525 if (stringRemap == NULL) { 526 xmlRegexpErrMemory(ctxt, "compiling regexp"); 527 xmlFree(stringMap); 528 xmlFree(stateRemap); 529 xmlFree(ret); 530 return(NULL); 531 } 532 for (i = 0;i < ret->nbAtoms;i++) { 533 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 534 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 535 value = ret->atoms[i]->valuep; 536 for (j = 0;j < nbatoms;j++) { 537 if (xmlStrEqual(stringMap[j], value)) { 538 stringRemap[i] = j; 539 break; 540 } 541 } 542 if (j >= nbatoms) { 543 stringRemap[i] = nbatoms; 544 stringMap[nbatoms] = xmlStrdup(value); 545 if (stringMap[nbatoms] == NULL) { 546 for (i = 0;i < nbatoms;i++) 547 xmlFree(stringMap[i]); 548 xmlFree(stringRemap); 549 xmlFree(stringMap); 550 xmlFree(stateRemap); 551 xmlFree(ret); 552 return(NULL); 553 } 554 nbatoms++; 555 } 556 } else { 557 xmlFree(stateRemap); 558 xmlFree(stringRemap); 559 for (i = 0;i < nbatoms;i++) 560 xmlFree(stringMap[i]); 561 xmlFree(stringMap); 562 xmlFree(ret); 563 return(NULL); 564 } 565 } 566 #ifdef DEBUG_COMPACTION 567 printf("Final: %d atoms\n", nbatoms); 568 #endif 569 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1, 570 sizeof(int)); 571 if (transitions == NULL) { 572 xmlFree(stateRemap); 573 xmlFree(stringRemap); 574 for (i = 0;i < nbatoms;i++) 575 xmlFree(stringMap[i]); 576 xmlFree(stringMap); 577 xmlFree(ret); 578 return(NULL); 579 } 580 581 /* 582 * Allocate the transition table. The first entry for each 583 * state corresponds to the state type. 584 */ 585 transdata = NULL; 586 587 for (i = 0;i < ret->nbStates;i++) { 588 int stateno, atomno, targetno, prev; 589 xmlRegStatePtr state; 590 xmlRegTransPtr trans; 591 592 stateno = stateRemap[i]; 593 if (stateno == -1) 594 continue; 595 state = ret->states[i]; 596 597 transitions[stateno * (nbatoms + 1)] = state->type; 598 599 for (j = 0;j < state->nbTrans;j++) { 600 trans = &(state->trans[j]); 601 if ((trans->to == -1) || (trans->atom == NULL)) 602 continue; 603 atomno = stringRemap[trans->atom->no]; 604 if ((trans->atom->data != NULL) && (transdata == NULL)) { 605 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms, 606 sizeof(void *)); 607 if (transdata == NULL) { 608 xmlRegexpErrMemory(ctxt, "compiling regexp"); 609 break; 610 } 611 } 612 targetno = stateRemap[trans->to]; 613 /* 614 * if the same atom can generate transitions to 2 different 615 * states then it means the automata is not deterministic and 616 * the compact form can't be used ! 617 */ 618 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 619 if (prev != 0) { 620 if (prev != targetno + 1) { 621 ret->determinist = 0; 622 #ifdef DEBUG_COMPACTION 623 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 624 i, j, trans->atom->no, trans->to, atomno, targetno); 625 printf(" previous to is %d\n", prev); 626 #endif 627 if (transdata != NULL) 628 xmlFree(transdata); 629 xmlFree(transitions); 630 xmlFree(stateRemap); 631 xmlFree(stringRemap); 632 for (i = 0;i < nbatoms;i++) 633 xmlFree(stringMap[i]); 634 xmlFree(stringMap); 635 goto not_determ; 636 } 637 } else { 638 #if 0 639 printf("State %d trans %d: atom %d to %d : %d to %d\n", 640 i, j, trans->atom->no, trans->to, atomno, targetno); 641 #endif 642 transitions[stateno * (nbatoms + 1) + atomno + 1] = 643 targetno + 1; /* to avoid 0 */ 644 if (transdata != NULL) 645 transdata[stateno * nbatoms + atomno] = 646 trans->atom->data; 647 } 648 } 649 } 650 ret->determinist = 1; 651 #ifdef DEBUG_COMPACTION 652 /* 653 * Debug 654 */ 655 for (i = 0;i < nbstates;i++) { 656 for (j = 0;j < nbatoms + 1;j++) { 657 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 658 } 659 printf("\n"); 660 } 661 printf("\n"); 662 #endif 663 /* 664 * Cleanup of the old data 665 */ 666 if (ret->states != NULL) { 667 for (i = 0;i < ret->nbStates;i++) 668 xmlRegFreeState(ret->states[i]); 669 xmlFree(ret->states); 670 } 671 ret->states = NULL; 672 ret->nbStates = 0; 673 if (ret->atoms != NULL) { 674 for (i = 0;i < ret->nbAtoms;i++) 675 xmlRegFreeAtom(ret->atoms[i]); 676 xmlFree(ret->atoms); 677 } 678 ret->atoms = NULL; 679 ret->nbAtoms = 0; 680 681 ret->compact = transitions; 682 ret->transdata = transdata; 683 ret->stringMap = stringMap; 684 ret->nbstrings = nbatoms; 685 ret->nbstates = nbstates; 686 xmlFree(stateRemap); 687 xmlFree(stringRemap); 688 } 689 not_determ: 690 ctxt->string = NULL; 691 ctxt->nbStates = 0; 692 ctxt->states = NULL; 693 ctxt->nbAtoms = 0; 694 ctxt->atoms = NULL; 695 ctxt->nbCounters = 0; 696 ctxt->counters = NULL; 697 return(ret); 698 } 699 700 /** 701 * xmlRegNewParserCtxt: 702 * @string: the string to parse 703 * 704 * Allocate a new regexp parser context 705 * 706 * Returns the new context or NULL in case of error 707 */ 708 static xmlRegParserCtxtPtr 709 xmlRegNewParserCtxt(const xmlChar *string) { 710 xmlRegParserCtxtPtr ret; 711 712 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 713 if (ret == NULL) 714 return(NULL); 715 memset(ret, 0, sizeof(xmlRegParserCtxt)); 716 if (string != NULL) 717 ret->string = xmlStrdup(string); 718 ret->cur = ret->string; 719 ret->neg = 0; 720 ret->negs = 0; 721 ret->error = 0; 722 ret->determinist = -1; 723 return(ret); 724 } 725 726 /** 727 * xmlRegNewRange: 728 * @ctxt: the regexp parser context 729 * @neg: is that negative 730 * @type: the type of range 731 * @start: the start codepoint 732 * @end: the end codepoint 733 * 734 * Allocate a new regexp range 735 * 736 * Returns the new range or NULL in case of error 737 */ 738 static xmlRegRangePtr 739 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 740 int neg, xmlRegAtomType type, int start, int end) { 741 xmlRegRangePtr ret; 742 743 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 744 if (ret == NULL) { 745 xmlRegexpErrMemory(ctxt, "allocating range"); 746 return(NULL); 747 } 748 ret->neg = neg; 749 ret->type = type; 750 ret->start = start; 751 ret->end = end; 752 return(ret); 753 } 754 755 /** 756 * xmlRegFreeRange: 757 * @range: the regexp range 758 * 759 * Free a regexp range 760 */ 761 static void 762 xmlRegFreeRange(xmlRegRangePtr range) { 763 if (range == NULL) 764 return; 765 766 if (range->blockName != NULL) 767 xmlFree(range->blockName); 768 xmlFree(range); 769 } 770 771 /** 772 * xmlRegCopyRange: 773 * @range: the regexp range 774 * 775 * Copy a regexp range 776 * 777 * Returns the new copy or NULL in case of error. 778 */ 779 static xmlRegRangePtr 780 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 781 xmlRegRangePtr ret; 782 783 if (range == NULL) 784 return(NULL); 785 786 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 787 range->end); 788 if (ret == NULL) 789 return(NULL); 790 if (range->blockName != NULL) { 791 ret->blockName = xmlStrdup(range->blockName); 792 if (ret->blockName == NULL) { 793 xmlRegexpErrMemory(ctxt, "allocating range"); 794 xmlRegFreeRange(ret); 795 return(NULL); 796 } 797 } 798 return(ret); 799 } 800 801 /** 802 * xmlRegNewAtom: 803 * @ctxt: the regexp parser context 804 * @type: the type of atom 805 * 806 * Allocate a new atom 807 * 808 * Returns the new atom or NULL in case of error 809 */ 810 static xmlRegAtomPtr 811 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 812 xmlRegAtomPtr ret; 813 814 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 815 if (ret == NULL) { 816 xmlRegexpErrMemory(ctxt, "allocating atom"); 817 return(NULL); 818 } 819 memset(ret, 0, sizeof(xmlRegAtom)); 820 ret->type = type; 821 ret->quant = XML_REGEXP_QUANT_ONCE; 822 ret->min = 0; 823 ret->max = 0; 824 return(ret); 825 } 826 827 /** 828 * xmlRegFreeAtom: 829 * @atom: the regexp atom 830 * 831 * Free a regexp atom 832 */ 833 static void 834 xmlRegFreeAtom(xmlRegAtomPtr atom) { 835 int i; 836 837 if (atom == NULL) 838 return; 839 840 for (i = 0;i < atom->nbRanges;i++) 841 xmlRegFreeRange(atom->ranges[i]); 842 if (atom->ranges != NULL) 843 xmlFree(atom->ranges); 844 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 845 xmlFree(atom->valuep); 846 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 847 xmlFree(atom->valuep2); 848 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 849 xmlFree(atom->valuep); 850 xmlFree(atom); 851 } 852 853 /** 854 * xmlRegCopyAtom: 855 * @ctxt: the regexp parser context 856 * @atom: the original atom 857 * 858 * Allocate a new regexp range 859 * 860 * Returns the new atom or NULL in case of error 861 */ 862 static xmlRegAtomPtr 863 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 864 xmlRegAtomPtr ret; 865 866 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 867 if (ret == NULL) { 868 xmlRegexpErrMemory(ctxt, "copying atom"); 869 return(NULL); 870 } 871 memset(ret, 0, sizeof(xmlRegAtom)); 872 ret->type = atom->type; 873 ret->quant = atom->quant; 874 ret->min = atom->min; 875 ret->max = atom->max; 876 if (atom->nbRanges > 0) { 877 int i; 878 879 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 880 atom->nbRanges); 881 if (ret->ranges == NULL) { 882 xmlRegexpErrMemory(ctxt, "copying atom"); 883 goto error; 884 } 885 for (i = 0;i < atom->nbRanges;i++) { 886 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 887 if (ret->ranges[i] == NULL) 888 goto error; 889 ret->nbRanges = i + 1; 890 } 891 } 892 return(ret); 893 894 error: 895 xmlRegFreeAtom(ret); 896 return(NULL); 897 } 898 899 static xmlRegStatePtr 900 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 901 xmlRegStatePtr ret; 902 903 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 904 if (ret == NULL) { 905 xmlRegexpErrMemory(ctxt, "allocating state"); 906 return(NULL); 907 } 908 memset(ret, 0, sizeof(xmlRegState)); 909 ret->type = XML_REGEXP_TRANS_STATE; 910 ret->mark = XML_REGEXP_MARK_NORMAL; 911 return(ret); 912 } 913 914 /** 915 * xmlRegFreeState: 916 * @state: the regexp state 917 * 918 * Free a regexp state 919 */ 920 static void 921 xmlRegFreeState(xmlRegStatePtr state) { 922 if (state == NULL) 923 return; 924 925 if (state->trans != NULL) 926 xmlFree(state->trans); 927 if (state->transTo != NULL) 928 xmlFree(state->transTo); 929 xmlFree(state); 930 } 931 932 /** 933 * xmlRegFreeParserCtxt: 934 * @ctxt: the regexp parser context 935 * 936 * Free a regexp parser context 937 */ 938 static void 939 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 940 int i; 941 if (ctxt == NULL) 942 return; 943 944 if (ctxt->string != NULL) 945 xmlFree(ctxt->string); 946 if (ctxt->states != NULL) { 947 for (i = 0;i < ctxt->nbStates;i++) 948 xmlRegFreeState(ctxt->states[i]); 949 xmlFree(ctxt->states); 950 } 951 if (ctxt->atoms != NULL) { 952 for (i = 0;i < ctxt->nbAtoms;i++) 953 xmlRegFreeAtom(ctxt->atoms[i]); 954 xmlFree(ctxt->atoms); 955 } 956 if (ctxt->counters != NULL) 957 xmlFree(ctxt->counters); 958 xmlFree(ctxt); 959 } 960 961 /************************************************************************ 962 * * 963 * Display of Data structures * 964 * * 965 ************************************************************************/ 966 967 static void 968 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 969 switch (type) { 970 case XML_REGEXP_EPSILON: 971 fprintf(output, "epsilon "); break; 972 case XML_REGEXP_CHARVAL: 973 fprintf(output, "charval "); break; 974 case XML_REGEXP_RANGES: 975 fprintf(output, "ranges "); break; 976 case XML_REGEXP_SUBREG: 977 fprintf(output, "subexpr "); break; 978 case XML_REGEXP_STRING: 979 fprintf(output, "string "); break; 980 case XML_REGEXP_ANYCHAR: 981 fprintf(output, "anychar "); break; 982 case XML_REGEXP_ANYSPACE: 983 fprintf(output, "anyspace "); break; 984 case XML_REGEXP_NOTSPACE: 985 fprintf(output, "notspace "); break; 986 case XML_REGEXP_INITNAME: 987 fprintf(output, "initname "); break; 988 case XML_REGEXP_NOTINITNAME: 989 fprintf(output, "notinitname "); break; 990 case XML_REGEXP_NAMECHAR: 991 fprintf(output, "namechar "); break; 992 case XML_REGEXP_NOTNAMECHAR: 993 fprintf(output, "notnamechar "); break; 994 case XML_REGEXP_DECIMAL: 995 fprintf(output, "decimal "); break; 996 case XML_REGEXP_NOTDECIMAL: 997 fprintf(output, "notdecimal "); break; 998 case XML_REGEXP_REALCHAR: 999 fprintf(output, "realchar "); break; 1000 case XML_REGEXP_NOTREALCHAR: 1001 fprintf(output, "notrealchar "); break; 1002 case XML_REGEXP_LETTER: 1003 fprintf(output, "LETTER "); break; 1004 case XML_REGEXP_LETTER_UPPERCASE: 1005 fprintf(output, "LETTER_UPPERCASE "); break; 1006 case XML_REGEXP_LETTER_LOWERCASE: 1007 fprintf(output, "LETTER_LOWERCASE "); break; 1008 case XML_REGEXP_LETTER_TITLECASE: 1009 fprintf(output, "LETTER_TITLECASE "); break; 1010 case XML_REGEXP_LETTER_MODIFIER: 1011 fprintf(output, "LETTER_MODIFIER "); break; 1012 case XML_REGEXP_LETTER_OTHERS: 1013 fprintf(output, "LETTER_OTHERS "); break; 1014 case XML_REGEXP_MARK: 1015 fprintf(output, "MARK "); break; 1016 case XML_REGEXP_MARK_NONSPACING: 1017 fprintf(output, "MARK_NONSPACING "); break; 1018 case XML_REGEXP_MARK_SPACECOMBINING: 1019 fprintf(output, "MARK_SPACECOMBINING "); break; 1020 case XML_REGEXP_MARK_ENCLOSING: 1021 fprintf(output, "MARK_ENCLOSING "); break; 1022 case XML_REGEXP_NUMBER: 1023 fprintf(output, "NUMBER "); break; 1024 case XML_REGEXP_NUMBER_DECIMAL: 1025 fprintf(output, "NUMBER_DECIMAL "); break; 1026 case XML_REGEXP_NUMBER_LETTER: 1027 fprintf(output, "NUMBER_LETTER "); break; 1028 case XML_REGEXP_NUMBER_OTHERS: 1029 fprintf(output, "NUMBER_OTHERS "); break; 1030 case XML_REGEXP_PUNCT: 1031 fprintf(output, "PUNCT "); break; 1032 case XML_REGEXP_PUNCT_CONNECTOR: 1033 fprintf(output, "PUNCT_CONNECTOR "); break; 1034 case XML_REGEXP_PUNCT_DASH: 1035 fprintf(output, "PUNCT_DASH "); break; 1036 case XML_REGEXP_PUNCT_OPEN: 1037 fprintf(output, "PUNCT_OPEN "); break; 1038 case XML_REGEXP_PUNCT_CLOSE: 1039 fprintf(output, "PUNCT_CLOSE "); break; 1040 case XML_REGEXP_PUNCT_INITQUOTE: 1041 fprintf(output, "PUNCT_INITQUOTE "); break; 1042 case XML_REGEXP_PUNCT_FINQUOTE: 1043 fprintf(output, "PUNCT_FINQUOTE "); break; 1044 case XML_REGEXP_PUNCT_OTHERS: 1045 fprintf(output, "PUNCT_OTHERS "); break; 1046 case XML_REGEXP_SEPAR: 1047 fprintf(output, "SEPAR "); break; 1048 case XML_REGEXP_SEPAR_SPACE: 1049 fprintf(output, "SEPAR_SPACE "); break; 1050 case XML_REGEXP_SEPAR_LINE: 1051 fprintf(output, "SEPAR_LINE "); break; 1052 case XML_REGEXP_SEPAR_PARA: 1053 fprintf(output, "SEPAR_PARA "); break; 1054 case XML_REGEXP_SYMBOL: 1055 fprintf(output, "SYMBOL "); break; 1056 case XML_REGEXP_SYMBOL_MATH: 1057 fprintf(output, "SYMBOL_MATH "); break; 1058 case XML_REGEXP_SYMBOL_CURRENCY: 1059 fprintf(output, "SYMBOL_CURRENCY "); break; 1060 case XML_REGEXP_SYMBOL_MODIFIER: 1061 fprintf(output, "SYMBOL_MODIFIER "); break; 1062 case XML_REGEXP_SYMBOL_OTHERS: 1063 fprintf(output, "SYMBOL_OTHERS "); break; 1064 case XML_REGEXP_OTHER: 1065 fprintf(output, "OTHER "); break; 1066 case XML_REGEXP_OTHER_CONTROL: 1067 fprintf(output, "OTHER_CONTROL "); break; 1068 case XML_REGEXP_OTHER_FORMAT: 1069 fprintf(output, "OTHER_FORMAT "); break; 1070 case XML_REGEXP_OTHER_PRIVATE: 1071 fprintf(output, "OTHER_PRIVATE "); break; 1072 case XML_REGEXP_OTHER_NA: 1073 fprintf(output, "OTHER_NA "); break; 1074 case XML_REGEXP_BLOCK_NAME: 1075 fprintf(output, "BLOCK "); break; 1076 } 1077 } 1078 1079 static void 1080 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1081 switch (type) { 1082 case XML_REGEXP_QUANT_EPSILON: 1083 fprintf(output, "epsilon "); break; 1084 case XML_REGEXP_QUANT_ONCE: 1085 fprintf(output, "once "); break; 1086 case XML_REGEXP_QUANT_OPT: 1087 fprintf(output, "? "); break; 1088 case XML_REGEXP_QUANT_MULT: 1089 fprintf(output, "* "); break; 1090 case XML_REGEXP_QUANT_PLUS: 1091 fprintf(output, "+ "); break; 1092 case XML_REGEXP_QUANT_RANGE: 1093 fprintf(output, "range "); break; 1094 case XML_REGEXP_QUANT_ONCEONLY: 1095 fprintf(output, "onceonly "); break; 1096 case XML_REGEXP_QUANT_ALL: 1097 fprintf(output, "all "); break; 1098 } 1099 } 1100 static void 1101 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1102 fprintf(output, " range: "); 1103 if (range->neg) 1104 fprintf(output, "negative "); 1105 xmlRegPrintAtomType(output, range->type); 1106 fprintf(output, "%c - %c\n", range->start, range->end); 1107 } 1108 1109 static void 1110 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1111 fprintf(output, " atom: "); 1112 if (atom == NULL) { 1113 fprintf(output, "NULL\n"); 1114 return; 1115 } 1116 if (atom->neg) 1117 fprintf(output, "not "); 1118 xmlRegPrintAtomType(output, atom->type); 1119 xmlRegPrintQuantType(output, atom->quant); 1120 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1121 fprintf(output, "%d-%d ", atom->min, atom->max); 1122 if (atom->type == XML_REGEXP_STRING) 1123 fprintf(output, "'%s' ", (char *) atom->valuep); 1124 if (atom->type == XML_REGEXP_CHARVAL) 1125 fprintf(output, "char %c\n", atom->codepoint); 1126 else if (atom->type == XML_REGEXP_RANGES) { 1127 int i; 1128 fprintf(output, "%d entries\n", atom->nbRanges); 1129 for (i = 0; i < atom->nbRanges;i++) 1130 xmlRegPrintRange(output, atom->ranges[i]); 1131 } else if (atom->type == XML_REGEXP_SUBREG) { 1132 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1133 } else { 1134 fprintf(output, "\n"); 1135 } 1136 } 1137 1138 static void 1139 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1140 fprintf(output, " trans: "); 1141 if (trans == NULL) { 1142 fprintf(output, "NULL\n"); 1143 return; 1144 } 1145 if (trans->to < 0) { 1146 fprintf(output, "removed\n"); 1147 return; 1148 } 1149 if (trans->nd != 0) { 1150 if (trans->nd == 2) 1151 fprintf(output, "last not determinist, "); 1152 else 1153 fprintf(output, "not determinist, "); 1154 } 1155 if (trans->counter >= 0) { 1156 fprintf(output, "counted %d, ", trans->counter); 1157 } 1158 if (trans->count == REGEXP_ALL_COUNTER) { 1159 fprintf(output, "all transition, "); 1160 } else if (trans->count >= 0) { 1161 fprintf(output, "count based %d, ", trans->count); 1162 } 1163 if (trans->atom == NULL) { 1164 fprintf(output, "epsilon to %d\n", trans->to); 1165 return; 1166 } 1167 if (trans->atom->type == XML_REGEXP_CHARVAL) 1168 fprintf(output, "char %c ", trans->atom->codepoint); 1169 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1170 } 1171 1172 static void 1173 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1174 int i; 1175 1176 fprintf(output, " state: "); 1177 if (state == NULL) { 1178 fprintf(output, "NULL\n"); 1179 return; 1180 } 1181 if (state->type == XML_REGEXP_START_STATE) 1182 fprintf(output, "START "); 1183 if (state->type == XML_REGEXP_FINAL_STATE) 1184 fprintf(output, "FINAL "); 1185 1186 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1187 for (i = 0;i < state->nbTrans; i++) { 1188 xmlRegPrintTrans(output, &(state->trans[i])); 1189 } 1190 } 1191 1192 #ifdef DEBUG_REGEXP_GRAPH 1193 static void 1194 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1195 int i; 1196 1197 fprintf(output, " ctxt: "); 1198 if (ctxt == NULL) { 1199 fprintf(output, "NULL\n"); 1200 return; 1201 } 1202 fprintf(output, "'%s' ", ctxt->string); 1203 if (ctxt->error) 1204 fprintf(output, "error "); 1205 if (ctxt->neg) 1206 fprintf(output, "neg "); 1207 fprintf(output, "\n"); 1208 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1209 for (i = 0;i < ctxt->nbAtoms; i++) { 1210 fprintf(output, " %02d ", i); 1211 xmlRegPrintAtom(output, ctxt->atoms[i]); 1212 } 1213 if (ctxt->atom != NULL) { 1214 fprintf(output, "current atom:\n"); 1215 xmlRegPrintAtom(output, ctxt->atom); 1216 } 1217 fprintf(output, "%d states:", ctxt->nbStates); 1218 if (ctxt->start != NULL) 1219 fprintf(output, " start: %d", ctxt->start->no); 1220 if (ctxt->end != NULL) 1221 fprintf(output, " end: %d", ctxt->end->no); 1222 fprintf(output, "\n"); 1223 for (i = 0;i < ctxt->nbStates; i++) { 1224 xmlRegPrintState(output, ctxt->states[i]); 1225 } 1226 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1227 for (i = 0;i < ctxt->nbCounters; i++) { 1228 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1229 ctxt->counters[i].max); 1230 } 1231 } 1232 #endif 1233 1234 /************************************************************************ 1235 * * 1236 * Finite Automata structures manipulations * 1237 * * 1238 ************************************************************************/ 1239 1240 static void 1241 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1242 int neg, xmlRegAtomType type, int start, int end, 1243 xmlChar *blockName) { 1244 xmlRegRangePtr range; 1245 1246 if (atom == NULL) { 1247 ERROR("add range: atom is NULL"); 1248 return; 1249 } 1250 if (atom->type != XML_REGEXP_RANGES) { 1251 ERROR("add range: atom is not ranges"); 1252 return; 1253 } 1254 if (atom->maxRanges == 0) { 1255 atom->maxRanges = 4; 1256 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1257 sizeof(xmlRegRangePtr)); 1258 if (atom->ranges == NULL) { 1259 xmlRegexpErrMemory(ctxt, "adding ranges"); 1260 atom->maxRanges = 0; 1261 return; 1262 } 1263 } else if (atom->nbRanges >= atom->maxRanges) { 1264 xmlRegRangePtr *tmp; 1265 atom->maxRanges *= 2; 1266 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1267 sizeof(xmlRegRangePtr)); 1268 if (tmp == NULL) { 1269 xmlRegexpErrMemory(ctxt, "adding ranges"); 1270 atom->maxRanges /= 2; 1271 return; 1272 } 1273 atom->ranges = tmp; 1274 } 1275 range = xmlRegNewRange(ctxt, neg, type, start, end); 1276 if (range == NULL) 1277 return; 1278 range->blockName = blockName; 1279 atom->ranges[atom->nbRanges++] = range; 1280 1281 } 1282 1283 static int 1284 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1285 if (ctxt->maxCounters == 0) { 1286 ctxt->maxCounters = 4; 1287 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1288 sizeof(xmlRegCounter)); 1289 if (ctxt->counters == NULL) { 1290 xmlRegexpErrMemory(ctxt, "allocating counter"); 1291 ctxt->maxCounters = 0; 1292 return(-1); 1293 } 1294 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1295 xmlRegCounter *tmp; 1296 ctxt->maxCounters *= 2; 1297 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1298 sizeof(xmlRegCounter)); 1299 if (tmp == NULL) { 1300 xmlRegexpErrMemory(ctxt, "allocating counter"); 1301 ctxt->maxCounters /= 2; 1302 return(-1); 1303 } 1304 ctxt->counters = tmp; 1305 } 1306 ctxt->counters[ctxt->nbCounters].min = -1; 1307 ctxt->counters[ctxt->nbCounters].max = -1; 1308 return(ctxt->nbCounters++); 1309 } 1310 1311 static int 1312 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1313 if (atom == NULL) { 1314 ERROR("atom push: atom is NULL"); 1315 return(-1); 1316 } 1317 if (ctxt->maxAtoms == 0) { 1318 ctxt->maxAtoms = 4; 1319 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1320 sizeof(xmlRegAtomPtr)); 1321 if (ctxt->atoms == NULL) { 1322 xmlRegexpErrMemory(ctxt, "pushing atom"); 1323 ctxt->maxAtoms = 0; 1324 return(-1); 1325 } 1326 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1327 xmlRegAtomPtr *tmp; 1328 ctxt->maxAtoms *= 2; 1329 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1330 sizeof(xmlRegAtomPtr)); 1331 if (tmp == NULL) { 1332 xmlRegexpErrMemory(ctxt, "allocating counter"); 1333 ctxt->maxAtoms /= 2; 1334 return(-1); 1335 } 1336 ctxt->atoms = tmp; 1337 } 1338 atom->no = ctxt->nbAtoms; 1339 ctxt->atoms[ctxt->nbAtoms++] = atom; 1340 return(0); 1341 } 1342 1343 static void 1344 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1345 int from) { 1346 if (target->maxTransTo == 0) { 1347 target->maxTransTo = 8; 1348 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1349 sizeof(int)); 1350 if (target->transTo == NULL) { 1351 xmlRegexpErrMemory(ctxt, "adding transition"); 1352 target->maxTransTo = 0; 1353 return; 1354 } 1355 } else if (target->nbTransTo >= target->maxTransTo) { 1356 int *tmp; 1357 target->maxTransTo *= 2; 1358 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1359 sizeof(int)); 1360 if (tmp == NULL) { 1361 xmlRegexpErrMemory(ctxt, "adding transition"); 1362 target->maxTransTo /= 2; 1363 return; 1364 } 1365 target->transTo = tmp; 1366 } 1367 target->transTo[target->nbTransTo] = from; 1368 target->nbTransTo++; 1369 } 1370 1371 static void 1372 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1373 xmlRegAtomPtr atom, xmlRegStatePtr target, 1374 int counter, int count) { 1375 1376 int nrtrans; 1377 1378 if (state == NULL) { 1379 ERROR("add state: state is NULL"); 1380 return; 1381 } 1382 if (target == NULL) { 1383 ERROR("add state: target is NULL"); 1384 return; 1385 } 1386 /* 1387 * Other routines follow the philosophy 'When in doubt, add a transition' 1388 * so we check here whether such a transition is already present and, if 1389 * so, silently ignore this request. 1390 */ 1391 1392 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1393 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1394 if ((trans->atom == atom) && 1395 (trans->to == target->no) && 1396 (trans->counter == counter) && 1397 (trans->count == count)) { 1398 #ifdef DEBUG_REGEXP_GRAPH 1399 printf("Ignoring duplicate transition from %d to %d\n", 1400 state->no, target->no); 1401 #endif 1402 return; 1403 } 1404 } 1405 1406 if (state->maxTrans == 0) { 1407 state->maxTrans = 8; 1408 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1409 sizeof(xmlRegTrans)); 1410 if (state->trans == NULL) { 1411 xmlRegexpErrMemory(ctxt, "adding transition"); 1412 state->maxTrans = 0; 1413 return; 1414 } 1415 } else if (state->nbTrans >= state->maxTrans) { 1416 xmlRegTrans *tmp; 1417 state->maxTrans *= 2; 1418 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1419 sizeof(xmlRegTrans)); 1420 if (tmp == NULL) { 1421 xmlRegexpErrMemory(ctxt, "adding transition"); 1422 state->maxTrans /= 2; 1423 return; 1424 } 1425 state->trans = tmp; 1426 } 1427 #ifdef DEBUG_REGEXP_GRAPH 1428 printf("Add trans from %d to %d ", state->no, target->no); 1429 if (count == REGEXP_ALL_COUNTER) 1430 printf("all transition\n"); 1431 else if (count >= 0) 1432 printf("count based %d\n", count); 1433 else if (counter >= 0) 1434 printf("counted %d\n", counter); 1435 else if (atom == NULL) 1436 printf("epsilon transition\n"); 1437 else if (atom != NULL) 1438 xmlRegPrintAtom(stdout, atom); 1439 #endif 1440 1441 state->trans[state->nbTrans].atom = atom; 1442 state->trans[state->nbTrans].to = target->no; 1443 state->trans[state->nbTrans].counter = counter; 1444 state->trans[state->nbTrans].count = count; 1445 state->trans[state->nbTrans].nd = 0; 1446 state->nbTrans++; 1447 xmlRegStateAddTransTo(ctxt, target, state->no); 1448 } 1449 1450 static int 1451 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1452 if (state == NULL) return(-1); 1453 if (ctxt->maxStates == 0) { 1454 ctxt->maxStates = 4; 1455 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1456 sizeof(xmlRegStatePtr)); 1457 if (ctxt->states == NULL) { 1458 xmlRegexpErrMemory(ctxt, "adding state"); 1459 ctxt->maxStates = 0; 1460 return(-1); 1461 } 1462 } else if (ctxt->nbStates >= ctxt->maxStates) { 1463 xmlRegStatePtr *tmp; 1464 ctxt->maxStates *= 2; 1465 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1466 sizeof(xmlRegStatePtr)); 1467 if (tmp == NULL) { 1468 xmlRegexpErrMemory(ctxt, "adding state"); 1469 ctxt->maxStates /= 2; 1470 return(-1); 1471 } 1472 ctxt->states = tmp; 1473 } 1474 state->no = ctxt->nbStates; 1475 ctxt->states[ctxt->nbStates++] = state; 1476 return(0); 1477 } 1478 1479 /** 1480 * xmlFAGenerateAllTransition: 1481 * @ctxt: a regexp parser context 1482 * @from: the from state 1483 * @to: the target state or NULL for building a new one 1484 * @lax: 1485 * 1486 */ 1487 static void 1488 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1489 xmlRegStatePtr from, xmlRegStatePtr to, 1490 int lax) { 1491 if (to == NULL) { 1492 to = xmlRegNewState(ctxt); 1493 xmlRegStatePush(ctxt, to); 1494 ctxt->state = to; 1495 } 1496 if (lax) 1497 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1498 else 1499 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1500 } 1501 1502 /** 1503 * xmlFAGenerateEpsilonTransition: 1504 * @ctxt: a regexp parser context 1505 * @from: the from state 1506 * @to: the target state or NULL for building a new one 1507 * 1508 */ 1509 static void 1510 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1511 xmlRegStatePtr from, xmlRegStatePtr to) { 1512 if (to == NULL) { 1513 to = xmlRegNewState(ctxt); 1514 xmlRegStatePush(ctxt, to); 1515 ctxt->state = to; 1516 } 1517 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1518 } 1519 1520 /** 1521 * xmlFAGenerateCountedEpsilonTransition: 1522 * @ctxt: a regexp parser context 1523 * @from: the from state 1524 * @to: the target state or NULL for building a new one 1525 * counter: the counter for that transition 1526 * 1527 */ 1528 static void 1529 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1530 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1531 if (to == NULL) { 1532 to = xmlRegNewState(ctxt); 1533 xmlRegStatePush(ctxt, to); 1534 ctxt->state = to; 1535 } 1536 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1537 } 1538 1539 /** 1540 * xmlFAGenerateCountedTransition: 1541 * @ctxt: a regexp parser context 1542 * @from: the from state 1543 * @to: the target state or NULL for building a new one 1544 * counter: the counter for that transition 1545 * 1546 */ 1547 static void 1548 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1549 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1550 if (to == NULL) { 1551 to = xmlRegNewState(ctxt); 1552 xmlRegStatePush(ctxt, to); 1553 ctxt->state = to; 1554 } 1555 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1556 } 1557 1558 /** 1559 * xmlFAGenerateTransitions: 1560 * @ctxt: a regexp parser context 1561 * @from: the from state 1562 * @to: the target state or NULL for building a new one 1563 * @atom: the atom generating the transition 1564 * 1565 * Returns 0 if success and -1 in case of error. 1566 */ 1567 static int 1568 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1569 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1570 xmlRegStatePtr end; 1571 int nullable = 0; 1572 1573 if (atom == NULL) { 1574 ERROR("generate transition: atom == NULL"); 1575 return(-1); 1576 } 1577 if (atom->type == XML_REGEXP_SUBREG) { 1578 /* 1579 * this is a subexpression handling one should not need to 1580 * create a new node except for XML_REGEXP_QUANT_RANGE. 1581 */ 1582 if (xmlRegAtomPush(ctxt, atom) < 0) { 1583 return(-1); 1584 } 1585 if ((to != NULL) && (atom->stop != to) && 1586 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1587 /* 1588 * Generate an epsilon transition to link to the target 1589 */ 1590 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1591 #ifdef DV 1592 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1593 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1594 to = xmlRegNewState(ctxt); 1595 xmlRegStatePush(ctxt, to); 1596 ctxt->state = to; 1597 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1598 #endif 1599 } 1600 switch (atom->quant) { 1601 case XML_REGEXP_QUANT_OPT: 1602 atom->quant = XML_REGEXP_QUANT_ONCE; 1603 /* 1604 * transition done to the state after end of atom. 1605 * 1. set transition from atom start to new state 1606 * 2. set transition from atom end to this state. 1607 */ 1608 if (to == NULL) { 1609 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1610 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1611 ctxt->state); 1612 } else { 1613 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1614 } 1615 break; 1616 case XML_REGEXP_QUANT_MULT: 1617 atom->quant = XML_REGEXP_QUANT_ONCE; 1618 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1619 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1620 break; 1621 case XML_REGEXP_QUANT_PLUS: 1622 atom->quant = XML_REGEXP_QUANT_ONCE; 1623 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1624 break; 1625 case XML_REGEXP_QUANT_RANGE: { 1626 int counter; 1627 xmlRegStatePtr inter, newstate; 1628 1629 /* 1630 * create the final state now if needed 1631 */ 1632 if (to != NULL) { 1633 newstate = to; 1634 } else { 1635 newstate = xmlRegNewState(ctxt); 1636 xmlRegStatePush(ctxt, newstate); 1637 } 1638 1639 /* 1640 * The principle here is to use counted transition 1641 * to avoid explosion in the number of states in the 1642 * graph. This is clearly more complex but should not 1643 * be exploitable at runtime. 1644 */ 1645 if ((atom->min == 0) && (atom->start0 == NULL)) { 1646 xmlRegAtomPtr copy; 1647 /* 1648 * duplicate a transition based on atom to count next 1649 * occurrences after 1. We cannot loop to atom->start 1650 * directly because we need an epsilon transition to 1651 * newstate. 1652 */ 1653 /* ???? For some reason it seems we never reach that 1654 case, I suppose this got optimized out before when 1655 building the automata */ 1656 copy = xmlRegCopyAtom(ctxt, atom); 1657 if (copy == NULL) 1658 return(-1); 1659 copy->quant = XML_REGEXP_QUANT_ONCE; 1660 copy->min = 0; 1661 copy->max = 0; 1662 1663 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1664 < 0) 1665 return(-1); 1666 inter = ctxt->state; 1667 counter = xmlRegGetCounter(ctxt); 1668 ctxt->counters[counter].min = atom->min - 1; 1669 ctxt->counters[counter].max = atom->max - 1; 1670 /* count the number of times we see it again */ 1671 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1672 atom->stop, counter); 1673 /* allow a way out based on the count */ 1674 xmlFAGenerateCountedTransition(ctxt, inter, 1675 newstate, counter); 1676 /* and also allow a direct exit for 0 */ 1677 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1678 newstate); 1679 } else { 1680 /* 1681 * either we need the atom at least once or there 1682 * is an atom->start0 allowing to easily plug the 1683 * epsilon transition. 1684 */ 1685 counter = xmlRegGetCounter(ctxt); 1686 ctxt->counters[counter].min = atom->min - 1; 1687 ctxt->counters[counter].max = atom->max - 1; 1688 /* allow a way out based on the count */ 1689 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1690 newstate, counter); 1691 /* count the number of times we see it again */ 1692 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1693 atom->start, counter); 1694 /* and if needed allow a direct exit for 0 */ 1695 if (atom->min == 0) 1696 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1697 newstate); 1698 1699 } 1700 atom->min = 0; 1701 atom->max = 0; 1702 atom->quant = XML_REGEXP_QUANT_ONCE; 1703 ctxt->state = newstate; 1704 } 1705 default: 1706 break; 1707 } 1708 return(0); 1709 } 1710 if ((atom->min == 0) && (atom->max == 0) && 1711 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1712 /* 1713 * we can discard the atom and generate an epsilon transition instead 1714 */ 1715 if (to == NULL) { 1716 to = xmlRegNewState(ctxt); 1717 if (to != NULL) 1718 xmlRegStatePush(ctxt, to); 1719 else { 1720 return(-1); 1721 } 1722 } 1723 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1724 ctxt->state = to; 1725 xmlRegFreeAtom(atom); 1726 return(0); 1727 } 1728 if (to == NULL) { 1729 to = xmlRegNewState(ctxt); 1730 if (to != NULL) 1731 xmlRegStatePush(ctxt, to); 1732 else { 1733 return(-1); 1734 } 1735 } 1736 end = to; 1737 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1738 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1739 /* 1740 * Do not pollute the target state by adding transitions from 1741 * it as it is likely to be the shared target of multiple branches. 1742 * So isolate with an epsilon transition. 1743 */ 1744 xmlRegStatePtr tmp; 1745 1746 tmp = xmlRegNewState(ctxt); 1747 if (tmp != NULL) 1748 xmlRegStatePush(ctxt, tmp); 1749 else { 1750 return(-1); 1751 } 1752 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1753 to = tmp; 1754 } 1755 if (xmlRegAtomPush(ctxt, atom) < 0) { 1756 return(-1); 1757 } 1758 if ((atom->quant == XML_REGEXP_QUANT_RANGE) && 1759 (atom->min == 0) && (atom->max > 0)) { 1760 nullable = 1; 1761 atom->min = 1; 1762 if (atom->max == 1) 1763 atom->quant = XML_REGEXP_QUANT_OPT; 1764 } 1765 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1766 ctxt->state = end; 1767 switch (atom->quant) { 1768 case XML_REGEXP_QUANT_OPT: 1769 atom->quant = XML_REGEXP_QUANT_ONCE; 1770 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1771 break; 1772 case XML_REGEXP_QUANT_MULT: 1773 atom->quant = XML_REGEXP_QUANT_ONCE; 1774 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1775 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1776 break; 1777 case XML_REGEXP_QUANT_PLUS: 1778 atom->quant = XML_REGEXP_QUANT_ONCE; 1779 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1780 break; 1781 case XML_REGEXP_QUANT_RANGE: 1782 if (nullable) 1783 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1784 break; 1785 default: 1786 break; 1787 } 1788 return(0); 1789 } 1790 1791 /** 1792 * xmlFAReduceEpsilonTransitions: 1793 * @ctxt: a regexp parser context 1794 * @fromnr: the from state 1795 * @tonr: the to state 1796 * @counter: should that transition be associated to a counted 1797 * 1798 */ 1799 static void 1800 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1801 int tonr, int counter) { 1802 int transnr; 1803 xmlRegStatePtr from; 1804 xmlRegStatePtr to; 1805 1806 #ifdef DEBUG_REGEXP_GRAPH 1807 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1808 #endif 1809 from = ctxt->states[fromnr]; 1810 if (from == NULL) 1811 return; 1812 to = ctxt->states[tonr]; 1813 if (to == NULL) 1814 return; 1815 if ((to->mark == XML_REGEXP_MARK_START) || 1816 (to->mark == XML_REGEXP_MARK_VISITED)) 1817 return; 1818 1819 to->mark = XML_REGEXP_MARK_VISITED; 1820 if (to->type == XML_REGEXP_FINAL_STATE) { 1821 #ifdef DEBUG_REGEXP_GRAPH 1822 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1823 #endif 1824 from->type = XML_REGEXP_FINAL_STATE; 1825 } 1826 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1827 if (to->trans[transnr].to < 0) 1828 continue; 1829 if (to->trans[transnr].atom == NULL) { 1830 /* 1831 * Don't remove counted transitions 1832 * Don't loop either 1833 */ 1834 if (to->trans[transnr].to != fromnr) { 1835 if (to->trans[transnr].count >= 0) { 1836 int newto = to->trans[transnr].to; 1837 1838 xmlRegStateAddTrans(ctxt, from, NULL, 1839 ctxt->states[newto], 1840 -1, to->trans[transnr].count); 1841 } else { 1842 #ifdef DEBUG_REGEXP_GRAPH 1843 printf("Found epsilon trans %d from %d to %d\n", 1844 transnr, tonr, to->trans[transnr].to); 1845 #endif 1846 if (to->trans[transnr].counter >= 0) { 1847 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1848 to->trans[transnr].to, 1849 to->trans[transnr].counter); 1850 } else { 1851 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1852 to->trans[transnr].to, 1853 counter); 1854 } 1855 } 1856 } 1857 } else { 1858 int newto = to->trans[transnr].to; 1859 1860 if (to->trans[transnr].counter >= 0) { 1861 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1862 ctxt->states[newto], 1863 to->trans[transnr].counter, -1); 1864 } else { 1865 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1866 ctxt->states[newto], counter, -1); 1867 } 1868 } 1869 } 1870 to->mark = XML_REGEXP_MARK_NORMAL; 1871 } 1872 1873 /** 1874 * xmlFAEliminateSimpleEpsilonTransitions: 1875 * @ctxt: a regexp parser context 1876 * 1877 * Eliminating general epsilon transitions can get costly in the general 1878 * algorithm due to the large amount of generated new transitions and 1879 * associated comparisons. However for simple epsilon transition used just 1880 * to separate building blocks when generating the automata this can be 1881 * reduced to state elimination: 1882 * - if there exists an epsilon from X to Y 1883 * - if there is no other transition from X 1884 * then X and Y are semantically equivalent and X can be eliminated 1885 * If X is the start state then make Y the start state, else replace the 1886 * target of all transitions to X by transitions to Y. 1887 * 1888 * If X is a final state, skip it. 1889 * Otherwise it would be necessary to manipulate counters for this case when 1890 * eliminating state 2: 1891 * State 1 has a transition with an atom to state 2. 1892 * State 2 is final and has an epsilon transition to state 1. 1893 */ 1894 static void 1895 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1896 int statenr, i, j, newto; 1897 xmlRegStatePtr state, tmp; 1898 1899 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1900 state = ctxt->states[statenr]; 1901 if (state == NULL) 1902 continue; 1903 if (state->nbTrans != 1) 1904 continue; 1905 if (state->type == XML_REGEXP_UNREACH_STATE || 1906 state->type == XML_REGEXP_FINAL_STATE) 1907 continue; 1908 /* is the only transition out a basic transition */ 1909 if ((state->trans[0].atom == NULL) && 1910 (state->trans[0].to >= 0) && 1911 (state->trans[0].to != statenr) && 1912 (state->trans[0].counter < 0) && 1913 (state->trans[0].count < 0)) { 1914 newto = state->trans[0].to; 1915 1916 if (state->type == XML_REGEXP_START_STATE) { 1917 #ifdef DEBUG_REGEXP_GRAPH 1918 printf("Found simple epsilon trans from start %d to %d\n", 1919 statenr, newto); 1920 #endif 1921 } else { 1922 #ifdef DEBUG_REGEXP_GRAPH 1923 printf("Found simple epsilon trans from %d to %d\n", 1924 statenr, newto); 1925 #endif 1926 for (i = 0;i < state->nbTransTo;i++) { 1927 tmp = ctxt->states[state->transTo[i]]; 1928 for (j = 0;j < tmp->nbTrans;j++) { 1929 if (tmp->trans[j].to == statenr) { 1930 #ifdef DEBUG_REGEXP_GRAPH 1931 printf("Changed transition %d on %d to go to %d\n", 1932 j, tmp->no, newto); 1933 #endif 1934 tmp->trans[j].to = -1; 1935 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1936 ctxt->states[newto], 1937 tmp->trans[j].counter, 1938 tmp->trans[j].count); 1939 } 1940 } 1941 } 1942 if (state->type == XML_REGEXP_FINAL_STATE) 1943 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1944 /* eliminate the transition completely */ 1945 state->nbTrans = 0; 1946 1947 state->type = XML_REGEXP_UNREACH_STATE; 1948 1949 } 1950 1951 } 1952 } 1953 } 1954 /** 1955 * xmlFAEliminateEpsilonTransitions: 1956 * @ctxt: a regexp parser context 1957 * 1958 */ 1959 static void 1960 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1961 int statenr, transnr; 1962 xmlRegStatePtr state; 1963 int has_epsilon; 1964 1965 if (ctxt->states == NULL) return; 1966 1967 /* 1968 * Eliminate simple epsilon transition and the associated unreachable 1969 * states. 1970 */ 1971 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1972 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1973 state = ctxt->states[statenr]; 1974 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1975 #ifdef DEBUG_REGEXP_GRAPH 1976 printf("Removed unreachable state %d\n", statenr); 1977 #endif 1978 xmlRegFreeState(state); 1979 ctxt->states[statenr] = NULL; 1980 } 1981 } 1982 1983 has_epsilon = 0; 1984 1985 /* 1986 * Build the completed transitions bypassing the epsilons 1987 * Use a marking algorithm to avoid loops 1988 * Mark sink states too. 1989 * Process from the latest states backward to the start when 1990 * there is long cascading epsilon chains this minimize the 1991 * recursions and transition compares when adding the new ones 1992 */ 1993 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1994 state = ctxt->states[statenr]; 1995 if (state == NULL) 1996 continue; 1997 if ((state->nbTrans == 0) && 1998 (state->type != XML_REGEXP_FINAL_STATE)) { 1999 state->type = XML_REGEXP_SINK_STATE; 2000 } 2001 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2002 if ((state->trans[transnr].atom == NULL) && 2003 (state->trans[transnr].to >= 0)) { 2004 if (state->trans[transnr].to == statenr) { 2005 state->trans[transnr].to = -1; 2006 #ifdef DEBUG_REGEXP_GRAPH 2007 printf("Removed loopback epsilon trans %d on %d\n", 2008 transnr, statenr); 2009 #endif 2010 } else if (state->trans[transnr].count < 0) { 2011 int newto = state->trans[transnr].to; 2012 2013 #ifdef DEBUG_REGEXP_GRAPH 2014 printf("Found epsilon trans %d from %d to %d\n", 2015 transnr, statenr, newto); 2016 #endif 2017 has_epsilon = 1; 2018 state->trans[transnr].to = -2; 2019 state->mark = XML_REGEXP_MARK_START; 2020 xmlFAReduceEpsilonTransitions(ctxt, statenr, 2021 newto, state->trans[transnr].counter); 2022 state->mark = XML_REGEXP_MARK_NORMAL; 2023 #ifdef DEBUG_REGEXP_GRAPH 2024 } else { 2025 printf("Found counted transition %d on %d\n", 2026 transnr, statenr); 2027 #endif 2028 } 2029 } 2030 } 2031 } 2032 /* 2033 * Eliminate the epsilon transitions 2034 */ 2035 if (has_epsilon) { 2036 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2037 state = ctxt->states[statenr]; 2038 if (state == NULL) 2039 continue; 2040 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2041 xmlRegTransPtr trans = &(state->trans[transnr]); 2042 if ((trans->atom == NULL) && 2043 (trans->count < 0) && 2044 (trans->to >= 0)) { 2045 trans->to = -1; 2046 } 2047 } 2048 } 2049 } 2050 2051 /* 2052 * Use this pass to detect unreachable states too 2053 */ 2054 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2055 state = ctxt->states[statenr]; 2056 if (state != NULL) 2057 state->reached = XML_REGEXP_MARK_NORMAL; 2058 } 2059 state = ctxt->states[0]; 2060 if (state != NULL) 2061 state->reached = XML_REGEXP_MARK_START; 2062 while (state != NULL) { 2063 xmlRegStatePtr target = NULL; 2064 state->reached = XML_REGEXP_MARK_VISITED; 2065 /* 2066 * Mark all states reachable from the current reachable state 2067 */ 2068 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2069 if ((state->trans[transnr].to >= 0) && 2070 ((state->trans[transnr].atom != NULL) || 2071 (state->trans[transnr].count >= 0))) { 2072 int newto = state->trans[transnr].to; 2073 2074 if (ctxt->states[newto] == NULL) 2075 continue; 2076 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2077 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2078 target = ctxt->states[newto]; 2079 } 2080 } 2081 } 2082 2083 /* 2084 * find the next accessible state not explored 2085 */ 2086 if (target == NULL) { 2087 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2088 state = ctxt->states[statenr]; 2089 if ((state != NULL) && (state->reached == 2090 XML_REGEXP_MARK_START)) { 2091 target = state; 2092 break; 2093 } 2094 } 2095 } 2096 state = target; 2097 } 2098 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2099 state = ctxt->states[statenr]; 2100 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2101 #ifdef DEBUG_REGEXP_GRAPH 2102 printf("Removed unreachable state %d\n", statenr); 2103 #endif 2104 xmlRegFreeState(state); 2105 ctxt->states[statenr] = NULL; 2106 } 2107 } 2108 2109 } 2110 2111 static int 2112 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2113 int ret = 0; 2114 2115 if ((range1->type == XML_REGEXP_RANGES) || 2116 (range2->type == XML_REGEXP_RANGES) || 2117 (range2->type == XML_REGEXP_SUBREG) || 2118 (range1->type == XML_REGEXP_SUBREG) || 2119 (range1->type == XML_REGEXP_STRING) || 2120 (range2->type == XML_REGEXP_STRING)) 2121 return(-1); 2122 2123 /* put them in order */ 2124 if (range1->type > range2->type) { 2125 xmlRegRangePtr tmp; 2126 2127 tmp = range1; 2128 range1 = range2; 2129 range2 = tmp; 2130 } 2131 if ((range1->type == XML_REGEXP_ANYCHAR) || 2132 (range2->type == XML_REGEXP_ANYCHAR)) { 2133 ret = 1; 2134 } else if ((range1->type == XML_REGEXP_EPSILON) || 2135 (range2->type == XML_REGEXP_EPSILON)) { 2136 return(0); 2137 } else if (range1->type == range2->type) { 2138 if (range1->type != XML_REGEXP_CHARVAL) 2139 ret = 1; 2140 else if ((range1->end < range2->start) || 2141 (range2->end < range1->start)) 2142 ret = 0; 2143 else 2144 ret = 1; 2145 } else if (range1->type == XML_REGEXP_CHARVAL) { 2146 int codepoint; 2147 int neg = 0; 2148 2149 /* 2150 * just check all codepoints in the range for acceptance, 2151 * this is usually way cheaper since done only once at 2152 * compilation than testing over and over at runtime or 2153 * pushing too many states when evaluating. 2154 */ 2155 if (((range1->neg == 0) && (range2->neg != 0)) || 2156 ((range1->neg != 0) && (range2->neg == 0))) 2157 neg = 1; 2158 2159 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2160 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2161 0, range2->start, range2->end, 2162 range2->blockName); 2163 if (ret < 0) 2164 return(-1); 2165 if (((neg == 1) && (ret == 0)) || 2166 ((neg == 0) && (ret == 1))) 2167 return(1); 2168 } 2169 return(0); 2170 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2171 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2172 if (range1->type == range2->type) { 2173 ret = xmlStrEqual(range1->blockName, range2->blockName); 2174 } else { 2175 /* 2176 * comparing a block range with anything else is way 2177 * too costly, and maintaining the table is like too much 2178 * memory too, so let's force the automata to save state 2179 * here. 2180 */ 2181 return(1); 2182 } 2183 } else if ((range1->type < XML_REGEXP_LETTER) || 2184 (range2->type < XML_REGEXP_LETTER)) { 2185 if ((range1->type == XML_REGEXP_ANYSPACE) && 2186 (range2->type == XML_REGEXP_NOTSPACE)) 2187 ret = 0; 2188 else if ((range1->type == XML_REGEXP_INITNAME) && 2189 (range2->type == XML_REGEXP_NOTINITNAME)) 2190 ret = 0; 2191 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2192 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2193 ret = 0; 2194 else if ((range1->type == XML_REGEXP_DECIMAL) && 2195 (range2->type == XML_REGEXP_NOTDECIMAL)) 2196 ret = 0; 2197 else if ((range1->type == XML_REGEXP_REALCHAR) && 2198 (range2->type == XML_REGEXP_NOTREALCHAR)) 2199 ret = 0; 2200 else { 2201 /* same thing to limit complexity */ 2202 return(1); 2203 } 2204 } else { 2205 ret = 0; 2206 /* range1->type < range2->type here */ 2207 switch (range1->type) { 2208 case XML_REGEXP_LETTER: 2209 /* all disjoint except in the subgroups */ 2210 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2211 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2212 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2213 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2214 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2215 ret = 1; 2216 break; 2217 case XML_REGEXP_MARK: 2218 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2219 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2220 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2221 ret = 1; 2222 break; 2223 case XML_REGEXP_NUMBER: 2224 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2225 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2226 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2227 ret = 1; 2228 break; 2229 case XML_REGEXP_PUNCT: 2230 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2231 (range2->type == XML_REGEXP_PUNCT_DASH) || 2232 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2233 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2234 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2235 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2236 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2237 ret = 1; 2238 break; 2239 case XML_REGEXP_SEPAR: 2240 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2241 (range2->type == XML_REGEXP_SEPAR_LINE) || 2242 (range2->type == XML_REGEXP_SEPAR_PARA)) 2243 ret = 1; 2244 break; 2245 case XML_REGEXP_SYMBOL: 2246 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2247 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2248 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2249 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2250 ret = 1; 2251 break; 2252 case XML_REGEXP_OTHER: 2253 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2254 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2255 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2256 ret = 1; 2257 break; 2258 default: 2259 if ((range2->type >= XML_REGEXP_LETTER) && 2260 (range2->type < XML_REGEXP_BLOCK_NAME)) 2261 ret = 0; 2262 else { 2263 /* safety net ! */ 2264 return(1); 2265 } 2266 } 2267 } 2268 if (((range1->neg == 0) && (range2->neg != 0)) || 2269 ((range1->neg != 0) && (range2->neg == 0))) 2270 ret = !ret; 2271 return(ret); 2272 } 2273 2274 /** 2275 * xmlFACompareAtomTypes: 2276 * @type1: an atom type 2277 * @type2: an atom type 2278 * 2279 * Compares two atoms type to check whether they intersect in some ways, 2280 * this is used by xmlFACompareAtoms only 2281 * 2282 * Returns 1 if they may intersect and 0 otherwise 2283 */ 2284 static int 2285 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2286 if ((type1 == XML_REGEXP_EPSILON) || 2287 (type1 == XML_REGEXP_CHARVAL) || 2288 (type1 == XML_REGEXP_RANGES) || 2289 (type1 == XML_REGEXP_SUBREG) || 2290 (type1 == XML_REGEXP_STRING) || 2291 (type1 == XML_REGEXP_ANYCHAR)) 2292 return(1); 2293 if ((type2 == XML_REGEXP_EPSILON) || 2294 (type2 == XML_REGEXP_CHARVAL) || 2295 (type2 == XML_REGEXP_RANGES) || 2296 (type2 == XML_REGEXP_SUBREG) || 2297 (type2 == XML_REGEXP_STRING) || 2298 (type2 == XML_REGEXP_ANYCHAR)) 2299 return(1); 2300 2301 if (type1 == type2) return(1); 2302 2303 /* simplify subsequent compares by making sure type1 < type2 */ 2304 if (type1 > type2) { 2305 xmlRegAtomType tmp = type1; 2306 type1 = type2; 2307 type2 = tmp; 2308 } 2309 switch (type1) { 2310 case XML_REGEXP_ANYSPACE: /* \s */ 2311 /* can't be a letter, number, mark, punctuation, symbol */ 2312 if ((type2 == XML_REGEXP_NOTSPACE) || 2313 ((type2 >= XML_REGEXP_LETTER) && 2314 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2315 ((type2 >= XML_REGEXP_NUMBER) && 2316 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2317 ((type2 >= XML_REGEXP_MARK) && 2318 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2319 ((type2 >= XML_REGEXP_PUNCT) && 2320 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2321 ((type2 >= XML_REGEXP_SYMBOL) && 2322 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2323 ) return(0); 2324 break; 2325 case XML_REGEXP_NOTSPACE: /* \S */ 2326 break; 2327 case XML_REGEXP_INITNAME: /* \l */ 2328 /* can't be a number, mark, separator, punctuation, symbol or other */ 2329 if ((type2 == XML_REGEXP_NOTINITNAME) || 2330 ((type2 >= XML_REGEXP_NUMBER) && 2331 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2332 ((type2 >= XML_REGEXP_MARK) && 2333 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2334 ((type2 >= XML_REGEXP_SEPAR) && 2335 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2336 ((type2 >= XML_REGEXP_PUNCT) && 2337 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2338 ((type2 >= XML_REGEXP_SYMBOL) && 2339 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2340 ((type2 >= XML_REGEXP_OTHER) && 2341 (type2 <= XML_REGEXP_OTHER_NA)) 2342 ) return(0); 2343 break; 2344 case XML_REGEXP_NOTINITNAME: /* \L */ 2345 break; 2346 case XML_REGEXP_NAMECHAR: /* \c */ 2347 /* can't be a mark, separator, punctuation, symbol or other */ 2348 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2349 ((type2 >= XML_REGEXP_MARK) && 2350 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2351 ((type2 >= XML_REGEXP_PUNCT) && 2352 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2353 ((type2 >= XML_REGEXP_SEPAR) && 2354 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2355 ((type2 >= XML_REGEXP_SYMBOL) && 2356 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2357 ((type2 >= XML_REGEXP_OTHER) && 2358 (type2 <= XML_REGEXP_OTHER_NA)) 2359 ) return(0); 2360 break; 2361 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2362 break; 2363 case XML_REGEXP_DECIMAL: /* \d */ 2364 /* can't be a letter, mark, separator, punctuation, symbol or other */ 2365 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2366 (type2 == XML_REGEXP_REALCHAR) || 2367 ((type2 >= XML_REGEXP_LETTER) && 2368 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2369 ((type2 >= XML_REGEXP_MARK) && 2370 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2371 ((type2 >= XML_REGEXP_PUNCT) && 2372 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2373 ((type2 >= XML_REGEXP_SEPAR) && 2374 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2375 ((type2 >= XML_REGEXP_SYMBOL) && 2376 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2377 ((type2 >= XML_REGEXP_OTHER) && 2378 (type2 <= XML_REGEXP_OTHER_NA)) 2379 )return(0); 2380 break; 2381 case XML_REGEXP_NOTDECIMAL: /* \D */ 2382 break; 2383 case XML_REGEXP_REALCHAR: /* \w */ 2384 /* can't be a mark, separator, punctuation, symbol or other */ 2385 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2386 ((type2 >= XML_REGEXP_MARK) && 2387 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2388 ((type2 >= XML_REGEXP_PUNCT) && 2389 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2390 ((type2 >= XML_REGEXP_SEPAR) && 2391 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2392 ((type2 >= XML_REGEXP_SYMBOL) && 2393 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2394 ((type2 >= XML_REGEXP_OTHER) && 2395 (type2 <= XML_REGEXP_OTHER_NA)) 2396 )return(0); 2397 break; 2398 case XML_REGEXP_NOTREALCHAR: /* \W */ 2399 break; 2400 /* 2401 * at that point we know both type 1 and type2 are from 2402 * character categories are ordered and are different, 2403 * it becomes simple because this is a partition 2404 */ 2405 case XML_REGEXP_LETTER: 2406 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2407 return(1); 2408 return(0); 2409 case XML_REGEXP_LETTER_UPPERCASE: 2410 case XML_REGEXP_LETTER_LOWERCASE: 2411 case XML_REGEXP_LETTER_TITLECASE: 2412 case XML_REGEXP_LETTER_MODIFIER: 2413 case XML_REGEXP_LETTER_OTHERS: 2414 return(0); 2415 case XML_REGEXP_MARK: 2416 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2417 return(1); 2418 return(0); 2419 case XML_REGEXP_MARK_NONSPACING: 2420 case XML_REGEXP_MARK_SPACECOMBINING: 2421 case XML_REGEXP_MARK_ENCLOSING: 2422 return(0); 2423 case XML_REGEXP_NUMBER: 2424 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2425 return(1); 2426 return(0); 2427 case XML_REGEXP_NUMBER_DECIMAL: 2428 case XML_REGEXP_NUMBER_LETTER: 2429 case XML_REGEXP_NUMBER_OTHERS: 2430 return(0); 2431 case XML_REGEXP_PUNCT: 2432 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2433 return(1); 2434 return(0); 2435 case XML_REGEXP_PUNCT_CONNECTOR: 2436 case XML_REGEXP_PUNCT_DASH: 2437 case XML_REGEXP_PUNCT_OPEN: 2438 case XML_REGEXP_PUNCT_CLOSE: 2439 case XML_REGEXP_PUNCT_INITQUOTE: 2440 case XML_REGEXP_PUNCT_FINQUOTE: 2441 case XML_REGEXP_PUNCT_OTHERS: 2442 return(0); 2443 case XML_REGEXP_SEPAR: 2444 if (type2 <= XML_REGEXP_SEPAR_PARA) 2445 return(1); 2446 return(0); 2447 case XML_REGEXP_SEPAR_SPACE: 2448 case XML_REGEXP_SEPAR_LINE: 2449 case XML_REGEXP_SEPAR_PARA: 2450 return(0); 2451 case XML_REGEXP_SYMBOL: 2452 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2453 return(1); 2454 return(0); 2455 case XML_REGEXP_SYMBOL_MATH: 2456 case XML_REGEXP_SYMBOL_CURRENCY: 2457 case XML_REGEXP_SYMBOL_MODIFIER: 2458 case XML_REGEXP_SYMBOL_OTHERS: 2459 return(0); 2460 case XML_REGEXP_OTHER: 2461 if (type2 <= XML_REGEXP_OTHER_NA) 2462 return(1); 2463 return(0); 2464 case XML_REGEXP_OTHER_CONTROL: 2465 case XML_REGEXP_OTHER_FORMAT: 2466 case XML_REGEXP_OTHER_PRIVATE: 2467 case XML_REGEXP_OTHER_NA: 2468 return(0); 2469 default: 2470 break; 2471 } 2472 return(1); 2473 } 2474 2475 /** 2476 * xmlFAEqualAtoms: 2477 * @atom1: an atom 2478 * @atom2: an atom 2479 * @deep: if not set only compare string pointers 2480 * 2481 * Compares two atoms to check whether they are the same exactly 2482 * this is used to remove equivalent transitions 2483 * 2484 * Returns 1 if same and 0 otherwise 2485 */ 2486 static int 2487 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2488 int ret = 0; 2489 2490 if (atom1 == atom2) 2491 return(1); 2492 if ((atom1 == NULL) || (atom2 == NULL)) 2493 return(0); 2494 2495 if (atom1->type != atom2->type) 2496 return(0); 2497 switch (atom1->type) { 2498 case XML_REGEXP_EPSILON: 2499 ret = 0; 2500 break; 2501 case XML_REGEXP_STRING: 2502 if (!deep) 2503 ret = (atom1->valuep == atom2->valuep); 2504 else 2505 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2506 (xmlChar *)atom2->valuep); 2507 break; 2508 case XML_REGEXP_CHARVAL: 2509 ret = (atom1->codepoint == atom2->codepoint); 2510 break; 2511 case XML_REGEXP_RANGES: 2512 /* too hard to do in the general case */ 2513 ret = 0; 2514 default: 2515 break; 2516 } 2517 return(ret); 2518 } 2519 2520 /** 2521 * xmlFACompareAtoms: 2522 * @atom1: an atom 2523 * @atom2: an atom 2524 * @deep: if not set only compare string pointers 2525 * 2526 * Compares two atoms to check whether they intersect in some ways, 2527 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2528 * 2529 * Returns 1 if yes and 0 otherwise 2530 */ 2531 static int 2532 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2533 int ret = 1; 2534 2535 if (atom1 == atom2) 2536 return(1); 2537 if ((atom1 == NULL) || (atom2 == NULL)) 2538 return(0); 2539 2540 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2541 (atom2->type == XML_REGEXP_ANYCHAR)) 2542 return(1); 2543 2544 if (atom1->type > atom2->type) { 2545 xmlRegAtomPtr tmp; 2546 tmp = atom1; 2547 atom1 = atom2; 2548 atom2 = tmp; 2549 } 2550 if (atom1->type != atom2->type) { 2551 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2552 /* if they can't intersect at the type level break now */ 2553 if (ret == 0) 2554 return(0); 2555 } 2556 switch (atom1->type) { 2557 case XML_REGEXP_STRING: 2558 if (!deep) 2559 ret = (atom1->valuep != atom2->valuep); 2560 else { 2561 xmlChar *val1 = (xmlChar *)atom1->valuep; 2562 xmlChar *val2 = (xmlChar *)atom2->valuep; 2563 int compound1 = (xmlStrchr(val1, '|') != NULL); 2564 int compound2 = (xmlStrchr(val2, '|') != NULL); 2565 2566 /* Ignore negative match flag for ##other namespaces */ 2567 if (compound1 != compound2) 2568 return(0); 2569 2570 ret = xmlRegStrEqualWildcard(val1, val2); 2571 } 2572 break; 2573 case XML_REGEXP_EPSILON: 2574 goto not_determinist; 2575 case XML_REGEXP_CHARVAL: 2576 if (atom2->type == XML_REGEXP_CHARVAL) { 2577 ret = (atom1->codepoint == atom2->codepoint); 2578 } else { 2579 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2580 if (ret < 0) 2581 ret = 1; 2582 } 2583 break; 2584 case XML_REGEXP_RANGES: 2585 if (atom2->type == XML_REGEXP_RANGES) { 2586 int i, j, res; 2587 xmlRegRangePtr r1, r2; 2588 2589 /* 2590 * need to check that none of the ranges eventually matches 2591 */ 2592 for (i = 0;i < atom1->nbRanges;i++) { 2593 for (j = 0;j < atom2->nbRanges;j++) { 2594 r1 = atom1->ranges[i]; 2595 r2 = atom2->ranges[j]; 2596 res = xmlFACompareRanges(r1, r2); 2597 if (res == 1) { 2598 ret = 1; 2599 goto done; 2600 } 2601 } 2602 } 2603 ret = 0; 2604 } 2605 break; 2606 default: 2607 goto not_determinist; 2608 } 2609 done: 2610 if (atom1->neg != atom2->neg) { 2611 ret = !ret; 2612 } 2613 if (ret == 0) 2614 return(0); 2615 not_determinist: 2616 return(1); 2617 } 2618 2619 /** 2620 * xmlFARecurseDeterminism: 2621 * @ctxt: a regexp parser context 2622 * 2623 * Check whether the associated regexp is determinist, 2624 * should be called after xmlFAEliminateEpsilonTransitions() 2625 * 2626 */ 2627 static int 2628 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2629 int to, xmlRegAtomPtr atom) { 2630 int ret = 1; 2631 int res; 2632 int transnr, nbTrans; 2633 xmlRegTransPtr t1; 2634 int deep = 1; 2635 2636 if (state == NULL) 2637 return(ret); 2638 if (state->markd == XML_REGEXP_MARK_VISITED) 2639 return(ret); 2640 2641 if (ctxt->flags & AM_AUTOMATA_RNG) 2642 deep = 0; 2643 2644 /* 2645 * don't recurse on transitions potentially added in the course of 2646 * the elimination. 2647 */ 2648 nbTrans = state->nbTrans; 2649 for (transnr = 0;transnr < nbTrans;transnr++) { 2650 t1 = &(state->trans[transnr]); 2651 /* 2652 * check transitions conflicting with the one looked at 2653 */ 2654 if (t1->atom == NULL) { 2655 if (t1->to < 0) 2656 continue; 2657 state->markd = XML_REGEXP_MARK_VISITED; 2658 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2659 to, atom); 2660 if (res == 0) { 2661 ret = 0; 2662 /* t1->nd = 1; */ 2663 } 2664 continue; 2665 } 2666 if (t1->to != to) 2667 continue; 2668 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2669 ret = 0; 2670 /* mark the transition as non-deterministic */ 2671 t1->nd = 1; 2672 } 2673 } 2674 return(ret); 2675 } 2676 2677 /** 2678 * xmlFAFinishRecurseDeterminism: 2679 * @ctxt: a regexp parser context 2680 * 2681 * Reset flags after checking determinism. 2682 */ 2683 static void 2684 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 2685 int transnr, nbTrans; 2686 2687 if (state == NULL) 2688 return; 2689 if (state->markd != XML_REGEXP_MARK_VISITED) 2690 return; 2691 state->markd = 0; 2692 2693 nbTrans = state->nbTrans; 2694 for (transnr = 0; transnr < nbTrans; transnr++) { 2695 xmlRegTransPtr t1 = &state->trans[transnr]; 2696 if ((t1->atom == NULL) && (t1->to >= 0)) 2697 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]); 2698 } 2699 } 2700 2701 /** 2702 * xmlFAComputesDeterminism: 2703 * @ctxt: a regexp parser context 2704 * 2705 * Check whether the associated regexp is determinist, 2706 * should be called after xmlFAEliminateEpsilonTransitions() 2707 * 2708 */ 2709 static int 2710 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2711 int statenr, transnr; 2712 xmlRegStatePtr state; 2713 xmlRegTransPtr t1, t2, last; 2714 int i; 2715 int ret = 1; 2716 int deep = 1; 2717 2718 #ifdef DEBUG_REGEXP_GRAPH 2719 printf("xmlFAComputesDeterminism\n"); 2720 xmlRegPrintCtxt(stdout, ctxt); 2721 #endif 2722 if (ctxt->determinist != -1) 2723 return(ctxt->determinist); 2724 2725 if (ctxt->flags & AM_AUTOMATA_RNG) 2726 deep = 0; 2727 2728 /* 2729 * First cleanup the automata removing cancelled transitions 2730 */ 2731 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2732 state = ctxt->states[statenr]; 2733 if (state == NULL) 2734 continue; 2735 if (state->nbTrans < 2) 2736 continue; 2737 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2738 t1 = &(state->trans[transnr]); 2739 /* 2740 * Determinism checks in case of counted or all transitions 2741 * will have to be handled separately 2742 */ 2743 if (t1->atom == NULL) { 2744 /* t1->nd = 1; */ 2745 continue; 2746 } 2747 if (t1->to == -1) /* eliminated */ 2748 continue; 2749 for (i = 0;i < transnr;i++) { 2750 t2 = &(state->trans[i]); 2751 if (t2->to == -1) /* eliminated */ 2752 continue; 2753 if (t2->atom != NULL) { 2754 if (t1->to == t2->to) { 2755 /* 2756 * Here we use deep because we want to keep the 2757 * transitions which indicate a conflict 2758 */ 2759 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2760 (t1->counter == t2->counter) && 2761 (t1->count == t2->count)) 2762 t2->to = -1; /* eliminated */ 2763 } 2764 } 2765 } 2766 } 2767 } 2768 2769 /* 2770 * Check for all states that there aren't 2 transitions 2771 * with the same atom and a different target. 2772 */ 2773 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2774 state = ctxt->states[statenr]; 2775 if (state == NULL) 2776 continue; 2777 if (state->nbTrans < 2) 2778 continue; 2779 last = NULL; 2780 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2781 t1 = &(state->trans[transnr]); 2782 /* 2783 * Determinism checks in case of counted or all transitions 2784 * will have to be handled separately 2785 */ 2786 if (t1->atom == NULL) { 2787 continue; 2788 } 2789 if (t1->to == -1) /* eliminated */ 2790 continue; 2791 for (i = 0;i < transnr;i++) { 2792 t2 = &(state->trans[i]); 2793 if (t2->to == -1) /* eliminated */ 2794 continue; 2795 if (t2->atom != NULL) { 2796 /* 2797 * But here we don't use deep because we want to 2798 * find transitions which indicate a conflict 2799 */ 2800 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2801 ret = 0; 2802 /* mark the transitions as non-deterministic ones */ 2803 t1->nd = 1; 2804 t2->nd = 1; 2805 last = t1; 2806 } 2807 } else if (t1->to != -1) { 2808 /* 2809 * do the closure in case of remaining specific 2810 * epsilon transitions like choices or all 2811 */ 2812 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2813 t2->to, t2->atom); 2814 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]); 2815 /* don't shortcut the computation so all non deterministic 2816 transition get marked down 2817 if (ret == 0) 2818 return(0); 2819 */ 2820 if (ret == 0) { 2821 t1->nd = 1; 2822 /* t2->nd = 1; */ 2823 last = t1; 2824 } 2825 } 2826 } 2827 /* don't shortcut the computation so all non deterministic 2828 transition get marked down 2829 if (ret == 0) 2830 break; */ 2831 } 2832 2833 /* 2834 * mark specifically the last non-deterministic transition 2835 * from a state since there is no need to set-up rollback 2836 * from it 2837 */ 2838 if (last != NULL) { 2839 last->nd = 2; 2840 } 2841 2842 /* don't shortcut the computation so all non deterministic 2843 transition get marked down 2844 if (ret == 0) 2845 break; */ 2846 } 2847 2848 ctxt->determinist = ret; 2849 return(ret); 2850 } 2851 2852 /************************************************************************ 2853 * * 2854 * Routines to check input against transition atoms * 2855 * * 2856 ************************************************************************/ 2857 2858 static int 2859 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2860 int start, int end, const xmlChar *blockName) { 2861 int ret = 0; 2862 2863 switch (type) { 2864 case XML_REGEXP_STRING: 2865 case XML_REGEXP_SUBREG: 2866 case XML_REGEXP_RANGES: 2867 case XML_REGEXP_EPSILON: 2868 return(-1); 2869 case XML_REGEXP_ANYCHAR: 2870 ret = ((codepoint != '\n') && (codepoint != '\r')); 2871 break; 2872 case XML_REGEXP_CHARVAL: 2873 ret = ((codepoint >= start) && (codepoint <= end)); 2874 break; 2875 case XML_REGEXP_NOTSPACE: 2876 neg = !neg; 2877 /* Falls through. */ 2878 case XML_REGEXP_ANYSPACE: 2879 ret = ((codepoint == '\n') || (codepoint == '\r') || 2880 (codepoint == '\t') || (codepoint == ' ')); 2881 break; 2882 case XML_REGEXP_NOTINITNAME: 2883 neg = !neg; 2884 /* Falls through. */ 2885 case XML_REGEXP_INITNAME: 2886 ret = (IS_LETTER(codepoint) || 2887 (codepoint == '_') || (codepoint == ':')); 2888 break; 2889 case XML_REGEXP_NOTNAMECHAR: 2890 neg = !neg; 2891 /* Falls through. */ 2892 case XML_REGEXP_NAMECHAR: 2893 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2894 (codepoint == '.') || (codepoint == '-') || 2895 (codepoint == '_') || (codepoint == ':') || 2896 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2897 break; 2898 case XML_REGEXP_NOTDECIMAL: 2899 neg = !neg; 2900 /* Falls through. */ 2901 case XML_REGEXP_DECIMAL: 2902 ret = xmlUCSIsCatNd(codepoint); 2903 break; 2904 case XML_REGEXP_REALCHAR: 2905 neg = !neg; 2906 /* Falls through. */ 2907 case XML_REGEXP_NOTREALCHAR: 2908 ret = xmlUCSIsCatP(codepoint); 2909 if (ret == 0) 2910 ret = xmlUCSIsCatZ(codepoint); 2911 if (ret == 0) 2912 ret = xmlUCSIsCatC(codepoint); 2913 break; 2914 case XML_REGEXP_LETTER: 2915 ret = xmlUCSIsCatL(codepoint); 2916 break; 2917 case XML_REGEXP_LETTER_UPPERCASE: 2918 ret = xmlUCSIsCatLu(codepoint); 2919 break; 2920 case XML_REGEXP_LETTER_LOWERCASE: 2921 ret = xmlUCSIsCatLl(codepoint); 2922 break; 2923 case XML_REGEXP_LETTER_TITLECASE: 2924 ret = xmlUCSIsCatLt(codepoint); 2925 break; 2926 case XML_REGEXP_LETTER_MODIFIER: 2927 ret = xmlUCSIsCatLm(codepoint); 2928 break; 2929 case XML_REGEXP_LETTER_OTHERS: 2930 ret = xmlUCSIsCatLo(codepoint); 2931 break; 2932 case XML_REGEXP_MARK: 2933 ret = xmlUCSIsCatM(codepoint); 2934 break; 2935 case XML_REGEXP_MARK_NONSPACING: 2936 ret = xmlUCSIsCatMn(codepoint); 2937 break; 2938 case XML_REGEXP_MARK_SPACECOMBINING: 2939 ret = xmlUCSIsCatMc(codepoint); 2940 break; 2941 case XML_REGEXP_MARK_ENCLOSING: 2942 ret = xmlUCSIsCatMe(codepoint); 2943 break; 2944 case XML_REGEXP_NUMBER: 2945 ret = xmlUCSIsCatN(codepoint); 2946 break; 2947 case XML_REGEXP_NUMBER_DECIMAL: 2948 ret = xmlUCSIsCatNd(codepoint); 2949 break; 2950 case XML_REGEXP_NUMBER_LETTER: 2951 ret = xmlUCSIsCatNl(codepoint); 2952 break; 2953 case XML_REGEXP_NUMBER_OTHERS: 2954 ret = xmlUCSIsCatNo(codepoint); 2955 break; 2956 case XML_REGEXP_PUNCT: 2957 ret = xmlUCSIsCatP(codepoint); 2958 break; 2959 case XML_REGEXP_PUNCT_CONNECTOR: 2960 ret = xmlUCSIsCatPc(codepoint); 2961 break; 2962 case XML_REGEXP_PUNCT_DASH: 2963 ret = xmlUCSIsCatPd(codepoint); 2964 break; 2965 case XML_REGEXP_PUNCT_OPEN: 2966 ret = xmlUCSIsCatPs(codepoint); 2967 break; 2968 case XML_REGEXP_PUNCT_CLOSE: 2969 ret = xmlUCSIsCatPe(codepoint); 2970 break; 2971 case XML_REGEXP_PUNCT_INITQUOTE: 2972 ret = xmlUCSIsCatPi(codepoint); 2973 break; 2974 case XML_REGEXP_PUNCT_FINQUOTE: 2975 ret = xmlUCSIsCatPf(codepoint); 2976 break; 2977 case XML_REGEXP_PUNCT_OTHERS: 2978 ret = xmlUCSIsCatPo(codepoint); 2979 break; 2980 case XML_REGEXP_SEPAR: 2981 ret = xmlUCSIsCatZ(codepoint); 2982 break; 2983 case XML_REGEXP_SEPAR_SPACE: 2984 ret = xmlUCSIsCatZs(codepoint); 2985 break; 2986 case XML_REGEXP_SEPAR_LINE: 2987 ret = xmlUCSIsCatZl(codepoint); 2988 break; 2989 case XML_REGEXP_SEPAR_PARA: 2990 ret = xmlUCSIsCatZp(codepoint); 2991 break; 2992 case XML_REGEXP_SYMBOL: 2993 ret = xmlUCSIsCatS(codepoint); 2994 break; 2995 case XML_REGEXP_SYMBOL_MATH: 2996 ret = xmlUCSIsCatSm(codepoint); 2997 break; 2998 case XML_REGEXP_SYMBOL_CURRENCY: 2999 ret = xmlUCSIsCatSc(codepoint); 3000 break; 3001 case XML_REGEXP_SYMBOL_MODIFIER: 3002 ret = xmlUCSIsCatSk(codepoint); 3003 break; 3004 case XML_REGEXP_SYMBOL_OTHERS: 3005 ret = xmlUCSIsCatSo(codepoint); 3006 break; 3007 case XML_REGEXP_OTHER: 3008 ret = xmlUCSIsCatC(codepoint); 3009 break; 3010 case XML_REGEXP_OTHER_CONTROL: 3011 ret = xmlUCSIsCatCc(codepoint); 3012 break; 3013 case XML_REGEXP_OTHER_FORMAT: 3014 ret = xmlUCSIsCatCf(codepoint); 3015 break; 3016 case XML_REGEXP_OTHER_PRIVATE: 3017 ret = xmlUCSIsCatCo(codepoint); 3018 break; 3019 case XML_REGEXP_OTHER_NA: 3020 /* ret = xmlUCSIsCatCn(codepoint); */ 3021 /* Seems it doesn't exist anymore in recent Unicode releases */ 3022 ret = 0; 3023 break; 3024 case XML_REGEXP_BLOCK_NAME: 3025 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 3026 break; 3027 } 3028 if (neg) 3029 return(!ret); 3030 return(ret); 3031 } 3032 3033 static int 3034 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 3035 int i, ret = 0; 3036 xmlRegRangePtr range; 3037 3038 if ((atom == NULL) || (!IS_CHAR(codepoint))) 3039 return(-1); 3040 3041 switch (atom->type) { 3042 case XML_REGEXP_SUBREG: 3043 case XML_REGEXP_EPSILON: 3044 return(-1); 3045 case XML_REGEXP_CHARVAL: 3046 return(codepoint == atom->codepoint); 3047 case XML_REGEXP_RANGES: { 3048 int accept = 0; 3049 3050 for (i = 0;i < atom->nbRanges;i++) { 3051 range = atom->ranges[i]; 3052 if (range->neg == 2) { 3053 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3054 0, range->start, range->end, 3055 range->blockName); 3056 if (ret != 0) 3057 return(0); /* excluded char */ 3058 } else if (range->neg) { 3059 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3060 0, range->start, range->end, 3061 range->blockName); 3062 if (ret == 0) 3063 accept = 1; 3064 else 3065 return(0); 3066 } else { 3067 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3068 0, range->start, range->end, 3069 range->blockName); 3070 if (ret != 0) 3071 accept = 1; /* might still be excluded */ 3072 } 3073 } 3074 return(accept); 3075 } 3076 case XML_REGEXP_STRING: 3077 printf("TODO: XML_REGEXP_STRING\n"); 3078 return(-1); 3079 case XML_REGEXP_ANYCHAR: 3080 case XML_REGEXP_ANYSPACE: 3081 case XML_REGEXP_NOTSPACE: 3082 case XML_REGEXP_INITNAME: 3083 case XML_REGEXP_NOTINITNAME: 3084 case XML_REGEXP_NAMECHAR: 3085 case XML_REGEXP_NOTNAMECHAR: 3086 case XML_REGEXP_DECIMAL: 3087 case XML_REGEXP_NOTDECIMAL: 3088 case XML_REGEXP_REALCHAR: 3089 case XML_REGEXP_NOTREALCHAR: 3090 case XML_REGEXP_LETTER: 3091 case XML_REGEXP_LETTER_UPPERCASE: 3092 case XML_REGEXP_LETTER_LOWERCASE: 3093 case XML_REGEXP_LETTER_TITLECASE: 3094 case XML_REGEXP_LETTER_MODIFIER: 3095 case XML_REGEXP_LETTER_OTHERS: 3096 case XML_REGEXP_MARK: 3097 case XML_REGEXP_MARK_NONSPACING: 3098 case XML_REGEXP_MARK_SPACECOMBINING: 3099 case XML_REGEXP_MARK_ENCLOSING: 3100 case XML_REGEXP_NUMBER: 3101 case XML_REGEXP_NUMBER_DECIMAL: 3102 case XML_REGEXP_NUMBER_LETTER: 3103 case XML_REGEXP_NUMBER_OTHERS: 3104 case XML_REGEXP_PUNCT: 3105 case XML_REGEXP_PUNCT_CONNECTOR: 3106 case XML_REGEXP_PUNCT_DASH: 3107 case XML_REGEXP_PUNCT_OPEN: 3108 case XML_REGEXP_PUNCT_CLOSE: 3109 case XML_REGEXP_PUNCT_INITQUOTE: 3110 case XML_REGEXP_PUNCT_FINQUOTE: 3111 case XML_REGEXP_PUNCT_OTHERS: 3112 case XML_REGEXP_SEPAR: 3113 case XML_REGEXP_SEPAR_SPACE: 3114 case XML_REGEXP_SEPAR_LINE: 3115 case XML_REGEXP_SEPAR_PARA: 3116 case XML_REGEXP_SYMBOL: 3117 case XML_REGEXP_SYMBOL_MATH: 3118 case XML_REGEXP_SYMBOL_CURRENCY: 3119 case XML_REGEXP_SYMBOL_MODIFIER: 3120 case XML_REGEXP_SYMBOL_OTHERS: 3121 case XML_REGEXP_OTHER: 3122 case XML_REGEXP_OTHER_CONTROL: 3123 case XML_REGEXP_OTHER_FORMAT: 3124 case XML_REGEXP_OTHER_PRIVATE: 3125 case XML_REGEXP_OTHER_NA: 3126 case XML_REGEXP_BLOCK_NAME: 3127 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3128 (const xmlChar *)atom->valuep); 3129 if (atom->neg) 3130 ret = !ret; 3131 break; 3132 } 3133 return(ret); 3134 } 3135 3136 /************************************************************************ 3137 * * 3138 * Saving and restoring state of an execution context * 3139 * * 3140 ************************************************************************/ 3141 3142 #ifdef DEBUG_REGEXP_EXEC 3143 static void 3144 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3145 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3146 if (exec->inputStack != NULL) { 3147 int i; 3148 printf(": "); 3149 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3150 printf("%s ", (const char *) 3151 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3152 } else { 3153 printf(": %s", &(exec->inputString[exec->index])); 3154 } 3155 printf("\n"); 3156 } 3157 #endif 3158 3159 static void 3160 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3161 #ifdef DEBUG_REGEXP_EXEC 3162 printf("saving "); 3163 exec->transno++; 3164 xmlFARegDebugExec(exec); 3165 exec->transno--; 3166 #endif 3167 #ifdef MAX_PUSH 3168 if (exec->nbPush > MAX_PUSH) { 3169 return; 3170 } 3171 exec->nbPush++; 3172 #endif 3173 3174 if (exec->maxRollbacks == 0) { 3175 exec->maxRollbacks = 4; 3176 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3177 sizeof(xmlRegExecRollback)); 3178 if (exec->rollbacks == NULL) { 3179 xmlRegexpErrMemory(NULL, "saving regexp"); 3180 exec->maxRollbacks = 0; 3181 return; 3182 } 3183 memset(exec->rollbacks, 0, 3184 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3185 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3186 xmlRegExecRollback *tmp; 3187 int len = exec->maxRollbacks; 3188 3189 exec->maxRollbacks *= 2; 3190 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3191 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3192 if (tmp == NULL) { 3193 xmlRegexpErrMemory(NULL, "saving regexp"); 3194 exec->maxRollbacks /= 2; 3195 return; 3196 } 3197 exec->rollbacks = tmp; 3198 tmp = &exec->rollbacks[len]; 3199 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3200 } 3201 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3202 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3203 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3204 if (exec->comp->nbCounters > 0) { 3205 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3206 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3207 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3208 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3209 xmlRegexpErrMemory(NULL, "saving regexp"); 3210 exec->status = -5; 3211 return; 3212 } 3213 } 3214 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3215 exec->comp->nbCounters * sizeof(int)); 3216 } 3217 exec->nbRollbacks++; 3218 } 3219 3220 static void 3221 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3222 if (exec->nbRollbacks <= 0) { 3223 exec->status = -1; 3224 #ifdef DEBUG_REGEXP_EXEC 3225 printf("rollback failed on empty stack\n"); 3226 #endif 3227 return; 3228 } 3229 exec->nbRollbacks--; 3230 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3231 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3232 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3233 if (exec->comp->nbCounters > 0) { 3234 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3235 fprintf(stderr, "exec save: allocation failed"); 3236 exec->status = -6; 3237 return; 3238 } 3239 if (exec->counts) { 3240 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3241 exec->comp->nbCounters * sizeof(int)); 3242 } 3243 } 3244 3245 #ifdef DEBUG_REGEXP_EXEC 3246 printf("restored "); 3247 xmlFARegDebugExec(exec); 3248 #endif 3249 } 3250 3251 /************************************************************************ 3252 * * 3253 * Verifier, running an input against a compiled regexp * 3254 * * 3255 ************************************************************************/ 3256 3257 static int 3258 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3259 xmlRegExecCtxt execval; 3260 xmlRegExecCtxtPtr exec = &execval; 3261 int ret, codepoint = 0, len, deter; 3262 3263 exec->inputString = content; 3264 exec->index = 0; 3265 exec->nbPush = 0; 3266 exec->determinist = 1; 3267 exec->maxRollbacks = 0; 3268 exec->nbRollbacks = 0; 3269 exec->rollbacks = NULL; 3270 exec->status = 0; 3271 exec->comp = comp; 3272 exec->state = comp->states[0]; 3273 exec->transno = 0; 3274 exec->transcount = 0; 3275 exec->inputStack = NULL; 3276 exec->inputStackMax = 0; 3277 if (comp->nbCounters > 0) { 3278 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3279 if (exec->counts == NULL) { 3280 xmlRegexpErrMemory(NULL, "running regexp"); 3281 return(-1); 3282 } 3283 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3284 } else 3285 exec->counts = NULL; 3286 while ((exec->status == 0) && (exec->state != NULL) && 3287 ((exec->inputString[exec->index] != 0) || 3288 ((exec->state != NULL) && 3289 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3290 xmlRegTransPtr trans; 3291 xmlRegAtomPtr atom; 3292 3293 /* 3294 * If end of input on non-terminal state, rollback, however we may 3295 * still have epsilon like transition for counted transitions 3296 * on counters, in that case don't break too early. Additionally, 3297 * if we are working on a range like "AB{0,2}", where B is not present, 3298 * we don't want to break. 3299 */ 3300 len = 1; 3301 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3302 /* 3303 * if there is a transition, we must check if 3304 * atom allows minOccurs of 0 3305 */ 3306 if (exec->transno < exec->state->nbTrans) { 3307 trans = &exec->state->trans[exec->transno]; 3308 if (trans->to >=0) { 3309 atom = trans->atom; 3310 if (!((atom->min == 0) && (atom->max > 0))) 3311 goto rollback; 3312 } 3313 } else 3314 goto rollback; 3315 } 3316 3317 exec->transcount = 0; 3318 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3319 trans = &exec->state->trans[exec->transno]; 3320 if (trans->to < 0) 3321 continue; 3322 atom = trans->atom; 3323 ret = 0; 3324 deter = 1; 3325 if (trans->count >= 0) { 3326 int count; 3327 xmlRegCounterPtr counter; 3328 3329 if (exec->counts == NULL) { 3330 exec->status = -1; 3331 goto error; 3332 } 3333 /* 3334 * A counted transition. 3335 */ 3336 3337 count = exec->counts[trans->count]; 3338 counter = &exec->comp->counters[trans->count]; 3339 #ifdef DEBUG_REGEXP_EXEC 3340 printf("testing count %d: val %d, min %d, max %d\n", 3341 trans->count, count, counter->min, counter->max); 3342 #endif 3343 ret = ((count >= counter->min) && (count <= counter->max)); 3344 if ((ret) && (counter->min != counter->max)) 3345 deter = 0; 3346 } else if (atom == NULL) { 3347 fprintf(stderr, "epsilon transition left at runtime\n"); 3348 exec->status = -2; 3349 break; 3350 } else if (exec->inputString[exec->index] != 0) { 3351 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3352 ret = xmlRegCheckCharacter(atom, codepoint); 3353 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3354 xmlRegStatePtr to = comp->states[trans->to]; 3355 3356 /* 3357 * this is a multiple input sequence 3358 * If there is a counter associated increment it now. 3359 * do not increment if the counter is already over the 3360 * maximum limit in which case get to next transition 3361 */ 3362 if (trans->counter >= 0) { 3363 xmlRegCounterPtr counter; 3364 3365 if ((exec->counts == NULL) || 3366 (exec->comp == NULL) || 3367 (exec->comp->counters == NULL)) { 3368 exec->status = -1; 3369 goto error; 3370 } 3371 counter = &exec->comp->counters[trans->counter]; 3372 if (exec->counts[trans->counter] >= counter->max) 3373 continue; /* for loop on transitions */ 3374 } 3375 /* Save before incrementing */ 3376 if (exec->state->nbTrans > exec->transno + 1) { 3377 xmlFARegExecSave(exec); 3378 } 3379 if (trans->counter >= 0) { 3380 #ifdef DEBUG_REGEXP_EXEC 3381 printf("Increasing count %d\n", trans->counter); 3382 #endif 3383 exec->counts[trans->counter]++; 3384 } 3385 exec->transcount = 1; 3386 do { 3387 /* 3388 * Try to progress as much as possible on the input 3389 */ 3390 if (exec->transcount == atom->max) { 3391 break; 3392 } 3393 exec->index += len; 3394 /* 3395 * End of input: stop here 3396 */ 3397 if (exec->inputString[exec->index] == 0) { 3398 exec->index -= len; 3399 break; 3400 } 3401 if (exec->transcount >= atom->min) { 3402 int transno = exec->transno; 3403 xmlRegStatePtr state = exec->state; 3404 3405 /* 3406 * The transition is acceptable save it 3407 */ 3408 exec->transno = -1; /* trick */ 3409 exec->state = to; 3410 xmlFARegExecSave(exec); 3411 exec->transno = transno; 3412 exec->state = state; 3413 } 3414 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3415 len); 3416 ret = xmlRegCheckCharacter(atom, codepoint); 3417 exec->transcount++; 3418 } while (ret == 1); 3419 if (exec->transcount < atom->min) 3420 ret = 0; 3421 3422 /* 3423 * If the last check failed but one transition was found 3424 * possible, rollback 3425 */ 3426 if (ret < 0) 3427 ret = 0; 3428 if (ret == 0) { 3429 goto rollback; 3430 } 3431 if (trans->counter >= 0) { 3432 if (exec->counts == NULL) { 3433 exec->status = -1; 3434 goto error; 3435 } 3436 #ifdef DEBUG_REGEXP_EXEC 3437 printf("Decreasing count %d\n", trans->counter); 3438 #endif 3439 exec->counts[trans->counter]--; 3440 } 3441 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3442 /* 3443 * we don't match on the codepoint, but minOccurs of 0 3444 * says that's ok. Setting len to 0 inhibits stepping 3445 * over the codepoint. 3446 */ 3447 exec->transcount = 1; 3448 len = 0; 3449 ret = 1; 3450 } 3451 } else if ((atom->min == 0) && (atom->max > 0)) { 3452 /* another spot to match when minOccurs is 0 */ 3453 exec->transcount = 1; 3454 len = 0; 3455 ret = 1; 3456 } 3457 if (ret == 1) { 3458 if ((trans->nd == 1) || 3459 ((trans->count >= 0) && (deter == 0) && 3460 (exec->state->nbTrans > exec->transno + 1))) { 3461 #ifdef DEBUG_REGEXP_EXEC 3462 if (trans->nd == 1) 3463 printf("Saving on nd transition atom %d for %c at %d\n", 3464 trans->atom->no, codepoint, exec->index); 3465 else 3466 printf("Saving on counted transition count %d for %c at %d\n", 3467 trans->count, codepoint, exec->index); 3468 #endif 3469 xmlFARegExecSave(exec); 3470 } 3471 if (trans->counter >= 0) { 3472 xmlRegCounterPtr counter; 3473 3474 /* make sure we don't go over the counter maximum value */ 3475 if ((exec->counts == NULL) || 3476 (exec->comp == NULL) || 3477 (exec->comp->counters == NULL)) { 3478 exec->status = -1; 3479 goto error; 3480 } 3481 counter = &exec->comp->counters[trans->counter]; 3482 if (exec->counts[trans->counter] >= counter->max) 3483 continue; /* for loop on transitions */ 3484 #ifdef DEBUG_REGEXP_EXEC 3485 printf("Increasing count %d\n", trans->counter); 3486 #endif 3487 exec->counts[trans->counter]++; 3488 } 3489 if ((trans->count >= 0) && 3490 (trans->count < REGEXP_ALL_COUNTER)) { 3491 if (exec->counts == NULL) { 3492 exec->status = -1; 3493 goto error; 3494 } 3495 #ifdef DEBUG_REGEXP_EXEC 3496 printf("resetting count %d on transition\n", 3497 trans->count); 3498 #endif 3499 exec->counts[trans->count] = 0; 3500 } 3501 #ifdef DEBUG_REGEXP_EXEC 3502 printf("entering state %d\n", trans->to); 3503 #endif 3504 exec->state = comp->states[trans->to]; 3505 exec->transno = 0; 3506 if (trans->atom != NULL) { 3507 exec->index += len; 3508 } 3509 goto progress; 3510 } else if (ret < 0) { 3511 exec->status = -4; 3512 break; 3513 } 3514 } 3515 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3516 rollback: 3517 /* 3518 * Failed to find a way out 3519 */ 3520 exec->determinist = 0; 3521 #ifdef DEBUG_REGEXP_EXEC 3522 printf("rollback from state %d on %d:%c\n", exec->state->no, 3523 codepoint,codepoint); 3524 #endif 3525 xmlFARegExecRollBack(exec); 3526 } 3527 progress: 3528 continue; 3529 } 3530 error: 3531 if (exec->rollbacks != NULL) { 3532 if (exec->counts != NULL) { 3533 int i; 3534 3535 for (i = 0;i < exec->maxRollbacks;i++) 3536 if (exec->rollbacks[i].counts != NULL) 3537 xmlFree(exec->rollbacks[i].counts); 3538 } 3539 xmlFree(exec->rollbacks); 3540 } 3541 if (exec->state == NULL) 3542 return(-1); 3543 if (exec->counts != NULL) 3544 xmlFree(exec->counts); 3545 if (exec->status == 0) 3546 return(1); 3547 if (exec->status == -1) { 3548 if (exec->nbPush > MAX_PUSH) 3549 return(-1); 3550 return(0); 3551 } 3552 return(exec->status); 3553 } 3554 3555 /************************************************************************ 3556 * * 3557 * Progressive interface to the verifier one atom at a time * 3558 * * 3559 ************************************************************************/ 3560 #ifdef DEBUG_ERR 3561 static void testerr(xmlRegExecCtxtPtr exec); 3562 #endif 3563 3564 /** 3565 * xmlRegNewExecCtxt: 3566 * @comp: a precompiled regular expression 3567 * @callback: a callback function used for handling progresses in the 3568 * automata matching phase 3569 * @data: the context data associated to the callback in this context 3570 * 3571 * Build a context used for progressive evaluation of a regexp. 3572 * 3573 * Returns the new context 3574 */ 3575 xmlRegExecCtxtPtr 3576 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3577 xmlRegExecCtxtPtr exec; 3578 3579 if (comp == NULL) 3580 return(NULL); 3581 if ((comp->compact == NULL) && (comp->states == NULL)) 3582 return(NULL); 3583 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3584 if (exec == NULL) { 3585 xmlRegexpErrMemory(NULL, "creating execution context"); 3586 return(NULL); 3587 } 3588 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3589 exec->inputString = NULL; 3590 exec->index = 0; 3591 exec->determinist = 1; 3592 exec->maxRollbacks = 0; 3593 exec->nbRollbacks = 0; 3594 exec->rollbacks = NULL; 3595 exec->status = 0; 3596 exec->comp = comp; 3597 if (comp->compact == NULL) 3598 exec->state = comp->states[0]; 3599 exec->transno = 0; 3600 exec->transcount = 0; 3601 exec->callback = callback; 3602 exec->data = data; 3603 if (comp->nbCounters > 0) { 3604 /* 3605 * For error handling, exec->counts is allocated twice the size 3606 * the second half is used to store the data in case of rollback 3607 */ 3608 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3609 * 2); 3610 if (exec->counts == NULL) { 3611 xmlRegexpErrMemory(NULL, "creating execution context"); 3612 xmlFree(exec); 3613 return(NULL); 3614 } 3615 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3616 exec->errCounts = &exec->counts[comp->nbCounters]; 3617 } else { 3618 exec->counts = NULL; 3619 exec->errCounts = NULL; 3620 } 3621 exec->inputStackMax = 0; 3622 exec->inputStackNr = 0; 3623 exec->inputStack = NULL; 3624 exec->errStateNo = -1; 3625 exec->errString = NULL; 3626 exec->nbPush = 0; 3627 return(exec); 3628 } 3629 3630 /** 3631 * xmlRegFreeExecCtxt: 3632 * @exec: a regular expression evaluation context 3633 * 3634 * Free the structures associated to a regular expression evaluation context. 3635 */ 3636 void 3637 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3638 if (exec == NULL) 3639 return; 3640 3641 if (exec->rollbacks != NULL) { 3642 if (exec->counts != NULL) { 3643 int i; 3644 3645 for (i = 0;i < exec->maxRollbacks;i++) 3646 if (exec->rollbacks[i].counts != NULL) 3647 xmlFree(exec->rollbacks[i].counts); 3648 } 3649 xmlFree(exec->rollbacks); 3650 } 3651 if (exec->counts != NULL) 3652 xmlFree(exec->counts); 3653 if (exec->inputStack != NULL) { 3654 int i; 3655 3656 for (i = 0;i < exec->inputStackNr;i++) { 3657 if (exec->inputStack[i].value != NULL) 3658 xmlFree(exec->inputStack[i].value); 3659 } 3660 xmlFree(exec->inputStack); 3661 } 3662 if (exec->errString != NULL) 3663 xmlFree(exec->errString); 3664 xmlFree(exec); 3665 } 3666 3667 static void 3668 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3669 void *data) { 3670 #ifdef DEBUG_PUSH 3671 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3672 #endif 3673 if (exec->inputStackMax == 0) { 3674 exec->inputStackMax = 4; 3675 exec->inputStack = (xmlRegInputTokenPtr) 3676 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3677 if (exec->inputStack == NULL) { 3678 xmlRegexpErrMemory(NULL, "pushing input string"); 3679 exec->inputStackMax = 0; 3680 return; 3681 } 3682 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3683 xmlRegInputTokenPtr tmp; 3684 3685 exec->inputStackMax *= 2; 3686 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3687 exec->inputStackMax * sizeof(xmlRegInputToken)); 3688 if (tmp == NULL) { 3689 xmlRegexpErrMemory(NULL, "pushing input string"); 3690 exec->inputStackMax /= 2; 3691 return; 3692 } 3693 exec->inputStack = tmp; 3694 } 3695 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3696 exec->inputStack[exec->inputStackNr].data = data; 3697 exec->inputStackNr++; 3698 exec->inputStack[exec->inputStackNr].value = NULL; 3699 exec->inputStack[exec->inputStackNr].data = NULL; 3700 } 3701 3702 /** 3703 * xmlRegStrEqualWildcard: 3704 * @expStr: the string to be evaluated 3705 * @valStr: the validation string 3706 * 3707 * Checks if both strings are equal or have the same content. "*" 3708 * can be used as a wildcard in @valStr; "|" is used as a separator of 3709 * substrings in both @expStr and @valStr. 3710 * 3711 * Returns 1 if the comparison is satisfied and the number of substrings 3712 * is equal, 0 otherwise. 3713 */ 3714 3715 static int 3716 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3717 if (expStr == valStr) return(1); 3718 if (expStr == NULL) return(0); 3719 if (valStr == NULL) return(0); 3720 do { 3721 /* 3722 * Eval if we have a wildcard for the current item. 3723 */ 3724 if (*expStr != *valStr) { 3725 /* if one of them starts with a wildcard make valStr be it */ 3726 if (*valStr == '*') { 3727 const xmlChar *tmp; 3728 3729 tmp = valStr; 3730 valStr = expStr; 3731 expStr = tmp; 3732 } 3733 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3734 do { 3735 if (*valStr == XML_REG_STRING_SEPARATOR) 3736 break; 3737 valStr++; 3738 } while (*valStr != 0); 3739 continue; 3740 } else 3741 return(0); 3742 } 3743 expStr++; 3744 valStr++; 3745 } while (*valStr != 0); 3746 if (*expStr != 0) 3747 return (0); 3748 else 3749 return (1); 3750 } 3751 3752 /** 3753 * xmlRegCompactPushString: 3754 * @exec: a regexp execution context 3755 * @comp: the precompiled exec with a compact table 3756 * @value: a string token input 3757 * @data: data associated to the token to reuse in callbacks 3758 * 3759 * Push one input token in the execution context 3760 * 3761 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3762 * a negative value in case of error. 3763 */ 3764 static int 3765 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3766 xmlRegexpPtr comp, 3767 const xmlChar *value, 3768 void *data) { 3769 int state = exec->index; 3770 int i, target; 3771 3772 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3773 return(-1); 3774 3775 if (value == NULL) { 3776 /* 3777 * are we at a final state ? 3778 */ 3779 if (comp->compact[state * (comp->nbstrings + 1)] == 3780 XML_REGEXP_FINAL_STATE) 3781 return(1); 3782 return(0); 3783 } 3784 3785 #ifdef DEBUG_PUSH 3786 printf("value pushed: %s\n", value); 3787 #endif 3788 3789 /* 3790 * Examine all outside transitions from current state 3791 */ 3792 for (i = 0;i < comp->nbstrings;i++) { 3793 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3794 if ((target > 0) && (target <= comp->nbstates)) { 3795 target--; /* to avoid 0 */ 3796 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3797 exec->index = target; 3798 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3799 exec->callback(exec->data, value, 3800 comp->transdata[state * comp->nbstrings + i], data); 3801 } 3802 #ifdef DEBUG_PUSH 3803 printf("entering state %d\n", target); 3804 #endif 3805 if (comp->compact[target * (comp->nbstrings + 1)] == 3806 XML_REGEXP_SINK_STATE) 3807 goto error; 3808 3809 if (comp->compact[target * (comp->nbstrings + 1)] == 3810 XML_REGEXP_FINAL_STATE) 3811 return(1); 3812 return(0); 3813 } 3814 } 3815 } 3816 /* 3817 * Failed to find an exit transition out from current state for the 3818 * current token 3819 */ 3820 #ifdef DEBUG_PUSH 3821 printf("failed to find a transition for %s on state %d\n", value, state); 3822 #endif 3823 error: 3824 if (exec->errString != NULL) 3825 xmlFree(exec->errString); 3826 exec->errString = xmlStrdup(value); 3827 exec->errStateNo = state; 3828 exec->status = -1; 3829 #ifdef DEBUG_ERR 3830 testerr(exec); 3831 #endif 3832 return(-1); 3833 } 3834 3835 /** 3836 * xmlRegExecPushStringInternal: 3837 * @exec: a regexp execution context or NULL to indicate the end 3838 * @value: a string token input 3839 * @data: data associated to the token to reuse in callbacks 3840 * @compound: value was assembled from 2 strings 3841 * 3842 * Push one input token in the execution context 3843 * 3844 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3845 * a negative value in case of error. 3846 */ 3847 static int 3848 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3849 void *data, int compound) { 3850 xmlRegTransPtr trans; 3851 xmlRegAtomPtr atom; 3852 int ret; 3853 int final = 0; 3854 int progress = 1; 3855 3856 if (exec == NULL) 3857 return(-1); 3858 if (exec->comp == NULL) 3859 return(-1); 3860 if (exec->status != 0) 3861 return(exec->status); 3862 3863 if (exec->comp->compact != NULL) 3864 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3865 3866 if (value == NULL) { 3867 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3868 return(1); 3869 final = 1; 3870 } 3871 3872 #ifdef DEBUG_PUSH 3873 printf("value pushed: %s\n", value); 3874 #endif 3875 /* 3876 * If we have an active rollback stack push the new value there 3877 * and get back to where we were left 3878 */ 3879 if ((value != NULL) && (exec->inputStackNr > 0)) { 3880 xmlFARegExecSaveInputString(exec, value, data); 3881 value = exec->inputStack[exec->index].value; 3882 data = exec->inputStack[exec->index].data; 3883 #ifdef DEBUG_PUSH 3884 printf("value loaded: %s\n", value); 3885 #endif 3886 } 3887 3888 while ((exec->status == 0) && 3889 ((value != NULL) || 3890 ((final == 1) && 3891 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3892 3893 /* 3894 * End of input on non-terminal state, rollback, however we may 3895 * still have epsilon like transition for counted transitions 3896 * on counters, in that case don't break too early. 3897 */ 3898 if ((value == NULL) && (exec->counts == NULL)) 3899 goto rollback; 3900 3901 exec->transcount = 0; 3902 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3903 trans = &exec->state->trans[exec->transno]; 3904 if (trans->to < 0) 3905 continue; 3906 atom = trans->atom; 3907 ret = 0; 3908 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3909 int i; 3910 int count; 3911 xmlRegTransPtr t; 3912 xmlRegCounterPtr counter; 3913 3914 ret = 0; 3915 3916 #ifdef DEBUG_PUSH 3917 printf("testing all lax %d\n", trans->count); 3918 #endif 3919 /* 3920 * Check all counted transitions from the current state 3921 */ 3922 if ((value == NULL) && (final)) { 3923 ret = 1; 3924 } else if (value != NULL) { 3925 for (i = 0;i < exec->state->nbTrans;i++) { 3926 t = &exec->state->trans[i]; 3927 if ((t->counter < 0) || (t == trans)) 3928 continue; 3929 counter = &exec->comp->counters[t->counter]; 3930 count = exec->counts[t->counter]; 3931 if ((count < counter->max) && 3932 (t->atom != NULL) && 3933 (xmlStrEqual(value, t->atom->valuep))) { 3934 ret = 0; 3935 break; 3936 } 3937 if ((count >= counter->min) && 3938 (count < counter->max) && 3939 (t->atom != NULL) && 3940 (xmlStrEqual(value, t->atom->valuep))) { 3941 ret = 1; 3942 break; 3943 } 3944 } 3945 } 3946 } else if (trans->count == REGEXP_ALL_COUNTER) { 3947 int i; 3948 int count; 3949 xmlRegTransPtr t; 3950 xmlRegCounterPtr counter; 3951 3952 ret = 1; 3953 3954 #ifdef DEBUG_PUSH 3955 printf("testing all %d\n", trans->count); 3956 #endif 3957 /* 3958 * Check all counted transitions from the current state 3959 */ 3960 for (i = 0;i < exec->state->nbTrans;i++) { 3961 t = &exec->state->trans[i]; 3962 if ((t->counter < 0) || (t == trans)) 3963 continue; 3964 counter = &exec->comp->counters[t->counter]; 3965 count = exec->counts[t->counter]; 3966 if ((count < counter->min) || (count > counter->max)) { 3967 ret = 0; 3968 break; 3969 } 3970 } 3971 } else if (trans->count >= 0) { 3972 int count; 3973 xmlRegCounterPtr counter; 3974 3975 /* 3976 * A counted transition. 3977 */ 3978 3979 count = exec->counts[trans->count]; 3980 counter = &exec->comp->counters[trans->count]; 3981 #ifdef DEBUG_PUSH 3982 printf("testing count %d: val %d, min %d, max %d\n", 3983 trans->count, count, counter->min, counter->max); 3984 #endif 3985 ret = ((count >= counter->min) && (count <= counter->max)); 3986 } else if (atom == NULL) { 3987 fprintf(stderr, "epsilon transition left at runtime\n"); 3988 exec->status = -2; 3989 break; 3990 } else if (value != NULL) { 3991 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3992 if (atom->neg) { 3993 ret = !ret; 3994 if (!compound) 3995 ret = 0; 3996 } 3997 if ((ret == 1) && (trans->counter >= 0)) { 3998 xmlRegCounterPtr counter; 3999 int count; 4000 4001 count = exec->counts[trans->counter]; 4002 counter = &exec->comp->counters[trans->counter]; 4003 if (count >= counter->max) 4004 ret = 0; 4005 } 4006 4007 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4008 xmlRegStatePtr to = exec->comp->states[trans->to]; 4009 4010 /* 4011 * this is a multiple input sequence 4012 */ 4013 if (exec->state->nbTrans > exec->transno + 1) { 4014 if (exec->inputStackNr <= 0) { 4015 xmlFARegExecSaveInputString(exec, value, data); 4016 } 4017 xmlFARegExecSave(exec); 4018 } 4019 exec->transcount = 1; 4020 do { 4021 /* 4022 * Try to progress as much as possible on the input 4023 */ 4024 if (exec->transcount == atom->max) { 4025 break; 4026 } 4027 exec->index++; 4028 value = exec->inputStack[exec->index].value; 4029 data = exec->inputStack[exec->index].data; 4030 #ifdef DEBUG_PUSH 4031 printf("value loaded: %s\n", value); 4032 #endif 4033 4034 /* 4035 * End of input: stop here 4036 */ 4037 if (value == NULL) { 4038 exec->index --; 4039 break; 4040 } 4041 if (exec->transcount >= atom->min) { 4042 int transno = exec->transno; 4043 xmlRegStatePtr state = exec->state; 4044 4045 /* 4046 * The transition is acceptable save it 4047 */ 4048 exec->transno = -1; /* trick */ 4049 exec->state = to; 4050 if (exec->inputStackNr <= 0) { 4051 xmlFARegExecSaveInputString(exec, value, data); 4052 } 4053 xmlFARegExecSave(exec); 4054 exec->transno = transno; 4055 exec->state = state; 4056 } 4057 ret = xmlStrEqual(value, atom->valuep); 4058 exec->transcount++; 4059 } while (ret == 1); 4060 if (exec->transcount < atom->min) 4061 ret = 0; 4062 4063 /* 4064 * If the last check failed but one transition was found 4065 * possible, rollback 4066 */ 4067 if (ret < 0) 4068 ret = 0; 4069 if (ret == 0) { 4070 goto rollback; 4071 } 4072 } 4073 } 4074 if (ret == 1) { 4075 if ((exec->callback != NULL) && (atom != NULL) && 4076 (data != NULL)) { 4077 exec->callback(exec->data, atom->valuep, 4078 atom->data, data); 4079 } 4080 if (exec->state->nbTrans > exec->transno + 1) { 4081 if (exec->inputStackNr <= 0) { 4082 xmlFARegExecSaveInputString(exec, value, data); 4083 } 4084 xmlFARegExecSave(exec); 4085 } 4086 if (trans->counter >= 0) { 4087 #ifdef DEBUG_PUSH 4088 printf("Increasing count %d\n", trans->counter); 4089 #endif 4090 exec->counts[trans->counter]++; 4091 } 4092 if ((trans->count >= 0) && 4093 (trans->count < REGEXP_ALL_COUNTER)) { 4094 #ifdef DEBUG_REGEXP_EXEC 4095 printf("resetting count %d on transition\n", 4096 trans->count); 4097 #endif 4098 exec->counts[trans->count] = 0; 4099 } 4100 #ifdef DEBUG_PUSH 4101 printf("entering state %d\n", trans->to); 4102 #endif 4103 if ((exec->comp->states[trans->to] != NULL) && 4104 (exec->comp->states[trans->to]->type == 4105 XML_REGEXP_SINK_STATE)) { 4106 /* 4107 * entering a sink state, save the current state as error 4108 * state. 4109 */ 4110 if (exec->errString != NULL) 4111 xmlFree(exec->errString); 4112 exec->errString = xmlStrdup(value); 4113 exec->errState = exec->state; 4114 memcpy(exec->errCounts, exec->counts, 4115 exec->comp->nbCounters * sizeof(int)); 4116 } 4117 exec->state = exec->comp->states[trans->to]; 4118 exec->transno = 0; 4119 if (trans->atom != NULL) { 4120 if (exec->inputStack != NULL) { 4121 exec->index++; 4122 if (exec->index < exec->inputStackNr) { 4123 value = exec->inputStack[exec->index].value; 4124 data = exec->inputStack[exec->index].data; 4125 #ifdef DEBUG_PUSH 4126 printf("value loaded: %s\n", value); 4127 #endif 4128 } else { 4129 value = NULL; 4130 data = NULL; 4131 #ifdef DEBUG_PUSH 4132 printf("end of input\n"); 4133 #endif 4134 } 4135 } else { 4136 value = NULL; 4137 data = NULL; 4138 #ifdef DEBUG_PUSH 4139 printf("end of input\n"); 4140 #endif 4141 } 4142 } 4143 goto progress; 4144 } else if (ret < 0) { 4145 exec->status = -4; 4146 break; 4147 } 4148 } 4149 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4150 rollback: 4151 /* 4152 * if we didn't yet rollback on the current input 4153 * store the current state as the error state. 4154 */ 4155 if ((progress) && (exec->state != NULL) && 4156 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4157 progress = 0; 4158 if (exec->errString != NULL) 4159 xmlFree(exec->errString); 4160 exec->errString = xmlStrdup(value); 4161 exec->errState = exec->state; 4162 if (exec->comp->nbCounters) 4163 memcpy(exec->errCounts, exec->counts, 4164 exec->comp->nbCounters * sizeof(int)); 4165 } 4166 4167 /* 4168 * Failed to find a way out 4169 */ 4170 exec->determinist = 0; 4171 xmlFARegExecRollBack(exec); 4172 if ((exec->inputStack != NULL ) && (exec->status == 0)) { 4173 value = exec->inputStack[exec->index].value; 4174 data = exec->inputStack[exec->index].data; 4175 #ifdef DEBUG_PUSH 4176 printf("value loaded: %s\n", value); 4177 #endif 4178 } 4179 } 4180 continue; 4181 progress: 4182 progress = 1; 4183 continue; 4184 } 4185 if (exec->status == 0) { 4186 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4187 } 4188 #ifdef DEBUG_ERR 4189 if (exec->status < 0) { 4190 testerr(exec); 4191 } 4192 #endif 4193 return(exec->status); 4194 } 4195 4196 /** 4197 * xmlRegExecPushString: 4198 * @exec: a regexp execution context or NULL to indicate the end 4199 * @value: a string token input 4200 * @data: data associated to the token to reuse in callbacks 4201 * 4202 * Push one input token in the execution context 4203 * 4204 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4205 * a negative value in case of error. 4206 */ 4207 int 4208 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4209 void *data) { 4210 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4211 } 4212 4213 /** 4214 * xmlRegExecPushString2: 4215 * @exec: a regexp execution context or NULL to indicate the end 4216 * @value: the first string token input 4217 * @value2: the second string token input 4218 * @data: data associated to the token to reuse in callbacks 4219 * 4220 * Push one input token in the execution context 4221 * 4222 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4223 * a negative value in case of error. 4224 */ 4225 int 4226 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4227 const xmlChar *value2, void *data) { 4228 xmlChar buf[150]; 4229 int lenn, lenp, ret; 4230 xmlChar *str; 4231 4232 if (exec == NULL) 4233 return(-1); 4234 if (exec->comp == NULL) 4235 return(-1); 4236 if (exec->status != 0) 4237 return(exec->status); 4238 4239 if (value2 == NULL) 4240 return(xmlRegExecPushString(exec, value, data)); 4241 4242 lenn = strlen((char *) value2); 4243 lenp = strlen((char *) value); 4244 4245 if (150 < lenn + lenp + 2) { 4246 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4247 if (str == NULL) { 4248 exec->status = -1; 4249 return(-1); 4250 } 4251 } else { 4252 str = buf; 4253 } 4254 memcpy(&str[0], value, lenp); 4255 str[lenp] = XML_REG_STRING_SEPARATOR; 4256 memcpy(&str[lenp + 1], value2, lenn); 4257 str[lenn + lenp + 1] = 0; 4258 4259 if (exec->comp->compact != NULL) 4260 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4261 else 4262 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4263 4264 if (str != buf) 4265 xmlFree(str); 4266 return(ret); 4267 } 4268 4269 /** 4270 * xmlRegExecGetValues: 4271 * @exec: a regexp execution context 4272 * @err: error extraction or normal one 4273 * @nbval: pointer to the number of accepted values IN/OUT 4274 * @nbneg: return number of negative transitions 4275 * @values: pointer to the array of acceptable values 4276 * @terminal: return value if this was a terminal state 4277 * 4278 * Extract information from the regexp execution, internal routine to 4279 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4280 * 4281 * Returns: 0 in case of success or -1 in case of error. 4282 */ 4283 static int 4284 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4285 int *nbval, int *nbneg, 4286 xmlChar **values, int *terminal) { 4287 int maxval; 4288 int nb = 0; 4289 4290 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4291 (values == NULL) || (*nbval <= 0)) 4292 return(-1); 4293 4294 maxval = *nbval; 4295 *nbval = 0; 4296 *nbneg = 0; 4297 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4298 xmlRegexpPtr comp; 4299 int target, i, state; 4300 4301 comp = exec->comp; 4302 4303 if (err) { 4304 if (exec->errStateNo == -1) return(-1); 4305 state = exec->errStateNo; 4306 } else { 4307 state = exec->index; 4308 } 4309 if (terminal != NULL) { 4310 if (comp->compact[state * (comp->nbstrings + 1)] == 4311 XML_REGEXP_FINAL_STATE) 4312 *terminal = 1; 4313 else 4314 *terminal = 0; 4315 } 4316 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4317 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4318 if ((target > 0) && (target <= comp->nbstates) && 4319 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4320 XML_REGEXP_SINK_STATE)) { 4321 values[nb++] = comp->stringMap[i]; 4322 (*nbval)++; 4323 } 4324 } 4325 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4326 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4327 if ((target > 0) && (target <= comp->nbstates) && 4328 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4329 XML_REGEXP_SINK_STATE)) { 4330 values[nb++] = comp->stringMap[i]; 4331 (*nbneg)++; 4332 } 4333 } 4334 } else { 4335 int transno; 4336 xmlRegTransPtr trans; 4337 xmlRegAtomPtr atom; 4338 xmlRegStatePtr state; 4339 4340 if (terminal != NULL) { 4341 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4342 *terminal = 1; 4343 else 4344 *terminal = 0; 4345 } 4346 4347 if (err) { 4348 if (exec->errState == NULL) return(-1); 4349 state = exec->errState; 4350 } else { 4351 if (exec->state == NULL) return(-1); 4352 state = exec->state; 4353 } 4354 for (transno = 0; 4355 (transno < state->nbTrans) && (nb < maxval); 4356 transno++) { 4357 trans = &state->trans[transno]; 4358 if (trans->to < 0) 4359 continue; 4360 atom = trans->atom; 4361 if ((atom == NULL) || (atom->valuep == NULL)) 4362 continue; 4363 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4364 /* this should not be reached but ... */ 4365 TODO; 4366 } else if (trans->count == REGEXP_ALL_COUNTER) { 4367 /* this should not be reached but ... */ 4368 TODO; 4369 } else if (trans->counter >= 0) { 4370 xmlRegCounterPtr counter = NULL; 4371 int count; 4372 4373 if (err) 4374 count = exec->errCounts[trans->counter]; 4375 else 4376 count = exec->counts[trans->counter]; 4377 if (exec->comp != NULL) 4378 counter = &exec->comp->counters[trans->counter]; 4379 if ((counter == NULL) || (count < counter->max)) { 4380 if (atom->neg) 4381 values[nb++] = (xmlChar *) atom->valuep2; 4382 else 4383 values[nb++] = (xmlChar *) atom->valuep; 4384 (*nbval)++; 4385 } 4386 } else { 4387 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) && 4388 (exec->comp->states[trans->to]->type != 4389 XML_REGEXP_SINK_STATE)) { 4390 if (atom->neg) 4391 values[nb++] = (xmlChar *) atom->valuep2; 4392 else 4393 values[nb++] = (xmlChar *) atom->valuep; 4394 (*nbval)++; 4395 } 4396 } 4397 } 4398 for (transno = 0; 4399 (transno < state->nbTrans) && (nb < maxval); 4400 transno++) { 4401 trans = &state->trans[transno]; 4402 if (trans->to < 0) 4403 continue; 4404 atom = trans->atom; 4405 if ((atom == NULL) || (atom->valuep == NULL)) 4406 continue; 4407 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4408 continue; 4409 } else if (trans->count == REGEXP_ALL_COUNTER) { 4410 continue; 4411 } else if (trans->counter >= 0) { 4412 continue; 4413 } else { 4414 if ((exec->comp->states[trans->to] != NULL) && 4415 (exec->comp->states[trans->to]->type == 4416 XML_REGEXP_SINK_STATE)) { 4417 if (atom->neg) 4418 values[nb++] = (xmlChar *) atom->valuep2; 4419 else 4420 values[nb++] = (xmlChar *) atom->valuep; 4421 (*nbneg)++; 4422 } 4423 } 4424 } 4425 } 4426 return(0); 4427 } 4428 4429 /** 4430 * xmlRegExecNextValues: 4431 * @exec: a regexp execution context 4432 * @nbval: pointer to the number of accepted values IN/OUT 4433 * @nbneg: return number of negative transitions 4434 * @values: pointer to the array of acceptable values 4435 * @terminal: return value if this was a terminal state 4436 * 4437 * Extract information from the regexp execution, 4438 * the parameter @values must point to an array of @nbval string pointers 4439 * on return nbval will contain the number of possible strings in that 4440 * state and the @values array will be updated with them. The string values 4441 * returned will be freed with the @exec context and don't need to be 4442 * deallocated. 4443 * 4444 * Returns: 0 in case of success or -1 in case of error. 4445 */ 4446 int 4447 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4448 xmlChar **values, int *terminal) { 4449 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4450 } 4451 4452 /** 4453 * xmlRegExecErrInfo: 4454 * @exec: a regexp execution context generating an error 4455 * @string: return value for the error string 4456 * @nbval: pointer to the number of accepted values IN/OUT 4457 * @nbneg: return number of negative transitions 4458 * @values: pointer to the array of acceptable values 4459 * @terminal: return value if this was a terminal state 4460 * 4461 * Extract error information from the regexp execution, the parameter 4462 * @string will be updated with the value pushed and not accepted, 4463 * the parameter @values must point to an array of @nbval string pointers 4464 * on return nbval will contain the number of possible strings in that 4465 * state and the @values array will be updated with them. The string values 4466 * returned will be freed with the @exec context and don't need to be 4467 * deallocated. 4468 * 4469 * Returns: 0 in case of success or -1 in case of error. 4470 */ 4471 int 4472 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4473 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4474 if (exec == NULL) 4475 return(-1); 4476 if (string != NULL) { 4477 if (exec->status != 0) 4478 *string = exec->errString; 4479 else 4480 *string = NULL; 4481 } 4482 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4483 } 4484 4485 #ifdef DEBUG_ERR 4486 static void testerr(xmlRegExecCtxtPtr exec) { 4487 const xmlChar *string; 4488 xmlChar *values[5]; 4489 int nb = 5; 4490 int nbneg; 4491 int terminal; 4492 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4493 } 4494 #endif 4495 4496 #if 0 4497 static int 4498 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4499 xmlRegTransPtr trans; 4500 xmlRegAtomPtr atom; 4501 int ret; 4502 int codepoint, len; 4503 4504 if (exec == NULL) 4505 return(-1); 4506 if (exec->status != 0) 4507 return(exec->status); 4508 4509 while ((exec->status == 0) && 4510 ((exec->inputString[exec->index] != 0) || 4511 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4512 4513 /* 4514 * End of input on non-terminal state, rollback, however we may 4515 * still have epsilon like transition for counted transitions 4516 * on counters, in that case don't break too early. 4517 */ 4518 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4519 goto rollback; 4520 4521 exec->transcount = 0; 4522 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4523 trans = &exec->state->trans[exec->transno]; 4524 if (trans->to < 0) 4525 continue; 4526 atom = trans->atom; 4527 ret = 0; 4528 if (trans->count >= 0) { 4529 int count; 4530 xmlRegCounterPtr counter; 4531 4532 /* 4533 * A counted transition. 4534 */ 4535 4536 count = exec->counts[trans->count]; 4537 counter = &exec->comp->counters[trans->count]; 4538 #ifdef DEBUG_REGEXP_EXEC 4539 printf("testing count %d: val %d, min %d, max %d\n", 4540 trans->count, count, counter->min, counter->max); 4541 #endif 4542 ret = ((count >= counter->min) && (count <= counter->max)); 4543 } else if (atom == NULL) { 4544 fprintf(stderr, "epsilon transition left at runtime\n"); 4545 exec->status = -2; 4546 break; 4547 } else if (exec->inputString[exec->index] != 0) { 4548 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4549 ret = xmlRegCheckCharacter(atom, codepoint); 4550 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4551 xmlRegStatePtr to = exec->comp->states[trans->to]; 4552 4553 /* 4554 * this is a multiple input sequence 4555 */ 4556 if (exec->state->nbTrans > exec->transno + 1) { 4557 xmlFARegExecSave(exec); 4558 } 4559 exec->transcount = 1; 4560 do { 4561 /* 4562 * Try to progress as much as possible on the input 4563 */ 4564 if (exec->transcount == atom->max) { 4565 break; 4566 } 4567 exec->index += len; 4568 /* 4569 * End of input: stop here 4570 */ 4571 if (exec->inputString[exec->index] == 0) { 4572 exec->index -= len; 4573 break; 4574 } 4575 if (exec->transcount >= atom->min) { 4576 int transno = exec->transno; 4577 xmlRegStatePtr state = exec->state; 4578 4579 /* 4580 * The transition is acceptable save it 4581 */ 4582 exec->transno = -1; /* trick */ 4583 exec->state = to; 4584 xmlFARegExecSave(exec); 4585 exec->transno = transno; 4586 exec->state = state; 4587 } 4588 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4589 len); 4590 ret = xmlRegCheckCharacter(atom, codepoint); 4591 exec->transcount++; 4592 } while (ret == 1); 4593 if (exec->transcount < atom->min) 4594 ret = 0; 4595 4596 /* 4597 * If the last check failed but one transition was found 4598 * possible, rollback 4599 */ 4600 if (ret < 0) 4601 ret = 0; 4602 if (ret == 0) { 4603 goto rollback; 4604 } 4605 } 4606 } 4607 if (ret == 1) { 4608 if (exec->state->nbTrans > exec->transno + 1) { 4609 xmlFARegExecSave(exec); 4610 } 4611 /* 4612 * restart count for expressions like this ((abc){2})* 4613 */ 4614 if (trans->count >= 0) { 4615 #ifdef DEBUG_REGEXP_EXEC 4616 printf("Reset count %d\n", trans->count); 4617 #endif 4618 exec->counts[trans->count] = 0; 4619 } 4620 if (trans->counter >= 0) { 4621 #ifdef DEBUG_REGEXP_EXEC 4622 printf("Increasing count %d\n", trans->counter); 4623 #endif 4624 exec->counts[trans->counter]++; 4625 } 4626 #ifdef DEBUG_REGEXP_EXEC 4627 printf("entering state %d\n", trans->to); 4628 #endif 4629 exec->state = exec->comp->states[trans->to]; 4630 exec->transno = 0; 4631 if (trans->atom != NULL) { 4632 exec->index += len; 4633 } 4634 goto progress; 4635 } else if (ret < 0) { 4636 exec->status = -4; 4637 break; 4638 } 4639 } 4640 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4641 rollback: 4642 /* 4643 * Failed to find a way out 4644 */ 4645 exec->determinist = 0; 4646 xmlFARegExecRollBack(exec); 4647 } 4648 progress: 4649 continue; 4650 } 4651 } 4652 #endif 4653 /************************************************************************ 4654 * * 4655 * Parser for the Schemas Datatype Regular Expressions * 4656 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4657 * * 4658 ************************************************************************/ 4659 4660 /** 4661 * xmlFAIsChar: 4662 * @ctxt: a regexp parser context 4663 * 4664 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4665 */ 4666 static int 4667 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4668 int cur; 4669 int len; 4670 4671 cur = CUR_SCHAR(ctxt->cur, len); 4672 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4673 (cur == '*') || (cur == '+') || (cur == '(') || 4674 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4675 (cur == 0x5D) || (cur == 0)) 4676 return(-1); 4677 return(cur); 4678 } 4679 4680 /** 4681 * xmlFAParseCharProp: 4682 * @ctxt: a regexp parser context 4683 * 4684 * [27] charProp ::= IsCategory | IsBlock 4685 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4686 * Separators | Symbols | Others 4687 * [29] Letters ::= 'L' [ultmo]? 4688 * [30] Marks ::= 'M' [nce]? 4689 * [31] Numbers ::= 'N' [dlo]? 4690 * [32] Punctuation ::= 'P' [cdseifo]? 4691 * [33] Separators ::= 'Z' [slp]? 4692 * [34] Symbols ::= 'S' [mcko]? 4693 * [35] Others ::= 'C' [cfon]? 4694 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4695 */ 4696 static void 4697 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4698 int cur; 4699 xmlRegAtomType type = (xmlRegAtomType) 0; 4700 xmlChar *blockName = NULL; 4701 4702 cur = CUR; 4703 if (cur == 'L') { 4704 NEXT; 4705 cur = CUR; 4706 if (cur == 'u') { 4707 NEXT; 4708 type = XML_REGEXP_LETTER_UPPERCASE; 4709 } else if (cur == 'l') { 4710 NEXT; 4711 type = XML_REGEXP_LETTER_LOWERCASE; 4712 } else if (cur == 't') { 4713 NEXT; 4714 type = XML_REGEXP_LETTER_TITLECASE; 4715 } else if (cur == 'm') { 4716 NEXT; 4717 type = XML_REGEXP_LETTER_MODIFIER; 4718 } else if (cur == 'o') { 4719 NEXT; 4720 type = XML_REGEXP_LETTER_OTHERS; 4721 } else { 4722 type = XML_REGEXP_LETTER; 4723 } 4724 } else if (cur == 'M') { 4725 NEXT; 4726 cur = CUR; 4727 if (cur == 'n') { 4728 NEXT; 4729 /* nonspacing */ 4730 type = XML_REGEXP_MARK_NONSPACING; 4731 } else if (cur == 'c') { 4732 NEXT; 4733 /* spacing combining */ 4734 type = XML_REGEXP_MARK_SPACECOMBINING; 4735 } else if (cur == 'e') { 4736 NEXT; 4737 /* enclosing */ 4738 type = XML_REGEXP_MARK_ENCLOSING; 4739 } else { 4740 /* all marks */ 4741 type = XML_REGEXP_MARK; 4742 } 4743 } else if (cur == 'N') { 4744 NEXT; 4745 cur = CUR; 4746 if (cur == 'd') { 4747 NEXT; 4748 /* digital */ 4749 type = XML_REGEXP_NUMBER_DECIMAL; 4750 } else if (cur == 'l') { 4751 NEXT; 4752 /* letter */ 4753 type = XML_REGEXP_NUMBER_LETTER; 4754 } else if (cur == 'o') { 4755 NEXT; 4756 /* other */ 4757 type = XML_REGEXP_NUMBER_OTHERS; 4758 } else { 4759 /* all numbers */ 4760 type = XML_REGEXP_NUMBER; 4761 } 4762 } else if (cur == 'P') { 4763 NEXT; 4764 cur = CUR; 4765 if (cur == 'c') { 4766 NEXT; 4767 /* connector */ 4768 type = XML_REGEXP_PUNCT_CONNECTOR; 4769 } else if (cur == 'd') { 4770 NEXT; 4771 /* dash */ 4772 type = XML_REGEXP_PUNCT_DASH; 4773 } else if (cur == 's') { 4774 NEXT; 4775 /* open */ 4776 type = XML_REGEXP_PUNCT_OPEN; 4777 } else if (cur == 'e') { 4778 NEXT; 4779 /* close */ 4780 type = XML_REGEXP_PUNCT_CLOSE; 4781 } else if (cur == 'i') { 4782 NEXT; 4783 /* initial quote */ 4784 type = XML_REGEXP_PUNCT_INITQUOTE; 4785 } else if (cur == 'f') { 4786 NEXT; 4787 /* final quote */ 4788 type = XML_REGEXP_PUNCT_FINQUOTE; 4789 } else if (cur == 'o') { 4790 NEXT; 4791 /* other */ 4792 type = XML_REGEXP_PUNCT_OTHERS; 4793 } else { 4794 /* all punctuation */ 4795 type = XML_REGEXP_PUNCT; 4796 } 4797 } else if (cur == 'Z') { 4798 NEXT; 4799 cur = CUR; 4800 if (cur == 's') { 4801 NEXT; 4802 /* space */ 4803 type = XML_REGEXP_SEPAR_SPACE; 4804 } else if (cur == 'l') { 4805 NEXT; 4806 /* line */ 4807 type = XML_REGEXP_SEPAR_LINE; 4808 } else if (cur == 'p') { 4809 NEXT; 4810 /* paragraph */ 4811 type = XML_REGEXP_SEPAR_PARA; 4812 } else { 4813 /* all separators */ 4814 type = XML_REGEXP_SEPAR; 4815 } 4816 } else if (cur == 'S') { 4817 NEXT; 4818 cur = CUR; 4819 if (cur == 'm') { 4820 NEXT; 4821 type = XML_REGEXP_SYMBOL_MATH; 4822 /* math */ 4823 } else if (cur == 'c') { 4824 NEXT; 4825 type = XML_REGEXP_SYMBOL_CURRENCY; 4826 /* currency */ 4827 } else if (cur == 'k') { 4828 NEXT; 4829 type = XML_REGEXP_SYMBOL_MODIFIER; 4830 /* modifiers */ 4831 } else if (cur == 'o') { 4832 NEXT; 4833 type = XML_REGEXP_SYMBOL_OTHERS; 4834 /* other */ 4835 } else { 4836 /* all symbols */ 4837 type = XML_REGEXP_SYMBOL; 4838 } 4839 } else if (cur == 'C') { 4840 NEXT; 4841 cur = CUR; 4842 if (cur == 'c') { 4843 NEXT; 4844 /* control */ 4845 type = XML_REGEXP_OTHER_CONTROL; 4846 } else if (cur == 'f') { 4847 NEXT; 4848 /* format */ 4849 type = XML_REGEXP_OTHER_FORMAT; 4850 } else if (cur == 'o') { 4851 NEXT; 4852 /* private use */ 4853 type = XML_REGEXP_OTHER_PRIVATE; 4854 } else if (cur == 'n') { 4855 NEXT; 4856 /* not assigned */ 4857 type = XML_REGEXP_OTHER_NA; 4858 } else { 4859 /* all others */ 4860 type = XML_REGEXP_OTHER; 4861 } 4862 } else if (cur == 'I') { 4863 const xmlChar *start; 4864 NEXT; 4865 cur = CUR; 4866 if (cur != 's') { 4867 ERROR("IsXXXX expected"); 4868 return; 4869 } 4870 NEXT; 4871 start = ctxt->cur; 4872 cur = CUR; 4873 if (((cur >= 'a') && (cur <= 'z')) || 4874 ((cur >= 'A') && (cur <= 'Z')) || 4875 ((cur >= '0') && (cur <= '9')) || 4876 (cur == 0x2D)) { 4877 NEXT; 4878 cur = CUR; 4879 while (((cur >= 'a') && (cur <= 'z')) || 4880 ((cur >= 'A') && (cur <= 'Z')) || 4881 ((cur >= '0') && (cur <= '9')) || 4882 (cur == 0x2D)) { 4883 NEXT; 4884 cur = CUR; 4885 } 4886 } 4887 type = XML_REGEXP_BLOCK_NAME; 4888 blockName = xmlStrndup(start, ctxt->cur - start); 4889 } else { 4890 ERROR("Unknown char property"); 4891 return; 4892 } 4893 if (ctxt->atom == NULL) { 4894 ctxt->atom = xmlRegNewAtom(ctxt, type); 4895 if (ctxt->atom != NULL) 4896 ctxt->atom->valuep = blockName; 4897 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4898 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4899 type, 0, 0, blockName); 4900 } 4901 } 4902 4903 static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt) 4904 { 4905 int val = 0, i, cur; 4906 for (i = 0; i < 4; i++) { 4907 NEXT; 4908 val *= 16; 4909 cur = CUR; 4910 if (cur >= '0' && cur <= '9') { 4911 val += cur - '0'; 4912 } else if (cur >= 'A' && cur <= 'F') { 4913 val += cur - 'A' + 10; 4914 } else if (cur >= 'a' && cur <= 'f') { 4915 val += cur - 'a' + 10; 4916 } else { 4917 ERROR("Expecting hex digit"); 4918 return -1; 4919 } 4920 } 4921 return val; 4922 } 4923 4924 static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt) 4925 { 4926 int val = parse_escaped_codeunit(ctxt); 4927 if (0xD800 <= val && val <= 0xDBFF) { 4928 NEXT; 4929 if (CUR == '\\') { 4930 NEXT; 4931 if (CUR == 'u') { 4932 int low = parse_escaped_codeunit(ctxt); 4933 if (0xDC00 <= low && low <= 0xDFFF) { 4934 return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000; 4935 } 4936 } 4937 } 4938 ERROR("Invalid low surrogate pair code unit"); 4939 val = -1; 4940 } 4941 return val; 4942 } 4943 4944 /** 4945 * xmlFAParseCharClassEsc: 4946 * @ctxt: a regexp parser context 4947 * 4948 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4949 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4950 * [25] catEsc ::= '\p{' charProp '}' 4951 * [26] complEsc ::= '\P{' charProp '}' 4952 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4953 */ 4954 static void 4955 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4956 int cur; 4957 4958 if (CUR == '.') { 4959 if (ctxt->atom == NULL) { 4960 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4961 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4962 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4963 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4964 } 4965 NEXT; 4966 return; 4967 } 4968 if (CUR != '\\') { 4969 ERROR("Escaped sequence: expecting \\"); 4970 return; 4971 } 4972 NEXT; 4973 cur = CUR; 4974 if (cur == 'p') { 4975 NEXT; 4976 if (CUR != '{') { 4977 ERROR("Expecting '{'"); 4978 return; 4979 } 4980 NEXT; 4981 xmlFAParseCharProp(ctxt); 4982 if (CUR != '}') { 4983 ERROR("Expecting '}'"); 4984 return; 4985 } 4986 NEXT; 4987 } else if (cur == 'P') { 4988 NEXT; 4989 if (CUR != '{') { 4990 ERROR("Expecting '{'"); 4991 return; 4992 } 4993 NEXT; 4994 xmlFAParseCharProp(ctxt); 4995 if (ctxt->atom != NULL) 4996 ctxt->atom->neg = 1; 4997 if (CUR != '}') { 4998 ERROR("Expecting '}'"); 4999 return; 5000 } 5001 NEXT; 5002 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 5003 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 5004 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 5005 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 5006 (cur == 0x5E) || 5007 5008 /* Non-standard escape sequences: 5009 * Java 1.8|.NET Core 3.1|MSXML 6 */ 5010 (cur == '!') || /* + | + | + */ 5011 (cur == '"') || /* + | + | + */ 5012 (cur == '#') || /* + | + | + */ 5013 (cur == '$') || /* + | + | + */ 5014 (cur == '%') || /* + | + | + */ 5015 (cur == ',') || /* + | + | + */ 5016 (cur == '/') || /* + | + | + */ 5017 (cur == ':') || /* + | + | + */ 5018 (cur == ';') || /* + | + | + */ 5019 (cur == '=') || /* + | + | + */ 5020 (cur == '>') || /* | + | + */ 5021 (cur == '@') || /* + | + | + */ 5022 (cur == '`') || /* + | + | + */ 5023 (cur == '~') || /* + | + | + */ 5024 (cur == 'u')) { /* | + | + */ 5025 if (ctxt->atom == NULL) { 5026 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5027 if (ctxt->atom != NULL) { 5028 switch (cur) { 5029 case 'n': 5030 ctxt->atom->codepoint = '\n'; 5031 break; 5032 case 'r': 5033 ctxt->atom->codepoint = '\r'; 5034 break; 5035 case 't': 5036 ctxt->atom->codepoint = '\t'; 5037 break; 5038 case 'u': 5039 cur = parse_escaped_codepoint(ctxt); 5040 if (cur < 0) { 5041 return; 5042 } 5043 ctxt->atom->codepoint = cur; 5044 break; 5045 default: 5046 ctxt->atom->codepoint = cur; 5047 } 5048 } 5049 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 5050 switch (cur) { 5051 case 'n': 5052 cur = '\n'; 5053 break; 5054 case 'r': 5055 cur = '\r'; 5056 break; 5057 case 't': 5058 cur = '\t'; 5059 break; 5060 } 5061 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5062 XML_REGEXP_CHARVAL, cur, cur, NULL); 5063 } 5064 NEXT; 5065 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 5066 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 5067 (cur == 'w') || (cur == 'W')) { 5068 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 5069 5070 switch (cur) { 5071 case 's': 5072 type = XML_REGEXP_ANYSPACE; 5073 break; 5074 case 'S': 5075 type = XML_REGEXP_NOTSPACE; 5076 break; 5077 case 'i': 5078 type = XML_REGEXP_INITNAME; 5079 break; 5080 case 'I': 5081 type = XML_REGEXP_NOTINITNAME; 5082 break; 5083 case 'c': 5084 type = XML_REGEXP_NAMECHAR; 5085 break; 5086 case 'C': 5087 type = XML_REGEXP_NOTNAMECHAR; 5088 break; 5089 case 'd': 5090 type = XML_REGEXP_DECIMAL; 5091 break; 5092 case 'D': 5093 type = XML_REGEXP_NOTDECIMAL; 5094 break; 5095 case 'w': 5096 type = XML_REGEXP_REALCHAR; 5097 break; 5098 case 'W': 5099 type = XML_REGEXP_NOTREALCHAR; 5100 break; 5101 } 5102 NEXT; 5103 if (ctxt->atom == NULL) { 5104 ctxt->atom = xmlRegNewAtom(ctxt, type); 5105 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 5106 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5107 type, 0, 0, NULL); 5108 } 5109 } else { 5110 ERROR("Wrong escape sequence, misuse of character '\\'"); 5111 } 5112 } 5113 5114 /** 5115 * xmlFAParseCharRange: 5116 * @ctxt: a regexp parser context 5117 * 5118 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 5119 * [18] seRange ::= charOrEsc '-' charOrEsc 5120 * [20] charOrEsc ::= XmlChar | SingleCharEsc 5121 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 5122 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 5123 */ 5124 static void 5125 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 5126 int cur, len; 5127 int start = -1; 5128 int end = -1; 5129 5130 if (CUR == '\0') { 5131 ERROR("Expecting ']'"); 5132 return; 5133 } 5134 5135 cur = CUR; 5136 if (cur == '\\') { 5137 NEXT; 5138 cur = CUR; 5139 switch (cur) { 5140 case 'n': start = 0xA; break; 5141 case 'r': start = 0xD; break; 5142 case 't': start = 0x9; break; 5143 case '\\': case '|': case '.': case '-': case '^': case '?': 5144 case '*': case '+': case '{': case '}': case '(': case ')': 5145 case '[': case ']': 5146 start = cur; break; 5147 default: 5148 ERROR("Invalid escape value"); 5149 return; 5150 } 5151 end = start; 5152 len = 1; 5153 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5154 end = start = CUR_SCHAR(ctxt->cur, len); 5155 } else { 5156 ERROR("Expecting a char range"); 5157 return; 5158 } 5159 /* 5160 * Since we are "inside" a range, we can assume ctxt->cur is past 5161 * the start of ctxt->string, and PREV should be safe 5162 */ 5163 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5164 NEXTL(len); 5165 return; 5166 } 5167 NEXTL(len); 5168 cur = CUR; 5169 if ((cur != '-') || (NXT(1) == '[') || (NXT(1) == ']')) { 5170 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5171 XML_REGEXP_CHARVAL, start, end, NULL); 5172 return; 5173 } 5174 NEXT; 5175 cur = CUR; 5176 if (cur == '\\') { 5177 NEXT; 5178 cur = CUR; 5179 switch (cur) { 5180 case 'n': end = 0xA; break; 5181 case 'r': end = 0xD; break; 5182 case 't': end = 0x9; break; 5183 case '\\': case '|': case '.': case '-': case '^': case '?': 5184 case '*': case '+': case '{': case '}': case '(': case ')': 5185 case '[': case ']': 5186 end = cur; break; 5187 default: 5188 ERROR("Invalid escape value"); 5189 return; 5190 } 5191 len = 1; 5192 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) { 5193 end = CUR_SCHAR(ctxt->cur, len); 5194 } else { 5195 ERROR("Expecting the end of a char range"); 5196 return; 5197 } 5198 5199 /* TODO check that the values are acceptable character ranges for XML */ 5200 if (end < start) { 5201 ERROR("End of range is before start of range"); 5202 } else { 5203 NEXTL(len); 5204 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5205 XML_REGEXP_CHARVAL, start, end, NULL); 5206 } 5207 return; 5208 } 5209 5210 /** 5211 * xmlFAParsePosCharGroup: 5212 * @ctxt: a regexp parser context 5213 * 5214 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5215 */ 5216 static void 5217 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5218 do { 5219 if (CUR == '\\') { 5220 xmlFAParseCharClassEsc(ctxt); 5221 } else { 5222 xmlFAParseCharRange(ctxt); 5223 } 5224 } while ((CUR != ']') && (CUR != '-') && 5225 (CUR != 0) && (ctxt->error == 0)); 5226 } 5227 5228 /** 5229 * xmlFAParseCharGroup: 5230 * @ctxt: a regexp parser context 5231 * 5232 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5233 * [15] negCharGroup ::= '^' posCharGroup 5234 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5235 * [12] charClassExpr ::= '[' charGroup ']' 5236 */ 5237 static void 5238 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5239 int neg = ctxt->neg; 5240 5241 if (CUR == '^') { 5242 NEXT; 5243 ctxt->neg = !ctxt->neg; 5244 xmlFAParsePosCharGroup(ctxt); 5245 ctxt->neg = neg; 5246 } 5247 while ((CUR != ']') && (ctxt->error == 0)) { 5248 if ((CUR == '-') && (NXT(1) == '[')) { 5249 NEXT; /* eat the '-' */ 5250 NEXT; /* eat the '[' */ 5251 ctxt->neg = 2; 5252 xmlFAParseCharGroup(ctxt); 5253 ctxt->neg = neg; 5254 if (CUR == ']') { 5255 NEXT; 5256 } else { 5257 ERROR("charClassExpr: ']' expected"); 5258 } 5259 break; 5260 } else { 5261 xmlFAParsePosCharGroup(ctxt); 5262 } 5263 } 5264 } 5265 5266 /** 5267 * xmlFAParseCharClass: 5268 * @ctxt: a regexp parser context 5269 * 5270 * [11] charClass ::= charClassEsc | charClassExpr 5271 * [12] charClassExpr ::= '[' charGroup ']' 5272 */ 5273 static void 5274 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5275 if (CUR == '[') { 5276 NEXT; 5277 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5278 if (ctxt->atom == NULL) 5279 return; 5280 xmlFAParseCharGroup(ctxt); 5281 if (CUR == ']') { 5282 NEXT; 5283 } else { 5284 ERROR("xmlFAParseCharClass: ']' expected"); 5285 } 5286 } else { 5287 xmlFAParseCharClassEsc(ctxt); 5288 } 5289 } 5290 5291 /** 5292 * xmlFAParseQuantExact: 5293 * @ctxt: a regexp parser context 5294 * 5295 * [8] QuantExact ::= [0-9]+ 5296 * 5297 * Returns 0 if success or -1 in case of error 5298 */ 5299 static int 5300 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5301 int ret = 0; 5302 int ok = 0; 5303 int overflow = 0; 5304 5305 while ((CUR >= '0') && (CUR <= '9')) { 5306 if (ret > INT_MAX / 10) { 5307 overflow = 1; 5308 } else { 5309 int digit = CUR - '0'; 5310 5311 ret *= 10; 5312 if (ret > INT_MAX - digit) 5313 overflow = 1; 5314 else 5315 ret += digit; 5316 } 5317 ok = 1; 5318 NEXT; 5319 } 5320 if ((ok != 1) || (overflow == 1)) { 5321 return(-1); 5322 } 5323 return(ret); 5324 } 5325 5326 /** 5327 * xmlFAParseQuantifier: 5328 * @ctxt: a regexp parser context 5329 * 5330 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5331 * [5] quantity ::= quantRange | quantMin | QuantExact 5332 * [6] quantRange ::= QuantExact ',' QuantExact 5333 * [7] quantMin ::= QuantExact ',' 5334 * [8] QuantExact ::= [0-9]+ 5335 */ 5336 static int 5337 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5338 int cur; 5339 5340 cur = CUR; 5341 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5342 if (ctxt->atom != NULL) { 5343 if (cur == '?') 5344 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5345 else if (cur == '*') 5346 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5347 else if (cur == '+') 5348 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5349 } 5350 NEXT; 5351 return(1); 5352 } 5353 if (cur == '{') { 5354 int min = 0, max = 0; 5355 5356 NEXT; 5357 cur = xmlFAParseQuantExact(ctxt); 5358 if (cur >= 0) 5359 min = cur; 5360 else { 5361 ERROR("Improper quantifier"); 5362 } 5363 if (CUR == ',') { 5364 NEXT; 5365 if (CUR == '}') 5366 max = INT_MAX; 5367 else { 5368 cur = xmlFAParseQuantExact(ctxt); 5369 if (cur >= 0) 5370 max = cur; 5371 else { 5372 ERROR("Improper quantifier"); 5373 } 5374 } 5375 } 5376 if (CUR == '}') { 5377 NEXT; 5378 } else { 5379 ERROR("Unterminated quantifier"); 5380 } 5381 if (max == 0) 5382 max = min; 5383 if (ctxt->atom != NULL) { 5384 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5385 ctxt->atom->min = min; 5386 ctxt->atom->max = max; 5387 } 5388 return(1); 5389 } 5390 return(0); 5391 } 5392 5393 /** 5394 * xmlFAParseAtom: 5395 * @ctxt: a regexp parser context 5396 * 5397 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5398 */ 5399 static int 5400 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5401 int codepoint, len; 5402 5403 codepoint = xmlFAIsChar(ctxt); 5404 if (codepoint > 0) { 5405 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5406 if (ctxt->atom == NULL) 5407 return(-1); 5408 codepoint = CUR_SCHAR(ctxt->cur, len); 5409 ctxt->atom->codepoint = codepoint; 5410 NEXTL(len); 5411 return(1); 5412 } else if (CUR == '|') { 5413 return(0); 5414 } else if (CUR == 0) { 5415 return(0); 5416 } else if (CUR == ')') { 5417 return(0); 5418 } else if (CUR == '(') { 5419 xmlRegStatePtr start, oldend, start0; 5420 5421 NEXT; 5422 if (ctxt->depth >= 50) { 5423 ERROR("xmlFAParseAtom: maximum nesting depth exceeded"); 5424 return(-1); 5425 } 5426 /* 5427 * this extra Epsilon transition is needed if we count with 0 allowed 5428 * unfortunately this can't be known at that point 5429 */ 5430 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5431 start0 = ctxt->state; 5432 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5433 start = ctxt->state; 5434 oldend = ctxt->end; 5435 ctxt->end = NULL; 5436 ctxt->atom = NULL; 5437 ctxt->depth++; 5438 xmlFAParseRegExp(ctxt, 0); 5439 ctxt->depth--; 5440 if (CUR == ')') { 5441 NEXT; 5442 } else { 5443 ERROR("xmlFAParseAtom: expecting ')'"); 5444 } 5445 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5446 if (ctxt->atom == NULL) 5447 return(-1); 5448 ctxt->atom->start = start; 5449 ctxt->atom->start0 = start0; 5450 ctxt->atom->stop = ctxt->state; 5451 ctxt->end = oldend; 5452 return(1); 5453 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5454 xmlFAParseCharClass(ctxt); 5455 return(1); 5456 } 5457 return(0); 5458 } 5459 5460 /** 5461 * xmlFAParsePiece: 5462 * @ctxt: a regexp parser context 5463 * 5464 * [3] piece ::= atom quantifier? 5465 */ 5466 static int 5467 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5468 int ret; 5469 5470 ctxt->atom = NULL; 5471 ret = xmlFAParseAtom(ctxt); 5472 if (ret == 0) 5473 return(0); 5474 if (ctxt->atom == NULL) { 5475 ERROR("internal: no atom generated"); 5476 } 5477 xmlFAParseQuantifier(ctxt); 5478 return(1); 5479 } 5480 5481 /** 5482 * xmlFAParseBranch: 5483 * @ctxt: a regexp parser context 5484 * @to: optional target to the end of the branch 5485 * 5486 * @to is used to optimize by removing duplicate path in automata 5487 * in expressions like (a|b)(c|d) 5488 * 5489 * [2] branch ::= piece* 5490 */ 5491 static int 5492 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5493 xmlRegStatePtr previous; 5494 int ret; 5495 5496 previous = ctxt->state; 5497 ret = xmlFAParsePiece(ctxt); 5498 if (ret == 0) { 5499 /* Empty branch */ 5500 xmlFAGenerateEpsilonTransition(ctxt, previous, to); 5501 } else { 5502 if (xmlFAGenerateTransitions(ctxt, previous, 5503 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0) 5504 return(-1); 5505 previous = ctxt->state; 5506 ctxt->atom = NULL; 5507 } 5508 while ((ret != 0) && (ctxt->error == 0)) { 5509 ret = xmlFAParsePiece(ctxt); 5510 if (ret != 0) { 5511 if (xmlFAGenerateTransitions(ctxt, previous, 5512 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, 5513 ctxt->atom) < 0) 5514 return(-1); 5515 previous = ctxt->state; 5516 ctxt->atom = NULL; 5517 } 5518 } 5519 return(0); 5520 } 5521 5522 /** 5523 * xmlFAParseRegExp: 5524 * @ctxt: a regexp parser context 5525 * @top: is this the top-level expression ? 5526 * 5527 * [1] regExp ::= branch ( '|' branch )* 5528 */ 5529 static void 5530 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5531 xmlRegStatePtr start, end; 5532 5533 /* if not top start should have been generated by an epsilon trans */ 5534 start = ctxt->state; 5535 ctxt->end = NULL; 5536 xmlFAParseBranch(ctxt, NULL); 5537 if (top) { 5538 #ifdef DEBUG_REGEXP_GRAPH 5539 printf("State %d is final\n", ctxt->state->no); 5540 #endif 5541 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5542 } 5543 if (CUR != '|') { 5544 ctxt->end = ctxt->state; 5545 return; 5546 } 5547 end = ctxt->state; 5548 while ((CUR == '|') && (ctxt->error == 0)) { 5549 NEXT; 5550 ctxt->state = start; 5551 ctxt->end = NULL; 5552 xmlFAParseBranch(ctxt, end); 5553 } 5554 if (!top) { 5555 ctxt->state = end; 5556 ctxt->end = end; 5557 } 5558 } 5559 5560 /************************************************************************ 5561 * * 5562 * The basic API * 5563 * * 5564 ************************************************************************/ 5565 5566 /** 5567 * xmlRegexpPrint: 5568 * @output: the file for the output debug 5569 * @regexp: the compiled regexp 5570 * 5571 * Print the content of the compiled regular expression 5572 */ 5573 void 5574 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5575 int i; 5576 5577 if (output == NULL) 5578 return; 5579 fprintf(output, " regexp: "); 5580 if (regexp == NULL) { 5581 fprintf(output, "NULL\n"); 5582 return; 5583 } 5584 fprintf(output, "'%s' ", regexp->string); 5585 fprintf(output, "\n"); 5586 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5587 for (i = 0;i < regexp->nbAtoms; i++) { 5588 fprintf(output, " %02d ", i); 5589 xmlRegPrintAtom(output, regexp->atoms[i]); 5590 } 5591 fprintf(output, "%d states:", regexp->nbStates); 5592 fprintf(output, "\n"); 5593 for (i = 0;i < regexp->nbStates; i++) { 5594 xmlRegPrintState(output, regexp->states[i]); 5595 } 5596 fprintf(output, "%d counters:\n", regexp->nbCounters); 5597 for (i = 0;i < regexp->nbCounters; i++) { 5598 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5599 regexp->counters[i].max); 5600 } 5601 } 5602 5603 /** 5604 * xmlRegexpCompile: 5605 * @regexp: a regular expression string 5606 * 5607 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5608 * Appendix F and builds an automata suitable for testing strings against 5609 * that regular expression 5610 * 5611 * Returns the compiled expression or NULL in case of error 5612 */ 5613 xmlRegexpPtr 5614 xmlRegexpCompile(const xmlChar *regexp) { 5615 xmlRegexpPtr ret; 5616 xmlRegParserCtxtPtr ctxt; 5617 5618 ctxt = xmlRegNewParserCtxt(regexp); 5619 if (ctxt == NULL) 5620 return(NULL); 5621 5622 /* initialize the parser */ 5623 ctxt->end = NULL; 5624 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5625 xmlRegStatePush(ctxt, ctxt->start); 5626 5627 /* parse the expression building an automata */ 5628 xmlFAParseRegExp(ctxt, 1); 5629 if (CUR != 0) { 5630 ERROR("xmlFAParseRegExp: extra characters"); 5631 } 5632 if (ctxt->error != 0) { 5633 xmlRegFreeParserCtxt(ctxt); 5634 return(NULL); 5635 } 5636 ctxt->end = ctxt->state; 5637 ctxt->start->type = XML_REGEXP_START_STATE; 5638 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5639 5640 /* remove the Epsilon except for counted transitions */ 5641 xmlFAEliminateEpsilonTransitions(ctxt); 5642 5643 5644 if (ctxt->error != 0) { 5645 xmlRegFreeParserCtxt(ctxt); 5646 return(NULL); 5647 } 5648 ret = xmlRegEpxFromParse(ctxt); 5649 xmlRegFreeParserCtxt(ctxt); 5650 return(ret); 5651 } 5652 5653 /** 5654 * xmlRegexpExec: 5655 * @comp: the compiled regular expression 5656 * @content: the value to check against the regular expression 5657 * 5658 * Check if the regular expression generates the value 5659 * 5660 * Returns 1 if it matches, 0 if not and a negative value in case of error 5661 */ 5662 int 5663 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5664 if ((comp == NULL) || (content == NULL)) 5665 return(-1); 5666 return(xmlFARegExec(comp, content)); 5667 } 5668 5669 /** 5670 * xmlRegexpIsDeterminist: 5671 * @comp: the compiled regular expression 5672 * 5673 * Check if the regular expression is determinist 5674 * 5675 * Returns 1 if it yes, 0 if not and a negative value in case of error 5676 */ 5677 int 5678 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5679 xmlAutomataPtr am; 5680 int ret; 5681 5682 if (comp == NULL) 5683 return(-1); 5684 if (comp->determinist != -1) 5685 return(comp->determinist); 5686 5687 am = xmlNewAutomata(); 5688 if (am == NULL) 5689 return(-1); 5690 if (am->states != NULL) { 5691 int i; 5692 5693 for (i = 0;i < am->nbStates;i++) 5694 xmlRegFreeState(am->states[i]); 5695 xmlFree(am->states); 5696 } 5697 am->nbAtoms = comp->nbAtoms; 5698 am->atoms = comp->atoms; 5699 am->nbStates = comp->nbStates; 5700 am->states = comp->states; 5701 am->determinist = -1; 5702 am->flags = comp->flags; 5703 ret = xmlFAComputesDeterminism(am); 5704 am->atoms = NULL; 5705 am->states = NULL; 5706 xmlFreeAutomata(am); 5707 comp->determinist = ret; 5708 return(ret); 5709 } 5710 5711 /** 5712 * xmlRegFreeRegexp: 5713 * @regexp: the regexp 5714 * 5715 * Free a regexp 5716 */ 5717 void 5718 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5719 int i; 5720 if (regexp == NULL) 5721 return; 5722 5723 if (regexp->string != NULL) 5724 xmlFree(regexp->string); 5725 if (regexp->states != NULL) { 5726 for (i = 0;i < regexp->nbStates;i++) 5727 xmlRegFreeState(regexp->states[i]); 5728 xmlFree(regexp->states); 5729 } 5730 if (regexp->atoms != NULL) { 5731 for (i = 0;i < regexp->nbAtoms;i++) 5732 xmlRegFreeAtom(regexp->atoms[i]); 5733 xmlFree(regexp->atoms); 5734 } 5735 if (regexp->counters != NULL) 5736 xmlFree(regexp->counters); 5737 if (regexp->compact != NULL) 5738 xmlFree(regexp->compact); 5739 if (regexp->transdata != NULL) 5740 xmlFree(regexp->transdata); 5741 if (regexp->stringMap != NULL) { 5742 for (i = 0; i < regexp->nbstrings;i++) 5743 xmlFree(regexp->stringMap[i]); 5744 xmlFree(regexp->stringMap); 5745 } 5746 5747 xmlFree(regexp); 5748 } 5749 5750 #ifdef LIBXML_AUTOMATA_ENABLED 5751 /************************************************************************ 5752 * * 5753 * The Automata interface * 5754 * * 5755 ************************************************************************/ 5756 5757 /** 5758 * xmlNewAutomata: 5759 * 5760 * Create a new automata 5761 * 5762 * Returns the new object or NULL in case of failure 5763 */ 5764 xmlAutomataPtr 5765 xmlNewAutomata(void) { 5766 xmlAutomataPtr ctxt; 5767 5768 ctxt = xmlRegNewParserCtxt(NULL); 5769 if (ctxt == NULL) 5770 return(NULL); 5771 5772 /* initialize the parser */ 5773 ctxt->end = NULL; 5774 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5775 if (ctxt->start == NULL) { 5776 xmlFreeAutomata(ctxt); 5777 return(NULL); 5778 } 5779 ctxt->start->type = XML_REGEXP_START_STATE; 5780 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5781 xmlRegFreeState(ctxt->start); 5782 xmlFreeAutomata(ctxt); 5783 return(NULL); 5784 } 5785 ctxt->flags = 0; 5786 5787 return(ctxt); 5788 } 5789 5790 /** 5791 * xmlFreeAutomata: 5792 * @am: an automata 5793 * 5794 * Free an automata 5795 */ 5796 void 5797 xmlFreeAutomata(xmlAutomataPtr am) { 5798 if (am == NULL) 5799 return; 5800 xmlRegFreeParserCtxt(am); 5801 } 5802 5803 /** 5804 * xmlAutomataSetFlags: 5805 * @am: an automata 5806 * @flags: a set of internal flags 5807 * 5808 * Set some flags on the automata 5809 */ 5810 void 5811 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5812 if (am == NULL) 5813 return; 5814 am->flags |= flags; 5815 } 5816 5817 /** 5818 * xmlAutomataGetInitState: 5819 * @am: an automata 5820 * 5821 * Initial state lookup 5822 * 5823 * Returns the initial state of the automata 5824 */ 5825 xmlAutomataStatePtr 5826 xmlAutomataGetInitState(xmlAutomataPtr am) { 5827 if (am == NULL) 5828 return(NULL); 5829 return(am->start); 5830 } 5831 5832 /** 5833 * xmlAutomataSetFinalState: 5834 * @am: an automata 5835 * @state: a state in this automata 5836 * 5837 * Makes that state a final state 5838 * 5839 * Returns 0 or -1 in case of error 5840 */ 5841 int 5842 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5843 if ((am == NULL) || (state == NULL)) 5844 return(-1); 5845 state->type = XML_REGEXP_FINAL_STATE; 5846 return(0); 5847 } 5848 5849 /** 5850 * xmlAutomataNewTransition: 5851 * @am: an automata 5852 * @from: the starting point of the transition 5853 * @to: the target point of the transition or NULL 5854 * @token: the input string associated to that transition 5855 * @data: data passed to the callback function if the transition is activated 5856 * 5857 * If @to is NULL, this creates first a new target state in the automata 5858 * and then adds a transition from the @from state to the target state 5859 * activated by the value of @token 5860 * 5861 * Returns the target state or NULL in case of error 5862 */ 5863 xmlAutomataStatePtr 5864 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5865 xmlAutomataStatePtr to, const xmlChar *token, 5866 void *data) { 5867 xmlRegAtomPtr atom; 5868 5869 if ((am == NULL) || (from == NULL) || (token == NULL)) 5870 return(NULL); 5871 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5872 if (atom == NULL) 5873 return(NULL); 5874 atom->data = data; 5875 atom->valuep = xmlStrdup(token); 5876 5877 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5878 xmlRegFreeAtom(atom); 5879 return(NULL); 5880 } 5881 if (to == NULL) 5882 return(am->state); 5883 return(to); 5884 } 5885 5886 /** 5887 * xmlAutomataNewTransition2: 5888 * @am: an automata 5889 * @from: the starting point of the transition 5890 * @to: the target point of the transition or NULL 5891 * @token: the first input string associated to that transition 5892 * @token2: the second input string associated to that transition 5893 * @data: data passed to the callback function if the transition is activated 5894 * 5895 * If @to is NULL, this creates first a new target state in the automata 5896 * and then adds a transition from the @from state to the target state 5897 * activated by the value of @token 5898 * 5899 * Returns the target state or NULL in case of error 5900 */ 5901 xmlAutomataStatePtr 5902 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5903 xmlAutomataStatePtr to, const xmlChar *token, 5904 const xmlChar *token2, void *data) { 5905 xmlRegAtomPtr atom; 5906 5907 if ((am == NULL) || (from == NULL) || (token == NULL)) 5908 return(NULL); 5909 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5910 if (atom == NULL) 5911 return(NULL); 5912 atom->data = data; 5913 if ((token2 == NULL) || (*token2 == 0)) { 5914 atom->valuep = xmlStrdup(token); 5915 } else { 5916 int lenn, lenp; 5917 xmlChar *str; 5918 5919 lenn = strlen((char *) token2); 5920 lenp = strlen((char *) token); 5921 5922 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5923 if (str == NULL) { 5924 xmlRegFreeAtom(atom); 5925 return(NULL); 5926 } 5927 memcpy(&str[0], token, lenp); 5928 str[lenp] = '|'; 5929 memcpy(&str[lenp + 1], token2, lenn); 5930 str[lenn + lenp + 1] = 0; 5931 5932 atom->valuep = str; 5933 } 5934 5935 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5936 xmlRegFreeAtom(atom); 5937 return(NULL); 5938 } 5939 if (to == NULL) 5940 return(am->state); 5941 return(to); 5942 } 5943 5944 /** 5945 * xmlAutomataNewNegTrans: 5946 * @am: an automata 5947 * @from: the starting point of the transition 5948 * @to: the target point of the transition or NULL 5949 * @token: the first input string associated to that transition 5950 * @token2: the second input string associated to that transition 5951 * @data: data passed to the callback function if the transition is activated 5952 * 5953 * If @to is NULL, this creates first a new target state in the automata 5954 * and then adds a transition from the @from state to the target state 5955 * activated by any value except (@token,@token2) 5956 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5957 # the semantic of XSD ##other 5958 * 5959 * Returns the target state or NULL in case of error 5960 */ 5961 xmlAutomataStatePtr 5962 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5963 xmlAutomataStatePtr to, const xmlChar *token, 5964 const xmlChar *token2, void *data) { 5965 xmlRegAtomPtr atom; 5966 xmlChar err_msg[200]; 5967 5968 if ((am == NULL) || (from == NULL) || (token == NULL)) 5969 return(NULL); 5970 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5971 if (atom == NULL) 5972 return(NULL); 5973 atom->data = data; 5974 atom->neg = 1; 5975 if ((token2 == NULL) || (*token2 == 0)) { 5976 atom->valuep = xmlStrdup(token); 5977 } else { 5978 int lenn, lenp; 5979 xmlChar *str; 5980 5981 lenn = strlen((char *) token2); 5982 lenp = strlen((char *) token); 5983 5984 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5985 if (str == NULL) { 5986 xmlRegFreeAtom(atom); 5987 return(NULL); 5988 } 5989 memcpy(&str[0], token, lenp); 5990 str[lenp] = '|'; 5991 memcpy(&str[lenp + 1], token2, lenn); 5992 str[lenn + lenp + 1] = 0; 5993 5994 atom->valuep = str; 5995 } 5996 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5997 err_msg[199] = 0; 5998 atom->valuep2 = xmlStrdup(err_msg); 5999 6000 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 6001 xmlRegFreeAtom(atom); 6002 return(NULL); 6003 } 6004 am->negs++; 6005 if (to == NULL) 6006 return(am->state); 6007 return(to); 6008 } 6009 6010 /** 6011 * xmlAutomataNewCountTrans2: 6012 * @am: an automata 6013 * @from: the starting point of the transition 6014 * @to: the target point of the transition or NULL 6015 * @token: the input string associated to that transition 6016 * @token2: the second input string associated to that transition 6017 * @min: the minimum successive occurrences of token 6018 * @max: the maximum successive occurrences of token 6019 * @data: data associated to the transition 6020 * 6021 * If @to is NULL, this creates first a new target state in the automata 6022 * and then adds a transition from the @from state to the target state 6023 * activated by a succession of input of value @token and @token2 and 6024 * whose number is between @min and @max 6025 * 6026 * Returns the target state or NULL in case of error 6027 */ 6028 xmlAutomataStatePtr 6029 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6030 xmlAutomataStatePtr to, const xmlChar *token, 6031 const xmlChar *token2, 6032 int min, int max, void *data) { 6033 xmlRegAtomPtr atom; 6034 int counter; 6035 6036 if ((am == NULL) || (from == NULL) || (token == NULL)) 6037 return(NULL); 6038 if (min < 0) 6039 return(NULL); 6040 if ((max < min) || (max < 1)) 6041 return(NULL); 6042 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6043 if (atom == NULL) 6044 return(NULL); 6045 if ((token2 == NULL) || (*token2 == 0)) { 6046 atom->valuep = xmlStrdup(token); 6047 } else { 6048 int lenn, lenp; 6049 xmlChar *str; 6050 6051 lenn = strlen((char *) token2); 6052 lenp = strlen((char *) token); 6053 6054 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6055 if (str == NULL) { 6056 xmlRegFreeAtom(atom); 6057 return(NULL); 6058 } 6059 memcpy(&str[0], token, lenp); 6060 str[lenp] = '|'; 6061 memcpy(&str[lenp + 1], token2, lenn); 6062 str[lenn + lenp + 1] = 0; 6063 6064 atom->valuep = str; 6065 } 6066 atom->data = data; 6067 if (min == 0) 6068 atom->min = 1; 6069 else 6070 atom->min = min; 6071 atom->max = max; 6072 6073 /* 6074 * associate a counter to the transition. 6075 */ 6076 counter = xmlRegGetCounter(am); 6077 am->counters[counter].min = min; 6078 am->counters[counter].max = max; 6079 6080 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6081 if (to == NULL) { 6082 to = xmlRegNewState(am); 6083 xmlRegStatePush(am, to); 6084 } 6085 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6086 xmlRegAtomPush(am, atom); 6087 am->state = to; 6088 6089 if (to == NULL) 6090 to = am->state; 6091 if (to == NULL) 6092 return(NULL); 6093 if (min == 0) 6094 xmlFAGenerateEpsilonTransition(am, from, to); 6095 return(to); 6096 } 6097 6098 /** 6099 * xmlAutomataNewCountTrans: 6100 * @am: an automata 6101 * @from: the starting point of the transition 6102 * @to: the target point of the transition or NULL 6103 * @token: the input string associated to that transition 6104 * @min: the minimum successive occurrences of token 6105 * @max: the maximum successive occurrences of token 6106 * @data: data associated to the transition 6107 * 6108 * If @to is NULL, this creates first a new target state in the automata 6109 * and then adds a transition from the @from state to the target state 6110 * activated by a succession of input of value @token and whose number 6111 * is between @min and @max 6112 * 6113 * Returns the target state or NULL in case of error 6114 */ 6115 xmlAutomataStatePtr 6116 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6117 xmlAutomataStatePtr to, const xmlChar *token, 6118 int min, int max, void *data) { 6119 xmlRegAtomPtr atom; 6120 int counter; 6121 6122 if ((am == NULL) || (from == NULL) || (token == NULL)) 6123 return(NULL); 6124 if (min < 0) 6125 return(NULL); 6126 if ((max < min) || (max < 1)) 6127 return(NULL); 6128 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6129 if (atom == NULL) 6130 return(NULL); 6131 atom->valuep = xmlStrdup(token); 6132 atom->data = data; 6133 if (min == 0) 6134 atom->min = 1; 6135 else 6136 atom->min = min; 6137 atom->max = max; 6138 6139 /* 6140 * associate a counter to the transition. 6141 */ 6142 counter = xmlRegGetCounter(am); 6143 am->counters[counter].min = min; 6144 am->counters[counter].max = max; 6145 6146 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6147 if (to == NULL) { 6148 to = xmlRegNewState(am); 6149 xmlRegStatePush(am, to); 6150 } 6151 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6152 xmlRegAtomPush(am, atom); 6153 am->state = to; 6154 6155 if (to == NULL) 6156 to = am->state; 6157 if (to == NULL) 6158 return(NULL); 6159 if (min == 0) 6160 xmlFAGenerateEpsilonTransition(am, from, to); 6161 return(to); 6162 } 6163 6164 /** 6165 * xmlAutomataNewOnceTrans2: 6166 * @am: an automata 6167 * @from: the starting point of the transition 6168 * @to: the target point of the transition or NULL 6169 * @token: the input string associated to that transition 6170 * @token2: the second input string associated to that transition 6171 * @min: the minimum successive occurrences of token 6172 * @max: the maximum successive occurrences of token 6173 * @data: data associated to the transition 6174 * 6175 * If @to is NULL, this creates first a new target state in the automata 6176 * and then adds a transition from the @from state to the target state 6177 * activated by a succession of input of value @token and @token2 and whose 6178 * number is between @min and @max, moreover that transition can only be 6179 * crossed once. 6180 * 6181 * Returns the target state or NULL in case of error 6182 */ 6183 xmlAutomataStatePtr 6184 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6185 xmlAutomataStatePtr to, const xmlChar *token, 6186 const xmlChar *token2, 6187 int min, int max, void *data) { 6188 xmlRegAtomPtr atom; 6189 int counter; 6190 6191 if ((am == NULL) || (from == NULL) || (token == NULL)) 6192 return(NULL); 6193 if (min < 1) 6194 return(NULL); 6195 if (max < min) 6196 return(NULL); 6197 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6198 if (atom == NULL) 6199 return(NULL); 6200 if ((token2 == NULL) || (*token2 == 0)) { 6201 atom->valuep = xmlStrdup(token); 6202 } else { 6203 int lenn, lenp; 6204 xmlChar *str; 6205 6206 lenn = strlen((char *) token2); 6207 lenp = strlen((char *) token); 6208 6209 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6210 if (str == NULL) { 6211 xmlRegFreeAtom(atom); 6212 return(NULL); 6213 } 6214 memcpy(&str[0], token, lenp); 6215 str[lenp] = '|'; 6216 memcpy(&str[lenp + 1], token2, lenn); 6217 str[lenn + lenp + 1] = 0; 6218 6219 atom->valuep = str; 6220 } 6221 atom->data = data; 6222 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6223 atom->min = min; 6224 atom->max = max; 6225 /* 6226 * associate a counter to the transition. 6227 */ 6228 counter = xmlRegGetCounter(am); 6229 am->counters[counter].min = 1; 6230 am->counters[counter].max = 1; 6231 6232 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6233 if (to == NULL) { 6234 to = xmlRegNewState(am); 6235 xmlRegStatePush(am, to); 6236 } 6237 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6238 xmlRegAtomPush(am, atom); 6239 am->state = to; 6240 return(to); 6241 } 6242 6243 6244 6245 /** 6246 * xmlAutomataNewOnceTrans: 6247 * @am: an automata 6248 * @from: the starting point of the transition 6249 * @to: the target point of the transition or NULL 6250 * @token: the input string associated to that transition 6251 * @min: the minimum successive occurrences of token 6252 * @max: the maximum successive occurrences of token 6253 * @data: data associated to the transition 6254 * 6255 * If @to is NULL, this creates first a new target state in the automata 6256 * and then adds a transition from the @from state to the target state 6257 * activated by a succession of input of value @token and whose number 6258 * is between @min and @max, moreover that transition can only be crossed 6259 * once. 6260 * 6261 * Returns the target state or NULL in case of error 6262 */ 6263 xmlAutomataStatePtr 6264 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6265 xmlAutomataStatePtr to, const xmlChar *token, 6266 int min, int max, void *data) { 6267 xmlRegAtomPtr atom; 6268 int counter; 6269 6270 if ((am == NULL) || (from == NULL) || (token == NULL)) 6271 return(NULL); 6272 if (min < 1) 6273 return(NULL); 6274 if (max < min) 6275 return(NULL); 6276 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6277 if (atom == NULL) 6278 return(NULL); 6279 atom->valuep = xmlStrdup(token); 6280 atom->data = data; 6281 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6282 atom->min = min; 6283 atom->max = max; 6284 /* 6285 * associate a counter to the transition. 6286 */ 6287 counter = xmlRegGetCounter(am); 6288 am->counters[counter].min = 1; 6289 am->counters[counter].max = 1; 6290 6291 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6292 if (to == NULL) { 6293 to = xmlRegNewState(am); 6294 xmlRegStatePush(am, to); 6295 } 6296 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6297 xmlRegAtomPush(am, atom); 6298 am->state = to; 6299 return(to); 6300 } 6301 6302 /** 6303 * xmlAutomataNewState: 6304 * @am: an automata 6305 * 6306 * Create a new disconnected state in the automata 6307 * 6308 * Returns the new state or NULL in case of error 6309 */ 6310 xmlAutomataStatePtr 6311 xmlAutomataNewState(xmlAutomataPtr am) { 6312 xmlAutomataStatePtr to; 6313 6314 if (am == NULL) 6315 return(NULL); 6316 to = xmlRegNewState(am); 6317 xmlRegStatePush(am, to); 6318 return(to); 6319 } 6320 6321 /** 6322 * xmlAutomataNewEpsilon: 6323 * @am: an automata 6324 * @from: the starting point of the transition 6325 * @to: the target point of the transition or NULL 6326 * 6327 * If @to is NULL, this creates first a new target state in the automata 6328 * and then adds an epsilon transition from the @from state to the 6329 * target state 6330 * 6331 * Returns the target state or NULL in case of error 6332 */ 6333 xmlAutomataStatePtr 6334 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6335 xmlAutomataStatePtr to) { 6336 if ((am == NULL) || (from == NULL)) 6337 return(NULL); 6338 xmlFAGenerateEpsilonTransition(am, from, to); 6339 if (to == NULL) 6340 return(am->state); 6341 return(to); 6342 } 6343 6344 /** 6345 * xmlAutomataNewAllTrans: 6346 * @am: an automata 6347 * @from: the starting point of the transition 6348 * @to: the target point of the transition or NULL 6349 * @lax: allow to transition if not all all transitions have been activated 6350 * 6351 * If @to is NULL, this creates first a new target state in the automata 6352 * and then adds a an ALL transition from the @from state to the 6353 * target state. That transition is an epsilon transition allowed only when 6354 * all transitions from the @from node have been activated. 6355 * 6356 * Returns the target state or NULL in case of error 6357 */ 6358 xmlAutomataStatePtr 6359 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6360 xmlAutomataStatePtr to, int lax) { 6361 if ((am == NULL) || (from == NULL)) 6362 return(NULL); 6363 xmlFAGenerateAllTransition(am, from, to, lax); 6364 if (to == NULL) 6365 return(am->state); 6366 return(to); 6367 } 6368 6369 /** 6370 * xmlAutomataNewCounter: 6371 * @am: an automata 6372 * @min: the minimal value on the counter 6373 * @max: the maximal value on the counter 6374 * 6375 * Create a new counter 6376 * 6377 * Returns the counter number or -1 in case of error 6378 */ 6379 int 6380 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6381 int ret; 6382 6383 if (am == NULL) 6384 return(-1); 6385 6386 ret = xmlRegGetCounter(am); 6387 if (ret < 0) 6388 return(-1); 6389 am->counters[ret].min = min; 6390 am->counters[ret].max = max; 6391 return(ret); 6392 } 6393 6394 /** 6395 * xmlAutomataNewCountedTrans: 6396 * @am: an automata 6397 * @from: the starting point of the transition 6398 * @to: the target point of the transition or NULL 6399 * @counter: the counter associated to that transition 6400 * 6401 * If @to is NULL, this creates first a new target state in the automata 6402 * and then adds an epsilon transition from the @from state to the target state 6403 * which will increment the counter provided 6404 * 6405 * Returns the target state or NULL in case of error 6406 */ 6407 xmlAutomataStatePtr 6408 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6409 xmlAutomataStatePtr to, int counter) { 6410 if ((am == NULL) || (from == NULL) || (counter < 0)) 6411 return(NULL); 6412 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6413 if (to == NULL) 6414 return(am->state); 6415 return(to); 6416 } 6417 6418 /** 6419 * xmlAutomataNewCounterTrans: 6420 * @am: an automata 6421 * @from: the starting point of the transition 6422 * @to: the target point of the transition or NULL 6423 * @counter: the counter associated to that transition 6424 * 6425 * If @to is NULL, this creates first a new target state in the automata 6426 * and then adds an epsilon transition from the @from state to the target state 6427 * which will be allowed only if the counter is within the right range. 6428 * 6429 * Returns the target state or NULL in case of error 6430 */ 6431 xmlAutomataStatePtr 6432 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6433 xmlAutomataStatePtr to, int counter) { 6434 if ((am == NULL) || (from == NULL) || (counter < 0)) 6435 return(NULL); 6436 xmlFAGenerateCountedTransition(am, from, to, counter); 6437 if (to == NULL) 6438 return(am->state); 6439 return(to); 6440 } 6441 6442 /** 6443 * xmlAutomataCompile: 6444 * @am: an automata 6445 * 6446 * Compile the automata into a Reg Exp ready for being executed. 6447 * The automata should be free after this point. 6448 * 6449 * Returns the compiled regexp or NULL in case of error 6450 */ 6451 xmlRegexpPtr 6452 xmlAutomataCompile(xmlAutomataPtr am) { 6453 xmlRegexpPtr ret; 6454 6455 if ((am == NULL) || (am->error != 0)) return(NULL); 6456 xmlFAEliminateEpsilonTransitions(am); 6457 /* xmlFAComputesDeterminism(am); */ 6458 ret = xmlRegEpxFromParse(am); 6459 6460 return(ret); 6461 } 6462 6463 /** 6464 * xmlAutomataIsDeterminist: 6465 * @am: an automata 6466 * 6467 * Checks if an automata is determinist. 6468 * 6469 * Returns 1 if true, 0 if not, and -1 in case of error 6470 */ 6471 int 6472 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6473 int ret; 6474 6475 if (am == NULL) 6476 return(-1); 6477 6478 ret = xmlFAComputesDeterminism(am); 6479 return(ret); 6480 } 6481 #endif /* LIBXML_AUTOMATA_ENABLED */ 6482 6483 #ifdef LIBXML_EXPR_ENABLED 6484 /************************************************************************ 6485 * * 6486 * Formal Expression handling code * 6487 * * 6488 ************************************************************************/ 6489 /************************************************************************ 6490 * * 6491 * Expression handling context * 6492 * * 6493 ************************************************************************/ 6494 6495 struct _xmlExpCtxt { 6496 xmlDictPtr dict; 6497 xmlExpNodePtr *table; 6498 int size; 6499 int nbElems; 6500 int nb_nodes; 6501 int maxNodes; 6502 const char *expr; 6503 const char *cur; 6504 int nb_cons; 6505 int tabSize; 6506 }; 6507 6508 /** 6509 * xmlExpNewCtxt: 6510 * @maxNodes: the maximum number of nodes 6511 * @dict: optional dictionary to use internally 6512 * 6513 * Creates a new context for manipulating expressions 6514 * 6515 * Returns the context or NULL in case of error 6516 */ 6517 xmlExpCtxtPtr 6518 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6519 xmlExpCtxtPtr ret; 6520 int size = 256; 6521 6522 if (maxNodes <= 4096) 6523 maxNodes = 4096; 6524 6525 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6526 if (ret == NULL) 6527 return(NULL); 6528 memset(ret, 0, sizeof(xmlExpCtxt)); 6529 ret->size = size; 6530 ret->nbElems = 0; 6531 ret->maxNodes = maxNodes; 6532 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6533 if (ret->table == NULL) { 6534 xmlFree(ret); 6535 return(NULL); 6536 } 6537 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6538 if (dict == NULL) { 6539 ret->dict = xmlDictCreate(); 6540 if (ret->dict == NULL) { 6541 xmlFree(ret->table); 6542 xmlFree(ret); 6543 return(NULL); 6544 } 6545 } else { 6546 ret->dict = dict; 6547 xmlDictReference(ret->dict); 6548 } 6549 return(ret); 6550 } 6551 6552 /** 6553 * xmlExpFreeCtxt: 6554 * @ctxt: an expression context 6555 * 6556 * Free an expression context 6557 */ 6558 void 6559 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6560 if (ctxt == NULL) 6561 return; 6562 xmlDictFree(ctxt->dict); 6563 if (ctxt->table != NULL) 6564 xmlFree(ctxt->table); 6565 xmlFree(ctxt); 6566 } 6567 6568 /************************************************************************ 6569 * * 6570 * Structure associated to an expression node * 6571 * * 6572 ************************************************************************/ 6573 #define MAX_NODES 10000 6574 6575 /* #define DEBUG_DERIV */ 6576 6577 /* 6578 * TODO: 6579 * - Wildcards 6580 * - public API for creation 6581 * 6582 * Started 6583 * - regression testing 6584 * 6585 * Done 6586 * - split into module and test tool 6587 * - memleaks 6588 */ 6589 6590 typedef enum { 6591 XML_EXP_NILABLE = (1 << 0) 6592 } xmlExpNodeInfo; 6593 6594 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6595 6596 struct _xmlExpNode { 6597 unsigned char type;/* xmlExpNodeType */ 6598 unsigned char info;/* OR of xmlExpNodeInfo */ 6599 unsigned short key; /* the hash key */ 6600 unsigned int ref; /* The number of references */ 6601 int c_max; /* the maximum length it can consume */ 6602 xmlExpNodePtr exp_left; 6603 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6604 union { 6605 struct { 6606 int f_min; 6607 int f_max; 6608 } count; 6609 struct { 6610 xmlExpNodePtr f_right; 6611 } children; 6612 const xmlChar *f_str; 6613 } field; 6614 }; 6615 6616 #define exp_min field.count.f_min 6617 #define exp_max field.count.f_max 6618 /* #define exp_left field.children.f_left */ 6619 #define exp_right field.children.f_right 6620 #define exp_str field.f_str 6621 6622 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6623 static xmlExpNode forbiddenExpNode = { 6624 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6625 }; 6626 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6627 static xmlExpNode emptyExpNode = { 6628 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6629 }; 6630 xmlExpNodePtr emptyExp = &emptyExpNode; 6631 6632 /************************************************************************ 6633 * * 6634 * The custom hash table for unicity and canonicalization * 6635 * of sub-expressions pointers * 6636 * * 6637 ************************************************************************/ 6638 /* 6639 * xmlExpHashNameComputeKey: 6640 * Calculate the hash key for a token 6641 */ 6642 static unsigned short 6643 xmlExpHashNameComputeKey(const xmlChar *name) { 6644 unsigned short value = 0L; 6645 char ch; 6646 6647 if (name != NULL) { 6648 value += 30 * (*name); 6649 while ((ch = *name++) != 0) { 6650 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6651 } 6652 } 6653 return (value); 6654 } 6655 6656 /* 6657 * xmlExpHashComputeKey: 6658 * Calculate the hash key for a compound expression 6659 */ 6660 static unsigned short 6661 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6662 xmlExpNodePtr right) { 6663 unsigned long value; 6664 unsigned short ret; 6665 6666 switch (type) { 6667 case XML_EXP_SEQ: 6668 value = left->key; 6669 value += right->key; 6670 value *= 3; 6671 ret = (unsigned short) value; 6672 break; 6673 case XML_EXP_OR: 6674 value = left->key; 6675 value += right->key; 6676 value *= 7; 6677 ret = (unsigned short) value; 6678 break; 6679 case XML_EXP_COUNT: 6680 value = left->key; 6681 value += right->key; 6682 ret = (unsigned short) value; 6683 break; 6684 default: 6685 ret = 0; 6686 } 6687 return(ret); 6688 } 6689 6690 6691 static xmlExpNodePtr 6692 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6693 xmlExpNodePtr ret; 6694 6695 if (ctxt->nb_nodes >= MAX_NODES) 6696 return(NULL); 6697 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6698 if (ret == NULL) 6699 return(NULL); 6700 memset(ret, 0, sizeof(xmlExpNode)); 6701 ret->type = type; 6702 ret->next = NULL; 6703 ctxt->nb_nodes++; 6704 ctxt->nb_cons++; 6705 return(ret); 6706 } 6707 6708 /** 6709 * xmlExpHashGetEntry: 6710 * @table: the hash table 6711 * 6712 * Get the unique entry from the hash table. The entry is created if 6713 * needed. @left and @right are consumed, i.e. their ref count will 6714 * be decremented by the operation. 6715 * 6716 * Returns the pointer or NULL in case of error 6717 */ 6718 static xmlExpNodePtr 6719 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6720 xmlExpNodePtr left, xmlExpNodePtr right, 6721 const xmlChar *name, int min, int max) { 6722 unsigned short kbase, key; 6723 xmlExpNodePtr entry; 6724 xmlExpNodePtr insert; 6725 6726 if (ctxt == NULL) 6727 return(NULL); 6728 6729 /* 6730 * Check for duplicate and insertion location. 6731 */ 6732 if (type == XML_EXP_ATOM) { 6733 kbase = xmlExpHashNameComputeKey(name); 6734 } else if (type == XML_EXP_COUNT) { 6735 /* COUNT reduction rule 1 */ 6736 /* a{1} -> a */ 6737 if (min == max) { 6738 if (min == 1) { 6739 return(left); 6740 } 6741 if (min == 0) { 6742 xmlExpFree(ctxt, left); 6743 return(emptyExp); 6744 } 6745 } 6746 if (min < 0) { 6747 xmlExpFree(ctxt, left); 6748 return(forbiddenExp); 6749 } 6750 if (max == -1) 6751 kbase = min + 79; 6752 else 6753 kbase = max - min; 6754 kbase += left->key; 6755 } else if (type == XML_EXP_OR) { 6756 /* Forbid reduction rules */ 6757 if (left->type == XML_EXP_FORBID) { 6758 xmlExpFree(ctxt, left); 6759 return(right); 6760 } 6761 if (right->type == XML_EXP_FORBID) { 6762 xmlExpFree(ctxt, right); 6763 return(left); 6764 } 6765 6766 /* OR reduction rule 1 */ 6767 /* a | a reduced to a */ 6768 if (left == right) { 6769 xmlExpFree(ctxt, right); 6770 return(left); 6771 } 6772 /* OR canonicalization rule 1 */ 6773 /* linearize (a | b) | c into a | (b | c) */ 6774 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6775 xmlExpNodePtr tmp = left; 6776 left = right; 6777 right = tmp; 6778 } 6779 /* OR reduction rule 2 */ 6780 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6781 if (right->type == XML_EXP_OR) { 6782 if ((left == right->exp_left) || 6783 (left == right->exp_right)) { 6784 xmlExpFree(ctxt, left); 6785 return(right); 6786 } 6787 } 6788 /* OR canonicalization rule 2 */ 6789 /* linearize (a | b) | c into a | (b | c) */ 6790 if (left->type == XML_EXP_OR) { 6791 xmlExpNodePtr tmp; 6792 6793 /* OR canonicalization rule 2 */ 6794 if ((left->exp_right->type != XML_EXP_OR) && 6795 (left->exp_right->key < left->exp_left->key)) { 6796 tmp = left->exp_right; 6797 left->exp_right = left->exp_left; 6798 left->exp_left = tmp; 6799 } 6800 left->exp_right->ref++; 6801 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6802 NULL, 0, 0); 6803 left->exp_left->ref++; 6804 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6805 NULL, 0, 0); 6806 6807 xmlExpFree(ctxt, left); 6808 return(tmp); 6809 } 6810 if (right->type == XML_EXP_OR) { 6811 /* Ordering in the tree */ 6812 /* C | (A | B) -> A | (B | C) */ 6813 if (left->key > right->exp_right->key) { 6814 xmlExpNodePtr tmp; 6815 right->exp_right->ref++; 6816 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6817 left, NULL, 0, 0); 6818 right->exp_left->ref++; 6819 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6820 tmp, NULL, 0, 0); 6821 xmlExpFree(ctxt, right); 6822 return(tmp); 6823 } 6824 /* Ordering in the tree */ 6825 /* B | (A | C) -> A | (B | C) */ 6826 if (left->key > right->exp_left->key) { 6827 xmlExpNodePtr tmp; 6828 right->exp_right->ref++; 6829 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6830 right->exp_right, NULL, 0, 0); 6831 right->exp_left->ref++; 6832 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6833 tmp, NULL, 0, 0); 6834 xmlExpFree(ctxt, right); 6835 return(tmp); 6836 } 6837 } 6838 /* we know both types are != XML_EXP_OR here */ 6839 else if (left->key > right->key) { 6840 xmlExpNodePtr tmp = left; 6841 left = right; 6842 right = tmp; 6843 } 6844 kbase = xmlExpHashComputeKey(type, left, right); 6845 } else if (type == XML_EXP_SEQ) { 6846 /* Forbid reduction rules */ 6847 if (left->type == XML_EXP_FORBID) { 6848 xmlExpFree(ctxt, right); 6849 return(left); 6850 } 6851 if (right->type == XML_EXP_FORBID) { 6852 xmlExpFree(ctxt, left); 6853 return(right); 6854 } 6855 /* Empty reduction rules */ 6856 if (right->type == XML_EXP_EMPTY) { 6857 return(left); 6858 } 6859 if (left->type == XML_EXP_EMPTY) { 6860 return(right); 6861 } 6862 kbase = xmlExpHashComputeKey(type, left, right); 6863 } else 6864 return(NULL); 6865 6866 key = kbase % ctxt->size; 6867 if (ctxt->table[key] != NULL) { 6868 for (insert = ctxt->table[key]; insert != NULL; 6869 insert = insert->next) { 6870 if ((insert->key == kbase) && 6871 (insert->type == type)) { 6872 if (type == XML_EXP_ATOM) { 6873 if (name == insert->exp_str) { 6874 insert->ref++; 6875 return(insert); 6876 } 6877 } else if (type == XML_EXP_COUNT) { 6878 if ((insert->exp_min == min) && (insert->exp_max == max) && 6879 (insert->exp_left == left)) { 6880 insert->ref++; 6881 left->ref--; 6882 return(insert); 6883 } 6884 } else if ((insert->exp_left == left) && 6885 (insert->exp_right == right)) { 6886 insert->ref++; 6887 left->ref--; 6888 right->ref--; 6889 return(insert); 6890 } 6891 } 6892 } 6893 } 6894 6895 entry = xmlExpNewNode(ctxt, type); 6896 if (entry == NULL) 6897 return(NULL); 6898 entry->key = kbase; 6899 if (type == XML_EXP_ATOM) { 6900 entry->exp_str = name; 6901 entry->c_max = 1; 6902 } else if (type == XML_EXP_COUNT) { 6903 entry->exp_min = min; 6904 entry->exp_max = max; 6905 entry->exp_left = left; 6906 if ((min == 0) || (IS_NILLABLE(left))) 6907 entry->info |= XML_EXP_NILABLE; 6908 if (max < 0) 6909 entry->c_max = -1; 6910 else 6911 entry->c_max = max * entry->exp_left->c_max; 6912 } else { 6913 entry->exp_left = left; 6914 entry->exp_right = right; 6915 if (type == XML_EXP_OR) { 6916 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6917 entry->info |= XML_EXP_NILABLE; 6918 if ((entry->exp_left->c_max == -1) || 6919 (entry->exp_right->c_max == -1)) 6920 entry->c_max = -1; 6921 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6922 entry->c_max = entry->exp_left->c_max; 6923 else 6924 entry->c_max = entry->exp_right->c_max; 6925 } else { 6926 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6927 entry->info |= XML_EXP_NILABLE; 6928 if ((entry->exp_left->c_max == -1) || 6929 (entry->exp_right->c_max == -1)) 6930 entry->c_max = -1; 6931 else 6932 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6933 } 6934 } 6935 entry->ref = 1; 6936 if (ctxt->table[key] != NULL) 6937 entry->next = ctxt->table[key]; 6938 6939 ctxt->table[key] = entry; 6940 ctxt->nbElems++; 6941 6942 return(entry); 6943 } 6944 6945 /** 6946 * xmlExpFree: 6947 * @ctxt: the expression context 6948 * @exp: the expression 6949 * 6950 * Dereference the expression 6951 */ 6952 void 6953 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6954 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6955 return; 6956 exp->ref--; 6957 if (exp->ref == 0) { 6958 unsigned short key; 6959 6960 /* Unlink it first from the hash table */ 6961 key = exp->key % ctxt->size; 6962 if (ctxt->table[key] == exp) { 6963 ctxt->table[key] = exp->next; 6964 } else { 6965 xmlExpNodePtr tmp; 6966 6967 tmp = ctxt->table[key]; 6968 while (tmp != NULL) { 6969 if (tmp->next == exp) { 6970 tmp->next = exp->next; 6971 break; 6972 } 6973 tmp = tmp->next; 6974 } 6975 } 6976 6977 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6978 xmlExpFree(ctxt, exp->exp_left); 6979 xmlExpFree(ctxt, exp->exp_right); 6980 } else if (exp->type == XML_EXP_COUNT) { 6981 xmlExpFree(ctxt, exp->exp_left); 6982 } 6983 xmlFree(exp); 6984 ctxt->nb_nodes--; 6985 } 6986 } 6987 6988 /** 6989 * xmlExpRef: 6990 * @exp: the expression 6991 * 6992 * Increase the reference count of the expression 6993 */ 6994 void 6995 xmlExpRef(xmlExpNodePtr exp) { 6996 if (exp != NULL) 6997 exp->ref++; 6998 } 6999 7000 /** 7001 * xmlExpNewAtom: 7002 * @ctxt: the expression context 7003 * @name: the atom name 7004 * @len: the atom name length in byte (or -1); 7005 * 7006 * Get the atom associated to this name from that context 7007 * 7008 * Returns the node or NULL in case of error 7009 */ 7010 xmlExpNodePtr 7011 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 7012 if ((ctxt == NULL) || (name == NULL)) 7013 return(NULL); 7014 name = xmlDictLookup(ctxt->dict, name, len); 7015 if (name == NULL) 7016 return(NULL); 7017 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 7018 } 7019 7020 /** 7021 * xmlExpNewOr: 7022 * @ctxt: the expression context 7023 * @left: left expression 7024 * @right: right expression 7025 * 7026 * Get the atom associated to the choice @left | @right 7027 * Note that @left and @right are consumed in the operation, to keep 7028 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 7029 * this is true even in case of failure (unless ctxt == NULL). 7030 * 7031 * Returns the node or NULL in case of error 7032 */ 7033 xmlExpNodePtr 7034 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 7035 if (ctxt == NULL) 7036 return(NULL); 7037 if ((left == NULL) || (right == NULL)) { 7038 xmlExpFree(ctxt, left); 7039 xmlExpFree(ctxt, right); 7040 return(NULL); 7041 } 7042 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 7043 } 7044 7045 /** 7046 * xmlExpNewSeq: 7047 * @ctxt: the expression context 7048 * @left: left expression 7049 * @right: right expression 7050 * 7051 * Get the atom associated to the sequence @left , @right 7052 * Note that @left and @right are consumed in the operation, to keep 7053 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 7054 * this is true even in case of failure (unless ctxt == NULL). 7055 * 7056 * Returns the node or NULL in case of error 7057 */ 7058 xmlExpNodePtr 7059 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 7060 if (ctxt == NULL) 7061 return(NULL); 7062 if ((left == NULL) || (right == NULL)) { 7063 xmlExpFree(ctxt, left); 7064 xmlExpFree(ctxt, right); 7065 return(NULL); 7066 } 7067 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 7068 } 7069 7070 /** 7071 * xmlExpNewRange: 7072 * @ctxt: the expression context 7073 * @subset: the expression to be repeated 7074 * @min: the lower bound for the repetition 7075 * @max: the upper bound for the repetition, -1 means infinite 7076 * 7077 * Get the atom associated to the range (@subset){@min, @max} 7078 * Note that @subset is consumed in the operation, to keep 7079 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 7080 * this is true even in case of failure (unless ctxt == NULL). 7081 * 7082 * Returns the node or NULL in case of error 7083 */ 7084 xmlExpNodePtr 7085 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 7086 if (ctxt == NULL) 7087 return(NULL); 7088 if ((subset == NULL) || (min < 0) || (max < -1) || 7089 ((max >= 0) && (min > max))) { 7090 xmlExpFree(ctxt, subset); 7091 return(NULL); 7092 } 7093 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 7094 NULL, NULL, min, max)); 7095 } 7096 7097 /************************************************************************ 7098 * * 7099 * Public API for operations on expressions * 7100 * * 7101 ************************************************************************/ 7102 7103 static int 7104 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7105 const xmlChar**list, int len, int nb) { 7106 int tmp, tmp2; 7107 tail: 7108 switch (exp->type) { 7109 case XML_EXP_EMPTY: 7110 return(0); 7111 case XML_EXP_ATOM: 7112 for (tmp = 0;tmp < nb;tmp++) 7113 if (list[tmp] == exp->exp_str) 7114 return(0); 7115 if (nb >= len) 7116 return(-2); 7117 list[nb] = exp->exp_str; 7118 return(1); 7119 case XML_EXP_COUNT: 7120 exp = exp->exp_left; 7121 goto tail; 7122 case XML_EXP_SEQ: 7123 case XML_EXP_OR: 7124 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 7125 if (tmp < 0) 7126 return(tmp); 7127 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 7128 nb + tmp); 7129 if (tmp2 < 0) 7130 return(tmp2); 7131 return(tmp + tmp2); 7132 } 7133 return(-1); 7134 } 7135 7136 /** 7137 * xmlExpGetLanguage: 7138 * @ctxt: the expression context 7139 * @exp: the expression 7140 * @langList: where to store the tokens 7141 * @len: the allocated length of @list 7142 * 7143 * Find all the strings used in @exp and store them in @list 7144 * 7145 * Returns the number of unique strings found, -1 in case of errors and 7146 * -2 if there is more than @len strings 7147 */ 7148 int 7149 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7150 const xmlChar**langList, int len) { 7151 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 7152 return(-1); 7153 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 7154 } 7155 7156 static int 7157 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7158 const xmlChar**list, int len, int nb) { 7159 int tmp, tmp2; 7160 tail: 7161 switch (exp->type) { 7162 case XML_EXP_FORBID: 7163 return(0); 7164 case XML_EXP_EMPTY: 7165 return(0); 7166 case XML_EXP_ATOM: 7167 for (tmp = 0;tmp < nb;tmp++) 7168 if (list[tmp] == exp->exp_str) 7169 return(0); 7170 if (nb >= len) 7171 return(-2); 7172 list[nb] = exp->exp_str; 7173 return(1); 7174 case XML_EXP_COUNT: 7175 exp = exp->exp_left; 7176 goto tail; 7177 case XML_EXP_SEQ: 7178 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7179 if (tmp < 0) 7180 return(tmp); 7181 if (IS_NILLABLE(exp->exp_left)) { 7182 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7183 nb + tmp); 7184 if (tmp2 < 0) 7185 return(tmp2); 7186 tmp += tmp2; 7187 } 7188 return(tmp); 7189 case XML_EXP_OR: 7190 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7191 if (tmp < 0) 7192 return(tmp); 7193 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7194 nb + tmp); 7195 if (tmp2 < 0) 7196 return(tmp2); 7197 return(tmp + tmp2); 7198 } 7199 return(-1); 7200 } 7201 7202 /** 7203 * xmlExpGetStart: 7204 * @ctxt: the expression context 7205 * @exp: the expression 7206 * @tokList: where to store the tokens 7207 * @len: the allocated length of @list 7208 * 7209 * Find all the strings that appears at the start of the languages 7210 * accepted by @exp and store them in @list. E.g. for (a, b) | c 7211 * it will return the list [a, c] 7212 * 7213 * Returns the number of unique strings found, -1 in case of errors and 7214 * -2 if there is more than @len strings 7215 */ 7216 int 7217 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7218 const xmlChar**tokList, int len) { 7219 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 7220 return(-1); 7221 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 7222 } 7223 7224 /** 7225 * xmlExpIsNillable: 7226 * @exp: the expression 7227 * 7228 * Finds if the expression is nillable, i.e. if it accepts the empty sequence 7229 * 7230 * Returns 1 if nillable, 0 if not and -1 in case of error 7231 */ 7232 int 7233 xmlExpIsNillable(xmlExpNodePtr exp) { 7234 if (exp == NULL) 7235 return(-1); 7236 return(IS_NILLABLE(exp) != 0); 7237 } 7238 7239 static xmlExpNodePtr 7240 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 7241 { 7242 xmlExpNodePtr ret; 7243 7244 switch (exp->type) { 7245 case XML_EXP_EMPTY: 7246 return(forbiddenExp); 7247 case XML_EXP_FORBID: 7248 return(forbiddenExp); 7249 case XML_EXP_ATOM: 7250 if (exp->exp_str == str) { 7251 #ifdef DEBUG_DERIV 7252 printf("deriv atom: equal => Empty\n"); 7253 #endif 7254 ret = emptyExp; 7255 } else { 7256 #ifdef DEBUG_DERIV 7257 printf("deriv atom: mismatch => forbid\n"); 7258 #endif 7259 /* TODO wildcards here */ 7260 ret = forbiddenExp; 7261 } 7262 return(ret); 7263 case XML_EXP_OR: { 7264 xmlExpNodePtr tmp; 7265 7266 #ifdef DEBUG_DERIV 7267 printf("deriv or: => or(derivs)\n"); 7268 #endif 7269 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7270 if (tmp == NULL) { 7271 return(NULL); 7272 } 7273 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7274 if (ret == NULL) { 7275 xmlExpFree(ctxt, tmp); 7276 return(NULL); 7277 } 7278 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7279 NULL, 0, 0); 7280 return(ret); 7281 } 7282 case XML_EXP_SEQ: 7283 #ifdef DEBUG_DERIV 7284 printf("deriv seq: starting with left\n"); 7285 #endif 7286 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7287 if (ret == NULL) { 7288 return(NULL); 7289 } else if (ret == forbiddenExp) { 7290 if (IS_NILLABLE(exp->exp_left)) { 7291 #ifdef DEBUG_DERIV 7292 printf("deriv seq: left failed but nillable\n"); 7293 #endif 7294 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7295 } 7296 } else { 7297 #ifdef DEBUG_DERIV 7298 printf("deriv seq: left match => sequence\n"); 7299 #endif 7300 exp->exp_right->ref++; 7301 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7302 NULL, 0, 0); 7303 } 7304 return(ret); 7305 case XML_EXP_COUNT: { 7306 int min, max; 7307 xmlExpNodePtr tmp; 7308 7309 if (exp->exp_max == 0) 7310 return(forbiddenExp); 7311 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7312 if (ret == NULL) 7313 return(NULL); 7314 if (ret == forbiddenExp) { 7315 #ifdef DEBUG_DERIV 7316 printf("deriv count: pattern mismatch => forbid\n"); 7317 #endif 7318 return(ret); 7319 } 7320 if (exp->exp_max == 1) 7321 return(ret); 7322 if (exp->exp_max < 0) /* unbounded */ 7323 max = -1; 7324 else 7325 max = exp->exp_max - 1; 7326 if (exp->exp_min > 0) 7327 min = exp->exp_min - 1; 7328 else 7329 min = 0; 7330 exp->exp_left->ref++; 7331 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7332 NULL, min, max); 7333 if (ret == emptyExp) { 7334 #ifdef DEBUG_DERIV 7335 printf("deriv count: match to empty => new count\n"); 7336 #endif 7337 return(tmp); 7338 } 7339 #ifdef DEBUG_DERIV 7340 printf("deriv count: match => sequence with new count\n"); 7341 #endif 7342 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7343 NULL, 0, 0)); 7344 } 7345 } 7346 return(NULL); 7347 } 7348 7349 /** 7350 * xmlExpStringDerive: 7351 * @ctxt: the expression context 7352 * @exp: the expression 7353 * @str: the string 7354 * @len: the string len in bytes if available 7355 * 7356 * Do one step of Brzozowski derivation of the expression @exp with 7357 * respect to the input string 7358 * 7359 * Returns the resulting expression or NULL in case of internal error 7360 */ 7361 xmlExpNodePtr 7362 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7363 const xmlChar *str, int len) { 7364 const xmlChar *input; 7365 7366 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7367 return(NULL); 7368 } 7369 /* 7370 * check the string is in the dictionary, if yes use an interned 7371 * copy, otherwise we know it's not an acceptable input 7372 */ 7373 input = xmlDictExists(ctxt->dict, str, len); 7374 if (input == NULL) { 7375 return(forbiddenExp); 7376 } 7377 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7378 } 7379 7380 static int 7381 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7382 int ret = 1; 7383 7384 if (sub->c_max == -1) { 7385 if (exp->c_max != -1) 7386 ret = 0; 7387 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7388 ret = 0; 7389 } 7390 #if 0 7391 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7392 ret = 0; 7393 #endif 7394 return(ret); 7395 } 7396 7397 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7398 xmlExpNodePtr sub); 7399 /** 7400 * xmlExpDivide: 7401 * @ctxt: the expressions context 7402 * @exp: the englobing expression 7403 * @sub: the subexpression 7404 * @mult: the multiple expression 7405 * @remain: the remain from the derivation of the multiple 7406 * 7407 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7408 * so that sub{n} subsume exp 7409 * 7410 * Returns the multiple value if successful, 0 if it is not a multiple 7411 * and -1 in case of internal error. 7412 */ 7413 7414 static int 7415 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7416 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7417 int i; 7418 xmlExpNodePtr tmp, tmp2; 7419 7420 if (mult != NULL) *mult = NULL; 7421 if (remain != NULL) *remain = NULL; 7422 if (exp->c_max == -1) return(0); 7423 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7424 7425 for (i = 1;i <= exp->c_max;i++) { 7426 sub->ref++; 7427 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7428 sub, NULL, NULL, i, i); 7429 if (tmp == NULL) { 7430 return(-1); 7431 } 7432 if (!xmlExpCheckCard(tmp, exp)) { 7433 xmlExpFree(ctxt, tmp); 7434 continue; 7435 } 7436 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7437 if (tmp2 == NULL) { 7438 xmlExpFree(ctxt, tmp); 7439 return(-1); 7440 } 7441 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7442 if (remain != NULL) 7443 *remain = tmp2; 7444 else 7445 xmlExpFree(ctxt, tmp2); 7446 if (mult != NULL) 7447 *mult = tmp; 7448 else 7449 xmlExpFree(ctxt, tmp); 7450 #ifdef DEBUG_DERIV 7451 printf("Divide succeeded %d\n", i); 7452 #endif 7453 return(i); 7454 } 7455 xmlExpFree(ctxt, tmp); 7456 xmlExpFree(ctxt, tmp2); 7457 } 7458 #ifdef DEBUG_DERIV 7459 printf("Divide failed\n"); 7460 #endif 7461 return(0); 7462 } 7463 7464 /** 7465 * xmlExpExpDeriveInt: 7466 * @ctxt: the expressions context 7467 * @exp: the englobing expression 7468 * @sub: the subexpression 7469 * 7470 * Try to do a step of Brzozowski derivation but at a higher level 7471 * the input being a subexpression. 7472 * 7473 * Returns the resulting expression or NULL in case of internal error 7474 */ 7475 static xmlExpNodePtr 7476 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7477 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7478 const xmlChar **tab; 7479 int len, i; 7480 7481 /* 7482 * In case of equality and if the expression can only consume a finite 7483 * amount, then the derivation is empty 7484 */ 7485 if ((exp == sub) && (exp->c_max >= 0)) { 7486 #ifdef DEBUG_DERIV 7487 printf("Equal(exp, sub) and finite -> Empty\n"); 7488 #endif 7489 return(emptyExp); 7490 } 7491 /* 7492 * decompose sub sequence first 7493 */ 7494 if (sub->type == XML_EXP_EMPTY) { 7495 #ifdef DEBUG_DERIV 7496 printf("Empty(sub) -> Empty\n"); 7497 #endif 7498 exp->ref++; 7499 return(exp); 7500 } 7501 if (sub->type == XML_EXP_SEQ) { 7502 #ifdef DEBUG_DERIV 7503 printf("Seq(sub) -> decompose\n"); 7504 #endif 7505 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7506 if (tmp == NULL) 7507 return(NULL); 7508 if (tmp == forbiddenExp) 7509 return(tmp); 7510 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7511 xmlExpFree(ctxt, tmp); 7512 return(ret); 7513 } 7514 if (sub->type == XML_EXP_OR) { 7515 #ifdef DEBUG_DERIV 7516 printf("Or(sub) -> decompose\n"); 7517 #endif 7518 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7519 if (tmp == forbiddenExp) 7520 return(tmp); 7521 if (tmp == NULL) 7522 return(NULL); 7523 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7524 if ((ret == NULL) || (ret == forbiddenExp)) { 7525 xmlExpFree(ctxt, tmp); 7526 return(ret); 7527 } 7528 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7529 } 7530 if (!xmlExpCheckCard(exp, sub)) { 7531 #ifdef DEBUG_DERIV 7532 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7533 #endif 7534 return(forbiddenExp); 7535 } 7536 switch (exp->type) { 7537 case XML_EXP_EMPTY: 7538 if (sub == emptyExp) 7539 return(emptyExp); 7540 #ifdef DEBUG_DERIV 7541 printf("Empty(exp) -> Forbid\n"); 7542 #endif 7543 return(forbiddenExp); 7544 case XML_EXP_FORBID: 7545 #ifdef DEBUG_DERIV 7546 printf("Forbid(exp) -> Forbid\n"); 7547 #endif 7548 return(forbiddenExp); 7549 case XML_EXP_ATOM: 7550 if (sub->type == XML_EXP_ATOM) { 7551 /* TODO: handle wildcards */ 7552 if (exp->exp_str == sub->exp_str) { 7553 #ifdef DEBUG_DERIV 7554 printf("Atom match -> Empty\n"); 7555 #endif 7556 return(emptyExp); 7557 } 7558 #ifdef DEBUG_DERIV 7559 printf("Atom mismatch -> Forbid\n"); 7560 #endif 7561 return(forbiddenExp); 7562 } 7563 if ((sub->type == XML_EXP_COUNT) && 7564 (sub->exp_max == 1) && 7565 (sub->exp_left->type == XML_EXP_ATOM)) { 7566 /* TODO: handle wildcards */ 7567 if (exp->exp_str == sub->exp_left->exp_str) { 7568 #ifdef DEBUG_DERIV 7569 printf("Atom match -> Empty\n"); 7570 #endif 7571 return(emptyExp); 7572 } 7573 #ifdef DEBUG_DERIV 7574 printf("Atom mismatch -> Forbid\n"); 7575 #endif 7576 return(forbiddenExp); 7577 } 7578 #ifdef DEBUG_DERIV 7579 printf("Complex exp vs Atom -> Forbid\n"); 7580 #endif 7581 return(forbiddenExp); 7582 case XML_EXP_SEQ: 7583 /* try to get the sequence consumed only if possible */ 7584 if (xmlExpCheckCard(exp->exp_left, sub)) { 7585 /* See if the sequence can be consumed directly */ 7586 #ifdef DEBUG_DERIV 7587 printf("Seq trying left only\n"); 7588 #endif 7589 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7590 if ((ret != forbiddenExp) && (ret != NULL)) { 7591 #ifdef DEBUG_DERIV 7592 printf("Seq trying left only worked\n"); 7593 #endif 7594 /* 7595 * TODO: assumption here that we are determinist 7596 * i.e. we won't get to a nillable exp left 7597 * subset which could be matched by the right 7598 * part too. 7599 * e.g.: (a | b)+,(a | c) and 'a+,a' 7600 */ 7601 exp->exp_right->ref++; 7602 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7603 exp->exp_right, NULL, 0, 0)); 7604 } 7605 #ifdef DEBUG_DERIV 7606 } else { 7607 printf("Seq: left too short\n"); 7608 #endif 7609 } 7610 /* Try instead to decompose */ 7611 if (sub->type == XML_EXP_COUNT) { 7612 int min, max; 7613 7614 #ifdef DEBUG_DERIV 7615 printf("Seq: sub is a count\n"); 7616 #endif 7617 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7618 if (ret == NULL) 7619 return(NULL); 7620 if (ret != forbiddenExp) { 7621 #ifdef DEBUG_DERIV 7622 printf("Seq , Count match on left\n"); 7623 #endif 7624 if (sub->exp_max < 0) 7625 max = -1; 7626 else 7627 max = sub->exp_max -1; 7628 if (sub->exp_min > 0) 7629 min = sub->exp_min -1; 7630 else 7631 min = 0; 7632 exp->exp_right->ref++; 7633 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7634 exp->exp_right, NULL, 0, 0); 7635 if (tmp == NULL) 7636 return(NULL); 7637 7638 sub->exp_left->ref++; 7639 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7640 sub->exp_left, NULL, NULL, min, max); 7641 if (tmp2 == NULL) { 7642 xmlExpFree(ctxt, tmp); 7643 return(NULL); 7644 } 7645 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7646 xmlExpFree(ctxt, tmp); 7647 xmlExpFree(ctxt, tmp2); 7648 return(ret); 7649 } 7650 } 7651 /* we made no progress on structured operations */ 7652 break; 7653 case XML_EXP_OR: 7654 #ifdef DEBUG_DERIV 7655 printf("Or , trying both side\n"); 7656 #endif 7657 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7658 if (ret == NULL) 7659 return(NULL); 7660 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7661 if (tmp == NULL) { 7662 xmlExpFree(ctxt, ret); 7663 return(NULL); 7664 } 7665 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7666 case XML_EXP_COUNT: { 7667 int min, max; 7668 7669 if (sub->type == XML_EXP_COUNT) { 7670 /* 7671 * Try to see if the loop is completely subsumed 7672 */ 7673 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7674 if (tmp == NULL) 7675 return(NULL); 7676 if (tmp == forbiddenExp) { 7677 int mult; 7678 7679 #ifdef DEBUG_DERIV 7680 printf("Count, Count inner don't subsume\n"); 7681 #endif 7682 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7683 NULL, &tmp); 7684 if (mult <= 0) { 7685 #ifdef DEBUG_DERIV 7686 printf("Count, Count not multiple => forbidden\n"); 7687 #endif 7688 return(forbiddenExp); 7689 } 7690 if (sub->exp_max == -1) { 7691 max = -1; 7692 if (exp->exp_max == -1) { 7693 if (exp->exp_min <= sub->exp_min * mult) 7694 min = 0; 7695 else 7696 min = exp->exp_min - sub->exp_min * mult; 7697 } else { 7698 #ifdef DEBUG_DERIV 7699 printf("Count, Count finite can't subsume infinite\n"); 7700 #endif 7701 xmlExpFree(ctxt, tmp); 7702 return(forbiddenExp); 7703 } 7704 } else { 7705 if (exp->exp_max == -1) { 7706 #ifdef DEBUG_DERIV 7707 printf("Infinite loop consume mult finite loop\n"); 7708 #endif 7709 if (exp->exp_min > sub->exp_min * mult) { 7710 max = -1; 7711 min = exp->exp_min - sub->exp_min * mult; 7712 } else { 7713 max = -1; 7714 min = 0; 7715 } 7716 } else { 7717 if (exp->exp_max < sub->exp_max * mult) { 7718 #ifdef DEBUG_DERIV 7719 printf("loops max mult mismatch => forbidden\n"); 7720 #endif 7721 xmlExpFree(ctxt, tmp); 7722 return(forbiddenExp); 7723 } 7724 if (sub->exp_max * mult > exp->exp_min) 7725 min = 0; 7726 else 7727 min = exp->exp_min - sub->exp_max * mult; 7728 max = exp->exp_max - sub->exp_max * mult; 7729 } 7730 } 7731 } else if (!IS_NILLABLE(tmp)) { 7732 /* 7733 * TODO: loop here to try to grow if working on finite 7734 * blocks. 7735 */ 7736 #ifdef DEBUG_DERIV 7737 printf("Count, Count remain not nillable => forbidden\n"); 7738 #endif 7739 xmlExpFree(ctxt, tmp); 7740 return(forbiddenExp); 7741 } else if (sub->exp_max == -1) { 7742 if (exp->exp_max == -1) { 7743 if (exp->exp_min <= sub->exp_min) { 7744 #ifdef DEBUG_DERIV 7745 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7746 #endif 7747 max = -1; 7748 min = 0; 7749 } else { 7750 #ifdef DEBUG_DERIV 7751 printf("Infinite loops min => Count(X,Inf)\n"); 7752 #endif 7753 max = -1; 7754 min = exp->exp_min - sub->exp_min; 7755 } 7756 } else if (exp->exp_min > sub->exp_min) { 7757 #ifdef DEBUG_DERIV 7758 printf("loops min mismatch 1 => forbidden ???\n"); 7759 #endif 7760 xmlExpFree(ctxt, tmp); 7761 return(forbiddenExp); 7762 } else { 7763 max = -1; 7764 min = 0; 7765 } 7766 } else { 7767 if (exp->exp_max == -1) { 7768 #ifdef DEBUG_DERIV 7769 printf("Infinite loop consume finite loop\n"); 7770 #endif 7771 if (exp->exp_min > sub->exp_min) { 7772 max = -1; 7773 min = exp->exp_min - sub->exp_min; 7774 } else { 7775 max = -1; 7776 min = 0; 7777 } 7778 } else { 7779 if (exp->exp_max < sub->exp_max) { 7780 #ifdef DEBUG_DERIV 7781 printf("loops max mismatch => forbidden\n"); 7782 #endif 7783 xmlExpFree(ctxt, tmp); 7784 return(forbiddenExp); 7785 } 7786 if (sub->exp_max > exp->exp_min) 7787 min = 0; 7788 else 7789 min = exp->exp_min - sub->exp_max; 7790 max = exp->exp_max - sub->exp_max; 7791 } 7792 } 7793 #ifdef DEBUG_DERIV 7794 printf("loops match => SEQ(COUNT())\n"); 7795 #endif 7796 exp->exp_left->ref++; 7797 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7798 NULL, NULL, min, max); 7799 if (tmp2 == NULL) { 7800 return(NULL); 7801 } 7802 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7803 NULL, 0, 0); 7804 return(ret); 7805 } 7806 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7807 if (tmp == NULL) 7808 return(NULL); 7809 if (tmp == forbiddenExp) { 7810 #ifdef DEBUG_DERIV 7811 printf("loop mismatch => forbidden\n"); 7812 #endif 7813 return(forbiddenExp); 7814 } 7815 if (exp->exp_min > 0) 7816 min = exp->exp_min - 1; 7817 else 7818 min = 0; 7819 if (exp->exp_max < 0) 7820 max = -1; 7821 else 7822 max = exp->exp_max - 1; 7823 7824 #ifdef DEBUG_DERIV 7825 printf("loop match => SEQ(COUNT())\n"); 7826 #endif 7827 exp->exp_left->ref++; 7828 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7829 NULL, NULL, min, max); 7830 if (tmp2 == NULL) 7831 return(NULL); 7832 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7833 NULL, 0, 0); 7834 return(ret); 7835 } 7836 } 7837 7838 #ifdef DEBUG_DERIV 7839 printf("Fallback to derivative\n"); 7840 #endif 7841 if (IS_NILLABLE(sub)) { 7842 if (!(IS_NILLABLE(exp))) 7843 return(forbiddenExp); 7844 else 7845 ret = emptyExp; 7846 } else 7847 ret = NULL; 7848 /* 7849 * here the structured derivation made no progress so 7850 * we use the default token based derivation to force one more step 7851 */ 7852 if (ctxt->tabSize == 0) 7853 ctxt->tabSize = 40; 7854 7855 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7856 sizeof(const xmlChar *)); 7857 if (tab == NULL) { 7858 return(NULL); 7859 } 7860 7861 /* 7862 * collect all the strings accepted by the subexpression on input 7863 */ 7864 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7865 while (len < 0) { 7866 const xmlChar **temp; 7867 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7868 sizeof(const xmlChar *)); 7869 if (temp == NULL) { 7870 xmlFree((xmlChar **) tab); 7871 return(NULL); 7872 } 7873 tab = temp; 7874 ctxt->tabSize *= 2; 7875 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7876 } 7877 for (i = 0;i < len;i++) { 7878 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7879 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7880 xmlExpFree(ctxt, ret); 7881 xmlFree((xmlChar **) tab); 7882 return(tmp); 7883 } 7884 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7885 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7886 xmlExpFree(ctxt, tmp); 7887 xmlExpFree(ctxt, ret); 7888 xmlFree((xmlChar **) tab); 7889 return(tmp); 7890 } 7891 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7892 xmlExpFree(ctxt, tmp); 7893 xmlExpFree(ctxt, tmp2); 7894 7895 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7896 xmlExpFree(ctxt, ret); 7897 xmlFree((xmlChar **) tab); 7898 return(tmp3); 7899 } 7900 7901 if (ret == NULL) 7902 ret = tmp3; 7903 else { 7904 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7905 if (ret == NULL) { 7906 xmlFree((xmlChar **) tab); 7907 return(NULL); 7908 } 7909 } 7910 } 7911 xmlFree((xmlChar **) tab); 7912 return(ret); 7913 } 7914 7915 /** 7916 * xmlExpExpDerive: 7917 * @ctxt: the expressions context 7918 * @exp: the englobing expression 7919 * @sub: the subexpression 7920 * 7921 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7922 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7923 * it usually takes less than linear time and can handle expressions generating 7924 * infinite languages. 7925 * 7926 * Returns the resulting expression or NULL in case of internal error, the 7927 * result must be freed 7928 */ 7929 xmlExpNodePtr 7930 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7931 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7932 return(NULL); 7933 7934 /* 7935 * O(1) speedups 7936 */ 7937 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7938 #ifdef DEBUG_DERIV 7939 printf("Sub nillable and not exp : can't subsume\n"); 7940 #endif 7941 return(forbiddenExp); 7942 } 7943 if (xmlExpCheckCard(exp, sub) == 0) { 7944 #ifdef DEBUG_DERIV 7945 printf("sub generate longer sequences than exp : can't subsume\n"); 7946 #endif 7947 return(forbiddenExp); 7948 } 7949 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7950 } 7951 7952 /** 7953 * xmlExpSubsume: 7954 * @ctxt: the expressions context 7955 * @exp: the englobing expression 7956 * @sub: the subexpression 7957 * 7958 * Check whether @exp accepts all the languages accepted by @sub 7959 * the input being a subexpression. 7960 * 7961 * Returns 1 if true 0 if false and -1 in case of failure. 7962 */ 7963 int 7964 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7965 xmlExpNodePtr tmp; 7966 7967 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7968 return(-1); 7969 7970 /* 7971 * TODO: speedup by checking the language of sub is a subset of the 7972 * language of exp 7973 */ 7974 /* 7975 * O(1) speedups 7976 */ 7977 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7978 #ifdef DEBUG_DERIV 7979 printf("Sub nillable and not exp : can't subsume\n"); 7980 #endif 7981 return(0); 7982 } 7983 if (xmlExpCheckCard(exp, sub) == 0) { 7984 #ifdef DEBUG_DERIV 7985 printf("sub generate longer sequences than exp : can't subsume\n"); 7986 #endif 7987 return(0); 7988 } 7989 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7990 #ifdef DEBUG_DERIV 7991 printf("Result derivation :\n"); 7992 PRINT_EXP(tmp); 7993 #endif 7994 if (tmp == NULL) 7995 return(-1); 7996 if (tmp == forbiddenExp) 7997 return(0); 7998 if (tmp == emptyExp) 7999 return(1); 8000 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 8001 xmlExpFree(ctxt, tmp); 8002 return(1); 8003 } 8004 xmlExpFree(ctxt, tmp); 8005 return(0); 8006 } 8007 8008 /************************************************************************ 8009 * * 8010 * Parsing expression * 8011 * * 8012 ************************************************************************/ 8013 8014 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 8015 8016 #undef CUR 8017 #define CUR (*ctxt->cur) 8018 #undef NEXT 8019 #define NEXT ctxt->cur++; 8020 #undef IS_BLANK 8021 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 8022 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 8023 8024 static int 8025 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 8026 int ret = 0; 8027 8028 SKIP_BLANKS 8029 if (CUR == '*') { 8030 NEXT 8031 return(-1); 8032 } 8033 if ((CUR < '0') || (CUR > '9')) 8034 return(-1); 8035 while ((CUR >= '0') && (CUR <= '9')) { 8036 ret = ret * 10 + (CUR - '0'); 8037 NEXT 8038 } 8039 return(ret); 8040 } 8041 8042 static xmlExpNodePtr 8043 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 8044 const char *base; 8045 xmlExpNodePtr ret; 8046 const xmlChar *val; 8047 8048 SKIP_BLANKS 8049 base = ctxt->cur; 8050 if (*ctxt->cur == '(') { 8051 NEXT 8052 ret = xmlExpParseExpr(ctxt); 8053 SKIP_BLANKS 8054 if (*ctxt->cur != ')') { 8055 fprintf(stderr, "unbalanced '(' : %s\n", base); 8056 xmlExpFree(ctxt, ret); 8057 return(NULL); 8058 } 8059 NEXT; 8060 SKIP_BLANKS 8061 goto parse_quantifier; 8062 } 8063 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 8064 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 8065 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 8066 NEXT; 8067 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 8068 if (val == NULL) 8069 return(NULL); 8070 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 8071 if (ret == NULL) 8072 return(NULL); 8073 SKIP_BLANKS 8074 parse_quantifier: 8075 if (CUR == '{') { 8076 int min, max; 8077 8078 NEXT 8079 min = xmlExpParseNumber(ctxt); 8080 if (min < 0) { 8081 xmlExpFree(ctxt, ret); 8082 return(NULL); 8083 } 8084 SKIP_BLANKS 8085 if (CUR == ',') { 8086 NEXT 8087 max = xmlExpParseNumber(ctxt); 8088 SKIP_BLANKS 8089 } else 8090 max = min; 8091 if (CUR != '}') { 8092 xmlExpFree(ctxt, ret); 8093 return(NULL); 8094 } 8095 NEXT 8096 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 8097 min, max); 8098 SKIP_BLANKS 8099 } else if (CUR == '?') { 8100 NEXT 8101 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 8102 0, 1); 8103 SKIP_BLANKS 8104 } else if (CUR == '+') { 8105 NEXT 8106 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 8107 1, -1); 8108 SKIP_BLANKS 8109 } else if (CUR == '*') { 8110 NEXT 8111 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 8112 0, -1); 8113 SKIP_BLANKS 8114 } 8115 return(ret); 8116 } 8117 8118 8119 static xmlExpNodePtr 8120 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 8121 xmlExpNodePtr ret, right; 8122 8123 ret = xmlExpParseOr(ctxt); 8124 SKIP_BLANKS 8125 while (CUR == '|') { 8126 NEXT 8127 right = xmlExpParseOr(ctxt); 8128 if (right == NULL) { 8129 xmlExpFree(ctxt, ret); 8130 return(NULL); 8131 } 8132 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 8133 if (ret == NULL) 8134 return(NULL); 8135 } 8136 return(ret); 8137 } 8138 8139 static xmlExpNodePtr 8140 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 8141 xmlExpNodePtr ret, right; 8142 8143 ret = xmlExpParseSeq(ctxt); 8144 SKIP_BLANKS 8145 while (CUR == ',') { 8146 NEXT 8147 right = xmlExpParseSeq(ctxt); 8148 if (right == NULL) { 8149 xmlExpFree(ctxt, ret); 8150 return(NULL); 8151 } 8152 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 8153 if (ret == NULL) 8154 return(NULL); 8155 } 8156 return(ret); 8157 } 8158 8159 /** 8160 * xmlExpParse: 8161 * @ctxt: the expressions context 8162 * @expr: the 0 terminated string 8163 * 8164 * Minimal parser for regexps, it understand the following constructs 8165 * - string terminals 8166 * - choice operator | 8167 * - sequence operator , 8168 * - subexpressions (...) 8169 * - usual cardinality operators + * and ? 8170 * - finite sequences { min, max } 8171 * - infinite sequences { min, * } 8172 * There is minimal checkings made especially no checking on strings values 8173 * 8174 * Returns a new expression or NULL in case of failure 8175 */ 8176 xmlExpNodePtr 8177 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 8178 xmlExpNodePtr ret; 8179 8180 ctxt->expr = expr; 8181 ctxt->cur = expr; 8182 8183 ret = xmlExpParseExpr(ctxt); 8184 SKIP_BLANKS 8185 if (*ctxt->cur != 0) { 8186 xmlExpFree(ctxt, ret); 8187 return(NULL); 8188 } 8189 return(ret); 8190 } 8191 8192 static void 8193 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 8194 xmlExpNodePtr c; 8195 8196 if (expr == NULL) return; 8197 if (glob) xmlBufferWriteChar(buf, "("); 8198 switch (expr->type) { 8199 case XML_EXP_EMPTY: 8200 xmlBufferWriteChar(buf, "empty"); 8201 break; 8202 case XML_EXP_FORBID: 8203 xmlBufferWriteChar(buf, "forbidden"); 8204 break; 8205 case XML_EXP_ATOM: 8206 xmlBufferWriteCHAR(buf, expr->exp_str); 8207 break; 8208 case XML_EXP_SEQ: 8209 c = expr->exp_left; 8210 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8211 xmlExpDumpInt(buf, c, 1); 8212 else 8213 xmlExpDumpInt(buf, c, 0); 8214 xmlBufferWriteChar(buf, " , "); 8215 c = expr->exp_right; 8216 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8217 xmlExpDumpInt(buf, c, 1); 8218 else 8219 xmlExpDumpInt(buf, c, 0); 8220 break; 8221 case XML_EXP_OR: 8222 c = expr->exp_left; 8223 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8224 xmlExpDumpInt(buf, c, 1); 8225 else 8226 xmlExpDumpInt(buf, c, 0); 8227 xmlBufferWriteChar(buf, " | "); 8228 c = expr->exp_right; 8229 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8230 xmlExpDumpInt(buf, c, 1); 8231 else 8232 xmlExpDumpInt(buf, c, 0); 8233 break; 8234 case XML_EXP_COUNT: { 8235 char rep[40]; 8236 8237 c = expr->exp_left; 8238 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8239 xmlExpDumpInt(buf, c, 1); 8240 else 8241 xmlExpDumpInt(buf, c, 0); 8242 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 8243 rep[0] = '?'; 8244 rep[1] = 0; 8245 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8246 rep[0] = '*'; 8247 rep[1] = 0; 8248 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8249 rep[0] = '+'; 8250 rep[1] = 0; 8251 } else if (expr->exp_max == expr->exp_min) { 8252 snprintf(rep, 39, "{%d}", expr->exp_min); 8253 } else if (expr->exp_max < 0) { 8254 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8255 } else { 8256 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8257 } 8258 rep[39] = 0; 8259 xmlBufferWriteChar(buf, rep); 8260 break; 8261 } 8262 default: 8263 fprintf(stderr, "Error in tree\n"); 8264 } 8265 if (glob) 8266 xmlBufferWriteChar(buf, ")"); 8267 } 8268 /** 8269 * xmlExpDump: 8270 * @buf: a buffer to receive the output 8271 * @expr: the compiled expression 8272 * 8273 * Serialize the expression as compiled to the buffer 8274 */ 8275 void 8276 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8277 if ((buf == NULL) || (expr == NULL)) 8278 return; 8279 xmlExpDumpInt(buf, expr, 0); 8280 } 8281 8282 /** 8283 * xmlExpMaxToken: 8284 * @expr: a compiled expression 8285 * 8286 * Indicate the maximum number of input a expression can accept 8287 * 8288 * Returns the maximum length or -1 in case of error 8289 */ 8290 int 8291 xmlExpMaxToken(xmlExpNodePtr expr) { 8292 if (expr == NULL) 8293 return(-1); 8294 return(expr->c_max); 8295 } 8296 8297 /** 8298 * xmlExpCtxtNbNodes: 8299 * @ctxt: an expression context 8300 * 8301 * Debugging facility provides the number of allocated nodes at a that point 8302 * 8303 * Returns the number of nodes in use or -1 in case of error 8304 */ 8305 int 8306 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8307 if (ctxt == NULL) 8308 return(-1); 8309 return(ctxt->nb_nodes); 8310 } 8311 8312 /** 8313 * xmlExpCtxtNbCons: 8314 * @ctxt: an expression context 8315 * 8316 * Debugging facility provides the number of allocated nodes over lifetime 8317 * 8318 * Returns the number of nodes ever allocated or -1 in case of error 8319 */ 8320 int 8321 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8322 if (ctxt == NULL) 8323 return(-1); 8324 return(ctxt->nb_cons); 8325 } 8326 8327 #endif /* LIBXML_EXPR_ENABLED */ 8328 8329 #endif /* LIBXML_REGEXP_ENABLED */ 8330