1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 30 #include <libxml/tree.h> 31 #include <libxml/parserInternals.h> 32 #include <libxml/xmlregexp.h> 33 #include <libxml/xmlautomata.h> 34 #include <libxml/xmlunicode.h> 35 36 #ifndef INT_MAX 37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38 #endif 39 40 /* #define DEBUG_REGEXP_GRAPH */ 41 /* #define DEBUG_REGEXP_EXEC */ 42 /* #define DEBUG_PUSH */ 43 /* #define DEBUG_COMPACTION */ 44 45 #define MAX_PUSH 10000000 46 47 #ifdef ERROR 48 #undef ERROR 49 #endif 50 #define ERROR(str) \ 51 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 52 xmlRegexpErrCompile(ctxt, str); 53 #define NEXT ctxt->cur++ 54 #define CUR (*(ctxt->cur)) 55 #define NXT(index) (ctxt->cur[index]) 56 57 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 58 #define NEXTL(l) ctxt->cur += l; 59 #define XML_REG_STRING_SEPARATOR '|' 60 /* 61 * Need PREV to check on a '-' within a Character Group. May only be used 62 * when it's guaranteed that cur is not at the beginning of ctxt->string! 63 */ 64 #define PREV (ctxt->cur[-1]) 65 66 /** 67 * TODO: 68 * 69 * macro to flag unimplemented blocks 70 */ 71 #define TODO \ 72 xmlGenericError(xmlGenericErrorContext, \ 73 "Unimplemented block at %s:%d\n", \ 74 __FILE__, __LINE__); 75 76 /************************************************************************ 77 * * 78 * Datatypes and structures * 79 * * 80 ************************************************************************/ 81 82 /* 83 * Note: the order of the enums below is significant, do not shuffle 84 */ 85 typedef enum { 86 XML_REGEXP_EPSILON = 1, 87 XML_REGEXP_CHARVAL, 88 XML_REGEXP_RANGES, 89 XML_REGEXP_SUBREG, /* used for () sub regexps */ 90 XML_REGEXP_STRING, 91 XML_REGEXP_ANYCHAR, /* . */ 92 XML_REGEXP_ANYSPACE, /* \s */ 93 XML_REGEXP_NOTSPACE, /* \S */ 94 XML_REGEXP_INITNAME, /* \l */ 95 XML_REGEXP_NOTINITNAME, /* \L */ 96 XML_REGEXP_NAMECHAR, /* \c */ 97 XML_REGEXP_NOTNAMECHAR, /* \C */ 98 XML_REGEXP_DECIMAL, /* \d */ 99 XML_REGEXP_NOTDECIMAL, /* \D */ 100 XML_REGEXP_REALCHAR, /* \w */ 101 XML_REGEXP_NOTREALCHAR, /* \W */ 102 XML_REGEXP_LETTER = 100, 103 XML_REGEXP_LETTER_UPPERCASE, 104 XML_REGEXP_LETTER_LOWERCASE, 105 XML_REGEXP_LETTER_TITLECASE, 106 XML_REGEXP_LETTER_MODIFIER, 107 XML_REGEXP_LETTER_OTHERS, 108 XML_REGEXP_MARK, 109 XML_REGEXP_MARK_NONSPACING, 110 XML_REGEXP_MARK_SPACECOMBINING, 111 XML_REGEXP_MARK_ENCLOSING, 112 XML_REGEXP_NUMBER, 113 XML_REGEXP_NUMBER_DECIMAL, 114 XML_REGEXP_NUMBER_LETTER, 115 XML_REGEXP_NUMBER_OTHERS, 116 XML_REGEXP_PUNCT, 117 XML_REGEXP_PUNCT_CONNECTOR, 118 XML_REGEXP_PUNCT_DASH, 119 XML_REGEXP_PUNCT_OPEN, 120 XML_REGEXP_PUNCT_CLOSE, 121 XML_REGEXP_PUNCT_INITQUOTE, 122 XML_REGEXP_PUNCT_FINQUOTE, 123 XML_REGEXP_PUNCT_OTHERS, 124 XML_REGEXP_SEPAR, 125 XML_REGEXP_SEPAR_SPACE, 126 XML_REGEXP_SEPAR_LINE, 127 XML_REGEXP_SEPAR_PARA, 128 XML_REGEXP_SYMBOL, 129 XML_REGEXP_SYMBOL_MATH, 130 XML_REGEXP_SYMBOL_CURRENCY, 131 XML_REGEXP_SYMBOL_MODIFIER, 132 XML_REGEXP_SYMBOL_OTHERS, 133 XML_REGEXP_OTHER, 134 XML_REGEXP_OTHER_CONTROL, 135 XML_REGEXP_OTHER_FORMAT, 136 XML_REGEXP_OTHER_PRIVATE, 137 XML_REGEXP_OTHER_NA, 138 XML_REGEXP_BLOCK_NAME 139 } xmlRegAtomType; 140 141 typedef enum { 142 XML_REGEXP_QUANT_EPSILON = 1, 143 XML_REGEXP_QUANT_ONCE, 144 XML_REGEXP_QUANT_OPT, 145 XML_REGEXP_QUANT_MULT, 146 XML_REGEXP_QUANT_PLUS, 147 XML_REGEXP_QUANT_ONCEONLY, 148 XML_REGEXP_QUANT_ALL, 149 XML_REGEXP_QUANT_RANGE 150 } xmlRegQuantType; 151 152 typedef enum { 153 XML_REGEXP_START_STATE = 1, 154 XML_REGEXP_FINAL_STATE, 155 XML_REGEXP_TRANS_STATE, 156 XML_REGEXP_SINK_STATE, 157 XML_REGEXP_UNREACH_STATE 158 } xmlRegStateType; 159 160 typedef enum { 161 XML_REGEXP_MARK_NORMAL = 0, 162 XML_REGEXP_MARK_START, 163 XML_REGEXP_MARK_VISITED 164 } xmlRegMarkedType; 165 166 typedef struct _xmlRegRange xmlRegRange; 167 typedef xmlRegRange *xmlRegRangePtr; 168 169 struct _xmlRegRange { 170 int neg; /* 0 normal, 1 not, 2 exclude */ 171 xmlRegAtomType type; 172 int start; 173 int end; 174 xmlChar *blockName; 175 }; 176 177 typedef struct _xmlRegAtom xmlRegAtom; 178 typedef xmlRegAtom *xmlRegAtomPtr; 179 180 typedef struct _xmlAutomataState xmlRegState; 181 typedef xmlRegState *xmlRegStatePtr; 182 183 struct _xmlRegAtom { 184 int no; 185 xmlRegAtomType type; 186 xmlRegQuantType quant; 187 int min; 188 int max; 189 190 void *valuep; 191 void *valuep2; 192 int neg; 193 int codepoint; 194 xmlRegStatePtr start; 195 xmlRegStatePtr start0; 196 xmlRegStatePtr stop; 197 int maxRanges; 198 int nbRanges; 199 xmlRegRangePtr *ranges; 200 void *data; 201 }; 202 203 typedef struct _xmlRegCounter xmlRegCounter; 204 typedef xmlRegCounter *xmlRegCounterPtr; 205 206 struct _xmlRegCounter { 207 int min; 208 int max; 209 }; 210 211 typedef struct _xmlRegTrans xmlRegTrans; 212 typedef xmlRegTrans *xmlRegTransPtr; 213 214 struct _xmlRegTrans { 215 xmlRegAtomPtr atom; 216 int to; 217 int counter; 218 int count; 219 int nd; 220 }; 221 222 struct _xmlAutomataState { 223 xmlRegStateType type; 224 xmlRegMarkedType mark; 225 xmlRegMarkedType markd; 226 xmlRegMarkedType reached; 227 int no; 228 int maxTrans; 229 int nbTrans; 230 xmlRegTrans *trans; 231 /* knowing states ponting to us can speed things up */ 232 int maxTransTo; 233 int nbTransTo; 234 int *transTo; 235 }; 236 237 typedef struct _xmlAutomata xmlRegParserCtxt; 238 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 239 240 #define AM_AUTOMATA_RNG 1 241 242 struct _xmlAutomata { 243 xmlChar *string; 244 xmlChar *cur; 245 246 int error; 247 int neg; 248 249 xmlRegStatePtr start; 250 xmlRegStatePtr end; 251 xmlRegStatePtr state; 252 253 xmlRegAtomPtr atom; 254 255 int maxAtoms; 256 int nbAtoms; 257 xmlRegAtomPtr *atoms; 258 259 int maxStates; 260 int nbStates; 261 xmlRegStatePtr *states; 262 263 int maxCounters; 264 int nbCounters; 265 xmlRegCounter *counters; 266 267 int determinist; 268 int negs; 269 int flags; 270 }; 271 272 struct _xmlRegexp { 273 xmlChar *string; 274 int nbStates; 275 xmlRegStatePtr *states; 276 int nbAtoms; 277 xmlRegAtomPtr *atoms; 278 int nbCounters; 279 xmlRegCounter *counters; 280 int determinist; 281 int flags; 282 /* 283 * That's the compact form for determinists automatas 284 */ 285 int nbstates; 286 int *compact; 287 void **transdata; 288 int nbstrings; 289 xmlChar **stringMap; 290 }; 291 292 typedef struct _xmlRegExecRollback xmlRegExecRollback; 293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 294 295 struct _xmlRegExecRollback { 296 xmlRegStatePtr state;/* the current state */ 297 int index; /* the index in the input stack */ 298 int nextbranch; /* the next transition to explore in that state */ 299 int *counts; /* save the automata state if it has some */ 300 }; 301 302 typedef struct _xmlRegInputToken xmlRegInputToken; 303 typedef xmlRegInputToken *xmlRegInputTokenPtr; 304 305 struct _xmlRegInputToken { 306 xmlChar *value; 307 void *data; 308 }; 309 310 struct _xmlRegExecCtxt { 311 int status; /* execution status != 0 indicate an error */ 312 int determinist; /* did we find an indeterministic behaviour */ 313 xmlRegexpPtr comp; /* the compiled regexp */ 314 xmlRegExecCallbacks callback; 315 void *data; 316 317 xmlRegStatePtr state;/* the current state */ 318 int transno; /* the current transition on that state */ 319 int transcount; /* the number of chars in char counted transitions */ 320 321 /* 322 * A stack of rollback states 323 */ 324 int maxRollbacks; 325 int nbRollbacks; 326 xmlRegExecRollback *rollbacks; 327 328 /* 329 * The state of the automata if any 330 */ 331 int *counts; 332 333 /* 334 * The input stack 335 */ 336 int inputStackMax; 337 int inputStackNr; 338 int index; 339 int *charStack; 340 const xmlChar *inputString; /* when operating on characters */ 341 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 342 343 /* 344 * error handling 345 */ 346 int errStateNo; /* the error state number */ 347 xmlRegStatePtr errState; /* the error state */ 348 xmlChar *errString; /* the string raising the error */ 349 int *errCounts; /* counters at the error state */ 350 int nbPush; 351 }; 352 353 #define REGEXP_ALL_COUNTER 0x123456 354 #define REGEXP_ALL_LAX_COUNTER 0x123457 355 356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 357 static void xmlRegFreeState(xmlRegStatePtr state); 358 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 362 int neg, int start, int end, const xmlChar *blockName); 363 364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 365 366 /************************************************************************ 367 * * 368 * Regexp memory error handler * 369 * * 370 ************************************************************************/ 371 /** 372 * xmlRegexpErrMemory: 373 * @extra: extra information 374 * 375 * Handle an out of memory condition 376 */ 377 static void 378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 379 { 380 const char *regexp = NULL; 381 if (ctxt != NULL) { 382 regexp = (const char *) ctxt->string; 383 ctxt->error = XML_ERR_NO_MEMORY; 384 } 385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 387 regexp, NULL, 0, 0, 388 "Memory allocation failed : %s\n", extra); 389 } 390 391 /** 392 * xmlRegexpErrCompile: 393 * @extra: extra information 394 * 395 * Handle a compilation failure 396 */ 397 static void 398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 399 { 400 const char *regexp = NULL; 401 int idx = 0; 402 403 if (ctxt != NULL) { 404 regexp = (const char *) ctxt->string; 405 idx = ctxt->cur - ctxt->string; 406 ctxt->error = XML_REGEXP_COMPILE_ERROR; 407 } 408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 410 regexp, NULL, idx, 0, 411 "failed to compile: %s\n", extra); 412 } 413 414 /************************************************************************ 415 * * 416 * Allocation/Deallocation * 417 * * 418 ************************************************************************/ 419 420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 421 /** 422 * xmlRegEpxFromParse: 423 * @ctxt: the parser context used to build it 424 * 425 * Allocate a new regexp and fill it with the result from the parser 426 * 427 * Returns the new regexp or NULL in case of error 428 */ 429 static xmlRegexpPtr 430 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 431 xmlRegexpPtr ret; 432 433 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 434 if (ret == NULL) { 435 xmlRegexpErrMemory(ctxt, "compiling regexp"); 436 return(NULL); 437 } 438 memset(ret, 0, sizeof(xmlRegexp)); 439 ret->string = ctxt->string; 440 ret->nbStates = ctxt->nbStates; 441 ret->states = ctxt->states; 442 ret->nbAtoms = ctxt->nbAtoms; 443 ret->atoms = ctxt->atoms; 444 ret->nbCounters = ctxt->nbCounters; 445 ret->counters = ctxt->counters; 446 ret->determinist = ctxt->determinist; 447 ret->flags = ctxt->flags; 448 if (ret->determinist == -1) { 449 xmlRegexpIsDeterminist(ret); 450 } 451 452 if ((ret->determinist != 0) && 453 (ret->nbCounters == 0) && 454 (ctxt->negs == 0) && 455 (ret->atoms != NULL) && 456 (ret->atoms[0] != NULL) && 457 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 458 int i, j, nbstates = 0, nbatoms = 0; 459 int *stateRemap; 460 int *stringRemap; 461 int *transitions; 462 void **transdata; 463 xmlChar **stringMap; 464 xmlChar *value; 465 466 /* 467 * Switch to a compact representation 468 * 1/ counting the effective number of states left 469 * 2/ counting the unique number of atoms, and check that 470 * they are all of the string type 471 * 3/ build a table state x atom for the transitions 472 */ 473 474 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 475 if (stateRemap == NULL) { 476 xmlRegexpErrMemory(ctxt, "compiling regexp"); 477 xmlFree(ret); 478 return(NULL); 479 } 480 for (i = 0;i < ret->nbStates;i++) { 481 if (ret->states[i] != NULL) { 482 stateRemap[i] = nbstates; 483 nbstates++; 484 } else { 485 stateRemap[i] = -1; 486 } 487 } 488 #ifdef DEBUG_COMPACTION 489 printf("Final: %d states\n", nbstates); 490 #endif 491 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 492 if (stringMap == NULL) { 493 xmlRegexpErrMemory(ctxt, "compiling regexp"); 494 xmlFree(stateRemap); 495 xmlFree(ret); 496 return(NULL); 497 } 498 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 499 if (stringRemap == NULL) { 500 xmlRegexpErrMemory(ctxt, "compiling regexp"); 501 xmlFree(stringMap); 502 xmlFree(stateRemap); 503 xmlFree(ret); 504 return(NULL); 505 } 506 for (i = 0;i < ret->nbAtoms;i++) { 507 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 508 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 509 value = ret->atoms[i]->valuep; 510 for (j = 0;j < nbatoms;j++) { 511 if (xmlStrEqual(stringMap[j], value)) { 512 stringRemap[i] = j; 513 break; 514 } 515 } 516 if (j >= nbatoms) { 517 stringRemap[i] = nbatoms; 518 stringMap[nbatoms] = xmlStrdup(value); 519 if (stringMap[nbatoms] == NULL) { 520 for (i = 0;i < nbatoms;i++) 521 xmlFree(stringMap[i]); 522 xmlFree(stringRemap); 523 xmlFree(stringMap); 524 xmlFree(stateRemap); 525 xmlFree(ret); 526 return(NULL); 527 } 528 nbatoms++; 529 } 530 } else { 531 xmlFree(stateRemap); 532 xmlFree(stringRemap); 533 for (i = 0;i < nbatoms;i++) 534 xmlFree(stringMap[i]); 535 xmlFree(stringMap); 536 xmlFree(ret); 537 return(NULL); 538 } 539 } 540 #ifdef DEBUG_COMPACTION 541 printf("Final: %d atoms\n", nbatoms); 542 #endif 543 transitions = (int *) xmlMalloc((nbstates + 1) * 544 (nbatoms + 1) * sizeof(int)); 545 if (transitions == NULL) { 546 xmlFree(stateRemap); 547 xmlFree(stringRemap); 548 xmlFree(stringMap); 549 xmlFree(ret); 550 return(NULL); 551 } 552 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 553 554 /* 555 * Allocate the transition table. The first entry for each 556 * state corresponds to the state type. 557 */ 558 transdata = NULL; 559 560 for (i = 0;i < ret->nbStates;i++) { 561 int stateno, atomno, targetno, prev; 562 xmlRegStatePtr state; 563 xmlRegTransPtr trans; 564 565 stateno = stateRemap[i]; 566 if (stateno == -1) 567 continue; 568 state = ret->states[i]; 569 570 transitions[stateno * (nbatoms + 1)] = state->type; 571 572 for (j = 0;j < state->nbTrans;j++) { 573 trans = &(state->trans[j]); 574 if ((trans->to == -1) || (trans->atom == NULL)) 575 continue; 576 atomno = stringRemap[trans->atom->no]; 577 if ((trans->atom->data != NULL) && (transdata == NULL)) { 578 transdata = (void **) xmlMalloc(nbstates * nbatoms * 579 sizeof(void *)); 580 if (transdata != NULL) 581 memset(transdata, 0, 582 nbstates * nbatoms * sizeof(void *)); 583 else { 584 xmlRegexpErrMemory(ctxt, "compiling regexp"); 585 break; 586 } 587 } 588 targetno = stateRemap[trans->to]; 589 /* 590 * if the same atom can generate transitions to 2 different 591 * states then it means the automata is not determinist and 592 * the compact form can't be used ! 593 */ 594 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 595 if (prev != 0) { 596 if (prev != targetno + 1) { 597 ret->determinist = 0; 598 #ifdef DEBUG_COMPACTION 599 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 600 i, j, trans->atom->no, trans->to, atomno, targetno); 601 printf(" previous to is %d\n", prev); 602 #endif 603 if (transdata != NULL) 604 xmlFree(transdata); 605 xmlFree(transitions); 606 xmlFree(stateRemap); 607 xmlFree(stringRemap); 608 for (i = 0;i < nbatoms;i++) 609 xmlFree(stringMap[i]); 610 xmlFree(stringMap); 611 goto not_determ; 612 } 613 } else { 614 #if 0 615 printf("State %d trans %d: atom %d to %d : %d to %d\n", 616 i, j, trans->atom->no, trans->to, atomno, targetno); 617 #endif 618 transitions[stateno * (nbatoms + 1) + atomno + 1] = 619 targetno + 1; /* to avoid 0 */ 620 if (transdata != NULL) 621 transdata[stateno * nbatoms + atomno] = 622 trans->atom->data; 623 } 624 } 625 } 626 ret->determinist = 1; 627 #ifdef DEBUG_COMPACTION 628 /* 629 * Debug 630 */ 631 for (i = 0;i < nbstates;i++) { 632 for (j = 0;j < nbatoms + 1;j++) { 633 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 634 } 635 printf("\n"); 636 } 637 printf("\n"); 638 #endif 639 /* 640 * Cleanup of the old data 641 */ 642 if (ret->states != NULL) { 643 for (i = 0;i < ret->nbStates;i++) 644 xmlRegFreeState(ret->states[i]); 645 xmlFree(ret->states); 646 } 647 ret->states = NULL; 648 ret->nbStates = 0; 649 if (ret->atoms != NULL) { 650 for (i = 0;i < ret->nbAtoms;i++) 651 xmlRegFreeAtom(ret->atoms[i]); 652 xmlFree(ret->atoms); 653 } 654 ret->atoms = NULL; 655 ret->nbAtoms = 0; 656 657 ret->compact = transitions; 658 ret->transdata = transdata; 659 ret->stringMap = stringMap; 660 ret->nbstrings = nbatoms; 661 ret->nbstates = nbstates; 662 xmlFree(stateRemap); 663 xmlFree(stringRemap); 664 } 665 not_determ: 666 ctxt->string = NULL; 667 ctxt->nbStates = 0; 668 ctxt->states = NULL; 669 ctxt->nbAtoms = 0; 670 ctxt->atoms = NULL; 671 ctxt->nbCounters = 0; 672 ctxt->counters = NULL; 673 return(ret); 674 } 675 676 /** 677 * xmlRegNewParserCtxt: 678 * @string: the string to parse 679 * 680 * Allocate a new regexp parser context 681 * 682 * Returns the new context or NULL in case of error 683 */ 684 static xmlRegParserCtxtPtr 685 xmlRegNewParserCtxt(const xmlChar *string) { 686 xmlRegParserCtxtPtr ret; 687 688 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 689 if (ret == NULL) 690 return(NULL); 691 memset(ret, 0, sizeof(xmlRegParserCtxt)); 692 if (string != NULL) 693 ret->string = xmlStrdup(string); 694 ret->cur = ret->string; 695 ret->neg = 0; 696 ret->negs = 0; 697 ret->error = 0; 698 ret->determinist = -1; 699 return(ret); 700 } 701 702 /** 703 * xmlRegNewRange: 704 * @ctxt: the regexp parser context 705 * @neg: is that negative 706 * @type: the type of range 707 * @start: the start codepoint 708 * @end: the end codepoint 709 * 710 * Allocate a new regexp range 711 * 712 * Returns the new range or NULL in case of error 713 */ 714 static xmlRegRangePtr 715 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 716 int neg, xmlRegAtomType type, int start, int end) { 717 xmlRegRangePtr ret; 718 719 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 720 if (ret == NULL) { 721 xmlRegexpErrMemory(ctxt, "allocating range"); 722 return(NULL); 723 } 724 ret->neg = neg; 725 ret->type = type; 726 ret->start = start; 727 ret->end = end; 728 return(ret); 729 } 730 731 /** 732 * xmlRegFreeRange: 733 * @range: the regexp range 734 * 735 * Free a regexp range 736 */ 737 static void 738 xmlRegFreeRange(xmlRegRangePtr range) { 739 if (range == NULL) 740 return; 741 742 if (range->blockName != NULL) 743 xmlFree(range->blockName); 744 xmlFree(range); 745 } 746 747 /** 748 * xmlRegCopyRange: 749 * @range: the regexp range 750 * 751 * Copy a regexp range 752 * 753 * Returns the new copy or NULL in case of error. 754 */ 755 static xmlRegRangePtr 756 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 757 xmlRegRangePtr ret; 758 759 if (range == NULL) 760 return(NULL); 761 762 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 763 range->end); 764 if (ret == NULL) 765 return(NULL); 766 if (range->blockName != NULL) { 767 ret->blockName = xmlStrdup(range->blockName); 768 if (ret->blockName == NULL) { 769 xmlRegexpErrMemory(ctxt, "allocating range"); 770 xmlRegFreeRange(ret); 771 return(NULL); 772 } 773 } 774 return(ret); 775 } 776 777 /** 778 * xmlRegNewAtom: 779 * @ctxt: the regexp parser context 780 * @type: the type of atom 781 * 782 * Allocate a new atom 783 * 784 * Returns the new atom or NULL in case of error 785 */ 786 static xmlRegAtomPtr 787 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 788 xmlRegAtomPtr ret; 789 790 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 791 if (ret == NULL) { 792 xmlRegexpErrMemory(ctxt, "allocating atom"); 793 return(NULL); 794 } 795 memset(ret, 0, sizeof(xmlRegAtom)); 796 ret->type = type; 797 ret->quant = XML_REGEXP_QUANT_ONCE; 798 ret->min = 0; 799 ret->max = 0; 800 return(ret); 801 } 802 803 /** 804 * xmlRegFreeAtom: 805 * @atom: the regexp atom 806 * 807 * Free a regexp atom 808 */ 809 static void 810 xmlRegFreeAtom(xmlRegAtomPtr atom) { 811 int i; 812 813 if (atom == NULL) 814 return; 815 816 for (i = 0;i < atom->nbRanges;i++) 817 xmlRegFreeRange(atom->ranges[i]); 818 if (atom->ranges != NULL) 819 xmlFree(atom->ranges); 820 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 821 xmlFree(atom->valuep); 822 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 823 xmlFree(atom->valuep2); 824 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 825 xmlFree(atom->valuep); 826 xmlFree(atom); 827 } 828 829 /** 830 * xmlRegCopyAtom: 831 * @ctxt: the regexp parser context 832 * @atom: the oiginal atom 833 * 834 * Allocate a new regexp range 835 * 836 * Returns the new atom or NULL in case of error 837 */ 838 static xmlRegAtomPtr 839 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 840 xmlRegAtomPtr ret; 841 842 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 843 if (ret == NULL) { 844 xmlRegexpErrMemory(ctxt, "copying atom"); 845 return(NULL); 846 } 847 memset(ret, 0, sizeof(xmlRegAtom)); 848 ret->type = atom->type; 849 ret->quant = atom->quant; 850 ret->min = atom->min; 851 ret->max = atom->max; 852 if (atom->nbRanges > 0) { 853 int i; 854 855 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 856 atom->nbRanges); 857 if (ret->ranges == NULL) { 858 xmlRegexpErrMemory(ctxt, "copying atom"); 859 goto error; 860 } 861 for (i = 0;i < atom->nbRanges;i++) { 862 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 863 if (ret->ranges[i] == NULL) 864 goto error; 865 ret->nbRanges = i + 1; 866 } 867 } 868 return(ret); 869 870 error: 871 xmlRegFreeAtom(ret); 872 return(NULL); 873 } 874 875 static xmlRegStatePtr 876 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 877 xmlRegStatePtr ret; 878 879 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 880 if (ret == NULL) { 881 xmlRegexpErrMemory(ctxt, "allocating state"); 882 return(NULL); 883 } 884 memset(ret, 0, sizeof(xmlRegState)); 885 ret->type = XML_REGEXP_TRANS_STATE; 886 ret->mark = XML_REGEXP_MARK_NORMAL; 887 return(ret); 888 } 889 890 /** 891 * xmlRegFreeState: 892 * @state: the regexp state 893 * 894 * Free a regexp state 895 */ 896 static void 897 xmlRegFreeState(xmlRegStatePtr state) { 898 if (state == NULL) 899 return; 900 901 if (state->trans != NULL) 902 xmlFree(state->trans); 903 if (state->transTo != NULL) 904 xmlFree(state->transTo); 905 xmlFree(state); 906 } 907 908 /** 909 * xmlRegFreeParserCtxt: 910 * @ctxt: the regexp parser context 911 * 912 * Free a regexp parser context 913 */ 914 static void 915 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 916 int i; 917 if (ctxt == NULL) 918 return; 919 920 if (ctxt->string != NULL) 921 xmlFree(ctxt->string); 922 if (ctxt->states != NULL) { 923 for (i = 0;i < ctxt->nbStates;i++) 924 xmlRegFreeState(ctxt->states[i]); 925 xmlFree(ctxt->states); 926 } 927 if (ctxt->atoms != NULL) { 928 for (i = 0;i < ctxt->nbAtoms;i++) 929 xmlRegFreeAtom(ctxt->atoms[i]); 930 xmlFree(ctxt->atoms); 931 } 932 if (ctxt->counters != NULL) 933 xmlFree(ctxt->counters); 934 xmlFree(ctxt); 935 } 936 937 /************************************************************************ 938 * * 939 * Display of Data structures * 940 * * 941 ************************************************************************/ 942 943 static void 944 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 945 switch (type) { 946 case XML_REGEXP_EPSILON: 947 fprintf(output, "epsilon "); break; 948 case XML_REGEXP_CHARVAL: 949 fprintf(output, "charval "); break; 950 case XML_REGEXP_RANGES: 951 fprintf(output, "ranges "); break; 952 case XML_REGEXP_SUBREG: 953 fprintf(output, "subexpr "); break; 954 case XML_REGEXP_STRING: 955 fprintf(output, "string "); break; 956 case XML_REGEXP_ANYCHAR: 957 fprintf(output, "anychar "); break; 958 case XML_REGEXP_ANYSPACE: 959 fprintf(output, "anyspace "); break; 960 case XML_REGEXP_NOTSPACE: 961 fprintf(output, "notspace "); break; 962 case XML_REGEXP_INITNAME: 963 fprintf(output, "initname "); break; 964 case XML_REGEXP_NOTINITNAME: 965 fprintf(output, "notinitname "); break; 966 case XML_REGEXP_NAMECHAR: 967 fprintf(output, "namechar "); break; 968 case XML_REGEXP_NOTNAMECHAR: 969 fprintf(output, "notnamechar "); break; 970 case XML_REGEXP_DECIMAL: 971 fprintf(output, "decimal "); break; 972 case XML_REGEXP_NOTDECIMAL: 973 fprintf(output, "notdecimal "); break; 974 case XML_REGEXP_REALCHAR: 975 fprintf(output, "realchar "); break; 976 case XML_REGEXP_NOTREALCHAR: 977 fprintf(output, "notrealchar "); break; 978 case XML_REGEXP_LETTER: 979 fprintf(output, "LETTER "); break; 980 case XML_REGEXP_LETTER_UPPERCASE: 981 fprintf(output, "LETTER_UPPERCASE "); break; 982 case XML_REGEXP_LETTER_LOWERCASE: 983 fprintf(output, "LETTER_LOWERCASE "); break; 984 case XML_REGEXP_LETTER_TITLECASE: 985 fprintf(output, "LETTER_TITLECASE "); break; 986 case XML_REGEXP_LETTER_MODIFIER: 987 fprintf(output, "LETTER_MODIFIER "); break; 988 case XML_REGEXP_LETTER_OTHERS: 989 fprintf(output, "LETTER_OTHERS "); break; 990 case XML_REGEXP_MARK: 991 fprintf(output, "MARK "); break; 992 case XML_REGEXP_MARK_NONSPACING: 993 fprintf(output, "MARK_NONSPACING "); break; 994 case XML_REGEXP_MARK_SPACECOMBINING: 995 fprintf(output, "MARK_SPACECOMBINING "); break; 996 case XML_REGEXP_MARK_ENCLOSING: 997 fprintf(output, "MARK_ENCLOSING "); break; 998 case XML_REGEXP_NUMBER: 999 fprintf(output, "NUMBER "); break; 1000 case XML_REGEXP_NUMBER_DECIMAL: 1001 fprintf(output, "NUMBER_DECIMAL "); break; 1002 case XML_REGEXP_NUMBER_LETTER: 1003 fprintf(output, "NUMBER_LETTER "); break; 1004 case XML_REGEXP_NUMBER_OTHERS: 1005 fprintf(output, "NUMBER_OTHERS "); break; 1006 case XML_REGEXP_PUNCT: 1007 fprintf(output, "PUNCT "); break; 1008 case XML_REGEXP_PUNCT_CONNECTOR: 1009 fprintf(output, "PUNCT_CONNECTOR "); break; 1010 case XML_REGEXP_PUNCT_DASH: 1011 fprintf(output, "PUNCT_DASH "); break; 1012 case XML_REGEXP_PUNCT_OPEN: 1013 fprintf(output, "PUNCT_OPEN "); break; 1014 case XML_REGEXP_PUNCT_CLOSE: 1015 fprintf(output, "PUNCT_CLOSE "); break; 1016 case XML_REGEXP_PUNCT_INITQUOTE: 1017 fprintf(output, "PUNCT_INITQUOTE "); break; 1018 case XML_REGEXP_PUNCT_FINQUOTE: 1019 fprintf(output, "PUNCT_FINQUOTE "); break; 1020 case XML_REGEXP_PUNCT_OTHERS: 1021 fprintf(output, "PUNCT_OTHERS "); break; 1022 case XML_REGEXP_SEPAR: 1023 fprintf(output, "SEPAR "); break; 1024 case XML_REGEXP_SEPAR_SPACE: 1025 fprintf(output, "SEPAR_SPACE "); break; 1026 case XML_REGEXP_SEPAR_LINE: 1027 fprintf(output, "SEPAR_LINE "); break; 1028 case XML_REGEXP_SEPAR_PARA: 1029 fprintf(output, "SEPAR_PARA "); break; 1030 case XML_REGEXP_SYMBOL: 1031 fprintf(output, "SYMBOL "); break; 1032 case XML_REGEXP_SYMBOL_MATH: 1033 fprintf(output, "SYMBOL_MATH "); break; 1034 case XML_REGEXP_SYMBOL_CURRENCY: 1035 fprintf(output, "SYMBOL_CURRENCY "); break; 1036 case XML_REGEXP_SYMBOL_MODIFIER: 1037 fprintf(output, "SYMBOL_MODIFIER "); break; 1038 case XML_REGEXP_SYMBOL_OTHERS: 1039 fprintf(output, "SYMBOL_OTHERS "); break; 1040 case XML_REGEXP_OTHER: 1041 fprintf(output, "OTHER "); break; 1042 case XML_REGEXP_OTHER_CONTROL: 1043 fprintf(output, "OTHER_CONTROL "); break; 1044 case XML_REGEXP_OTHER_FORMAT: 1045 fprintf(output, "OTHER_FORMAT "); break; 1046 case XML_REGEXP_OTHER_PRIVATE: 1047 fprintf(output, "OTHER_PRIVATE "); break; 1048 case XML_REGEXP_OTHER_NA: 1049 fprintf(output, "OTHER_NA "); break; 1050 case XML_REGEXP_BLOCK_NAME: 1051 fprintf(output, "BLOCK "); break; 1052 } 1053 } 1054 1055 static void 1056 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1057 switch (type) { 1058 case XML_REGEXP_QUANT_EPSILON: 1059 fprintf(output, "epsilon "); break; 1060 case XML_REGEXP_QUANT_ONCE: 1061 fprintf(output, "once "); break; 1062 case XML_REGEXP_QUANT_OPT: 1063 fprintf(output, "? "); break; 1064 case XML_REGEXP_QUANT_MULT: 1065 fprintf(output, "* "); break; 1066 case XML_REGEXP_QUANT_PLUS: 1067 fprintf(output, "+ "); break; 1068 case XML_REGEXP_QUANT_RANGE: 1069 fprintf(output, "range "); break; 1070 case XML_REGEXP_QUANT_ONCEONLY: 1071 fprintf(output, "onceonly "); break; 1072 case XML_REGEXP_QUANT_ALL: 1073 fprintf(output, "all "); break; 1074 } 1075 } 1076 static void 1077 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1078 fprintf(output, " range: "); 1079 if (range->neg) 1080 fprintf(output, "negative "); 1081 xmlRegPrintAtomType(output, range->type); 1082 fprintf(output, "%c - %c\n", range->start, range->end); 1083 } 1084 1085 static void 1086 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1087 fprintf(output, " atom: "); 1088 if (atom == NULL) { 1089 fprintf(output, "NULL\n"); 1090 return; 1091 } 1092 if (atom->neg) 1093 fprintf(output, "not "); 1094 xmlRegPrintAtomType(output, atom->type); 1095 xmlRegPrintQuantType(output, atom->quant); 1096 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1097 fprintf(output, "%d-%d ", atom->min, atom->max); 1098 if (atom->type == XML_REGEXP_STRING) 1099 fprintf(output, "'%s' ", (char *) atom->valuep); 1100 if (atom->type == XML_REGEXP_CHARVAL) 1101 fprintf(output, "char %c\n", atom->codepoint); 1102 else if (atom->type == XML_REGEXP_RANGES) { 1103 int i; 1104 fprintf(output, "%d entries\n", atom->nbRanges); 1105 for (i = 0; i < atom->nbRanges;i++) 1106 xmlRegPrintRange(output, atom->ranges[i]); 1107 } else if (atom->type == XML_REGEXP_SUBREG) { 1108 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1109 } else { 1110 fprintf(output, "\n"); 1111 } 1112 } 1113 1114 static void 1115 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1116 fprintf(output, " trans: "); 1117 if (trans == NULL) { 1118 fprintf(output, "NULL\n"); 1119 return; 1120 } 1121 if (trans->to < 0) { 1122 fprintf(output, "removed\n"); 1123 return; 1124 } 1125 if (trans->nd != 0) { 1126 if (trans->nd == 2) 1127 fprintf(output, "last not determinist, "); 1128 else 1129 fprintf(output, "not determinist, "); 1130 } 1131 if (trans->counter >= 0) { 1132 fprintf(output, "counted %d, ", trans->counter); 1133 } 1134 if (trans->count == REGEXP_ALL_COUNTER) { 1135 fprintf(output, "all transition, "); 1136 } else if (trans->count >= 0) { 1137 fprintf(output, "count based %d, ", trans->count); 1138 } 1139 if (trans->atom == NULL) { 1140 fprintf(output, "epsilon to %d\n", trans->to); 1141 return; 1142 } 1143 if (trans->atom->type == XML_REGEXP_CHARVAL) 1144 fprintf(output, "char %c ", trans->atom->codepoint); 1145 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1146 } 1147 1148 static void 1149 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1150 int i; 1151 1152 fprintf(output, " state: "); 1153 if (state == NULL) { 1154 fprintf(output, "NULL\n"); 1155 return; 1156 } 1157 if (state->type == XML_REGEXP_START_STATE) 1158 fprintf(output, "START "); 1159 if (state->type == XML_REGEXP_FINAL_STATE) 1160 fprintf(output, "FINAL "); 1161 1162 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1163 for (i = 0;i < state->nbTrans; i++) { 1164 xmlRegPrintTrans(output, &(state->trans[i])); 1165 } 1166 } 1167 1168 #ifdef DEBUG_REGEXP_GRAPH 1169 static void 1170 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1171 int i; 1172 1173 fprintf(output, " ctxt: "); 1174 if (ctxt == NULL) { 1175 fprintf(output, "NULL\n"); 1176 return; 1177 } 1178 fprintf(output, "'%s' ", ctxt->string); 1179 if (ctxt->error) 1180 fprintf(output, "error "); 1181 if (ctxt->neg) 1182 fprintf(output, "neg "); 1183 fprintf(output, "\n"); 1184 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1185 for (i = 0;i < ctxt->nbAtoms; i++) { 1186 fprintf(output, " %02d ", i); 1187 xmlRegPrintAtom(output, ctxt->atoms[i]); 1188 } 1189 if (ctxt->atom != NULL) { 1190 fprintf(output, "current atom:\n"); 1191 xmlRegPrintAtom(output, ctxt->atom); 1192 } 1193 fprintf(output, "%d states:", ctxt->nbStates); 1194 if (ctxt->start != NULL) 1195 fprintf(output, " start: %d", ctxt->start->no); 1196 if (ctxt->end != NULL) 1197 fprintf(output, " end: %d", ctxt->end->no); 1198 fprintf(output, "\n"); 1199 for (i = 0;i < ctxt->nbStates; i++) { 1200 xmlRegPrintState(output, ctxt->states[i]); 1201 } 1202 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1203 for (i = 0;i < ctxt->nbCounters; i++) { 1204 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1205 ctxt->counters[i].max); 1206 } 1207 } 1208 #endif 1209 1210 /************************************************************************ 1211 * * 1212 * Finite Automata structures manipulations * 1213 * * 1214 ************************************************************************/ 1215 1216 static void 1217 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1218 int neg, xmlRegAtomType type, int start, int end, 1219 xmlChar *blockName) { 1220 xmlRegRangePtr range; 1221 1222 if (atom == NULL) { 1223 ERROR("add range: atom is NULL"); 1224 return; 1225 } 1226 if (atom->type != XML_REGEXP_RANGES) { 1227 ERROR("add range: atom is not ranges"); 1228 return; 1229 } 1230 if (atom->maxRanges == 0) { 1231 atom->maxRanges = 4; 1232 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1233 sizeof(xmlRegRangePtr)); 1234 if (atom->ranges == NULL) { 1235 xmlRegexpErrMemory(ctxt, "adding ranges"); 1236 atom->maxRanges = 0; 1237 return; 1238 } 1239 } else if (atom->nbRanges >= atom->maxRanges) { 1240 xmlRegRangePtr *tmp; 1241 atom->maxRanges *= 2; 1242 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1243 sizeof(xmlRegRangePtr)); 1244 if (tmp == NULL) { 1245 xmlRegexpErrMemory(ctxt, "adding ranges"); 1246 atom->maxRanges /= 2; 1247 return; 1248 } 1249 atom->ranges = tmp; 1250 } 1251 range = xmlRegNewRange(ctxt, neg, type, start, end); 1252 if (range == NULL) 1253 return; 1254 range->blockName = blockName; 1255 atom->ranges[atom->nbRanges++] = range; 1256 1257 } 1258 1259 static int 1260 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1261 if (ctxt->maxCounters == 0) { 1262 ctxt->maxCounters = 4; 1263 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1264 sizeof(xmlRegCounter)); 1265 if (ctxt->counters == NULL) { 1266 xmlRegexpErrMemory(ctxt, "allocating counter"); 1267 ctxt->maxCounters = 0; 1268 return(-1); 1269 } 1270 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1271 xmlRegCounter *tmp; 1272 ctxt->maxCounters *= 2; 1273 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1274 sizeof(xmlRegCounter)); 1275 if (tmp == NULL) { 1276 xmlRegexpErrMemory(ctxt, "allocating counter"); 1277 ctxt->maxCounters /= 2; 1278 return(-1); 1279 } 1280 ctxt->counters = tmp; 1281 } 1282 ctxt->counters[ctxt->nbCounters].min = -1; 1283 ctxt->counters[ctxt->nbCounters].max = -1; 1284 return(ctxt->nbCounters++); 1285 } 1286 1287 static int 1288 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1289 if (atom == NULL) { 1290 ERROR("atom push: atom is NULL"); 1291 return(-1); 1292 } 1293 if (ctxt->maxAtoms == 0) { 1294 ctxt->maxAtoms = 4; 1295 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1296 sizeof(xmlRegAtomPtr)); 1297 if (ctxt->atoms == NULL) { 1298 xmlRegexpErrMemory(ctxt, "pushing atom"); 1299 ctxt->maxAtoms = 0; 1300 return(-1); 1301 } 1302 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1303 xmlRegAtomPtr *tmp; 1304 ctxt->maxAtoms *= 2; 1305 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1306 sizeof(xmlRegAtomPtr)); 1307 if (tmp == NULL) { 1308 xmlRegexpErrMemory(ctxt, "allocating counter"); 1309 ctxt->maxAtoms /= 2; 1310 return(-1); 1311 } 1312 ctxt->atoms = tmp; 1313 } 1314 atom->no = ctxt->nbAtoms; 1315 ctxt->atoms[ctxt->nbAtoms++] = atom; 1316 return(0); 1317 } 1318 1319 static void 1320 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1321 int from) { 1322 if (target->maxTransTo == 0) { 1323 target->maxTransTo = 8; 1324 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1325 sizeof(int)); 1326 if (target->transTo == NULL) { 1327 xmlRegexpErrMemory(ctxt, "adding transition"); 1328 target->maxTransTo = 0; 1329 return; 1330 } 1331 } else if (target->nbTransTo >= target->maxTransTo) { 1332 int *tmp; 1333 target->maxTransTo *= 2; 1334 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1335 sizeof(int)); 1336 if (tmp == NULL) { 1337 xmlRegexpErrMemory(ctxt, "adding transition"); 1338 target->maxTransTo /= 2; 1339 return; 1340 } 1341 target->transTo = tmp; 1342 } 1343 target->transTo[target->nbTransTo] = from; 1344 target->nbTransTo++; 1345 } 1346 1347 static void 1348 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1349 xmlRegAtomPtr atom, xmlRegStatePtr target, 1350 int counter, int count) { 1351 1352 int nrtrans; 1353 1354 if (state == NULL) { 1355 ERROR("add state: state is NULL"); 1356 return; 1357 } 1358 if (target == NULL) { 1359 ERROR("add state: target is NULL"); 1360 return; 1361 } 1362 /* 1363 * Other routines follow the philosophy 'When in doubt, add a transition' 1364 * so we check here whether such a transition is already present and, if 1365 * so, silently ignore this request. 1366 */ 1367 1368 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1369 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1370 if ((trans->atom == atom) && 1371 (trans->to == target->no) && 1372 (trans->counter == counter) && 1373 (trans->count == count)) { 1374 #ifdef DEBUG_REGEXP_GRAPH 1375 printf("Ignoring duplicate transition from %d to %d\n", 1376 state->no, target->no); 1377 #endif 1378 return; 1379 } 1380 } 1381 1382 if (state->maxTrans == 0) { 1383 state->maxTrans = 8; 1384 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1385 sizeof(xmlRegTrans)); 1386 if (state->trans == NULL) { 1387 xmlRegexpErrMemory(ctxt, "adding transition"); 1388 state->maxTrans = 0; 1389 return; 1390 } 1391 } else if (state->nbTrans >= state->maxTrans) { 1392 xmlRegTrans *tmp; 1393 state->maxTrans *= 2; 1394 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1395 sizeof(xmlRegTrans)); 1396 if (tmp == NULL) { 1397 xmlRegexpErrMemory(ctxt, "adding transition"); 1398 state->maxTrans /= 2; 1399 return; 1400 } 1401 state->trans = tmp; 1402 } 1403 #ifdef DEBUG_REGEXP_GRAPH 1404 printf("Add trans from %d to %d ", state->no, target->no); 1405 if (count == REGEXP_ALL_COUNTER) 1406 printf("all transition\n"); 1407 else if (count >= 0) 1408 printf("count based %d\n", count); 1409 else if (counter >= 0) 1410 printf("counted %d\n", counter); 1411 else if (atom == NULL) 1412 printf("epsilon transition\n"); 1413 else if (atom != NULL) 1414 xmlRegPrintAtom(stdout, atom); 1415 #endif 1416 1417 state->trans[state->nbTrans].atom = atom; 1418 state->trans[state->nbTrans].to = target->no; 1419 state->trans[state->nbTrans].counter = counter; 1420 state->trans[state->nbTrans].count = count; 1421 state->trans[state->nbTrans].nd = 0; 1422 state->nbTrans++; 1423 xmlRegStateAddTransTo(ctxt, target, state->no); 1424 } 1425 1426 static int 1427 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1428 if (state == NULL) return(-1); 1429 if (ctxt->maxStates == 0) { 1430 ctxt->maxStates = 4; 1431 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1432 sizeof(xmlRegStatePtr)); 1433 if (ctxt->states == NULL) { 1434 xmlRegexpErrMemory(ctxt, "adding state"); 1435 ctxt->maxStates = 0; 1436 return(-1); 1437 } 1438 } else if (ctxt->nbStates >= ctxt->maxStates) { 1439 xmlRegStatePtr *tmp; 1440 ctxt->maxStates *= 2; 1441 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1442 sizeof(xmlRegStatePtr)); 1443 if (tmp == NULL) { 1444 xmlRegexpErrMemory(ctxt, "adding state"); 1445 ctxt->maxStates /= 2; 1446 return(-1); 1447 } 1448 ctxt->states = tmp; 1449 } 1450 state->no = ctxt->nbStates; 1451 ctxt->states[ctxt->nbStates++] = state; 1452 return(0); 1453 } 1454 1455 /** 1456 * xmlFAGenerateAllTransition: 1457 * @ctxt: a regexp parser context 1458 * @from: the from state 1459 * @to: the target state or NULL for building a new one 1460 * @lax: 1461 * 1462 */ 1463 static void 1464 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1465 xmlRegStatePtr from, xmlRegStatePtr to, 1466 int lax) { 1467 if (to == NULL) { 1468 to = xmlRegNewState(ctxt); 1469 xmlRegStatePush(ctxt, to); 1470 ctxt->state = to; 1471 } 1472 if (lax) 1473 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1474 else 1475 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1476 } 1477 1478 /** 1479 * xmlFAGenerateEpsilonTransition: 1480 * @ctxt: a regexp parser context 1481 * @from: the from state 1482 * @to: the target state or NULL for building a new one 1483 * 1484 */ 1485 static void 1486 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1487 xmlRegStatePtr from, xmlRegStatePtr to) { 1488 if (to == NULL) { 1489 to = xmlRegNewState(ctxt); 1490 xmlRegStatePush(ctxt, to); 1491 ctxt->state = to; 1492 } 1493 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1494 } 1495 1496 /** 1497 * xmlFAGenerateCountedEpsilonTransition: 1498 * @ctxt: a regexp parser context 1499 * @from: the from state 1500 * @to: the target state or NULL for building a new one 1501 * counter: the counter for that transition 1502 * 1503 */ 1504 static void 1505 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1506 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1507 if (to == NULL) { 1508 to = xmlRegNewState(ctxt); 1509 xmlRegStatePush(ctxt, to); 1510 ctxt->state = to; 1511 } 1512 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1513 } 1514 1515 /** 1516 * xmlFAGenerateCountedTransition: 1517 * @ctxt: a regexp parser context 1518 * @from: the from state 1519 * @to: the target state or NULL for building a new one 1520 * counter: the counter for that transition 1521 * 1522 */ 1523 static void 1524 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1525 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1526 if (to == NULL) { 1527 to = xmlRegNewState(ctxt); 1528 xmlRegStatePush(ctxt, to); 1529 ctxt->state = to; 1530 } 1531 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1532 } 1533 1534 /** 1535 * xmlFAGenerateTransitions: 1536 * @ctxt: a regexp parser context 1537 * @from: the from state 1538 * @to: the target state or NULL for building a new one 1539 * @atom: the atom generating the transition 1540 * 1541 * Returns 0 if success and -1 in case of error. 1542 */ 1543 static int 1544 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1545 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1546 xmlRegStatePtr end; 1547 int nullable = 0; 1548 1549 if (atom == NULL) { 1550 ERROR("genrate transition: atom == NULL"); 1551 return(-1); 1552 } 1553 if (atom->type == XML_REGEXP_SUBREG) { 1554 /* 1555 * this is a subexpression handling one should not need to 1556 * create a new node except for XML_REGEXP_QUANT_RANGE. 1557 */ 1558 if (xmlRegAtomPush(ctxt, atom) < 0) { 1559 return(-1); 1560 } 1561 if ((to != NULL) && (atom->stop != to) && 1562 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1563 /* 1564 * Generate an epsilon transition to link to the target 1565 */ 1566 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1567 #ifdef DV 1568 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1569 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1570 to = xmlRegNewState(ctxt); 1571 xmlRegStatePush(ctxt, to); 1572 ctxt->state = to; 1573 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1574 #endif 1575 } 1576 switch (atom->quant) { 1577 case XML_REGEXP_QUANT_OPT: 1578 atom->quant = XML_REGEXP_QUANT_ONCE; 1579 /* 1580 * transition done to the state after end of atom. 1581 * 1. set transition from atom start to new state 1582 * 2. set transition from atom end to this state. 1583 */ 1584 if (to == NULL) { 1585 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1586 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1587 ctxt->state); 1588 } else { 1589 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1590 } 1591 break; 1592 case XML_REGEXP_QUANT_MULT: 1593 atom->quant = XML_REGEXP_QUANT_ONCE; 1594 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1595 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1596 break; 1597 case XML_REGEXP_QUANT_PLUS: 1598 atom->quant = XML_REGEXP_QUANT_ONCE; 1599 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1600 break; 1601 case XML_REGEXP_QUANT_RANGE: { 1602 int counter; 1603 xmlRegStatePtr inter, newstate; 1604 1605 /* 1606 * create the final state now if needed 1607 */ 1608 if (to != NULL) { 1609 newstate = to; 1610 } else { 1611 newstate = xmlRegNewState(ctxt); 1612 xmlRegStatePush(ctxt, newstate); 1613 } 1614 1615 /* 1616 * The principle here is to use counted transition 1617 * to avoid explosion in the number of states in the 1618 * graph. This is clearly more complex but should not 1619 * be exploitable at runtime. 1620 */ 1621 if ((atom->min == 0) && (atom->start0 == NULL)) { 1622 xmlRegAtomPtr copy; 1623 /* 1624 * duplicate a transition based on atom to count next 1625 * occurences after 1. We cannot loop to atom->start 1626 * directly because we need an epsilon transition to 1627 * newstate. 1628 */ 1629 /* ???? For some reason it seems we never reach that 1630 case, I suppose this got optimized out before when 1631 building the automata */ 1632 copy = xmlRegCopyAtom(ctxt, atom); 1633 if (copy == NULL) 1634 return(-1); 1635 copy->quant = XML_REGEXP_QUANT_ONCE; 1636 copy->min = 0; 1637 copy->max = 0; 1638 1639 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1640 < 0) 1641 return(-1); 1642 inter = ctxt->state; 1643 counter = xmlRegGetCounter(ctxt); 1644 ctxt->counters[counter].min = atom->min - 1; 1645 ctxt->counters[counter].max = atom->max - 1; 1646 /* count the number of times we see it again */ 1647 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1648 atom->stop, counter); 1649 /* allow a way out based on the count */ 1650 xmlFAGenerateCountedTransition(ctxt, inter, 1651 newstate, counter); 1652 /* and also allow a direct exit for 0 */ 1653 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1654 newstate); 1655 } else { 1656 /* 1657 * either we need the atom at least once or there 1658 * is an atom->start0 allowing to easilly plug the 1659 * epsilon transition. 1660 */ 1661 counter = xmlRegGetCounter(ctxt); 1662 ctxt->counters[counter].min = atom->min - 1; 1663 ctxt->counters[counter].max = atom->max - 1; 1664 /* count the number of times we see it again */ 1665 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1666 atom->start, counter); 1667 /* allow a way out based on the count */ 1668 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1669 newstate, counter); 1670 /* and if needed allow a direct exit for 0 */ 1671 if (atom->min == 0) 1672 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1673 newstate); 1674 1675 } 1676 atom->min = 0; 1677 atom->max = 0; 1678 atom->quant = XML_REGEXP_QUANT_ONCE; 1679 ctxt->state = newstate; 1680 } 1681 default: 1682 break; 1683 } 1684 return(0); 1685 } 1686 if ((atom->min == 0) && (atom->max == 0) && 1687 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1688 /* 1689 * we can discard the atom and generate an epsilon transition instead 1690 */ 1691 if (to == NULL) { 1692 to = xmlRegNewState(ctxt); 1693 if (to != NULL) 1694 xmlRegStatePush(ctxt, to); 1695 else { 1696 return(-1); 1697 } 1698 } 1699 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1700 ctxt->state = to; 1701 xmlRegFreeAtom(atom); 1702 return(0); 1703 } 1704 if (to == NULL) { 1705 to = xmlRegNewState(ctxt); 1706 if (to != NULL) 1707 xmlRegStatePush(ctxt, to); 1708 else { 1709 return(-1); 1710 } 1711 } 1712 end = to; 1713 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1714 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1715 /* 1716 * Do not pollute the target state by adding transitions from 1717 * it as it is likely to be the shared target of multiple branches. 1718 * So isolate with an epsilon transition. 1719 */ 1720 xmlRegStatePtr tmp; 1721 1722 tmp = xmlRegNewState(ctxt); 1723 if (tmp != NULL) 1724 xmlRegStatePush(ctxt, tmp); 1725 else { 1726 return(-1); 1727 } 1728 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1729 to = tmp; 1730 } 1731 if (xmlRegAtomPush(ctxt, atom) < 0) { 1732 return(-1); 1733 } 1734 if ((atom->quant == XML_REGEXP_QUANT_RANGE) && 1735 (atom->min == 0) && (atom->max > 0)) { 1736 nullable = 1; 1737 atom->min = 1; 1738 if (atom->max == 1) 1739 atom->quant = XML_REGEXP_QUANT_OPT; 1740 } 1741 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1742 ctxt->state = end; 1743 switch (atom->quant) { 1744 case XML_REGEXP_QUANT_OPT: 1745 atom->quant = XML_REGEXP_QUANT_ONCE; 1746 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1747 break; 1748 case XML_REGEXP_QUANT_MULT: 1749 atom->quant = XML_REGEXP_QUANT_ONCE; 1750 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1751 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1752 break; 1753 case XML_REGEXP_QUANT_PLUS: 1754 atom->quant = XML_REGEXP_QUANT_ONCE; 1755 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1756 break; 1757 case XML_REGEXP_QUANT_RANGE: 1758 if (nullable) 1759 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1760 break; 1761 default: 1762 break; 1763 } 1764 return(0); 1765 } 1766 1767 /** 1768 * xmlFAReduceEpsilonTransitions: 1769 * @ctxt: a regexp parser context 1770 * @fromnr: the from state 1771 * @tonr: the to state 1772 * @counter: should that transition be associated to a counted 1773 * 1774 */ 1775 static void 1776 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1777 int tonr, int counter) { 1778 int transnr; 1779 xmlRegStatePtr from; 1780 xmlRegStatePtr to; 1781 1782 #ifdef DEBUG_REGEXP_GRAPH 1783 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1784 #endif 1785 from = ctxt->states[fromnr]; 1786 if (from == NULL) 1787 return; 1788 to = ctxt->states[tonr]; 1789 if (to == NULL) 1790 return; 1791 if ((to->mark == XML_REGEXP_MARK_START) || 1792 (to->mark == XML_REGEXP_MARK_VISITED)) 1793 return; 1794 1795 to->mark = XML_REGEXP_MARK_VISITED; 1796 if (to->type == XML_REGEXP_FINAL_STATE) { 1797 #ifdef DEBUG_REGEXP_GRAPH 1798 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1799 #endif 1800 from->type = XML_REGEXP_FINAL_STATE; 1801 } 1802 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1803 if (to->trans[transnr].to < 0) 1804 continue; 1805 if (to->trans[transnr].atom == NULL) { 1806 /* 1807 * Don't remove counted transitions 1808 * Don't loop either 1809 */ 1810 if (to->trans[transnr].to != fromnr) { 1811 if (to->trans[transnr].count >= 0) { 1812 int newto = to->trans[transnr].to; 1813 1814 xmlRegStateAddTrans(ctxt, from, NULL, 1815 ctxt->states[newto], 1816 -1, to->trans[transnr].count); 1817 } else { 1818 #ifdef DEBUG_REGEXP_GRAPH 1819 printf("Found epsilon trans %d from %d to %d\n", 1820 transnr, tonr, to->trans[transnr].to); 1821 #endif 1822 if (to->trans[transnr].counter >= 0) { 1823 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1824 to->trans[transnr].to, 1825 to->trans[transnr].counter); 1826 } else { 1827 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1828 to->trans[transnr].to, 1829 counter); 1830 } 1831 } 1832 } 1833 } else { 1834 int newto = to->trans[transnr].to; 1835 1836 if (to->trans[transnr].counter >= 0) { 1837 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1838 ctxt->states[newto], 1839 to->trans[transnr].counter, -1); 1840 } else { 1841 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1842 ctxt->states[newto], counter, -1); 1843 } 1844 } 1845 } 1846 to->mark = XML_REGEXP_MARK_NORMAL; 1847 } 1848 1849 /** 1850 * xmlFAEliminateSimpleEpsilonTransitions: 1851 * @ctxt: a regexp parser context 1852 * 1853 * Eliminating general epsilon transitions can get costly in the general 1854 * algorithm due to the large amount of generated new transitions and 1855 * associated comparisons. However for simple epsilon transition used just 1856 * to separate building blocks when generating the automata this can be 1857 * reduced to state elimination: 1858 * - if there exists an epsilon from X to Y 1859 * - if there is no other transition from X 1860 * then X and Y are semantically equivalent and X can be eliminated 1861 * If X is the start state then make Y the start state, else replace the 1862 * target of all transitions to X by transitions to Y. 1863 */ 1864 static void 1865 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1866 int statenr, i, j, newto; 1867 xmlRegStatePtr state, tmp; 1868 1869 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1870 state = ctxt->states[statenr]; 1871 if (state == NULL) 1872 continue; 1873 if (state->nbTrans != 1) 1874 continue; 1875 if (state->type == XML_REGEXP_UNREACH_STATE) 1876 continue; 1877 /* is the only transition out a basic transition */ 1878 if ((state->trans[0].atom == NULL) && 1879 (state->trans[0].to >= 0) && 1880 (state->trans[0].to != statenr) && 1881 (state->trans[0].counter < 0) && 1882 (state->trans[0].count < 0)) { 1883 newto = state->trans[0].to; 1884 1885 if (state->type == XML_REGEXP_START_STATE) { 1886 #ifdef DEBUG_REGEXP_GRAPH 1887 printf("Found simple epsilon trans from start %d to %d\n", 1888 statenr, newto); 1889 #endif 1890 } else { 1891 #ifdef DEBUG_REGEXP_GRAPH 1892 printf("Found simple epsilon trans from %d to %d\n", 1893 statenr, newto); 1894 #endif 1895 for (i = 0;i < state->nbTransTo;i++) { 1896 tmp = ctxt->states[state->transTo[i]]; 1897 for (j = 0;j < tmp->nbTrans;j++) { 1898 if (tmp->trans[j].to == statenr) { 1899 #ifdef DEBUG_REGEXP_GRAPH 1900 printf("Changed transition %d on %d to go to %d\n", 1901 j, tmp->no, newto); 1902 #endif 1903 tmp->trans[j].to = -1; 1904 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1905 ctxt->states[newto], 1906 tmp->trans[j].counter, 1907 tmp->trans[j].count); 1908 } 1909 } 1910 } 1911 if (state->type == XML_REGEXP_FINAL_STATE) 1912 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1913 /* eliminate the transition completely */ 1914 state->nbTrans = 0; 1915 1916 state->type = XML_REGEXP_UNREACH_STATE; 1917 1918 } 1919 1920 } 1921 } 1922 } 1923 /** 1924 * xmlFAEliminateEpsilonTransitions: 1925 * @ctxt: a regexp parser context 1926 * 1927 */ 1928 static void 1929 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1930 int statenr, transnr; 1931 xmlRegStatePtr state; 1932 int has_epsilon; 1933 1934 if (ctxt->states == NULL) return; 1935 1936 /* 1937 * Eliminate simple epsilon transition and the associated unreachable 1938 * states. 1939 */ 1940 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1941 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1942 state = ctxt->states[statenr]; 1943 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1944 #ifdef DEBUG_REGEXP_GRAPH 1945 printf("Removed unreachable state %d\n", statenr); 1946 #endif 1947 xmlRegFreeState(state); 1948 ctxt->states[statenr] = NULL; 1949 } 1950 } 1951 1952 has_epsilon = 0; 1953 1954 /* 1955 * Build the completed transitions bypassing the epsilons 1956 * Use a marking algorithm to avoid loops 1957 * Mark sink states too. 1958 * Process from the latests states backward to the start when 1959 * there is long cascading epsilon chains this minimize the 1960 * recursions and transition compares when adding the new ones 1961 */ 1962 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1963 state = ctxt->states[statenr]; 1964 if (state == NULL) 1965 continue; 1966 if ((state->nbTrans == 0) && 1967 (state->type != XML_REGEXP_FINAL_STATE)) { 1968 state->type = XML_REGEXP_SINK_STATE; 1969 } 1970 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1971 if ((state->trans[transnr].atom == NULL) && 1972 (state->trans[transnr].to >= 0)) { 1973 if (state->trans[transnr].to == statenr) { 1974 state->trans[transnr].to = -1; 1975 #ifdef DEBUG_REGEXP_GRAPH 1976 printf("Removed loopback epsilon trans %d on %d\n", 1977 transnr, statenr); 1978 #endif 1979 } else if (state->trans[transnr].count < 0) { 1980 int newto = state->trans[transnr].to; 1981 1982 #ifdef DEBUG_REGEXP_GRAPH 1983 printf("Found epsilon trans %d from %d to %d\n", 1984 transnr, statenr, newto); 1985 #endif 1986 has_epsilon = 1; 1987 state->trans[transnr].to = -2; 1988 state->mark = XML_REGEXP_MARK_START; 1989 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1990 newto, state->trans[transnr].counter); 1991 state->mark = XML_REGEXP_MARK_NORMAL; 1992 #ifdef DEBUG_REGEXP_GRAPH 1993 } else { 1994 printf("Found counted transition %d on %d\n", 1995 transnr, statenr); 1996 #endif 1997 } 1998 } 1999 } 2000 } 2001 /* 2002 * Eliminate the epsilon transitions 2003 */ 2004 if (has_epsilon) { 2005 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2006 state = ctxt->states[statenr]; 2007 if (state == NULL) 2008 continue; 2009 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2010 xmlRegTransPtr trans = &(state->trans[transnr]); 2011 if ((trans->atom == NULL) && 2012 (trans->count < 0) && 2013 (trans->to >= 0)) { 2014 trans->to = -1; 2015 } 2016 } 2017 } 2018 } 2019 2020 /* 2021 * Use this pass to detect unreachable states too 2022 */ 2023 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2024 state = ctxt->states[statenr]; 2025 if (state != NULL) 2026 state->reached = XML_REGEXP_MARK_NORMAL; 2027 } 2028 state = ctxt->states[0]; 2029 if (state != NULL) 2030 state->reached = XML_REGEXP_MARK_START; 2031 while (state != NULL) { 2032 xmlRegStatePtr target = NULL; 2033 state->reached = XML_REGEXP_MARK_VISITED; 2034 /* 2035 * Mark all states reachable from the current reachable state 2036 */ 2037 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2038 if ((state->trans[transnr].to >= 0) && 2039 ((state->trans[transnr].atom != NULL) || 2040 (state->trans[transnr].count >= 0))) { 2041 int newto = state->trans[transnr].to; 2042 2043 if (ctxt->states[newto] == NULL) 2044 continue; 2045 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2046 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2047 target = ctxt->states[newto]; 2048 } 2049 } 2050 } 2051 2052 /* 2053 * find the next accessible state not explored 2054 */ 2055 if (target == NULL) { 2056 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2057 state = ctxt->states[statenr]; 2058 if ((state != NULL) && (state->reached == 2059 XML_REGEXP_MARK_START)) { 2060 target = state; 2061 break; 2062 } 2063 } 2064 } 2065 state = target; 2066 } 2067 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2068 state = ctxt->states[statenr]; 2069 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2070 #ifdef DEBUG_REGEXP_GRAPH 2071 printf("Removed unreachable state %d\n", statenr); 2072 #endif 2073 xmlRegFreeState(state); 2074 ctxt->states[statenr] = NULL; 2075 } 2076 } 2077 2078 } 2079 2080 static int 2081 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2082 int ret = 0; 2083 2084 if ((range1->type == XML_REGEXP_RANGES) || 2085 (range2->type == XML_REGEXP_RANGES) || 2086 (range2->type == XML_REGEXP_SUBREG) || 2087 (range1->type == XML_REGEXP_SUBREG) || 2088 (range1->type == XML_REGEXP_STRING) || 2089 (range2->type == XML_REGEXP_STRING)) 2090 return(-1); 2091 2092 /* put them in order */ 2093 if (range1->type > range2->type) { 2094 xmlRegRangePtr tmp; 2095 2096 tmp = range1; 2097 range1 = range2; 2098 range2 = tmp; 2099 } 2100 if ((range1->type == XML_REGEXP_ANYCHAR) || 2101 (range2->type == XML_REGEXP_ANYCHAR)) { 2102 ret = 1; 2103 } else if ((range1->type == XML_REGEXP_EPSILON) || 2104 (range2->type == XML_REGEXP_EPSILON)) { 2105 return(0); 2106 } else if (range1->type == range2->type) { 2107 if (range1->type != XML_REGEXP_CHARVAL) 2108 ret = 1; 2109 else if ((range1->end < range2->start) || 2110 (range2->end < range1->start)) 2111 ret = 0; 2112 else 2113 ret = 1; 2114 } else if (range1->type == XML_REGEXP_CHARVAL) { 2115 int codepoint; 2116 int neg = 0; 2117 2118 /* 2119 * just check all codepoints in the range for acceptance, 2120 * this is usually way cheaper since done only once at 2121 * compilation than testing over and over at runtime or 2122 * pushing too many states when evaluating. 2123 */ 2124 if (((range1->neg == 0) && (range2->neg != 0)) || 2125 ((range1->neg != 0) && (range2->neg == 0))) 2126 neg = 1; 2127 2128 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2129 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2130 0, range2->start, range2->end, 2131 range2->blockName); 2132 if (ret < 0) 2133 return(-1); 2134 if (((neg == 1) && (ret == 0)) || 2135 ((neg == 0) && (ret == 1))) 2136 return(1); 2137 } 2138 return(0); 2139 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2140 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2141 if (range1->type == range2->type) { 2142 ret = xmlStrEqual(range1->blockName, range2->blockName); 2143 } else { 2144 /* 2145 * comparing a block range with anything else is way 2146 * too costly, and maintining the table is like too much 2147 * memory too, so let's force the automata to save state 2148 * here. 2149 */ 2150 return(1); 2151 } 2152 } else if ((range1->type < XML_REGEXP_LETTER) || 2153 (range2->type < XML_REGEXP_LETTER)) { 2154 if ((range1->type == XML_REGEXP_ANYSPACE) && 2155 (range2->type == XML_REGEXP_NOTSPACE)) 2156 ret = 0; 2157 else if ((range1->type == XML_REGEXP_INITNAME) && 2158 (range2->type == XML_REGEXP_NOTINITNAME)) 2159 ret = 0; 2160 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2161 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2162 ret = 0; 2163 else if ((range1->type == XML_REGEXP_DECIMAL) && 2164 (range2->type == XML_REGEXP_NOTDECIMAL)) 2165 ret = 0; 2166 else if ((range1->type == XML_REGEXP_REALCHAR) && 2167 (range2->type == XML_REGEXP_NOTREALCHAR)) 2168 ret = 0; 2169 else { 2170 /* same thing to limit complexity */ 2171 return(1); 2172 } 2173 } else { 2174 ret = 0; 2175 /* range1->type < range2->type here */ 2176 switch (range1->type) { 2177 case XML_REGEXP_LETTER: 2178 /* all disjoint except in the subgroups */ 2179 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2180 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2181 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2182 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2183 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2184 ret = 1; 2185 break; 2186 case XML_REGEXP_MARK: 2187 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2188 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2189 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2190 ret = 1; 2191 break; 2192 case XML_REGEXP_NUMBER: 2193 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2194 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2195 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2196 ret = 1; 2197 break; 2198 case XML_REGEXP_PUNCT: 2199 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2200 (range2->type == XML_REGEXP_PUNCT_DASH) || 2201 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2202 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2203 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2204 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2205 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2206 ret = 1; 2207 break; 2208 case XML_REGEXP_SEPAR: 2209 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2210 (range2->type == XML_REGEXP_SEPAR_LINE) || 2211 (range2->type == XML_REGEXP_SEPAR_PARA)) 2212 ret = 1; 2213 break; 2214 case XML_REGEXP_SYMBOL: 2215 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2216 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2217 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2218 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2219 ret = 1; 2220 break; 2221 case XML_REGEXP_OTHER: 2222 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2223 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2224 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2225 ret = 1; 2226 break; 2227 default: 2228 if ((range2->type >= XML_REGEXP_LETTER) && 2229 (range2->type < XML_REGEXP_BLOCK_NAME)) 2230 ret = 0; 2231 else { 2232 /* safety net ! */ 2233 return(1); 2234 } 2235 } 2236 } 2237 if (((range1->neg == 0) && (range2->neg != 0)) || 2238 ((range1->neg != 0) && (range2->neg == 0))) 2239 ret = !ret; 2240 return(ret); 2241 } 2242 2243 /** 2244 * xmlFACompareAtomTypes: 2245 * @type1: an atom type 2246 * @type2: an atom type 2247 * 2248 * Compares two atoms type to check whether they intersect in some ways, 2249 * this is used by xmlFACompareAtoms only 2250 * 2251 * Returns 1 if they may intersect and 0 otherwise 2252 */ 2253 static int 2254 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2255 if ((type1 == XML_REGEXP_EPSILON) || 2256 (type1 == XML_REGEXP_CHARVAL) || 2257 (type1 == XML_REGEXP_RANGES) || 2258 (type1 == XML_REGEXP_SUBREG) || 2259 (type1 == XML_REGEXP_STRING) || 2260 (type1 == XML_REGEXP_ANYCHAR)) 2261 return(1); 2262 if ((type2 == XML_REGEXP_EPSILON) || 2263 (type2 == XML_REGEXP_CHARVAL) || 2264 (type2 == XML_REGEXP_RANGES) || 2265 (type2 == XML_REGEXP_SUBREG) || 2266 (type2 == XML_REGEXP_STRING) || 2267 (type2 == XML_REGEXP_ANYCHAR)) 2268 return(1); 2269 2270 if (type1 == type2) return(1); 2271 2272 /* simplify subsequent compares by making sure type1 < type2 */ 2273 if (type1 > type2) { 2274 xmlRegAtomType tmp = type1; 2275 type1 = type2; 2276 type2 = tmp; 2277 } 2278 switch (type1) { 2279 case XML_REGEXP_ANYSPACE: /* \s */ 2280 /* can't be a letter, number, mark, pontuation, symbol */ 2281 if ((type2 == XML_REGEXP_NOTSPACE) || 2282 ((type2 >= XML_REGEXP_LETTER) && 2283 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2284 ((type2 >= XML_REGEXP_NUMBER) && 2285 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2286 ((type2 >= XML_REGEXP_MARK) && 2287 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2288 ((type2 >= XML_REGEXP_PUNCT) && 2289 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2290 ((type2 >= XML_REGEXP_SYMBOL) && 2291 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2292 ) return(0); 2293 break; 2294 case XML_REGEXP_NOTSPACE: /* \S */ 2295 break; 2296 case XML_REGEXP_INITNAME: /* \l */ 2297 /* can't be a number, mark, separator, pontuation, symbol or other */ 2298 if ((type2 == XML_REGEXP_NOTINITNAME) || 2299 ((type2 >= XML_REGEXP_NUMBER) && 2300 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2301 ((type2 >= XML_REGEXP_MARK) && 2302 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2303 ((type2 >= XML_REGEXP_SEPAR) && 2304 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2305 ((type2 >= XML_REGEXP_PUNCT) && 2306 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2307 ((type2 >= XML_REGEXP_SYMBOL) && 2308 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2309 ((type2 >= XML_REGEXP_OTHER) && 2310 (type2 <= XML_REGEXP_OTHER_NA)) 2311 ) return(0); 2312 break; 2313 case XML_REGEXP_NOTINITNAME: /* \L */ 2314 break; 2315 case XML_REGEXP_NAMECHAR: /* \c */ 2316 /* can't be a mark, separator, pontuation, symbol or other */ 2317 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2318 ((type2 >= XML_REGEXP_MARK) && 2319 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2320 ((type2 >= XML_REGEXP_PUNCT) && 2321 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2322 ((type2 >= XML_REGEXP_SEPAR) && 2323 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2324 ((type2 >= XML_REGEXP_SYMBOL) && 2325 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2326 ((type2 >= XML_REGEXP_OTHER) && 2327 (type2 <= XML_REGEXP_OTHER_NA)) 2328 ) return(0); 2329 break; 2330 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2331 break; 2332 case XML_REGEXP_DECIMAL: /* \d */ 2333 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2334 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2335 (type2 == XML_REGEXP_REALCHAR) || 2336 ((type2 >= XML_REGEXP_LETTER) && 2337 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2338 ((type2 >= XML_REGEXP_MARK) && 2339 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2340 ((type2 >= XML_REGEXP_PUNCT) && 2341 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2342 ((type2 >= XML_REGEXP_SEPAR) && 2343 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2344 ((type2 >= XML_REGEXP_SYMBOL) && 2345 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2346 ((type2 >= XML_REGEXP_OTHER) && 2347 (type2 <= XML_REGEXP_OTHER_NA)) 2348 )return(0); 2349 break; 2350 case XML_REGEXP_NOTDECIMAL: /* \D */ 2351 break; 2352 case XML_REGEXP_REALCHAR: /* \w */ 2353 /* can't be a mark, separator, pontuation, symbol or other */ 2354 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2355 ((type2 >= XML_REGEXP_MARK) && 2356 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2357 ((type2 >= XML_REGEXP_PUNCT) && 2358 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2359 ((type2 >= XML_REGEXP_SEPAR) && 2360 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2361 ((type2 >= XML_REGEXP_SYMBOL) && 2362 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2363 ((type2 >= XML_REGEXP_OTHER) && 2364 (type2 <= XML_REGEXP_OTHER_NA)) 2365 )return(0); 2366 break; 2367 case XML_REGEXP_NOTREALCHAR: /* \W */ 2368 break; 2369 /* 2370 * at that point we know both type 1 and type2 are from 2371 * character categories are ordered and are different, 2372 * it becomes simple because this is a partition 2373 */ 2374 case XML_REGEXP_LETTER: 2375 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2376 return(1); 2377 return(0); 2378 case XML_REGEXP_LETTER_UPPERCASE: 2379 case XML_REGEXP_LETTER_LOWERCASE: 2380 case XML_REGEXP_LETTER_TITLECASE: 2381 case XML_REGEXP_LETTER_MODIFIER: 2382 case XML_REGEXP_LETTER_OTHERS: 2383 return(0); 2384 case XML_REGEXP_MARK: 2385 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2386 return(1); 2387 return(0); 2388 case XML_REGEXP_MARK_NONSPACING: 2389 case XML_REGEXP_MARK_SPACECOMBINING: 2390 case XML_REGEXP_MARK_ENCLOSING: 2391 return(0); 2392 case XML_REGEXP_NUMBER: 2393 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2394 return(1); 2395 return(0); 2396 case XML_REGEXP_NUMBER_DECIMAL: 2397 case XML_REGEXP_NUMBER_LETTER: 2398 case XML_REGEXP_NUMBER_OTHERS: 2399 return(0); 2400 case XML_REGEXP_PUNCT: 2401 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2402 return(1); 2403 return(0); 2404 case XML_REGEXP_PUNCT_CONNECTOR: 2405 case XML_REGEXP_PUNCT_DASH: 2406 case XML_REGEXP_PUNCT_OPEN: 2407 case XML_REGEXP_PUNCT_CLOSE: 2408 case XML_REGEXP_PUNCT_INITQUOTE: 2409 case XML_REGEXP_PUNCT_FINQUOTE: 2410 case XML_REGEXP_PUNCT_OTHERS: 2411 return(0); 2412 case XML_REGEXP_SEPAR: 2413 if (type2 <= XML_REGEXP_SEPAR_PARA) 2414 return(1); 2415 return(0); 2416 case XML_REGEXP_SEPAR_SPACE: 2417 case XML_REGEXP_SEPAR_LINE: 2418 case XML_REGEXP_SEPAR_PARA: 2419 return(0); 2420 case XML_REGEXP_SYMBOL: 2421 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2422 return(1); 2423 return(0); 2424 case XML_REGEXP_SYMBOL_MATH: 2425 case XML_REGEXP_SYMBOL_CURRENCY: 2426 case XML_REGEXP_SYMBOL_MODIFIER: 2427 case XML_REGEXP_SYMBOL_OTHERS: 2428 return(0); 2429 case XML_REGEXP_OTHER: 2430 if (type2 <= XML_REGEXP_OTHER_NA) 2431 return(1); 2432 return(0); 2433 case XML_REGEXP_OTHER_CONTROL: 2434 case XML_REGEXP_OTHER_FORMAT: 2435 case XML_REGEXP_OTHER_PRIVATE: 2436 case XML_REGEXP_OTHER_NA: 2437 return(0); 2438 default: 2439 break; 2440 } 2441 return(1); 2442 } 2443 2444 /** 2445 * xmlFAEqualAtoms: 2446 * @atom1: an atom 2447 * @atom2: an atom 2448 * @deep: if not set only compare string pointers 2449 * 2450 * Compares two atoms to check whether they are the same exactly 2451 * this is used to remove equivalent transitions 2452 * 2453 * Returns 1 if same and 0 otherwise 2454 */ 2455 static int 2456 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2457 int ret = 0; 2458 2459 if (atom1 == atom2) 2460 return(1); 2461 if ((atom1 == NULL) || (atom2 == NULL)) 2462 return(0); 2463 2464 if (atom1->type != atom2->type) 2465 return(0); 2466 switch (atom1->type) { 2467 case XML_REGEXP_EPSILON: 2468 ret = 0; 2469 break; 2470 case XML_REGEXP_STRING: 2471 if (!deep) 2472 ret = (atom1->valuep == atom2->valuep); 2473 else 2474 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2475 (xmlChar *)atom2->valuep); 2476 break; 2477 case XML_REGEXP_CHARVAL: 2478 ret = (atom1->codepoint == atom2->codepoint); 2479 break; 2480 case XML_REGEXP_RANGES: 2481 /* too hard to do in the general case */ 2482 ret = 0; 2483 default: 2484 break; 2485 } 2486 return(ret); 2487 } 2488 2489 /** 2490 * xmlFACompareAtoms: 2491 * @atom1: an atom 2492 * @atom2: an atom 2493 * @deep: if not set only compare string pointers 2494 * 2495 * Compares two atoms to check whether they intersect in some ways, 2496 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2497 * 2498 * Returns 1 if yes and 0 otherwise 2499 */ 2500 static int 2501 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2502 int ret = 1; 2503 2504 if (atom1 == atom2) 2505 return(1); 2506 if ((atom1 == NULL) || (atom2 == NULL)) 2507 return(0); 2508 2509 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2510 (atom2->type == XML_REGEXP_ANYCHAR)) 2511 return(1); 2512 2513 if (atom1->type > atom2->type) { 2514 xmlRegAtomPtr tmp; 2515 tmp = atom1; 2516 atom1 = atom2; 2517 atom2 = tmp; 2518 } 2519 if (atom1->type != atom2->type) { 2520 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2521 /* if they can't intersect at the type level break now */ 2522 if (ret == 0) 2523 return(0); 2524 } 2525 switch (atom1->type) { 2526 case XML_REGEXP_STRING: 2527 if (!deep) 2528 ret = (atom1->valuep != atom2->valuep); 2529 else 2530 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2531 (xmlChar *)atom2->valuep); 2532 break; 2533 case XML_REGEXP_EPSILON: 2534 goto not_determinist; 2535 case XML_REGEXP_CHARVAL: 2536 if (atom2->type == XML_REGEXP_CHARVAL) { 2537 ret = (atom1->codepoint == atom2->codepoint); 2538 } else { 2539 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2540 if (ret < 0) 2541 ret = 1; 2542 } 2543 break; 2544 case XML_REGEXP_RANGES: 2545 if (atom2->type == XML_REGEXP_RANGES) { 2546 int i, j, res; 2547 xmlRegRangePtr r1, r2; 2548 2549 /* 2550 * need to check that none of the ranges eventually matches 2551 */ 2552 for (i = 0;i < atom1->nbRanges;i++) { 2553 for (j = 0;j < atom2->nbRanges;j++) { 2554 r1 = atom1->ranges[i]; 2555 r2 = atom2->ranges[j]; 2556 res = xmlFACompareRanges(r1, r2); 2557 if (res == 1) { 2558 ret = 1; 2559 goto done; 2560 } 2561 } 2562 } 2563 ret = 0; 2564 } 2565 break; 2566 default: 2567 goto not_determinist; 2568 } 2569 done: 2570 if (atom1->neg != atom2->neg) { 2571 ret = !ret; 2572 } 2573 if (ret == 0) 2574 return(0); 2575 not_determinist: 2576 return(1); 2577 } 2578 2579 /** 2580 * xmlFARecurseDeterminism: 2581 * @ctxt: a regexp parser context 2582 * 2583 * Check whether the associated regexp is determinist, 2584 * should be called after xmlFAEliminateEpsilonTransitions() 2585 * 2586 */ 2587 static int 2588 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2589 int to, xmlRegAtomPtr atom) { 2590 int ret = 1; 2591 int res; 2592 int transnr, nbTrans; 2593 xmlRegTransPtr t1; 2594 int deep = 1; 2595 2596 if (state == NULL) 2597 return(ret); 2598 if (state->markd == XML_REGEXP_MARK_VISITED) 2599 return(ret); 2600 2601 if (ctxt->flags & AM_AUTOMATA_RNG) 2602 deep = 0; 2603 2604 /* 2605 * don't recurse on transitions potentially added in the course of 2606 * the elimination. 2607 */ 2608 nbTrans = state->nbTrans; 2609 for (transnr = 0;transnr < nbTrans;transnr++) { 2610 t1 = &(state->trans[transnr]); 2611 /* 2612 * check transitions conflicting with the one looked at 2613 */ 2614 if (t1->atom == NULL) { 2615 if (t1->to < 0) 2616 continue; 2617 state->markd = XML_REGEXP_MARK_VISITED; 2618 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2619 to, atom); 2620 state->markd = 0; 2621 if (res == 0) { 2622 ret = 0; 2623 /* t1->nd = 1; */ 2624 } 2625 continue; 2626 } 2627 if (t1->to != to) 2628 continue; 2629 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2630 ret = 0; 2631 /* mark the transition as non-deterministic */ 2632 t1->nd = 1; 2633 } 2634 } 2635 return(ret); 2636 } 2637 2638 /** 2639 * xmlFAComputesDeterminism: 2640 * @ctxt: a regexp parser context 2641 * 2642 * Check whether the associated regexp is determinist, 2643 * should be called after xmlFAEliminateEpsilonTransitions() 2644 * 2645 */ 2646 static int 2647 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2648 int statenr, transnr; 2649 xmlRegStatePtr state; 2650 xmlRegTransPtr t1, t2, last; 2651 int i; 2652 int ret = 1; 2653 int deep = 1; 2654 2655 #ifdef DEBUG_REGEXP_GRAPH 2656 printf("xmlFAComputesDeterminism\n"); 2657 xmlRegPrintCtxt(stdout, ctxt); 2658 #endif 2659 if (ctxt->determinist != -1) 2660 return(ctxt->determinist); 2661 2662 if (ctxt->flags & AM_AUTOMATA_RNG) 2663 deep = 0; 2664 2665 /* 2666 * First cleanup the automata removing cancelled transitions 2667 */ 2668 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2669 state = ctxt->states[statenr]; 2670 if (state == NULL) 2671 continue; 2672 if (state->nbTrans < 2) 2673 continue; 2674 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2675 t1 = &(state->trans[transnr]); 2676 /* 2677 * Determinism checks in case of counted or all transitions 2678 * will have to be handled separately 2679 */ 2680 if (t1->atom == NULL) { 2681 /* t1->nd = 1; */ 2682 continue; 2683 } 2684 if (t1->to == -1) /* eliminated */ 2685 continue; 2686 for (i = 0;i < transnr;i++) { 2687 t2 = &(state->trans[i]); 2688 if (t2->to == -1) /* eliminated */ 2689 continue; 2690 if (t2->atom != NULL) { 2691 if (t1->to == t2->to) { 2692 /* 2693 * Here we use deep because we want to keep the 2694 * transitions which indicate a conflict 2695 */ 2696 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2697 (t1->counter == t2->counter) && 2698 (t1->count == t2->count)) 2699 t2->to = -1; /* eliminated */ 2700 } 2701 } 2702 } 2703 } 2704 } 2705 2706 /* 2707 * Check for all states that there aren't 2 transitions 2708 * with the same atom and a different target. 2709 */ 2710 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2711 state = ctxt->states[statenr]; 2712 if (state == NULL) 2713 continue; 2714 if (state->nbTrans < 2) 2715 continue; 2716 last = NULL; 2717 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2718 t1 = &(state->trans[transnr]); 2719 /* 2720 * Determinism checks in case of counted or all transitions 2721 * will have to be handled separately 2722 */ 2723 if (t1->atom == NULL) { 2724 continue; 2725 } 2726 if (t1->to == -1) /* eliminated */ 2727 continue; 2728 for (i = 0;i < transnr;i++) { 2729 t2 = &(state->trans[i]); 2730 if (t2->to == -1) /* eliminated */ 2731 continue; 2732 if (t2->atom != NULL) { 2733 /* 2734 * But here we don't use deep because we want to 2735 * find transitions which indicate a conflict 2736 */ 2737 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2738 ret = 0; 2739 /* mark the transitions as non-deterministic ones */ 2740 t1->nd = 1; 2741 t2->nd = 1; 2742 last = t1; 2743 } 2744 } else if (t1->to != -1) { 2745 /* 2746 * do the closure in case of remaining specific 2747 * epsilon transitions like choices or all 2748 */ 2749 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2750 t2->to, t2->atom); 2751 /* don't shortcut the computation so all non deterministic 2752 transition get marked down 2753 if (ret == 0) 2754 return(0); 2755 */ 2756 if (ret == 0) { 2757 t1->nd = 1; 2758 /* t2->nd = 1; */ 2759 last = t1; 2760 } 2761 } 2762 } 2763 /* don't shortcut the computation so all non deterministic 2764 transition get marked down 2765 if (ret == 0) 2766 break; */ 2767 } 2768 2769 /* 2770 * mark specifically the last non-deterministic transition 2771 * from a state since there is no need to set-up rollback 2772 * from it 2773 */ 2774 if (last != NULL) { 2775 last->nd = 2; 2776 } 2777 2778 /* don't shortcut the computation so all non deterministic 2779 transition get marked down 2780 if (ret == 0) 2781 break; */ 2782 } 2783 2784 ctxt->determinist = ret; 2785 return(ret); 2786 } 2787 2788 /************************************************************************ 2789 * * 2790 * Routines to check input against transition atoms * 2791 * * 2792 ************************************************************************/ 2793 2794 static int 2795 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2796 int start, int end, const xmlChar *blockName) { 2797 int ret = 0; 2798 2799 switch (type) { 2800 case XML_REGEXP_STRING: 2801 case XML_REGEXP_SUBREG: 2802 case XML_REGEXP_RANGES: 2803 case XML_REGEXP_EPSILON: 2804 return(-1); 2805 case XML_REGEXP_ANYCHAR: 2806 ret = ((codepoint != '\n') && (codepoint != '\r')); 2807 break; 2808 case XML_REGEXP_CHARVAL: 2809 ret = ((codepoint >= start) && (codepoint <= end)); 2810 break; 2811 case XML_REGEXP_NOTSPACE: 2812 neg = !neg; 2813 /* Falls through. */ 2814 case XML_REGEXP_ANYSPACE: 2815 ret = ((codepoint == '\n') || (codepoint == '\r') || 2816 (codepoint == '\t') || (codepoint == ' ')); 2817 break; 2818 case XML_REGEXP_NOTINITNAME: 2819 neg = !neg; 2820 /* Falls through. */ 2821 case XML_REGEXP_INITNAME: 2822 ret = (IS_LETTER(codepoint) || 2823 (codepoint == '_') || (codepoint == ':')); 2824 break; 2825 case XML_REGEXP_NOTNAMECHAR: 2826 neg = !neg; 2827 /* Falls through. */ 2828 case XML_REGEXP_NAMECHAR: 2829 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2830 (codepoint == '.') || (codepoint == '-') || 2831 (codepoint == '_') || (codepoint == ':') || 2832 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2833 break; 2834 case XML_REGEXP_NOTDECIMAL: 2835 neg = !neg; 2836 /* Falls through. */ 2837 case XML_REGEXP_DECIMAL: 2838 ret = xmlUCSIsCatNd(codepoint); 2839 break; 2840 case XML_REGEXP_REALCHAR: 2841 neg = !neg; 2842 /* Falls through. */ 2843 case XML_REGEXP_NOTREALCHAR: 2844 ret = xmlUCSIsCatP(codepoint); 2845 if (ret == 0) 2846 ret = xmlUCSIsCatZ(codepoint); 2847 if (ret == 0) 2848 ret = xmlUCSIsCatC(codepoint); 2849 break; 2850 case XML_REGEXP_LETTER: 2851 ret = xmlUCSIsCatL(codepoint); 2852 break; 2853 case XML_REGEXP_LETTER_UPPERCASE: 2854 ret = xmlUCSIsCatLu(codepoint); 2855 break; 2856 case XML_REGEXP_LETTER_LOWERCASE: 2857 ret = xmlUCSIsCatLl(codepoint); 2858 break; 2859 case XML_REGEXP_LETTER_TITLECASE: 2860 ret = xmlUCSIsCatLt(codepoint); 2861 break; 2862 case XML_REGEXP_LETTER_MODIFIER: 2863 ret = xmlUCSIsCatLm(codepoint); 2864 break; 2865 case XML_REGEXP_LETTER_OTHERS: 2866 ret = xmlUCSIsCatLo(codepoint); 2867 break; 2868 case XML_REGEXP_MARK: 2869 ret = xmlUCSIsCatM(codepoint); 2870 break; 2871 case XML_REGEXP_MARK_NONSPACING: 2872 ret = xmlUCSIsCatMn(codepoint); 2873 break; 2874 case XML_REGEXP_MARK_SPACECOMBINING: 2875 ret = xmlUCSIsCatMc(codepoint); 2876 break; 2877 case XML_REGEXP_MARK_ENCLOSING: 2878 ret = xmlUCSIsCatMe(codepoint); 2879 break; 2880 case XML_REGEXP_NUMBER: 2881 ret = xmlUCSIsCatN(codepoint); 2882 break; 2883 case XML_REGEXP_NUMBER_DECIMAL: 2884 ret = xmlUCSIsCatNd(codepoint); 2885 break; 2886 case XML_REGEXP_NUMBER_LETTER: 2887 ret = xmlUCSIsCatNl(codepoint); 2888 break; 2889 case XML_REGEXP_NUMBER_OTHERS: 2890 ret = xmlUCSIsCatNo(codepoint); 2891 break; 2892 case XML_REGEXP_PUNCT: 2893 ret = xmlUCSIsCatP(codepoint); 2894 break; 2895 case XML_REGEXP_PUNCT_CONNECTOR: 2896 ret = xmlUCSIsCatPc(codepoint); 2897 break; 2898 case XML_REGEXP_PUNCT_DASH: 2899 ret = xmlUCSIsCatPd(codepoint); 2900 break; 2901 case XML_REGEXP_PUNCT_OPEN: 2902 ret = xmlUCSIsCatPs(codepoint); 2903 break; 2904 case XML_REGEXP_PUNCT_CLOSE: 2905 ret = xmlUCSIsCatPe(codepoint); 2906 break; 2907 case XML_REGEXP_PUNCT_INITQUOTE: 2908 ret = xmlUCSIsCatPi(codepoint); 2909 break; 2910 case XML_REGEXP_PUNCT_FINQUOTE: 2911 ret = xmlUCSIsCatPf(codepoint); 2912 break; 2913 case XML_REGEXP_PUNCT_OTHERS: 2914 ret = xmlUCSIsCatPo(codepoint); 2915 break; 2916 case XML_REGEXP_SEPAR: 2917 ret = xmlUCSIsCatZ(codepoint); 2918 break; 2919 case XML_REGEXP_SEPAR_SPACE: 2920 ret = xmlUCSIsCatZs(codepoint); 2921 break; 2922 case XML_REGEXP_SEPAR_LINE: 2923 ret = xmlUCSIsCatZl(codepoint); 2924 break; 2925 case XML_REGEXP_SEPAR_PARA: 2926 ret = xmlUCSIsCatZp(codepoint); 2927 break; 2928 case XML_REGEXP_SYMBOL: 2929 ret = xmlUCSIsCatS(codepoint); 2930 break; 2931 case XML_REGEXP_SYMBOL_MATH: 2932 ret = xmlUCSIsCatSm(codepoint); 2933 break; 2934 case XML_REGEXP_SYMBOL_CURRENCY: 2935 ret = xmlUCSIsCatSc(codepoint); 2936 break; 2937 case XML_REGEXP_SYMBOL_MODIFIER: 2938 ret = xmlUCSIsCatSk(codepoint); 2939 break; 2940 case XML_REGEXP_SYMBOL_OTHERS: 2941 ret = xmlUCSIsCatSo(codepoint); 2942 break; 2943 case XML_REGEXP_OTHER: 2944 ret = xmlUCSIsCatC(codepoint); 2945 break; 2946 case XML_REGEXP_OTHER_CONTROL: 2947 ret = xmlUCSIsCatCc(codepoint); 2948 break; 2949 case XML_REGEXP_OTHER_FORMAT: 2950 ret = xmlUCSIsCatCf(codepoint); 2951 break; 2952 case XML_REGEXP_OTHER_PRIVATE: 2953 ret = xmlUCSIsCatCo(codepoint); 2954 break; 2955 case XML_REGEXP_OTHER_NA: 2956 /* ret = xmlUCSIsCatCn(codepoint); */ 2957 /* Seems it doesn't exist anymore in recent Unicode releases */ 2958 ret = 0; 2959 break; 2960 case XML_REGEXP_BLOCK_NAME: 2961 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2962 break; 2963 } 2964 if (neg) 2965 return(!ret); 2966 return(ret); 2967 } 2968 2969 static int 2970 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2971 int i, ret = 0; 2972 xmlRegRangePtr range; 2973 2974 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2975 return(-1); 2976 2977 switch (atom->type) { 2978 case XML_REGEXP_SUBREG: 2979 case XML_REGEXP_EPSILON: 2980 return(-1); 2981 case XML_REGEXP_CHARVAL: 2982 return(codepoint == atom->codepoint); 2983 case XML_REGEXP_RANGES: { 2984 int accept = 0; 2985 2986 for (i = 0;i < atom->nbRanges;i++) { 2987 range = atom->ranges[i]; 2988 if (range->neg == 2) { 2989 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2990 0, range->start, range->end, 2991 range->blockName); 2992 if (ret != 0) 2993 return(0); /* excluded char */ 2994 } else if (range->neg) { 2995 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2996 0, range->start, range->end, 2997 range->blockName); 2998 if (ret == 0) 2999 accept = 1; 3000 else 3001 return(0); 3002 } else { 3003 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3004 0, range->start, range->end, 3005 range->blockName); 3006 if (ret != 0) 3007 accept = 1; /* might still be excluded */ 3008 } 3009 } 3010 return(accept); 3011 } 3012 case XML_REGEXP_STRING: 3013 printf("TODO: XML_REGEXP_STRING\n"); 3014 return(-1); 3015 case XML_REGEXP_ANYCHAR: 3016 case XML_REGEXP_ANYSPACE: 3017 case XML_REGEXP_NOTSPACE: 3018 case XML_REGEXP_INITNAME: 3019 case XML_REGEXP_NOTINITNAME: 3020 case XML_REGEXP_NAMECHAR: 3021 case XML_REGEXP_NOTNAMECHAR: 3022 case XML_REGEXP_DECIMAL: 3023 case XML_REGEXP_NOTDECIMAL: 3024 case XML_REGEXP_REALCHAR: 3025 case XML_REGEXP_NOTREALCHAR: 3026 case XML_REGEXP_LETTER: 3027 case XML_REGEXP_LETTER_UPPERCASE: 3028 case XML_REGEXP_LETTER_LOWERCASE: 3029 case XML_REGEXP_LETTER_TITLECASE: 3030 case XML_REGEXP_LETTER_MODIFIER: 3031 case XML_REGEXP_LETTER_OTHERS: 3032 case XML_REGEXP_MARK: 3033 case XML_REGEXP_MARK_NONSPACING: 3034 case XML_REGEXP_MARK_SPACECOMBINING: 3035 case XML_REGEXP_MARK_ENCLOSING: 3036 case XML_REGEXP_NUMBER: 3037 case XML_REGEXP_NUMBER_DECIMAL: 3038 case XML_REGEXP_NUMBER_LETTER: 3039 case XML_REGEXP_NUMBER_OTHERS: 3040 case XML_REGEXP_PUNCT: 3041 case XML_REGEXP_PUNCT_CONNECTOR: 3042 case XML_REGEXP_PUNCT_DASH: 3043 case XML_REGEXP_PUNCT_OPEN: 3044 case XML_REGEXP_PUNCT_CLOSE: 3045 case XML_REGEXP_PUNCT_INITQUOTE: 3046 case XML_REGEXP_PUNCT_FINQUOTE: 3047 case XML_REGEXP_PUNCT_OTHERS: 3048 case XML_REGEXP_SEPAR: 3049 case XML_REGEXP_SEPAR_SPACE: 3050 case XML_REGEXP_SEPAR_LINE: 3051 case XML_REGEXP_SEPAR_PARA: 3052 case XML_REGEXP_SYMBOL: 3053 case XML_REGEXP_SYMBOL_MATH: 3054 case XML_REGEXP_SYMBOL_CURRENCY: 3055 case XML_REGEXP_SYMBOL_MODIFIER: 3056 case XML_REGEXP_SYMBOL_OTHERS: 3057 case XML_REGEXP_OTHER: 3058 case XML_REGEXP_OTHER_CONTROL: 3059 case XML_REGEXP_OTHER_FORMAT: 3060 case XML_REGEXP_OTHER_PRIVATE: 3061 case XML_REGEXP_OTHER_NA: 3062 case XML_REGEXP_BLOCK_NAME: 3063 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3064 (const xmlChar *)atom->valuep); 3065 if (atom->neg) 3066 ret = !ret; 3067 break; 3068 } 3069 return(ret); 3070 } 3071 3072 /************************************************************************ 3073 * * 3074 * Saving and restoring state of an execution context * 3075 * * 3076 ************************************************************************/ 3077 3078 #ifdef DEBUG_REGEXP_EXEC 3079 static void 3080 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3081 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3082 if (exec->inputStack != NULL) { 3083 int i; 3084 printf(": "); 3085 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3086 printf("%s ", (const char *) 3087 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3088 } else { 3089 printf(": %s", &(exec->inputString[exec->index])); 3090 } 3091 printf("\n"); 3092 } 3093 #endif 3094 3095 static void 3096 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3097 #ifdef DEBUG_REGEXP_EXEC 3098 printf("saving "); 3099 exec->transno++; 3100 xmlFARegDebugExec(exec); 3101 exec->transno--; 3102 #endif 3103 #ifdef MAX_PUSH 3104 if (exec->nbPush > MAX_PUSH) { 3105 return; 3106 } 3107 exec->nbPush++; 3108 #endif 3109 3110 if (exec->maxRollbacks == 0) { 3111 exec->maxRollbacks = 4; 3112 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3113 sizeof(xmlRegExecRollback)); 3114 if (exec->rollbacks == NULL) { 3115 xmlRegexpErrMemory(NULL, "saving regexp"); 3116 exec->maxRollbacks = 0; 3117 return; 3118 } 3119 memset(exec->rollbacks, 0, 3120 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3121 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3122 xmlRegExecRollback *tmp; 3123 int len = exec->maxRollbacks; 3124 3125 exec->maxRollbacks *= 2; 3126 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3127 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3128 if (tmp == NULL) { 3129 xmlRegexpErrMemory(NULL, "saving regexp"); 3130 exec->maxRollbacks /= 2; 3131 return; 3132 } 3133 exec->rollbacks = tmp; 3134 tmp = &exec->rollbacks[len]; 3135 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3136 } 3137 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3138 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3139 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3140 if (exec->comp->nbCounters > 0) { 3141 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3142 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3143 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3144 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3145 xmlRegexpErrMemory(NULL, "saving regexp"); 3146 exec->status = -5; 3147 return; 3148 } 3149 } 3150 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3151 exec->comp->nbCounters * sizeof(int)); 3152 } 3153 exec->nbRollbacks++; 3154 } 3155 3156 static void 3157 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3158 if (exec->nbRollbacks <= 0) { 3159 exec->status = -1; 3160 #ifdef DEBUG_REGEXP_EXEC 3161 printf("rollback failed on empty stack\n"); 3162 #endif 3163 return; 3164 } 3165 exec->nbRollbacks--; 3166 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3167 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3168 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3169 if (exec->comp->nbCounters > 0) { 3170 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3171 fprintf(stderr, "exec save: allocation failed"); 3172 exec->status = -6; 3173 return; 3174 } 3175 if (exec->counts) { 3176 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3177 exec->comp->nbCounters * sizeof(int)); 3178 } 3179 } 3180 3181 #ifdef DEBUG_REGEXP_EXEC 3182 printf("restored "); 3183 xmlFARegDebugExec(exec); 3184 #endif 3185 } 3186 3187 /************************************************************************ 3188 * * 3189 * Verifier, running an input against a compiled regexp * 3190 * * 3191 ************************************************************************/ 3192 3193 static int 3194 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3195 xmlRegExecCtxt execval; 3196 xmlRegExecCtxtPtr exec = &execval; 3197 int ret, codepoint = 0, len, deter; 3198 3199 exec->inputString = content; 3200 exec->index = 0; 3201 exec->nbPush = 0; 3202 exec->determinist = 1; 3203 exec->maxRollbacks = 0; 3204 exec->nbRollbacks = 0; 3205 exec->rollbacks = NULL; 3206 exec->status = 0; 3207 exec->comp = comp; 3208 exec->state = comp->states[0]; 3209 exec->transno = 0; 3210 exec->transcount = 0; 3211 exec->inputStack = NULL; 3212 exec->inputStackMax = 0; 3213 if (comp->nbCounters > 0) { 3214 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3215 if (exec->counts == NULL) { 3216 xmlRegexpErrMemory(NULL, "running regexp"); 3217 return(-1); 3218 } 3219 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3220 } else 3221 exec->counts = NULL; 3222 while ((exec->status == 0) && (exec->state != NULL) && 3223 ((exec->inputString[exec->index] != 0) || 3224 ((exec->state != NULL) && 3225 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3226 xmlRegTransPtr trans; 3227 xmlRegAtomPtr atom; 3228 3229 /* 3230 * If end of input on non-terminal state, rollback, however we may 3231 * still have epsilon like transition for counted transitions 3232 * on counters, in that case don't break too early. Additionally, 3233 * if we are working on a range like "AB{0,2}", where B is not present, 3234 * we don't want to break. 3235 */ 3236 len = 1; 3237 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3238 /* 3239 * if there is a transition, we must check if 3240 * atom allows minOccurs of 0 3241 */ 3242 if (exec->transno < exec->state->nbTrans) { 3243 trans = &exec->state->trans[exec->transno]; 3244 if (trans->to >=0) { 3245 atom = trans->atom; 3246 if (!((atom->min == 0) && (atom->max > 0))) 3247 goto rollback; 3248 } 3249 } else 3250 goto rollback; 3251 } 3252 3253 exec->transcount = 0; 3254 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3255 trans = &exec->state->trans[exec->transno]; 3256 if (trans->to < 0) 3257 continue; 3258 atom = trans->atom; 3259 ret = 0; 3260 deter = 1; 3261 if (trans->count >= 0) { 3262 int count; 3263 xmlRegCounterPtr counter; 3264 3265 if (exec->counts == NULL) { 3266 exec->status = -1; 3267 goto error; 3268 } 3269 /* 3270 * A counted transition. 3271 */ 3272 3273 count = exec->counts[trans->count]; 3274 counter = &exec->comp->counters[trans->count]; 3275 #ifdef DEBUG_REGEXP_EXEC 3276 printf("testing count %d: val %d, min %d, max %d\n", 3277 trans->count, count, counter->min, counter->max); 3278 #endif 3279 ret = ((count >= counter->min) && (count <= counter->max)); 3280 if ((ret) && (counter->min != counter->max)) 3281 deter = 0; 3282 } else if (atom == NULL) { 3283 fprintf(stderr, "epsilon transition left at runtime\n"); 3284 exec->status = -2; 3285 break; 3286 } else if (exec->inputString[exec->index] != 0) { 3287 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3288 ret = xmlRegCheckCharacter(atom, codepoint); 3289 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3290 xmlRegStatePtr to = comp->states[trans->to]; 3291 3292 /* 3293 * this is a multiple input sequence 3294 * If there is a counter associated increment it now. 3295 * before potentially saving and rollback 3296 * do not increment if the counter is already over the 3297 * maximum limit in which case get to next transition 3298 */ 3299 if (trans->counter >= 0) { 3300 xmlRegCounterPtr counter; 3301 3302 if ((exec->counts == NULL) || 3303 (exec->comp == NULL) || 3304 (exec->comp->counters == NULL)) { 3305 exec->status = -1; 3306 goto error; 3307 } 3308 counter = &exec->comp->counters[trans->counter]; 3309 if (exec->counts[trans->counter] >= counter->max) 3310 continue; /* for loop on transitions */ 3311 3312 #ifdef DEBUG_REGEXP_EXEC 3313 printf("Increasing count %d\n", trans->counter); 3314 #endif 3315 exec->counts[trans->counter]++; 3316 } 3317 if (exec->state->nbTrans > exec->transno + 1) { 3318 xmlFARegExecSave(exec); 3319 } 3320 exec->transcount = 1; 3321 do { 3322 /* 3323 * Try to progress as much as possible on the input 3324 */ 3325 if (exec->transcount == atom->max) { 3326 break; 3327 } 3328 exec->index += len; 3329 /* 3330 * End of input: stop here 3331 */ 3332 if (exec->inputString[exec->index] == 0) { 3333 exec->index -= len; 3334 break; 3335 } 3336 if (exec->transcount >= atom->min) { 3337 int transno = exec->transno; 3338 xmlRegStatePtr state = exec->state; 3339 3340 /* 3341 * The transition is acceptable save it 3342 */ 3343 exec->transno = -1; /* trick */ 3344 exec->state = to; 3345 xmlFARegExecSave(exec); 3346 exec->transno = transno; 3347 exec->state = state; 3348 } 3349 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3350 len); 3351 ret = xmlRegCheckCharacter(atom, codepoint); 3352 exec->transcount++; 3353 } while (ret == 1); 3354 if (exec->transcount < atom->min) 3355 ret = 0; 3356 3357 /* 3358 * If the last check failed but one transition was found 3359 * possible, rollback 3360 */ 3361 if (ret < 0) 3362 ret = 0; 3363 if (ret == 0) { 3364 goto rollback; 3365 } 3366 if (trans->counter >= 0) { 3367 if (exec->counts == NULL) { 3368 exec->status = -1; 3369 goto error; 3370 } 3371 #ifdef DEBUG_REGEXP_EXEC 3372 printf("Decreasing count %d\n", trans->counter); 3373 #endif 3374 exec->counts[trans->counter]--; 3375 } 3376 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3377 /* 3378 * we don't match on the codepoint, but minOccurs of 0 3379 * says that's ok. Setting len to 0 inhibits stepping 3380 * over the codepoint. 3381 */ 3382 exec->transcount = 1; 3383 len = 0; 3384 ret = 1; 3385 } 3386 } else if ((atom->min == 0) && (atom->max > 0)) { 3387 /* another spot to match when minOccurs is 0 */ 3388 exec->transcount = 1; 3389 len = 0; 3390 ret = 1; 3391 } 3392 if (ret == 1) { 3393 if ((trans->nd == 1) || 3394 ((trans->count >= 0) && (deter == 0) && 3395 (exec->state->nbTrans > exec->transno + 1))) { 3396 #ifdef DEBUG_REGEXP_EXEC 3397 if (trans->nd == 1) 3398 printf("Saving on nd transition atom %d for %c at %d\n", 3399 trans->atom->no, codepoint, exec->index); 3400 else 3401 printf("Saving on counted transition count %d for %c at %d\n", 3402 trans->count, codepoint, exec->index); 3403 #endif 3404 xmlFARegExecSave(exec); 3405 } 3406 if (trans->counter >= 0) { 3407 xmlRegCounterPtr counter; 3408 3409 /* make sure we don't go over the counter maximum value */ 3410 if ((exec->counts == NULL) || 3411 (exec->comp == NULL) || 3412 (exec->comp->counters == NULL)) { 3413 exec->status = -1; 3414 goto error; 3415 } 3416 counter = &exec->comp->counters[trans->counter]; 3417 if (exec->counts[trans->counter] >= counter->max) 3418 continue; /* for loop on transitions */ 3419 #ifdef DEBUG_REGEXP_EXEC 3420 printf("Increasing count %d\n", trans->counter); 3421 #endif 3422 exec->counts[trans->counter]++; 3423 } 3424 if ((trans->count >= 0) && 3425 (trans->count < REGEXP_ALL_COUNTER)) { 3426 if (exec->counts == NULL) { 3427 exec->status = -1; 3428 goto error; 3429 } 3430 #ifdef DEBUG_REGEXP_EXEC 3431 printf("resetting count %d on transition\n", 3432 trans->count); 3433 #endif 3434 exec->counts[trans->count] = 0; 3435 } 3436 #ifdef DEBUG_REGEXP_EXEC 3437 printf("entering state %d\n", trans->to); 3438 #endif 3439 exec->state = comp->states[trans->to]; 3440 exec->transno = 0; 3441 if (trans->atom != NULL) { 3442 exec->index += len; 3443 } 3444 goto progress; 3445 } else if (ret < 0) { 3446 exec->status = -4; 3447 break; 3448 } 3449 } 3450 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3451 rollback: 3452 /* 3453 * Failed to find a way out 3454 */ 3455 exec->determinist = 0; 3456 #ifdef DEBUG_REGEXP_EXEC 3457 printf("rollback from state %d on %d:%c\n", exec->state->no, 3458 codepoint,codepoint); 3459 #endif 3460 xmlFARegExecRollBack(exec); 3461 } 3462 progress: 3463 continue; 3464 } 3465 error: 3466 if (exec->rollbacks != NULL) { 3467 if (exec->counts != NULL) { 3468 int i; 3469 3470 for (i = 0;i < exec->maxRollbacks;i++) 3471 if (exec->rollbacks[i].counts != NULL) 3472 xmlFree(exec->rollbacks[i].counts); 3473 } 3474 xmlFree(exec->rollbacks); 3475 } 3476 if (exec->state == NULL) 3477 return(-1); 3478 if (exec->counts != NULL) 3479 xmlFree(exec->counts); 3480 if (exec->status == 0) 3481 return(1); 3482 if (exec->status == -1) { 3483 if (exec->nbPush > MAX_PUSH) 3484 return(-1); 3485 return(0); 3486 } 3487 return(exec->status); 3488 } 3489 3490 /************************************************************************ 3491 * * 3492 * Progressive interface to the verifier one atom at a time * 3493 * * 3494 ************************************************************************/ 3495 #ifdef DEBUG_ERR 3496 static void testerr(xmlRegExecCtxtPtr exec); 3497 #endif 3498 3499 /** 3500 * xmlRegNewExecCtxt: 3501 * @comp: a precompiled regular expression 3502 * @callback: a callback function used for handling progresses in the 3503 * automata matching phase 3504 * @data: the context data associated to the callback in this context 3505 * 3506 * Build a context used for progressive evaluation of a regexp. 3507 * 3508 * Returns the new context 3509 */ 3510 xmlRegExecCtxtPtr 3511 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3512 xmlRegExecCtxtPtr exec; 3513 3514 if (comp == NULL) 3515 return(NULL); 3516 if ((comp->compact == NULL) && (comp->states == NULL)) 3517 return(NULL); 3518 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3519 if (exec == NULL) { 3520 xmlRegexpErrMemory(NULL, "creating execution context"); 3521 return(NULL); 3522 } 3523 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3524 exec->inputString = NULL; 3525 exec->index = 0; 3526 exec->determinist = 1; 3527 exec->maxRollbacks = 0; 3528 exec->nbRollbacks = 0; 3529 exec->rollbacks = NULL; 3530 exec->status = 0; 3531 exec->comp = comp; 3532 if (comp->compact == NULL) 3533 exec->state = comp->states[0]; 3534 exec->transno = 0; 3535 exec->transcount = 0; 3536 exec->callback = callback; 3537 exec->data = data; 3538 if (comp->nbCounters > 0) { 3539 /* 3540 * For error handling, exec->counts is allocated twice the size 3541 * the second half is used to store the data in case of rollback 3542 */ 3543 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3544 * 2); 3545 if (exec->counts == NULL) { 3546 xmlRegexpErrMemory(NULL, "creating execution context"); 3547 xmlFree(exec); 3548 return(NULL); 3549 } 3550 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3551 exec->errCounts = &exec->counts[comp->nbCounters]; 3552 } else { 3553 exec->counts = NULL; 3554 exec->errCounts = NULL; 3555 } 3556 exec->inputStackMax = 0; 3557 exec->inputStackNr = 0; 3558 exec->inputStack = NULL; 3559 exec->errStateNo = -1; 3560 exec->errString = NULL; 3561 exec->nbPush = 0; 3562 return(exec); 3563 } 3564 3565 /** 3566 * xmlRegFreeExecCtxt: 3567 * @exec: a regular expression evaulation context 3568 * 3569 * Free the structures associated to a regular expression evaulation context. 3570 */ 3571 void 3572 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3573 if (exec == NULL) 3574 return; 3575 3576 if (exec->rollbacks != NULL) { 3577 if (exec->counts != NULL) { 3578 int i; 3579 3580 for (i = 0;i < exec->maxRollbacks;i++) 3581 if (exec->rollbacks[i].counts != NULL) 3582 xmlFree(exec->rollbacks[i].counts); 3583 } 3584 xmlFree(exec->rollbacks); 3585 } 3586 if (exec->counts != NULL) 3587 xmlFree(exec->counts); 3588 if (exec->inputStack != NULL) { 3589 int i; 3590 3591 for (i = 0;i < exec->inputStackNr;i++) { 3592 if (exec->inputStack[i].value != NULL) 3593 xmlFree(exec->inputStack[i].value); 3594 } 3595 xmlFree(exec->inputStack); 3596 } 3597 if (exec->errString != NULL) 3598 xmlFree(exec->errString); 3599 xmlFree(exec); 3600 } 3601 3602 static void 3603 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3604 void *data) { 3605 #ifdef DEBUG_PUSH 3606 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3607 #endif 3608 if (exec->inputStackMax == 0) { 3609 exec->inputStackMax = 4; 3610 exec->inputStack = (xmlRegInputTokenPtr) 3611 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3612 if (exec->inputStack == NULL) { 3613 xmlRegexpErrMemory(NULL, "pushing input string"); 3614 exec->inputStackMax = 0; 3615 return; 3616 } 3617 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3618 xmlRegInputTokenPtr tmp; 3619 3620 exec->inputStackMax *= 2; 3621 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3622 exec->inputStackMax * sizeof(xmlRegInputToken)); 3623 if (tmp == NULL) { 3624 xmlRegexpErrMemory(NULL, "pushing input string"); 3625 exec->inputStackMax /= 2; 3626 return; 3627 } 3628 exec->inputStack = tmp; 3629 } 3630 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3631 exec->inputStack[exec->inputStackNr].data = data; 3632 exec->inputStackNr++; 3633 exec->inputStack[exec->inputStackNr].value = NULL; 3634 exec->inputStack[exec->inputStackNr].data = NULL; 3635 } 3636 3637 /** 3638 * xmlRegStrEqualWildcard: 3639 * @expStr: the string to be evaluated 3640 * @valStr: the validation string 3641 * 3642 * Checks if both strings are equal or have the same content. "*" 3643 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3644 * substrings in both @expStr and @valStr. 3645 * 3646 * Returns 1 if the comparison is satisfied and the number of substrings 3647 * is equal, 0 otherwise. 3648 */ 3649 3650 static int 3651 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3652 if (expStr == valStr) return(1); 3653 if (expStr == NULL) return(0); 3654 if (valStr == NULL) return(0); 3655 do { 3656 /* 3657 * Eval if we have a wildcard for the current item. 3658 */ 3659 if (*expStr != *valStr) { 3660 /* if one of them starts with a wildcard make valStr be it */ 3661 if (*valStr == '*') { 3662 const xmlChar *tmp; 3663 3664 tmp = valStr; 3665 valStr = expStr; 3666 expStr = tmp; 3667 } 3668 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3669 do { 3670 if (*valStr == XML_REG_STRING_SEPARATOR) 3671 break; 3672 valStr++; 3673 } while (*valStr != 0); 3674 continue; 3675 } else 3676 return(0); 3677 } 3678 expStr++; 3679 valStr++; 3680 } while (*valStr != 0); 3681 if (*expStr != 0) 3682 return (0); 3683 else 3684 return (1); 3685 } 3686 3687 /** 3688 * xmlRegCompactPushString: 3689 * @exec: a regexp execution context 3690 * @comp: the precompiled exec with a compact table 3691 * @value: a string token input 3692 * @data: data associated to the token to reuse in callbacks 3693 * 3694 * Push one input token in the execution context 3695 * 3696 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3697 * a negative value in case of error. 3698 */ 3699 static int 3700 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3701 xmlRegexpPtr comp, 3702 const xmlChar *value, 3703 void *data) { 3704 int state = exec->index; 3705 int i, target; 3706 3707 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3708 return(-1); 3709 3710 if (value == NULL) { 3711 /* 3712 * are we at a final state ? 3713 */ 3714 if (comp->compact[state * (comp->nbstrings + 1)] == 3715 XML_REGEXP_FINAL_STATE) 3716 return(1); 3717 return(0); 3718 } 3719 3720 #ifdef DEBUG_PUSH 3721 printf("value pushed: %s\n", value); 3722 #endif 3723 3724 /* 3725 * Examine all outside transitions from current state 3726 */ 3727 for (i = 0;i < comp->nbstrings;i++) { 3728 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3729 if ((target > 0) && (target <= comp->nbstates)) { 3730 target--; /* to avoid 0 */ 3731 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3732 exec->index = target; 3733 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3734 exec->callback(exec->data, value, 3735 comp->transdata[state * comp->nbstrings + i], data); 3736 } 3737 #ifdef DEBUG_PUSH 3738 printf("entering state %d\n", target); 3739 #endif 3740 if (comp->compact[target * (comp->nbstrings + 1)] == 3741 XML_REGEXP_SINK_STATE) 3742 goto error; 3743 3744 if (comp->compact[target * (comp->nbstrings + 1)] == 3745 XML_REGEXP_FINAL_STATE) 3746 return(1); 3747 return(0); 3748 } 3749 } 3750 } 3751 /* 3752 * Failed to find an exit transition out from current state for the 3753 * current token 3754 */ 3755 #ifdef DEBUG_PUSH 3756 printf("failed to find a transition for %s on state %d\n", value, state); 3757 #endif 3758 error: 3759 if (exec->errString != NULL) 3760 xmlFree(exec->errString); 3761 exec->errString = xmlStrdup(value); 3762 exec->errStateNo = state; 3763 exec->status = -1; 3764 #ifdef DEBUG_ERR 3765 testerr(exec); 3766 #endif 3767 return(-1); 3768 } 3769 3770 /** 3771 * xmlRegExecPushStringInternal: 3772 * @exec: a regexp execution context or NULL to indicate the end 3773 * @value: a string token input 3774 * @data: data associated to the token to reuse in callbacks 3775 * @compound: value was assembled from 2 strings 3776 * 3777 * Push one input token in the execution context 3778 * 3779 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3780 * a negative value in case of error. 3781 */ 3782 static int 3783 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3784 void *data, int compound) { 3785 xmlRegTransPtr trans; 3786 xmlRegAtomPtr atom; 3787 int ret; 3788 int final = 0; 3789 int progress = 1; 3790 3791 if (exec == NULL) 3792 return(-1); 3793 if (exec->comp == NULL) 3794 return(-1); 3795 if (exec->status != 0) 3796 return(exec->status); 3797 3798 if (exec->comp->compact != NULL) 3799 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3800 3801 if (value == NULL) { 3802 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3803 return(1); 3804 final = 1; 3805 } 3806 3807 #ifdef DEBUG_PUSH 3808 printf("value pushed: %s\n", value); 3809 #endif 3810 /* 3811 * If we have an active rollback stack push the new value there 3812 * and get back to where we were left 3813 */ 3814 if ((value != NULL) && (exec->inputStackNr > 0)) { 3815 xmlFARegExecSaveInputString(exec, value, data); 3816 value = exec->inputStack[exec->index].value; 3817 data = exec->inputStack[exec->index].data; 3818 #ifdef DEBUG_PUSH 3819 printf("value loaded: %s\n", value); 3820 #endif 3821 } 3822 3823 while ((exec->status == 0) && 3824 ((value != NULL) || 3825 ((final == 1) && 3826 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3827 3828 /* 3829 * End of input on non-terminal state, rollback, however we may 3830 * still have epsilon like transition for counted transitions 3831 * on counters, in that case don't break too early. 3832 */ 3833 if ((value == NULL) && (exec->counts == NULL)) 3834 goto rollback; 3835 3836 exec->transcount = 0; 3837 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3838 trans = &exec->state->trans[exec->transno]; 3839 if (trans->to < 0) 3840 continue; 3841 atom = trans->atom; 3842 ret = 0; 3843 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3844 int i; 3845 int count; 3846 xmlRegTransPtr t; 3847 xmlRegCounterPtr counter; 3848 3849 ret = 0; 3850 3851 #ifdef DEBUG_PUSH 3852 printf("testing all lax %d\n", trans->count); 3853 #endif 3854 /* 3855 * Check all counted transitions from the current state 3856 */ 3857 if ((value == NULL) && (final)) { 3858 ret = 1; 3859 } else if (value != NULL) { 3860 for (i = 0;i < exec->state->nbTrans;i++) { 3861 t = &exec->state->trans[i]; 3862 if ((t->counter < 0) || (t == trans)) 3863 continue; 3864 counter = &exec->comp->counters[t->counter]; 3865 count = exec->counts[t->counter]; 3866 if ((count < counter->max) && 3867 (t->atom != NULL) && 3868 (xmlStrEqual(value, t->atom->valuep))) { 3869 ret = 0; 3870 break; 3871 } 3872 if ((count >= counter->min) && 3873 (count < counter->max) && 3874 (t->atom != NULL) && 3875 (xmlStrEqual(value, t->atom->valuep))) { 3876 ret = 1; 3877 break; 3878 } 3879 } 3880 } 3881 } else if (trans->count == REGEXP_ALL_COUNTER) { 3882 int i; 3883 int count; 3884 xmlRegTransPtr t; 3885 xmlRegCounterPtr counter; 3886 3887 ret = 1; 3888 3889 #ifdef DEBUG_PUSH 3890 printf("testing all %d\n", trans->count); 3891 #endif 3892 /* 3893 * Check all counted transitions from the current state 3894 */ 3895 for (i = 0;i < exec->state->nbTrans;i++) { 3896 t = &exec->state->trans[i]; 3897 if ((t->counter < 0) || (t == trans)) 3898 continue; 3899 counter = &exec->comp->counters[t->counter]; 3900 count = exec->counts[t->counter]; 3901 if ((count < counter->min) || (count > counter->max)) { 3902 ret = 0; 3903 break; 3904 } 3905 } 3906 } else if (trans->count >= 0) { 3907 int count; 3908 xmlRegCounterPtr counter; 3909 3910 /* 3911 * A counted transition. 3912 */ 3913 3914 count = exec->counts[trans->count]; 3915 counter = &exec->comp->counters[trans->count]; 3916 #ifdef DEBUG_PUSH 3917 printf("testing count %d: val %d, min %d, max %d\n", 3918 trans->count, count, counter->min, counter->max); 3919 #endif 3920 ret = ((count >= counter->min) && (count <= counter->max)); 3921 } else if (atom == NULL) { 3922 fprintf(stderr, "epsilon transition left at runtime\n"); 3923 exec->status = -2; 3924 break; 3925 } else if (value != NULL) { 3926 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3927 if (atom->neg) { 3928 ret = !ret; 3929 if (!compound) 3930 ret = 0; 3931 } 3932 if ((ret == 1) && (trans->counter >= 0)) { 3933 xmlRegCounterPtr counter; 3934 int count; 3935 3936 count = exec->counts[trans->counter]; 3937 counter = &exec->comp->counters[trans->counter]; 3938 if (count >= counter->max) 3939 ret = 0; 3940 } 3941 3942 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3943 xmlRegStatePtr to = exec->comp->states[trans->to]; 3944 3945 /* 3946 * this is a multiple input sequence 3947 */ 3948 if (exec->state->nbTrans > exec->transno + 1) { 3949 if (exec->inputStackNr <= 0) { 3950 xmlFARegExecSaveInputString(exec, value, data); 3951 } 3952 xmlFARegExecSave(exec); 3953 } 3954 exec->transcount = 1; 3955 do { 3956 /* 3957 * Try to progress as much as possible on the input 3958 */ 3959 if (exec->transcount == atom->max) { 3960 break; 3961 } 3962 exec->index++; 3963 value = exec->inputStack[exec->index].value; 3964 data = exec->inputStack[exec->index].data; 3965 #ifdef DEBUG_PUSH 3966 printf("value loaded: %s\n", value); 3967 #endif 3968 3969 /* 3970 * End of input: stop here 3971 */ 3972 if (value == NULL) { 3973 exec->index --; 3974 break; 3975 } 3976 if (exec->transcount >= atom->min) { 3977 int transno = exec->transno; 3978 xmlRegStatePtr state = exec->state; 3979 3980 /* 3981 * The transition is acceptable save it 3982 */ 3983 exec->transno = -1; /* trick */ 3984 exec->state = to; 3985 if (exec->inputStackNr <= 0) { 3986 xmlFARegExecSaveInputString(exec, value, data); 3987 } 3988 xmlFARegExecSave(exec); 3989 exec->transno = transno; 3990 exec->state = state; 3991 } 3992 ret = xmlStrEqual(value, atom->valuep); 3993 exec->transcount++; 3994 } while (ret == 1); 3995 if (exec->transcount < atom->min) 3996 ret = 0; 3997 3998 /* 3999 * If the last check failed but one transition was found 4000 * possible, rollback 4001 */ 4002 if (ret < 0) 4003 ret = 0; 4004 if (ret == 0) { 4005 goto rollback; 4006 } 4007 } 4008 } 4009 if (ret == 1) { 4010 if ((exec->callback != NULL) && (atom != NULL) && 4011 (data != NULL)) { 4012 exec->callback(exec->data, atom->valuep, 4013 atom->data, data); 4014 } 4015 if (exec->state->nbTrans > exec->transno + 1) { 4016 if (exec->inputStackNr <= 0) { 4017 xmlFARegExecSaveInputString(exec, value, data); 4018 } 4019 xmlFARegExecSave(exec); 4020 } 4021 if (trans->counter >= 0) { 4022 #ifdef DEBUG_PUSH 4023 printf("Increasing count %d\n", trans->counter); 4024 #endif 4025 exec->counts[trans->counter]++; 4026 } 4027 if ((trans->count >= 0) && 4028 (trans->count < REGEXP_ALL_COUNTER)) { 4029 #ifdef DEBUG_REGEXP_EXEC 4030 printf("resetting count %d on transition\n", 4031 trans->count); 4032 #endif 4033 exec->counts[trans->count] = 0; 4034 } 4035 #ifdef DEBUG_PUSH 4036 printf("entering state %d\n", trans->to); 4037 #endif 4038 if ((exec->comp->states[trans->to] != NULL) && 4039 (exec->comp->states[trans->to]->type == 4040 XML_REGEXP_SINK_STATE)) { 4041 /* 4042 * entering a sink state, save the current state as error 4043 * state. 4044 */ 4045 if (exec->errString != NULL) 4046 xmlFree(exec->errString); 4047 exec->errString = xmlStrdup(value); 4048 exec->errState = exec->state; 4049 memcpy(exec->errCounts, exec->counts, 4050 exec->comp->nbCounters * sizeof(int)); 4051 } 4052 exec->state = exec->comp->states[trans->to]; 4053 exec->transno = 0; 4054 if (trans->atom != NULL) { 4055 if (exec->inputStack != NULL) { 4056 exec->index++; 4057 if (exec->index < exec->inputStackNr) { 4058 value = exec->inputStack[exec->index].value; 4059 data = exec->inputStack[exec->index].data; 4060 #ifdef DEBUG_PUSH 4061 printf("value loaded: %s\n", value); 4062 #endif 4063 } else { 4064 value = NULL; 4065 data = NULL; 4066 #ifdef DEBUG_PUSH 4067 printf("end of input\n"); 4068 #endif 4069 } 4070 } else { 4071 value = NULL; 4072 data = NULL; 4073 #ifdef DEBUG_PUSH 4074 printf("end of input\n"); 4075 #endif 4076 } 4077 } 4078 goto progress; 4079 } else if (ret < 0) { 4080 exec->status = -4; 4081 break; 4082 } 4083 } 4084 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4085 rollback: 4086 /* 4087 * if we didn't yet rollback on the current input 4088 * store the current state as the error state. 4089 */ 4090 if ((progress) && (exec->state != NULL) && 4091 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4092 progress = 0; 4093 if (exec->errString != NULL) 4094 xmlFree(exec->errString); 4095 exec->errString = xmlStrdup(value); 4096 exec->errState = exec->state; 4097 if (exec->comp->nbCounters) 4098 memcpy(exec->errCounts, exec->counts, 4099 exec->comp->nbCounters * sizeof(int)); 4100 } 4101 4102 /* 4103 * Failed to find a way out 4104 */ 4105 exec->determinist = 0; 4106 xmlFARegExecRollBack(exec); 4107 if ((exec->inputStack != NULL ) && (exec->status == 0)) { 4108 value = exec->inputStack[exec->index].value; 4109 data = exec->inputStack[exec->index].data; 4110 #ifdef DEBUG_PUSH 4111 printf("value loaded: %s\n", value); 4112 #endif 4113 } 4114 } 4115 continue; 4116 progress: 4117 progress = 1; 4118 continue; 4119 } 4120 if (exec->status == 0) { 4121 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4122 } 4123 #ifdef DEBUG_ERR 4124 if (exec->status < 0) { 4125 testerr(exec); 4126 } 4127 #endif 4128 return(exec->status); 4129 } 4130 4131 /** 4132 * xmlRegExecPushString: 4133 * @exec: a regexp execution context or NULL to indicate the end 4134 * @value: a string token input 4135 * @data: data associated to the token to reuse in callbacks 4136 * 4137 * Push one input token in the execution context 4138 * 4139 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4140 * a negative value in case of error. 4141 */ 4142 int 4143 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4144 void *data) { 4145 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4146 } 4147 4148 /** 4149 * xmlRegExecPushString2: 4150 * @exec: a regexp execution context or NULL to indicate the end 4151 * @value: the first string token input 4152 * @value2: the second string token input 4153 * @data: data associated to the token to reuse in callbacks 4154 * 4155 * Push one input token in the execution context 4156 * 4157 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4158 * a negative value in case of error. 4159 */ 4160 int 4161 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4162 const xmlChar *value2, void *data) { 4163 xmlChar buf[150]; 4164 int lenn, lenp, ret; 4165 xmlChar *str; 4166 4167 if (exec == NULL) 4168 return(-1); 4169 if (exec->comp == NULL) 4170 return(-1); 4171 if (exec->status != 0) 4172 return(exec->status); 4173 4174 if (value2 == NULL) 4175 return(xmlRegExecPushString(exec, value, data)); 4176 4177 lenn = strlen((char *) value2); 4178 lenp = strlen((char *) value); 4179 4180 if (150 < lenn + lenp + 2) { 4181 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4182 if (str == NULL) { 4183 exec->status = -1; 4184 return(-1); 4185 } 4186 } else { 4187 str = buf; 4188 } 4189 memcpy(&str[0], value, lenp); 4190 str[lenp] = XML_REG_STRING_SEPARATOR; 4191 memcpy(&str[lenp + 1], value2, lenn); 4192 str[lenn + lenp + 1] = 0; 4193 4194 if (exec->comp->compact != NULL) 4195 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4196 else 4197 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4198 4199 if (str != buf) 4200 xmlFree(str); 4201 return(ret); 4202 } 4203 4204 /** 4205 * xmlRegExecGetValues: 4206 * @exec: a regexp execution context 4207 * @err: error extraction or normal one 4208 * @nbval: pointer to the number of accepted values IN/OUT 4209 * @nbneg: return number of negative transitions 4210 * @values: pointer to the array of acceptable values 4211 * @terminal: return value if this was a terminal state 4212 * 4213 * Extract informations from the regexp execution, internal routine to 4214 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4215 * 4216 * Returns: 0 in case of success or -1 in case of error. 4217 */ 4218 static int 4219 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4220 int *nbval, int *nbneg, 4221 xmlChar **values, int *terminal) { 4222 int maxval; 4223 int nb = 0; 4224 4225 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4226 (values == NULL) || (*nbval <= 0)) 4227 return(-1); 4228 4229 maxval = *nbval; 4230 *nbval = 0; 4231 *nbneg = 0; 4232 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4233 xmlRegexpPtr comp; 4234 int target, i, state; 4235 4236 comp = exec->comp; 4237 4238 if (err) { 4239 if (exec->errStateNo == -1) return(-1); 4240 state = exec->errStateNo; 4241 } else { 4242 state = exec->index; 4243 } 4244 if (terminal != NULL) { 4245 if (comp->compact[state * (comp->nbstrings + 1)] == 4246 XML_REGEXP_FINAL_STATE) 4247 *terminal = 1; 4248 else 4249 *terminal = 0; 4250 } 4251 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4252 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4253 if ((target > 0) && (target <= comp->nbstates) && 4254 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4255 XML_REGEXP_SINK_STATE)) { 4256 values[nb++] = comp->stringMap[i]; 4257 (*nbval)++; 4258 } 4259 } 4260 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4261 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4262 if ((target > 0) && (target <= comp->nbstates) && 4263 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4264 XML_REGEXP_SINK_STATE)) { 4265 values[nb++] = comp->stringMap[i]; 4266 (*nbneg)++; 4267 } 4268 } 4269 } else { 4270 int transno; 4271 xmlRegTransPtr trans; 4272 xmlRegAtomPtr atom; 4273 xmlRegStatePtr state; 4274 4275 if (terminal != NULL) { 4276 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4277 *terminal = 1; 4278 else 4279 *terminal = 0; 4280 } 4281 4282 if (err) { 4283 if (exec->errState == NULL) return(-1); 4284 state = exec->errState; 4285 } else { 4286 if (exec->state == NULL) return(-1); 4287 state = exec->state; 4288 } 4289 for (transno = 0; 4290 (transno < state->nbTrans) && (nb < maxval); 4291 transno++) { 4292 trans = &state->trans[transno]; 4293 if (trans->to < 0) 4294 continue; 4295 atom = trans->atom; 4296 if ((atom == NULL) || (atom->valuep == NULL)) 4297 continue; 4298 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4299 /* this should not be reached but ... */ 4300 TODO; 4301 } else if (trans->count == REGEXP_ALL_COUNTER) { 4302 /* this should not be reached but ... */ 4303 TODO; 4304 } else if (trans->counter >= 0) { 4305 xmlRegCounterPtr counter = NULL; 4306 int count; 4307 4308 if (err) 4309 count = exec->errCounts[trans->counter]; 4310 else 4311 count = exec->counts[trans->counter]; 4312 if (exec->comp != NULL) 4313 counter = &exec->comp->counters[trans->counter]; 4314 if ((counter == NULL) || (count < counter->max)) { 4315 if (atom->neg) 4316 values[nb++] = (xmlChar *) atom->valuep2; 4317 else 4318 values[nb++] = (xmlChar *) atom->valuep; 4319 (*nbval)++; 4320 } 4321 } else { 4322 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) && 4323 (exec->comp->states[trans->to]->type != 4324 XML_REGEXP_SINK_STATE)) { 4325 if (atom->neg) 4326 values[nb++] = (xmlChar *) atom->valuep2; 4327 else 4328 values[nb++] = (xmlChar *) atom->valuep; 4329 (*nbval)++; 4330 } 4331 } 4332 } 4333 for (transno = 0; 4334 (transno < state->nbTrans) && (nb < maxval); 4335 transno++) { 4336 trans = &state->trans[transno]; 4337 if (trans->to < 0) 4338 continue; 4339 atom = trans->atom; 4340 if ((atom == NULL) || (atom->valuep == NULL)) 4341 continue; 4342 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4343 continue; 4344 } else if (trans->count == REGEXP_ALL_COUNTER) { 4345 continue; 4346 } else if (trans->counter >= 0) { 4347 continue; 4348 } else { 4349 if ((exec->comp->states[trans->to] != NULL) && 4350 (exec->comp->states[trans->to]->type == 4351 XML_REGEXP_SINK_STATE)) { 4352 if (atom->neg) 4353 values[nb++] = (xmlChar *) atom->valuep2; 4354 else 4355 values[nb++] = (xmlChar *) atom->valuep; 4356 (*nbneg)++; 4357 } 4358 } 4359 } 4360 } 4361 return(0); 4362 } 4363 4364 /** 4365 * xmlRegExecNextValues: 4366 * @exec: a regexp execution context 4367 * @nbval: pointer to the number of accepted values IN/OUT 4368 * @nbneg: return number of negative transitions 4369 * @values: pointer to the array of acceptable values 4370 * @terminal: return value if this was a terminal state 4371 * 4372 * Extract informations from the regexp execution, 4373 * the parameter @values must point to an array of @nbval string pointers 4374 * on return nbval will contain the number of possible strings in that 4375 * state and the @values array will be updated with them. The string values 4376 * returned will be freed with the @exec context and don't need to be 4377 * deallocated. 4378 * 4379 * Returns: 0 in case of success or -1 in case of error. 4380 */ 4381 int 4382 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4383 xmlChar **values, int *terminal) { 4384 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4385 } 4386 4387 /** 4388 * xmlRegExecErrInfo: 4389 * @exec: a regexp execution context generating an error 4390 * @string: return value for the error string 4391 * @nbval: pointer to the number of accepted values IN/OUT 4392 * @nbneg: return number of negative transitions 4393 * @values: pointer to the array of acceptable values 4394 * @terminal: return value if this was a terminal state 4395 * 4396 * Extract error informations from the regexp execution, the parameter 4397 * @string will be updated with the value pushed and not accepted, 4398 * the parameter @values must point to an array of @nbval string pointers 4399 * on return nbval will contain the number of possible strings in that 4400 * state and the @values array will be updated with them. The string values 4401 * returned will be freed with the @exec context and don't need to be 4402 * deallocated. 4403 * 4404 * Returns: 0 in case of success or -1 in case of error. 4405 */ 4406 int 4407 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4408 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4409 if (exec == NULL) 4410 return(-1); 4411 if (string != NULL) { 4412 if (exec->status != 0) 4413 *string = exec->errString; 4414 else 4415 *string = NULL; 4416 } 4417 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4418 } 4419 4420 #ifdef DEBUG_ERR 4421 static void testerr(xmlRegExecCtxtPtr exec) { 4422 const xmlChar *string; 4423 xmlChar *values[5]; 4424 int nb = 5; 4425 int nbneg; 4426 int terminal; 4427 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4428 } 4429 #endif 4430 4431 #if 0 4432 static int 4433 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4434 xmlRegTransPtr trans; 4435 xmlRegAtomPtr atom; 4436 int ret; 4437 int codepoint, len; 4438 4439 if (exec == NULL) 4440 return(-1); 4441 if (exec->status != 0) 4442 return(exec->status); 4443 4444 while ((exec->status == 0) && 4445 ((exec->inputString[exec->index] != 0) || 4446 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4447 4448 /* 4449 * End of input on non-terminal state, rollback, however we may 4450 * still have epsilon like transition for counted transitions 4451 * on counters, in that case don't break too early. 4452 */ 4453 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4454 goto rollback; 4455 4456 exec->transcount = 0; 4457 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4458 trans = &exec->state->trans[exec->transno]; 4459 if (trans->to < 0) 4460 continue; 4461 atom = trans->atom; 4462 ret = 0; 4463 if (trans->count >= 0) { 4464 int count; 4465 xmlRegCounterPtr counter; 4466 4467 /* 4468 * A counted transition. 4469 */ 4470 4471 count = exec->counts[trans->count]; 4472 counter = &exec->comp->counters[trans->count]; 4473 #ifdef DEBUG_REGEXP_EXEC 4474 printf("testing count %d: val %d, min %d, max %d\n", 4475 trans->count, count, counter->min, counter->max); 4476 #endif 4477 ret = ((count >= counter->min) && (count <= counter->max)); 4478 } else if (atom == NULL) { 4479 fprintf(stderr, "epsilon transition left at runtime\n"); 4480 exec->status = -2; 4481 break; 4482 } else if (exec->inputString[exec->index] != 0) { 4483 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4484 ret = xmlRegCheckCharacter(atom, codepoint); 4485 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4486 xmlRegStatePtr to = exec->comp->states[trans->to]; 4487 4488 /* 4489 * this is a multiple input sequence 4490 */ 4491 if (exec->state->nbTrans > exec->transno + 1) { 4492 xmlFARegExecSave(exec); 4493 } 4494 exec->transcount = 1; 4495 do { 4496 /* 4497 * Try to progress as much as possible on the input 4498 */ 4499 if (exec->transcount == atom->max) { 4500 break; 4501 } 4502 exec->index += len; 4503 /* 4504 * End of input: stop here 4505 */ 4506 if (exec->inputString[exec->index] == 0) { 4507 exec->index -= len; 4508 break; 4509 } 4510 if (exec->transcount >= atom->min) { 4511 int transno = exec->transno; 4512 xmlRegStatePtr state = exec->state; 4513 4514 /* 4515 * The transition is acceptable save it 4516 */ 4517 exec->transno = -1; /* trick */ 4518 exec->state = to; 4519 xmlFARegExecSave(exec); 4520 exec->transno = transno; 4521 exec->state = state; 4522 } 4523 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4524 len); 4525 ret = xmlRegCheckCharacter(atom, codepoint); 4526 exec->transcount++; 4527 } while (ret == 1); 4528 if (exec->transcount < atom->min) 4529 ret = 0; 4530 4531 /* 4532 * If the last check failed but one transition was found 4533 * possible, rollback 4534 */ 4535 if (ret < 0) 4536 ret = 0; 4537 if (ret == 0) { 4538 goto rollback; 4539 } 4540 } 4541 } 4542 if (ret == 1) { 4543 if (exec->state->nbTrans > exec->transno + 1) { 4544 xmlFARegExecSave(exec); 4545 } 4546 /* 4547 * restart count for expressions like this ((abc){2})* 4548 */ 4549 if (trans->count >= 0) { 4550 #ifdef DEBUG_REGEXP_EXEC 4551 printf("Reset count %d\n", trans->count); 4552 #endif 4553 exec->counts[trans->count] = 0; 4554 } 4555 if (trans->counter >= 0) { 4556 #ifdef DEBUG_REGEXP_EXEC 4557 printf("Increasing count %d\n", trans->counter); 4558 #endif 4559 exec->counts[trans->counter]++; 4560 } 4561 #ifdef DEBUG_REGEXP_EXEC 4562 printf("entering state %d\n", trans->to); 4563 #endif 4564 exec->state = exec->comp->states[trans->to]; 4565 exec->transno = 0; 4566 if (trans->atom != NULL) { 4567 exec->index += len; 4568 } 4569 goto progress; 4570 } else if (ret < 0) { 4571 exec->status = -4; 4572 break; 4573 } 4574 } 4575 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4576 rollback: 4577 /* 4578 * Failed to find a way out 4579 */ 4580 exec->determinist = 0; 4581 xmlFARegExecRollBack(exec); 4582 } 4583 progress: 4584 continue; 4585 } 4586 } 4587 #endif 4588 /************************************************************************ 4589 * * 4590 * Parser for the Schemas Datatype Regular Expressions * 4591 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4592 * * 4593 ************************************************************************/ 4594 4595 /** 4596 * xmlFAIsChar: 4597 * @ctxt: a regexp parser context 4598 * 4599 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4600 */ 4601 static int 4602 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4603 int cur; 4604 int len; 4605 4606 cur = CUR_SCHAR(ctxt->cur, len); 4607 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4608 (cur == '*') || (cur == '+') || (cur == '(') || 4609 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4610 (cur == 0x5D) || (cur == 0)) 4611 return(-1); 4612 return(cur); 4613 } 4614 4615 /** 4616 * xmlFAParseCharProp: 4617 * @ctxt: a regexp parser context 4618 * 4619 * [27] charProp ::= IsCategory | IsBlock 4620 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4621 * Separators | Symbols | Others 4622 * [29] Letters ::= 'L' [ultmo]? 4623 * [30] Marks ::= 'M' [nce]? 4624 * [31] Numbers ::= 'N' [dlo]? 4625 * [32] Punctuation ::= 'P' [cdseifo]? 4626 * [33] Separators ::= 'Z' [slp]? 4627 * [34] Symbols ::= 'S' [mcko]? 4628 * [35] Others ::= 'C' [cfon]? 4629 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4630 */ 4631 static void 4632 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4633 int cur; 4634 xmlRegAtomType type = (xmlRegAtomType) 0; 4635 xmlChar *blockName = NULL; 4636 4637 cur = CUR; 4638 if (cur == 'L') { 4639 NEXT; 4640 cur = CUR; 4641 if (cur == 'u') { 4642 NEXT; 4643 type = XML_REGEXP_LETTER_UPPERCASE; 4644 } else if (cur == 'l') { 4645 NEXT; 4646 type = XML_REGEXP_LETTER_LOWERCASE; 4647 } else if (cur == 't') { 4648 NEXT; 4649 type = XML_REGEXP_LETTER_TITLECASE; 4650 } else if (cur == 'm') { 4651 NEXT; 4652 type = XML_REGEXP_LETTER_MODIFIER; 4653 } else if (cur == 'o') { 4654 NEXT; 4655 type = XML_REGEXP_LETTER_OTHERS; 4656 } else { 4657 type = XML_REGEXP_LETTER; 4658 } 4659 } else if (cur == 'M') { 4660 NEXT; 4661 cur = CUR; 4662 if (cur == 'n') { 4663 NEXT; 4664 /* nonspacing */ 4665 type = XML_REGEXP_MARK_NONSPACING; 4666 } else if (cur == 'c') { 4667 NEXT; 4668 /* spacing combining */ 4669 type = XML_REGEXP_MARK_SPACECOMBINING; 4670 } else if (cur == 'e') { 4671 NEXT; 4672 /* enclosing */ 4673 type = XML_REGEXP_MARK_ENCLOSING; 4674 } else { 4675 /* all marks */ 4676 type = XML_REGEXP_MARK; 4677 } 4678 } else if (cur == 'N') { 4679 NEXT; 4680 cur = CUR; 4681 if (cur == 'd') { 4682 NEXT; 4683 /* digital */ 4684 type = XML_REGEXP_NUMBER_DECIMAL; 4685 } else if (cur == 'l') { 4686 NEXT; 4687 /* letter */ 4688 type = XML_REGEXP_NUMBER_LETTER; 4689 } else if (cur == 'o') { 4690 NEXT; 4691 /* other */ 4692 type = XML_REGEXP_NUMBER_OTHERS; 4693 } else { 4694 /* all numbers */ 4695 type = XML_REGEXP_NUMBER; 4696 } 4697 } else if (cur == 'P') { 4698 NEXT; 4699 cur = CUR; 4700 if (cur == 'c') { 4701 NEXT; 4702 /* connector */ 4703 type = XML_REGEXP_PUNCT_CONNECTOR; 4704 } else if (cur == 'd') { 4705 NEXT; 4706 /* dash */ 4707 type = XML_REGEXP_PUNCT_DASH; 4708 } else if (cur == 's') { 4709 NEXT; 4710 /* open */ 4711 type = XML_REGEXP_PUNCT_OPEN; 4712 } else if (cur == 'e') { 4713 NEXT; 4714 /* close */ 4715 type = XML_REGEXP_PUNCT_CLOSE; 4716 } else if (cur == 'i') { 4717 NEXT; 4718 /* initial quote */ 4719 type = XML_REGEXP_PUNCT_INITQUOTE; 4720 } else if (cur == 'f') { 4721 NEXT; 4722 /* final quote */ 4723 type = XML_REGEXP_PUNCT_FINQUOTE; 4724 } else if (cur == 'o') { 4725 NEXT; 4726 /* other */ 4727 type = XML_REGEXP_PUNCT_OTHERS; 4728 } else { 4729 /* all punctuation */ 4730 type = XML_REGEXP_PUNCT; 4731 } 4732 } else if (cur == 'Z') { 4733 NEXT; 4734 cur = CUR; 4735 if (cur == 's') { 4736 NEXT; 4737 /* space */ 4738 type = XML_REGEXP_SEPAR_SPACE; 4739 } else if (cur == 'l') { 4740 NEXT; 4741 /* line */ 4742 type = XML_REGEXP_SEPAR_LINE; 4743 } else if (cur == 'p') { 4744 NEXT; 4745 /* paragraph */ 4746 type = XML_REGEXP_SEPAR_PARA; 4747 } else { 4748 /* all separators */ 4749 type = XML_REGEXP_SEPAR; 4750 } 4751 } else if (cur == 'S') { 4752 NEXT; 4753 cur = CUR; 4754 if (cur == 'm') { 4755 NEXT; 4756 type = XML_REGEXP_SYMBOL_MATH; 4757 /* math */ 4758 } else if (cur == 'c') { 4759 NEXT; 4760 type = XML_REGEXP_SYMBOL_CURRENCY; 4761 /* currency */ 4762 } else if (cur == 'k') { 4763 NEXT; 4764 type = XML_REGEXP_SYMBOL_MODIFIER; 4765 /* modifiers */ 4766 } else if (cur == 'o') { 4767 NEXT; 4768 type = XML_REGEXP_SYMBOL_OTHERS; 4769 /* other */ 4770 } else { 4771 /* all symbols */ 4772 type = XML_REGEXP_SYMBOL; 4773 } 4774 } else if (cur == 'C') { 4775 NEXT; 4776 cur = CUR; 4777 if (cur == 'c') { 4778 NEXT; 4779 /* control */ 4780 type = XML_REGEXP_OTHER_CONTROL; 4781 } else if (cur == 'f') { 4782 NEXT; 4783 /* format */ 4784 type = XML_REGEXP_OTHER_FORMAT; 4785 } else if (cur == 'o') { 4786 NEXT; 4787 /* private use */ 4788 type = XML_REGEXP_OTHER_PRIVATE; 4789 } else if (cur == 'n') { 4790 NEXT; 4791 /* not assigned */ 4792 type = XML_REGEXP_OTHER_NA; 4793 } else { 4794 /* all others */ 4795 type = XML_REGEXP_OTHER; 4796 } 4797 } else if (cur == 'I') { 4798 const xmlChar *start; 4799 NEXT; 4800 cur = CUR; 4801 if (cur != 's') { 4802 ERROR("IsXXXX expected"); 4803 return; 4804 } 4805 NEXT; 4806 start = ctxt->cur; 4807 cur = CUR; 4808 if (((cur >= 'a') && (cur <= 'z')) || 4809 ((cur >= 'A') && (cur <= 'Z')) || 4810 ((cur >= '0') && (cur <= '9')) || 4811 (cur == 0x2D)) { 4812 NEXT; 4813 cur = CUR; 4814 while (((cur >= 'a') && (cur <= 'z')) || 4815 ((cur >= 'A') && (cur <= 'Z')) || 4816 ((cur >= '0') && (cur <= '9')) || 4817 (cur == 0x2D)) { 4818 NEXT; 4819 cur = CUR; 4820 } 4821 } 4822 type = XML_REGEXP_BLOCK_NAME; 4823 blockName = xmlStrndup(start, ctxt->cur - start); 4824 } else { 4825 ERROR("Unknown char property"); 4826 return; 4827 } 4828 if (ctxt->atom == NULL) { 4829 ctxt->atom = xmlRegNewAtom(ctxt, type); 4830 if (ctxt->atom != NULL) 4831 ctxt->atom->valuep = blockName; 4832 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4833 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4834 type, 0, 0, blockName); 4835 } 4836 } 4837 4838 /** 4839 * xmlFAParseCharClassEsc: 4840 * @ctxt: a regexp parser context 4841 * 4842 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4843 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4844 * [25] catEsc ::= '\p{' charProp '}' 4845 * [26] complEsc ::= '\P{' charProp '}' 4846 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4847 */ 4848 static void 4849 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4850 int cur; 4851 4852 if (CUR == '.') { 4853 if (ctxt->atom == NULL) { 4854 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4855 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4856 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4857 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4858 } 4859 NEXT; 4860 return; 4861 } 4862 if (CUR != '\\') { 4863 ERROR("Escaped sequence: expecting \\"); 4864 return; 4865 } 4866 NEXT; 4867 cur = CUR; 4868 if (cur == 'p') { 4869 NEXT; 4870 if (CUR != '{') { 4871 ERROR("Expecting '{'"); 4872 return; 4873 } 4874 NEXT; 4875 xmlFAParseCharProp(ctxt); 4876 if (CUR != '}') { 4877 ERROR("Expecting '}'"); 4878 return; 4879 } 4880 NEXT; 4881 } else if (cur == 'P') { 4882 NEXT; 4883 if (CUR != '{') { 4884 ERROR("Expecting '{'"); 4885 return; 4886 } 4887 NEXT; 4888 xmlFAParseCharProp(ctxt); 4889 if (ctxt->atom != NULL) 4890 ctxt->atom->neg = 1; 4891 if (CUR != '}') { 4892 ERROR("Expecting '}'"); 4893 return; 4894 } 4895 NEXT; 4896 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4897 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4898 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4899 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4900 (cur == 0x5E)) { 4901 if (ctxt->atom == NULL) { 4902 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4903 if (ctxt->atom != NULL) { 4904 switch (cur) { 4905 case 'n': 4906 ctxt->atom->codepoint = '\n'; 4907 break; 4908 case 'r': 4909 ctxt->atom->codepoint = '\r'; 4910 break; 4911 case 't': 4912 ctxt->atom->codepoint = '\t'; 4913 break; 4914 default: 4915 ctxt->atom->codepoint = cur; 4916 } 4917 } 4918 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4919 switch (cur) { 4920 case 'n': 4921 cur = '\n'; 4922 break; 4923 case 'r': 4924 cur = '\r'; 4925 break; 4926 case 't': 4927 cur = '\t'; 4928 break; 4929 } 4930 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4931 XML_REGEXP_CHARVAL, cur, cur, NULL); 4932 } 4933 NEXT; 4934 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4935 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4936 (cur == 'w') || (cur == 'W')) { 4937 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4938 4939 switch (cur) { 4940 case 's': 4941 type = XML_REGEXP_ANYSPACE; 4942 break; 4943 case 'S': 4944 type = XML_REGEXP_NOTSPACE; 4945 break; 4946 case 'i': 4947 type = XML_REGEXP_INITNAME; 4948 break; 4949 case 'I': 4950 type = XML_REGEXP_NOTINITNAME; 4951 break; 4952 case 'c': 4953 type = XML_REGEXP_NAMECHAR; 4954 break; 4955 case 'C': 4956 type = XML_REGEXP_NOTNAMECHAR; 4957 break; 4958 case 'd': 4959 type = XML_REGEXP_DECIMAL; 4960 break; 4961 case 'D': 4962 type = XML_REGEXP_NOTDECIMAL; 4963 break; 4964 case 'w': 4965 type = XML_REGEXP_REALCHAR; 4966 break; 4967 case 'W': 4968 type = XML_REGEXP_NOTREALCHAR; 4969 break; 4970 } 4971 NEXT; 4972 if (ctxt->atom == NULL) { 4973 ctxt->atom = xmlRegNewAtom(ctxt, type); 4974 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4975 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4976 type, 0, 0, NULL); 4977 } 4978 } else { 4979 ERROR("Wrong escape sequence, misuse of character '\\'"); 4980 } 4981 } 4982 4983 /** 4984 * xmlFAParseCharRange: 4985 * @ctxt: a regexp parser context 4986 * 4987 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4988 * [18] seRange ::= charOrEsc '-' charOrEsc 4989 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4990 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4991 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4992 */ 4993 static void 4994 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4995 int cur, len; 4996 int start = -1; 4997 int end = -1; 4998 4999 if (CUR == '\0') { 5000 ERROR("Expecting ']'"); 5001 return; 5002 } 5003 5004 cur = CUR; 5005 if (cur == '\\') { 5006 NEXT; 5007 cur = CUR; 5008 switch (cur) { 5009 case 'n': start = 0xA; break; 5010 case 'r': start = 0xD; break; 5011 case 't': start = 0x9; break; 5012 case '\\': case '|': case '.': case '-': case '^': case '?': 5013 case '*': case '+': case '{': case '}': case '(': case ')': 5014 case '[': case ']': 5015 start = cur; break; 5016 default: 5017 ERROR("Invalid escape value"); 5018 return; 5019 } 5020 end = start; 5021 len = 1; 5022 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5023 end = start = CUR_SCHAR(ctxt->cur, len); 5024 } else { 5025 ERROR("Expecting a char range"); 5026 return; 5027 } 5028 /* 5029 * Since we are "inside" a range, we can assume ctxt->cur is past 5030 * the start of ctxt->string, and PREV should be safe 5031 */ 5032 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5033 NEXTL(len); 5034 return; 5035 } 5036 NEXTL(len); 5037 cur = CUR; 5038 if ((cur != '-') || (NXT(1) == ']')) { 5039 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5040 XML_REGEXP_CHARVAL, start, end, NULL); 5041 return; 5042 } 5043 NEXT; 5044 cur = CUR; 5045 if (cur == '\\') { 5046 NEXT; 5047 cur = CUR; 5048 switch (cur) { 5049 case 'n': end = 0xA; break; 5050 case 'r': end = 0xD; break; 5051 case 't': end = 0x9; break; 5052 case '\\': case '|': case '.': case '-': case '^': case '?': 5053 case '*': case '+': case '{': case '}': case '(': case ')': 5054 case '[': case ']': 5055 end = cur; break; 5056 default: 5057 ERROR("Invalid escape value"); 5058 return; 5059 } 5060 len = 1; 5061 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) { 5062 end = CUR_SCHAR(ctxt->cur, len); 5063 } else { 5064 ERROR("Expecting the end of a char range"); 5065 return; 5066 } 5067 5068 /* TODO check that the values are acceptable character ranges for XML */ 5069 if (end < start) { 5070 ERROR("End of range is before start of range"); 5071 } else { 5072 NEXTL(len); 5073 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5074 XML_REGEXP_CHARVAL, start, end, NULL); 5075 } 5076 return; 5077 } 5078 5079 /** 5080 * xmlFAParsePosCharGroup: 5081 * @ctxt: a regexp parser context 5082 * 5083 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5084 */ 5085 static void 5086 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5087 do { 5088 if (CUR == '\\') { 5089 xmlFAParseCharClassEsc(ctxt); 5090 } else { 5091 xmlFAParseCharRange(ctxt); 5092 } 5093 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 5094 (CUR != 0) && (ctxt->error == 0)); 5095 } 5096 5097 /** 5098 * xmlFAParseCharGroup: 5099 * @ctxt: a regexp parser context 5100 * 5101 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5102 * [15] negCharGroup ::= '^' posCharGroup 5103 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5104 * [12] charClassExpr ::= '[' charGroup ']' 5105 */ 5106 static void 5107 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5108 int n = ctxt->neg; 5109 while ((CUR != ']') && (ctxt->error == 0)) { 5110 if (CUR == '^') { 5111 int neg = ctxt->neg; 5112 5113 NEXT; 5114 ctxt->neg = !ctxt->neg; 5115 xmlFAParsePosCharGroup(ctxt); 5116 ctxt->neg = neg; 5117 } else if ((CUR == '-') && (NXT(1) == '[')) { 5118 int neg = ctxt->neg; 5119 ctxt->neg = 2; 5120 NEXT; /* eat the '-' */ 5121 NEXT; /* eat the '[' */ 5122 xmlFAParseCharGroup(ctxt); 5123 if (CUR == ']') { 5124 NEXT; 5125 } else { 5126 ERROR("charClassExpr: ']' expected"); 5127 break; 5128 } 5129 ctxt->neg = neg; 5130 break; 5131 } else if (CUR != ']') { 5132 xmlFAParsePosCharGroup(ctxt); 5133 } 5134 } 5135 ctxt->neg = n; 5136 } 5137 5138 /** 5139 * xmlFAParseCharClass: 5140 * @ctxt: a regexp parser context 5141 * 5142 * [11] charClass ::= charClassEsc | charClassExpr 5143 * [12] charClassExpr ::= '[' charGroup ']' 5144 */ 5145 static void 5146 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5147 if (CUR == '[') { 5148 NEXT; 5149 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5150 if (ctxt->atom == NULL) 5151 return; 5152 xmlFAParseCharGroup(ctxt); 5153 if (CUR == ']') { 5154 NEXT; 5155 } else { 5156 ERROR("xmlFAParseCharClass: ']' expected"); 5157 } 5158 } else { 5159 xmlFAParseCharClassEsc(ctxt); 5160 } 5161 } 5162 5163 /** 5164 * xmlFAParseQuantExact: 5165 * @ctxt: a regexp parser context 5166 * 5167 * [8] QuantExact ::= [0-9]+ 5168 * 5169 * Returns 0 if success or -1 in case of error 5170 */ 5171 static int 5172 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5173 int ret = 0; 5174 int ok = 0; 5175 5176 while ((CUR >= '0') && (CUR <= '9')) { 5177 ret = ret * 10 + (CUR - '0'); 5178 ok = 1; 5179 NEXT; 5180 } 5181 if (ok != 1) { 5182 return(-1); 5183 } 5184 return(ret); 5185 } 5186 5187 /** 5188 * xmlFAParseQuantifier: 5189 * @ctxt: a regexp parser context 5190 * 5191 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5192 * [5] quantity ::= quantRange | quantMin | QuantExact 5193 * [6] quantRange ::= QuantExact ',' QuantExact 5194 * [7] quantMin ::= QuantExact ',' 5195 * [8] QuantExact ::= [0-9]+ 5196 */ 5197 static int 5198 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5199 int cur; 5200 5201 cur = CUR; 5202 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5203 if (ctxt->atom != NULL) { 5204 if (cur == '?') 5205 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5206 else if (cur == '*') 5207 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5208 else if (cur == '+') 5209 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5210 } 5211 NEXT; 5212 return(1); 5213 } 5214 if (cur == '{') { 5215 int min = 0, max = 0; 5216 5217 NEXT; 5218 cur = xmlFAParseQuantExact(ctxt); 5219 if (cur >= 0) 5220 min = cur; 5221 if (CUR == ',') { 5222 NEXT; 5223 if (CUR == '}') 5224 max = INT_MAX; 5225 else { 5226 cur = xmlFAParseQuantExact(ctxt); 5227 if (cur >= 0) 5228 max = cur; 5229 else { 5230 ERROR("Improper quantifier"); 5231 } 5232 } 5233 } 5234 if (CUR == '}') { 5235 NEXT; 5236 } else { 5237 ERROR("Unterminated quantifier"); 5238 } 5239 if (max == 0) 5240 max = min; 5241 if (ctxt->atom != NULL) { 5242 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5243 ctxt->atom->min = min; 5244 ctxt->atom->max = max; 5245 } 5246 return(1); 5247 } 5248 return(0); 5249 } 5250 5251 /** 5252 * xmlFAParseAtom: 5253 * @ctxt: a regexp parser context 5254 * 5255 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5256 */ 5257 static int 5258 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5259 int codepoint, len; 5260 5261 codepoint = xmlFAIsChar(ctxt); 5262 if (codepoint > 0) { 5263 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5264 if (ctxt->atom == NULL) 5265 return(-1); 5266 codepoint = CUR_SCHAR(ctxt->cur, len); 5267 ctxt->atom->codepoint = codepoint; 5268 NEXTL(len); 5269 return(1); 5270 } else if (CUR == '|') { 5271 return(0); 5272 } else if (CUR == 0) { 5273 return(0); 5274 } else if (CUR == ')') { 5275 return(0); 5276 } else if (CUR == '(') { 5277 xmlRegStatePtr start, oldend, start0; 5278 5279 NEXT; 5280 /* 5281 * this extra Epsilon transition is needed if we count with 0 allowed 5282 * unfortunately this can't be known at that point 5283 */ 5284 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5285 start0 = ctxt->state; 5286 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5287 start = ctxt->state; 5288 oldend = ctxt->end; 5289 ctxt->end = NULL; 5290 ctxt->atom = NULL; 5291 xmlFAParseRegExp(ctxt, 0); 5292 if (CUR == ')') { 5293 NEXT; 5294 } else { 5295 ERROR("xmlFAParseAtom: expecting ')'"); 5296 } 5297 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5298 if (ctxt->atom == NULL) 5299 return(-1); 5300 ctxt->atom->start = start; 5301 ctxt->atom->start0 = start0; 5302 ctxt->atom->stop = ctxt->state; 5303 ctxt->end = oldend; 5304 return(1); 5305 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5306 xmlFAParseCharClass(ctxt); 5307 return(1); 5308 } 5309 return(0); 5310 } 5311 5312 /** 5313 * xmlFAParsePiece: 5314 * @ctxt: a regexp parser context 5315 * 5316 * [3] piece ::= atom quantifier? 5317 */ 5318 static int 5319 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5320 int ret; 5321 5322 ctxt->atom = NULL; 5323 ret = xmlFAParseAtom(ctxt); 5324 if (ret == 0) 5325 return(0); 5326 if (ctxt->atom == NULL) { 5327 ERROR("internal: no atom generated"); 5328 } 5329 xmlFAParseQuantifier(ctxt); 5330 return(1); 5331 } 5332 5333 /** 5334 * xmlFAParseBranch: 5335 * @ctxt: a regexp parser context 5336 * @to: optional target to the end of the branch 5337 * 5338 * @to is used to optimize by removing duplicate path in automata 5339 * in expressions like (a|b)(c|d) 5340 * 5341 * [2] branch ::= piece* 5342 */ 5343 static int 5344 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5345 xmlRegStatePtr previous; 5346 int ret; 5347 5348 previous = ctxt->state; 5349 ret = xmlFAParsePiece(ctxt); 5350 if (ret != 0) { 5351 if (xmlFAGenerateTransitions(ctxt, previous, 5352 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5353 return(-1); 5354 previous = ctxt->state; 5355 ctxt->atom = NULL; 5356 } 5357 while ((ret != 0) && (ctxt->error == 0)) { 5358 ret = xmlFAParsePiece(ctxt); 5359 if (ret != 0) { 5360 if (xmlFAGenerateTransitions(ctxt, previous, 5361 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5362 return(-1); 5363 previous = ctxt->state; 5364 ctxt->atom = NULL; 5365 } 5366 } 5367 return(0); 5368 } 5369 5370 /** 5371 * xmlFAParseRegExp: 5372 * @ctxt: a regexp parser context 5373 * @top: is this the top-level expression ? 5374 * 5375 * [1] regExp ::= branch ( '|' branch )* 5376 */ 5377 static void 5378 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5379 xmlRegStatePtr start, end; 5380 5381 /* if not top start should have been generated by an epsilon trans */ 5382 start = ctxt->state; 5383 ctxt->end = NULL; 5384 xmlFAParseBranch(ctxt, NULL); 5385 if (top) { 5386 #ifdef DEBUG_REGEXP_GRAPH 5387 printf("State %d is final\n", ctxt->state->no); 5388 #endif 5389 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5390 } 5391 if (CUR != '|') { 5392 ctxt->end = ctxt->state; 5393 return; 5394 } 5395 end = ctxt->state; 5396 while ((CUR == '|') && (ctxt->error == 0)) { 5397 NEXT; 5398 if (CUR == 0) { 5399 ERROR("expecting a branch after |") 5400 return; 5401 } 5402 ctxt->state = start; 5403 ctxt->end = NULL; 5404 xmlFAParseBranch(ctxt, end); 5405 } 5406 if (!top) { 5407 ctxt->state = end; 5408 ctxt->end = end; 5409 } 5410 } 5411 5412 /************************************************************************ 5413 * * 5414 * The basic API * 5415 * * 5416 ************************************************************************/ 5417 5418 /** 5419 * xmlRegexpPrint: 5420 * @output: the file for the output debug 5421 * @regexp: the compiled regexp 5422 * 5423 * Print the content of the compiled regular expression 5424 */ 5425 void 5426 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5427 int i; 5428 5429 if (output == NULL) 5430 return; 5431 fprintf(output, " regexp: "); 5432 if (regexp == NULL) { 5433 fprintf(output, "NULL\n"); 5434 return; 5435 } 5436 fprintf(output, "'%s' ", regexp->string); 5437 fprintf(output, "\n"); 5438 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5439 for (i = 0;i < regexp->nbAtoms; i++) { 5440 fprintf(output, " %02d ", i); 5441 xmlRegPrintAtom(output, regexp->atoms[i]); 5442 } 5443 fprintf(output, "%d states:", regexp->nbStates); 5444 fprintf(output, "\n"); 5445 for (i = 0;i < regexp->nbStates; i++) { 5446 xmlRegPrintState(output, regexp->states[i]); 5447 } 5448 fprintf(output, "%d counters:\n", regexp->nbCounters); 5449 for (i = 0;i < regexp->nbCounters; i++) { 5450 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5451 regexp->counters[i].max); 5452 } 5453 } 5454 5455 /** 5456 * xmlRegexpCompile: 5457 * @regexp: a regular expression string 5458 * 5459 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5460 * Appendix F and builds an automata suitable for testing strings against 5461 * that regular expression 5462 * 5463 * Returns the compiled expression or NULL in case of error 5464 */ 5465 xmlRegexpPtr 5466 xmlRegexpCompile(const xmlChar *regexp) { 5467 xmlRegexpPtr ret; 5468 xmlRegParserCtxtPtr ctxt; 5469 5470 ctxt = xmlRegNewParserCtxt(regexp); 5471 if (ctxt == NULL) 5472 return(NULL); 5473 5474 /* initialize the parser */ 5475 ctxt->end = NULL; 5476 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5477 xmlRegStatePush(ctxt, ctxt->start); 5478 5479 /* parse the expression building an automata */ 5480 xmlFAParseRegExp(ctxt, 1); 5481 if (CUR != 0) { 5482 ERROR("xmlFAParseRegExp: extra characters"); 5483 } 5484 if (ctxt->error != 0) { 5485 xmlRegFreeParserCtxt(ctxt); 5486 return(NULL); 5487 } 5488 ctxt->end = ctxt->state; 5489 ctxt->start->type = XML_REGEXP_START_STATE; 5490 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5491 5492 /* remove the Epsilon except for counted transitions */ 5493 xmlFAEliminateEpsilonTransitions(ctxt); 5494 5495 5496 if (ctxt->error != 0) { 5497 xmlRegFreeParserCtxt(ctxt); 5498 return(NULL); 5499 } 5500 ret = xmlRegEpxFromParse(ctxt); 5501 xmlRegFreeParserCtxt(ctxt); 5502 return(ret); 5503 } 5504 5505 /** 5506 * xmlRegexpExec: 5507 * @comp: the compiled regular expression 5508 * @content: the value to check against the regular expression 5509 * 5510 * Check if the regular expression generates the value 5511 * 5512 * Returns 1 if it matches, 0 if not and a negative value in case of error 5513 */ 5514 int 5515 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5516 if ((comp == NULL) || (content == NULL)) 5517 return(-1); 5518 return(xmlFARegExec(comp, content)); 5519 } 5520 5521 /** 5522 * xmlRegexpIsDeterminist: 5523 * @comp: the compiled regular expression 5524 * 5525 * Check if the regular expression is determinist 5526 * 5527 * Returns 1 if it yes, 0 if not and a negative value in case of error 5528 */ 5529 int 5530 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5531 xmlAutomataPtr am; 5532 int ret; 5533 5534 if (comp == NULL) 5535 return(-1); 5536 if (comp->determinist != -1) 5537 return(comp->determinist); 5538 5539 am = xmlNewAutomata(); 5540 if (am->states != NULL) { 5541 int i; 5542 5543 for (i = 0;i < am->nbStates;i++) 5544 xmlRegFreeState(am->states[i]); 5545 xmlFree(am->states); 5546 } 5547 am->nbAtoms = comp->nbAtoms; 5548 am->atoms = comp->atoms; 5549 am->nbStates = comp->nbStates; 5550 am->states = comp->states; 5551 am->determinist = -1; 5552 am->flags = comp->flags; 5553 ret = xmlFAComputesDeterminism(am); 5554 am->atoms = NULL; 5555 am->states = NULL; 5556 xmlFreeAutomata(am); 5557 comp->determinist = ret; 5558 return(ret); 5559 } 5560 5561 /** 5562 * xmlRegFreeRegexp: 5563 * @regexp: the regexp 5564 * 5565 * Free a regexp 5566 */ 5567 void 5568 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5569 int i; 5570 if (regexp == NULL) 5571 return; 5572 5573 if (regexp->string != NULL) 5574 xmlFree(regexp->string); 5575 if (regexp->states != NULL) { 5576 for (i = 0;i < regexp->nbStates;i++) 5577 xmlRegFreeState(regexp->states[i]); 5578 xmlFree(regexp->states); 5579 } 5580 if (regexp->atoms != NULL) { 5581 for (i = 0;i < regexp->nbAtoms;i++) 5582 xmlRegFreeAtom(regexp->atoms[i]); 5583 xmlFree(regexp->atoms); 5584 } 5585 if (regexp->counters != NULL) 5586 xmlFree(regexp->counters); 5587 if (regexp->compact != NULL) 5588 xmlFree(regexp->compact); 5589 if (regexp->transdata != NULL) 5590 xmlFree(regexp->transdata); 5591 if (regexp->stringMap != NULL) { 5592 for (i = 0; i < regexp->nbstrings;i++) 5593 xmlFree(regexp->stringMap[i]); 5594 xmlFree(regexp->stringMap); 5595 } 5596 5597 xmlFree(regexp); 5598 } 5599 5600 #ifdef LIBXML_AUTOMATA_ENABLED 5601 /************************************************************************ 5602 * * 5603 * The Automata interface * 5604 * * 5605 ************************************************************************/ 5606 5607 /** 5608 * xmlNewAutomata: 5609 * 5610 * Create a new automata 5611 * 5612 * Returns the new object or NULL in case of failure 5613 */ 5614 xmlAutomataPtr 5615 xmlNewAutomata(void) { 5616 xmlAutomataPtr ctxt; 5617 5618 ctxt = xmlRegNewParserCtxt(NULL); 5619 if (ctxt == NULL) 5620 return(NULL); 5621 5622 /* initialize the parser */ 5623 ctxt->end = NULL; 5624 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5625 if (ctxt->start == NULL) { 5626 xmlFreeAutomata(ctxt); 5627 return(NULL); 5628 } 5629 ctxt->start->type = XML_REGEXP_START_STATE; 5630 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5631 xmlRegFreeState(ctxt->start); 5632 xmlFreeAutomata(ctxt); 5633 return(NULL); 5634 } 5635 ctxt->flags = 0; 5636 5637 return(ctxt); 5638 } 5639 5640 /** 5641 * xmlFreeAutomata: 5642 * @am: an automata 5643 * 5644 * Free an automata 5645 */ 5646 void 5647 xmlFreeAutomata(xmlAutomataPtr am) { 5648 if (am == NULL) 5649 return; 5650 xmlRegFreeParserCtxt(am); 5651 } 5652 5653 /** 5654 * xmlAutomataSetFlags: 5655 * @am: an automata 5656 * @flags: a set of internal flags 5657 * 5658 * Set some flags on the automata 5659 */ 5660 void 5661 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5662 if (am == NULL) 5663 return; 5664 am->flags |= flags; 5665 } 5666 5667 /** 5668 * xmlAutomataGetInitState: 5669 * @am: an automata 5670 * 5671 * Initial state lookup 5672 * 5673 * Returns the initial state of the automata 5674 */ 5675 xmlAutomataStatePtr 5676 xmlAutomataGetInitState(xmlAutomataPtr am) { 5677 if (am == NULL) 5678 return(NULL); 5679 return(am->start); 5680 } 5681 5682 /** 5683 * xmlAutomataSetFinalState: 5684 * @am: an automata 5685 * @state: a state in this automata 5686 * 5687 * Makes that state a final state 5688 * 5689 * Returns 0 or -1 in case of error 5690 */ 5691 int 5692 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5693 if ((am == NULL) || (state == NULL)) 5694 return(-1); 5695 state->type = XML_REGEXP_FINAL_STATE; 5696 return(0); 5697 } 5698 5699 /** 5700 * xmlAutomataNewTransition: 5701 * @am: an automata 5702 * @from: the starting point of the transition 5703 * @to: the target point of the transition or NULL 5704 * @token: the input string associated to that transition 5705 * @data: data passed to the callback function if the transition is activated 5706 * 5707 * If @to is NULL, this creates first a new target state in the automata 5708 * and then adds a transition from the @from state to the target state 5709 * activated by the value of @token 5710 * 5711 * Returns the target state or NULL in case of error 5712 */ 5713 xmlAutomataStatePtr 5714 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5715 xmlAutomataStatePtr to, const xmlChar *token, 5716 void *data) { 5717 xmlRegAtomPtr atom; 5718 5719 if ((am == NULL) || (from == NULL) || (token == NULL)) 5720 return(NULL); 5721 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5722 if (atom == NULL) 5723 return(NULL); 5724 atom->data = data; 5725 atom->valuep = xmlStrdup(token); 5726 5727 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5728 xmlRegFreeAtom(atom); 5729 return(NULL); 5730 } 5731 if (to == NULL) 5732 return(am->state); 5733 return(to); 5734 } 5735 5736 /** 5737 * xmlAutomataNewTransition2: 5738 * @am: an automata 5739 * @from: the starting point of the transition 5740 * @to: the target point of the transition or NULL 5741 * @token: the first input string associated to that transition 5742 * @token2: the second input string associated to that transition 5743 * @data: data passed to the callback function if the transition is activated 5744 * 5745 * If @to is NULL, this creates first a new target state in the automata 5746 * and then adds a transition from the @from state to the target state 5747 * activated by the value of @token 5748 * 5749 * Returns the target state or NULL in case of error 5750 */ 5751 xmlAutomataStatePtr 5752 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5753 xmlAutomataStatePtr to, const xmlChar *token, 5754 const xmlChar *token2, void *data) { 5755 xmlRegAtomPtr atom; 5756 5757 if ((am == NULL) || (from == NULL) || (token == NULL)) 5758 return(NULL); 5759 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5760 if (atom == NULL) 5761 return(NULL); 5762 atom->data = data; 5763 if ((token2 == NULL) || (*token2 == 0)) { 5764 atom->valuep = xmlStrdup(token); 5765 } else { 5766 int lenn, lenp; 5767 xmlChar *str; 5768 5769 lenn = strlen((char *) token2); 5770 lenp = strlen((char *) token); 5771 5772 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5773 if (str == NULL) { 5774 xmlRegFreeAtom(atom); 5775 return(NULL); 5776 } 5777 memcpy(&str[0], token, lenp); 5778 str[lenp] = '|'; 5779 memcpy(&str[lenp + 1], token2, lenn); 5780 str[lenn + lenp + 1] = 0; 5781 5782 atom->valuep = str; 5783 } 5784 5785 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5786 xmlRegFreeAtom(atom); 5787 return(NULL); 5788 } 5789 if (to == NULL) 5790 return(am->state); 5791 return(to); 5792 } 5793 5794 /** 5795 * xmlAutomataNewNegTrans: 5796 * @am: an automata 5797 * @from: the starting point of the transition 5798 * @to: the target point of the transition or NULL 5799 * @token: the first input string associated to that transition 5800 * @token2: the second input string associated to that transition 5801 * @data: data passed to the callback function if the transition is activated 5802 * 5803 * If @to is NULL, this creates first a new target state in the automata 5804 * and then adds a transition from the @from state to the target state 5805 * activated by any value except (@token,@token2) 5806 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5807 # the semantic of XSD ##other 5808 * 5809 * Returns the target state or NULL in case of error 5810 */ 5811 xmlAutomataStatePtr 5812 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5813 xmlAutomataStatePtr to, const xmlChar *token, 5814 const xmlChar *token2, void *data) { 5815 xmlRegAtomPtr atom; 5816 xmlChar err_msg[200]; 5817 5818 if ((am == NULL) || (from == NULL) || (token == NULL)) 5819 return(NULL); 5820 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5821 if (atom == NULL) 5822 return(NULL); 5823 atom->data = data; 5824 atom->neg = 1; 5825 if ((token2 == NULL) || (*token2 == 0)) { 5826 atom->valuep = xmlStrdup(token); 5827 } else { 5828 int lenn, lenp; 5829 xmlChar *str; 5830 5831 lenn = strlen((char *) token2); 5832 lenp = strlen((char *) token); 5833 5834 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5835 if (str == NULL) { 5836 xmlRegFreeAtom(atom); 5837 return(NULL); 5838 } 5839 memcpy(&str[0], token, lenp); 5840 str[lenp] = '|'; 5841 memcpy(&str[lenp + 1], token2, lenn); 5842 str[lenn + lenp + 1] = 0; 5843 5844 atom->valuep = str; 5845 } 5846 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5847 err_msg[199] = 0; 5848 atom->valuep2 = xmlStrdup(err_msg); 5849 5850 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5851 xmlRegFreeAtom(atom); 5852 return(NULL); 5853 } 5854 am->negs++; 5855 if (to == NULL) 5856 return(am->state); 5857 return(to); 5858 } 5859 5860 /** 5861 * xmlAutomataNewCountTrans2: 5862 * @am: an automata 5863 * @from: the starting point of the transition 5864 * @to: the target point of the transition or NULL 5865 * @token: the input string associated to that transition 5866 * @token2: the second input string associated to that transition 5867 * @min: the minimum successive occurences of token 5868 * @max: the maximum successive occurences of token 5869 * @data: data associated to the transition 5870 * 5871 * If @to is NULL, this creates first a new target state in the automata 5872 * and then adds a transition from the @from state to the target state 5873 * activated by a succession of input of value @token and @token2 and 5874 * whose number is between @min and @max 5875 * 5876 * Returns the target state or NULL in case of error 5877 */ 5878 xmlAutomataStatePtr 5879 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5880 xmlAutomataStatePtr to, const xmlChar *token, 5881 const xmlChar *token2, 5882 int min, int max, void *data) { 5883 xmlRegAtomPtr atom; 5884 int counter; 5885 5886 if ((am == NULL) || (from == NULL) || (token == NULL)) 5887 return(NULL); 5888 if (min < 0) 5889 return(NULL); 5890 if ((max < min) || (max < 1)) 5891 return(NULL); 5892 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5893 if (atom == NULL) 5894 return(NULL); 5895 if ((token2 == NULL) || (*token2 == 0)) { 5896 atom->valuep = xmlStrdup(token); 5897 } else { 5898 int lenn, lenp; 5899 xmlChar *str; 5900 5901 lenn = strlen((char *) token2); 5902 lenp = strlen((char *) token); 5903 5904 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5905 if (str == NULL) { 5906 xmlRegFreeAtom(atom); 5907 return(NULL); 5908 } 5909 memcpy(&str[0], token, lenp); 5910 str[lenp] = '|'; 5911 memcpy(&str[lenp + 1], token2, lenn); 5912 str[lenn + lenp + 1] = 0; 5913 5914 atom->valuep = str; 5915 } 5916 atom->data = data; 5917 if (min == 0) 5918 atom->min = 1; 5919 else 5920 atom->min = min; 5921 atom->max = max; 5922 5923 /* 5924 * associate a counter to the transition. 5925 */ 5926 counter = xmlRegGetCounter(am); 5927 am->counters[counter].min = min; 5928 am->counters[counter].max = max; 5929 5930 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5931 if (to == NULL) { 5932 to = xmlRegNewState(am); 5933 xmlRegStatePush(am, to); 5934 } 5935 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5936 xmlRegAtomPush(am, atom); 5937 am->state = to; 5938 5939 if (to == NULL) 5940 to = am->state; 5941 if (to == NULL) 5942 return(NULL); 5943 if (min == 0) 5944 xmlFAGenerateEpsilonTransition(am, from, to); 5945 return(to); 5946 } 5947 5948 /** 5949 * xmlAutomataNewCountTrans: 5950 * @am: an automata 5951 * @from: the starting point of the transition 5952 * @to: the target point of the transition or NULL 5953 * @token: the input string associated to that transition 5954 * @min: the minimum successive occurences of token 5955 * @max: the maximum successive occurences of token 5956 * @data: data associated to the transition 5957 * 5958 * If @to is NULL, this creates first a new target state in the automata 5959 * and then adds a transition from the @from state to the target state 5960 * activated by a succession of input of value @token and whose number 5961 * is between @min and @max 5962 * 5963 * Returns the target state or NULL in case of error 5964 */ 5965 xmlAutomataStatePtr 5966 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5967 xmlAutomataStatePtr to, const xmlChar *token, 5968 int min, int max, void *data) { 5969 xmlRegAtomPtr atom; 5970 int counter; 5971 5972 if ((am == NULL) || (from == NULL) || (token == NULL)) 5973 return(NULL); 5974 if (min < 0) 5975 return(NULL); 5976 if ((max < min) || (max < 1)) 5977 return(NULL); 5978 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5979 if (atom == NULL) 5980 return(NULL); 5981 atom->valuep = xmlStrdup(token); 5982 atom->data = data; 5983 if (min == 0) 5984 atom->min = 1; 5985 else 5986 atom->min = min; 5987 atom->max = max; 5988 5989 /* 5990 * associate a counter to the transition. 5991 */ 5992 counter = xmlRegGetCounter(am); 5993 am->counters[counter].min = min; 5994 am->counters[counter].max = max; 5995 5996 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5997 if (to == NULL) { 5998 to = xmlRegNewState(am); 5999 xmlRegStatePush(am, to); 6000 } 6001 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6002 xmlRegAtomPush(am, atom); 6003 am->state = to; 6004 6005 if (to == NULL) 6006 to = am->state; 6007 if (to == NULL) 6008 return(NULL); 6009 if (min == 0) 6010 xmlFAGenerateEpsilonTransition(am, from, to); 6011 return(to); 6012 } 6013 6014 /** 6015 * xmlAutomataNewOnceTrans2: 6016 * @am: an automata 6017 * @from: the starting point of the transition 6018 * @to: the target point of the transition or NULL 6019 * @token: the input string associated to that transition 6020 * @token2: the second input string associated to that transition 6021 * @min: the minimum successive occurences of token 6022 * @max: the maximum successive occurences of token 6023 * @data: data associated to the transition 6024 * 6025 * If @to is NULL, this creates first a new target state in the automata 6026 * and then adds a transition from the @from state to the target state 6027 * activated by a succession of input of value @token and @token2 and whose 6028 * number is between @min and @max, moreover that transition can only be 6029 * crossed once. 6030 * 6031 * Returns the target state or NULL in case of error 6032 */ 6033 xmlAutomataStatePtr 6034 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6035 xmlAutomataStatePtr to, const xmlChar *token, 6036 const xmlChar *token2, 6037 int min, int max, void *data) { 6038 xmlRegAtomPtr atom; 6039 int counter; 6040 6041 if ((am == NULL) || (from == NULL) || (token == NULL)) 6042 return(NULL); 6043 if (min < 1) 6044 return(NULL); 6045 if ((max < min) || (max < 1)) 6046 return(NULL); 6047 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6048 if (atom == NULL) 6049 return(NULL); 6050 if ((token2 == NULL) || (*token2 == 0)) { 6051 atom->valuep = xmlStrdup(token); 6052 } else { 6053 int lenn, lenp; 6054 xmlChar *str; 6055 6056 lenn = strlen((char *) token2); 6057 lenp = strlen((char *) token); 6058 6059 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6060 if (str == NULL) { 6061 xmlRegFreeAtom(atom); 6062 return(NULL); 6063 } 6064 memcpy(&str[0], token, lenp); 6065 str[lenp] = '|'; 6066 memcpy(&str[lenp + 1], token2, lenn); 6067 str[lenn + lenp + 1] = 0; 6068 6069 atom->valuep = str; 6070 } 6071 atom->data = data; 6072 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6073 atom->min = min; 6074 atom->max = max; 6075 /* 6076 * associate a counter to the transition. 6077 */ 6078 counter = xmlRegGetCounter(am); 6079 am->counters[counter].min = 1; 6080 am->counters[counter].max = 1; 6081 6082 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6083 if (to == NULL) { 6084 to = xmlRegNewState(am); 6085 xmlRegStatePush(am, to); 6086 } 6087 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6088 xmlRegAtomPush(am, atom); 6089 am->state = to; 6090 return(to); 6091 } 6092 6093 6094 6095 /** 6096 * xmlAutomataNewOnceTrans: 6097 * @am: an automata 6098 * @from: the starting point of the transition 6099 * @to: the target point of the transition or NULL 6100 * @token: the input string associated to that transition 6101 * @min: the minimum successive occurences of token 6102 * @max: the maximum successive occurences of token 6103 * @data: data associated to the transition 6104 * 6105 * If @to is NULL, this creates first a new target state in the automata 6106 * and then adds a transition from the @from state to the target state 6107 * activated by a succession of input of value @token and whose number 6108 * is between @min and @max, moreover that transition can only be crossed 6109 * once. 6110 * 6111 * Returns the target state or NULL in case of error 6112 */ 6113 xmlAutomataStatePtr 6114 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6115 xmlAutomataStatePtr to, const xmlChar *token, 6116 int min, int max, void *data) { 6117 xmlRegAtomPtr atom; 6118 int counter; 6119 6120 if ((am == NULL) || (from == NULL) || (token == NULL)) 6121 return(NULL); 6122 if (min < 1) 6123 return(NULL); 6124 if ((max < min) || (max < 1)) 6125 return(NULL); 6126 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6127 if (atom == NULL) 6128 return(NULL); 6129 atom->valuep = xmlStrdup(token); 6130 atom->data = data; 6131 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6132 atom->min = min; 6133 atom->max = max; 6134 /* 6135 * associate a counter to the transition. 6136 */ 6137 counter = xmlRegGetCounter(am); 6138 am->counters[counter].min = 1; 6139 am->counters[counter].max = 1; 6140 6141 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6142 if (to == NULL) { 6143 to = xmlRegNewState(am); 6144 xmlRegStatePush(am, to); 6145 } 6146 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6147 xmlRegAtomPush(am, atom); 6148 am->state = to; 6149 return(to); 6150 } 6151 6152 /** 6153 * xmlAutomataNewState: 6154 * @am: an automata 6155 * 6156 * Create a new disconnected state in the automata 6157 * 6158 * Returns the new state or NULL in case of error 6159 */ 6160 xmlAutomataStatePtr 6161 xmlAutomataNewState(xmlAutomataPtr am) { 6162 xmlAutomataStatePtr to; 6163 6164 if (am == NULL) 6165 return(NULL); 6166 to = xmlRegNewState(am); 6167 xmlRegStatePush(am, to); 6168 return(to); 6169 } 6170 6171 /** 6172 * xmlAutomataNewEpsilon: 6173 * @am: an automata 6174 * @from: the starting point of the transition 6175 * @to: the target point of the transition or NULL 6176 * 6177 * If @to is NULL, this creates first a new target state in the automata 6178 * and then adds an epsilon transition from the @from state to the 6179 * target state 6180 * 6181 * Returns the target state or NULL in case of error 6182 */ 6183 xmlAutomataStatePtr 6184 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6185 xmlAutomataStatePtr to) { 6186 if ((am == NULL) || (from == NULL)) 6187 return(NULL); 6188 xmlFAGenerateEpsilonTransition(am, from, to); 6189 if (to == NULL) 6190 return(am->state); 6191 return(to); 6192 } 6193 6194 /** 6195 * xmlAutomataNewAllTrans: 6196 * @am: an automata 6197 * @from: the starting point of the transition 6198 * @to: the target point of the transition or NULL 6199 * @lax: allow to transition if not all all transitions have been activated 6200 * 6201 * If @to is NULL, this creates first a new target state in the automata 6202 * and then adds a an ALL transition from the @from state to the 6203 * target state. That transition is an epsilon transition allowed only when 6204 * all transitions from the @from node have been activated. 6205 * 6206 * Returns the target state or NULL in case of error 6207 */ 6208 xmlAutomataStatePtr 6209 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6210 xmlAutomataStatePtr to, int lax) { 6211 if ((am == NULL) || (from == NULL)) 6212 return(NULL); 6213 xmlFAGenerateAllTransition(am, from, to, lax); 6214 if (to == NULL) 6215 return(am->state); 6216 return(to); 6217 } 6218 6219 /** 6220 * xmlAutomataNewCounter: 6221 * @am: an automata 6222 * @min: the minimal value on the counter 6223 * @max: the maximal value on the counter 6224 * 6225 * Create a new counter 6226 * 6227 * Returns the counter number or -1 in case of error 6228 */ 6229 int 6230 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6231 int ret; 6232 6233 if (am == NULL) 6234 return(-1); 6235 6236 ret = xmlRegGetCounter(am); 6237 if (ret < 0) 6238 return(-1); 6239 am->counters[ret].min = min; 6240 am->counters[ret].max = max; 6241 return(ret); 6242 } 6243 6244 /** 6245 * xmlAutomataNewCountedTrans: 6246 * @am: an automata 6247 * @from: the starting point of the transition 6248 * @to: the target point of the transition or NULL 6249 * @counter: the counter associated to that transition 6250 * 6251 * If @to is NULL, this creates first a new target state in the automata 6252 * and then adds an epsilon transition from the @from state to the target state 6253 * which will increment the counter provided 6254 * 6255 * Returns the target state or NULL in case of error 6256 */ 6257 xmlAutomataStatePtr 6258 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6259 xmlAutomataStatePtr to, int counter) { 6260 if ((am == NULL) || (from == NULL) || (counter < 0)) 6261 return(NULL); 6262 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6263 if (to == NULL) 6264 return(am->state); 6265 return(to); 6266 } 6267 6268 /** 6269 * xmlAutomataNewCounterTrans: 6270 * @am: an automata 6271 * @from: the starting point of the transition 6272 * @to: the target point of the transition or NULL 6273 * @counter: the counter associated to that transition 6274 * 6275 * If @to is NULL, this creates first a new target state in the automata 6276 * and then adds an epsilon transition from the @from state to the target state 6277 * which will be allowed only if the counter is within the right range. 6278 * 6279 * Returns the target state or NULL in case of error 6280 */ 6281 xmlAutomataStatePtr 6282 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6283 xmlAutomataStatePtr to, int counter) { 6284 if ((am == NULL) || (from == NULL) || (counter < 0)) 6285 return(NULL); 6286 xmlFAGenerateCountedTransition(am, from, to, counter); 6287 if (to == NULL) 6288 return(am->state); 6289 return(to); 6290 } 6291 6292 /** 6293 * xmlAutomataCompile: 6294 * @am: an automata 6295 * 6296 * Compile the automata into a Reg Exp ready for being executed. 6297 * The automata should be free after this point. 6298 * 6299 * Returns the compiled regexp or NULL in case of error 6300 */ 6301 xmlRegexpPtr 6302 xmlAutomataCompile(xmlAutomataPtr am) { 6303 xmlRegexpPtr ret; 6304 6305 if ((am == NULL) || (am->error != 0)) return(NULL); 6306 xmlFAEliminateEpsilonTransitions(am); 6307 /* xmlFAComputesDeterminism(am); */ 6308 ret = xmlRegEpxFromParse(am); 6309 6310 return(ret); 6311 } 6312 6313 /** 6314 * xmlAutomataIsDeterminist: 6315 * @am: an automata 6316 * 6317 * Checks if an automata is determinist. 6318 * 6319 * Returns 1 if true, 0 if not, and -1 in case of error 6320 */ 6321 int 6322 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6323 int ret; 6324 6325 if (am == NULL) 6326 return(-1); 6327 6328 ret = xmlFAComputesDeterminism(am); 6329 return(ret); 6330 } 6331 #endif /* LIBXML_AUTOMATA_ENABLED */ 6332 6333 #ifdef LIBXML_EXPR_ENABLED 6334 /************************************************************************ 6335 * * 6336 * Formal Expression handling code * 6337 * * 6338 ************************************************************************/ 6339 /************************************************************************ 6340 * * 6341 * Expression handling context * 6342 * * 6343 ************************************************************************/ 6344 6345 struct _xmlExpCtxt { 6346 xmlDictPtr dict; 6347 xmlExpNodePtr *table; 6348 int size; 6349 int nbElems; 6350 int nb_nodes; 6351 int maxNodes; 6352 const char *expr; 6353 const char *cur; 6354 int nb_cons; 6355 int tabSize; 6356 }; 6357 6358 /** 6359 * xmlExpNewCtxt: 6360 * @maxNodes: the maximum number of nodes 6361 * @dict: optional dictionary to use internally 6362 * 6363 * Creates a new context for manipulating expressions 6364 * 6365 * Returns the context or NULL in case of error 6366 */ 6367 xmlExpCtxtPtr 6368 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6369 xmlExpCtxtPtr ret; 6370 int size = 256; 6371 6372 if (maxNodes <= 4096) 6373 maxNodes = 4096; 6374 6375 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6376 if (ret == NULL) 6377 return(NULL); 6378 memset(ret, 0, sizeof(xmlExpCtxt)); 6379 ret->size = size; 6380 ret->nbElems = 0; 6381 ret->maxNodes = maxNodes; 6382 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6383 if (ret->table == NULL) { 6384 xmlFree(ret); 6385 return(NULL); 6386 } 6387 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6388 if (dict == NULL) { 6389 ret->dict = xmlDictCreate(); 6390 if (ret->dict == NULL) { 6391 xmlFree(ret->table); 6392 xmlFree(ret); 6393 return(NULL); 6394 } 6395 } else { 6396 ret->dict = dict; 6397 xmlDictReference(ret->dict); 6398 } 6399 return(ret); 6400 } 6401 6402 /** 6403 * xmlExpFreeCtxt: 6404 * @ctxt: an expression context 6405 * 6406 * Free an expression context 6407 */ 6408 void 6409 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6410 if (ctxt == NULL) 6411 return; 6412 xmlDictFree(ctxt->dict); 6413 if (ctxt->table != NULL) 6414 xmlFree(ctxt->table); 6415 xmlFree(ctxt); 6416 } 6417 6418 /************************************************************************ 6419 * * 6420 * Structure associated to an expression node * 6421 * * 6422 ************************************************************************/ 6423 #define MAX_NODES 10000 6424 6425 /* #define DEBUG_DERIV */ 6426 6427 /* 6428 * TODO: 6429 * - Wildcards 6430 * - public API for creation 6431 * 6432 * Started 6433 * - regression testing 6434 * 6435 * Done 6436 * - split into module and test tool 6437 * - memleaks 6438 */ 6439 6440 typedef enum { 6441 XML_EXP_NILABLE = (1 << 0) 6442 } xmlExpNodeInfo; 6443 6444 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6445 6446 struct _xmlExpNode { 6447 unsigned char type;/* xmlExpNodeType */ 6448 unsigned char info;/* OR of xmlExpNodeInfo */ 6449 unsigned short key; /* the hash key */ 6450 unsigned int ref; /* The number of references */ 6451 int c_max; /* the maximum length it can consume */ 6452 xmlExpNodePtr exp_left; 6453 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6454 union { 6455 struct { 6456 int f_min; 6457 int f_max; 6458 } count; 6459 struct { 6460 xmlExpNodePtr f_right; 6461 } children; 6462 const xmlChar *f_str; 6463 } field; 6464 }; 6465 6466 #define exp_min field.count.f_min 6467 #define exp_max field.count.f_max 6468 /* #define exp_left field.children.f_left */ 6469 #define exp_right field.children.f_right 6470 #define exp_str field.f_str 6471 6472 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6473 static xmlExpNode forbiddenExpNode = { 6474 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6475 }; 6476 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6477 static xmlExpNode emptyExpNode = { 6478 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6479 }; 6480 xmlExpNodePtr emptyExp = &emptyExpNode; 6481 6482 /************************************************************************ 6483 * * 6484 * The custom hash table for unicity and canonicalization * 6485 * of sub-expressions pointers * 6486 * * 6487 ************************************************************************/ 6488 /* 6489 * xmlExpHashNameComputeKey: 6490 * Calculate the hash key for a token 6491 */ 6492 static unsigned short 6493 xmlExpHashNameComputeKey(const xmlChar *name) { 6494 unsigned short value = 0L; 6495 char ch; 6496 6497 if (name != NULL) { 6498 value += 30 * (*name); 6499 while ((ch = *name++) != 0) { 6500 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6501 } 6502 } 6503 return (value); 6504 } 6505 6506 /* 6507 * xmlExpHashComputeKey: 6508 * Calculate the hash key for a compound expression 6509 */ 6510 static unsigned short 6511 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6512 xmlExpNodePtr right) { 6513 unsigned long value; 6514 unsigned short ret; 6515 6516 switch (type) { 6517 case XML_EXP_SEQ: 6518 value = left->key; 6519 value += right->key; 6520 value *= 3; 6521 ret = (unsigned short) value; 6522 break; 6523 case XML_EXP_OR: 6524 value = left->key; 6525 value += right->key; 6526 value *= 7; 6527 ret = (unsigned short) value; 6528 break; 6529 case XML_EXP_COUNT: 6530 value = left->key; 6531 value += right->key; 6532 ret = (unsigned short) value; 6533 break; 6534 default: 6535 ret = 0; 6536 } 6537 return(ret); 6538 } 6539 6540 6541 static xmlExpNodePtr 6542 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6543 xmlExpNodePtr ret; 6544 6545 if (ctxt->nb_nodes >= MAX_NODES) 6546 return(NULL); 6547 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6548 if (ret == NULL) 6549 return(NULL); 6550 memset(ret, 0, sizeof(xmlExpNode)); 6551 ret->type = type; 6552 ret->next = NULL; 6553 ctxt->nb_nodes++; 6554 ctxt->nb_cons++; 6555 return(ret); 6556 } 6557 6558 /** 6559 * xmlExpHashGetEntry: 6560 * @table: the hash table 6561 * 6562 * Get the unique entry from the hash table. The entry is created if 6563 * needed. @left and @right are consumed, i.e. their ref count will 6564 * be decremented by the operation. 6565 * 6566 * Returns the pointer or NULL in case of error 6567 */ 6568 static xmlExpNodePtr 6569 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6570 xmlExpNodePtr left, xmlExpNodePtr right, 6571 const xmlChar *name, int min, int max) { 6572 unsigned short kbase, key; 6573 xmlExpNodePtr entry; 6574 xmlExpNodePtr insert; 6575 6576 if (ctxt == NULL) 6577 return(NULL); 6578 6579 /* 6580 * Check for duplicate and insertion location. 6581 */ 6582 if (type == XML_EXP_ATOM) { 6583 kbase = xmlExpHashNameComputeKey(name); 6584 } else if (type == XML_EXP_COUNT) { 6585 /* COUNT reduction rule 1 */ 6586 /* a{1} -> a */ 6587 if (min == max) { 6588 if (min == 1) { 6589 return(left); 6590 } 6591 if (min == 0) { 6592 xmlExpFree(ctxt, left); 6593 return(emptyExp); 6594 } 6595 } 6596 if (min < 0) { 6597 xmlExpFree(ctxt, left); 6598 return(forbiddenExp); 6599 } 6600 if (max == -1) 6601 kbase = min + 79; 6602 else 6603 kbase = max - min; 6604 kbase += left->key; 6605 } else if (type == XML_EXP_OR) { 6606 /* Forbid reduction rules */ 6607 if (left->type == XML_EXP_FORBID) { 6608 xmlExpFree(ctxt, left); 6609 return(right); 6610 } 6611 if (right->type == XML_EXP_FORBID) { 6612 xmlExpFree(ctxt, right); 6613 return(left); 6614 } 6615 6616 /* OR reduction rule 1 */ 6617 /* a | a reduced to a */ 6618 if (left == right) { 6619 left->ref--; 6620 return(left); 6621 } 6622 /* OR canonicalization rule 1 */ 6623 /* linearize (a | b) | c into a | (b | c) */ 6624 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6625 xmlExpNodePtr tmp = left; 6626 left = right; 6627 right = tmp; 6628 } 6629 /* OR reduction rule 2 */ 6630 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6631 if (right->type == XML_EXP_OR) { 6632 if ((left == right->exp_left) || 6633 (left == right->exp_right)) { 6634 xmlExpFree(ctxt, left); 6635 return(right); 6636 } 6637 } 6638 /* OR canonicalization rule 2 */ 6639 /* linearize (a | b) | c into a | (b | c) */ 6640 if (left->type == XML_EXP_OR) { 6641 xmlExpNodePtr tmp; 6642 6643 /* OR canonicalization rule 2 */ 6644 if ((left->exp_right->type != XML_EXP_OR) && 6645 (left->exp_right->key < left->exp_left->key)) { 6646 tmp = left->exp_right; 6647 left->exp_right = left->exp_left; 6648 left->exp_left = tmp; 6649 } 6650 left->exp_right->ref++; 6651 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6652 NULL, 0, 0); 6653 left->exp_left->ref++; 6654 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6655 NULL, 0, 0); 6656 6657 xmlExpFree(ctxt, left); 6658 return(tmp); 6659 } 6660 if (right->type == XML_EXP_OR) { 6661 /* Ordering in the tree */ 6662 /* C | (A | B) -> A | (B | C) */ 6663 if (left->key > right->exp_right->key) { 6664 xmlExpNodePtr tmp; 6665 right->exp_right->ref++; 6666 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6667 left, NULL, 0, 0); 6668 right->exp_left->ref++; 6669 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6670 tmp, NULL, 0, 0); 6671 xmlExpFree(ctxt, right); 6672 return(tmp); 6673 } 6674 /* Ordering in the tree */ 6675 /* B | (A | C) -> A | (B | C) */ 6676 if (left->key > right->exp_left->key) { 6677 xmlExpNodePtr tmp; 6678 right->exp_right->ref++; 6679 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6680 right->exp_right, NULL, 0, 0); 6681 right->exp_left->ref++; 6682 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6683 tmp, NULL, 0, 0); 6684 xmlExpFree(ctxt, right); 6685 return(tmp); 6686 } 6687 } 6688 /* we know both types are != XML_EXP_OR here */ 6689 else if (left->key > right->key) { 6690 xmlExpNodePtr tmp = left; 6691 left = right; 6692 right = tmp; 6693 } 6694 kbase = xmlExpHashComputeKey(type, left, right); 6695 } else if (type == XML_EXP_SEQ) { 6696 /* Forbid reduction rules */ 6697 if (left->type == XML_EXP_FORBID) { 6698 xmlExpFree(ctxt, right); 6699 return(left); 6700 } 6701 if (right->type == XML_EXP_FORBID) { 6702 xmlExpFree(ctxt, left); 6703 return(right); 6704 } 6705 /* Empty reduction rules */ 6706 if (right->type == XML_EXP_EMPTY) { 6707 return(left); 6708 } 6709 if (left->type == XML_EXP_EMPTY) { 6710 return(right); 6711 } 6712 kbase = xmlExpHashComputeKey(type, left, right); 6713 } else 6714 return(NULL); 6715 6716 key = kbase % ctxt->size; 6717 if (ctxt->table[key] != NULL) { 6718 for (insert = ctxt->table[key]; insert != NULL; 6719 insert = insert->next) { 6720 if ((insert->key == kbase) && 6721 (insert->type == type)) { 6722 if (type == XML_EXP_ATOM) { 6723 if (name == insert->exp_str) { 6724 insert->ref++; 6725 return(insert); 6726 } 6727 } else if (type == XML_EXP_COUNT) { 6728 if ((insert->exp_min == min) && (insert->exp_max == max) && 6729 (insert->exp_left == left)) { 6730 insert->ref++; 6731 left->ref--; 6732 return(insert); 6733 } 6734 } else if ((insert->exp_left == left) && 6735 (insert->exp_right == right)) { 6736 insert->ref++; 6737 left->ref--; 6738 right->ref--; 6739 return(insert); 6740 } 6741 } 6742 } 6743 } 6744 6745 entry = xmlExpNewNode(ctxt, type); 6746 if (entry == NULL) 6747 return(NULL); 6748 entry->key = kbase; 6749 if (type == XML_EXP_ATOM) { 6750 entry->exp_str = name; 6751 entry->c_max = 1; 6752 } else if (type == XML_EXP_COUNT) { 6753 entry->exp_min = min; 6754 entry->exp_max = max; 6755 entry->exp_left = left; 6756 if ((min == 0) || (IS_NILLABLE(left))) 6757 entry->info |= XML_EXP_NILABLE; 6758 if (max < 0) 6759 entry->c_max = -1; 6760 else 6761 entry->c_max = max * entry->exp_left->c_max; 6762 } else { 6763 entry->exp_left = left; 6764 entry->exp_right = right; 6765 if (type == XML_EXP_OR) { 6766 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6767 entry->info |= XML_EXP_NILABLE; 6768 if ((entry->exp_left->c_max == -1) || 6769 (entry->exp_right->c_max == -1)) 6770 entry->c_max = -1; 6771 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6772 entry->c_max = entry->exp_left->c_max; 6773 else 6774 entry->c_max = entry->exp_right->c_max; 6775 } else { 6776 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6777 entry->info |= XML_EXP_NILABLE; 6778 if ((entry->exp_left->c_max == -1) || 6779 (entry->exp_right->c_max == -1)) 6780 entry->c_max = -1; 6781 else 6782 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6783 } 6784 } 6785 entry->ref = 1; 6786 if (ctxt->table[key] != NULL) 6787 entry->next = ctxt->table[key]; 6788 6789 ctxt->table[key] = entry; 6790 ctxt->nbElems++; 6791 6792 return(entry); 6793 } 6794 6795 /** 6796 * xmlExpFree: 6797 * @ctxt: the expression context 6798 * @exp: the expression 6799 * 6800 * Dereference the expression 6801 */ 6802 void 6803 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6804 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6805 return; 6806 exp->ref--; 6807 if (exp->ref == 0) { 6808 unsigned short key; 6809 6810 /* Unlink it first from the hash table */ 6811 key = exp->key % ctxt->size; 6812 if (ctxt->table[key] == exp) { 6813 ctxt->table[key] = exp->next; 6814 } else { 6815 xmlExpNodePtr tmp; 6816 6817 tmp = ctxt->table[key]; 6818 while (tmp != NULL) { 6819 if (tmp->next == exp) { 6820 tmp->next = exp->next; 6821 break; 6822 } 6823 tmp = tmp->next; 6824 } 6825 } 6826 6827 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6828 xmlExpFree(ctxt, exp->exp_left); 6829 xmlExpFree(ctxt, exp->exp_right); 6830 } else if (exp->type == XML_EXP_COUNT) { 6831 xmlExpFree(ctxt, exp->exp_left); 6832 } 6833 xmlFree(exp); 6834 ctxt->nb_nodes--; 6835 } 6836 } 6837 6838 /** 6839 * xmlExpRef: 6840 * @exp: the expression 6841 * 6842 * Increase the reference count of the expression 6843 */ 6844 void 6845 xmlExpRef(xmlExpNodePtr exp) { 6846 if (exp != NULL) 6847 exp->ref++; 6848 } 6849 6850 /** 6851 * xmlExpNewAtom: 6852 * @ctxt: the expression context 6853 * @name: the atom name 6854 * @len: the atom name length in byte (or -1); 6855 * 6856 * Get the atom associated to this name from that context 6857 * 6858 * Returns the node or NULL in case of error 6859 */ 6860 xmlExpNodePtr 6861 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6862 if ((ctxt == NULL) || (name == NULL)) 6863 return(NULL); 6864 name = xmlDictLookup(ctxt->dict, name, len); 6865 if (name == NULL) 6866 return(NULL); 6867 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6868 } 6869 6870 /** 6871 * xmlExpNewOr: 6872 * @ctxt: the expression context 6873 * @left: left expression 6874 * @right: right expression 6875 * 6876 * Get the atom associated to the choice @left | @right 6877 * Note that @left and @right are consumed in the operation, to keep 6878 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6879 * this is true even in case of failure (unless ctxt == NULL). 6880 * 6881 * Returns the node or NULL in case of error 6882 */ 6883 xmlExpNodePtr 6884 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6885 if (ctxt == NULL) 6886 return(NULL); 6887 if ((left == NULL) || (right == NULL)) { 6888 xmlExpFree(ctxt, left); 6889 xmlExpFree(ctxt, right); 6890 return(NULL); 6891 } 6892 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6893 } 6894 6895 /** 6896 * xmlExpNewSeq: 6897 * @ctxt: the expression context 6898 * @left: left expression 6899 * @right: right expression 6900 * 6901 * Get the atom associated to the sequence @left , @right 6902 * Note that @left and @right are consumed in the operation, to keep 6903 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6904 * this is true even in case of failure (unless ctxt == NULL). 6905 * 6906 * Returns the node or NULL in case of error 6907 */ 6908 xmlExpNodePtr 6909 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6910 if (ctxt == NULL) 6911 return(NULL); 6912 if ((left == NULL) || (right == NULL)) { 6913 xmlExpFree(ctxt, left); 6914 xmlExpFree(ctxt, right); 6915 return(NULL); 6916 } 6917 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6918 } 6919 6920 /** 6921 * xmlExpNewRange: 6922 * @ctxt: the expression context 6923 * @subset: the expression to be repeated 6924 * @min: the lower bound for the repetition 6925 * @max: the upper bound for the repetition, -1 means infinite 6926 * 6927 * Get the atom associated to the range (@subset){@min, @max} 6928 * Note that @subset is consumed in the operation, to keep 6929 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6930 * this is true even in case of failure (unless ctxt == NULL). 6931 * 6932 * Returns the node or NULL in case of error 6933 */ 6934 xmlExpNodePtr 6935 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6936 if (ctxt == NULL) 6937 return(NULL); 6938 if ((subset == NULL) || (min < 0) || (max < -1) || 6939 ((max >= 0) && (min > max))) { 6940 xmlExpFree(ctxt, subset); 6941 return(NULL); 6942 } 6943 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6944 NULL, NULL, min, max)); 6945 } 6946 6947 /************************************************************************ 6948 * * 6949 * Public API for operations on expressions * 6950 * * 6951 ************************************************************************/ 6952 6953 static int 6954 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6955 const xmlChar**list, int len, int nb) { 6956 int tmp, tmp2; 6957 tail: 6958 switch (exp->type) { 6959 case XML_EXP_EMPTY: 6960 return(0); 6961 case XML_EXP_ATOM: 6962 for (tmp = 0;tmp < nb;tmp++) 6963 if (list[tmp] == exp->exp_str) 6964 return(0); 6965 if (nb >= len) 6966 return(-2); 6967 list[nb] = exp->exp_str; 6968 return(1); 6969 case XML_EXP_COUNT: 6970 exp = exp->exp_left; 6971 goto tail; 6972 case XML_EXP_SEQ: 6973 case XML_EXP_OR: 6974 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6975 if (tmp < 0) 6976 return(tmp); 6977 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6978 nb + tmp); 6979 if (tmp2 < 0) 6980 return(tmp2); 6981 return(tmp + tmp2); 6982 } 6983 return(-1); 6984 } 6985 6986 /** 6987 * xmlExpGetLanguage: 6988 * @ctxt: the expression context 6989 * @exp: the expression 6990 * @langList: where to store the tokens 6991 * @len: the allocated length of @list 6992 * 6993 * Find all the strings used in @exp and store them in @list 6994 * 6995 * Returns the number of unique strings found, -1 in case of errors and 6996 * -2 if there is more than @len strings 6997 */ 6998 int 6999 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7000 const xmlChar**langList, int len) { 7001 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 7002 return(-1); 7003 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 7004 } 7005 7006 static int 7007 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7008 const xmlChar**list, int len, int nb) { 7009 int tmp, tmp2; 7010 tail: 7011 switch (exp->type) { 7012 case XML_EXP_FORBID: 7013 return(0); 7014 case XML_EXP_EMPTY: 7015 return(0); 7016 case XML_EXP_ATOM: 7017 for (tmp = 0;tmp < nb;tmp++) 7018 if (list[tmp] == exp->exp_str) 7019 return(0); 7020 if (nb >= len) 7021 return(-2); 7022 list[nb] = exp->exp_str; 7023 return(1); 7024 case XML_EXP_COUNT: 7025 exp = exp->exp_left; 7026 goto tail; 7027 case XML_EXP_SEQ: 7028 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7029 if (tmp < 0) 7030 return(tmp); 7031 if (IS_NILLABLE(exp->exp_left)) { 7032 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7033 nb + tmp); 7034 if (tmp2 < 0) 7035 return(tmp2); 7036 tmp += tmp2; 7037 } 7038 return(tmp); 7039 case XML_EXP_OR: 7040 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7041 if (tmp < 0) 7042 return(tmp); 7043 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7044 nb + tmp); 7045 if (tmp2 < 0) 7046 return(tmp2); 7047 return(tmp + tmp2); 7048 } 7049 return(-1); 7050 } 7051 7052 /** 7053 * xmlExpGetStart: 7054 * @ctxt: the expression context 7055 * @exp: the expression 7056 * @tokList: where to store the tokens 7057 * @len: the allocated length of @list 7058 * 7059 * Find all the strings that appears at the start of the languages 7060 * accepted by @exp and store them in @list. E.g. for (a, b) | c 7061 * it will return the list [a, c] 7062 * 7063 * Returns the number of unique strings found, -1 in case of errors and 7064 * -2 if there is more than @len strings 7065 */ 7066 int 7067 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7068 const xmlChar**tokList, int len) { 7069 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 7070 return(-1); 7071 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 7072 } 7073 7074 /** 7075 * xmlExpIsNillable: 7076 * @exp: the expression 7077 * 7078 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 7079 * 7080 * Returns 1 if nillable, 0 if not and -1 in case of error 7081 */ 7082 int 7083 xmlExpIsNillable(xmlExpNodePtr exp) { 7084 if (exp == NULL) 7085 return(-1); 7086 return(IS_NILLABLE(exp) != 0); 7087 } 7088 7089 static xmlExpNodePtr 7090 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 7091 { 7092 xmlExpNodePtr ret; 7093 7094 switch (exp->type) { 7095 case XML_EXP_EMPTY: 7096 return(forbiddenExp); 7097 case XML_EXP_FORBID: 7098 return(forbiddenExp); 7099 case XML_EXP_ATOM: 7100 if (exp->exp_str == str) { 7101 #ifdef DEBUG_DERIV 7102 printf("deriv atom: equal => Empty\n"); 7103 #endif 7104 ret = emptyExp; 7105 } else { 7106 #ifdef DEBUG_DERIV 7107 printf("deriv atom: mismatch => forbid\n"); 7108 #endif 7109 /* TODO wildcards here */ 7110 ret = forbiddenExp; 7111 } 7112 return(ret); 7113 case XML_EXP_OR: { 7114 xmlExpNodePtr tmp; 7115 7116 #ifdef DEBUG_DERIV 7117 printf("deriv or: => or(derivs)\n"); 7118 #endif 7119 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7120 if (tmp == NULL) { 7121 return(NULL); 7122 } 7123 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7124 if (ret == NULL) { 7125 xmlExpFree(ctxt, tmp); 7126 return(NULL); 7127 } 7128 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7129 NULL, 0, 0); 7130 return(ret); 7131 } 7132 case XML_EXP_SEQ: 7133 #ifdef DEBUG_DERIV 7134 printf("deriv seq: starting with left\n"); 7135 #endif 7136 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7137 if (ret == NULL) { 7138 return(NULL); 7139 } else if (ret == forbiddenExp) { 7140 if (IS_NILLABLE(exp->exp_left)) { 7141 #ifdef DEBUG_DERIV 7142 printf("deriv seq: left failed but nillable\n"); 7143 #endif 7144 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7145 } 7146 } else { 7147 #ifdef DEBUG_DERIV 7148 printf("deriv seq: left match => sequence\n"); 7149 #endif 7150 exp->exp_right->ref++; 7151 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7152 NULL, 0, 0); 7153 } 7154 return(ret); 7155 case XML_EXP_COUNT: { 7156 int min, max; 7157 xmlExpNodePtr tmp; 7158 7159 if (exp->exp_max == 0) 7160 return(forbiddenExp); 7161 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7162 if (ret == NULL) 7163 return(NULL); 7164 if (ret == forbiddenExp) { 7165 #ifdef DEBUG_DERIV 7166 printf("deriv count: pattern mismatch => forbid\n"); 7167 #endif 7168 return(ret); 7169 } 7170 if (exp->exp_max == 1) 7171 return(ret); 7172 if (exp->exp_max < 0) /* unbounded */ 7173 max = -1; 7174 else 7175 max = exp->exp_max - 1; 7176 if (exp->exp_min > 0) 7177 min = exp->exp_min - 1; 7178 else 7179 min = 0; 7180 exp->exp_left->ref++; 7181 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7182 NULL, min, max); 7183 if (ret == emptyExp) { 7184 #ifdef DEBUG_DERIV 7185 printf("deriv count: match to empty => new count\n"); 7186 #endif 7187 return(tmp); 7188 } 7189 #ifdef DEBUG_DERIV 7190 printf("deriv count: match => sequence with new count\n"); 7191 #endif 7192 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7193 NULL, 0, 0)); 7194 } 7195 } 7196 return(NULL); 7197 } 7198 7199 /** 7200 * xmlExpStringDerive: 7201 * @ctxt: the expression context 7202 * @exp: the expression 7203 * @str: the string 7204 * @len: the string len in bytes if available 7205 * 7206 * Do one step of Brzozowski derivation of the expression @exp with 7207 * respect to the input string 7208 * 7209 * Returns the resulting expression or NULL in case of internal error 7210 */ 7211 xmlExpNodePtr 7212 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7213 const xmlChar *str, int len) { 7214 const xmlChar *input; 7215 7216 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7217 return(NULL); 7218 } 7219 /* 7220 * check the string is in the dictionary, if yes use an interned 7221 * copy, otherwise we know it's not an acceptable input 7222 */ 7223 input = xmlDictExists(ctxt->dict, str, len); 7224 if (input == NULL) { 7225 return(forbiddenExp); 7226 } 7227 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7228 } 7229 7230 static int 7231 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7232 int ret = 1; 7233 7234 if (sub->c_max == -1) { 7235 if (exp->c_max != -1) 7236 ret = 0; 7237 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7238 ret = 0; 7239 } 7240 #if 0 7241 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7242 ret = 0; 7243 #endif 7244 return(ret); 7245 } 7246 7247 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7248 xmlExpNodePtr sub); 7249 /** 7250 * xmlExpDivide: 7251 * @ctxt: the expressions context 7252 * @exp: the englobing expression 7253 * @sub: the subexpression 7254 * @mult: the multiple expression 7255 * @remain: the remain from the derivation of the multiple 7256 * 7257 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7258 * so that sub{n} subsume exp 7259 * 7260 * Returns the multiple value if successful, 0 if it is not a multiple 7261 * and -1 in case of internel error. 7262 */ 7263 7264 static int 7265 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7266 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7267 int i; 7268 xmlExpNodePtr tmp, tmp2; 7269 7270 if (mult != NULL) *mult = NULL; 7271 if (remain != NULL) *remain = NULL; 7272 if (exp->c_max == -1) return(0); 7273 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7274 7275 for (i = 1;i <= exp->c_max;i++) { 7276 sub->ref++; 7277 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7278 sub, NULL, NULL, i, i); 7279 if (tmp == NULL) { 7280 return(-1); 7281 } 7282 if (!xmlExpCheckCard(tmp, exp)) { 7283 xmlExpFree(ctxt, tmp); 7284 continue; 7285 } 7286 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7287 if (tmp2 == NULL) { 7288 xmlExpFree(ctxt, tmp); 7289 return(-1); 7290 } 7291 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7292 if (remain != NULL) 7293 *remain = tmp2; 7294 else 7295 xmlExpFree(ctxt, tmp2); 7296 if (mult != NULL) 7297 *mult = tmp; 7298 else 7299 xmlExpFree(ctxt, tmp); 7300 #ifdef DEBUG_DERIV 7301 printf("Divide succeeded %d\n", i); 7302 #endif 7303 return(i); 7304 } 7305 xmlExpFree(ctxt, tmp); 7306 xmlExpFree(ctxt, tmp2); 7307 } 7308 #ifdef DEBUG_DERIV 7309 printf("Divide failed\n"); 7310 #endif 7311 return(0); 7312 } 7313 7314 /** 7315 * xmlExpExpDeriveInt: 7316 * @ctxt: the expressions context 7317 * @exp: the englobing expression 7318 * @sub: the subexpression 7319 * 7320 * Try to do a step of Brzozowski derivation but at a higher level 7321 * the input being a subexpression. 7322 * 7323 * Returns the resulting expression or NULL in case of internal error 7324 */ 7325 static xmlExpNodePtr 7326 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7327 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7328 const xmlChar **tab; 7329 int len, i; 7330 7331 /* 7332 * In case of equality and if the expression can only consume a finite 7333 * amount, then the derivation is empty 7334 */ 7335 if ((exp == sub) && (exp->c_max >= 0)) { 7336 #ifdef DEBUG_DERIV 7337 printf("Equal(exp, sub) and finite -> Empty\n"); 7338 #endif 7339 return(emptyExp); 7340 } 7341 /* 7342 * decompose sub sequence first 7343 */ 7344 if (sub->type == XML_EXP_EMPTY) { 7345 #ifdef DEBUG_DERIV 7346 printf("Empty(sub) -> Empty\n"); 7347 #endif 7348 exp->ref++; 7349 return(exp); 7350 } 7351 if (sub->type == XML_EXP_SEQ) { 7352 #ifdef DEBUG_DERIV 7353 printf("Seq(sub) -> decompose\n"); 7354 #endif 7355 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7356 if (tmp == NULL) 7357 return(NULL); 7358 if (tmp == forbiddenExp) 7359 return(tmp); 7360 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7361 xmlExpFree(ctxt, tmp); 7362 return(ret); 7363 } 7364 if (sub->type == XML_EXP_OR) { 7365 #ifdef DEBUG_DERIV 7366 printf("Or(sub) -> decompose\n"); 7367 #endif 7368 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7369 if (tmp == forbiddenExp) 7370 return(tmp); 7371 if (tmp == NULL) 7372 return(NULL); 7373 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7374 if ((ret == NULL) || (ret == forbiddenExp)) { 7375 xmlExpFree(ctxt, tmp); 7376 return(ret); 7377 } 7378 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7379 } 7380 if (!xmlExpCheckCard(exp, sub)) { 7381 #ifdef DEBUG_DERIV 7382 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7383 #endif 7384 return(forbiddenExp); 7385 } 7386 switch (exp->type) { 7387 case XML_EXP_EMPTY: 7388 if (sub == emptyExp) 7389 return(emptyExp); 7390 #ifdef DEBUG_DERIV 7391 printf("Empty(exp) -> Forbid\n"); 7392 #endif 7393 return(forbiddenExp); 7394 case XML_EXP_FORBID: 7395 #ifdef DEBUG_DERIV 7396 printf("Forbid(exp) -> Forbid\n"); 7397 #endif 7398 return(forbiddenExp); 7399 case XML_EXP_ATOM: 7400 if (sub->type == XML_EXP_ATOM) { 7401 /* TODO: handle wildcards */ 7402 if (exp->exp_str == sub->exp_str) { 7403 #ifdef DEBUG_DERIV 7404 printf("Atom match -> Empty\n"); 7405 #endif 7406 return(emptyExp); 7407 } 7408 #ifdef DEBUG_DERIV 7409 printf("Atom mismatch -> Forbid\n"); 7410 #endif 7411 return(forbiddenExp); 7412 } 7413 if ((sub->type == XML_EXP_COUNT) && 7414 (sub->exp_max == 1) && 7415 (sub->exp_left->type == XML_EXP_ATOM)) { 7416 /* TODO: handle wildcards */ 7417 if (exp->exp_str == sub->exp_left->exp_str) { 7418 #ifdef DEBUG_DERIV 7419 printf("Atom match -> Empty\n"); 7420 #endif 7421 return(emptyExp); 7422 } 7423 #ifdef DEBUG_DERIV 7424 printf("Atom mismatch -> Forbid\n"); 7425 #endif 7426 return(forbiddenExp); 7427 } 7428 #ifdef DEBUG_DERIV 7429 printf("Compex exp vs Atom -> Forbid\n"); 7430 #endif 7431 return(forbiddenExp); 7432 case XML_EXP_SEQ: 7433 /* try to get the sequence consumed only if possible */ 7434 if (xmlExpCheckCard(exp->exp_left, sub)) { 7435 /* See if the sequence can be consumed directly */ 7436 #ifdef DEBUG_DERIV 7437 printf("Seq trying left only\n"); 7438 #endif 7439 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7440 if ((ret != forbiddenExp) && (ret != NULL)) { 7441 #ifdef DEBUG_DERIV 7442 printf("Seq trying left only worked\n"); 7443 #endif 7444 /* 7445 * TODO: assumption here that we are determinist 7446 * i.e. we won't get to a nillable exp left 7447 * subset which could be matched by the right 7448 * part too. 7449 * e.g.: (a | b)+,(a | c) and 'a+,a' 7450 */ 7451 exp->exp_right->ref++; 7452 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7453 exp->exp_right, NULL, 0, 0)); 7454 } 7455 #ifdef DEBUG_DERIV 7456 } else { 7457 printf("Seq: left too short\n"); 7458 #endif 7459 } 7460 /* Try instead to decompose */ 7461 if (sub->type == XML_EXP_COUNT) { 7462 int min, max; 7463 7464 #ifdef DEBUG_DERIV 7465 printf("Seq: sub is a count\n"); 7466 #endif 7467 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7468 if (ret == NULL) 7469 return(NULL); 7470 if (ret != forbiddenExp) { 7471 #ifdef DEBUG_DERIV 7472 printf("Seq , Count match on left\n"); 7473 #endif 7474 if (sub->exp_max < 0) 7475 max = -1; 7476 else 7477 max = sub->exp_max -1; 7478 if (sub->exp_min > 0) 7479 min = sub->exp_min -1; 7480 else 7481 min = 0; 7482 exp->exp_right->ref++; 7483 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7484 exp->exp_right, NULL, 0, 0); 7485 if (tmp == NULL) 7486 return(NULL); 7487 7488 sub->exp_left->ref++; 7489 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7490 sub->exp_left, NULL, NULL, min, max); 7491 if (tmp2 == NULL) { 7492 xmlExpFree(ctxt, tmp); 7493 return(NULL); 7494 } 7495 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7496 xmlExpFree(ctxt, tmp); 7497 xmlExpFree(ctxt, tmp2); 7498 return(ret); 7499 } 7500 } 7501 /* we made no progress on structured operations */ 7502 break; 7503 case XML_EXP_OR: 7504 #ifdef DEBUG_DERIV 7505 printf("Or , trying both side\n"); 7506 #endif 7507 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7508 if (ret == NULL) 7509 return(NULL); 7510 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7511 if (tmp == NULL) { 7512 xmlExpFree(ctxt, ret); 7513 return(NULL); 7514 } 7515 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7516 case XML_EXP_COUNT: { 7517 int min, max; 7518 7519 if (sub->type == XML_EXP_COUNT) { 7520 /* 7521 * Try to see if the loop is completely subsumed 7522 */ 7523 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7524 if (tmp == NULL) 7525 return(NULL); 7526 if (tmp == forbiddenExp) { 7527 int mult; 7528 7529 #ifdef DEBUG_DERIV 7530 printf("Count, Count inner don't subsume\n"); 7531 #endif 7532 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7533 NULL, &tmp); 7534 if (mult <= 0) { 7535 #ifdef DEBUG_DERIV 7536 printf("Count, Count not multiple => forbidden\n"); 7537 #endif 7538 return(forbiddenExp); 7539 } 7540 if (sub->exp_max == -1) { 7541 max = -1; 7542 if (exp->exp_max == -1) { 7543 if (exp->exp_min <= sub->exp_min * mult) 7544 min = 0; 7545 else 7546 min = exp->exp_min - sub->exp_min * mult; 7547 } else { 7548 #ifdef DEBUG_DERIV 7549 printf("Count, Count finite can't subsume infinite\n"); 7550 #endif 7551 xmlExpFree(ctxt, tmp); 7552 return(forbiddenExp); 7553 } 7554 } else { 7555 if (exp->exp_max == -1) { 7556 #ifdef DEBUG_DERIV 7557 printf("Infinite loop consume mult finite loop\n"); 7558 #endif 7559 if (exp->exp_min > sub->exp_min * mult) { 7560 max = -1; 7561 min = exp->exp_min - sub->exp_min * mult; 7562 } else { 7563 max = -1; 7564 min = 0; 7565 } 7566 } else { 7567 if (exp->exp_max < sub->exp_max * mult) { 7568 #ifdef DEBUG_DERIV 7569 printf("loops max mult mismatch => forbidden\n"); 7570 #endif 7571 xmlExpFree(ctxt, tmp); 7572 return(forbiddenExp); 7573 } 7574 if (sub->exp_max * mult > exp->exp_min) 7575 min = 0; 7576 else 7577 min = exp->exp_min - sub->exp_max * mult; 7578 max = exp->exp_max - sub->exp_max * mult; 7579 } 7580 } 7581 } else if (!IS_NILLABLE(tmp)) { 7582 /* 7583 * TODO: loop here to try to grow if working on finite 7584 * blocks. 7585 */ 7586 #ifdef DEBUG_DERIV 7587 printf("Count, Count remain not nillable => forbidden\n"); 7588 #endif 7589 xmlExpFree(ctxt, tmp); 7590 return(forbiddenExp); 7591 } else if (sub->exp_max == -1) { 7592 if (exp->exp_max == -1) { 7593 if (exp->exp_min <= sub->exp_min) { 7594 #ifdef DEBUG_DERIV 7595 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7596 #endif 7597 max = -1; 7598 min = 0; 7599 } else { 7600 #ifdef DEBUG_DERIV 7601 printf("Infinite loops min => Count(X,Inf)\n"); 7602 #endif 7603 max = -1; 7604 min = exp->exp_min - sub->exp_min; 7605 } 7606 } else if (exp->exp_min > sub->exp_min) { 7607 #ifdef DEBUG_DERIV 7608 printf("loops min mismatch 1 => forbidden ???\n"); 7609 #endif 7610 xmlExpFree(ctxt, tmp); 7611 return(forbiddenExp); 7612 } else { 7613 max = -1; 7614 min = 0; 7615 } 7616 } else { 7617 if (exp->exp_max == -1) { 7618 #ifdef DEBUG_DERIV 7619 printf("Infinite loop consume finite loop\n"); 7620 #endif 7621 if (exp->exp_min > sub->exp_min) { 7622 max = -1; 7623 min = exp->exp_min - sub->exp_min; 7624 } else { 7625 max = -1; 7626 min = 0; 7627 } 7628 } else { 7629 if (exp->exp_max < sub->exp_max) { 7630 #ifdef DEBUG_DERIV 7631 printf("loops max mismatch => forbidden\n"); 7632 #endif 7633 xmlExpFree(ctxt, tmp); 7634 return(forbiddenExp); 7635 } 7636 if (sub->exp_max > exp->exp_min) 7637 min = 0; 7638 else 7639 min = exp->exp_min - sub->exp_max; 7640 max = exp->exp_max - sub->exp_max; 7641 } 7642 } 7643 #ifdef DEBUG_DERIV 7644 printf("loops match => SEQ(COUNT())\n"); 7645 #endif 7646 exp->exp_left->ref++; 7647 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7648 NULL, NULL, min, max); 7649 if (tmp2 == NULL) { 7650 return(NULL); 7651 } 7652 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7653 NULL, 0, 0); 7654 return(ret); 7655 } 7656 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7657 if (tmp == NULL) 7658 return(NULL); 7659 if (tmp == forbiddenExp) { 7660 #ifdef DEBUG_DERIV 7661 printf("loop mismatch => forbidden\n"); 7662 #endif 7663 return(forbiddenExp); 7664 } 7665 if (exp->exp_min > 0) 7666 min = exp->exp_min - 1; 7667 else 7668 min = 0; 7669 if (exp->exp_max < 0) 7670 max = -1; 7671 else 7672 max = exp->exp_max - 1; 7673 7674 #ifdef DEBUG_DERIV 7675 printf("loop match => SEQ(COUNT())\n"); 7676 #endif 7677 exp->exp_left->ref++; 7678 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7679 NULL, NULL, min, max); 7680 if (tmp2 == NULL) 7681 return(NULL); 7682 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7683 NULL, 0, 0); 7684 return(ret); 7685 } 7686 } 7687 7688 #ifdef DEBUG_DERIV 7689 printf("Fallback to derivative\n"); 7690 #endif 7691 if (IS_NILLABLE(sub)) { 7692 if (!(IS_NILLABLE(exp))) 7693 return(forbiddenExp); 7694 else 7695 ret = emptyExp; 7696 } else 7697 ret = NULL; 7698 /* 7699 * here the structured derivation made no progress so 7700 * we use the default token based derivation to force one more step 7701 */ 7702 if (ctxt->tabSize == 0) 7703 ctxt->tabSize = 40; 7704 7705 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7706 sizeof(const xmlChar *)); 7707 if (tab == NULL) { 7708 return(NULL); 7709 } 7710 7711 /* 7712 * collect all the strings accepted by the subexpression on input 7713 */ 7714 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7715 while (len < 0) { 7716 const xmlChar **temp; 7717 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7718 sizeof(const xmlChar *)); 7719 if (temp == NULL) { 7720 xmlFree((xmlChar **) tab); 7721 return(NULL); 7722 } 7723 tab = temp; 7724 ctxt->tabSize *= 2; 7725 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7726 } 7727 for (i = 0;i < len;i++) { 7728 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7729 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7730 xmlExpFree(ctxt, ret); 7731 xmlFree((xmlChar **) tab); 7732 return(tmp); 7733 } 7734 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7735 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7736 xmlExpFree(ctxt, tmp); 7737 xmlExpFree(ctxt, ret); 7738 xmlFree((xmlChar **) tab); 7739 return(tmp); 7740 } 7741 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7742 xmlExpFree(ctxt, tmp); 7743 xmlExpFree(ctxt, tmp2); 7744 7745 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7746 xmlExpFree(ctxt, ret); 7747 xmlFree((xmlChar **) tab); 7748 return(tmp3); 7749 } 7750 7751 if (ret == NULL) 7752 ret = tmp3; 7753 else { 7754 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7755 if (ret == NULL) { 7756 xmlFree((xmlChar **) tab); 7757 return(NULL); 7758 } 7759 } 7760 } 7761 xmlFree((xmlChar **) tab); 7762 return(ret); 7763 } 7764 7765 /** 7766 * xmlExpExpDerive: 7767 * @ctxt: the expressions context 7768 * @exp: the englobing expression 7769 * @sub: the subexpression 7770 * 7771 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7772 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7773 * it usually tatkes less than linear time and can handle expressions generating 7774 * infinite languages. 7775 * 7776 * Returns the resulting expression or NULL in case of internal error, the 7777 * result must be freed 7778 */ 7779 xmlExpNodePtr 7780 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7781 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7782 return(NULL); 7783 7784 /* 7785 * O(1) speedups 7786 */ 7787 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7788 #ifdef DEBUG_DERIV 7789 printf("Sub nillable and not exp : can't subsume\n"); 7790 #endif 7791 return(forbiddenExp); 7792 } 7793 if (xmlExpCheckCard(exp, sub) == 0) { 7794 #ifdef DEBUG_DERIV 7795 printf("sub generate longuer sequances than exp : can't subsume\n"); 7796 #endif 7797 return(forbiddenExp); 7798 } 7799 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7800 } 7801 7802 /** 7803 * xmlExpSubsume: 7804 * @ctxt: the expressions context 7805 * @exp: the englobing expression 7806 * @sub: the subexpression 7807 * 7808 * Check whether @exp accepts all the languages accexpted by @sub 7809 * the input being a subexpression. 7810 * 7811 * Returns 1 if true 0 if false and -1 in case of failure. 7812 */ 7813 int 7814 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7815 xmlExpNodePtr tmp; 7816 7817 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7818 return(-1); 7819 7820 /* 7821 * TODO: speedup by checking the language of sub is a subset of the 7822 * language of exp 7823 */ 7824 /* 7825 * O(1) speedups 7826 */ 7827 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7828 #ifdef DEBUG_DERIV 7829 printf("Sub nillable and not exp : can't subsume\n"); 7830 #endif 7831 return(0); 7832 } 7833 if (xmlExpCheckCard(exp, sub) == 0) { 7834 #ifdef DEBUG_DERIV 7835 printf("sub generate longuer sequances than exp : can't subsume\n"); 7836 #endif 7837 return(0); 7838 } 7839 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7840 #ifdef DEBUG_DERIV 7841 printf("Result derivation :\n"); 7842 PRINT_EXP(tmp); 7843 #endif 7844 if (tmp == NULL) 7845 return(-1); 7846 if (tmp == forbiddenExp) 7847 return(0); 7848 if (tmp == emptyExp) 7849 return(1); 7850 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7851 xmlExpFree(ctxt, tmp); 7852 return(1); 7853 } 7854 xmlExpFree(ctxt, tmp); 7855 return(0); 7856 } 7857 7858 /************************************************************************ 7859 * * 7860 * Parsing expression * 7861 * * 7862 ************************************************************************/ 7863 7864 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7865 7866 #undef CUR 7867 #define CUR (*ctxt->cur) 7868 #undef NEXT 7869 #define NEXT ctxt->cur++; 7870 #undef IS_BLANK 7871 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7872 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7873 7874 static int 7875 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7876 int ret = 0; 7877 7878 SKIP_BLANKS 7879 if (CUR == '*') { 7880 NEXT 7881 return(-1); 7882 } 7883 if ((CUR < '0') || (CUR > '9')) 7884 return(-1); 7885 while ((CUR >= '0') && (CUR <= '9')) { 7886 ret = ret * 10 + (CUR - '0'); 7887 NEXT 7888 } 7889 return(ret); 7890 } 7891 7892 static xmlExpNodePtr 7893 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7894 const char *base; 7895 xmlExpNodePtr ret; 7896 const xmlChar *val; 7897 7898 SKIP_BLANKS 7899 base = ctxt->cur; 7900 if (*ctxt->cur == '(') { 7901 NEXT 7902 ret = xmlExpParseExpr(ctxt); 7903 SKIP_BLANKS 7904 if (*ctxt->cur != ')') { 7905 fprintf(stderr, "unbalanced '(' : %s\n", base); 7906 xmlExpFree(ctxt, ret); 7907 return(NULL); 7908 } 7909 NEXT; 7910 SKIP_BLANKS 7911 goto parse_quantifier; 7912 } 7913 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7914 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7915 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7916 NEXT; 7917 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7918 if (val == NULL) 7919 return(NULL); 7920 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7921 if (ret == NULL) 7922 return(NULL); 7923 SKIP_BLANKS 7924 parse_quantifier: 7925 if (CUR == '{') { 7926 int min, max; 7927 7928 NEXT 7929 min = xmlExpParseNumber(ctxt); 7930 if (min < 0) { 7931 xmlExpFree(ctxt, ret); 7932 return(NULL); 7933 } 7934 SKIP_BLANKS 7935 if (CUR == ',') { 7936 NEXT 7937 max = xmlExpParseNumber(ctxt); 7938 SKIP_BLANKS 7939 } else 7940 max = min; 7941 if (CUR != '}') { 7942 xmlExpFree(ctxt, ret); 7943 return(NULL); 7944 } 7945 NEXT 7946 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7947 min, max); 7948 SKIP_BLANKS 7949 } else if (CUR == '?') { 7950 NEXT 7951 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7952 0, 1); 7953 SKIP_BLANKS 7954 } else if (CUR == '+') { 7955 NEXT 7956 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7957 1, -1); 7958 SKIP_BLANKS 7959 } else if (CUR == '*') { 7960 NEXT 7961 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7962 0, -1); 7963 SKIP_BLANKS 7964 } 7965 return(ret); 7966 } 7967 7968 7969 static xmlExpNodePtr 7970 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7971 xmlExpNodePtr ret, right; 7972 7973 ret = xmlExpParseOr(ctxt); 7974 SKIP_BLANKS 7975 while (CUR == '|') { 7976 NEXT 7977 right = xmlExpParseOr(ctxt); 7978 if (right == NULL) { 7979 xmlExpFree(ctxt, ret); 7980 return(NULL); 7981 } 7982 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7983 if (ret == NULL) 7984 return(NULL); 7985 } 7986 return(ret); 7987 } 7988 7989 static xmlExpNodePtr 7990 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 7991 xmlExpNodePtr ret, right; 7992 7993 ret = xmlExpParseSeq(ctxt); 7994 SKIP_BLANKS 7995 while (CUR == ',') { 7996 NEXT 7997 right = xmlExpParseSeq(ctxt); 7998 if (right == NULL) { 7999 xmlExpFree(ctxt, ret); 8000 return(NULL); 8001 } 8002 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 8003 if (ret == NULL) 8004 return(NULL); 8005 } 8006 return(ret); 8007 } 8008 8009 /** 8010 * xmlExpParse: 8011 * @ctxt: the expressions context 8012 * @expr: the 0 terminated string 8013 * 8014 * Minimal parser for regexps, it understand the following constructs 8015 * - string terminals 8016 * - choice operator | 8017 * - sequence operator , 8018 * - subexpressions (...) 8019 * - usual cardinality operators + * and ? 8020 * - finite sequences { min, max } 8021 * - infinite sequences { min, * } 8022 * There is minimal checkings made especially no checking on strings values 8023 * 8024 * Returns a new expression or NULL in case of failure 8025 */ 8026 xmlExpNodePtr 8027 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 8028 xmlExpNodePtr ret; 8029 8030 ctxt->expr = expr; 8031 ctxt->cur = expr; 8032 8033 ret = xmlExpParseExpr(ctxt); 8034 SKIP_BLANKS 8035 if (*ctxt->cur != 0) { 8036 xmlExpFree(ctxt, ret); 8037 return(NULL); 8038 } 8039 return(ret); 8040 } 8041 8042 static void 8043 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 8044 xmlExpNodePtr c; 8045 8046 if (expr == NULL) return; 8047 if (glob) xmlBufferWriteChar(buf, "("); 8048 switch (expr->type) { 8049 case XML_EXP_EMPTY: 8050 xmlBufferWriteChar(buf, "empty"); 8051 break; 8052 case XML_EXP_FORBID: 8053 xmlBufferWriteChar(buf, "forbidden"); 8054 break; 8055 case XML_EXP_ATOM: 8056 xmlBufferWriteCHAR(buf, expr->exp_str); 8057 break; 8058 case XML_EXP_SEQ: 8059 c = expr->exp_left; 8060 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8061 xmlExpDumpInt(buf, c, 1); 8062 else 8063 xmlExpDumpInt(buf, c, 0); 8064 xmlBufferWriteChar(buf, " , "); 8065 c = expr->exp_right; 8066 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8067 xmlExpDumpInt(buf, c, 1); 8068 else 8069 xmlExpDumpInt(buf, c, 0); 8070 break; 8071 case XML_EXP_OR: 8072 c = expr->exp_left; 8073 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8074 xmlExpDumpInt(buf, c, 1); 8075 else 8076 xmlExpDumpInt(buf, c, 0); 8077 xmlBufferWriteChar(buf, " | "); 8078 c = expr->exp_right; 8079 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8080 xmlExpDumpInt(buf, c, 1); 8081 else 8082 xmlExpDumpInt(buf, c, 0); 8083 break; 8084 case XML_EXP_COUNT: { 8085 char rep[40]; 8086 8087 c = expr->exp_left; 8088 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8089 xmlExpDumpInt(buf, c, 1); 8090 else 8091 xmlExpDumpInt(buf, c, 0); 8092 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 8093 rep[0] = '?'; 8094 rep[1] = 0; 8095 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8096 rep[0] = '*'; 8097 rep[1] = 0; 8098 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8099 rep[0] = '+'; 8100 rep[1] = 0; 8101 } else if (expr->exp_max == expr->exp_min) { 8102 snprintf(rep, 39, "{%d}", expr->exp_min); 8103 } else if (expr->exp_max < 0) { 8104 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8105 } else { 8106 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8107 } 8108 rep[39] = 0; 8109 xmlBufferWriteChar(buf, rep); 8110 break; 8111 } 8112 default: 8113 fprintf(stderr, "Error in tree\n"); 8114 } 8115 if (glob) 8116 xmlBufferWriteChar(buf, ")"); 8117 } 8118 /** 8119 * xmlExpDump: 8120 * @buf: a buffer to receive the output 8121 * @expr: the compiled expression 8122 * 8123 * Serialize the expression as compiled to the buffer 8124 */ 8125 void 8126 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8127 if ((buf == NULL) || (expr == NULL)) 8128 return; 8129 xmlExpDumpInt(buf, expr, 0); 8130 } 8131 8132 /** 8133 * xmlExpMaxToken: 8134 * @expr: a compiled expression 8135 * 8136 * Indicate the maximum number of input a expression can accept 8137 * 8138 * Returns the maximum length or -1 in case of error 8139 */ 8140 int 8141 xmlExpMaxToken(xmlExpNodePtr expr) { 8142 if (expr == NULL) 8143 return(-1); 8144 return(expr->c_max); 8145 } 8146 8147 /** 8148 * xmlExpCtxtNbNodes: 8149 * @ctxt: an expression context 8150 * 8151 * Debugging facility provides the number of allocated nodes at a that point 8152 * 8153 * Returns the number of nodes in use or -1 in case of error 8154 */ 8155 int 8156 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8157 if (ctxt == NULL) 8158 return(-1); 8159 return(ctxt->nb_nodes); 8160 } 8161 8162 /** 8163 * xmlExpCtxtNbCons: 8164 * @ctxt: an expression context 8165 * 8166 * Debugging facility provides the number of allocated nodes over lifetime 8167 * 8168 * Returns the number of nodes ever allocated or -1 in case of error 8169 */ 8170 int 8171 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8172 if (ctxt == NULL) 8173 return(-1); 8174 return(ctxt->nb_cons); 8175 } 8176 8177 #endif /* LIBXML_EXPR_ENABLED */ 8178 #define bottom_xmlregexp 8179 #include "elfgcchack.h" 8180 #endif /* LIBXML_REGEXP_ENABLED */ 8181