1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/schemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 30 #include <libxml/tree.h> 31 #include <libxml/parserInternals.h> 32 #include <libxml/xmlregexp.h> 33 #include <libxml/xmlautomata.h> 34 #include <libxml/xmlunicode.h> 35 36 #ifndef INT_MAX 37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38 #endif 39 40 /* #define DEBUG_REGEXP_GRAPH */ 41 /* #define DEBUG_REGEXP_EXEC */ 42 /* #define DEBUG_PUSH */ 43 /* #define DEBUG_COMPACTION */ 44 45 #define MAX_PUSH 10000000 46 47 #ifdef ERROR 48 #undef ERROR 49 #endif 50 #define ERROR(str) \ 51 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 52 xmlRegexpErrCompile(ctxt, str); 53 #define NEXT ctxt->cur++ 54 #define CUR (*(ctxt->cur)) 55 #define NXT(index) (ctxt->cur[index]) 56 57 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 58 #define NEXTL(l) ctxt->cur += l; 59 #define XML_REG_STRING_SEPARATOR '|' 60 /* 61 * Need PREV to check on a '-' within a Character Group. May only be used 62 * when it's guaranteed that cur is not at the beginning of ctxt->string! 63 */ 64 #define PREV (ctxt->cur[-1]) 65 66 /** 67 * TODO: 68 * 69 * macro to flag unimplemented blocks 70 */ 71 #define TODO \ 72 xmlGenericError(xmlGenericErrorContext, \ 73 "Unimplemented block at %s:%d\n", \ 74 __FILE__, __LINE__); 75 76 /************************************************************************ 77 * * 78 * Datatypes and structures * 79 * * 80 ************************************************************************/ 81 82 /* 83 * Note: the order of the enums below is significant, do not shuffle 84 */ 85 typedef enum { 86 XML_REGEXP_EPSILON = 1, 87 XML_REGEXP_CHARVAL, 88 XML_REGEXP_RANGES, 89 XML_REGEXP_SUBREG, /* used for () sub regexps */ 90 XML_REGEXP_STRING, 91 XML_REGEXP_ANYCHAR, /* . */ 92 XML_REGEXP_ANYSPACE, /* \s */ 93 XML_REGEXP_NOTSPACE, /* \S */ 94 XML_REGEXP_INITNAME, /* \l */ 95 XML_REGEXP_NOTINITNAME, /* \L */ 96 XML_REGEXP_NAMECHAR, /* \c */ 97 XML_REGEXP_NOTNAMECHAR, /* \C */ 98 XML_REGEXP_DECIMAL, /* \d */ 99 XML_REGEXP_NOTDECIMAL, /* \D */ 100 XML_REGEXP_REALCHAR, /* \w */ 101 XML_REGEXP_NOTREALCHAR, /* \W */ 102 XML_REGEXP_LETTER = 100, 103 XML_REGEXP_LETTER_UPPERCASE, 104 XML_REGEXP_LETTER_LOWERCASE, 105 XML_REGEXP_LETTER_TITLECASE, 106 XML_REGEXP_LETTER_MODIFIER, 107 XML_REGEXP_LETTER_OTHERS, 108 XML_REGEXP_MARK, 109 XML_REGEXP_MARK_NONSPACING, 110 XML_REGEXP_MARK_SPACECOMBINING, 111 XML_REGEXP_MARK_ENCLOSING, 112 XML_REGEXP_NUMBER, 113 XML_REGEXP_NUMBER_DECIMAL, 114 XML_REGEXP_NUMBER_LETTER, 115 XML_REGEXP_NUMBER_OTHERS, 116 XML_REGEXP_PUNCT, 117 XML_REGEXP_PUNCT_CONNECTOR, 118 XML_REGEXP_PUNCT_DASH, 119 XML_REGEXP_PUNCT_OPEN, 120 XML_REGEXP_PUNCT_CLOSE, 121 XML_REGEXP_PUNCT_INITQUOTE, 122 XML_REGEXP_PUNCT_FINQUOTE, 123 XML_REGEXP_PUNCT_OTHERS, 124 XML_REGEXP_SEPAR, 125 XML_REGEXP_SEPAR_SPACE, 126 XML_REGEXP_SEPAR_LINE, 127 XML_REGEXP_SEPAR_PARA, 128 XML_REGEXP_SYMBOL, 129 XML_REGEXP_SYMBOL_MATH, 130 XML_REGEXP_SYMBOL_CURRENCY, 131 XML_REGEXP_SYMBOL_MODIFIER, 132 XML_REGEXP_SYMBOL_OTHERS, 133 XML_REGEXP_OTHER, 134 XML_REGEXP_OTHER_CONTROL, 135 XML_REGEXP_OTHER_FORMAT, 136 XML_REGEXP_OTHER_PRIVATE, 137 XML_REGEXP_OTHER_NA, 138 XML_REGEXP_BLOCK_NAME 139 } xmlRegAtomType; 140 141 typedef enum { 142 XML_REGEXP_QUANT_EPSILON = 1, 143 XML_REGEXP_QUANT_ONCE, 144 XML_REGEXP_QUANT_OPT, 145 XML_REGEXP_QUANT_MULT, 146 XML_REGEXP_QUANT_PLUS, 147 XML_REGEXP_QUANT_ONCEONLY, 148 XML_REGEXP_QUANT_ALL, 149 XML_REGEXP_QUANT_RANGE 150 } xmlRegQuantType; 151 152 typedef enum { 153 XML_REGEXP_START_STATE = 1, 154 XML_REGEXP_FINAL_STATE, 155 XML_REGEXP_TRANS_STATE, 156 XML_REGEXP_SINK_STATE, 157 XML_REGEXP_UNREACH_STATE 158 } xmlRegStateType; 159 160 typedef enum { 161 XML_REGEXP_MARK_NORMAL = 0, 162 XML_REGEXP_MARK_START, 163 XML_REGEXP_MARK_VISITED 164 } xmlRegMarkedType; 165 166 typedef struct _xmlRegRange xmlRegRange; 167 typedef xmlRegRange *xmlRegRangePtr; 168 169 struct _xmlRegRange { 170 int neg; /* 0 normal, 1 not, 2 exclude */ 171 xmlRegAtomType type; 172 int start; 173 int end; 174 xmlChar *blockName; 175 }; 176 177 typedef struct _xmlRegAtom xmlRegAtom; 178 typedef xmlRegAtom *xmlRegAtomPtr; 179 180 typedef struct _xmlAutomataState xmlRegState; 181 typedef xmlRegState *xmlRegStatePtr; 182 183 struct _xmlRegAtom { 184 int no; 185 xmlRegAtomType type; 186 xmlRegQuantType quant; 187 int min; 188 int max; 189 190 void *valuep; 191 void *valuep2; 192 int neg; 193 int codepoint; 194 xmlRegStatePtr start; 195 xmlRegStatePtr start0; 196 xmlRegStatePtr stop; 197 int maxRanges; 198 int nbRanges; 199 xmlRegRangePtr *ranges; 200 void *data; 201 }; 202 203 typedef struct _xmlRegCounter xmlRegCounter; 204 typedef xmlRegCounter *xmlRegCounterPtr; 205 206 struct _xmlRegCounter { 207 int min; 208 int max; 209 }; 210 211 typedef struct _xmlRegTrans xmlRegTrans; 212 typedef xmlRegTrans *xmlRegTransPtr; 213 214 struct _xmlRegTrans { 215 xmlRegAtomPtr atom; 216 int to; 217 int counter; 218 int count; 219 int nd; 220 }; 221 222 struct _xmlAutomataState { 223 xmlRegStateType type; 224 xmlRegMarkedType mark; 225 xmlRegMarkedType markd; 226 xmlRegMarkedType reached; 227 int no; 228 int maxTrans; 229 int nbTrans; 230 xmlRegTrans *trans; 231 /* knowing states pointing to us can speed things up */ 232 int maxTransTo; 233 int nbTransTo; 234 int *transTo; 235 }; 236 237 typedef struct _xmlAutomata xmlRegParserCtxt; 238 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 239 240 #define AM_AUTOMATA_RNG 1 241 242 struct _xmlAutomata { 243 xmlChar *string; 244 xmlChar *cur; 245 246 int error; 247 int neg; 248 249 xmlRegStatePtr start; 250 xmlRegStatePtr end; 251 xmlRegStatePtr state; 252 253 xmlRegAtomPtr atom; 254 255 int maxAtoms; 256 int nbAtoms; 257 xmlRegAtomPtr *atoms; 258 259 int maxStates; 260 int nbStates; 261 xmlRegStatePtr *states; 262 263 int maxCounters; 264 int nbCounters; 265 xmlRegCounter *counters; 266 267 int determinist; 268 int negs; 269 int flags; 270 }; 271 272 struct _xmlRegexp { 273 xmlChar *string; 274 int nbStates; 275 xmlRegStatePtr *states; 276 int nbAtoms; 277 xmlRegAtomPtr *atoms; 278 int nbCounters; 279 xmlRegCounter *counters; 280 int determinist; 281 int flags; 282 /* 283 * That's the compact form for determinists automatas 284 */ 285 int nbstates; 286 int *compact; 287 void **transdata; 288 int nbstrings; 289 xmlChar **stringMap; 290 }; 291 292 typedef struct _xmlRegExecRollback xmlRegExecRollback; 293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 294 295 struct _xmlRegExecRollback { 296 xmlRegStatePtr state;/* the current state */ 297 int index; /* the index in the input stack */ 298 int nextbranch; /* the next transition to explore in that state */ 299 int *counts; /* save the automata state if it has some */ 300 }; 301 302 typedef struct _xmlRegInputToken xmlRegInputToken; 303 typedef xmlRegInputToken *xmlRegInputTokenPtr; 304 305 struct _xmlRegInputToken { 306 xmlChar *value; 307 void *data; 308 }; 309 310 struct _xmlRegExecCtxt { 311 int status; /* execution status != 0 indicate an error */ 312 int determinist; /* did we find an indeterministic behaviour */ 313 xmlRegexpPtr comp; /* the compiled regexp */ 314 xmlRegExecCallbacks callback; 315 void *data; 316 317 xmlRegStatePtr state;/* the current state */ 318 int transno; /* the current transition on that state */ 319 int transcount; /* the number of chars in char counted transitions */ 320 321 /* 322 * A stack of rollback states 323 */ 324 int maxRollbacks; 325 int nbRollbacks; 326 xmlRegExecRollback *rollbacks; 327 328 /* 329 * The state of the automata if any 330 */ 331 int *counts; 332 333 /* 334 * The input stack 335 */ 336 int inputStackMax; 337 int inputStackNr; 338 int index; 339 int *charStack; 340 const xmlChar *inputString; /* when operating on characters */ 341 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 342 343 /* 344 * error handling 345 */ 346 int errStateNo; /* the error state number */ 347 xmlRegStatePtr errState; /* the error state */ 348 xmlChar *errString; /* the string raising the error */ 349 int *errCounts; /* counters at the error state */ 350 int nbPush; 351 }; 352 353 #define REGEXP_ALL_COUNTER 0x123456 354 #define REGEXP_ALL_LAX_COUNTER 0x123457 355 356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 357 static void xmlRegFreeState(xmlRegStatePtr state); 358 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 362 int neg, int start, int end, const xmlChar *blockName); 363 364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 365 366 /************************************************************************ 367 * * 368 * Regexp memory error handler * 369 * * 370 ************************************************************************/ 371 /** 372 * xmlRegexpErrMemory: 373 * @extra: extra information 374 * 375 * Handle an out of memory condition 376 */ 377 static void 378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 379 { 380 const char *regexp = NULL; 381 if (ctxt != NULL) { 382 regexp = (const char *) ctxt->string; 383 ctxt->error = XML_ERR_NO_MEMORY; 384 } 385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 387 regexp, NULL, 0, 0, 388 "Memory allocation failed : %s\n", extra); 389 } 390 391 /** 392 * xmlRegexpErrCompile: 393 * @extra: extra information 394 * 395 * Handle a compilation failure 396 */ 397 static void 398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 399 { 400 const char *regexp = NULL; 401 int idx = 0; 402 403 if (ctxt != NULL) { 404 regexp = (const char *) ctxt->string; 405 idx = ctxt->cur - ctxt->string; 406 ctxt->error = XML_REGEXP_COMPILE_ERROR; 407 } 408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 410 regexp, NULL, idx, 0, 411 "failed to compile: %s\n", extra); 412 } 413 414 /************************************************************************ 415 * * 416 * Allocation/Deallocation * 417 * * 418 ************************************************************************/ 419 420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 421 /** 422 * xmlRegEpxFromParse: 423 * @ctxt: the parser context used to build it 424 * 425 * Allocate a new regexp and fill it with the result from the parser 426 * 427 * Returns the new regexp or NULL in case of error 428 */ 429 static xmlRegexpPtr 430 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 431 xmlRegexpPtr ret; 432 433 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 434 if (ret == NULL) { 435 xmlRegexpErrMemory(ctxt, "compiling regexp"); 436 return(NULL); 437 } 438 memset(ret, 0, sizeof(xmlRegexp)); 439 ret->string = ctxt->string; 440 ret->nbStates = ctxt->nbStates; 441 ret->states = ctxt->states; 442 ret->nbAtoms = ctxt->nbAtoms; 443 ret->atoms = ctxt->atoms; 444 ret->nbCounters = ctxt->nbCounters; 445 ret->counters = ctxt->counters; 446 ret->determinist = ctxt->determinist; 447 ret->flags = ctxt->flags; 448 if (ret->determinist == -1) { 449 xmlRegexpIsDeterminist(ret); 450 } 451 452 if ((ret->determinist != 0) && 453 (ret->nbCounters == 0) && 454 (ctxt->negs == 0) && 455 (ret->atoms != NULL) && 456 (ret->atoms[0] != NULL) && 457 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 458 int i, j, nbstates = 0, nbatoms = 0; 459 int *stateRemap; 460 int *stringRemap; 461 int *transitions; 462 void **transdata; 463 xmlChar **stringMap; 464 xmlChar *value; 465 466 /* 467 * Switch to a compact representation 468 * 1/ counting the effective number of states left 469 * 2/ counting the unique number of atoms, and check that 470 * they are all of the string type 471 * 3/ build a table state x atom for the transitions 472 */ 473 474 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 475 if (stateRemap == NULL) { 476 xmlRegexpErrMemory(ctxt, "compiling regexp"); 477 xmlFree(ret); 478 return(NULL); 479 } 480 for (i = 0;i < ret->nbStates;i++) { 481 if (ret->states[i] != NULL) { 482 stateRemap[i] = nbstates; 483 nbstates++; 484 } else { 485 stateRemap[i] = -1; 486 } 487 } 488 #ifdef DEBUG_COMPACTION 489 printf("Final: %d states\n", nbstates); 490 #endif 491 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 492 if (stringMap == NULL) { 493 xmlRegexpErrMemory(ctxt, "compiling regexp"); 494 xmlFree(stateRemap); 495 xmlFree(ret); 496 return(NULL); 497 } 498 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 499 if (stringRemap == NULL) { 500 xmlRegexpErrMemory(ctxt, "compiling regexp"); 501 xmlFree(stringMap); 502 xmlFree(stateRemap); 503 xmlFree(ret); 504 return(NULL); 505 } 506 for (i = 0;i < ret->nbAtoms;i++) { 507 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 508 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 509 value = ret->atoms[i]->valuep; 510 for (j = 0;j < nbatoms;j++) { 511 if (xmlStrEqual(stringMap[j], value)) { 512 stringRemap[i] = j; 513 break; 514 } 515 } 516 if (j >= nbatoms) { 517 stringRemap[i] = nbatoms; 518 stringMap[nbatoms] = xmlStrdup(value); 519 if (stringMap[nbatoms] == NULL) { 520 for (i = 0;i < nbatoms;i++) 521 xmlFree(stringMap[i]); 522 xmlFree(stringRemap); 523 xmlFree(stringMap); 524 xmlFree(stateRemap); 525 xmlFree(ret); 526 return(NULL); 527 } 528 nbatoms++; 529 } 530 } else { 531 xmlFree(stateRemap); 532 xmlFree(stringRemap); 533 for (i = 0;i < nbatoms;i++) 534 xmlFree(stringMap[i]); 535 xmlFree(stringMap); 536 xmlFree(ret); 537 return(NULL); 538 } 539 } 540 #ifdef DEBUG_COMPACTION 541 printf("Final: %d atoms\n", nbatoms); 542 #endif 543 transitions = (int *) xmlMalloc((nbstates + 1) * 544 (nbatoms + 1) * sizeof(int)); 545 if (transitions == NULL) { 546 xmlFree(stateRemap); 547 xmlFree(stringRemap); 548 for (i = 0;i < nbatoms;i++) 549 xmlFree(stringMap[i]); 550 xmlFree(stringMap); 551 xmlFree(ret); 552 return(NULL); 553 } 554 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 555 556 /* 557 * Allocate the transition table. The first entry for each 558 * state corresponds to the state type. 559 */ 560 transdata = NULL; 561 562 for (i = 0;i < ret->nbStates;i++) { 563 int stateno, atomno, targetno, prev; 564 xmlRegStatePtr state; 565 xmlRegTransPtr trans; 566 567 stateno = stateRemap[i]; 568 if (stateno == -1) 569 continue; 570 state = ret->states[i]; 571 572 transitions[stateno * (nbatoms + 1)] = state->type; 573 574 for (j = 0;j < state->nbTrans;j++) { 575 trans = &(state->trans[j]); 576 if ((trans->to == -1) || (trans->atom == NULL)) 577 continue; 578 atomno = stringRemap[trans->atom->no]; 579 if ((trans->atom->data != NULL) && (transdata == NULL)) { 580 transdata = (void **) xmlMalloc(nbstates * nbatoms * 581 sizeof(void *)); 582 if (transdata != NULL) 583 memset(transdata, 0, 584 nbstates * nbatoms * sizeof(void *)); 585 else { 586 xmlRegexpErrMemory(ctxt, "compiling regexp"); 587 break; 588 } 589 } 590 targetno = stateRemap[trans->to]; 591 /* 592 * if the same atom can generate transitions to 2 different 593 * states then it means the automata is not deterministic and 594 * the compact form can't be used ! 595 */ 596 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 597 if (prev != 0) { 598 if (prev != targetno + 1) { 599 ret->determinist = 0; 600 #ifdef DEBUG_COMPACTION 601 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 602 i, j, trans->atom->no, trans->to, atomno, targetno); 603 printf(" previous to is %d\n", prev); 604 #endif 605 if (transdata != NULL) 606 xmlFree(transdata); 607 xmlFree(transitions); 608 xmlFree(stateRemap); 609 xmlFree(stringRemap); 610 for (i = 0;i < nbatoms;i++) 611 xmlFree(stringMap[i]); 612 xmlFree(stringMap); 613 goto not_determ; 614 } 615 } else { 616 #if 0 617 printf("State %d trans %d: atom %d to %d : %d to %d\n", 618 i, j, trans->atom->no, trans->to, atomno, targetno); 619 #endif 620 transitions[stateno * (nbatoms + 1) + atomno + 1] = 621 targetno + 1; /* to avoid 0 */ 622 if (transdata != NULL) 623 transdata[stateno * nbatoms + atomno] = 624 trans->atom->data; 625 } 626 } 627 } 628 ret->determinist = 1; 629 #ifdef DEBUG_COMPACTION 630 /* 631 * Debug 632 */ 633 for (i = 0;i < nbstates;i++) { 634 for (j = 0;j < nbatoms + 1;j++) { 635 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 636 } 637 printf("\n"); 638 } 639 printf("\n"); 640 #endif 641 /* 642 * Cleanup of the old data 643 */ 644 if (ret->states != NULL) { 645 for (i = 0;i < ret->nbStates;i++) 646 xmlRegFreeState(ret->states[i]); 647 xmlFree(ret->states); 648 } 649 ret->states = NULL; 650 ret->nbStates = 0; 651 if (ret->atoms != NULL) { 652 for (i = 0;i < ret->nbAtoms;i++) 653 xmlRegFreeAtom(ret->atoms[i]); 654 xmlFree(ret->atoms); 655 } 656 ret->atoms = NULL; 657 ret->nbAtoms = 0; 658 659 ret->compact = transitions; 660 ret->transdata = transdata; 661 ret->stringMap = stringMap; 662 ret->nbstrings = nbatoms; 663 ret->nbstates = nbstates; 664 xmlFree(stateRemap); 665 xmlFree(stringRemap); 666 } 667 not_determ: 668 ctxt->string = NULL; 669 ctxt->nbStates = 0; 670 ctxt->states = NULL; 671 ctxt->nbAtoms = 0; 672 ctxt->atoms = NULL; 673 ctxt->nbCounters = 0; 674 ctxt->counters = NULL; 675 return(ret); 676 } 677 678 /** 679 * xmlRegNewParserCtxt: 680 * @string: the string to parse 681 * 682 * Allocate a new regexp parser context 683 * 684 * Returns the new context or NULL in case of error 685 */ 686 static xmlRegParserCtxtPtr 687 xmlRegNewParserCtxt(const xmlChar *string) { 688 xmlRegParserCtxtPtr ret; 689 690 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 691 if (ret == NULL) 692 return(NULL); 693 memset(ret, 0, sizeof(xmlRegParserCtxt)); 694 if (string != NULL) 695 ret->string = xmlStrdup(string); 696 ret->cur = ret->string; 697 ret->neg = 0; 698 ret->negs = 0; 699 ret->error = 0; 700 ret->determinist = -1; 701 return(ret); 702 } 703 704 /** 705 * xmlRegNewRange: 706 * @ctxt: the regexp parser context 707 * @neg: is that negative 708 * @type: the type of range 709 * @start: the start codepoint 710 * @end: the end codepoint 711 * 712 * Allocate a new regexp range 713 * 714 * Returns the new range or NULL in case of error 715 */ 716 static xmlRegRangePtr 717 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 718 int neg, xmlRegAtomType type, int start, int end) { 719 xmlRegRangePtr ret; 720 721 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 722 if (ret == NULL) { 723 xmlRegexpErrMemory(ctxt, "allocating range"); 724 return(NULL); 725 } 726 ret->neg = neg; 727 ret->type = type; 728 ret->start = start; 729 ret->end = end; 730 return(ret); 731 } 732 733 /** 734 * xmlRegFreeRange: 735 * @range: the regexp range 736 * 737 * Free a regexp range 738 */ 739 static void 740 xmlRegFreeRange(xmlRegRangePtr range) { 741 if (range == NULL) 742 return; 743 744 if (range->blockName != NULL) 745 xmlFree(range->blockName); 746 xmlFree(range); 747 } 748 749 /** 750 * xmlRegCopyRange: 751 * @range: the regexp range 752 * 753 * Copy a regexp range 754 * 755 * Returns the new copy or NULL in case of error. 756 */ 757 static xmlRegRangePtr 758 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 759 xmlRegRangePtr ret; 760 761 if (range == NULL) 762 return(NULL); 763 764 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 765 range->end); 766 if (ret == NULL) 767 return(NULL); 768 if (range->blockName != NULL) { 769 ret->blockName = xmlStrdup(range->blockName); 770 if (ret->blockName == NULL) { 771 xmlRegexpErrMemory(ctxt, "allocating range"); 772 xmlRegFreeRange(ret); 773 return(NULL); 774 } 775 } 776 return(ret); 777 } 778 779 /** 780 * xmlRegNewAtom: 781 * @ctxt: the regexp parser context 782 * @type: the type of atom 783 * 784 * Allocate a new atom 785 * 786 * Returns the new atom or NULL in case of error 787 */ 788 static xmlRegAtomPtr 789 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 790 xmlRegAtomPtr ret; 791 792 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 793 if (ret == NULL) { 794 xmlRegexpErrMemory(ctxt, "allocating atom"); 795 return(NULL); 796 } 797 memset(ret, 0, sizeof(xmlRegAtom)); 798 ret->type = type; 799 ret->quant = XML_REGEXP_QUANT_ONCE; 800 ret->min = 0; 801 ret->max = 0; 802 return(ret); 803 } 804 805 /** 806 * xmlRegFreeAtom: 807 * @atom: the regexp atom 808 * 809 * Free a regexp atom 810 */ 811 static void 812 xmlRegFreeAtom(xmlRegAtomPtr atom) { 813 int i; 814 815 if (atom == NULL) 816 return; 817 818 for (i = 0;i < atom->nbRanges;i++) 819 xmlRegFreeRange(atom->ranges[i]); 820 if (atom->ranges != NULL) 821 xmlFree(atom->ranges); 822 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 823 xmlFree(atom->valuep); 824 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 825 xmlFree(atom->valuep2); 826 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 827 xmlFree(atom->valuep); 828 xmlFree(atom); 829 } 830 831 /** 832 * xmlRegCopyAtom: 833 * @ctxt: the regexp parser context 834 * @atom: the original atom 835 * 836 * Allocate a new regexp range 837 * 838 * Returns the new atom or NULL in case of error 839 */ 840 static xmlRegAtomPtr 841 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 842 xmlRegAtomPtr ret; 843 844 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 845 if (ret == NULL) { 846 xmlRegexpErrMemory(ctxt, "copying atom"); 847 return(NULL); 848 } 849 memset(ret, 0, sizeof(xmlRegAtom)); 850 ret->type = atom->type; 851 ret->quant = atom->quant; 852 ret->min = atom->min; 853 ret->max = atom->max; 854 if (atom->nbRanges > 0) { 855 int i; 856 857 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 858 atom->nbRanges); 859 if (ret->ranges == NULL) { 860 xmlRegexpErrMemory(ctxt, "copying atom"); 861 goto error; 862 } 863 for (i = 0;i < atom->nbRanges;i++) { 864 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 865 if (ret->ranges[i] == NULL) 866 goto error; 867 ret->nbRanges = i + 1; 868 } 869 } 870 return(ret); 871 872 error: 873 xmlRegFreeAtom(ret); 874 return(NULL); 875 } 876 877 static xmlRegStatePtr 878 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 879 xmlRegStatePtr ret; 880 881 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 882 if (ret == NULL) { 883 xmlRegexpErrMemory(ctxt, "allocating state"); 884 return(NULL); 885 } 886 memset(ret, 0, sizeof(xmlRegState)); 887 ret->type = XML_REGEXP_TRANS_STATE; 888 ret->mark = XML_REGEXP_MARK_NORMAL; 889 return(ret); 890 } 891 892 /** 893 * xmlRegFreeState: 894 * @state: the regexp state 895 * 896 * Free a regexp state 897 */ 898 static void 899 xmlRegFreeState(xmlRegStatePtr state) { 900 if (state == NULL) 901 return; 902 903 if (state->trans != NULL) 904 xmlFree(state->trans); 905 if (state->transTo != NULL) 906 xmlFree(state->transTo); 907 xmlFree(state); 908 } 909 910 /** 911 * xmlRegFreeParserCtxt: 912 * @ctxt: the regexp parser context 913 * 914 * Free a regexp parser context 915 */ 916 static void 917 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 918 int i; 919 if (ctxt == NULL) 920 return; 921 922 if (ctxt->string != NULL) 923 xmlFree(ctxt->string); 924 if (ctxt->states != NULL) { 925 for (i = 0;i < ctxt->nbStates;i++) 926 xmlRegFreeState(ctxt->states[i]); 927 xmlFree(ctxt->states); 928 } 929 if (ctxt->atoms != NULL) { 930 for (i = 0;i < ctxt->nbAtoms;i++) 931 xmlRegFreeAtom(ctxt->atoms[i]); 932 xmlFree(ctxt->atoms); 933 } 934 if (ctxt->counters != NULL) 935 xmlFree(ctxt->counters); 936 xmlFree(ctxt); 937 } 938 939 /************************************************************************ 940 * * 941 * Display of Data structures * 942 * * 943 ************************************************************************/ 944 945 static void 946 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 947 switch (type) { 948 case XML_REGEXP_EPSILON: 949 fprintf(output, "epsilon "); break; 950 case XML_REGEXP_CHARVAL: 951 fprintf(output, "charval "); break; 952 case XML_REGEXP_RANGES: 953 fprintf(output, "ranges "); break; 954 case XML_REGEXP_SUBREG: 955 fprintf(output, "subexpr "); break; 956 case XML_REGEXP_STRING: 957 fprintf(output, "string "); break; 958 case XML_REGEXP_ANYCHAR: 959 fprintf(output, "anychar "); break; 960 case XML_REGEXP_ANYSPACE: 961 fprintf(output, "anyspace "); break; 962 case XML_REGEXP_NOTSPACE: 963 fprintf(output, "notspace "); break; 964 case XML_REGEXP_INITNAME: 965 fprintf(output, "initname "); break; 966 case XML_REGEXP_NOTINITNAME: 967 fprintf(output, "notinitname "); break; 968 case XML_REGEXP_NAMECHAR: 969 fprintf(output, "namechar "); break; 970 case XML_REGEXP_NOTNAMECHAR: 971 fprintf(output, "notnamechar "); break; 972 case XML_REGEXP_DECIMAL: 973 fprintf(output, "decimal "); break; 974 case XML_REGEXP_NOTDECIMAL: 975 fprintf(output, "notdecimal "); break; 976 case XML_REGEXP_REALCHAR: 977 fprintf(output, "realchar "); break; 978 case XML_REGEXP_NOTREALCHAR: 979 fprintf(output, "notrealchar "); break; 980 case XML_REGEXP_LETTER: 981 fprintf(output, "LETTER "); break; 982 case XML_REGEXP_LETTER_UPPERCASE: 983 fprintf(output, "LETTER_UPPERCASE "); break; 984 case XML_REGEXP_LETTER_LOWERCASE: 985 fprintf(output, "LETTER_LOWERCASE "); break; 986 case XML_REGEXP_LETTER_TITLECASE: 987 fprintf(output, "LETTER_TITLECASE "); break; 988 case XML_REGEXP_LETTER_MODIFIER: 989 fprintf(output, "LETTER_MODIFIER "); break; 990 case XML_REGEXP_LETTER_OTHERS: 991 fprintf(output, "LETTER_OTHERS "); break; 992 case XML_REGEXP_MARK: 993 fprintf(output, "MARK "); break; 994 case XML_REGEXP_MARK_NONSPACING: 995 fprintf(output, "MARK_NONSPACING "); break; 996 case XML_REGEXP_MARK_SPACECOMBINING: 997 fprintf(output, "MARK_SPACECOMBINING "); break; 998 case XML_REGEXP_MARK_ENCLOSING: 999 fprintf(output, "MARK_ENCLOSING "); break; 1000 case XML_REGEXP_NUMBER: 1001 fprintf(output, "NUMBER "); break; 1002 case XML_REGEXP_NUMBER_DECIMAL: 1003 fprintf(output, "NUMBER_DECIMAL "); break; 1004 case XML_REGEXP_NUMBER_LETTER: 1005 fprintf(output, "NUMBER_LETTER "); break; 1006 case XML_REGEXP_NUMBER_OTHERS: 1007 fprintf(output, "NUMBER_OTHERS "); break; 1008 case XML_REGEXP_PUNCT: 1009 fprintf(output, "PUNCT "); break; 1010 case XML_REGEXP_PUNCT_CONNECTOR: 1011 fprintf(output, "PUNCT_CONNECTOR "); break; 1012 case XML_REGEXP_PUNCT_DASH: 1013 fprintf(output, "PUNCT_DASH "); break; 1014 case XML_REGEXP_PUNCT_OPEN: 1015 fprintf(output, "PUNCT_OPEN "); break; 1016 case XML_REGEXP_PUNCT_CLOSE: 1017 fprintf(output, "PUNCT_CLOSE "); break; 1018 case XML_REGEXP_PUNCT_INITQUOTE: 1019 fprintf(output, "PUNCT_INITQUOTE "); break; 1020 case XML_REGEXP_PUNCT_FINQUOTE: 1021 fprintf(output, "PUNCT_FINQUOTE "); break; 1022 case XML_REGEXP_PUNCT_OTHERS: 1023 fprintf(output, "PUNCT_OTHERS "); break; 1024 case XML_REGEXP_SEPAR: 1025 fprintf(output, "SEPAR "); break; 1026 case XML_REGEXP_SEPAR_SPACE: 1027 fprintf(output, "SEPAR_SPACE "); break; 1028 case XML_REGEXP_SEPAR_LINE: 1029 fprintf(output, "SEPAR_LINE "); break; 1030 case XML_REGEXP_SEPAR_PARA: 1031 fprintf(output, "SEPAR_PARA "); break; 1032 case XML_REGEXP_SYMBOL: 1033 fprintf(output, "SYMBOL "); break; 1034 case XML_REGEXP_SYMBOL_MATH: 1035 fprintf(output, "SYMBOL_MATH "); break; 1036 case XML_REGEXP_SYMBOL_CURRENCY: 1037 fprintf(output, "SYMBOL_CURRENCY "); break; 1038 case XML_REGEXP_SYMBOL_MODIFIER: 1039 fprintf(output, "SYMBOL_MODIFIER "); break; 1040 case XML_REGEXP_SYMBOL_OTHERS: 1041 fprintf(output, "SYMBOL_OTHERS "); break; 1042 case XML_REGEXP_OTHER: 1043 fprintf(output, "OTHER "); break; 1044 case XML_REGEXP_OTHER_CONTROL: 1045 fprintf(output, "OTHER_CONTROL "); break; 1046 case XML_REGEXP_OTHER_FORMAT: 1047 fprintf(output, "OTHER_FORMAT "); break; 1048 case XML_REGEXP_OTHER_PRIVATE: 1049 fprintf(output, "OTHER_PRIVATE "); break; 1050 case XML_REGEXP_OTHER_NA: 1051 fprintf(output, "OTHER_NA "); break; 1052 case XML_REGEXP_BLOCK_NAME: 1053 fprintf(output, "BLOCK "); break; 1054 } 1055 } 1056 1057 static void 1058 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1059 switch (type) { 1060 case XML_REGEXP_QUANT_EPSILON: 1061 fprintf(output, "epsilon "); break; 1062 case XML_REGEXP_QUANT_ONCE: 1063 fprintf(output, "once "); break; 1064 case XML_REGEXP_QUANT_OPT: 1065 fprintf(output, "? "); break; 1066 case XML_REGEXP_QUANT_MULT: 1067 fprintf(output, "* "); break; 1068 case XML_REGEXP_QUANT_PLUS: 1069 fprintf(output, "+ "); break; 1070 case XML_REGEXP_QUANT_RANGE: 1071 fprintf(output, "range "); break; 1072 case XML_REGEXP_QUANT_ONCEONLY: 1073 fprintf(output, "onceonly "); break; 1074 case XML_REGEXP_QUANT_ALL: 1075 fprintf(output, "all "); break; 1076 } 1077 } 1078 static void 1079 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1080 fprintf(output, " range: "); 1081 if (range->neg) 1082 fprintf(output, "negative "); 1083 xmlRegPrintAtomType(output, range->type); 1084 fprintf(output, "%c - %c\n", range->start, range->end); 1085 } 1086 1087 static void 1088 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1089 fprintf(output, " atom: "); 1090 if (atom == NULL) { 1091 fprintf(output, "NULL\n"); 1092 return; 1093 } 1094 if (atom->neg) 1095 fprintf(output, "not "); 1096 xmlRegPrintAtomType(output, atom->type); 1097 xmlRegPrintQuantType(output, atom->quant); 1098 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1099 fprintf(output, "%d-%d ", atom->min, atom->max); 1100 if (atom->type == XML_REGEXP_STRING) 1101 fprintf(output, "'%s' ", (char *) atom->valuep); 1102 if (atom->type == XML_REGEXP_CHARVAL) 1103 fprintf(output, "char %c\n", atom->codepoint); 1104 else if (atom->type == XML_REGEXP_RANGES) { 1105 int i; 1106 fprintf(output, "%d entries\n", atom->nbRanges); 1107 for (i = 0; i < atom->nbRanges;i++) 1108 xmlRegPrintRange(output, atom->ranges[i]); 1109 } else if (atom->type == XML_REGEXP_SUBREG) { 1110 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1111 } else { 1112 fprintf(output, "\n"); 1113 } 1114 } 1115 1116 static void 1117 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1118 fprintf(output, " trans: "); 1119 if (trans == NULL) { 1120 fprintf(output, "NULL\n"); 1121 return; 1122 } 1123 if (trans->to < 0) { 1124 fprintf(output, "removed\n"); 1125 return; 1126 } 1127 if (trans->nd != 0) { 1128 if (trans->nd == 2) 1129 fprintf(output, "last not determinist, "); 1130 else 1131 fprintf(output, "not determinist, "); 1132 } 1133 if (trans->counter >= 0) { 1134 fprintf(output, "counted %d, ", trans->counter); 1135 } 1136 if (trans->count == REGEXP_ALL_COUNTER) { 1137 fprintf(output, "all transition, "); 1138 } else if (trans->count >= 0) { 1139 fprintf(output, "count based %d, ", trans->count); 1140 } 1141 if (trans->atom == NULL) { 1142 fprintf(output, "epsilon to %d\n", trans->to); 1143 return; 1144 } 1145 if (trans->atom->type == XML_REGEXP_CHARVAL) 1146 fprintf(output, "char %c ", trans->atom->codepoint); 1147 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1148 } 1149 1150 static void 1151 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1152 int i; 1153 1154 fprintf(output, " state: "); 1155 if (state == NULL) { 1156 fprintf(output, "NULL\n"); 1157 return; 1158 } 1159 if (state->type == XML_REGEXP_START_STATE) 1160 fprintf(output, "START "); 1161 if (state->type == XML_REGEXP_FINAL_STATE) 1162 fprintf(output, "FINAL "); 1163 1164 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1165 for (i = 0;i < state->nbTrans; i++) { 1166 xmlRegPrintTrans(output, &(state->trans[i])); 1167 } 1168 } 1169 1170 #ifdef DEBUG_REGEXP_GRAPH 1171 static void 1172 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1173 int i; 1174 1175 fprintf(output, " ctxt: "); 1176 if (ctxt == NULL) { 1177 fprintf(output, "NULL\n"); 1178 return; 1179 } 1180 fprintf(output, "'%s' ", ctxt->string); 1181 if (ctxt->error) 1182 fprintf(output, "error "); 1183 if (ctxt->neg) 1184 fprintf(output, "neg "); 1185 fprintf(output, "\n"); 1186 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1187 for (i = 0;i < ctxt->nbAtoms; i++) { 1188 fprintf(output, " %02d ", i); 1189 xmlRegPrintAtom(output, ctxt->atoms[i]); 1190 } 1191 if (ctxt->atom != NULL) { 1192 fprintf(output, "current atom:\n"); 1193 xmlRegPrintAtom(output, ctxt->atom); 1194 } 1195 fprintf(output, "%d states:", ctxt->nbStates); 1196 if (ctxt->start != NULL) 1197 fprintf(output, " start: %d", ctxt->start->no); 1198 if (ctxt->end != NULL) 1199 fprintf(output, " end: %d", ctxt->end->no); 1200 fprintf(output, "\n"); 1201 for (i = 0;i < ctxt->nbStates; i++) { 1202 xmlRegPrintState(output, ctxt->states[i]); 1203 } 1204 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1205 for (i = 0;i < ctxt->nbCounters; i++) { 1206 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1207 ctxt->counters[i].max); 1208 } 1209 } 1210 #endif 1211 1212 /************************************************************************ 1213 * * 1214 * Finite Automata structures manipulations * 1215 * * 1216 ************************************************************************/ 1217 1218 static void 1219 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1220 int neg, xmlRegAtomType type, int start, int end, 1221 xmlChar *blockName) { 1222 xmlRegRangePtr range; 1223 1224 if (atom == NULL) { 1225 ERROR("add range: atom is NULL"); 1226 return; 1227 } 1228 if (atom->type != XML_REGEXP_RANGES) { 1229 ERROR("add range: atom is not ranges"); 1230 return; 1231 } 1232 if (atom->maxRanges == 0) { 1233 atom->maxRanges = 4; 1234 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1235 sizeof(xmlRegRangePtr)); 1236 if (atom->ranges == NULL) { 1237 xmlRegexpErrMemory(ctxt, "adding ranges"); 1238 atom->maxRanges = 0; 1239 return; 1240 } 1241 } else if (atom->nbRanges >= atom->maxRanges) { 1242 xmlRegRangePtr *tmp; 1243 atom->maxRanges *= 2; 1244 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1245 sizeof(xmlRegRangePtr)); 1246 if (tmp == NULL) { 1247 xmlRegexpErrMemory(ctxt, "adding ranges"); 1248 atom->maxRanges /= 2; 1249 return; 1250 } 1251 atom->ranges = tmp; 1252 } 1253 range = xmlRegNewRange(ctxt, neg, type, start, end); 1254 if (range == NULL) 1255 return; 1256 range->blockName = blockName; 1257 atom->ranges[atom->nbRanges++] = range; 1258 1259 } 1260 1261 static int 1262 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1263 if (ctxt->maxCounters == 0) { 1264 ctxt->maxCounters = 4; 1265 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1266 sizeof(xmlRegCounter)); 1267 if (ctxt->counters == NULL) { 1268 xmlRegexpErrMemory(ctxt, "allocating counter"); 1269 ctxt->maxCounters = 0; 1270 return(-1); 1271 } 1272 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1273 xmlRegCounter *tmp; 1274 ctxt->maxCounters *= 2; 1275 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1276 sizeof(xmlRegCounter)); 1277 if (tmp == NULL) { 1278 xmlRegexpErrMemory(ctxt, "allocating counter"); 1279 ctxt->maxCounters /= 2; 1280 return(-1); 1281 } 1282 ctxt->counters = tmp; 1283 } 1284 ctxt->counters[ctxt->nbCounters].min = -1; 1285 ctxt->counters[ctxt->nbCounters].max = -1; 1286 return(ctxt->nbCounters++); 1287 } 1288 1289 static int 1290 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1291 if (atom == NULL) { 1292 ERROR("atom push: atom is NULL"); 1293 return(-1); 1294 } 1295 if (ctxt->maxAtoms == 0) { 1296 ctxt->maxAtoms = 4; 1297 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1298 sizeof(xmlRegAtomPtr)); 1299 if (ctxt->atoms == NULL) { 1300 xmlRegexpErrMemory(ctxt, "pushing atom"); 1301 ctxt->maxAtoms = 0; 1302 return(-1); 1303 } 1304 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1305 xmlRegAtomPtr *tmp; 1306 ctxt->maxAtoms *= 2; 1307 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1308 sizeof(xmlRegAtomPtr)); 1309 if (tmp == NULL) { 1310 xmlRegexpErrMemory(ctxt, "allocating counter"); 1311 ctxt->maxAtoms /= 2; 1312 return(-1); 1313 } 1314 ctxt->atoms = tmp; 1315 } 1316 atom->no = ctxt->nbAtoms; 1317 ctxt->atoms[ctxt->nbAtoms++] = atom; 1318 return(0); 1319 } 1320 1321 static void 1322 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1323 int from) { 1324 if (target->maxTransTo == 0) { 1325 target->maxTransTo = 8; 1326 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1327 sizeof(int)); 1328 if (target->transTo == NULL) { 1329 xmlRegexpErrMemory(ctxt, "adding transition"); 1330 target->maxTransTo = 0; 1331 return; 1332 } 1333 } else if (target->nbTransTo >= target->maxTransTo) { 1334 int *tmp; 1335 target->maxTransTo *= 2; 1336 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1337 sizeof(int)); 1338 if (tmp == NULL) { 1339 xmlRegexpErrMemory(ctxt, "adding transition"); 1340 target->maxTransTo /= 2; 1341 return; 1342 } 1343 target->transTo = tmp; 1344 } 1345 target->transTo[target->nbTransTo] = from; 1346 target->nbTransTo++; 1347 } 1348 1349 static void 1350 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1351 xmlRegAtomPtr atom, xmlRegStatePtr target, 1352 int counter, int count) { 1353 1354 int nrtrans; 1355 1356 if (state == NULL) { 1357 ERROR("add state: state is NULL"); 1358 return; 1359 } 1360 if (target == NULL) { 1361 ERROR("add state: target is NULL"); 1362 return; 1363 } 1364 /* 1365 * Other routines follow the philosophy 'When in doubt, add a transition' 1366 * so we check here whether such a transition is already present and, if 1367 * so, silently ignore this request. 1368 */ 1369 1370 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1371 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1372 if ((trans->atom == atom) && 1373 (trans->to == target->no) && 1374 (trans->counter == counter) && 1375 (trans->count == count)) { 1376 #ifdef DEBUG_REGEXP_GRAPH 1377 printf("Ignoring duplicate transition from %d to %d\n", 1378 state->no, target->no); 1379 #endif 1380 return; 1381 } 1382 } 1383 1384 if (state->maxTrans == 0) { 1385 state->maxTrans = 8; 1386 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1387 sizeof(xmlRegTrans)); 1388 if (state->trans == NULL) { 1389 xmlRegexpErrMemory(ctxt, "adding transition"); 1390 state->maxTrans = 0; 1391 return; 1392 } 1393 } else if (state->nbTrans >= state->maxTrans) { 1394 xmlRegTrans *tmp; 1395 state->maxTrans *= 2; 1396 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1397 sizeof(xmlRegTrans)); 1398 if (tmp == NULL) { 1399 xmlRegexpErrMemory(ctxt, "adding transition"); 1400 state->maxTrans /= 2; 1401 return; 1402 } 1403 state->trans = tmp; 1404 } 1405 #ifdef DEBUG_REGEXP_GRAPH 1406 printf("Add trans from %d to %d ", state->no, target->no); 1407 if (count == REGEXP_ALL_COUNTER) 1408 printf("all transition\n"); 1409 else if (count >= 0) 1410 printf("count based %d\n", count); 1411 else if (counter >= 0) 1412 printf("counted %d\n", counter); 1413 else if (atom == NULL) 1414 printf("epsilon transition\n"); 1415 else if (atom != NULL) 1416 xmlRegPrintAtom(stdout, atom); 1417 #endif 1418 1419 state->trans[state->nbTrans].atom = atom; 1420 state->trans[state->nbTrans].to = target->no; 1421 state->trans[state->nbTrans].counter = counter; 1422 state->trans[state->nbTrans].count = count; 1423 state->trans[state->nbTrans].nd = 0; 1424 state->nbTrans++; 1425 xmlRegStateAddTransTo(ctxt, target, state->no); 1426 } 1427 1428 static int 1429 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1430 if (state == NULL) return(-1); 1431 if (ctxt->maxStates == 0) { 1432 ctxt->maxStates = 4; 1433 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1434 sizeof(xmlRegStatePtr)); 1435 if (ctxt->states == NULL) { 1436 xmlRegexpErrMemory(ctxt, "adding state"); 1437 ctxt->maxStates = 0; 1438 return(-1); 1439 } 1440 } else if (ctxt->nbStates >= ctxt->maxStates) { 1441 xmlRegStatePtr *tmp; 1442 ctxt->maxStates *= 2; 1443 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1444 sizeof(xmlRegStatePtr)); 1445 if (tmp == NULL) { 1446 xmlRegexpErrMemory(ctxt, "adding state"); 1447 ctxt->maxStates /= 2; 1448 return(-1); 1449 } 1450 ctxt->states = tmp; 1451 } 1452 state->no = ctxt->nbStates; 1453 ctxt->states[ctxt->nbStates++] = state; 1454 return(0); 1455 } 1456 1457 /** 1458 * xmlFAGenerateAllTransition: 1459 * @ctxt: a regexp parser context 1460 * @from: the from state 1461 * @to: the target state or NULL for building a new one 1462 * @lax: 1463 * 1464 */ 1465 static void 1466 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1467 xmlRegStatePtr from, xmlRegStatePtr to, 1468 int lax) { 1469 if (to == NULL) { 1470 to = xmlRegNewState(ctxt); 1471 xmlRegStatePush(ctxt, to); 1472 ctxt->state = to; 1473 } 1474 if (lax) 1475 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1476 else 1477 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1478 } 1479 1480 /** 1481 * xmlFAGenerateEpsilonTransition: 1482 * @ctxt: a regexp parser context 1483 * @from: the from state 1484 * @to: the target state or NULL for building a new one 1485 * 1486 */ 1487 static void 1488 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1489 xmlRegStatePtr from, xmlRegStatePtr to) { 1490 if (to == NULL) { 1491 to = xmlRegNewState(ctxt); 1492 xmlRegStatePush(ctxt, to); 1493 ctxt->state = to; 1494 } 1495 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1496 } 1497 1498 /** 1499 * xmlFAGenerateCountedEpsilonTransition: 1500 * @ctxt: a regexp parser context 1501 * @from: the from state 1502 * @to: the target state or NULL for building a new one 1503 * counter: the counter for that transition 1504 * 1505 */ 1506 static void 1507 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1508 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1509 if (to == NULL) { 1510 to = xmlRegNewState(ctxt); 1511 xmlRegStatePush(ctxt, to); 1512 ctxt->state = to; 1513 } 1514 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1515 } 1516 1517 /** 1518 * xmlFAGenerateCountedTransition: 1519 * @ctxt: a regexp parser context 1520 * @from: the from state 1521 * @to: the target state or NULL for building a new one 1522 * counter: the counter for that transition 1523 * 1524 */ 1525 static void 1526 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1527 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1528 if (to == NULL) { 1529 to = xmlRegNewState(ctxt); 1530 xmlRegStatePush(ctxt, to); 1531 ctxt->state = to; 1532 } 1533 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1534 } 1535 1536 /** 1537 * xmlFAGenerateTransitions: 1538 * @ctxt: a regexp parser context 1539 * @from: the from state 1540 * @to: the target state or NULL for building a new one 1541 * @atom: the atom generating the transition 1542 * 1543 * Returns 0 if success and -1 in case of error. 1544 */ 1545 static int 1546 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1547 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1548 xmlRegStatePtr end; 1549 int nullable = 0; 1550 1551 if (atom == NULL) { 1552 ERROR("generate transition: atom == NULL"); 1553 return(-1); 1554 } 1555 if (atom->type == XML_REGEXP_SUBREG) { 1556 /* 1557 * this is a subexpression handling one should not need to 1558 * create a new node except for XML_REGEXP_QUANT_RANGE. 1559 */ 1560 if (xmlRegAtomPush(ctxt, atom) < 0) { 1561 return(-1); 1562 } 1563 if ((to != NULL) && (atom->stop != to) && 1564 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1565 /* 1566 * Generate an epsilon transition to link to the target 1567 */ 1568 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1569 #ifdef DV 1570 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1571 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1572 to = xmlRegNewState(ctxt); 1573 xmlRegStatePush(ctxt, to); 1574 ctxt->state = to; 1575 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1576 #endif 1577 } 1578 switch (atom->quant) { 1579 case XML_REGEXP_QUANT_OPT: 1580 atom->quant = XML_REGEXP_QUANT_ONCE; 1581 /* 1582 * transition done to the state after end of atom. 1583 * 1. set transition from atom start to new state 1584 * 2. set transition from atom end to this state. 1585 */ 1586 if (to == NULL) { 1587 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1588 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1589 ctxt->state); 1590 } else { 1591 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1592 } 1593 break; 1594 case XML_REGEXP_QUANT_MULT: 1595 atom->quant = XML_REGEXP_QUANT_ONCE; 1596 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1597 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1598 break; 1599 case XML_REGEXP_QUANT_PLUS: 1600 atom->quant = XML_REGEXP_QUANT_ONCE; 1601 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1602 break; 1603 case XML_REGEXP_QUANT_RANGE: { 1604 int counter; 1605 xmlRegStatePtr inter, newstate; 1606 1607 /* 1608 * create the final state now if needed 1609 */ 1610 if (to != NULL) { 1611 newstate = to; 1612 } else { 1613 newstate = xmlRegNewState(ctxt); 1614 xmlRegStatePush(ctxt, newstate); 1615 } 1616 1617 /* 1618 * The principle here is to use counted transition 1619 * to avoid explosion in the number of states in the 1620 * graph. This is clearly more complex but should not 1621 * be exploitable at runtime. 1622 */ 1623 if ((atom->min == 0) && (atom->start0 == NULL)) { 1624 xmlRegAtomPtr copy; 1625 /* 1626 * duplicate a transition based on atom to count next 1627 * occurrences after 1. We cannot loop to atom->start 1628 * directly because we need an epsilon transition to 1629 * newstate. 1630 */ 1631 /* ???? For some reason it seems we never reach that 1632 case, I suppose this got optimized out before when 1633 building the automata */ 1634 copy = xmlRegCopyAtom(ctxt, atom); 1635 if (copy == NULL) 1636 return(-1); 1637 copy->quant = XML_REGEXP_QUANT_ONCE; 1638 copy->min = 0; 1639 copy->max = 0; 1640 1641 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1642 < 0) 1643 return(-1); 1644 inter = ctxt->state; 1645 counter = xmlRegGetCounter(ctxt); 1646 ctxt->counters[counter].min = atom->min - 1; 1647 ctxt->counters[counter].max = atom->max - 1; 1648 /* count the number of times we see it again */ 1649 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1650 atom->stop, counter); 1651 /* allow a way out based on the count */ 1652 xmlFAGenerateCountedTransition(ctxt, inter, 1653 newstate, counter); 1654 /* and also allow a direct exit for 0 */ 1655 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1656 newstate); 1657 } else { 1658 /* 1659 * either we need the atom at least once or there 1660 * is an atom->start0 allowing to easily plug the 1661 * epsilon transition. 1662 */ 1663 counter = xmlRegGetCounter(ctxt); 1664 ctxt->counters[counter].min = atom->min - 1; 1665 ctxt->counters[counter].max = atom->max - 1; 1666 /* count the number of times we see it again */ 1667 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1668 atom->start, counter); 1669 /* allow a way out based on the count */ 1670 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1671 newstate, counter); 1672 /* and if needed allow a direct exit for 0 */ 1673 if (atom->min == 0) 1674 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1675 newstate); 1676 1677 } 1678 atom->min = 0; 1679 atom->max = 0; 1680 atom->quant = XML_REGEXP_QUANT_ONCE; 1681 ctxt->state = newstate; 1682 } 1683 default: 1684 break; 1685 } 1686 return(0); 1687 } 1688 if ((atom->min == 0) && (atom->max == 0) && 1689 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1690 /* 1691 * we can discard the atom and generate an epsilon transition instead 1692 */ 1693 if (to == NULL) { 1694 to = xmlRegNewState(ctxt); 1695 if (to != NULL) 1696 xmlRegStatePush(ctxt, to); 1697 else { 1698 return(-1); 1699 } 1700 } 1701 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1702 ctxt->state = to; 1703 xmlRegFreeAtom(atom); 1704 return(0); 1705 } 1706 if (to == NULL) { 1707 to = xmlRegNewState(ctxt); 1708 if (to != NULL) 1709 xmlRegStatePush(ctxt, to); 1710 else { 1711 return(-1); 1712 } 1713 } 1714 end = to; 1715 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1716 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1717 /* 1718 * Do not pollute the target state by adding transitions from 1719 * it as it is likely to be the shared target of multiple branches. 1720 * So isolate with an epsilon transition. 1721 */ 1722 xmlRegStatePtr tmp; 1723 1724 tmp = xmlRegNewState(ctxt); 1725 if (tmp != NULL) 1726 xmlRegStatePush(ctxt, tmp); 1727 else { 1728 return(-1); 1729 } 1730 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1731 to = tmp; 1732 } 1733 if (xmlRegAtomPush(ctxt, atom) < 0) { 1734 return(-1); 1735 } 1736 if ((atom->quant == XML_REGEXP_QUANT_RANGE) && 1737 (atom->min == 0) && (atom->max > 0)) { 1738 nullable = 1; 1739 atom->min = 1; 1740 if (atom->max == 1) 1741 atom->quant = XML_REGEXP_QUANT_OPT; 1742 } 1743 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1744 ctxt->state = end; 1745 switch (atom->quant) { 1746 case XML_REGEXP_QUANT_OPT: 1747 atom->quant = XML_REGEXP_QUANT_ONCE; 1748 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1749 break; 1750 case XML_REGEXP_QUANT_MULT: 1751 atom->quant = XML_REGEXP_QUANT_ONCE; 1752 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1753 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1754 break; 1755 case XML_REGEXP_QUANT_PLUS: 1756 atom->quant = XML_REGEXP_QUANT_ONCE; 1757 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1758 break; 1759 case XML_REGEXP_QUANT_RANGE: 1760 if (nullable) 1761 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1762 break; 1763 default: 1764 break; 1765 } 1766 return(0); 1767 } 1768 1769 /** 1770 * xmlFAReduceEpsilonTransitions: 1771 * @ctxt: a regexp parser context 1772 * @fromnr: the from state 1773 * @tonr: the to state 1774 * @counter: should that transition be associated to a counted 1775 * 1776 */ 1777 static void 1778 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1779 int tonr, int counter) { 1780 int transnr; 1781 xmlRegStatePtr from; 1782 xmlRegStatePtr to; 1783 1784 #ifdef DEBUG_REGEXP_GRAPH 1785 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1786 #endif 1787 from = ctxt->states[fromnr]; 1788 if (from == NULL) 1789 return; 1790 to = ctxt->states[tonr]; 1791 if (to == NULL) 1792 return; 1793 if ((to->mark == XML_REGEXP_MARK_START) || 1794 (to->mark == XML_REGEXP_MARK_VISITED)) 1795 return; 1796 1797 to->mark = XML_REGEXP_MARK_VISITED; 1798 if (to->type == XML_REGEXP_FINAL_STATE) { 1799 #ifdef DEBUG_REGEXP_GRAPH 1800 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1801 #endif 1802 from->type = XML_REGEXP_FINAL_STATE; 1803 } 1804 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1805 if (to->trans[transnr].to < 0) 1806 continue; 1807 if (to->trans[transnr].atom == NULL) { 1808 /* 1809 * Don't remove counted transitions 1810 * Don't loop either 1811 */ 1812 if (to->trans[transnr].to != fromnr) { 1813 if (to->trans[transnr].count >= 0) { 1814 int newto = to->trans[transnr].to; 1815 1816 xmlRegStateAddTrans(ctxt, from, NULL, 1817 ctxt->states[newto], 1818 -1, to->trans[transnr].count); 1819 } else { 1820 #ifdef DEBUG_REGEXP_GRAPH 1821 printf("Found epsilon trans %d from %d to %d\n", 1822 transnr, tonr, to->trans[transnr].to); 1823 #endif 1824 if (to->trans[transnr].counter >= 0) { 1825 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1826 to->trans[transnr].to, 1827 to->trans[transnr].counter); 1828 } else { 1829 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1830 to->trans[transnr].to, 1831 counter); 1832 } 1833 } 1834 } 1835 } else { 1836 int newto = to->trans[transnr].to; 1837 1838 if (to->trans[transnr].counter >= 0) { 1839 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1840 ctxt->states[newto], 1841 to->trans[transnr].counter, -1); 1842 } else { 1843 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1844 ctxt->states[newto], counter, -1); 1845 } 1846 } 1847 } 1848 to->mark = XML_REGEXP_MARK_NORMAL; 1849 } 1850 1851 /** 1852 * xmlFAEliminateSimpleEpsilonTransitions: 1853 * @ctxt: a regexp parser context 1854 * 1855 * Eliminating general epsilon transitions can get costly in the general 1856 * algorithm due to the large amount of generated new transitions and 1857 * associated comparisons. However for simple epsilon transition used just 1858 * to separate building blocks when generating the automata this can be 1859 * reduced to state elimination: 1860 * - if there exists an epsilon from X to Y 1861 * - if there is no other transition from X 1862 * then X and Y are semantically equivalent and X can be eliminated 1863 * If X is the start state then make Y the start state, else replace the 1864 * target of all transitions to X by transitions to Y. 1865 */ 1866 static void 1867 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1868 int statenr, i, j, newto; 1869 xmlRegStatePtr state, tmp; 1870 1871 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1872 state = ctxt->states[statenr]; 1873 if (state == NULL) 1874 continue; 1875 if (state->nbTrans != 1) 1876 continue; 1877 if (state->type == XML_REGEXP_UNREACH_STATE) 1878 continue; 1879 /* is the only transition out a basic transition */ 1880 if ((state->trans[0].atom == NULL) && 1881 (state->trans[0].to >= 0) && 1882 (state->trans[0].to != statenr) && 1883 (state->trans[0].counter < 0) && 1884 (state->trans[0].count < 0)) { 1885 newto = state->trans[0].to; 1886 1887 if (state->type == XML_REGEXP_START_STATE) { 1888 #ifdef DEBUG_REGEXP_GRAPH 1889 printf("Found simple epsilon trans from start %d to %d\n", 1890 statenr, newto); 1891 #endif 1892 } else { 1893 #ifdef DEBUG_REGEXP_GRAPH 1894 printf("Found simple epsilon trans from %d to %d\n", 1895 statenr, newto); 1896 #endif 1897 for (i = 0;i < state->nbTransTo;i++) { 1898 tmp = ctxt->states[state->transTo[i]]; 1899 for (j = 0;j < tmp->nbTrans;j++) { 1900 if (tmp->trans[j].to == statenr) { 1901 #ifdef DEBUG_REGEXP_GRAPH 1902 printf("Changed transition %d on %d to go to %d\n", 1903 j, tmp->no, newto); 1904 #endif 1905 tmp->trans[j].to = -1; 1906 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1907 ctxt->states[newto], 1908 tmp->trans[j].counter, 1909 tmp->trans[j].count); 1910 } 1911 } 1912 } 1913 if (state->type == XML_REGEXP_FINAL_STATE) 1914 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1915 /* eliminate the transition completely */ 1916 state->nbTrans = 0; 1917 1918 state->type = XML_REGEXP_UNREACH_STATE; 1919 1920 } 1921 1922 } 1923 } 1924 } 1925 /** 1926 * xmlFAEliminateEpsilonTransitions: 1927 * @ctxt: a regexp parser context 1928 * 1929 */ 1930 static void 1931 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1932 int statenr, transnr; 1933 xmlRegStatePtr state; 1934 int has_epsilon; 1935 1936 if (ctxt->states == NULL) return; 1937 1938 /* 1939 * Eliminate simple epsilon transition and the associated unreachable 1940 * states. 1941 */ 1942 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1943 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1944 state = ctxt->states[statenr]; 1945 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1946 #ifdef DEBUG_REGEXP_GRAPH 1947 printf("Removed unreachable state %d\n", statenr); 1948 #endif 1949 xmlRegFreeState(state); 1950 ctxt->states[statenr] = NULL; 1951 } 1952 } 1953 1954 has_epsilon = 0; 1955 1956 /* 1957 * Build the completed transitions bypassing the epsilons 1958 * Use a marking algorithm to avoid loops 1959 * Mark sink states too. 1960 * Process from the latest states backward to the start when 1961 * there is long cascading epsilon chains this minimize the 1962 * recursions and transition compares when adding the new ones 1963 */ 1964 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1965 state = ctxt->states[statenr]; 1966 if (state == NULL) 1967 continue; 1968 if ((state->nbTrans == 0) && 1969 (state->type != XML_REGEXP_FINAL_STATE)) { 1970 state->type = XML_REGEXP_SINK_STATE; 1971 } 1972 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1973 if ((state->trans[transnr].atom == NULL) && 1974 (state->trans[transnr].to >= 0)) { 1975 if (state->trans[transnr].to == statenr) { 1976 state->trans[transnr].to = -1; 1977 #ifdef DEBUG_REGEXP_GRAPH 1978 printf("Removed loopback epsilon trans %d on %d\n", 1979 transnr, statenr); 1980 #endif 1981 } else if (state->trans[transnr].count < 0) { 1982 int newto = state->trans[transnr].to; 1983 1984 #ifdef DEBUG_REGEXP_GRAPH 1985 printf("Found epsilon trans %d from %d to %d\n", 1986 transnr, statenr, newto); 1987 #endif 1988 has_epsilon = 1; 1989 state->trans[transnr].to = -2; 1990 state->mark = XML_REGEXP_MARK_START; 1991 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1992 newto, state->trans[transnr].counter); 1993 state->mark = XML_REGEXP_MARK_NORMAL; 1994 #ifdef DEBUG_REGEXP_GRAPH 1995 } else { 1996 printf("Found counted transition %d on %d\n", 1997 transnr, statenr); 1998 #endif 1999 } 2000 } 2001 } 2002 } 2003 /* 2004 * Eliminate the epsilon transitions 2005 */ 2006 if (has_epsilon) { 2007 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2008 state = ctxt->states[statenr]; 2009 if (state == NULL) 2010 continue; 2011 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2012 xmlRegTransPtr trans = &(state->trans[transnr]); 2013 if ((trans->atom == NULL) && 2014 (trans->count < 0) && 2015 (trans->to >= 0)) { 2016 trans->to = -1; 2017 } 2018 } 2019 } 2020 } 2021 2022 /* 2023 * Use this pass to detect unreachable states too 2024 */ 2025 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2026 state = ctxt->states[statenr]; 2027 if (state != NULL) 2028 state->reached = XML_REGEXP_MARK_NORMAL; 2029 } 2030 state = ctxt->states[0]; 2031 if (state != NULL) 2032 state->reached = XML_REGEXP_MARK_START; 2033 while (state != NULL) { 2034 xmlRegStatePtr target = NULL; 2035 state->reached = XML_REGEXP_MARK_VISITED; 2036 /* 2037 * Mark all states reachable from the current reachable state 2038 */ 2039 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2040 if ((state->trans[transnr].to >= 0) && 2041 ((state->trans[transnr].atom != NULL) || 2042 (state->trans[transnr].count >= 0))) { 2043 int newto = state->trans[transnr].to; 2044 2045 if (ctxt->states[newto] == NULL) 2046 continue; 2047 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2048 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2049 target = ctxt->states[newto]; 2050 } 2051 } 2052 } 2053 2054 /* 2055 * find the next accessible state not explored 2056 */ 2057 if (target == NULL) { 2058 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2059 state = ctxt->states[statenr]; 2060 if ((state != NULL) && (state->reached == 2061 XML_REGEXP_MARK_START)) { 2062 target = state; 2063 break; 2064 } 2065 } 2066 } 2067 state = target; 2068 } 2069 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2070 state = ctxt->states[statenr]; 2071 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2072 #ifdef DEBUG_REGEXP_GRAPH 2073 printf("Removed unreachable state %d\n", statenr); 2074 #endif 2075 xmlRegFreeState(state); 2076 ctxt->states[statenr] = NULL; 2077 } 2078 } 2079 2080 } 2081 2082 static int 2083 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2084 int ret = 0; 2085 2086 if ((range1->type == XML_REGEXP_RANGES) || 2087 (range2->type == XML_REGEXP_RANGES) || 2088 (range2->type == XML_REGEXP_SUBREG) || 2089 (range1->type == XML_REGEXP_SUBREG) || 2090 (range1->type == XML_REGEXP_STRING) || 2091 (range2->type == XML_REGEXP_STRING)) 2092 return(-1); 2093 2094 /* put them in order */ 2095 if (range1->type > range2->type) { 2096 xmlRegRangePtr tmp; 2097 2098 tmp = range1; 2099 range1 = range2; 2100 range2 = tmp; 2101 } 2102 if ((range1->type == XML_REGEXP_ANYCHAR) || 2103 (range2->type == XML_REGEXP_ANYCHAR)) { 2104 ret = 1; 2105 } else if ((range1->type == XML_REGEXP_EPSILON) || 2106 (range2->type == XML_REGEXP_EPSILON)) { 2107 return(0); 2108 } else if (range1->type == range2->type) { 2109 if (range1->type != XML_REGEXP_CHARVAL) 2110 ret = 1; 2111 else if ((range1->end < range2->start) || 2112 (range2->end < range1->start)) 2113 ret = 0; 2114 else 2115 ret = 1; 2116 } else if (range1->type == XML_REGEXP_CHARVAL) { 2117 int codepoint; 2118 int neg = 0; 2119 2120 /* 2121 * just check all codepoints in the range for acceptance, 2122 * this is usually way cheaper since done only once at 2123 * compilation than testing over and over at runtime or 2124 * pushing too many states when evaluating. 2125 */ 2126 if (((range1->neg == 0) && (range2->neg != 0)) || 2127 ((range1->neg != 0) && (range2->neg == 0))) 2128 neg = 1; 2129 2130 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2131 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2132 0, range2->start, range2->end, 2133 range2->blockName); 2134 if (ret < 0) 2135 return(-1); 2136 if (((neg == 1) && (ret == 0)) || 2137 ((neg == 0) && (ret == 1))) 2138 return(1); 2139 } 2140 return(0); 2141 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2142 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2143 if (range1->type == range2->type) { 2144 ret = xmlStrEqual(range1->blockName, range2->blockName); 2145 } else { 2146 /* 2147 * comparing a block range with anything else is way 2148 * too costly, and maintaining the table is like too much 2149 * memory too, so let's force the automata to save state 2150 * here. 2151 */ 2152 return(1); 2153 } 2154 } else if ((range1->type < XML_REGEXP_LETTER) || 2155 (range2->type < XML_REGEXP_LETTER)) { 2156 if ((range1->type == XML_REGEXP_ANYSPACE) && 2157 (range2->type == XML_REGEXP_NOTSPACE)) 2158 ret = 0; 2159 else if ((range1->type == XML_REGEXP_INITNAME) && 2160 (range2->type == XML_REGEXP_NOTINITNAME)) 2161 ret = 0; 2162 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2163 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2164 ret = 0; 2165 else if ((range1->type == XML_REGEXP_DECIMAL) && 2166 (range2->type == XML_REGEXP_NOTDECIMAL)) 2167 ret = 0; 2168 else if ((range1->type == XML_REGEXP_REALCHAR) && 2169 (range2->type == XML_REGEXP_NOTREALCHAR)) 2170 ret = 0; 2171 else { 2172 /* same thing to limit complexity */ 2173 return(1); 2174 } 2175 } else { 2176 ret = 0; 2177 /* range1->type < range2->type here */ 2178 switch (range1->type) { 2179 case XML_REGEXP_LETTER: 2180 /* all disjoint except in the subgroups */ 2181 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2182 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2183 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2184 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2185 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2186 ret = 1; 2187 break; 2188 case XML_REGEXP_MARK: 2189 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2190 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2191 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2192 ret = 1; 2193 break; 2194 case XML_REGEXP_NUMBER: 2195 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2196 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2197 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2198 ret = 1; 2199 break; 2200 case XML_REGEXP_PUNCT: 2201 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2202 (range2->type == XML_REGEXP_PUNCT_DASH) || 2203 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2204 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2205 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2206 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2207 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2208 ret = 1; 2209 break; 2210 case XML_REGEXP_SEPAR: 2211 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2212 (range2->type == XML_REGEXP_SEPAR_LINE) || 2213 (range2->type == XML_REGEXP_SEPAR_PARA)) 2214 ret = 1; 2215 break; 2216 case XML_REGEXP_SYMBOL: 2217 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2218 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2219 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2220 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2221 ret = 1; 2222 break; 2223 case XML_REGEXP_OTHER: 2224 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2225 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2226 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2227 ret = 1; 2228 break; 2229 default: 2230 if ((range2->type >= XML_REGEXP_LETTER) && 2231 (range2->type < XML_REGEXP_BLOCK_NAME)) 2232 ret = 0; 2233 else { 2234 /* safety net ! */ 2235 return(1); 2236 } 2237 } 2238 } 2239 if (((range1->neg == 0) && (range2->neg != 0)) || 2240 ((range1->neg != 0) && (range2->neg == 0))) 2241 ret = !ret; 2242 return(ret); 2243 } 2244 2245 /** 2246 * xmlFACompareAtomTypes: 2247 * @type1: an atom type 2248 * @type2: an atom type 2249 * 2250 * Compares two atoms type to check whether they intersect in some ways, 2251 * this is used by xmlFACompareAtoms only 2252 * 2253 * Returns 1 if they may intersect and 0 otherwise 2254 */ 2255 static int 2256 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2257 if ((type1 == XML_REGEXP_EPSILON) || 2258 (type1 == XML_REGEXP_CHARVAL) || 2259 (type1 == XML_REGEXP_RANGES) || 2260 (type1 == XML_REGEXP_SUBREG) || 2261 (type1 == XML_REGEXP_STRING) || 2262 (type1 == XML_REGEXP_ANYCHAR)) 2263 return(1); 2264 if ((type2 == XML_REGEXP_EPSILON) || 2265 (type2 == XML_REGEXP_CHARVAL) || 2266 (type2 == XML_REGEXP_RANGES) || 2267 (type2 == XML_REGEXP_SUBREG) || 2268 (type2 == XML_REGEXP_STRING) || 2269 (type2 == XML_REGEXP_ANYCHAR)) 2270 return(1); 2271 2272 if (type1 == type2) return(1); 2273 2274 /* simplify subsequent compares by making sure type1 < type2 */ 2275 if (type1 > type2) { 2276 xmlRegAtomType tmp = type1; 2277 type1 = type2; 2278 type2 = tmp; 2279 } 2280 switch (type1) { 2281 case XML_REGEXP_ANYSPACE: /* \s */ 2282 /* can't be a letter, number, mark, punctuation, symbol */ 2283 if ((type2 == XML_REGEXP_NOTSPACE) || 2284 ((type2 >= XML_REGEXP_LETTER) && 2285 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2286 ((type2 >= XML_REGEXP_NUMBER) && 2287 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2288 ((type2 >= XML_REGEXP_MARK) && 2289 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2290 ((type2 >= XML_REGEXP_PUNCT) && 2291 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2292 ((type2 >= XML_REGEXP_SYMBOL) && 2293 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2294 ) return(0); 2295 break; 2296 case XML_REGEXP_NOTSPACE: /* \S */ 2297 break; 2298 case XML_REGEXP_INITNAME: /* \l */ 2299 /* can't be a number, mark, separator, punctuation, symbol or other */ 2300 if ((type2 == XML_REGEXP_NOTINITNAME) || 2301 ((type2 >= XML_REGEXP_NUMBER) && 2302 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2303 ((type2 >= XML_REGEXP_MARK) && 2304 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2305 ((type2 >= XML_REGEXP_SEPAR) && 2306 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2307 ((type2 >= XML_REGEXP_PUNCT) && 2308 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2309 ((type2 >= XML_REGEXP_SYMBOL) && 2310 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2311 ((type2 >= XML_REGEXP_OTHER) && 2312 (type2 <= XML_REGEXP_OTHER_NA)) 2313 ) return(0); 2314 break; 2315 case XML_REGEXP_NOTINITNAME: /* \L */ 2316 break; 2317 case XML_REGEXP_NAMECHAR: /* \c */ 2318 /* can't be a mark, separator, punctuation, symbol or other */ 2319 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2320 ((type2 >= XML_REGEXP_MARK) && 2321 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2322 ((type2 >= XML_REGEXP_PUNCT) && 2323 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2324 ((type2 >= XML_REGEXP_SEPAR) && 2325 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2326 ((type2 >= XML_REGEXP_SYMBOL) && 2327 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2328 ((type2 >= XML_REGEXP_OTHER) && 2329 (type2 <= XML_REGEXP_OTHER_NA)) 2330 ) return(0); 2331 break; 2332 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2333 break; 2334 case XML_REGEXP_DECIMAL: /* \d */ 2335 /* can't be a letter, mark, separator, punctuation, symbol or other */ 2336 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2337 (type2 == XML_REGEXP_REALCHAR) || 2338 ((type2 >= XML_REGEXP_LETTER) && 2339 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2340 ((type2 >= XML_REGEXP_MARK) && 2341 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2342 ((type2 >= XML_REGEXP_PUNCT) && 2343 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2344 ((type2 >= XML_REGEXP_SEPAR) && 2345 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2346 ((type2 >= XML_REGEXP_SYMBOL) && 2347 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2348 ((type2 >= XML_REGEXP_OTHER) && 2349 (type2 <= XML_REGEXP_OTHER_NA)) 2350 )return(0); 2351 break; 2352 case XML_REGEXP_NOTDECIMAL: /* \D */ 2353 break; 2354 case XML_REGEXP_REALCHAR: /* \w */ 2355 /* can't be a mark, separator, punctuation, symbol or other */ 2356 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2357 ((type2 >= XML_REGEXP_MARK) && 2358 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2359 ((type2 >= XML_REGEXP_PUNCT) && 2360 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2361 ((type2 >= XML_REGEXP_SEPAR) && 2362 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2363 ((type2 >= XML_REGEXP_SYMBOL) && 2364 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2365 ((type2 >= XML_REGEXP_OTHER) && 2366 (type2 <= XML_REGEXP_OTHER_NA)) 2367 )return(0); 2368 break; 2369 case XML_REGEXP_NOTREALCHAR: /* \W */ 2370 break; 2371 /* 2372 * at that point we know both type 1 and type2 are from 2373 * character categories are ordered and are different, 2374 * it becomes simple because this is a partition 2375 */ 2376 case XML_REGEXP_LETTER: 2377 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2378 return(1); 2379 return(0); 2380 case XML_REGEXP_LETTER_UPPERCASE: 2381 case XML_REGEXP_LETTER_LOWERCASE: 2382 case XML_REGEXP_LETTER_TITLECASE: 2383 case XML_REGEXP_LETTER_MODIFIER: 2384 case XML_REGEXP_LETTER_OTHERS: 2385 return(0); 2386 case XML_REGEXP_MARK: 2387 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2388 return(1); 2389 return(0); 2390 case XML_REGEXP_MARK_NONSPACING: 2391 case XML_REGEXP_MARK_SPACECOMBINING: 2392 case XML_REGEXP_MARK_ENCLOSING: 2393 return(0); 2394 case XML_REGEXP_NUMBER: 2395 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2396 return(1); 2397 return(0); 2398 case XML_REGEXP_NUMBER_DECIMAL: 2399 case XML_REGEXP_NUMBER_LETTER: 2400 case XML_REGEXP_NUMBER_OTHERS: 2401 return(0); 2402 case XML_REGEXP_PUNCT: 2403 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2404 return(1); 2405 return(0); 2406 case XML_REGEXP_PUNCT_CONNECTOR: 2407 case XML_REGEXP_PUNCT_DASH: 2408 case XML_REGEXP_PUNCT_OPEN: 2409 case XML_REGEXP_PUNCT_CLOSE: 2410 case XML_REGEXP_PUNCT_INITQUOTE: 2411 case XML_REGEXP_PUNCT_FINQUOTE: 2412 case XML_REGEXP_PUNCT_OTHERS: 2413 return(0); 2414 case XML_REGEXP_SEPAR: 2415 if (type2 <= XML_REGEXP_SEPAR_PARA) 2416 return(1); 2417 return(0); 2418 case XML_REGEXP_SEPAR_SPACE: 2419 case XML_REGEXP_SEPAR_LINE: 2420 case XML_REGEXP_SEPAR_PARA: 2421 return(0); 2422 case XML_REGEXP_SYMBOL: 2423 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2424 return(1); 2425 return(0); 2426 case XML_REGEXP_SYMBOL_MATH: 2427 case XML_REGEXP_SYMBOL_CURRENCY: 2428 case XML_REGEXP_SYMBOL_MODIFIER: 2429 case XML_REGEXP_SYMBOL_OTHERS: 2430 return(0); 2431 case XML_REGEXP_OTHER: 2432 if (type2 <= XML_REGEXP_OTHER_NA) 2433 return(1); 2434 return(0); 2435 case XML_REGEXP_OTHER_CONTROL: 2436 case XML_REGEXP_OTHER_FORMAT: 2437 case XML_REGEXP_OTHER_PRIVATE: 2438 case XML_REGEXP_OTHER_NA: 2439 return(0); 2440 default: 2441 break; 2442 } 2443 return(1); 2444 } 2445 2446 /** 2447 * xmlFAEqualAtoms: 2448 * @atom1: an atom 2449 * @atom2: an atom 2450 * @deep: if not set only compare string pointers 2451 * 2452 * Compares two atoms to check whether they are the same exactly 2453 * this is used to remove equivalent transitions 2454 * 2455 * Returns 1 if same and 0 otherwise 2456 */ 2457 static int 2458 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2459 int ret = 0; 2460 2461 if (atom1 == atom2) 2462 return(1); 2463 if ((atom1 == NULL) || (atom2 == NULL)) 2464 return(0); 2465 2466 if (atom1->type != atom2->type) 2467 return(0); 2468 switch (atom1->type) { 2469 case XML_REGEXP_EPSILON: 2470 ret = 0; 2471 break; 2472 case XML_REGEXP_STRING: 2473 if (!deep) 2474 ret = (atom1->valuep == atom2->valuep); 2475 else 2476 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2477 (xmlChar *)atom2->valuep); 2478 break; 2479 case XML_REGEXP_CHARVAL: 2480 ret = (atom1->codepoint == atom2->codepoint); 2481 break; 2482 case XML_REGEXP_RANGES: 2483 /* too hard to do in the general case */ 2484 ret = 0; 2485 default: 2486 break; 2487 } 2488 return(ret); 2489 } 2490 2491 /** 2492 * xmlFACompareAtoms: 2493 * @atom1: an atom 2494 * @atom2: an atom 2495 * @deep: if not set only compare string pointers 2496 * 2497 * Compares two atoms to check whether they intersect in some ways, 2498 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2499 * 2500 * Returns 1 if yes and 0 otherwise 2501 */ 2502 static int 2503 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2504 int ret = 1; 2505 2506 if (atom1 == atom2) 2507 return(1); 2508 if ((atom1 == NULL) || (atom2 == NULL)) 2509 return(0); 2510 2511 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2512 (atom2->type == XML_REGEXP_ANYCHAR)) 2513 return(1); 2514 2515 if (atom1->type > atom2->type) { 2516 xmlRegAtomPtr tmp; 2517 tmp = atom1; 2518 atom1 = atom2; 2519 atom2 = tmp; 2520 } 2521 if (atom1->type != atom2->type) { 2522 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2523 /* if they can't intersect at the type level break now */ 2524 if (ret == 0) 2525 return(0); 2526 } 2527 switch (atom1->type) { 2528 case XML_REGEXP_STRING: 2529 if (!deep) 2530 ret = (atom1->valuep != atom2->valuep); 2531 else { 2532 xmlChar *val1 = (xmlChar *)atom1->valuep; 2533 xmlChar *val2 = (xmlChar *)atom2->valuep; 2534 int compound1 = (xmlStrchr(val1, '|') != NULL); 2535 int compound2 = (xmlStrchr(val2, '|') != NULL); 2536 2537 /* Ignore negative match flag for ##other namespaces */ 2538 if (compound1 != compound2) 2539 return(0); 2540 2541 ret = xmlRegStrEqualWildcard(val1, val2); 2542 } 2543 break; 2544 case XML_REGEXP_EPSILON: 2545 goto not_determinist; 2546 case XML_REGEXP_CHARVAL: 2547 if (atom2->type == XML_REGEXP_CHARVAL) { 2548 ret = (atom1->codepoint == atom2->codepoint); 2549 } else { 2550 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2551 if (ret < 0) 2552 ret = 1; 2553 } 2554 break; 2555 case XML_REGEXP_RANGES: 2556 if (atom2->type == XML_REGEXP_RANGES) { 2557 int i, j, res; 2558 xmlRegRangePtr r1, r2; 2559 2560 /* 2561 * need to check that none of the ranges eventually matches 2562 */ 2563 for (i = 0;i < atom1->nbRanges;i++) { 2564 for (j = 0;j < atom2->nbRanges;j++) { 2565 r1 = atom1->ranges[i]; 2566 r2 = atom2->ranges[j]; 2567 res = xmlFACompareRanges(r1, r2); 2568 if (res == 1) { 2569 ret = 1; 2570 goto done; 2571 } 2572 } 2573 } 2574 ret = 0; 2575 } 2576 break; 2577 default: 2578 goto not_determinist; 2579 } 2580 done: 2581 if (atom1->neg != atom2->neg) { 2582 ret = !ret; 2583 } 2584 if (ret == 0) 2585 return(0); 2586 not_determinist: 2587 return(1); 2588 } 2589 2590 /** 2591 * xmlFARecurseDeterminism: 2592 * @ctxt: a regexp parser context 2593 * 2594 * Check whether the associated regexp is determinist, 2595 * should be called after xmlFAEliminateEpsilonTransitions() 2596 * 2597 */ 2598 static int 2599 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2600 int to, xmlRegAtomPtr atom) { 2601 int ret = 1; 2602 int res; 2603 int transnr, nbTrans; 2604 xmlRegTransPtr t1; 2605 int deep = 1; 2606 2607 if (state == NULL) 2608 return(ret); 2609 if (state->markd == XML_REGEXP_MARK_VISITED) 2610 return(ret); 2611 2612 if (ctxt->flags & AM_AUTOMATA_RNG) 2613 deep = 0; 2614 2615 /* 2616 * don't recurse on transitions potentially added in the course of 2617 * the elimination. 2618 */ 2619 nbTrans = state->nbTrans; 2620 for (transnr = 0;transnr < nbTrans;transnr++) { 2621 t1 = &(state->trans[transnr]); 2622 /* 2623 * check transitions conflicting with the one looked at 2624 */ 2625 if (t1->atom == NULL) { 2626 if (t1->to < 0) 2627 continue; 2628 state->markd = XML_REGEXP_MARK_VISITED; 2629 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2630 to, atom); 2631 state->markd = 0; 2632 if (res == 0) { 2633 ret = 0; 2634 /* t1->nd = 1; */ 2635 } 2636 continue; 2637 } 2638 if (t1->to != to) 2639 continue; 2640 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2641 ret = 0; 2642 /* mark the transition as non-deterministic */ 2643 t1->nd = 1; 2644 } 2645 } 2646 return(ret); 2647 } 2648 2649 /** 2650 * xmlFAComputesDeterminism: 2651 * @ctxt: a regexp parser context 2652 * 2653 * Check whether the associated regexp is determinist, 2654 * should be called after xmlFAEliminateEpsilonTransitions() 2655 * 2656 */ 2657 static int 2658 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2659 int statenr, transnr; 2660 xmlRegStatePtr state; 2661 xmlRegTransPtr t1, t2, last; 2662 int i; 2663 int ret = 1; 2664 int deep = 1; 2665 2666 #ifdef DEBUG_REGEXP_GRAPH 2667 printf("xmlFAComputesDeterminism\n"); 2668 xmlRegPrintCtxt(stdout, ctxt); 2669 #endif 2670 if (ctxt->determinist != -1) 2671 return(ctxt->determinist); 2672 2673 if (ctxt->flags & AM_AUTOMATA_RNG) 2674 deep = 0; 2675 2676 /* 2677 * First cleanup the automata removing cancelled transitions 2678 */ 2679 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2680 state = ctxt->states[statenr]; 2681 if (state == NULL) 2682 continue; 2683 if (state->nbTrans < 2) 2684 continue; 2685 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2686 t1 = &(state->trans[transnr]); 2687 /* 2688 * Determinism checks in case of counted or all transitions 2689 * will have to be handled separately 2690 */ 2691 if (t1->atom == NULL) { 2692 /* t1->nd = 1; */ 2693 continue; 2694 } 2695 if (t1->to == -1) /* eliminated */ 2696 continue; 2697 for (i = 0;i < transnr;i++) { 2698 t2 = &(state->trans[i]); 2699 if (t2->to == -1) /* eliminated */ 2700 continue; 2701 if (t2->atom != NULL) { 2702 if (t1->to == t2->to) { 2703 /* 2704 * Here we use deep because we want to keep the 2705 * transitions which indicate a conflict 2706 */ 2707 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2708 (t1->counter == t2->counter) && 2709 (t1->count == t2->count)) 2710 t2->to = -1; /* eliminated */ 2711 } 2712 } 2713 } 2714 } 2715 } 2716 2717 /* 2718 * Check for all states that there aren't 2 transitions 2719 * with the same atom and a different target. 2720 */ 2721 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2722 state = ctxt->states[statenr]; 2723 if (state == NULL) 2724 continue; 2725 if (state->nbTrans < 2) 2726 continue; 2727 last = NULL; 2728 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2729 t1 = &(state->trans[transnr]); 2730 /* 2731 * Determinism checks in case of counted or all transitions 2732 * will have to be handled separately 2733 */ 2734 if (t1->atom == NULL) { 2735 continue; 2736 } 2737 if (t1->to == -1) /* eliminated */ 2738 continue; 2739 for (i = 0;i < transnr;i++) { 2740 t2 = &(state->trans[i]); 2741 if (t2->to == -1) /* eliminated */ 2742 continue; 2743 if (t2->atom != NULL) { 2744 /* 2745 * But here we don't use deep because we want to 2746 * find transitions which indicate a conflict 2747 */ 2748 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2749 ret = 0; 2750 /* mark the transitions as non-deterministic ones */ 2751 t1->nd = 1; 2752 t2->nd = 1; 2753 last = t1; 2754 } 2755 } else if (t1->to != -1) { 2756 /* 2757 * do the closure in case of remaining specific 2758 * epsilon transitions like choices or all 2759 */ 2760 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2761 t2->to, t2->atom); 2762 /* don't shortcut the computation so all non deterministic 2763 transition get marked down 2764 if (ret == 0) 2765 return(0); 2766 */ 2767 if (ret == 0) { 2768 t1->nd = 1; 2769 /* t2->nd = 1; */ 2770 last = t1; 2771 } 2772 } 2773 } 2774 /* don't shortcut the computation so all non deterministic 2775 transition get marked down 2776 if (ret == 0) 2777 break; */ 2778 } 2779 2780 /* 2781 * mark specifically the last non-deterministic transition 2782 * from a state since there is no need to set-up rollback 2783 * from it 2784 */ 2785 if (last != NULL) { 2786 last->nd = 2; 2787 } 2788 2789 /* don't shortcut the computation so all non deterministic 2790 transition get marked down 2791 if (ret == 0) 2792 break; */ 2793 } 2794 2795 ctxt->determinist = ret; 2796 return(ret); 2797 } 2798 2799 /************************************************************************ 2800 * * 2801 * Routines to check input against transition atoms * 2802 * * 2803 ************************************************************************/ 2804 2805 static int 2806 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2807 int start, int end, const xmlChar *blockName) { 2808 int ret = 0; 2809 2810 switch (type) { 2811 case XML_REGEXP_STRING: 2812 case XML_REGEXP_SUBREG: 2813 case XML_REGEXP_RANGES: 2814 case XML_REGEXP_EPSILON: 2815 return(-1); 2816 case XML_REGEXP_ANYCHAR: 2817 ret = ((codepoint != '\n') && (codepoint != '\r')); 2818 break; 2819 case XML_REGEXP_CHARVAL: 2820 ret = ((codepoint >= start) && (codepoint <= end)); 2821 break; 2822 case XML_REGEXP_NOTSPACE: 2823 neg = !neg; 2824 /* Falls through. */ 2825 case XML_REGEXP_ANYSPACE: 2826 ret = ((codepoint == '\n') || (codepoint == '\r') || 2827 (codepoint == '\t') || (codepoint == ' ')); 2828 break; 2829 case XML_REGEXP_NOTINITNAME: 2830 neg = !neg; 2831 /* Falls through. */ 2832 case XML_REGEXP_INITNAME: 2833 ret = (IS_LETTER(codepoint) || 2834 (codepoint == '_') || (codepoint == ':')); 2835 break; 2836 case XML_REGEXP_NOTNAMECHAR: 2837 neg = !neg; 2838 /* Falls through. */ 2839 case XML_REGEXP_NAMECHAR: 2840 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2841 (codepoint == '.') || (codepoint == '-') || 2842 (codepoint == '_') || (codepoint == ':') || 2843 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2844 break; 2845 case XML_REGEXP_NOTDECIMAL: 2846 neg = !neg; 2847 /* Falls through. */ 2848 case XML_REGEXP_DECIMAL: 2849 ret = xmlUCSIsCatNd(codepoint); 2850 break; 2851 case XML_REGEXP_REALCHAR: 2852 neg = !neg; 2853 /* Falls through. */ 2854 case XML_REGEXP_NOTREALCHAR: 2855 ret = xmlUCSIsCatP(codepoint); 2856 if (ret == 0) 2857 ret = xmlUCSIsCatZ(codepoint); 2858 if (ret == 0) 2859 ret = xmlUCSIsCatC(codepoint); 2860 break; 2861 case XML_REGEXP_LETTER: 2862 ret = xmlUCSIsCatL(codepoint); 2863 break; 2864 case XML_REGEXP_LETTER_UPPERCASE: 2865 ret = xmlUCSIsCatLu(codepoint); 2866 break; 2867 case XML_REGEXP_LETTER_LOWERCASE: 2868 ret = xmlUCSIsCatLl(codepoint); 2869 break; 2870 case XML_REGEXP_LETTER_TITLECASE: 2871 ret = xmlUCSIsCatLt(codepoint); 2872 break; 2873 case XML_REGEXP_LETTER_MODIFIER: 2874 ret = xmlUCSIsCatLm(codepoint); 2875 break; 2876 case XML_REGEXP_LETTER_OTHERS: 2877 ret = xmlUCSIsCatLo(codepoint); 2878 break; 2879 case XML_REGEXP_MARK: 2880 ret = xmlUCSIsCatM(codepoint); 2881 break; 2882 case XML_REGEXP_MARK_NONSPACING: 2883 ret = xmlUCSIsCatMn(codepoint); 2884 break; 2885 case XML_REGEXP_MARK_SPACECOMBINING: 2886 ret = xmlUCSIsCatMc(codepoint); 2887 break; 2888 case XML_REGEXP_MARK_ENCLOSING: 2889 ret = xmlUCSIsCatMe(codepoint); 2890 break; 2891 case XML_REGEXP_NUMBER: 2892 ret = xmlUCSIsCatN(codepoint); 2893 break; 2894 case XML_REGEXP_NUMBER_DECIMAL: 2895 ret = xmlUCSIsCatNd(codepoint); 2896 break; 2897 case XML_REGEXP_NUMBER_LETTER: 2898 ret = xmlUCSIsCatNl(codepoint); 2899 break; 2900 case XML_REGEXP_NUMBER_OTHERS: 2901 ret = xmlUCSIsCatNo(codepoint); 2902 break; 2903 case XML_REGEXP_PUNCT: 2904 ret = xmlUCSIsCatP(codepoint); 2905 break; 2906 case XML_REGEXP_PUNCT_CONNECTOR: 2907 ret = xmlUCSIsCatPc(codepoint); 2908 break; 2909 case XML_REGEXP_PUNCT_DASH: 2910 ret = xmlUCSIsCatPd(codepoint); 2911 break; 2912 case XML_REGEXP_PUNCT_OPEN: 2913 ret = xmlUCSIsCatPs(codepoint); 2914 break; 2915 case XML_REGEXP_PUNCT_CLOSE: 2916 ret = xmlUCSIsCatPe(codepoint); 2917 break; 2918 case XML_REGEXP_PUNCT_INITQUOTE: 2919 ret = xmlUCSIsCatPi(codepoint); 2920 break; 2921 case XML_REGEXP_PUNCT_FINQUOTE: 2922 ret = xmlUCSIsCatPf(codepoint); 2923 break; 2924 case XML_REGEXP_PUNCT_OTHERS: 2925 ret = xmlUCSIsCatPo(codepoint); 2926 break; 2927 case XML_REGEXP_SEPAR: 2928 ret = xmlUCSIsCatZ(codepoint); 2929 break; 2930 case XML_REGEXP_SEPAR_SPACE: 2931 ret = xmlUCSIsCatZs(codepoint); 2932 break; 2933 case XML_REGEXP_SEPAR_LINE: 2934 ret = xmlUCSIsCatZl(codepoint); 2935 break; 2936 case XML_REGEXP_SEPAR_PARA: 2937 ret = xmlUCSIsCatZp(codepoint); 2938 break; 2939 case XML_REGEXP_SYMBOL: 2940 ret = xmlUCSIsCatS(codepoint); 2941 break; 2942 case XML_REGEXP_SYMBOL_MATH: 2943 ret = xmlUCSIsCatSm(codepoint); 2944 break; 2945 case XML_REGEXP_SYMBOL_CURRENCY: 2946 ret = xmlUCSIsCatSc(codepoint); 2947 break; 2948 case XML_REGEXP_SYMBOL_MODIFIER: 2949 ret = xmlUCSIsCatSk(codepoint); 2950 break; 2951 case XML_REGEXP_SYMBOL_OTHERS: 2952 ret = xmlUCSIsCatSo(codepoint); 2953 break; 2954 case XML_REGEXP_OTHER: 2955 ret = xmlUCSIsCatC(codepoint); 2956 break; 2957 case XML_REGEXP_OTHER_CONTROL: 2958 ret = xmlUCSIsCatCc(codepoint); 2959 break; 2960 case XML_REGEXP_OTHER_FORMAT: 2961 ret = xmlUCSIsCatCf(codepoint); 2962 break; 2963 case XML_REGEXP_OTHER_PRIVATE: 2964 ret = xmlUCSIsCatCo(codepoint); 2965 break; 2966 case XML_REGEXP_OTHER_NA: 2967 /* ret = xmlUCSIsCatCn(codepoint); */ 2968 /* Seems it doesn't exist anymore in recent Unicode releases */ 2969 ret = 0; 2970 break; 2971 case XML_REGEXP_BLOCK_NAME: 2972 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2973 break; 2974 } 2975 if (neg) 2976 return(!ret); 2977 return(ret); 2978 } 2979 2980 static int 2981 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2982 int i, ret = 0; 2983 xmlRegRangePtr range; 2984 2985 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2986 return(-1); 2987 2988 switch (atom->type) { 2989 case XML_REGEXP_SUBREG: 2990 case XML_REGEXP_EPSILON: 2991 return(-1); 2992 case XML_REGEXP_CHARVAL: 2993 return(codepoint == atom->codepoint); 2994 case XML_REGEXP_RANGES: { 2995 int accept = 0; 2996 2997 for (i = 0;i < atom->nbRanges;i++) { 2998 range = atom->ranges[i]; 2999 if (range->neg == 2) { 3000 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3001 0, range->start, range->end, 3002 range->blockName); 3003 if (ret != 0) 3004 return(0); /* excluded char */ 3005 } else if (range->neg) { 3006 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3007 0, range->start, range->end, 3008 range->blockName); 3009 if (ret == 0) 3010 accept = 1; 3011 else 3012 return(0); 3013 } else { 3014 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3015 0, range->start, range->end, 3016 range->blockName); 3017 if (ret != 0) 3018 accept = 1; /* might still be excluded */ 3019 } 3020 } 3021 return(accept); 3022 } 3023 case XML_REGEXP_STRING: 3024 printf("TODO: XML_REGEXP_STRING\n"); 3025 return(-1); 3026 case XML_REGEXP_ANYCHAR: 3027 case XML_REGEXP_ANYSPACE: 3028 case XML_REGEXP_NOTSPACE: 3029 case XML_REGEXP_INITNAME: 3030 case XML_REGEXP_NOTINITNAME: 3031 case XML_REGEXP_NAMECHAR: 3032 case XML_REGEXP_NOTNAMECHAR: 3033 case XML_REGEXP_DECIMAL: 3034 case XML_REGEXP_NOTDECIMAL: 3035 case XML_REGEXP_REALCHAR: 3036 case XML_REGEXP_NOTREALCHAR: 3037 case XML_REGEXP_LETTER: 3038 case XML_REGEXP_LETTER_UPPERCASE: 3039 case XML_REGEXP_LETTER_LOWERCASE: 3040 case XML_REGEXP_LETTER_TITLECASE: 3041 case XML_REGEXP_LETTER_MODIFIER: 3042 case XML_REGEXP_LETTER_OTHERS: 3043 case XML_REGEXP_MARK: 3044 case XML_REGEXP_MARK_NONSPACING: 3045 case XML_REGEXP_MARK_SPACECOMBINING: 3046 case XML_REGEXP_MARK_ENCLOSING: 3047 case XML_REGEXP_NUMBER: 3048 case XML_REGEXP_NUMBER_DECIMAL: 3049 case XML_REGEXP_NUMBER_LETTER: 3050 case XML_REGEXP_NUMBER_OTHERS: 3051 case XML_REGEXP_PUNCT: 3052 case XML_REGEXP_PUNCT_CONNECTOR: 3053 case XML_REGEXP_PUNCT_DASH: 3054 case XML_REGEXP_PUNCT_OPEN: 3055 case XML_REGEXP_PUNCT_CLOSE: 3056 case XML_REGEXP_PUNCT_INITQUOTE: 3057 case XML_REGEXP_PUNCT_FINQUOTE: 3058 case XML_REGEXP_PUNCT_OTHERS: 3059 case XML_REGEXP_SEPAR: 3060 case XML_REGEXP_SEPAR_SPACE: 3061 case XML_REGEXP_SEPAR_LINE: 3062 case XML_REGEXP_SEPAR_PARA: 3063 case XML_REGEXP_SYMBOL: 3064 case XML_REGEXP_SYMBOL_MATH: 3065 case XML_REGEXP_SYMBOL_CURRENCY: 3066 case XML_REGEXP_SYMBOL_MODIFIER: 3067 case XML_REGEXP_SYMBOL_OTHERS: 3068 case XML_REGEXP_OTHER: 3069 case XML_REGEXP_OTHER_CONTROL: 3070 case XML_REGEXP_OTHER_FORMAT: 3071 case XML_REGEXP_OTHER_PRIVATE: 3072 case XML_REGEXP_OTHER_NA: 3073 case XML_REGEXP_BLOCK_NAME: 3074 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3075 (const xmlChar *)atom->valuep); 3076 if (atom->neg) 3077 ret = !ret; 3078 break; 3079 } 3080 return(ret); 3081 } 3082 3083 /************************************************************************ 3084 * * 3085 * Saving and restoring state of an execution context * 3086 * * 3087 ************************************************************************/ 3088 3089 #ifdef DEBUG_REGEXP_EXEC 3090 static void 3091 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3092 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3093 if (exec->inputStack != NULL) { 3094 int i; 3095 printf(": "); 3096 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3097 printf("%s ", (const char *) 3098 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3099 } else { 3100 printf(": %s", &(exec->inputString[exec->index])); 3101 } 3102 printf("\n"); 3103 } 3104 #endif 3105 3106 static void 3107 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3108 #ifdef DEBUG_REGEXP_EXEC 3109 printf("saving "); 3110 exec->transno++; 3111 xmlFARegDebugExec(exec); 3112 exec->transno--; 3113 #endif 3114 #ifdef MAX_PUSH 3115 if (exec->nbPush > MAX_PUSH) { 3116 return; 3117 } 3118 exec->nbPush++; 3119 #endif 3120 3121 if (exec->maxRollbacks == 0) { 3122 exec->maxRollbacks = 4; 3123 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3124 sizeof(xmlRegExecRollback)); 3125 if (exec->rollbacks == NULL) { 3126 xmlRegexpErrMemory(NULL, "saving regexp"); 3127 exec->maxRollbacks = 0; 3128 return; 3129 } 3130 memset(exec->rollbacks, 0, 3131 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3132 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3133 xmlRegExecRollback *tmp; 3134 int len = exec->maxRollbacks; 3135 3136 exec->maxRollbacks *= 2; 3137 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3138 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3139 if (tmp == NULL) { 3140 xmlRegexpErrMemory(NULL, "saving regexp"); 3141 exec->maxRollbacks /= 2; 3142 return; 3143 } 3144 exec->rollbacks = tmp; 3145 tmp = &exec->rollbacks[len]; 3146 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3147 } 3148 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3149 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3150 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3151 if (exec->comp->nbCounters > 0) { 3152 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3153 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3154 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3155 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3156 xmlRegexpErrMemory(NULL, "saving regexp"); 3157 exec->status = -5; 3158 return; 3159 } 3160 } 3161 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3162 exec->comp->nbCounters * sizeof(int)); 3163 } 3164 exec->nbRollbacks++; 3165 } 3166 3167 static void 3168 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3169 if (exec->nbRollbacks <= 0) { 3170 exec->status = -1; 3171 #ifdef DEBUG_REGEXP_EXEC 3172 printf("rollback failed on empty stack\n"); 3173 #endif 3174 return; 3175 } 3176 exec->nbRollbacks--; 3177 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3178 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3179 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3180 if (exec->comp->nbCounters > 0) { 3181 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3182 fprintf(stderr, "exec save: allocation failed"); 3183 exec->status = -6; 3184 return; 3185 } 3186 if (exec->counts) { 3187 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3188 exec->comp->nbCounters * sizeof(int)); 3189 } 3190 } 3191 3192 #ifdef DEBUG_REGEXP_EXEC 3193 printf("restored "); 3194 xmlFARegDebugExec(exec); 3195 #endif 3196 } 3197 3198 /************************************************************************ 3199 * * 3200 * Verifier, running an input against a compiled regexp * 3201 * * 3202 ************************************************************************/ 3203 3204 static int 3205 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3206 xmlRegExecCtxt execval; 3207 xmlRegExecCtxtPtr exec = &execval; 3208 int ret, codepoint = 0, len, deter; 3209 3210 exec->inputString = content; 3211 exec->index = 0; 3212 exec->nbPush = 0; 3213 exec->determinist = 1; 3214 exec->maxRollbacks = 0; 3215 exec->nbRollbacks = 0; 3216 exec->rollbacks = NULL; 3217 exec->status = 0; 3218 exec->comp = comp; 3219 exec->state = comp->states[0]; 3220 exec->transno = 0; 3221 exec->transcount = 0; 3222 exec->inputStack = NULL; 3223 exec->inputStackMax = 0; 3224 if (comp->nbCounters > 0) { 3225 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3226 if (exec->counts == NULL) { 3227 xmlRegexpErrMemory(NULL, "running regexp"); 3228 return(-1); 3229 } 3230 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3231 } else 3232 exec->counts = NULL; 3233 while ((exec->status == 0) && (exec->state != NULL) && 3234 ((exec->inputString[exec->index] != 0) || 3235 ((exec->state != NULL) && 3236 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3237 xmlRegTransPtr trans; 3238 xmlRegAtomPtr atom; 3239 3240 /* 3241 * If end of input on non-terminal state, rollback, however we may 3242 * still have epsilon like transition for counted transitions 3243 * on counters, in that case don't break too early. Additionally, 3244 * if we are working on a range like "AB{0,2}", where B is not present, 3245 * we don't want to break. 3246 */ 3247 len = 1; 3248 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3249 /* 3250 * if there is a transition, we must check if 3251 * atom allows minOccurs of 0 3252 */ 3253 if (exec->transno < exec->state->nbTrans) { 3254 trans = &exec->state->trans[exec->transno]; 3255 if (trans->to >=0) { 3256 atom = trans->atom; 3257 if (!((atom->min == 0) && (atom->max > 0))) 3258 goto rollback; 3259 } 3260 } else 3261 goto rollback; 3262 } 3263 3264 exec->transcount = 0; 3265 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3266 trans = &exec->state->trans[exec->transno]; 3267 if (trans->to < 0) 3268 continue; 3269 atom = trans->atom; 3270 ret = 0; 3271 deter = 1; 3272 if (trans->count >= 0) { 3273 int count; 3274 xmlRegCounterPtr counter; 3275 3276 if (exec->counts == NULL) { 3277 exec->status = -1; 3278 goto error; 3279 } 3280 /* 3281 * A counted transition. 3282 */ 3283 3284 count = exec->counts[trans->count]; 3285 counter = &exec->comp->counters[trans->count]; 3286 #ifdef DEBUG_REGEXP_EXEC 3287 printf("testing count %d: val %d, min %d, max %d\n", 3288 trans->count, count, counter->min, counter->max); 3289 #endif 3290 ret = ((count >= counter->min) && (count <= counter->max)); 3291 if ((ret) && (counter->min != counter->max)) 3292 deter = 0; 3293 } else if (atom == NULL) { 3294 fprintf(stderr, "epsilon transition left at runtime\n"); 3295 exec->status = -2; 3296 break; 3297 } else if (exec->inputString[exec->index] != 0) { 3298 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3299 ret = xmlRegCheckCharacter(atom, codepoint); 3300 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3301 xmlRegStatePtr to = comp->states[trans->to]; 3302 3303 /* 3304 * this is a multiple input sequence 3305 * If there is a counter associated increment it now. 3306 * before potentially saving and rollback 3307 * do not increment if the counter is already over the 3308 * maximum limit in which case get to next transition 3309 */ 3310 if (trans->counter >= 0) { 3311 xmlRegCounterPtr counter; 3312 3313 if ((exec->counts == NULL) || 3314 (exec->comp == NULL) || 3315 (exec->comp->counters == NULL)) { 3316 exec->status = -1; 3317 goto error; 3318 } 3319 counter = &exec->comp->counters[trans->counter]; 3320 if (exec->counts[trans->counter] >= counter->max) 3321 continue; /* for loop on transitions */ 3322 3323 #ifdef DEBUG_REGEXP_EXEC 3324 printf("Increasing count %d\n", trans->counter); 3325 #endif 3326 exec->counts[trans->counter]++; 3327 } 3328 if (exec->state->nbTrans > exec->transno + 1) { 3329 xmlFARegExecSave(exec); 3330 } 3331 exec->transcount = 1; 3332 do { 3333 /* 3334 * Try to progress as much as possible on the input 3335 */ 3336 if (exec->transcount == atom->max) { 3337 break; 3338 } 3339 exec->index += len; 3340 /* 3341 * End of input: stop here 3342 */ 3343 if (exec->inputString[exec->index] == 0) { 3344 exec->index -= len; 3345 break; 3346 } 3347 if (exec->transcount >= atom->min) { 3348 int transno = exec->transno; 3349 xmlRegStatePtr state = exec->state; 3350 3351 /* 3352 * The transition is acceptable save it 3353 */ 3354 exec->transno = -1; /* trick */ 3355 exec->state = to; 3356 xmlFARegExecSave(exec); 3357 exec->transno = transno; 3358 exec->state = state; 3359 } 3360 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3361 len); 3362 ret = xmlRegCheckCharacter(atom, codepoint); 3363 exec->transcount++; 3364 } while (ret == 1); 3365 if (exec->transcount < atom->min) 3366 ret = 0; 3367 3368 /* 3369 * If the last check failed but one transition was found 3370 * possible, rollback 3371 */ 3372 if (ret < 0) 3373 ret = 0; 3374 if (ret == 0) { 3375 goto rollback; 3376 } 3377 if (trans->counter >= 0) { 3378 if (exec->counts == NULL) { 3379 exec->status = -1; 3380 goto error; 3381 } 3382 #ifdef DEBUG_REGEXP_EXEC 3383 printf("Decreasing count %d\n", trans->counter); 3384 #endif 3385 exec->counts[trans->counter]--; 3386 } 3387 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3388 /* 3389 * we don't match on the codepoint, but minOccurs of 0 3390 * says that's ok. Setting len to 0 inhibits stepping 3391 * over the codepoint. 3392 */ 3393 exec->transcount = 1; 3394 len = 0; 3395 ret = 1; 3396 } 3397 } else if ((atom->min == 0) && (atom->max > 0)) { 3398 /* another spot to match when minOccurs is 0 */ 3399 exec->transcount = 1; 3400 len = 0; 3401 ret = 1; 3402 } 3403 if (ret == 1) { 3404 if ((trans->nd == 1) || 3405 ((trans->count >= 0) && (deter == 0) && 3406 (exec->state->nbTrans > exec->transno + 1))) { 3407 #ifdef DEBUG_REGEXP_EXEC 3408 if (trans->nd == 1) 3409 printf("Saving on nd transition atom %d for %c at %d\n", 3410 trans->atom->no, codepoint, exec->index); 3411 else 3412 printf("Saving on counted transition count %d for %c at %d\n", 3413 trans->count, codepoint, exec->index); 3414 #endif 3415 xmlFARegExecSave(exec); 3416 } 3417 if (trans->counter >= 0) { 3418 xmlRegCounterPtr counter; 3419 3420 /* make sure we don't go over the counter maximum value */ 3421 if ((exec->counts == NULL) || 3422 (exec->comp == NULL) || 3423 (exec->comp->counters == NULL)) { 3424 exec->status = -1; 3425 goto error; 3426 } 3427 counter = &exec->comp->counters[trans->counter]; 3428 if (exec->counts[trans->counter] >= counter->max) 3429 continue; /* for loop on transitions */ 3430 #ifdef DEBUG_REGEXP_EXEC 3431 printf("Increasing count %d\n", trans->counter); 3432 #endif 3433 exec->counts[trans->counter]++; 3434 } 3435 if ((trans->count >= 0) && 3436 (trans->count < REGEXP_ALL_COUNTER)) { 3437 if (exec->counts == NULL) { 3438 exec->status = -1; 3439 goto error; 3440 } 3441 #ifdef DEBUG_REGEXP_EXEC 3442 printf("resetting count %d on transition\n", 3443 trans->count); 3444 #endif 3445 exec->counts[trans->count] = 0; 3446 } 3447 #ifdef DEBUG_REGEXP_EXEC 3448 printf("entering state %d\n", trans->to); 3449 #endif 3450 exec->state = comp->states[trans->to]; 3451 exec->transno = 0; 3452 if (trans->atom != NULL) { 3453 exec->index += len; 3454 } 3455 goto progress; 3456 } else if (ret < 0) { 3457 exec->status = -4; 3458 break; 3459 } 3460 } 3461 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3462 rollback: 3463 /* 3464 * Failed to find a way out 3465 */ 3466 exec->determinist = 0; 3467 #ifdef DEBUG_REGEXP_EXEC 3468 printf("rollback from state %d on %d:%c\n", exec->state->no, 3469 codepoint,codepoint); 3470 #endif 3471 xmlFARegExecRollBack(exec); 3472 } 3473 progress: 3474 continue; 3475 } 3476 error: 3477 if (exec->rollbacks != NULL) { 3478 if (exec->counts != NULL) { 3479 int i; 3480 3481 for (i = 0;i < exec->maxRollbacks;i++) 3482 if (exec->rollbacks[i].counts != NULL) 3483 xmlFree(exec->rollbacks[i].counts); 3484 } 3485 xmlFree(exec->rollbacks); 3486 } 3487 if (exec->state == NULL) 3488 return(-1); 3489 if (exec->counts != NULL) 3490 xmlFree(exec->counts); 3491 if (exec->status == 0) 3492 return(1); 3493 if (exec->status == -1) { 3494 if (exec->nbPush > MAX_PUSH) 3495 return(-1); 3496 return(0); 3497 } 3498 return(exec->status); 3499 } 3500 3501 /************************************************************************ 3502 * * 3503 * Progressive interface to the verifier one atom at a time * 3504 * * 3505 ************************************************************************/ 3506 #ifdef DEBUG_ERR 3507 static void testerr(xmlRegExecCtxtPtr exec); 3508 #endif 3509 3510 /** 3511 * xmlRegNewExecCtxt: 3512 * @comp: a precompiled regular expression 3513 * @callback: a callback function used for handling progresses in the 3514 * automata matching phase 3515 * @data: the context data associated to the callback in this context 3516 * 3517 * Build a context used for progressive evaluation of a regexp. 3518 * 3519 * Returns the new context 3520 */ 3521 xmlRegExecCtxtPtr 3522 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3523 xmlRegExecCtxtPtr exec; 3524 3525 if (comp == NULL) 3526 return(NULL); 3527 if ((comp->compact == NULL) && (comp->states == NULL)) 3528 return(NULL); 3529 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3530 if (exec == NULL) { 3531 xmlRegexpErrMemory(NULL, "creating execution context"); 3532 return(NULL); 3533 } 3534 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3535 exec->inputString = NULL; 3536 exec->index = 0; 3537 exec->determinist = 1; 3538 exec->maxRollbacks = 0; 3539 exec->nbRollbacks = 0; 3540 exec->rollbacks = NULL; 3541 exec->status = 0; 3542 exec->comp = comp; 3543 if (comp->compact == NULL) 3544 exec->state = comp->states[0]; 3545 exec->transno = 0; 3546 exec->transcount = 0; 3547 exec->callback = callback; 3548 exec->data = data; 3549 if (comp->nbCounters > 0) { 3550 /* 3551 * For error handling, exec->counts is allocated twice the size 3552 * the second half is used to store the data in case of rollback 3553 */ 3554 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3555 * 2); 3556 if (exec->counts == NULL) { 3557 xmlRegexpErrMemory(NULL, "creating execution context"); 3558 xmlFree(exec); 3559 return(NULL); 3560 } 3561 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3562 exec->errCounts = &exec->counts[comp->nbCounters]; 3563 } else { 3564 exec->counts = NULL; 3565 exec->errCounts = NULL; 3566 } 3567 exec->inputStackMax = 0; 3568 exec->inputStackNr = 0; 3569 exec->inputStack = NULL; 3570 exec->errStateNo = -1; 3571 exec->errString = NULL; 3572 exec->nbPush = 0; 3573 return(exec); 3574 } 3575 3576 /** 3577 * xmlRegFreeExecCtxt: 3578 * @exec: a regular expression evaluation context 3579 * 3580 * Free the structures associated to a regular expression evaluation context. 3581 */ 3582 void 3583 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3584 if (exec == NULL) 3585 return; 3586 3587 if (exec->rollbacks != NULL) { 3588 if (exec->counts != NULL) { 3589 int i; 3590 3591 for (i = 0;i < exec->maxRollbacks;i++) 3592 if (exec->rollbacks[i].counts != NULL) 3593 xmlFree(exec->rollbacks[i].counts); 3594 } 3595 xmlFree(exec->rollbacks); 3596 } 3597 if (exec->counts != NULL) 3598 xmlFree(exec->counts); 3599 if (exec->inputStack != NULL) { 3600 int i; 3601 3602 for (i = 0;i < exec->inputStackNr;i++) { 3603 if (exec->inputStack[i].value != NULL) 3604 xmlFree(exec->inputStack[i].value); 3605 } 3606 xmlFree(exec->inputStack); 3607 } 3608 if (exec->errString != NULL) 3609 xmlFree(exec->errString); 3610 xmlFree(exec); 3611 } 3612 3613 static void 3614 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3615 void *data) { 3616 #ifdef DEBUG_PUSH 3617 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3618 #endif 3619 if (exec->inputStackMax == 0) { 3620 exec->inputStackMax = 4; 3621 exec->inputStack = (xmlRegInputTokenPtr) 3622 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3623 if (exec->inputStack == NULL) { 3624 xmlRegexpErrMemory(NULL, "pushing input string"); 3625 exec->inputStackMax = 0; 3626 return; 3627 } 3628 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3629 xmlRegInputTokenPtr tmp; 3630 3631 exec->inputStackMax *= 2; 3632 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3633 exec->inputStackMax * sizeof(xmlRegInputToken)); 3634 if (tmp == NULL) { 3635 xmlRegexpErrMemory(NULL, "pushing input string"); 3636 exec->inputStackMax /= 2; 3637 return; 3638 } 3639 exec->inputStack = tmp; 3640 } 3641 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3642 exec->inputStack[exec->inputStackNr].data = data; 3643 exec->inputStackNr++; 3644 exec->inputStack[exec->inputStackNr].value = NULL; 3645 exec->inputStack[exec->inputStackNr].data = NULL; 3646 } 3647 3648 /** 3649 * xmlRegStrEqualWildcard: 3650 * @expStr: the string to be evaluated 3651 * @valStr: the validation string 3652 * 3653 * Checks if both strings are equal or have the same content. "*" 3654 * can be used as a wildcard in @valStr; "|" is used as a separator of 3655 * substrings in both @expStr and @valStr. 3656 * 3657 * Returns 1 if the comparison is satisfied and the number of substrings 3658 * is equal, 0 otherwise. 3659 */ 3660 3661 static int 3662 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3663 if (expStr == valStr) return(1); 3664 if (expStr == NULL) return(0); 3665 if (valStr == NULL) return(0); 3666 do { 3667 /* 3668 * Eval if we have a wildcard for the current item. 3669 */ 3670 if (*expStr != *valStr) { 3671 /* if one of them starts with a wildcard make valStr be it */ 3672 if (*valStr == '*') { 3673 const xmlChar *tmp; 3674 3675 tmp = valStr; 3676 valStr = expStr; 3677 expStr = tmp; 3678 } 3679 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3680 do { 3681 if (*valStr == XML_REG_STRING_SEPARATOR) 3682 break; 3683 valStr++; 3684 } while (*valStr != 0); 3685 continue; 3686 } else 3687 return(0); 3688 } 3689 expStr++; 3690 valStr++; 3691 } while (*valStr != 0); 3692 if (*expStr != 0) 3693 return (0); 3694 else 3695 return (1); 3696 } 3697 3698 /** 3699 * xmlRegCompactPushString: 3700 * @exec: a regexp execution context 3701 * @comp: the precompiled exec with a compact table 3702 * @value: a string token input 3703 * @data: data associated to the token to reuse in callbacks 3704 * 3705 * Push one input token in the execution context 3706 * 3707 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3708 * a negative value in case of error. 3709 */ 3710 static int 3711 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3712 xmlRegexpPtr comp, 3713 const xmlChar *value, 3714 void *data) { 3715 int state = exec->index; 3716 int i, target; 3717 3718 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3719 return(-1); 3720 3721 if (value == NULL) { 3722 /* 3723 * are we at a final state ? 3724 */ 3725 if (comp->compact[state * (comp->nbstrings + 1)] == 3726 XML_REGEXP_FINAL_STATE) 3727 return(1); 3728 return(0); 3729 } 3730 3731 #ifdef DEBUG_PUSH 3732 printf("value pushed: %s\n", value); 3733 #endif 3734 3735 /* 3736 * Examine all outside transitions from current state 3737 */ 3738 for (i = 0;i < comp->nbstrings;i++) { 3739 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3740 if ((target > 0) && (target <= comp->nbstates)) { 3741 target--; /* to avoid 0 */ 3742 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3743 exec->index = target; 3744 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3745 exec->callback(exec->data, value, 3746 comp->transdata[state * comp->nbstrings + i], data); 3747 } 3748 #ifdef DEBUG_PUSH 3749 printf("entering state %d\n", target); 3750 #endif 3751 if (comp->compact[target * (comp->nbstrings + 1)] == 3752 XML_REGEXP_SINK_STATE) 3753 goto error; 3754 3755 if (comp->compact[target * (comp->nbstrings + 1)] == 3756 XML_REGEXP_FINAL_STATE) 3757 return(1); 3758 return(0); 3759 } 3760 } 3761 } 3762 /* 3763 * Failed to find an exit transition out from current state for the 3764 * current token 3765 */ 3766 #ifdef DEBUG_PUSH 3767 printf("failed to find a transition for %s on state %d\n", value, state); 3768 #endif 3769 error: 3770 if (exec->errString != NULL) 3771 xmlFree(exec->errString); 3772 exec->errString = xmlStrdup(value); 3773 exec->errStateNo = state; 3774 exec->status = -1; 3775 #ifdef DEBUG_ERR 3776 testerr(exec); 3777 #endif 3778 return(-1); 3779 } 3780 3781 /** 3782 * xmlRegExecPushStringInternal: 3783 * @exec: a regexp execution context or NULL to indicate the end 3784 * @value: a string token input 3785 * @data: data associated to the token to reuse in callbacks 3786 * @compound: value was assembled from 2 strings 3787 * 3788 * Push one input token in the execution context 3789 * 3790 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3791 * a negative value in case of error. 3792 */ 3793 static int 3794 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3795 void *data, int compound) { 3796 xmlRegTransPtr trans; 3797 xmlRegAtomPtr atom; 3798 int ret; 3799 int final = 0; 3800 int progress = 1; 3801 3802 if (exec == NULL) 3803 return(-1); 3804 if (exec->comp == NULL) 3805 return(-1); 3806 if (exec->status != 0) 3807 return(exec->status); 3808 3809 if (exec->comp->compact != NULL) 3810 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3811 3812 if (value == NULL) { 3813 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3814 return(1); 3815 final = 1; 3816 } 3817 3818 #ifdef DEBUG_PUSH 3819 printf("value pushed: %s\n", value); 3820 #endif 3821 /* 3822 * If we have an active rollback stack push the new value there 3823 * and get back to where we were left 3824 */ 3825 if ((value != NULL) && (exec->inputStackNr > 0)) { 3826 xmlFARegExecSaveInputString(exec, value, data); 3827 value = exec->inputStack[exec->index].value; 3828 data = exec->inputStack[exec->index].data; 3829 #ifdef DEBUG_PUSH 3830 printf("value loaded: %s\n", value); 3831 #endif 3832 } 3833 3834 while ((exec->status == 0) && 3835 ((value != NULL) || 3836 ((final == 1) && 3837 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3838 3839 /* 3840 * End of input on non-terminal state, rollback, however we may 3841 * still have epsilon like transition for counted transitions 3842 * on counters, in that case don't break too early. 3843 */ 3844 if ((value == NULL) && (exec->counts == NULL)) 3845 goto rollback; 3846 3847 exec->transcount = 0; 3848 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3849 trans = &exec->state->trans[exec->transno]; 3850 if (trans->to < 0) 3851 continue; 3852 atom = trans->atom; 3853 ret = 0; 3854 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3855 int i; 3856 int count; 3857 xmlRegTransPtr t; 3858 xmlRegCounterPtr counter; 3859 3860 ret = 0; 3861 3862 #ifdef DEBUG_PUSH 3863 printf("testing all lax %d\n", trans->count); 3864 #endif 3865 /* 3866 * Check all counted transitions from the current state 3867 */ 3868 if ((value == NULL) && (final)) { 3869 ret = 1; 3870 } else if (value != NULL) { 3871 for (i = 0;i < exec->state->nbTrans;i++) { 3872 t = &exec->state->trans[i]; 3873 if ((t->counter < 0) || (t == trans)) 3874 continue; 3875 counter = &exec->comp->counters[t->counter]; 3876 count = exec->counts[t->counter]; 3877 if ((count < counter->max) && 3878 (t->atom != NULL) && 3879 (xmlStrEqual(value, t->atom->valuep))) { 3880 ret = 0; 3881 break; 3882 } 3883 if ((count >= counter->min) && 3884 (count < counter->max) && 3885 (t->atom != NULL) && 3886 (xmlStrEqual(value, t->atom->valuep))) { 3887 ret = 1; 3888 break; 3889 } 3890 } 3891 } 3892 } else if (trans->count == REGEXP_ALL_COUNTER) { 3893 int i; 3894 int count; 3895 xmlRegTransPtr t; 3896 xmlRegCounterPtr counter; 3897 3898 ret = 1; 3899 3900 #ifdef DEBUG_PUSH 3901 printf("testing all %d\n", trans->count); 3902 #endif 3903 /* 3904 * Check all counted transitions from the current state 3905 */ 3906 for (i = 0;i < exec->state->nbTrans;i++) { 3907 t = &exec->state->trans[i]; 3908 if ((t->counter < 0) || (t == trans)) 3909 continue; 3910 counter = &exec->comp->counters[t->counter]; 3911 count = exec->counts[t->counter]; 3912 if ((count < counter->min) || (count > counter->max)) { 3913 ret = 0; 3914 break; 3915 } 3916 } 3917 } else if (trans->count >= 0) { 3918 int count; 3919 xmlRegCounterPtr counter; 3920 3921 /* 3922 * A counted transition. 3923 */ 3924 3925 count = exec->counts[trans->count]; 3926 counter = &exec->comp->counters[trans->count]; 3927 #ifdef DEBUG_PUSH 3928 printf("testing count %d: val %d, min %d, max %d\n", 3929 trans->count, count, counter->min, counter->max); 3930 #endif 3931 ret = ((count >= counter->min) && (count <= counter->max)); 3932 } else if (atom == NULL) { 3933 fprintf(stderr, "epsilon transition left at runtime\n"); 3934 exec->status = -2; 3935 break; 3936 } else if (value != NULL) { 3937 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3938 if (atom->neg) { 3939 ret = !ret; 3940 if (!compound) 3941 ret = 0; 3942 } 3943 if ((ret == 1) && (trans->counter >= 0)) { 3944 xmlRegCounterPtr counter; 3945 int count; 3946 3947 count = exec->counts[trans->counter]; 3948 counter = &exec->comp->counters[trans->counter]; 3949 if (count >= counter->max) 3950 ret = 0; 3951 } 3952 3953 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3954 xmlRegStatePtr to = exec->comp->states[trans->to]; 3955 3956 /* 3957 * this is a multiple input sequence 3958 */ 3959 if (exec->state->nbTrans > exec->transno + 1) { 3960 if (exec->inputStackNr <= 0) { 3961 xmlFARegExecSaveInputString(exec, value, data); 3962 } 3963 xmlFARegExecSave(exec); 3964 } 3965 exec->transcount = 1; 3966 do { 3967 /* 3968 * Try to progress as much as possible on the input 3969 */ 3970 if (exec->transcount == atom->max) { 3971 break; 3972 } 3973 exec->index++; 3974 value = exec->inputStack[exec->index].value; 3975 data = exec->inputStack[exec->index].data; 3976 #ifdef DEBUG_PUSH 3977 printf("value loaded: %s\n", value); 3978 #endif 3979 3980 /* 3981 * End of input: stop here 3982 */ 3983 if (value == NULL) { 3984 exec->index --; 3985 break; 3986 } 3987 if (exec->transcount >= atom->min) { 3988 int transno = exec->transno; 3989 xmlRegStatePtr state = exec->state; 3990 3991 /* 3992 * The transition is acceptable save it 3993 */ 3994 exec->transno = -1; /* trick */ 3995 exec->state = to; 3996 if (exec->inputStackNr <= 0) { 3997 xmlFARegExecSaveInputString(exec, value, data); 3998 } 3999 xmlFARegExecSave(exec); 4000 exec->transno = transno; 4001 exec->state = state; 4002 } 4003 ret = xmlStrEqual(value, atom->valuep); 4004 exec->transcount++; 4005 } while (ret == 1); 4006 if (exec->transcount < atom->min) 4007 ret = 0; 4008 4009 /* 4010 * If the last check failed but one transition was found 4011 * possible, rollback 4012 */ 4013 if (ret < 0) 4014 ret = 0; 4015 if (ret == 0) { 4016 goto rollback; 4017 } 4018 } 4019 } 4020 if (ret == 1) { 4021 if ((exec->callback != NULL) && (atom != NULL) && 4022 (data != NULL)) { 4023 exec->callback(exec->data, atom->valuep, 4024 atom->data, data); 4025 } 4026 if (exec->state->nbTrans > exec->transno + 1) { 4027 if (exec->inputStackNr <= 0) { 4028 xmlFARegExecSaveInputString(exec, value, data); 4029 } 4030 xmlFARegExecSave(exec); 4031 } 4032 if (trans->counter >= 0) { 4033 #ifdef DEBUG_PUSH 4034 printf("Increasing count %d\n", trans->counter); 4035 #endif 4036 exec->counts[trans->counter]++; 4037 } 4038 if ((trans->count >= 0) && 4039 (trans->count < REGEXP_ALL_COUNTER)) { 4040 #ifdef DEBUG_REGEXP_EXEC 4041 printf("resetting count %d on transition\n", 4042 trans->count); 4043 #endif 4044 exec->counts[trans->count] = 0; 4045 } 4046 #ifdef DEBUG_PUSH 4047 printf("entering state %d\n", trans->to); 4048 #endif 4049 if ((exec->comp->states[trans->to] != NULL) && 4050 (exec->comp->states[trans->to]->type == 4051 XML_REGEXP_SINK_STATE)) { 4052 /* 4053 * entering a sink state, save the current state as error 4054 * state. 4055 */ 4056 if (exec->errString != NULL) 4057 xmlFree(exec->errString); 4058 exec->errString = xmlStrdup(value); 4059 exec->errState = exec->state; 4060 memcpy(exec->errCounts, exec->counts, 4061 exec->comp->nbCounters * sizeof(int)); 4062 } 4063 exec->state = exec->comp->states[trans->to]; 4064 exec->transno = 0; 4065 if (trans->atom != NULL) { 4066 if (exec->inputStack != NULL) { 4067 exec->index++; 4068 if (exec->index < exec->inputStackNr) { 4069 value = exec->inputStack[exec->index].value; 4070 data = exec->inputStack[exec->index].data; 4071 #ifdef DEBUG_PUSH 4072 printf("value loaded: %s\n", value); 4073 #endif 4074 } else { 4075 value = NULL; 4076 data = NULL; 4077 #ifdef DEBUG_PUSH 4078 printf("end of input\n"); 4079 #endif 4080 } 4081 } else { 4082 value = NULL; 4083 data = NULL; 4084 #ifdef DEBUG_PUSH 4085 printf("end of input\n"); 4086 #endif 4087 } 4088 } 4089 goto progress; 4090 } else if (ret < 0) { 4091 exec->status = -4; 4092 break; 4093 } 4094 } 4095 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4096 rollback: 4097 /* 4098 * if we didn't yet rollback on the current input 4099 * store the current state as the error state. 4100 */ 4101 if ((progress) && (exec->state != NULL) && 4102 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4103 progress = 0; 4104 if (exec->errString != NULL) 4105 xmlFree(exec->errString); 4106 exec->errString = xmlStrdup(value); 4107 exec->errState = exec->state; 4108 if (exec->comp->nbCounters) 4109 memcpy(exec->errCounts, exec->counts, 4110 exec->comp->nbCounters * sizeof(int)); 4111 } 4112 4113 /* 4114 * Failed to find a way out 4115 */ 4116 exec->determinist = 0; 4117 xmlFARegExecRollBack(exec); 4118 if ((exec->inputStack != NULL ) && (exec->status == 0)) { 4119 value = exec->inputStack[exec->index].value; 4120 data = exec->inputStack[exec->index].data; 4121 #ifdef DEBUG_PUSH 4122 printf("value loaded: %s\n", value); 4123 #endif 4124 } 4125 } 4126 continue; 4127 progress: 4128 progress = 1; 4129 continue; 4130 } 4131 if (exec->status == 0) { 4132 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4133 } 4134 #ifdef DEBUG_ERR 4135 if (exec->status < 0) { 4136 testerr(exec); 4137 } 4138 #endif 4139 return(exec->status); 4140 } 4141 4142 /** 4143 * xmlRegExecPushString: 4144 * @exec: a regexp execution context or NULL to indicate the end 4145 * @value: a string token input 4146 * @data: data associated to the token to reuse in callbacks 4147 * 4148 * Push one input token in the execution context 4149 * 4150 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4151 * a negative value in case of error. 4152 */ 4153 int 4154 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4155 void *data) { 4156 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4157 } 4158 4159 /** 4160 * xmlRegExecPushString2: 4161 * @exec: a regexp execution context or NULL to indicate the end 4162 * @value: the first string token input 4163 * @value2: the second string token input 4164 * @data: data associated to the token to reuse in callbacks 4165 * 4166 * Push one input token in the execution context 4167 * 4168 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4169 * a negative value in case of error. 4170 */ 4171 int 4172 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4173 const xmlChar *value2, void *data) { 4174 xmlChar buf[150]; 4175 int lenn, lenp, ret; 4176 xmlChar *str; 4177 4178 if (exec == NULL) 4179 return(-1); 4180 if (exec->comp == NULL) 4181 return(-1); 4182 if (exec->status != 0) 4183 return(exec->status); 4184 4185 if (value2 == NULL) 4186 return(xmlRegExecPushString(exec, value, data)); 4187 4188 lenn = strlen((char *) value2); 4189 lenp = strlen((char *) value); 4190 4191 if (150 < lenn + lenp + 2) { 4192 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4193 if (str == NULL) { 4194 exec->status = -1; 4195 return(-1); 4196 } 4197 } else { 4198 str = buf; 4199 } 4200 memcpy(&str[0], value, lenp); 4201 str[lenp] = XML_REG_STRING_SEPARATOR; 4202 memcpy(&str[lenp + 1], value2, lenn); 4203 str[lenn + lenp + 1] = 0; 4204 4205 if (exec->comp->compact != NULL) 4206 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4207 else 4208 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4209 4210 if (str != buf) 4211 xmlFree(str); 4212 return(ret); 4213 } 4214 4215 /** 4216 * xmlRegExecGetValues: 4217 * @exec: a regexp execution context 4218 * @err: error extraction or normal one 4219 * @nbval: pointer to the number of accepted values IN/OUT 4220 * @nbneg: return number of negative transitions 4221 * @values: pointer to the array of acceptable values 4222 * @terminal: return value if this was a terminal state 4223 * 4224 * Extract informations from the regexp execution, internal routine to 4225 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4226 * 4227 * Returns: 0 in case of success or -1 in case of error. 4228 */ 4229 static int 4230 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4231 int *nbval, int *nbneg, 4232 xmlChar **values, int *terminal) { 4233 int maxval; 4234 int nb = 0; 4235 4236 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4237 (values == NULL) || (*nbval <= 0)) 4238 return(-1); 4239 4240 maxval = *nbval; 4241 *nbval = 0; 4242 *nbneg = 0; 4243 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4244 xmlRegexpPtr comp; 4245 int target, i, state; 4246 4247 comp = exec->comp; 4248 4249 if (err) { 4250 if (exec->errStateNo == -1) return(-1); 4251 state = exec->errStateNo; 4252 } else { 4253 state = exec->index; 4254 } 4255 if (terminal != NULL) { 4256 if (comp->compact[state * (comp->nbstrings + 1)] == 4257 XML_REGEXP_FINAL_STATE) 4258 *terminal = 1; 4259 else 4260 *terminal = 0; 4261 } 4262 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4263 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4264 if ((target > 0) && (target <= comp->nbstates) && 4265 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4266 XML_REGEXP_SINK_STATE)) { 4267 values[nb++] = comp->stringMap[i]; 4268 (*nbval)++; 4269 } 4270 } 4271 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4272 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4273 if ((target > 0) && (target <= comp->nbstates) && 4274 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4275 XML_REGEXP_SINK_STATE)) { 4276 values[nb++] = comp->stringMap[i]; 4277 (*nbneg)++; 4278 } 4279 } 4280 } else { 4281 int transno; 4282 xmlRegTransPtr trans; 4283 xmlRegAtomPtr atom; 4284 xmlRegStatePtr state; 4285 4286 if (terminal != NULL) { 4287 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4288 *terminal = 1; 4289 else 4290 *terminal = 0; 4291 } 4292 4293 if (err) { 4294 if (exec->errState == NULL) return(-1); 4295 state = exec->errState; 4296 } else { 4297 if (exec->state == NULL) return(-1); 4298 state = exec->state; 4299 } 4300 for (transno = 0; 4301 (transno < state->nbTrans) && (nb < maxval); 4302 transno++) { 4303 trans = &state->trans[transno]; 4304 if (trans->to < 0) 4305 continue; 4306 atom = trans->atom; 4307 if ((atom == NULL) || (atom->valuep == NULL)) 4308 continue; 4309 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4310 /* this should not be reached but ... */ 4311 TODO; 4312 } else if (trans->count == REGEXP_ALL_COUNTER) { 4313 /* this should not be reached but ... */ 4314 TODO; 4315 } else if (trans->counter >= 0) { 4316 xmlRegCounterPtr counter = NULL; 4317 int count; 4318 4319 if (err) 4320 count = exec->errCounts[trans->counter]; 4321 else 4322 count = exec->counts[trans->counter]; 4323 if (exec->comp != NULL) 4324 counter = &exec->comp->counters[trans->counter]; 4325 if ((counter == NULL) || (count < counter->max)) { 4326 if (atom->neg) 4327 values[nb++] = (xmlChar *) atom->valuep2; 4328 else 4329 values[nb++] = (xmlChar *) atom->valuep; 4330 (*nbval)++; 4331 } 4332 } else { 4333 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) && 4334 (exec->comp->states[trans->to]->type != 4335 XML_REGEXP_SINK_STATE)) { 4336 if (atom->neg) 4337 values[nb++] = (xmlChar *) atom->valuep2; 4338 else 4339 values[nb++] = (xmlChar *) atom->valuep; 4340 (*nbval)++; 4341 } 4342 } 4343 } 4344 for (transno = 0; 4345 (transno < state->nbTrans) && (nb < maxval); 4346 transno++) { 4347 trans = &state->trans[transno]; 4348 if (trans->to < 0) 4349 continue; 4350 atom = trans->atom; 4351 if ((atom == NULL) || (atom->valuep == NULL)) 4352 continue; 4353 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4354 continue; 4355 } else if (trans->count == REGEXP_ALL_COUNTER) { 4356 continue; 4357 } else if (trans->counter >= 0) { 4358 continue; 4359 } else { 4360 if ((exec->comp->states[trans->to] != NULL) && 4361 (exec->comp->states[trans->to]->type == 4362 XML_REGEXP_SINK_STATE)) { 4363 if (atom->neg) 4364 values[nb++] = (xmlChar *) atom->valuep2; 4365 else 4366 values[nb++] = (xmlChar *) atom->valuep; 4367 (*nbneg)++; 4368 } 4369 } 4370 } 4371 } 4372 return(0); 4373 } 4374 4375 /** 4376 * xmlRegExecNextValues: 4377 * @exec: a regexp execution context 4378 * @nbval: pointer to the number of accepted values IN/OUT 4379 * @nbneg: return number of negative transitions 4380 * @values: pointer to the array of acceptable values 4381 * @terminal: return value if this was a terminal state 4382 * 4383 * Extract informations from the regexp execution, 4384 * the parameter @values must point to an array of @nbval string pointers 4385 * on return nbval will contain the number of possible strings in that 4386 * state and the @values array will be updated with them. The string values 4387 * returned will be freed with the @exec context and don't need to be 4388 * deallocated. 4389 * 4390 * Returns: 0 in case of success or -1 in case of error. 4391 */ 4392 int 4393 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4394 xmlChar **values, int *terminal) { 4395 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4396 } 4397 4398 /** 4399 * xmlRegExecErrInfo: 4400 * @exec: a regexp execution context generating an error 4401 * @string: return value for the error string 4402 * @nbval: pointer to the number of accepted values IN/OUT 4403 * @nbneg: return number of negative transitions 4404 * @values: pointer to the array of acceptable values 4405 * @terminal: return value if this was a terminal state 4406 * 4407 * Extract error informations from the regexp execution, the parameter 4408 * @string will be updated with the value pushed and not accepted, 4409 * the parameter @values must point to an array of @nbval string pointers 4410 * on return nbval will contain the number of possible strings in that 4411 * state and the @values array will be updated with them. The string values 4412 * returned will be freed with the @exec context and don't need to be 4413 * deallocated. 4414 * 4415 * Returns: 0 in case of success or -1 in case of error. 4416 */ 4417 int 4418 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4419 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4420 if (exec == NULL) 4421 return(-1); 4422 if (string != NULL) { 4423 if (exec->status != 0) 4424 *string = exec->errString; 4425 else 4426 *string = NULL; 4427 } 4428 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4429 } 4430 4431 #ifdef DEBUG_ERR 4432 static void testerr(xmlRegExecCtxtPtr exec) { 4433 const xmlChar *string; 4434 xmlChar *values[5]; 4435 int nb = 5; 4436 int nbneg; 4437 int terminal; 4438 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4439 } 4440 #endif 4441 4442 #if 0 4443 static int 4444 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4445 xmlRegTransPtr trans; 4446 xmlRegAtomPtr atom; 4447 int ret; 4448 int codepoint, len; 4449 4450 if (exec == NULL) 4451 return(-1); 4452 if (exec->status != 0) 4453 return(exec->status); 4454 4455 while ((exec->status == 0) && 4456 ((exec->inputString[exec->index] != 0) || 4457 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4458 4459 /* 4460 * End of input on non-terminal state, rollback, however we may 4461 * still have epsilon like transition for counted transitions 4462 * on counters, in that case don't break too early. 4463 */ 4464 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4465 goto rollback; 4466 4467 exec->transcount = 0; 4468 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4469 trans = &exec->state->trans[exec->transno]; 4470 if (trans->to < 0) 4471 continue; 4472 atom = trans->atom; 4473 ret = 0; 4474 if (trans->count >= 0) { 4475 int count; 4476 xmlRegCounterPtr counter; 4477 4478 /* 4479 * A counted transition. 4480 */ 4481 4482 count = exec->counts[trans->count]; 4483 counter = &exec->comp->counters[trans->count]; 4484 #ifdef DEBUG_REGEXP_EXEC 4485 printf("testing count %d: val %d, min %d, max %d\n", 4486 trans->count, count, counter->min, counter->max); 4487 #endif 4488 ret = ((count >= counter->min) && (count <= counter->max)); 4489 } else if (atom == NULL) { 4490 fprintf(stderr, "epsilon transition left at runtime\n"); 4491 exec->status = -2; 4492 break; 4493 } else if (exec->inputString[exec->index] != 0) { 4494 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4495 ret = xmlRegCheckCharacter(atom, codepoint); 4496 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4497 xmlRegStatePtr to = exec->comp->states[trans->to]; 4498 4499 /* 4500 * this is a multiple input sequence 4501 */ 4502 if (exec->state->nbTrans > exec->transno + 1) { 4503 xmlFARegExecSave(exec); 4504 } 4505 exec->transcount = 1; 4506 do { 4507 /* 4508 * Try to progress as much as possible on the input 4509 */ 4510 if (exec->transcount == atom->max) { 4511 break; 4512 } 4513 exec->index += len; 4514 /* 4515 * End of input: stop here 4516 */ 4517 if (exec->inputString[exec->index] == 0) { 4518 exec->index -= len; 4519 break; 4520 } 4521 if (exec->transcount >= atom->min) { 4522 int transno = exec->transno; 4523 xmlRegStatePtr state = exec->state; 4524 4525 /* 4526 * The transition is acceptable save it 4527 */ 4528 exec->transno = -1; /* trick */ 4529 exec->state = to; 4530 xmlFARegExecSave(exec); 4531 exec->transno = transno; 4532 exec->state = state; 4533 } 4534 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4535 len); 4536 ret = xmlRegCheckCharacter(atom, codepoint); 4537 exec->transcount++; 4538 } while (ret == 1); 4539 if (exec->transcount < atom->min) 4540 ret = 0; 4541 4542 /* 4543 * If the last check failed but one transition was found 4544 * possible, rollback 4545 */ 4546 if (ret < 0) 4547 ret = 0; 4548 if (ret == 0) { 4549 goto rollback; 4550 } 4551 } 4552 } 4553 if (ret == 1) { 4554 if (exec->state->nbTrans > exec->transno + 1) { 4555 xmlFARegExecSave(exec); 4556 } 4557 /* 4558 * restart count for expressions like this ((abc){2})* 4559 */ 4560 if (trans->count >= 0) { 4561 #ifdef DEBUG_REGEXP_EXEC 4562 printf("Reset count %d\n", trans->count); 4563 #endif 4564 exec->counts[trans->count] = 0; 4565 } 4566 if (trans->counter >= 0) { 4567 #ifdef DEBUG_REGEXP_EXEC 4568 printf("Increasing count %d\n", trans->counter); 4569 #endif 4570 exec->counts[trans->counter]++; 4571 } 4572 #ifdef DEBUG_REGEXP_EXEC 4573 printf("entering state %d\n", trans->to); 4574 #endif 4575 exec->state = exec->comp->states[trans->to]; 4576 exec->transno = 0; 4577 if (trans->atom != NULL) { 4578 exec->index += len; 4579 } 4580 goto progress; 4581 } else if (ret < 0) { 4582 exec->status = -4; 4583 break; 4584 } 4585 } 4586 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4587 rollback: 4588 /* 4589 * Failed to find a way out 4590 */ 4591 exec->determinist = 0; 4592 xmlFARegExecRollBack(exec); 4593 } 4594 progress: 4595 continue; 4596 } 4597 } 4598 #endif 4599 /************************************************************************ 4600 * * 4601 * Parser for the Schemas Datatype Regular Expressions * 4602 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4603 * * 4604 ************************************************************************/ 4605 4606 /** 4607 * xmlFAIsChar: 4608 * @ctxt: a regexp parser context 4609 * 4610 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4611 */ 4612 static int 4613 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4614 int cur; 4615 int len; 4616 4617 cur = CUR_SCHAR(ctxt->cur, len); 4618 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4619 (cur == '*') || (cur == '+') || (cur == '(') || 4620 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4621 (cur == 0x5D) || (cur == 0)) 4622 return(-1); 4623 return(cur); 4624 } 4625 4626 /** 4627 * xmlFAParseCharProp: 4628 * @ctxt: a regexp parser context 4629 * 4630 * [27] charProp ::= IsCategory | IsBlock 4631 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4632 * Separators | Symbols | Others 4633 * [29] Letters ::= 'L' [ultmo]? 4634 * [30] Marks ::= 'M' [nce]? 4635 * [31] Numbers ::= 'N' [dlo]? 4636 * [32] Punctuation ::= 'P' [cdseifo]? 4637 * [33] Separators ::= 'Z' [slp]? 4638 * [34] Symbols ::= 'S' [mcko]? 4639 * [35] Others ::= 'C' [cfon]? 4640 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4641 */ 4642 static void 4643 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4644 int cur; 4645 xmlRegAtomType type = (xmlRegAtomType) 0; 4646 xmlChar *blockName = NULL; 4647 4648 cur = CUR; 4649 if (cur == 'L') { 4650 NEXT; 4651 cur = CUR; 4652 if (cur == 'u') { 4653 NEXT; 4654 type = XML_REGEXP_LETTER_UPPERCASE; 4655 } else if (cur == 'l') { 4656 NEXT; 4657 type = XML_REGEXP_LETTER_LOWERCASE; 4658 } else if (cur == 't') { 4659 NEXT; 4660 type = XML_REGEXP_LETTER_TITLECASE; 4661 } else if (cur == 'm') { 4662 NEXT; 4663 type = XML_REGEXP_LETTER_MODIFIER; 4664 } else if (cur == 'o') { 4665 NEXT; 4666 type = XML_REGEXP_LETTER_OTHERS; 4667 } else { 4668 type = XML_REGEXP_LETTER; 4669 } 4670 } else if (cur == 'M') { 4671 NEXT; 4672 cur = CUR; 4673 if (cur == 'n') { 4674 NEXT; 4675 /* nonspacing */ 4676 type = XML_REGEXP_MARK_NONSPACING; 4677 } else if (cur == 'c') { 4678 NEXT; 4679 /* spacing combining */ 4680 type = XML_REGEXP_MARK_SPACECOMBINING; 4681 } else if (cur == 'e') { 4682 NEXT; 4683 /* enclosing */ 4684 type = XML_REGEXP_MARK_ENCLOSING; 4685 } else { 4686 /* all marks */ 4687 type = XML_REGEXP_MARK; 4688 } 4689 } else if (cur == 'N') { 4690 NEXT; 4691 cur = CUR; 4692 if (cur == 'd') { 4693 NEXT; 4694 /* digital */ 4695 type = XML_REGEXP_NUMBER_DECIMAL; 4696 } else if (cur == 'l') { 4697 NEXT; 4698 /* letter */ 4699 type = XML_REGEXP_NUMBER_LETTER; 4700 } else if (cur == 'o') { 4701 NEXT; 4702 /* other */ 4703 type = XML_REGEXP_NUMBER_OTHERS; 4704 } else { 4705 /* all numbers */ 4706 type = XML_REGEXP_NUMBER; 4707 } 4708 } else if (cur == 'P') { 4709 NEXT; 4710 cur = CUR; 4711 if (cur == 'c') { 4712 NEXT; 4713 /* connector */ 4714 type = XML_REGEXP_PUNCT_CONNECTOR; 4715 } else if (cur == 'd') { 4716 NEXT; 4717 /* dash */ 4718 type = XML_REGEXP_PUNCT_DASH; 4719 } else if (cur == 's') { 4720 NEXT; 4721 /* open */ 4722 type = XML_REGEXP_PUNCT_OPEN; 4723 } else if (cur == 'e') { 4724 NEXT; 4725 /* close */ 4726 type = XML_REGEXP_PUNCT_CLOSE; 4727 } else if (cur == 'i') { 4728 NEXT; 4729 /* initial quote */ 4730 type = XML_REGEXP_PUNCT_INITQUOTE; 4731 } else if (cur == 'f') { 4732 NEXT; 4733 /* final quote */ 4734 type = XML_REGEXP_PUNCT_FINQUOTE; 4735 } else if (cur == 'o') { 4736 NEXT; 4737 /* other */ 4738 type = XML_REGEXP_PUNCT_OTHERS; 4739 } else { 4740 /* all punctuation */ 4741 type = XML_REGEXP_PUNCT; 4742 } 4743 } else if (cur == 'Z') { 4744 NEXT; 4745 cur = CUR; 4746 if (cur == 's') { 4747 NEXT; 4748 /* space */ 4749 type = XML_REGEXP_SEPAR_SPACE; 4750 } else if (cur == 'l') { 4751 NEXT; 4752 /* line */ 4753 type = XML_REGEXP_SEPAR_LINE; 4754 } else if (cur == 'p') { 4755 NEXT; 4756 /* paragraph */ 4757 type = XML_REGEXP_SEPAR_PARA; 4758 } else { 4759 /* all separators */ 4760 type = XML_REGEXP_SEPAR; 4761 } 4762 } else if (cur == 'S') { 4763 NEXT; 4764 cur = CUR; 4765 if (cur == 'm') { 4766 NEXT; 4767 type = XML_REGEXP_SYMBOL_MATH; 4768 /* math */ 4769 } else if (cur == 'c') { 4770 NEXT; 4771 type = XML_REGEXP_SYMBOL_CURRENCY; 4772 /* currency */ 4773 } else if (cur == 'k') { 4774 NEXT; 4775 type = XML_REGEXP_SYMBOL_MODIFIER; 4776 /* modifiers */ 4777 } else if (cur == 'o') { 4778 NEXT; 4779 type = XML_REGEXP_SYMBOL_OTHERS; 4780 /* other */ 4781 } else { 4782 /* all symbols */ 4783 type = XML_REGEXP_SYMBOL; 4784 } 4785 } else if (cur == 'C') { 4786 NEXT; 4787 cur = CUR; 4788 if (cur == 'c') { 4789 NEXT; 4790 /* control */ 4791 type = XML_REGEXP_OTHER_CONTROL; 4792 } else if (cur == 'f') { 4793 NEXT; 4794 /* format */ 4795 type = XML_REGEXP_OTHER_FORMAT; 4796 } else if (cur == 'o') { 4797 NEXT; 4798 /* private use */ 4799 type = XML_REGEXP_OTHER_PRIVATE; 4800 } else if (cur == 'n') { 4801 NEXT; 4802 /* not assigned */ 4803 type = XML_REGEXP_OTHER_NA; 4804 } else { 4805 /* all others */ 4806 type = XML_REGEXP_OTHER; 4807 } 4808 } else if (cur == 'I') { 4809 const xmlChar *start; 4810 NEXT; 4811 cur = CUR; 4812 if (cur != 's') { 4813 ERROR("IsXXXX expected"); 4814 return; 4815 } 4816 NEXT; 4817 start = ctxt->cur; 4818 cur = CUR; 4819 if (((cur >= 'a') && (cur <= 'z')) || 4820 ((cur >= 'A') && (cur <= 'Z')) || 4821 ((cur >= '0') && (cur <= '9')) || 4822 (cur == 0x2D)) { 4823 NEXT; 4824 cur = CUR; 4825 while (((cur >= 'a') && (cur <= 'z')) || 4826 ((cur >= 'A') && (cur <= 'Z')) || 4827 ((cur >= '0') && (cur <= '9')) || 4828 (cur == 0x2D)) { 4829 NEXT; 4830 cur = CUR; 4831 } 4832 } 4833 type = XML_REGEXP_BLOCK_NAME; 4834 blockName = xmlStrndup(start, ctxt->cur - start); 4835 } else { 4836 ERROR("Unknown char property"); 4837 return; 4838 } 4839 if (ctxt->atom == NULL) { 4840 ctxt->atom = xmlRegNewAtom(ctxt, type); 4841 if (ctxt->atom != NULL) 4842 ctxt->atom->valuep = blockName; 4843 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4844 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4845 type, 0, 0, blockName); 4846 } 4847 } 4848 4849 /** 4850 * xmlFAParseCharClassEsc: 4851 * @ctxt: a regexp parser context 4852 * 4853 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4854 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4855 * [25] catEsc ::= '\p{' charProp '}' 4856 * [26] complEsc ::= '\P{' charProp '}' 4857 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4858 */ 4859 static void 4860 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4861 int cur; 4862 4863 if (CUR == '.') { 4864 if (ctxt->atom == NULL) { 4865 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4866 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4867 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4868 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4869 } 4870 NEXT; 4871 return; 4872 } 4873 if (CUR != '\\') { 4874 ERROR("Escaped sequence: expecting \\"); 4875 return; 4876 } 4877 NEXT; 4878 cur = CUR; 4879 if (cur == 'p') { 4880 NEXT; 4881 if (CUR != '{') { 4882 ERROR("Expecting '{'"); 4883 return; 4884 } 4885 NEXT; 4886 xmlFAParseCharProp(ctxt); 4887 if (CUR != '}') { 4888 ERROR("Expecting '}'"); 4889 return; 4890 } 4891 NEXT; 4892 } else if (cur == 'P') { 4893 NEXT; 4894 if (CUR != '{') { 4895 ERROR("Expecting '{'"); 4896 return; 4897 } 4898 NEXT; 4899 xmlFAParseCharProp(ctxt); 4900 if (ctxt->atom != NULL) 4901 ctxt->atom->neg = 1; 4902 if (CUR != '}') { 4903 ERROR("Expecting '}'"); 4904 return; 4905 } 4906 NEXT; 4907 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4908 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4909 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4910 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4911 (cur == 0x5E)) { 4912 if (ctxt->atom == NULL) { 4913 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4914 if (ctxt->atom != NULL) { 4915 switch (cur) { 4916 case 'n': 4917 ctxt->atom->codepoint = '\n'; 4918 break; 4919 case 'r': 4920 ctxt->atom->codepoint = '\r'; 4921 break; 4922 case 't': 4923 ctxt->atom->codepoint = '\t'; 4924 break; 4925 default: 4926 ctxt->atom->codepoint = cur; 4927 } 4928 } 4929 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4930 switch (cur) { 4931 case 'n': 4932 cur = '\n'; 4933 break; 4934 case 'r': 4935 cur = '\r'; 4936 break; 4937 case 't': 4938 cur = '\t'; 4939 break; 4940 } 4941 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4942 XML_REGEXP_CHARVAL, cur, cur, NULL); 4943 } 4944 NEXT; 4945 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4946 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4947 (cur == 'w') || (cur == 'W')) { 4948 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4949 4950 switch (cur) { 4951 case 's': 4952 type = XML_REGEXP_ANYSPACE; 4953 break; 4954 case 'S': 4955 type = XML_REGEXP_NOTSPACE; 4956 break; 4957 case 'i': 4958 type = XML_REGEXP_INITNAME; 4959 break; 4960 case 'I': 4961 type = XML_REGEXP_NOTINITNAME; 4962 break; 4963 case 'c': 4964 type = XML_REGEXP_NAMECHAR; 4965 break; 4966 case 'C': 4967 type = XML_REGEXP_NOTNAMECHAR; 4968 break; 4969 case 'd': 4970 type = XML_REGEXP_DECIMAL; 4971 break; 4972 case 'D': 4973 type = XML_REGEXP_NOTDECIMAL; 4974 break; 4975 case 'w': 4976 type = XML_REGEXP_REALCHAR; 4977 break; 4978 case 'W': 4979 type = XML_REGEXP_NOTREALCHAR; 4980 break; 4981 } 4982 NEXT; 4983 if (ctxt->atom == NULL) { 4984 ctxt->atom = xmlRegNewAtom(ctxt, type); 4985 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4986 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4987 type, 0, 0, NULL); 4988 } 4989 } else { 4990 ERROR("Wrong escape sequence, misuse of character '\\'"); 4991 } 4992 } 4993 4994 /** 4995 * xmlFAParseCharRange: 4996 * @ctxt: a regexp parser context 4997 * 4998 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4999 * [18] seRange ::= charOrEsc '-' charOrEsc 5000 * [20] charOrEsc ::= XmlChar | SingleCharEsc 5001 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 5002 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 5003 */ 5004 static void 5005 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 5006 int cur, len; 5007 int start = -1; 5008 int end = -1; 5009 5010 if (CUR == '\0') { 5011 ERROR("Expecting ']'"); 5012 return; 5013 } 5014 5015 cur = CUR; 5016 if (cur == '\\') { 5017 NEXT; 5018 cur = CUR; 5019 switch (cur) { 5020 case 'n': start = 0xA; break; 5021 case 'r': start = 0xD; break; 5022 case 't': start = 0x9; break; 5023 case '\\': case '|': case '.': case '-': case '^': case '?': 5024 case '*': case '+': case '{': case '}': case '(': case ')': 5025 case '[': case ']': 5026 start = cur; break; 5027 default: 5028 ERROR("Invalid escape value"); 5029 return; 5030 } 5031 end = start; 5032 len = 1; 5033 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5034 end = start = CUR_SCHAR(ctxt->cur, len); 5035 } else { 5036 ERROR("Expecting a char range"); 5037 return; 5038 } 5039 /* 5040 * Since we are "inside" a range, we can assume ctxt->cur is past 5041 * the start of ctxt->string, and PREV should be safe 5042 */ 5043 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5044 NEXTL(len); 5045 return; 5046 } 5047 NEXTL(len); 5048 cur = CUR; 5049 if ((cur != '-') || (NXT(1) == ']')) { 5050 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5051 XML_REGEXP_CHARVAL, start, end, NULL); 5052 return; 5053 } 5054 NEXT; 5055 cur = CUR; 5056 if (cur == '\\') { 5057 NEXT; 5058 cur = CUR; 5059 switch (cur) { 5060 case 'n': end = 0xA; break; 5061 case 'r': end = 0xD; break; 5062 case 't': end = 0x9; break; 5063 case '\\': case '|': case '.': case '-': case '^': case '?': 5064 case '*': case '+': case '{': case '}': case '(': case ')': 5065 case '[': case ']': 5066 end = cur; break; 5067 default: 5068 ERROR("Invalid escape value"); 5069 return; 5070 } 5071 len = 1; 5072 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) { 5073 end = CUR_SCHAR(ctxt->cur, len); 5074 } else { 5075 ERROR("Expecting the end of a char range"); 5076 return; 5077 } 5078 5079 /* TODO check that the values are acceptable character ranges for XML */ 5080 if (end < start) { 5081 ERROR("End of range is before start of range"); 5082 } else { 5083 NEXTL(len); 5084 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5085 XML_REGEXP_CHARVAL, start, end, NULL); 5086 } 5087 return; 5088 } 5089 5090 /** 5091 * xmlFAParsePosCharGroup: 5092 * @ctxt: a regexp parser context 5093 * 5094 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5095 */ 5096 static void 5097 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5098 do { 5099 if (CUR == '\\') { 5100 xmlFAParseCharClassEsc(ctxt); 5101 } else { 5102 xmlFAParseCharRange(ctxt); 5103 } 5104 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 5105 (CUR != 0) && (ctxt->error == 0)); 5106 } 5107 5108 /** 5109 * xmlFAParseCharGroup: 5110 * @ctxt: a regexp parser context 5111 * 5112 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5113 * [15] negCharGroup ::= '^' posCharGroup 5114 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5115 * [12] charClassExpr ::= '[' charGroup ']' 5116 */ 5117 static void 5118 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5119 int n = ctxt->neg; 5120 while ((CUR != ']') && (ctxt->error == 0)) { 5121 if (CUR == '^') { 5122 int neg = ctxt->neg; 5123 5124 NEXT; 5125 ctxt->neg = !ctxt->neg; 5126 xmlFAParsePosCharGroup(ctxt); 5127 ctxt->neg = neg; 5128 } else if ((CUR == '-') && (NXT(1) == '[')) { 5129 int neg = ctxt->neg; 5130 ctxt->neg = 2; 5131 NEXT; /* eat the '-' */ 5132 NEXT; /* eat the '[' */ 5133 xmlFAParseCharGroup(ctxt); 5134 if (CUR == ']') { 5135 NEXT; 5136 } else { 5137 ERROR("charClassExpr: ']' expected"); 5138 break; 5139 } 5140 ctxt->neg = neg; 5141 break; 5142 } else if (CUR != ']') { 5143 xmlFAParsePosCharGroup(ctxt); 5144 } 5145 } 5146 ctxt->neg = n; 5147 } 5148 5149 /** 5150 * xmlFAParseCharClass: 5151 * @ctxt: a regexp parser context 5152 * 5153 * [11] charClass ::= charClassEsc | charClassExpr 5154 * [12] charClassExpr ::= '[' charGroup ']' 5155 */ 5156 static void 5157 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5158 if (CUR == '[') { 5159 NEXT; 5160 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5161 if (ctxt->atom == NULL) 5162 return; 5163 xmlFAParseCharGroup(ctxt); 5164 if (CUR == ']') { 5165 NEXT; 5166 } else { 5167 ERROR("xmlFAParseCharClass: ']' expected"); 5168 } 5169 } else { 5170 xmlFAParseCharClassEsc(ctxt); 5171 } 5172 } 5173 5174 /** 5175 * xmlFAParseQuantExact: 5176 * @ctxt: a regexp parser context 5177 * 5178 * [8] QuantExact ::= [0-9]+ 5179 * 5180 * Returns 0 if success or -1 in case of error 5181 */ 5182 static int 5183 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5184 int ret = 0; 5185 int ok = 0; 5186 5187 while ((CUR >= '0') && (CUR <= '9')) { 5188 ret = ret * 10 + (CUR - '0'); 5189 ok = 1; 5190 NEXT; 5191 } 5192 if (ok != 1) { 5193 return(-1); 5194 } 5195 return(ret); 5196 } 5197 5198 /** 5199 * xmlFAParseQuantifier: 5200 * @ctxt: a regexp parser context 5201 * 5202 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5203 * [5] quantity ::= quantRange | quantMin | QuantExact 5204 * [6] quantRange ::= QuantExact ',' QuantExact 5205 * [7] quantMin ::= QuantExact ',' 5206 * [8] QuantExact ::= [0-9]+ 5207 */ 5208 static int 5209 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5210 int cur; 5211 5212 cur = CUR; 5213 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5214 if (ctxt->atom != NULL) { 5215 if (cur == '?') 5216 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5217 else if (cur == '*') 5218 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5219 else if (cur == '+') 5220 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5221 } 5222 NEXT; 5223 return(1); 5224 } 5225 if (cur == '{') { 5226 int min = 0, max = 0; 5227 5228 NEXT; 5229 cur = xmlFAParseQuantExact(ctxt); 5230 if (cur >= 0) 5231 min = cur; 5232 if (CUR == ',') { 5233 NEXT; 5234 if (CUR == '}') 5235 max = INT_MAX; 5236 else { 5237 cur = xmlFAParseQuantExact(ctxt); 5238 if (cur >= 0) 5239 max = cur; 5240 else { 5241 ERROR("Improper quantifier"); 5242 } 5243 } 5244 } 5245 if (CUR == '}') { 5246 NEXT; 5247 } else { 5248 ERROR("Unterminated quantifier"); 5249 } 5250 if (max == 0) 5251 max = min; 5252 if (ctxt->atom != NULL) { 5253 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5254 ctxt->atom->min = min; 5255 ctxt->atom->max = max; 5256 } 5257 return(1); 5258 } 5259 return(0); 5260 } 5261 5262 /** 5263 * xmlFAParseAtom: 5264 * @ctxt: a regexp parser context 5265 * 5266 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5267 */ 5268 static int 5269 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5270 int codepoint, len; 5271 5272 codepoint = xmlFAIsChar(ctxt); 5273 if (codepoint > 0) { 5274 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5275 if (ctxt->atom == NULL) 5276 return(-1); 5277 codepoint = CUR_SCHAR(ctxt->cur, len); 5278 ctxt->atom->codepoint = codepoint; 5279 NEXTL(len); 5280 return(1); 5281 } else if (CUR == '|') { 5282 return(0); 5283 } else if (CUR == 0) { 5284 return(0); 5285 } else if (CUR == ')') { 5286 return(0); 5287 } else if (CUR == '(') { 5288 xmlRegStatePtr start, oldend, start0; 5289 5290 NEXT; 5291 /* 5292 * this extra Epsilon transition is needed if we count with 0 allowed 5293 * unfortunately this can't be known at that point 5294 */ 5295 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5296 start0 = ctxt->state; 5297 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5298 start = ctxt->state; 5299 oldend = ctxt->end; 5300 ctxt->end = NULL; 5301 ctxt->atom = NULL; 5302 xmlFAParseRegExp(ctxt, 0); 5303 if (CUR == ')') { 5304 NEXT; 5305 } else { 5306 ERROR("xmlFAParseAtom: expecting ')'"); 5307 } 5308 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5309 if (ctxt->atom == NULL) 5310 return(-1); 5311 ctxt->atom->start = start; 5312 ctxt->atom->start0 = start0; 5313 ctxt->atom->stop = ctxt->state; 5314 ctxt->end = oldend; 5315 return(1); 5316 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5317 xmlFAParseCharClass(ctxt); 5318 return(1); 5319 } 5320 return(0); 5321 } 5322 5323 /** 5324 * xmlFAParsePiece: 5325 * @ctxt: a regexp parser context 5326 * 5327 * [3] piece ::= atom quantifier? 5328 */ 5329 static int 5330 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5331 int ret; 5332 5333 ctxt->atom = NULL; 5334 ret = xmlFAParseAtom(ctxt); 5335 if (ret == 0) 5336 return(0); 5337 if (ctxt->atom == NULL) { 5338 ERROR("internal: no atom generated"); 5339 } 5340 xmlFAParseQuantifier(ctxt); 5341 return(1); 5342 } 5343 5344 /** 5345 * xmlFAParseBranch: 5346 * @ctxt: a regexp parser context 5347 * @to: optional target to the end of the branch 5348 * 5349 * @to is used to optimize by removing duplicate path in automata 5350 * in expressions like (a|b)(c|d) 5351 * 5352 * [2] branch ::= piece* 5353 */ 5354 static int 5355 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5356 xmlRegStatePtr previous; 5357 int ret; 5358 5359 previous = ctxt->state; 5360 ret = xmlFAParsePiece(ctxt); 5361 if (ret == 0) { 5362 /* Empty branch */ 5363 xmlFAGenerateEpsilonTransition(ctxt, previous, to); 5364 } else { 5365 if (xmlFAGenerateTransitions(ctxt, previous, 5366 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0) 5367 return(-1); 5368 previous = ctxt->state; 5369 ctxt->atom = NULL; 5370 } 5371 while ((ret != 0) && (ctxt->error == 0)) { 5372 ret = xmlFAParsePiece(ctxt); 5373 if (ret != 0) { 5374 if (xmlFAGenerateTransitions(ctxt, previous, 5375 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, 5376 ctxt->atom) < 0) 5377 return(-1); 5378 previous = ctxt->state; 5379 ctxt->atom = NULL; 5380 } 5381 } 5382 return(0); 5383 } 5384 5385 /** 5386 * xmlFAParseRegExp: 5387 * @ctxt: a regexp parser context 5388 * @top: is this the top-level expression ? 5389 * 5390 * [1] regExp ::= branch ( '|' branch )* 5391 */ 5392 static void 5393 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5394 xmlRegStatePtr start, end; 5395 5396 /* if not top start should have been generated by an epsilon trans */ 5397 start = ctxt->state; 5398 ctxt->end = NULL; 5399 xmlFAParseBranch(ctxt, NULL); 5400 if (top) { 5401 #ifdef DEBUG_REGEXP_GRAPH 5402 printf("State %d is final\n", ctxt->state->no); 5403 #endif 5404 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5405 } 5406 if (CUR != '|') { 5407 ctxt->end = ctxt->state; 5408 return; 5409 } 5410 end = ctxt->state; 5411 while ((CUR == '|') && (ctxt->error == 0)) { 5412 NEXT; 5413 ctxt->state = start; 5414 ctxt->end = NULL; 5415 xmlFAParseBranch(ctxt, end); 5416 } 5417 if (!top) { 5418 ctxt->state = end; 5419 ctxt->end = end; 5420 } 5421 } 5422 5423 /************************************************************************ 5424 * * 5425 * The basic API * 5426 * * 5427 ************************************************************************/ 5428 5429 /** 5430 * xmlRegexpPrint: 5431 * @output: the file for the output debug 5432 * @regexp: the compiled regexp 5433 * 5434 * Print the content of the compiled regular expression 5435 */ 5436 void 5437 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5438 int i; 5439 5440 if (output == NULL) 5441 return; 5442 fprintf(output, " regexp: "); 5443 if (regexp == NULL) { 5444 fprintf(output, "NULL\n"); 5445 return; 5446 } 5447 fprintf(output, "'%s' ", regexp->string); 5448 fprintf(output, "\n"); 5449 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5450 for (i = 0;i < regexp->nbAtoms; i++) { 5451 fprintf(output, " %02d ", i); 5452 xmlRegPrintAtom(output, regexp->atoms[i]); 5453 } 5454 fprintf(output, "%d states:", regexp->nbStates); 5455 fprintf(output, "\n"); 5456 for (i = 0;i < regexp->nbStates; i++) { 5457 xmlRegPrintState(output, regexp->states[i]); 5458 } 5459 fprintf(output, "%d counters:\n", regexp->nbCounters); 5460 for (i = 0;i < regexp->nbCounters; i++) { 5461 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5462 regexp->counters[i].max); 5463 } 5464 } 5465 5466 /** 5467 * xmlRegexpCompile: 5468 * @regexp: a regular expression string 5469 * 5470 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5471 * Appendix F and builds an automata suitable for testing strings against 5472 * that regular expression 5473 * 5474 * Returns the compiled expression or NULL in case of error 5475 */ 5476 xmlRegexpPtr 5477 xmlRegexpCompile(const xmlChar *regexp) { 5478 xmlRegexpPtr ret; 5479 xmlRegParserCtxtPtr ctxt; 5480 5481 ctxt = xmlRegNewParserCtxt(regexp); 5482 if (ctxt == NULL) 5483 return(NULL); 5484 5485 /* initialize the parser */ 5486 ctxt->end = NULL; 5487 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5488 xmlRegStatePush(ctxt, ctxt->start); 5489 5490 /* parse the expression building an automata */ 5491 xmlFAParseRegExp(ctxt, 1); 5492 if (CUR != 0) { 5493 ERROR("xmlFAParseRegExp: extra characters"); 5494 } 5495 if (ctxt->error != 0) { 5496 xmlRegFreeParserCtxt(ctxt); 5497 return(NULL); 5498 } 5499 ctxt->end = ctxt->state; 5500 ctxt->start->type = XML_REGEXP_START_STATE; 5501 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5502 5503 /* remove the Epsilon except for counted transitions */ 5504 xmlFAEliminateEpsilonTransitions(ctxt); 5505 5506 5507 if (ctxt->error != 0) { 5508 xmlRegFreeParserCtxt(ctxt); 5509 return(NULL); 5510 } 5511 ret = xmlRegEpxFromParse(ctxt); 5512 xmlRegFreeParserCtxt(ctxt); 5513 return(ret); 5514 } 5515 5516 /** 5517 * xmlRegexpExec: 5518 * @comp: the compiled regular expression 5519 * @content: the value to check against the regular expression 5520 * 5521 * Check if the regular expression generates the value 5522 * 5523 * Returns 1 if it matches, 0 if not and a negative value in case of error 5524 */ 5525 int 5526 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5527 if ((comp == NULL) || (content == NULL)) 5528 return(-1); 5529 return(xmlFARegExec(comp, content)); 5530 } 5531 5532 /** 5533 * xmlRegexpIsDeterminist: 5534 * @comp: the compiled regular expression 5535 * 5536 * Check if the regular expression is determinist 5537 * 5538 * Returns 1 if it yes, 0 if not and a negative value in case of error 5539 */ 5540 int 5541 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5542 xmlAutomataPtr am; 5543 int ret; 5544 5545 if (comp == NULL) 5546 return(-1); 5547 if (comp->determinist != -1) 5548 return(comp->determinist); 5549 5550 am = xmlNewAutomata(); 5551 if (am == NULL) 5552 return(-1); 5553 if (am->states != NULL) { 5554 int i; 5555 5556 for (i = 0;i < am->nbStates;i++) 5557 xmlRegFreeState(am->states[i]); 5558 xmlFree(am->states); 5559 } 5560 am->nbAtoms = comp->nbAtoms; 5561 am->atoms = comp->atoms; 5562 am->nbStates = comp->nbStates; 5563 am->states = comp->states; 5564 am->determinist = -1; 5565 am->flags = comp->flags; 5566 ret = xmlFAComputesDeterminism(am); 5567 am->atoms = NULL; 5568 am->states = NULL; 5569 xmlFreeAutomata(am); 5570 comp->determinist = ret; 5571 return(ret); 5572 } 5573 5574 /** 5575 * xmlRegFreeRegexp: 5576 * @regexp: the regexp 5577 * 5578 * Free a regexp 5579 */ 5580 void 5581 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5582 int i; 5583 if (regexp == NULL) 5584 return; 5585 5586 if (regexp->string != NULL) 5587 xmlFree(regexp->string); 5588 if (regexp->states != NULL) { 5589 for (i = 0;i < regexp->nbStates;i++) 5590 xmlRegFreeState(regexp->states[i]); 5591 xmlFree(regexp->states); 5592 } 5593 if (regexp->atoms != NULL) { 5594 for (i = 0;i < regexp->nbAtoms;i++) 5595 xmlRegFreeAtom(regexp->atoms[i]); 5596 xmlFree(regexp->atoms); 5597 } 5598 if (regexp->counters != NULL) 5599 xmlFree(regexp->counters); 5600 if (regexp->compact != NULL) 5601 xmlFree(regexp->compact); 5602 if (regexp->transdata != NULL) 5603 xmlFree(regexp->transdata); 5604 if (regexp->stringMap != NULL) { 5605 for (i = 0; i < regexp->nbstrings;i++) 5606 xmlFree(regexp->stringMap[i]); 5607 xmlFree(regexp->stringMap); 5608 } 5609 5610 xmlFree(regexp); 5611 } 5612 5613 #ifdef LIBXML_AUTOMATA_ENABLED 5614 /************************************************************************ 5615 * * 5616 * The Automata interface * 5617 * * 5618 ************************************************************************/ 5619 5620 /** 5621 * xmlNewAutomata: 5622 * 5623 * Create a new automata 5624 * 5625 * Returns the new object or NULL in case of failure 5626 */ 5627 xmlAutomataPtr 5628 xmlNewAutomata(void) { 5629 xmlAutomataPtr ctxt; 5630 5631 ctxt = xmlRegNewParserCtxt(NULL); 5632 if (ctxt == NULL) 5633 return(NULL); 5634 5635 /* initialize the parser */ 5636 ctxt->end = NULL; 5637 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5638 if (ctxt->start == NULL) { 5639 xmlFreeAutomata(ctxt); 5640 return(NULL); 5641 } 5642 ctxt->start->type = XML_REGEXP_START_STATE; 5643 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5644 xmlRegFreeState(ctxt->start); 5645 xmlFreeAutomata(ctxt); 5646 return(NULL); 5647 } 5648 ctxt->flags = 0; 5649 5650 return(ctxt); 5651 } 5652 5653 /** 5654 * xmlFreeAutomata: 5655 * @am: an automata 5656 * 5657 * Free an automata 5658 */ 5659 void 5660 xmlFreeAutomata(xmlAutomataPtr am) { 5661 if (am == NULL) 5662 return; 5663 xmlRegFreeParserCtxt(am); 5664 } 5665 5666 /** 5667 * xmlAutomataSetFlags: 5668 * @am: an automata 5669 * @flags: a set of internal flags 5670 * 5671 * Set some flags on the automata 5672 */ 5673 void 5674 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5675 if (am == NULL) 5676 return; 5677 am->flags |= flags; 5678 } 5679 5680 /** 5681 * xmlAutomataGetInitState: 5682 * @am: an automata 5683 * 5684 * Initial state lookup 5685 * 5686 * Returns the initial state of the automata 5687 */ 5688 xmlAutomataStatePtr 5689 xmlAutomataGetInitState(xmlAutomataPtr am) { 5690 if (am == NULL) 5691 return(NULL); 5692 return(am->start); 5693 } 5694 5695 /** 5696 * xmlAutomataSetFinalState: 5697 * @am: an automata 5698 * @state: a state in this automata 5699 * 5700 * Makes that state a final state 5701 * 5702 * Returns 0 or -1 in case of error 5703 */ 5704 int 5705 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5706 if ((am == NULL) || (state == NULL)) 5707 return(-1); 5708 state->type = XML_REGEXP_FINAL_STATE; 5709 return(0); 5710 } 5711 5712 /** 5713 * xmlAutomataNewTransition: 5714 * @am: an automata 5715 * @from: the starting point of the transition 5716 * @to: the target point of the transition or NULL 5717 * @token: the input string associated to that transition 5718 * @data: data passed to the callback function if the transition is activated 5719 * 5720 * If @to is NULL, this creates first a new target state in the automata 5721 * and then adds a transition from the @from state to the target state 5722 * activated by the value of @token 5723 * 5724 * Returns the target state or NULL in case of error 5725 */ 5726 xmlAutomataStatePtr 5727 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5728 xmlAutomataStatePtr to, const xmlChar *token, 5729 void *data) { 5730 xmlRegAtomPtr atom; 5731 5732 if ((am == NULL) || (from == NULL) || (token == NULL)) 5733 return(NULL); 5734 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5735 if (atom == NULL) 5736 return(NULL); 5737 atom->data = data; 5738 atom->valuep = xmlStrdup(token); 5739 5740 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5741 xmlRegFreeAtom(atom); 5742 return(NULL); 5743 } 5744 if (to == NULL) 5745 return(am->state); 5746 return(to); 5747 } 5748 5749 /** 5750 * xmlAutomataNewTransition2: 5751 * @am: an automata 5752 * @from: the starting point of the transition 5753 * @to: the target point of the transition or NULL 5754 * @token: the first input string associated to that transition 5755 * @token2: the second input string associated to that transition 5756 * @data: data passed to the callback function if the transition is activated 5757 * 5758 * If @to is NULL, this creates first a new target state in the automata 5759 * and then adds a transition from the @from state to the target state 5760 * activated by the value of @token 5761 * 5762 * Returns the target state or NULL in case of error 5763 */ 5764 xmlAutomataStatePtr 5765 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5766 xmlAutomataStatePtr to, const xmlChar *token, 5767 const xmlChar *token2, void *data) { 5768 xmlRegAtomPtr atom; 5769 5770 if ((am == NULL) || (from == NULL) || (token == NULL)) 5771 return(NULL); 5772 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5773 if (atom == NULL) 5774 return(NULL); 5775 atom->data = data; 5776 if ((token2 == NULL) || (*token2 == 0)) { 5777 atom->valuep = xmlStrdup(token); 5778 } else { 5779 int lenn, lenp; 5780 xmlChar *str; 5781 5782 lenn = strlen((char *) token2); 5783 lenp = strlen((char *) token); 5784 5785 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5786 if (str == NULL) { 5787 xmlRegFreeAtom(atom); 5788 return(NULL); 5789 } 5790 memcpy(&str[0], token, lenp); 5791 str[lenp] = '|'; 5792 memcpy(&str[lenp + 1], token2, lenn); 5793 str[lenn + lenp + 1] = 0; 5794 5795 atom->valuep = str; 5796 } 5797 5798 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5799 xmlRegFreeAtom(atom); 5800 return(NULL); 5801 } 5802 if (to == NULL) 5803 return(am->state); 5804 return(to); 5805 } 5806 5807 /** 5808 * xmlAutomataNewNegTrans: 5809 * @am: an automata 5810 * @from: the starting point of the transition 5811 * @to: the target point of the transition or NULL 5812 * @token: the first input string associated to that transition 5813 * @token2: the second input string associated to that transition 5814 * @data: data passed to the callback function if the transition is activated 5815 * 5816 * If @to is NULL, this creates first a new target state in the automata 5817 * and then adds a transition from the @from state to the target state 5818 * activated by any value except (@token,@token2) 5819 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5820 # the semantic of XSD ##other 5821 * 5822 * Returns the target state or NULL in case of error 5823 */ 5824 xmlAutomataStatePtr 5825 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5826 xmlAutomataStatePtr to, const xmlChar *token, 5827 const xmlChar *token2, void *data) { 5828 xmlRegAtomPtr atom; 5829 xmlChar err_msg[200]; 5830 5831 if ((am == NULL) || (from == NULL) || (token == NULL)) 5832 return(NULL); 5833 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5834 if (atom == NULL) 5835 return(NULL); 5836 atom->data = data; 5837 atom->neg = 1; 5838 if ((token2 == NULL) || (*token2 == 0)) { 5839 atom->valuep = xmlStrdup(token); 5840 } else { 5841 int lenn, lenp; 5842 xmlChar *str; 5843 5844 lenn = strlen((char *) token2); 5845 lenp = strlen((char *) token); 5846 5847 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5848 if (str == NULL) { 5849 xmlRegFreeAtom(atom); 5850 return(NULL); 5851 } 5852 memcpy(&str[0], token, lenp); 5853 str[lenp] = '|'; 5854 memcpy(&str[lenp + 1], token2, lenn); 5855 str[lenn + lenp + 1] = 0; 5856 5857 atom->valuep = str; 5858 } 5859 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5860 err_msg[199] = 0; 5861 atom->valuep2 = xmlStrdup(err_msg); 5862 5863 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5864 xmlRegFreeAtom(atom); 5865 return(NULL); 5866 } 5867 am->negs++; 5868 if (to == NULL) 5869 return(am->state); 5870 return(to); 5871 } 5872 5873 /** 5874 * xmlAutomataNewCountTrans2: 5875 * @am: an automata 5876 * @from: the starting point of the transition 5877 * @to: the target point of the transition or NULL 5878 * @token: the input string associated to that transition 5879 * @token2: the second input string associated to that transition 5880 * @min: the minimum successive occurrences of token 5881 * @max: the maximum successive occurrences of token 5882 * @data: data associated to the transition 5883 * 5884 * If @to is NULL, this creates first a new target state in the automata 5885 * and then adds a transition from the @from state to the target state 5886 * activated by a succession of input of value @token and @token2 and 5887 * whose number is between @min and @max 5888 * 5889 * Returns the target state or NULL in case of error 5890 */ 5891 xmlAutomataStatePtr 5892 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5893 xmlAutomataStatePtr to, const xmlChar *token, 5894 const xmlChar *token2, 5895 int min, int max, void *data) { 5896 xmlRegAtomPtr atom; 5897 int counter; 5898 5899 if ((am == NULL) || (from == NULL) || (token == NULL)) 5900 return(NULL); 5901 if (min < 0) 5902 return(NULL); 5903 if ((max < min) || (max < 1)) 5904 return(NULL); 5905 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5906 if (atom == NULL) 5907 return(NULL); 5908 if ((token2 == NULL) || (*token2 == 0)) { 5909 atom->valuep = xmlStrdup(token); 5910 } else { 5911 int lenn, lenp; 5912 xmlChar *str; 5913 5914 lenn = strlen((char *) token2); 5915 lenp = strlen((char *) token); 5916 5917 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5918 if (str == NULL) { 5919 xmlRegFreeAtom(atom); 5920 return(NULL); 5921 } 5922 memcpy(&str[0], token, lenp); 5923 str[lenp] = '|'; 5924 memcpy(&str[lenp + 1], token2, lenn); 5925 str[lenn + lenp + 1] = 0; 5926 5927 atom->valuep = str; 5928 } 5929 atom->data = data; 5930 if (min == 0) 5931 atom->min = 1; 5932 else 5933 atom->min = min; 5934 atom->max = max; 5935 5936 /* 5937 * associate a counter to the transition. 5938 */ 5939 counter = xmlRegGetCounter(am); 5940 am->counters[counter].min = min; 5941 am->counters[counter].max = max; 5942 5943 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5944 if (to == NULL) { 5945 to = xmlRegNewState(am); 5946 xmlRegStatePush(am, to); 5947 } 5948 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5949 xmlRegAtomPush(am, atom); 5950 am->state = to; 5951 5952 if (to == NULL) 5953 to = am->state; 5954 if (to == NULL) 5955 return(NULL); 5956 if (min == 0) 5957 xmlFAGenerateEpsilonTransition(am, from, to); 5958 return(to); 5959 } 5960 5961 /** 5962 * xmlAutomataNewCountTrans: 5963 * @am: an automata 5964 * @from: the starting point of the transition 5965 * @to: the target point of the transition or NULL 5966 * @token: the input string associated to that transition 5967 * @min: the minimum successive occurrences of token 5968 * @max: the maximum successive occurrences of token 5969 * @data: data associated to the transition 5970 * 5971 * If @to is NULL, this creates first a new target state in the automata 5972 * and then adds a transition from the @from state to the target state 5973 * activated by a succession of input of value @token and whose number 5974 * is between @min and @max 5975 * 5976 * Returns the target state or NULL in case of error 5977 */ 5978 xmlAutomataStatePtr 5979 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5980 xmlAutomataStatePtr to, const xmlChar *token, 5981 int min, int max, void *data) { 5982 xmlRegAtomPtr atom; 5983 int counter; 5984 5985 if ((am == NULL) || (from == NULL) || (token == NULL)) 5986 return(NULL); 5987 if (min < 0) 5988 return(NULL); 5989 if ((max < min) || (max < 1)) 5990 return(NULL); 5991 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5992 if (atom == NULL) 5993 return(NULL); 5994 atom->valuep = xmlStrdup(token); 5995 atom->data = data; 5996 if (min == 0) 5997 atom->min = 1; 5998 else 5999 atom->min = min; 6000 atom->max = max; 6001 6002 /* 6003 * associate a counter to the transition. 6004 */ 6005 counter = xmlRegGetCounter(am); 6006 am->counters[counter].min = min; 6007 am->counters[counter].max = max; 6008 6009 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6010 if (to == NULL) { 6011 to = xmlRegNewState(am); 6012 xmlRegStatePush(am, to); 6013 } 6014 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6015 xmlRegAtomPush(am, atom); 6016 am->state = to; 6017 6018 if (to == NULL) 6019 to = am->state; 6020 if (to == NULL) 6021 return(NULL); 6022 if (min == 0) 6023 xmlFAGenerateEpsilonTransition(am, from, to); 6024 return(to); 6025 } 6026 6027 /** 6028 * xmlAutomataNewOnceTrans2: 6029 * @am: an automata 6030 * @from: the starting point of the transition 6031 * @to: the target point of the transition or NULL 6032 * @token: the input string associated to that transition 6033 * @token2: the second input string associated to that transition 6034 * @min: the minimum successive occurrences of token 6035 * @max: the maximum successive occurrences of token 6036 * @data: data associated to the transition 6037 * 6038 * If @to is NULL, this creates first a new target state in the automata 6039 * and then adds a transition from the @from state to the target state 6040 * activated by a succession of input of value @token and @token2 and whose 6041 * number is between @min and @max, moreover that transition can only be 6042 * crossed once. 6043 * 6044 * Returns the target state or NULL in case of error 6045 */ 6046 xmlAutomataStatePtr 6047 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6048 xmlAutomataStatePtr to, const xmlChar *token, 6049 const xmlChar *token2, 6050 int min, int max, void *data) { 6051 xmlRegAtomPtr atom; 6052 int counter; 6053 6054 if ((am == NULL) || (from == NULL) || (token == NULL)) 6055 return(NULL); 6056 if (min < 1) 6057 return(NULL); 6058 if ((max < min) || (max < 1)) 6059 return(NULL); 6060 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6061 if (atom == NULL) 6062 return(NULL); 6063 if ((token2 == NULL) || (*token2 == 0)) { 6064 atom->valuep = xmlStrdup(token); 6065 } else { 6066 int lenn, lenp; 6067 xmlChar *str; 6068 6069 lenn = strlen((char *) token2); 6070 lenp = strlen((char *) token); 6071 6072 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6073 if (str == NULL) { 6074 xmlRegFreeAtom(atom); 6075 return(NULL); 6076 } 6077 memcpy(&str[0], token, lenp); 6078 str[lenp] = '|'; 6079 memcpy(&str[lenp + 1], token2, lenn); 6080 str[lenn + lenp + 1] = 0; 6081 6082 atom->valuep = str; 6083 } 6084 atom->data = data; 6085 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6086 atom->min = min; 6087 atom->max = max; 6088 /* 6089 * associate a counter to the transition. 6090 */ 6091 counter = xmlRegGetCounter(am); 6092 am->counters[counter].min = 1; 6093 am->counters[counter].max = 1; 6094 6095 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6096 if (to == NULL) { 6097 to = xmlRegNewState(am); 6098 xmlRegStatePush(am, to); 6099 } 6100 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6101 xmlRegAtomPush(am, atom); 6102 am->state = to; 6103 return(to); 6104 } 6105 6106 6107 6108 /** 6109 * xmlAutomataNewOnceTrans: 6110 * @am: an automata 6111 * @from: the starting point of the transition 6112 * @to: the target point of the transition or NULL 6113 * @token: the input string associated to that transition 6114 * @min: the minimum successive occurrences of token 6115 * @max: the maximum successive occurrences of token 6116 * @data: data associated to the transition 6117 * 6118 * If @to is NULL, this creates first a new target state in the automata 6119 * and then adds a transition from the @from state to the target state 6120 * activated by a succession of input of value @token and whose number 6121 * is between @min and @max, moreover that transition can only be crossed 6122 * once. 6123 * 6124 * Returns the target state or NULL in case of error 6125 */ 6126 xmlAutomataStatePtr 6127 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6128 xmlAutomataStatePtr to, const xmlChar *token, 6129 int min, int max, void *data) { 6130 xmlRegAtomPtr atom; 6131 int counter; 6132 6133 if ((am == NULL) || (from == NULL) || (token == NULL)) 6134 return(NULL); 6135 if (min < 1) 6136 return(NULL); 6137 if ((max < min) || (max < 1)) 6138 return(NULL); 6139 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6140 if (atom == NULL) 6141 return(NULL); 6142 atom->valuep = xmlStrdup(token); 6143 atom->data = data; 6144 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6145 atom->min = min; 6146 atom->max = max; 6147 /* 6148 * associate a counter to the transition. 6149 */ 6150 counter = xmlRegGetCounter(am); 6151 am->counters[counter].min = 1; 6152 am->counters[counter].max = 1; 6153 6154 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6155 if (to == NULL) { 6156 to = xmlRegNewState(am); 6157 xmlRegStatePush(am, to); 6158 } 6159 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6160 xmlRegAtomPush(am, atom); 6161 am->state = to; 6162 return(to); 6163 } 6164 6165 /** 6166 * xmlAutomataNewState: 6167 * @am: an automata 6168 * 6169 * Create a new disconnected state in the automata 6170 * 6171 * Returns the new state or NULL in case of error 6172 */ 6173 xmlAutomataStatePtr 6174 xmlAutomataNewState(xmlAutomataPtr am) { 6175 xmlAutomataStatePtr to; 6176 6177 if (am == NULL) 6178 return(NULL); 6179 to = xmlRegNewState(am); 6180 xmlRegStatePush(am, to); 6181 return(to); 6182 } 6183 6184 /** 6185 * xmlAutomataNewEpsilon: 6186 * @am: an automata 6187 * @from: the starting point of the transition 6188 * @to: the target point of the transition or NULL 6189 * 6190 * If @to is NULL, this creates first a new target state in the automata 6191 * and then adds an epsilon transition from the @from state to the 6192 * target state 6193 * 6194 * Returns the target state or NULL in case of error 6195 */ 6196 xmlAutomataStatePtr 6197 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6198 xmlAutomataStatePtr to) { 6199 if ((am == NULL) || (from == NULL)) 6200 return(NULL); 6201 xmlFAGenerateEpsilonTransition(am, from, to); 6202 if (to == NULL) 6203 return(am->state); 6204 return(to); 6205 } 6206 6207 /** 6208 * xmlAutomataNewAllTrans: 6209 * @am: an automata 6210 * @from: the starting point of the transition 6211 * @to: the target point of the transition or NULL 6212 * @lax: allow to transition if not all all transitions have been activated 6213 * 6214 * If @to is NULL, this creates first a new target state in the automata 6215 * and then adds a an ALL transition from the @from state to the 6216 * target state. That transition is an epsilon transition allowed only when 6217 * all transitions from the @from node have been activated. 6218 * 6219 * Returns the target state or NULL in case of error 6220 */ 6221 xmlAutomataStatePtr 6222 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6223 xmlAutomataStatePtr to, int lax) { 6224 if ((am == NULL) || (from == NULL)) 6225 return(NULL); 6226 xmlFAGenerateAllTransition(am, from, to, lax); 6227 if (to == NULL) 6228 return(am->state); 6229 return(to); 6230 } 6231 6232 /** 6233 * xmlAutomataNewCounter: 6234 * @am: an automata 6235 * @min: the minimal value on the counter 6236 * @max: the maximal value on the counter 6237 * 6238 * Create a new counter 6239 * 6240 * Returns the counter number or -1 in case of error 6241 */ 6242 int 6243 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6244 int ret; 6245 6246 if (am == NULL) 6247 return(-1); 6248 6249 ret = xmlRegGetCounter(am); 6250 if (ret < 0) 6251 return(-1); 6252 am->counters[ret].min = min; 6253 am->counters[ret].max = max; 6254 return(ret); 6255 } 6256 6257 /** 6258 * xmlAutomataNewCountedTrans: 6259 * @am: an automata 6260 * @from: the starting point of the transition 6261 * @to: the target point of the transition or NULL 6262 * @counter: the counter associated to that transition 6263 * 6264 * If @to is NULL, this creates first a new target state in the automata 6265 * and then adds an epsilon transition from the @from state to the target state 6266 * which will increment the counter provided 6267 * 6268 * Returns the target state or NULL in case of error 6269 */ 6270 xmlAutomataStatePtr 6271 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6272 xmlAutomataStatePtr to, int counter) { 6273 if ((am == NULL) || (from == NULL) || (counter < 0)) 6274 return(NULL); 6275 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6276 if (to == NULL) 6277 return(am->state); 6278 return(to); 6279 } 6280 6281 /** 6282 * xmlAutomataNewCounterTrans: 6283 * @am: an automata 6284 * @from: the starting point of the transition 6285 * @to: the target point of the transition or NULL 6286 * @counter: the counter associated to that transition 6287 * 6288 * If @to is NULL, this creates first a new target state in the automata 6289 * and then adds an epsilon transition from the @from state to the target state 6290 * which will be allowed only if the counter is within the right range. 6291 * 6292 * Returns the target state or NULL in case of error 6293 */ 6294 xmlAutomataStatePtr 6295 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6296 xmlAutomataStatePtr to, int counter) { 6297 if ((am == NULL) || (from == NULL) || (counter < 0)) 6298 return(NULL); 6299 xmlFAGenerateCountedTransition(am, from, to, counter); 6300 if (to == NULL) 6301 return(am->state); 6302 return(to); 6303 } 6304 6305 /** 6306 * xmlAutomataCompile: 6307 * @am: an automata 6308 * 6309 * Compile the automata into a Reg Exp ready for being executed. 6310 * The automata should be free after this point. 6311 * 6312 * Returns the compiled regexp or NULL in case of error 6313 */ 6314 xmlRegexpPtr 6315 xmlAutomataCompile(xmlAutomataPtr am) { 6316 xmlRegexpPtr ret; 6317 6318 if ((am == NULL) || (am->error != 0)) return(NULL); 6319 xmlFAEliminateEpsilonTransitions(am); 6320 /* xmlFAComputesDeterminism(am); */ 6321 ret = xmlRegEpxFromParse(am); 6322 6323 return(ret); 6324 } 6325 6326 /** 6327 * xmlAutomataIsDeterminist: 6328 * @am: an automata 6329 * 6330 * Checks if an automata is determinist. 6331 * 6332 * Returns 1 if true, 0 if not, and -1 in case of error 6333 */ 6334 int 6335 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6336 int ret; 6337 6338 if (am == NULL) 6339 return(-1); 6340 6341 ret = xmlFAComputesDeterminism(am); 6342 return(ret); 6343 } 6344 #endif /* LIBXML_AUTOMATA_ENABLED */ 6345 6346 #ifdef LIBXML_EXPR_ENABLED 6347 /************************************************************************ 6348 * * 6349 * Formal Expression handling code * 6350 * * 6351 ************************************************************************/ 6352 /************************************************************************ 6353 * * 6354 * Expression handling context * 6355 * * 6356 ************************************************************************/ 6357 6358 struct _xmlExpCtxt { 6359 xmlDictPtr dict; 6360 xmlExpNodePtr *table; 6361 int size; 6362 int nbElems; 6363 int nb_nodes; 6364 int maxNodes; 6365 const char *expr; 6366 const char *cur; 6367 int nb_cons; 6368 int tabSize; 6369 }; 6370 6371 /** 6372 * xmlExpNewCtxt: 6373 * @maxNodes: the maximum number of nodes 6374 * @dict: optional dictionary to use internally 6375 * 6376 * Creates a new context for manipulating expressions 6377 * 6378 * Returns the context or NULL in case of error 6379 */ 6380 xmlExpCtxtPtr 6381 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6382 xmlExpCtxtPtr ret; 6383 int size = 256; 6384 6385 if (maxNodes <= 4096) 6386 maxNodes = 4096; 6387 6388 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6389 if (ret == NULL) 6390 return(NULL); 6391 memset(ret, 0, sizeof(xmlExpCtxt)); 6392 ret->size = size; 6393 ret->nbElems = 0; 6394 ret->maxNodes = maxNodes; 6395 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6396 if (ret->table == NULL) { 6397 xmlFree(ret); 6398 return(NULL); 6399 } 6400 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6401 if (dict == NULL) { 6402 ret->dict = xmlDictCreate(); 6403 if (ret->dict == NULL) { 6404 xmlFree(ret->table); 6405 xmlFree(ret); 6406 return(NULL); 6407 } 6408 } else { 6409 ret->dict = dict; 6410 xmlDictReference(ret->dict); 6411 } 6412 return(ret); 6413 } 6414 6415 /** 6416 * xmlExpFreeCtxt: 6417 * @ctxt: an expression context 6418 * 6419 * Free an expression context 6420 */ 6421 void 6422 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6423 if (ctxt == NULL) 6424 return; 6425 xmlDictFree(ctxt->dict); 6426 if (ctxt->table != NULL) 6427 xmlFree(ctxt->table); 6428 xmlFree(ctxt); 6429 } 6430 6431 /************************************************************************ 6432 * * 6433 * Structure associated to an expression node * 6434 * * 6435 ************************************************************************/ 6436 #define MAX_NODES 10000 6437 6438 /* #define DEBUG_DERIV */ 6439 6440 /* 6441 * TODO: 6442 * - Wildcards 6443 * - public API for creation 6444 * 6445 * Started 6446 * - regression testing 6447 * 6448 * Done 6449 * - split into module and test tool 6450 * - memleaks 6451 */ 6452 6453 typedef enum { 6454 XML_EXP_NILABLE = (1 << 0) 6455 } xmlExpNodeInfo; 6456 6457 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6458 6459 struct _xmlExpNode { 6460 unsigned char type;/* xmlExpNodeType */ 6461 unsigned char info;/* OR of xmlExpNodeInfo */ 6462 unsigned short key; /* the hash key */ 6463 unsigned int ref; /* The number of references */ 6464 int c_max; /* the maximum length it can consume */ 6465 xmlExpNodePtr exp_left; 6466 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6467 union { 6468 struct { 6469 int f_min; 6470 int f_max; 6471 } count; 6472 struct { 6473 xmlExpNodePtr f_right; 6474 } children; 6475 const xmlChar *f_str; 6476 } field; 6477 }; 6478 6479 #define exp_min field.count.f_min 6480 #define exp_max field.count.f_max 6481 /* #define exp_left field.children.f_left */ 6482 #define exp_right field.children.f_right 6483 #define exp_str field.f_str 6484 6485 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6486 static xmlExpNode forbiddenExpNode = { 6487 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6488 }; 6489 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6490 static xmlExpNode emptyExpNode = { 6491 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6492 }; 6493 xmlExpNodePtr emptyExp = &emptyExpNode; 6494 6495 /************************************************************************ 6496 * * 6497 * The custom hash table for unicity and canonicalization * 6498 * of sub-expressions pointers * 6499 * * 6500 ************************************************************************/ 6501 /* 6502 * xmlExpHashNameComputeKey: 6503 * Calculate the hash key for a token 6504 */ 6505 static unsigned short 6506 xmlExpHashNameComputeKey(const xmlChar *name) { 6507 unsigned short value = 0L; 6508 char ch; 6509 6510 if (name != NULL) { 6511 value += 30 * (*name); 6512 while ((ch = *name++) != 0) { 6513 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6514 } 6515 } 6516 return (value); 6517 } 6518 6519 /* 6520 * xmlExpHashComputeKey: 6521 * Calculate the hash key for a compound expression 6522 */ 6523 static unsigned short 6524 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6525 xmlExpNodePtr right) { 6526 unsigned long value; 6527 unsigned short ret; 6528 6529 switch (type) { 6530 case XML_EXP_SEQ: 6531 value = left->key; 6532 value += right->key; 6533 value *= 3; 6534 ret = (unsigned short) value; 6535 break; 6536 case XML_EXP_OR: 6537 value = left->key; 6538 value += right->key; 6539 value *= 7; 6540 ret = (unsigned short) value; 6541 break; 6542 case XML_EXP_COUNT: 6543 value = left->key; 6544 value += right->key; 6545 ret = (unsigned short) value; 6546 break; 6547 default: 6548 ret = 0; 6549 } 6550 return(ret); 6551 } 6552 6553 6554 static xmlExpNodePtr 6555 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6556 xmlExpNodePtr ret; 6557 6558 if (ctxt->nb_nodes >= MAX_NODES) 6559 return(NULL); 6560 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6561 if (ret == NULL) 6562 return(NULL); 6563 memset(ret, 0, sizeof(xmlExpNode)); 6564 ret->type = type; 6565 ret->next = NULL; 6566 ctxt->nb_nodes++; 6567 ctxt->nb_cons++; 6568 return(ret); 6569 } 6570 6571 /** 6572 * xmlExpHashGetEntry: 6573 * @table: the hash table 6574 * 6575 * Get the unique entry from the hash table. The entry is created if 6576 * needed. @left and @right are consumed, i.e. their ref count will 6577 * be decremented by the operation. 6578 * 6579 * Returns the pointer or NULL in case of error 6580 */ 6581 static xmlExpNodePtr 6582 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6583 xmlExpNodePtr left, xmlExpNodePtr right, 6584 const xmlChar *name, int min, int max) { 6585 unsigned short kbase, key; 6586 xmlExpNodePtr entry; 6587 xmlExpNodePtr insert; 6588 6589 if (ctxt == NULL) 6590 return(NULL); 6591 6592 /* 6593 * Check for duplicate and insertion location. 6594 */ 6595 if (type == XML_EXP_ATOM) { 6596 kbase = xmlExpHashNameComputeKey(name); 6597 } else if (type == XML_EXP_COUNT) { 6598 /* COUNT reduction rule 1 */ 6599 /* a{1} -> a */ 6600 if (min == max) { 6601 if (min == 1) { 6602 return(left); 6603 } 6604 if (min == 0) { 6605 xmlExpFree(ctxt, left); 6606 return(emptyExp); 6607 } 6608 } 6609 if (min < 0) { 6610 xmlExpFree(ctxt, left); 6611 return(forbiddenExp); 6612 } 6613 if (max == -1) 6614 kbase = min + 79; 6615 else 6616 kbase = max - min; 6617 kbase += left->key; 6618 } else if (type == XML_EXP_OR) { 6619 /* Forbid reduction rules */ 6620 if (left->type == XML_EXP_FORBID) { 6621 xmlExpFree(ctxt, left); 6622 return(right); 6623 } 6624 if (right->type == XML_EXP_FORBID) { 6625 xmlExpFree(ctxt, right); 6626 return(left); 6627 } 6628 6629 /* OR reduction rule 1 */ 6630 /* a | a reduced to a */ 6631 if (left == right) { 6632 xmlExpFree(ctxt, right); 6633 return(left); 6634 } 6635 /* OR canonicalization rule 1 */ 6636 /* linearize (a | b) | c into a | (b | c) */ 6637 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6638 xmlExpNodePtr tmp = left; 6639 left = right; 6640 right = tmp; 6641 } 6642 /* OR reduction rule 2 */ 6643 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6644 if (right->type == XML_EXP_OR) { 6645 if ((left == right->exp_left) || 6646 (left == right->exp_right)) { 6647 xmlExpFree(ctxt, left); 6648 return(right); 6649 } 6650 } 6651 /* OR canonicalization rule 2 */ 6652 /* linearize (a | b) | c into a | (b | c) */ 6653 if (left->type == XML_EXP_OR) { 6654 xmlExpNodePtr tmp; 6655 6656 /* OR canonicalization rule 2 */ 6657 if ((left->exp_right->type != XML_EXP_OR) && 6658 (left->exp_right->key < left->exp_left->key)) { 6659 tmp = left->exp_right; 6660 left->exp_right = left->exp_left; 6661 left->exp_left = tmp; 6662 } 6663 left->exp_right->ref++; 6664 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6665 NULL, 0, 0); 6666 left->exp_left->ref++; 6667 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6668 NULL, 0, 0); 6669 6670 xmlExpFree(ctxt, left); 6671 return(tmp); 6672 } 6673 if (right->type == XML_EXP_OR) { 6674 /* Ordering in the tree */ 6675 /* C | (A | B) -> A | (B | C) */ 6676 if (left->key > right->exp_right->key) { 6677 xmlExpNodePtr tmp; 6678 right->exp_right->ref++; 6679 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6680 left, NULL, 0, 0); 6681 right->exp_left->ref++; 6682 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6683 tmp, NULL, 0, 0); 6684 xmlExpFree(ctxt, right); 6685 return(tmp); 6686 } 6687 /* Ordering in the tree */ 6688 /* B | (A | C) -> A | (B | C) */ 6689 if (left->key > right->exp_left->key) { 6690 xmlExpNodePtr tmp; 6691 right->exp_right->ref++; 6692 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6693 right->exp_right, NULL, 0, 0); 6694 right->exp_left->ref++; 6695 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6696 tmp, NULL, 0, 0); 6697 xmlExpFree(ctxt, right); 6698 return(tmp); 6699 } 6700 } 6701 /* we know both types are != XML_EXP_OR here */ 6702 else if (left->key > right->key) { 6703 xmlExpNodePtr tmp = left; 6704 left = right; 6705 right = tmp; 6706 } 6707 kbase = xmlExpHashComputeKey(type, left, right); 6708 } else if (type == XML_EXP_SEQ) { 6709 /* Forbid reduction rules */ 6710 if (left->type == XML_EXP_FORBID) { 6711 xmlExpFree(ctxt, right); 6712 return(left); 6713 } 6714 if (right->type == XML_EXP_FORBID) { 6715 xmlExpFree(ctxt, left); 6716 return(right); 6717 } 6718 /* Empty reduction rules */ 6719 if (right->type == XML_EXP_EMPTY) { 6720 return(left); 6721 } 6722 if (left->type == XML_EXP_EMPTY) { 6723 return(right); 6724 } 6725 kbase = xmlExpHashComputeKey(type, left, right); 6726 } else 6727 return(NULL); 6728 6729 key = kbase % ctxt->size; 6730 if (ctxt->table[key] != NULL) { 6731 for (insert = ctxt->table[key]; insert != NULL; 6732 insert = insert->next) { 6733 if ((insert->key == kbase) && 6734 (insert->type == type)) { 6735 if (type == XML_EXP_ATOM) { 6736 if (name == insert->exp_str) { 6737 insert->ref++; 6738 return(insert); 6739 } 6740 } else if (type == XML_EXP_COUNT) { 6741 if ((insert->exp_min == min) && (insert->exp_max == max) && 6742 (insert->exp_left == left)) { 6743 insert->ref++; 6744 left->ref--; 6745 return(insert); 6746 } 6747 } else if ((insert->exp_left == left) && 6748 (insert->exp_right == right)) { 6749 insert->ref++; 6750 left->ref--; 6751 right->ref--; 6752 return(insert); 6753 } 6754 } 6755 } 6756 } 6757 6758 entry = xmlExpNewNode(ctxt, type); 6759 if (entry == NULL) 6760 return(NULL); 6761 entry->key = kbase; 6762 if (type == XML_EXP_ATOM) { 6763 entry->exp_str = name; 6764 entry->c_max = 1; 6765 } else if (type == XML_EXP_COUNT) { 6766 entry->exp_min = min; 6767 entry->exp_max = max; 6768 entry->exp_left = left; 6769 if ((min == 0) || (IS_NILLABLE(left))) 6770 entry->info |= XML_EXP_NILABLE; 6771 if (max < 0) 6772 entry->c_max = -1; 6773 else 6774 entry->c_max = max * entry->exp_left->c_max; 6775 } else { 6776 entry->exp_left = left; 6777 entry->exp_right = right; 6778 if (type == XML_EXP_OR) { 6779 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6780 entry->info |= XML_EXP_NILABLE; 6781 if ((entry->exp_left->c_max == -1) || 6782 (entry->exp_right->c_max == -1)) 6783 entry->c_max = -1; 6784 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6785 entry->c_max = entry->exp_left->c_max; 6786 else 6787 entry->c_max = entry->exp_right->c_max; 6788 } else { 6789 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6790 entry->info |= XML_EXP_NILABLE; 6791 if ((entry->exp_left->c_max == -1) || 6792 (entry->exp_right->c_max == -1)) 6793 entry->c_max = -1; 6794 else 6795 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6796 } 6797 } 6798 entry->ref = 1; 6799 if (ctxt->table[key] != NULL) 6800 entry->next = ctxt->table[key]; 6801 6802 ctxt->table[key] = entry; 6803 ctxt->nbElems++; 6804 6805 return(entry); 6806 } 6807 6808 /** 6809 * xmlExpFree: 6810 * @ctxt: the expression context 6811 * @exp: the expression 6812 * 6813 * Dereference the expression 6814 */ 6815 void 6816 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6817 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6818 return; 6819 exp->ref--; 6820 if (exp->ref == 0) { 6821 unsigned short key; 6822 6823 /* Unlink it first from the hash table */ 6824 key = exp->key % ctxt->size; 6825 if (ctxt->table[key] == exp) { 6826 ctxt->table[key] = exp->next; 6827 } else { 6828 xmlExpNodePtr tmp; 6829 6830 tmp = ctxt->table[key]; 6831 while (tmp != NULL) { 6832 if (tmp->next == exp) { 6833 tmp->next = exp->next; 6834 break; 6835 } 6836 tmp = tmp->next; 6837 } 6838 } 6839 6840 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6841 xmlExpFree(ctxt, exp->exp_left); 6842 xmlExpFree(ctxt, exp->exp_right); 6843 } else if (exp->type == XML_EXP_COUNT) { 6844 xmlExpFree(ctxt, exp->exp_left); 6845 } 6846 xmlFree(exp); 6847 ctxt->nb_nodes--; 6848 } 6849 } 6850 6851 /** 6852 * xmlExpRef: 6853 * @exp: the expression 6854 * 6855 * Increase the reference count of the expression 6856 */ 6857 void 6858 xmlExpRef(xmlExpNodePtr exp) { 6859 if (exp != NULL) 6860 exp->ref++; 6861 } 6862 6863 /** 6864 * xmlExpNewAtom: 6865 * @ctxt: the expression context 6866 * @name: the atom name 6867 * @len: the atom name length in byte (or -1); 6868 * 6869 * Get the atom associated to this name from that context 6870 * 6871 * Returns the node or NULL in case of error 6872 */ 6873 xmlExpNodePtr 6874 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6875 if ((ctxt == NULL) || (name == NULL)) 6876 return(NULL); 6877 name = xmlDictLookup(ctxt->dict, name, len); 6878 if (name == NULL) 6879 return(NULL); 6880 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6881 } 6882 6883 /** 6884 * xmlExpNewOr: 6885 * @ctxt: the expression context 6886 * @left: left expression 6887 * @right: right expression 6888 * 6889 * Get the atom associated to the choice @left | @right 6890 * Note that @left and @right are consumed in the operation, to keep 6891 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6892 * this is true even in case of failure (unless ctxt == NULL). 6893 * 6894 * Returns the node or NULL in case of error 6895 */ 6896 xmlExpNodePtr 6897 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6898 if (ctxt == NULL) 6899 return(NULL); 6900 if ((left == NULL) || (right == NULL)) { 6901 xmlExpFree(ctxt, left); 6902 xmlExpFree(ctxt, right); 6903 return(NULL); 6904 } 6905 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6906 } 6907 6908 /** 6909 * xmlExpNewSeq: 6910 * @ctxt: the expression context 6911 * @left: left expression 6912 * @right: right expression 6913 * 6914 * Get the atom associated to the sequence @left , @right 6915 * Note that @left and @right are consumed in the operation, to keep 6916 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6917 * this is true even in case of failure (unless ctxt == NULL). 6918 * 6919 * Returns the node or NULL in case of error 6920 */ 6921 xmlExpNodePtr 6922 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6923 if (ctxt == NULL) 6924 return(NULL); 6925 if ((left == NULL) || (right == NULL)) { 6926 xmlExpFree(ctxt, left); 6927 xmlExpFree(ctxt, right); 6928 return(NULL); 6929 } 6930 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6931 } 6932 6933 /** 6934 * xmlExpNewRange: 6935 * @ctxt: the expression context 6936 * @subset: the expression to be repeated 6937 * @min: the lower bound for the repetition 6938 * @max: the upper bound for the repetition, -1 means infinite 6939 * 6940 * Get the atom associated to the range (@subset){@min, @max} 6941 * Note that @subset is consumed in the operation, to keep 6942 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6943 * this is true even in case of failure (unless ctxt == NULL). 6944 * 6945 * Returns the node or NULL in case of error 6946 */ 6947 xmlExpNodePtr 6948 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6949 if (ctxt == NULL) 6950 return(NULL); 6951 if ((subset == NULL) || (min < 0) || (max < -1) || 6952 ((max >= 0) && (min > max))) { 6953 xmlExpFree(ctxt, subset); 6954 return(NULL); 6955 } 6956 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6957 NULL, NULL, min, max)); 6958 } 6959 6960 /************************************************************************ 6961 * * 6962 * Public API for operations on expressions * 6963 * * 6964 ************************************************************************/ 6965 6966 static int 6967 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6968 const xmlChar**list, int len, int nb) { 6969 int tmp, tmp2; 6970 tail: 6971 switch (exp->type) { 6972 case XML_EXP_EMPTY: 6973 return(0); 6974 case XML_EXP_ATOM: 6975 for (tmp = 0;tmp < nb;tmp++) 6976 if (list[tmp] == exp->exp_str) 6977 return(0); 6978 if (nb >= len) 6979 return(-2); 6980 list[nb] = exp->exp_str; 6981 return(1); 6982 case XML_EXP_COUNT: 6983 exp = exp->exp_left; 6984 goto tail; 6985 case XML_EXP_SEQ: 6986 case XML_EXP_OR: 6987 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6988 if (tmp < 0) 6989 return(tmp); 6990 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6991 nb + tmp); 6992 if (tmp2 < 0) 6993 return(tmp2); 6994 return(tmp + tmp2); 6995 } 6996 return(-1); 6997 } 6998 6999 /** 7000 * xmlExpGetLanguage: 7001 * @ctxt: the expression context 7002 * @exp: the expression 7003 * @langList: where to store the tokens 7004 * @len: the allocated length of @list 7005 * 7006 * Find all the strings used in @exp and store them in @list 7007 * 7008 * Returns the number of unique strings found, -1 in case of errors and 7009 * -2 if there is more than @len strings 7010 */ 7011 int 7012 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7013 const xmlChar**langList, int len) { 7014 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 7015 return(-1); 7016 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 7017 } 7018 7019 static int 7020 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7021 const xmlChar**list, int len, int nb) { 7022 int tmp, tmp2; 7023 tail: 7024 switch (exp->type) { 7025 case XML_EXP_FORBID: 7026 return(0); 7027 case XML_EXP_EMPTY: 7028 return(0); 7029 case XML_EXP_ATOM: 7030 for (tmp = 0;tmp < nb;tmp++) 7031 if (list[tmp] == exp->exp_str) 7032 return(0); 7033 if (nb >= len) 7034 return(-2); 7035 list[nb] = exp->exp_str; 7036 return(1); 7037 case XML_EXP_COUNT: 7038 exp = exp->exp_left; 7039 goto tail; 7040 case XML_EXP_SEQ: 7041 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7042 if (tmp < 0) 7043 return(tmp); 7044 if (IS_NILLABLE(exp->exp_left)) { 7045 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7046 nb + tmp); 7047 if (tmp2 < 0) 7048 return(tmp2); 7049 tmp += tmp2; 7050 } 7051 return(tmp); 7052 case XML_EXP_OR: 7053 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7054 if (tmp < 0) 7055 return(tmp); 7056 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7057 nb + tmp); 7058 if (tmp2 < 0) 7059 return(tmp2); 7060 return(tmp + tmp2); 7061 } 7062 return(-1); 7063 } 7064 7065 /** 7066 * xmlExpGetStart: 7067 * @ctxt: the expression context 7068 * @exp: the expression 7069 * @tokList: where to store the tokens 7070 * @len: the allocated length of @list 7071 * 7072 * Find all the strings that appears at the start of the languages 7073 * accepted by @exp and store them in @list. E.g. for (a, b) | c 7074 * it will return the list [a, c] 7075 * 7076 * Returns the number of unique strings found, -1 in case of errors and 7077 * -2 if there is more than @len strings 7078 */ 7079 int 7080 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7081 const xmlChar**tokList, int len) { 7082 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 7083 return(-1); 7084 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 7085 } 7086 7087 /** 7088 * xmlExpIsNillable: 7089 * @exp: the expression 7090 * 7091 * Finds if the expression is nillable, i.e. if it accepts the empty sequence 7092 * 7093 * Returns 1 if nillable, 0 if not and -1 in case of error 7094 */ 7095 int 7096 xmlExpIsNillable(xmlExpNodePtr exp) { 7097 if (exp == NULL) 7098 return(-1); 7099 return(IS_NILLABLE(exp) != 0); 7100 } 7101 7102 static xmlExpNodePtr 7103 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 7104 { 7105 xmlExpNodePtr ret; 7106 7107 switch (exp->type) { 7108 case XML_EXP_EMPTY: 7109 return(forbiddenExp); 7110 case XML_EXP_FORBID: 7111 return(forbiddenExp); 7112 case XML_EXP_ATOM: 7113 if (exp->exp_str == str) { 7114 #ifdef DEBUG_DERIV 7115 printf("deriv atom: equal => Empty\n"); 7116 #endif 7117 ret = emptyExp; 7118 } else { 7119 #ifdef DEBUG_DERIV 7120 printf("deriv atom: mismatch => forbid\n"); 7121 #endif 7122 /* TODO wildcards here */ 7123 ret = forbiddenExp; 7124 } 7125 return(ret); 7126 case XML_EXP_OR: { 7127 xmlExpNodePtr tmp; 7128 7129 #ifdef DEBUG_DERIV 7130 printf("deriv or: => or(derivs)\n"); 7131 #endif 7132 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7133 if (tmp == NULL) { 7134 return(NULL); 7135 } 7136 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7137 if (ret == NULL) { 7138 xmlExpFree(ctxt, tmp); 7139 return(NULL); 7140 } 7141 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7142 NULL, 0, 0); 7143 return(ret); 7144 } 7145 case XML_EXP_SEQ: 7146 #ifdef DEBUG_DERIV 7147 printf("deriv seq: starting with left\n"); 7148 #endif 7149 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7150 if (ret == NULL) { 7151 return(NULL); 7152 } else if (ret == forbiddenExp) { 7153 if (IS_NILLABLE(exp->exp_left)) { 7154 #ifdef DEBUG_DERIV 7155 printf("deriv seq: left failed but nillable\n"); 7156 #endif 7157 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7158 } 7159 } else { 7160 #ifdef DEBUG_DERIV 7161 printf("deriv seq: left match => sequence\n"); 7162 #endif 7163 exp->exp_right->ref++; 7164 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7165 NULL, 0, 0); 7166 } 7167 return(ret); 7168 case XML_EXP_COUNT: { 7169 int min, max; 7170 xmlExpNodePtr tmp; 7171 7172 if (exp->exp_max == 0) 7173 return(forbiddenExp); 7174 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7175 if (ret == NULL) 7176 return(NULL); 7177 if (ret == forbiddenExp) { 7178 #ifdef DEBUG_DERIV 7179 printf("deriv count: pattern mismatch => forbid\n"); 7180 #endif 7181 return(ret); 7182 } 7183 if (exp->exp_max == 1) 7184 return(ret); 7185 if (exp->exp_max < 0) /* unbounded */ 7186 max = -1; 7187 else 7188 max = exp->exp_max - 1; 7189 if (exp->exp_min > 0) 7190 min = exp->exp_min - 1; 7191 else 7192 min = 0; 7193 exp->exp_left->ref++; 7194 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7195 NULL, min, max); 7196 if (ret == emptyExp) { 7197 #ifdef DEBUG_DERIV 7198 printf("deriv count: match to empty => new count\n"); 7199 #endif 7200 return(tmp); 7201 } 7202 #ifdef DEBUG_DERIV 7203 printf("deriv count: match => sequence with new count\n"); 7204 #endif 7205 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7206 NULL, 0, 0)); 7207 } 7208 } 7209 return(NULL); 7210 } 7211 7212 /** 7213 * xmlExpStringDerive: 7214 * @ctxt: the expression context 7215 * @exp: the expression 7216 * @str: the string 7217 * @len: the string len in bytes if available 7218 * 7219 * Do one step of Brzozowski derivation of the expression @exp with 7220 * respect to the input string 7221 * 7222 * Returns the resulting expression or NULL in case of internal error 7223 */ 7224 xmlExpNodePtr 7225 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7226 const xmlChar *str, int len) { 7227 const xmlChar *input; 7228 7229 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7230 return(NULL); 7231 } 7232 /* 7233 * check the string is in the dictionary, if yes use an interned 7234 * copy, otherwise we know it's not an acceptable input 7235 */ 7236 input = xmlDictExists(ctxt->dict, str, len); 7237 if (input == NULL) { 7238 return(forbiddenExp); 7239 } 7240 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7241 } 7242 7243 static int 7244 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7245 int ret = 1; 7246 7247 if (sub->c_max == -1) { 7248 if (exp->c_max != -1) 7249 ret = 0; 7250 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7251 ret = 0; 7252 } 7253 #if 0 7254 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7255 ret = 0; 7256 #endif 7257 return(ret); 7258 } 7259 7260 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7261 xmlExpNodePtr sub); 7262 /** 7263 * xmlExpDivide: 7264 * @ctxt: the expressions context 7265 * @exp: the englobing expression 7266 * @sub: the subexpression 7267 * @mult: the multiple expression 7268 * @remain: the remain from the derivation of the multiple 7269 * 7270 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7271 * so that sub{n} subsume exp 7272 * 7273 * Returns the multiple value if successful, 0 if it is not a multiple 7274 * and -1 in case of internal error. 7275 */ 7276 7277 static int 7278 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7279 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7280 int i; 7281 xmlExpNodePtr tmp, tmp2; 7282 7283 if (mult != NULL) *mult = NULL; 7284 if (remain != NULL) *remain = NULL; 7285 if (exp->c_max == -1) return(0); 7286 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7287 7288 for (i = 1;i <= exp->c_max;i++) { 7289 sub->ref++; 7290 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7291 sub, NULL, NULL, i, i); 7292 if (tmp == NULL) { 7293 return(-1); 7294 } 7295 if (!xmlExpCheckCard(tmp, exp)) { 7296 xmlExpFree(ctxt, tmp); 7297 continue; 7298 } 7299 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7300 if (tmp2 == NULL) { 7301 xmlExpFree(ctxt, tmp); 7302 return(-1); 7303 } 7304 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7305 if (remain != NULL) 7306 *remain = tmp2; 7307 else 7308 xmlExpFree(ctxt, tmp2); 7309 if (mult != NULL) 7310 *mult = tmp; 7311 else 7312 xmlExpFree(ctxt, tmp); 7313 #ifdef DEBUG_DERIV 7314 printf("Divide succeeded %d\n", i); 7315 #endif 7316 return(i); 7317 } 7318 xmlExpFree(ctxt, tmp); 7319 xmlExpFree(ctxt, tmp2); 7320 } 7321 #ifdef DEBUG_DERIV 7322 printf("Divide failed\n"); 7323 #endif 7324 return(0); 7325 } 7326 7327 /** 7328 * xmlExpExpDeriveInt: 7329 * @ctxt: the expressions context 7330 * @exp: the englobing expression 7331 * @sub: the subexpression 7332 * 7333 * Try to do a step of Brzozowski derivation but at a higher level 7334 * the input being a subexpression. 7335 * 7336 * Returns the resulting expression or NULL in case of internal error 7337 */ 7338 static xmlExpNodePtr 7339 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7340 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7341 const xmlChar **tab; 7342 int len, i; 7343 7344 /* 7345 * In case of equality and if the expression can only consume a finite 7346 * amount, then the derivation is empty 7347 */ 7348 if ((exp == sub) && (exp->c_max >= 0)) { 7349 #ifdef DEBUG_DERIV 7350 printf("Equal(exp, sub) and finite -> Empty\n"); 7351 #endif 7352 return(emptyExp); 7353 } 7354 /* 7355 * decompose sub sequence first 7356 */ 7357 if (sub->type == XML_EXP_EMPTY) { 7358 #ifdef DEBUG_DERIV 7359 printf("Empty(sub) -> Empty\n"); 7360 #endif 7361 exp->ref++; 7362 return(exp); 7363 } 7364 if (sub->type == XML_EXP_SEQ) { 7365 #ifdef DEBUG_DERIV 7366 printf("Seq(sub) -> decompose\n"); 7367 #endif 7368 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7369 if (tmp == NULL) 7370 return(NULL); 7371 if (tmp == forbiddenExp) 7372 return(tmp); 7373 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7374 xmlExpFree(ctxt, tmp); 7375 return(ret); 7376 } 7377 if (sub->type == XML_EXP_OR) { 7378 #ifdef DEBUG_DERIV 7379 printf("Or(sub) -> decompose\n"); 7380 #endif 7381 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7382 if (tmp == forbiddenExp) 7383 return(tmp); 7384 if (tmp == NULL) 7385 return(NULL); 7386 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7387 if ((ret == NULL) || (ret == forbiddenExp)) { 7388 xmlExpFree(ctxt, tmp); 7389 return(ret); 7390 } 7391 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7392 } 7393 if (!xmlExpCheckCard(exp, sub)) { 7394 #ifdef DEBUG_DERIV 7395 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7396 #endif 7397 return(forbiddenExp); 7398 } 7399 switch (exp->type) { 7400 case XML_EXP_EMPTY: 7401 if (sub == emptyExp) 7402 return(emptyExp); 7403 #ifdef DEBUG_DERIV 7404 printf("Empty(exp) -> Forbid\n"); 7405 #endif 7406 return(forbiddenExp); 7407 case XML_EXP_FORBID: 7408 #ifdef DEBUG_DERIV 7409 printf("Forbid(exp) -> Forbid\n"); 7410 #endif 7411 return(forbiddenExp); 7412 case XML_EXP_ATOM: 7413 if (sub->type == XML_EXP_ATOM) { 7414 /* TODO: handle wildcards */ 7415 if (exp->exp_str == sub->exp_str) { 7416 #ifdef DEBUG_DERIV 7417 printf("Atom match -> Empty\n"); 7418 #endif 7419 return(emptyExp); 7420 } 7421 #ifdef DEBUG_DERIV 7422 printf("Atom mismatch -> Forbid\n"); 7423 #endif 7424 return(forbiddenExp); 7425 } 7426 if ((sub->type == XML_EXP_COUNT) && 7427 (sub->exp_max == 1) && 7428 (sub->exp_left->type == XML_EXP_ATOM)) { 7429 /* TODO: handle wildcards */ 7430 if (exp->exp_str == sub->exp_left->exp_str) { 7431 #ifdef DEBUG_DERIV 7432 printf("Atom match -> Empty\n"); 7433 #endif 7434 return(emptyExp); 7435 } 7436 #ifdef DEBUG_DERIV 7437 printf("Atom mismatch -> Forbid\n"); 7438 #endif 7439 return(forbiddenExp); 7440 } 7441 #ifdef DEBUG_DERIV 7442 printf("Complex exp vs Atom -> Forbid\n"); 7443 #endif 7444 return(forbiddenExp); 7445 case XML_EXP_SEQ: 7446 /* try to get the sequence consumed only if possible */ 7447 if (xmlExpCheckCard(exp->exp_left, sub)) { 7448 /* See if the sequence can be consumed directly */ 7449 #ifdef DEBUG_DERIV 7450 printf("Seq trying left only\n"); 7451 #endif 7452 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7453 if ((ret != forbiddenExp) && (ret != NULL)) { 7454 #ifdef DEBUG_DERIV 7455 printf("Seq trying left only worked\n"); 7456 #endif 7457 /* 7458 * TODO: assumption here that we are determinist 7459 * i.e. we won't get to a nillable exp left 7460 * subset which could be matched by the right 7461 * part too. 7462 * e.g.: (a | b)+,(a | c) and 'a+,a' 7463 */ 7464 exp->exp_right->ref++; 7465 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7466 exp->exp_right, NULL, 0, 0)); 7467 } 7468 #ifdef DEBUG_DERIV 7469 } else { 7470 printf("Seq: left too short\n"); 7471 #endif 7472 } 7473 /* Try instead to decompose */ 7474 if (sub->type == XML_EXP_COUNT) { 7475 int min, max; 7476 7477 #ifdef DEBUG_DERIV 7478 printf("Seq: sub is a count\n"); 7479 #endif 7480 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7481 if (ret == NULL) 7482 return(NULL); 7483 if (ret != forbiddenExp) { 7484 #ifdef DEBUG_DERIV 7485 printf("Seq , Count match on left\n"); 7486 #endif 7487 if (sub->exp_max < 0) 7488 max = -1; 7489 else 7490 max = sub->exp_max -1; 7491 if (sub->exp_min > 0) 7492 min = sub->exp_min -1; 7493 else 7494 min = 0; 7495 exp->exp_right->ref++; 7496 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7497 exp->exp_right, NULL, 0, 0); 7498 if (tmp == NULL) 7499 return(NULL); 7500 7501 sub->exp_left->ref++; 7502 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7503 sub->exp_left, NULL, NULL, min, max); 7504 if (tmp2 == NULL) { 7505 xmlExpFree(ctxt, tmp); 7506 return(NULL); 7507 } 7508 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7509 xmlExpFree(ctxt, tmp); 7510 xmlExpFree(ctxt, tmp2); 7511 return(ret); 7512 } 7513 } 7514 /* we made no progress on structured operations */ 7515 break; 7516 case XML_EXP_OR: 7517 #ifdef DEBUG_DERIV 7518 printf("Or , trying both side\n"); 7519 #endif 7520 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7521 if (ret == NULL) 7522 return(NULL); 7523 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7524 if (tmp == NULL) { 7525 xmlExpFree(ctxt, ret); 7526 return(NULL); 7527 } 7528 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7529 case XML_EXP_COUNT: { 7530 int min, max; 7531 7532 if (sub->type == XML_EXP_COUNT) { 7533 /* 7534 * Try to see if the loop is completely subsumed 7535 */ 7536 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7537 if (tmp == NULL) 7538 return(NULL); 7539 if (tmp == forbiddenExp) { 7540 int mult; 7541 7542 #ifdef DEBUG_DERIV 7543 printf("Count, Count inner don't subsume\n"); 7544 #endif 7545 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7546 NULL, &tmp); 7547 if (mult <= 0) { 7548 #ifdef DEBUG_DERIV 7549 printf("Count, Count not multiple => forbidden\n"); 7550 #endif 7551 return(forbiddenExp); 7552 } 7553 if (sub->exp_max == -1) { 7554 max = -1; 7555 if (exp->exp_max == -1) { 7556 if (exp->exp_min <= sub->exp_min * mult) 7557 min = 0; 7558 else 7559 min = exp->exp_min - sub->exp_min * mult; 7560 } else { 7561 #ifdef DEBUG_DERIV 7562 printf("Count, Count finite can't subsume infinite\n"); 7563 #endif 7564 xmlExpFree(ctxt, tmp); 7565 return(forbiddenExp); 7566 } 7567 } else { 7568 if (exp->exp_max == -1) { 7569 #ifdef DEBUG_DERIV 7570 printf("Infinite loop consume mult finite loop\n"); 7571 #endif 7572 if (exp->exp_min > sub->exp_min * mult) { 7573 max = -1; 7574 min = exp->exp_min - sub->exp_min * mult; 7575 } else { 7576 max = -1; 7577 min = 0; 7578 } 7579 } else { 7580 if (exp->exp_max < sub->exp_max * mult) { 7581 #ifdef DEBUG_DERIV 7582 printf("loops max mult mismatch => forbidden\n"); 7583 #endif 7584 xmlExpFree(ctxt, tmp); 7585 return(forbiddenExp); 7586 } 7587 if (sub->exp_max * mult > exp->exp_min) 7588 min = 0; 7589 else 7590 min = exp->exp_min - sub->exp_max * mult; 7591 max = exp->exp_max - sub->exp_max * mult; 7592 } 7593 } 7594 } else if (!IS_NILLABLE(tmp)) { 7595 /* 7596 * TODO: loop here to try to grow if working on finite 7597 * blocks. 7598 */ 7599 #ifdef DEBUG_DERIV 7600 printf("Count, Count remain not nillable => forbidden\n"); 7601 #endif 7602 xmlExpFree(ctxt, tmp); 7603 return(forbiddenExp); 7604 } else if (sub->exp_max == -1) { 7605 if (exp->exp_max == -1) { 7606 if (exp->exp_min <= sub->exp_min) { 7607 #ifdef DEBUG_DERIV 7608 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7609 #endif 7610 max = -1; 7611 min = 0; 7612 } else { 7613 #ifdef DEBUG_DERIV 7614 printf("Infinite loops min => Count(X,Inf)\n"); 7615 #endif 7616 max = -1; 7617 min = exp->exp_min - sub->exp_min; 7618 } 7619 } else if (exp->exp_min > sub->exp_min) { 7620 #ifdef DEBUG_DERIV 7621 printf("loops min mismatch 1 => forbidden ???\n"); 7622 #endif 7623 xmlExpFree(ctxt, tmp); 7624 return(forbiddenExp); 7625 } else { 7626 max = -1; 7627 min = 0; 7628 } 7629 } else { 7630 if (exp->exp_max == -1) { 7631 #ifdef DEBUG_DERIV 7632 printf("Infinite loop consume finite loop\n"); 7633 #endif 7634 if (exp->exp_min > sub->exp_min) { 7635 max = -1; 7636 min = exp->exp_min - sub->exp_min; 7637 } else { 7638 max = -1; 7639 min = 0; 7640 } 7641 } else { 7642 if (exp->exp_max < sub->exp_max) { 7643 #ifdef DEBUG_DERIV 7644 printf("loops max mismatch => forbidden\n"); 7645 #endif 7646 xmlExpFree(ctxt, tmp); 7647 return(forbiddenExp); 7648 } 7649 if (sub->exp_max > exp->exp_min) 7650 min = 0; 7651 else 7652 min = exp->exp_min - sub->exp_max; 7653 max = exp->exp_max - sub->exp_max; 7654 } 7655 } 7656 #ifdef DEBUG_DERIV 7657 printf("loops match => SEQ(COUNT())\n"); 7658 #endif 7659 exp->exp_left->ref++; 7660 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7661 NULL, NULL, min, max); 7662 if (tmp2 == NULL) { 7663 return(NULL); 7664 } 7665 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7666 NULL, 0, 0); 7667 return(ret); 7668 } 7669 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7670 if (tmp == NULL) 7671 return(NULL); 7672 if (tmp == forbiddenExp) { 7673 #ifdef DEBUG_DERIV 7674 printf("loop mismatch => forbidden\n"); 7675 #endif 7676 return(forbiddenExp); 7677 } 7678 if (exp->exp_min > 0) 7679 min = exp->exp_min - 1; 7680 else 7681 min = 0; 7682 if (exp->exp_max < 0) 7683 max = -1; 7684 else 7685 max = exp->exp_max - 1; 7686 7687 #ifdef DEBUG_DERIV 7688 printf("loop match => SEQ(COUNT())\n"); 7689 #endif 7690 exp->exp_left->ref++; 7691 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7692 NULL, NULL, min, max); 7693 if (tmp2 == NULL) 7694 return(NULL); 7695 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7696 NULL, 0, 0); 7697 return(ret); 7698 } 7699 } 7700 7701 #ifdef DEBUG_DERIV 7702 printf("Fallback to derivative\n"); 7703 #endif 7704 if (IS_NILLABLE(sub)) { 7705 if (!(IS_NILLABLE(exp))) 7706 return(forbiddenExp); 7707 else 7708 ret = emptyExp; 7709 } else 7710 ret = NULL; 7711 /* 7712 * here the structured derivation made no progress so 7713 * we use the default token based derivation to force one more step 7714 */ 7715 if (ctxt->tabSize == 0) 7716 ctxt->tabSize = 40; 7717 7718 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7719 sizeof(const xmlChar *)); 7720 if (tab == NULL) { 7721 return(NULL); 7722 } 7723 7724 /* 7725 * collect all the strings accepted by the subexpression on input 7726 */ 7727 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7728 while (len < 0) { 7729 const xmlChar **temp; 7730 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7731 sizeof(const xmlChar *)); 7732 if (temp == NULL) { 7733 xmlFree((xmlChar **) tab); 7734 return(NULL); 7735 } 7736 tab = temp; 7737 ctxt->tabSize *= 2; 7738 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7739 } 7740 for (i = 0;i < len;i++) { 7741 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7742 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7743 xmlExpFree(ctxt, ret); 7744 xmlFree((xmlChar **) tab); 7745 return(tmp); 7746 } 7747 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7748 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7749 xmlExpFree(ctxt, tmp); 7750 xmlExpFree(ctxt, ret); 7751 xmlFree((xmlChar **) tab); 7752 return(tmp); 7753 } 7754 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7755 xmlExpFree(ctxt, tmp); 7756 xmlExpFree(ctxt, tmp2); 7757 7758 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7759 xmlExpFree(ctxt, ret); 7760 xmlFree((xmlChar **) tab); 7761 return(tmp3); 7762 } 7763 7764 if (ret == NULL) 7765 ret = tmp3; 7766 else { 7767 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7768 if (ret == NULL) { 7769 xmlFree((xmlChar **) tab); 7770 return(NULL); 7771 } 7772 } 7773 } 7774 xmlFree((xmlChar **) tab); 7775 return(ret); 7776 } 7777 7778 /** 7779 * xmlExpExpDerive: 7780 * @ctxt: the expressions context 7781 * @exp: the englobing expression 7782 * @sub: the subexpression 7783 * 7784 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7785 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7786 * it usually takes less than linear time and can handle expressions generating 7787 * infinite languages. 7788 * 7789 * Returns the resulting expression or NULL in case of internal error, the 7790 * result must be freed 7791 */ 7792 xmlExpNodePtr 7793 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7794 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7795 return(NULL); 7796 7797 /* 7798 * O(1) speedups 7799 */ 7800 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7801 #ifdef DEBUG_DERIV 7802 printf("Sub nillable and not exp : can't subsume\n"); 7803 #endif 7804 return(forbiddenExp); 7805 } 7806 if (xmlExpCheckCard(exp, sub) == 0) { 7807 #ifdef DEBUG_DERIV 7808 printf("sub generate longer sequences than exp : can't subsume\n"); 7809 #endif 7810 return(forbiddenExp); 7811 } 7812 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7813 } 7814 7815 /** 7816 * xmlExpSubsume: 7817 * @ctxt: the expressions context 7818 * @exp: the englobing expression 7819 * @sub: the subexpression 7820 * 7821 * Check whether @exp accepts all the languages accepted by @sub 7822 * the input being a subexpression. 7823 * 7824 * Returns 1 if true 0 if false and -1 in case of failure. 7825 */ 7826 int 7827 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7828 xmlExpNodePtr tmp; 7829 7830 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7831 return(-1); 7832 7833 /* 7834 * TODO: speedup by checking the language of sub is a subset of the 7835 * language of exp 7836 */ 7837 /* 7838 * O(1) speedups 7839 */ 7840 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7841 #ifdef DEBUG_DERIV 7842 printf("Sub nillable and not exp : can't subsume\n"); 7843 #endif 7844 return(0); 7845 } 7846 if (xmlExpCheckCard(exp, sub) == 0) { 7847 #ifdef DEBUG_DERIV 7848 printf("sub generate longer sequences than exp : can't subsume\n"); 7849 #endif 7850 return(0); 7851 } 7852 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7853 #ifdef DEBUG_DERIV 7854 printf("Result derivation :\n"); 7855 PRINT_EXP(tmp); 7856 #endif 7857 if (tmp == NULL) 7858 return(-1); 7859 if (tmp == forbiddenExp) 7860 return(0); 7861 if (tmp == emptyExp) 7862 return(1); 7863 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7864 xmlExpFree(ctxt, tmp); 7865 return(1); 7866 } 7867 xmlExpFree(ctxt, tmp); 7868 return(0); 7869 } 7870 7871 /************************************************************************ 7872 * * 7873 * Parsing expression * 7874 * * 7875 ************************************************************************/ 7876 7877 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7878 7879 #undef CUR 7880 #define CUR (*ctxt->cur) 7881 #undef NEXT 7882 #define NEXT ctxt->cur++; 7883 #undef IS_BLANK 7884 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7885 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7886 7887 static int 7888 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7889 int ret = 0; 7890 7891 SKIP_BLANKS 7892 if (CUR == '*') { 7893 NEXT 7894 return(-1); 7895 } 7896 if ((CUR < '0') || (CUR > '9')) 7897 return(-1); 7898 while ((CUR >= '0') && (CUR <= '9')) { 7899 ret = ret * 10 + (CUR - '0'); 7900 NEXT 7901 } 7902 return(ret); 7903 } 7904 7905 static xmlExpNodePtr 7906 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7907 const char *base; 7908 xmlExpNodePtr ret; 7909 const xmlChar *val; 7910 7911 SKIP_BLANKS 7912 base = ctxt->cur; 7913 if (*ctxt->cur == '(') { 7914 NEXT 7915 ret = xmlExpParseExpr(ctxt); 7916 SKIP_BLANKS 7917 if (*ctxt->cur != ')') { 7918 fprintf(stderr, "unbalanced '(' : %s\n", base); 7919 xmlExpFree(ctxt, ret); 7920 return(NULL); 7921 } 7922 NEXT; 7923 SKIP_BLANKS 7924 goto parse_quantifier; 7925 } 7926 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7927 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7928 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7929 NEXT; 7930 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7931 if (val == NULL) 7932 return(NULL); 7933 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7934 if (ret == NULL) 7935 return(NULL); 7936 SKIP_BLANKS 7937 parse_quantifier: 7938 if (CUR == '{') { 7939 int min, max; 7940 7941 NEXT 7942 min = xmlExpParseNumber(ctxt); 7943 if (min < 0) { 7944 xmlExpFree(ctxt, ret); 7945 return(NULL); 7946 } 7947 SKIP_BLANKS 7948 if (CUR == ',') { 7949 NEXT 7950 max = xmlExpParseNumber(ctxt); 7951 SKIP_BLANKS 7952 } else 7953 max = min; 7954 if (CUR != '}') { 7955 xmlExpFree(ctxt, ret); 7956 return(NULL); 7957 } 7958 NEXT 7959 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7960 min, max); 7961 SKIP_BLANKS 7962 } else if (CUR == '?') { 7963 NEXT 7964 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7965 0, 1); 7966 SKIP_BLANKS 7967 } else if (CUR == '+') { 7968 NEXT 7969 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7970 1, -1); 7971 SKIP_BLANKS 7972 } else if (CUR == '*') { 7973 NEXT 7974 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7975 0, -1); 7976 SKIP_BLANKS 7977 } 7978 return(ret); 7979 } 7980 7981 7982 static xmlExpNodePtr 7983 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7984 xmlExpNodePtr ret, right; 7985 7986 ret = xmlExpParseOr(ctxt); 7987 SKIP_BLANKS 7988 while (CUR == '|') { 7989 NEXT 7990 right = xmlExpParseOr(ctxt); 7991 if (right == NULL) { 7992 xmlExpFree(ctxt, ret); 7993 return(NULL); 7994 } 7995 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7996 if (ret == NULL) 7997 return(NULL); 7998 } 7999 return(ret); 8000 } 8001 8002 static xmlExpNodePtr 8003 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 8004 xmlExpNodePtr ret, right; 8005 8006 ret = xmlExpParseSeq(ctxt); 8007 SKIP_BLANKS 8008 while (CUR == ',') { 8009 NEXT 8010 right = xmlExpParseSeq(ctxt); 8011 if (right == NULL) { 8012 xmlExpFree(ctxt, ret); 8013 return(NULL); 8014 } 8015 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 8016 if (ret == NULL) 8017 return(NULL); 8018 } 8019 return(ret); 8020 } 8021 8022 /** 8023 * xmlExpParse: 8024 * @ctxt: the expressions context 8025 * @expr: the 0 terminated string 8026 * 8027 * Minimal parser for regexps, it understand the following constructs 8028 * - string terminals 8029 * - choice operator | 8030 * - sequence operator , 8031 * - subexpressions (...) 8032 * - usual cardinality operators + * and ? 8033 * - finite sequences { min, max } 8034 * - infinite sequences { min, * } 8035 * There is minimal checkings made especially no checking on strings values 8036 * 8037 * Returns a new expression or NULL in case of failure 8038 */ 8039 xmlExpNodePtr 8040 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 8041 xmlExpNodePtr ret; 8042 8043 ctxt->expr = expr; 8044 ctxt->cur = expr; 8045 8046 ret = xmlExpParseExpr(ctxt); 8047 SKIP_BLANKS 8048 if (*ctxt->cur != 0) { 8049 xmlExpFree(ctxt, ret); 8050 return(NULL); 8051 } 8052 return(ret); 8053 } 8054 8055 static void 8056 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 8057 xmlExpNodePtr c; 8058 8059 if (expr == NULL) return; 8060 if (glob) xmlBufferWriteChar(buf, "("); 8061 switch (expr->type) { 8062 case XML_EXP_EMPTY: 8063 xmlBufferWriteChar(buf, "empty"); 8064 break; 8065 case XML_EXP_FORBID: 8066 xmlBufferWriteChar(buf, "forbidden"); 8067 break; 8068 case XML_EXP_ATOM: 8069 xmlBufferWriteCHAR(buf, expr->exp_str); 8070 break; 8071 case XML_EXP_SEQ: 8072 c = expr->exp_left; 8073 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8074 xmlExpDumpInt(buf, c, 1); 8075 else 8076 xmlExpDumpInt(buf, c, 0); 8077 xmlBufferWriteChar(buf, " , "); 8078 c = expr->exp_right; 8079 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8080 xmlExpDumpInt(buf, c, 1); 8081 else 8082 xmlExpDumpInt(buf, c, 0); 8083 break; 8084 case XML_EXP_OR: 8085 c = expr->exp_left; 8086 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8087 xmlExpDumpInt(buf, c, 1); 8088 else 8089 xmlExpDumpInt(buf, c, 0); 8090 xmlBufferWriteChar(buf, " | "); 8091 c = expr->exp_right; 8092 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8093 xmlExpDumpInt(buf, c, 1); 8094 else 8095 xmlExpDumpInt(buf, c, 0); 8096 break; 8097 case XML_EXP_COUNT: { 8098 char rep[40]; 8099 8100 c = expr->exp_left; 8101 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8102 xmlExpDumpInt(buf, c, 1); 8103 else 8104 xmlExpDumpInt(buf, c, 0); 8105 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 8106 rep[0] = '?'; 8107 rep[1] = 0; 8108 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8109 rep[0] = '*'; 8110 rep[1] = 0; 8111 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8112 rep[0] = '+'; 8113 rep[1] = 0; 8114 } else if (expr->exp_max == expr->exp_min) { 8115 snprintf(rep, 39, "{%d}", expr->exp_min); 8116 } else if (expr->exp_max < 0) { 8117 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8118 } else { 8119 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8120 } 8121 rep[39] = 0; 8122 xmlBufferWriteChar(buf, rep); 8123 break; 8124 } 8125 default: 8126 fprintf(stderr, "Error in tree\n"); 8127 } 8128 if (glob) 8129 xmlBufferWriteChar(buf, ")"); 8130 } 8131 /** 8132 * xmlExpDump: 8133 * @buf: a buffer to receive the output 8134 * @expr: the compiled expression 8135 * 8136 * Serialize the expression as compiled to the buffer 8137 */ 8138 void 8139 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8140 if ((buf == NULL) || (expr == NULL)) 8141 return; 8142 xmlExpDumpInt(buf, expr, 0); 8143 } 8144 8145 /** 8146 * xmlExpMaxToken: 8147 * @expr: a compiled expression 8148 * 8149 * Indicate the maximum number of input a expression can accept 8150 * 8151 * Returns the maximum length or -1 in case of error 8152 */ 8153 int 8154 xmlExpMaxToken(xmlExpNodePtr expr) { 8155 if (expr == NULL) 8156 return(-1); 8157 return(expr->c_max); 8158 } 8159 8160 /** 8161 * xmlExpCtxtNbNodes: 8162 * @ctxt: an expression context 8163 * 8164 * Debugging facility provides the number of allocated nodes at a that point 8165 * 8166 * Returns the number of nodes in use or -1 in case of error 8167 */ 8168 int 8169 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8170 if (ctxt == NULL) 8171 return(-1); 8172 return(ctxt->nb_nodes); 8173 } 8174 8175 /** 8176 * xmlExpCtxtNbCons: 8177 * @ctxt: an expression context 8178 * 8179 * Debugging facility provides the number of allocated nodes over lifetime 8180 * 8181 * Returns the number of nodes ever allocated or -1 in case of error 8182 */ 8183 int 8184 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8185 if (ctxt == NULL) 8186 return(-1); 8187 return(ctxt->nb_cons); 8188 } 8189 8190 #endif /* LIBXML_EXPR_ENABLED */ 8191 #define bottom_xmlregexp 8192 #include "elfgcchack.h" 8193 #endif /* LIBXML_REGEXP_ENABLED */ 8194