xref: /reactos/sdk/lib/3rdparty/libxml2/xmlregexp.c (revision 911153da)
1 /*
2  * regexp.c: generic and extensible Regular Expression engine
3  *
4  * Basically designed with the purpose of compiling regexps for
5  * the variety of validation/schemas mechanisms now available in
6  * XML related specifications these include:
7  *    - XML-1.0 DTD validation
8  *    - XML Schemas structure part 1
9  *    - XML Schemas Datatypes part 2 especially Appendix F
10  *    - RELAX-NG/TREX i.e. the counter proposal
11  *
12  * See Copyright for the status of this software.
13  *
14  * Daniel Veillard <veillard@redhat.com>
15  */
16 
17 #define IN_LIBXML
18 #include "libxml.h"
19 
20 #ifdef LIBXML_REGEXP_ENABLED
21 
22 /* #define DEBUG_ERR */
23 
24 #include <stdio.h>
25 #include <string.h>
26 #include <limits.h>
27 
28 #include <libxml/tree.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/xmlregexp.h>
31 #include <libxml/xmlautomata.h>
32 #include <libxml/xmlunicode.h>
33 
34 #ifndef SIZE_MAX
35 #define SIZE_MAX ((size_t) -1)
36 #endif
37 
38 /* #define DEBUG_REGEXP_GRAPH */
39 /* #define DEBUG_REGEXP_EXEC */
40 /* #define DEBUG_PUSH */
41 /* #define DEBUG_COMPACTION */
42 
43 #define MAX_PUSH 10000000
44 
45 #ifdef ERROR
46 #undef ERROR
47 #endif
48 #define ERROR(str)							\
49     ctxt->error = XML_REGEXP_COMPILE_ERROR;				\
50     xmlRegexpErrCompile(ctxt, str);
51 #define NEXT ctxt->cur++
52 #define CUR (*(ctxt->cur))
53 #define NXT(index) (ctxt->cur[index])
54 
55 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
56 #define NEXTL(l) ctxt->cur += l;
57 #define XML_REG_STRING_SEPARATOR '|'
58 /*
59  * Need PREV to check on a '-' within a Character Group. May only be used
60  * when it's guaranteed that cur is not at the beginning of ctxt->string!
61  */
62 #define PREV (ctxt->cur[-1])
63 
64 /**
65  * TODO:
66  *
67  * macro to flag unimplemented blocks
68  */
69 #define TODO								\
70     xmlGenericError(xmlGenericErrorContext,				\
71 	    "Unimplemented block at %s:%d\n",				\
72             __FILE__, __LINE__);
73 
74 /************************************************************************
75  *									*
76  *			Datatypes and structures			*
77  *									*
78  ************************************************************************/
79 
80 /*
81  * Note: the order of the enums below is significant, do not shuffle
82  */
83 typedef enum {
84     XML_REGEXP_EPSILON = 1,
85     XML_REGEXP_CHARVAL,
86     XML_REGEXP_RANGES,
87     XML_REGEXP_SUBREG,  /* used for () sub regexps */
88     XML_REGEXP_STRING,
89     XML_REGEXP_ANYCHAR, /* . */
90     XML_REGEXP_ANYSPACE, /* \s */
91     XML_REGEXP_NOTSPACE, /* \S */
92     XML_REGEXP_INITNAME, /* \l */
93     XML_REGEXP_NOTINITNAME, /* \L */
94     XML_REGEXP_NAMECHAR, /* \c */
95     XML_REGEXP_NOTNAMECHAR, /* \C */
96     XML_REGEXP_DECIMAL, /* \d */
97     XML_REGEXP_NOTDECIMAL, /* \D */
98     XML_REGEXP_REALCHAR, /* \w */
99     XML_REGEXP_NOTREALCHAR, /* \W */
100     XML_REGEXP_LETTER = 100,
101     XML_REGEXP_LETTER_UPPERCASE,
102     XML_REGEXP_LETTER_LOWERCASE,
103     XML_REGEXP_LETTER_TITLECASE,
104     XML_REGEXP_LETTER_MODIFIER,
105     XML_REGEXP_LETTER_OTHERS,
106     XML_REGEXP_MARK,
107     XML_REGEXP_MARK_NONSPACING,
108     XML_REGEXP_MARK_SPACECOMBINING,
109     XML_REGEXP_MARK_ENCLOSING,
110     XML_REGEXP_NUMBER,
111     XML_REGEXP_NUMBER_DECIMAL,
112     XML_REGEXP_NUMBER_LETTER,
113     XML_REGEXP_NUMBER_OTHERS,
114     XML_REGEXP_PUNCT,
115     XML_REGEXP_PUNCT_CONNECTOR,
116     XML_REGEXP_PUNCT_DASH,
117     XML_REGEXP_PUNCT_OPEN,
118     XML_REGEXP_PUNCT_CLOSE,
119     XML_REGEXP_PUNCT_INITQUOTE,
120     XML_REGEXP_PUNCT_FINQUOTE,
121     XML_REGEXP_PUNCT_OTHERS,
122     XML_REGEXP_SEPAR,
123     XML_REGEXP_SEPAR_SPACE,
124     XML_REGEXP_SEPAR_LINE,
125     XML_REGEXP_SEPAR_PARA,
126     XML_REGEXP_SYMBOL,
127     XML_REGEXP_SYMBOL_MATH,
128     XML_REGEXP_SYMBOL_CURRENCY,
129     XML_REGEXP_SYMBOL_MODIFIER,
130     XML_REGEXP_SYMBOL_OTHERS,
131     XML_REGEXP_OTHER,
132     XML_REGEXP_OTHER_CONTROL,
133     XML_REGEXP_OTHER_FORMAT,
134     XML_REGEXP_OTHER_PRIVATE,
135     XML_REGEXP_OTHER_NA,
136     XML_REGEXP_BLOCK_NAME
137 } xmlRegAtomType;
138 
139 typedef enum {
140     XML_REGEXP_QUANT_EPSILON = 1,
141     XML_REGEXP_QUANT_ONCE,
142     XML_REGEXP_QUANT_OPT,
143     XML_REGEXP_QUANT_MULT,
144     XML_REGEXP_QUANT_PLUS,
145     XML_REGEXP_QUANT_ONCEONLY,
146     XML_REGEXP_QUANT_ALL,
147     XML_REGEXP_QUANT_RANGE
148 } xmlRegQuantType;
149 
150 typedef enum {
151     XML_REGEXP_START_STATE = 1,
152     XML_REGEXP_FINAL_STATE,
153     XML_REGEXP_TRANS_STATE,
154     XML_REGEXP_SINK_STATE,
155     XML_REGEXP_UNREACH_STATE
156 } xmlRegStateType;
157 
158 typedef enum {
159     XML_REGEXP_MARK_NORMAL = 0,
160     XML_REGEXP_MARK_START,
161     XML_REGEXP_MARK_VISITED
162 } xmlRegMarkedType;
163 
164 typedef struct _xmlRegRange xmlRegRange;
165 typedef xmlRegRange *xmlRegRangePtr;
166 
167 struct _xmlRegRange {
168     int neg;		/* 0 normal, 1 not, 2 exclude */
169     xmlRegAtomType type;
170     int start;
171     int end;
172     xmlChar *blockName;
173 };
174 
175 typedef struct _xmlRegAtom xmlRegAtom;
176 typedef xmlRegAtom *xmlRegAtomPtr;
177 
178 typedef struct _xmlAutomataState xmlRegState;
179 typedef xmlRegState *xmlRegStatePtr;
180 
181 struct _xmlRegAtom {
182     int no;
183     xmlRegAtomType type;
184     xmlRegQuantType quant;
185     int min;
186     int max;
187 
188     void *valuep;
189     void *valuep2;
190     int neg;
191     int codepoint;
192     xmlRegStatePtr start;
193     xmlRegStatePtr start0;
194     xmlRegStatePtr stop;
195     int maxRanges;
196     int nbRanges;
197     xmlRegRangePtr *ranges;
198     void *data;
199 };
200 
201 typedef struct _xmlRegCounter xmlRegCounter;
202 typedef xmlRegCounter *xmlRegCounterPtr;
203 
204 struct _xmlRegCounter {
205     int min;
206     int max;
207 };
208 
209 typedef struct _xmlRegTrans xmlRegTrans;
210 typedef xmlRegTrans *xmlRegTransPtr;
211 
212 struct _xmlRegTrans {
213     xmlRegAtomPtr atom;
214     int to;
215     int counter;
216     int count;
217     int nd;
218 };
219 
220 struct _xmlAutomataState {
221     xmlRegStateType type;
222     xmlRegMarkedType mark;
223     xmlRegMarkedType markd;
224     xmlRegMarkedType reached;
225     int no;
226     int maxTrans;
227     int nbTrans;
228     xmlRegTrans *trans;
229     /*  knowing states pointing to us can speed things up */
230     int maxTransTo;
231     int nbTransTo;
232     int *transTo;
233 };
234 
235 typedef struct _xmlAutomata xmlRegParserCtxt;
236 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
237 
238 #define AM_AUTOMATA_RNG 1
239 
240 struct _xmlAutomata {
241     xmlChar *string;
242     xmlChar *cur;
243 
244     int error;
245     int neg;
246 
247     xmlRegStatePtr start;
248     xmlRegStatePtr end;
249     xmlRegStatePtr state;
250 
251     xmlRegAtomPtr atom;
252 
253     int maxAtoms;
254     int nbAtoms;
255     xmlRegAtomPtr *atoms;
256 
257     int maxStates;
258     int nbStates;
259     xmlRegStatePtr *states;
260 
261     int maxCounters;
262     int nbCounters;
263     xmlRegCounter *counters;
264 
265     int determinist;
266     int negs;
267     int flags;
268 
269     int depth;
270 };
271 
272 struct _xmlRegexp {
273     xmlChar *string;
274     int nbStates;
275     xmlRegStatePtr *states;
276     int nbAtoms;
277     xmlRegAtomPtr *atoms;
278     int nbCounters;
279     xmlRegCounter *counters;
280     int determinist;
281     int flags;
282     /*
283      * That's the compact form for determinists automatas
284      */
285     int nbstates;
286     int *compact;
287     void **transdata;
288     int nbstrings;
289     xmlChar **stringMap;
290 };
291 
292 typedef struct _xmlRegExecRollback xmlRegExecRollback;
293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
294 
295 struct _xmlRegExecRollback {
296     xmlRegStatePtr state;/* the current state */
297     int index;		/* the index in the input stack */
298     int nextbranch;	/* the next transition to explore in that state */
299     int *counts;	/* save the automata state if it has some */
300 };
301 
302 typedef struct _xmlRegInputToken xmlRegInputToken;
303 typedef xmlRegInputToken *xmlRegInputTokenPtr;
304 
305 struct _xmlRegInputToken {
306     xmlChar *value;
307     void *data;
308 };
309 
310 struct _xmlRegExecCtxt {
311     int status;		/* execution status != 0 indicate an error */
312     int determinist;	/* did we find an indeterministic behaviour */
313     xmlRegexpPtr comp;	/* the compiled regexp */
314     xmlRegExecCallbacks callback;
315     void *data;
316 
317     xmlRegStatePtr state;/* the current state */
318     int transno;	/* the current transition on that state */
319     int transcount;	/* the number of chars in char counted transitions */
320 
321     /*
322      * A stack of rollback states
323      */
324     int maxRollbacks;
325     int nbRollbacks;
326     xmlRegExecRollback *rollbacks;
327 
328     /*
329      * The state of the automata if any
330      */
331     int *counts;
332 
333     /*
334      * The input stack
335      */
336     int inputStackMax;
337     int inputStackNr;
338     int index;
339     int *charStack;
340     const xmlChar *inputString; /* when operating on characters */
341     xmlRegInputTokenPtr inputStack;/* when operating on strings */
342 
343     /*
344      * error handling
345      */
346     int errStateNo;		/* the error state number */
347     xmlRegStatePtr errState;    /* the error state */
348     xmlChar *errString;		/* the string raising the error */
349     int *errCounts;		/* counters at the error state */
350     int nbPush;
351 };
352 
353 #define REGEXP_ALL_COUNTER	0x123456
354 #define REGEXP_ALL_LAX_COUNTER	0x123457
355 
356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
357 static void xmlRegFreeState(xmlRegStatePtr state);
358 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
362                   int neg, int start, int end, const xmlChar *blockName);
363 
364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
365 
366 /************************************************************************
367  *									*
368  *		Regexp memory error handler				*
369  *									*
370  ************************************************************************/
371 /**
372  * xmlRegexpErrMemory:
373  * @extra:  extra information
374  *
375  * Handle an out of memory condition
376  */
377 static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt,const char * extra)378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
379 {
380     const char *regexp = NULL;
381     if (ctxt != NULL) {
382         regexp = (const char *) ctxt->string;
383 	ctxt->error = XML_ERR_NO_MEMORY;
384     }
385     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
386 		    XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
387 		    regexp, NULL, 0, 0,
388 		    "Memory allocation failed : %s\n", extra);
389 }
390 
391 /**
392  * xmlRegexpErrCompile:
393  * @extra:  extra information
394  *
395  * Handle a compilation failure
396  */
397 static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt,const char * extra)398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
399 {
400     const char *regexp = NULL;
401     int idx = 0;
402 
403     if (ctxt != NULL) {
404         regexp = (const char *) ctxt->string;
405 	idx = ctxt->cur - ctxt->string;
406 	ctxt->error = XML_REGEXP_COMPILE_ERROR;
407     }
408     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
409 		    XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
410 		    regexp, NULL, idx, 0,
411 		    "failed to compile: %s\n", extra);
412 }
413 
414 /************************************************************************
415  *									*
416  *			Allocation/Deallocation				*
417  *									*
418  ************************************************************************/
419 
420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
421 
422 /**
423  * xmlRegCalloc2:
424  * @dim1:  size of first dimension
425  * @dim2:  size of second dimension
426  * @elemSize:  size of element
427  *
428  * Allocate a two-dimensional array and set all elements to zero.
429  *
430  * Returns the new array or NULL in case of error.
431  */
432 static void*
xmlRegCalloc2(size_t dim1,size_t dim2,size_t elemSize)433 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
434     size_t totalSize;
435     void *ret;
436 
437     /* Check for overflow */
438     if (dim1 > SIZE_MAX / dim2 / elemSize)
439         return (NULL);
440     totalSize = dim1 * dim2 * elemSize;
441     ret = xmlMalloc(totalSize);
442     if (ret != NULL)
443         memset(ret, 0, totalSize);
444     return (ret);
445 }
446 
447 /**
448  * xmlRegEpxFromParse:
449  * @ctxt:  the parser context used to build it
450  *
451  * Allocate a new regexp and fill it with the result from the parser
452  *
453  * Returns the new regexp or NULL in case of error
454  */
455 static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt)456 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
457     xmlRegexpPtr ret;
458 
459     ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
460     if (ret == NULL) {
461 	xmlRegexpErrMemory(ctxt, "compiling regexp");
462 	return(NULL);
463     }
464     memset(ret, 0, sizeof(xmlRegexp));
465     ret->string = ctxt->string;
466     ret->nbStates = ctxt->nbStates;
467     ret->states = ctxt->states;
468     ret->nbAtoms = ctxt->nbAtoms;
469     ret->atoms = ctxt->atoms;
470     ret->nbCounters = ctxt->nbCounters;
471     ret->counters = ctxt->counters;
472     ret->determinist = ctxt->determinist;
473     ret->flags = ctxt->flags;
474     if (ret->determinist == -1) {
475         xmlRegexpIsDeterminist(ret);
476     }
477 
478     if ((ret->determinist != 0) &&
479 	(ret->nbCounters == 0) &&
480 	(ctxt->negs == 0) &&
481 	(ret->atoms != NULL) &&
482 	(ret->atoms[0] != NULL) &&
483 	(ret->atoms[0]->type == XML_REGEXP_STRING)) {
484 	int i, j, nbstates = 0, nbatoms = 0;
485 	int *stateRemap;
486 	int *stringRemap;
487 	int *transitions;
488 	void **transdata;
489 	xmlChar **stringMap;
490         xmlChar *value;
491 
492 	/*
493 	 * Switch to a compact representation
494 	 * 1/ counting the effective number of states left
495 	 * 2/ counting the unique number of atoms, and check that
496 	 *    they are all of the string type
497 	 * 3/ build a table state x atom for the transitions
498 	 */
499 
500 	stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
501 	if (stateRemap == NULL) {
502 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
503 	    xmlFree(ret);
504 	    return(NULL);
505 	}
506 	for (i = 0;i < ret->nbStates;i++) {
507 	    if (ret->states[i] != NULL) {
508 		stateRemap[i] = nbstates;
509 		nbstates++;
510 	    } else {
511 		stateRemap[i] = -1;
512 	    }
513 	}
514 #ifdef DEBUG_COMPACTION
515 	printf("Final: %d states\n", nbstates);
516 #endif
517 	stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
518 	if (stringMap == NULL) {
519 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
520 	    xmlFree(stateRemap);
521 	    xmlFree(ret);
522 	    return(NULL);
523 	}
524 	stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
525 	if (stringRemap == NULL) {
526 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
527 	    xmlFree(stringMap);
528 	    xmlFree(stateRemap);
529 	    xmlFree(ret);
530 	    return(NULL);
531 	}
532 	for (i = 0;i < ret->nbAtoms;i++) {
533 	    if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
534 		(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
535 		value = ret->atoms[i]->valuep;
536                 for (j = 0;j < nbatoms;j++) {
537 		    if (xmlStrEqual(stringMap[j], value)) {
538 			stringRemap[i] = j;
539 			break;
540 		    }
541 		}
542 		if (j >= nbatoms) {
543 		    stringRemap[i] = nbatoms;
544 		    stringMap[nbatoms] = xmlStrdup(value);
545 		    if (stringMap[nbatoms] == NULL) {
546 			for (i = 0;i < nbatoms;i++)
547 			    xmlFree(stringMap[i]);
548 			xmlFree(stringRemap);
549 			xmlFree(stringMap);
550 			xmlFree(stateRemap);
551 			xmlFree(ret);
552 			return(NULL);
553 		    }
554 		    nbatoms++;
555 		}
556 	    } else {
557 		xmlFree(stateRemap);
558 		xmlFree(stringRemap);
559 		for (i = 0;i < nbatoms;i++)
560 		    xmlFree(stringMap[i]);
561 		xmlFree(stringMap);
562 		xmlFree(ret);
563 		return(NULL);
564 	    }
565 	}
566 #ifdef DEBUG_COMPACTION
567 	printf("Final: %d atoms\n", nbatoms);
568 #endif
569 	transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
570                                             sizeof(int));
571 	if (transitions == NULL) {
572 	    xmlFree(stateRemap);
573 	    xmlFree(stringRemap);
574             for (i = 0;i < nbatoms;i++)
575 		xmlFree(stringMap[i]);
576 	    xmlFree(stringMap);
577 	    xmlFree(ret);
578 	    return(NULL);
579 	}
580 
581 	/*
582 	 * Allocate the transition table. The first entry for each
583 	 * state corresponds to the state type.
584 	 */
585 	transdata = NULL;
586 
587 	for (i = 0;i < ret->nbStates;i++) {
588 	    int stateno, atomno, targetno, prev;
589 	    xmlRegStatePtr state;
590 	    xmlRegTransPtr trans;
591 
592 	    stateno = stateRemap[i];
593 	    if (stateno == -1)
594 		continue;
595 	    state = ret->states[i];
596 
597 	    transitions[stateno * (nbatoms + 1)] = state->type;
598 
599 	    for (j = 0;j < state->nbTrans;j++) {
600 		trans = &(state->trans[j]);
601 		if ((trans->to == -1) || (trans->atom == NULL))
602 		    continue;
603                 atomno = stringRemap[trans->atom->no];
604 		if ((trans->atom->data != NULL) && (transdata == NULL)) {
605 		    transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
606 			                                sizeof(void *));
607 		    if (transdata == NULL) {
608 			xmlRegexpErrMemory(ctxt, "compiling regexp");
609 			break;
610 		    }
611 		}
612 		targetno = stateRemap[trans->to];
613 		/*
614 		 * if the same atom can generate transitions to 2 different
615 		 * states then it means the automata is not deterministic and
616 		 * the compact form can't be used !
617 		 */
618 		prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
619 		if (prev != 0) {
620 		    if (prev != targetno + 1) {
621 			ret->determinist = 0;
622 #ifdef DEBUG_COMPACTION
623 			printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
624 			       i, j, trans->atom->no, trans->to, atomno, targetno);
625 			printf("       previous to is %d\n", prev);
626 #endif
627 			if (transdata != NULL)
628 			    xmlFree(transdata);
629 			xmlFree(transitions);
630 			xmlFree(stateRemap);
631 			xmlFree(stringRemap);
632 			for (i = 0;i < nbatoms;i++)
633 			    xmlFree(stringMap[i]);
634 			xmlFree(stringMap);
635 			goto not_determ;
636 		    }
637 		} else {
638 #if 0
639 		    printf("State %d trans %d: atom %d to %d : %d to %d\n",
640 			   i, j, trans->atom->no, trans->to, atomno, targetno);
641 #endif
642 		    transitions[stateno * (nbatoms + 1) + atomno + 1] =
643 			targetno + 1; /* to avoid 0 */
644 		    if (transdata != NULL)
645 			transdata[stateno * nbatoms + atomno] =
646 			    trans->atom->data;
647 		}
648 	    }
649 	}
650 	ret->determinist = 1;
651 #ifdef DEBUG_COMPACTION
652 	/*
653 	 * Debug
654 	 */
655 	for (i = 0;i < nbstates;i++) {
656 	    for (j = 0;j < nbatoms + 1;j++) {
657                 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
658 	    }
659 	    printf("\n");
660 	}
661 	printf("\n");
662 #endif
663 	/*
664 	 * Cleanup of the old data
665 	 */
666 	if (ret->states != NULL) {
667 	    for (i = 0;i < ret->nbStates;i++)
668 		xmlRegFreeState(ret->states[i]);
669 	    xmlFree(ret->states);
670 	}
671 	ret->states = NULL;
672 	ret->nbStates = 0;
673 	if (ret->atoms != NULL) {
674 	    for (i = 0;i < ret->nbAtoms;i++)
675 		xmlRegFreeAtom(ret->atoms[i]);
676 	    xmlFree(ret->atoms);
677 	}
678 	ret->atoms = NULL;
679 	ret->nbAtoms = 0;
680 
681 	ret->compact = transitions;
682 	ret->transdata = transdata;
683 	ret->stringMap = stringMap;
684 	ret->nbstrings = nbatoms;
685 	ret->nbstates = nbstates;
686 	xmlFree(stateRemap);
687 	xmlFree(stringRemap);
688     }
689 not_determ:
690     ctxt->string = NULL;
691     ctxt->nbStates = 0;
692     ctxt->states = NULL;
693     ctxt->nbAtoms = 0;
694     ctxt->atoms = NULL;
695     ctxt->nbCounters = 0;
696     ctxt->counters = NULL;
697     return(ret);
698 }
699 
700 /**
701  * xmlRegNewParserCtxt:
702  * @string:  the string to parse
703  *
704  * Allocate a new regexp parser context
705  *
706  * Returns the new context or NULL in case of error
707  */
708 static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar * string)709 xmlRegNewParserCtxt(const xmlChar *string) {
710     xmlRegParserCtxtPtr ret;
711 
712     ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
713     if (ret == NULL)
714 	return(NULL);
715     memset(ret, 0, sizeof(xmlRegParserCtxt));
716     if (string != NULL)
717 	ret->string = xmlStrdup(string);
718     ret->cur = ret->string;
719     ret->neg = 0;
720     ret->negs = 0;
721     ret->error = 0;
722     ret->determinist = -1;
723     return(ret);
724 }
725 
726 /**
727  * xmlRegNewRange:
728  * @ctxt:  the regexp parser context
729  * @neg:  is that negative
730  * @type:  the type of range
731  * @start:  the start codepoint
732  * @end:  the end codepoint
733  *
734  * Allocate a new regexp range
735  *
736  * Returns the new range or NULL in case of error
737  */
738 static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,int neg,xmlRegAtomType type,int start,int end)739 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
740 	       int neg, xmlRegAtomType type, int start, int end) {
741     xmlRegRangePtr ret;
742 
743     ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
744     if (ret == NULL) {
745 	xmlRegexpErrMemory(ctxt, "allocating range");
746 	return(NULL);
747     }
748     ret->neg = neg;
749     ret->type = type;
750     ret->start = start;
751     ret->end = end;
752     return(ret);
753 }
754 
755 /**
756  * xmlRegFreeRange:
757  * @range:  the regexp range
758  *
759  * Free a regexp range
760  */
761 static void
xmlRegFreeRange(xmlRegRangePtr range)762 xmlRegFreeRange(xmlRegRangePtr range) {
763     if (range == NULL)
764 	return;
765 
766     if (range->blockName != NULL)
767 	xmlFree(range->blockName);
768     xmlFree(range);
769 }
770 
771 /**
772  * xmlRegCopyRange:
773  * @range:  the regexp range
774  *
775  * Copy a regexp range
776  *
777  * Returns the new copy or NULL in case of error.
778  */
779 static xmlRegRangePtr
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt,xmlRegRangePtr range)780 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
781     xmlRegRangePtr ret;
782 
783     if (range == NULL)
784 	return(NULL);
785 
786     ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
787                          range->end);
788     if (ret == NULL)
789         return(NULL);
790     if (range->blockName != NULL) {
791 	ret->blockName = xmlStrdup(range->blockName);
792 	if (ret->blockName == NULL) {
793 	    xmlRegexpErrMemory(ctxt, "allocating range");
794 	    xmlRegFreeRange(ret);
795 	    return(NULL);
796 	}
797     }
798     return(ret);
799 }
800 
801 /**
802  * xmlRegNewAtom:
803  * @ctxt:  the regexp parser context
804  * @type:  the type of atom
805  *
806  * Allocate a new atom
807  *
808  * Returns the new atom or NULL in case of error
809  */
810 static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomType type)811 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
812     xmlRegAtomPtr ret;
813 
814     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
815     if (ret == NULL) {
816 	xmlRegexpErrMemory(ctxt, "allocating atom");
817 	return(NULL);
818     }
819     memset(ret, 0, sizeof(xmlRegAtom));
820     ret->type = type;
821     ret->quant = XML_REGEXP_QUANT_ONCE;
822     ret->min = 0;
823     ret->max = 0;
824     return(ret);
825 }
826 
827 /**
828  * xmlRegFreeAtom:
829  * @atom:  the regexp atom
830  *
831  * Free a regexp atom
832  */
833 static void
xmlRegFreeAtom(xmlRegAtomPtr atom)834 xmlRegFreeAtom(xmlRegAtomPtr atom) {
835     int i;
836 
837     if (atom == NULL)
838 	return;
839 
840     for (i = 0;i < atom->nbRanges;i++)
841 	xmlRegFreeRange(atom->ranges[i]);
842     if (atom->ranges != NULL)
843 	xmlFree(atom->ranges);
844     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
845 	xmlFree(atom->valuep);
846     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
847 	xmlFree(atom->valuep2);
848     if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
849 	xmlFree(atom->valuep);
850     xmlFree(atom);
851 }
852 
853 /**
854  * xmlRegCopyAtom:
855  * @ctxt:  the regexp parser context
856  * @atom:  the original atom
857  *
858  * Allocate a new regexp range
859  *
860  * Returns the new atom or NULL in case of error
861  */
862 static xmlRegAtomPtr
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)863 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
864     xmlRegAtomPtr ret;
865 
866     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
867     if (ret == NULL) {
868 	xmlRegexpErrMemory(ctxt, "copying atom");
869 	return(NULL);
870     }
871     memset(ret, 0, sizeof(xmlRegAtom));
872     ret->type = atom->type;
873     ret->quant = atom->quant;
874     ret->min = atom->min;
875     ret->max = atom->max;
876     if (atom->nbRanges > 0) {
877         int i;
878 
879         ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
880 	                                           atom->nbRanges);
881 	if (ret->ranges == NULL) {
882 	    xmlRegexpErrMemory(ctxt, "copying atom");
883 	    goto error;
884 	}
885 	for (i = 0;i < atom->nbRanges;i++) {
886 	    ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
887 	    if (ret->ranges[i] == NULL)
888 	        goto error;
889 	    ret->nbRanges = i + 1;
890 	}
891     }
892     return(ret);
893 
894 error:
895     xmlRegFreeAtom(ret);
896     return(NULL);
897 }
898 
899 static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt)900 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
901     xmlRegStatePtr ret;
902 
903     ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
904     if (ret == NULL) {
905 	xmlRegexpErrMemory(ctxt, "allocating state");
906 	return(NULL);
907     }
908     memset(ret, 0, sizeof(xmlRegState));
909     ret->type = XML_REGEXP_TRANS_STATE;
910     ret->mark = XML_REGEXP_MARK_NORMAL;
911     return(ret);
912 }
913 
914 /**
915  * xmlRegFreeState:
916  * @state:  the regexp state
917  *
918  * Free a regexp state
919  */
920 static void
xmlRegFreeState(xmlRegStatePtr state)921 xmlRegFreeState(xmlRegStatePtr state) {
922     if (state == NULL)
923 	return;
924 
925     if (state->trans != NULL)
926 	xmlFree(state->trans);
927     if (state->transTo != NULL)
928 	xmlFree(state->transTo);
929     xmlFree(state);
930 }
931 
932 /**
933  * xmlRegFreeParserCtxt:
934  * @ctxt:  the regexp parser context
935  *
936  * Free a regexp parser context
937  */
938 static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt)939 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
940     int i;
941     if (ctxt == NULL)
942 	return;
943 
944     if (ctxt->string != NULL)
945 	xmlFree(ctxt->string);
946     if (ctxt->states != NULL) {
947 	for (i = 0;i < ctxt->nbStates;i++)
948 	    xmlRegFreeState(ctxt->states[i]);
949 	xmlFree(ctxt->states);
950     }
951     if (ctxt->atoms != NULL) {
952 	for (i = 0;i < ctxt->nbAtoms;i++)
953 	    xmlRegFreeAtom(ctxt->atoms[i]);
954 	xmlFree(ctxt->atoms);
955     }
956     if (ctxt->counters != NULL)
957 	xmlFree(ctxt->counters);
958     xmlFree(ctxt);
959 }
960 
961 /************************************************************************
962  *									*
963  *			Display of Data structures			*
964  *									*
965  ************************************************************************/
966 
967 static void
xmlRegPrintAtomType(FILE * output,xmlRegAtomType type)968 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
969     switch (type) {
970         case XML_REGEXP_EPSILON:
971 	    fprintf(output, "epsilon "); break;
972         case XML_REGEXP_CHARVAL:
973 	    fprintf(output, "charval "); break;
974         case XML_REGEXP_RANGES:
975 	    fprintf(output, "ranges "); break;
976         case XML_REGEXP_SUBREG:
977 	    fprintf(output, "subexpr "); break;
978         case XML_REGEXP_STRING:
979 	    fprintf(output, "string "); break;
980         case XML_REGEXP_ANYCHAR:
981 	    fprintf(output, "anychar "); break;
982         case XML_REGEXP_ANYSPACE:
983 	    fprintf(output, "anyspace "); break;
984         case XML_REGEXP_NOTSPACE:
985 	    fprintf(output, "notspace "); break;
986         case XML_REGEXP_INITNAME:
987 	    fprintf(output, "initname "); break;
988         case XML_REGEXP_NOTINITNAME:
989 	    fprintf(output, "notinitname "); break;
990         case XML_REGEXP_NAMECHAR:
991 	    fprintf(output, "namechar "); break;
992         case XML_REGEXP_NOTNAMECHAR:
993 	    fprintf(output, "notnamechar "); break;
994         case XML_REGEXP_DECIMAL:
995 	    fprintf(output, "decimal "); break;
996         case XML_REGEXP_NOTDECIMAL:
997 	    fprintf(output, "notdecimal "); break;
998         case XML_REGEXP_REALCHAR:
999 	    fprintf(output, "realchar "); break;
1000         case XML_REGEXP_NOTREALCHAR:
1001 	    fprintf(output, "notrealchar "); break;
1002         case XML_REGEXP_LETTER:
1003             fprintf(output, "LETTER "); break;
1004         case XML_REGEXP_LETTER_UPPERCASE:
1005             fprintf(output, "LETTER_UPPERCASE "); break;
1006         case XML_REGEXP_LETTER_LOWERCASE:
1007             fprintf(output, "LETTER_LOWERCASE "); break;
1008         case XML_REGEXP_LETTER_TITLECASE:
1009             fprintf(output, "LETTER_TITLECASE "); break;
1010         case XML_REGEXP_LETTER_MODIFIER:
1011             fprintf(output, "LETTER_MODIFIER "); break;
1012         case XML_REGEXP_LETTER_OTHERS:
1013             fprintf(output, "LETTER_OTHERS "); break;
1014         case XML_REGEXP_MARK:
1015             fprintf(output, "MARK "); break;
1016         case XML_REGEXP_MARK_NONSPACING:
1017             fprintf(output, "MARK_NONSPACING "); break;
1018         case XML_REGEXP_MARK_SPACECOMBINING:
1019             fprintf(output, "MARK_SPACECOMBINING "); break;
1020         case XML_REGEXP_MARK_ENCLOSING:
1021             fprintf(output, "MARK_ENCLOSING "); break;
1022         case XML_REGEXP_NUMBER:
1023             fprintf(output, "NUMBER "); break;
1024         case XML_REGEXP_NUMBER_DECIMAL:
1025             fprintf(output, "NUMBER_DECIMAL "); break;
1026         case XML_REGEXP_NUMBER_LETTER:
1027             fprintf(output, "NUMBER_LETTER "); break;
1028         case XML_REGEXP_NUMBER_OTHERS:
1029             fprintf(output, "NUMBER_OTHERS "); break;
1030         case XML_REGEXP_PUNCT:
1031             fprintf(output, "PUNCT "); break;
1032         case XML_REGEXP_PUNCT_CONNECTOR:
1033             fprintf(output, "PUNCT_CONNECTOR "); break;
1034         case XML_REGEXP_PUNCT_DASH:
1035             fprintf(output, "PUNCT_DASH "); break;
1036         case XML_REGEXP_PUNCT_OPEN:
1037             fprintf(output, "PUNCT_OPEN "); break;
1038         case XML_REGEXP_PUNCT_CLOSE:
1039             fprintf(output, "PUNCT_CLOSE "); break;
1040         case XML_REGEXP_PUNCT_INITQUOTE:
1041             fprintf(output, "PUNCT_INITQUOTE "); break;
1042         case XML_REGEXP_PUNCT_FINQUOTE:
1043             fprintf(output, "PUNCT_FINQUOTE "); break;
1044         case XML_REGEXP_PUNCT_OTHERS:
1045             fprintf(output, "PUNCT_OTHERS "); break;
1046         case XML_REGEXP_SEPAR:
1047             fprintf(output, "SEPAR "); break;
1048         case XML_REGEXP_SEPAR_SPACE:
1049             fprintf(output, "SEPAR_SPACE "); break;
1050         case XML_REGEXP_SEPAR_LINE:
1051             fprintf(output, "SEPAR_LINE "); break;
1052         case XML_REGEXP_SEPAR_PARA:
1053             fprintf(output, "SEPAR_PARA "); break;
1054         case XML_REGEXP_SYMBOL:
1055             fprintf(output, "SYMBOL "); break;
1056         case XML_REGEXP_SYMBOL_MATH:
1057             fprintf(output, "SYMBOL_MATH "); break;
1058         case XML_REGEXP_SYMBOL_CURRENCY:
1059             fprintf(output, "SYMBOL_CURRENCY "); break;
1060         case XML_REGEXP_SYMBOL_MODIFIER:
1061             fprintf(output, "SYMBOL_MODIFIER "); break;
1062         case XML_REGEXP_SYMBOL_OTHERS:
1063             fprintf(output, "SYMBOL_OTHERS "); break;
1064         case XML_REGEXP_OTHER:
1065             fprintf(output, "OTHER "); break;
1066         case XML_REGEXP_OTHER_CONTROL:
1067             fprintf(output, "OTHER_CONTROL "); break;
1068         case XML_REGEXP_OTHER_FORMAT:
1069             fprintf(output, "OTHER_FORMAT "); break;
1070         case XML_REGEXP_OTHER_PRIVATE:
1071             fprintf(output, "OTHER_PRIVATE "); break;
1072         case XML_REGEXP_OTHER_NA:
1073             fprintf(output, "OTHER_NA "); break;
1074         case XML_REGEXP_BLOCK_NAME:
1075 	    fprintf(output, "BLOCK "); break;
1076     }
1077 }
1078 
1079 static void
xmlRegPrintQuantType(FILE * output,xmlRegQuantType type)1080 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1081     switch (type) {
1082         case XML_REGEXP_QUANT_EPSILON:
1083 	    fprintf(output, "epsilon "); break;
1084         case XML_REGEXP_QUANT_ONCE:
1085 	    fprintf(output, "once "); break;
1086         case XML_REGEXP_QUANT_OPT:
1087 	    fprintf(output, "? "); break;
1088         case XML_REGEXP_QUANT_MULT:
1089 	    fprintf(output, "* "); break;
1090         case XML_REGEXP_QUANT_PLUS:
1091 	    fprintf(output, "+ "); break;
1092 	case XML_REGEXP_QUANT_RANGE:
1093 	    fprintf(output, "range "); break;
1094 	case XML_REGEXP_QUANT_ONCEONLY:
1095 	    fprintf(output, "onceonly "); break;
1096 	case XML_REGEXP_QUANT_ALL:
1097 	    fprintf(output, "all "); break;
1098     }
1099 }
1100 static void
xmlRegPrintRange(FILE * output,xmlRegRangePtr range)1101 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1102     fprintf(output, "  range: ");
1103     if (range->neg)
1104 	fprintf(output, "negative ");
1105     xmlRegPrintAtomType(output, range->type);
1106     fprintf(output, "%c - %c\n", range->start, range->end);
1107 }
1108 
1109 static void
xmlRegPrintAtom(FILE * output,xmlRegAtomPtr atom)1110 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1111     fprintf(output, " atom: ");
1112     if (atom == NULL) {
1113 	fprintf(output, "NULL\n");
1114 	return;
1115     }
1116     if (atom->neg)
1117         fprintf(output, "not ");
1118     xmlRegPrintAtomType(output, atom->type);
1119     xmlRegPrintQuantType(output, atom->quant);
1120     if (atom->quant == XML_REGEXP_QUANT_RANGE)
1121 	fprintf(output, "%d-%d ", atom->min, atom->max);
1122     if (atom->type == XML_REGEXP_STRING)
1123 	fprintf(output, "'%s' ", (char *) atom->valuep);
1124     if (atom->type == XML_REGEXP_CHARVAL)
1125 	fprintf(output, "char %c\n", atom->codepoint);
1126     else if (atom->type == XML_REGEXP_RANGES) {
1127 	int i;
1128 	fprintf(output, "%d entries\n", atom->nbRanges);
1129 	for (i = 0; i < atom->nbRanges;i++)
1130 	    xmlRegPrintRange(output, atom->ranges[i]);
1131     } else if (atom->type == XML_REGEXP_SUBREG) {
1132 	fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1133     } else {
1134 	fprintf(output, "\n");
1135     }
1136 }
1137 
1138 static void
xmlRegPrintTrans(FILE * output,xmlRegTransPtr trans)1139 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1140     fprintf(output, "  trans: ");
1141     if (trans == NULL) {
1142 	fprintf(output, "NULL\n");
1143 	return;
1144     }
1145     if (trans->to < 0) {
1146 	fprintf(output, "removed\n");
1147 	return;
1148     }
1149     if (trans->nd != 0) {
1150 	if (trans->nd == 2)
1151 	    fprintf(output, "last not determinist, ");
1152 	else
1153 	    fprintf(output, "not determinist, ");
1154     }
1155     if (trans->counter >= 0) {
1156 	fprintf(output, "counted %d, ", trans->counter);
1157     }
1158     if (trans->count == REGEXP_ALL_COUNTER) {
1159 	fprintf(output, "all transition, ");
1160     } else if (trans->count >= 0) {
1161 	fprintf(output, "count based %d, ", trans->count);
1162     }
1163     if (trans->atom == NULL) {
1164 	fprintf(output, "epsilon to %d\n", trans->to);
1165 	return;
1166     }
1167     if (trans->atom->type == XML_REGEXP_CHARVAL)
1168 	fprintf(output, "char %c ", trans->atom->codepoint);
1169     fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1170 }
1171 
1172 static void
xmlRegPrintState(FILE * output,xmlRegStatePtr state)1173 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1174     int i;
1175 
1176     fprintf(output, " state: ");
1177     if (state == NULL) {
1178 	fprintf(output, "NULL\n");
1179 	return;
1180     }
1181     if (state->type == XML_REGEXP_START_STATE)
1182 	fprintf(output, "START ");
1183     if (state->type == XML_REGEXP_FINAL_STATE)
1184 	fprintf(output, "FINAL ");
1185 
1186     fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1187     for (i = 0;i < state->nbTrans; i++) {
1188 	xmlRegPrintTrans(output, &(state->trans[i]));
1189     }
1190 }
1191 
1192 #ifdef DEBUG_REGEXP_GRAPH
1193 static void
xmlRegPrintCtxt(FILE * output,xmlRegParserCtxtPtr ctxt)1194 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1195     int i;
1196 
1197     fprintf(output, " ctxt: ");
1198     if (ctxt == NULL) {
1199 	fprintf(output, "NULL\n");
1200 	return;
1201     }
1202     fprintf(output, "'%s' ", ctxt->string);
1203     if (ctxt->error)
1204 	fprintf(output, "error ");
1205     if (ctxt->neg)
1206 	fprintf(output, "neg ");
1207     fprintf(output, "\n");
1208     fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1209     for (i = 0;i < ctxt->nbAtoms; i++) {
1210 	fprintf(output, " %02d ", i);
1211 	xmlRegPrintAtom(output, ctxt->atoms[i]);
1212     }
1213     if (ctxt->atom != NULL) {
1214 	fprintf(output, "current atom:\n");
1215 	xmlRegPrintAtom(output, ctxt->atom);
1216     }
1217     fprintf(output, "%d states:", ctxt->nbStates);
1218     if (ctxt->start != NULL)
1219 	fprintf(output, " start: %d", ctxt->start->no);
1220     if (ctxt->end != NULL)
1221 	fprintf(output, " end: %d", ctxt->end->no);
1222     fprintf(output, "\n");
1223     for (i = 0;i < ctxt->nbStates; i++) {
1224 	xmlRegPrintState(output, ctxt->states[i]);
1225     }
1226     fprintf(output, "%d counters:\n", ctxt->nbCounters);
1227     for (i = 0;i < ctxt->nbCounters; i++) {
1228 	fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1229 		                                ctxt->counters[i].max);
1230     }
1231 }
1232 #endif
1233 
1234 /************************************************************************
1235  *									*
1236  *		 Finite Automata structures manipulations		*
1237  *									*
1238  ************************************************************************/
1239 
1240 static void
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom,int neg,xmlRegAtomType type,int start,int end,xmlChar * blockName)1241 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1242 	           int neg, xmlRegAtomType type, int start, int end,
1243 		   xmlChar *blockName) {
1244     xmlRegRangePtr range;
1245 
1246     if (atom == NULL) {
1247 	ERROR("add range: atom is NULL");
1248 	return;
1249     }
1250     if (atom->type != XML_REGEXP_RANGES) {
1251 	ERROR("add range: atom is not ranges");
1252 	return;
1253     }
1254     if (atom->maxRanges == 0) {
1255 	atom->maxRanges = 4;
1256 	atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1257 		                             sizeof(xmlRegRangePtr));
1258 	if (atom->ranges == NULL) {
1259 	    xmlRegexpErrMemory(ctxt, "adding ranges");
1260 	    atom->maxRanges = 0;
1261 	    return;
1262 	}
1263     } else if (atom->nbRanges >= atom->maxRanges) {
1264 	xmlRegRangePtr *tmp;
1265 	atom->maxRanges *= 2;
1266 	tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1267 		                             sizeof(xmlRegRangePtr));
1268 	if (tmp == NULL) {
1269 	    xmlRegexpErrMemory(ctxt, "adding ranges");
1270 	    atom->maxRanges /= 2;
1271 	    return;
1272 	}
1273 	atom->ranges = tmp;
1274     }
1275     range = xmlRegNewRange(ctxt, neg, type, start, end);
1276     if (range == NULL)
1277 	return;
1278     range->blockName = blockName;
1279     atom->ranges[atom->nbRanges++] = range;
1280 
1281 }
1282 
1283 static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt)1284 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1285     if (ctxt->maxCounters == 0) {
1286 	ctxt->maxCounters = 4;
1287 	ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1288 		                             sizeof(xmlRegCounter));
1289 	if (ctxt->counters == NULL) {
1290 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1291 	    ctxt->maxCounters = 0;
1292 	    return(-1);
1293 	}
1294     } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1295 	xmlRegCounter *tmp;
1296 	ctxt->maxCounters *= 2;
1297 	tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1298 		                           sizeof(xmlRegCounter));
1299 	if (tmp == NULL) {
1300 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1301 	    ctxt->maxCounters /= 2;
1302 	    return(-1);
1303 	}
1304 	ctxt->counters = tmp;
1305     }
1306     ctxt->counters[ctxt->nbCounters].min = -1;
1307     ctxt->counters[ctxt->nbCounters].max = -1;
1308     return(ctxt->nbCounters++);
1309 }
1310 
1311 static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)1312 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1313     if (atom == NULL) {
1314 	ERROR("atom push: atom is NULL");
1315 	return(-1);
1316     }
1317     if (ctxt->maxAtoms == 0) {
1318 	ctxt->maxAtoms = 4;
1319 	ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1320 		                             sizeof(xmlRegAtomPtr));
1321 	if (ctxt->atoms == NULL) {
1322 	    xmlRegexpErrMemory(ctxt, "pushing atom");
1323 	    ctxt->maxAtoms = 0;
1324 	    return(-1);
1325 	}
1326     } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1327 	xmlRegAtomPtr *tmp;
1328 	ctxt->maxAtoms *= 2;
1329 	tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1330 		                             sizeof(xmlRegAtomPtr));
1331 	if (tmp == NULL) {
1332 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1333 	    ctxt->maxAtoms /= 2;
1334 	    return(-1);
1335 	}
1336 	ctxt->atoms = tmp;
1337     }
1338     atom->no = ctxt->nbAtoms;
1339     ctxt->atoms[ctxt->nbAtoms++] = atom;
1340     return(0);
1341 }
1342 
1343 static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr target,int from)1344 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1345                       int from) {
1346     if (target->maxTransTo == 0) {
1347 	target->maxTransTo = 8;
1348 	target->transTo = (int *) xmlMalloc(target->maxTransTo *
1349 		                             sizeof(int));
1350 	if (target->transTo == NULL) {
1351 	    xmlRegexpErrMemory(ctxt, "adding transition");
1352 	    target->maxTransTo = 0;
1353 	    return;
1354 	}
1355     } else if (target->nbTransTo >= target->maxTransTo) {
1356 	int *tmp;
1357 	target->maxTransTo *= 2;
1358 	tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1359 		                             sizeof(int));
1360 	if (tmp == NULL) {
1361 	    xmlRegexpErrMemory(ctxt, "adding transition");
1362 	    target->maxTransTo /= 2;
1363 	    return;
1364 	}
1365 	target->transTo = tmp;
1366     }
1367     target->transTo[target->nbTransTo] = from;
1368     target->nbTransTo++;
1369 }
1370 
1371 static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,xmlRegAtomPtr atom,xmlRegStatePtr target,int counter,int count)1372 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1373 	            xmlRegAtomPtr atom, xmlRegStatePtr target,
1374 		    int counter, int count) {
1375 
1376     int nrtrans;
1377 
1378     if (state == NULL) {
1379 	ERROR("add state: state is NULL");
1380 	return;
1381     }
1382     if (target == NULL) {
1383 	ERROR("add state: target is NULL");
1384 	return;
1385     }
1386     /*
1387      * Other routines follow the philosophy 'When in doubt, add a transition'
1388      * so we check here whether such a transition is already present and, if
1389      * so, silently ignore this request.
1390      */
1391 
1392     for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1393 	xmlRegTransPtr trans = &(state->trans[nrtrans]);
1394 	if ((trans->atom == atom) &&
1395 	    (trans->to == target->no) &&
1396 	    (trans->counter == counter) &&
1397 	    (trans->count == count)) {
1398 #ifdef DEBUG_REGEXP_GRAPH
1399 	    printf("Ignoring duplicate transition from %d to %d\n",
1400 		    state->no, target->no);
1401 #endif
1402 	    return;
1403 	}
1404     }
1405 
1406     if (state->maxTrans == 0) {
1407 	state->maxTrans = 8;
1408 	state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1409 		                             sizeof(xmlRegTrans));
1410 	if (state->trans == NULL) {
1411 	    xmlRegexpErrMemory(ctxt, "adding transition");
1412 	    state->maxTrans = 0;
1413 	    return;
1414 	}
1415     } else if (state->nbTrans >= state->maxTrans) {
1416 	xmlRegTrans *tmp;
1417 	state->maxTrans *= 2;
1418 	tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1419 		                             sizeof(xmlRegTrans));
1420 	if (tmp == NULL) {
1421 	    xmlRegexpErrMemory(ctxt, "adding transition");
1422 	    state->maxTrans /= 2;
1423 	    return;
1424 	}
1425 	state->trans = tmp;
1426     }
1427 #ifdef DEBUG_REGEXP_GRAPH
1428     printf("Add trans from %d to %d ", state->no, target->no);
1429     if (count == REGEXP_ALL_COUNTER)
1430 	printf("all transition\n");
1431     else if (count >= 0)
1432 	printf("count based %d\n", count);
1433     else if (counter >= 0)
1434 	printf("counted %d\n", counter);
1435     else if (atom == NULL)
1436 	printf("epsilon transition\n");
1437     else if (atom != NULL)
1438         xmlRegPrintAtom(stdout, atom);
1439 #endif
1440 
1441     state->trans[state->nbTrans].atom = atom;
1442     state->trans[state->nbTrans].to = target->no;
1443     state->trans[state->nbTrans].counter = counter;
1444     state->trans[state->nbTrans].count = count;
1445     state->trans[state->nbTrans].nd = 0;
1446     state->nbTrans++;
1447     xmlRegStateAddTransTo(ctxt, target, state->no);
1448 }
1449 
1450 static int
xmlRegStatePush(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)1451 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1452     if (state == NULL) return(-1);
1453     if (ctxt->maxStates == 0) {
1454 	ctxt->maxStates = 4;
1455 	ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1456 		                             sizeof(xmlRegStatePtr));
1457 	if (ctxt->states == NULL) {
1458 	    xmlRegexpErrMemory(ctxt, "adding state");
1459 	    ctxt->maxStates = 0;
1460 	    return(-1);
1461 	}
1462     } else if (ctxt->nbStates >= ctxt->maxStates) {
1463 	xmlRegStatePtr *tmp;
1464 	ctxt->maxStates *= 2;
1465 	tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1466 		                             sizeof(xmlRegStatePtr));
1467 	if (tmp == NULL) {
1468 	    xmlRegexpErrMemory(ctxt, "adding state");
1469 	    ctxt->maxStates /= 2;
1470 	    return(-1);
1471 	}
1472 	ctxt->states = tmp;
1473     }
1474     state->no = ctxt->nbStates;
1475     ctxt->states[ctxt->nbStates++] = state;
1476     return(0);
1477 }
1478 
1479 /**
1480  * xmlFAGenerateAllTransition:
1481  * @ctxt:  a regexp parser context
1482  * @from:  the from state
1483  * @to:  the target state or NULL for building a new one
1484  * @lax:
1485  *
1486  */
1487 static void
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int lax)1488 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1489 			   xmlRegStatePtr from, xmlRegStatePtr to,
1490 			   int lax) {
1491     if (to == NULL) {
1492 	to = xmlRegNewState(ctxt);
1493 	xmlRegStatePush(ctxt, to);
1494 	ctxt->state = to;
1495     }
1496     if (lax)
1497 	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1498     else
1499 	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1500 }
1501 
1502 /**
1503  * xmlFAGenerateEpsilonTransition:
1504  * @ctxt:  a regexp parser context
1505  * @from:  the from state
1506  * @to:  the target state or NULL for building a new one
1507  *
1508  */
1509 static void
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to)1510 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1511 			       xmlRegStatePtr from, xmlRegStatePtr to) {
1512     if (to == NULL) {
1513 	to = xmlRegNewState(ctxt);
1514 	xmlRegStatePush(ctxt, to);
1515 	ctxt->state = to;
1516     }
1517     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1518 }
1519 
1520 /**
1521  * xmlFAGenerateCountedEpsilonTransition:
1522  * @ctxt:  a regexp parser context
1523  * @from:  the from state
1524  * @to:  the target state or NULL for building a new one
1525  * counter:  the counter for that transition
1526  *
1527  */
1528 static void
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1529 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1530 	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1531     if (to == NULL) {
1532 	to = xmlRegNewState(ctxt);
1533 	xmlRegStatePush(ctxt, to);
1534 	ctxt->state = to;
1535     }
1536     xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1537 }
1538 
1539 /**
1540  * xmlFAGenerateCountedTransition:
1541  * @ctxt:  a regexp parser context
1542  * @from:  the from state
1543  * @to:  the target state or NULL for building a new one
1544  * counter:  the counter for that transition
1545  *
1546  */
1547 static void
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1548 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1549 	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1550     if (to == NULL) {
1551 	to = xmlRegNewState(ctxt);
1552 	xmlRegStatePush(ctxt, to);
1553 	ctxt->state = to;
1554     }
1555     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1556 }
1557 
1558 /**
1559  * xmlFAGenerateTransitions:
1560  * @ctxt:  a regexp parser context
1561  * @from:  the from state
1562  * @to:  the target state or NULL for building a new one
1563  * @atom:  the atom generating the transition
1564  *
1565  * Returns 0 if success and -1 in case of error.
1566  */
1567 static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,xmlRegAtomPtr atom)1568 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1569 	                 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1570     xmlRegStatePtr end;
1571     int nullable = 0;
1572 
1573     if (atom == NULL) {
1574 	ERROR("generate transition: atom == NULL");
1575 	return(-1);
1576     }
1577     if (atom->type == XML_REGEXP_SUBREG) {
1578 	/*
1579 	 * this is a subexpression handling one should not need to
1580 	 * create a new node except for XML_REGEXP_QUANT_RANGE.
1581 	 */
1582 	if (xmlRegAtomPush(ctxt, atom) < 0) {
1583 	    return(-1);
1584 	}
1585 	if ((to != NULL) && (atom->stop != to) &&
1586 	    (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1587 	    /*
1588 	     * Generate an epsilon transition to link to the target
1589 	     */
1590 	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1591 #ifdef DV
1592 	} else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1593 		   (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1594 	    to = xmlRegNewState(ctxt);
1595 	    xmlRegStatePush(ctxt, to);
1596 	    ctxt->state = to;
1597 	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1598 #endif
1599 	}
1600 	switch (atom->quant) {
1601 	    case XML_REGEXP_QUANT_OPT:
1602 		atom->quant = XML_REGEXP_QUANT_ONCE;
1603 		/*
1604 		 * transition done to the state after end of atom.
1605 		 *      1. set transition from atom start to new state
1606 		 *      2. set transition from atom end to this state.
1607 		 */
1608                 if (to == NULL) {
1609                     xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1610                     xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1611                                                    ctxt->state);
1612                 } else {
1613                     xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1614                 }
1615 		break;
1616 	    case XML_REGEXP_QUANT_MULT:
1617 		atom->quant = XML_REGEXP_QUANT_ONCE;
1618 		xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1619 		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1620 		break;
1621 	    case XML_REGEXP_QUANT_PLUS:
1622 		atom->quant = XML_REGEXP_QUANT_ONCE;
1623 		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1624 		break;
1625 	    case XML_REGEXP_QUANT_RANGE: {
1626 		int counter;
1627 		xmlRegStatePtr inter, newstate;
1628 
1629 		/*
1630 		 * create the final state now if needed
1631 		 */
1632 		if (to != NULL) {
1633 		    newstate = to;
1634 		} else {
1635 		    newstate = xmlRegNewState(ctxt);
1636 		    xmlRegStatePush(ctxt, newstate);
1637 		}
1638 
1639 		/*
1640 		 * The principle here is to use counted transition
1641 		 * to avoid explosion in the number of states in the
1642 		 * graph. This is clearly more complex but should not
1643 		 * be exploitable at runtime.
1644 		 */
1645 		if ((atom->min == 0) && (atom->start0 == NULL)) {
1646 		    xmlRegAtomPtr copy;
1647 		    /*
1648 		     * duplicate a transition based on atom to count next
1649 		     * occurrences after 1. We cannot loop to atom->start
1650 		     * directly because we need an epsilon transition to
1651 		     * newstate.
1652 		     */
1653 		     /* ???? For some reason it seems we never reach that
1654 		        case, I suppose this got optimized out before when
1655 			building the automata */
1656 		    copy = xmlRegCopyAtom(ctxt, atom);
1657 		    if (copy == NULL)
1658 		        return(-1);
1659 		    copy->quant = XML_REGEXP_QUANT_ONCE;
1660 		    copy->min = 0;
1661 		    copy->max = 0;
1662 
1663 		    if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1664 		        < 0)
1665 			return(-1);
1666 		    inter = ctxt->state;
1667 		    counter = xmlRegGetCounter(ctxt);
1668 		    ctxt->counters[counter].min = atom->min - 1;
1669 		    ctxt->counters[counter].max = atom->max - 1;
1670 		    /* count the number of times we see it again */
1671 		    xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1672 						   atom->stop, counter);
1673 		    /* allow a way out based on the count */
1674 		    xmlFAGenerateCountedTransition(ctxt, inter,
1675 			                           newstate, counter);
1676 		    /* and also allow a direct exit for 0 */
1677 		    xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1678 		                                   newstate);
1679 		} else {
1680 		    /*
1681 		     * either we need the atom at least once or there
1682 		     * is an atom->start0 allowing to easily plug the
1683 		     * epsilon transition.
1684 		     */
1685 		    counter = xmlRegGetCounter(ctxt);
1686 		    ctxt->counters[counter].min = atom->min - 1;
1687 		    ctxt->counters[counter].max = atom->max - 1;
1688 		    /* allow a way out based on the count */
1689 		    xmlFAGenerateCountedTransition(ctxt, atom->stop,
1690 			                           newstate, counter);
1691 		    /* count the number of times we see it again */
1692 		    xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1693 						   atom->start, counter);
1694 		    /* and if needed allow a direct exit for 0 */
1695 		    if (atom->min == 0)
1696 			xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1697 						       newstate);
1698 
1699 		}
1700 		atom->min = 0;
1701 		atom->max = 0;
1702 		atom->quant = XML_REGEXP_QUANT_ONCE;
1703 		ctxt->state = newstate;
1704 	    }
1705 	    default:
1706 		break;
1707 	}
1708 	return(0);
1709     }
1710     if ((atom->min == 0) && (atom->max == 0) &&
1711                (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1712         /*
1713 	 * we can discard the atom and generate an epsilon transition instead
1714 	 */
1715 	if (to == NULL) {
1716 	    to = xmlRegNewState(ctxt);
1717 	    if (to != NULL)
1718 		xmlRegStatePush(ctxt, to);
1719 	    else {
1720 		return(-1);
1721 	    }
1722 	}
1723 	xmlFAGenerateEpsilonTransition(ctxt, from, to);
1724 	ctxt->state = to;
1725 	xmlRegFreeAtom(atom);
1726 	return(0);
1727     }
1728     if (to == NULL) {
1729 	to = xmlRegNewState(ctxt);
1730 	if (to != NULL)
1731 	    xmlRegStatePush(ctxt, to);
1732 	else {
1733 	    return(-1);
1734 	}
1735     }
1736     end = to;
1737     if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1738         (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1739 	/*
1740 	 * Do not pollute the target state by adding transitions from
1741 	 * it as it is likely to be the shared target of multiple branches.
1742 	 * So isolate with an epsilon transition.
1743 	 */
1744         xmlRegStatePtr tmp;
1745 
1746 	tmp = xmlRegNewState(ctxt);
1747 	if (tmp != NULL)
1748 	    xmlRegStatePush(ctxt, tmp);
1749 	else {
1750 	    return(-1);
1751 	}
1752 	xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1753 	to = tmp;
1754     }
1755     if (xmlRegAtomPush(ctxt, atom) < 0) {
1756 	return(-1);
1757     }
1758     if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1759         (atom->min == 0) && (atom->max > 0)) {
1760 	nullable = 1;
1761 	atom->min = 1;
1762         if (atom->max == 1)
1763 	    atom->quant = XML_REGEXP_QUANT_OPT;
1764     }
1765     xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1766     ctxt->state = end;
1767     switch (atom->quant) {
1768 	case XML_REGEXP_QUANT_OPT:
1769 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1770 	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1771 	    break;
1772 	case XML_REGEXP_QUANT_MULT:
1773 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1774 	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1775 	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1776 	    break;
1777 	case XML_REGEXP_QUANT_PLUS:
1778 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1779 	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1780 	    break;
1781 	case XML_REGEXP_QUANT_RANGE:
1782 	    if (nullable)
1783 		xmlFAGenerateEpsilonTransition(ctxt, from, to);
1784 	    break;
1785 	default:
1786 	    break;
1787     }
1788     return(0);
1789 }
1790 
1791 /**
1792  * xmlFAReduceEpsilonTransitions:
1793  * @ctxt:  a regexp parser context
1794  * @fromnr:  the from state
1795  * @tonr:  the to state
1796  * @counter:  should that transition be associated to a counted
1797  *
1798  */
1799 static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int fromnr,int tonr,int counter)1800 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1801 	                      int tonr, int counter) {
1802     int transnr;
1803     xmlRegStatePtr from;
1804     xmlRegStatePtr to;
1805 
1806 #ifdef DEBUG_REGEXP_GRAPH
1807     printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1808 #endif
1809     from = ctxt->states[fromnr];
1810     if (from == NULL)
1811 	return;
1812     to = ctxt->states[tonr];
1813     if (to == NULL)
1814 	return;
1815     if ((to->mark == XML_REGEXP_MARK_START) ||
1816 	(to->mark == XML_REGEXP_MARK_VISITED))
1817 	return;
1818 
1819     to->mark = XML_REGEXP_MARK_VISITED;
1820     if (to->type == XML_REGEXP_FINAL_STATE) {
1821 #ifdef DEBUG_REGEXP_GRAPH
1822 	printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1823 #endif
1824 	from->type = XML_REGEXP_FINAL_STATE;
1825     }
1826     for (transnr = 0;transnr < to->nbTrans;transnr++) {
1827         if (to->trans[transnr].to < 0)
1828 	    continue;
1829 	if (to->trans[transnr].atom == NULL) {
1830 	    /*
1831 	     * Don't remove counted transitions
1832 	     * Don't loop either
1833 	     */
1834 	    if (to->trans[transnr].to != fromnr) {
1835 		if (to->trans[transnr].count >= 0) {
1836 		    int newto = to->trans[transnr].to;
1837 
1838 		    xmlRegStateAddTrans(ctxt, from, NULL,
1839 					ctxt->states[newto],
1840 					-1, to->trans[transnr].count);
1841 		} else {
1842 #ifdef DEBUG_REGEXP_GRAPH
1843 		    printf("Found epsilon trans %d from %d to %d\n",
1844 			   transnr, tonr, to->trans[transnr].to);
1845 #endif
1846 		    if (to->trans[transnr].counter >= 0) {
1847 			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1848 					      to->trans[transnr].to,
1849 					      to->trans[transnr].counter);
1850 		    } else {
1851 			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1852 					      to->trans[transnr].to,
1853 					      counter);
1854 		    }
1855 		}
1856 	    }
1857 	} else {
1858 	    int newto = to->trans[transnr].to;
1859 
1860 	    if (to->trans[transnr].counter >= 0) {
1861 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1862 				    ctxt->states[newto],
1863 				    to->trans[transnr].counter, -1);
1864 	    } else {
1865 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1866 				    ctxt->states[newto], counter, -1);
1867 	    }
1868 	}
1869     }
1870     to->mark = XML_REGEXP_MARK_NORMAL;
1871 }
1872 
1873 /**
1874  * xmlFAEliminateSimpleEpsilonTransitions:
1875  * @ctxt:  a regexp parser context
1876  *
1877  * Eliminating general epsilon transitions can get costly in the general
1878  * algorithm due to the large amount of generated new transitions and
1879  * associated comparisons. However for simple epsilon transition used just
1880  * to separate building blocks when generating the automata this can be
1881  * reduced to state elimination:
1882  *    - if there exists an epsilon from X to Y
1883  *    - if there is no other transition from X
1884  * then X and Y are semantically equivalent and X can be eliminated
1885  * If X is the start state then make Y the start state, else replace the
1886  * target of all transitions to X by transitions to Y.
1887  *
1888  * If X is a final state, skip it.
1889  * Otherwise it would be necessary to manipulate counters for this case when
1890  * eliminating state 2:
1891  * State 1 has a transition with an atom to state 2.
1892  * State 2 is final and has an epsilon transition to state 1.
1893  */
1894 static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1895 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1896     int statenr, i, j, newto;
1897     xmlRegStatePtr state, tmp;
1898 
1899     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1900 	state = ctxt->states[statenr];
1901 	if (state == NULL)
1902 	    continue;
1903 	if (state->nbTrans != 1)
1904 	    continue;
1905        if (state->type == XML_REGEXP_UNREACH_STATE ||
1906            state->type == XML_REGEXP_FINAL_STATE)
1907 	    continue;
1908 	/* is the only transition out a basic transition */
1909 	if ((state->trans[0].atom == NULL) &&
1910 	    (state->trans[0].to >= 0) &&
1911 	    (state->trans[0].to != statenr) &&
1912 	    (state->trans[0].counter < 0) &&
1913 	    (state->trans[0].count < 0)) {
1914 	    newto = state->trans[0].to;
1915 
1916             if (state->type == XML_REGEXP_START_STATE) {
1917 #ifdef DEBUG_REGEXP_GRAPH
1918 		printf("Found simple epsilon trans from start %d to %d\n",
1919 		       statenr, newto);
1920 #endif
1921             } else {
1922 #ifdef DEBUG_REGEXP_GRAPH
1923 		printf("Found simple epsilon trans from %d to %d\n",
1924 		       statenr, newto);
1925 #endif
1926 	        for (i = 0;i < state->nbTransTo;i++) {
1927 		    tmp = ctxt->states[state->transTo[i]];
1928 		    for (j = 0;j < tmp->nbTrans;j++) {
1929 			if (tmp->trans[j].to == statenr) {
1930 #ifdef DEBUG_REGEXP_GRAPH
1931 			    printf("Changed transition %d on %d to go to %d\n",
1932 				   j, tmp->no, newto);
1933 #endif
1934 			    tmp->trans[j].to = -1;
1935 			    xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1936 						ctxt->states[newto],
1937 					        tmp->trans[j].counter,
1938 						tmp->trans[j].count);
1939 			}
1940 		    }
1941 		}
1942 		if (state->type == XML_REGEXP_FINAL_STATE)
1943 		    ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1944 		/* eliminate the transition completely */
1945 		state->nbTrans = 0;
1946 
1947                 state->type = XML_REGEXP_UNREACH_STATE;
1948 
1949 	    }
1950 
1951 	}
1952     }
1953 }
1954 /**
1955  * xmlFAEliminateEpsilonTransitions:
1956  * @ctxt:  a regexp parser context
1957  *
1958  */
1959 static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1960 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1961     int statenr, transnr;
1962     xmlRegStatePtr state;
1963     int has_epsilon;
1964 
1965     if (ctxt->states == NULL) return;
1966 
1967     /*
1968      * Eliminate simple epsilon transition and the associated unreachable
1969      * states.
1970      */
1971     xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1972     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1973 	state = ctxt->states[statenr];
1974 	if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1975 #ifdef DEBUG_REGEXP_GRAPH
1976 	    printf("Removed unreachable state %d\n", statenr);
1977 #endif
1978 	    xmlRegFreeState(state);
1979 	    ctxt->states[statenr] = NULL;
1980 	}
1981     }
1982 
1983     has_epsilon = 0;
1984 
1985     /*
1986      * Build the completed transitions bypassing the epsilons
1987      * Use a marking algorithm to avoid loops
1988      * Mark sink states too.
1989      * Process from the latest states backward to the start when
1990      * there is long cascading epsilon chains this minimize the
1991      * recursions and transition compares when adding the new ones
1992      */
1993     for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1994 	state = ctxt->states[statenr];
1995 	if (state == NULL)
1996 	    continue;
1997 	if ((state->nbTrans == 0) &&
1998 	    (state->type != XML_REGEXP_FINAL_STATE)) {
1999 	    state->type = XML_REGEXP_SINK_STATE;
2000 	}
2001 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2002 	    if ((state->trans[transnr].atom == NULL) &&
2003 		(state->trans[transnr].to >= 0)) {
2004 		if (state->trans[transnr].to == statenr) {
2005 		    state->trans[transnr].to = -1;
2006 #ifdef DEBUG_REGEXP_GRAPH
2007 		    printf("Removed loopback epsilon trans %d on %d\n",
2008 			   transnr, statenr);
2009 #endif
2010 		} else if (state->trans[transnr].count < 0) {
2011 		    int newto = state->trans[transnr].to;
2012 
2013 #ifdef DEBUG_REGEXP_GRAPH
2014 		    printf("Found epsilon trans %d from %d to %d\n",
2015 			   transnr, statenr, newto);
2016 #endif
2017 		    has_epsilon = 1;
2018 		    state->trans[transnr].to = -2;
2019 		    state->mark = XML_REGEXP_MARK_START;
2020 		    xmlFAReduceEpsilonTransitions(ctxt, statenr,
2021 				      newto, state->trans[transnr].counter);
2022 		    state->mark = XML_REGEXP_MARK_NORMAL;
2023 #ifdef DEBUG_REGEXP_GRAPH
2024 		} else {
2025 		    printf("Found counted transition %d on %d\n",
2026 			   transnr, statenr);
2027 #endif
2028 	        }
2029 	    }
2030 	}
2031     }
2032     /*
2033      * Eliminate the epsilon transitions
2034      */
2035     if (has_epsilon) {
2036 	for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2037 	    state = ctxt->states[statenr];
2038 	    if (state == NULL)
2039 		continue;
2040 	    for (transnr = 0;transnr < state->nbTrans;transnr++) {
2041 		xmlRegTransPtr trans = &(state->trans[transnr]);
2042 		if ((trans->atom == NULL) &&
2043 		    (trans->count < 0) &&
2044 		    (trans->to >= 0)) {
2045 		    trans->to = -1;
2046 		}
2047 	    }
2048 	}
2049     }
2050 
2051     /*
2052      * Use this pass to detect unreachable states too
2053      */
2054     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2055 	state = ctxt->states[statenr];
2056 	if (state != NULL)
2057 	    state->reached = XML_REGEXP_MARK_NORMAL;
2058     }
2059     state = ctxt->states[0];
2060     if (state != NULL)
2061 	state->reached = XML_REGEXP_MARK_START;
2062     while (state != NULL) {
2063 	xmlRegStatePtr target = NULL;
2064 	state->reached = XML_REGEXP_MARK_VISITED;
2065 	/*
2066 	 * Mark all states reachable from the current reachable state
2067 	 */
2068 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2069 	    if ((state->trans[transnr].to >= 0) &&
2070 		((state->trans[transnr].atom != NULL) ||
2071 		 (state->trans[transnr].count >= 0))) {
2072 		int newto = state->trans[transnr].to;
2073 
2074 		if (ctxt->states[newto] == NULL)
2075 		    continue;
2076 		if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2077 		    ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2078 		    target = ctxt->states[newto];
2079 		}
2080 	    }
2081 	}
2082 
2083 	/*
2084 	 * find the next accessible state not explored
2085 	 */
2086 	if (target == NULL) {
2087 	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2088 		state = ctxt->states[statenr];
2089 		if ((state != NULL) && (state->reached ==
2090 			XML_REGEXP_MARK_START)) {
2091 		    target = state;
2092 		    break;
2093 		}
2094 	    }
2095 	}
2096 	state = target;
2097     }
2098     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2099 	state = ctxt->states[statenr];
2100 	if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2101 #ifdef DEBUG_REGEXP_GRAPH
2102 	    printf("Removed unreachable state %d\n", statenr);
2103 #endif
2104 	    xmlRegFreeState(state);
2105 	    ctxt->states[statenr] = NULL;
2106 	}
2107     }
2108 
2109 }
2110 
2111 static int
xmlFACompareRanges(xmlRegRangePtr range1,xmlRegRangePtr range2)2112 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2113     int ret = 0;
2114 
2115     if ((range1->type == XML_REGEXP_RANGES) ||
2116         (range2->type == XML_REGEXP_RANGES) ||
2117         (range2->type == XML_REGEXP_SUBREG) ||
2118         (range1->type == XML_REGEXP_SUBREG) ||
2119         (range1->type == XML_REGEXP_STRING) ||
2120         (range2->type == XML_REGEXP_STRING))
2121 	return(-1);
2122 
2123     /* put them in order */
2124     if (range1->type > range2->type) {
2125         xmlRegRangePtr tmp;
2126 
2127 	tmp = range1;
2128 	range1 = range2;
2129 	range2 = tmp;
2130     }
2131     if ((range1->type == XML_REGEXP_ANYCHAR) ||
2132         (range2->type == XML_REGEXP_ANYCHAR)) {
2133 	ret = 1;
2134     } else if ((range1->type == XML_REGEXP_EPSILON) ||
2135                (range2->type == XML_REGEXP_EPSILON)) {
2136 	return(0);
2137     } else if (range1->type == range2->type) {
2138         if (range1->type != XML_REGEXP_CHARVAL)
2139             ret = 1;
2140         else if ((range1->end < range2->start) ||
2141 	         (range2->end < range1->start))
2142 	    ret = 0;
2143 	else
2144 	    ret = 1;
2145     } else if (range1->type == XML_REGEXP_CHARVAL) {
2146         int codepoint;
2147 	int neg = 0;
2148 
2149 	/*
2150 	 * just check all codepoints in the range for acceptance,
2151 	 * this is usually way cheaper since done only once at
2152 	 * compilation than testing over and over at runtime or
2153 	 * pushing too many states when evaluating.
2154 	 */
2155 	if (((range1->neg == 0) && (range2->neg != 0)) ||
2156 	    ((range1->neg != 0) && (range2->neg == 0)))
2157 	    neg = 1;
2158 
2159 	for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2160 	    ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2161 					    0, range2->start, range2->end,
2162 					    range2->blockName);
2163 	    if (ret < 0)
2164 	        return(-1);
2165 	    if (((neg == 1) && (ret == 0)) ||
2166 	        ((neg == 0) && (ret == 1)))
2167 		return(1);
2168 	}
2169 	return(0);
2170     } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2171                (range2->type == XML_REGEXP_BLOCK_NAME)) {
2172 	if (range1->type == range2->type) {
2173 	    ret = xmlStrEqual(range1->blockName, range2->blockName);
2174 	} else {
2175 	    /*
2176 	     * comparing a block range with anything else is way
2177 	     * too costly, and maintaining the table is like too much
2178 	     * memory too, so let's force the automata to save state
2179 	     * here.
2180 	     */
2181 	    return(1);
2182 	}
2183     } else if ((range1->type < XML_REGEXP_LETTER) ||
2184                (range2->type < XML_REGEXP_LETTER)) {
2185 	if ((range1->type == XML_REGEXP_ANYSPACE) &&
2186 	    (range2->type == XML_REGEXP_NOTSPACE))
2187 	    ret = 0;
2188 	else if ((range1->type == XML_REGEXP_INITNAME) &&
2189 	         (range2->type == XML_REGEXP_NOTINITNAME))
2190 	    ret = 0;
2191 	else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2192 	         (range2->type == XML_REGEXP_NOTNAMECHAR))
2193 	    ret = 0;
2194 	else if ((range1->type == XML_REGEXP_DECIMAL) &&
2195 	         (range2->type == XML_REGEXP_NOTDECIMAL))
2196 	    ret = 0;
2197 	else if ((range1->type == XML_REGEXP_REALCHAR) &&
2198 	         (range2->type == XML_REGEXP_NOTREALCHAR))
2199 	    ret = 0;
2200 	else {
2201 	    /* same thing to limit complexity */
2202 	    return(1);
2203 	}
2204     } else {
2205         ret = 0;
2206         /* range1->type < range2->type here */
2207         switch (range1->type) {
2208 	    case XML_REGEXP_LETTER:
2209 	         /* all disjoint except in the subgroups */
2210 	         if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2211 		     (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2212 		     (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2213 		     (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2214 		     (range2->type == XML_REGEXP_LETTER_OTHERS))
2215 		     ret = 1;
2216 		 break;
2217 	    case XML_REGEXP_MARK:
2218 	         if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2219 		     (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2220 		     (range2->type == XML_REGEXP_MARK_ENCLOSING))
2221 		     ret = 1;
2222 		 break;
2223 	    case XML_REGEXP_NUMBER:
2224 	         if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2225 		     (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2226 		     (range2->type == XML_REGEXP_NUMBER_OTHERS))
2227 		     ret = 1;
2228 		 break;
2229 	    case XML_REGEXP_PUNCT:
2230 	         if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2231 		     (range2->type == XML_REGEXP_PUNCT_DASH) ||
2232 		     (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2233 		     (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2234 		     (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2235 		     (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2236 		     (range2->type == XML_REGEXP_PUNCT_OTHERS))
2237 		     ret = 1;
2238 		 break;
2239 	    case XML_REGEXP_SEPAR:
2240 	         if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2241 		     (range2->type == XML_REGEXP_SEPAR_LINE) ||
2242 		     (range2->type == XML_REGEXP_SEPAR_PARA))
2243 		     ret = 1;
2244 		 break;
2245 	    case XML_REGEXP_SYMBOL:
2246 	         if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2247 		     (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2248 		     (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2249 		     (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2250 		     ret = 1;
2251 		 break;
2252 	    case XML_REGEXP_OTHER:
2253 	         if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2254 		     (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2255 		     (range2->type == XML_REGEXP_OTHER_PRIVATE))
2256 		     ret = 1;
2257 		 break;
2258             default:
2259 	         if ((range2->type >= XML_REGEXP_LETTER) &&
2260 		     (range2->type < XML_REGEXP_BLOCK_NAME))
2261 		     ret = 0;
2262 		 else {
2263 		     /* safety net ! */
2264 		     return(1);
2265 		 }
2266 	}
2267     }
2268     if (((range1->neg == 0) && (range2->neg != 0)) ||
2269         ((range1->neg != 0) && (range2->neg == 0)))
2270 	ret = !ret;
2271     return(ret);
2272 }
2273 
2274 /**
2275  * xmlFACompareAtomTypes:
2276  * @type1:  an atom type
2277  * @type2:  an atom type
2278  *
2279  * Compares two atoms type to check whether they intersect in some ways,
2280  * this is used by xmlFACompareAtoms only
2281  *
2282  * Returns 1 if they may intersect and 0 otherwise
2283  */
2284 static int
xmlFACompareAtomTypes(xmlRegAtomType type1,xmlRegAtomType type2)2285 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2286     if ((type1 == XML_REGEXP_EPSILON) ||
2287         (type1 == XML_REGEXP_CHARVAL) ||
2288 	(type1 == XML_REGEXP_RANGES) ||
2289 	(type1 == XML_REGEXP_SUBREG) ||
2290 	(type1 == XML_REGEXP_STRING) ||
2291 	(type1 == XML_REGEXP_ANYCHAR))
2292 	return(1);
2293     if ((type2 == XML_REGEXP_EPSILON) ||
2294         (type2 == XML_REGEXP_CHARVAL) ||
2295 	(type2 == XML_REGEXP_RANGES) ||
2296 	(type2 == XML_REGEXP_SUBREG) ||
2297 	(type2 == XML_REGEXP_STRING) ||
2298 	(type2 == XML_REGEXP_ANYCHAR))
2299 	return(1);
2300 
2301     if (type1 == type2) return(1);
2302 
2303     /* simplify subsequent compares by making sure type1 < type2 */
2304     if (type1 > type2) {
2305         xmlRegAtomType tmp = type1;
2306 	type1 = type2;
2307 	type2 = tmp;
2308     }
2309     switch (type1) {
2310         case XML_REGEXP_ANYSPACE: /* \s */
2311 	    /* can't be a letter, number, mark, punctuation, symbol */
2312 	    if ((type2 == XML_REGEXP_NOTSPACE) ||
2313 		((type2 >= XML_REGEXP_LETTER) &&
2314 		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2315 	        ((type2 >= XML_REGEXP_NUMBER) &&
2316 		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2317 	        ((type2 >= XML_REGEXP_MARK) &&
2318 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2319 	        ((type2 >= XML_REGEXP_PUNCT) &&
2320 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2321 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2322 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2323 	        ) return(0);
2324 	    break;
2325         case XML_REGEXP_NOTSPACE: /* \S */
2326 	    break;
2327         case XML_REGEXP_INITNAME: /* \l */
2328 	    /* can't be a number, mark, separator, punctuation, symbol or other */
2329 	    if ((type2 == XML_REGEXP_NOTINITNAME) ||
2330 	        ((type2 >= XML_REGEXP_NUMBER) &&
2331 		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2332 	        ((type2 >= XML_REGEXP_MARK) &&
2333 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2334 	        ((type2 >= XML_REGEXP_SEPAR) &&
2335 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2336 	        ((type2 >= XML_REGEXP_PUNCT) &&
2337 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2338 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2339 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2340 	        ((type2 >= XML_REGEXP_OTHER) &&
2341 		 (type2 <= XML_REGEXP_OTHER_NA))
2342 		) return(0);
2343 	    break;
2344         case XML_REGEXP_NOTINITNAME: /* \L */
2345 	    break;
2346         case XML_REGEXP_NAMECHAR: /* \c */
2347 	    /* can't be a mark, separator, punctuation, symbol or other */
2348 	    if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2349 	        ((type2 >= XML_REGEXP_MARK) &&
2350 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2351 	        ((type2 >= XML_REGEXP_PUNCT) &&
2352 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2353 	        ((type2 >= XML_REGEXP_SEPAR) &&
2354 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2355 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2356 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2357 	        ((type2 >= XML_REGEXP_OTHER) &&
2358 		 (type2 <= XML_REGEXP_OTHER_NA))
2359 		) return(0);
2360 	    break;
2361         case XML_REGEXP_NOTNAMECHAR: /* \C */
2362 	    break;
2363         case XML_REGEXP_DECIMAL: /* \d */
2364 	    /* can't be a letter, mark, separator, punctuation, symbol or other */
2365 	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2366 	        (type2 == XML_REGEXP_REALCHAR) ||
2367 		((type2 >= XML_REGEXP_LETTER) &&
2368 		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2369 	        ((type2 >= XML_REGEXP_MARK) &&
2370 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2371 	        ((type2 >= XML_REGEXP_PUNCT) &&
2372 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2373 	        ((type2 >= XML_REGEXP_SEPAR) &&
2374 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2375 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2376 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2377 	        ((type2 >= XML_REGEXP_OTHER) &&
2378 		 (type2 <= XML_REGEXP_OTHER_NA))
2379 		)return(0);
2380 	    break;
2381         case XML_REGEXP_NOTDECIMAL: /* \D */
2382 	    break;
2383         case XML_REGEXP_REALCHAR: /* \w */
2384 	    /* can't be a mark, separator, punctuation, symbol or other */
2385 	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2386 	        ((type2 >= XML_REGEXP_MARK) &&
2387 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2388 	        ((type2 >= XML_REGEXP_PUNCT) &&
2389 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2390 	        ((type2 >= XML_REGEXP_SEPAR) &&
2391 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2392 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2393 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2394 	        ((type2 >= XML_REGEXP_OTHER) &&
2395 		 (type2 <= XML_REGEXP_OTHER_NA))
2396 		)return(0);
2397 	    break;
2398         case XML_REGEXP_NOTREALCHAR: /* \W */
2399 	    break;
2400 	/*
2401 	 * at that point we know both type 1 and type2 are from
2402 	 * character categories are ordered and are different,
2403 	 * it becomes simple because this is a partition
2404 	 */
2405         case XML_REGEXP_LETTER:
2406 	    if (type2 <= XML_REGEXP_LETTER_OTHERS)
2407 	        return(1);
2408 	    return(0);
2409         case XML_REGEXP_LETTER_UPPERCASE:
2410         case XML_REGEXP_LETTER_LOWERCASE:
2411         case XML_REGEXP_LETTER_TITLECASE:
2412         case XML_REGEXP_LETTER_MODIFIER:
2413         case XML_REGEXP_LETTER_OTHERS:
2414 	    return(0);
2415         case XML_REGEXP_MARK:
2416 	    if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2417 	        return(1);
2418 	    return(0);
2419         case XML_REGEXP_MARK_NONSPACING:
2420         case XML_REGEXP_MARK_SPACECOMBINING:
2421         case XML_REGEXP_MARK_ENCLOSING:
2422 	    return(0);
2423         case XML_REGEXP_NUMBER:
2424 	    if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2425 	        return(1);
2426 	    return(0);
2427         case XML_REGEXP_NUMBER_DECIMAL:
2428         case XML_REGEXP_NUMBER_LETTER:
2429         case XML_REGEXP_NUMBER_OTHERS:
2430 	    return(0);
2431         case XML_REGEXP_PUNCT:
2432 	    if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2433 	        return(1);
2434 	    return(0);
2435         case XML_REGEXP_PUNCT_CONNECTOR:
2436         case XML_REGEXP_PUNCT_DASH:
2437         case XML_REGEXP_PUNCT_OPEN:
2438         case XML_REGEXP_PUNCT_CLOSE:
2439         case XML_REGEXP_PUNCT_INITQUOTE:
2440         case XML_REGEXP_PUNCT_FINQUOTE:
2441         case XML_REGEXP_PUNCT_OTHERS:
2442 	    return(0);
2443         case XML_REGEXP_SEPAR:
2444 	    if (type2 <= XML_REGEXP_SEPAR_PARA)
2445 	        return(1);
2446 	    return(0);
2447         case XML_REGEXP_SEPAR_SPACE:
2448         case XML_REGEXP_SEPAR_LINE:
2449         case XML_REGEXP_SEPAR_PARA:
2450 	    return(0);
2451         case XML_REGEXP_SYMBOL:
2452 	    if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2453 	        return(1);
2454 	    return(0);
2455         case XML_REGEXP_SYMBOL_MATH:
2456         case XML_REGEXP_SYMBOL_CURRENCY:
2457         case XML_REGEXP_SYMBOL_MODIFIER:
2458         case XML_REGEXP_SYMBOL_OTHERS:
2459 	    return(0);
2460         case XML_REGEXP_OTHER:
2461 	    if (type2 <= XML_REGEXP_OTHER_NA)
2462 	        return(1);
2463 	    return(0);
2464         case XML_REGEXP_OTHER_CONTROL:
2465         case XML_REGEXP_OTHER_FORMAT:
2466         case XML_REGEXP_OTHER_PRIVATE:
2467         case XML_REGEXP_OTHER_NA:
2468 	    return(0);
2469 	default:
2470 	    break;
2471     }
2472     return(1);
2473 }
2474 
2475 /**
2476  * xmlFAEqualAtoms:
2477  * @atom1:  an atom
2478  * @atom2:  an atom
2479  * @deep: if not set only compare string pointers
2480  *
2481  * Compares two atoms to check whether they are the same exactly
2482  * this is used to remove equivalent transitions
2483  *
2484  * Returns 1 if same and 0 otherwise
2485  */
2486 static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2487 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2488     int ret = 0;
2489 
2490     if (atom1 == atom2)
2491 	return(1);
2492     if ((atom1 == NULL) || (atom2 == NULL))
2493 	return(0);
2494 
2495     if (atom1->type != atom2->type)
2496         return(0);
2497     switch (atom1->type) {
2498         case XML_REGEXP_EPSILON:
2499 	    ret = 0;
2500 	    break;
2501         case XML_REGEXP_STRING:
2502             if (!deep)
2503                 ret = (atom1->valuep == atom2->valuep);
2504             else
2505                 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2506                                   (xmlChar *)atom2->valuep);
2507 	    break;
2508         case XML_REGEXP_CHARVAL:
2509 	    ret = (atom1->codepoint == atom2->codepoint);
2510 	    break;
2511 	case XML_REGEXP_RANGES:
2512 	    /* too hard to do in the general case */
2513 	    ret = 0;
2514 	default:
2515 	    break;
2516     }
2517     return(ret);
2518 }
2519 
2520 /**
2521  * xmlFACompareAtoms:
2522  * @atom1:  an atom
2523  * @atom2:  an atom
2524  * @deep: if not set only compare string pointers
2525  *
2526  * Compares two atoms to check whether they intersect in some ways,
2527  * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2528  *
2529  * Returns 1 if yes and 0 otherwise
2530  */
2531 static int
xmlFACompareAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2532 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2533     int ret = 1;
2534 
2535     if (atom1 == atom2)
2536 	return(1);
2537     if ((atom1 == NULL) || (atom2 == NULL))
2538 	return(0);
2539 
2540     if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2541         (atom2->type == XML_REGEXP_ANYCHAR))
2542 	return(1);
2543 
2544     if (atom1->type > atom2->type) {
2545 	xmlRegAtomPtr tmp;
2546 	tmp = atom1;
2547 	atom1 = atom2;
2548 	atom2 = tmp;
2549     }
2550     if (atom1->type != atom2->type) {
2551         ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2552 	/* if they can't intersect at the type level break now */
2553 	if (ret == 0)
2554 	    return(0);
2555     }
2556     switch (atom1->type) {
2557         case XML_REGEXP_STRING:
2558             if (!deep)
2559                 ret = (atom1->valuep != atom2->valuep);
2560             else {
2561                 xmlChar *val1 = (xmlChar *)atom1->valuep;
2562                 xmlChar *val2 = (xmlChar *)atom2->valuep;
2563                 int compound1 = (xmlStrchr(val1, '|') != NULL);
2564                 int compound2 = (xmlStrchr(val2, '|') != NULL);
2565 
2566                 /* Ignore negative match flag for ##other namespaces */
2567                 if (compound1 != compound2)
2568                     return(0);
2569 
2570                 ret = xmlRegStrEqualWildcard(val1, val2);
2571             }
2572 	    break;
2573         case XML_REGEXP_EPSILON:
2574 	    goto not_determinist;
2575         case XML_REGEXP_CHARVAL:
2576 	    if (atom2->type == XML_REGEXP_CHARVAL) {
2577 		ret = (atom1->codepoint == atom2->codepoint);
2578 	    } else {
2579 	        ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2580 		if (ret < 0)
2581 		    ret = 1;
2582 	    }
2583 	    break;
2584         case XML_REGEXP_RANGES:
2585 	    if (atom2->type == XML_REGEXP_RANGES) {
2586 	        int i, j, res;
2587 		xmlRegRangePtr r1, r2;
2588 
2589 		/*
2590 		 * need to check that none of the ranges eventually matches
2591 		 */
2592 		for (i = 0;i < atom1->nbRanges;i++) {
2593 		    for (j = 0;j < atom2->nbRanges;j++) {
2594 			r1 = atom1->ranges[i];
2595 			r2 = atom2->ranges[j];
2596 			res = xmlFACompareRanges(r1, r2);
2597 			if (res == 1) {
2598 			    ret = 1;
2599 			    goto done;
2600 			}
2601 		    }
2602 		}
2603 		ret = 0;
2604 	    }
2605 	    break;
2606 	default:
2607 	    goto not_determinist;
2608     }
2609 done:
2610     if (atom1->neg != atom2->neg) {
2611         ret = !ret;
2612     }
2613     if (ret == 0)
2614         return(0);
2615 not_determinist:
2616     return(1);
2617 }
2618 
2619 /**
2620  * xmlFARecurseDeterminism:
2621  * @ctxt:  a regexp parser context
2622  *
2623  * Check whether the associated regexp is determinist,
2624  * should be called after xmlFAEliminateEpsilonTransitions()
2625  *
2626  */
2627 static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,int to,xmlRegAtomPtr atom)2628 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2629 	                 int to, xmlRegAtomPtr atom) {
2630     int ret = 1;
2631     int res;
2632     int transnr, nbTrans;
2633     xmlRegTransPtr t1;
2634     int deep = 1;
2635 
2636     if (state == NULL)
2637 	return(ret);
2638     if (state->markd == XML_REGEXP_MARK_VISITED)
2639 	return(ret);
2640 
2641     if (ctxt->flags & AM_AUTOMATA_RNG)
2642         deep = 0;
2643 
2644     /*
2645      * don't recurse on transitions potentially added in the course of
2646      * the elimination.
2647      */
2648     nbTrans = state->nbTrans;
2649     for (transnr = 0;transnr < nbTrans;transnr++) {
2650 	t1 = &(state->trans[transnr]);
2651 	/*
2652 	 * check transitions conflicting with the one looked at
2653 	 */
2654 	if (t1->atom == NULL) {
2655 	    if (t1->to < 0)
2656 		continue;
2657 	    state->markd = XML_REGEXP_MARK_VISITED;
2658 	    res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2659 		                           to, atom);
2660 	    if (res == 0) {
2661 	        ret = 0;
2662 		/* t1->nd = 1; */
2663 	    }
2664 	    continue;
2665 	}
2666 	if (t1->to != to)
2667 	    continue;
2668 	if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2669 	    ret = 0;
2670 	    /* mark the transition as non-deterministic */
2671 	    t1->nd = 1;
2672 	}
2673     }
2674     return(ret);
2675 }
2676 
2677 /**
2678  * xmlFAFinishRecurseDeterminism:
2679  * @ctxt:  a regexp parser context
2680  *
2681  * Reset flags after checking determinism.
2682  */
2683 static void
xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)2684 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2685     int transnr, nbTrans;
2686 
2687     if (state == NULL)
2688 	return;
2689     if (state->markd != XML_REGEXP_MARK_VISITED)
2690 	return;
2691     state->markd = 0;
2692 
2693     nbTrans = state->nbTrans;
2694     for (transnr = 0; transnr < nbTrans; transnr++) {
2695 	xmlRegTransPtr t1 = &state->trans[transnr];
2696 	if ((t1->atom == NULL) && (t1->to >= 0))
2697 	    xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2698     }
2699 }
2700 
2701 /**
2702  * xmlFAComputesDeterminism:
2703  * @ctxt:  a regexp parser context
2704  *
2705  * Check whether the associated regexp is determinist,
2706  * should be called after xmlFAEliminateEpsilonTransitions()
2707  *
2708  */
2709 static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt)2710 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2711     int statenr, transnr;
2712     xmlRegStatePtr state;
2713     xmlRegTransPtr t1, t2, last;
2714     int i;
2715     int ret = 1;
2716     int deep = 1;
2717 
2718 #ifdef DEBUG_REGEXP_GRAPH
2719     printf("xmlFAComputesDeterminism\n");
2720     xmlRegPrintCtxt(stdout, ctxt);
2721 #endif
2722     if (ctxt->determinist != -1)
2723 	return(ctxt->determinist);
2724 
2725     if (ctxt->flags & AM_AUTOMATA_RNG)
2726         deep = 0;
2727 
2728     /*
2729      * First cleanup the automata removing cancelled transitions
2730      */
2731     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2732 	state = ctxt->states[statenr];
2733 	if (state == NULL)
2734 	    continue;
2735 	if (state->nbTrans < 2)
2736 	    continue;
2737 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2738 	    t1 = &(state->trans[transnr]);
2739 	    /*
2740 	     * Determinism checks in case of counted or all transitions
2741 	     * will have to be handled separately
2742 	     */
2743 	    if (t1->atom == NULL) {
2744 		/* t1->nd = 1; */
2745 		continue;
2746 	    }
2747 	    if (t1->to == -1) /* eliminated */
2748 		continue;
2749 	    for (i = 0;i < transnr;i++) {
2750 		t2 = &(state->trans[i]);
2751 		if (t2->to == -1) /* eliminated */
2752 		    continue;
2753 		if (t2->atom != NULL) {
2754 		    if (t1->to == t2->to) {
2755                         /*
2756                          * Here we use deep because we want to keep the
2757                          * transitions which indicate a conflict
2758                          */
2759 			if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2760                             (t1->counter == t2->counter) &&
2761                             (t1->count == t2->count))
2762 			    t2->to = -1; /* eliminated */
2763 		    }
2764 		}
2765 	    }
2766 	}
2767     }
2768 
2769     /*
2770      * Check for all states that there aren't 2 transitions
2771      * with the same atom and a different target.
2772      */
2773     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2774 	state = ctxt->states[statenr];
2775 	if (state == NULL)
2776 	    continue;
2777 	if (state->nbTrans < 2)
2778 	    continue;
2779 	last = NULL;
2780 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2781 	    t1 = &(state->trans[transnr]);
2782 	    /*
2783 	     * Determinism checks in case of counted or all transitions
2784 	     * will have to be handled separately
2785 	     */
2786 	    if (t1->atom == NULL) {
2787 		continue;
2788 	    }
2789 	    if (t1->to == -1) /* eliminated */
2790 		continue;
2791 	    for (i = 0;i < transnr;i++) {
2792 		t2 = &(state->trans[i]);
2793 		if (t2->to == -1) /* eliminated */
2794 		    continue;
2795 		if (t2->atom != NULL) {
2796                     /*
2797                      * But here we don't use deep because we want to
2798                      * find transitions which indicate a conflict
2799                      */
2800 		    if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2801 			ret = 0;
2802 			/* mark the transitions as non-deterministic ones */
2803 			t1->nd = 1;
2804 			t2->nd = 1;
2805 			last = t1;
2806 		    }
2807 		} else if (t1->to != -1) {
2808 		    /*
2809 		     * do the closure in case of remaining specific
2810 		     * epsilon transitions like choices or all
2811 		     */
2812 		    ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2813 						   t2->to, t2->atom);
2814                     xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2815 		    /* don't shortcut the computation so all non deterministic
2816 		       transition get marked down
2817 		    if (ret == 0)
2818 			return(0);
2819 		     */
2820 		    if (ret == 0) {
2821 			t1->nd = 1;
2822 			/* t2->nd = 1; */
2823 			last = t1;
2824 		    }
2825 		}
2826 	    }
2827 	    /* don't shortcut the computation so all non deterministic
2828 	       transition get marked down
2829 	    if (ret == 0)
2830 		break; */
2831 	}
2832 
2833 	/*
2834 	 * mark specifically the last non-deterministic transition
2835 	 * from a state since there is no need to set-up rollback
2836 	 * from it
2837 	 */
2838 	if (last != NULL) {
2839 	    last->nd = 2;
2840 	}
2841 
2842 	/* don't shortcut the computation so all non deterministic
2843 	   transition get marked down
2844 	if (ret == 0)
2845 	    break; */
2846     }
2847 
2848     ctxt->determinist = ret;
2849     return(ret);
2850 }
2851 
2852 /************************************************************************
2853  *									*
2854  *	Routines to check input against transition atoms		*
2855  *									*
2856  ************************************************************************/
2857 
2858 static int
xmlRegCheckCharacterRange(xmlRegAtomType type,int codepoint,int neg,int start,int end,const xmlChar * blockName)2859 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2860 	                  int start, int end, const xmlChar *blockName) {
2861     int ret = 0;
2862 
2863     switch (type) {
2864         case XML_REGEXP_STRING:
2865         case XML_REGEXP_SUBREG:
2866         case XML_REGEXP_RANGES:
2867         case XML_REGEXP_EPSILON:
2868 	    return(-1);
2869         case XML_REGEXP_ANYCHAR:
2870 	    ret = ((codepoint != '\n') && (codepoint != '\r'));
2871 	    break;
2872         case XML_REGEXP_CHARVAL:
2873 	    ret = ((codepoint >= start) && (codepoint <= end));
2874 	    break;
2875         case XML_REGEXP_NOTSPACE:
2876 	    neg = !neg;
2877             /* Falls through. */
2878         case XML_REGEXP_ANYSPACE:
2879 	    ret = ((codepoint == '\n') || (codepoint == '\r') ||
2880 		   (codepoint == '\t') || (codepoint == ' '));
2881 	    break;
2882         case XML_REGEXP_NOTINITNAME:
2883 	    neg = !neg;
2884             /* Falls through. */
2885         case XML_REGEXP_INITNAME:
2886 	    ret = (IS_LETTER(codepoint) ||
2887 		   (codepoint == '_') || (codepoint == ':'));
2888 	    break;
2889         case XML_REGEXP_NOTNAMECHAR:
2890 	    neg = !neg;
2891             /* Falls through. */
2892         case XML_REGEXP_NAMECHAR:
2893 	    ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2894 		   (codepoint == '.') || (codepoint == '-') ||
2895 		   (codepoint == '_') || (codepoint == ':') ||
2896 		   IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2897 	    break;
2898         case XML_REGEXP_NOTDECIMAL:
2899 	    neg = !neg;
2900             /* Falls through. */
2901         case XML_REGEXP_DECIMAL:
2902 	    ret = xmlUCSIsCatNd(codepoint);
2903 	    break;
2904         case XML_REGEXP_REALCHAR:
2905 	    neg = !neg;
2906             /* Falls through. */
2907         case XML_REGEXP_NOTREALCHAR:
2908 	    ret = xmlUCSIsCatP(codepoint);
2909 	    if (ret == 0)
2910 		ret = xmlUCSIsCatZ(codepoint);
2911 	    if (ret == 0)
2912 		ret = xmlUCSIsCatC(codepoint);
2913 	    break;
2914         case XML_REGEXP_LETTER:
2915 	    ret = xmlUCSIsCatL(codepoint);
2916 	    break;
2917         case XML_REGEXP_LETTER_UPPERCASE:
2918 	    ret = xmlUCSIsCatLu(codepoint);
2919 	    break;
2920         case XML_REGEXP_LETTER_LOWERCASE:
2921 	    ret = xmlUCSIsCatLl(codepoint);
2922 	    break;
2923         case XML_REGEXP_LETTER_TITLECASE:
2924 	    ret = xmlUCSIsCatLt(codepoint);
2925 	    break;
2926         case XML_REGEXP_LETTER_MODIFIER:
2927 	    ret = xmlUCSIsCatLm(codepoint);
2928 	    break;
2929         case XML_REGEXP_LETTER_OTHERS:
2930 	    ret = xmlUCSIsCatLo(codepoint);
2931 	    break;
2932         case XML_REGEXP_MARK:
2933 	    ret = xmlUCSIsCatM(codepoint);
2934 	    break;
2935         case XML_REGEXP_MARK_NONSPACING:
2936 	    ret = xmlUCSIsCatMn(codepoint);
2937 	    break;
2938         case XML_REGEXP_MARK_SPACECOMBINING:
2939 	    ret = xmlUCSIsCatMc(codepoint);
2940 	    break;
2941         case XML_REGEXP_MARK_ENCLOSING:
2942 	    ret = xmlUCSIsCatMe(codepoint);
2943 	    break;
2944         case XML_REGEXP_NUMBER:
2945 	    ret = xmlUCSIsCatN(codepoint);
2946 	    break;
2947         case XML_REGEXP_NUMBER_DECIMAL:
2948 	    ret = xmlUCSIsCatNd(codepoint);
2949 	    break;
2950         case XML_REGEXP_NUMBER_LETTER:
2951 	    ret = xmlUCSIsCatNl(codepoint);
2952 	    break;
2953         case XML_REGEXP_NUMBER_OTHERS:
2954 	    ret = xmlUCSIsCatNo(codepoint);
2955 	    break;
2956         case XML_REGEXP_PUNCT:
2957 	    ret = xmlUCSIsCatP(codepoint);
2958 	    break;
2959         case XML_REGEXP_PUNCT_CONNECTOR:
2960 	    ret = xmlUCSIsCatPc(codepoint);
2961 	    break;
2962         case XML_REGEXP_PUNCT_DASH:
2963 	    ret = xmlUCSIsCatPd(codepoint);
2964 	    break;
2965         case XML_REGEXP_PUNCT_OPEN:
2966 	    ret = xmlUCSIsCatPs(codepoint);
2967 	    break;
2968         case XML_REGEXP_PUNCT_CLOSE:
2969 	    ret = xmlUCSIsCatPe(codepoint);
2970 	    break;
2971         case XML_REGEXP_PUNCT_INITQUOTE:
2972 	    ret = xmlUCSIsCatPi(codepoint);
2973 	    break;
2974         case XML_REGEXP_PUNCT_FINQUOTE:
2975 	    ret = xmlUCSIsCatPf(codepoint);
2976 	    break;
2977         case XML_REGEXP_PUNCT_OTHERS:
2978 	    ret = xmlUCSIsCatPo(codepoint);
2979 	    break;
2980         case XML_REGEXP_SEPAR:
2981 	    ret = xmlUCSIsCatZ(codepoint);
2982 	    break;
2983         case XML_REGEXP_SEPAR_SPACE:
2984 	    ret = xmlUCSIsCatZs(codepoint);
2985 	    break;
2986         case XML_REGEXP_SEPAR_LINE:
2987 	    ret = xmlUCSIsCatZl(codepoint);
2988 	    break;
2989         case XML_REGEXP_SEPAR_PARA:
2990 	    ret = xmlUCSIsCatZp(codepoint);
2991 	    break;
2992         case XML_REGEXP_SYMBOL:
2993 	    ret = xmlUCSIsCatS(codepoint);
2994 	    break;
2995         case XML_REGEXP_SYMBOL_MATH:
2996 	    ret = xmlUCSIsCatSm(codepoint);
2997 	    break;
2998         case XML_REGEXP_SYMBOL_CURRENCY:
2999 	    ret = xmlUCSIsCatSc(codepoint);
3000 	    break;
3001         case XML_REGEXP_SYMBOL_MODIFIER:
3002 	    ret = xmlUCSIsCatSk(codepoint);
3003 	    break;
3004         case XML_REGEXP_SYMBOL_OTHERS:
3005 	    ret = xmlUCSIsCatSo(codepoint);
3006 	    break;
3007         case XML_REGEXP_OTHER:
3008 	    ret = xmlUCSIsCatC(codepoint);
3009 	    break;
3010         case XML_REGEXP_OTHER_CONTROL:
3011 	    ret = xmlUCSIsCatCc(codepoint);
3012 	    break;
3013         case XML_REGEXP_OTHER_FORMAT:
3014 	    ret = xmlUCSIsCatCf(codepoint);
3015 	    break;
3016         case XML_REGEXP_OTHER_PRIVATE:
3017 	    ret = xmlUCSIsCatCo(codepoint);
3018 	    break;
3019         case XML_REGEXP_OTHER_NA:
3020 	    /* ret = xmlUCSIsCatCn(codepoint); */
3021 	    /* Seems it doesn't exist anymore in recent Unicode releases */
3022 	    ret = 0;
3023 	    break;
3024         case XML_REGEXP_BLOCK_NAME:
3025 	    ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3026 	    break;
3027     }
3028     if (neg)
3029 	return(!ret);
3030     return(ret);
3031 }
3032 
3033 static int
xmlRegCheckCharacter(xmlRegAtomPtr atom,int codepoint)3034 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3035     int i, ret = 0;
3036     xmlRegRangePtr range;
3037 
3038     if ((atom == NULL) || (!IS_CHAR(codepoint)))
3039 	return(-1);
3040 
3041     switch (atom->type) {
3042         case XML_REGEXP_SUBREG:
3043         case XML_REGEXP_EPSILON:
3044 	    return(-1);
3045         case XML_REGEXP_CHARVAL:
3046             return(codepoint == atom->codepoint);
3047         case XML_REGEXP_RANGES: {
3048 	    int accept = 0;
3049 
3050 	    for (i = 0;i < atom->nbRanges;i++) {
3051 		range = atom->ranges[i];
3052 		if (range->neg == 2) {
3053 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3054 						0, range->start, range->end,
3055 						range->blockName);
3056 		    if (ret != 0)
3057 			return(0); /* excluded char */
3058 		} else if (range->neg) {
3059 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3060 						0, range->start, range->end,
3061 						range->blockName);
3062 		    if (ret == 0)
3063 		        accept = 1;
3064 		    else
3065 		        return(0);
3066 		} else {
3067 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3068 						0, range->start, range->end,
3069 						range->blockName);
3070 		    if (ret != 0)
3071 			accept = 1; /* might still be excluded */
3072 		}
3073 	    }
3074 	    return(accept);
3075 	}
3076         case XML_REGEXP_STRING:
3077 	    printf("TODO: XML_REGEXP_STRING\n");
3078 	    return(-1);
3079         case XML_REGEXP_ANYCHAR:
3080         case XML_REGEXP_ANYSPACE:
3081         case XML_REGEXP_NOTSPACE:
3082         case XML_REGEXP_INITNAME:
3083         case XML_REGEXP_NOTINITNAME:
3084         case XML_REGEXP_NAMECHAR:
3085         case XML_REGEXP_NOTNAMECHAR:
3086         case XML_REGEXP_DECIMAL:
3087         case XML_REGEXP_NOTDECIMAL:
3088         case XML_REGEXP_REALCHAR:
3089         case XML_REGEXP_NOTREALCHAR:
3090         case XML_REGEXP_LETTER:
3091         case XML_REGEXP_LETTER_UPPERCASE:
3092         case XML_REGEXP_LETTER_LOWERCASE:
3093         case XML_REGEXP_LETTER_TITLECASE:
3094         case XML_REGEXP_LETTER_MODIFIER:
3095         case XML_REGEXP_LETTER_OTHERS:
3096         case XML_REGEXP_MARK:
3097         case XML_REGEXP_MARK_NONSPACING:
3098         case XML_REGEXP_MARK_SPACECOMBINING:
3099         case XML_REGEXP_MARK_ENCLOSING:
3100         case XML_REGEXP_NUMBER:
3101         case XML_REGEXP_NUMBER_DECIMAL:
3102         case XML_REGEXP_NUMBER_LETTER:
3103         case XML_REGEXP_NUMBER_OTHERS:
3104         case XML_REGEXP_PUNCT:
3105         case XML_REGEXP_PUNCT_CONNECTOR:
3106         case XML_REGEXP_PUNCT_DASH:
3107         case XML_REGEXP_PUNCT_OPEN:
3108         case XML_REGEXP_PUNCT_CLOSE:
3109         case XML_REGEXP_PUNCT_INITQUOTE:
3110         case XML_REGEXP_PUNCT_FINQUOTE:
3111         case XML_REGEXP_PUNCT_OTHERS:
3112         case XML_REGEXP_SEPAR:
3113         case XML_REGEXP_SEPAR_SPACE:
3114         case XML_REGEXP_SEPAR_LINE:
3115         case XML_REGEXP_SEPAR_PARA:
3116         case XML_REGEXP_SYMBOL:
3117         case XML_REGEXP_SYMBOL_MATH:
3118         case XML_REGEXP_SYMBOL_CURRENCY:
3119         case XML_REGEXP_SYMBOL_MODIFIER:
3120         case XML_REGEXP_SYMBOL_OTHERS:
3121         case XML_REGEXP_OTHER:
3122         case XML_REGEXP_OTHER_CONTROL:
3123         case XML_REGEXP_OTHER_FORMAT:
3124         case XML_REGEXP_OTHER_PRIVATE:
3125         case XML_REGEXP_OTHER_NA:
3126 	case XML_REGEXP_BLOCK_NAME:
3127 	    ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3128 		                            (const xmlChar *)atom->valuep);
3129 	    if (atom->neg)
3130 		ret = !ret;
3131 	    break;
3132     }
3133     return(ret);
3134 }
3135 
3136 /************************************************************************
3137  *									*
3138  *	Saving and restoring state of an execution context		*
3139  *									*
3140  ************************************************************************/
3141 
3142 #ifdef DEBUG_REGEXP_EXEC
3143 static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec)3144 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3145     printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3146     if (exec->inputStack != NULL) {
3147 	int i;
3148 	printf(": ");
3149 	for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3150 	    printf("%s ", (const char *)
3151 	           exec->inputStack[exec->inputStackNr - (i + 1)].value);
3152     } else {
3153 	printf(": %s", &(exec->inputString[exec->index]));
3154     }
3155     printf("\n");
3156 }
3157 #endif
3158 
3159 static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec)3160 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3161 #ifdef DEBUG_REGEXP_EXEC
3162     printf("saving ");
3163     exec->transno++;
3164     xmlFARegDebugExec(exec);
3165     exec->transno--;
3166 #endif
3167 #ifdef MAX_PUSH
3168     if (exec->nbPush > MAX_PUSH) {
3169         return;
3170     }
3171     exec->nbPush++;
3172 #endif
3173 
3174     if (exec->maxRollbacks == 0) {
3175 	exec->maxRollbacks = 4;
3176 	exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3177 		                             sizeof(xmlRegExecRollback));
3178 	if (exec->rollbacks == NULL) {
3179 	    xmlRegexpErrMemory(NULL, "saving regexp");
3180 	    exec->maxRollbacks = 0;
3181 	    return;
3182 	}
3183 	memset(exec->rollbacks, 0,
3184 	       exec->maxRollbacks * sizeof(xmlRegExecRollback));
3185     } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3186 	xmlRegExecRollback *tmp;
3187 	int len = exec->maxRollbacks;
3188 
3189 	exec->maxRollbacks *= 2;
3190 	tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3191 			exec->maxRollbacks * sizeof(xmlRegExecRollback));
3192 	if (tmp == NULL) {
3193 	    xmlRegexpErrMemory(NULL, "saving regexp");
3194 	    exec->maxRollbacks /= 2;
3195 	    return;
3196 	}
3197 	exec->rollbacks = tmp;
3198 	tmp = &exec->rollbacks[len];
3199 	memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3200     }
3201     exec->rollbacks[exec->nbRollbacks].state = exec->state;
3202     exec->rollbacks[exec->nbRollbacks].index = exec->index;
3203     exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3204     if (exec->comp->nbCounters > 0) {
3205 	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3206 	    exec->rollbacks[exec->nbRollbacks].counts = (int *)
3207 		xmlMalloc(exec->comp->nbCounters * sizeof(int));
3208 	    if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3209 		xmlRegexpErrMemory(NULL, "saving regexp");
3210 		exec->status = -5;
3211 		return;
3212 	    }
3213 	}
3214 	memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3215 	       exec->comp->nbCounters * sizeof(int));
3216     }
3217     exec->nbRollbacks++;
3218 }
3219 
3220 static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec)3221 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3222     if (exec->nbRollbacks <= 0) {
3223 	exec->status = -1;
3224 #ifdef DEBUG_REGEXP_EXEC
3225 	printf("rollback failed on empty stack\n");
3226 #endif
3227 	return;
3228     }
3229     exec->nbRollbacks--;
3230     exec->state = exec->rollbacks[exec->nbRollbacks].state;
3231     exec->index = exec->rollbacks[exec->nbRollbacks].index;
3232     exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3233     if (exec->comp->nbCounters > 0) {
3234 	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3235 	    fprintf(stderr, "exec save: allocation failed");
3236 	    exec->status = -6;
3237 	    return;
3238 	}
3239 	if (exec->counts) {
3240 	    memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3241 	       exec->comp->nbCounters * sizeof(int));
3242 	}
3243     }
3244 
3245 #ifdef DEBUG_REGEXP_EXEC
3246     printf("restored ");
3247     xmlFARegDebugExec(exec);
3248 #endif
3249 }
3250 
3251 /************************************************************************
3252  *									*
3253  *	Verifier, running an input against a compiled regexp		*
3254  *									*
3255  ************************************************************************/
3256 
3257 static int
xmlFARegExec(xmlRegexpPtr comp,const xmlChar * content)3258 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3259     xmlRegExecCtxt execval;
3260     xmlRegExecCtxtPtr exec = &execval;
3261     int ret, codepoint = 0, len, deter;
3262 
3263     exec->inputString = content;
3264     exec->index = 0;
3265     exec->nbPush = 0;
3266     exec->determinist = 1;
3267     exec->maxRollbacks = 0;
3268     exec->nbRollbacks = 0;
3269     exec->rollbacks = NULL;
3270     exec->status = 0;
3271     exec->comp = comp;
3272     exec->state = comp->states[0];
3273     exec->transno = 0;
3274     exec->transcount = 0;
3275     exec->inputStack = NULL;
3276     exec->inputStackMax = 0;
3277     if (comp->nbCounters > 0) {
3278 	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3279 	if (exec->counts == NULL) {
3280 	    xmlRegexpErrMemory(NULL, "running regexp");
3281 	    return(-1);
3282 	}
3283         memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3284     } else
3285 	exec->counts = NULL;
3286     while ((exec->status == 0) && (exec->state != NULL) &&
3287 	   ((exec->inputString[exec->index] != 0) ||
3288 	    ((exec->state != NULL) &&
3289 	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3290 	xmlRegTransPtr trans;
3291 	xmlRegAtomPtr atom;
3292 
3293 	/*
3294 	 * If end of input on non-terminal state, rollback, however we may
3295 	 * still have epsilon like transition for counted transitions
3296 	 * on counters, in that case don't break too early.  Additionally,
3297 	 * if we are working on a range like "AB{0,2}", where B is not present,
3298 	 * we don't want to break.
3299 	 */
3300 	len = 1;
3301 	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3302 	    /*
3303 	     * if there is a transition, we must check if
3304 	     *  atom allows minOccurs of 0
3305 	     */
3306 	    if (exec->transno < exec->state->nbTrans) {
3307 	        trans = &exec->state->trans[exec->transno];
3308 		if (trans->to >=0) {
3309 		    atom = trans->atom;
3310 		    if (!((atom->min == 0) && (atom->max > 0)))
3311 		        goto rollback;
3312 		}
3313 	    } else
3314 	        goto rollback;
3315 	}
3316 
3317 	exec->transcount = 0;
3318 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3319 	    trans = &exec->state->trans[exec->transno];
3320 	    if (trans->to < 0)
3321 		continue;
3322 	    atom = trans->atom;
3323 	    ret = 0;
3324 	    deter = 1;
3325 	    if (trans->count >= 0) {
3326 		int count;
3327 		xmlRegCounterPtr counter;
3328 
3329 		if (exec->counts == NULL) {
3330 		    exec->status = -1;
3331 		    goto error;
3332 		}
3333 		/*
3334 		 * A counted transition.
3335 		 */
3336 
3337 		count = exec->counts[trans->count];
3338 		counter = &exec->comp->counters[trans->count];
3339 #ifdef DEBUG_REGEXP_EXEC
3340 		printf("testing count %d: val %d, min %d, max %d\n",
3341 		       trans->count, count, counter->min,  counter->max);
3342 #endif
3343 		ret = ((count >= counter->min) && (count <= counter->max));
3344 		if ((ret) && (counter->min != counter->max))
3345 		    deter = 0;
3346 	    } else if (atom == NULL) {
3347 		fprintf(stderr, "epsilon transition left at runtime\n");
3348 		exec->status = -2;
3349 		break;
3350 	    } else if (exec->inputString[exec->index] != 0) {
3351                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3352 		ret = xmlRegCheckCharacter(atom, codepoint);
3353 		if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3354 		    xmlRegStatePtr to = comp->states[trans->to];
3355 
3356 		    /*
3357 		     * this is a multiple input sequence
3358 		     * If there is a counter associated increment it now.
3359 		     * do not increment if the counter is already over the
3360 		     * maximum limit in which case get to next transition
3361 		     */
3362 		    if (trans->counter >= 0) {
3363 			xmlRegCounterPtr counter;
3364 
3365 			if ((exec->counts == NULL) ||
3366 			    (exec->comp == NULL) ||
3367 			    (exec->comp->counters == NULL)) {
3368 			    exec->status = -1;
3369 			    goto error;
3370 			}
3371 			counter = &exec->comp->counters[trans->counter];
3372 			if (exec->counts[trans->counter] >= counter->max)
3373 			    continue; /* for loop on transitions */
3374                     }
3375                     /* Save before incrementing */
3376 		    if (exec->state->nbTrans > exec->transno + 1) {
3377 			xmlFARegExecSave(exec);
3378 		    }
3379 		    if (trans->counter >= 0) {
3380 #ifdef DEBUG_REGEXP_EXEC
3381 			printf("Increasing count %d\n", trans->counter);
3382 #endif
3383 			exec->counts[trans->counter]++;
3384 		    }
3385 		    exec->transcount = 1;
3386 		    do {
3387 			/*
3388 			 * Try to progress as much as possible on the input
3389 			 */
3390 			if (exec->transcount == atom->max) {
3391 			    break;
3392 			}
3393 			exec->index += len;
3394 			/*
3395 			 * End of input: stop here
3396 			 */
3397 			if (exec->inputString[exec->index] == 0) {
3398 			    exec->index -= len;
3399 			    break;
3400 			}
3401 			if (exec->transcount >= atom->min) {
3402 			    int transno = exec->transno;
3403 			    xmlRegStatePtr state = exec->state;
3404 
3405 			    /*
3406 			     * The transition is acceptable save it
3407 			     */
3408 			    exec->transno = -1; /* trick */
3409 			    exec->state = to;
3410 			    xmlFARegExecSave(exec);
3411 			    exec->transno = transno;
3412 			    exec->state = state;
3413 			}
3414 			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3415 				              len);
3416 			ret = xmlRegCheckCharacter(atom, codepoint);
3417 			exec->transcount++;
3418 		    } while (ret == 1);
3419 		    if (exec->transcount < atom->min)
3420 			ret = 0;
3421 
3422 		    /*
3423 		     * If the last check failed but one transition was found
3424 		     * possible, rollback
3425 		     */
3426 		    if (ret < 0)
3427 			ret = 0;
3428 		    if (ret == 0) {
3429 			goto rollback;
3430 		    }
3431 		    if (trans->counter >= 0) {
3432 			if (exec->counts == NULL) {
3433 			    exec->status = -1;
3434 			    goto error;
3435 			}
3436 #ifdef DEBUG_REGEXP_EXEC
3437 			printf("Decreasing count %d\n", trans->counter);
3438 #endif
3439 			exec->counts[trans->counter]--;
3440 		    }
3441 		} else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3442 		    /*
3443 		     * we don't match on the codepoint, but minOccurs of 0
3444 		     * says that's ok.  Setting len to 0 inhibits stepping
3445 		     * over the codepoint.
3446 		     */
3447 		    exec->transcount = 1;
3448 		    len = 0;
3449 		    ret = 1;
3450 		}
3451 	    } else if ((atom->min == 0) && (atom->max > 0)) {
3452 	        /* another spot to match when minOccurs is 0 */
3453 		exec->transcount = 1;
3454 		len = 0;
3455 		ret = 1;
3456 	    }
3457 	    if (ret == 1) {
3458 		if ((trans->nd == 1) ||
3459 		    ((trans->count >= 0) && (deter == 0) &&
3460 		     (exec->state->nbTrans > exec->transno + 1))) {
3461 #ifdef DEBUG_REGEXP_EXEC
3462 		    if (trans->nd == 1)
3463 		        printf("Saving on nd transition atom %d for %c at %d\n",
3464 			       trans->atom->no, codepoint, exec->index);
3465 		    else
3466 		        printf("Saving on counted transition count %d for %c at %d\n",
3467 			       trans->count, codepoint, exec->index);
3468 #endif
3469 		    xmlFARegExecSave(exec);
3470 		}
3471 		if (trans->counter >= 0) {
3472 		    xmlRegCounterPtr counter;
3473 
3474                     /* make sure we don't go over the counter maximum value */
3475 		    if ((exec->counts == NULL) ||
3476 			(exec->comp == NULL) ||
3477 			(exec->comp->counters == NULL)) {
3478 			exec->status = -1;
3479 			goto error;
3480 		    }
3481 		    counter = &exec->comp->counters[trans->counter];
3482 		    if (exec->counts[trans->counter] >= counter->max)
3483 			continue; /* for loop on transitions */
3484 #ifdef DEBUG_REGEXP_EXEC
3485 		    printf("Increasing count %d\n", trans->counter);
3486 #endif
3487 		    exec->counts[trans->counter]++;
3488 		}
3489 		if ((trans->count >= 0) &&
3490 		    (trans->count < REGEXP_ALL_COUNTER)) {
3491 		    if (exec->counts == NULL) {
3492 		        exec->status = -1;
3493 			goto error;
3494 		    }
3495 #ifdef DEBUG_REGEXP_EXEC
3496 		    printf("resetting count %d on transition\n",
3497 		           trans->count);
3498 #endif
3499 		    exec->counts[trans->count] = 0;
3500 		}
3501 #ifdef DEBUG_REGEXP_EXEC
3502 		printf("entering state %d\n", trans->to);
3503 #endif
3504 		exec->state = comp->states[trans->to];
3505 		exec->transno = 0;
3506 		if (trans->atom != NULL) {
3507 		    exec->index += len;
3508 		}
3509 		goto progress;
3510 	    } else if (ret < 0) {
3511 		exec->status = -4;
3512 		break;
3513 	    }
3514 	}
3515 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3516 rollback:
3517 	    /*
3518 	     * Failed to find a way out
3519 	     */
3520 	    exec->determinist = 0;
3521 #ifdef DEBUG_REGEXP_EXEC
3522 	    printf("rollback from state %d on %d:%c\n", exec->state->no,
3523 	           codepoint,codepoint);
3524 #endif
3525 	    xmlFARegExecRollBack(exec);
3526 	}
3527 progress:
3528 	continue;
3529     }
3530 error:
3531     if (exec->rollbacks != NULL) {
3532 	if (exec->counts != NULL) {
3533 	    int i;
3534 
3535 	    for (i = 0;i < exec->maxRollbacks;i++)
3536 		if (exec->rollbacks[i].counts != NULL)
3537 		    xmlFree(exec->rollbacks[i].counts);
3538 	}
3539 	xmlFree(exec->rollbacks);
3540     }
3541     if (exec->state == NULL)
3542         return(-1);
3543     if (exec->counts != NULL)
3544 	xmlFree(exec->counts);
3545     if (exec->status == 0)
3546 	return(1);
3547     if (exec->status == -1) {
3548 	if (exec->nbPush > MAX_PUSH)
3549 	    return(-1);
3550 	return(0);
3551     }
3552     return(exec->status);
3553 }
3554 
3555 /************************************************************************
3556  *									*
3557  *	Progressive interface to the verifier one atom at a time	*
3558  *									*
3559  ************************************************************************/
3560 #ifdef DEBUG_ERR
3561 static void testerr(xmlRegExecCtxtPtr exec);
3562 #endif
3563 
3564 /**
3565  * xmlRegNewExecCtxt:
3566  * @comp: a precompiled regular expression
3567  * @callback: a callback function used for handling progresses in the
3568  *            automata matching phase
3569  * @data: the context data associated to the callback in this context
3570  *
3571  * Build a context used for progressive evaluation of a regexp.
3572  *
3573  * Returns the new context
3574  */
3575 xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp,xmlRegExecCallbacks callback,void * data)3576 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3577     xmlRegExecCtxtPtr exec;
3578 
3579     if (comp == NULL)
3580 	return(NULL);
3581     if ((comp->compact == NULL) && (comp->states == NULL))
3582         return(NULL);
3583     exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3584     if (exec == NULL) {
3585 	xmlRegexpErrMemory(NULL, "creating execution context");
3586 	return(NULL);
3587     }
3588     memset(exec, 0, sizeof(xmlRegExecCtxt));
3589     exec->inputString = NULL;
3590     exec->index = 0;
3591     exec->determinist = 1;
3592     exec->maxRollbacks = 0;
3593     exec->nbRollbacks = 0;
3594     exec->rollbacks = NULL;
3595     exec->status = 0;
3596     exec->comp = comp;
3597     if (comp->compact == NULL)
3598 	exec->state = comp->states[0];
3599     exec->transno = 0;
3600     exec->transcount = 0;
3601     exec->callback = callback;
3602     exec->data = data;
3603     if (comp->nbCounters > 0) {
3604         /*
3605 	 * For error handling, exec->counts is allocated twice the size
3606 	 * the second half is used to store the data in case of rollback
3607 	 */
3608 	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3609 	                                 * 2);
3610 	if (exec->counts == NULL) {
3611 	    xmlRegexpErrMemory(NULL, "creating execution context");
3612 	    xmlFree(exec);
3613 	    return(NULL);
3614 	}
3615         memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3616 	exec->errCounts = &exec->counts[comp->nbCounters];
3617     } else {
3618 	exec->counts = NULL;
3619 	exec->errCounts = NULL;
3620     }
3621     exec->inputStackMax = 0;
3622     exec->inputStackNr = 0;
3623     exec->inputStack = NULL;
3624     exec->errStateNo = -1;
3625     exec->errString = NULL;
3626     exec->nbPush = 0;
3627     return(exec);
3628 }
3629 
3630 /**
3631  * xmlRegFreeExecCtxt:
3632  * @exec: a regular expression evaluation context
3633  *
3634  * Free the structures associated to a regular expression evaluation context.
3635  */
3636 void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec)3637 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3638     if (exec == NULL)
3639 	return;
3640 
3641     if (exec->rollbacks != NULL) {
3642 	if (exec->counts != NULL) {
3643 	    int i;
3644 
3645 	    for (i = 0;i < exec->maxRollbacks;i++)
3646 		if (exec->rollbacks[i].counts != NULL)
3647 		    xmlFree(exec->rollbacks[i].counts);
3648 	}
3649 	xmlFree(exec->rollbacks);
3650     }
3651     if (exec->counts != NULL)
3652 	xmlFree(exec->counts);
3653     if (exec->inputStack != NULL) {
3654 	int i;
3655 
3656 	for (i = 0;i < exec->inputStackNr;i++) {
3657 	    if (exec->inputStack[i].value != NULL)
3658 		xmlFree(exec->inputStack[i].value);
3659 	}
3660 	xmlFree(exec->inputStack);
3661     }
3662     if (exec->errString != NULL)
3663         xmlFree(exec->errString);
3664     xmlFree(exec);
3665 }
3666 
3667 static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3668 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3669 	                    void *data) {
3670 #ifdef DEBUG_PUSH
3671     printf("saving value: %d:%s\n", exec->inputStackNr, value);
3672 #endif
3673     if (exec->inputStackMax == 0) {
3674 	exec->inputStackMax = 4;
3675 	exec->inputStack = (xmlRegInputTokenPtr)
3676 	    xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3677 	if (exec->inputStack == NULL) {
3678 	    xmlRegexpErrMemory(NULL, "pushing input string");
3679 	    exec->inputStackMax = 0;
3680 	    return;
3681 	}
3682     } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3683 	xmlRegInputTokenPtr tmp;
3684 
3685 	exec->inputStackMax *= 2;
3686 	tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3687 			exec->inputStackMax * sizeof(xmlRegInputToken));
3688 	if (tmp == NULL) {
3689 	    xmlRegexpErrMemory(NULL, "pushing input string");
3690 	    exec->inputStackMax /= 2;
3691 	    return;
3692 	}
3693 	exec->inputStack = tmp;
3694     }
3695     exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3696     exec->inputStack[exec->inputStackNr].data = data;
3697     exec->inputStackNr++;
3698     exec->inputStack[exec->inputStackNr].value = NULL;
3699     exec->inputStack[exec->inputStackNr].data = NULL;
3700 }
3701 
3702 /**
3703  * xmlRegStrEqualWildcard:
3704  * @expStr:  the string to be evaluated
3705  * @valStr:  the validation string
3706  *
3707  * Checks if both strings are equal or have the same content. "*"
3708  * can be used as a wildcard in @valStr; "|" is used as a separator of
3709  * substrings in both @expStr and @valStr.
3710  *
3711  * Returns 1 if the comparison is satisfied and the number of substrings
3712  * is equal, 0 otherwise.
3713  */
3714 
3715 static int
xmlRegStrEqualWildcard(const xmlChar * expStr,const xmlChar * valStr)3716 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3717     if (expStr == valStr) return(1);
3718     if (expStr == NULL) return(0);
3719     if (valStr == NULL) return(0);
3720     do {
3721 	/*
3722 	* Eval if we have a wildcard for the current item.
3723 	*/
3724         if (*expStr != *valStr) {
3725 	    /* if one of them starts with a wildcard make valStr be it */
3726 	    if (*valStr == '*') {
3727 	        const xmlChar *tmp;
3728 
3729 		tmp = valStr;
3730 		valStr = expStr;
3731 		expStr = tmp;
3732 	    }
3733 	    if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3734 		do {
3735 		    if (*valStr == XML_REG_STRING_SEPARATOR)
3736 			break;
3737 		    valStr++;
3738 		} while (*valStr != 0);
3739 		continue;
3740 	    } else
3741 		return(0);
3742 	}
3743 	expStr++;
3744 	valStr++;
3745     } while (*valStr != 0);
3746     if (*expStr != 0)
3747 	return (0);
3748     else
3749 	return (1);
3750 }
3751 
3752 /**
3753  * xmlRegCompactPushString:
3754  * @exec: a regexp execution context
3755  * @comp:  the precompiled exec with a compact table
3756  * @value: a string token input
3757  * @data: data associated to the token to reuse in callbacks
3758  *
3759  * Push one input token in the execution context
3760  *
3761  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3762  *     a negative value in case of error.
3763  */
3764 static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,xmlRegexpPtr comp,const xmlChar * value,void * data)3765 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3766 	                xmlRegexpPtr comp,
3767 	                const xmlChar *value,
3768 	                void *data) {
3769     int state = exec->index;
3770     int i, target;
3771 
3772     if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3773 	return(-1);
3774 
3775     if (value == NULL) {
3776 	/*
3777 	 * are we at a final state ?
3778 	 */
3779 	if (comp->compact[state * (comp->nbstrings + 1)] ==
3780             XML_REGEXP_FINAL_STATE)
3781 	    return(1);
3782 	return(0);
3783     }
3784 
3785 #ifdef DEBUG_PUSH
3786     printf("value pushed: %s\n", value);
3787 #endif
3788 
3789     /*
3790      * Examine all outside transitions from current state
3791      */
3792     for (i = 0;i < comp->nbstrings;i++) {
3793 	target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3794 	if ((target > 0) && (target <= comp->nbstates)) {
3795 	    target--; /* to avoid 0 */
3796 	    if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3797 		exec->index = target;
3798 		if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3799 		    exec->callback(exec->data, value,
3800 			  comp->transdata[state * comp->nbstrings + i], data);
3801 		}
3802 #ifdef DEBUG_PUSH
3803 		printf("entering state %d\n", target);
3804 #endif
3805 		if (comp->compact[target * (comp->nbstrings + 1)] ==
3806 		    XML_REGEXP_SINK_STATE)
3807 		    goto error;
3808 
3809 		if (comp->compact[target * (comp->nbstrings + 1)] ==
3810 		    XML_REGEXP_FINAL_STATE)
3811 		    return(1);
3812 		return(0);
3813 	    }
3814 	}
3815     }
3816     /*
3817      * Failed to find an exit transition out from current state for the
3818      * current token
3819      */
3820 #ifdef DEBUG_PUSH
3821     printf("failed to find a transition for %s on state %d\n", value, state);
3822 #endif
3823 error:
3824     if (exec->errString != NULL)
3825         xmlFree(exec->errString);
3826     exec->errString = xmlStrdup(value);
3827     exec->errStateNo = state;
3828     exec->status = -1;
3829 #ifdef DEBUG_ERR
3830     testerr(exec);
3831 #endif
3832     return(-1);
3833 }
3834 
3835 /**
3836  * xmlRegExecPushStringInternal:
3837  * @exec: a regexp execution context or NULL to indicate the end
3838  * @value: a string token input
3839  * @data: data associated to the token to reuse in callbacks
3840  * @compound: value was assembled from 2 strings
3841  *
3842  * Push one input token in the execution context
3843  *
3844  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3845  *     a negative value in case of error.
3846  */
3847 static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data,int compound)3848 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3849 	                     void *data, int compound) {
3850     xmlRegTransPtr trans;
3851     xmlRegAtomPtr atom;
3852     int ret;
3853     int final = 0;
3854     int progress = 1;
3855 
3856     if (exec == NULL)
3857 	return(-1);
3858     if (exec->comp == NULL)
3859 	return(-1);
3860     if (exec->status != 0)
3861 	return(exec->status);
3862 
3863     if (exec->comp->compact != NULL)
3864 	return(xmlRegCompactPushString(exec, exec->comp, value, data));
3865 
3866     if (value == NULL) {
3867         if (exec->state->type == XML_REGEXP_FINAL_STATE)
3868 	    return(1);
3869 	final = 1;
3870     }
3871 
3872 #ifdef DEBUG_PUSH
3873     printf("value pushed: %s\n", value);
3874 #endif
3875     /*
3876      * If we have an active rollback stack push the new value there
3877      * and get back to where we were left
3878      */
3879     if ((value != NULL) && (exec->inputStackNr > 0)) {
3880 	xmlFARegExecSaveInputString(exec, value, data);
3881 	value = exec->inputStack[exec->index].value;
3882 	data = exec->inputStack[exec->index].data;
3883 #ifdef DEBUG_PUSH
3884 	printf("value loaded: %s\n", value);
3885 #endif
3886     }
3887 
3888     while ((exec->status == 0) &&
3889 	   ((value != NULL) ||
3890 	    ((final == 1) &&
3891 	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3892 
3893 	/*
3894 	 * End of input on non-terminal state, rollback, however we may
3895 	 * still have epsilon like transition for counted transitions
3896 	 * on counters, in that case don't break too early.
3897 	 */
3898 	if ((value == NULL) && (exec->counts == NULL))
3899 	    goto rollback;
3900 
3901 	exec->transcount = 0;
3902 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3903 	    trans = &exec->state->trans[exec->transno];
3904 	    if (trans->to < 0)
3905 		continue;
3906 	    atom = trans->atom;
3907 	    ret = 0;
3908 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3909 		int i;
3910 		int count;
3911 		xmlRegTransPtr t;
3912 		xmlRegCounterPtr counter;
3913 
3914 		ret = 0;
3915 
3916 #ifdef DEBUG_PUSH
3917 		printf("testing all lax %d\n", trans->count);
3918 #endif
3919 		/*
3920 		 * Check all counted transitions from the current state
3921 		 */
3922 		if ((value == NULL) && (final)) {
3923 		    ret = 1;
3924 		} else if (value != NULL) {
3925 		    for (i = 0;i < exec->state->nbTrans;i++) {
3926 			t = &exec->state->trans[i];
3927 			if ((t->counter < 0) || (t == trans))
3928 			    continue;
3929 			counter = &exec->comp->counters[t->counter];
3930 			count = exec->counts[t->counter];
3931 			if ((count < counter->max) &&
3932 		            (t->atom != NULL) &&
3933 			    (xmlStrEqual(value, t->atom->valuep))) {
3934 			    ret = 0;
3935 			    break;
3936 			}
3937 			if ((count >= counter->min) &&
3938 			    (count < counter->max) &&
3939 			    (t->atom != NULL) &&
3940 			    (xmlStrEqual(value, t->atom->valuep))) {
3941 			    ret = 1;
3942 			    break;
3943 			}
3944 		    }
3945 		}
3946 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
3947 		int i;
3948 		int count;
3949 		xmlRegTransPtr t;
3950 		xmlRegCounterPtr counter;
3951 
3952 		ret = 1;
3953 
3954 #ifdef DEBUG_PUSH
3955 		printf("testing all %d\n", trans->count);
3956 #endif
3957 		/*
3958 		 * Check all counted transitions from the current state
3959 		 */
3960 		for (i = 0;i < exec->state->nbTrans;i++) {
3961                     t = &exec->state->trans[i];
3962 		    if ((t->counter < 0) || (t == trans))
3963 			continue;
3964                     counter = &exec->comp->counters[t->counter];
3965 		    count = exec->counts[t->counter];
3966 		    if ((count < counter->min) || (count > counter->max)) {
3967 			ret = 0;
3968 			break;
3969 		    }
3970 		}
3971 	    } else if (trans->count >= 0) {
3972 		int count;
3973 		xmlRegCounterPtr counter;
3974 
3975 		/*
3976 		 * A counted transition.
3977 		 */
3978 
3979 		count = exec->counts[trans->count];
3980 		counter = &exec->comp->counters[trans->count];
3981 #ifdef DEBUG_PUSH
3982 		printf("testing count %d: val %d, min %d, max %d\n",
3983 		       trans->count, count, counter->min,  counter->max);
3984 #endif
3985 		ret = ((count >= counter->min) && (count <= counter->max));
3986 	    } else if (atom == NULL) {
3987 		fprintf(stderr, "epsilon transition left at runtime\n");
3988 		exec->status = -2;
3989 		break;
3990 	    } else if (value != NULL) {
3991 		ret = xmlRegStrEqualWildcard(atom->valuep, value);
3992 		if (atom->neg) {
3993 		    ret = !ret;
3994 		    if (!compound)
3995 		        ret = 0;
3996 		}
3997 		if ((ret == 1) && (trans->counter >= 0)) {
3998 		    xmlRegCounterPtr counter;
3999 		    int count;
4000 
4001 		    count = exec->counts[trans->counter];
4002 		    counter = &exec->comp->counters[trans->counter];
4003 		    if (count >= counter->max)
4004 			ret = 0;
4005 		}
4006 
4007 		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4008 		    xmlRegStatePtr to = exec->comp->states[trans->to];
4009 
4010 		    /*
4011 		     * this is a multiple input sequence
4012 		     */
4013 		    if (exec->state->nbTrans > exec->transno + 1) {
4014 			if (exec->inputStackNr <= 0) {
4015 			    xmlFARegExecSaveInputString(exec, value, data);
4016 			}
4017 			xmlFARegExecSave(exec);
4018 		    }
4019 		    exec->transcount = 1;
4020 		    do {
4021 			/*
4022 			 * Try to progress as much as possible on the input
4023 			 */
4024 			if (exec->transcount == atom->max) {
4025 			    break;
4026 			}
4027 			exec->index++;
4028 			value = exec->inputStack[exec->index].value;
4029 			data = exec->inputStack[exec->index].data;
4030 #ifdef DEBUG_PUSH
4031 			printf("value loaded: %s\n", value);
4032 #endif
4033 
4034 			/*
4035 			 * End of input: stop here
4036 			 */
4037 			if (value == NULL) {
4038 			    exec->index --;
4039 			    break;
4040 			}
4041 			if (exec->transcount >= atom->min) {
4042 			    int transno = exec->transno;
4043 			    xmlRegStatePtr state = exec->state;
4044 
4045 			    /*
4046 			     * The transition is acceptable save it
4047 			     */
4048 			    exec->transno = -1; /* trick */
4049 			    exec->state = to;
4050 			    if (exec->inputStackNr <= 0) {
4051 				xmlFARegExecSaveInputString(exec, value, data);
4052 			    }
4053 			    xmlFARegExecSave(exec);
4054 			    exec->transno = transno;
4055 			    exec->state = state;
4056 			}
4057 			ret = xmlStrEqual(value, atom->valuep);
4058 			exec->transcount++;
4059 		    } while (ret == 1);
4060 		    if (exec->transcount < atom->min)
4061 			ret = 0;
4062 
4063 		    /*
4064 		     * If the last check failed but one transition was found
4065 		     * possible, rollback
4066 		     */
4067 		    if (ret < 0)
4068 			ret = 0;
4069 		    if (ret == 0) {
4070 			goto rollback;
4071 		    }
4072 		}
4073 	    }
4074 	    if (ret == 1) {
4075 		if ((exec->callback != NULL) && (atom != NULL) &&
4076 			(data != NULL)) {
4077 		    exec->callback(exec->data, atom->valuep,
4078 			           atom->data, data);
4079 		}
4080 		if (exec->state->nbTrans > exec->transno + 1) {
4081 		    if (exec->inputStackNr <= 0) {
4082 			xmlFARegExecSaveInputString(exec, value, data);
4083 		    }
4084 		    xmlFARegExecSave(exec);
4085 		}
4086 		if (trans->counter >= 0) {
4087 #ifdef DEBUG_PUSH
4088 		    printf("Increasing count %d\n", trans->counter);
4089 #endif
4090 		    exec->counts[trans->counter]++;
4091 		}
4092 		if ((trans->count >= 0) &&
4093 		    (trans->count < REGEXP_ALL_COUNTER)) {
4094 #ifdef DEBUG_REGEXP_EXEC
4095 		    printf("resetting count %d on transition\n",
4096 		           trans->count);
4097 #endif
4098 		    exec->counts[trans->count] = 0;
4099 		}
4100 #ifdef DEBUG_PUSH
4101 		printf("entering state %d\n", trans->to);
4102 #endif
4103                 if ((exec->comp->states[trans->to] != NULL) &&
4104 		    (exec->comp->states[trans->to]->type ==
4105 		     XML_REGEXP_SINK_STATE)) {
4106 		    /*
4107 		     * entering a sink state, save the current state as error
4108 		     * state.
4109 		     */
4110 		    if (exec->errString != NULL)
4111 			xmlFree(exec->errString);
4112 		    exec->errString = xmlStrdup(value);
4113 		    exec->errState = exec->state;
4114 		    memcpy(exec->errCounts, exec->counts,
4115 			   exec->comp->nbCounters * sizeof(int));
4116 		}
4117 		exec->state = exec->comp->states[trans->to];
4118 		exec->transno = 0;
4119 		if (trans->atom != NULL) {
4120 		    if (exec->inputStack != NULL) {
4121 			exec->index++;
4122 			if (exec->index < exec->inputStackNr) {
4123 			    value = exec->inputStack[exec->index].value;
4124 			    data = exec->inputStack[exec->index].data;
4125 #ifdef DEBUG_PUSH
4126 			    printf("value loaded: %s\n", value);
4127 #endif
4128 			} else {
4129 			    value = NULL;
4130 			    data = NULL;
4131 #ifdef DEBUG_PUSH
4132 			    printf("end of input\n");
4133 #endif
4134 			}
4135 		    } else {
4136 			value = NULL;
4137 			data = NULL;
4138 #ifdef DEBUG_PUSH
4139 			printf("end of input\n");
4140 #endif
4141 		    }
4142 		}
4143 		goto progress;
4144 	    } else if (ret < 0) {
4145 		exec->status = -4;
4146 		break;
4147 	    }
4148 	}
4149 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4150 rollback:
4151             /*
4152 	     * if we didn't yet rollback on the current input
4153 	     * store the current state as the error state.
4154 	     */
4155 	    if ((progress) && (exec->state != NULL) &&
4156 	        (exec->state->type != XML_REGEXP_SINK_STATE)) {
4157 	        progress = 0;
4158 		if (exec->errString != NULL)
4159 		    xmlFree(exec->errString);
4160 		exec->errString = xmlStrdup(value);
4161 		exec->errState = exec->state;
4162                 if (exec->comp->nbCounters)
4163                     memcpy(exec->errCounts, exec->counts,
4164                            exec->comp->nbCounters * sizeof(int));
4165 	    }
4166 
4167 	    /*
4168 	     * Failed to find a way out
4169 	     */
4170 	    exec->determinist = 0;
4171 	    xmlFARegExecRollBack(exec);
4172 	    if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4173 		value = exec->inputStack[exec->index].value;
4174 		data = exec->inputStack[exec->index].data;
4175 #ifdef DEBUG_PUSH
4176 		printf("value loaded: %s\n", value);
4177 #endif
4178 	    }
4179 	}
4180 	continue;
4181 progress:
4182         progress = 1;
4183 	continue;
4184     }
4185     if (exec->status == 0) {
4186         return(exec->state->type == XML_REGEXP_FINAL_STATE);
4187     }
4188 #ifdef DEBUG_ERR
4189     if (exec->status < 0) {
4190 	testerr(exec);
4191     }
4192 #endif
4193     return(exec->status);
4194 }
4195 
4196 /**
4197  * xmlRegExecPushString:
4198  * @exec: a regexp execution context or NULL to indicate the end
4199  * @value: a string token input
4200  * @data: data associated to the token to reuse in callbacks
4201  *
4202  * Push one input token in the execution context
4203  *
4204  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4205  *     a negative value in case of error.
4206  */
4207 int
xmlRegExecPushString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)4208 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4209 	             void *data) {
4210     return(xmlRegExecPushStringInternal(exec, value, data, 0));
4211 }
4212 
4213 /**
4214  * xmlRegExecPushString2:
4215  * @exec: a regexp execution context or NULL to indicate the end
4216  * @value: the first string token input
4217  * @value2: the second string token input
4218  * @data: data associated to the token to reuse in callbacks
4219  *
4220  * Push one input token in the execution context
4221  *
4222  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4223  *     a negative value in case of error.
4224  */
4225 int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec,const xmlChar * value,const xmlChar * value2,void * data)4226 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4227                       const xmlChar *value2, void *data) {
4228     xmlChar buf[150];
4229     int lenn, lenp, ret;
4230     xmlChar *str;
4231 
4232     if (exec == NULL)
4233 	return(-1);
4234     if (exec->comp == NULL)
4235 	return(-1);
4236     if (exec->status != 0)
4237 	return(exec->status);
4238 
4239     if (value2 == NULL)
4240         return(xmlRegExecPushString(exec, value, data));
4241 
4242     lenn = strlen((char *) value2);
4243     lenp = strlen((char *) value);
4244 
4245     if (150 < lenn + lenp + 2) {
4246 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4247 	if (str == NULL) {
4248 	    exec->status = -1;
4249 	    return(-1);
4250 	}
4251     } else {
4252 	str = buf;
4253     }
4254     memcpy(&str[0], value, lenp);
4255     str[lenp] = XML_REG_STRING_SEPARATOR;
4256     memcpy(&str[lenp + 1], value2, lenn);
4257     str[lenn + lenp + 1] = 0;
4258 
4259     if (exec->comp->compact != NULL)
4260 	ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4261     else
4262         ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4263 
4264     if (str != buf)
4265         xmlFree(str);
4266     return(ret);
4267 }
4268 
4269 /**
4270  * xmlRegExecGetValues:
4271  * @exec: a regexp execution context
4272  * @err: error extraction or normal one
4273  * @nbval: pointer to the number of accepted values IN/OUT
4274  * @nbneg: return number of negative transitions
4275  * @values: pointer to the array of acceptable values
4276  * @terminal: return value if this was a terminal state
4277  *
4278  * Extract information from the regexp execution, internal routine to
4279  * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4280  *
4281  * Returns: 0 in case of success or -1 in case of error.
4282  */
4283 static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec,int err,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4284 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4285                     int *nbval, int *nbneg,
4286 		    xmlChar **values, int *terminal) {
4287     int maxval;
4288     int nb = 0;
4289 
4290     if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4291         (values == NULL) || (*nbval <= 0))
4292         return(-1);
4293 
4294     maxval = *nbval;
4295     *nbval = 0;
4296     *nbneg = 0;
4297     if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4298         xmlRegexpPtr comp;
4299 	int target, i, state;
4300 
4301         comp = exec->comp;
4302 
4303 	if (err) {
4304 	    if (exec->errStateNo == -1) return(-1);
4305 	    state = exec->errStateNo;
4306 	} else {
4307 	    state = exec->index;
4308 	}
4309 	if (terminal != NULL) {
4310 	    if (comp->compact[state * (comp->nbstrings + 1)] ==
4311 	        XML_REGEXP_FINAL_STATE)
4312 		*terminal = 1;
4313 	    else
4314 		*terminal = 0;
4315 	}
4316 	for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4317 	    target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4318 	    if ((target > 0) && (target <= comp->nbstates) &&
4319 	        (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4320 		 XML_REGEXP_SINK_STATE)) {
4321 	        values[nb++] = comp->stringMap[i];
4322 		(*nbval)++;
4323 	    }
4324 	}
4325 	for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4326 	    target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4327 	    if ((target > 0) && (target <= comp->nbstates) &&
4328 	        (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4329 		 XML_REGEXP_SINK_STATE)) {
4330 	        values[nb++] = comp->stringMap[i];
4331 		(*nbneg)++;
4332 	    }
4333 	}
4334     } else {
4335         int transno;
4336 	xmlRegTransPtr trans;
4337 	xmlRegAtomPtr atom;
4338 	xmlRegStatePtr state;
4339 
4340 	if (terminal != NULL) {
4341 	    if (exec->state->type == XML_REGEXP_FINAL_STATE)
4342 		*terminal = 1;
4343 	    else
4344 		*terminal = 0;
4345 	}
4346 
4347 	if (err) {
4348 	    if (exec->errState == NULL) return(-1);
4349 	    state = exec->errState;
4350 	} else {
4351 	    if (exec->state == NULL) return(-1);
4352 	    state = exec->state;
4353 	}
4354 	for (transno = 0;
4355 	     (transno < state->nbTrans) && (nb < maxval);
4356 	     transno++) {
4357 	    trans = &state->trans[transno];
4358 	    if (trans->to < 0)
4359 		continue;
4360 	    atom = trans->atom;
4361 	    if ((atom == NULL) || (atom->valuep == NULL))
4362 		continue;
4363 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4364 	        /* this should not be reached but ... */
4365 	        TODO;
4366 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
4367 	        /* this should not be reached but ... */
4368 	        TODO;
4369 	    } else if (trans->counter >= 0) {
4370 		xmlRegCounterPtr counter = NULL;
4371 		int count;
4372 
4373 		if (err)
4374 		    count = exec->errCounts[trans->counter];
4375 		else
4376 		    count = exec->counts[trans->counter];
4377 		if (exec->comp != NULL)
4378 		    counter = &exec->comp->counters[trans->counter];
4379 		if ((counter == NULL) || (count < counter->max)) {
4380 		    if (atom->neg)
4381 			values[nb++] = (xmlChar *) atom->valuep2;
4382 		    else
4383 			values[nb++] = (xmlChar *) atom->valuep;
4384 		    (*nbval)++;
4385 		}
4386 	    } else {
4387                 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4388 		    (exec->comp->states[trans->to]->type !=
4389 		     XML_REGEXP_SINK_STATE)) {
4390 		    if (atom->neg)
4391 			values[nb++] = (xmlChar *) atom->valuep2;
4392 		    else
4393 			values[nb++] = (xmlChar *) atom->valuep;
4394 		    (*nbval)++;
4395 		}
4396 	    }
4397 	}
4398 	for (transno = 0;
4399 	     (transno < state->nbTrans) && (nb < maxval);
4400 	     transno++) {
4401 	    trans = &state->trans[transno];
4402 	    if (trans->to < 0)
4403 		continue;
4404 	    atom = trans->atom;
4405 	    if ((atom == NULL) || (atom->valuep == NULL))
4406 		continue;
4407 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4408 	        continue;
4409 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
4410 	        continue;
4411 	    } else if (trans->counter >= 0) {
4412 	        continue;
4413 	    } else {
4414                 if ((exec->comp->states[trans->to] != NULL) &&
4415 		    (exec->comp->states[trans->to]->type ==
4416 		     XML_REGEXP_SINK_STATE)) {
4417 		    if (atom->neg)
4418 			values[nb++] = (xmlChar *) atom->valuep2;
4419 		    else
4420 			values[nb++] = (xmlChar *) atom->valuep;
4421 		    (*nbneg)++;
4422 		}
4423 	    }
4424 	}
4425     }
4426     return(0);
4427 }
4428 
4429 /**
4430  * xmlRegExecNextValues:
4431  * @exec: a regexp execution context
4432  * @nbval: pointer to the number of accepted values IN/OUT
4433  * @nbneg: return number of negative transitions
4434  * @values: pointer to the array of acceptable values
4435  * @terminal: return value if this was a terminal state
4436  *
4437  * Extract information from the regexp execution,
4438  * the parameter @values must point to an array of @nbval string pointers
4439  * on return nbval will contain the number of possible strings in that
4440  * state and the @values array will be updated with them. The string values
4441  * returned will be freed with the @exec context and don't need to be
4442  * deallocated.
4443  *
4444  * Returns: 0 in case of success or -1 in case of error.
4445  */
4446 int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4447 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4448                      xmlChar **values, int *terminal) {
4449     return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4450 }
4451 
4452 /**
4453  * xmlRegExecErrInfo:
4454  * @exec: a regexp execution context generating an error
4455  * @string: return value for the error string
4456  * @nbval: pointer to the number of accepted values IN/OUT
4457  * @nbneg: return number of negative transitions
4458  * @values: pointer to the array of acceptable values
4459  * @terminal: return value if this was a terminal state
4460  *
4461  * Extract error information from the regexp execution, the parameter
4462  * @string will be updated with the value pushed and not accepted,
4463  * the parameter @values must point to an array of @nbval string pointers
4464  * on return nbval will contain the number of possible strings in that
4465  * state and the @values array will be updated with them. The string values
4466  * returned will be freed with the @exec context and don't need to be
4467  * deallocated.
4468  *
4469  * Returns: 0 in case of success or -1 in case of error.
4470  */
4471 int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec,const xmlChar ** string,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4472 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4473                   int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4474     if (exec == NULL)
4475         return(-1);
4476     if (string != NULL) {
4477         if (exec->status != 0)
4478 	    *string = exec->errString;
4479 	else
4480 	    *string = NULL;
4481     }
4482     return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4483 }
4484 
4485 #ifdef DEBUG_ERR
testerr(xmlRegExecCtxtPtr exec)4486 static void testerr(xmlRegExecCtxtPtr exec) {
4487     const xmlChar *string;
4488     xmlChar *values[5];
4489     int nb = 5;
4490     int nbneg;
4491     int terminal;
4492     xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4493 }
4494 #endif
4495 
4496 #if 0
4497 static int
4498 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4499     xmlRegTransPtr trans;
4500     xmlRegAtomPtr atom;
4501     int ret;
4502     int codepoint, len;
4503 
4504     if (exec == NULL)
4505 	return(-1);
4506     if (exec->status != 0)
4507 	return(exec->status);
4508 
4509     while ((exec->status == 0) &&
4510 	   ((exec->inputString[exec->index] != 0) ||
4511 	    (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4512 
4513 	/*
4514 	 * End of input on non-terminal state, rollback, however we may
4515 	 * still have epsilon like transition for counted transitions
4516 	 * on counters, in that case don't break too early.
4517 	 */
4518 	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4519 	    goto rollback;
4520 
4521 	exec->transcount = 0;
4522 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4523 	    trans = &exec->state->trans[exec->transno];
4524 	    if (trans->to < 0)
4525 		continue;
4526 	    atom = trans->atom;
4527 	    ret = 0;
4528 	    if (trans->count >= 0) {
4529 		int count;
4530 		xmlRegCounterPtr counter;
4531 
4532 		/*
4533 		 * A counted transition.
4534 		 */
4535 
4536 		count = exec->counts[trans->count];
4537 		counter = &exec->comp->counters[trans->count];
4538 #ifdef DEBUG_REGEXP_EXEC
4539 		printf("testing count %d: val %d, min %d, max %d\n",
4540 		       trans->count, count, counter->min,  counter->max);
4541 #endif
4542 		ret = ((count >= counter->min) && (count <= counter->max));
4543 	    } else if (atom == NULL) {
4544 		fprintf(stderr, "epsilon transition left at runtime\n");
4545 		exec->status = -2;
4546 		break;
4547 	    } else if (exec->inputString[exec->index] != 0) {
4548                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4549 		ret = xmlRegCheckCharacter(atom, codepoint);
4550 		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4551 		    xmlRegStatePtr to = exec->comp->states[trans->to];
4552 
4553 		    /*
4554 		     * this is a multiple input sequence
4555 		     */
4556 		    if (exec->state->nbTrans > exec->transno + 1) {
4557 			xmlFARegExecSave(exec);
4558 		    }
4559 		    exec->transcount = 1;
4560 		    do {
4561 			/*
4562 			 * Try to progress as much as possible on the input
4563 			 */
4564 			if (exec->transcount == atom->max) {
4565 			    break;
4566 			}
4567 			exec->index += len;
4568 			/*
4569 			 * End of input: stop here
4570 			 */
4571 			if (exec->inputString[exec->index] == 0) {
4572 			    exec->index -= len;
4573 			    break;
4574 			}
4575 			if (exec->transcount >= atom->min) {
4576 			    int transno = exec->transno;
4577 			    xmlRegStatePtr state = exec->state;
4578 
4579 			    /*
4580 			     * The transition is acceptable save it
4581 			     */
4582 			    exec->transno = -1; /* trick */
4583 			    exec->state = to;
4584 			    xmlFARegExecSave(exec);
4585 			    exec->transno = transno;
4586 			    exec->state = state;
4587 			}
4588 			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4589 				              len);
4590 			ret = xmlRegCheckCharacter(atom, codepoint);
4591 			exec->transcount++;
4592 		    } while (ret == 1);
4593 		    if (exec->transcount < atom->min)
4594 			ret = 0;
4595 
4596 		    /*
4597 		     * If the last check failed but one transition was found
4598 		     * possible, rollback
4599 		     */
4600 		    if (ret < 0)
4601 			ret = 0;
4602 		    if (ret == 0) {
4603 			goto rollback;
4604 		    }
4605 		}
4606 	    }
4607 	    if (ret == 1) {
4608 		if (exec->state->nbTrans > exec->transno + 1) {
4609 		    xmlFARegExecSave(exec);
4610 		}
4611 		/*
4612 		 * restart count for expressions like this ((abc){2})*
4613 		 */
4614 		if (trans->count >= 0) {
4615 #ifdef DEBUG_REGEXP_EXEC
4616 		    printf("Reset count %d\n", trans->count);
4617 #endif
4618 		    exec->counts[trans->count] = 0;
4619 		}
4620 		if (trans->counter >= 0) {
4621 #ifdef DEBUG_REGEXP_EXEC
4622 		    printf("Increasing count %d\n", trans->counter);
4623 #endif
4624 		    exec->counts[trans->counter]++;
4625 		}
4626 #ifdef DEBUG_REGEXP_EXEC
4627 		printf("entering state %d\n", trans->to);
4628 #endif
4629 		exec->state = exec->comp->states[trans->to];
4630 		exec->transno = 0;
4631 		if (trans->atom != NULL) {
4632 		    exec->index += len;
4633 		}
4634 		goto progress;
4635 	    } else if (ret < 0) {
4636 		exec->status = -4;
4637 		break;
4638 	    }
4639 	}
4640 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4641 rollback:
4642 	    /*
4643 	     * Failed to find a way out
4644 	     */
4645 	    exec->determinist = 0;
4646 	    xmlFARegExecRollBack(exec);
4647 	}
4648 progress:
4649 	continue;
4650     }
4651 }
4652 #endif
4653 /************************************************************************
4654  *									*
4655  *	Parser for the Schemas Datatype Regular Expressions		*
4656  *	http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs	*
4657  *									*
4658  ************************************************************************/
4659 
4660 /**
4661  * xmlFAIsChar:
4662  * @ctxt:  a regexp parser context
4663  *
4664  * [10]   Char   ::=   [^.\?*+()|#x5B#x5D]
4665  */
4666 static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt)4667 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4668     int cur;
4669     int len;
4670 
4671     cur = CUR_SCHAR(ctxt->cur, len);
4672     if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4673 	(cur == '*') || (cur == '+') || (cur == '(') ||
4674 	(cur == ')') || (cur == '|') || (cur == 0x5B) ||
4675 	(cur == 0x5D) || (cur == 0))
4676 	return(-1);
4677     return(cur);
4678 }
4679 
4680 /**
4681  * xmlFAParseCharProp:
4682  * @ctxt:  a regexp parser context
4683  *
4684  * [27]   charProp   ::=   IsCategory | IsBlock
4685  * [28]   IsCategory ::= Letters | Marks | Numbers | Punctuation |
4686  *                       Separators | Symbols | Others
4687  * [29]   Letters   ::=   'L' [ultmo]?
4688  * [30]   Marks   ::=   'M' [nce]?
4689  * [31]   Numbers   ::=   'N' [dlo]?
4690  * [32]   Punctuation   ::=   'P' [cdseifo]?
4691  * [33]   Separators   ::=   'Z' [slp]?
4692  * [34]   Symbols   ::=   'S' [mcko]?
4693  * [35]   Others   ::=   'C' [cfon]?
4694  * [36]   IsBlock   ::=   'Is' [a-zA-Z0-9#x2D]+
4695  */
4696 static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt)4697 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4698     int cur;
4699     xmlRegAtomType type = (xmlRegAtomType) 0;
4700     xmlChar *blockName = NULL;
4701 
4702     cur = CUR;
4703     if (cur == 'L') {
4704 	NEXT;
4705 	cur = CUR;
4706 	if (cur == 'u') {
4707 	    NEXT;
4708 	    type = XML_REGEXP_LETTER_UPPERCASE;
4709 	} else if (cur == 'l') {
4710 	    NEXT;
4711 	    type = XML_REGEXP_LETTER_LOWERCASE;
4712 	} else if (cur == 't') {
4713 	    NEXT;
4714 	    type = XML_REGEXP_LETTER_TITLECASE;
4715 	} else if (cur == 'm') {
4716 	    NEXT;
4717 	    type = XML_REGEXP_LETTER_MODIFIER;
4718 	} else if (cur == 'o') {
4719 	    NEXT;
4720 	    type = XML_REGEXP_LETTER_OTHERS;
4721 	} else {
4722 	    type = XML_REGEXP_LETTER;
4723 	}
4724     } else if (cur == 'M') {
4725 	NEXT;
4726 	cur = CUR;
4727 	if (cur == 'n') {
4728 	    NEXT;
4729 	    /* nonspacing */
4730 	    type = XML_REGEXP_MARK_NONSPACING;
4731 	} else if (cur == 'c') {
4732 	    NEXT;
4733 	    /* spacing combining */
4734 	    type = XML_REGEXP_MARK_SPACECOMBINING;
4735 	} else if (cur == 'e') {
4736 	    NEXT;
4737 	    /* enclosing */
4738 	    type = XML_REGEXP_MARK_ENCLOSING;
4739 	} else {
4740 	    /* all marks */
4741 	    type = XML_REGEXP_MARK;
4742 	}
4743     } else if (cur == 'N') {
4744 	NEXT;
4745 	cur = CUR;
4746 	if (cur == 'd') {
4747 	    NEXT;
4748 	    /* digital */
4749 	    type = XML_REGEXP_NUMBER_DECIMAL;
4750 	} else if (cur == 'l') {
4751 	    NEXT;
4752 	    /* letter */
4753 	    type = XML_REGEXP_NUMBER_LETTER;
4754 	} else if (cur == 'o') {
4755 	    NEXT;
4756 	    /* other */
4757 	    type = XML_REGEXP_NUMBER_OTHERS;
4758 	} else {
4759 	    /* all numbers */
4760 	    type = XML_REGEXP_NUMBER;
4761 	}
4762     } else if (cur == 'P') {
4763 	NEXT;
4764 	cur = CUR;
4765 	if (cur == 'c') {
4766 	    NEXT;
4767 	    /* connector */
4768 	    type = XML_REGEXP_PUNCT_CONNECTOR;
4769 	} else if (cur == 'd') {
4770 	    NEXT;
4771 	    /* dash */
4772 	    type = XML_REGEXP_PUNCT_DASH;
4773 	} else if (cur == 's') {
4774 	    NEXT;
4775 	    /* open */
4776 	    type = XML_REGEXP_PUNCT_OPEN;
4777 	} else if (cur == 'e') {
4778 	    NEXT;
4779 	    /* close */
4780 	    type = XML_REGEXP_PUNCT_CLOSE;
4781 	} else if (cur == 'i') {
4782 	    NEXT;
4783 	    /* initial quote */
4784 	    type = XML_REGEXP_PUNCT_INITQUOTE;
4785 	} else if (cur == 'f') {
4786 	    NEXT;
4787 	    /* final quote */
4788 	    type = XML_REGEXP_PUNCT_FINQUOTE;
4789 	} else if (cur == 'o') {
4790 	    NEXT;
4791 	    /* other */
4792 	    type = XML_REGEXP_PUNCT_OTHERS;
4793 	} else {
4794 	    /* all punctuation */
4795 	    type = XML_REGEXP_PUNCT;
4796 	}
4797     } else if (cur == 'Z') {
4798 	NEXT;
4799 	cur = CUR;
4800 	if (cur == 's') {
4801 	    NEXT;
4802 	    /* space */
4803 	    type = XML_REGEXP_SEPAR_SPACE;
4804 	} else if (cur == 'l') {
4805 	    NEXT;
4806 	    /* line */
4807 	    type = XML_REGEXP_SEPAR_LINE;
4808 	} else if (cur == 'p') {
4809 	    NEXT;
4810 	    /* paragraph */
4811 	    type = XML_REGEXP_SEPAR_PARA;
4812 	} else {
4813 	    /* all separators */
4814 	    type = XML_REGEXP_SEPAR;
4815 	}
4816     } else if (cur == 'S') {
4817 	NEXT;
4818 	cur = CUR;
4819 	if (cur == 'm') {
4820 	    NEXT;
4821 	    type = XML_REGEXP_SYMBOL_MATH;
4822 	    /* math */
4823 	} else if (cur == 'c') {
4824 	    NEXT;
4825 	    type = XML_REGEXP_SYMBOL_CURRENCY;
4826 	    /* currency */
4827 	} else if (cur == 'k') {
4828 	    NEXT;
4829 	    type = XML_REGEXP_SYMBOL_MODIFIER;
4830 	    /* modifiers */
4831 	} else if (cur == 'o') {
4832 	    NEXT;
4833 	    type = XML_REGEXP_SYMBOL_OTHERS;
4834 	    /* other */
4835 	} else {
4836 	    /* all symbols */
4837 	    type = XML_REGEXP_SYMBOL;
4838 	}
4839     } else if (cur == 'C') {
4840 	NEXT;
4841 	cur = CUR;
4842 	if (cur == 'c') {
4843 	    NEXT;
4844 	    /* control */
4845 	    type = XML_REGEXP_OTHER_CONTROL;
4846 	} else if (cur == 'f') {
4847 	    NEXT;
4848 	    /* format */
4849 	    type = XML_REGEXP_OTHER_FORMAT;
4850 	} else if (cur == 'o') {
4851 	    NEXT;
4852 	    /* private use */
4853 	    type = XML_REGEXP_OTHER_PRIVATE;
4854 	} else if (cur == 'n') {
4855 	    NEXT;
4856 	    /* not assigned */
4857 	    type = XML_REGEXP_OTHER_NA;
4858 	} else {
4859 	    /* all others */
4860 	    type = XML_REGEXP_OTHER;
4861 	}
4862     } else if (cur == 'I') {
4863 	const xmlChar *start;
4864 	NEXT;
4865 	cur = CUR;
4866 	if (cur != 's') {
4867 	    ERROR("IsXXXX expected");
4868 	    return;
4869 	}
4870 	NEXT;
4871 	start = ctxt->cur;
4872 	cur = CUR;
4873 	if (((cur >= 'a') && (cur <= 'z')) ||
4874 	    ((cur >= 'A') && (cur <= 'Z')) ||
4875 	    ((cur >= '0') && (cur <= '9')) ||
4876 	    (cur == 0x2D)) {
4877 	    NEXT;
4878 	    cur = CUR;
4879 	    while (((cur >= 'a') && (cur <= 'z')) ||
4880 		((cur >= 'A') && (cur <= 'Z')) ||
4881 		((cur >= '0') && (cur <= '9')) ||
4882 		(cur == 0x2D)) {
4883 		NEXT;
4884 		cur = CUR;
4885 	    }
4886 	}
4887 	type = XML_REGEXP_BLOCK_NAME;
4888 	blockName = xmlStrndup(start, ctxt->cur - start);
4889     } else {
4890 	ERROR("Unknown char property");
4891 	return;
4892     }
4893     if (ctxt->atom == NULL) {
4894 	ctxt->atom = xmlRegNewAtom(ctxt, type);
4895 	if (ctxt->atom != NULL)
4896 	    ctxt->atom->valuep = blockName;
4897     } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4898         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4899 		           type, 0, 0, blockName);
4900     }
4901 }
4902 
parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)4903 static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
4904 {
4905     int val = 0, i, cur;
4906     for (i = 0; i < 4; i++) {
4907 	NEXT;
4908 	val *= 16;
4909 	cur = CUR;
4910 	if (cur >= '0' && cur <= '9') {
4911 	    val += cur - '0';
4912 	} else if (cur >= 'A' && cur <= 'F') {
4913 	    val += cur - 'A' + 10;
4914 	} else if (cur >= 'a' && cur <= 'f') {
4915 	    val += cur - 'a' + 10;
4916 	} else {
4917 	    ERROR("Expecting hex digit");
4918 	    return -1;
4919 	}
4920     }
4921     return val;
4922 }
4923 
parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)4924 static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
4925 {
4926     int val = parse_escaped_codeunit(ctxt);
4927     if (0xD800 <= val && val <= 0xDBFF) {
4928 	NEXT;
4929 	if (CUR == '\\') {
4930 	    NEXT;
4931 	    if (CUR == 'u') {
4932 		int low = parse_escaped_codeunit(ctxt);
4933 		if (0xDC00 <= low && low <= 0xDFFF) {
4934 		    return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
4935 		}
4936 	    }
4937 	}
4938 	ERROR("Invalid low surrogate pair code unit");
4939 	val = -1;
4940     }
4941     return val;
4942 }
4943 
4944 /**
4945  * xmlFAParseCharClassEsc:
4946  * @ctxt:  a regexp parser context
4947  *
4948  * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4949  * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4950  * [25] catEsc   ::=   '\p{' charProp '}'
4951  * [26] complEsc ::=   '\P{' charProp '}'
4952  * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4953  */
4954 static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt)4955 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4956     int cur;
4957 
4958     if (CUR == '.') {
4959 	if (ctxt->atom == NULL) {
4960 	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4961 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4962 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4963 			       XML_REGEXP_ANYCHAR, 0, 0, NULL);
4964 	}
4965 	NEXT;
4966 	return;
4967     }
4968     if (CUR != '\\') {
4969 	ERROR("Escaped sequence: expecting \\");
4970 	return;
4971     }
4972     NEXT;
4973     cur = CUR;
4974     if (cur == 'p') {
4975 	NEXT;
4976 	if (CUR != '{') {
4977 	    ERROR("Expecting '{'");
4978 	    return;
4979 	}
4980 	NEXT;
4981 	xmlFAParseCharProp(ctxt);
4982 	if (CUR != '}') {
4983 	    ERROR("Expecting '}'");
4984 	    return;
4985 	}
4986 	NEXT;
4987     } else if (cur == 'P') {
4988 	NEXT;
4989 	if (CUR != '{') {
4990 	    ERROR("Expecting '{'");
4991 	    return;
4992 	}
4993 	NEXT;
4994 	xmlFAParseCharProp(ctxt);
4995         if (ctxt->atom != NULL)
4996 	    ctxt->atom->neg = 1;
4997 	if (CUR != '}') {
4998 	    ERROR("Expecting '}'");
4999 	    return;
5000 	}
5001 	NEXT;
5002     } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
5003 	(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
5004 	(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
5005 	(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
5006 	(cur == 0x5E) ||
5007 
5008 	/* Non-standard escape sequences:
5009 	 *                  Java 1.8|.NET Core 3.1|MSXML 6 */
5010 	(cur == '!') ||     /*   +  |     +       |    +   */
5011 	(cur == '"') ||     /*   +  |     +       |    +   */
5012 	(cur == '#') ||     /*   +  |     +       |    +   */
5013 	(cur == '$') ||     /*   +  |     +       |    +   */
5014 	(cur == '%') ||     /*   +  |     +       |    +   */
5015 	(cur == ',') ||     /*   +  |     +       |    +   */
5016 	(cur == '/') ||     /*   +  |     +       |    +   */
5017 	(cur == ':') ||     /*   +  |     +       |    +   */
5018 	(cur == ';') ||     /*   +  |     +       |    +   */
5019 	(cur == '=') ||     /*   +  |     +       |    +   */
5020 	(cur == '>') ||     /*      |     +       |    +   */
5021 	(cur == '@') ||     /*   +  |     +       |    +   */
5022 	(cur == '`') ||     /*   +  |     +       |    +   */
5023 	(cur == '~') ||     /*   +  |     +       |    +   */
5024 	(cur == 'u')) {     /*      |     +       |    +   */
5025 	if (ctxt->atom == NULL) {
5026 	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5027 	    if (ctxt->atom != NULL) {
5028 	        switch (cur) {
5029 		    case 'n':
5030 		        ctxt->atom->codepoint = '\n';
5031 			break;
5032 		    case 'r':
5033 		        ctxt->atom->codepoint = '\r';
5034 			break;
5035 		    case 't':
5036 		        ctxt->atom->codepoint = '\t';
5037 			break;
5038 		    case 'u':
5039 			cur = parse_escaped_codepoint(ctxt);
5040 			if (cur < 0) {
5041 			    return;
5042 			}
5043 			ctxt->atom->codepoint = cur;
5044 			break;
5045 		    default:
5046 			ctxt->atom->codepoint = cur;
5047 		}
5048 	    }
5049 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5050             switch (cur) {
5051                 case 'n':
5052                     cur = '\n';
5053                     break;
5054                 case 'r':
5055                     cur = '\r';
5056                     break;
5057                 case 't':
5058                     cur = '\t';
5059                     break;
5060             }
5061 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5062 			       XML_REGEXP_CHARVAL, cur, cur, NULL);
5063 	}
5064 	NEXT;
5065     } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
5066 	(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
5067 	(cur == 'w') || (cur == 'W')) {
5068 	xmlRegAtomType type = XML_REGEXP_ANYSPACE;
5069 
5070 	switch (cur) {
5071 	    case 's':
5072 		type = XML_REGEXP_ANYSPACE;
5073 		break;
5074 	    case 'S':
5075 		type = XML_REGEXP_NOTSPACE;
5076 		break;
5077 	    case 'i':
5078 		type = XML_REGEXP_INITNAME;
5079 		break;
5080 	    case 'I':
5081 		type = XML_REGEXP_NOTINITNAME;
5082 		break;
5083 	    case 'c':
5084 		type = XML_REGEXP_NAMECHAR;
5085 		break;
5086 	    case 'C':
5087 		type = XML_REGEXP_NOTNAMECHAR;
5088 		break;
5089 	    case 'd':
5090 		type = XML_REGEXP_DECIMAL;
5091 		break;
5092 	    case 'D':
5093 		type = XML_REGEXP_NOTDECIMAL;
5094 		break;
5095 	    case 'w':
5096 		type = XML_REGEXP_REALCHAR;
5097 		break;
5098 	    case 'W':
5099 		type = XML_REGEXP_NOTREALCHAR;
5100 		break;
5101 	}
5102 	NEXT;
5103 	if (ctxt->atom == NULL) {
5104 	    ctxt->atom = xmlRegNewAtom(ctxt, type);
5105 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5106 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5107 			       type, 0, 0, NULL);
5108 	}
5109     } else {
5110 	ERROR("Wrong escape sequence, misuse of character '\\'");
5111     }
5112 }
5113 
5114 /**
5115  * xmlFAParseCharRange:
5116  * @ctxt:  a regexp parser context
5117  *
5118  * [17]   charRange   ::=     seRange | XmlCharRef | XmlCharIncDash
5119  * [18]   seRange   ::=   charOrEsc '-' charOrEsc
5120  * [20]   charOrEsc   ::=   XmlChar | SingleCharEsc
5121  * [21]   XmlChar   ::=   [^\#x2D#x5B#x5D]
5122  * [22]   XmlCharIncDash   ::=   [^\#x5B#x5D]
5123  */
5124 static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt)5125 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5126     int cur, len;
5127     int start = -1;
5128     int end = -1;
5129 
5130     if (CUR == '\0') {
5131         ERROR("Expecting ']'");
5132 	return;
5133     }
5134 
5135     cur = CUR;
5136     if (cur == '\\') {
5137 	NEXT;
5138 	cur = CUR;
5139 	switch (cur) {
5140 	    case 'n': start = 0xA; break;
5141 	    case 'r': start = 0xD; break;
5142 	    case 't': start = 0x9; break;
5143 	    case '\\': case '|': case '.': case '-': case '^': case '?':
5144 	    case '*': case '+': case '{': case '}': case '(': case ')':
5145 	    case '[': case ']':
5146 		start = cur; break;
5147 	    default:
5148 		ERROR("Invalid escape value");
5149 		return;
5150 	}
5151 	end = start;
5152         len = 1;
5153     } else if ((cur != 0x5B) && (cur != 0x5D)) {
5154         end = start = CUR_SCHAR(ctxt->cur, len);
5155     } else {
5156 	ERROR("Expecting a char range");
5157 	return;
5158     }
5159     /*
5160      * Since we are "inside" a range, we can assume ctxt->cur is past
5161      * the start of ctxt->string, and PREV should be safe
5162      */
5163     if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5164 	NEXTL(len);
5165 	return;
5166     }
5167     NEXTL(len);
5168     cur = CUR;
5169     if ((cur != '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
5170         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5171 		              XML_REGEXP_CHARVAL, start, end, NULL);
5172 	return;
5173     }
5174     NEXT;
5175     cur = CUR;
5176     if (cur == '\\') {
5177 	NEXT;
5178 	cur = CUR;
5179 	switch (cur) {
5180 	    case 'n': end = 0xA; break;
5181 	    case 'r': end = 0xD; break;
5182 	    case 't': end = 0x9; break;
5183 	    case '\\': case '|': case '.': case '-': case '^': case '?':
5184 	    case '*': case '+': case '{': case '}': case '(': case ')':
5185 	    case '[': case ']':
5186 		end = cur; break;
5187 	    default:
5188 		ERROR("Invalid escape value");
5189 		return;
5190 	}
5191         len = 1;
5192     } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5193         end = CUR_SCHAR(ctxt->cur, len);
5194     } else {
5195 	ERROR("Expecting the end of a char range");
5196 	return;
5197     }
5198 
5199     /* TODO check that the values are acceptable character ranges for XML */
5200     if (end < start) {
5201 	ERROR("End of range is before start of range");
5202     } else {
5203         NEXTL(len);
5204         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5205 		           XML_REGEXP_CHARVAL, start, end, NULL);
5206     }
5207     return;
5208 }
5209 
5210 /**
5211  * xmlFAParsePosCharGroup:
5212  * @ctxt:  a regexp parser context
5213  *
5214  * [14]   posCharGroup ::= ( charRange | charClassEsc  )+
5215  */
5216 static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt)5217 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5218     do {
5219 	if (CUR == '\\') {
5220 	    xmlFAParseCharClassEsc(ctxt);
5221 	} else {
5222 	    xmlFAParseCharRange(ctxt);
5223 	}
5224     } while ((CUR != ']') && (CUR != '-') &&
5225              (CUR != 0) && (ctxt->error == 0));
5226 }
5227 
5228 /**
5229  * xmlFAParseCharGroup:
5230  * @ctxt:  a regexp parser context
5231  *
5232  * [13]   charGroup    ::= posCharGroup | negCharGroup | charClassSub
5233  * [15]   negCharGroup ::= '^' posCharGroup
5234  * [16]   charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5235  * [12]   charClassExpr ::= '[' charGroup ']'
5236  */
5237 static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt)5238 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5239     int neg = ctxt->neg;
5240 
5241     if (CUR == '^') {
5242 	NEXT;
5243 	ctxt->neg = !ctxt->neg;
5244 	xmlFAParsePosCharGroup(ctxt);
5245 	ctxt->neg = neg;
5246     }
5247     while ((CUR != ']') && (ctxt->error == 0)) {
5248 	if ((CUR == '-') && (NXT(1) == '[')) {
5249 	    NEXT;	/* eat the '-' */
5250 	    NEXT;	/* eat the '[' */
5251 	    ctxt->neg = 2;
5252 	    xmlFAParseCharGroup(ctxt);
5253 	    ctxt->neg = neg;
5254 	    if (CUR == ']') {
5255 		NEXT;
5256 	    } else {
5257 		ERROR("charClassExpr: ']' expected");
5258 	    }
5259 	    break;
5260 	} else {
5261 	    xmlFAParsePosCharGroup(ctxt);
5262 	}
5263     }
5264 }
5265 
5266 /**
5267  * xmlFAParseCharClass:
5268  * @ctxt:  a regexp parser context
5269  *
5270  * [11]   charClass   ::=     charClassEsc | charClassExpr
5271  * [12]   charClassExpr   ::=   '[' charGroup ']'
5272  */
5273 static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt)5274 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5275     if (CUR == '[') {
5276 	NEXT;
5277 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5278 	if (ctxt->atom == NULL)
5279 	    return;
5280 	xmlFAParseCharGroup(ctxt);
5281 	if (CUR == ']') {
5282 	    NEXT;
5283 	} else {
5284 	    ERROR("xmlFAParseCharClass: ']' expected");
5285 	}
5286     } else {
5287 	xmlFAParseCharClassEsc(ctxt);
5288     }
5289 }
5290 
5291 /**
5292  * xmlFAParseQuantExact:
5293  * @ctxt:  a regexp parser context
5294  *
5295  * [8]   QuantExact   ::=   [0-9]+
5296  *
5297  * Returns 0 if success or -1 in case of error
5298  */
5299 static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt)5300 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5301     int ret = 0;
5302     int ok = 0;
5303     int overflow = 0;
5304 
5305     while ((CUR >= '0') && (CUR <= '9')) {
5306         if (ret > INT_MAX / 10) {
5307             overflow = 1;
5308         } else {
5309             int digit = CUR - '0';
5310 
5311             ret *= 10;
5312             if (ret > INT_MAX - digit)
5313                 overflow = 1;
5314             else
5315                 ret += digit;
5316         }
5317 	ok = 1;
5318 	NEXT;
5319     }
5320     if ((ok != 1) || (overflow == 1)) {
5321 	return(-1);
5322     }
5323     return(ret);
5324 }
5325 
5326 /**
5327  * xmlFAParseQuantifier:
5328  * @ctxt:  a regexp parser context
5329  *
5330  * [4]   quantifier   ::=   [?*+] | ( '{' quantity '}' )
5331  * [5]   quantity   ::=   quantRange | quantMin | QuantExact
5332  * [6]   quantRange   ::=   QuantExact ',' QuantExact
5333  * [7]   quantMin   ::=   QuantExact ','
5334  * [8]   QuantExact   ::=   [0-9]+
5335  */
5336 static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt)5337 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5338     int cur;
5339 
5340     cur = CUR;
5341     if ((cur == '?') || (cur == '*') || (cur == '+')) {
5342 	if (ctxt->atom != NULL) {
5343 	    if (cur == '?')
5344 		ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5345 	    else if (cur == '*')
5346 		ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5347 	    else if (cur == '+')
5348 		ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5349 	}
5350 	NEXT;
5351 	return(1);
5352     }
5353     if (cur == '{') {
5354 	int min = 0, max = 0;
5355 
5356 	NEXT;
5357 	cur = xmlFAParseQuantExact(ctxt);
5358 	if (cur >= 0)
5359 	    min = cur;
5360         else {
5361             ERROR("Improper quantifier");
5362         }
5363 	if (CUR == ',') {
5364 	    NEXT;
5365 	    if (CUR == '}')
5366 	        max = INT_MAX;
5367 	    else {
5368 	        cur = xmlFAParseQuantExact(ctxt);
5369 	        if (cur >= 0)
5370 		    max = cur;
5371 		else {
5372 		    ERROR("Improper quantifier");
5373 		}
5374 	    }
5375 	}
5376 	if (CUR == '}') {
5377 	    NEXT;
5378 	} else {
5379 	    ERROR("Unterminated quantifier");
5380 	}
5381 	if (max == 0)
5382 	    max = min;
5383 	if (ctxt->atom != NULL) {
5384 	    ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5385 	    ctxt->atom->min = min;
5386 	    ctxt->atom->max = max;
5387 	}
5388 	return(1);
5389     }
5390     return(0);
5391 }
5392 
5393 /**
5394  * xmlFAParseAtom:
5395  * @ctxt:  a regexp parser context
5396  *
5397  * [9]   atom   ::=   Char | charClass | ( '(' regExp ')' )
5398  */
5399 static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt)5400 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5401     int codepoint, len;
5402 
5403     codepoint = xmlFAIsChar(ctxt);
5404     if (codepoint > 0) {
5405 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5406 	if (ctxt->atom == NULL)
5407 	    return(-1);
5408 	codepoint = CUR_SCHAR(ctxt->cur, len);
5409 	ctxt->atom->codepoint = codepoint;
5410 	NEXTL(len);
5411 	return(1);
5412     } else if (CUR == '|') {
5413 	return(0);
5414     } else if (CUR == 0) {
5415 	return(0);
5416     } else if (CUR == ')') {
5417 	return(0);
5418     } else if (CUR == '(') {
5419 	xmlRegStatePtr start, oldend, start0;
5420 
5421 	NEXT;
5422         if (ctxt->depth >= 50) {
5423 	    ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5424             return(-1);
5425         }
5426 	/*
5427 	 * this extra Epsilon transition is needed if we count with 0 allowed
5428 	 * unfortunately this can't be known at that point
5429 	 */
5430 	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5431 	start0 = ctxt->state;
5432 	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5433 	start = ctxt->state;
5434 	oldend = ctxt->end;
5435 	ctxt->end = NULL;
5436 	ctxt->atom = NULL;
5437         ctxt->depth++;
5438 	xmlFAParseRegExp(ctxt, 0);
5439         ctxt->depth--;
5440 	if (CUR == ')') {
5441 	    NEXT;
5442 	} else {
5443 	    ERROR("xmlFAParseAtom: expecting ')'");
5444 	}
5445 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5446 	if (ctxt->atom == NULL)
5447 	    return(-1);
5448 	ctxt->atom->start = start;
5449 	ctxt->atom->start0 = start0;
5450 	ctxt->atom->stop = ctxt->state;
5451 	ctxt->end = oldend;
5452 	return(1);
5453     } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5454 	xmlFAParseCharClass(ctxt);
5455 	return(1);
5456     }
5457     return(0);
5458 }
5459 
5460 /**
5461  * xmlFAParsePiece:
5462  * @ctxt:  a regexp parser context
5463  *
5464  * [3]   piece   ::=   atom quantifier?
5465  */
5466 static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt)5467 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5468     int ret;
5469 
5470     ctxt->atom = NULL;
5471     ret = xmlFAParseAtom(ctxt);
5472     if (ret == 0)
5473 	return(0);
5474     if (ctxt->atom == NULL) {
5475 	ERROR("internal: no atom generated");
5476     }
5477     xmlFAParseQuantifier(ctxt);
5478     return(1);
5479 }
5480 
5481 /**
5482  * xmlFAParseBranch:
5483  * @ctxt:  a regexp parser context
5484  * @to: optional target to the end of the branch
5485  *
5486  * @to is used to optimize by removing duplicate path in automata
5487  * in expressions like (a|b)(c|d)
5488  *
5489  * [2]   branch   ::=   piece*
5490  */
5491 static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr to)5492 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5493     xmlRegStatePtr previous;
5494     int ret;
5495 
5496     previous = ctxt->state;
5497     ret = xmlFAParsePiece(ctxt);
5498     if (ret == 0) {
5499         /* Empty branch */
5500 	xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5501     } else {
5502 	if (xmlFAGenerateTransitions(ctxt, previous,
5503 	        (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
5504 	    return(-1);
5505 	previous = ctxt->state;
5506 	ctxt->atom = NULL;
5507     }
5508     while ((ret != 0) && (ctxt->error == 0)) {
5509 	ret = xmlFAParsePiece(ctxt);
5510 	if (ret != 0) {
5511 	    if (xmlFAGenerateTransitions(ctxt, previous,
5512 	            (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5513                     ctxt->atom) < 0)
5514 		    return(-1);
5515 	    previous = ctxt->state;
5516 	    ctxt->atom = NULL;
5517 	}
5518     }
5519     return(0);
5520 }
5521 
5522 /**
5523  * xmlFAParseRegExp:
5524  * @ctxt:  a regexp parser context
5525  * @top:  is this the top-level expression ?
5526  *
5527  * [1]   regExp   ::=     branch  ( '|' branch )*
5528  */
5529 static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt,int top)5530 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5531     xmlRegStatePtr start, end;
5532 
5533     /* if not top start should have been generated by an epsilon trans */
5534     start = ctxt->state;
5535     ctxt->end = NULL;
5536     xmlFAParseBranch(ctxt, NULL);
5537     if (top) {
5538 #ifdef DEBUG_REGEXP_GRAPH
5539 	printf("State %d is final\n", ctxt->state->no);
5540 #endif
5541 	ctxt->state->type = XML_REGEXP_FINAL_STATE;
5542     }
5543     if (CUR != '|') {
5544 	ctxt->end = ctxt->state;
5545 	return;
5546     }
5547     end = ctxt->state;
5548     while ((CUR == '|') && (ctxt->error == 0)) {
5549 	NEXT;
5550 	ctxt->state = start;
5551 	ctxt->end = NULL;
5552 	xmlFAParseBranch(ctxt, end);
5553     }
5554     if (!top) {
5555 	ctxt->state = end;
5556 	ctxt->end = end;
5557     }
5558 }
5559 
5560 /************************************************************************
5561  *									*
5562  *			The basic API					*
5563  *									*
5564  ************************************************************************/
5565 
5566 /**
5567  * xmlRegexpPrint:
5568  * @output: the file for the output debug
5569  * @regexp: the compiled regexp
5570  *
5571  * Print the content of the compiled regular expression
5572  */
5573 void
xmlRegexpPrint(FILE * output,xmlRegexpPtr regexp)5574 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5575     int i;
5576 
5577     if (output == NULL)
5578         return;
5579     fprintf(output, " regexp: ");
5580     if (regexp == NULL) {
5581 	fprintf(output, "NULL\n");
5582 	return;
5583     }
5584     fprintf(output, "'%s' ", regexp->string);
5585     fprintf(output, "\n");
5586     fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5587     for (i = 0;i < regexp->nbAtoms; i++) {
5588 	fprintf(output, " %02d ", i);
5589 	xmlRegPrintAtom(output, regexp->atoms[i]);
5590     }
5591     fprintf(output, "%d states:", regexp->nbStates);
5592     fprintf(output, "\n");
5593     for (i = 0;i < regexp->nbStates; i++) {
5594 	xmlRegPrintState(output, regexp->states[i]);
5595     }
5596     fprintf(output, "%d counters:\n", regexp->nbCounters);
5597     for (i = 0;i < regexp->nbCounters; i++) {
5598 	fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5599 		                                regexp->counters[i].max);
5600     }
5601 }
5602 
5603 /**
5604  * xmlRegexpCompile:
5605  * @regexp:  a regular expression string
5606  *
5607  * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5608  * Appendix F and builds an automata suitable for testing strings against
5609  * that regular expression
5610  *
5611  * Returns the compiled expression or NULL in case of error
5612  */
5613 xmlRegexpPtr
xmlRegexpCompile(const xmlChar * regexp)5614 xmlRegexpCompile(const xmlChar *regexp) {
5615     xmlRegexpPtr ret;
5616     xmlRegParserCtxtPtr ctxt;
5617 
5618     ctxt = xmlRegNewParserCtxt(regexp);
5619     if (ctxt == NULL)
5620 	return(NULL);
5621 
5622     /* initialize the parser */
5623     ctxt->end = NULL;
5624     ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5625     xmlRegStatePush(ctxt, ctxt->start);
5626 
5627     /* parse the expression building an automata */
5628     xmlFAParseRegExp(ctxt, 1);
5629     if (CUR != 0) {
5630 	ERROR("xmlFAParseRegExp: extra characters");
5631     }
5632     if (ctxt->error != 0) {
5633 	xmlRegFreeParserCtxt(ctxt);
5634 	return(NULL);
5635     }
5636     ctxt->end = ctxt->state;
5637     ctxt->start->type = XML_REGEXP_START_STATE;
5638     ctxt->end->type = XML_REGEXP_FINAL_STATE;
5639 
5640     /* remove the Epsilon except for counted transitions */
5641     xmlFAEliminateEpsilonTransitions(ctxt);
5642 
5643 
5644     if (ctxt->error != 0) {
5645 	xmlRegFreeParserCtxt(ctxt);
5646 	return(NULL);
5647     }
5648     ret = xmlRegEpxFromParse(ctxt);
5649     xmlRegFreeParserCtxt(ctxt);
5650     return(ret);
5651 }
5652 
5653 /**
5654  * xmlRegexpExec:
5655  * @comp:  the compiled regular expression
5656  * @content:  the value to check against the regular expression
5657  *
5658  * Check if the regular expression generates the value
5659  *
5660  * Returns 1 if it matches, 0 if not and a negative value in case of error
5661  */
5662 int
xmlRegexpExec(xmlRegexpPtr comp,const xmlChar * content)5663 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5664     if ((comp == NULL) || (content == NULL))
5665 	return(-1);
5666     return(xmlFARegExec(comp, content));
5667 }
5668 
5669 /**
5670  * xmlRegexpIsDeterminist:
5671  * @comp:  the compiled regular expression
5672  *
5673  * Check if the regular expression is determinist
5674  *
5675  * Returns 1 if it yes, 0 if not and a negative value in case of error
5676  */
5677 int
xmlRegexpIsDeterminist(xmlRegexpPtr comp)5678 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5679     xmlAutomataPtr am;
5680     int ret;
5681 
5682     if (comp == NULL)
5683 	return(-1);
5684     if (comp->determinist != -1)
5685 	return(comp->determinist);
5686 
5687     am = xmlNewAutomata();
5688     if (am == NULL)
5689         return(-1);
5690     if (am->states != NULL) {
5691 	int i;
5692 
5693 	for (i = 0;i < am->nbStates;i++)
5694 	    xmlRegFreeState(am->states[i]);
5695 	xmlFree(am->states);
5696     }
5697     am->nbAtoms = comp->nbAtoms;
5698     am->atoms = comp->atoms;
5699     am->nbStates = comp->nbStates;
5700     am->states = comp->states;
5701     am->determinist = -1;
5702     am->flags = comp->flags;
5703     ret = xmlFAComputesDeterminism(am);
5704     am->atoms = NULL;
5705     am->states = NULL;
5706     xmlFreeAutomata(am);
5707     comp->determinist = ret;
5708     return(ret);
5709 }
5710 
5711 /**
5712  * xmlRegFreeRegexp:
5713  * @regexp:  the regexp
5714  *
5715  * Free a regexp
5716  */
5717 void
xmlRegFreeRegexp(xmlRegexpPtr regexp)5718 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5719     int i;
5720     if (regexp == NULL)
5721 	return;
5722 
5723     if (regexp->string != NULL)
5724 	xmlFree(regexp->string);
5725     if (regexp->states != NULL) {
5726 	for (i = 0;i < regexp->nbStates;i++)
5727 	    xmlRegFreeState(regexp->states[i]);
5728 	xmlFree(regexp->states);
5729     }
5730     if (regexp->atoms != NULL) {
5731 	for (i = 0;i < regexp->nbAtoms;i++)
5732 	    xmlRegFreeAtom(regexp->atoms[i]);
5733 	xmlFree(regexp->atoms);
5734     }
5735     if (regexp->counters != NULL)
5736 	xmlFree(regexp->counters);
5737     if (regexp->compact != NULL)
5738 	xmlFree(regexp->compact);
5739     if (regexp->transdata != NULL)
5740 	xmlFree(regexp->transdata);
5741     if (regexp->stringMap != NULL) {
5742 	for (i = 0; i < regexp->nbstrings;i++)
5743 	    xmlFree(regexp->stringMap[i]);
5744 	xmlFree(regexp->stringMap);
5745     }
5746 
5747     xmlFree(regexp);
5748 }
5749 
5750 #ifdef LIBXML_AUTOMATA_ENABLED
5751 /************************************************************************
5752  *									*
5753  *			The Automata interface				*
5754  *									*
5755  ************************************************************************/
5756 
5757 /**
5758  * xmlNewAutomata:
5759  *
5760  * Create a new automata
5761  *
5762  * Returns the new object or NULL in case of failure
5763  */
5764 xmlAutomataPtr
xmlNewAutomata(void)5765 xmlNewAutomata(void) {
5766     xmlAutomataPtr ctxt;
5767 
5768     ctxt = xmlRegNewParserCtxt(NULL);
5769     if (ctxt == NULL)
5770 	return(NULL);
5771 
5772     /* initialize the parser */
5773     ctxt->end = NULL;
5774     ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5775     if (ctxt->start == NULL) {
5776 	xmlFreeAutomata(ctxt);
5777 	return(NULL);
5778     }
5779     ctxt->start->type = XML_REGEXP_START_STATE;
5780     if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5781         xmlRegFreeState(ctxt->start);
5782 	xmlFreeAutomata(ctxt);
5783 	return(NULL);
5784     }
5785     ctxt->flags = 0;
5786 
5787     return(ctxt);
5788 }
5789 
5790 /**
5791  * xmlFreeAutomata:
5792  * @am: an automata
5793  *
5794  * Free an automata
5795  */
5796 void
xmlFreeAutomata(xmlAutomataPtr am)5797 xmlFreeAutomata(xmlAutomataPtr am) {
5798     if (am == NULL)
5799 	return;
5800     xmlRegFreeParserCtxt(am);
5801 }
5802 
5803 /**
5804  * xmlAutomataSetFlags:
5805  * @am: an automata
5806  * @flags:  a set of internal flags
5807  *
5808  * Set some flags on the automata
5809  */
5810 void
xmlAutomataSetFlags(xmlAutomataPtr am,int flags)5811 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5812     if (am == NULL)
5813 	return;
5814     am->flags |= flags;
5815 }
5816 
5817 /**
5818  * xmlAutomataGetInitState:
5819  * @am: an automata
5820  *
5821  * Initial state lookup
5822  *
5823  * Returns the initial state of the automata
5824  */
5825 xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am)5826 xmlAutomataGetInitState(xmlAutomataPtr am) {
5827     if (am == NULL)
5828 	return(NULL);
5829     return(am->start);
5830 }
5831 
5832 /**
5833  * xmlAutomataSetFinalState:
5834  * @am: an automata
5835  * @state: a state in this automata
5836  *
5837  * Makes that state a final state
5838  *
5839  * Returns 0 or -1 in case of error
5840  */
5841 int
xmlAutomataSetFinalState(xmlAutomataPtr am,xmlAutomataStatePtr state)5842 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5843     if ((am == NULL) || (state == NULL))
5844 	return(-1);
5845     state->type = XML_REGEXP_FINAL_STATE;
5846     return(0);
5847 }
5848 
5849 /**
5850  * xmlAutomataNewTransition:
5851  * @am: an automata
5852  * @from: the starting point of the transition
5853  * @to: the target point of the transition or NULL
5854  * @token: the input string associated to that transition
5855  * @data: data passed to the callback function if the transition is activated
5856  *
5857  * If @to is NULL, this creates first a new target state in the automata
5858  * and then adds a transition from the @from state to the target state
5859  * activated by the value of @token
5860  *
5861  * Returns the target state or NULL in case of error
5862  */
5863 xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,void * data)5864 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5865 			 xmlAutomataStatePtr to, const xmlChar *token,
5866 			 void *data) {
5867     xmlRegAtomPtr atom;
5868 
5869     if ((am == NULL) || (from == NULL) || (token == NULL))
5870 	return(NULL);
5871     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5872     if (atom == NULL)
5873         return(NULL);
5874     atom->data = data;
5875     atom->valuep = xmlStrdup(token);
5876 
5877     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5878         xmlRegFreeAtom(atom);
5879 	return(NULL);
5880     }
5881     if (to == NULL)
5882 	return(am->state);
5883     return(to);
5884 }
5885 
5886 /**
5887  * xmlAutomataNewTransition2:
5888  * @am: an automata
5889  * @from: the starting point of the transition
5890  * @to: the target point of the transition or NULL
5891  * @token: the first input string associated to that transition
5892  * @token2: the second input string associated to that transition
5893  * @data: data passed to the callback function if the transition is activated
5894  *
5895  * If @to is NULL, this creates first a new target state in the automata
5896  * and then adds a transition from the @from state to the target state
5897  * activated by the value of @token
5898  *
5899  * Returns the target state or NULL in case of error
5900  */
5901 xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5902 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5903 			  xmlAutomataStatePtr to, const xmlChar *token,
5904 			  const xmlChar *token2, void *data) {
5905     xmlRegAtomPtr atom;
5906 
5907     if ((am == NULL) || (from == NULL) || (token == NULL))
5908 	return(NULL);
5909     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5910     if (atom == NULL)
5911 	return(NULL);
5912     atom->data = data;
5913     if ((token2 == NULL) || (*token2 == 0)) {
5914 	atom->valuep = xmlStrdup(token);
5915     } else {
5916 	int lenn, lenp;
5917 	xmlChar *str;
5918 
5919 	lenn = strlen((char *) token2);
5920 	lenp = strlen((char *) token);
5921 
5922 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5923 	if (str == NULL) {
5924 	    xmlRegFreeAtom(atom);
5925 	    return(NULL);
5926 	}
5927 	memcpy(&str[0], token, lenp);
5928 	str[lenp] = '|';
5929 	memcpy(&str[lenp + 1], token2, lenn);
5930 	str[lenn + lenp + 1] = 0;
5931 
5932 	atom->valuep = str;
5933     }
5934 
5935     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5936         xmlRegFreeAtom(atom);
5937 	return(NULL);
5938     }
5939     if (to == NULL)
5940 	return(am->state);
5941     return(to);
5942 }
5943 
5944 /**
5945  * xmlAutomataNewNegTrans:
5946  * @am: an automata
5947  * @from: the starting point of the transition
5948  * @to: the target point of the transition or NULL
5949  * @token: the first input string associated to that transition
5950  * @token2: the second input string associated to that transition
5951  * @data: data passed to the callback function if the transition is activated
5952  *
5953  * If @to is NULL, this creates first a new target state in the automata
5954  * and then adds a transition from the @from state to the target state
5955  * activated by any value except (@token,@token2)
5956  * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5957  # the semantic of XSD ##other
5958  *
5959  * Returns the target state or NULL in case of error
5960  */
5961 xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5962 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5963 		       xmlAutomataStatePtr to, const xmlChar *token,
5964 		       const xmlChar *token2, void *data) {
5965     xmlRegAtomPtr atom;
5966     xmlChar err_msg[200];
5967 
5968     if ((am == NULL) || (from == NULL) || (token == NULL))
5969 	return(NULL);
5970     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5971     if (atom == NULL)
5972 	return(NULL);
5973     atom->data = data;
5974     atom->neg = 1;
5975     if ((token2 == NULL) || (*token2 == 0)) {
5976 	atom->valuep = xmlStrdup(token);
5977     } else {
5978 	int lenn, lenp;
5979 	xmlChar *str;
5980 
5981 	lenn = strlen((char *) token2);
5982 	lenp = strlen((char *) token);
5983 
5984 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5985 	if (str == NULL) {
5986 	    xmlRegFreeAtom(atom);
5987 	    return(NULL);
5988 	}
5989 	memcpy(&str[0], token, lenp);
5990 	str[lenp] = '|';
5991 	memcpy(&str[lenp + 1], token2, lenn);
5992 	str[lenn + lenp + 1] = 0;
5993 
5994 	atom->valuep = str;
5995     }
5996     snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5997     err_msg[199] = 0;
5998     atom->valuep2 = xmlStrdup(err_msg);
5999 
6000     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
6001         xmlRegFreeAtom(atom);
6002 	return(NULL);
6003     }
6004     am->negs++;
6005     if (to == NULL)
6006 	return(am->state);
6007     return(to);
6008 }
6009 
6010 /**
6011  * xmlAutomataNewCountTrans2:
6012  * @am: an automata
6013  * @from: the starting point of the transition
6014  * @to: the target point of the transition or NULL
6015  * @token: the input string associated to that transition
6016  * @token2: the second input string associated to that transition
6017  * @min:  the minimum successive occurrences of token
6018  * @max:  the maximum successive occurrences of token
6019  * @data:  data associated to the transition
6020  *
6021  * If @to is NULL, this creates first a new target state in the automata
6022  * and then adds a transition from the @from state to the target state
6023  * activated by a succession of input of value @token and @token2 and
6024  * whose number is between @min and @max
6025  *
6026  * Returns the target state or NULL in case of error
6027  */
6028 xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)6029 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6030 			 xmlAutomataStatePtr to, const xmlChar *token,
6031 			 const xmlChar *token2,
6032 			 int min, int max, void *data) {
6033     xmlRegAtomPtr atom;
6034     int counter;
6035 
6036     if ((am == NULL) || (from == NULL) || (token == NULL))
6037 	return(NULL);
6038     if (min < 0)
6039 	return(NULL);
6040     if ((max < min) || (max < 1))
6041 	return(NULL);
6042     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6043     if (atom == NULL)
6044 	return(NULL);
6045     if ((token2 == NULL) || (*token2 == 0)) {
6046 	atom->valuep = xmlStrdup(token);
6047     } else {
6048 	int lenn, lenp;
6049 	xmlChar *str;
6050 
6051 	lenn = strlen((char *) token2);
6052 	lenp = strlen((char *) token);
6053 
6054 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6055 	if (str == NULL) {
6056 	    xmlRegFreeAtom(atom);
6057 	    return(NULL);
6058 	}
6059 	memcpy(&str[0], token, lenp);
6060 	str[lenp] = '|';
6061 	memcpy(&str[lenp + 1], token2, lenn);
6062 	str[lenn + lenp + 1] = 0;
6063 
6064 	atom->valuep = str;
6065     }
6066     atom->data = data;
6067     if (min == 0)
6068 	atom->min = 1;
6069     else
6070 	atom->min = min;
6071     atom->max = max;
6072 
6073     /*
6074      * associate a counter to the transition.
6075      */
6076     counter = xmlRegGetCounter(am);
6077     am->counters[counter].min = min;
6078     am->counters[counter].max = max;
6079 
6080     /* xmlFAGenerateTransitions(am, from, to, atom); */
6081     if (to == NULL) {
6082         to = xmlRegNewState(am);
6083 	xmlRegStatePush(am, to);
6084     }
6085     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6086     xmlRegAtomPush(am, atom);
6087     am->state = to;
6088 
6089     if (to == NULL)
6090 	to = am->state;
6091     if (to == NULL)
6092 	return(NULL);
6093     if (min == 0)
6094 	xmlFAGenerateEpsilonTransition(am, from, to);
6095     return(to);
6096 }
6097 
6098 /**
6099  * xmlAutomataNewCountTrans:
6100  * @am: an automata
6101  * @from: the starting point of the transition
6102  * @to: the target point of the transition or NULL
6103  * @token: the input string associated to that transition
6104  * @min:  the minimum successive occurrences of token
6105  * @max:  the maximum successive occurrences of token
6106  * @data:  data associated to the transition
6107  *
6108  * If @to is NULL, this creates first a new target state in the automata
6109  * and then adds a transition from the @from state to the target state
6110  * activated by a succession of input of value @token and whose number
6111  * is between @min and @max
6112  *
6113  * Returns the target state or NULL in case of error
6114  */
6115 xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6116 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6117 			 xmlAutomataStatePtr to, const xmlChar *token,
6118 			 int min, int max, void *data) {
6119     xmlRegAtomPtr atom;
6120     int counter;
6121 
6122     if ((am == NULL) || (from == NULL) || (token == NULL))
6123 	return(NULL);
6124     if (min < 0)
6125 	return(NULL);
6126     if ((max < min) || (max < 1))
6127 	return(NULL);
6128     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6129     if (atom == NULL)
6130 	return(NULL);
6131     atom->valuep = xmlStrdup(token);
6132     atom->data = data;
6133     if (min == 0)
6134 	atom->min = 1;
6135     else
6136 	atom->min = min;
6137     atom->max = max;
6138 
6139     /*
6140      * associate a counter to the transition.
6141      */
6142     counter = xmlRegGetCounter(am);
6143     am->counters[counter].min = min;
6144     am->counters[counter].max = max;
6145 
6146     /* xmlFAGenerateTransitions(am, from, to, atom); */
6147     if (to == NULL) {
6148         to = xmlRegNewState(am);
6149 	xmlRegStatePush(am, to);
6150     }
6151     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6152     xmlRegAtomPush(am, atom);
6153     am->state = to;
6154 
6155     if (to == NULL)
6156 	to = am->state;
6157     if (to == NULL)
6158 	return(NULL);
6159     if (min == 0)
6160 	xmlFAGenerateEpsilonTransition(am, from, to);
6161     return(to);
6162 }
6163 
6164 /**
6165  * xmlAutomataNewOnceTrans2:
6166  * @am: an automata
6167  * @from: the starting point of the transition
6168  * @to: the target point of the transition or NULL
6169  * @token: the input string associated to that transition
6170  * @token2: the second input string associated to that transition
6171  * @min:  the minimum successive occurrences of token
6172  * @max:  the maximum successive occurrences of token
6173  * @data:  data associated to the transition
6174  *
6175  * If @to is NULL, this creates first a new target state in the automata
6176  * and then adds a transition from the @from state to the target state
6177  * activated by a succession of input of value @token and @token2 and whose
6178  * number is between @min and @max, moreover that transition can only be
6179  * crossed once.
6180  *
6181  * Returns the target state or NULL in case of error
6182  */
6183 xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)6184 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6185 			 xmlAutomataStatePtr to, const xmlChar *token,
6186 			 const xmlChar *token2,
6187 			 int min, int max, void *data) {
6188     xmlRegAtomPtr atom;
6189     int counter;
6190 
6191     if ((am == NULL) || (from == NULL) || (token == NULL))
6192 	return(NULL);
6193     if (min < 1)
6194 	return(NULL);
6195     if (max < min)
6196 	return(NULL);
6197     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6198     if (atom == NULL)
6199 	return(NULL);
6200     if ((token2 == NULL) || (*token2 == 0)) {
6201 	atom->valuep = xmlStrdup(token);
6202     } else {
6203 	int lenn, lenp;
6204 	xmlChar *str;
6205 
6206 	lenn = strlen((char *) token2);
6207 	lenp = strlen((char *) token);
6208 
6209 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6210 	if (str == NULL) {
6211 	    xmlRegFreeAtom(atom);
6212 	    return(NULL);
6213 	}
6214 	memcpy(&str[0], token, lenp);
6215 	str[lenp] = '|';
6216 	memcpy(&str[lenp + 1], token2, lenn);
6217 	str[lenn + lenp + 1] = 0;
6218 
6219 	atom->valuep = str;
6220     }
6221     atom->data = data;
6222     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6223     atom->min = min;
6224     atom->max = max;
6225     /*
6226      * associate a counter to the transition.
6227      */
6228     counter = xmlRegGetCounter(am);
6229     am->counters[counter].min = 1;
6230     am->counters[counter].max = 1;
6231 
6232     /* xmlFAGenerateTransitions(am, from, to, atom); */
6233     if (to == NULL) {
6234 	to = xmlRegNewState(am);
6235 	xmlRegStatePush(am, to);
6236     }
6237     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6238     xmlRegAtomPush(am, atom);
6239     am->state = to;
6240     return(to);
6241 }
6242 
6243 
6244 
6245 /**
6246  * xmlAutomataNewOnceTrans:
6247  * @am: an automata
6248  * @from: the starting point of the transition
6249  * @to: the target point of the transition or NULL
6250  * @token: the input string associated to that transition
6251  * @min:  the minimum successive occurrences of token
6252  * @max:  the maximum successive occurrences of token
6253  * @data:  data associated to the transition
6254  *
6255  * If @to is NULL, this creates first a new target state in the automata
6256  * and then adds a transition from the @from state to the target state
6257  * activated by a succession of input of value @token and whose number
6258  * is between @min and @max, moreover that transition can only be crossed
6259  * once.
6260  *
6261  * Returns the target state or NULL in case of error
6262  */
6263 xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6264 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6265 			 xmlAutomataStatePtr to, const xmlChar *token,
6266 			 int min, int max, void *data) {
6267     xmlRegAtomPtr atom;
6268     int counter;
6269 
6270     if ((am == NULL) || (from == NULL) || (token == NULL))
6271 	return(NULL);
6272     if (min < 1)
6273 	return(NULL);
6274     if (max < min)
6275 	return(NULL);
6276     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6277     if (atom == NULL)
6278 	return(NULL);
6279     atom->valuep = xmlStrdup(token);
6280     atom->data = data;
6281     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6282     atom->min = min;
6283     atom->max = max;
6284     /*
6285      * associate a counter to the transition.
6286      */
6287     counter = xmlRegGetCounter(am);
6288     am->counters[counter].min = 1;
6289     am->counters[counter].max = 1;
6290 
6291     /* xmlFAGenerateTransitions(am, from, to, atom); */
6292     if (to == NULL) {
6293 	to = xmlRegNewState(am);
6294 	xmlRegStatePush(am, to);
6295     }
6296     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6297     xmlRegAtomPush(am, atom);
6298     am->state = to;
6299     return(to);
6300 }
6301 
6302 /**
6303  * xmlAutomataNewState:
6304  * @am: an automata
6305  *
6306  * Create a new disconnected state in the automata
6307  *
6308  * Returns the new state or NULL in case of error
6309  */
6310 xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am)6311 xmlAutomataNewState(xmlAutomataPtr am) {
6312     xmlAutomataStatePtr to;
6313 
6314     if (am == NULL)
6315 	return(NULL);
6316     to = xmlRegNewState(am);
6317     xmlRegStatePush(am, to);
6318     return(to);
6319 }
6320 
6321 /**
6322  * xmlAutomataNewEpsilon:
6323  * @am: an automata
6324  * @from: the starting point of the transition
6325  * @to: the target point of the transition or NULL
6326  *
6327  * If @to is NULL, this creates first a new target state in the automata
6328  * and then adds an epsilon transition from the @from state to the
6329  * target state
6330  *
6331  * Returns the target state or NULL in case of error
6332  */
6333 xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to)6334 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6335 		      xmlAutomataStatePtr to) {
6336     if ((am == NULL) || (from == NULL))
6337 	return(NULL);
6338     xmlFAGenerateEpsilonTransition(am, from, to);
6339     if (to == NULL)
6340 	return(am->state);
6341     return(to);
6342 }
6343 
6344 /**
6345  * xmlAutomataNewAllTrans:
6346  * @am: an automata
6347  * @from: the starting point of the transition
6348  * @to: the target point of the transition or NULL
6349  * @lax: allow to transition if not all all transitions have been activated
6350  *
6351  * If @to is NULL, this creates first a new target state in the automata
6352  * and then adds a an ALL transition from the @from state to the
6353  * target state. That transition is an epsilon transition allowed only when
6354  * all transitions from the @from node have been activated.
6355  *
6356  * Returns the target state or NULL in case of error
6357  */
6358 xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int lax)6359 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6360 		       xmlAutomataStatePtr to, int lax) {
6361     if ((am == NULL) || (from == NULL))
6362 	return(NULL);
6363     xmlFAGenerateAllTransition(am, from, to, lax);
6364     if (to == NULL)
6365 	return(am->state);
6366     return(to);
6367 }
6368 
6369 /**
6370  * xmlAutomataNewCounter:
6371  * @am: an automata
6372  * @min:  the minimal value on the counter
6373  * @max:  the maximal value on the counter
6374  *
6375  * Create a new counter
6376  *
6377  * Returns the counter number or -1 in case of error
6378  */
6379 int
xmlAutomataNewCounter(xmlAutomataPtr am,int min,int max)6380 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6381     int ret;
6382 
6383     if (am == NULL)
6384 	return(-1);
6385 
6386     ret = xmlRegGetCounter(am);
6387     if (ret < 0)
6388 	return(-1);
6389     am->counters[ret].min = min;
6390     am->counters[ret].max = max;
6391     return(ret);
6392 }
6393 
6394 /**
6395  * xmlAutomataNewCountedTrans:
6396  * @am: an automata
6397  * @from: the starting point of the transition
6398  * @to: the target point of the transition or NULL
6399  * @counter: the counter associated to that transition
6400  *
6401  * If @to is NULL, this creates first a new target state in the automata
6402  * and then adds an epsilon transition from the @from state to the target state
6403  * which will increment the counter provided
6404  *
6405  * Returns the target state or NULL in case of error
6406  */
6407 xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6408 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6409 		xmlAutomataStatePtr to, int counter) {
6410     if ((am == NULL) || (from == NULL) || (counter < 0))
6411 	return(NULL);
6412     xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6413     if (to == NULL)
6414 	return(am->state);
6415     return(to);
6416 }
6417 
6418 /**
6419  * xmlAutomataNewCounterTrans:
6420  * @am: an automata
6421  * @from: the starting point of the transition
6422  * @to: the target point of the transition or NULL
6423  * @counter: the counter associated to that transition
6424  *
6425  * If @to is NULL, this creates first a new target state in the automata
6426  * and then adds an epsilon transition from the @from state to the target state
6427  * which will be allowed only if the counter is within the right range.
6428  *
6429  * Returns the target state or NULL in case of error
6430  */
6431 xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6432 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6433 		xmlAutomataStatePtr to, int counter) {
6434     if ((am == NULL) || (from == NULL) || (counter < 0))
6435 	return(NULL);
6436     xmlFAGenerateCountedTransition(am, from, to, counter);
6437     if (to == NULL)
6438 	return(am->state);
6439     return(to);
6440 }
6441 
6442 /**
6443  * xmlAutomataCompile:
6444  * @am: an automata
6445  *
6446  * Compile the automata into a Reg Exp ready for being executed.
6447  * The automata should be free after this point.
6448  *
6449  * Returns the compiled regexp or NULL in case of error
6450  */
6451 xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am)6452 xmlAutomataCompile(xmlAutomataPtr am) {
6453     xmlRegexpPtr ret;
6454 
6455     if ((am == NULL) || (am->error != 0)) return(NULL);
6456     xmlFAEliminateEpsilonTransitions(am);
6457     /* xmlFAComputesDeterminism(am); */
6458     ret = xmlRegEpxFromParse(am);
6459 
6460     return(ret);
6461 }
6462 
6463 /**
6464  * xmlAutomataIsDeterminist:
6465  * @am: an automata
6466  *
6467  * Checks if an automata is determinist.
6468  *
6469  * Returns 1 if true, 0 if not, and -1 in case of error
6470  */
6471 int
xmlAutomataIsDeterminist(xmlAutomataPtr am)6472 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6473     int ret;
6474 
6475     if (am == NULL)
6476 	return(-1);
6477 
6478     ret = xmlFAComputesDeterminism(am);
6479     return(ret);
6480 }
6481 #endif /* LIBXML_AUTOMATA_ENABLED */
6482 
6483 #ifdef LIBXML_EXPR_ENABLED
6484 /************************************************************************
6485  *									*
6486  *		Formal Expression handling code				*
6487  *									*
6488  ************************************************************************/
6489 /************************************************************************
6490  *									*
6491  *		Expression handling context				*
6492  *									*
6493  ************************************************************************/
6494 
6495 struct _xmlExpCtxt {
6496     xmlDictPtr dict;
6497     xmlExpNodePtr *table;
6498     int size;
6499     int nbElems;
6500     int nb_nodes;
6501     int maxNodes;
6502     const char *expr;
6503     const char *cur;
6504     int nb_cons;
6505     int tabSize;
6506 };
6507 
6508 /**
6509  * xmlExpNewCtxt:
6510  * @maxNodes:  the maximum number of nodes
6511  * @dict:  optional dictionary to use internally
6512  *
6513  * Creates a new context for manipulating expressions
6514  *
6515  * Returns the context or NULL in case of error
6516  */
6517 xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes,xmlDictPtr dict)6518 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6519     xmlExpCtxtPtr ret;
6520     int size = 256;
6521 
6522     if (maxNodes <= 4096)
6523         maxNodes = 4096;
6524 
6525     ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6526     if (ret == NULL)
6527         return(NULL);
6528     memset(ret, 0, sizeof(xmlExpCtxt));
6529     ret->size = size;
6530     ret->nbElems = 0;
6531     ret->maxNodes = maxNodes;
6532     ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6533     if (ret->table == NULL) {
6534         xmlFree(ret);
6535 	return(NULL);
6536     }
6537     memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6538     if (dict == NULL) {
6539         ret->dict = xmlDictCreate();
6540 	if (ret->dict == NULL) {
6541 	    xmlFree(ret->table);
6542 	    xmlFree(ret);
6543 	    return(NULL);
6544 	}
6545     } else {
6546         ret->dict = dict;
6547 	xmlDictReference(ret->dict);
6548     }
6549     return(ret);
6550 }
6551 
6552 /**
6553  * xmlExpFreeCtxt:
6554  * @ctxt:  an expression context
6555  *
6556  * Free an expression context
6557  */
6558 void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt)6559 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6560     if (ctxt == NULL)
6561         return;
6562     xmlDictFree(ctxt->dict);
6563     if (ctxt->table != NULL)
6564 	xmlFree(ctxt->table);
6565     xmlFree(ctxt);
6566 }
6567 
6568 /************************************************************************
6569  *									*
6570  *		Structure associated to an expression node		*
6571  *									*
6572  ************************************************************************/
6573 #define MAX_NODES 10000
6574 
6575 /* #define DEBUG_DERIV */
6576 
6577 /*
6578  * TODO:
6579  * - Wildcards
6580  * - public API for creation
6581  *
6582  * Started
6583  * - regression testing
6584  *
6585  * Done
6586  * - split into module and test tool
6587  * - memleaks
6588  */
6589 
6590 typedef enum {
6591     XML_EXP_NILABLE = (1 << 0)
6592 } xmlExpNodeInfo;
6593 
6594 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6595 
6596 struct _xmlExpNode {
6597     unsigned char type;/* xmlExpNodeType */
6598     unsigned char info;/* OR of xmlExpNodeInfo */
6599     unsigned short key;	/* the hash key */
6600     unsigned int ref;	/* The number of references */
6601     int c_max;		/* the maximum length it can consume */
6602     xmlExpNodePtr exp_left;
6603     xmlExpNodePtr next;/* the next node in the hash table or free list */
6604     union {
6605 	struct {
6606 	    int f_min;
6607 	    int f_max;
6608 	} count;
6609 	struct {
6610 	    xmlExpNodePtr f_right;
6611 	} children;
6612         const xmlChar *f_str;
6613     } field;
6614 };
6615 
6616 #define exp_min field.count.f_min
6617 #define exp_max field.count.f_max
6618 /* #define exp_left field.children.f_left */
6619 #define exp_right field.children.f_right
6620 #define exp_str field.f_str
6621 
6622 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6623 static xmlExpNode forbiddenExpNode = {
6624     XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6625 };
6626 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6627 static xmlExpNode emptyExpNode = {
6628     XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6629 };
6630 xmlExpNodePtr emptyExp = &emptyExpNode;
6631 
6632 /************************************************************************
6633  *									*
6634  *  The custom hash table for unicity and canonicalization		*
6635  *  of sub-expressions pointers						*
6636  *									*
6637  ************************************************************************/
6638 /*
6639  * xmlExpHashNameComputeKey:
6640  * Calculate the hash key for a token
6641  */
6642 static unsigned short
xmlExpHashNameComputeKey(const xmlChar * name)6643 xmlExpHashNameComputeKey(const xmlChar *name) {
6644     unsigned short value = 0L;
6645     char ch;
6646 
6647     if (name != NULL) {
6648 	value += 30 * (*name);
6649 	while ((ch = *name++) != 0) {
6650 	    value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6651 	}
6652     }
6653     return (value);
6654 }
6655 
6656 /*
6657  * xmlExpHashComputeKey:
6658  * Calculate the hash key for a compound expression
6659  */
6660 static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right)6661 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6662                      xmlExpNodePtr right) {
6663     unsigned long value;
6664     unsigned short ret;
6665 
6666     switch (type) {
6667         case XML_EXP_SEQ:
6668 	    value = left->key;
6669 	    value += right->key;
6670 	    value *= 3;
6671 	    ret = (unsigned short) value;
6672 	    break;
6673         case XML_EXP_OR:
6674 	    value = left->key;
6675 	    value += right->key;
6676 	    value *= 7;
6677 	    ret = (unsigned short) value;
6678 	    break;
6679         case XML_EXP_COUNT:
6680 	    value = left->key;
6681 	    value += right->key;
6682 	    ret = (unsigned short) value;
6683 	    break;
6684 	default:
6685 	    ret = 0;
6686     }
6687     return(ret);
6688 }
6689 
6690 
6691 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt,xmlExpNodeType type)6692 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6693     xmlExpNodePtr ret;
6694 
6695     if (ctxt->nb_nodes >= MAX_NODES)
6696         return(NULL);
6697     ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6698     if (ret == NULL)
6699         return(NULL);
6700     memset(ret, 0, sizeof(xmlExpNode));
6701     ret->type = type;
6702     ret->next = NULL;
6703     ctxt->nb_nodes++;
6704     ctxt->nb_cons++;
6705     return(ret);
6706 }
6707 
6708 /**
6709  * xmlExpHashGetEntry:
6710  * @table: the hash table
6711  *
6712  * Get the unique entry from the hash table. The entry is created if
6713  * needed. @left and @right are consumed, i.e. their ref count will
6714  * be decremented by the operation.
6715  *
6716  * Returns the pointer or NULL in case of error
6717  */
6718 static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt,xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right,const xmlChar * name,int min,int max)6719 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6720                    xmlExpNodePtr left, xmlExpNodePtr right,
6721 		   const xmlChar *name, int min, int max) {
6722     unsigned short kbase, key;
6723     xmlExpNodePtr entry;
6724     xmlExpNodePtr insert;
6725 
6726     if (ctxt == NULL)
6727 	return(NULL);
6728 
6729     /*
6730      * Check for duplicate and insertion location.
6731      */
6732     if (type == XML_EXP_ATOM) {
6733 	kbase = xmlExpHashNameComputeKey(name);
6734     } else if (type == XML_EXP_COUNT) {
6735         /* COUNT reduction rule 1 */
6736 	/* a{1} -> a */
6737 	if (min == max) {
6738 	    if (min == 1) {
6739 		return(left);
6740 	    }
6741 	    if (min == 0) {
6742 		xmlExpFree(ctxt, left);
6743 	        return(emptyExp);
6744 	    }
6745 	}
6746 	if (min < 0) {
6747 	    xmlExpFree(ctxt, left);
6748 	    return(forbiddenExp);
6749 	}
6750         if (max == -1)
6751 	    kbase = min + 79;
6752 	else
6753 	    kbase = max - min;
6754 	kbase += left->key;
6755     } else if (type == XML_EXP_OR) {
6756         /* Forbid reduction rules */
6757         if (left->type == XML_EXP_FORBID) {
6758 	    xmlExpFree(ctxt, left);
6759 	    return(right);
6760 	}
6761         if (right->type == XML_EXP_FORBID) {
6762 	    xmlExpFree(ctxt, right);
6763 	    return(left);
6764 	}
6765 
6766         /* OR reduction rule 1 */
6767 	/* a | a reduced to a */
6768         if (left == right) {
6769 	    xmlExpFree(ctxt, right);
6770 	    return(left);
6771 	}
6772         /* OR canonicalization rule 1 */
6773 	/* linearize (a | b) | c into a | (b | c) */
6774         if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6775 	    xmlExpNodePtr tmp = left;
6776             left = right;
6777 	    right = tmp;
6778 	}
6779         /* OR reduction rule 2 */
6780 	/* a | (a | b) and b | (a | b) are reduced to a | b */
6781         if (right->type == XML_EXP_OR) {
6782 	    if ((left == right->exp_left) ||
6783 	        (left == right->exp_right)) {
6784 		xmlExpFree(ctxt, left);
6785 		return(right);
6786 	    }
6787 	}
6788         /* OR canonicalization rule 2 */
6789 	/* linearize (a | b) | c into a | (b | c) */
6790         if (left->type == XML_EXP_OR) {
6791 	    xmlExpNodePtr tmp;
6792 
6793 	    /* OR canonicalization rule 2 */
6794 	    if ((left->exp_right->type != XML_EXP_OR) &&
6795 	        (left->exp_right->key < left->exp_left->key)) {
6796 	        tmp = left->exp_right;
6797 		left->exp_right = left->exp_left;
6798 		left->exp_left = tmp;
6799 	    }
6800 	    left->exp_right->ref++;
6801 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6802 	                             NULL, 0, 0);
6803 	    left->exp_left->ref++;
6804 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6805 	                             NULL, 0, 0);
6806 
6807 	    xmlExpFree(ctxt, left);
6808 	    return(tmp);
6809 	}
6810 	if (right->type == XML_EXP_OR) {
6811 	    /* Ordering in the tree */
6812 	    /* C | (A | B) -> A | (B | C) */
6813 	    if (left->key > right->exp_right->key) {
6814 		xmlExpNodePtr tmp;
6815 		right->exp_right->ref++;
6816 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6817 		                         left, NULL, 0, 0);
6818 		right->exp_left->ref++;
6819 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6820 		                         tmp, NULL, 0, 0);
6821 		xmlExpFree(ctxt, right);
6822 		return(tmp);
6823 	    }
6824 	    /* Ordering in the tree */
6825 	    /* B | (A | C) -> A | (B | C) */
6826 	    if (left->key > right->exp_left->key) {
6827 		xmlExpNodePtr tmp;
6828 		right->exp_right->ref++;
6829 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6830 		                         right->exp_right, NULL, 0, 0);
6831 		right->exp_left->ref++;
6832 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6833 		                         tmp, NULL, 0, 0);
6834 		xmlExpFree(ctxt, right);
6835 		return(tmp);
6836 	    }
6837 	}
6838 	/* we know both types are != XML_EXP_OR here */
6839         else if (left->key > right->key) {
6840 	    xmlExpNodePtr tmp = left;
6841             left = right;
6842 	    right = tmp;
6843 	}
6844 	kbase = xmlExpHashComputeKey(type, left, right);
6845     } else if (type == XML_EXP_SEQ) {
6846         /* Forbid reduction rules */
6847         if (left->type == XML_EXP_FORBID) {
6848 	    xmlExpFree(ctxt, right);
6849 	    return(left);
6850 	}
6851         if (right->type == XML_EXP_FORBID) {
6852 	    xmlExpFree(ctxt, left);
6853 	    return(right);
6854 	}
6855         /* Empty reduction rules */
6856         if (right->type == XML_EXP_EMPTY) {
6857 	    return(left);
6858 	}
6859         if (left->type == XML_EXP_EMPTY) {
6860 	    return(right);
6861 	}
6862 	kbase = xmlExpHashComputeKey(type, left, right);
6863     } else
6864         return(NULL);
6865 
6866     key = kbase % ctxt->size;
6867     if (ctxt->table[key] != NULL) {
6868 	for (insert = ctxt->table[key]; insert != NULL;
6869 	     insert = insert->next) {
6870 	    if ((insert->key == kbase) &&
6871 	        (insert->type == type)) {
6872 		if (type == XML_EXP_ATOM) {
6873 		    if (name == insert->exp_str) {
6874 			insert->ref++;
6875 			return(insert);
6876 		    }
6877 		} else if (type == XML_EXP_COUNT) {
6878 		    if ((insert->exp_min == min) && (insert->exp_max == max) &&
6879 		        (insert->exp_left == left)) {
6880 			insert->ref++;
6881 			left->ref--;
6882 			return(insert);
6883 		    }
6884 		} else if ((insert->exp_left == left) &&
6885 			   (insert->exp_right == right)) {
6886 		    insert->ref++;
6887 		    left->ref--;
6888 		    right->ref--;
6889 		    return(insert);
6890 		}
6891 	    }
6892 	}
6893     }
6894 
6895     entry = xmlExpNewNode(ctxt, type);
6896     if (entry == NULL)
6897         return(NULL);
6898     entry->key = kbase;
6899     if (type == XML_EXP_ATOM) {
6900 	entry->exp_str = name;
6901 	entry->c_max = 1;
6902     } else if (type == XML_EXP_COUNT) {
6903         entry->exp_min = min;
6904         entry->exp_max = max;
6905 	entry->exp_left = left;
6906 	if ((min == 0) || (IS_NILLABLE(left)))
6907 	    entry->info |= XML_EXP_NILABLE;
6908 	if (max < 0)
6909 	    entry->c_max = -1;
6910 	else
6911 	    entry->c_max = max * entry->exp_left->c_max;
6912     } else {
6913 	entry->exp_left = left;
6914 	entry->exp_right = right;
6915 	if (type == XML_EXP_OR) {
6916 	    if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6917 		entry->info |= XML_EXP_NILABLE;
6918 	    if ((entry->exp_left->c_max == -1) ||
6919 	        (entry->exp_right->c_max == -1))
6920 		entry->c_max = -1;
6921 	    else if (entry->exp_left->c_max > entry->exp_right->c_max)
6922 	        entry->c_max = entry->exp_left->c_max;
6923 	    else
6924 	        entry->c_max = entry->exp_right->c_max;
6925 	} else {
6926 	    if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6927 		entry->info |= XML_EXP_NILABLE;
6928 	    if ((entry->exp_left->c_max == -1) ||
6929 	        (entry->exp_right->c_max == -1))
6930 		entry->c_max = -1;
6931 	    else
6932 	        entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6933 	}
6934     }
6935     entry->ref = 1;
6936     if (ctxt->table[key] != NULL)
6937         entry->next = ctxt->table[key];
6938 
6939     ctxt->table[key] = entry;
6940     ctxt->nbElems++;
6941 
6942     return(entry);
6943 }
6944 
6945 /**
6946  * xmlExpFree:
6947  * @ctxt: the expression context
6948  * @exp: the expression
6949  *
6950  * Dereference the expression
6951  */
6952 void
xmlExpFree(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp)6953 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6954     if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6955         return;
6956     exp->ref--;
6957     if (exp->ref == 0) {
6958         unsigned short key;
6959 
6960         /* Unlink it first from the hash table */
6961 	key = exp->key % ctxt->size;
6962 	if (ctxt->table[key] == exp) {
6963 	    ctxt->table[key] = exp->next;
6964 	} else {
6965 	    xmlExpNodePtr tmp;
6966 
6967 	    tmp = ctxt->table[key];
6968 	    while (tmp != NULL) {
6969 	        if (tmp->next == exp) {
6970 		    tmp->next = exp->next;
6971 		    break;
6972 		}
6973 	        tmp = tmp->next;
6974 	    }
6975 	}
6976 
6977         if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6978 	    xmlExpFree(ctxt, exp->exp_left);
6979 	    xmlExpFree(ctxt, exp->exp_right);
6980 	} else if (exp->type == XML_EXP_COUNT) {
6981 	    xmlExpFree(ctxt, exp->exp_left);
6982 	}
6983         xmlFree(exp);
6984 	ctxt->nb_nodes--;
6985     }
6986 }
6987 
6988 /**
6989  * xmlExpRef:
6990  * @exp: the expression
6991  *
6992  * Increase the reference count of the expression
6993  */
6994 void
xmlExpRef(xmlExpNodePtr exp)6995 xmlExpRef(xmlExpNodePtr exp) {
6996     if (exp != NULL)
6997         exp->ref++;
6998 }
6999 
7000 /**
7001  * xmlExpNewAtom:
7002  * @ctxt: the expression context
7003  * @name: the atom name
7004  * @len: the atom name length in byte (or -1);
7005  *
7006  * Get the atom associated to this name from that context
7007  *
7008  * Returns the node or NULL in case of error
7009  */
7010 xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt,const xmlChar * name,int len)7011 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
7012     if ((ctxt == NULL) || (name == NULL))
7013         return(NULL);
7014     name = xmlDictLookup(ctxt->dict, name, len);
7015     if (name == NULL)
7016         return(NULL);
7017     return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
7018 }
7019 
7020 /**
7021  * xmlExpNewOr:
7022  * @ctxt: the expression context
7023  * @left: left expression
7024  * @right: right expression
7025  *
7026  * Get the atom associated to the choice @left | @right
7027  * Note that @left and @right are consumed in the operation, to keep
7028  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7029  * this is true even in case of failure (unless ctxt == NULL).
7030  *
7031  * Returns the node or NULL in case of error
7032  */
7033 xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)7034 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
7035     if (ctxt == NULL)
7036         return(NULL);
7037     if ((left == NULL) || (right == NULL)) {
7038         xmlExpFree(ctxt, left);
7039         xmlExpFree(ctxt, right);
7040         return(NULL);
7041     }
7042     return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
7043 }
7044 
7045 /**
7046  * xmlExpNewSeq:
7047  * @ctxt: the expression context
7048  * @left: left expression
7049  * @right: right expression
7050  *
7051  * Get the atom associated to the sequence @left , @right
7052  * Note that @left and @right are consumed in the operation, to keep
7053  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7054  * this is true even in case of failure (unless ctxt == NULL).
7055  *
7056  * Returns the node or NULL in case of error
7057  */
7058 xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)7059 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
7060     if (ctxt == NULL)
7061         return(NULL);
7062     if ((left == NULL) || (right == NULL)) {
7063         xmlExpFree(ctxt, left);
7064         xmlExpFree(ctxt, right);
7065         return(NULL);
7066     }
7067     return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
7068 }
7069 
7070 /**
7071  * xmlExpNewRange:
7072  * @ctxt: the expression context
7073  * @subset: the expression to be repeated
7074  * @min: the lower bound for the repetition
7075  * @max: the upper bound for the repetition, -1 means infinite
7076  *
7077  * Get the atom associated to the range (@subset){@min, @max}
7078  * Note that @subset is consumed in the operation, to keep
7079  * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
7080  * this is true even in case of failure (unless ctxt == NULL).
7081  *
7082  * Returns the node or NULL in case of error
7083  */
7084 xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt,xmlExpNodePtr subset,int min,int max)7085 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
7086     if (ctxt == NULL)
7087         return(NULL);
7088     if ((subset == NULL) || (min < 0) || (max < -1) ||
7089         ((max >= 0) && (min > max))) {
7090 	xmlExpFree(ctxt, subset);
7091         return(NULL);
7092     }
7093     return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
7094                               NULL, NULL, min, max));
7095 }
7096 
7097 /************************************************************************
7098  *									*
7099  *		Public API for operations on expressions		*
7100  *									*
7101  ************************************************************************/
7102 
7103 static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7104 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7105                      const xmlChar**list, int len, int nb) {
7106     int tmp, tmp2;
7107 tail:
7108     switch (exp->type) {
7109         case XML_EXP_EMPTY:
7110 	    return(0);
7111         case XML_EXP_ATOM:
7112 	    for (tmp = 0;tmp < nb;tmp++)
7113 	        if (list[tmp] == exp->exp_str)
7114 		    return(0);
7115             if (nb >= len)
7116 	        return(-2);
7117 	    list[nb] = exp->exp_str;
7118 	    return(1);
7119         case XML_EXP_COUNT:
7120 	    exp = exp->exp_left;
7121 	    goto tail;
7122         case XML_EXP_SEQ:
7123         case XML_EXP_OR:
7124 	    tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7125 	    if (tmp < 0)
7126 	        return(tmp);
7127 	    tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7128 	                                nb + tmp);
7129 	    if (tmp2 < 0)
7130 	        return(tmp2);
7131             return(tmp + tmp2);
7132     }
7133     return(-1);
7134 }
7135 
7136 /**
7137  * xmlExpGetLanguage:
7138  * @ctxt: the expression context
7139  * @exp: the expression
7140  * @langList: where to store the tokens
7141  * @len: the allocated length of @list
7142  *
7143  * Find all the strings used in @exp and store them in @list
7144  *
7145  * Returns the number of unique strings found, -1 in case of errors and
7146  *         -2 if there is more than @len strings
7147  */
7148 int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** langList,int len)7149 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7150                   const xmlChar**langList, int len) {
7151     if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
7152         return(-1);
7153     return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
7154 }
7155 
7156 static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7157 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7158                   const xmlChar**list, int len, int nb) {
7159     int tmp, tmp2;
7160 tail:
7161     switch (exp->type) {
7162         case XML_EXP_FORBID:
7163 	    return(0);
7164         case XML_EXP_EMPTY:
7165 	    return(0);
7166         case XML_EXP_ATOM:
7167 	    for (tmp = 0;tmp < nb;tmp++)
7168 	        if (list[tmp] == exp->exp_str)
7169 		    return(0);
7170             if (nb >= len)
7171 	        return(-2);
7172 	    list[nb] = exp->exp_str;
7173 	    return(1);
7174         case XML_EXP_COUNT:
7175 	    exp = exp->exp_left;
7176 	    goto tail;
7177         case XML_EXP_SEQ:
7178 	    tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7179 	    if (tmp < 0)
7180 	        return(tmp);
7181 	    if (IS_NILLABLE(exp->exp_left)) {
7182 		tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7183 					    nb + tmp);
7184 		if (tmp2 < 0)
7185 		    return(tmp2);
7186 		tmp += tmp2;
7187 	    }
7188             return(tmp);
7189         case XML_EXP_OR:
7190 	    tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7191 	    if (tmp < 0)
7192 	        return(tmp);
7193 	    tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7194 	                                nb + tmp);
7195 	    if (tmp2 < 0)
7196 	        return(tmp2);
7197             return(tmp + tmp2);
7198     }
7199     return(-1);
7200 }
7201 
7202 /**
7203  * xmlExpGetStart:
7204  * @ctxt: the expression context
7205  * @exp: the expression
7206  * @tokList: where to store the tokens
7207  * @len: the allocated length of @list
7208  *
7209  * Find all the strings that appears at the start of the languages
7210  * accepted by @exp and store them in @list. E.g. for (a, b) | c
7211  * it will return the list [a, c]
7212  *
7213  * Returns the number of unique strings found, -1 in case of errors and
7214  *         -2 if there is more than @len strings
7215  */
7216 int
xmlExpGetStart(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** tokList,int len)7217 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7218                const xmlChar**tokList, int len) {
7219     if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7220         return(-1);
7221     return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7222 }
7223 
7224 /**
7225  * xmlExpIsNillable:
7226  * @exp: the expression
7227  *
7228  * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7229  *
7230  * Returns 1 if nillable, 0 if not and -1 in case of error
7231  */
7232 int
xmlExpIsNillable(xmlExpNodePtr exp)7233 xmlExpIsNillable(xmlExpNodePtr exp) {
7234     if (exp == NULL)
7235         return(-1);
7236     return(IS_NILLABLE(exp) != 0);
7237 }
7238 
7239 static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str)7240 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7241 {
7242     xmlExpNodePtr ret;
7243 
7244     switch (exp->type) {
7245 	case XML_EXP_EMPTY:
7246 	    return(forbiddenExp);
7247 	case XML_EXP_FORBID:
7248 	    return(forbiddenExp);
7249 	case XML_EXP_ATOM:
7250 	    if (exp->exp_str == str) {
7251 #ifdef DEBUG_DERIV
7252 		printf("deriv atom: equal => Empty\n");
7253 #endif
7254 	        ret = emptyExp;
7255 	    } else {
7256 #ifdef DEBUG_DERIV
7257 		printf("deriv atom: mismatch => forbid\n");
7258 #endif
7259 	        /* TODO wildcards here */
7260 		ret = forbiddenExp;
7261 	    }
7262 	    return(ret);
7263 	case XML_EXP_OR: {
7264 	    xmlExpNodePtr tmp;
7265 
7266 #ifdef DEBUG_DERIV
7267 	    printf("deriv or: => or(derivs)\n");
7268 #endif
7269 	    tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7270 	    if (tmp == NULL) {
7271 		return(NULL);
7272 	    }
7273 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7274 	    if (ret == NULL) {
7275 	        xmlExpFree(ctxt, tmp);
7276 		return(NULL);
7277 	    }
7278             ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7279 			     NULL, 0, 0);
7280 	    return(ret);
7281 	}
7282 	case XML_EXP_SEQ:
7283 #ifdef DEBUG_DERIV
7284 	    printf("deriv seq: starting with left\n");
7285 #endif
7286 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7287 	    if (ret == NULL) {
7288 	        return(NULL);
7289 	    } else if (ret == forbiddenExp) {
7290 	        if (IS_NILLABLE(exp->exp_left)) {
7291 #ifdef DEBUG_DERIV
7292 		    printf("deriv seq: left failed but nillable\n");
7293 #endif
7294 		    ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7295 		}
7296 	    } else {
7297 #ifdef DEBUG_DERIV
7298 		printf("deriv seq: left match => sequence\n");
7299 #endif
7300 	        exp->exp_right->ref++;
7301 	        ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7302 		                         NULL, 0, 0);
7303 	    }
7304 	    return(ret);
7305 	case XML_EXP_COUNT: {
7306 	    int min, max;
7307 	    xmlExpNodePtr tmp;
7308 
7309 	    if (exp->exp_max == 0)
7310 		return(forbiddenExp);
7311 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7312 	    if (ret == NULL)
7313 	        return(NULL);
7314 	    if (ret == forbiddenExp) {
7315 #ifdef DEBUG_DERIV
7316 		printf("deriv count: pattern mismatch => forbid\n");
7317 #endif
7318 	        return(ret);
7319 	    }
7320 	    if (exp->exp_max == 1)
7321 		return(ret);
7322 	    if (exp->exp_max < 0) /* unbounded */
7323 		max = -1;
7324 	    else
7325 		max = exp->exp_max - 1;
7326 	    if (exp->exp_min > 0)
7327 		min = exp->exp_min - 1;
7328 	    else
7329 		min = 0;
7330 	    exp->exp_left->ref++;
7331 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7332 				     NULL, min, max);
7333 	    if (ret == emptyExp) {
7334 #ifdef DEBUG_DERIV
7335 		printf("deriv count: match to empty => new count\n");
7336 #endif
7337 	        return(tmp);
7338 	    }
7339 #ifdef DEBUG_DERIV
7340 	    printf("deriv count: match => sequence with new count\n");
7341 #endif
7342 	    return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7343 	                              NULL, 0, 0));
7344 	}
7345     }
7346     return(NULL);
7347 }
7348 
7349 /**
7350  * xmlExpStringDerive:
7351  * @ctxt: the expression context
7352  * @exp: the expression
7353  * @str: the string
7354  * @len: the string len in bytes if available
7355  *
7356  * Do one step of Brzozowski derivation of the expression @exp with
7357  * respect to the input string
7358  *
7359  * Returns the resulting expression or NULL in case of internal error
7360  */
7361 xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str,int len)7362 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7363                    const xmlChar *str, int len) {
7364     const xmlChar *input;
7365 
7366     if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7367         return(NULL);
7368     }
7369     /*
7370      * check the string is in the dictionary, if yes use an interned
7371      * copy, otherwise we know it's not an acceptable input
7372      */
7373     input = xmlDictExists(ctxt->dict, str, len);
7374     if (input == NULL) {
7375         return(forbiddenExp);
7376     }
7377     return(xmlExpStringDeriveInt(ctxt, exp, input));
7378 }
7379 
7380 static int
xmlExpCheckCard(xmlExpNodePtr exp,xmlExpNodePtr sub)7381 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7382     int ret = 1;
7383 
7384     if (sub->c_max == -1) {
7385         if (exp->c_max != -1)
7386 	    ret = 0;
7387     } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7388         ret = 0;
7389     }
7390 #if 0
7391     if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7392         ret = 0;
7393 #endif
7394     return(ret);
7395 }
7396 
7397 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7398                                         xmlExpNodePtr sub);
7399 /**
7400  * xmlExpDivide:
7401  * @ctxt: the expressions context
7402  * @exp: the englobing expression
7403  * @sub: the subexpression
7404  * @mult: the multiple expression
7405  * @remain: the remain from the derivation of the multiple
7406  *
7407  * Check if exp is a multiple of sub, i.e. if there is a finite number n
7408  * so that sub{n} subsume exp
7409  *
7410  * Returns the multiple value if successful, 0 if it is not a multiple
7411  *         and -1 in case of internal error.
7412  */
7413 
7414 static int
xmlExpDivide(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub,xmlExpNodePtr * mult,xmlExpNodePtr * remain)7415 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7416              xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7417     int i;
7418     xmlExpNodePtr tmp, tmp2;
7419 
7420     if (mult != NULL) *mult = NULL;
7421     if (remain != NULL) *remain = NULL;
7422     if (exp->c_max == -1) return(0);
7423     if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7424 
7425     for (i = 1;i <= exp->c_max;i++) {
7426         sub->ref++;
7427         tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7428 				 sub, NULL, NULL, i, i);
7429 	if (tmp == NULL) {
7430 	    return(-1);
7431 	}
7432 	if (!xmlExpCheckCard(tmp, exp)) {
7433 	    xmlExpFree(ctxt, tmp);
7434 	    continue;
7435 	}
7436 	tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7437 	if (tmp2 == NULL) {
7438 	    xmlExpFree(ctxt, tmp);
7439 	    return(-1);
7440 	}
7441 	if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7442 	    if (remain != NULL)
7443 	        *remain = tmp2;
7444 	    else
7445 	        xmlExpFree(ctxt, tmp2);
7446 	    if (mult != NULL)
7447 	        *mult = tmp;
7448 	    else
7449 	        xmlExpFree(ctxt, tmp);
7450 #ifdef DEBUG_DERIV
7451 	    printf("Divide succeeded %d\n", i);
7452 #endif
7453 	    return(i);
7454 	}
7455 	xmlExpFree(ctxt, tmp);
7456 	xmlExpFree(ctxt, tmp2);
7457     }
7458 #ifdef DEBUG_DERIV
7459     printf("Divide failed\n");
7460 #endif
7461     return(0);
7462 }
7463 
7464 /**
7465  * xmlExpExpDeriveInt:
7466  * @ctxt: the expressions context
7467  * @exp: the englobing expression
7468  * @sub: the subexpression
7469  *
7470  * Try to do a step of Brzozowski derivation but at a higher level
7471  * the input being a subexpression.
7472  *
7473  * Returns the resulting expression or NULL in case of internal error
7474  */
7475 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7476 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7477     xmlExpNodePtr ret, tmp, tmp2, tmp3;
7478     const xmlChar **tab;
7479     int len, i;
7480 
7481     /*
7482      * In case of equality and if the expression can only consume a finite
7483      * amount, then the derivation is empty
7484      */
7485     if ((exp == sub) && (exp->c_max >= 0)) {
7486 #ifdef DEBUG_DERIV
7487         printf("Equal(exp, sub) and finite -> Empty\n");
7488 #endif
7489         return(emptyExp);
7490     }
7491     /*
7492      * decompose sub sequence first
7493      */
7494     if (sub->type == XML_EXP_EMPTY) {
7495 #ifdef DEBUG_DERIV
7496         printf("Empty(sub) -> Empty\n");
7497 #endif
7498 	exp->ref++;
7499         return(exp);
7500     }
7501     if (sub->type == XML_EXP_SEQ) {
7502 #ifdef DEBUG_DERIV
7503         printf("Seq(sub) -> decompose\n");
7504 #endif
7505         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7506 	if (tmp == NULL)
7507 	    return(NULL);
7508 	if (tmp == forbiddenExp)
7509 	    return(tmp);
7510 	ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7511 	xmlExpFree(ctxt, tmp);
7512 	return(ret);
7513     }
7514     if (sub->type == XML_EXP_OR) {
7515 #ifdef DEBUG_DERIV
7516         printf("Or(sub) -> decompose\n");
7517 #endif
7518         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7519 	if (tmp == forbiddenExp)
7520 	    return(tmp);
7521 	if (tmp == NULL)
7522 	    return(NULL);
7523 	ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7524 	if ((ret == NULL) || (ret == forbiddenExp)) {
7525 	    xmlExpFree(ctxt, tmp);
7526 	    return(ret);
7527 	}
7528 	return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7529     }
7530     if (!xmlExpCheckCard(exp, sub)) {
7531 #ifdef DEBUG_DERIV
7532         printf("CheckCard(exp, sub) failed -> Forbid\n");
7533 #endif
7534         return(forbiddenExp);
7535     }
7536     switch (exp->type) {
7537         case XML_EXP_EMPTY:
7538 	    if (sub == emptyExp)
7539 	        return(emptyExp);
7540 #ifdef DEBUG_DERIV
7541 	    printf("Empty(exp) -> Forbid\n");
7542 #endif
7543 	    return(forbiddenExp);
7544         case XML_EXP_FORBID:
7545 #ifdef DEBUG_DERIV
7546 	    printf("Forbid(exp) -> Forbid\n");
7547 #endif
7548 	    return(forbiddenExp);
7549         case XML_EXP_ATOM:
7550 	    if (sub->type == XML_EXP_ATOM) {
7551 	        /* TODO: handle wildcards */
7552 	        if (exp->exp_str == sub->exp_str) {
7553 #ifdef DEBUG_DERIV
7554 		    printf("Atom match -> Empty\n");
7555 #endif
7556 		    return(emptyExp);
7557                 }
7558 #ifdef DEBUG_DERIV
7559 		printf("Atom mismatch -> Forbid\n");
7560 #endif
7561 	        return(forbiddenExp);
7562 	    }
7563 	    if ((sub->type == XML_EXP_COUNT) &&
7564 	        (sub->exp_max == 1) &&
7565 	        (sub->exp_left->type == XML_EXP_ATOM)) {
7566 	        /* TODO: handle wildcards */
7567 	        if (exp->exp_str == sub->exp_left->exp_str) {
7568 #ifdef DEBUG_DERIV
7569 		    printf("Atom match -> Empty\n");
7570 #endif
7571 		    return(emptyExp);
7572 		}
7573 #ifdef DEBUG_DERIV
7574 		printf("Atom mismatch -> Forbid\n");
7575 #endif
7576 	        return(forbiddenExp);
7577 	    }
7578 #ifdef DEBUG_DERIV
7579 	    printf("Complex exp vs Atom -> Forbid\n");
7580 #endif
7581 	    return(forbiddenExp);
7582         case XML_EXP_SEQ:
7583 	    /* try to get the sequence consumed only if possible */
7584 	    if (xmlExpCheckCard(exp->exp_left, sub)) {
7585 		/* See if the sequence can be consumed directly */
7586 #ifdef DEBUG_DERIV
7587 		printf("Seq trying left only\n");
7588 #endif
7589 		ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7590 		if ((ret != forbiddenExp) && (ret != NULL)) {
7591 #ifdef DEBUG_DERIV
7592 		    printf("Seq trying left only worked\n");
7593 #endif
7594 		    /*
7595 		     * TODO: assumption here that we are determinist
7596 		     *       i.e. we won't get to a nillable exp left
7597 		     *       subset which could be matched by the right
7598 		     *       part too.
7599 		     * e.g.: (a | b)+,(a | c) and 'a+,a'
7600 		     */
7601 		    exp->exp_right->ref++;
7602 		    return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7603 					      exp->exp_right, NULL, 0, 0));
7604 		}
7605 #ifdef DEBUG_DERIV
7606 	    } else {
7607 		printf("Seq: left too short\n");
7608 #endif
7609 	    }
7610 	    /* Try instead to decompose */
7611 	    if (sub->type == XML_EXP_COUNT) {
7612 		int min, max;
7613 
7614 #ifdef DEBUG_DERIV
7615 		printf("Seq: sub is a count\n");
7616 #endif
7617 	        ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7618 		if (ret == NULL)
7619 		    return(NULL);
7620 		if (ret != forbiddenExp) {
7621 #ifdef DEBUG_DERIV
7622 		    printf("Seq , Count match on left\n");
7623 #endif
7624 		    if (sub->exp_max < 0)
7625 		        max = -1;
7626 	            else
7627 		        max = sub->exp_max -1;
7628 		    if (sub->exp_min > 0)
7629 		        min = sub->exp_min -1;
7630 		    else
7631 		        min = 0;
7632 		    exp->exp_right->ref++;
7633 		    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7634 		                             exp->exp_right, NULL, 0, 0);
7635 		    if (tmp == NULL)
7636 		        return(NULL);
7637 
7638 		    sub->exp_left->ref++;
7639 		    tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7640 				      sub->exp_left, NULL, NULL, min, max);
7641 		    if (tmp2 == NULL) {
7642 		        xmlExpFree(ctxt, tmp);
7643 			return(NULL);
7644 		    }
7645 		    ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7646 		    xmlExpFree(ctxt, tmp);
7647 		    xmlExpFree(ctxt, tmp2);
7648 		    return(ret);
7649 		}
7650 	    }
7651 	    /* we made no progress on structured operations */
7652 	    break;
7653         case XML_EXP_OR:
7654 #ifdef DEBUG_DERIV
7655 	    printf("Or , trying both side\n");
7656 #endif
7657 	    ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7658 	    if (ret == NULL)
7659 	        return(NULL);
7660 	    tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7661 	    if (tmp == NULL) {
7662 		xmlExpFree(ctxt, ret);
7663 	        return(NULL);
7664 	    }
7665 	    return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7666         case XML_EXP_COUNT: {
7667 	    int min, max;
7668 
7669 	    if (sub->type == XML_EXP_COUNT) {
7670 	        /*
7671 		 * Try to see if the loop is completely subsumed
7672 		 */
7673 	        tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7674 		if (tmp == NULL)
7675 		    return(NULL);
7676 		if (tmp == forbiddenExp) {
7677 		    int mult;
7678 
7679 #ifdef DEBUG_DERIV
7680 		    printf("Count, Count inner don't subsume\n");
7681 #endif
7682 		    mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7683 		                        NULL, &tmp);
7684 		    if (mult <= 0) {
7685 #ifdef DEBUG_DERIV
7686 			printf("Count, Count not multiple => forbidden\n");
7687 #endif
7688                         return(forbiddenExp);
7689 		    }
7690 		    if (sub->exp_max == -1) {
7691 		        max = -1;
7692 			if (exp->exp_max == -1) {
7693 			    if (exp->exp_min <= sub->exp_min * mult)
7694 			        min = 0;
7695 			    else
7696 			        min = exp->exp_min - sub->exp_min * mult;
7697 			} else {
7698 #ifdef DEBUG_DERIV
7699 			    printf("Count, Count finite can't subsume infinite\n");
7700 #endif
7701                             xmlExpFree(ctxt, tmp);
7702 			    return(forbiddenExp);
7703 			}
7704 		    } else {
7705 			if (exp->exp_max == -1) {
7706 #ifdef DEBUG_DERIV
7707 			    printf("Infinite loop consume mult finite loop\n");
7708 #endif
7709 			    if (exp->exp_min > sub->exp_min * mult) {
7710 				max = -1;
7711 				min = exp->exp_min - sub->exp_min * mult;
7712 			    } else {
7713 				max = -1;
7714 				min = 0;
7715 			    }
7716 			} else {
7717 			    if (exp->exp_max < sub->exp_max * mult) {
7718 #ifdef DEBUG_DERIV
7719 				printf("loops max mult mismatch => forbidden\n");
7720 #endif
7721 				xmlExpFree(ctxt, tmp);
7722 				return(forbiddenExp);
7723 			    }
7724 			    if (sub->exp_max * mult > exp->exp_min)
7725 				min = 0;
7726 			    else
7727 				min = exp->exp_min - sub->exp_max * mult;
7728 			    max = exp->exp_max - sub->exp_max * mult;
7729 			}
7730 		    }
7731 		} else if (!IS_NILLABLE(tmp)) {
7732 		    /*
7733 		     * TODO: loop here to try to grow if working on finite
7734 		     *       blocks.
7735 		     */
7736 #ifdef DEBUG_DERIV
7737 		    printf("Count, Count remain not nillable => forbidden\n");
7738 #endif
7739 		    xmlExpFree(ctxt, tmp);
7740 		    return(forbiddenExp);
7741 		} else if (sub->exp_max == -1) {
7742 		    if (exp->exp_max == -1) {
7743 		        if (exp->exp_min <= sub->exp_min) {
7744 #ifdef DEBUG_DERIV
7745 			    printf("Infinite loops Okay => COUNT(0,Inf)\n");
7746 #endif
7747                             max = -1;
7748 			    min = 0;
7749 			} else {
7750 #ifdef DEBUG_DERIV
7751 			    printf("Infinite loops min => Count(X,Inf)\n");
7752 #endif
7753                             max = -1;
7754 			    min = exp->exp_min - sub->exp_min;
7755 			}
7756 		    } else if (exp->exp_min > sub->exp_min) {
7757 #ifdef DEBUG_DERIV
7758 			printf("loops min mismatch 1 => forbidden ???\n");
7759 #endif
7760 		        xmlExpFree(ctxt, tmp);
7761 		        return(forbiddenExp);
7762 		    } else {
7763 			max = -1;
7764 			min = 0;
7765 		    }
7766 		} else {
7767 		    if (exp->exp_max == -1) {
7768 #ifdef DEBUG_DERIV
7769 			printf("Infinite loop consume finite loop\n");
7770 #endif
7771 		        if (exp->exp_min > sub->exp_min) {
7772 			    max = -1;
7773 			    min = exp->exp_min - sub->exp_min;
7774 			} else {
7775 			    max = -1;
7776 			    min = 0;
7777 			}
7778 		    } else {
7779 		        if (exp->exp_max < sub->exp_max) {
7780 #ifdef DEBUG_DERIV
7781 			    printf("loops max mismatch => forbidden\n");
7782 #endif
7783 			    xmlExpFree(ctxt, tmp);
7784 			    return(forbiddenExp);
7785 			}
7786 			if (sub->exp_max > exp->exp_min)
7787 			    min = 0;
7788 			else
7789 			    min = exp->exp_min - sub->exp_max;
7790 			max = exp->exp_max - sub->exp_max;
7791 		    }
7792 		}
7793 #ifdef DEBUG_DERIV
7794 		printf("loops match => SEQ(COUNT())\n");
7795 #endif
7796 		exp->exp_left->ref++;
7797 		tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7798 		                          NULL, NULL, min, max);
7799 		if (tmp2 == NULL) {
7800 		    return(NULL);
7801 		}
7802                 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7803 		                         NULL, 0, 0);
7804 		return(ret);
7805 	    }
7806 	    tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7807 	    if (tmp == NULL)
7808 		return(NULL);
7809 	    if (tmp == forbiddenExp) {
7810 #ifdef DEBUG_DERIV
7811 		printf("loop mismatch => forbidden\n");
7812 #endif
7813 		return(forbiddenExp);
7814 	    }
7815 	    if (exp->exp_min > 0)
7816 		min = exp->exp_min - 1;
7817 	    else
7818 		min = 0;
7819 	    if (exp->exp_max < 0)
7820 		max = -1;
7821 	    else
7822 		max = exp->exp_max - 1;
7823 
7824 #ifdef DEBUG_DERIV
7825 	    printf("loop match => SEQ(COUNT())\n");
7826 #endif
7827 	    exp->exp_left->ref++;
7828 	    tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7829 				      NULL, NULL, min, max);
7830 	    if (tmp2 == NULL)
7831 		return(NULL);
7832 	    ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7833 				     NULL, 0, 0);
7834 	    return(ret);
7835 	}
7836     }
7837 
7838 #ifdef DEBUG_DERIV
7839     printf("Fallback to derivative\n");
7840 #endif
7841     if (IS_NILLABLE(sub)) {
7842         if (!(IS_NILLABLE(exp)))
7843 	    return(forbiddenExp);
7844 	else
7845 	    ret = emptyExp;
7846     } else
7847 	ret = NULL;
7848     /*
7849      * here the structured derivation made no progress so
7850      * we use the default token based derivation to force one more step
7851      */
7852     if (ctxt->tabSize == 0)
7853         ctxt->tabSize = 40;
7854 
7855     tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7856 	                               sizeof(const xmlChar *));
7857     if (tab == NULL) {
7858 	return(NULL);
7859     }
7860 
7861     /*
7862      * collect all the strings accepted by the subexpression on input
7863      */
7864     len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7865     while (len < 0) {
7866         const xmlChar **temp;
7867 	temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7868 	                                     sizeof(const xmlChar *));
7869 	if (temp == NULL) {
7870 	    xmlFree((xmlChar **) tab);
7871 	    return(NULL);
7872 	}
7873 	tab = temp;
7874 	ctxt->tabSize *= 2;
7875 	len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7876     }
7877     for (i = 0;i < len;i++) {
7878         tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7879 	if ((tmp == NULL) || (tmp == forbiddenExp)) {
7880 	    xmlExpFree(ctxt, ret);
7881 	    xmlFree((xmlChar **) tab);
7882 	    return(tmp);
7883 	}
7884 	tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7885 	if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7886 	    xmlExpFree(ctxt, tmp);
7887 	    xmlExpFree(ctxt, ret);
7888 	    xmlFree((xmlChar **) tab);
7889 	    return(tmp);
7890 	}
7891 	tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7892 	xmlExpFree(ctxt, tmp);
7893 	xmlExpFree(ctxt, tmp2);
7894 
7895 	if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7896 	    xmlExpFree(ctxt, ret);
7897 	    xmlFree((xmlChar **) tab);
7898 	    return(tmp3);
7899 	}
7900 
7901 	if (ret == NULL)
7902 	    ret = tmp3;
7903 	else {
7904 	    ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7905 	    if (ret == NULL) {
7906 		xmlFree((xmlChar **) tab);
7907 	        return(NULL);
7908 	    }
7909 	}
7910     }
7911     xmlFree((xmlChar **) tab);
7912     return(ret);
7913 }
7914 
7915 /**
7916  * xmlExpExpDerive:
7917  * @ctxt: the expressions context
7918  * @exp: the englobing expression
7919  * @sub: the subexpression
7920  *
7921  * Evaluates the expression resulting from @exp consuming a sub expression @sub
7922  * Based on algebraic derivation and sometimes direct Brzozowski derivation
7923  * it usually takes less than linear time and can handle expressions generating
7924  * infinite languages.
7925  *
7926  * Returns the resulting expression or NULL in case of internal error, the
7927  *         result must be freed
7928  */
7929 xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7930 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7931     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7932         return(NULL);
7933 
7934     /*
7935      * O(1) speedups
7936      */
7937     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7938 #ifdef DEBUG_DERIV
7939 	printf("Sub nillable and not exp : can't subsume\n");
7940 #endif
7941         return(forbiddenExp);
7942     }
7943     if (xmlExpCheckCard(exp, sub) == 0) {
7944 #ifdef DEBUG_DERIV
7945 	printf("sub generate longer sequences than exp : can't subsume\n");
7946 #endif
7947         return(forbiddenExp);
7948     }
7949     return(xmlExpExpDeriveInt(ctxt, exp, sub));
7950 }
7951 
7952 /**
7953  * xmlExpSubsume:
7954  * @ctxt: the expressions context
7955  * @exp: the englobing expression
7956  * @sub: the subexpression
7957  *
7958  * Check whether @exp accepts all the languages accepted by @sub
7959  * the input being a subexpression.
7960  *
7961  * Returns 1 if true 0 if false and -1 in case of failure.
7962  */
7963 int
xmlExpSubsume(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7964 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7965     xmlExpNodePtr tmp;
7966 
7967     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7968         return(-1);
7969 
7970     /*
7971      * TODO: speedup by checking the language of sub is a subset of the
7972      *       language of exp
7973      */
7974     /*
7975      * O(1) speedups
7976      */
7977     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7978 #ifdef DEBUG_DERIV
7979 	printf("Sub nillable and not exp : can't subsume\n");
7980 #endif
7981         return(0);
7982     }
7983     if (xmlExpCheckCard(exp, sub) == 0) {
7984 #ifdef DEBUG_DERIV
7985 	printf("sub generate longer sequences than exp : can't subsume\n");
7986 #endif
7987         return(0);
7988     }
7989     tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7990 #ifdef DEBUG_DERIV
7991     printf("Result derivation :\n");
7992     PRINT_EXP(tmp);
7993 #endif
7994     if (tmp == NULL)
7995         return(-1);
7996     if (tmp == forbiddenExp)
7997 	return(0);
7998     if (tmp == emptyExp)
7999 	return(1);
8000     if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
8001         xmlExpFree(ctxt, tmp);
8002         return(1);
8003     }
8004     xmlExpFree(ctxt, tmp);
8005     return(0);
8006 }
8007 
8008 /************************************************************************
8009  *									*
8010  *			Parsing expression				*
8011  *									*
8012  ************************************************************************/
8013 
8014 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
8015 
8016 #undef CUR
8017 #define CUR (*ctxt->cur)
8018 #undef NEXT
8019 #define NEXT ctxt->cur++;
8020 #undef IS_BLANK
8021 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
8022 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
8023 
8024 static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt)8025 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
8026     int ret = 0;
8027 
8028     SKIP_BLANKS
8029     if (CUR == '*') {
8030 	NEXT
8031 	return(-1);
8032     }
8033     if ((CUR < '0') || (CUR > '9'))
8034         return(-1);
8035     while ((CUR >= '0') && (CUR <= '9')) {
8036         ret = ret * 10 + (CUR - '0');
8037 	NEXT
8038     }
8039     return(ret);
8040 }
8041 
8042 static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt)8043 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
8044     const char *base;
8045     xmlExpNodePtr ret;
8046     const xmlChar *val;
8047 
8048     SKIP_BLANKS
8049     base = ctxt->cur;
8050     if (*ctxt->cur == '(') {
8051         NEXT
8052 	ret = xmlExpParseExpr(ctxt);
8053 	SKIP_BLANKS
8054 	if (*ctxt->cur != ')') {
8055 	    fprintf(stderr, "unbalanced '(' : %s\n", base);
8056 	    xmlExpFree(ctxt, ret);
8057 	    return(NULL);
8058 	}
8059 	NEXT;
8060 	SKIP_BLANKS
8061 	goto parse_quantifier;
8062     }
8063     while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
8064            (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
8065 	   (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
8066 	NEXT;
8067     val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
8068     if (val == NULL)
8069         return(NULL);
8070     ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
8071     if (ret == NULL)
8072         return(NULL);
8073     SKIP_BLANKS
8074 parse_quantifier:
8075     if (CUR == '{') {
8076         int min, max;
8077 
8078         NEXT
8079 	min = xmlExpParseNumber(ctxt);
8080 	if (min < 0) {
8081 	    xmlExpFree(ctxt, ret);
8082 	    return(NULL);
8083 	}
8084 	SKIP_BLANKS
8085 	if (CUR == ',') {
8086 	    NEXT
8087 	    max = xmlExpParseNumber(ctxt);
8088 	    SKIP_BLANKS
8089 	} else
8090 	    max = min;
8091 	if (CUR != '}') {
8092 	    xmlExpFree(ctxt, ret);
8093 	    return(NULL);
8094 	}
8095         NEXT
8096 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8097 	                         min, max);
8098 	SKIP_BLANKS
8099     } else if (CUR == '?') {
8100         NEXT
8101 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8102 	                         0, 1);
8103 	SKIP_BLANKS
8104     } else if (CUR == '+') {
8105         NEXT
8106 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8107 	                         1, -1);
8108 	SKIP_BLANKS
8109     } else if (CUR == '*') {
8110         NEXT
8111 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8112 	                         0, -1);
8113 	SKIP_BLANKS
8114     }
8115     return(ret);
8116 }
8117 
8118 
8119 static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt)8120 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
8121     xmlExpNodePtr ret, right;
8122 
8123     ret = xmlExpParseOr(ctxt);
8124     SKIP_BLANKS
8125     while (CUR == '|') {
8126         NEXT
8127 	right = xmlExpParseOr(ctxt);
8128 	if (right == NULL) {
8129 	    xmlExpFree(ctxt, ret);
8130 	    return(NULL);
8131 	}
8132 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8133 	if (ret == NULL)
8134 	    return(NULL);
8135     }
8136     return(ret);
8137 }
8138 
8139 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt)8140 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8141     xmlExpNodePtr ret, right;
8142 
8143     ret = xmlExpParseSeq(ctxt);
8144     SKIP_BLANKS
8145     while (CUR == ',') {
8146         NEXT
8147 	right = xmlExpParseSeq(ctxt);
8148 	if (right == NULL) {
8149 	    xmlExpFree(ctxt, ret);
8150 	    return(NULL);
8151 	}
8152 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8153 	if (ret == NULL)
8154 	    return(NULL);
8155     }
8156     return(ret);
8157 }
8158 
8159 /**
8160  * xmlExpParse:
8161  * @ctxt: the expressions context
8162  * @expr: the 0 terminated string
8163  *
8164  * Minimal parser for regexps, it understand the following constructs
8165  *  - string terminals
8166  *  - choice operator |
8167  *  - sequence operator ,
8168  *  - subexpressions (...)
8169  *  - usual cardinality operators + * and ?
8170  *  - finite sequences  { min, max }
8171  *  - infinite sequences { min, * }
8172  * There is minimal checkings made especially no checking on strings values
8173  *
8174  * Returns a new expression or NULL in case of failure
8175  */
8176 xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt,const char * expr)8177 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8178     xmlExpNodePtr ret;
8179 
8180     ctxt->expr = expr;
8181     ctxt->cur = expr;
8182 
8183     ret = xmlExpParseExpr(ctxt);
8184     SKIP_BLANKS
8185     if (*ctxt->cur != 0) {
8186         xmlExpFree(ctxt, ret);
8187         return(NULL);
8188     }
8189     return(ret);
8190 }
8191 
8192 static void
xmlExpDumpInt(xmlBufferPtr buf,xmlExpNodePtr expr,int glob)8193 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8194     xmlExpNodePtr c;
8195 
8196     if (expr == NULL) return;
8197     if (glob) xmlBufferWriteChar(buf, "(");
8198     switch (expr->type) {
8199         case XML_EXP_EMPTY:
8200 	    xmlBufferWriteChar(buf, "empty");
8201 	    break;
8202         case XML_EXP_FORBID:
8203 	    xmlBufferWriteChar(buf, "forbidden");
8204 	    break;
8205         case XML_EXP_ATOM:
8206 	    xmlBufferWriteCHAR(buf, expr->exp_str);
8207 	    break;
8208         case XML_EXP_SEQ:
8209 	    c = expr->exp_left;
8210 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8211 	        xmlExpDumpInt(buf, c, 1);
8212 	    else
8213 	        xmlExpDumpInt(buf, c, 0);
8214 	    xmlBufferWriteChar(buf, " , ");
8215 	    c = expr->exp_right;
8216 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8217 	        xmlExpDumpInt(buf, c, 1);
8218 	    else
8219 	        xmlExpDumpInt(buf, c, 0);
8220             break;
8221         case XML_EXP_OR:
8222 	    c = expr->exp_left;
8223 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8224 	        xmlExpDumpInt(buf, c, 1);
8225 	    else
8226 	        xmlExpDumpInt(buf, c, 0);
8227 	    xmlBufferWriteChar(buf, " | ");
8228 	    c = expr->exp_right;
8229 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8230 	        xmlExpDumpInt(buf, c, 1);
8231 	    else
8232 	        xmlExpDumpInt(buf, c, 0);
8233             break;
8234         case XML_EXP_COUNT: {
8235 	    char rep[40];
8236 
8237 	    c = expr->exp_left;
8238 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8239 	        xmlExpDumpInt(buf, c, 1);
8240 	    else
8241 	        xmlExpDumpInt(buf, c, 0);
8242 	    if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8243 		rep[0] = '?';
8244 		rep[1] = 0;
8245 	    } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8246 		rep[0] = '*';
8247 		rep[1] = 0;
8248 	    } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8249 		rep[0] = '+';
8250 		rep[1] = 0;
8251 	    } else if (expr->exp_max == expr->exp_min) {
8252 	        snprintf(rep, 39, "{%d}", expr->exp_min);
8253 	    } else if (expr->exp_max < 0) {
8254 	        snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8255 	    } else {
8256 	        snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8257 	    }
8258 	    rep[39] = 0;
8259 	    xmlBufferWriteChar(buf, rep);
8260 	    break;
8261 	}
8262 	default:
8263 	    fprintf(stderr, "Error in tree\n");
8264     }
8265     if (glob)
8266         xmlBufferWriteChar(buf, ")");
8267 }
8268 /**
8269  * xmlExpDump:
8270  * @buf:  a buffer to receive the output
8271  * @expr:  the compiled expression
8272  *
8273  * Serialize the expression as compiled to the buffer
8274  */
8275 void
xmlExpDump(xmlBufferPtr buf,xmlExpNodePtr expr)8276 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8277     if ((buf == NULL) || (expr == NULL))
8278         return;
8279     xmlExpDumpInt(buf, expr, 0);
8280 }
8281 
8282 /**
8283  * xmlExpMaxToken:
8284  * @expr: a compiled expression
8285  *
8286  * Indicate the maximum number of input a expression can accept
8287  *
8288  * Returns the maximum length or -1 in case of error
8289  */
8290 int
xmlExpMaxToken(xmlExpNodePtr expr)8291 xmlExpMaxToken(xmlExpNodePtr expr) {
8292     if (expr == NULL)
8293         return(-1);
8294     return(expr->c_max);
8295 }
8296 
8297 /**
8298  * xmlExpCtxtNbNodes:
8299  * @ctxt: an expression context
8300  *
8301  * Debugging facility provides the number of allocated nodes at a that point
8302  *
8303  * Returns the number of nodes in use or -1 in case of error
8304  */
8305 int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt)8306 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8307     if (ctxt == NULL)
8308         return(-1);
8309     return(ctxt->nb_nodes);
8310 }
8311 
8312 /**
8313  * xmlExpCtxtNbCons:
8314  * @ctxt: an expression context
8315  *
8316  * Debugging facility provides the number of allocated nodes over lifetime
8317  *
8318  * Returns the number of nodes ever allocated or -1 in case of error
8319  */
8320 int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt)8321 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8322     if (ctxt == NULL)
8323         return(-1);
8324     return(ctxt->nb_cons);
8325 }
8326 
8327 #endif /* LIBXML_EXPR_ENABLED */
8328 
8329 #endif /* LIBXML_REGEXP_ENABLED */
8330