1 /* libxml2 - Library for parsing XML documents
2  * Copyright (C) 2006-2019 Free Software Foundation, Inc.
3  *
4  * This file is not part of the GNU gettext program, but is used with
5  * GNU gettext.
6  *
7  * The original copyright notice is as follows:
8  */
9 
10 /*
11  * Copyright (C) 1998-2012 Daniel Veillard.  All Rights Reserved.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this software and associated documentation files (the "Software"), to deal
15  * in the Software without restriction, including without limitation the rights
16  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17  * copies of the Software, and to permit persons to whom the Software is fur-
18  * nished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
25  * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29  * THE SOFTWARE.
30  *
31  * Daniel Veillard <veillard@redhat.com>
32  */
33 
34 /*
35  * regexp.c: generic and extensible Regular Expression engine
36  *
37  * Basically designed with the purpose of compiling regexps for
38  * the variety of validation/shemas mechanisms now available in
39  * XML related specifications these include:
40  *    - XML-1.0 DTD validation
41  *    - XML Schemas structure part 1
42  *    - XML Schemas Datatypes part 2 especially Appendix F
43  *    - RELAX-NG/TREX i.e. the counter proposal
44  */
45 
46 #define IN_LIBXML
47 #include "libxml.h"
48 
49 #ifdef LIBXML_REGEXP_ENABLED
50 
51 /* #define DEBUG_ERR */
52 
53 #include <stdio.h>
54 #include <string.h>
55 #ifdef HAVE_LIMITS_H
56 #include <limits.h>
57 #endif
58 
59 #include <libxml/tree.h>
60 #include <libxml/parserInternals.h>
61 #include <libxml/xmlregexp.h>
62 #include <libxml/xmlautomata.h>
63 #include <libxml/xmlunicode.h>
64 
65 #ifndef INT_MAX
66 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */
67 #endif
68 
69 /* #define DEBUG_REGEXP_GRAPH */
70 /* #define DEBUG_REGEXP_EXEC */
71 /* #define DEBUG_PUSH */
72 /* #define DEBUG_COMPACTION */
73 
74 #define MAX_PUSH 10000000
75 
76 #ifdef ERROR
77 #undef ERROR
78 #endif
79 #define ERROR(str)							\
80     ctxt->error = XML_REGEXP_COMPILE_ERROR;				\
81     xmlRegexpErrCompile(ctxt, str);
82 #define NEXT ctxt->cur++
83 #define CUR (*(ctxt->cur))
84 #define NXT(index) (ctxt->cur[index])
85 
86 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
87 #define NEXTL(l) ctxt->cur += l;
88 #define XML_REG_STRING_SEPARATOR '|'
89 /*
90  * Need PREV to check on a '-' within a Character Group. May only be used
91  * when it's guaranteed that cur is not at the beginning of ctxt->string!
92  */
93 #define PREV (ctxt->cur[-1])
94 
95 /**
96  * TODO:
97  *
98  * macro to flag unimplemented blocks
99  */
100 #define TODO								\
101     xmlGenericError(xmlGenericErrorContext,				\
102 	    "Unimplemented block at %s:%d\n",				\
103             __FILE__, __LINE__);
104 
105 /************************************************************************
106  *									*
107  *			Datatypes and structures			*
108  *									*
109  ************************************************************************/
110 
111 /*
112  * Note: the order of the enums below is significant, do not shuffle
113  */
114 typedef enum {
115     XML_REGEXP_EPSILON = 1,
116     XML_REGEXP_CHARVAL,
117     XML_REGEXP_RANGES,
118     XML_REGEXP_SUBREG,  /* used for () sub regexps */
119     XML_REGEXP_STRING,
120     XML_REGEXP_ANYCHAR, /* . */
121     XML_REGEXP_ANYSPACE, /* \s */
122     XML_REGEXP_NOTSPACE, /* \S */
123     XML_REGEXP_INITNAME, /* \l */
124     XML_REGEXP_NOTINITNAME, /* \L */
125     XML_REGEXP_NAMECHAR, /* \c */
126     XML_REGEXP_NOTNAMECHAR, /* \C */
127     XML_REGEXP_DECIMAL, /* \d */
128     XML_REGEXP_NOTDECIMAL, /* \D */
129     XML_REGEXP_REALCHAR, /* \w */
130     XML_REGEXP_NOTREALCHAR, /* \W */
131     XML_REGEXP_LETTER = 100,
132     XML_REGEXP_LETTER_UPPERCASE,
133     XML_REGEXP_LETTER_LOWERCASE,
134     XML_REGEXP_LETTER_TITLECASE,
135     XML_REGEXP_LETTER_MODIFIER,
136     XML_REGEXP_LETTER_OTHERS,
137     XML_REGEXP_MARK,
138     XML_REGEXP_MARK_NONSPACING,
139     XML_REGEXP_MARK_SPACECOMBINING,
140     XML_REGEXP_MARK_ENCLOSING,
141     XML_REGEXP_NUMBER,
142     XML_REGEXP_NUMBER_DECIMAL,
143     XML_REGEXP_NUMBER_LETTER,
144     XML_REGEXP_NUMBER_OTHERS,
145     XML_REGEXP_PUNCT,
146     XML_REGEXP_PUNCT_CONNECTOR,
147     XML_REGEXP_PUNCT_DASH,
148     XML_REGEXP_PUNCT_OPEN,
149     XML_REGEXP_PUNCT_CLOSE,
150     XML_REGEXP_PUNCT_INITQUOTE,
151     XML_REGEXP_PUNCT_FINQUOTE,
152     XML_REGEXP_PUNCT_OTHERS,
153     XML_REGEXP_SEPAR,
154     XML_REGEXP_SEPAR_SPACE,
155     XML_REGEXP_SEPAR_LINE,
156     XML_REGEXP_SEPAR_PARA,
157     XML_REGEXP_SYMBOL,
158     XML_REGEXP_SYMBOL_MATH,
159     XML_REGEXP_SYMBOL_CURRENCY,
160     XML_REGEXP_SYMBOL_MODIFIER,
161     XML_REGEXP_SYMBOL_OTHERS,
162     XML_REGEXP_OTHER,
163     XML_REGEXP_OTHER_CONTROL,
164     XML_REGEXP_OTHER_FORMAT,
165     XML_REGEXP_OTHER_PRIVATE,
166     XML_REGEXP_OTHER_NA,
167     XML_REGEXP_BLOCK_NAME
168 } xmlRegAtomType;
169 
170 typedef enum {
171     XML_REGEXP_QUANT_EPSILON = 1,
172     XML_REGEXP_QUANT_ONCE,
173     XML_REGEXP_QUANT_OPT,
174     XML_REGEXP_QUANT_MULT,
175     XML_REGEXP_QUANT_PLUS,
176     XML_REGEXP_QUANT_ONCEONLY,
177     XML_REGEXP_QUANT_ALL,
178     XML_REGEXP_QUANT_RANGE
179 } xmlRegQuantType;
180 
181 typedef enum {
182     XML_REGEXP_START_STATE = 1,
183     XML_REGEXP_FINAL_STATE,
184     XML_REGEXP_TRANS_STATE,
185     XML_REGEXP_SINK_STATE,
186     XML_REGEXP_UNREACH_STATE
187 } xmlRegStateType;
188 
189 typedef enum {
190     XML_REGEXP_MARK_NORMAL = 0,
191     XML_REGEXP_MARK_START,
192     XML_REGEXP_MARK_VISITED
193 } xmlRegMarkedType;
194 
195 typedef struct _xmlRegRange xmlRegRange;
196 typedef xmlRegRange *xmlRegRangePtr;
197 
198 struct _xmlRegRange {
199     int neg;		/* 0 normal, 1 not, 2 exclude */
200     xmlRegAtomType type;
201     int start;
202     int end;
203     xmlChar *blockName;
204 };
205 
206 typedef struct _xmlRegAtom xmlRegAtom;
207 typedef xmlRegAtom *xmlRegAtomPtr;
208 
209 typedef struct _xmlAutomataState xmlRegState;
210 typedef xmlRegState *xmlRegStatePtr;
211 
212 struct _xmlRegAtom {
213     int no;
214     xmlRegAtomType type;
215     xmlRegQuantType quant;
216     int min;
217     int max;
218 
219     void *valuep;
220     void *valuep2;
221     int neg;
222     int codepoint;
223     xmlRegStatePtr start;
224     xmlRegStatePtr start0;
225     xmlRegStatePtr stop;
226     int maxRanges;
227     int nbRanges;
228     xmlRegRangePtr *ranges;
229     void *data;
230 };
231 
232 typedef struct _xmlRegCounter xmlRegCounter;
233 typedef xmlRegCounter *xmlRegCounterPtr;
234 
235 struct _xmlRegCounter {
236     int min;
237     int max;
238 };
239 
240 typedef struct _xmlRegTrans xmlRegTrans;
241 typedef xmlRegTrans *xmlRegTransPtr;
242 
243 struct _xmlRegTrans {
244     xmlRegAtomPtr atom;
245     int to;
246     int counter;
247     int count;
248     int nd;
249 };
250 
251 struct _xmlAutomataState {
252     xmlRegStateType type;
253     xmlRegMarkedType mark;
254     xmlRegMarkedType markd;
255     xmlRegMarkedType reached;
256     int no;
257     int maxTrans;
258     int nbTrans;
259     xmlRegTrans *trans;
260     /*  knowing states ponting to us can speed things up */
261     int maxTransTo;
262     int nbTransTo;
263     int *transTo;
264 };
265 
266 typedef struct _xmlAutomata xmlRegParserCtxt;
267 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
268 
269 #define AM_AUTOMATA_RNG 1
270 
271 struct _xmlAutomata {
272     xmlChar *string;
273     xmlChar *cur;
274 
275     int error;
276     int neg;
277 
278     xmlRegStatePtr start;
279     xmlRegStatePtr end;
280     xmlRegStatePtr state;
281 
282     xmlRegAtomPtr atom;
283 
284     int maxAtoms;
285     int nbAtoms;
286     xmlRegAtomPtr *atoms;
287 
288     int maxStates;
289     int nbStates;
290     xmlRegStatePtr *states;
291 
292     int maxCounters;
293     int nbCounters;
294     xmlRegCounter *counters;
295 
296     int determinist;
297     int negs;
298     int flags;
299 };
300 
301 struct _xmlRegexp {
302     xmlChar *string;
303     int nbStates;
304     xmlRegStatePtr *states;
305     int nbAtoms;
306     xmlRegAtomPtr *atoms;
307     int nbCounters;
308     xmlRegCounter *counters;
309     int determinist;
310     int flags;
311     /*
312      * That's the compact form for determinists automatas
313      */
314     int nbstates;
315     int *compact;
316     void **transdata;
317     int nbstrings;
318     xmlChar **stringMap;
319 };
320 
321 typedef struct _xmlRegExecRollback xmlRegExecRollback;
322 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
323 
324 struct _xmlRegExecRollback {
325     xmlRegStatePtr state;/* the current state */
326     int index;		/* the index in the input stack */
327     int nextbranch;	/* the next transition to explore in that state */
328     int *counts;	/* save the automata state if it has some */
329 };
330 
331 typedef struct _xmlRegInputToken xmlRegInputToken;
332 typedef xmlRegInputToken *xmlRegInputTokenPtr;
333 
334 struct _xmlRegInputToken {
335     xmlChar *value;
336     void *data;
337 };
338 
339 struct _xmlRegExecCtxt {
340     int status;		/* execution status != 0 indicate an error */
341     int determinist;	/* did we find an indeterministic behaviour */
342     xmlRegexpPtr comp;	/* the compiled regexp */
343     xmlRegExecCallbacks callback;
344     void *data;
345 
346     xmlRegStatePtr state;/* the current state */
347     int transno;	/* the current transition on that state */
348     int transcount;	/* the number of chars in char counted transitions */
349 
350     /*
351      * A stack of rollback states
352      */
353     int maxRollbacks;
354     int nbRollbacks;
355     xmlRegExecRollback *rollbacks;
356 
357     /*
358      * The state of the automata if any
359      */
360     int *counts;
361 
362     /*
363      * The input stack
364      */
365     int inputStackMax;
366     int inputStackNr;
367     int index;
368     int *charStack;
369     const xmlChar *inputString; /* when operating on characters */
370     xmlRegInputTokenPtr inputStack;/* when operating on strings */
371 
372     /*
373      * error handling
374      */
375     int errStateNo;		/* the error state number */
376     xmlRegStatePtr errState;    /* the error state */
377     xmlChar *errString;		/* the string raising the error */
378     int *errCounts;		/* counters at the error state */
379     int nbPush;
380 };
381 
382 #define REGEXP_ALL_COUNTER	0x123456
383 #define REGEXP_ALL_LAX_COUNTER	0x123457
384 
385 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
386 static void xmlRegFreeState(xmlRegStatePtr state);
387 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
388 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
389 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
390 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
391                   int neg, int start, int end, const xmlChar *blockName);
392 
393 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
394 
395 /************************************************************************
396  *									*
397  *		Regexp memory error handler				*
398  *									*
399  ************************************************************************/
400 /**
401  * xmlRegexpErrMemory:
402  * @extra:  extra information
403  *
404  * Handle an out of memory condition
405  */
406 static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt,const char * extra)407 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
408 {
409     const char *regexp = NULL;
410     if (ctxt != NULL) {
411         regexp = (const char *) ctxt->string;
412 	ctxt->error = XML_ERR_NO_MEMORY;
413     }
414     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
415 		    XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
416 		    regexp, NULL, 0, 0,
417 		    "Memory allocation failed : %s\n", extra);
418 }
419 
420 /**
421  * xmlRegexpErrCompile:
422  * @extra:  extra information
423  *
424  * Handle a compilation failure
425  */
426 static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt,const char * extra)427 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
428 {
429     const char *regexp = NULL;
430     int idx = 0;
431 
432     if (ctxt != NULL) {
433         regexp = (const char *) ctxt->string;
434 	idx = ctxt->cur - ctxt->string;
435 	ctxt->error = XML_REGEXP_COMPILE_ERROR;
436     }
437     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
438 		    XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
439 		    regexp, NULL, idx, 0,
440 		    "failed to compile: %s\n", extra);
441 }
442 
443 /************************************************************************
444  *									*
445  *			Allocation/Deallocation				*
446  *									*
447  ************************************************************************/
448 
449 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
450 /**
451  * xmlRegEpxFromParse:
452  * @ctxt:  the parser context used to build it
453  *
454  * Allocate a new regexp and fill it with the result from the parser
455  *
456  * Returns the new regexp or NULL in case of error
457  */
458 static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt)459 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
460     xmlRegexpPtr ret;
461 
462     ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
463     if (ret == NULL) {
464 	xmlRegexpErrMemory(ctxt, "compiling regexp");
465 	return(NULL);
466     }
467     memset(ret, 0, sizeof(xmlRegexp));
468     ret->string = ctxt->string;
469     ret->nbStates = ctxt->nbStates;
470     ret->states = ctxt->states;
471     ret->nbAtoms = ctxt->nbAtoms;
472     ret->atoms = ctxt->atoms;
473     ret->nbCounters = ctxt->nbCounters;
474     ret->counters = ctxt->counters;
475     ret->determinist = ctxt->determinist;
476     ret->flags = ctxt->flags;
477     if (ret->determinist == -1) {
478         xmlRegexpIsDeterminist(ret);
479     }
480 
481     if ((ret->determinist != 0) &&
482 	(ret->nbCounters == 0) &&
483 	(ctxt->negs == 0) &&
484 	(ret->atoms != NULL) &&
485 	(ret->atoms[0] != NULL) &&
486 	(ret->atoms[0]->type == XML_REGEXP_STRING)) {
487 	int i, j, nbstates = 0, nbatoms = 0;
488 	int *stateRemap;
489 	int *stringRemap;
490 	int *transitions;
491 	void **transdata;
492 	xmlChar **stringMap;
493         xmlChar *value;
494 
495 	/*
496 	 * Switch to a compact representation
497 	 * 1/ counting the effective number of states left
498 	 * 2/ counting the unique number of atoms, and check that
499 	 *    they are all of the string type
500 	 * 3/ build a table state x atom for the transitions
501 	 */
502 
503 	stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
504 	if (stateRemap == NULL) {
505 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
506 	    xmlFree(ret);
507 	    return(NULL);
508 	}
509 	for (i = 0;i < ret->nbStates;i++) {
510 	    if (ret->states[i] != NULL) {
511 		stateRemap[i] = nbstates;
512 		nbstates++;
513 	    } else {
514 		stateRemap[i] = -1;
515 	    }
516 	}
517 #ifdef DEBUG_COMPACTION
518 	printf("Final: %d states\n", nbstates);
519 #endif
520 	stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
521 	if (stringMap == NULL) {
522 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
523 	    xmlFree(stateRemap);
524 	    xmlFree(ret);
525 	    return(NULL);
526 	}
527 	stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
528 	if (stringRemap == NULL) {
529 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
530 	    xmlFree(stringMap);
531 	    xmlFree(stateRemap);
532 	    xmlFree(ret);
533 	    return(NULL);
534 	}
535 	for (i = 0;i < ret->nbAtoms;i++) {
536 	    if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
537 		(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
538 		value = ret->atoms[i]->valuep;
539                 for (j = 0;j < nbatoms;j++) {
540 		    if (xmlStrEqual(stringMap[j], value)) {
541 			stringRemap[i] = j;
542 			break;
543 		    }
544 		}
545 		if (j >= nbatoms) {
546 		    stringRemap[i] = nbatoms;
547 		    stringMap[nbatoms] = xmlStrdup(value);
548 		    if (stringMap[nbatoms] == NULL) {
549 			for (i = 0;i < nbatoms;i++)
550 			    xmlFree(stringMap[i]);
551 			xmlFree(stringRemap);
552 			xmlFree(stringMap);
553 			xmlFree(stateRemap);
554 			xmlFree(ret);
555 			return(NULL);
556 		    }
557 		    nbatoms++;
558 		}
559 	    } else {
560 		xmlFree(stateRemap);
561 		xmlFree(stringRemap);
562 		for (i = 0;i < nbatoms;i++)
563 		    xmlFree(stringMap[i]);
564 		xmlFree(stringMap);
565 		xmlFree(ret);
566 		return(NULL);
567 	    }
568 	}
569 #ifdef DEBUG_COMPACTION
570 	printf("Final: %d atoms\n", nbatoms);
571 #endif
572 	transitions = (int *) xmlMalloc((nbstates + 1) *
573 	                                (nbatoms + 1) * sizeof(int));
574 	if (transitions == NULL) {
575 	    xmlFree(stateRemap);
576 	    xmlFree(stringRemap);
577 	    xmlFree(stringMap);
578 	    xmlFree(ret);
579 	    return(NULL);
580 	}
581 	memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
582 
583 	/*
584 	 * Allocate the transition table. The first entry for each
585 	 * state corresponds to the state type.
586 	 */
587 	transdata = NULL;
588 
589 	for (i = 0;i < ret->nbStates;i++) {
590 	    int stateno, atomno, targetno, prev;
591 	    xmlRegStatePtr state;
592 	    xmlRegTransPtr trans;
593 
594 	    stateno = stateRemap[i];
595 	    if (stateno == -1)
596 		continue;
597 	    state = ret->states[i];
598 
599 	    transitions[stateno * (nbatoms + 1)] = state->type;
600 
601 	    for (j = 0;j < state->nbTrans;j++) {
602 		trans = &(state->trans[j]);
603 		if ((trans->to == -1) || (trans->atom == NULL))
604 		    continue;
605                 atomno = stringRemap[trans->atom->no];
606 		if ((trans->atom->data != NULL) && (transdata == NULL)) {
607 		    transdata = (void **) xmlMalloc(nbstates * nbatoms *
608 			                            sizeof(void *));
609 		    if (transdata != NULL)
610 			memset(transdata, 0,
611 			       nbstates * nbatoms * sizeof(void *));
612 		    else {
613 			xmlRegexpErrMemory(ctxt, "compiling regexp");
614 			break;
615 		    }
616 		}
617 		targetno = stateRemap[trans->to];
618 		/*
619 		 * if the same atom can generate transitions to 2 different
620 		 * states then it means the automata is not determinist and
621 		 * the compact form can't be used !
622 		 */
623 		prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
624 		if (prev != 0) {
625 		    if (prev != targetno + 1) {
626 			ret->determinist = 0;
627 #ifdef DEBUG_COMPACTION
628 			printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
629 			       i, j, trans->atom->no, trans->to, atomno, targetno);
630 			printf("       previous to is %d\n", prev);
631 #endif
632 			if (transdata != NULL)
633 			    xmlFree(transdata);
634 			xmlFree(transitions);
635 			xmlFree(stateRemap);
636 			xmlFree(stringRemap);
637 			for (i = 0;i < nbatoms;i++)
638 			    xmlFree(stringMap[i]);
639 			xmlFree(stringMap);
640 			goto not_determ;
641 		    }
642 		} else {
643 #if 0
644 		    printf("State %d trans %d: atom %d to %d : %d to %d\n",
645 			   i, j, trans->atom->no, trans->to, atomno, targetno);
646 #endif
647 		    transitions[stateno * (nbatoms + 1) + atomno + 1] =
648 			targetno + 1; /* to avoid 0 */
649 		    if (transdata != NULL)
650 			transdata[stateno * nbatoms + atomno] =
651 			    trans->atom->data;
652 		}
653 	    }
654 	}
655 	ret->determinist = 1;
656 #ifdef DEBUG_COMPACTION
657 	/*
658 	 * Debug
659 	 */
660 	for (i = 0;i < nbstates;i++) {
661 	    for (j = 0;j < nbatoms + 1;j++) {
662                 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
663 	    }
664 	    printf("\n");
665 	}
666 	printf("\n");
667 #endif
668 	/*
669 	 * Cleanup of the old data
670 	 */
671 	if (ret->states != NULL) {
672 	    for (i = 0;i < ret->nbStates;i++)
673 		xmlRegFreeState(ret->states[i]);
674 	    xmlFree(ret->states);
675 	}
676 	ret->states = NULL;
677 	ret->nbStates = 0;
678 	if (ret->atoms != NULL) {
679 	    for (i = 0;i < ret->nbAtoms;i++)
680 		xmlRegFreeAtom(ret->atoms[i]);
681 	    xmlFree(ret->atoms);
682 	}
683 	ret->atoms = NULL;
684 	ret->nbAtoms = 0;
685 
686 	ret->compact = transitions;
687 	ret->transdata = transdata;
688 	ret->stringMap = stringMap;
689 	ret->nbstrings = nbatoms;
690 	ret->nbstates = nbstates;
691 	xmlFree(stateRemap);
692 	xmlFree(stringRemap);
693     }
694 not_determ:
695     ctxt->string = NULL;
696     ctxt->nbStates = 0;
697     ctxt->states = NULL;
698     ctxt->nbAtoms = 0;
699     ctxt->atoms = NULL;
700     ctxt->nbCounters = 0;
701     ctxt->counters = NULL;
702     return(ret);
703 }
704 
705 /**
706  * xmlRegNewParserCtxt:
707  * @string:  the string to parse
708  *
709  * Allocate a new regexp parser context
710  *
711  * Returns the new context or NULL in case of error
712  */
713 static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar * string)714 xmlRegNewParserCtxt(const xmlChar *string) {
715     xmlRegParserCtxtPtr ret;
716 
717     ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
718     if (ret == NULL)
719 	return(NULL);
720     memset(ret, 0, sizeof(xmlRegParserCtxt));
721     if (string != NULL)
722 	ret->string = xmlStrdup(string);
723     ret->cur = ret->string;
724     ret->neg = 0;
725     ret->negs = 0;
726     ret->error = 0;
727     ret->determinist = -1;
728     return(ret);
729 }
730 
731 /**
732  * xmlRegNewRange:
733  * @ctxt:  the regexp parser context
734  * @neg:  is that negative
735  * @type:  the type of range
736  * @start:  the start codepoint
737  * @end:  the end codepoint
738  *
739  * Allocate a new regexp range
740  *
741  * Returns the new range or NULL in case of error
742  */
743 static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,int neg,xmlRegAtomType type,int start,int end)744 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
745 	       int neg, xmlRegAtomType type, int start, int end) {
746     xmlRegRangePtr ret;
747 
748     ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
749     if (ret == NULL) {
750 	xmlRegexpErrMemory(ctxt, "allocating range");
751 	return(NULL);
752     }
753     ret->neg = neg;
754     ret->type = type;
755     ret->start = start;
756     ret->end = end;
757     return(ret);
758 }
759 
760 /**
761  * xmlRegFreeRange:
762  * @range:  the regexp range
763  *
764  * Free a regexp range
765  */
766 static void
xmlRegFreeRange(xmlRegRangePtr range)767 xmlRegFreeRange(xmlRegRangePtr range) {
768     if (range == NULL)
769 	return;
770 
771     if (range->blockName != NULL)
772 	xmlFree(range->blockName);
773     xmlFree(range);
774 }
775 
776 /**
777  * xmlRegCopyRange:
778  * @range:  the regexp range
779  *
780  * Copy a regexp range
781  *
782  * Returns the new copy or NULL in case of error.
783  */
784 static xmlRegRangePtr
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt,xmlRegRangePtr range)785 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
786     xmlRegRangePtr ret;
787 
788     if (range == NULL)
789 	return(NULL);
790 
791     ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
792                          range->end);
793     if (ret == NULL)
794         return(NULL);
795     if (range->blockName != NULL) {
796 	ret->blockName = xmlStrdup(range->blockName);
797 	if (ret->blockName == NULL) {
798 	    xmlRegexpErrMemory(ctxt, "allocating range");
799 	    xmlRegFreeRange(ret);
800 	    return(NULL);
801 	}
802     }
803     return(ret);
804 }
805 
806 /**
807  * xmlRegNewAtom:
808  * @ctxt:  the regexp parser context
809  * @type:  the type of atom
810  *
811  * Allocate a new atom
812  *
813  * Returns the new atom or NULL in case of error
814  */
815 static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomType type)816 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
817     xmlRegAtomPtr ret;
818 
819     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
820     if (ret == NULL) {
821 	xmlRegexpErrMemory(ctxt, "allocating atom");
822 	return(NULL);
823     }
824     memset(ret, 0, sizeof(xmlRegAtom));
825     ret->type = type;
826     ret->quant = XML_REGEXP_QUANT_ONCE;
827     ret->min = 0;
828     ret->max = 0;
829     return(ret);
830 }
831 
832 /**
833  * xmlRegFreeAtom:
834  * @atom:  the regexp atom
835  *
836  * Free a regexp atom
837  */
838 static void
xmlRegFreeAtom(xmlRegAtomPtr atom)839 xmlRegFreeAtom(xmlRegAtomPtr atom) {
840     int i;
841 
842     if (atom == NULL)
843 	return;
844 
845     for (i = 0;i < atom->nbRanges;i++)
846 	xmlRegFreeRange(atom->ranges[i]);
847     if (atom->ranges != NULL)
848 	xmlFree(atom->ranges);
849     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
850 	xmlFree(atom->valuep);
851     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
852 	xmlFree(atom->valuep2);
853     if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
854 	xmlFree(atom->valuep);
855     xmlFree(atom);
856 }
857 
858 /**
859  * xmlRegCopyAtom:
860  * @ctxt:  the regexp parser context
861  * @atom:  the oiginal atom
862  *
863  * Allocate a new regexp range
864  *
865  * Returns the new atom or NULL in case of error
866  */
867 static xmlRegAtomPtr
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)868 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
869     xmlRegAtomPtr ret;
870 
871     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
872     if (ret == NULL) {
873 	xmlRegexpErrMemory(ctxt, "copying atom");
874 	return(NULL);
875     }
876     memset(ret, 0, sizeof(xmlRegAtom));
877     ret->type = atom->type;
878     ret->quant = atom->quant;
879     ret->min = atom->min;
880     ret->max = atom->max;
881     if (atom->nbRanges > 0) {
882         int i;
883 
884         ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
885 	                                           atom->nbRanges);
886 	if (ret->ranges == NULL) {
887 	    xmlRegexpErrMemory(ctxt, "copying atom");
888 	    goto error;
889 	}
890 	for (i = 0;i < atom->nbRanges;i++) {
891 	    ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
892 	    if (ret->ranges[i] == NULL)
893 	        goto error;
894 	    ret->nbRanges = i + 1;
895 	}
896     }
897     return(ret);
898 
899 error:
900     xmlRegFreeAtom(ret);
901     return(NULL);
902 }
903 
904 static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt)905 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
906     xmlRegStatePtr ret;
907 
908     ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
909     if (ret == NULL) {
910 	xmlRegexpErrMemory(ctxt, "allocating state");
911 	return(NULL);
912     }
913     memset(ret, 0, sizeof(xmlRegState));
914     ret->type = XML_REGEXP_TRANS_STATE;
915     ret->mark = XML_REGEXP_MARK_NORMAL;
916     return(ret);
917 }
918 
919 /**
920  * xmlRegFreeState:
921  * @state:  the regexp state
922  *
923  * Free a regexp state
924  */
925 static void
xmlRegFreeState(xmlRegStatePtr state)926 xmlRegFreeState(xmlRegStatePtr state) {
927     if (state == NULL)
928 	return;
929 
930     if (state->trans != NULL)
931 	xmlFree(state->trans);
932     if (state->transTo != NULL)
933 	xmlFree(state->transTo);
934     xmlFree(state);
935 }
936 
937 /**
938  * xmlRegFreeParserCtxt:
939  * @ctxt:  the regexp parser context
940  *
941  * Free a regexp parser context
942  */
943 static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt)944 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
945     int i;
946     if (ctxt == NULL)
947 	return;
948 
949     if (ctxt->string != NULL)
950 	xmlFree(ctxt->string);
951     if (ctxt->states != NULL) {
952 	for (i = 0;i < ctxt->nbStates;i++)
953 	    xmlRegFreeState(ctxt->states[i]);
954 	xmlFree(ctxt->states);
955     }
956     if (ctxt->atoms != NULL) {
957 	for (i = 0;i < ctxt->nbAtoms;i++)
958 	    xmlRegFreeAtom(ctxt->atoms[i]);
959 	xmlFree(ctxt->atoms);
960     }
961     if (ctxt->counters != NULL)
962 	xmlFree(ctxt->counters);
963     xmlFree(ctxt);
964 }
965 
966 /************************************************************************
967  *									*
968  *			Display of Data structures			*
969  *									*
970  ************************************************************************/
971 
972 static void
xmlRegPrintAtomType(FILE * output,xmlRegAtomType type)973 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
974     switch (type) {
975         case XML_REGEXP_EPSILON:
976 	    fprintf(output, "epsilon "); break;
977         case XML_REGEXP_CHARVAL:
978 	    fprintf(output, "charval "); break;
979         case XML_REGEXP_RANGES:
980 	    fprintf(output, "ranges "); break;
981         case XML_REGEXP_SUBREG:
982 	    fprintf(output, "subexpr "); break;
983         case XML_REGEXP_STRING:
984 	    fprintf(output, "string "); break;
985         case XML_REGEXP_ANYCHAR:
986 	    fprintf(output, "anychar "); break;
987         case XML_REGEXP_ANYSPACE:
988 	    fprintf(output, "anyspace "); break;
989         case XML_REGEXP_NOTSPACE:
990 	    fprintf(output, "notspace "); break;
991         case XML_REGEXP_INITNAME:
992 	    fprintf(output, "initname "); break;
993         case XML_REGEXP_NOTINITNAME:
994 	    fprintf(output, "notinitname "); break;
995         case XML_REGEXP_NAMECHAR:
996 	    fprintf(output, "namechar "); break;
997         case XML_REGEXP_NOTNAMECHAR:
998 	    fprintf(output, "notnamechar "); break;
999         case XML_REGEXP_DECIMAL:
1000 	    fprintf(output, "decimal "); break;
1001         case XML_REGEXP_NOTDECIMAL:
1002 	    fprintf(output, "notdecimal "); break;
1003         case XML_REGEXP_REALCHAR:
1004 	    fprintf(output, "realchar "); break;
1005         case XML_REGEXP_NOTREALCHAR:
1006 	    fprintf(output, "notrealchar "); break;
1007         case XML_REGEXP_LETTER:
1008             fprintf(output, "LETTER "); break;
1009         case XML_REGEXP_LETTER_UPPERCASE:
1010             fprintf(output, "LETTER_UPPERCASE "); break;
1011         case XML_REGEXP_LETTER_LOWERCASE:
1012             fprintf(output, "LETTER_LOWERCASE "); break;
1013         case XML_REGEXP_LETTER_TITLECASE:
1014             fprintf(output, "LETTER_TITLECASE "); break;
1015         case XML_REGEXP_LETTER_MODIFIER:
1016             fprintf(output, "LETTER_MODIFIER "); break;
1017         case XML_REGEXP_LETTER_OTHERS:
1018             fprintf(output, "LETTER_OTHERS "); break;
1019         case XML_REGEXP_MARK:
1020             fprintf(output, "MARK "); break;
1021         case XML_REGEXP_MARK_NONSPACING:
1022             fprintf(output, "MARK_NONSPACING "); break;
1023         case XML_REGEXP_MARK_SPACECOMBINING:
1024             fprintf(output, "MARK_SPACECOMBINING "); break;
1025         case XML_REGEXP_MARK_ENCLOSING:
1026             fprintf(output, "MARK_ENCLOSING "); break;
1027         case XML_REGEXP_NUMBER:
1028             fprintf(output, "NUMBER "); break;
1029         case XML_REGEXP_NUMBER_DECIMAL:
1030             fprintf(output, "NUMBER_DECIMAL "); break;
1031         case XML_REGEXP_NUMBER_LETTER:
1032             fprintf(output, "NUMBER_LETTER "); break;
1033         case XML_REGEXP_NUMBER_OTHERS:
1034             fprintf(output, "NUMBER_OTHERS "); break;
1035         case XML_REGEXP_PUNCT:
1036             fprintf(output, "PUNCT "); break;
1037         case XML_REGEXP_PUNCT_CONNECTOR:
1038             fprintf(output, "PUNCT_CONNECTOR "); break;
1039         case XML_REGEXP_PUNCT_DASH:
1040             fprintf(output, "PUNCT_DASH "); break;
1041         case XML_REGEXP_PUNCT_OPEN:
1042             fprintf(output, "PUNCT_OPEN "); break;
1043         case XML_REGEXP_PUNCT_CLOSE:
1044             fprintf(output, "PUNCT_CLOSE "); break;
1045         case XML_REGEXP_PUNCT_INITQUOTE:
1046             fprintf(output, "PUNCT_INITQUOTE "); break;
1047         case XML_REGEXP_PUNCT_FINQUOTE:
1048             fprintf(output, "PUNCT_FINQUOTE "); break;
1049         case XML_REGEXP_PUNCT_OTHERS:
1050             fprintf(output, "PUNCT_OTHERS "); break;
1051         case XML_REGEXP_SEPAR:
1052             fprintf(output, "SEPAR "); break;
1053         case XML_REGEXP_SEPAR_SPACE:
1054             fprintf(output, "SEPAR_SPACE "); break;
1055         case XML_REGEXP_SEPAR_LINE:
1056             fprintf(output, "SEPAR_LINE "); break;
1057         case XML_REGEXP_SEPAR_PARA:
1058             fprintf(output, "SEPAR_PARA "); break;
1059         case XML_REGEXP_SYMBOL:
1060             fprintf(output, "SYMBOL "); break;
1061         case XML_REGEXP_SYMBOL_MATH:
1062             fprintf(output, "SYMBOL_MATH "); break;
1063         case XML_REGEXP_SYMBOL_CURRENCY:
1064             fprintf(output, "SYMBOL_CURRENCY "); break;
1065         case XML_REGEXP_SYMBOL_MODIFIER:
1066             fprintf(output, "SYMBOL_MODIFIER "); break;
1067         case XML_REGEXP_SYMBOL_OTHERS:
1068             fprintf(output, "SYMBOL_OTHERS "); break;
1069         case XML_REGEXP_OTHER:
1070             fprintf(output, "OTHER "); break;
1071         case XML_REGEXP_OTHER_CONTROL:
1072             fprintf(output, "OTHER_CONTROL "); break;
1073         case XML_REGEXP_OTHER_FORMAT:
1074             fprintf(output, "OTHER_FORMAT "); break;
1075         case XML_REGEXP_OTHER_PRIVATE:
1076             fprintf(output, "OTHER_PRIVATE "); break;
1077         case XML_REGEXP_OTHER_NA:
1078             fprintf(output, "OTHER_NA "); break;
1079         case XML_REGEXP_BLOCK_NAME:
1080 	    fprintf(output, "BLOCK "); break;
1081     }
1082 }
1083 
1084 static void
xmlRegPrintQuantType(FILE * output,xmlRegQuantType type)1085 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1086     switch (type) {
1087         case XML_REGEXP_QUANT_EPSILON:
1088 	    fprintf(output, "epsilon "); break;
1089         case XML_REGEXP_QUANT_ONCE:
1090 	    fprintf(output, "once "); break;
1091         case XML_REGEXP_QUANT_OPT:
1092 	    fprintf(output, "? "); break;
1093         case XML_REGEXP_QUANT_MULT:
1094 	    fprintf(output, "* "); break;
1095         case XML_REGEXP_QUANT_PLUS:
1096 	    fprintf(output, "+ "); break;
1097 	case XML_REGEXP_QUANT_RANGE:
1098 	    fprintf(output, "range "); break;
1099 	case XML_REGEXP_QUANT_ONCEONLY:
1100 	    fprintf(output, "onceonly "); break;
1101 	case XML_REGEXP_QUANT_ALL:
1102 	    fprintf(output, "all "); break;
1103     }
1104 }
1105 static void
xmlRegPrintRange(FILE * output,xmlRegRangePtr range)1106 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1107     fprintf(output, "  range: ");
1108     if (range->neg)
1109 	fprintf(output, "negative ");
1110     xmlRegPrintAtomType(output, range->type);
1111     fprintf(output, "%c - %c\n", range->start, range->end);
1112 }
1113 
1114 static void
xmlRegPrintAtom(FILE * output,xmlRegAtomPtr atom)1115 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1116     fprintf(output, " atom: ");
1117     if (atom == NULL) {
1118 	fprintf(output, "NULL\n");
1119 	return;
1120     }
1121     if (atom->neg)
1122         fprintf(output, "not ");
1123     xmlRegPrintAtomType(output, atom->type);
1124     xmlRegPrintQuantType(output, atom->quant);
1125     if (atom->quant == XML_REGEXP_QUANT_RANGE)
1126 	fprintf(output, "%d-%d ", atom->min, atom->max);
1127     if (atom->type == XML_REGEXP_STRING)
1128 	fprintf(output, "'%s' ", (char *) atom->valuep);
1129     if (atom->type == XML_REGEXP_CHARVAL)
1130 	fprintf(output, "char %c\n", atom->codepoint);
1131     else if (atom->type == XML_REGEXP_RANGES) {
1132 	int i;
1133 	fprintf(output, "%d entries\n", atom->nbRanges);
1134 	for (i = 0; i < atom->nbRanges;i++)
1135 	    xmlRegPrintRange(output, atom->ranges[i]);
1136     } else if (atom->type == XML_REGEXP_SUBREG) {
1137 	fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1138     } else {
1139 	fprintf(output, "\n");
1140     }
1141 }
1142 
1143 static void
xmlRegPrintTrans(FILE * output,xmlRegTransPtr trans)1144 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1145     fprintf(output, "  trans: ");
1146     if (trans == NULL) {
1147 	fprintf(output, "NULL\n");
1148 	return;
1149     }
1150     if (trans->to < 0) {
1151 	fprintf(output, "removed\n");
1152 	return;
1153     }
1154     if (trans->nd != 0) {
1155 	if (trans->nd == 2)
1156 	    fprintf(output, "last not determinist, ");
1157 	else
1158 	    fprintf(output, "not determinist, ");
1159     }
1160     if (trans->counter >= 0) {
1161 	fprintf(output, "counted %d, ", trans->counter);
1162     }
1163     if (trans->count == REGEXP_ALL_COUNTER) {
1164 	fprintf(output, "all transition, ");
1165     } else if (trans->count >= 0) {
1166 	fprintf(output, "count based %d, ", trans->count);
1167     }
1168     if (trans->atom == NULL) {
1169 	fprintf(output, "epsilon to %d\n", trans->to);
1170 	return;
1171     }
1172     if (trans->atom->type == XML_REGEXP_CHARVAL)
1173 	fprintf(output, "char %c ", trans->atom->codepoint);
1174     fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1175 }
1176 
1177 static void
xmlRegPrintState(FILE * output,xmlRegStatePtr state)1178 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1179     int i;
1180 
1181     fprintf(output, " state: ");
1182     if (state == NULL) {
1183 	fprintf(output, "NULL\n");
1184 	return;
1185     }
1186     if (state->type == XML_REGEXP_START_STATE)
1187 	fprintf(output, "START ");
1188     if (state->type == XML_REGEXP_FINAL_STATE)
1189 	fprintf(output, "FINAL ");
1190 
1191     fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1192     for (i = 0;i < state->nbTrans; i++) {
1193 	xmlRegPrintTrans(output, &(state->trans[i]));
1194     }
1195 }
1196 
1197 #ifdef DEBUG_REGEXP_GRAPH
1198 static void
xmlRegPrintCtxt(FILE * output,xmlRegParserCtxtPtr ctxt)1199 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1200     int i;
1201 
1202     fprintf(output, " ctxt: ");
1203     if (ctxt == NULL) {
1204 	fprintf(output, "NULL\n");
1205 	return;
1206     }
1207     fprintf(output, "'%s' ", ctxt->string);
1208     if (ctxt->error)
1209 	fprintf(output, "error ");
1210     if (ctxt->neg)
1211 	fprintf(output, "neg ");
1212     fprintf(output, "\n");
1213     fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1214     for (i = 0;i < ctxt->nbAtoms; i++) {
1215 	fprintf(output, " %02d ", i);
1216 	xmlRegPrintAtom(output, ctxt->atoms[i]);
1217     }
1218     if (ctxt->atom != NULL) {
1219 	fprintf(output, "current atom:\n");
1220 	xmlRegPrintAtom(output, ctxt->atom);
1221     }
1222     fprintf(output, "%d states:", ctxt->nbStates);
1223     if (ctxt->start != NULL)
1224 	fprintf(output, " start: %d", ctxt->start->no);
1225     if (ctxt->end != NULL)
1226 	fprintf(output, " end: %d", ctxt->end->no);
1227     fprintf(output, "\n");
1228     for (i = 0;i < ctxt->nbStates; i++) {
1229 	xmlRegPrintState(output, ctxt->states[i]);
1230     }
1231     fprintf(output, "%d counters:\n", ctxt->nbCounters);
1232     for (i = 0;i < ctxt->nbCounters; i++) {
1233 	fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1234 		                                ctxt->counters[i].max);
1235     }
1236 }
1237 #endif
1238 
1239 /************************************************************************
1240  *									*
1241  *		 Finite Automata structures manipulations		*
1242  *									*
1243  ************************************************************************/
1244 
1245 static void
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom,int neg,xmlRegAtomType type,int start,int end,xmlChar * blockName)1246 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1247 	           int neg, xmlRegAtomType type, int start, int end,
1248 		   xmlChar *blockName) {
1249     xmlRegRangePtr range;
1250 
1251     if (atom == NULL) {
1252 	ERROR("add range: atom is NULL");
1253 	return;
1254     }
1255     if (atom->type != XML_REGEXP_RANGES) {
1256 	ERROR("add range: atom is not ranges");
1257 	return;
1258     }
1259     if (atom->maxRanges == 0) {
1260 	atom->maxRanges = 4;
1261 	atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1262 		                             sizeof(xmlRegRangePtr));
1263 	if (atom->ranges == NULL) {
1264 	    xmlRegexpErrMemory(ctxt, "adding ranges");
1265 	    atom->maxRanges = 0;
1266 	    return;
1267 	}
1268     } else if (atom->nbRanges >= atom->maxRanges) {
1269 	xmlRegRangePtr *tmp;
1270 	atom->maxRanges *= 2;
1271 	tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1272 		                             sizeof(xmlRegRangePtr));
1273 	if (tmp == NULL) {
1274 	    xmlRegexpErrMemory(ctxt, "adding ranges");
1275 	    atom->maxRanges /= 2;
1276 	    return;
1277 	}
1278 	atom->ranges = tmp;
1279     }
1280     range = xmlRegNewRange(ctxt, neg, type, start, end);
1281     if (range == NULL)
1282 	return;
1283     range->blockName = blockName;
1284     atom->ranges[atom->nbRanges++] = range;
1285 
1286 }
1287 
1288 static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt)1289 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1290     if (ctxt->maxCounters == 0) {
1291 	ctxt->maxCounters = 4;
1292 	ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1293 		                             sizeof(xmlRegCounter));
1294 	if (ctxt->counters == NULL) {
1295 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1296 	    ctxt->maxCounters = 0;
1297 	    return(-1);
1298 	}
1299     } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1300 	xmlRegCounter *tmp;
1301 	ctxt->maxCounters *= 2;
1302 	tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1303 		                           sizeof(xmlRegCounter));
1304 	if (tmp == NULL) {
1305 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1306 	    ctxt->maxCounters /= 2;
1307 	    return(-1);
1308 	}
1309 	ctxt->counters = tmp;
1310     }
1311     ctxt->counters[ctxt->nbCounters].min = -1;
1312     ctxt->counters[ctxt->nbCounters].max = -1;
1313     return(ctxt->nbCounters++);
1314 }
1315 
1316 static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)1317 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1318     if (atom == NULL) {
1319 	ERROR("atom push: atom is NULL");
1320 	return(-1);
1321     }
1322     if (ctxt->maxAtoms == 0) {
1323 	ctxt->maxAtoms = 4;
1324 	ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1325 		                             sizeof(xmlRegAtomPtr));
1326 	if (ctxt->atoms == NULL) {
1327 	    xmlRegexpErrMemory(ctxt, "pushing atom");
1328 	    ctxt->maxAtoms = 0;
1329 	    return(-1);
1330 	}
1331     } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1332 	xmlRegAtomPtr *tmp;
1333 	ctxt->maxAtoms *= 2;
1334 	tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1335 		                             sizeof(xmlRegAtomPtr));
1336 	if (tmp == NULL) {
1337 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1338 	    ctxt->maxAtoms /= 2;
1339 	    return(-1);
1340 	}
1341 	ctxt->atoms = tmp;
1342     }
1343     atom->no = ctxt->nbAtoms;
1344     ctxt->atoms[ctxt->nbAtoms++] = atom;
1345     return(0);
1346 }
1347 
1348 static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr target,int from)1349 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1350                       int from) {
1351     if (target->maxTransTo == 0) {
1352 	target->maxTransTo = 8;
1353 	target->transTo = (int *) xmlMalloc(target->maxTransTo *
1354 		                             sizeof(int));
1355 	if (target->transTo == NULL) {
1356 	    xmlRegexpErrMemory(ctxt, "adding transition");
1357 	    target->maxTransTo = 0;
1358 	    return;
1359 	}
1360     } else if (target->nbTransTo >= target->maxTransTo) {
1361 	int *tmp;
1362 	target->maxTransTo *= 2;
1363 	tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1364 		                             sizeof(int));
1365 	if (tmp == NULL) {
1366 	    xmlRegexpErrMemory(ctxt, "adding transition");
1367 	    target->maxTransTo /= 2;
1368 	    return;
1369 	}
1370 	target->transTo = tmp;
1371     }
1372     target->transTo[target->nbTransTo] = from;
1373     target->nbTransTo++;
1374 }
1375 
1376 static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,xmlRegAtomPtr atom,xmlRegStatePtr target,int counter,int count)1377 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1378 	            xmlRegAtomPtr atom, xmlRegStatePtr target,
1379 		    int counter, int count) {
1380 
1381     int nrtrans;
1382 
1383     if (state == NULL) {
1384 	ERROR("add state: state is NULL");
1385 	return;
1386     }
1387     if (target == NULL) {
1388 	ERROR("add state: target is NULL");
1389 	return;
1390     }
1391     /*
1392      * Other routines follow the philosophy 'When in doubt, add a transition'
1393      * so we check here whether such a transition is already present and, if
1394      * so, silently ignore this request.
1395      */
1396 
1397     for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1398 	xmlRegTransPtr trans = &(state->trans[nrtrans]);
1399 	if ((trans->atom == atom) &&
1400 	    (trans->to == target->no) &&
1401 	    (trans->counter == counter) &&
1402 	    (trans->count == count)) {
1403 #ifdef DEBUG_REGEXP_GRAPH
1404 	    printf("Ignoring duplicate transition from %d to %d\n",
1405 		    state->no, target->no);
1406 #endif
1407 	    return;
1408 	}
1409     }
1410 
1411     if (state->maxTrans == 0) {
1412 	state->maxTrans = 8;
1413 	state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1414 		                             sizeof(xmlRegTrans));
1415 	if (state->trans == NULL) {
1416 	    xmlRegexpErrMemory(ctxt, "adding transition");
1417 	    state->maxTrans = 0;
1418 	    return;
1419 	}
1420     } else if (state->nbTrans >= state->maxTrans) {
1421 	xmlRegTrans *tmp;
1422 	state->maxTrans *= 2;
1423 	tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1424 		                             sizeof(xmlRegTrans));
1425 	if (tmp == NULL) {
1426 	    xmlRegexpErrMemory(ctxt, "adding transition");
1427 	    state->maxTrans /= 2;
1428 	    return;
1429 	}
1430 	state->trans = tmp;
1431     }
1432 #ifdef DEBUG_REGEXP_GRAPH
1433     printf("Add trans from %d to %d ", state->no, target->no);
1434     if (count == REGEXP_ALL_COUNTER)
1435 	printf("all transition\n");
1436     else if (count >= 0)
1437 	printf("count based %d\n", count);
1438     else if (counter >= 0)
1439 	printf("counted %d\n", counter);
1440     else if (atom == NULL)
1441 	printf("epsilon transition\n");
1442     else if (atom != NULL)
1443         xmlRegPrintAtom(stdout, atom);
1444 #endif
1445 
1446     state->trans[state->nbTrans].atom = atom;
1447     state->trans[state->nbTrans].to = target->no;
1448     state->trans[state->nbTrans].counter = counter;
1449     state->trans[state->nbTrans].count = count;
1450     state->trans[state->nbTrans].nd = 0;
1451     state->nbTrans++;
1452     xmlRegStateAddTransTo(ctxt, target, state->no);
1453 }
1454 
1455 static int
xmlRegStatePush(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)1456 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1457     if (state == NULL) return(-1);
1458     if (ctxt->maxStates == 0) {
1459 	ctxt->maxStates = 4;
1460 	ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1461 		                             sizeof(xmlRegStatePtr));
1462 	if (ctxt->states == NULL) {
1463 	    xmlRegexpErrMemory(ctxt, "adding state");
1464 	    ctxt->maxStates = 0;
1465 	    return(-1);
1466 	}
1467     } else if (ctxt->nbStates >= ctxt->maxStates) {
1468 	xmlRegStatePtr *tmp;
1469 	ctxt->maxStates *= 2;
1470 	tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1471 		                             sizeof(xmlRegStatePtr));
1472 	if (tmp == NULL) {
1473 	    xmlRegexpErrMemory(ctxt, "adding state");
1474 	    ctxt->maxStates /= 2;
1475 	    return(-1);
1476 	}
1477 	ctxt->states = tmp;
1478     }
1479     state->no = ctxt->nbStates;
1480     ctxt->states[ctxt->nbStates++] = state;
1481     return(0);
1482 }
1483 
1484 /**
1485  * xmlFAGenerateAllTransition:
1486  * @ctxt:  a regexp parser context
1487  * @from:  the from state
1488  * @to:  the target state or NULL for building a new one
1489  * @lax:
1490  *
1491  */
1492 static void
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int lax)1493 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1494 			   xmlRegStatePtr from, xmlRegStatePtr to,
1495 			   int lax) {
1496     if (to == NULL) {
1497 	to = xmlRegNewState(ctxt);
1498 	xmlRegStatePush(ctxt, to);
1499 	ctxt->state = to;
1500     }
1501     if (lax)
1502 	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1503     else
1504 	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1505 }
1506 
1507 /**
1508  * xmlFAGenerateEpsilonTransition:
1509  * @ctxt:  a regexp parser context
1510  * @from:  the from state
1511  * @to:  the target state or NULL for building a new one
1512  *
1513  */
1514 static void
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to)1515 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1516 			       xmlRegStatePtr from, xmlRegStatePtr to) {
1517     if (to == NULL) {
1518 	to = xmlRegNewState(ctxt);
1519 	xmlRegStatePush(ctxt, to);
1520 	ctxt->state = to;
1521     }
1522     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1523 }
1524 
1525 /**
1526  * xmlFAGenerateCountedEpsilonTransition:
1527  * @ctxt:  a regexp parser context
1528  * @from:  the from state
1529  * @to:  the target state or NULL for building a new one
1530  * counter:  the counter for that transition
1531  *
1532  */
1533 static void
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1534 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1535 	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1536     if (to == NULL) {
1537 	to = xmlRegNewState(ctxt);
1538 	xmlRegStatePush(ctxt, to);
1539 	ctxt->state = to;
1540     }
1541     xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1542 }
1543 
1544 /**
1545  * xmlFAGenerateCountedTransition:
1546  * @ctxt:  a regexp parser context
1547  * @from:  the from state
1548  * @to:  the target state or NULL for building a new one
1549  * counter:  the counter for that transition
1550  *
1551  */
1552 static void
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1553 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1554 	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1555     if (to == NULL) {
1556 	to = xmlRegNewState(ctxt);
1557 	xmlRegStatePush(ctxt, to);
1558 	ctxt->state = to;
1559     }
1560     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1561 }
1562 
1563 /**
1564  * xmlFAGenerateTransitions:
1565  * @ctxt:  a regexp parser context
1566  * @from:  the from state
1567  * @to:  the target state or NULL for building a new one
1568  * @atom:  the atom generating the transition
1569  *
1570  * Returns 0 if success and -1 in case of error.
1571  */
1572 static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,xmlRegAtomPtr atom)1573 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1574 	                 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1575     xmlRegStatePtr end;
1576     int nullable = 0;
1577 
1578     if (atom == NULL) {
1579 	ERROR("genrate transition: atom == NULL");
1580 	return(-1);
1581     }
1582     if (atom->type == XML_REGEXP_SUBREG) {
1583 	/*
1584 	 * this is a subexpression handling one should not need to
1585 	 * create a new node except for XML_REGEXP_QUANT_RANGE.
1586 	 */
1587 	if (xmlRegAtomPush(ctxt, atom) < 0) {
1588 	    return(-1);
1589 	}
1590 	if ((to != NULL) && (atom->stop != to) &&
1591 	    (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1592 	    /*
1593 	     * Generate an epsilon transition to link to the target
1594 	     */
1595 	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1596 #ifdef DV
1597 	} else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1598 		   (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1599 	    to = xmlRegNewState(ctxt);
1600 	    xmlRegStatePush(ctxt, to);
1601 	    ctxt->state = to;
1602 	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1603 #endif
1604 	}
1605 	switch (atom->quant) {
1606 	    case XML_REGEXP_QUANT_OPT:
1607 		atom->quant = XML_REGEXP_QUANT_ONCE;
1608 		/*
1609 		 * transition done to the state after end of atom.
1610 		 *      1. set transition from atom start to new state
1611 		 *      2. set transition from atom end to this state.
1612 		 */
1613                 if (to == NULL) {
1614                     xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1615                     xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1616                                                    ctxt->state);
1617                 } else {
1618                     xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1619                 }
1620 		break;
1621 	    case XML_REGEXP_QUANT_MULT:
1622 		atom->quant = XML_REGEXP_QUANT_ONCE;
1623 		xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1624 		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1625 		break;
1626 	    case XML_REGEXP_QUANT_PLUS:
1627 		atom->quant = XML_REGEXP_QUANT_ONCE;
1628 		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1629 		break;
1630 	    case XML_REGEXP_QUANT_RANGE: {
1631 		int counter;
1632 		xmlRegStatePtr inter, newstate;
1633 
1634 		/*
1635 		 * create the final state now if needed
1636 		 */
1637 		if (to != NULL) {
1638 		    newstate = to;
1639 		} else {
1640 		    newstate = xmlRegNewState(ctxt);
1641 		    xmlRegStatePush(ctxt, newstate);
1642 		}
1643 
1644 		/*
1645 		 * The principle here is to use counted transition
1646 		 * to avoid explosion in the number of states in the
1647 		 * graph. This is clearly more complex but should not
1648 		 * be exploitable at runtime.
1649 		 */
1650 		if ((atom->min == 0) && (atom->start0 == NULL)) {
1651 		    xmlRegAtomPtr copy;
1652 		    /*
1653 		     * duplicate a transition based on atom to count next
1654 		     * occurences after 1. We cannot loop to atom->start
1655 		     * directly because we need an epsilon transition to
1656 		     * newstate.
1657 		     */
1658 		     /* ???? For some reason it seems we never reach that
1659 		        case, I suppose this got optimized out before when
1660 			building the automata */
1661 		    copy = xmlRegCopyAtom(ctxt, atom);
1662 		    if (copy == NULL)
1663 		        return(-1);
1664 		    copy->quant = XML_REGEXP_QUANT_ONCE;
1665 		    copy->min = 0;
1666 		    copy->max = 0;
1667 
1668 		    if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1669 		        < 0)
1670 			return(-1);
1671 		    inter = ctxt->state;
1672 		    counter = xmlRegGetCounter(ctxt);
1673 		    ctxt->counters[counter].min = atom->min - 1;
1674 		    ctxt->counters[counter].max = atom->max - 1;
1675 		    /* count the number of times we see it again */
1676 		    xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1677 						   atom->stop, counter);
1678 		    /* allow a way out based on the count */
1679 		    xmlFAGenerateCountedTransition(ctxt, inter,
1680 			                           newstate, counter);
1681 		    /* and also allow a direct exit for 0 */
1682 		    xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1683 		                                   newstate);
1684 		} else {
1685 		    /*
1686 		     * either we need the atom at least once or there
1687 		     * is an atom->start0 allowing to easilly plug the
1688 		     * epsilon transition.
1689 		     */
1690 		    counter = xmlRegGetCounter(ctxt);
1691 		    ctxt->counters[counter].min = atom->min - 1;
1692 		    ctxt->counters[counter].max = atom->max - 1;
1693 		    /* count the number of times we see it again */
1694 		    xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1695 						   atom->start, counter);
1696 		    /* allow a way out based on the count */
1697 		    xmlFAGenerateCountedTransition(ctxt, atom->stop,
1698 			                           newstate, counter);
1699 		    /* and if needed allow a direct exit for 0 */
1700 		    if (atom->min == 0)
1701 			xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1702 						       newstate);
1703 
1704 		}
1705 		atom->min = 0;
1706 		atom->max = 0;
1707 		atom->quant = XML_REGEXP_QUANT_ONCE;
1708 		ctxt->state = newstate;
1709 	    }
1710 	    default:
1711 		break;
1712 	}
1713 	return(0);
1714     }
1715     if ((atom->min == 0) && (atom->max == 0) &&
1716                (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1717         /*
1718 	 * we can discard the atom and generate an epsilon transition instead
1719 	 */
1720 	if (to == NULL) {
1721 	    to = xmlRegNewState(ctxt);
1722 	    if (to != NULL)
1723 		xmlRegStatePush(ctxt, to);
1724 	    else {
1725 		return(-1);
1726 	    }
1727 	}
1728 	xmlFAGenerateEpsilonTransition(ctxt, from, to);
1729 	ctxt->state = to;
1730 	xmlRegFreeAtom(atom);
1731 	return(0);
1732     }
1733     if (to == NULL) {
1734 	to = xmlRegNewState(ctxt);
1735 	if (to != NULL)
1736 	    xmlRegStatePush(ctxt, to);
1737 	else {
1738 	    return(-1);
1739 	}
1740     }
1741     end = to;
1742     if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1743         (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1744 	/*
1745 	 * Do not pollute the target state by adding transitions from
1746 	 * it as it is likely to be the shared target of multiple branches.
1747 	 * So isolate with an epsilon transition.
1748 	 */
1749         xmlRegStatePtr tmp;
1750 
1751 	tmp = xmlRegNewState(ctxt);
1752 	if (tmp != NULL)
1753 	    xmlRegStatePush(ctxt, tmp);
1754 	else {
1755 	    return(-1);
1756 	}
1757 	xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1758 	to = tmp;
1759     }
1760     if (xmlRegAtomPush(ctxt, atom) < 0) {
1761 	return(-1);
1762     }
1763     if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1764         (atom->min == 0) && (atom->max > 0)) {
1765 	nullable = 1;
1766 	atom->min = 1;
1767         if (atom->max == 1)
1768 	    atom->quant = XML_REGEXP_QUANT_OPT;
1769     }
1770     xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1771     ctxt->state = end;
1772     switch (atom->quant) {
1773 	case XML_REGEXP_QUANT_OPT:
1774 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1775 	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1776 	    break;
1777 	case XML_REGEXP_QUANT_MULT:
1778 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1779 	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1780 	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1781 	    break;
1782 	case XML_REGEXP_QUANT_PLUS:
1783 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1784 	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1785 	    break;
1786 	case XML_REGEXP_QUANT_RANGE:
1787 	    if (nullable)
1788 		xmlFAGenerateEpsilonTransition(ctxt, from, to);
1789 	    break;
1790 	default:
1791 	    break;
1792     }
1793     return(0);
1794 }
1795 
1796 /**
1797  * xmlFAReduceEpsilonTransitions:
1798  * @ctxt:  a regexp parser context
1799  * @fromnr:  the from state
1800  * @tonr:  the to state
1801  * @counter:  should that transition be associated to a counted
1802  *
1803  */
1804 static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int fromnr,int tonr,int counter)1805 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1806 	                      int tonr, int counter) {
1807     int transnr;
1808     xmlRegStatePtr from;
1809     xmlRegStatePtr to;
1810 
1811 #ifdef DEBUG_REGEXP_GRAPH
1812     printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1813 #endif
1814     from = ctxt->states[fromnr];
1815     if (from == NULL)
1816 	return;
1817     to = ctxt->states[tonr];
1818     if (to == NULL)
1819 	return;
1820     if ((to->mark == XML_REGEXP_MARK_START) ||
1821 	(to->mark == XML_REGEXP_MARK_VISITED))
1822 	return;
1823 
1824     to->mark = XML_REGEXP_MARK_VISITED;
1825     if (to->type == XML_REGEXP_FINAL_STATE) {
1826 #ifdef DEBUG_REGEXP_GRAPH
1827 	printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1828 #endif
1829 	from->type = XML_REGEXP_FINAL_STATE;
1830     }
1831     for (transnr = 0;transnr < to->nbTrans;transnr++) {
1832         if (to->trans[transnr].to < 0)
1833 	    continue;
1834 	if (to->trans[transnr].atom == NULL) {
1835 	    /*
1836 	     * Don't remove counted transitions
1837 	     * Don't loop either
1838 	     */
1839 	    if (to->trans[transnr].to != fromnr) {
1840 		if (to->trans[transnr].count >= 0) {
1841 		    int newto = to->trans[transnr].to;
1842 
1843 		    xmlRegStateAddTrans(ctxt, from, NULL,
1844 					ctxt->states[newto],
1845 					-1, to->trans[transnr].count);
1846 		} else {
1847 #ifdef DEBUG_REGEXP_GRAPH
1848 		    printf("Found epsilon trans %d from %d to %d\n",
1849 			   transnr, tonr, to->trans[transnr].to);
1850 #endif
1851 		    if (to->trans[transnr].counter >= 0) {
1852 			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1853 					      to->trans[transnr].to,
1854 					      to->trans[transnr].counter);
1855 		    } else {
1856 			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1857 					      to->trans[transnr].to,
1858 					      counter);
1859 		    }
1860 		}
1861 	    }
1862 	} else {
1863 	    int newto = to->trans[transnr].to;
1864 
1865 	    if (to->trans[transnr].counter >= 0) {
1866 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1867 				    ctxt->states[newto],
1868 				    to->trans[transnr].counter, -1);
1869 	    } else {
1870 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1871 				    ctxt->states[newto], counter, -1);
1872 	    }
1873 	}
1874     }
1875     to->mark = XML_REGEXP_MARK_NORMAL;
1876 }
1877 
1878 /**
1879  * xmlFAEliminateSimpleEpsilonTransitions:
1880  * @ctxt:  a regexp parser context
1881  *
1882  * Eliminating general epsilon transitions can get costly in the general
1883  * algorithm due to the large amount of generated new transitions and
1884  * associated comparisons. However for simple epsilon transition used just
1885  * to separate building blocks when generating the automata this can be
1886  * reduced to state elimination:
1887  *    - if there exists an epsilon from X to Y
1888  *    - if there is no other transition from X
1889  * then X and Y are semantically equivalent and X can be eliminated
1890  * If X is the start state then make Y the start state, else replace the
1891  * target of all transitions to X by transitions to Y.
1892  */
1893 static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1894 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1895     int statenr, i, j, newto;
1896     xmlRegStatePtr state, tmp;
1897 
1898     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1899 	state = ctxt->states[statenr];
1900 	if (state == NULL)
1901 	    continue;
1902 	if (state->nbTrans != 1)
1903 	    continue;
1904 	if (state->type == XML_REGEXP_UNREACH_STATE)
1905 	    continue;
1906 	/* is the only transition out a basic transition */
1907 	if ((state->trans[0].atom == NULL) &&
1908 	    (state->trans[0].to >= 0) &&
1909 	    (state->trans[0].to != statenr) &&
1910 	    (state->trans[0].counter < 0) &&
1911 	    (state->trans[0].count < 0)) {
1912 	    newto = state->trans[0].to;
1913 
1914             if (state->type == XML_REGEXP_START_STATE) {
1915 #ifdef DEBUG_REGEXP_GRAPH
1916 		printf("Found simple epsilon trans from start %d to %d\n",
1917 		       statenr, newto);
1918 #endif
1919             } else {
1920 #ifdef DEBUG_REGEXP_GRAPH
1921 		printf("Found simple epsilon trans from %d to %d\n",
1922 		       statenr, newto);
1923 #endif
1924 	        for (i = 0;i < state->nbTransTo;i++) {
1925 		    tmp = ctxt->states[state->transTo[i]];
1926 		    for (j = 0;j < tmp->nbTrans;j++) {
1927 			if (tmp->trans[j].to == statenr) {
1928 #ifdef DEBUG_REGEXP_GRAPH
1929 			    printf("Changed transition %d on %d to go to %d\n",
1930 				   j, tmp->no, newto);
1931 #endif
1932 			    tmp->trans[j].to = -1;
1933 			    xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1934 						ctxt->states[newto],
1935 					        tmp->trans[j].counter,
1936 						tmp->trans[j].count);
1937 			}
1938 		    }
1939 		}
1940 		if (state->type == XML_REGEXP_FINAL_STATE)
1941 		    ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1942 		/* eliminate the transition completely */
1943 		state->nbTrans = 0;
1944 
1945                 state->type = XML_REGEXP_UNREACH_STATE;
1946 
1947 	    }
1948 
1949 	}
1950     }
1951 }
1952 /**
1953  * xmlFAEliminateEpsilonTransitions:
1954  * @ctxt:  a regexp parser context
1955  *
1956  */
1957 static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1958 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1959     int statenr, transnr;
1960     xmlRegStatePtr state;
1961     int has_epsilon;
1962 
1963     if (ctxt->states == NULL) return;
1964 
1965     /*
1966      * Eliminate simple epsilon transition and the associated unreachable
1967      * states.
1968      */
1969     xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1970     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1971 	state = ctxt->states[statenr];
1972 	if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1973 #ifdef DEBUG_REGEXP_GRAPH
1974 	    printf("Removed unreachable state %d\n", statenr);
1975 #endif
1976 	    xmlRegFreeState(state);
1977 	    ctxt->states[statenr] = NULL;
1978 	}
1979     }
1980 
1981     has_epsilon = 0;
1982 
1983     /*
1984      * Build the completed transitions bypassing the epsilons
1985      * Use a marking algorithm to avoid loops
1986      * Mark sink states too.
1987      * Process from the latests states backward to the start when
1988      * there is long cascading epsilon chains this minimize the
1989      * recursions and transition compares when adding the new ones
1990      */
1991     for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1992 	state = ctxt->states[statenr];
1993 	if (state == NULL)
1994 	    continue;
1995 	if ((state->nbTrans == 0) &&
1996 	    (state->type != XML_REGEXP_FINAL_STATE)) {
1997 	    state->type = XML_REGEXP_SINK_STATE;
1998 	}
1999 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2000 	    if ((state->trans[transnr].atom == NULL) &&
2001 		(state->trans[transnr].to >= 0)) {
2002 		if (state->trans[transnr].to == statenr) {
2003 		    state->trans[transnr].to = -1;
2004 #ifdef DEBUG_REGEXP_GRAPH
2005 		    printf("Removed loopback epsilon trans %d on %d\n",
2006 			   transnr, statenr);
2007 #endif
2008 		} else if (state->trans[transnr].count < 0) {
2009 		    int newto = state->trans[transnr].to;
2010 
2011 #ifdef DEBUG_REGEXP_GRAPH
2012 		    printf("Found epsilon trans %d from %d to %d\n",
2013 			   transnr, statenr, newto);
2014 #endif
2015 		    has_epsilon = 1;
2016 		    state->trans[transnr].to = -2;
2017 		    state->mark = XML_REGEXP_MARK_START;
2018 		    xmlFAReduceEpsilonTransitions(ctxt, statenr,
2019 				      newto, state->trans[transnr].counter);
2020 		    state->mark = XML_REGEXP_MARK_NORMAL;
2021 #ifdef DEBUG_REGEXP_GRAPH
2022 		} else {
2023 		    printf("Found counted transition %d on %d\n",
2024 			   transnr, statenr);
2025 #endif
2026 	        }
2027 	    }
2028 	}
2029     }
2030     /*
2031      * Eliminate the epsilon transitions
2032      */
2033     if (has_epsilon) {
2034 	for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2035 	    state = ctxt->states[statenr];
2036 	    if (state == NULL)
2037 		continue;
2038 	    for (transnr = 0;transnr < state->nbTrans;transnr++) {
2039 		xmlRegTransPtr trans = &(state->trans[transnr]);
2040 		if ((trans->atom == NULL) &&
2041 		    (trans->count < 0) &&
2042 		    (trans->to >= 0)) {
2043 		    trans->to = -1;
2044 		}
2045 	    }
2046 	}
2047     }
2048 
2049     /*
2050      * Use this pass to detect unreachable states too
2051      */
2052     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2053 	state = ctxt->states[statenr];
2054 	if (state != NULL)
2055 	    state->reached = XML_REGEXP_MARK_NORMAL;
2056     }
2057     state = ctxt->states[0];
2058     if (state != NULL)
2059 	state->reached = XML_REGEXP_MARK_START;
2060     while (state != NULL) {
2061 	xmlRegStatePtr target = NULL;
2062 	state->reached = XML_REGEXP_MARK_VISITED;
2063 	/*
2064 	 * Mark all states reachable from the current reachable state
2065 	 */
2066 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2067 	    if ((state->trans[transnr].to >= 0) &&
2068 		((state->trans[transnr].atom != NULL) ||
2069 		 (state->trans[transnr].count >= 0))) {
2070 		int newto = state->trans[transnr].to;
2071 
2072 		if (ctxt->states[newto] == NULL)
2073 		    continue;
2074 		if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2075 		    ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2076 		    target = ctxt->states[newto];
2077 		}
2078 	    }
2079 	}
2080 
2081 	/*
2082 	 * find the next accessible state not explored
2083 	 */
2084 	if (target == NULL) {
2085 	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2086 		state = ctxt->states[statenr];
2087 		if ((state != NULL) && (state->reached ==
2088 			XML_REGEXP_MARK_START)) {
2089 		    target = state;
2090 		    break;
2091 		}
2092 	    }
2093 	}
2094 	state = target;
2095     }
2096     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2097 	state = ctxt->states[statenr];
2098 	if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2099 #ifdef DEBUG_REGEXP_GRAPH
2100 	    printf("Removed unreachable state %d\n", statenr);
2101 #endif
2102 	    xmlRegFreeState(state);
2103 	    ctxt->states[statenr] = NULL;
2104 	}
2105     }
2106 
2107 }
2108 
2109 static int
xmlFACompareRanges(xmlRegRangePtr range1,xmlRegRangePtr range2)2110 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2111     int ret = 0;
2112 
2113     if ((range1->type == XML_REGEXP_RANGES) ||
2114         (range2->type == XML_REGEXP_RANGES) ||
2115         (range2->type == XML_REGEXP_SUBREG) ||
2116         (range1->type == XML_REGEXP_SUBREG) ||
2117         (range1->type == XML_REGEXP_STRING) ||
2118         (range2->type == XML_REGEXP_STRING))
2119 	return(-1);
2120 
2121     /* put them in order */
2122     if (range1->type > range2->type) {
2123         xmlRegRangePtr tmp;
2124 
2125 	tmp = range1;
2126 	range1 = range2;
2127 	range2 = tmp;
2128     }
2129     if ((range1->type == XML_REGEXP_ANYCHAR) ||
2130         (range2->type == XML_REGEXP_ANYCHAR)) {
2131 	ret = 1;
2132     } else if ((range1->type == XML_REGEXP_EPSILON) ||
2133                (range2->type == XML_REGEXP_EPSILON)) {
2134 	return(0);
2135     } else if (range1->type == range2->type) {
2136         if (range1->type != XML_REGEXP_CHARVAL)
2137             ret = 1;
2138         else if ((range1->end < range2->start) ||
2139 	         (range2->end < range1->start))
2140 	    ret = 0;
2141 	else
2142 	    ret = 1;
2143     } else if (range1->type == XML_REGEXP_CHARVAL) {
2144         int codepoint;
2145 	int neg = 0;
2146 
2147 	/*
2148 	 * just check all codepoints in the range for acceptance,
2149 	 * this is usually way cheaper since done only once at
2150 	 * compilation than testing over and over at runtime or
2151 	 * pushing too many states when evaluating.
2152 	 */
2153 	if (((range1->neg == 0) && (range2->neg != 0)) ||
2154 	    ((range1->neg != 0) && (range2->neg == 0)))
2155 	    neg = 1;
2156 
2157 	for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2158 	    ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2159 					    0, range2->start, range2->end,
2160 					    range2->blockName);
2161 	    if (ret < 0)
2162 	        return(-1);
2163 	    if (((neg == 1) && (ret == 0)) ||
2164 	        ((neg == 0) && (ret == 1)))
2165 		return(1);
2166 	}
2167 	return(0);
2168     } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2169                (range2->type == XML_REGEXP_BLOCK_NAME)) {
2170 	if (range1->type == range2->type) {
2171 	    ret = xmlStrEqual(range1->blockName, range2->blockName);
2172 	} else {
2173 	    /*
2174 	     * comparing a block range with anything else is way
2175 	     * too costly, and maintining the table is like too much
2176 	     * memory too, so let's force the automata to save state
2177 	     * here.
2178 	     */
2179 	    return(1);
2180 	}
2181     } else if ((range1->type < XML_REGEXP_LETTER) ||
2182                (range2->type < XML_REGEXP_LETTER)) {
2183 	if ((range1->type == XML_REGEXP_ANYSPACE) &&
2184 	    (range2->type == XML_REGEXP_NOTSPACE))
2185 	    ret = 0;
2186 	else if ((range1->type == XML_REGEXP_INITNAME) &&
2187 	         (range2->type == XML_REGEXP_NOTINITNAME))
2188 	    ret = 0;
2189 	else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2190 	         (range2->type == XML_REGEXP_NOTNAMECHAR))
2191 	    ret = 0;
2192 	else if ((range1->type == XML_REGEXP_DECIMAL) &&
2193 	         (range2->type == XML_REGEXP_NOTDECIMAL))
2194 	    ret = 0;
2195 	else if ((range1->type == XML_REGEXP_REALCHAR) &&
2196 	         (range2->type == XML_REGEXP_NOTREALCHAR))
2197 	    ret = 0;
2198 	else {
2199 	    /* same thing to limit complexity */
2200 	    return(1);
2201 	}
2202     } else {
2203         ret = 0;
2204         /* range1->type < range2->type here */
2205         switch (range1->type) {
2206 	    case XML_REGEXP_LETTER:
2207 	         /* all disjoint except in the subgroups */
2208 	         if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2209 		     (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2210 		     (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2211 		     (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2212 		     (range2->type == XML_REGEXP_LETTER_OTHERS))
2213 		     ret = 1;
2214 		 break;
2215 	    case XML_REGEXP_MARK:
2216 	         if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2217 		     (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2218 		     (range2->type == XML_REGEXP_MARK_ENCLOSING))
2219 		     ret = 1;
2220 		 break;
2221 	    case XML_REGEXP_NUMBER:
2222 	         if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2223 		     (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2224 		     (range2->type == XML_REGEXP_NUMBER_OTHERS))
2225 		     ret = 1;
2226 		 break;
2227 	    case XML_REGEXP_PUNCT:
2228 	         if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2229 		     (range2->type == XML_REGEXP_PUNCT_DASH) ||
2230 		     (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2231 		     (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2232 		     (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2233 		     (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2234 		     (range2->type == XML_REGEXP_PUNCT_OTHERS))
2235 		     ret = 1;
2236 		 break;
2237 	    case XML_REGEXP_SEPAR:
2238 	         if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2239 		     (range2->type == XML_REGEXP_SEPAR_LINE) ||
2240 		     (range2->type == XML_REGEXP_SEPAR_PARA))
2241 		     ret = 1;
2242 		 break;
2243 	    case XML_REGEXP_SYMBOL:
2244 	         if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2245 		     (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2246 		     (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2247 		     (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2248 		     ret = 1;
2249 		 break;
2250 	    case XML_REGEXP_OTHER:
2251 	         if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2252 		     (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2253 		     (range2->type == XML_REGEXP_OTHER_PRIVATE))
2254 		     ret = 1;
2255 		 break;
2256             default:
2257 	         if ((range2->type >= XML_REGEXP_LETTER) &&
2258 		     (range2->type < XML_REGEXP_BLOCK_NAME))
2259 		     ret = 0;
2260 		 else {
2261 		     /* safety net ! */
2262 		     return(1);
2263 		 }
2264 	}
2265     }
2266     if (((range1->neg == 0) && (range2->neg != 0)) ||
2267         ((range1->neg != 0) && (range2->neg == 0)))
2268 	ret = !ret;
2269     return(ret);
2270 }
2271 
2272 /**
2273  * xmlFACompareAtomTypes:
2274  * @type1:  an atom type
2275  * @type2:  an atom type
2276  *
2277  * Compares two atoms type to check whether they intersect in some ways,
2278  * this is used by xmlFACompareAtoms only
2279  *
2280  * Returns 1 if they may intersect and 0 otherwise
2281  */
2282 static int
xmlFACompareAtomTypes(xmlRegAtomType type1,xmlRegAtomType type2)2283 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2284     if ((type1 == XML_REGEXP_EPSILON) ||
2285         (type1 == XML_REGEXP_CHARVAL) ||
2286 	(type1 == XML_REGEXP_RANGES) ||
2287 	(type1 == XML_REGEXP_SUBREG) ||
2288 	(type1 == XML_REGEXP_STRING) ||
2289 	(type1 == XML_REGEXP_ANYCHAR))
2290 	return(1);
2291     if ((type2 == XML_REGEXP_EPSILON) ||
2292         (type2 == XML_REGEXP_CHARVAL) ||
2293 	(type2 == XML_REGEXP_RANGES) ||
2294 	(type2 == XML_REGEXP_SUBREG) ||
2295 	(type2 == XML_REGEXP_STRING) ||
2296 	(type2 == XML_REGEXP_ANYCHAR))
2297 	return(1);
2298 
2299     if (type1 == type2) return(1);
2300 
2301     /* simplify subsequent compares by making sure type1 < type2 */
2302     if (type1 > type2) {
2303         xmlRegAtomType tmp = type1;
2304 	type1 = type2;
2305 	type2 = tmp;
2306     }
2307     switch (type1) {
2308         case XML_REGEXP_ANYSPACE: /* \s */
2309 	    /* can't be a letter, number, mark, pontuation, symbol */
2310 	    if ((type2 == XML_REGEXP_NOTSPACE) ||
2311 		((type2 >= XML_REGEXP_LETTER) &&
2312 		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2313 	        ((type2 >= XML_REGEXP_NUMBER) &&
2314 		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2315 	        ((type2 >= XML_REGEXP_MARK) &&
2316 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2317 	        ((type2 >= XML_REGEXP_PUNCT) &&
2318 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2319 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2320 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2321 	        ) return(0);
2322 	    break;
2323         case XML_REGEXP_NOTSPACE: /* \S */
2324 	    break;
2325         case XML_REGEXP_INITNAME: /* \l */
2326 	    /* can't be a number, mark, separator, pontuation, symbol or other */
2327 	    if ((type2 == XML_REGEXP_NOTINITNAME) ||
2328 	        ((type2 >= XML_REGEXP_NUMBER) &&
2329 		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2330 	        ((type2 >= XML_REGEXP_MARK) &&
2331 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2332 	        ((type2 >= XML_REGEXP_SEPAR) &&
2333 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2334 	        ((type2 >= XML_REGEXP_PUNCT) &&
2335 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2336 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2337 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2338 	        ((type2 >= XML_REGEXP_OTHER) &&
2339 		 (type2 <= XML_REGEXP_OTHER_NA))
2340 		) return(0);
2341 	    break;
2342         case XML_REGEXP_NOTINITNAME: /* \L */
2343 	    break;
2344         case XML_REGEXP_NAMECHAR: /* \c */
2345 	    /* can't be a mark, separator, pontuation, symbol or other */
2346 	    if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2347 	        ((type2 >= XML_REGEXP_MARK) &&
2348 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2349 	        ((type2 >= XML_REGEXP_PUNCT) &&
2350 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2351 	        ((type2 >= XML_REGEXP_SEPAR) &&
2352 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2353 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2354 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2355 	        ((type2 >= XML_REGEXP_OTHER) &&
2356 		 (type2 <= XML_REGEXP_OTHER_NA))
2357 		) return(0);
2358 	    break;
2359         case XML_REGEXP_NOTNAMECHAR: /* \C */
2360 	    break;
2361         case XML_REGEXP_DECIMAL: /* \d */
2362 	    /* can't be a letter, mark, separator, pontuation, symbol or other */
2363 	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2364 	        (type2 == XML_REGEXP_REALCHAR) ||
2365 		((type2 >= XML_REGEXP_LETTER) &&
2366 		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2367 	        ((type2 >= XML_REGEXP_MARK) &&
2368 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2369 	        ((type2 >= XML_REGEXP_PUNCT) &&
2370 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2371 	        ((type2 >= XML_REGEXP_SEPAR) &&
2372 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2373 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2374 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2375 	        ((type2 >= XML_REGEXP_OTHER) &&
2376 		 (type2 <= XML_REGEXP_OTHER_NA))
2377 		)return(0);
2378 	    break;
2379         case XML_REGEXP_NOTDECIMAL: /* \D */
2380 	    break;
2381         case XML_REGEXP_REALCHAR: /* \w */
2382 	    /* can't be a mark, separator, pontuation, symbol or other */
2383 	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2384 	        ((type2 >= XML_REGEXP_MARK) &&
2385 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2386 	        ((type2 >= XML_REGEXP_PUNCT) &&
2387 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2388 	        ((type2 >= XML_REGEXP_SEPAR) &&
2389 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2390 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2391 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2392 	        ((type2 >= XML_REGEXP_OTHER) &&
2393 		 (type2 <= XML_REGEXP_OTHER_NA))
2394 		)return(0);
2395 	    break;
2396         case XML_REGEXP_NOTREALCHAR: /* \W */
2397 	    break;
2398 	/*
2399 	 * at that point we know both type 1 and type2 are from
2400 	 * character categories are ordered and are different,
2401 	 * it becomes simple because this is a partition
2402 	 */
2403         case XML_REGEXP_LETTER:
2404 	    if (type2 <= XML_REGEXP_LETTER_OTHERS)
2405 	        return(1);
2406 	    return(0);
2407         case XML_REGEXP_LETTER_UPPERCASE:
2408         case XML_REGEXP_LETTER_LOWERCASE:
2409         case XML_REGEXP_LETTER_TITLECASE:
2410         case XML_REGEXP_LETTER_MODIFIER:
2411         case XML_REGEXP_LETTER_OTHERS:
2412 	    return(0);
2413         case XML_REGEXP_MARK:
2414 	    if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2415 	        return(1);
2416 	    return(0);
2417         case XML_REGEXP_MARK_NONSPACING:
2418         case XML_REGEXP_MARK_SPACECOMBINING:
2419         case XML_REGEXP_MARK_ENCLOSING:
2420 	    return(0);
2421         case XML_REGEXP_NUMBER:
2422 	    if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2423 	        return(1);
2424 	    return(0);
2425         case XML_REGEXP_NUMBER_DECIMAL:
2426         case XML_REGEXP_NUMBER_LETTER:
2427         case XML_REGEXP_NUMBER_OTHERS:
2428 	    return(0);
2429         case XML_REGEXP_PUNCT:
2430 	    if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2431 	        return(1);
2432 	    return(0);
2433         case XML_REGEXP_PUNCT_CONNECTOR:
2434         case XML_REGEXP_PUNCT_DASH:
2435         case XML_REGEXP_PUNCT_OPEN:
2436         case XML_REGEXP_PUNCT_CLOSE:
2437         case XML_REGEXP_PUNCT_INITQUOTE:
2438         case XML_REGEXP_PUNCT_FINQUOTE:
2439         case XML_REGEXP_PUNCT_OTHERS:
2440 	    return(0);
2441         case XML_REGEXP_SEPAR:
2442 	    if (type2 <= XML_REGEXP_SEPAR_PARA)
2443 	        return(1);
2444 	    return(0);
2445         case XML_REGEXP_SEPAR_SPACE:
2446         case XML_REGEXP_SEPAR_LINE:
2447         case XML_REGEXP_SEPAR_PARA:
2448 	    return(0);
2449         case XML_REGEXP_SYMBOL:
2450 	    if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2451 	        return(1);
2452 	    return(0);
2453         case XML_REGEXP_SYMBOL_MATH:
2454         case XML_REGEXP_SYMBOL_CURRENCY:
2455         case XML_REGEXP_SYMBOL_MODIFIER:
2456         case XML_REGEXP_SYMBOL_OTHERS:
2457 	    return(0);
2458         case XML_REGEXP_OTHER:
2459 	    if (type2 <= XML_REGEXP_OTHER_NA)
2460 	        return(1);
2461 	    return(0);
2462         case XML_REGEXP_OTHER_CONTROL:
2463         case XML_REGEXP_OTHER_FORMAT:
2464         case XML_REGEXP_OTHER_PRIVATE:
2465         case XML_REGEXP_OTHER_NA:
2466 	    return(0);
2467 	default:
2468 	    break;
2469     }
2470     return(1);
2471 }
2472 
2473 /**
2474  * xmlFAEqualAtoms:
2475  * @atom1:  an atom
2476  * @atom2:  an atom
2477  * @deep: if not set only compare string pointers
2478  *
2479  * Compares two atoms to check whether they are the same exactly
2480  * this is used to remove equivalent transitions
2481  *
2482  * Returns 1 if same and 0 otherwise
2483  */
2484 static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2485 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2486     int ret = 0;
2487 
2488     if (atom1 == atom2)
2489 	return(1);
2490     if ((atom1 == NULL) || (atom2 == NULL))
2491 	return(0);
2492 
2493     if (atom1->type != atom2->type)
2494         return(0);
2495     switch (atom1->type) {
2496         case XML_REGEXP_EPSILON:
2497 	    ret = 0;
2498 	    break;
2499         case XML_REGEXP_STRING:
2500             if (!deep)
2501                 ret = (atom1->valuep == atom2->valuep);
2502             else
2503                 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2504                                   (xmlChar *)atom2->valuep);
2505 	    break;
2506         case XML_REGEXP_CHARVAL:
2507 	    ret = (atom1->codepoint == atom2->codepoint);
2508 	    break;
2509 	case XML_REGEXP_RANGES:
2510 	    /* too hard to do in the general case */
2511 	    ret = 0;
2512 	default:
2513 	    break;
2514     }
2515     return(ret);
2516 }
2517 
2518 /**
2519  * xmlFACompareAtoms:
2520  * @atom1:  an atom
2521  * @atom2:  an atom
2522  * @deep: if not set only compare string pointers
2523  *
2524  * Compares two atoms to check whether they intersect in some ways,
2525  * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2526  *
2527  * Returns 1 if yes and 0 otherwise
2528  */
2529 static int
xmlFACompareAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2530 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2531     int ret = 1;
2532 
2533     if (atom1 == atom2)
2534 	return(1);
2535     if ((atom1 == NULL) || (atom2 == NULL))
2536 	return(0);
2537 
2538     if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2539         (atom2->type == XML_REGEXP_ANYCHAR))
2540 	return(1);
2541 
2542     if (atom1->type > atom2->type) {
2543 	xmlRegAtomPtr tmp;
2544 	tmp = atom1;
2545 	atom1 = atom2;
2546 	atom2 = tmp;
2547     }
2548     if (atom1->type != atom2->type) {
2549         ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2550 	/* if they can't intersect at the type level break now */
2551 	if (ret == 0)
2552 	    return(0);
2553     }
2554     switch (atom1->type) {
2555         case XML_REGEXP_STRING:
2556             if (!deep)
2557                 ret = (atom1->valuep != atom2->valuep);
2558             else
2559                 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
2560                                              (xmlChar *)atom2->valuep);
2561 	    break;
2562         case XML_REGEXP_EPSILON:
2563 	    goto not_determinist;
2564         case XML_REGEXP_CHARVAL:
2565 	    if (atom2->type == XML_REGEXP_CHARVAL) {
2566 		ret = (atom1->codepoint == atom2->codepoint);
2567 	    } else {
2568 	        ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2569 		if (ret < 0)
2570 		    ret = 1;
2571 	    }
2572 	    break;
2573         case XML_REGEXP_RANGES:
2574 	    if (atom2->type == XML_REGEXP_RANGES) {
2575 	        int i, j, res;
2576 		xmlRegRangePtr r1, r2;
2577 
2578 		/*
2579 		 * need to check that none of the ranges eventually matches
2580 		 */
2581 		for (i = 0;i < atom1->nbRanges;i++) {
2582 		    for (j = 0;j < atom2->nbRanges;j++) {
2583 			r1 = atom1->ranges[i];
2584 			r2 = atom2->ranges[j];
2585 			res = xmlFACompareRanges(r1, r2);
2586 			if (res == 1) {
2587 			    ret = 1;
2588 			    goto done;
2589 			}
2590 		    }
2591 		}
2592 		ret = 0;
2593 	    }
2594 	    break;
2595 	default:
2596 	    goto not_determinist;
2597     }
2598 done:
2599     if (atom1->neg != atom2->neg) {
2600         ret = !ret;
2601     }
2602     if (ret == 0)
2603         return(0);
2604 not_determinist:
2605     return(1);
2606 }
2607 
2608 /**
2609  * xmlFARecurseDeterminism:
2610  * @ctxt:  a regexp parser context
2611  *
2612  * Check whether the associated regexp is determinist,
2613  * should be called after xmlFAEliminateEpsilonTransitions()
2614  *
2615  */
2616 static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,int to,xmlRegAtomPtr atom)2617 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2618 	                 int to, xmlRegAtomPtr atom) {
2619     int ret = 1;
2620     int res;
2621     int transnr, nbTrans;
2622     xmlRegTransPtr t1;
2623     int deep = 1;
2624 
2625     if (state == NULL)
2626 	return(ret);
2627     if (state->markd == XML_REGEXP_MARK_VISITED)
2628 	return(ret);
2629 
2630     if (ctxt->flags & AM_AUTOMATA_RNG)
2631         deep = 0;
2632 
2633     /*
2634      * don't recurse on transitions potentially added in the course of
2635      * the elimination.
2636      */
2637     nbTrans = state->nbTrans;
2638     for (transnr = 0;transnr < nbTrans;transnr++) {
2639 	t1 = &(state->trans[transnr]);
2640 	/*
2641 	 * check transitions conflicting with the one looked at
2642 	 */
2643 	if (t1->atom == NULL) {
2644 	    if (t1->to < 0)
2645 		continue;
2646 	    state->markd = XML_REGEXP_MARK_VISITED;
2647 	    res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2648 		                           to, atom);
2649 	    state->markd = 0;
2650 	    if (res == 0) {
2651 	        ret = 0;
2652 		/* t1->nd = 1; */
2653 	    }
2654 	    continue;
2655 	}
2656 	if (t1->to != to)
2657 	    continue;
2658 	if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2659 	    ret = 0;
2660 	    /* mark the transition as non-deterministic */
2661 	    t1->nd = 1;
2662 	}
2663     }
2664     return(ret);
2665 }
2666 
2667 /**
2668  * xmlFAComputesDeterminism:
2669  * @ctxt:  a regexp parser context
2670  *
2671  * Check whether the associated regexp is determinist,
2672  * should be called after xmlFAEliminateEpsilonTransitions()
2673  *
2674  */
2675 static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt)2676 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2677     int statenr, transnr;
2678     xmlRegStatePtr state;
2679     xmlRegTransPtr t1, t2, last;
2680     int i;
2681     int ret = 1;
2682     int deep = 1;
2683 
2684 #ifdef DEBUG_REGEXP_GRAPH
2685     printf("xmlFAComputesDeterminism\n");
2686     xmlRegPrintCtxt(stdout, ctxt);
2687 #endif
2688     if (ctxt->determinist != -1)
2689 	return(ctxt->determinist);
2690 
2691     if (ctxt->flags & AM_AUTOMATA_RNG)
2692         deep = 0;
2693 
2694     /*
2695      * First cleanup the automata removing cancelled transitions
2696      */
2697     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2698 	state = ctxt->states[statenr];
2699 	if (state == NULL)
2700 	    continue;
2701 	if (state->nbTrans < 2)
2702 	    continue;
2703 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2704 	    t1 = &(state->trans[transnr]);
2705 	    /*
2706 	     * Determinism checks in case of counted or all transitions
2707 	     * will have to be handled separately
2708 	     */
2709 	    if (t1->atom == NULL) {
2710 		/* t1->nd = 1; */
2711 		continue;
2712 	    }
2713 	    if (t1->to == -1) /* eliminated */
2714 		continue;
2715 	    for (i = 0;i < transnr;i++) {
2716 		t2 = &(state->trans[i]);
2717 		if (t2->to == -1) /* eliminated */
2718 		    continue;
2719 		if (t2->atom != NULL) {
2720 		    if (t1->to == t2->to) {
2721                         /*
2722                          * Here we use deep because we want to keep the
2723                          * transitions which indicate a conflict
2724                          */
2725 			if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2726                             (t1->counter == t2->counter) &&
2727                             (t1->count == t2->count))
2728 			    t2->to = -1; /* eliminated */
2729 		    }
2730 		}
2731 	    }
2732 	}
2733     }
2734 
2735     /*
2736      * Check for all states that there aren't 2 transitions
2737      * with the same atom and a different target.
2738      */
2739     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2740 	state = ctxt->states[statenr];
2741 	if (state == NULL)
2742 	    continue;
2743 	if (state->nbTrans < 2)
2744 	    continue;
2745 	last = NULL;
2746 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2747 	    t1 = &(state->trans[transnr]);
2748 	    /*
2749 	     * Determinism checks in case of counted or all transitions
2750 	     * will have to be handled separately
2751 	     */
2752 	    if (t1->atom == NULL) {
2753 		continue;
2754 	    }
2755 	    if (t1->to == -1) /* eliminated */
2756 		continue;
2757 	    for (i = 0;i < transnr;i++) {
2758 		t2 = &(state->trans[i]);
2759 		if (t2->to == -1) /* eliminated */
2760 		    continue;
2761 		if (t2->atom != NULL) {
2762                     /*
2763                      * But here we don't use deep because we want to
2764                      * find transitions which indicate a conflict
2765                      */
2766 		    if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2767 			ret = 0;
2768 			/* mark the transitions as non-deterministic ones */
2769 			t1->nd = 1;
2770 			t2->nd = 1;
2771 			last = t1;
2772 		    }
2773 		} else if (t1->to != -1) {
2774 		    /*
2775 		     * do the closure in case of remaining specific
2776 		     * epsilon transitions like choices or all
2777 		     */
2778 		    ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2779 						   t2->to, t2->atom);
2780 		    /* don't shortcut the computation so all non deterministic
2781 		       transition get marked down
2782 		    if (ret == 0)
2783 			return(0);
2784 		     */
2785 		    if (ret == 0) {
2786 			t1->nd = 1;
2787 			/* t2->nd = 1; */
2788 			last = t1;
2789 		    }
2790 		}
2791 	    }
2792 	    /* don't shortcut the computation so all non deterministic
2793 	       transition get marked down
2794 	    if (ret == 0)
2795 		break; */
2796 	}
2797 
2798 	/*
2799 	 * mark specifically the last non-deterministic transition
2800 	 * from a state since there is no need to set-up rollback
2801 	 * from it
2802 	 */
2803 	if (last != NULL) {
2804 	    last->nd = 2;
2805 	}
2806 
2807 	/* don't shortcut the computation so all non deterministic
2808 	   transition get marked down
2809 	if (ret == 0)
2810 	    break; */
2811     }
2812 
2813     ctxt->determinist = ret;
2814     return(ret);
2815 }
2816 
2817 /************************************************************************
2818  *									*
2819  *	Routines to check input against transition atoms		*
2820  *									*
2821  ************************************************************************/
2822 
2823 static int
xmlRegCheckCharacterRange(xmlRegAtomType type,int codepoint,int neg,int start,int end,const xmlChar * blockName)2824 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2825 	                  int start, int end, const xmlChar *blockName) {
2826     int ret = 0;
2827 
2828     switch (type) {
2829         case XML_REGEXP_STRING:
2830         case XML_REGEXP_SUBREG:
2831         case XML_REGEXP_RANGES:
2832         case XML_REGEXP_EPSILON:
2833 	    return(-1);
2834         case XML_REGEXP_ANYCHAR:
2835 	    ret = ((codepoint != '\n') && (codepoint != '\r'));
2836 	    break;
2837         case XML_REGEXP_CHARVAL:
2838 	    ret = ((codepoint >= start) && (codepoint <= end));
2839 	    break;
2840         case XML_REGEXP_NOTSPACE:
2841 	    neg = !neg;
2842             /* Falls through. */
2843         case XML_REGEXP_ANYSPACE:
2844 	    ret = ((codepoint == '\n') || (codepoint == '\r') ||
2845 		   (codepoint == '\t') || (codepoint == ' '));
2846 	    break;
2847         case XML_REGEXP_NOTINITNAME:
2848 	    neg = !neg;
2849             /* Falls through. */
2850         case XML_REGEXP_INITNAME:
2851 	    ret = (IS_LETTER(codepoint) ||
2852 		   (codepoint == '_') || (codepoint == ':'));
2853 	    break;
2854         case XML_REGEXP_NOTNAMECHAR:
2855 	    neg = !neg;
2856             /* Falls through. */
2857         case XML_REGEXP_NAMECHAR:
2858 	    ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2859 		   (codepoint == '.') || (codepoint == '-') ||
2860 		   (codepoint == '_') || (codepoint == ':') ||
2861 		   IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2862 	    break;
2863         case XML_REGEXP_NOTDECIMAL:
2864 	    neg = !neg;
2865             /* Falls through. */
2866         case XML_REGEXP_DECIMAL:
2867 	    ret = xmlUCSIsCatNd(codepoint);
2868 	    break;
2869         case XML_REGEXP_REALCHAR:
2870 	    neg = !neg;
2871             /* Falls through. */
2872         case XML_REGEXP_NOTREALCHAR:
2873 	    ret = xmlUCSIsCatP(codepoint);
2874 	    if (ret == 0)
2875 		ret = xmlUCSIsCatZ(codepoint);
2876 	    if (ret == 0)
2877 		ret = xmlUCSIsCatC(codepoint);
2878 	    break;
2879         case XML_REGEXP_LETTER:
2880 	    ret = xmlUCSIsCatL(codepoint);
2881 	    break;
2882         case XML_REGEXP_LETTER_UPPERCASE:
2883 	    ret = xmlUCSIsCatLu(codepoint);
2884 	    break;
2885         case XML_REGEXP_LETTER_LOWERCASE:
2886 	    ret = xmlUCSIsCatLl(codepoint);
2887 	    break;
2888         case XML_REGEXP_LETTER_TITLECASE:
2889 	    ret = xmlUCSIsCatLt(codepoint);
2890 	    break;
2891         case XML_REGEXP_LETTER_MODIFIER:
2892 	    ret = xmlUCSIsCatLm(codepoint);
2893 	    break;
2894         case XML_REGEXP_LETTER_OTHERS:
2895 	    ret = xmlUCSIsCatLo(codepoint);
2896 	    break;
2897         case XML_REGEXP_MARK:
2898 	    ret = xmlUCSIsCatM(codepoint);
2899 	    break;
2900         case XML_REGEXP_MARK_NONSPACING:
2901 	    ret = xmlUCSIsCatMn(codepoint);
2902 	    break;
2903         case XML_REGEXP_MARK_SPACECOMBINING:
2904 	    ret = xmlUCSIsCatMc(codepoint);
2905 	    break;
2906         case XML_REGEXP_MARK_ENCLOSING:
2907 	    ret = xmlUCSIsCatMe(codepoint);
2908 	    break;
2909         case XML_REGEXP_NUMBER:
2910 	    ret = xmlUCSIsCatN(codepoint);
2911 	    break;
2912         case XML_REGEXP_NUMBER_DECIMAL:
2913 	    ret = xmlUCSIsCatNd(codepoint);
2914 	    break;
2915         case XML_REGEXP_NUMBER_LETTER:
2916 	    ret = xmlUCSIsCatNl(codepoint);
2917 	    break;
2918         case XML_REGEXP_NUMBER_OTHERS:
2919 	    ret = xmlUCSIsCatNo(codepoint);
2920 	    break;
2921         case XML_REGEXP_PUNCT:
2922 	    ret = xmlUCSIsCatP(codepoint);
2923 	    break;
2924         case XML_REGEXP_PUNCT_CONNECTOR:
2925 	    ret = xmlUCSIsCatPc(codepoint);
2926 	    break;
2927         case XML_REGEXP_PUNCT_DASH:
2928 	    ret = xmlUCSIsCatPd(codepoint);
2929 	    break;
2930         case XML_REGEXP_PUNCT_OPEN:
2931 	    ret = xmlUCSIsCatPs(codepoint);
2932 	    break;
2933         case XML_REGEXP_PUNCT_CLOSE:
2934 	    ret = xmlUCSIsCatPe(codepoint);
2935 	    break;
2936         case XML_REGEXP_PUNCT_INITQUOTE:
2937 	    ret = xmlUCSIsCatPi(codepoint);
2938 	    break;
2939         case XML_REGEXP_PUNCT_FINQUOTE:
2940 	    ret = xmlUCSIsCatPf(codepoint);
2941 	    break;
2942         case XML_REGEXP_PUNCT_OTHERS:
2943 	    ret = xmlUCSIsCatPo(codepoint);
2944 	    break;
2945         case XML_REGEXP_SEPAR:
2946 	    ret = xmlUCSIsCatZ(codepoint);
2947 	    break;
2948         case XML_REGEXP_SEPAR_SPACE:
2949 	    ret = xmlUCSIsCatZs(codepoint);
2950 	    break;
2951         case XML_REGEXP_SEPAR_LINE:
2952 	    ret = xmlUCSIsCatZl(codepoint);
2953 	    break;
2954         case XML_REGEXP_SEPAR_PARA:
2955 	    ret = xmlUCSIsCatZp(codepoint);
2956 	    break;
2957         case XML_REGEXP_SYMBOL:
2958 	    ret = xmlUCSIsCatS(codepoint);
2959 	    break;
2960         case XML_REGEXP_SYMBOL_MATH:
2961 	    ret = xmlUCSIsCatSm(codepoint);
2962 	    break;
2963         case XML_REGEXP_SYMBOL_CURRENCY:
2964 	    ret = xmlUCSIsCatSc(codepoint);
2965 	    break;
2966         case XML_REGEXP_SYMBOL_MODIFIER:
2967 	    ret = xmlUCSIsCatSk(codepoint);
2968 	    break;
2969         case XML_REGEXP_SYMBOL_OTHERS:
2970 	    ret = xmlUCSIsCatSo(codepoint);
2971 	    break;
2972         case XML_REGEXP_OTHER:
2973 	    ret = xmlUCSIsCatC(codepoint);
2974 	    break;
2975         case XML_REGEXP_OTHER_CONTROL:
2976 	    ret = xmlUCSIsCatCc(codepoint);
2977 	    break;
2978         case XML_REGEXP_OTHER_FORMAT:
2979 	    ret = xmlUCSIsCatCf(codepoint);
2980 	    break;
2981         case XML_REGEXP_OTHER_PRIVATE:
2982 	    ret = xmlUCSIsCatCo(codepoint);
2983 	    break;
2984         case XML_REGEXP_OTHER_NA:
2985 	    /* ret = xmlUCSIsCatCn(codepoint); */
2986 	    /* Seems it doesn't exist anymore in recent Unicode releases */
2987 	    ret = 0;
2988 	    break;
2989         case XML_REGEXP_BLOCK_NAME:
2990 	    ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2991 	    break;
2992     }
2993     if (neg)
2994 	return(!ret);
2995     return(ret);
2996 }
2997 
2998 static int
xmlRegCheckCharacter(xmlRegAtomPtr atom,int codepoint)2999 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3000     int i, ret = 0;
3001     xmlRegRangePtr range;
3002 
3003     if ((atom == NULL) || (!IS_CHAR(codepoint)))
3004 	return(-1);
3005 
3006     switch (atom->type) {
3007         case XML_REGEXP_SUBREG:
3008         case XML_REGEXP_EPSILON:
3009 	    return(-1);
3010         case XML_REGEXP_CHARVAL:
3011             return(codepoint == atom->codepoint);
3012         case XML_REGEXP_RANGES: {
3013 	    int accept = 0;
3014 
3015 	    for (i = 0;i < atom->nbRanges;i++) {
3016 		range = atom->ranges[i];
3017 		if (range->neg == 2) {
3018 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3019 						0, range->start, range->end,
3020 						range->blockName);
3021 		    if (ret != 0)
3022 			return(0); /* excluded char */
3023 		} else if (range->neg) {
3024 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3025 						0, range->start, range->end,
3026 						range->blockName);
3027 		    if (ret == 0)
3028 		        accept = 1;
3029 		    else
3030 		        return(0);
3031 		} else {
3032 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3033 						0, range->start, range->end,
3034 						range->blockName);
3035 		    if (ret != 0)
3036 			accept = 1; /* might still be excluded */
3037 		}
3038 	    }
3039 	    return(accept);
3040 	}
3041         case XML_REGEXP_STRING:
3042 	    printf("TODO: XML_REGEXP_STRING\n");
3043 	    return(-1);
3044         case XML_REGEXP_ANYCHAR:
3045         case XML_REGEXP_ANYSPACE:
3046         case XML_REGEXP_NOTSPACE:
3047         case XML_REGEXP_INITNAME:
3048         case XML_REGEXP_NOTINITNAME:
3049         case XML_REGEXP_NAMECHAR:
3050         case XML_REGEXP_NOTNAMECHAR:
3051         case XML_REGEXP_DECIMAL:
3052         case XML_REGEXP_NOTDECIMAL:
3053         case XML_REGEXP_REALCHAR:
3054         case XML_REGEXP_NOTREALCHAR:
3055         case XML_REGEXP_LETTER:
3056         case XML_REGEXP_LETTER_UPPERCASE:
3057         case XML_REGEXP_LETTER_LOWERCASE:
3058         case XML_REGEXP_LETTER_TITLECASE:
3059         case XML_REGEXP_LETTER_MODIFIER:
3060         case XML_REGEXP_LETTER_OTHERS:
3061         case XML_REGEXP_MARK:
3062         case XML_REGEXP_MARK_NONSPACING:
3063         case XML_REGEXP_MARK_SPACECOMBINING:
3064         case XML_REGEXP_MARK_ENCLOSING:
3065         case XML_REGEXP_NUMBER:
3066         case XML_REGEXP_NUMBER_DECIMAL:
3067         case XML_REGEXP_NUMBER_LETTER:
3068         case XML_REGEXP_NUMBER_OTHERS:
3069         case XML_REGEXP_PUNCT:
3070         case XML_REGEXP_PUNCT_CONNECTOR:
3071         case XML_REGEXP_PUNCT_DASH:
3072         case XML_REGEXP_PUNCT_OPEN:
3073         case XML_REGEXP_PUNCT_CLOSE:
3074         case XML_REGEXP_PUNCT_INITQUOTE:
3075         case XML_REGEXP_PUNCT_FINQUOTE:
3076         case XML_REGEXP_PUNCT_OTHERS:
3077         case XML_REGEXP_SEPAR:
3078         case XML_REGEXP_SEPAR_SPACE:
3079         case XML_REGEXP_SEPAR_LINE:
3080         case XML_REGEXP_SEPAR_PARA:
3081         case XML_REGEXP_SYMBOL:
3082         case XML_REGEXP_SYMBOL_MATH:
3083         case XML_REGEXP_SYMBOL_CURRENCY:
3084         case XML_REGEXP_SYMBOL_MODIFIER:
3085         case XML_REGEXP_SYMBOL_OTHERS:
3086         case XML_REGEXP_OTHER:
3087         case XML_REGEXP_OTHER_CONTROL:
3088         case XML_REGEXP_OTHER_FORMAT:
3089         case XML_REGEXP_OTHER_PRIVATE:
3090         case XML_REGEXP_OTHER_NA:
3091 	case XML_REGEXP_BLOCK_NAME:
3092 	    ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3093 		                            (const xmlChar *)atom->valuep);
3094 	    if (atom->neg)
3095 		ret = !ret;
3096 	    break;
3097     }
3098     return(ret);
3099 }
3100 
3101 /************************************************************************
3102  *									*
3103  *	Saving and restoring state of an execution context		*
3104  *									*
3105  ************************************************************************/
3106 
3107 #ifdef DEBUG_REGEXP_EXEC
3108 static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec)3109 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3110     printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3111     if (exec->inputStack != NULL) {
3112 	int i;
3113 	printf(": ");
3114 	for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3115 	    printf("%s ", (const char *)
3116 	           exec->inputStack[exec->inputStackNr - (i + 1)].value);
3117     } else {
3118 	printf(": %s", &(exec->inputString[exec->index]));
3119     }
3120     printf("\n");
3121 }
3122 #endif
3123 
3124 static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec)3125 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3126 #ifdef DEBUG_REGEXP_EXEC
3127     printf("saving ");
3128     exec->transno++;
3129     xmlFARegDebugExec(exec);
3130     exec->transno--;
3131 #endif
3132 #ifdef MAX_PUSH
3133     if (exec->nbPush > MAX_PUSH) {
3134         return;
3135     }
3136     exec->nbPush++;
3137 #endif
3138 
3139     if (exec->maxRollbacks == 0) {
3140 	exec->maxRollbacks = 4;
3141 	exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3142 		                             sizeof(xmlRegExecRollback));
3143 	if (exec->rollbacks == NULL) {
3144 	    xmlRegexpErrMemory(NULL, "saving regexp");
3145 	    exec->maxRollbacks = 0;
3146 	    return;
3147 	}
3148 	memset(exec->rollbacks, 0,
3149 	       exec->maxRollbacks * sizeof(xmlRegExecRollback));
3150     } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3151 	xmlRegExecRollback *tmp;
3152 	int len = exec->maxRollbacks;
3153 
3154 	exec->maxRollbacks *= 2;
3155 	tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3156 			exec->maxRollbacks * sizeof(xmlRegExecRollback));
3157 	if (tmp == NULL) {
3158 	    xmlRegexpErrMemory(NULL, "saving regexp");
3159 	    exec->maxRollbacks /= 2;
3160 	    return;
3161 	}
3162 	exec->rollbacks = tmp;
3163 	tmp = &exec->rollbacks[len];
3164 	memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3165     }
3166     exec->rollbacks[exec->nbRollbacks].state = exec->state;
3167     exec->rollbacks[exec->nbRollbacks].index = exec->index;
3168     exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3169     if (exec->comp->nbCounters > 0) {
3170 	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3171 	    exec->rollbacks[exec->nbRollbacks].counts = (int *)
3172 		xmlMalloc(exec->comp->nbCounters * sizeof(int));
3173 	    if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3174 		xmlRegexpErrMemory(NULL, "saving regexp");
3175 		exec->status = -5;
3176 		return;
3177 	    }
3178 	}
3179 	memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3180 	       exec->comp->nbCounters * sizeof(int));
3181     }
3182     exec->nbRollbacks++;
3183 }
3184 
3185 static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec)3186 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3187     if (exec->nbRollbacks <= 0) {
3188 	exec->status = -1;
3189 #ifdef DEBUG_REGEXP_EXEC
3190 	printf("rollback failed on empty stack\n");
3191 #endif
3192 	return;
3193     }
3194     exec->nbRollbacks--;
3195     exec->state = exec->rollbacks[exec->nbRollbacks].state;
3196     exec->index = exec->rollbacks[exec->nbRollbacks].index;
3197     exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3198     if (exec->comp->nbCounters > 0) {
3199 	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3200 	    fprintf(stderr, "exec save: allocation failed");
3201 	    exec->status = -6;
3202 	    return;
3203 	}
3204 	if (exec->counts) {
3205 	    memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3206 	       exec->comp->nbCounters * sizeof(int));
3207 	}
3208     }
3209 
3210 #ifdef DEBUG_REGEXP_EXEC
3211     printf("restored ");
3212     xmlFARegDebugExec(exec);
3213 #endif
3214 }
3215 
3216 /************************************************************************
3217  *									*
3218  *	Verifier, running an input against a compiled regexp		*
3219  *									*
3220  ************************************************************************/
3221 
3222 static int
xmlFARegExec(xmlRegexpPtr comp,const xmlChar * content)3223 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3224     xmlRegExecCtxt execval;
3225     xmlRegExecCtxtPtr exec = &execval;
3226     int ret, codepoint = 0, len, deter;
3227 
3228     exec->inputString = content;
3229     exec->index = 0;
3230     exec->nbPush = 0;
3231     exec->determinist = 1;
3232     exec->maxRollbacks = 0;
3233     exec->nbRollbacks = 0;
3234     exec->rollbacks = NULL;
3235     exec->status = 0;
3236     exec->comp = comp;
3237     exec->state = comp->states[0];
3238     exec->transno = 0;
3239     exec->transcount = 0;
3240     exec->inputStack = NULL;
3241     exec->inputStackMax = 0;
3242     if (comp->nbCounters > 0) {
3243 	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3244 	if (exec->counts == NULL) {
3245 	    xmlRegexpErrMemory(NULL, "running regexp");
3246 	    return(-1);
3247 	}
3248         memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3249     } else
3250 	exec->counts = NULL;
3251     while ((exec->status == 0) && (exec->state != NULL) &&
3252 	   ((exec->inputString[exec->index] != 0) ||
3253 	    ((exec->state != NULL) &&
3254 	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3255 	xmlRegTransPtr trans;
3256 	xmlRegAtomPtr atom;
3257 
3258 	/*
3259 	 * If end of input on non-terminal state, rollback, however we may
3260 	 * still have epsilon like transition for counted transitions
3261 	 * on counters, in that case don't break too early.  Additionally,
3262 	 * if we are working on a range like "AB{0,2}", where B is not present,
3263 	 * we don't want to break.
3264 	 */
3265 	len = 1;
3266 	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3267 	    /*
3268 	     * if there is a transition, we must check if
3269 	     *  atom allows minOccurs of 0
3270 	     */
3271 	    if (exec->transno < exec->state->nbTrans) {
3272 	        trans = &exec->state->trans[exec->transno];
3273 		if (trans->to >=0) {
3274 		    atom = trans->atom;
3275 		    if (!((atom->min == 0) && (atom->max > 0)))
3276 		        goto rollback;
3277 		}
3278 	    } else
3279 	        goto rollback;
3280 	}
3281 
3282 	exec->transcount = 0;
3283 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3284 	    trans = &exec->state->trans[exec->transno];
3285 	    if (trans->to < 0)
3286 		continue;
3287 	    atom = trans->atom;
3288 	    ret = 0;
3289 	    deter = 1;
3290 	    if (trans->count >= 0) {
3291 		int count;
3292 		xmlRegCounterPtr counter;
3293 
3294 		if (exec->counts == NULL) {
3295 		    exec->status = -1;
3296 		    goto error;
3297 		}
3298 		/*
3299 		 * A counted transition.
3300 		 */
3301 
3302 		count = exec->counts[trans->count];
3303 		counter = &exec->comp->counters[trans->count];
3304 #ifdef DEBUG_REGEXP_EXEC
3305 		printf("testing count %d: val %d, min %d, max %d\n",
3306 		       trans->count, count, counter->min,  counter->max);
3307 #endif
3308 		ret = ((count >= counter->min) && (count <= counter->max));
3309 		if ((ret) && (counter->min != counter->max))
3310 		    deter = 0;
3311 	    } else if (atom == NULL) {
3312 		fprintf(stderr, "epsilon transition left at runtime\n");
3313 		exec->status = -2;
3314 		break;
3315 	    } else if (exec->inputString[exec->index] != 0) {
3316                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3317 		ret = xmlRegCheckCharacter(atom, codepoint);
3318 		if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3319 		    xmlRegStatePtr to = comp->states[trans->to];
3320 
3321 		    /*
3322 		     * this is a multiple input sequence
3323 		     * If there is a counter associated increment it now.
3324 		     * before potentially saving and rollback
3325 		     * do not increment if the counter is already over the
3326 		     * maximum limit in which case get to next transition
3327 		     */
3328 		    if (trans->counter >= 0) {
3329 			xmlRegCounterPtr counter;
3330 
3331 			if ((exec->counts == NULL) ||
3332 			    (exec->comp == NULL) ||
3333 			    (exec->comp->counters == NULL)) {
3334 			    exec->status = -1;
3335 			    goto error;
3336 			}
3337 			counter = &exec->comp->counters[trans->counter];
3338 			if (exec->counts[trans->counter] >= counter->max)
3339 			    continue; /* for loop on transitions */
3340 
3341 #ifdef DEBUG_REGEXP_EXEC
3342 			printf("Increasing count %d\n", trans->counter);
3343 #endif
3344 			exec->counts[trans->counter]++;
3345 		    }
3346 		    if (exec->state->nbTrans > exec->transno + 1) {
3347 			xmlFARegExecSave(exec);
3348 		    }
3349 		    exec->transcount = 1;
3350 		    do {
3351 			/*
3352 			 * Try to progress as much as possible on the input
3353 			 */
3354 			if (exec->transcount == atom->max) {
3355 			    break;
3356 			}
3357 			exec->index += len;
3358 			/*
3359 			 * End of input: stop here
3360 			 */
3361 			if (exec->inputString[exec->index] == 0) {
3362 			    exec->index -= len;
3363 			    break;
3364 			}
3365 			if (exec->transcount >= atom->min) {
3366 			    int transno = exec->transno;
3367 			    xmlRegStatePtr state = exec->state;
3368 
3369 			    /*
3370 			     * The transition is acceptable save it
3371 			     */
3372 			    exec->transno = -1; /* trick */
3373 			    exec->state = to;
3374 			    xmlFARegExecSave(exec);
3375 			    exec->transno = transno;
3376 			    exec->state = state;
3377 			}
3378 			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3379 				              len);
3380 			ret = xmlRegCheckCharacter(atom, codepoint);
3381 			exec->transcount++;
3382 		    } while (ret == 1);
3383 		    if (exec->transcount < atom->min)
3384 			ret = 0;
3385 
3386 		    /*
3387 		     * If the last check failed but one transition was found
3388 		     * possible, rollback
3389 		     */
3390 		    if (ret < 0)
3391 			ret = 0;
3392 		    if (ret == 0) {
3393 			goto rollback;
3394 		    }
3395 		    if (trans->counter >= 0) {
3396 			if (exec->counts == NULL) {
3397 			    exec->status = -1;
3398 			    goto error;
3399 			}
3400 #ifdef DEBUG_REGEXP_EXEC
3401 			printf("Decreasing count %d\n", trans->counter);
3402 #endif
3403 			exec->counts[trans->counter]--;
3404 		    }
3405 		} else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3406 		    /*
3407 		     * we don't match on the codepoint, but minOccurs of 0
3408 		     * says that's ok.  Setting len to 0 inhibits stepping
3409 		     * over the codepoint.
3410 		     */
3411 		    exec->transcount = 1;
3412 		    len = 0;
3413 		    ret = 1;
3414 		}
3415 	    } else if ((atom->min == 0) && (atom->max > 0)) {
3416 	        /* another spot to match when minOccurs is 0 */
3417 		exec->transcount = 1;
3418 		len = 0;
3419 		ret = 1;
3420 	    }
3421 	    if (ret == 1) {
3422 		if ((trans->nd == 1) ||
3423 		    ((trans->count >= 0) && (deter == 0) &&
3424 		     (exec->state->nbTrans > exec->transno + 1))) {
3425 #ifdef DEBUG_REGEXP_EXEC
3426 		    if (trans->nd == 1)
3427 		        printf("Saving on nd transition atom %d for %c at %d\n",
3428 			       trans->atom->no, codepoint, exec->index);
3429 		    else
3430 		        printf("Saving on counted transition count %d for %c at %d\n",
3431 			       trans->count, codepoint, exec->index);
3432 #endif
3433 		    xmlFARegExecSave(exec);
3434 		}
3435 		if (trans->counter >= 0) {
3436 		    xmlRegCounterPtr counter;
3437 
3438                     /* make sure we don't go over the counter maximum value */
3439 		    if ((exec->counts == NULL) ||
3440 			(exec->comp == NULL) ||
3441 			(exec->comp->counters == NULL)) {
3442 			exec->status = -1;
3443 			goto error;
3444 		    }
3445 		    counter = &exec->comp->counters[trans->counter];
3446 		    if (exec->counts[trans->counter] >= counter->max)
3447 			continue; /* for loop on transitions */
3448 #ifdef DEBUG_REGEXP_EXEC
3449 		    printf("Increasing count %d\n", trans->counter);
3450 #endif
3451 		    exec->counts[trans->counter]++;
3452 		}
3453 		if ((trans->count >= 0) &&
3454 		    (trans->count < REGEXP_ALL_COUNTER)) {
3455 		    if (exec->counts == NULL) {
3456 		        exec->status = -1;
3457 			goto error;
3458 		    }
3459 #ifdef DEBUG_REGEXP_EXEC
3460 		    printf("resetting count %d on transition\n",
3461 		           trans->count);
3462 #endif
3463 		    exec->counts[trans->count] = 0;
3464 		}
3465 #ifdef DEBUG_REGEXP_EXEC
3466 		printf("entering state %d\n", trans->to);
3467 #endif
3468 		exec->state = comp->states[trans->to];
3469 		exec->transno = 0;
3470 		if (trans->atom != NULL) {
3471 		    exec->index += len;
3472 		}
3473 		goto progress;
3474 	    } else if (ret < 0) {
3475 		exec->status = -4;
3476 		break;
3477 	    }
3478 	}
3479 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3480 rollback:
3481 	    /*
3482 	     * Failed to find a way out
3483 	     */
3484 	    exec->determinist = 0;
3485 #ifdef DEBUG_REGEXP_EXEC
3486 	    printf("rollback from state %d on %d:%c\n", exec->state->no,
3487 	           codepoint,codepoint);
3488 #endif
3489 	    xmlFARegExecRollBack(exec);
3490 	}
3491 progress:
3492 	continue;
3493     }
3494 error:
3495     if (exec->rollbacks != NULL) {
3496 	if (exec->counts != NULL) {
3497 	    int i;
3498 
3499 	    for (i = 0;i < exec->maxRollbacks;i++)
3500 		if (exec->rollbacks[i].counts != NULL)
3501 		    xmlFree(exec->rollbacks[i].counts);
3502 	}
3503 	xmlFree(exec->rollbacks);
3504     }
3505     if (exec->state == NULL)
3506         return(-1);
3507     if (exec->counts != NULL)
3508 	xmlFree(exec->counts);
3509     if (exec->status == 0)
3510 	return(1);
3511     if (exec->status == -1) {
3512 	if (exec->nbPush > MAX_PUSH)
3513 	    return(-1);
3514 	return(0);
3515     }
3516     return(exec->status);
3517 }
3518 
3519 /************************************************************************
3520  *									*
3521  *	Progressive interface to the verifier one atom at a time	*
3522  *									*
3523  ************************************************************************/
3524 #ifdef DEBUG_ERR
3525 static void testerr(xmlRegExecCtxtPtr exec);
3526 #endif
3527 
3528 /**
3529  * xmlRegNewExecCtxt:
3530  * @comp: a precompiled regular expression
3531  * @callback: a callback function used for handling progresses in the
3532  *            automata matching phase
3533  * @data: the context data associated to the callback in this context
3534  *
3535  * Build a context used for progressive evaluation of a regexp.
3536  *
3537  * Returns the new context
3538  */
3539 xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp,xmlRegExecCallbacks callback,void * data)3540 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3541     xmlRegExecCtxtPtr exec;
3542 
3543     if (comp == NULL)
3544 	return(NULL);
3545     if ((comp->compact == NULL) && (comp->states == NULL))
3546         return(NULL);
3547     exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3548     if (exec == NULL) {
3549 	xmlRegexpErrMemory(NULL, "creating execution context");
3550 	return(NULL);
3551     }
3552     memset(exec, 0, sizeof(xmlRegExecCtxt));
3553     exec->inputString = NULL;
3554     exec->index = 0;
3555     exec->determinist = 1;
3556     exec->maxRollbacks = 0;
3557     exec->nbRollbacks = 0;
3558     exec->rollbacks = NULL;
3559     exec->status = 0;
3560     exec->comp = comp;
3561     if (comp->compact == NULL)
3562 	exec->state = comp->states[0];
3563     exec->transno = 0;
3564     exec->transcount = 0;
3565     exec->callback = callback;
3566     exec->data = data;
3567     if (comp->nbCounters > 0) {
3568         /*
3569 	 * For error handling, exec->counts is allocated twice the size
3570 	 * the second half is used to store the data in case of rollback
3571 	 */
3572 	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3573 	                                 * 2);
3574 	if (exec->counts == NULL) {
3575 	    xmlRegexpErrMemory(NULL, "creating execution context");
3576 	    xmlFree(exec);
3577 	    return(NULL);
3578 	}
3579         memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3580 	exec->errCounts = &exec->counts[comp->nbCounters];
3581     } else {
3582 	exec->counts = NULL;
3583 	exec->errCounts = NULL;
3584     }
3585     exec->inputStackMax = 0;
3586     exec->inputStackNr = 0;
3587     exec->inputStack = NULL;
3588     exec->errStateNo = -1;
3589     exec->errString = NULL;
3590     exec->nbPush = 0;
3591     return(exec);
3592 }
3593 
3594 /**
3595  * xmlRegFreeExecCtxt:
3596  * @exec: a regular expression evaulation context
3597  *
3598  * Free the structures associated to a regular expression evaulation context.
3599  */
3600 void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec)3601 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3602     if (exec == NULL)
3603 	return;
3604 
3605     if (exec->rollbacks != NULL) {
3606 	if (exec->counts != NULL) {
3607 	    int i;
3608 
3609 	    for (i = 0;i < exec->maxRollbacks;i++)
3610 		if (exec->rollbacks[i].counts != NULL)
3611 		    xmlFree(exec->rollbacks[i].counts);
3612 	}
3613 	xmlFree(exec->rollbacks);
3614     }
3615     if (exec->counts != NULL)
3616 	xmlFree(exec->counts);
3617     if (exec->inputStack != NULL) {
3618 	int i;
3619 
3620 	for (i = 0;i < exec->inputStackNr;i++) {
3621 	    if (exec->inputStack[i].value != NULL)
3622 		xmlFree(exec->inputStack[i].value);
3623 	}
3624 	xmlFree(exec->inputStack);
3625     }
3626     if (exec->errString != NULL)
3627         xmlFree(exec->errString);
3628     xmlFree(exec);
3629 }
3630 
3631 static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3632 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3633 	                    void *data) {
3634 #ifdef DEBUG_PUSH
3635     printf("saving value: %d:%s\n", exec->inputStackNr, value);
3636 #endif
3637     if (exec->inputStackMax == 0) {
3638 	exec->inputStackMax = 4;
3639 	exec->inputStack = (xmlRegInputTokenPtr)
3640 	    xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3641 	if (exec->inputStack == NULL) {
3642 	    xmlRegexpErrMemory(NULL, "pushing input string");
3643 	    exec->inputStackMax = 0;
3644 	    return;
3645 	}
3646     } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3647 	xmlRegInputTokenPtr tmp;
3648 
3649 	exec->inputStackMax *= 2;
3650 	tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3651 			exec->inputStackMax * sizeof(xmlRegInputToken));
3652 	if (tmp == NULL) {
3653 	    xmlRegexpErrMemory(NULL, "pushing input string");
3654 	    exec->inputStackMax /= 2;
3655 	    return;
3656 	}
3657 	exec->inputStack = tmp;
3658     }
3659     exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3660     exec->inputStack[exec->inputStackNr].data = data;
3661     exec->inputStackNr++;
3662     exec->inputStack[exec->inputStackNr].value = NULL;
3663     exec->inputStack[exec->inputStackNr].data = NULL;
3664 }
3665 
3666 /**
3667  * xmlRegStrEqualWildcard:
3668  * @expStr:  the string to be evaluated
3669  * @valStr:  the validation string
3670  *
3671  * Checks if both strings are equal or have the same content. "*"
3672  * can be used as a wildcard in @valStr; "|" is used as a seperator of
3673  * substrings in both @expStr and @valStr.
3674  *
3675  * Returns 1 if the comparison is satisfied and the number of substrings
3676  * is equal, 0 otherwise.
3677  */
3678 
3679 static int
xmlRegStrEqualWildcard(const xmlChar * expStr,const xmlChar * valStr)3680 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3681     if (expStr == valStr) return(1);
3682     if (expStr == NULL) return(0);
3683     if (valStr == NULL) return(0);
3684     do {
3685 	/*
3686 	* Eval if we have a wildcard for the current item.
3687 	*/
3688         if (*expStr != *valStr) {
3689 	    /* if one of them starts with a wildcard make valStr be it */
3690 	    if (*valStr == '*') {
3691 	        const xmlChar *tmp;
3692 
3693 		tmp = valStr;
3694 		valStr = expStr;
3695 		expStr = tmp;
3696 	    }
3697 	    if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3698 		do {
3699 		    if (*valStr == XML_REG_STRING_SEPARATOR)
3700 			break;
3701 		    valStr++;
3702 		} while (*valStr != 0);
3703 		continue;
3704 	    } else
3705 		return(0);
3706 	}
3707 	expStr++;
3708 	valStr++;
3709     } while (*valStr != 0);
3710     if (*expStr != 0)
3711 	return (0);
3712     else
3713 	return (1);
3714 }
3715 
3716 /**
3717  * xmlRegCompactPushString:
3718  * @exec: a regexp execution context
3719  * @comp:  the precompiled exec with a compact table
3720  * @value: a string token input
3721  * @data: data associated to the token to reuse in callbacks
3722  *
3723  * Push one input token in the execution context
3724  *
3725  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3726  *     a negative value in case of error.
3727  */
3728 static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,xmlRegexpPtr comp,const xmlChar * value,void * data)3729 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3730 	                xmlRegexpPtr comp,
3731 	                const xmlChar *value,
3732 	                void *data) {
3733     int state = exec->index;
3734     int i, target;
3735 
3736     if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3737 	return(-1);
3738 
3739     if (value == NULL) {
3740 	/*
3741 	 * are we at a final state ?
3742 	 */
3743 	if (comp->compact[state * (comp->nbstrings + 1)] ==
3744             XML_REGEXP_FINAL_STATE)
3745 	    return(1);
3746 	return(0);
3747     }
3748 
3749 #ifdef DEBUG_PUSH
3750     printf("value pushed: %s\n", value);
3751 #endif
3752 
3753     /*
3754      * Examine all outside transitions from current state
3755      */
3756     for (i = 0;i < comp->nbstrings;i++) {
3757 	target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3758 	if ((target > 0) && (target <= comp->nbstates)) {
3759 	    target--; /* to avoid 0 */
3760 	    if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3761 		exec->index = target;
3762 		if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3763 		    exec->callback(exec->data, value,
3764 			  comp->transdata[state * comp->nbstrings + i], data);
3765 		}
3766 #ifdef DEBUG_PUSH
3767 		printf("entering state %d\n", target);
3768 #endif
3769 		if (comp->compact[target * (comp->nbstrings + 1)] ==
3770 		    XML_REGEXP_SINK_STATE)
3771 		    goto error;
3772 
3773 		if (comp->compact[target * (comp->nbstrings + 1)] ==
3774 		    XML_REGEXP_FINAL_STATE)
3775 		    return(1);
3776 		return(0);
3777 	    }
3778 	}
3779     }
3780     /*
3781      * Failed to find an exit transition out from current state for the
3782      * current token
3783      */
3784 #ifdef DEBUG_PUSH
3785     printf("failed to find a transition for %s on state %d\n", value, state);
3786 #endif
3787 error:
3788     if (exec->errString != NULL)
3789         xmlFree(exec->errString);
3790     exec->errString = xmlStrdup(value);
3791     exec->errStateNo = state;
3792     exec->status = -1;
3793 #ifdef DEBUG_ERR
3794     testerr(exec);
3795 #endif
3796     return(-1);
3797 }
3798 
3799 /**
3800  * xmlRegExecPushStringInternal:
3801  * @exec: a regexp execution context or NULL to indicate the end
3802  * @value: a string token input
3803  * @data: data associated to the token to reuse in callbacks
3804  * @compound: value was assembled from 2 strings
3805  *
3806  * Push one input token in the execution context
3807  *
3808  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3809  *     a negative value in case of error.
3810  */
3811 static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data,int compound)3812 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3813 	                     void *data, int compound) {
3814     xmlRegTransPtr trans;
3815     xmlRegAtomPtr atom;
3816     int ret;
3817     int final = 0;
3818     int progress = 1;
3819 
3820     if (exec == NULL)
3821 	return(-1);
3822     if (exec->comp == NULL)
3823 	return(-1);
3824     if (exec->status != 0)
3825 	return(exec->status);
3826 
3827     if (exec->comp->compact != NULL)
3828 	return(xmlRegCompactPushString(exec, exec->comp, value, data));
3829 
3830     if (value == NULL) {
3831         if (exec->state->type == XML_REGEXP_FINAL_STATE)
3832 	    return(1);
3833 	final = 1;
3834     }
3835 
3836 #ifdef DEBUG_PUSH
3837     printf("value pushed: %s\n", value);
3838 #endif
3839     /*
3840      * If we have an active rollback stack push the new value there
3841      * and get back to where we were left
3842      */
3843     if ((value != NULL) && (exec->inputStackNr > 0)) {
3844 	xmlFARegExecSaveInputString(exec, value, data);
3845 	value = exec->inputStack[exec->index].value;
3846 	data = exec->inputStack[exec->index].data;
3847 #ifdef DEBUG_PUSH
3848 	printf("value loaded: %s\n", value);
3849 #endif
3850     }
3851 
3852     while ((exec->status == 0) &&
3853 	   ((value != NULL) ||
3854 	    ((final == 1) &&
3855 	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3856 
3857 	/*
3858 	 * End of input on non-terminal state, rollback, however we may
3859 	 * still have epsilon like transition for counted transitions
3860 	 * on counters, in that case don't break too early.
3861 	 */
3862 	if ((value == NULL) && (exec->counts == NULL))
3863 	    goto rollback;
3864 
3865 	exec->transcount = 0;
3866 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3867 	    trans = &exec->state->trans[exec->transno];
3868 	    if (trans->to < 0)
3869 		continue;
3870 	    atom = trans->atom;
3871 	    ret = 0;
3872 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3873 		int i;
3874 		int count;
3875 		xmlRegTransPtr t;
3876 		xmlRegCounterPtr counter;
3877 
3878 		ret = 0;
3879 
3880 #ifdef DEBUG_PUSH
3881 		printf("testing all lax %d\n", trans->count);
3882 #endif
3883 		/*
3884 		 * Check all counted transitions from the current state
3885 		 */
3886 		if ((value == NULL) && (final)) {
3887 		    ret = 1;
3888 		} else if (value != NULL) {
3889 		    for (i = 0;i < exec->state->nbTrans;i++) {
3890 			t = &exec->state->trans[i];
3891 			if ((t->counter < 0) || (t == trans))
3892 			    continue;
3893 			counter = &exec->comp->counters[t->counter];
3894 			count = exec->counts[t->counter];
3895 			if ((count < counter->max) &&
3896 		            (t->atom != NULL) &&
3897 			    (xmlStrEqual(value, t->atom->valuep))) {
3898 			    ret = 0;
3899 			    break;
3900 			}
3901 			if ((count >= counter->min) &&
3902 			    (count < counter->max) &&
3903 			    (t->atom != NULL) &&
3904 			    (xmlStrEqual(value, t->atom->valuep))) {
3905 			    ret = 1;
3906 			    break;
3907 			}
3908 		    }
3909 		}
3910 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
3911 		int i;
3912 		int count;
3913 		xmlRegTransPtr t;
3914 		xmlRegCounterPtr counter;
3915 
3916 		ret = 1;
3917 
3918 #ifdef DEBUG_PUSH
3919 		printf("testing all %d\n", trans->count);
3920 #endif
3921 		/*
3922 		 * Check all counted transitions from the current state
3923 		 */
3924 		for (i = 0;i < exec->state->nbTrans;i++) {
3925                     t = &exec->state->trans[i];
3926 		    if ((t->counter < 0) || (t == trans))
3927 			continue;
3928                     counter = &exec->comp->counters[t->counter];
3929 		    count = exec->counts[t->counter];
3930 		    if ((count < counter->min) || (count > counter->max)) {
3931 			ret = 0;
3932 			break;
3933 		    }
3934 		}
3935 	    } else if (trans->count >= 0) {
3936 		int count;
3937 		xmlRegCounterPtr counter;
3938 
3939 		/*
3940 		 * A counted transition.
3941 		 */
3942 
3943 		count = exec->counts[trans->count];
3944 		counter = &exec->comp->counters[trans->count];
3945 #ifdef DEBUG_PUSH
3946 		printf("testing count %d: val %d, min %d, max %d\n",
3947 		       trans->count, count, counter->min,  counter->max);
3948 #endif
3949 		ret = ((count >= counter->min) && (count <= counter->max));
3950 	    } else if (atom == NULL) {
3951 		fprintf(stderr, "epsilon transition left at runtime\n");
3952 		exec->status = -2;
3953 		break;
3954 	    } else if (value != NULL) {
3955 		ret = xmlRegStrEqualWildcard(atom->valuep, value);
3956 		if (atom->neg) {
3957 		    ret = !ret;
3958 		    if (!compound)
3959 		        ret = 0;
3960 		}
3961 		if ((ret == 1) && (trans->counter >= 0)) {
3962 		    xmlRegCounterPtr counter;
3963 		    int count;
3964 
3965 		    count = exec->counts[trans->counter];
3966 		    counter = &exec->comp->counters[trans->counter];
3967 		    if (count >= counter->max)
3968 			ret = 0;
3969 		}
3970 
3971 		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3972 		    xmlRegStatePtr to = exec->comp->states[trans->to];
3973 
3974 		    /*
3975 		     * this is a multiple input sequence
3976 		     */
3977 		    if (exec->state->nbTrans > exec->transno + 1) {
3978 			if (exec->inputStackNr <= 0) {
3979 			    xmlFARegExecSaveInputString(exec, value, data);
3980 			}
3981 			xmlFARegExecSave(exec);
3982 		    }
3983 		    exec->transcount = 1;
3984 		    do {
3985 			/*
3986 			 * Try to progress as much as possible on the input
3987 			 */
3988 			if (exec->transcount == atom->max) {
3989 			    break;
3990 			}
3991 			exec->index++;
3992 			value = exec->inputStack[exec->index].value;
3993 			data = exec->inputStack[exec->index].data;
3994 #ifdef DEBUG_PUSH
3995 			printf("value loaded: %s\n", value);
3996 #endif
3997 
3998 			/*
3999 			 * End of input: stop here
4000 			 */
4001 			if (value == NULL) {
4002 			    exec->index --;
4003 			    break;
4004 			}
4005 			if (exec->transcount >= atom->min) {
4006 			    int transno = exec->transno;
4007 			    xmlRegStatePtr state = exec->state;
4008 
4009 			    /*
4010 			     * The transition is acceptable save it
4011 			     */
4012 			    exec->transno = -1; /* trick */
4013 			    exec->state = to;
4014 			    if (exec->inputStackNr <= 0) {
4015 				xmlFARegExecSaveInputString(exec, value, data);
4016 			    }
4017 			    xmlFARegExecSave(exec);
4018 			    exec->transno = transno;
4019 			    exec->state = state;
4020 			}
4021 			ret = xmlStrEqual(value, atom->valuep);
4022 			exec->transcount++;
4023 		    } while (ret == 1);
4024 		    if (exec->transcount < atom->min)
4025 			ret = 0;
4026 
4027 		    /*
4028 		     * If the last check failed but one transition was found
4029 		     * possible, rollback
4030 		     */
4031 		    if (ret < 0)
4032 			ret = 0;
4033 		    if (ret == 0) {
4034 			goto rollback;
4035 		    }
4036 		}
4037 	    }
4038 	    if (ret == 1) {
4039 		if ((exec->callback != NULL) && (atom != NULL) &&
4040 			(data != NULL)) {
4041 		    exec->callback(exec->data, atom->valuep,
4042 			           atom->data, data);
4043 		}
4044 		if (exec->state->nbTrans > exec->transno + 1) {
4045 		    if (exec->inputStackNr <= 0) {
4046 			xmlFARegExecSaveInputString(exec, value, data);
4047 		    }
4048 		    xmlFARegExecSave(exec);
4049 		}
4050 		if (trans->counter >= 0) {
4051 #ifdef DEBUG_PUSH
4052 		    printf("Increasing count %d\n", trans->counter);
4053 #endif
4054 		    exec->counts[trans->counter]++;
4055 		}
4056 		if ((trans->count >= 0) &&
4057 		    (trans->count < REGEXP_ALL_COUNTER)) {
4058 #ifdef DEBUG_REGEXP_EXEC
4059 		    printf("resetting count %d on transition\n",
4060 		           trans->count);
4061 #endif
4062 		    exec->counts[trans->count] = 0;
4063 		}
4064 #ifdef DEBUG_PUSH
4065 		printf("entering state %d\n", trans->to);
4066 #endif
4067                 if ((exec->comp->states[trans->to] != NULL) &&
4068 		    (exec->comp->states[trans->to]->type ==
4069 		     XML_REGEXP_SINK_STATE)) {
4070 		    /*
4071 		     * entering a sink state, save the current state as error
4072 		     * state.
4073 		     */
4074 		    if (exec->errString != NULL)
4075 			xmlFree(exec->errString);
4076 		    exec->errString = xmlStrdup(value);
4077 		    exec->errState = exec->state;
4078 		    memcpy(exec->errCounts, exec->counts,
4079 			   exec->comp->nbCounters * sizeof(int));
4080 		}
4081 		exec->state = exec->comp->states[trans->to];
4082 		exec->transno = 0;
4083 		if (trans->atom != NULL) {
4084 		    if (exec->inputStack != NULL) {
4085 			exec->index++;
4086 			if (exec->index < exec->inputStackNr) {
4087 			    value = exec->inputStack[exec->index].value;
4088 			    data = exec->inputStack[exec->index].data;
4089 #ifdef DEBUG_PUSH
4090 			    printf("value loaded: %s\n", value);
4091 #endif
4092 			} else {
4093 			    value = NULL;
4094 			    data = NULL;
4095 #ifdef DEBUG_PUSH
4096 			    printf("end of input\n");
4097 #endif
4098 			}
4099 		    } else {
4100 			value = NULL;
4101 			data = NULL;
4102 #ifdef DEBUG_PUSH
4103 			printf("end of input\n");
4104 #endif
4105 		    }
4106 		}
4107 		goto progress;
4108 	    } else if (ret < 0) {
4109 		exec->status = -4;
4110 		break;
4111 	    }
4112 	}
4113 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4114 rollback:
4115             /*
4116 	     * if we didn't yet rollback on the current input
4117 	     * store the current state as the error state.
4118 	     */
4119 	    if ((progress) && (exec->state != NULL) &&
4120 	        (exec->state->type != XML_REGEXP_SINK_STATE)) {
4121 	        progress = 0;
4122 		if (exec->errString != NULL)
4123 		    xmlFree(exec->errString);
4124 		exec->errString = xmlStrdup(value);
4125 		exec->errState = exec->state;
4126                 if (exec->comp->nbCounters)
4127                     memcpy(exec->errCounts, exec->counts,
4128                            exec->comp->nbCounters * sizeof(int));
4129 	    }
4130 
4131 	    /*
4132 	     * Failed to find a way out
4133 	     */
4134 	    exec->determinist = 0;
4135 	    xmlFARegExecRollBack(exec);
4136 	    if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4137 		value = exec->inputStack[exec->index].value;
4138 		data = exec->inputStack[exec->index].data;
4139 #ifdef DEBUG_PUSH
4140 		printf("value loaded: %s\n", value);
4141 #endif
4142 	    }
4143 	}
4144 	continue;
4145 progress:
4146         progress = 1;
4147 	continue;
4148     }
4149     if (exec->status == 0) {
4150         return(exec->state->type == XML_REGEXP_FINAL_STATE);
4151     }
4152 #ifdef DEBUG_ERR
4153     if (exec->status < 0) {
4154 	testerr(exec);
4155     }
4156 #endif
4157     return(exec->status);
4158 }
4159 
4160 /**
4161  * xmlRegExecPushString:
4162  * @exec: a regexp execution context or NULL to indicate the end
4163  * @value: a string token input
4164  * @data: data associated to the token to reuse in callbacks
4165  *
4166  * Push one input token in the execution context
4167  *
4168  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4169  *     a negative value in case of error.
4170  */
4171 int
xmlRegExecPushString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)4172 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4173 	             void *data) {
4174     return(xmlRegExecPushStringInternal(exec, value, data, 0));
4175 }
4176 
4177 /**
4178  * xmlRegExecPushString2:
4179  * @exec: a regexp execution context or NULL to indicate the end
4180  * @value: the first string token input
4181  * @value2: the second string token input
4182  * @data: data associated to the token to reuse in callbacks
4183  *
4184  * Push one input token in the execution context
4185  *
4186  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4187  *     a negative value in case of error.
4188  */
4189 int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec,const xmlChar * value,const xmlChar * value2,void * data)4190 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4191                       const xmlChar *value2, void *data) {
4192     xmlChar buf[150];
4193     int lenn, lenp, ret;
4194     xmlChar *str;
4195 
4196     if (exec == NULL)
4197 	return(-1);
4198     if (exec->comp == NULL)
4199 	return(-1);
4200     if (exec->status != 0)
4201 	return(exec->status);
4202 
4203     if (value2 == NULL)
4204         return(xmlRegExecPushString(exec, value, data));
4205 
4206     lenn = strlen((char *) value2);
4207     lenp = strlen((char *) value);
4208 
4209     if (150 < lenn + lenp + 2) {
4210 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4211 	if (str == NULL) {
4212 	    exec->status = -1;
4213 	    return(-1);
4214 	}
4215     } else {
4216 	str = buf;
4217     }
4218     memcpy(&str[0], value, lenp);
4219     str[lenp] = XML_REG_STRING_SEPARATOR;
4220     memcpy(&str[lenp + 1], value2, lenn);
4221     str[lenn + lenp + 1] = 0;
4222 
4223     if (exec->comp->compact != NULL)
4224 	ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4225     else
4226         ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4227 
4228     if (str != buf)
4229         xmlFree(str);
4230     return(ret);
4231 }
4232 
4233 /**
4234  * xmlRegExecGetValues:
4235  * @exec: a regexp execution context
4236  * @err: error extraction or normal one
4237  * @nbval: pointer to the number of accepted values IN/OUT
4238  * @nbneg: return number of negative transitions
4239  * @values: pointer to the array of acceptable values
4240  * @terminal: return value if this was a terminal state
4241  *
4242  * Extract informations from the regexp execution, internal routine to
4243  * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4244  *
4245  * Returns: 0 in case of success or -1 in case of error.
4246  */
4247 static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec,int err,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4248 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4249                     int *nbval, int *nbneg,
4250 		    xmlChar **values, int *terminal) {
4251     int maxval;
4252     int nb = 0;
4253 
4254     if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4255         (values == NULL) || (*nbval <= 0))
4256         return(-1);
4257 
4258     maxval = *nbval;
4259     *nbval = 0;
4260     *nbneg = 0;
4261     if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4262         xmlRegexpPtr comp;
4263 	int target, i, state;
4264 
4265         comp = exec->comp;
4266 
4267 	if (err) {
4268 	    if (exec->errStateNo == -1) return(-1);
4269 	    state = exec->errStateNo;
4270 	} else {
4271 	    state = exec->index;
4272 	}
4273 	if (terminal != NULL) {
4274 	    if (comp->compact[state * (comp->nbstrings + 1)] ==
4275 	        XML_REGEXP_FINAL_STATE)
4276 		*terminal = 1;
4277 	    else
4278 		*terminal = 0;
4279 	}
4280 	for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4281 	    target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4282 	    if ((target > 0) && (target <= comp->nbstates) &&
4283 	        (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4284 		 XML_REGEXP_SINK_STATE)) {
4285 	        values[nb++] = comp->stringMap[i];
4286 		(*nbval)++;
4287 	    }
4288 	}
4289 	for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4290 	    target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4291 	    if ((target > 0) && (target <= comp->nbstates) &&
4292 	        (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4293 		 XML_REGEXP_SINK_STATE)) {
4294 	        values[nb++] = comp->stringMap[i];
4295 		(*nbneg)++;
4296 	    }
4297 	}
4298     } else {
4299         int transno;
4300 	xmlRegTransPtr trans;
4301 	xmlRegAtomPtr atom;
4302 	xmlRegStatePtr state;
4303 
4304 	if (terminal != NULL) {
4305 	    if (exec->state->type == XML_REGEXP_FINAL_STATE)
4306 		*terminal = 1;
4307 	    else
4308 		*terminal = 0;
4309 	}
4310 
4311 	if (err) {
4312 	    if (exec->errState == NULL) return(-1);
4313 	    state = exec->errState;
4314 	} else {
4315 	    if (exec->state == NULL) return(-1);
4316 	    state = exec->state;
4317 	}
4318 	for (transno = 0;
4319 	     (transno < state->nbTrans) && (nb < maxval);
4320 	     transno++) {
4321 	    trans = &state->trans[transno];
4322 	    if (trans->to < 0)
4323 		continue;
4324 	    atom = trans->atom;
4325 	    if ((atom == NULL) || (atom->valuep == NULL))
4326 		continue;
4327 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4328 	        /* this should not be reached but ... */
4329 	        TODO;
4330 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
4331 	        /* this should not be reached but ... */
4332 	        TODO;
4333 	    } else if (trans->counter >= 0) {
4334 		xmlRegCounterPtr counter = NULL;
4335 		int count;
4336 
4337 		if (err)
4338 		    count = exec->errCounts[trans->counter];
4339 		else
4340 		    count = exec->counts[trans->counter];
4341 		if (exec->comp != NULL)
4342 		    counter = &exec->comp->counters[trans->counter];
4343 		if ((counter == NULL) || (count < counter->max)) {
4344 		    if (atom->neg)
4345 			values[nb++] = (xmlChar *) atom->valuep2;
4346 		    else
4347 			values[nb++] = (xmlChar *) atom->valuep;
4348 		    (*nbval)++;
4349 		}
4350 	    } else {
4351                 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4352 		    (exec->comp->states[trans->to]->type !=
4353 		     XML_REGEXP_SINK_STATE)) {
4354 		    if (atom->neg)
4355 			values[nb++] = (xmlChar *) atom->valuep2;
4356 		    else
4357 			values[nb++] = (xmlChar *) atom->valuep;
4358 		    (*nbval)++;
4359 		}
4360 	    }
4361 	}
4362 	for (transno = 0;
4363 	     (transno < state->nbTrans) && (nb < maxval);
4364 	     transno++) {
4365 	    trans = &state->trans[transno];
4366 	    if (trans->to < 0)
4367 		continue;
4368 	    atom = trans->atom;
4369 	    if ((atom == NULL) || (atom->valuep == NULL))
4370 		continue;
4371 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4372 	        continue;
4373 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
4374 	        continue;
4375 	    } else if (trans->counter >= 0) {
4376 	        continue;
4377 	    } else {
4378                 if ((exec->comp->states[trans->to] != NULL) &&
4379 		    (exec->comp->states[trans->to]->type ==
4380 		     XML_REGEXP_SINK_STATE)) {
4381 		    if (atom->neg)
4382 			values[nb++] = (xmlChar *) atom->valuep2;
4383 		    else
4384 			values[nb++] = (xmlChar *) atom->valuep;
4385 		    (*nbneg)++;
4386 		}
4387 	    }
4388 	}
4389     }
4390     return(0);
4391 }
4392 
4393 /**
4394  * xmlRegExecNextValues:
4395  * @exec: a regexp execution context
4396  * @nbval: pointer to the number of accepted values IN/OUT
4397  * @nbneg: return number of negative transitions
4398  * @values: pointer to the array of acceptable values
4399  * @terminal: return value if this was a terminal state
4400  *
4401  * Extract informations from the regexp execution,
4402  * the parameter @values must point to an array of @nbval string pointers
4403  * on return nbval will contain the number of possible strings in that
4404  * state and the @values array will be updated with them. The string values
4405  * returned will be freed with the @exec context and don't need to be
4406  * deallocated.
4407  *
4408  * Returns: 0 in case of success or -1 in case of error.
4409  */
4410 int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4411 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4412                      xmlChar **values, int *terminal) {
4413     return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4414 }
4415 
4416 /**
4417  * xmlRegExecErrInfo:
4418  * @exec: a regexp execution context generating an error
4419  * @string: return value for the error string
4420  * @nbval: pointer to the number of accepted values IN/OUT
4421  * @nbneg: return number of negative transitions
4422  * @values: pointer to the array of acceptable values
4423  * @terminal: return value if this was a terminal state
4424  *
4425  * Extract error informations from the regexp execution, the parameter
4426  * @string will be updated with the value pushed and not accepted,
4427  * the parameter @values must point to an array of @nbval string pointers
4428  * on return nbval will contain the number of possible strings in that
4429  * state and the @values array will be updated with them. The string values
4430  * returned will be freed with the @exec context and don't need to be
4431  * deallocated.
4432  *
4433  * Returns: 0 in case of success or -1 in case of error.
4434  */
4435 int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec,const xmlChar ** string,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4436 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4437                   int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4438     if (exec == NULL)
4439         return(-1);
4440     if (string != NULL) {
4441         if (exec->status != 0)
4442 	    *string = exec->errString;
4443 	else
4444 	    *string = NULL;
4445     }
4446     return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4447 }
4448 
4449 #ifdef DEBUG_ERR
testerr(xmlRegExecCtxtPtr exec)4450 static void testerr(xmlRegExecCtxtPtr exec) {
4451     const xmlChar *string;
4452     xmlChar *values[5];
4453     int nb = 5;
4454     int nbneg;
4455     int terminal;
4456     xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4457 }
4458 #endif
4459 
4460 #if 0
4461 static int
4462 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4463     xmlRegTransPtr trans;
4464     xmlRegAtomPtr atom;
4465     int ret;
4466     int codepoint, len;
4467 
4468     if (exec == NULL)
4469 	return(-1);
4470     if (exec->status != 0)
4471 	return(exec->status);
4472 
4473     while ((exec->status == 0) &&
4474 	   ((exec->inputString[exec->index] != 0) ||
4475 	    (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4476 
4477 	/*
4478 	 * End of input on non-terminal state, rollback, however we may
4479 	 * still have epsilon like transition for counted transitions
4480 	 * on counters, in that case don't break too early.
4481 	 */
4482 	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4483 	    goto rollback;
4484 
4485 	exec->transcount = 0;
4486 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4487 	    trans = &exec->state->trans[exec->transno];
4488 	    if (trans->to < 0)
4489 		continue;
4490 	    atom = trans->atom;
4491 	    ret = 0;
4492 	    if (trans->count >= 0) {
4493 		int count;
4494 		xmlRegCounterPtr counter;
4495 
4496 		/*
4497 		 * A counted transition.
4498 		 */
4499 
4500 		count = exec->counts[trans->count];
4501 		counter = &exec->comp->counters[trans->count];
4502 #ifdef DEBUG_REGEXP_EXEC
4503 		printf("testing count %d: val %d, min %d, max %d\n",
4504 		       trans->count, count, counter->min,  counter->max);
4505 #endif
4506 		ret = ((count >= counter->min) && (count <= counter->max));
4507 	    } else if (atom == NULL) {
4508 		fprintf(stderr, "epsilon transition left at runtime\n");
4509 		exec->status = -2;
4510 		break;
4511 	    } else if (exec->inputString[exec->index] != 0) {
4512                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4513 		ret = xmlRegCheckCharacter(atom, codepoint);
4514 		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4515 		    xmlRegStatePtr to = exec->comp->states[trans->to];
4516 
4517 		    /*
4518 		     * this is a multiple input sequence
4519 		     */
4520 		    if (exec->state->nbTrans > exec->transno + 1) {
4521 			xmlFARegExecSave(exec);
4522 		    }
4523 		    exec->transcount = 1;
4524 		    do {
4525 			/*
4526 			 * Try to progress as much as possible on the input
4527 			 */
4528 			if (exec->transcount == atom->max) {
4529 			    break;
4530 			}
4531 			exec->index += len;
4532 			/*
4533 			 * End of input: stop here
4534 			 */
4535 			if (exec->inputString[exec->index] == 0) {
4536 			    exec->index -= len;
4537 			    break;
4538 			}
4539 			if (exec->transcount >= atom->min) {
4540 			    int transno = exec->transno;
4541 			    xmlRegStatePtr state = exec->state;
4542 
4543 			    /*
4544 			     * The transition is acceptable save it
4545 			     */
4546 			    exec->transno = -1; /* trick */
4547 			    exec->state = to;
4548 			    xmlFARegExecSave(exec);
4549 			    exec->transno = transno;
4550 			    exec->state = state;
4551 			}
4552 			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4553 				              len);
4554 			ret = xmlRegCheckCharacter(atom, codepoint);
4555 			exec->transcount++;
4556 		    } while (ret == 1);
4557 		    if (exec->transcount < atom->min)
4558 			ret = 0;
4559 
4560 		    /*
4561 		     * If the last check failed but one transition was found
4562 		     * possible, rollback
4563 		     */
4564 		    if (ret < 0)
4565 			ret = 0;
4566 		    if (ret == 0) {
4567 			goto rollback;
4568 		    }
4569 		}
4570 	    }
4571 	    if (ret == 1) {
4572 		if (exec->state->nbTrans > exec->transno + 1) {
4573 		    xmlFARegExecSave(exec);
4574 		}
4575 		/*
4576 		 * restart count for expressions like this ((abc){2})*
4577 		 */
4578 		if (trans->count >= 0) {
4579 #ifdef DEBUG_REGEXP_EXEC
4580 		    printf("Reset count %d\n", trans->count);
4581 #endif
4582 		    exec->counts[trans->count] = 0;
4583 		}
4584 		if (trans->counter >= 0) {
4585 #ifdef DEBUG_REGEXP_EXEC
4586 		    printf("Increasing count %d\n", trans->counter);
4587 #endif
4588 		    exec->counts[trans->counter]++;
4589 		}
4590 #ifdef DEBUG_REGEXP_EXEC
4591 		printf("entering state %d\n", trans->to);
4592 #endif
4593 		exec->state = exec->comp->states[trans->to];
4594 		exec->transno = 0;
4595 		if (trans->atom != NULL) {
4596 		    exec->index += len;
4597 		}
4598 		goto progress;
4599 	    } else if (ret < 0) {
4600 		exec->status = -4;
4601 		break;
4602 	    }
4603 	}
4604 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4605 rollback:
4606 	    /*
4607 	     * Failed to find a way out
4608 	     */
4609 	    exec->determinist = 0;
4610 	    xmlFARegExecRollBack(exec);
4611 	}
4612 progress:
4613 	continue;
4614     }
4615 }
4616 #endif
4617 /************************************************************************
4618  *									*
4619  *	Parser for the Schemas Datatype Regular Expressions		*
4620  *	http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs	*
4621  *									*
4622  ************************************************************************/
4623 
4624 /**
4625  * xmlFAIsChar:
4626  * @ctxt:  a regexp parser context
4627  *
4628  * [10]   Char   ::=   [^.\?*+()|#x5B#x5D]
4629  */
4630 static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt)4631 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4632     int cur;
4633     int len;
4634 
4635     cur = CUR_SCHAR(ctxt->cur, len);
4636     if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4637 	(cur == '*') || (cur == '+') || (cur == '(') ||
4638 	(cur == ')') || (cur == '|') || (cur == 0x5B) ||
4639 	(cur == 0x5D) || (cur == 0))
4640 	return(-1);
4641     return(cur);
4642 }
4643 
4644 /**
4645  * xmlFAParseCharProp:
4646  * @ctxt:  a regexp parser context
4647  *
4648  * [27]   charProp   ::=   IsCategory | IsBlock
4649  * [28]   IsCategory ::= Letters | Marks | Numbers | Punctuation |
4650  *                       Separators | Symbols | Others
4651  * [29]   Letters   ::=   'L' [ultmo]?
4652  * [30]   Marks   ::=   'M' [nce]?
4653  * [31]   Numbers   ::=   'N' [dlo]?
4654  * [32]   Punctuation   ::=   'P' [cdseifo]?
4655  * [33]   Separators   ::=   'Z' [slp]?
4656  * [34]   Symbols   ::=   'S' [mcko]?
4657  * [35]   Others   ::=   'C' [cfon]?
4658  * [36]   IsBlock   ::=   'Is' [a-zA-Z0-9#x2D]+
4659  */
4660 static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt)4661 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4662     int cur;
4663     xmlRegAtomType type = (xmlRegAtomType) 0;
4664     xmlChar *blockName = NULL;
4665 
4666     cur = CUR;
4667     if (cur == 'L') {
4668 	NEXT;
4669 	cur = CUR;
4670 	if (cur == 'u') {
4671 	    NEXT;
4672 	    type = XML_REGEXP_LETTER_UPPERCASE;
4673 	} else if (cur == 'l') {
4674 	    NEXT;
4675 	    type = XML_REGEXP_LETTER_LOWERCASE;
4676 	} else if (cur == 't') {
4677 	    NEXT;
4678 	    type = XML_REGEXP_LETTER_TITLECASE;
4679 	} else if (cur == 'm') {
4680 	    NEXT;
4681 	    type = XML_REGEXP_LETTER_MODIFIER;
4682 	} else if (cur == 'o') {
4683 	    NEXT;
4684 	    type = XML_REGEXP_LETTER_OTHERS;
4685 	} else {
4686 	    type = XML_REGEXP_LETTER;
4687 	}
4688     } else if (cur == 'M') {
4689 	NEXT;
4690 	cur = CUR;
4691 	if (cur == 'n') {
4692 	    NEXT;
4693 	    /* nonspacing */
4694 	    type = XML_REGEXP_MARK_NONSPACING;
4695 	} else if (cur == 'c') {
4696 	    NEXT;
4697 	    /* spacing combining */
4698 	    type = XML_REGEXP_MARK_SPACECOMBINING;
4699 	} else if (cur == 'e') {
4700 	    NEXT;
4701 	    /* enclosing */
4702 	    type = XML_REGEXP_MARK_ENCLOSING;
4703 	} else {
4704 	    /* all marks */
4705 	    type = XML_REGEXP_MARK;
4706 	}
4707     } else if (cur == 'N') {
4708 	NEXT;
4709 	cur = CUR;
4710 	if (cur == 'd') {
4711 	    NEXT;
4712 	    /* digital */
4713 	    type = XML_REGEXP_NUMBER_DECIMAL;
4714 	} else if (cur == 'l') {
4715 	    NEXT;
4716 	    /* letter */
4717 	    type = XML_REGEXP_NUMBER_LETTER;
4718 	} else if (cur == 'o') {
4719 	    NEXT;
4720 	    /* other */
4721 	    type = XML_REGEXP_NUMBER_OTHERS;
4722 	} else {
4723 	    /* all numbers */
4724 	    type = XML_REGEXP_NUMBER;
4725 	}
4726     } else if (cur == 'P') {
4727 	NEXT;
4728 	cur = CUR;
4729 	if (cur == 'c') {
4730 	    NEXT;
4731 	    /* connector */
4732 	    type = XML_REGEXP_PUNCT_CONNECTOR;
4733 	} else if (cur == 'd') {
4734 	    NEXT;
4735 	    /* dash */
4736 	    type = XML_REGEXP_PUNCT_DASH;
4737 	} else if (cur == 's') {
4738 	    NEXT;
4739 	    /* open */
4740 	    type = XML_REGEXP_PUNCT_OPEN;
4741 	} else if (cur == 'e') {
4742 	    NEXT;
4743 	    /* close */
4744 	    type = XML_REGEXP_PUNCT_CLOSE;
4745 	} else if (cur == 'i') {
4746 	    NEXT;
4747 	    /* initial quote */
4748 	    type = XML_REGEXP_PUNCT_INITQUOTE;
4749 	} else if (cur == 'f') {
4750 	    NEXT;
4751 	    /* final quote */
4752 	    type = XML_REGEXP_PUNCT_FINQUOTE;
4753 	} else if (cur == 'o') {
4754 	    NEXT;
4755 	    /* other */
4756 	    type = XML_REGEXP_PUNCT_OTHERS;
4757 	} else {
4758 	    /* all punctuation */
4759 	    type = XML_REGEXP_PUNCT;
4760 	}
4761     } else if (cur == 'Z') {
4762 	NEXT;
4763 	cur = CUR;
4764 	if (cur == 's') {
4765 	    NEXT;
4766 	    /* space */
4767 	    type = XML_REGEXP_SEPAR_SPACE;
4768 	} else if (cur == 'l') {
4769 	    NEXT;
4770 	    /* line */
4771 	    type = XML_REGEXP_SEPAR_LINE;
4772 	} else if (cur == 'p') {
4773 	    NEXT;
4774 	    /* paragraph */
4775 	    type = XML_REGEXP_SEPAR_PARA;
4776 	} else {
4777 	    /* all separators */
4778 	    type = XML_REGEXP_SEPAR;
4779 	}
4780     } else if (cur == 'S') {
4781 	NEXT;
4782 	cur = CUR;
4783 	if (cur == 'm') {
4784 	    NEXT;
4785 	    type = XML_REGEXP_SYMBOL_MATH;
4786 	    /* math */
4787 	} else if (cur == 'c') {
4788 	    NEXT;
4789 	    type = XML_REGEXP_SYMBOL_CURRENCY;
4790 	    /* currency */
4791 	} else if (cur == 'k') {
4792 	    NEXT;
4793 	    type = XML_REGEXP_SYMBOL_MODIFIER;
4794 	    /* modifiers */
4795 	} else if (cur == 'o') {
4796 	    NEXT;
4797 	    type = XML_REGEXP_SYMBOL_OTHERS;
4798 	    /* other */
4799 	} else {
4800 	    /* all symbols */
4801 	    type = XML_REGEXP_SYMBOL;
4802 	}
4803     } else if (cur == 'C') {
4804 	NEXT;
4805 	cur = CUR;
4806 	if (cur == 'c') {
4807 	    NEXT;
4808 	    /* control */
4809 	    type = XML_REGEXP_OTHER_CONTROL;
4810 	} else if (cur == 'f') {
4811 	    NEXT;
4812 	    /* format */
4813 	    type = XML_REGEXP_OTHER_FORMAT;
4814 	} else if (cur == 'o') {
4815 	    NEXT;
4816 	    /* private use */
4817 	    type = XML_REGEXP_OTHER_PRIVATE;
4818 	} else if (cur == 'n') {
4819 	    NEXT;
4820 	    /* not assigned */
4821 	    type = XML_REGEXP_OTHER_NA;
4822 	} else {
4823 	    /* all others */
4824 	    type = XML_REGEXP_OTHER;
4825 	}
4826     } else if (cur == 'I') {
4827 	const xmlChar *start;
4828 	NEXT;
4829 	cur = CUR;
4830 	if (cur != 's') {
4831 	    ERROR("IsXXXX expected");
4832 	    return;
4833 	}
4834 	NEXT;
4835 	start = ctxt->cur;
4836 	cur = CUR;
4837 	if (((cur >= 'a') && (cur <= 'z')) ||
4838 	    ((cur >= 'A') && (cur <= 'Z')) ||
4839 	    ((cur >= '0') && (cur <= '9')) ||
4840 	    (cur == 0x2D)) {
4841 	    NEXT;
4842 	    cur = CUR;
4843 	    while (((cur >= 'a') && (cur <= 'z')) ||
4844 		((cur >= 'A') && (cur <= 'Z')) ||
4845 		((cur >= '0') && (cur <= '9')) ||
4846 		(cur == 0x2D)) {
4847 		NEXT;
4848 		cur = CUR;
4849 	    }
4850 	}
4851 	type = XML_REGEXP_BLOCK_NAME;
4852 	blockName = xmlStrndup(start, ctxt->cur - start);
4853     } else {
4854 	ERROR("Unknown char property");
4855 	return;
4856     }
4857     if (ctxt->atom == NULL) {
4858 	ctxt->atom = xmlRegNewAtom(ctxt, type);
4859 	if (ctxt->atom != NULL)
4860 	    ctxt->atom->valuep = blockName;
4861     } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4862         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4863 		           type, 0, 0, blockName);
4864     }
4865 }
4866 
4867 /**
4868  * xmlFAParseCharClassEsc:
4869  * @ctxt:  a regexp parser context
4870  *
4871  * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4872  * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4873  * [25] catEsc   ::=   '\p{' charProp '}'
4874  * [26] complEsc ::=   '\P{' charProp '}'
4875  * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4876  */
4877 static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt)4878 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4879     int cur;
4880 
4881     if (CUR == '.') {
4882 	if (ctxt->atom == NULL) {
4883 	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4884 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4885 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4886 			       XML_REGEXP_ANYCHAR, 0, 0, NULL);
4887 	}
4888 	NEXT;
4889 	return;
4890     }
4891     if (CUR != '\\') {
4892 	ERROR("Escaped sequence: expecting \\");
4893 	return;
4894     }
4895     NEXT;
4896     cur = CUR;
4897     if (cur == 'p') {
4898 	NEXT;
4899 	if (CUR != '{') {
4900 	    ERROR("Expecting '{'");
4901 	    return;
4902 	}
4903 	NEXT;
4904 	xmlFAParseCharProp(ctxt);
4905 	if (CUR != '}') {
4906 	    ERROR("Expecting '}'");
4907 	    return;
4908 	}
4909 	NEXT;
4910     } else if (cur == 'P') {
4911 	NEXT;
4912 	if (CUR != '{') {
4913 	    ERROR("Expecting '{'");
4914 	    return;
4915 	}
4916 	NEXT;
4917 	xmlFAParseCharProp(ctxt);
4918         if (ctxt->atom != NULL)
4919 	    ctxt->atom->neg = 1;
4920 	if (CUR != '}') {
4921 	    ERROR("Expecting '}'");
4922 	    return;
4923 	}
4924 	NEXT;
4925     } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4926 	(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4927 	(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4928 	(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4929 	(cur == 0x5E)) {
4930 	if (ctxt->atom == NULL) {
4931 	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4932 	    if (ctxt->atom != NULL) {
4933 	        switch (cur) {
4934 		    case 'n':
4935 		        ctxt->atom->codepoint = '\n';
4936 			break;
4937 		    case 'r':
4938 		        ctxt->atom->codepoint = '\r';
4939 			break;
4940 		    case 't':
4941 		        ctxt->atom->codepoint = '\t';
4942 			break;
4943 		    default:
4944 			ctxt->atom->codepoint = cur;
4945 		}
4946 	    }
4947 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4948             switch (cur) {
4949                 case 'n':
4950                     cur = '\n';
4951                     break;
4952                 case 'r':
4953                     cur = '\r';
4954                     break;
4955                 case 't':
4956                     cur = '\t';
4957                     break;
4958             }
4959 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4960 			       XML_REGEXP_CHARVAL, cur, cur, NULL);
4961 	}
4962 	NEXT;
4963     } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4964 	(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4965 	(cur == 'w') || (cur == 'W')) {
4966 	xmlRegAtomType type = XML_REGEXP_ANYSPACE;
4967 
4968 	switch (cur) {
4969 	    case 's':
4970 		type = XML_REGEXP_ANYSPACE;
4971 		break;
4972 	    case 'S':
4973 		type = XML_REGEXP_NOTSPACE;
4974 		break;
4975 	    case 'i':
4976 		type = XML_REGEXP_INITNAME;
4977 		break;
4978 	    case 'I':
4979 		type = XML_REGEXP_NOTINITNAME;
4980 		break;
4981 	    case 'c':
4982 		type = XML_REGEXP_NAMECHAR;
4983 		break;
4984 	    case 'C':
4985 		type = XML_REGEXP_NOTNAMECHAR;
4986 		break;
4987 	    case 'd':
4988 		type = XML_REGEXP_DECIMAL;
4989 		break;
4990 	    case 'D':
4991 		type = XML_REGEXP_NOTDECIMAL;
4992 		break;
4993 	    case 'w':
4994 		type = XML_REGEXP_REALCHAR;
4995 		break;
4996 	    case 'W':
4997 		type = XML_REGEXP_NOTREALCHAR;
4998 		break;
4999 	}
5000 	NEXT;
5001 	if (ctxt->atom == NULL) {
5002 	    ctxt->atom = xmlRegNewAtom(ctxt, type);
5003 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5004 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5005 			       type, 0, 0, NULL);
5006 	}
5007     } else {
5008 	ERROR("Wrong escape sequence, misuse of character '\\'");
5009     }
5010 }
5011 
5012 /**
5013  * xmlFAParseCharRange:
5014  * @ctxt:  a regexp parser context
5015  *
5016  * [17]   charRange   ::=     seRange | XmlCharRef | XmlCharIncDash
5017  * [18]   seRange   ::=   charOrEsc '-' charOrEsc
5018  * [20]   charOrEsc   ::=   XmlChar | SingleCharEsc
5019  * [21]   XmlChar   ::=   [^\#x2D#x5B#x5D]
5020  * [22]   XmlCharIncDash   ::=   [^\#x5B#x5D]
5021  */
5022 static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt)5023 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5024     int cur, len;
5025     int start = -1;
5026     int end = -1;
5027 
5028     if (CUR == '\0') {
5029         ERROR("Expecting ']'");
5030 	return;
5031     }
5032 
5033     cur = CUR;
5034     if (cur == '\\') {
5035 	NEXT;
5036 	cur = CUR;
5037 	switch (cur) {
5038 	    case 'n': start = 0xA; break;
5039 	    case 'r': start = 0xD; break;
5040 	    case 't': start = 0x9; break;
5041 	    case '\\': case '|': case '.': case '-': case '^': case '?':
5042 	    case '*': case '+': case '{': case '}': case '(': case ')':
5043 	    case '[': case ']':
5044 		start = cur; break;
5045 	    default:
5046 		ERROR("Invalid escape value");
5047 		return;
5048 	}
5049 	end = start;
5050         len = 1;
5051     } else if ((cur != 0x5B) && (cur != 0x5D)) {
5052         end = start = CUR_SCHAR(ctxt->cur, len);
5053     } else {
5054 	ERROR("Expecting a char range");
5055 	return;
5056     }
5057     /*
5058      * Since we are "inside" a range, we can assume ctxt->cur is past
5059      * the start of ctxt->string, and PREV should be safe
5060      */
5061     if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5062 	NEXTL(len);
5063 	return;
5064     }
5065     NEXTL(len);
5066     cur = CUR;
5067     if ((cur != '-') || (NXT(1) == ']')) {
5068         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5069 		              XML_REGEXP_CHARVAL, start, end, NULL);
5070 	return;
5071     }
5072     NEXT;
5073     cur = CUR;
5074     if (cur == '\\') {
5075 	NEXT;
5076 	cur = CUR;
5077 	switch (cur) {
5078 	    case 'n': end = 0xA; break;
5079 	    case 'r': end = 0xD; break;
5080 	    case 't': end = 0x9; break;
5081 	    case '\\': case '|': case '.': case '-': case '^': case '?':
5082 	    case '*': case '+': case '{': case '}': case '(': case ')':
5083 	    case '[': case ']':
5084 		end = cur; break;
5085 	    default:
5086 		ERROR("Invalid escape value");
5087 		return;
5088 	}
5089         len = 1;
5090     } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5091         end = CUR_SCHAR(ctxt->cur, len);
5092     } else {
5093 	ERROR("Expecting the end of a char range");
5094 	return;
5095     }
5096 
5097     /* TODO check that the values are acceptable character ranges for XML */
5098     if (end < start) {
5099 	ERROR("End of range is before start of range");
5100     } else {
5101         NEXTL(len);
5102         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5103 		           XML_REGEXP_CHARVAL, start, end, NULL);
5104     }
5105     return;
5106 }
5107 
5108 /**
5109  * xmlFAParsePosCharGroup:
5110  * @ctxt:  a regexp parser context
5111  *
5112  * [14]   posCharGroup ::= ( charRange | charClassEsc  )+
5113  */
5114 static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt)5115 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5116     do {
5117 	if (CUR == '\\') {
5118 	    xmlFAParseCharClassEsc(ctxt);
5119 	} else {
5120 	    xmlFAParseCharRange(ctxt);
5121 	}
5122     } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
5123              (CUR != 0) && (ctxt->error == 0));
5124 }
5125 
5126 /**
5127  * xmlFAParseCharGroup:
5128  * @ctxt:  a regexp parser context
5129  *
5130  * [13]   charGroup    ::= posCharGroup | negCharGroup | charClassSub
5131  * [15]   negCharGroup ::= '^' posCharGroup
5132  * [16]   charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5133  * [12]   charClassExpr ::= '[' charGroup ']'
5134  */
5135 static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt)5136 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5137     int n = ctxt->neg;
5138     while ((CUR != ']') && (ctxt->error == 0)) {
5139 	if (CUR == '^') {
5140 	    int neg = ctxt->neg;
5141 
5142 	    NEXT;
5143 	    ctxt->neg = !ctxt->neg;
5144 	    xmlFAParsePosCharGroup(ctxt);
5145 	    ctxt->neg = neg;
5146 	} else if ((CUR == '-') && (NXT(1) == '[')) {
5147 	    int neg = ctxt->neg;
5148 	    ctxt->neg = 2;
5149 	    NEXT;	/* eat the '-' */
5150 	    NEXT;	/* eat the '[' */
5151 	    xmlFAParseCharGroup(ctxt);
5152 	    if (CUR == ']') {
5153 		NEXT;
5154 	    } else {
5155 		ERROR("charClassExpr: ']' expected");
5156 		break;
5157 	    }
5158 	    ctxt->neg = neg;
5159 	    break;
5160 	} else if (CUR != ']') {
5161 	    xmlFAParsePosCharGroup(ctxt);
5162 	}
5163     }
5164     ctxt->neg = n;
5165 }
5166 
5167 /**
5168  * xmlFAParseCharClass:
5169  * @ctxt:  a regexp parser context
5170  *
5171  * [11]   charClass   ::=     charClassEsc | charClassExpr
5172  * [12]   charClassExpr   ::=   '[' charGroup ']'
5173  */
5174 static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt)5175 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5176     if (CUR == '[') {
5177 	NEXT;
5178 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5179 	if (ctxt->atom == NULL)
5180 	    return;
5181 	xmlFAParseCharGroup(ctxt);
5182 	if (CUR == ']') {
5183 	    NEXT;
5184 	} else {
5185 	    ERROR("xmlFAParseCharClass: ']' expected");
5186 	}
5187     } else {
5188 	xmlFAParseCharClassEsc(ctxt);
5189     }
5190 }
5191 
5192 /**
5193  * xmlFAParseQuantExact:
5194  * @ctxt:  a regexp parser context
5195  *
5196  * [8]   QuantExact   ::=   [0-9]+
5197  *
5198  * Returns 0 if success or -1 in case of error
5199  */
5200 static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt)5201 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5202     int ret = 0;
5203     int ok = 0;
5204 
5205     while ((CUR >= '0') && (CUR <= '9')) {
5206 	ret = ret * 10 + (CUR - '0');
5207 	ok = 1;
5208 	NEXT;
5209     }
5210     if (ok != 1) {
5211 	return(-1);
5212     }
5213     return(ret);
5214 }
5215 
5216 /**
5217  * xmlFAParseQuantifier:
5218  * @ctxt:  a regexp parser context
5219  *
5220  * [4]   quantifier   ::=   [?*+] | ( '{' quantity '}' )
5221  * [5]   quantity   ::=   quantRange | quantMin | QuantExact
5222  * [6]   quantRange   ::=   QuantExact ',' QuantExact
5223  * [7]   quantMin   ::=   QuantExact ','
5224  * [8]   QuantExact   ::=   [0-9]+
5225  */
5226 static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt)5227 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5228     int cur;
5229 
5230     cur = CUR;
5231     if ((cur == '?') || (cur == '*') || (cur == '+')) {
5232 	if (ctxt->atom != NULL) {
5233 	    if (cur == '?')
5234 		ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5235 	    else if (cur == '*')
5236 		ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5237 	    else if (cur == '+')
5238 		ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5239 	}
5240 	NEXT;
5241 	return(1);
5242     }
5243     if (cur == '{') {
5244 	int min = 0, max = 0;
5245 
5246 	NEXT;
5247 	cur = xmlFAParseQuantExact(ctxt);
5248 	if (cur >= 0)
5249 	    min = cur;
5250 	if (CUR == ',') {
5251 	    NEXT;
5252 	    if (CUR == '}')
5253 	        max = INT_MAX;
5254 	    else {
5255 	        cur = xmlFAParseQuantExact(ctxt);
5256 	        if (cur >= 0)
5257 		    max = cur;
5258 		else {
5259 		    ERROR("Improper quantifier");
5260 		}
5261 	    }
5262 	}
5263 	if (CUR == '}') {
5264 	    NEXT;
5265 	} else {
5266 	    ERROR("Unterminated quantifier");
5267 	}
5268 	if (max == 0)
5269 	    max = min;
5270 	if (ctxt->atom != NULL) {
5271 	    ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5272 	    ctxt->atom->min = min;
5273 	    ctxt->atom->max = max;
5274 	}
5275 	return(1);
5276     }
5277     return(0);
5278 }
5279 
5280 /**
5281  * xmlFAParseAtom:
5282  * @ctxt:  a regexp parser context
5283  *
5284  * [9]   atom   ::=   Char | charClass | ( '(' regExp ')' )
5285  */
5286 static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt)5287 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5288     int codepoint, len;
5289 
5290     codepoint = xmlFAIsChar(ctxt);
5291     if (codepoint > 0) {
5292 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5293 	if (ctxt->atom == NULL)
5294 	    return(-1);
5295 	codepoint = CUR_SCHAR(ctxt->cur, len);
5296 	ctxt->atom->codepoint = codepoint;
5297 	NEXTL(len);
5298 	return(1);
5299     } else if (CUR == '|') {
5300 	return(0);
5301     } else if (CUR == 0) {
5302 	return(0);
5303     } else if (CUR == ')') {
5304 	return(0);
5305     } else if (CUR == '(') {
5306 	xmlRegStatePtr start, oldend, start0;
5307 
5308 	NEXT;
5309 	/*
5310 	 * this extra Epsilon transition is needed if we count with 0 allowed
5311 	 * unfortunately this can't be known at that point
5312 	 */
5313 	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5314 	start0 = ctxt->state;
5315 	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5316 	start = ctxt->state;
5317 	oldend = ctxt->end;
5318 	ctxt->end = NULL;
5319 	ctxt->atom = NULL;
5320 	xmlFAParseRegExp(ctxt, 0);
5321 	if (CUR == ')') {
5322 	    NEXT;
5323 	} else {
5324 	    ERROR("xmlFAParseAtom: expecting ')'");
5325 	}
5326 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5327 	if (ctxt->atom == NULL)
5328 	    return(-1);
5329 	ctxt->atom->start = start;
5330 	ctxt->atom->start0 = start0;
5331 	ctxt->atom->stop = ctxt->state;
5332 	ctxt->end = oldend;
5333 	return(1);
5334     } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5335 	xmlFAParseCharClass(ctxt);
5336 	return(1);
5337     }
5338     return(0);
5339 }
5340 
5341 /**
5342  * xmlFAParsePiece:
5343  * @ctxt:  a regexp parser context
5344  *
5345  * [3]   piece   ::=   atom quantifier?
5346  */
5347 static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt)5348 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5349     int ret;
5350 
5351     ctxt->atom = NULL;
5352     ret = xmlFAParseAtom(ctxt);
5353     if (ret == 0)
5354 	return(0);
5355     if (ctxt->atom == NULL) {
5356 	ERROR("internal: no atom generated");
5357     }
5358     xmlFAParseQuantifier(ctxt);
5359     return(1);
5360 }
5361 
5362 /**
5363  * xmlFAParseBranch:
5364  * @ctxt:  a regexp parser context
5365  * @to: optional target to the end of the branch
5366  *
5367  * @to is used to optimize by removing duplicate path in automata
5368  * in expressions like (a|b)(c|d)
5369  *
5370  * [2]   branch   ::=   piece*
5371  */
5372 static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr to)5373 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5374     xmlRegStatePtr previous;
5375     int ret;
5376 
5377     previous = ctxt->state;
5378     ret = xmlFAParsePiece(ctxt);
5379     if (ret != 0) {
5380 	if (xmlFAGenerateTransitions(ctxt, previous,
5381 	        (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
5382 	    return(-1);
5383 	previous = ctxt->state;
5384 	ctxt->atom = NULL;
5385     }
5386     while ((ret != 0) && (ctxt->error == 0)) {
5387 	ret = xmlFAParsePiece(ctxt);
5388 	if (ret != 0) {
5389 	    if (xmlFAGenerateTransitions(ctxt, previous,
5390 	            (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
5391 		    return(-1);
5392 	    previous = ctxt->state;
5393 	    ctxt->atom = NULL;
5394 	}
5395     }
5396     return(0);
5397 }
5398 
5399 /**
5400  * xmlFAParseRegExp:
5401  * @ctxt:  a regexp parser context
5402  * @top:  is this the top-level expression ?
5403  *
5404  * [1]   regExp   ::=     branch  ( '|' branch )*
5405  */
5406 static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt,int top)5407 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5408     xmlRegStatePtr start, end;
5409 
5410     /* if not top start should have been generated by an epsilon trans */
5411     start = ctxt->state;
5412     ctxt->end = NULL;
5413     xmlFAParseBranch(ctxt, NULL);
5414     if (top) {
5415 #ifdef DEBUG_REGEXP_GRAPH
5416 	printf("State %d is final\n", ctxt->state->no);
5417 #endif
5418 	ctxt->state->type = XML_REGEXP_FINAL_STATE;
5419     }
5420     if (CUR != '|') {
5421 	ctxt->end = ctxt->state;
5422 	return;
5423     }
5424     end = ctxt->state;
5425     while ((CUR == '|') && (ctxt->error == 0)) {
5426 	NEXT;
5427 	if (CUR == 0) {
5428 	    ERROR("expecting a branch after |")
5429 	    return;
5430 	}
5431 	ctxt->state = start;
5432 	ctxt->end = NULL;
5433 	xmlFAParseBranch(ctxt, end);
5434     }
5435     if (!top) {
5436 	ctxt->state = end;
5437 	ctxt->end = end;
5438     }
5439 }
5440 
5441 /************************************************************************
5442  *									*
5443  *			The basic API					*
5444  *									*
5445  ************************************************************************/
5446 
5447 /**
5448  * xmlRegexpPrint:
5449  * @output: the file for the output debug
5450  * @regexp: the compiled regexp
5451  *
5452  * Print the content of the compiled regular expression
5453  */
5454 void
xmlRegexpPrint(FILE * output,xmlRegexpPtr regexp)5455 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5456     int i;
5457 
5458     if (output == NULL)
5459         return;
5460     fprintf(output, " regexp: ");
5461     if (regexp == NULL) {
5462 	fprintf(output, "NULL\n");
5463 	return;
5464     }
5465     fprintf(output, "'%s' ", regexp->string);
5466     fprintf(output, "\n");
5467     fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5468     for (i = 0;i < regexp->nbAtoms; i++) {
5469 	fprintf(output, " %02d ", i);
5470 	xmlRegPrintAtom(output, regexp->atoms[i]);
5471     }
5472     fprintf(output, "%d states:", regexp->nbStates);
5473     fprintf(output, "\n");
5474     for (i = 0;i < regexp->nbStates; i++) {
5475 	xmlRegPrintState(output, regexp->states[i]);
5476     }
5477     fprintf(output, "%d counters:\n", regexp->nbCounters);
5478     for (i = 0;i < regexp->nbCounters; i++) {
5479 	fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5480 		                                regexp->counters[i].max);
5481     }
5482 }
5483 
5484 /**
5485  * xmlRegexpCompile:
5486  * @regexp:  a regular expression string
5487  *
5488  * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5489  * Appendix F and builds an automata suitable for testing strings against
5490  * that regular expression
5491  *
5492  * Returns the compiled expression or NULL in case of error
5493  */
5494 xmlRegexpPtr
xmlRegexpCompile(const xmlChar * regexp)5495 xmlRegexpCompile(const xmlChar *regexp) {
5496     xmlRegexpPtr ret;
5497     xmlRegParserCtxtPtr ctxt;
5498 
5499     ctxt = xmlRegNewParserCtxt(regexp);
5500     if (ctxt == NULL)
5501 	return(NULL);
5502 
5503     /* initialize the parser */
5504     ctxt->end = NULL;
5505     ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5506     xmlRegStatePush(ctxt, ctxt->start);
5507 
5508     /* parse the expression building an automata */
5509     xmlFAParseRegExp(ctxt, 1);
5510     if (CUR != 0) {
5511 	ERROR("xmlFAParseRegExp: extra characters");
5512     }
5513     if (ctxt->error != 0) {
5514 	xmlRegFreeParserCtxt(ctxt);
5515 	return(NULL);
5516     }
5517     ctxt->end = ctxt->state;
5518     ctxt->start->type = XML_REGEXP_START_STATE;
5519     ctxt->end->type = XML_REGEXP_FINAL_STATE;
5520 
5521     /* remove the Epsilon except for counted transitions */
5522     xmlFAEliminateEpsilonTransitions(ctxt);
5523 
5524 
5525     if (ctxt->error != 0) {
5526 	xmlRegFreeParserCtxt(ctxt);
5527 	return(NULL);
5528     }
5529     ret = xmlRegEpxFromParse(ctxt);
5530     xmlRegFreeParserCtxt(ctxt);
5531     return(ret);
5532 }
5533 
5534 /**
5535  * xmlRegexpExec:
5536  * @comp:  the compiled regular expression
5537  * @content:  the value to check against the regular expression
5538  *
5539  * Check if the regular expression generates the value
5540  *
5541  * Returns 1 if it matches, 0 if not and a negative value in case of error
5542  */
5543 int
xmlRegexpExec(xmlRegexpPtr comp,const xmlChar * content)5544 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5545     if ((comp == NULL) || (content == NULL))
5546 	return(-1);
5547     return(xmlFARegExec(comp, content));
5548 }
5549 
5550 /**
5551  * xmlRegexpIsDeterminist:
5552  * @comp:  the compiled regular expression
5553  *
5554  * Check if the regular expression is determinist
5555  *
5556  * Returns 1 if it yes, 0 if not and a negative value in case of error
5557  */
5558 int
xmlRegexpIsDeterminist(xmlRegexpPtr comp)5559 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5560     xmlAutomataPtr am;
5561     int ret;
5562 
5563     if (comp == NULL)
5564 	return(-1);
5565     if (comp->determinist != -1)
5566 	return(comp->determinist);
5567 
5568     am = xmlNewAutomata();
5569     if (am->states != NULL) {
5570 	int i;
5571 
5572 	for (i = 0;i < am->nbStates;i++)
5573 	    xmlRegFreeState(am->states[i]);
5574 	xmlFree(am->states);
5575     }
5576     am->nbAtoms = comp->nbAtoms;
5577     am->atoms = comp->atoms;
5578     am->nbStates = comp->nbStates;
5579     am->states = comp->states;
5580     am->determinist = -1;
5581     am->flags = comp->flags;
5582     ret = xmlFAComputesDeterminism(am);
5583     am->atoms = NULL;
5584     am->states = NULL;
5585     xmlFreeAutomata(am);
5586     comp->determinist = ret;
5587     return(ret);
5588 }
5589 
5590 /**
5591  * xmlRegFreeRegexp:
5592  * @regexp:  the regexp
5593  *
5594  * Free a regexp
5595  */
5596 void
xmlRegFreeRegexp(xmlRegexpPtr regexp)5597 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5598     int i;
5599     if (regexp == NULL)
5600 	return;
5601 
5602     if (regexp->string != NULL)
5603 	xmlFree(regexp->string);
5604     if (regexp->states != NULL) {
5605 	for (i = 0;i < regexp->nbStates;i++)
5606 	    xmlRegFreeState(regexp->states[i]);
5607 	xmlFree(regexp->states);
5608     }
5609     if (regexp->atoms != NULL) {
5610 	for (i = 0;i < regexp->nbAtoms;i++)
5611 	    xmlRegFreeAtom(regexp->atoms[i]);
5612 	xmlFree(regexp->atoms);
5613     }
5614     if (regexp->counters != NULL)
5615 	xmlFree(regexp->counters);
5616     if (regexp->compact != NULL)
5617 	xmlFree(regexp->compact);
5618     if (regexp->transdata != NULL)
5619 	xmlFree(regexp->transdata);
5620     if (regexp->stringMap != NULL) {
5621 	for (i = 0; i < regexp->nbstrings;i++)
5622 	    xmlFree(regexp->stringMap[i]);
5623 	xmlFree(regexp->stringMap);
5624     }
5625 
5626     xmlFree(regexp);
5627 }
5628 
5629 #ifdef LIBXML_AUTOMATA_ENABLED
5630 /************************************************************************
5631  *									*
5632  *			The Automata interface				*
5633  *									*
5634  ************************************************************************/
5635 
5636 /**
5637  * xmlNewAutomata:
5638  *
5639  * Create a new automata
5640  *
5641  * Returns the new object or NULL in case of failure
5642  */
5643 xmlAutomataPtr
xmlNewAutomata(void)5644 xmlNewAutomata(void) {
5645     xmlAutomataPtr ctxt;
5646 
5647     ctxt = xmlRegNewParserCtxt(NULL);
5648     if (ctxt == NULL)
5649 	return(NULL);
5650 
5651     /* initialize the parser */
5652     ctxt->end = NULL;
5653     ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5654     if (ctxt->start == NULL) {
5655 	xmlFreeAutomata(ctxt);
5656 	return(NULL);
5657     }
5658     ctxt->start->type = XML_REGEXP_START_STATE;
5659     if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5660         xmlRegFreeState(ctxt->start);
5661 	xmlFreeAutomata(ctxt);
5662 	return(NULL);
5663     }
5664     ctxt->flags = 0;
5665 
5666     return(ctxt);
5667 }
5668 
5669 /**
5670  * xmlFreeAutomata:
5671  * @am: an automata
5672  *
5673  * Free an automata
5674  */
5675 void
xmlFreeAutomata(xmlAutomataPtr am)5676 xmlFreeAutomata(xmlAutomataPtr am) {
5677     if (am == NULL)
5678 	return;
5679     xmlRegFreeParserCtxt(am);
5680 }
5681 
5682 /**
5683  * xmlAutomataSetFlags:
5684  * @am: an automata
5685  * @flags:  a set of internal flags
5686  *
5687  * Set some flags on the automata
5688  */
5689 void
xmlAutomataSetFlags(xmlAutomataPtr am,int flags)5690 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5691     if (am == NULL)
5692 	return;
5693     am->flags |= flags;
5694 }
5695 
5696 /**
5697  * xmlAutomataGetInitState:
5698  * @am: an automata
5699  *
5700  * Initial state lookup
5701  *
5702  * Returns the initial state of the automata
5703  */
5704 xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am)5705 xmlAutomataGetInitState(xmlAutomataPtr am) {
5706     if (am == NULL)
5707 	return(NULL);
5708     return(am->start);
5709 }
5710 
5711 /**
5712  * xmlAutomataSetFinalState:
5713  * @am: an automata
5714  * @state: a state in this automata
5715  *
5716  * Makes that state a final state
5717  *
5718  * Returns 0 or -1 in case of error
5719  */
5720 int
xmlAutomataSetFinalState(xmlAutomataPtr am,xmlAutomataStatePtr state)5721 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5722     if ((am == NULL) || (state == NULL))
5723 	return(-1);
5724     state->type = XML_REGEXP_FINAL_STATE;
5725     return(0);
5726 }
5727 
5728 /**
5729  * xmlAutomataNewTransition:
5730  * @am: an automata
5731  * @from: the starting point of the transition
5732  * @to: the target point of the transition or NULL
5733  * @token: the input string associated to that transition
5734  * @data: data passed to the callback function if the transition is activated
5735  *
5736  * If @to is NULL, this creates first a new target state in the automata
5737  * and then adds a transition from the @from state to the target state
5738  * activated by the value of @token
5739  *
5740  * Returns the target state or NULL in case of error
5741  */
5742 xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,void * data)5743 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5744 			 xmlAutomataStatePtr to, const xmlChar *token,
5745 			 void *data) {
5746     xmlRegAtomPtr atom;
5747 
5748     if ((am == NULL) || (from == NULL) || (token == NULL))
5749 	return(NULL);
5750     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5751     if (atom == NULL)
5752         return(NULL);
5753     atom->data = data;
5754     atom->valuep = xmlStrdup(token);
5755 
5756     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5757         xmlRegFreeAtom(atom);
5758 	return(NULL);
5759     }
5760     if (to == NULL)
5761 	return(am->state);
5762     return(to);
5763 }
5764 
5765 /**
5766  * xmlAutomataNewTransition2:
5767  * @am: an automata
5768  * @from: the starting point of the transition
5769  * @to: the target point of the transition or NULL
5770  * @token: the first input string associated to that transition
5771  * @token2: the second input string associated to that transition
5772  * @data: data passed to the callback function if the transition is activated
5773  *
5774  * If @to is NULL, this creates first a new target state in the automata
5775  * and then adds a transition from the @from state to the target state
5776  * activated by the value of @token
5777  *
5778  * Returns the target state or NULL in case of error
5779  */
5780 xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5781 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5782 			  xmlAutomataStatePtr to, const xmlChar *token,
5783 			  const xmlChar *token2, void *data) {
5784     xmlRegAtomPtr atom;
5785 
5786     if ((am == NULL) || (from == NULL) || (token == NULL))
5787 	return(NULL);
5788     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5789     if (atom == NULL)
5790 	return(NULL);
5791     atom->data = data;
5792     if ((token2 == NULL) || (*token2 == 0)) {
5793 	atom->valuep = xmlStrdup(token);
5794     } else {
5795 	int lenn, lenp;
5796 	xmlChar *str;
5797 
5798 	lenn = strlen((char *) token2);
5799 	lenp = strlen((char *) token);
5800 
5801 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5802 	if (str == NULL) {
5803 	    xmlRegFreeAtom(atom);
5804 	    return(NULL);
5805 	}
5806 	memcpy(&str[0], token, lenp);
5807 	str[lenp] = '|';
5808 	memcpy(&str[lenp + 1], token2, lenn);
5809 	str[lenn + lenp + 1] = 0;
5810 
5811 	atom->valuep = str;
5812     }
5813 
5814     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5815         xmlRegFreeAtom(atom);
5816 	return(NULL);
5817     }
5818     if (to == NULL)
5819 	return(am->state);
5820     return(to);
5821 }
5822 
5823 /**
5824  * xmlAutomataNewNegTrans:
5825  * @am: an automata
5826  * @from: the starting point of the transition
5827  * @to: the target point of the transition or NULL
5828  * @token: the first input string associated to that transition
5829  * @token2: the second input string associated to that transition
5830  * @data: data passed to the callback function if the transition is activated
5831  *
5832  * If @to is NULL, this creates first a new target state in the automata
5833  * and then adds a transition from the @from state to the target state
5834  * activated by any value except (@token,@token2)
5835  * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5836  # the semantic of XSD ##other
5837  *
5838  * Returns the target state or NULL in case of error
5839  */
5840 xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5841 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5842 		       xmlAutomataStatePtr to, const xmlChar *token,
5843 		       const xmlChar *token2, void *data) {
5844     xmlRegAtomPtr atom;
5845     xmlChar err_msg[200];
5846 
5847     if ((am == NULL) || (from == NULL) || (token == NULL))
5848 	return(NULL);
5849     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5850     if (atom == NULL)
5851 	return(NULL);
5852     atom->data = data;
5853     atom->neg = 1;
5854     if ((token2 == NULL) || (*token2 == 0)) {
5855 	atom->valuep = xmlStrdup(token);
5856     } else {
5857 	int lenn, lenp;
5858 	xmlChar *str;
5859 
5860 	lenn = strlen((char *) token2);
5861 	lenp = strlen((char *) token);
5862 
5863 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5864 	if (str == NULL) {
5865 	    xmlRegFreeAtom(atom);
5866 	    return(NULL);
5867 	}
5868 	memcpy(&str[0], token, lenp);
5869 	str[lenp] = '|';
5870 	memcpy(&str[lenp + 1], token2, lenn);
5871 	str[lenn + lenp + 1] = 0;
5872 
5873 	atom->valuep = str;
5874     }
5875     snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5876     err_msg[199] = 0;
5877     atom->valuep2 = xmlStrdup(err_msg);
5878 
5879     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5880         xmlRegFreeAtom(atom);
5881 	return(NULL);
5882     }
5883     am->negs++;
5884     if (to == NULL)
5885 	return(am->state);
5886     return(to);
5887 }
5888 
5889 /**
5890  * xmlAutomataNewCountTrans2:
5891  * @am: an automata
5892  * @from: the starting point of the transition
5893  * @to: the target point of the transition or NULL
5894  * @token: the input string associated to that transition
5895  * @token2: the second input string associated to that transition
5896  * @min:  the minimum successive occurences of token
5897  * @max:  the maximum successive occurences of token
5898  * @data:  data associated to the transition
5899  *
5900  * If @to is NULL, this creates first a new target state in the automata
5901  * and then adds a transition from the @from state to the target state
5902  * activated by a succession of input of value @token and @token2 and
5903  * whose number is between @min and @max
5904  *
5905  * Returns the target state or NULL in case of error
5906  */
5907 xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)5908 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5909 			 xmlAutomataStatePtr to, const xmlChar *token,
5910 			 const xmlChar *token2,
5911 			 int min, int max, void *data) {
5912     xmlRegAtomPtr atom;
5913     int counter;
5914 
5915     if ((am == NULL) || (from == NULL) || (token == NULL))
5916 	return(NULL);
5917     if (min < 0)
5918 	return(NULL);
5919     if ((max < min) || (max < 1))
5920 	return(NULL);
5921     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5922     if (atom == NULL)
5923 	return(NULL);
5924     if ((token2 == NULL) || (*token2 == 0)) {
5925 	atom->valuep = xmlStrdup(token);
5926     } else {
5927 	int lenn, lenp;
5928 	xmlChar *str;
5929 
5930 	lenn = strlen((char *) token2);
5931 	lenp = strlen((char *) token);
5932 
5933 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5934 	if (str == NULL) {
5935 	    xmlRegFreeAtom(atom);
5936 	    return(NULL);
5937 	}
5938 	memcpy(&str[0], token, lenp);
5939 	str[lenp] = '|';
5940 	memcpy(&str[lenp + 1], token2, lenn);
5941 	str[lenn + lenp + 1] = 0;
5942 
5943 	atom->valuep = str;
5944     }
5945     atom->data = data;
5946     if (min == 0)
5947 	atom->min = 1;
5948     else
5949 	atom->min = min;
5950     atom->max = max;
5951 
5952     /*
5953      * associate a counter to the transition.
5954      */
5955     counter = xmlRegGetCounter(am);
5956     am->counters[counter].min = min;
5957     am->counters[counter].max = max;
5958 
5959     /* xmlFAGenerateTransitions(am, from, to, atom); */
5960     if (to == NULL) {
5961         to = xmlRegNewState(am);
5962 	xmlRegStatePush(am, to);
5963     }
5964     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5965     xmlRegAtomPush(am, atom);
5966     am->state = to;
5967 
5968     if (to == NULL)
5969 	to = am->state;
5970     if (to == NULL)
5971 	return(NULL);
5972     if (min == 0)
5973 	xmlFAGenerateEpsilonTransition(am, from, to);
5974     return(to);
5975 }
5976 
5977 /**
5978  * xmlAutomataNewCountTrans:
5979  * @am: an automata
5980  * @from: the starting point of the transition
5981  * @to: the target point of the transition or NULL
5982  * @token: the input string associated to that transition
5983  * @min:  the minimum successive occurences of token
5984  * @max:  the maximum successive occurences of token
5985  * @data:  data associated to the transition
5986  *
5987  * If @to is NULL, this creates first a new target state in the automata
5988  * and then adds a transition from the @from state to the target state
5989  * activated by a succession of input of value @token and whose number
5990  * is between @min and @max
5991  *
5992  * Returns the target state or NULL in case of error
5993  */
5994 xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)5995 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5996 			 xmlAutomataStatePtr to, const xmlChar *token,
5997 			 int min, int max, void *data) {
5998     xmlRegAtomPtr atom;
5999     int counter;
6000 
6001     if ((am == NULL) || (from == NULL) || (token == NULL))
6002 	return(NULL);
6003     if (min < 0)
6004 	return(NULL);
6005     if ((max < min) || (max < 1))
6006 	return(NULL);
6007     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6008     if (atom == NULL)
6009 	return(NULL);
6010     atom->valuep = xmlStrdup(token);
6011     atom->data = data;
6012     if (min == 0)
6013 	atom->min = 1;
6014     else
6015 	atom->min = min;
6016     atom->max = max;
6017 
6018     /*
6019      * associate a counter to the transition.
6020      */
6021     counter = xmlRegGetCounter(am);
6022     am->counters[counter].min = min;
6023     am->counters[counter].max = max;
6024 
6025     /* xmlFAGenerateTransitions(am, from, to, atom); */
6026     if (to == NULL) {
6027         to = xmlRegNewState(am);
6028 	xmlRegStatePush(am, to);
6029     }
6030     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6031     xmlRegAtomPush(am, atom);
6032     am->state = to;
6033 
6034     if (to == NULL)
6035 	to = am->state;
6036     if (to == NULL)
6037 	return(NULL);
6038     if (min == 0)
6039 	xmlFAGenerateEpsilonTransition(am, from, to);
6040     return(to);
6041 }
6042 
6043 /**
6044  * xmlAutomataNewOnceTrans2:
6045  * @am: an automata
6046  * @from: the starting point of the transition
6047  * @to: the target point of the transition or NULL
6048  * @token: the input string associated to that transition
6049  * @token2: the second input string associated to that transition
6050  * @min:  the minimum successive occurences of token
6051  * @max:  the maximum successive occurences of token
6052  * @data:  data associated to the transition
6053  *
6054  * If @to is NULL, this creates first a new target state in the automata
6055  * and then adds a transition from the @from state to the target state
6056  * activated by a succession of input of value @token and @token2 and whose
6057  * number is between @min and @max, moreover that transition can only be
6058  * crossed once.
6059  *
6060  * Returns the target state or NULL in case of error
6061  */
6062 xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)6063 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6064 			 xmlAutomataStatePtr to, const xmlChar *token,
6065 			 const xmlChar *token2,
6066 			 int min, int max, void *data) {
6067     xmlRegAtomPtr atom;
6068     int counter;
6069 
6070     if ((am == NULL) || (from == NULL) || (token == NULL))
6071 	return(NULL);
6072     if (min < 1)
6073 	return(NULL);
6074     if ((max < min) || (max < 1))
6075 	return(NULL);
6076     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6077     if (atom == NULL)
6078 	return(NULL);
6079     if ((token2 == NULL) || (*token2 == 0)) {
6080 	atom->valuep = xmlStrdup(token);
6081     } else {
6082 	int lenn, lenp;
6083 	xmlChar *str;
6084 
6085 	lenn = strlen((char *) token2);
6086 	lenp = strlen((char *) token);
6087 
6088 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6089 	if (str == NULL) {
6090 	    xmlRegFreeAtom(atom);
6091 	    return(NULL);
6092 	}
6093 	memcpy(&str[0], token, lenp);
6094 	str[lenp] = '|';
6095 	memcpy(&str[lenp + 1], token2, lenn);
6096 	str[lenn + lenp + 1] = 0;
6097 
6098 	atom->valuep = str;
6099     }
6100     atom->data = data;
6101     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6102     atom->min = min;
6103     atom->max = max;
6104     /*
6105      * associate a counter to the transition.
6106      */
6107     counter = xmlRegGetCounter(am);
6108     am->counters[counter].min = 1;
6109     am->counters[counter].max = 1;
6110 
6111     /* xmlFAGenerateTransitions(am, from, to, atom); */
6112     if (to == NULL) {
6113 	to = xmlRegNewState(am);
6114 	xmlRegStatePush(am, to);
6115     }
6116     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6117     xmlRegAtomPush(am, atom);
6118     am->state = to;
6119     return(to);
6120 }
6121 
6122 
6123 
6124 /**
6125  * xmlAutomataNewOnceTrans:
6126  * @am: an automata
6127  * @from: the starting point of the transition
6128  * @to: the target point of the transition or NULL
6129  * @token: the input string associated to that transition
6130  * @min:  the minimum successive occurences of token
6131  * @max:  the maximum successive occurences of token
6132  * @data:  data associated to the transition
6133  *
6134  * If @to is NULL, this creates first a new target state in the automata
6135  * and then adds a transition from the @from state to the target state
6136  * activated by a succession of input of value @token and whose number
6137  * is between @min and @max, moreover that transition can only be crossed
6138  * once.
6139  *
6140  * Returns the target state or NULL in case of error
6141  */
6142 xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6143 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6144 			 xmlAutomataStatePtr to, const xmlChar *token,
6145 			 int min, int max, void *data) {
6146     xmlRegAtomPtr atom;
6147     int counter;
6148 
6149     if ((am == NULL) || (from == NULL) || (token == NULL))
6150 	return(NULL);
6151     if (min < 1)
6152 	return(NULL);
6153     if ((max < min) || (max < 1))
6154 	return(NULL);
6155     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6156     if (atom == NULL)
6157 	return(NULL);
6158     atom->valuep = xmlStrdup(token);
6159     atom->data = data;
6160     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6161     atom->min = min;
6162     atom->max = max;
6163     /*
6164      * associate a counter to the transition.
6165      */
6166     counter = xmlRegGetCounter(am);
6167     am->counters[counter].min = 1;
6168     am->counters[counter].max = 1;
6169 
6170     /* xmlFAGenerateTransitions(am, from, to, atom); */
6171     if (to == NULL) {
6172 	to = xmlRegNewState(am);
6173 	xmlRegStatePush(am, to);
6174     }
6175     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6176     xmlRegAtomPush(am, atom);
6177     am->state = to;
6178     return(to);
6179 }
6180 
6181 /**
6182  * xmlAutomataNewState:
6183  * @am: an automata
6184  *
6185  * Create a new disconnected state in the automata
6186  *
6187  * Returns the new state or NULL in case of error
6188  */
6189 xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am)6190 xmlAutomataNewState(xmlAutomataPtr am) {
6191     xmlAutomataStatePtr to;
6192 
6193     if (am == NULL)
6194 	return(NULL);
6195     to = xmlRegNewState(am);
6196     xmlRegStatePush(am, to);
6197     return(to);
6198 }
6199 
6200 /**
6201  * xmlAutomataNewEpsilon:
6202  * @am: an automata
6203  * @from: the starting point of the transition
6204  * @to: the target point of the transition or NULL
6205  *
6206  * If @to is NULL, this creates first a new target state in the automata
6207  * and then adds an epsilon transition from the @from state to the
6208  * target state
6209  *
6210  * Returns the target state or NULL in case of error
6211  */
6212 xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to)6213 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6214 		      xmlAutomataStatePtr to) {
6215     if ((am == NULL) || (from == NULL))
6216 	return(NULL);
6217     xmlFAGenerateEpsilonTransition(am, from, to);
6218     if (to == NULL)
6219 	return(am->state);
6220     return(to);
6221 }
6222 
6223 /**
6224  * xmlAutomataNewAllTrans:
6225  * @am: an automata
6226  * @from: the starting point of the transition
6227  * @to: the target point of the transition or NULL
6228  * @lax: allow to transition if not all all transitions have been activated
6229  *
6230  * If @to is NULL, this creates first a new target state in the automata
6231  * and then adds a an ALL transition from the @from state to the
6232  * target state. That transition is an epsilon transition allowed only when
6233  * all transitions from the @from node have been activated.
6234  *
6235  * Returns the target state or NULL in case of error
6236  */
6237 xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int lax)6238 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6239 		       xmlAutomataStatePtr to, int lax) {
6240     if ((am == NULL) || (from == NULL))
6241 	return(NULL);
6242     xmlFAGenerateAllTransition(am, from, to, lax);
6243     if (to == NULL)
6244 	return(am->state);
6245     return(to);
6246 }
6247 
6248 /**
6249  * xmlAutomataNewCounter:
6250  * @am: an automata
6251  * @min:  the minimal value on the counter
6252  * @max:  the maximal value on the counter
6253  *
6254  * Create a new counter
6255  *
6256  * Returns the counter number or -1 in case of error
6257  */
6258 int
xmlAutomataNewCounter(xmlAutomataPtr am,int min,int max)6259 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6260     int ret;
6261 
6262     if (am == NULL)
6263 	return(-1);
6264 
6265     ret = xmlRegGetCounter(am);
6266     if (ret < 0)
6267 	return(-1);
6268     am->counters[ret].min = min;
6269     am->counters[ret].max = max;
6270     return(ret);
6271 }
6272 
6273 /**
6274  * xmlAutomataNewCountedTrans:
6275  * @am: an automata
6276  * @from: the starting point of the transition
6277  * @to: the target point of the transition or NULL
6278  * @counter: the counter associated to that transition
6279  *
6280  * If @to is NULL, this creates first a new target state in the automata
6281  * and then adds an epsilon transition from the @from state to the target state
6282  * which will increment the counter provided
6283  *
6284  * Returns the target state or NULL in case of error
6285  */
6286 xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6287 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6288 		xmlAutomataStatePtr to, int counter) {
6289     if ((am == NULL) || (from == NULL) || (counter < 0))
6290 	return(NULL);
6291     xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6292     if (to == NULL)
6293 	return(am->state);
6294     return(to);
6295 }
6296 
6297 /**
6298  * xmlAutomataNewCounterTrans:
6299  * @am: an automata
6300  * @from: the starting point of the transition
6301  * @to: the target point of the transition or NULL
6302  * @counter: the counter associated to that transition
6303  *
6304  * If @to is NULL, this creates first a new target state in the automata
6305  * and then adds an epsilon transition from the @from state to the target state
6306  * which will be allowed only if the counter is within the right range.
6307  *
6308  * Returns the target state or NULL in case of error
6309  */
6310 xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6311 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6312 		xmlAutomataStatePtr to, int counter) {
6313     if ((am == NULL) || (from == NULL) || (counter < 0))
6314 	return(NULL);
6315     xmlFAGenerateCountedTransition(am, from, to, counter);
6316     if (to == NULL)
6317 	return(am->state);
6318     return(to);
6319 }
6320 
6321 /**
6322  * xmlAutomataCompile:
6323  * @am: an automata
6324  *
6325  * Compile the automata into a Reg Exp ready for being executed.
6326  * The automata should be free after this point.
6327  *
6328  * Returns the compiled regexp or NULL in case of error
6329  */
6330 xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am)6331 xmlAutomataCompile(xmlAutomataPtr am) {
6332     xmlRegexpPtr ret;
6333 
6334     if ((am == NULL) || (am->error != 0)) return(NULL);
6335     xmlFAEliminateEpsilonTransitions(am);
6336     /* xmlFAComputesDeterminism(am); */
6337     ret = xmlRegEpxFromParse(am);
6338 
6339     return(ret);
6340 }
6341 
6342 /**
6343  * xmlAutomataIsDeterminist:
6344  * @am: an automata
6345  *
6346  * Checks if an automata is determinist.
6347  *
6348  * Returns 1 if true, 0 if not, and -1 in case of error
6349  */
6350 int
xmlAutomataIsDeterminist(xmlAutomataPtr am)6351 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6352     int ret;
6353 
6354     if (am == NULL)
6355 	return(-1);
6356 
6357     ret = xmlFAComputesDeterminism(am);
6358     return(ret);
6359 }
6360 #endif /* LIBXML_AUTOMATA_ENABLED */
6361 
6362 #ifdef LIBXML_EXPR_ENABLED
6363 /************************************************************************
6364  *									*
6365  *		Formal Expression handling code				*
6366  *									*
6367  ************************************************************************/
6368 /************************************************************************
6369  *									*
6370  *		Expression handling context				*
6371  *									*
6372  ************************************************************************/
6373 
6374 struct _xmlExpCtxt {
6375     xmlDictPtr dict;
6376     xmlExpNodePtr *table;
6377     int size;
6378     int nbElems;
6379     int nb_nodes;
6380     int maxNodes;
6381     const char *expr;
6382     const char *cur;
6383     int nb_cons;
6384     int tabSize;
6385 };
6386 
6387 /**
6388  * xmlExpNewCtxt:
6389  * @maxNodes:  the maximum number of nodes
6390  * @dict:  optional dictionary to use internally
6391  *
6392  * Creates a new context for manipulating expressions
6393  *
6394  * Returns the context or NULL in case of error
6395  */
6396 xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes,xmlDictPtr dict)6397 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6398     xmlExpCtxtPtr ret;
6399     int size = 256;
6400 
6401     if (maxNodes <= 4096)
6402         maxNodes = 4096;
6403 
6404     ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6405     if (ret == NULL)
6406         return(NULL);
6407     memset(ret, 0, sizeof(xmlExpCtxt));
6408     ret->size = size;
6409     ret->nbElems = 0;
6410     ret->maxNodes = maxNodes;
6411     ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6412     if (ret->table == NULL) {
6413         xmlFree(ret);
6414 	return(NULL);
6415     }
6416     memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6417     if (dict == NULL) {
6418         ret->dict = xmlDictCreate();
6419 	if (ret->dict == NULL) {
6420 	    xmlFree(ret->table);
6421 	    xmlFree(ret);
6422 	    return(NULL);
6423 	}
6424     } else {
6425         ret->dict = dict;
6426 	xmlDictReference(ret->dict);
6427     }
6428     return(ret);
6429 }
6430 
6431 /**
6432  * xmlExpFreeCtxt:
6433  * @ctxt:  an expression context
6434  *
6435  * Free an expression context
6436  */
6437 void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt)6438 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6439     if (ctxt == NULL)
6440         return;
6441     xmlDictFree(ctxt->dict);
6442     if (ctxt->table != NULL)
6443 	xmlFree(ctxt->table);
6444     xmlFree(ctxt);
6445 }
6446 
6447 /************************************************************************
6448  *									*
6449  *		Structure associated to an expression node		*
6450  *									*
6451  ************************************************************************/
6452 #define MAX_NODES 10000
6453 
6454 /* #define DEBUG_DERIV */
6455 
6456 /*
6457  * TODO:
6458  * - Wildcards
6459  * - public API for creation
6460  *
6461  * Started
6462  * - regression testing
6463  *
6464  * Done
6465  * - split into module and test tool
6466  * - memleaks
6467  */
6468 
6469 typedef enum {
6470     XML_EXP_NILABLE = (1 << 0)
6471 } xmlExpNodeInfo;
6472 
6473 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6474 
6475 struct _xmlExpNode {
6476     unsigned char type;/* xmlExpNodeType */
6477     unsigned char info;/* OR of xmlExpNodeInfo */
6478     unsigned short key;	/* the hash key */
6479     unsigned int ref;	/* The number of references */
6480     int c_max;		/* the maximum length it can consume */
6481     xmlExpNodePtr exp_left;
6482     xmlExpNodePtr next;/* the next node in the hash table or free list */
6483     union {
6484 	struct {
6485 	    int f_min;
6486 	    int f_max;
6487 	} count;
6488 	struct {
6489 	    xmlExpNodePtr f_right;
6490 	} children;
6491         const xmlChar *f_str;
6492     } field;
6493 };
6494 
6495 #define exp_min field.count.f_min
6496 #define exp_max field.count.f_max
6497 /* #define exp_left field.children.f_left */
6498 #define exp_right field.children.f_right
6499 #define exp_str field.f_str
6500 
6501 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6502 static xmlExpNode forbiddenExpNode = {
6503     XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6504 };
6505 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6506 static xmlExpNode emptyExpNode = {
6507     XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6508 };
6509 xmlExpNodePtr emptyExp = &emptyExpNode;
6510 
6511 /************************************************************************
6512  *									*
6513  *  The custom hash table for unicity and canonicalization		*
6514  *  of sub-expressions pointers						*
6515  *									*
6516  ************************************************************************/
6517 /*
6518  * xmlExpHashNameComputeKey:
6519  * Calculate the hash key for a token
6520  */
6521 static unsigned short
xmlExpHashNameComputeKey(const xmlChar * name)6522 xmlExpHashNameComputeKey(const xmlChar *name) {
6523     unsigned short value = 0L;
6524     char ch;
6525 
6526     if (name != NULL) {
6527 	value += 30 * (*name);
6528 	while ((ch = *name++) != 0) {
6529 	    value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6530 	}
6531     }
6532     return (value);
6533 }
6534 
6535 /*
6536  * xmlExpHashComputeKey:
6537  * Calculate the hash key for a compound expression
6538  */
6539 static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right)6540 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6541                      xmlExpNodePtr right) {
6542     unsigned long value;
6543     unsigned short ret;
6544 
6545     switch (type) {
6546         case XML_EXP_SEQ:
6547 	    value = left->key;
6548 	    value += right->key;
6549 	    value *= 3;
6550 	    ret = (unsigned short) value;
6551 	    break;
6552         case XML_EXP_OR:
6553 	    value = left->key;
6554 	    value += right->key;
6555 	    value *= 7;
6556 	    ret = (unsigned short) value;
6557 	    break;
6558         case XML_EXP_COUNT:
6559 	    value = left->key;
6560 	    value += right->key;
6561 	    ret = (unsigned short) value;
6562 	    break;
6563 	default:
6564 	    ret = 0;
6565     }
6566     return(ret);
6567 }
6568 
6569 
6570 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt,xmlExpNodeType type)6571 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6572     xmlExpNodePtr ret;
6573 
6574     if (ctxt->nb_nodes >= MAX_NODES)
6575         return(NULL);
6576     ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6577     if (ret == NULL)
6578         return(NULL);
6579     memset(ret, 0, sizeof(xmlExpNode));
6580     ret->type = type;
6581     ret->next = NULL;
6582     ctxt->nb_nodes++;
6583     ctxt->nb_cons++;
6584     return(ret);
6585 }
6586 
6587 /**
6588  * xmlExpHashGetEntry:
6589  * @table: the hash table
6590  *
6591  * Get the unique entry from the hash table. The entry is created if
6592  * needed. @left and @right are consumed, i.e. their ref count will
6593  * be decremented by the operation.
6594  *
6595  * Returns the pointer or NULL in case of error
6596  */
6597 static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt,xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right,const xmlChar * name,int min,int max)6598 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6599                    xmlExpNodePtr left, xmlExpNodePtr right,
6600 		   const xmlChar *name, int min, int max) {
6601     unsigned short kbase, key;
6602     xmlExpNodePtr entry;
6603     xmlExpNodePtr insert;
6604 
6605     if (ctxt == NULL)
6606 	return(NULL);
6607 
6608     /*
6609      * Check for duplicate and insertion location.
6610      */
6611     if (type == XML_EXP_ATOM) {
6612 	kbase = xmlExpHashNameComputeKey(name);
6613     } else if (type == XML_EXP_COUNT) {
6614         /* COUNT reduction rule 1 */
6615 	/* a{1} -> a */
6616 	if (min == max) {
6617 	    if (min == 1) {
6618 		return(left);
6619 	    }
6620 	    if (min == 0) {
6621 		xmlExpFree(ctxt, left);
6622 	        return(emptyExp);
6623 	    }
6624 	}
6625 	if (min < 0) {
6626 	    xmlExpFree(ctxt, left);
6627 	    return(forbiddenExp);
6628 	}
6629         if (max == -1)
6630 	    kbase = min + 79;
6631 	else
6632 	    kbase = max - min;
6633 	kbase += left->key;
6634     } else if (type == XML_EXP_OR) {
6635         /* Forbid reduction rules */
6636         if (left->type == XML_EXP_FORBID) {
6637 	    xmlExpFree(ctxt, left);
6638 	    return(right);
6639 	}
6640         if (right->type == XML_EXP_FORBID) {
6641 	    xmlExpFree(ctxt, right);
6642 	    return(left);
6643 	}
6644 
6645         /* OR reduction rule 1 */
6646 	/* a | a reduced to a */
6647         if (left == right) {
6648 	    left->ref--;
6649 	    return(left);
6650 	}
6651         /* OR canonicalization rule 1 */
6652 	/* linearize (a | b) | c into a | (b | c) */
6653         if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6654 	    xmlExpNodePtr tmp = left;
6655             left = right;
6656 	    right = tmp;
6657 	}
6658         /* OR reduction rule 2 */
6659 	/* a | (a | b) and b | (a | b) are reduced to a | b */
6660         if (right->type == XML_EXP_OR) {
6661 	    if ((left == right->exp_left) ||
6662 	        (left == right->exp_right)) {
6663 		xmlExpFree(ctxt, left);
6664 		return(right);
6665 	    }
6666 	}
6667         /* OR canonicalization rule 2 */
6668 	/* linearize (a | b) | c into a | (b | c) */
6669         if (left->type == XML_EXP_OR) {
6670 	    xmlExpNodePtr tmp;
6671 
6672 	    /* OR canonicalization rule 2 */
6673 	    if ((left->exp_right->type != XML_EXP_OR) &&
6674 	        (left->exp_right->key < left->exp_left->key)) {
6675 	        tmp = left->exp_right;
6676 		left->exp_right = left->exp_left;
6677 		left->exp_left = tmp;
6678 	    }
6679 	    left->exp_right->ref++;
6680 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6681 	                             NULL, 0, 0);
6682 	    left->exp_left->ref++;
6683 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6684 	                             NULL, 0, 0);
6685 
6686 	    xmlExpFree(ctxt, left);
6687 	    return(tmp);
6688 	}
6689 	if (right->type == XML_EXP_OR) {
6690 	    /* Ordering in the tree */
6691 	    /* C | (A | B) -> A | (B | C) */
6692 	    if (left->key > right->exp_right->key) {
6693 		xmlExpNodePtr tmp;
6694 		right->exp_right->ref++;
6695 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6696 		                         left, NULL, 0, 0);
6697 		right->exp_left->ref++;
6698 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6699 		                         tmp, NULL, 0, 0);
6700 		xmlExpFree(ctxt, right);
6701 		return(tmp);
6702 	    }
6703 	    /* Ordering in the tree */
6704 	    /* B | (A | C) -> A | (B | C) */
6705 	    if (left->key > right->exp_left->key) {
6706 		xmlExpNodePtr tmp;
6707 		right->exp_right->ref++;
6708 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6709 		                         right->exp_right, NULL, 0, 0);
6710 		right->exp_left->ref++;
6711 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6712 		                         tmp, NULL, 0, 0);
6713 		xmlExpFree(ctxt, right);
6714 		return(tmp);
6715 	    }
6716 	}
6717 	/* we know both types are != XML_EXP_OR here */
6718         else if (left->key > right->key) {
6719 	    xmlExpNodePtr tmp = left;
6720             left = right;
6721 	    right = tmp;
6722 	}
6723 	kbase = xmlExpHashComputeKey(type, left, right);
6724     } else if (type == XML_EXP_SEQ) {
6725         /* Forbid reduction rules */
6726         if (left->type == XML_EXP_FORBID) {
6727 	    xmlExpFree(ctxt, right);
6728 	    return(left);
6729 	}
6730         if (right->type == XML_EXP_FORBID) {
6731 	    xmlExpFree(ctxt, left);
6732 	    return(right);
6733 	}
6734         /* Empty reduction rules */
6735         if (right->type == XML_EXP_EMPTY) {
6736 	    return(left);
6737 	}
6738         if (left->type == XML_EXP_EMPTY) {
6739 	    return(right);
6740 	}
6741 	kbase = xmlExpHashComputeKey(type, left, right);
6742     } else
6743         return(NULL);
6744 
6745     key = kbase % ctxt->size;
6746     if (ctxt->table[key] != NULL) {
6747 	for (insert = ctxt->table[key]; insert != NULL;
6748 	     insert = insert->next) {
6749 	    if ((insert->key == kbase) &&
6750 	        (insert->type == type)) {
6751 		if (type == XML_EXP_ATOM) {
6752 		    if (name == insert->exp_str) {
6753 			insert->ref++;
6754 			return(insert);
6755 		    }
6756 		} else if (type == XML_EXP_COUNT) {
6757 		    if ((insert->exp_min == min) && (insert->exp_max == max) &&
6758 		        (insert->exp_left == left)) {
6759 			insert->ref++;
6760 			left->ref--;
6761 			return(insert);
6762 		    }
6763 		} else if ((insert->exp_left == left) &&
6764 			   (insert->exp_right == right)) {
6765 		    insert->ref++;
6766 		    left->ref--;
6767 		    right->ref--;
6768 		    return(insert);
6769 		}
6770 	    }
6771 	}
6772     }
6773 
6774     entry = xmlExpNewNode(ctxt, type);
6775     if (entry == NULL)
6776         return(NULL);
6777     entry->key = kbase;
6778     if (type == XML_EXP_ATOM) {
6779 	entry->exp_str = name;
6780 	entry->c_max = 1;
6781     } else if (type == XML_EXP_COUNT) {
6782         entry->exp_min = min;
6783         entry->exp_max = max;
6784 	entry->exp_left = left;
6785 	if ((min == 0) || (IS_NILLABLE(left)))
6786 	    entry->info |= XML_EXP_NILABLE;
6787 	if (max < 0)
6788 	    entry->c_max = -1;
6789 	else
6790 	    entry->c_max = max * entry->exp_left->c_max;
6791     } else {
6792 	entry->exp_left = left;
6793 	entry->exp_right = right;
6794 	if (type == XML_EXP_OR) {
6795 	    if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6796 		entry->info |= XML_EXP_NILABLE;
6797 	    if ((entry->exp_left->c_max == -1) ||
6798 	        (entry->exp_right->c_max == -1))
6799 		entry->c_max = -1;
6800 	    else if (entry->exp_left->c_max > entry->exp_right->c_max)
6801 	        entry->c_max = entry->exp_left->c_max;
6802 	    else
6803 	        entry->c_max = entry->exp_right->c_max;
6804 	} else {
6805 	    if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6806 		entry->info |= XML_EXP_NILABLE;
6807 	    if ((entry->exp_left->c_max == -1) ||
6808 	        (entry->exp_right->c_max == -1))
6809 		entry->c_max = -1;
6810 	    else
6811 	        entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6812 	}
6813     }
6814     entry->ref = 1;
6815     if (ctxt->table[key] != NULL)
6816         entry->next = ctxt->table[key];
6817 
6818     ctxt->table[key] = entry;
6819     ctxt->nbElems++;
6820 
6821     return(entry);
6822 }
6823 
6824 /**
6825  * xmlExpFree:
6826  * @ctxt: the expression context
6827  * @exp: the expression
6828  *
6829  * Dereference the expression
6830  */
6831 void
xmlExpFree(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp)6832 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6833     if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6834         return;
6835     exp->ref--;
6836     if (exp->ref == 0) {
6837         unsigned short key;
6838 
6839         /* Unlink it first from the hash table */
6840 	key = exp->key % ctxt->size;
6841 	if (ctxt->table[key] == exp) {
6842 	    ctxt->table[key] = exp->next;
6843 	} else {
6844 	    xmlExpNodePtr tmp;
6845 
6846 	    tmp = ctxt->table[key];
6847 	    while (tmp != NULL) {
6848 	        if (tmp->next == exp) {
6849 		    tmp->next = exp->next;
6850 		    break;
6851 		}
6852 	        tmp = tmp->next;
6853 	    }
6854 	}
6855 
6856         if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6857 	    xmlExpFree(ctxt, exp->exp_left);
6858 	    xmlExpFree(ctxt, exp->exp_right);
6859 	} else if (exp->type == XML_EXP_COUNT) {
6860 	    xmlExpFree(ctxt, exp->exp_left);
6861 	}
6862         xmlFree(exp);
6863 	ctxt->nb_nodes--;
6864     }
6865 }
6866 
6867 /**
6868  * xmlExpRef:
6869  * @exp: the expression
6870  *
6871  * Increase the reference count of the expression
6872  */
6873 void
xmlExpRef(xmlExpNodePtr exp)6874 xmlExpRef(xmlExpNodePtr exp) {
6875     if (exp != NULL)
6876         exp->ref++;
6877 }
6878 
6879 /**
6880  * xmlExpNewAtom:
6881  * @ctxt: the expression context
6882  * @name: the atom name
6883  * @len: the atom name length in byte (or -1);
6884  *
6885  * Get the atom associated to this name from that context
6886  *
6887  * Returns the node or NULL in case of error
6888  */
6889 xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt,const xmlChar * name,int len)6890 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6891     if ((ctxt == NULL) || (name == NULL))
6892         return(NULL);
6893     name = xmlDictLookup(ctxt->dict, name, len);
6894     if (name == NULL)
6895         return(NULL);
6896     return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6897 }
6898 
6899 /**
6900  * xmlExpNewOr:
6901  * @ctxt: the expression context
6902  * @left: left expression
6903  * @right: right expression
6904  *
6905  * Get the atom associated to the choice @left | @right
6906  * Note that @left and @right are consumed in the operation, to keep
6907  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6908  * this is true even in case of failure (unless ctxt == NULL).
6909  *
6910  * Returns the node or NULL in case of error
6911  */
6912 xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6913 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6914     if (ctxt == NULL)
6915         return(NULL);
6916     if ((left == NULL) || (right == NULL)) {
6917         xmlExpFree(ctxt, left);
6918         xmlExpFree(ctxt, right);
6919         return(NULL);
6920     }
6921     return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6922 }
6923 
6924 /**
6925  * xmlExpNewSeq:
6926  * @ctxt: the expression context
6927  * @left: left expression
6928  * @right: right expression
6929  *
6930  * Get the atom associated to the sequence @left , @right
6931  * Note that @left and @right are consumed in the operation, to keep
6932  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6933  * this is true even in case of failure (unless ctxt == NULL).
6934  *
6935  * Returns the node or NULL in case of error
6936  */
6937 xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6938 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6939     if (ctxt == NULL)
6940         return(NULL);
6941     if ((left == NULL) || (right == NULL)) {
6942         xmlExpFree(ctxt, left);
6943         xmlExpFree(ctxt, right);
6944         return(NULL);
6945     }
6946     return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6947 }
6948 
6949 /**
6950  * xmlExpNewRange:
6951  * @ctxt: the expression context
6952  * @subset: the expression to be repeated
6953  * @min: the lower bound for the repetition
6954  * @max: the upper bound for the repetition, -1 means infinite
6955  *
6956  * Get the atom associated to the range (@subset){@min, @max}
6957  * Note that @subset is consumed in the operation, to keep
6958  * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6959  * this is true even in case of failure (unless ctxt == NULL).
6960  *
6961  * Returns the node or NULL in case of error
6962  */
6963 xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt,xmlExpNodePtr subset,int min,int max)6964 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6965     if (ctxt == NULL)
6966         return(NULL);
6967     if ((subset == NULL) || (min < 0) || (max < -1) ||
6968         ((max >= 0) && (min > max))) {
6969 	xmlExpFree(ctxt, subset);
6970         return(NULL);
6971     }
6972     return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6973                               NULL, NULL, min, max));
6974 }
6975 
6976 /************************************************************************
6977  *									*
6978  *		Public API for operations on expressions		*
6979  *									*
6980  ************************************************************************/
6981 
6982 static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)6983 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6984                      const xmlChar**list, int len, int nb) {
6985     int tmp, tmp2;
6986 tail:
6987     switch (exp->type) {
6988         case XML_EXP_EMPTY:
6989 	    return(0);
6990         case XML_EXP_ATOM:
6991 	    for (tmp = 0;tmp < nb;tmp++)
6992 	        if (list[tmp] == exp->exp_str)
6993 		    return(0);
6994             if (nb >= len)
6995 	        return(-2);
6996 	    list[nb] = exp->exp_str;
6997 	    return(1);
6998         case XML_EXP_COUNT:
6999 	    exp = exp->exp_left;
7000 	    goto tail;
7001         case XML_EXP_SEQ:
7002         case XML_EXP_OR:
7003 	    tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7004 	    if (tmp < 0)
7005 	        return(tmp);
7006 	    tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7007 	                                nb + tmp);
7008 	    if (tmp2 < 0)
7009 	        return(tmp2);
7010             return(tmp + tmp2);
7011     }
7012     return(-1);
7013 }
7014 
7015 /**
7016  * xmlExpGetLanguage:
7017  * @ctxt: the expression context
7018  * @exp: the expression
7019  * @langList: where to store the tokens
7020  * @len: the allocated length of @list
7021  *
7022  * Find all the strings used in @exp and store them in @list
7023  *
7024  * Returns the number of unique strings found, -1 in case of errors and
7025  *         -2 if there is more than @len strings
7026  */
7027 int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** langList,int len)7028 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7029                   const xmlChar**langList, int len) {
7030     if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
7031         return(-1);
7032     return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
7033 }
7034 
7035 static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7036 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7037                   const xmlChar**list, int len, int nb) {
7038     int tmp, tmp2;
7039 tail:
7040     switch (exp->type) {
7041         case XML_EXP_FORBID:
7042 	    return(0);
7043         case XML_EXP_EMPTY:
7044 	    return(0);
7045         case XML_EXP_ATOM:
7046 	    for (tmp = 0;tmp < nb;tmp++)
7047 	        if (list[tmp] == exp->exp_str)
7048 		    return(0);
7049             if (nb >= len)
7050 	        return(-2);
7051 	    list[nb] = exp->exp_str;
7052 	    return(1);
7053         case XML_EXP_COUNT:
7054 	    exp = exp->exp_left;
7055 	    goto tail;
7056         case XML_EXP_SEQ:
7057 	    tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7058 	    if (tmp < 0)
7059 	        return(tmp);
7060 	    if (IS_NILLABLE(exp->exp_left)) {
7061 		tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7062 					    nb + tmp);
7063 		if (tmp2 < 0)
7064 		    return(tmp2);
7065 		tmp += tmp2;
7066 	    }
7067             return(tmp);
7068         case XML_EXP_OR:
7069 	    tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7070 	    if (tmp < 0)
7071 	        return(tmp);
7072 	    tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7073 	                                nb + tmp);
7074 	    if (tmp2 < 0)
7075 	        return(tmp2);
7076             return(tmp + tmp2);
7077     }
7078     return(-1);
7079 }
7080 
7081 /**
7082  * xmlExpGetStart:
7083  * @ctxt: the expression context
7084  * @exp: the expression
7085  * @tokList: where to store the tokens
7086  * @len: the allocated length of @list
7087  *
7088  * Find all the strings that appears at the start of the languages
7089  * accepted by @exp and store them in @list. E.g. for (a, b) | c
7090  * it will return the list [a, c]
7091  *
7092  * Returns the number of unique strings found, -1 in case of errors and
7093  *         -2 if there is more than @len strings
7094  */
7095 int
xmlExpGetStart(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** tokList,int len)7096 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7097                const xmlChar**tokList, int len) {
7098     if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7099         return(-1);
7100     return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7101 }
7102 
7103 /**
7104  * xmlExpIsNillable:
7105  * @exp: the expression
7106  *
7107  * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce
7108  *
7109  * Returns 1 if nillable, 0 if not and -1 in case of error
7110  */
7111 int
xmlExpIsNillable(xmlExpNodePtr exp)7112 xmlExpIsNillable(xmlExpNodePtr exp) {
7113     if (exp == NULL)
7114         return(-1);
7115     return(IS_NILLABLE(exp) != 0);
7116 }
7117 
7118 static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str)7119 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7120 {
7121     xmlExpNodePtr ret;
7122 
7123     switch (exp->type) {
7124 	case XML_EXP_EMPTY:
7125 	    return(forbiddenExp);
7126 	case XML_EXP_FORBID:
7127 	    return(forbiddenExp);
7128 	case XML_EXP_ATOM:
7129 	    if (exp->exp_str == str) {
7130 #ifdef DEBUG_DERIV
7131 		printf("deriv atom: equal => Empty\n");
7132 #endif
7133 	        ret = emptyExp;
7134 	    } else {
7135 #ifdef DEBUG_DERIV
7136 		printf("deriv atom: mismatch => forbid\n");
7137 #endif
7138 	        /* TODO wildcards here */
7139 		ret = forbiddenExp;
7140 	    }
7141 	    return(ret);
7142 	case XML_EXP_OR: {
7143 	    xmlExpNodePtr tmp;
7144 
7145 #ifdef DEBUG_DERIV
7146 	    printf("deriv or: => or(derivs)\n");
7147 #endif
7148 	    tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7149 	    if (tmp == NULL) {
7150 		return(NULL);
7151 	    }
7152 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7153 	    if (ret == NULL) {
7154 	        xmlExpFree(ctxt, tmp);
7155 		return(NULL);
7156 	    }
7157             ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7158 			     NULL, 0, 0);
7159 	    return(ret);
7160 	}
7161 	case XML_EXP_SEQ:
7162 #ifdef DEBUG_DERIV
7163 	    printf("deriv seq: starting with left\n");
7164 #endif
7165 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7166 	    if (ret == NULL) {
7167 	        return(NULL);
7168 	    } else if (ret == forbiddenExp) {
7169 	        if (IS_NILLABLE(exp->exp_left)) {
7170 #ifdef DEBUG_DERIV
7171 		    printf("deriv seq: left failed but nillable\n");
7172 #endif
7173 		    ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7174 		}
7175 	    } else {
7176 #ifdef DEBUG_DERIV
7177 		printf("deriv seq: left match => sequence\n");
7178 #endif
7179 	        exp->exp_right->ref++;
7180 	        ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7181 		                         NULL, 0, 0);
7182 	    }
7183 	    return(ret);
7184 	case XML_EXP_COUNT: {
7185 	    int min, max;
7186 	    xmlExpNodePtr tmp;
7187 
7188 	    if (exp->exp_max == 0)
7189 		return(forbiddenExp);
7190 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7191 	    if (ret == NULL)
7192 	        return(NULL);
7193 	    if (ret == forbiddenExp) {
7194 #ifdef DEBUG_DERIV
7195 		printf("deriv count: pattern mismatch => forbid\n");
7196 #endif
7197 	        return(ret);
7198 	    }
7199 	    if (exp->exp_max == 1)
7200 		return(ret);
7201 	    if (exp->exp_max < 0) /* unbounded */
7202 		max = -1;
7203 	    else
7204 		max = exp->exp_max - 1;
7205 	    if (exp->exp_min > 0)
7206 		min = exp->exp_min - 1;
7207 	    else
7208 		min = 0;
7209 	    exp->exp_left->ref++;
7210 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7211 				     NULL, min, max);
7212 	    if (ret == emptyExp) {
7213 #ifdef DEBUG_DERIV
7214 		printf("deriv count: match to empty => new count\n");
7215 #endif
7216 	        return(tmp);
7217 	    }
7218 #ifdef DEBUG_DERIV
7219 	    printf("deriv count: match => sequence with new count\n");
7220 #endif
7221 	    return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7222 	                              NULL, 0, 0));
7223 	}
7224     }
7225     return(NULL);
7226 }
7227 
7228 /**
7229  * xmlExpStringDerive:
7230  * @ctxt: the expression context
7231  * @exp: the expression
7232  * @str: the string
7233  * @len: the string len in bytes if available
7234  *
7235  * Do one step of Brzozowski derivation of the expression @exp with
7236  * respect to the input string
7237  *
7238  * Returns the resulting expression or NULL in case of internal error
7239  */
7240 xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str,int len)7241 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7242                    const xmlChar *str, int len) {
7243     const xmlChar *input;
7244 
7245     if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7246         return(NULL);
7247     }
7248     /*
7249      * check the string is in the dictionary, if yes use an interned
7250      * copy, otherwise we know it's not an acceptable input
7251      */
7252     input = xmlDictExists(ctxt->dict, str, len);
7253     if (input == NULL) {
7254         return(forbiddenExp);
7255     }
7256     return(xmlExpStringDeriveInt(ctxt, exp, input));
7257 }
7258 
7259 static int
xmlExpCheckCard(xmlExpNodePtr exp,xmlExpNodePtr sub)7260 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7261     int ret = 1;
7262 
7263     if (sub->c_max == -1) {
7264         if (exp->c_max != -1)
7265 	    ret = 0;
7266     } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7267         ret = 0;
7268     }
7269 #if 0
7270     if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7271         ret = 0;
7272 #endif
7273     return(ret);
7274 }
7275 
7276 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7277                                         xmlExpNodePtr sub);
7278 /**
7279  * xmlExpDivide:
7280  * @ctxt: the expressions context
7281  * @exp: the englobing expression
7282  * @sub: the subexpression
7283  * @mult: the multiple expression
7284  * @remain: the remain from the derivation of the multiple
7285  *
7286  * Check if exp is a multiple of sub, i.e. if there is a finite number n
7287  * so that sub{n} subsume exp
7288  *
7289  * Returns the multiple value if successful, 0 if it is not a multiple
7290  *         and -1 in case of internel error.
7291  */
7292 
7293 static int
xmlExpDivide(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub,xmlExpNodePtr * mult,xmlExpNodePtr * remain)7294 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7295              xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7296     int i;
7297     xmlExpNodePtr tmp, tmp2;
7298 
7299     if (mult != NULL) *mult = NULL;
7300     if (remain != NULL) *remain = NULL;
7301     if (exp->c_max == -1) return(0);
7302     if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7303 
7304     for (i = 1;i <= exp->c_max;i++) {
7305         sub->ref++;
7306         tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7307 				 sub, NULL, NULL, i, i);
7308 	if (tmp == NULL) {
7309 	    return(-1);
7310 	}
7311 	if (!xmlExpCheckCard(tmp, exp)) {
7312 	    xmlExpFree(ctxt, tmp);
7313 	    continue;
7314 	}
7315 	tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7316 	if (tmp2 == NULL) {
7317 	    xmlExpFree(ctxt, tmp);
7318 	    return(-1);
7319 	}
7320 	if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7321 	    if (remain != NULL)
7322 	        *remain = tmp2;
7323 	    else
7324 	        xmlExpFree(ctxt, tmp2);
7325 	    if (mult != NULL)
7326 	        *mult = tmp;
7327 	    else
7328 	        xmlExpFree(ctxt, tmp);
7329 #ifdef DEBUG_DERIV
7330 	    printf("Divide succeeded %d\n", i);
7331 #endif
7332 	    return(i);
7333 	}
7334 	xmlExpFree(ctxt, tmp);
7335 	xmlExpFree(ctxt, tmp2);
7336     }
7337 #ifdef DEBUG_DERIV
7338     printf("Divide failed\n");
7339 #endif
7340     return(0);
7341 }
7342 
7343 /**
7344  * xmlExpExpDeriveInt:
7345  * @ctxt: the expressions context
7346  * @exp: the englobing expression
7347  * @sub: the subexpression
7348  *
7349  * Try to do a step of Brzozowski derivation but at a higher level
7350  * the input being a subexpression.
7351  *
7352  * Returns the resulting expression or NULL in case of internal error
7353  */
7354 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7355 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7356     xmlExpNodePtr ret, tmp, tmp2, tmp3;
7357     const xmlChar **tab;
7358     int len, i;
7359 
7360     /*
7361      * In case of equality and if the expression can only consume a finite
7362      * amount, then the derivation is empty
7363      */
7364     if ((exp == sub) && (exp->c_max >= 0)) {
7365 #ifdef DEBUG_DERIV
7366         printf("Equal(exp, sub) and finite -> Empty\n");
7367 #endif
7368         return(emptyExp);
7369     }
7370     /*
7371      * decompose sub sequence first
7372      */
7373     if (sub->type == XML_EXP_EMPTY) {
7374 #ifdef DEBUG_DERIV
7375         printf("Empty(sub) -> Empty\n");
7376 #endif
7377 	exp->ref++;
7378         return(exp);
7379     }
7380     if (sub->type == XML_EXP_SEQ) {
7381 #ifdef DEBUG_DERIV
7382         printf("Seq(sub) -> decompose\n");
7383 #endif
7384         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7385 	if (tmp == NULL)
7386 	    return(NULL);
7387 	if (tmp == forbiddenExp)
7388 	    return(tmp);
7389 	ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7390 	xmlExpFree(ctxt, tmp);
7391 	return(ret);
7392     }
7393     if (sub->type == XML_EXP_OR) {
7394 #ifdef DEBUG_DERIV
7395         printf("Or(sub) -> decompose\n");
7396 #endif
7397         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7398 	if (tmp == forbiddenExp)
7399 	    return(tmp);
7400 	if (tmp == NULL)
7401 	    return(NULL);
7402 	ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7403 	if ((ret == NULL) || (ret == forbiddenExp)) {
7404 	    xmlExpFree(ctxt, tmp);
7405 	    return(ret);
7406 	}
7407 	return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7408     }
7409     if (!xmlExpCheckCard(exp, sub)) {
7410 #ifdef DEBUG_DERIV
7411         printf("CheckCard(exp, sub) failed -> Forbid\n");
7412 #endif
7413         return(forbiddenExp);
7414     }
7415     switch (exp->type) {
7416         case XML_EXP_EMPTY:
7417 	    if (sub == emptyExp)
7418 	        return(emptyExp);
7419 #ifdef DEBUG_DERIV
7420 	    printf("Empty(exp) -> Forbid\n");
7421 #endif
7422 	    return(forbiddenExp);
7423         case XML_EXP_FORBID:
7424 #ifdef DEBUG_DERIV
7425 	    printf("Forbid(exp) -> Forbid\n");
7426 #endif
7427 	    return(forbiddenExp);
7428         case XML_EXP_ATOM:
7429 	    if (sub->type == XML_EXP_ATOM) {
7430 	        /* TODO: handle wildcards */
7431 	        if (exp->exp_str == sub->exp_str) {
7432 #ifdef DEBUG_DERIV
7433 		    printf("Atom match -> Empty\n");
7434 #endif
7435 		    return(emptyExp);
7436                 }
7437 #ifdef DEBUG_DERIV
7438 		printf("Atom mismatch -> Forbid\n");
7439 #endif
7440 	        return(forbiddenExp);
7441 	    }
7442 	    if ((sub->type == XML_EXP_COUNT) &&
7443 	        (sub->exp_max == 1) &&
7444 	        (sub->exp_left->type == XML_EXP_ATOM)) {
7445 	        /* TODO: handle wildcards */
7446 	        if (exp->exp_str == sub->exp_left->exp_str) {
7447 #ifdef DEBUG_DERIV
7448 		    printf("Atom match -> Empty\n");
7449 #endif
7450 		    return(emptyExp);
7451 		}
7452 #ifdef DEBUG_DERIV
7453 		printf("Atom mismatch -> Forbid\n");
7454 #endif
7455 	        return(forbiddenExp);
7456 	    }
7457 #ifdef DEBUG_DERIV
7458 	    printf("Compex exp vs Atom -> Forbid\n");
7459 #endif
7460 	    return(forbiddenExp);
7461         case XML_EXP_SEQ:
7462 	    /* try to get the sequence consumed only if possible */
7463 	    if (xmlExpCheckCard(exp->exp_left, sub)) {
7464 		/* See if the sequence can be consumed directly */
7465 #ifdef DEBUG_DERIV
7466 		printf("Seq trying left only\n");
7467 #endif
7468 		ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7469 		if ((ret != forbiddenExp) && (ret != NULL)) {
7470 #ifdef DEBUG_DERIV
7471 		    printf("Seq trying left only worked\n");
7472 #endif
7473 		    /*
7474 		     * TODO: assumption here that we are determinist
7475 		     *       i.e. we won't get to a nillable exp left
7476 		     *       subset which could be matched by the right
7477 		     *       part too.
7478 		     * e.g.: (a | b)+,(a | c) and 'a+,a'
7479 		     */
7480 		    exp->exp_right->ref++;
7481 		    return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7482 					      exp->exp_right, NULL, 0, 0));
7483 		}
7484 #ifdef DEBUG_DERIV
7485 	    } else {
7486 		printf("Seq: left too short\n");
7487 #endif
7488 	    }
7489 	    /* Try instead to decompose */
7490 	    if (sub->type == XML_EXP_COUNT) {
7491 		int min, max;
7492 
7493 #ifdef DEBUG_DERIV
7494 		printf("Seq: sub is a count\n");
7495 #endif
7496 	        ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7497 		if (ret == NULL)
7498 		    return(NULL);
7499 		if (ret != forbiddenExp) {
7500 #ifdef DEBUG_DERIV
7501 		    printf("Seq , Count match on left\n");
7502 #endif
7503 		    if (sub->exp_max < 0)
7504 		        max = -1;
7505 	            else
7506 		        max = sub->exp_max -1;
7507 		    if (sub->exp_min > 0)
7508 		        min = sub->exp_min -1;
7509 		    else
7510 		        min = 0;
7511 		    exp->exp_right->ref++;
7512 		    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7513 		                             exp->exp_right, NULL, 0, 0);
7514 		    if (tmp == NULL)
7515 		        return(NULL);
7516 
7517 		    sub->exp_left->ref++;
7518 		    tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7519 				      sub->exp_left, NULL, NULL, min, max);
7520 		    if (tmp2 == NULL) {
7521 		        xmlExpFree(ctxt, tmp);
7522 			return(NULL);
7523 		    }
7524 		    ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7525 		    xmlExpFree(ctxt, tmp);
7526 		    xmlExpFree(ctxt, tmp2);
7527 		    return(ret);
7528 		}
7529 	    }
7530 	    /* we made no progress on structured operations */
7531 	    break;
7532         case XML_EXP_OR:
7533 #ifdef DEBUG_DERIV
7534 	    printf("Or , trying both side\n");
7535 #endif
7536 	    ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7537 	    if (ret == NULL)
7538 	        return(NULL);
7539 	    tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7540 	    if (tmp == NULL) {
7541 		xmlExpFree(ctxt, ret);
7542 	        return(NULL);
7543 	    }
7544 	    return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7545         case XML_EXP_COUNT: {
7546 	    int min, max;
7547 
7548 	    if (sub->type == XML_EXP_COUNT) {
7549 	        /*
7550 		 * Try to see if the loop is completely subsumed
7551 		 */
7552 	        tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7553 		if (tmp == NULL)
7554 		    return(NULL);
7555 		if (tmp == forbiddenExp) {
7556 		    int mult;
7557 
7558 #ifdef DEBUG_DERIV
7559 		    printf("Count, Count inner don't subsume\n");
7560 #endif
7561 		    mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7562 		                        NULL, &tmp);
7563 		    if (mult <= 0) {
7564 #ifdef DEBUG_DERIV
7565 			printf("Count, Count not multiple => forbidden\n");
7566 #endif
7567                         return(forbiddenExp);
7568 		    }
7569 		    if (sub->exp_max == -1) {
7570 		        max = -1;
7571 			if (exp->exp_max == -1) {
7572 			    if (exp->exp_min <= sub->exp_min * mult)
7573 			        min = 0;
7574 			    else
7575 			        min = exp->exp_min - sub->exp_min * mult;
7576 			} else {
7577 #ifdef DEBUG_DERIV
7578 			    printf("Count, Count finite can't subsume infinite\n");
7579 #endif
7580                             xmlExpFree(ctxt, tmp);
7581 			    return(forbiddenExp);
7582 			}
7583 		    } else {
7584 			if (exp->exp_max == -1) {
7585 #ifdef DEBUG_DERIV
7586 			    printf("Infinite loop consume mult finite loop\n");
7587 #endif
7588 			    if (exp->exp_min > sub->exp_min * mult) {
7589 				max = -1;
7590 				min = exp->exp_min - sub->exp_min * mult;
7591 			    } else {
7592 				max = -1;
7593 				min = 0;
7594 			    }
7595 			} else {
7596 			    if (exp->exp_max < sub->exp_max * mult) {
7597 #ifdef DEBUG_DERIV
7598 				printf("loops max mult mismatch => forbidden\n");
7599 #endif
7600 				xmlExpFree(ctxt, tmp);
7601 				return(forbiddenExp);
7602 			    }
7603 			    if (sub->exp_max * mult > exp->exp_min)
7604 				min = 0;
7605 			    else
7606 				min = exp->exp_min - sub->exp_max * mult;
7607 			    max = exp->exp_max - sub->exp_max * mult;
7608 			}
7609 		    }
7610 		} else if (!IS_NILLABLE(tmp)) {
7611 		    /*
7612 		     * TODO: loop here to try to grow if working on finite
7613 		     *       blocks.
7614 		     */
7615 #ifdef DEBUG_DERIV
7616 		    printf("Count, Count remain not nillable => forbidden\n");
7617 #endif
7618 		    xmlExpFree(ctxt, tmp);
7619 		    return(forbiddenExp);
7620 		} else if (sub->exp_max == -1) {
7621 		    if (exp->exp_max == -1) {
7622 		        if (exp->exp_min <= sub->exp_min) {
7623 #ifdef DEBUG_DERIV
7624 			    printf("Infinite loops Okay => COUNT(0,Inf)\n");
7625 #endif
7626                             max = -1;
7627 			    min = 0;
7628 			} else {
7629 #ifdef DEBUG_DERIV
7630 			    printf("Infinite loops min => Count(X,Inf)\n");
7631 #endif
7632                             max = -1;
7633 			    min = exp->exp_min - sub->exp_min;
7634 			}
7635 		    } else if (exp->exp_min > sub->exp_min) {
7636 #ifdef DEBUG_DERIV
7637 			printf("loops min mismatch 1 => forbidden ???\n");
7638 #endif
7639 		        xmlExpFree(ctxt, tmp);
7640 		        return(forbiddenExp);
7641 		    } else {
7642 			max = -1;
7643 			min = 0;
7644 		    }
7645 		} else {
7646 		    if (exp->exp_max == -1) {
7647 #ifdef DEBUG_DERIV
7648 			printf("Infinite loop consume finite loop\n");
7649 #endif
7650 		        if (exp->exp_min > sub->exp_min) {
7651 			    max = -1;
7652 			    min = exp->exp_min - sub->exp_min;
7653 			} else {
7654 			    max = -1;
7655 			    min = 0;
7656 			}
7657 		    } else {
7658 		        if (exp->exp_max < sub->exp_max) {
7659 #ifdef DEBUG_DERIV
7660 			    printf("loops max mismatch => forbidden\n");
7661 #endif
7662 			    xmlExpFree(ctxt, tmp);
7663 			    return(forbiddenExp);
7664 			}
7665 			if (sub->exp_max > exp->exp_min)
7666 			    min = 0;
7667 			else
7668 			    min = exp->exp_min - sub->exp_max;
7669 			max = exp->exp_max - sub->exp_max;
7670 		    }
7671 		}
7672 #ifdef DEBUG_DERIV
7673 		printf("loops match => SEQ(COUNT())\n");
7674 #endif
7675 		exp->exp_left->ref++;
7676 		tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7677 		                          NULL, NULL, min, max);
7678 		if (tmp2 == NULL) {
7679 		    return(NULL);
7680 		}
7681                 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7682 		                         NULL, 0, 0);
7683 		return(ret);
7684 	    }
7685 	    tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7686 	    if (tmp == NULL)
7687 		return(NULL);
7688 	    if (tmp == forbiddenExp) {
7689 #ifdef DEBUG_DERIV
7690 		printf("loop mismatch => forbidden\n");
7691 #endif
7692 		return(forbiddenExp);
7693 	    }
7694 	    if (exp->exp_min > 0)
7695 		min = exp->exp_min - 1;
7696 	    else
7697 		min = 0;
7698 	    if (exp->exp_max < 0)
7699 		max = -1;
7700 	    else
7701 		max = exp->exp_max - 1;
7702 
7703 #ifdef DEBUG_DERIV
7704 	    printf("loop match => SEQ(COUNT())\n");
7705 #endif
7706 	    exp->exp_left->ref++;
7707 	    tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7708 				      NULL, NULL, min, max);
7709 	    if (tmp2 == NULL)
7710 		return(NULL);
7711 	    ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7712 				     NULL, 0, 0);
7713 	    return(ret);
7714 	}
7715     }
7716 
7717 #ifdef DEBUG_DERIV
7718     printf("Fallback to derivative\n");
7719 #endif
7720     if (IS_NILLABLE(sub)) {
7721         if (!(IS_NILLABLE(exp)))
7722 	    return(forbiddenExp);
7723 	else
7724 	    ret = emptyExp;
7725     } else
7726 	ret = NULL;
7727     /*
7728      * here the structured derivation made no progress so
7729      * we use the default token based derivation to force one more step
7730      */
7731     if (ctxt->tabSize == 0)
7732         ctxt->tabSize = 40;
7733 
7734     tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7735 	                               sizeof(const xmlChar *));
7736     if (tab == NULL) {
7737 	return(NULL);
7738     }
7739 
7740     /*
7741      * collect all the strings accepted by the subexpression on input
7742      */
7743     len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7744     while (len < 0) {
7745         const xmlChar **temp;
7746 	temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7747 	                                     sizeof(const xmlChar *));
7748 	if (temp == NULL) {
7749 	    xmlFree((xmlChar **) tab);
7750 	    return(NULL);
7751 	}
7752 	tab = temp;
7753 	ctxt->tabSize *= 2;
7754 	len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7755     }
7756     for (i = 0;i < len;i++) {
7757         tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7758 	if ((tmp == NULL) || (tmp == forbiddenExp)) {
7759 	    xmlExpFree(ctxt, ret);
7760 	    xmlFree((xmlChar **) tab);
7761 	    return(tmp);
7762 	}
7763 	tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7764 	if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7765 	    xmlExpFree(ctxt, tmp);
7766 	    xmlExpFree(ctxt, ret);
7767 	    xmlFree((xmlChar **) tab);
7768 	    return(tmp);
7769 	}
7770 	tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7771 	xmlExpFree(ctxt, tmp);
7772 	xmlExpFree(ctxt, tmp2);
7773 
7774 	if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7775 	    xmlExpFree(ctxt, ret);
7776 	    xmlFree((xmlChar **) tab);
7777 	    return(tmp3);
7778 	}
7779 
7780 	if (ret == NULL)
7781 	    ret = tmp3;
7782 	else {
7783 	    ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7784 	    if (ret == NULL) {
7785 		xmlFree((xmlChar **) tab);
7786 	        return(NULL);
7787 	    }
7788 	}
7789     }
7790     xmlFree((xmlChar **) tab);
7791     return(ret);
7792 }
7793 
7794 /**
7795  * xmlExpExpDerive:
7796  * @ctxt: the expressions context
7797  * @exp: the englobing expression
7798  * @sub: the subexpression
7799  *
7800  * Evaluates the expression resulting from @exp consuming a sub expression @sub
7801  * Based on algebraic derivation and sometimes direct Brzozowski derivation
7802  * it usually tatkes less than linear time and can handle expressions generating
7803  * infinite languages.
7804  *
7805  * Returns the resulting expression or NULL in case of internal error, the
7806  *         result must be freed
7807  */
7808 xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7809 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7810     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7811         return(NULL);
7812 
7813     /*
7814      * O(1) speedups
7815      */
7816     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7817 #ifdef DEBUG_DERIV
7818 	printf("Sub nillable and not exp : can't subsume\n");
7819 #endif
7820         return(forbiddenExp);
7821     }
7822     if (xmlExpCheckCard(exp, sub) == 0) {
7823 #ifdef DEBUG_DERIV
7824 	printf("sub generate longuer sequances than exp : can't subsume\n");
7825 #endif
7826         return(forbiddenExp);
7827     }
7828     return(xmlExpExpDeriveInt(ctxt, exp, sub));
7829 }
7830 
7831 /**
7832  * xmlExpSubsume:
7833  * @ctxt: the expressions context
7834  * @exp: the englobing expression
7835  * @sub: the subexpression
7836  *
7837  * Check whether @exp accepts all the languages accexpted by @sub
7838  * the input being a subexpression.
7839  *
7840  * Returns 1 if true 0 if false and -1 in case of failure.
7841  */
7842 int
xmlExpSubsume(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7843 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7844     xmlExpNodePtr tmp;
7845 
7846     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7847         return(-1);
7848 
7849     /*
7850      * TODO: speedup by checking the language of sub is a subset of the
7851      *       language of exp
7852      */
7853     /*
7854      * O(1) speedups
7855      */
7856     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7857 #ifdef DEBUG_DERIV
7858 	printf("Sub nillable and not exp : can't subsume\n");
7859 #endif
7860         return(0);
7861     }
7862     if (xmlExpCheckCard(exp, sub) == 0) {
7863 #ifdef DEBUG_DERIV
7864 	printf("sub generate longuer sequances than exp : can't subsume\n");
7865 #endif
7866         return(0);
7867     }
7868     tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7869 #ifdef DEBUG_DERIV
7870     printf("Result derivation :\n");
7871     PRINT_EXP(tmp);
7872 #endif
7873     if (tmp == NULL)
7874         return(-1);
7875     if (tmp == forbiddenExp)
7876 	return(0);
7877     if (tmp == emptyExp)
7878 	return(1);
7879     if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7880         xmlExpFree(ctxt, tmp);
7881         return(1);
7882     }
7883     xmlExpFree(ctxt, tmp);
7884     return(0);
7885 }
7886 
7887 /************************************************************************
7888  *									*
7889  *			Parsing expression				*
7890  *									*
7891  ************************************************************************/
7892 
7893 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7894 
7895 #undef CUR
7896 #define CUR (*ctxt->cur)
7897 #undef NEXT
7898 #define NEXT ctxt->cur++;
7899 #undef IS_BLANK
7900 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7901 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7902 
7903 static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt)7904 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7905     int ret = 0;
7906 
7907     SKIP_BLANKS
7908     if (CUR == '*') {
7909 	NEXT
7910 	return(-1);
7911     }
7912     if ((CUR < '0') || (CUR > '9'))
7913         return(-1);
7914     while ((CUR >= '0') && (CUR <= '9')) {
7915         ret = ret * 10 + (CUR - '0');
7916 	NEXT
7917     }
7918     return(ret);
7919 }
7920 
7921 static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt)7922 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7923     const char *base;
7924     xmlExpNodePtr ret;
7925     const xmlChar *val;
7926 
7927     SKIP_BLANKS
7928     base = ctxt->cur;
7929     if (*ctxt->cur == '(') {
7930         NEXT
7931 	ret = xmlExpParseExpr(ctxt);
7932 	SKIP_BLANKS
7933 	if (*ctxt->cur != ')') {
7934 	    fprintf(stderr, "unbalanced '(' : %s\n", base);
7935 	    xmlExpFree(ctxt, ret);
7936 	    return(NULL);
7937 	}
7938 	NEXT;
7939 	SKIP_BLANKS
7940 	goto parse_quantifier;
7941     }
7942     while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7943            (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7944 	   (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7945 	NEXT;
7946     val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7947     if (val == NULL)
7948         return(NULL);
7949     ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7950     if (ret == NULL)
7951         return(NULL);
7952     SKIP_BLANKS
7953 parse_quantifier:
7954     if (CUR == '{') {
7955         int min, max;
7956 
7957         NEXT
7958 	min = xmlExpParseNumber(ctxt);
7959 	if (min < 0) {
7960 	    xmlExpFree(ctxt, ret);
7961 	    return(NULL);
7962 	}
7963 	SKIP_BLANKS
7964 	if (CUR == ',') {
7965 	    NEXT
7966 	    max = xmlExpParseNumber(ctxt);
7967 	    SKIP_BLANKS
7968 	} else
7969 	    max = min;
7970 	if (CUR != '}') {
7971 	    xmlExpFree(ctxt, ret);
7972 	    return(NULL);
7973 	}
7974         NEXT
7975 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7976 	                         min, max);
7977 	SKIP_BLANKS
7978     } else if (CUR == '?') {
7979         NEXT
7980 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7981 	                         0, 1);
7982 	SKIP_BLANKS
7983     } else if (CUR == '+') {
7984         NEXT
7985 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7986 	                         1, -1);
7987 	SKIP_BLANKS
7988     } else if (CUR == '*') {
7989         NEXT
7990 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7991 	                         0, -1);
7992 	SKIP_BLANKS
7993     }
7994     return(ret);
7995 }
7996 
7997 
7998 static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt)7999 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
8000     xmlExpNodePtr ret, right;
8001 
8002     ret = xmlExpParseOr(ctxt);
8003     SKIP_BLANKS
8004     while (CUR == '|') {
8005         NEXT
8006 	right = xmlExpParseOr(ctxt);
8007 	if (right == NULL) {
8008 	    xmlExpFree(ctxt, ret);
8009 	    return(NULL);
8010 	}
8011 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8012 	if (ret == NULL)
8013 	    return(NULL);
8014     }
8015     return(ret);
8016 }
8017 
8018 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt)8019 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8020     xmlExpNodePtr ret, right;
8021 
8022     ret = xmlExpParseSeq(ctxt);
8023     SKIP_BLANKS
8024     while (CUR == ',') {
8025         NEXT
8026 	right = xmlExpParseSeq(ctxt);
8027 	if (right == NULL) {
8028 	    xmlExpFree(ctxt, ret);
8029 	    return(NULL);
8030 	}
8031 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8032 	if (ret == NULL)
8033 	    return(NULL);
8034     }
8035     return(ret);
8036 }
8037 
8038 /**
8039  * xmlExpParse:
8040  * @ctxt: the expressions context
8041  * @expr: the 0 terminated string
8042  *
8043  * Minimal parser for regexps, it understand the following constructs
8044  *  - string terminals
8045  *  - choice operator |
8046  *  - sequence operator ,
8047  *  - subexpressions (...)
8048  *  - usual cardinality operators + * and ?
8049  *  - finite sequences  { min, max }
8050  *  - infinite sequences { min, * }
8051  * There is minimal checkings made especially no checking on strings values
8052  *
8053  * Returns a new expression or NULL in case of failure
8054  */
8055 xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt,const char * expr)8056 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8057     xmlExpNodePtr ret;
8058 
8059     ctxt->expr = expr;
8060     ctxt->cur = expr;
8061 
8062     ret = xmlExpParseExpr(ctxt);
8063     SKIP_BLANKS
8064     if (*ctxt->cur != 0) {
8065         xmlExpFree(ctxt, ret);
8066         return(NULL);
8067     }
8068     return(ret);
8069 }
8070 
8071 static void
xmlExpDumpInt(xmlBufferPtr buf,xmlExpNodePtr expr,int glob)8072 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8073     xmlExpNodePtr c;
8074 
8075     if (expr == NULL) return;
8076     if (glob) xmlBufferWriteChar(buf, "(");
8077     switch (expr->type) {
8078         case XML_EXP_EMPTY:
8079 	    xmlBufferWriteChar(buf, "empty");
8080 	    break;
8081         case XML_EXP_FORBID:
8082 	    xmlBufferWriteChar(buf, "forbidden");
8083 	    break;
8084         case XML_EXP_ATOM:
8085 	    xmlBufferWriteCHAR(buf, expr->exp_str);
8086 	    break;
8087         case XML_EXP_SEQ:
8088 	    c = expr->exp_left;
8089 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8090 	        xmlExpDumpInt(buf, c, 1);
8091 	    else
8092 	        xmlExpDumpInt(buf, c, 0);
8093 	    xmlBufferWriteChar(buf, " , ");
8094 	    c = expr->exp_right;
8095 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8096 	        xmlExpDumpInt(buf, c, 1);
8097 	    else
8098 	        xmlExpDumpInt(buf, c, 0);
8099             break;
8100         case XML_EXP_OR:
8101 	    c = expr->exp_left;
8102 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8103 	        xmlExpDumpInt(buf, c, 1);
8104 	    else
8105 	        xmlExpDumpInt(buf, c, 0);
8106 	    xmlBufferWriteChar(buf, " | ");
8107 	    c = expr->exp_right;
8108 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8109 	        xmlExpDumpInt(buf, c, 1);
8110 	    else
8111 	        xmlExpDumpInt(buf, c, 0);
8112             break;
8113         case XML_EXP_COUNT: {
8114 	    char rep[40];
8115 
8116 	    c = expr->exp_left;
8117 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8118 	        xmlExpDumpInt(buf, c, 1);
8119 	    else
8120 	        xmlExpDumpInt(buf, c, 0);
8121 	    if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8122 		rep[0] = '?';
8123 		rep[1] = 0;
8124 	    } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8125 		rep[0] = '*';
8126 		rep[1] = 0;
8127 	    } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8128 		rep[0] = '+';
8129 		rep[1] = 0;
8130 	    } else if (expr->exp_max == expr->exp_min) {
8131 	        snprintf(rep, 39, "{%d}", expr->exp_min);
8132 	    } else if (expr->exp_max < 0) {
8133 	        snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8134 	    } else {
8135 	        snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8136 	    }
8137 	    rep[39] = 0;
8138 	    xmlBufferWriteChar(buf, rep);
8139 	    break;
8140 	}
8141 	default:
8142 	    fprintf(stderr, "Error in tree\n");
8143     }
8144     if (glob)
8145         xmlBufferWriteChar(buf, ")");
8146 }
8147 /**
8148  * xmlExpDump:
8149  * @buf:  a buffer to receive the output
8150  * @expr:  the compiled expression
8151  *
8152  * Serialize the expression as compiled to the buffer
8153  */
8154 void
xmlExpDump(xmlBufferPtr buf,xmlExpNodePtr expr)8155 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8156     if ((buf == NULL) || (expr == NULL))
8157         return;
8158     xmlExpDumpInt(buf, expr, 0);
8159 }
8160 
8161 /**
8162  * xmlExpMaxToken:
8163  * @expr: a compiled expression
8164  *
8165  * Indicate the maximum number of input a expression can accept
8166  *
8167  * Returns the maximum length or -1 in case of error
8168  */
8169 int
xmlExpMaxToken(xmlExpNodePtr expr)8170 xmlExpMaxToken(xmlExpNodePtr expr) {
8171     if (expr == NULL)
8172         return(-1);
8173     return(expr->c_max);
8174 }
8175 
8176 /**
8177  * xmlExpCtxtNbNodes:
8178  * @ctxt: an expression context
8179  *
8180  * Debugging facility provides the number of allocated nodes at a that point
8181  *
8182  * Returns the number of nodes in use or -1 in case of error
8183  */
8184 int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt)8185 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8186     if (ctxt == NULL)
8187         return(-1);
8188     return(ctxt->nb_nodes);
8189 }
8190 
8191 /**
8192  * xmlExpCtxtNbCons:
8193  * @ctxt: an expression context
8194  *
8195  * Debugging facility provides the number of allocated nodes over lifetime
8196  *
8197  * Returns the number of nodes ever allocated or -1 in case of error
8198  */
8199 int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt)8200 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8201     if (ctxt == NULL)
8202         return(-1);
8203     return(ctxt->nb_cons);
8204 }
8205 
8206 #endif /* LIBXML_EXPR_ENABLED */
8207 #define bottom_xmlregexp
8208 #include "elfgcchack.h"
8209 #endif /* LIBXML_REGEXP_ENABLED */
8210