1 /*
2  * regexp.c: generic and extensible Regular Expression engine
3  *
4  * Basically designed with the purpose of compiling regexps for
5  * the variety of validation/shemas mechanisms now available in
6  * XML related specifications these include:
7  *    - XML-1.0 DTD validation
8  *    - XML Schemas structure part 1
9  *    - XML Schemas Datatypes part 2 especially Appendix F
10  *    - RELAX-NG/TREX i.e. the counter proposal
11  *
12  * See Copyright for the status of this software.
13  *
14  * Daniel Veillard <veillard@redhat.com>
15  */
16 
17 #define IN_LIBXML
18 #include "libxml.h"
19 
20 #ifdef LIBXML_REGEXP_ENABLED
21 
22 /* #define DEBUG_ERR */
23 
24 #include <stdio.h>
25 #include <string.h>
26 #ifdef HAVE_LIMITS_H
27 #include <limits.h>
28 #endif
29 
30 #include <libxml/tree.h>
31 #include <libxml/parserInternals.h>
32 #include <libxml/xmlregexp.h>
33 #include <libxml/xmlautomata.h>
34 #include <libxml/xmlunicode.h>
35 
36 #ifndef INT_MAX
37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */
38 #endif
39 
40 /* #define DEBUG_REGEXP_GRAPH */
41 /* #define DEBUG_REGEXP_EXEC */
42 /* #define DEBUG_PUSH */
43 /* #define DEBUG_COMPACTION */
44 
45 #define MAX_PUSH 10000000
46 
47 #define ERROR(str)                                                      \
48     ctxt->error = XML_REGEXP_COMPILE_ERROR;                             \
49     xmlRegexpErrCompile(ctxt, str);
50 #define NEXT ctxt->cur++
51 #define CUR (*(ctxt->cur))
52 #define NXT(index) (ctxt->cur[index])
53 
54 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
55 #define NEXTL(l) ctxt->cur += l;
56 #define XML_REG_STRING_SEPARATOR '|'
57 
58 /**
59  * TODO:
60  *
61  * macro to flag unimplemented blocks
62  */
63 #define TODO                                                            \
64     xmlGenericError(xmlGenericErrorContext,                             \
65             "Unimplemented block at %s:%d\n",                           \
66             __FILE__, __LINE__);
67 
68 /************************************************************************
69  *                                                                      *
70  *                      Datatypes and structures                        *
71  *                                                                      *
72  ************************************************************************/
73 
74 /*
75  * Note: the order of the enums below is significant, do not shuffle
76  */
77 typedef enum {
78     XML_REGEXP_EPSILON = 1,
79     XML_REGEXP_CHARVAL,
80     XML_REGEXP_RANGES,
81     XML_REGEXP_SUBREG,  /* used for () sub regexps */
82     XML_REGEXP_STRING,
83     XML_REGEXP_ANYCHAR, /* . */
84     XML_REGEXP_ANYSPACE, /* \s */
85     XML_REGEXP_NOTSPACE, /* \S */
86     XML_REGEXP_INITNAME, /* \l */
87     XML_REGEXP_NOTINITNAME, /* \L */
88     XML_REGEXP_NAMECHAR, /* \c */
89     XML_REGEXP_NOTNAMECHAR, /* \C */
90     XML_REGEXP_DECIMAL, /* \d */
91     XML_REGEXP_NOTDECIMAL, /* \D */
92     XML_REGEXP_REALCHAR, /* \w */
93     XML_REGEXP_NOTREALCHAR, /* \W */
94     XML_REGEXP_LETTER = 100,
95     XML_REGEXP_LETTER_UPPERCASE,
96     XML_REGEXP_LETTER_LOWERCASE,
97     XML_REGEXP_LETTER_TITLECASE,
98     XML_REGEXP_LETTER_MODIFIER,
99     XML_REGEXP_LETTER_OTHERS,
100     XML_REGEXP_MARK,
101     XML_REGEXP_MARK_NONSPACING,
102     XML_REGEXP_MARK_SPACECOMBINING,
103     XML_REGEXP_MARK_ENCLOSING,
104     XML_REGEXP_NUMBER,
105     XML_REGEXP_NUMBER_DECIMAL,
106     XML_REGEXP_NUMBER_LETTER,
107     XML_REGEXP_NUMBER_OTHERS,
108     XML_REGEXP_PUNCT,
109     XML_REGEXP_PUNCT_CONNECTOR,
110     XML_REGEXP_PUNCT_DASH,
111     XML_REGEXP_PUNCT_OPEN,
112     XML_REGEXP_PUNCT_CLOSE,
113     XML_REGEXP_PUNCT_INITQUOTE,
114     XML_REGEXP_PUNCT_FINQUOTE,
115     XML_REGEXP_PUNCT_OTHERS,
116     XML_REGEXP_SEPAR,
117     XML_REGEXP_SEPAR_SPACE,
118     XML_REGEXP_SEPAR_LINE,
119     XML_REGEXP_SEPAR_PARA,
120     XML_REGEXP_SYMBOL,
121     XML_REGEXP_SYMBOL_MATH,
122     XML_REGEXP_SYMBOL_CURRENCY,
123     XML_REGEXP_SYMBOL_MODIFIER,
124     XML_REGEXP_SYMBOL_OTHERS,
125     XML_REGEXP_OTHER,
126     XML_REGEXP_OTHER_CONTROL,
127     XML_REGEXP_OTHER_FORMAT,
128     XML_REGEXP_OTHER_PRIVATE,
129     XML_REGEXP_OTHER_NA,
130     XML_REGEXP_BLOCK_NAME
131 } xmlRegAtomType;
132 
133 typedef enum {
134     XML_REGEXP_QUANT_EPSILON = 1,
135     XML_REGEXP_QUANT_ONCE,
136     XML_REGEXP_QUANT_OPT,
137     XML_REGEXP_QUANT_MULT,
138     XML_REGEXP_QUANT_PLUS,
139     XML_REGEXP_QUANT_ONCEONLY,
140     XML_REGEXP_QUANT_ALL,
141     XML_REGEXP_QUANT_RANGE
142 } xmlRegQuantType;
143 
144 typedef enum {
145     XML_REGEXP_START_STATE = 1,
146     XML_REGEXP_FINAL_STATE,
147     XML_REGEXP_TRANS_STATE,
148     XML_REGEXP_SINK_STATE
149 } xmlRegStateType;
150 
151 typedef enum {
152     XML_REGEXP_MARK_NORMAL = 0,
153     XML_REGEXP_MARK_START,
154     XML_REGEXP_MARK_VISITED
155 } xmlRegMarkedType;
156 
157 typedef struct _xmlRegRange xmlRegRange;
158 typedef xmlRegRange *xmlRegRangePtr;
159 
160 struct _xmlRegRange {
161     int neg;            /* 0 normal, 1 not, 2 exclude */
162     xmlRegAtomType type;
163     int start;
164     int end;
165     xmlChar *blockName;
166 };
167 
168 typedef struct _xmlRegAtom xmlRegAtom;
169 typedef xmlRegAtom *xmlRegAtomPtr;
170 
171 typedef struct _xmlAutomataState xmlRegState;
172 typedef xmlRegState *xmlRegStatePtr;
173 
174 struct _xmlRegAtom {
175     int no;
176     xmlRegAtomType type;
177     xmlRegQuantType quant;
178     int min;
179     int max;
180 
181     void *valuep;
182     void *valuep2;
183     int neg;
184     int codepoint;
185     xmlRegStatePtr start;
186     xmlRegStatePtr stop;
187     int maxRanges;
188     int nbRanges;
189     xmlRegRangePtr *ranges;
190     void *data;
191 };
192 
193 typedef struct _xmlRegCounter xmlRegCounter;
194 typedef xmlRegCounter *xmlRegCounterPtr;
195 
196 struct _xmlRegCounter {
197     int min;
198     int max;
199 };
200 
201 typedef struct _xmlRegTrans xmlRegTrans;
202 typedef xmlRegTrans *xmlRegTransPtr;
203 
204 struct _xmlRegTrans {
205     xmlRegAtomPtr atom;
206     int to;
207     int counter;
208     int count;
209     int nd;
210 };
211 
212 struct _xmlAutomataState {
213     xmlRegStateType type;
214     xmlRegMarkedType mark;
215     xmlRegMarkedType reached;
216     int no;
217     int maxTrans;
218     int nbTrans;
219     xmlRegTrans *trans;
220     /*  knowing states ponting to us can speed things up */
221     int maxTransTo;
222     int nbTransTo;
223     int *transTo;
224 };
225 
226 typedef struct _xmlAutomata xmlRegParserCtxt;
227 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
228 
229 struct _xmlAutomata {
230     xmlChar *string;
231     xmlChar *cur;
232 
233     int error;
234     int neg;
235 
236     xmlRegStatePtr start;
237     xmlRegStatePtr end;
238     xmlRegStatePtr state;
239 
240     xmlRegAtomPtr atom;
241 
242     int maxAtoms;
243     int nbAtoms;
244     xmlRegAtomPtr *atoms;
245 
246     int maxStates;
247     int nbStates;
248     xmlRegStatePtr *states;
249 
250     int maxCounters;
251     int nbCounters;
252     xmlRegCounter *counters;
253 
254     int determinist;
255     int negs;
256 };
257 
258 struct _xmlRegexp {
259     xmlChar *string;
260     int nbStates;
261     xmlRegStatePtr *states;
262     int nbAtoms;
263     xmlRegAtomPtr *atoms;
264     int nbCounters;
265     xmlRegCounter *counters;
266     int determinist;
267     /*
268      * That's the compact form for determinists automatas
269      */
270     int nbstates;
271     int *compact;
272     void **transdata;
273     int nbstrings;
274     xmlChar **stringMap;
275 };
276 
277 typedef struct _xmlRegExecRollback xmlRegExecRollback;
278 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
279 
280 struct _xmlRegExecRollback {
281     xmlRegStatePtr state;/* the current state */
282     int index;          /* the index in the input stack */
283     int nextbranch;     /* the next transition to explore in that state */
284     int *counts;        /* save the automata state if it has some */
285 };
286 
287 typedef struct _xmlRegInputToken xmlRegInputToken;
288 typedef xmlRegInputToken *xmlRegInputTokenPtr;
289 
290 struct _xmlRegInputToken {
291     xmlChar *value;
292     void *data;
293 };
294 
295 struct _xmlRegExecCtxt {
296     int status;         /* execution status != 0 indicate an error */
297     int determinist;    /* did we find an indeterministic behaviour */
298     xmlRegexpPtr comp;  /* the compiled regexp */
299     xmlRegExecCallbacks callback;
300     void *data;
301 
302     xmlRegStatePtr state;/* the current state */
303     int transno;        /* the current transition on that state */
304     int transcount;     /* the number of chars in char counted transitions */
305 
306     /*
307      * A stack of rollback states
308      */
309     int maxRollbacks;
310     int nbRollbacks;
311     xmlRegExecRollback *rollbacks;
312 
313     /*
314      * The state of the automata if any
315      */
316     int *counts;
317 
318     /*
319      * The input stack
320      */
321     int inputStackMax;
322     int inputStackNr;
323     int index;
324     int *charStack;
325     const xmlChar *inputString; /* when operating on characters */
326     xmlRegInputTokenPtr inputStack;/* when operating on strings */
327 
328     /*
329      * error handling
330      */
331     int errStateNo;             /* the error state number */
332     xmlRegStatePtr errState;    /* the error state */
333     xmlChar *errString;         /* the string raising the error */
334     int *errCounts;             /* counters at the error state */
335     int nbPush;
336 };
337 
338 #define REGEXP_ALL_COUNTER      0x123456
339 #define REGEXP_ALL_LAX_COUNTER  0x123457
340 
341 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
342 static void xmlRegFreeState(xmlRegStatePtr state);
343 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
344 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
345 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
346 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
347                   int neg, int start, int end, const xmlChar *blockName);
348 
349 /************************************************************************
350  *                                                                      *
351  *              Regexp memory error handler                             *
352  *                                                                      *
353  ************************************************************************/
354 /**
355  * xmlRegexpErrMemory:
356  * @extra:  extra information
357  *
358  * Handle an out of memory condition
359  */
360 static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt,const char * extra)361 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
362 {
363     const char *regexp = NULL;
364     if (ctxt != NULL) {
365         regexp = (const char *) ctxt->string;
366         ctxt->error = XML_ERR_NO_MEMORY;
367     }
368     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
369                     XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
370                     regexp, NULL, 0, 0,
371                     "Memory allocation failed : %s\n", extra);
372 }
373 
374 /**
375  * xmlRegexpErrCompile:
376  * @extra:  extra information
377  *
378  * Handle a compilation failure
379  */
380 static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt,const char * extra)381 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
382 {
383     const char *regexp = NULL;
384     int idx = 0;
385 
386     if (ctxt != NULL) {
387         regexp = (const char *) ctxt->string;
388         idx = ctxt->cur - ctxt->string;
389         ctxt->error = XML_REGEXP_COMPILE_ERROR;
390     }
391     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
392                     XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
393                     regexp, NULL, idx, 0,
394                     "failed to compile: %s\n", extra);
395 }
396 
397 /************************************************************************
398  *                                                                      *
399  *                      Allocation/Deallocation                         *
400  *                                                                      *
401  ************************************************************************/
402 
403 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
404 /**
405  * xmlRegEpxFromParse:
406  * @ctxt:  the parser context used to build it
407  *
408  * Allocate a new regexp and fill it with the result from the parser
409  *
410  * Returns the new regexp or NULL in case of error
411  */
412 static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt)413 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
414     xmlRegexpPtr ret;
415 
416     ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
417     if (ret == NULL) {
418         xmlRegexpErrMemory(ctxt, "compiling regexp");
419         return(NULL);
420     }
421     memset(ret, 0, sizeof(xmlRegexp));
422     ret->string = ctxt->string;
423     ret->nbStates = ctxt->nbStates;
424     ret->states = ctxt->states;
425     ret->nbAtoms = ctxt->nbAtoms;
426     ret->atoms = ctxt->atoms;
427     ret->nbCounters = ctxt->nbCounters;
428     ret->counters = ctxt->counters;
429     ret->determinist = ctxt->determinist;
430     if (ret->determinist == -1) {
431         xmlRegexpIsDeterminist(ret);
432     }
433 
434     if ((ret->determinist != 0) &&
435         (ret->nbCounters == 0) &&
436         (ctxt->negs == 0) &&
437         (ret->atoms != NULL) &&
438         (ret->atoms[0] != NULL) &&
439         (ret->atoms[0]->type == XML_REGEXP_STRING)) {
440         int i, j, nbstates = 0, nbatoms = 0;
441         int *stateRemap;
442         int *stringRemap;
443         int *transitions;
444         void **transdata;
445         xmlChar **stringMap;
446         xmlChar *value;
447 
448         /*
449          * Switch to a compact representation
450          * 1/ counting the effective number of states left
451          * 2/ counting the unique number of atoms, and check that
452          *    they are all of the string type
453          * 3/ build a table state x atom for the transitions
454          */
455 
456         stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
457         if (stateRemap == NULL) {
458             xmlRegexpErrMemory(ctxt, "compiling regexp");
459             xmlFree(ret);
460             return(NULL);
461         }
462         for (i = 0;i < ret->nbStates;i++) {
463             if (ret->states[i] != NULL) {
464                 stateRemap[i] = nbstates;
465                 nbstates++;
466             } else {
467                 stateRemap[i] = -1;
468             }
469         }
470 #ifdef DEBUG_COMPACTION
471         printf("Final: %d states\n", nbstates);
472 #endif
473         stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
474         if (stringMap == NULL) {
475             xmlRegexpErrMemory(ctxt, "compiling regexp");
476             xmlFree(stateRemap);
477             xmlFree(ret);
478             return(NULL);
479         }
480         stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
481         if (stringRemap == NULL) {
482             xmlRegexpErrMemory(ctxt, "compiling regexp");
483             xmlFree(stringMap);
484             xmlFree(stateRemap);
485             xmlFree(ret);
486             return(NULL);
487         }
488         for (i = 0;i < ret->nbAtoms;i++) {
489             if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
490                 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
491                 value = ret->atoms[i]->valuep;
492                 for (j = 0;j < nbatoms;j++) {
493                     if (xmlStrEqual(stringMap[j], value)) {
494                         stringRemap[i] = j;
495                         break;
496                     }
497                 }
498                 if (j >= nbatoms) {
499                     stringRemap[i] = nbatoms;
500                     stringMap[nbatoms] = xmlStrdup(value);
501                     if (stringMap[nbatoms] == NULL) {
502                         for (i = 0;i < nbatoms;i++)
503                             xmlFree(stringMap[i]);
504                         xmlFree(stringRemap);
505                         xmlFree(stringMap);
506                         xmlFree(stateRemap);
507                         xmlFree(ret);
508                         return(NULL);
509                     }
510                     nbatoms++;
511                 }
512             } else {
513                 xmlFree(stateRemap);
514                 xmlFree(stringRemap);
515                 for (i = 0;i < nbatoms;i++)
516                     xmlFree(stringMap[i]);
517                 xmlFree(stringMap);
518                 xmlFree(ret);
519                 return(NULL);
520             }
521         }
522 #ifdef DEBUG_COMPACTION
523         printf("Final: %d atoms\n", nbatoms);
524 #endif
525         transitions = (int *) xmlMalloc((nbstates + 1) *
526                                         (nbatoms + 1) * sizeof(int));
527         if (transitions == NULL) {
528             xmlFree(stateRemap);
529             xmlFree(stringRemap);
530             xmlFree(stringMap);
531             xmlFree(ret);
532             return(NULL);
533         }
534         memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
535 
536         /*
537          * Allocate the transition table. The first entry for each
538          * state corresponds to the state type.
539          */
540         transdata = NULL;
541 
542         for (i = 0;i < ret->nbStates;i++) {
543             int stateno, atomno, targetno, prev;
544             xmlRegStatePtr state;
545             xmlRegTransPtr trans;
546 
547             stateno = stateRemap[i];
548             if (stateno == -1)
549                 continue;
550             state = ret->states[i];
551 
552             transitions[stateno * (nbatoms + 1)] = state->type;
553 
554             for (j = 0;j < state->nbTrans;j++) {
555                 trans = &(state->trans[j]);
556                 if ((trans->to == -1) || (trans->atom == NULL))
557                     continue;
558                 atomno = stringRemap[trans->atom->no];
559                 if ((trans->atom->data != NULL) && (transdata == NULL)) {
560                     transdata = (void **) xmlMalloc(nbstates * nbatoms *
561                                                     sizeof(void *));
562                     if (transdata != NULL)
563                         memset(transdata, 0,
564                                nbstates * nbatoms * sizeof(void *));
565                     else {
566                         xmlRegexpErrMemory(ctxt, "compiling regexp");
567                         break;
568                     }
569                 }
570                 targetno = stateRemap[trans->to];
571                 /*
572                  * if the same atom can generate transitions to 2 different
573                  * states then it means the automata is not determinist and
574                  * the compact form can't be used !
575                  */
576                 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
577                 if (prev != 0) {
578                     if (prev != targetno + 1) {
579                         ret->determinist = 0;
580 #ifdef DEBUG_COMPACTION
581                         printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
582                                i, j, trans->atom->no, trans->to, atomno, targetno);
583                         printf("       previous to is %d\n", prev);
584 #endif
585                         if (transdata != NULL)
586                             xmlFree(transdata);
587                         xmlFree(transitions);
588                         xmlFree(stateRemap);
589                         xmlFree(stringRemap);
590                         for (i = 0;i < nbatoms;i++)
591                             xmlFree(stringMap[i]);
592                         xmlFree(stringMap);
593                         goto not_determ;
594                     }
595                 } else {
596 #if 0
597                     printf("State %d trans %d: atom %d to %d : %d to %d\n",
598                            i, j, trans->atom->no, trans->to, atomno, targetno);
599 #endif
600                     transitions[stateno * (nbatoms + 1) + atomno + 1] =
601                         targetno + 1; /* to avoid 0 */
602                     if (transdata != NULL)
603                         transdata[stateno * nbatoms + atomno] =
604                             trans->atom->data;
605                 }
606             }
607         }
608         ret->determinist = 1;
609 #ifdef DEBUG_COMPACTION
610         /*
611          * Debug
612          */
613         for (i = 0;i < nbstates;i++) {
614             for (j = 0;j < nbatoms + 1;j++) {
615                 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
616             }
617             printf("\n");
618         }
619         printf("\n");
620 #endif
621         /*
622          * Cleanup of the old data
623          */
624         if (ret->states != NULL) {
625             for (i = 0;i < ret->nbStates;i++)
626                 xmlRegFreeState(ret->states[i]);
627             xmlFree(ret->states);
628         }
629         ret->states = NULL;
630         ret->nbStates = 0;
631         if (ret->atoms != NULL) {
632             for (i = 0;i < ret->nbAtoms;i++)
633                 xmlRegFreeAtom(ret->atoms[i]);
634             xmlFree(ret->atoms);
635         }
636         ret->atoms = NULL;
637         ret->nbAtoms = 0;
638 
639         ret->compact = transitions;
640         ret->transdata = transdata;
641         ret->stringMap = stringMap;
642         ret->nbstrings = nbatoms;
643         ret->nbstates = nbstates;
644         xmlFree(stateRemap);
645         xmlFree(stringRemap);
646     }
647 not_determ:
648     ctxt->string = NULL;
649     ctxt->nbStates = 0;
650     ctxt->states = NULL;
651     ctxt->nbAtoms = 0;
652     ctxt->atoms = NULL;
653     ctxt->nbCounters = 0;
654     ctxt->counters = NULL;
655     return(ret);
656 }
657 
658 /**
659  * xmlRegNewParserCtxt:
660  * @string:  the string to parse
661  *
662  * Allocate a new regexp parser context
663  *
664  * Returns the new context or NULL in case of error
665  */
666 static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar * string)667 xmlRegNewParserCtxt(const xmlChar *string) {
668     xmlRegParserCtxtPtr ret;
669 
670     ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
671     if (ret == NULL)
672         return(NULL);
673     memset(ret, 0, sizeof(xmlRegParserCtxt));
674     if (string != NULL)
675         ret->string = xmlStrdup(string);
676     ret->cur = ret->string;
677     ret->neg = 0;
678     ret->negs = 0;
679     ret->error = 0;
680     ret->determinist = -1;
681     return(ret);
682 }
683 
684 /**
685  * xmlRegNewRange:
686  * @ctxt:  the regexp parser context
687  * @neg:  is that negative
688  * @type:  the type of range
689  * @start:  the start codepoint
690  * @end:  the end codepoint
691  *
692  * Allocate a new regexp range
693  *
694  * Returns the new range or NULL in case of error
695  */
696 static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,int neg,xmlRegAtomType type,int start,int end)697 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
698                int neg, xmlRegAtomType type, int start, int end) {
699     xmlRegRangePtr ret;
700 
701     ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
702     if (ret == NULL) {
703         xmlRegexpErrMemory(ctxt, "allocating range");
704         return(NULL);
705     }
706     ret->neg = neg;
707     ret->type = type;
708     ret->start = start;
709     ret->end = end;
710     return(ret);
711 }
712 
713 /**
714  * xmlRegFreeRange:
715  * @range:  the regexp range
716  *
717  * Free a regexp range
718  */
719 static void
xmlRegFreeRange(xmlRegRangePtr range)720 xmlRegFreeRange(xmlRegRangePtr range) {
721     if (range == NULL)
722         return;
723 
724     if (range->blockName != NULL)
725         xmlFree(range->blockName);
726     xmlFree(range);
727 }
728 
729 /**
730  * xmlRegNewAtom:
731  * @ctxt:  the regexp parser context
732  * @type:  the type of atom
733  *
734  * Allocate a new regexp range
735  *
736  * Returns the new atom or NULL in case of error
737  */
738 static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomType type)739 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
740     xmlRegAtomPtr ret;
741 
742     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
743     if (ret == NULL) {
744         xmlRegexpErrMemory(ctxt, "allocating atom");
745         return(NULL);
746     }
747     memset(ret, 0, sizeof(xmlRegAtom));
748     ret->type = type;
749     ret->quant = XML_REGEXP_QUANT_ONCE;
750     ret->min = 0;
751     ret->max = 0;
752     return(ret);
753 }
754 
755 /**
756  * xmlRegFreeAtom:
757  * @atom:  the regexp atom
758  *
759  * Free a regexp atom
760  */
761 static void
xmlRegFreeAtom(xmlRegAtomPtr atom)762 xmlRegFreeAtom(xmlRegAtomPtr atom) {
763     int i;
764 
765     if (atom == NULL)
766         return;
767 
768     for (i = 0;i < atom->nbRanges;i++)
769         xmlRegFreeRange(atom->ranges[i]);
770     if (atom->ranges != NULL)
771         xmlFree(atom->ranges);
772     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
773         xmlFree(atom->valuep);
774     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
775         xmlFree(atom->valuep2);
776     if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
777         xmlFree(atom->valuep);
778     xmlFree(atom);
779 }
780 
781 static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt)782 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
783     xmlRegStatePtr ret;
784 
785     ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
786     if (ret == NULL) {
787         xmlRegexpErrMemory(ctxt, "allocating state");
788         return(NULL);
789     }
790     memset(ret, 0, sizeof(xmlRegState));
791     ret->type = XML_REGEXP_TRANS_STATE;
792     ret->mark = XML_REGEXP_MARK_NORMAL;
793     return(ret);
794 }
795 
796 /**
797  * xmlRegFreeState:
798  * @state:  the regexp state
799  *
800  * Free a regexp state
801  */
802 static void
xmlRegFreeState(xmlRegStatePtr state)803 xmlRegFreeState(xmlRegStatePtr state) {
804     if (state == NULL)
805         return;
806 
807     if (state->trans != NULL)
808         xmlFree(state->trans);
809     if (state->transTo != NULL)
810         xmlFree(state->transTo);
811     xmlFree(state);
812 }
813 
814 /**
815  * xmlRegFreeParserCtxt:
816  * @ctxt:  the regexp parser context
817  *
818  * Free a regexp parser context
819  */
820 static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt)821 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
822     int i;
823     if (ctxt == NULL)
824         return;
825 
826     if (ctxt->string != NULL)
827         xmlFree(ctxt->string);
828     if (ctxt->states != NULL) {
829         for (i = 0;i < ctxt->nbStates;i++)
830             xmlRegFreeState(ctxt->states[i]);
831         xmlFree(ctxt->states);
832     }
833     if (ctxt->atoms != NULL) {
834         for (i = 0;i < ctxt->nbAtoms;i++)
835             xmlRegFreeAtom(ctxt->atoms[i]);
836         xmlFree(ctxt->atoms);
837     }
838     if (ctxt->counters != NULL)
839         xmlFree(ctxt->counters);
840     xmlFree(ctxt);
841 }
842 
843 /************************************************************************
844  *                                                                      *
845  *                      Display of Data structures                      *
846  *                                                                      *
847  ************************************************************************/
848 
849 static void
xmlRegPrintAtomType(FILE * output,xmlRegAtomType type)850 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
851     switch (type) {
852         case XML_REGEXP_EPSILON:
853             fprintf(output, "epsilon "); break;
854         case XML_REGEXP_CHARVAL:
855             fprintf(output, "charval "); break;
856         case XML_REGEXP_RANGES:
857             fprintf(output, "ranges "); break;
858         case XML_REGEXP_SUBREG:
859             fprintf(output, "subexpr "); break;
860         case XML_REGEXP_STRING:
861             fprintf(output, "string "); break;
862         case XML_REGEXP_ANYCHAR:
863             fprintf(output, "anychar "); break;
864         case XML_REGEXP_ANYSPACE:
865             fprintf(output, "anyspace "); break;
866         case XML_REGEXP_NOTSPACE:
867             fprintf(output, "notspace "); break;
868         case XML_REGEXP_INITNAME:
869             fprintf(output, "initname "); break;
870         case XML_REGEXP_NOTINITNAME:
871             fprintf(output, "notinitname "); break;
872         case XML_REGEXP_NAMECHAR:
873             fprintf(output, "namechar "); break;
874         case XML_REGEXP_NOTNAMECHAR:
875             fprintf(output, "notnamechar "); break;
876         case XML_REGEXP_DECIMAL:
877             fprintf(output, "decimal "); break;
878         case XML_REGEXP_NOTDECIMAL:
879             fprintf(output, "notdecimal "); break;
880         case XML_REGEXP_REALCHAR:
881             fprintf(output, "realchar "); break;
882         case XML_REGEXP_NOTREALCHAR:
883             fprintf(output, "notrealchar "); break;
884         case XML_REGEXP_LETTER:
885             fprintf(output, "LETTER "); break;
886         case XML_REGEXP_LETTER_UPPERCASE:
887             fprintf(output, "LETTER_UPPERCASE "); break;
888         case XML_REGEXP_LETTER_LOWERCASE:
889             fprintf(output, "LETTER_LOWERCASE "); break;
890         case XML_REGEXP_LETTER_TITLECASE:
891             fprintf(output, "LETTER_TITLECASE "); break;
892         case XML_REGEXP_LETTER_MODIFIER:
893             fprintf(output, "LETTER_MODIFIER "); break;
894         case XML_REGEXP_LETTER_OTHERS:
895             fprintf(output, "LETTER_OTHERS "); break;
896         case XML_REGEXP_MARK:
897             fprintf(output, "MARK "); break;
898         case XML_REGEXP_MARK_NONSPACING:
899             fprintf(output, "MARK_NONSPACING "); break;
900         case XML_REGEXP_MARK_SPACECOMBINING:
901             fprintf(output, "MARK_SPACECOMBINING "); break;
902         case XML_REGEXP_MARK_ENCLOSING:
903             fprintf(output, "MARK_ENCLOSING "); break;
904         case XML_REGEXP_NUMBER:
905             fprintf(output, "NUMBER "); break;
906         case XML_REGEXP_NUMBER_DECIMAL:
907             fprintf(output, "NUMBER_DECIMAL "); break;
908         case XML_REGEXP_NUMBER_LETTER:
909             fprintf(output, "NUMBER_LETTER "); break;
910         case XML_REGEXP_NUMBER_OTHERS:
911             fprintf(output, "NUMBER_OTHERS "); break;
912         case XML_REGEXP_PUNCT:
913             fprintf(output, "PUNCT "); break;
914         case XML_REGEXP_PUNCT_CONNECTOR:
915             fprintf(output, "PUNCT_CONNECTOR "); break;
916         case XML_REGEXP_PUNCT_DASH:
917             fprintf(output, "PUNCT_DASH "); break;
918         case XML_REGEXP_PUNCT_OPEN:
919             fprintf(output, "PUNCT_OPEN "); break;
920         case XML_REGEXP_PUNCT_CLOSE:
921             fprintf(output, "PUNCT_CLOSE "); break;
922         case XML_REGEXP_PUNCT_INITQUOTE:
923             fprintf(output, "PUNCT_INITQUOTE "); break;
924         case XML_REGEXP_PUNCT_FINQUOTE:
925             fprintf(output, "PUNCT_FINQUOTE "); break;
926         case XML_REGEXP_PUNCT_OTHERS:
927             fprintf(output, "PUNCT_OTHERS "); break;
928         case XML_REGEXP_SEPAR:
929             fprintf(output, "SEPAR "); break;
930         case XML_REGEXP_SEPAR_SPACE:
931             fprintf(output, "SEPAR_SPACE "); break;
932         case XML_REGEXP_SEPAR_LINE:
933             fprintf(output, "SEPAR_LINE "); break;
934         case XML_REGEXP_SEPAR_PARA:
935             fprintf(output, "SEPAR_PARA "); break;
936         case XML_REGEXP_SYMBOL:
937             fprintf(output, "SYMBOL "); break;
938         case XML_REGEXP_SYMBOL_MATH:
939             fprintf(output, "SYMBOL_MATH "); break;
940         case XML_REGEXP_SYMBOL_CURRENCY:
941             fprintf(output, "SYMBOL_CURRENCY "); break;
942         case XML_REGEXP_SYMBOL_MODIFIER:
943             fprintf(output, "SYMBOL_MODIFIER "); break;
944         case XML_REGEXP_SYMBOL_OTHERS:
945             fprintf(output, "SYMBOL_OTHERS "); break;
946         case XML_REGEXP_OTHER:
947             fprintf(output, "OTHER "); break;
948         case XML_REGEXP_OTHER_CONTROL:
949             fprintf(output, "OTHER_CONTROL "); break;
950         case XML_REGEXP_OTHER_FORMAT:
951             fprintf(output, "OTHER_FORMAT "); break;
952         case XML_REGEXP_OTHER_PRIVATE:
953             fprintf(output, "OTHER_PRIVATE "); break;
954         case XML_REGEXP_OTHER_NA:
955             fprintf(output, "OTHER_NA "); break;
956         case XML_REGEXP_BLOCK_NAME:
957             fprintf(output, "BLOCK "); break;
958     }
959 }
960 
961 static void
xmlRegPrintQuantType(FILE * output,xmlRegQuantType type)962 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
963     switch (type) {
964         case XML_REGEXP_QUANT_EPSILON:
965             fprintf(output, "epsilon "); break;
966         case XML_REGEXP_QUANT_ONCE:
967             fprintf(output, "once "); break;
968         case XML_REGEXP_QUANT_OPT:
969             fprintf(output, "? "); break;
970         case XML_REGEXP_QUANT_MULT:
971             fprintf(output, "* "); break;
972         case XML_REGEXP_QUANT_PLUS:
973             fprintf(output, "+ "); break;
974         case XML_REGEXP_QUANT_RANGE:
975             fprintf(output, "range "); break;
976         case XML_REGEXP_QUANT_ONCEONLY:
977             fprintf(output, "onceonly "); break;
978         case XML_REGEXP_QUANT_ALL:
979             fprintf(output, "all "); break;
980     }
981 }
982 static void
xmlRegPrintRange(FILE * output,xmlRegRangePtr range)983 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
984     fprintf(output, "  range: ");
985     if (range->neg)
986         fprintf(output, "negative ");
987     xmlRegPrintAtomType(output, range->type);
988     fprintf(output, "%c - %c\n", range->start, range->end);
989 }
990 
991 static void
xmlRegPrintAtom(FILE * output,xmlRegAtomPtr atom)992 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
993     fprintf(output, " atom: ");
994     if (atom == NULL) {
995         fprintf(output, "NULL\n");
996         return;
997     }
998     if (atom->neg)
999         fprintf(output, "not ");
1000     xmlRegPrintAtomType(output, atom->type);
1001     xmlRegPrintQuantType(output, atom->quant);
1002     if (atom->quant == XML_REGEXP_QUANT_RANGE)
1003         fprintf(output, "%d-%d ", atom->min, atom->max);
1004     if (atom->type == XML_REGEXP_STRING)
1005         fprintf(output, "'%s' ", (char *) atom->valuep);
1006     if (atom->type == XML_REGEXP_CHARVAL)
1007         fprintf(output, "char %c\n", atom->codepoint);
1008     else if (atom->type == XML_REGEXP_RANGES) {
1009         int i;
1010         fprintf(output, "%d entries\n", atom->nbRanges);
1011         for (i = 0; i < atom->nbRanges;i++)
1012             xmlRegPrintRange(output, atom->ranges[i]);
1013     } else if (atom->type == XML_REGEXP_SUBREG) {
1014         fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1015     } else {
1016         fprintf(output, "\n");
1017     }
1018 }
1019 
1020 static void
xmlRegPrintTrans(FILE * output,xmlRegTransPtr trans)1021 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1022     fprintf(output, "  trans: ");
1023     if (trans == NULL) {
1024         fprintf(output, "NULL\n");
1025         return;
1026     }
1027     if (trans->to < 0) {
1028         fprintf(output, "removed\n");
1029         return;
1030     }
1031     if (trans->nd != 0) {
1032         if (trans->nd == 2)
1033             fprintf(output, "last not determinist, ");
1034         else
1035             fprintf(output, "not determinist, ");
1036     }
1037     if (trans->counter >= 0) {
1038         fprintf(output, "counted %d, ", trans->counter);
1039     }
1040     if (trans->count == REGEXP_ALL_COUNTER) {
1041         fprintf(output, "all transition, ");
1042     } else if (trans->count >= 0) {
1043         fprintf(output, "count based %d, ", trans->count);
1044     }
1045     if (trans->atom == NULL) {
1046         fprintf(output, "epsilon to %d\n", trans->to);
1047         return;
1048     }
1049     if (trans->atom->type == XML_REGEXP_CHARVAL)
1050         fprintf(output, "char %c ", trans->atom->codepoint);
1051     fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1052 }
1053 
1054 static void
xmlRegPrintState(FILE * output,xmlRegStatePtr state)1055 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1056     int i;
1057 
1058     fprintf(output, " state: ");
1059     if (state == NULL) {
1060         fprintf(output, "NULL\n");
1061         return;
1062     }
1063     if (state->type == XML_REGEXP_START_STATE)
1064         fprintf(output, "START ");
1065     if (state->type == XML_REGEXP_FINAL_STATE)
1066         fprintf(output, "FINAL ");
1067 
1068     fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1069     for (i = 0;i < state->nbTrans; i++) {
1070         xmlRegPrintTrans(output, &(state->trans[i]));
1071     }
1072 }
1073 
1074 #ifdef DEBUG_REGEXP_GRAPH
1075 static void
xmlRegPrintCtxt(FILE * output,xmlRegParserCtxtPtr ctxt)1076 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1077     int i;
1078 
1079     fprintf(output, " ctxt: ");
1080     if (ctxt == NULL) {
1081         fprintf(output, "NULL\n");
1082         return;
1083     }
1084     fprintf(output, "'%s' ", ctxt->string);
1085     if (ctxt->error)
1086         fprintf(output, "error ");
1087     if (ctxt->neg)
1088         fprintf(output, "neg ");
1089     fprintf(output, "\n");
1090     fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1091     for (i = 0;i < ctxt->nbAtoms; i++) {
1092         fprintf(output, " %02d ", i);
1093         xmlRegPrintAtom(output, ctxt->atoms[i]);
1094     }
1095     if (ctxt->atom != NULL) {
1096         fprintf(output, "current atom:\n");
1097         xmlRegPrintAtom(output, ctxt->atom);
1098     }
1099     fprintf(output, "%d states:", ctxt->nbStates);
1100     if (ctxt->start != NULL)
1101         fprintf(output, " start: %d", ctxt->start->no);
1102     if (ctxt->end != NULL)
1103         fprintf(output, " end: %d", ctxt->end->no);
1104     fprintf(output, "\n");
1105     for (i = 0;i < ctxt->nbStates; i++) {
1106         xmlRegPrintState(output, ctxt->states[i]);
1107     }
1108     fprintf(output, "%d counters:\n", ctxt->nbCounters);
1109     for (i = 0;i < ctxt->nbCounters; i++) {
1110         fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1111                                                 ctxt->counters[i].max);
1112     }
1113 }
1114 #endif
1115 
1116 /************************************************************************
1117  *                                                                      *
1118  *               Finite Automata structures manipulations               *
1119  *                                                                      *
1120  ************************************************************************/
1121 
1122 static void
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom,int neg,xmlRegAtomType type,int start,int end,xmlChar * blockName)1123 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1124                    int neg, xmlRegAtomType type, int start, int end,
1125                    xmlChar *blockName) {
1126     xmlRegRangePtr range;
1127 
1128     if (atom == NULL) {
1129         ERROR("add range: atom is NULL");
1130         return;
1131     }
1132     if (atom->type != XML_REGEXP_RANGES) {
1133         ERROR("add range: atom is not ranges");
1134         return;
1135     }
1136     if (atom->maxRanges == 0) {
1137         atom->maxRanges = 4;
1138         atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1139                                              sizeof(xmlRegRangePtr));
1140         if (atom->ranges == NULL) {
1141             xmlRegexpErrMemory(ctxt, "adding ranges");
1142             atom->maxRanges = 0;
1143             return;
1144         }
1145     } else if (atom->nbRanges >= atom->maxRanges) {
1146         xmlRegRangePtr *tmp;
1147         atom->maxRanges *= 2;
1148         tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1149                                              sizeof(xmlRegRangePtr));
1150         if (tmp == NULL) {
1151             xmlRegexpErrMemory(ctxt, "adding ranges");
1152             atom->maxRanges /= 2;
1153             return;
1154         }
1155         atom->ranges = tmp;
1156     }
1157     range = xmlRegNewRange(ctxt, neg, type, start, end);
1158     if (range == NULL)
1159         return;
1160     range->blockName = blockName;
1161     atom->ranges[atom->nbRanges++] = range;
1162 
1163 }
1164 
1165 static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt)1166 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1167     if (ctxt->maxCounters == 0) {
1168         ctxt->maxCounters = 4;
1169         ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1170                                              sizeof(xmlRegCounter));
1171         if (ctxt->counters == NULL) {
1172             xmlRegexpErrMemory(ctxt, "allocating counter");
1173             ctxt->maxCounters = 0;
1174             return(-1);
1175         }
1176     } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1177         xmlRegCounter *tmp;
1178         ctxt->maxCounters *= 2;
1179         tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1180                                            sizeof(xmlRegCounter));
1181         if (tmp == NULL) {
1182             xmlRegexpErrMemory(ctxt, "allocating counter");
1183             ctxt->maxCounters /= 2;
1184             return(-1);
1185         }
1186         ctxt->counters = tmp;
1187     }
1188     ctxt->counters[ctxt->nbCounters].min = -1;
1189     ctxt->counters[ctxt->nbCounters].max = -1;
1190     return(ctxt->nbCounters++);
1191 }
1192 
1193 static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)1194 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1195     if (atom == NULL) {
1196         ERROR("atom push: atom is NULL");
1197         return(-1);
1198     }
1199     if (ctxt->maxAtoms == 0) {
1200         ctxt->maxAtoms = 4;
1201         ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1202                                              sizeof(xmlRegAtomPtr));
1203         if (ctxt->atoms == NULL) {
1204             xmlRegexpErrMemory(ctxt, "pushing atom");
1205             ctxt->maxAtoms = 0;
1206             return(-1);
1207         }
1208     } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1209         xmlRegAtomPtr *tmp;
1210         ctxt->maxAtoms *= 2;
1211         tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1212                                              sizeof(xmlRegAtomPtr));
1213         if (tmp == NULL) {
1214             xmlRegexpErrMemory(ctxt, "allocating counter");
1215             ctxt->maxAtoms /= 2;
1216             return(-1);
1217         }
1218         ctxt->atoms = tmp;
1219     }
1220     atom->no = ctxt->nbAtoms;
1221     ctxt->atoms[ctxt->nbAtoms++] = atom;
1222     return(0);
1223 }
1224 
1225 static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr target,int from)1226 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1227                       int from) {
1228     if (target->maxTransTo == 0) {
1229         target->maxTransTo = 8;
1230         target->transTo = (int *) xmlMalloc(target->maxTransTo *
1231                                              sizeof(int));
1232         if (target->transTo == NULL) {
1233             xmlRegexpErrMemory(ctxt, "adding transition");
1234             target->maxTransTo = 0;
1235             return;
1236         }
1237     } else if (target->nbTransTo >= target->maxTransTo) {
1238         int *tmp;
1239         target->maxTransTo *= 2;
1240         tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1241                                              sizeof(int));
1242         if (tmp == NULL) {
1243             xmlRegexpErrMemory(ctxt, "adding transition");
1244             target->maxTransTo /= 2;
1245             return;
1246         }
1247         target->transTo = tmp;
1248     }
1249     target->transTo[target->nbTransTo] = from;
1250     target->nbTransTo++;
1251 }
1252 
1253 static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,xmlRegAtomPtr atom,xmlRegStatePtr target,int counter,int count)1254 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1255                     xmlRegAtomPtr atom, xmlRegStatePtr target,
1256                     int counter, int count) {
1257 
1258     int nrtrans;
1259 
1260     if (state == NULL) {
1261         ERROR("add state: state is NULL");
1262         return;
1263     }
1264     if (target == NULL) {
1265         ERROR("add state: target is NULL");
1266         return;
1267     }
1268     /*
1269      * Other routines follow the philosophy 'When in doubt, add a transition'
1270      * so we check here whether such a transition is already present and, if
1271      * so, silently ignore this request.
1272      */
1273 
1274     for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1275         xmlRegTransPtr trans = &(state->trans[nrtrans]);
1276         if ((trans->atom == atom) &&
1277             (trans->to == target->no) &&
1278             (trans->counter == counter) &&
1279             (trans->count == count)) {
1280 #ifdef DEBUG_REGEXP_GRAPH
1281             printf("Ignoring duplicate transition from %d to %d\n",
1282                     state->no, target->no);
1283 #endif
1284             return;
1285         }
1286     }
1287 
1288     if (state->maxTrans == 0) {
1289         state->maxTrans = 8;
1290         state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1291                                              sizeof(xmlRegTrans));
1292         if (state->trans == NULL) {
1293             xmlRegexpErrMemory(ctxt, "adding transition");
1294             state->maxTrans = 0;
1295             return;
1296         }
1297     } else if (state->nbTrans >= state->maxTrans) {
1298         xmlRegTrans *tmp;
1299         state->maxTrans *= 2;
1300         tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1301                                              sizeof(xmlRegTrans));
1302         if (tmp == NULL) {
1303             xmlRegexpErrMemory(ctxt, "adding transition");
1304             state->maxTrans /= 2;
1305             return;
1306         }
1307         state->trans = tmp;
1308     }
1309 #ifdef DEBUG_REGEXP_GRAPH
1310     printf("Add trans from %d to %d ", state->no, target->no);
1311     if (count == REGEXP_ALL_COUNTER)
1312         printf("all transition\n");
1313     else if (count >= 0)
1314         printf("count based %d\n", count);
1315     else if (counter >= 0)
1316         printf("counted %d\n", counter);
1317     else if (atom == NULL)
1318         printf("epsilon transition\n");
1319     else if (atom != NULL)
1320         xmlRegPrintAtom(stdout, atom);
1321 #endif
1322 
1323     state->trans[state->nbTrans].atom = atom;
1324     state->trans[state->nbTrans].to = target->no;
1325     state->trans[state->nbTrans].counter = counter;
1326     state->trans[state->nbTrans].count = count;
1327     state->trans[state->nbTrans].nd = 0;
1328     state->nbTrans++;
1329     xmlRegStateAddTransTo(ctxt, target, state->no);
1330 }
1331 
1332 static int
xmlRegStatePush(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)1333 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1334     if (state == NULL) return(-1);
1335     if (ctxt->maxStates == 0) {
1336         ctxt->maxStates = 4;
1337         ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1338                                              sizeof(xmlRegStatePtr));
1339         if (ctxt->states == NULL) {
1340             xmlRegexpErrMemory(ctxt, "adding state");
1341             ctxt->maxStates = 0;
1342             return(-1);
1343         }
1344     } else if (ctxt->nbStates >= ctxt->maxStates) {
1345         xmlRegStatePtr *tmp;
1346         ctxt->maxStates *= 2;
1347         tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1348                                              sizeof(xmlRegStatePtr));
1349         if (tmp == NULL) {
1350             xmlRegexpErrMemory(ctxt, "adding state");
1351             ctxt->maxStates /= 2;
1352             return(-1);
1353         }
1354         ctxt->states = tmp;
1355     }
1356     state->no = ctxt->nbStates;
1357     ctxt->states[ctxt->nbStates++] = state;
1358     return(0);
1359 }
1360 
1361 /**
1362  * xmlFAGenerateAllTransition:
1363  * @ctxt:  a regexp parser context
1364  * @from:  the from state
1365  * @to:  the target state or NULL for building a new one
1366  * @lax:
1367  *
1368  */
1369 static void
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int lax)1370 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1371                            xmlRegStatePtr from, xmlRegStatePtr to,
1372                            int lax) {
1373     if (to == NULL) {
1374         to = xmlRegNewState(ctxt);
1375         xmlRegStatePush(ctxt, to);
1376         ctxt->state = to;
1377     }
1378     if (lax)
1379         xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1380     else
1381         xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1382 }
1383 
1384 /**
1385  * xmlFAGenerateEpsilonTransition:
1386  * @ctxt:  a regexp parser context
1387  * @from:  the from state
1388  * @to:  the target state or NULL for building a new one
1389  *
1390  */
1391 static void
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to)1392 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1393                                xmlRegStatePtr from, xmlRegStatePtr to) {
1394     if (to == NULL) {
1395         to = xmlRegNewState(ctxt);
1396         xmlRegStatePush(ctxt, to);
1397         ctxt->state = to;
1398     }
1399     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1400 }
1401 
1402 /**
1403  * xmlFAGenerateCountedEpsilonTransition:
1404  * @ctxt:  a regexp parser context
1405  * @from:  the from state
1406  * @to:  the target state or NULL for building a new one
1407  * counter:  the counter for that transition
1408  *
1409  */
1410 static void
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1411 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1412             xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1413     if (to == NULL) {
1414         to = xmlRegNewState(ctxt);
1415         xmlRegStatePush(ctxt, to);
1416         ctxt->state = to;
1417     }
1418     xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1419 }
1420 
1421 /**
1422  * xmlFAGenerateCountedTransition:
1423  * @ctxt:  a regexp parser context
1424  * @from:  the from state
1425  * @to:  the target state or NULL for building a new one
1426  * counter:  the counter for that transition
1427  *
1428  */
1429 static void
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1430 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1431             xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1432     if (to == NULL) {
1433         to = xmlRegNewState(ctxt);
1434         xmlRegStatePush(ctxt, to);
1435         ctxt->state = to;
1436     }
1437     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1438 }
1439 
1440 /**
1441  * xmlFAGenerateTransitions:
1442  * @ctxt:  a regexp parser context
1443  * @from:  the from state
1444  * @to:  the target state or NULL for building a new one
1445  * @atom:  the atom generating the transition
1446  *
1447  * Returns 0 if success and -1 in case of error.
1448  */
1449 static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,xmlRegAtomPtr atom)1450 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1451                          xmlRegStatePtr to, xmlRegAtomPtr atom) {
1452     if (atom == NULL) {
1453         ERROR("genrate transition: atom == NULL");
1454         return(-1);
1455     }
1456     if (atom->type == XML_REGEXP_SUBREG) {
1457         /*
1458          * this is a subexpression handling one should not need to
1459          * create a new node except for XML_REGEXP_QUANT_RANGE.
1460          */
1461         if (xmlRegAtomPush(ctxt, atom) < 0) {
1462             return(-1);
1463         }
1464         if ((to != NULL) && (atom->stop != to) &&
1465             (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1466             /*
1467              * Generate an epsilon transition to link to the target
1468              */
1469             xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1470 #ifdef DV
1471         } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1472                    (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1473             to = xmlRegNewState(ctxt);
1474             xmlRegStatePush(ctxt, to);
1475             ctxt->state = to;
1476             xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1477 #endif
1478         }
1479         switch (atom->quant) {
1480             case XML_REGEXP_QUANT_OPT:
1481                 atom->quant = XML_REGEXP_QUANT_ONCE;
1482                 /*
1483                  * transition done to the state after end of atom.
1484                  *      1. set transition from atom start to new state
1485                  *      2. set transition from atom end to this state.
1486                  */
1487                 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1488                 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, ctxt->state);
1489                 break;
1490             case XML_REGEXP_QUANT_MULT:
1491                 atom->quant = XML_REGEXP_QUANT_ONCE;
1492                 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1493                 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1494                 break;
1495             case XML_REGEXP_QUANT_PLUS:
1496                 atom->quant = XML_REGEXP_QUANT_ONCE;
1497                 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1498                 break;
1499             case XML_REGEXP_QUANT_RANGE: {
1500                 int counter;
1501                 xmlRegStatePtr newstate;
1502 
1503                 /*
1504                  * This one is nasty:
1505                  *   1/ if range has minOccurs == 0, create a new state
1506                  *      and create epsilon transitions from atom->start
1507                  *      to atom->stop, as well as atom->start to the new
1508                  *      state
1509                  *   2/ register a new counter
1510                  *   3/ register an epsilon transition associated to
1511                  *      this counter going from atom->stop to atom->start
1512                  *   4/ create a new state
1513                  *   5/ generate a counted transition from atom->stop to
1514                  *      that state
1515                  */
1516                 if (atom->min == 0) {
1517                     xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1518                         atom->stop);
1519                     newstate = xmlRegNewState(ctxt);
1520                     xmlRegStatePush(ctxt, newstate);
1521                     ctxt->state = newstate;
1522                     xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1523                         newstate);
1524                 }
1525                 counter = xmlRegGetCounter(ctxt);
1526                 ctxt->counters[counter].min = atom->min - 1;
1527                 ctxt->counters[counter].max = atom->max - 1;
1528                 atom->min = 0;
1529                 atom->max = 0;
1530                 atom->quant = XML_REGEXP_QUANT_ONCE;
1531                 if (to != NULL) {
1532                     newstate = to;
1533                 } else {
1534                     newstate = xmlRegNewState(ctxt);
1535                     xmlRegStatePush(ctxt, newstate);
1536                 }
1537                 ctxt->state = newstate;
1538                 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1539                                                newstate, counter);
1540 
1541                 /*
1542                  * first check count and if OK jump forward;
1543                  * if checking fail increment count and jump back
1544                  */
1545                 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1546                                                       atom->start, counter);
1547             }
1548             default:
1549                 break;
1550         }
1551         return(0);
1552     }
1553     if ((atom->min == 0) && (atom->max == 0) &&
1554                (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1555         /*
1556          * we can discard the atom and generate an epsilon transition instead
1557          */
1558         if (to == NULL) {
1559             to = xmlRegNewState(ctxt);
1560             if (to != NULL)
1561                 xmlRegStatePush(ctxt, to);
1562             else {
1563                 return(-1);
1564             }
1565         }
1566         xmlFAGenerateEpsilonTransition(ctxt, from, to);
1567         ctxt->state = to;
1568         xmlRegFreeAtom(atom);
1569         return(0);
1570     }
1571     if (to == NULL) {
1572         to = xmlRegNewState(ctxt);
1573         if (to != NULL)
1574             xmlRegStatePush(ctxt, to);
1575         else {
1576             return(-1);
1577         }
1578     }
1579     if (xmlRegAtomPush(ctxt, atom) < 0) {
1580         return(-1);
1581     }
1582     xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1583     ctxt->state = to;
1584     switch (atom->quant) {
1585         case XML_REGEXP_QUANT_OPT:
1586             atom->quant = XML_REGEXP_QUANT_ONCE;
1587             xmlFAGenerateEpsilonTransition(ctxt, from, to);
1588             break;
1589         case XML_REGEXP_QUANT_MULT:
1590             atom->quant = XML_REGEXP_QUANT_ONCE;
1591             xmlFAGenerateEpsilonTransition(ctxt, from, to);
1592             xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1593             break;
1594         case XML_REGEXP_QUANT_PLUS:
1595             atom->quant = XML_REGEXP_QUANT_ONCE;
1596             xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1597             break;
1598         default:
1599             break;
1600     }
1601     return(0);
1602 }
1603 
1604 /**
1605  * xmlFAReduceEpsilonTransitions:
1606  * @ctxt:  a regexp parser context
1607  * @fromnr:  the from state
1608  * @tonr:  the to state
1609  * @counter:  should that transition be associated to a counted
1610  *
1611  */
1612 static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int fromnr,int tonr,int counter)1613 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1614                               int tonr, int counter) {
1615     int transnr;
1616     xmlRegStatePtr from;
1617     xmlRegStatePtr to;
1618 
1619 #ifdef DEBUG_REGEXP_GRAPH
1620     printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1621 #endif
1622     from = ctxt->states[fromnr];
1623     if (from == NULL)
1624         return;
1625     to = ctxt->states[tonr];
1626     if (to == NULL)
1627         return;
1628     if ((to->mark == XML_REGEXP_MARK_START) ||
1629         (to->mark == XML_REGEXP_MARK_VISITED))
1630         return;
1631 
1632     to->mark = XML_REGEXP_MARK_VISITED;
1633     if (to->type == XML_REGEXP_FINAL_STATE) {
1634 #ifdef DEBUG_REGEXP_GRAPH
1635         printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1636 #endif
1637         from->type = XML_REGEXP_FINAL_STATE;
1638     }
1639     for (transnr = 0;transnr < to->nbTrans;transnr++) {
1640         if (to->trans[transnr].to < 0)
1641             continue;
1642         if (to->trans[transnr].atom == NULL) {
1643             /*
1644              * Don't remove counted transitions
1645              * Don't loop either
1646              */
1647             if (to->trans[transnr].to != fromnr) {
1648                 if (to->trans[transnr].count >= 0) {
1649                     int newto = to->trans[transnr].to;
1650 
1651                     xmlRegStateAddTrans(ctxt, from, NULL,
1652                                         ctxt->states[newto],
1653                                         -1, to->trans[transnr].count);
1654                 } else {
1655 #ifdef DEBUG_REGEXP_GRAPH
1656                     printf("Found epsilon trans %d from %d to %d\n",
1657                            transnr, tonr, to->trans[transnr].to);
1658 #endif
1659                     if (to->trans[transnr].counter >= 0) {
1660                         xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1661                                               to->trans[transnr].to,
1662                                               to->trans[transnr].counter);
1663                     } else {
1664                         xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1665                                               to->trans[transnr].to,
1666                                               counter);
1667                     }
1668                 }
1669             }
1670         } else {
1671             int newto = to->trans[transnr].to;
1672 
1673             if (to->trans[transnr].counter >= 0) {
1674                 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1675                                     ctxt->states[newto],
1676                                     to->trans[transnr].counter, -1);
1677             } else {
1678                 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1679                                     ctxt->states[newto], counter, -1);
1680             }
1681         }
1682     }
1683     to->mark = XML_REGEXP_MARK_NORMAL;
1684 }
1685 
1686 /**
1687  * xmlFAEliminateSimpleEpsilonTransitions:
1688  * @ctxt:  a regexp parser context
1689  *
1690  * Eliminating general epsilon transitions can get costly in the general
1691  * algorithm due to the large amount of generated new transitions and
1692  * associated comparisons. However for simple epsilon transition used just
1693  * to separate building blocks when generating the automata this can be
1694  * reduced to state elimination:
1695  *    - if there exists an epsilon from X to Y
1696  *    - if there is no other transition from X
1697  * then X and Y are semantically equivalent and X can be eliminated
1698  * If X is the start state then make Y the start state, else replace the
1699  * target of all transitions to X by transitions to Y.
1700  */
1701 static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1702 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1703     int statenr, i, j, newto;
1704     xmlRegStatePtr state, tmp;
1705 
1706     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1707         state = ctxt->states[statenr];
1708         if (state == NULL)
1709             continue;
1710         if (state->nbTrans != 1)
1711             continue;
1712         /* is the only transition out a basic transition */
1713         if ((state->trans[0].atom == NULL) &&
1714             (state->trans[0].to >= 0) &&
1715             (state->trans[0].to != statenr) &&
1716             (state->trans[0].counter < 0) &&
1717             (state->trans[0].count < 0)) {
1718             newto = state->trans[0].to;
1719 
1720             if (state->type == XML_REGEXP_START_STATE) {
1721 #ifdef DEBUG_REGEXP_GRAPH
1722                 printf("Found simple epsilon trans from start %d to %d\n",
1723                        statenr, newto);
1724 #endif
1725             } else {
1726 #ifdef DEBUG_REGEXP_GRAPH
1727                 printf("Found simple epsilon trans from %d to %d\n",
1728                        statenr, newto);
1729 #endif
1730                 for (i = 0;i < state->nbTransTo;i++) {
1731                     tmp = ctxt->states[state->transTo[i]];
1732                     for (j = 0;j < tmp->nbTrans;j++) {
1733                         if (tmp->trans[j].to == statenr) {
1734                             tmp->trans[j].to = newto;
1735 #ifdef DEBUG_REGEXP_GRAPH
1736                             printf("Changed transition %d on %d to go to %d\n",
1737                                    j, tmp->no, newto);
1738 #endif
1739                             xmlRegStateAddTransTo(ctxt, ctxt->states[newto],
1740                                                   tmp->no);
1741                         }
1742                     }
1743                 }
1744 #if 0
1745                 for (i = 0;i < ctxt->nbStates;i++) {
1746                     tmp = ctxt->states[i];
1747                     for (j = 0;j < tmp->nbTrans;j++) {
1748                         if (tmp->trans[j].to == statenr) {
1749                             tmp->trans[j].to = newto;
1750 #ifdef DEBUG_REGEXP_GRAPH
1751                             printf("Changed transition %d on %d to go to %d\n",
1752                                    j, tmp->no, newto);
1753 #endif
1754                         }
1755                     }
1756                 }
1757 #endif
1758                 if (state->type == XML_REGEXP_FINAL_STATE)
1759                     ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1760                 /* eliminate the transition completely */
1761                 state->nbTrans = 0;
1762 
1763 
1764             }
1765 
1766         }
1767     }
1768 }
1769 /**
1770  * xmlFAEliminateEpsilonTransitions:
1771  * @ctxt:  a regexp parser context
1772  *
1773  */
1774 static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1775 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1776     int statenr, transnr;
1777     xmlRegStatePtr state;
1778     int has_epsilon;
1779 
1780     if (ctxt->states == NULL) return;
1781 
1782     xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1783 
1784     has_epsilon = 0;
1785 
1786     /*
1787      * build the completed transitions bypassing the epsilons
1788      * Use a marking algorithm to avoid loops
1789      * mark sink states too.
1790      */
1791     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1792         state = ctxt->states[statenr];
1793         if (state == NULL)
1794             continue;
1795         if ((state->nbTrans == 0) &&
1796             (state->type != XML_REGEXP_FINAL_STATE)) {
1797             state->type = XML_REGEXP_SINK_STATE;
1798         }
1799         for (transnr = 0;transnr < state->nbTrans;transnr++) {
1800             if ((state->trans[transnr].atom == NULL) &&
1801                 (state->trans[transnr].to >= 0)) {
1802                 if (state->trans[transnr].to == statenr) {
1803                     state->trans[transnr].to = -1;
1804 #ifdef DEBUG_REGEXP_GRAPH
1805                     printf("Removed loopback epsilon trans %d on %d\n",
1806                            transnr, statenr);
1807 #endif
1808                 } else if (state->trans[transnr].count < 0) {
1809                     int newto = state->trans[transnr].to;
1810 
1811 #ifdef DEBUG_REGEXP_GRAPH
1812                     printf("Found epsilon trans %d from %d to %d\n",
1813                            transnr, statenr, newto);
1814 #endif
1815                     state->mark = XML_REGEXP_MARK_START;
1816                     has_epsilon = 1;
1817                     xmlFAReduceEpsilonTransitions(ctxt, statenr,
1818                                       newto, state->trans[transnr].counter);
1819                     state->mark = XML_REGEXP_MARK_NORMAL;
1820 #ifdef DEBUG_REGEXP_GRAPH
1821                 } else {
1822                     printf("Found counted transition %d on %d\n",
1823                            transnr, statenr);
1824 #endif
1825                 }
1826             }
1827         }
1828     }
1829     /*
1830      * Eliminate the epsilon transitions
1831      */
1832     if (has_epsilon) {
1833         for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1834             state = ctxt->states[statenr];
1835             if (state == NULL)
1836                 continue;
1837             for (transnr = 0;transnr < state->nbTrans;transnr++) {
1838                 xmlRegTransPtr trans = &(state->trans[transnr]);
1839                 if ((trans->atom == NULL) &&
1840                     (trans->count < 0) &&
1841                     (trans->to >= 0)) {
1842                     trans->to = -1;
1843                 }
1844             }
1845         }
1846     }
1847 
1848     /*
1849      * Use this pass to detect unreachable states too
1850      */
1851     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1852         state = ctxt->states[statenr];
1853         if (state != NULL)
1854             state->reached = XML_REGEXP_MARK_NORMAL;
1855     }
1856     state = ctxt->states[0];
1857     if (state != NULL)
1858         state->reached = XML_REGEXP_MARK_START;
1859     while (state != NULL) {
1860         xmlRegStatePtr target = NULL;
1861         state->reached = XML_REGEXP_MARK_VISITED;
1862         /*
1863          * Mark all states reachable from the current reachable state
1864          */
1865         for (transnr = 0;transnr < state->nbTrans;transnr++) {
1866             if ((state->trans[transnr].to >= 0) &&
1867                 ((state->trans[transnr].atom != NULL) ||
1868                  (state->trans[transnr].count >= 0))) {
1869                 int newto = state->trans[transnr].to;
1870 
1871                 if (ctxt->states[newto] == NULL)
1872                     continue;
1873                 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1874                     ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
1875                     target = ctxt->states[newto];
1876                 }
1877             }
1878         }
1879 
1880         /*
1881          * find the next accessible state not explored
1882          */
1883         if (target == NULL) {
1884             for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1885                 state = ctxt->states[statenr];
1886                 if ((state != NULL) && (state->reached ==
1887                         XML_REGEXP_MARK_START)) {
1888                     target = state;
1889                     break;
1890                 }
1891             }
1892         }
1893         state = target;
1894     }
1895     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1896         state = ctxt->states[statenr];
1897         if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
1898 #ifdef DEBUG_REGEXP_GRAPH
1899             printf("Removed unreachable state %d\n", statenr);
1900 #endif
1901             xmlRegFreeState(state);
1902             ctxt->states[statenr] = NULL;
1903         }
1904     }
1905 
1906 }
1907 
1908 static int
xmlFACompareRanges(xmlRegRangePtr range1,xmlRegRangePtr range2)1909 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
1910     int ret = 0;
1911 
1912     if ((range1->type == XML_REGEXP_RANGES) ||
1913         (range2->type == XML_REGEXP_RANGES) ||
1914         (range2->type == XML_REGEXP_SUBREG) ||
1915         (range1->type == XML_REGEXP_SUBREG) ||
1916         (range1->type == XML_REGEXP_STRING) ||
1917         (range2->type == XML_REGEXP_STRING))
1918         return(-1);
1919 
1920     /* put them in order */
1921     if (range1->type > range2->type) {
1922         xmlRegRangePtr tmp;
1923 
1924         tmp = range1;
1925         range1 = range2;
1926         range2 = tmp;
1927     }
1928     if ((range1->type == XML_REGEXP_ANYCHAR) ||
1929         (range2->type == XML_REGEXP_ANYCHAR)) {
1930         ret = 1;
1931     } else if ((range1->type == XML_REGEXP_EPSILON) ||
1932                (range2->type == XML_REGEXP_EPSILON)) {
1933         return(0);
1934     } else if (range1->type == range2->type) {
1935         if ((range1->type != XML_REGEXP_CHARVAL) ||
1936             (range1->end < range2->start) ||
1937             (range2->end < range1->start))
1938             ret = 1;
1939         else
1940             ret = 0;
1941     } else if (range1->type == XML_REGEXP_CHARVAL) {
1942         int codepoint;
1943         int neg = 0;
1944 
1945         /*
1946          * just check all codepoints in the range for acceptance,
1947          * this is usually way cheaper since done only once at
1948          * compilation than testing over and over at runtime or
1949          * pushing too many states when evaluating.
1950          */
1951         if (((range1->neg == 0) && (range2->neg != 0)) ||
1952             ((range1->neg != 0) && (range2->neg == 0)))
1953             neg = 1;
1954 
1955         for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
1956             ret = xmlRegCheckCharacterRange(range2->type, codepoint,
1957                                             0, range2->start, range2->end,
1958                                             range2->blockName);
1959             if (ret < 0)
1960                 return(-1);
1961             if (((neg == 1) && (ret == 0)) ||
1962                 ((neg == 0) && (ret == 1)))
1963                 return(1);
1964         }
1965         return(0);
1966     } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
1967                (range2->type == XML_REGEXP_BLOCK_NAME)) {
1968         if (range1->type == range2->type) {
1969             ret = xmlStrEqual(range1->blockName, range2->blockName);
1970         } else {
1971             /*
1972              * comparing a block range with anything else is way
1973              * too costly, and maintaining the table is like too much
1974              * memory too, so let's force the automata to save state
1975              * here.
1976              */
1977             return(1);
1978         }
1979     } else if ((range1->type < XML_REGEXP_LETTER) ||
1980                (range2->type < XML_REGEXP_LETTER)) {
1981         if ((range1->type == XML_REGEXP_ANYSPACE) &&
1982             (range2->type == XML_REGEXP_NOTSPACE))
1983             ret = 0;
1984         else if ((range1->type == XML_REGEXP_INITNAME) &&
1985                  (range2->type == XML_REGEXP_NOTINITNAME))
1986             ret = 0;
1987         else if ((range1->type == XML_REGEXP_NAMECHAR) &&
1988                  (range2->type == XML_REGEXP_NOTNAMECHAR))
1989             ret = 0;
1990         else if ((range1->type == XML_REGEXP_DECIMAL) &&
1991                  (range2->type == XML_REGEXP_NOTDECIMAL))
1992             ret = 0;
1993         else if ((range1->type == XML_REGEXP_REALCHAR) &&
1994                  (range2->type == XML_REGEXP_NOTREALCHAR))
1995             ret = 0;
1996         else {
1997             /* same thing to limit complexity */
1998             return(1);
1999         }
2000     } else {
2001         ret = 0;
2002         /* range1->type < range2->type here */
2003         switch (range1->type) {
2004             case XML_REGEXP_LETTER:
2005                  /* all disjoint except in the subgroups */
2006                  if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2007                      (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2008                      (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2009                      (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2010                      (range2->type == XML_REGEXP_LETTER_OTHERS))
2011                      ret = 1;
2012                  break;
2013             case XML_REGEXP_MARK:
2014                  if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2015                      (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2016                      (range2->type == XML_REGEXP_MARK_ENCLOSING))
2017                      ret = 1;
2018                  break;
2019             case XML_REGEXP_NUMBER:
2020                  if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2021                      (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2022                      (range2->type == XML_REGEXP_NUMBER_OTHERS))
2023                      ret = 1;
2024                  break;
2025             case XML_REGEXP_PUNCT:
2026                  if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2027                      (range2->type == XML_REGEXP_PUNCT_DASH) ||
2028                      (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2029                      (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2030                      (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2031                      (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2032                      (range2->type == XML_REGEXP_PUNCT_OTHERS))
2033                      ret = 1;
2034                  break;
2035             case XML_REGEXP_SEPAR:
2036                  if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2037                      (range2->type == XML_REGEXP_SEPAR_LINE) ||
2038                      (range2->type == XML_REGEXP_SEPAR_PARA))
2039                      ret = 1;
2040                  break;
2041             case XML_REGEXP_SYMBOL:
2042                  if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2043                      (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2044                      (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2045                      (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2046                      ret = 1;
2047                  break;
2048             case XML_REGEXP_OTHER:
2049                  if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2050                      (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2051                      (range2->type == XML_REGEXP_OTHER_PRIVATE))
2052                      ret = 1;
2053                  break;
2054             default:
2055                  if ((range2->type >= XML_REGEXP_LETTER) &&
2056                      (range2->type < XML_REGEXP_BLOCK_NAME))
2057                      ret = 0;
2058                  else {
2059                      /* safety net ! */
2060                      return(1);
2061                  }
2062         }
2063     }
2064     if (((range1->neg == 0) && (range2->neg != 0)) ||
2065         ((range1->neg != 0) && (range2->neg == 0)))
2066         ret = !ret;
2067     return(1);
2068 }
2069 
2070 /**
2071  * xmlFACompareAtomTypes:
2072  * @type1:  an atom type
2073  * @type2:  an atom type
2074  *
2075  * Compares two atoms type to check whether they intersect in some ways,
2076  * this is used by xmlFACompareAtoms only
2077  *
2078  * Returns 1 if they may intersect and 0 otherwise
2079  */
2080 static int
xmlFACompareAtomTypes(xmlRegAtomType type1,xmlRegAtomType type2)2081 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2082     if ((type1 == XML_REGEXP_EPSILON) ||
2083         (type1 == XML_REGEXP_CHARVAL) ||
2084         (type1 == XML_REGEXP_RANGES) ||
2085         (type1 == XML_REGEXP_SUBREG) ||
2086         (type1 == XML_REGEXP_STRING) ||
2087         (type1 == XML_REGEXP_ANYCHAR))
2088         return(1);
2089     if ((type2 == XML_REGEXP_EPSILON) ||
2090         (type2 == XML_REGEXP_CHARVAL) ||
2091         (type2 == XML_REGEXP_RANGES) ||
2092         (type2 == XML_REGEXP_SUBREG) ||
2093         (type2 == XML_REGEXP_STRING) ||
2094         (type2 == XML_REGEXP_ANYCHAR))
2095         return(1);
2096 
2097     if (type1 == type2) return(1);
2098 
2099     /* simplify subsequent compares by making sure type1 < type2 */
2100     if (type1 > type2) {
2101         xmlRegAtomType tmp = type1;
2102         type1 = type2;
2103         type2 = tmp;
2104     }
2105     switch (type1) {
2106         case XML_REGEXP_ANYSPACE: /* \s */
2107             /* can't be a letter, number, mark, pontuation, symbol */
2108             if ((type2 == XML_REGEXP_NOTSPACE) ||
2109                 ((type2 >= XML_REGEXP_LETTER) &&
2110                  (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2111                 ((type2 >= XML_REGEXP_NUMBER) &&
2112                  (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2113                 ((type2 >= XML_REGEXP_MARK) &&
2114                  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2115                 ((type2 >= XML_REGEXP_PUNCT) &&
2116                  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2117                 ((type2 >= XML_REGEXP_SYMBOL) &&
2118                  (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2119                 ) return(0);
2120             break;
2121         case XML_REGEXP_NOTSPACE: /* \S */
2122             break;
2123         case XML_REGEXP_INITNAME: /* \l */
2124             /* can't be a number, mark, separator, pontuation, symbol or other */
2125             if ((type2 == XML_REGEXP_NOTINITNAME) ||
2126                 ((type2 >= XML_REGEXP_NUMBER) &&
2127                  (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2128                 ((type2 >= XML_REGEXP_MARK) &&
2129                  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2130                 ((type2 >= XML_REGEXP_SEPAR) &&
2131                  (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2132                 ((type2 >= XML_REGEXP_PUNCT) &&
2133                  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2134                 ((type2 >= XML_REGEXP_SYMBOL) &&
2135                  (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2136                 ((type2 >= XML_REGEXP_OTHER) &&
2137                  (type2 <= XML_REGEXP_OTHER_NA))
2138                 ) return(0);
2139             break;
2140         case XML_REGEXP_NOTINITNAME: /* \L */
2141             break;
2142         case XML_REGEXP_NAMECHAR: /* \c */
2143             /* can't be a mark, separator, pontuation, symbol or other */
2144             if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2145                 ((type2 >= XML_REGEXP_MARK) &&
2146                  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2147                 ((type2 >= XML_REGEXP_PUNCT) &&
2148                  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2149                 ((type2 >= XML_REGEXP_SEPAR) &&
2150                  (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2151                 ((type2 >= XML_REGEXP_SYMBOL) &&
2152                  (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2153                 ((type2 >= XML_REGEXP_OTHER) &&
2154                  (type2 <= XML_REGEXP_OTHER_NA))
2155                 ) return(0);
2156             break;
2157         case XML_REGEXP_NOTNAMECHAR: /* \C */
2158             break;
2159         case XML_REGEXP_DECIMAL: /* \d */
2160             /* can't be a letter, mark, separator, pontuation, symbol or other */
2161             if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2162                 (type2 == XML_REGEXP_REALCHAR) ||
2163                 ((type2 >= XML_REGEXP_LETTER) &&
2164                  (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2165                 ((type2 >= XML_REGEXP_MARK) &&
2166                  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2167                 ((type2 >= XML_REGEXP_PUNCT) &&
2168                  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2169                 ((type2 >= XML_REGEXP_SEPAR) &&
2170                  (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2171                 ((type2 >= XML_REGEXP_SYMBOL) &&
2172                  (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2173                 ((type2 >= XML_REGEXP_OTHER) &&
2174                  (type2 <= XML_REGEXP_OTHER_NA))
2175                 )return(0);
2176             break;
2177         case XML_REGEXP_NOTDECIMAL: /* \D */
2178             break;
2179         case XML_REGEXP_REALCHAR: /* \w */
2180             /* can't be a mark, separator, pontuation, symbol or other */
2181             if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2182                 ((type2 >= XML_REGEXP_MARK) &&
2183                  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2184                 ((type2 >= XML_REGEXP_PUNCT) &&
2185                  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2186                 ((type2 >= XML_REGEXP_SEPAR) &&
2187                  (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2188                 ((type2 >= XML_REGEXP_SYMBOL) &&
2189                  (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2190                 ((type2 >= XML_REGEXP_OTHER) &&
2191                  (type2 <= XML_REGEXP_OTHER_NA))
2192                 )return(0);
2193             break;
2194         case XML_REGEXP_NOTREALCHAR: /* \W */
2195             break;
2196         /*
2197          * at that point we know both type 1 and type2 are from
2198          * character categories are ordered and are different,
2199          * it becomes simple because this is a partition
2200          */
2201         case XML_REGEXP_LETTER:
2202             if (type2 <= XML_REGEXP_LETTER_OTHERS)
2203                 return(1);
2204             return(0);
2205         case XML_REGEXP_LETTER_UPPERCASE:
2206         case XML_REGEXP_LETTER_LOWERCASE:
2207         case XML_REGEXP_LETTER_TITLECASE:
2208         case XML_REGEXP_LETTER_MODIFIER:
2209         case XML_REGEXP_LETTER_OTHERS:
2210             return(0);
2211         case XML_REGEXP_MARK:
2212             if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2213                 return(1);
2214             return(0);
2215         case XML_REGEXP_MARK_NONSPACING:
2216         case XML_REGEXP_MARK_SPACECOMBINING:
2217         case XML_REGEXP_MARK_ENCLOSING:
2218             return(0);
2219         case XML_REGEXP_NUMBER:
2220             if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2221                 return(1);
2222             return(0);
2223         case XML_REGEXP_NUMBER_DECIMAL:
2224         case XML_REGEXP_NUMBER_LETTER:
2225         case XML_REGEXP_NUMBER_OTHERS:
2226             return(0);
2227         case XML_REGEXP_PUNCT:
2228             if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2229                 return(1);
2230             return(0);
2231         case XML_REGEXP_PUNCT_CONNECTOR:
2232         case XML_REGEXP_PUNCT_DASH:
2233         case XML_REGEXP_PUNCT_OPEN:
2234         case XML_REGEXP_PUNCT_CLOSE:
2235         case XML_REGEXP_PUNCT_INITQUOTE:
2236         case XML_REGEXP_PUNCT_FINQUOTE:
2237         case XML_REGEXP_PUNCT_OTHERS:
2238             return(0);
2239         case XML_REGEXP_SEPAR:
2240             if (type2 <= XML_REGEXP_SEPAR_PARA)
2241                 return(1);
2242             return(0);
2243         case XML_REGEXP_SEPAR_SPACE:
2244         case XML_REGEXP_SEPAR_LINE:
2245         case XML_REGEXP_SEPAR_PARA:
2246             return(0);
2247         case XML_REGEXP_SYMBOL:
2248             if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2249                 return(1);
2250             return(0);
2251         case XML_REGEXP_SYMBOL_MATH:
2252         case XML_REGEXP_SYMBOL_CURRENCY:
2253         case XML_REGEXP_SYMBOL_MODIFIER:
2254         case XML_REGEXP_SYMBOL_OTHERS:
2255             return(0);
2256         case XML_REGEXP_OTHER:
2257             if (type2 <= XML_REGEXP_OTHER_NA)
2258                 return(1);
2259             return(0);
2260         case XML_REGEXP_OTHER_CONTROL:
2261         case XML_REGEXP_OTHER_FORMAT:
2262         case XML_REGEXP_OTHER_PRIVATE:
2263         case XML_REGEXP_OTHER_NA:
2264             return(0);
2265         default:
2266             break;
2267     }
2268     return(1);
2269 }
2270 
2271 /**
2272  * xmlFAEqualAtoms:
2273  * @atom1:  an atom
2274  * @atom2:  an atom
2275  *
2276  * Compares two atoms to check whether they are the same exactly
2277  * this is used to remove equivalent transitions
2278  *
2279  * Returns 1 if same and 0 otherwise
2280  */
2281 static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2)2282 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
2283     int ret = 0;
2284 
2285     if (atom1 == atom2)
2286         return(1);
2287     if ((atom1 == NULL) || (atom2 == NULL))
2288         return(0);
2289 
2290     if (atom1->type != atom2->type)
2291         return(0);
2292     switch (atom1->type) {
2293         case XML_REGEXP_EPSILON:
2294             ret = 0;
2295             break;
2296         case XML_REGEXP_STRING:
2297             ret = xmlStrEqual((xmlChar *)atom1->valuep,
2298                               (xmlChar *)atom2->valuep);
2299             break;
2300         case XML_REGEXP_CHARVAL:
2301             ret = (atom1->codepoint == atom2->codepoint);
2302             break;
2303         case XML_REGEXP_RANGES:
2304             /* too hard to do in the general case */
2305             ret = 0;
2306         default:
2307             break;
2308     }
2309     return(ret);
2310 }
2311 
2312 /**
2313  * xmlFACompareAtoms:
2314  * @atom1:  an atom
2315  * @atom2:  an atom
2316  *
2317  * Compares two atoms to check whether they intersect in some ways,
2318  * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2319  *
2320  * Returns 1 if yes and 0 otherwise
2321  */
2322 static int
xmlFACompareAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2)2323 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
2324     int ret = 1;
2325 
2326     if (atom1 == atom2)
2327         return(1);
2328     if ((atom1 == NULL) || (atom2 == NULL))
2329         return(0);
2330 
2331     if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2332         (atom2->type == XML_REGEXP_ANYCHAR))
2333         return(1);
2334 
2335     if (atom1->type > atom2->type) {
2336         xmlRegAtomPtr tmp;
2337         tmp = atom1;
2338         atom1 = atom2;
2339         atom2 = tmp;
2340     }
2341     if (atom1->type != atom2->type) {
2342         ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2343         /* if they can't intersect at the type level break now */
2344         if (ret == 0)
2345             return(0);
2346     }
2347     switch (atom1->type) {
2348         case XML_REGEXP_STRING:
2349             ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
2350                                          (xmlChar *)atom2->valuep);
2351             break;
2352         case XML_REGEXP_EPSILON:
2353             goto not_determinist;
2354         case XML_REGEXP_CHARVAL:
2355             if (atom2->type == XML_REGEXP_CHARVAL) {
2356                 ret = (atom1->codepoint == atom2->codepoint);
2357             } else {
2358                 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2359                 if (ret < 0)
2360                     ret = 1;
2361             }
2362             break;
2363         case XML_REGEXP_RANGES:
2364             if (atom2->type == XML_REGEXP_RANGES) {
2365                 int i, j, res;
2366                 xmlRegRangePtr r1, r2;
2367 
2368                 /*
2369                  * need to check that none of the ranges eventually matches
2370                  */
2371                 for (i = 0;i < atom1->nbRanges;i++) {
2372                     for (j = 0;j < atom2->nbRanges;j++) {
2373                         r1 = atom1->ranges[i];
2374                         r2 = atom2->ranges[j];
2375                         res = xmlFACompareRanges(r1, r2);
2376                         if (res == 1) {
2377                             ret = 1;
2378                             goto done;
2379                         }
2380                     }
2381                 }
2382                 ret = 0;
2383             }
2384             break;
2385         default:
2386             goto not_determinist;
2387     }
2388 done:
2389     if (atom1->neg != atom2->neg) {
2390         ret = !ret;
2391     }
2392     if (ret == 0)
2393         return(0);
2394 not_determinist:
2395     return(1);
2396 }
2397 
2398 /**
2399  * xmlFARecurseDeterminism:
2400  * @ctxt:  a regexp parser context
2401  *
2402  * Check whether the associated regexp is determinist,
2403  * should be called after xmlFAEliminateEpsilonTransitions()
2404  *
2405  */
2406 static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,int to,xmlRegAtomPtr atom)2407 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2408                          int to, xmlRegAtomPtr atom) {
2409     int ret = 1;
2410     int res;
2411     int transnr, nbTrans;
2412     xmlRegTransPtr t1;
2413 
2414     if (state == NULL)
2415         return(ret);
2416     /*
2417      * don't recurse on transitions potentially added in the course of
2418      * the elimination.
2419      */
2420     nbTrans = state->nbTrans;
2421     for (transnr = 0;transnr < nbTrans;transnr++) {
2422         t1 = &(state->trans[transnr]);
2423         /*
2424          * check transitions conflicting with the one looked at
2425          */
2426         if (t1->atom == NULL) {
2427             if (t1->to == -1)
2428                 continue;
2429             res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2430                                            to, atom);
2431             if (res == 0) {
2432                 ret = 0;
2433                 /* t1->nd = 1; */
2434             }
2435             continue;
2436         }
2437         if (t1->to != to)
2438             continue;
2439         if (xmlFACompareAtoms(t1->atom, atom)) {
2440             ret = 0;
2441             /* mark the transition as non-deterministic */
2442             t1->nd = 1;
2443         }
2444     }
2445     return(ret);
2446 }
2447 
2448 /**
2449  * xmlFAComputesDeterminism:
2450  * @ctxt:  a regexp parser context
2451  *
2452  * Check whether the associated regexp is determinist,
2453  * should be called after xmlFAEliminateEpsilonTransitions()
2454  *
2455  */
2456 static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt)2457 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2458     int statenr, transnr;
2459     xmlRegStatePtr state;
2460     xmlRegTransPtr t1, t2, last;
2461     int i;
2462     int ret = 1;
2463 
2464 #ifdef DEBUG_REGEXP_GRAPH
2465     printf("xmlFAComputesDeterminism\n");
2466     xmlRegPrintCtxt(stdout, ctxt);
2467 #endif
2468     if (ctxt->determinist != -1)
2469         return(ctxt->determinist);
2470 
2471     /*
2472      * First cleanup the automata removing cancelled transitions
2473      */
2474     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2475         state = ctxt->states[statenr];
2476         if (state == NULL)
2477             continue;
2478         if (state->nbTrans < 2)
2479             continue;
2480         for (transnr = 0;transnr < state->nbTrans;transnr++) {
2481             t1 = &(state->trans[transnr]);
2482             /*
2483              * Determinism checks in case of counted or all transitions
2484              * will have to be handled separately
2485              */
2486             if (t1->atom == NULL) {
2487                 /* t1->nd = 1; */
2488                 continue;
2489             }
2490             if (t1->to == -1) /* eliminated */
2491                 continue;
2492             for (i = 0;i < transnr;i++) {
2493                 t2 = &(state->trans[i]);
2494                 if (t2->to == -1) /* eliminated */
2495                     continue;
2496                 if (t2->atom != NULL) {
2497                     if (t1->to == t2->to) {
2498                         if (xmlFAEqualAtoms(t1->atom, t2->atom))
2499                             t2->to = -1; /* eliminated */
2500                     }
2501                 }
2502             }
2503         }
2504     }
2505 
2506     /*
2507      * Check for all states that there aren't 2 transitions
2508      * with the same atom and a different target.
2509      */
2510     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2511         state = ctxt->states[statenr];
2512         if (state == NULL)
2513             continue;
2514         if (state->nbTrans < 2)
2515             continue;
2516         last = NULL;
2517         for (transnr = 0;transnr < state->nbTrans;transnr++) {
2518             t1 = &(state->trans[transnr]);
2519             /*
2520              * Determinism checks in case of counted or all transitions
2521              * will have to be handled separately
2522              */
2523             if (t1->atom == NULL) {
2524                 continue;
2525             }
2526             if (t1->to == -1) /* eliminated */
2527                 continue;
2528             for (i = 0;i < transnr;i++) {
2529                 t2 = &(state->trans[i]);
2530                 if (t2->to == -1) /* eliminated */
2531                     continue;
2532                 if (t2->atom != NULL) {
2533                     /* not determinist ! */
2534                     if (xmlFACompareAtoms(t1->atom, t2->atom)) {
2535                         ret = 0;
2536                         /* mark the transitions as non-deterministic ones */
2537                         t1->nd = 1;
2538                         t2->nd = 1;
2539                         last = t1;
2540                     }
2541                 } else if (t1->to != -1) {
2542                     /*
2543                      * do the closure in case of remaining specific
2544                      * epsilon transitions like choices or all
2545                      */
2546                     ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2547                                                    t2->to, t2->atom);
2548                     /* don't shortcut the computation so all non deterministic
2549                        transition get marked down
2550                     if (ret == 0)
2551                         return(0);
2552                      */
2553                     if (ret == 0) {
2554                         t1->nd = 1;
2555                         /* t2->nd = 1; */
2556                         last = t1;
2557                     }
2558                 }
2559             }
2560             /* don't shortcut the computation so all non deterministic
2561                transition get marked down
2562             if (ret == 0)
2563                 break; */
2564         }
2565 
2566         /*
2567          * mark specifically the last non-deterministic transition
2568          * from a state since there is no need to set-up rollback
2569          * from it
2570          */
2571         if (last != NULL) {
2572             last->nd = 2;
2573         }
2574 
2575         /* don't shortcut the computation so all non deterministic
2576            transition get marked down
2577         if (ret == 0)
2578             break; */
2579     }
2580 
2581     ctxt->determinist = ret;
2582     return(ret);
2583 }
2584 
2585 /************************************************************************
2586  *                                                                      *
2587  *      Routines to check input against transition atoms                *
2588  *                                                                      *
2589  ************************************************************************/
2590 
2591 static int
xmlRegCheckCharacterRange(xmlRegAtomType type,int codepoint,int neg,int start,int end,const xmlChar * blockName)2592 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2593                           int start, int end, const xmlChar *blockName) {
2594     int ret = 0;
2595 
2596     switch (type) {
2597         case XML_REGEXP_STRING:
2598         case XML_REGEXP_SUBREG:
2599         case XML_REGEXP_RANGES:
2600         case XML_REGEXP_EPSILON:
2601             return(-1);
2602         case XML_REGEXP_ANYCHAR:
2603             ret = ((codepoint != '\n') && (codepoint != '\r'));
2604             break;
2605         case XML_REGEXP_CHARVAL:
2606             ret = ((codepoint >= start) && (codepoint <= end));
2607             break;
2608         case XML_REGEXP_NOTSPACE:
2609             neg = !neg;
2610         case XML_REGEXP_ANYSPACE:
2611             ret = ((codepoint == '\n') || (codepoint == '\r') ||
2612                    (codepoint == '\t') || (codepoint == ' '));
2613             break;
2614         case XML_REGEXP_NOTINITNAME:
2615             neg = !neg;
2616         case XML_REGEXP_INITNAME:
2617             ret = (IS_LETTER(codepoint) ||
2618                    (codepoint == '_') || (codepoint == ':'));
2619             break;
2620         case XML_REGEXP_NOTNAMECHAR:
2621             neg = !neg;
2622         case XML_REGEXP_NAMECHAR:
2623             ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2624                    (codepoint == '.') || (codepoint == '-') ||
2625                    (codepoint == '_') || (codepoint == ':') ||
2626                    IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2627             break;
2628         case XML_REGEXP_NOTDECIMAL:
2629             neg = !neg;
2630         case XML_REGEXP_DECIMAL:
2631             ret = xmlUCSIsCatNd(codepoint);
2632             break;
2633         case XML_REGEXP_REALCHAR:
2634             neg = !neg;
2635         case XML_REGEXP_NOTREALCHAR:
2636             ret = xmlUCSIsCatP(codepoint);
2637             if (ret == 0)
2638                 ret = xmlUCSIsCatZ(codepoint);
2639             if (ret == 0)
2640                 ret = xmlUCSIsCatC(codepoint);
2641             break;
2642         case XML_REGEXP_LETTER:
2643             ret = xmlUCSIsCatL(codepoint);
2644             break;
2645         case XML_REGEXP_LETTER_UPPERCASE:
2646             ret = xmlUCSIsCatLu(codepoint);
2647             break;
2648         case XML_REGEXP_LETTER_LOWERCASE:
2649             ret = xmlUCSIsCatLl(codepoint);
2650             break;
2651         case XML_REGEXP_LETTER_TITLECASE:
2652             ret = xmlUCSIsCatLt(codepoint);
2653             break;
2654         case XML_REGEXP_LETTER_MODIFIER:
2655             ret = xmlUCSIsCatLm(codepoint);
2656             break;
2657         case XML_REGEXP_LETTER_OTHERS:
2658             ret = xmlUCSIsCatLo(codepoint);
2659             break;
2660         case XML_REGEXP_MARK:
2661             ret = xmlUCSIsCatM(codepoint);
2662             break;
2663         case XML_REGEXP_MARK_NONSPACING:
2664             ret = xmlUCSIsCatMn(codepoint);
2665             break;
2666         case XML_REGEXP_MARK_SPACECOMBINING:
2667             ret = xmlUCSIsCatMc(codepoint);
2668             break;
2669         case XML_REGEXP_MARK_ENCLOSING:
2670             ret = xmlUCSIsCatMe(codepoint);
2671             break;
2672         case XML_REGEXP_NUMBER:
2673             ret = xmlUCSIsCatN(codepoint);
2674             break;
2675         case XML_REGEXP_NUMBER_DECIMAL:
2676             ret = xmlUCSIsCatNd(codepoint);
2677             break;
2678         case XML_REGEXP_NUMBER_LETTER:
2679             ret = xmlUCSIsCatNl(codepoint);
2680             break;
2681         case XML_REGEXP_NUMBER_OTHERS:
2682             ret = xmlUCSIsCatNo(codepoint);
2683             break;
2684         case XML_REGEXP_PUNCT:
2685             ret = xmlUCSIsCatP(codepoint);
2686             break;
2687         case XML_REGEXP_PUNCT_CONNECTOR:
2688             ret = xmlUCSIsCatPc(codepoint);
2689             break;
2690         case XML_REGEXP_PUNCT_DASH:
2691             ret = xmlUCSIsCatPd(codepoint);
2692             break;
2693         case XML_REGEXP_PUNCT_OPEN:
2694             ret = xmlUCSIsCatPs(codepoint);
2695             break;
2696         case XML_REGEXP_PUNCT_CLOSE:
2697             ret = xmlUCSIsCatPe(codepoint);
2698             break;
2699         case XML_REGEXP_PUNCT_INITQUOTE:
2700             ret = xmlUCSIsCatPi(codepoint);
2701             break;
2702         case XML_REGEXP_PUNCT_FINQUOTE:
2703             ret = xmlUCSIsCatPf(codepoint);
2704             break;
2705         case XML_REGEXP_PUNCT_OTHERS:
2706             ret = xmlUCSIsCatPo(codepoint);
2707             break;
2708         case XML_REGEXP_SEPAR:
2709             ret = xmlUCSIsCatZ(codepoint);
2710             break;
2711         case XML_REGEXP_SEPAR_SPACE:
2712             ret = xmlUCSIsCatZs(codepoint);
2713             break;
2714         case XML_REGEXP_SEPAR_LINE:
2715             ret = xmlUCSIsCatZl(codepoint);
2716             break;
2717         case XML_REGEXP_SEPAR_PARA:
2718             ret = xmlUCSIsCatZp(codepoint);
2719             break;
2720         case XML_REGEXP_SYMBOL:
2721             ret = xmlUCSIsCatS(codepoint);
2722             break;
2723         case XML_REGEXP_SYMBOL_MATH:
2724             ret = xmlUCSIsCatSm(codepoint);
2725             break;
2726         case XML_REGEXP_SYMBOL_CURRENCY:
2727             ret = xmlUCSIsCatSc(codepoint);
2728             break;
2729         case XML_REGEXP_SYMBOL_MODIFIER:
2730             ret = xmlUCSIsCatSk(codepoint);
2731             break;
2732         case XML_REGEXP_SYMBOL_OTHERS:
2733             ret = xmlUCSIsCatSo(codepoint);
2734             break;
2735         case XML_REGEXP_OTHER:
2736             ret = xmlUCSIsCatC(codepoint);
2737             break;
2738         case XML_REGEXP_OTHER_CONTROL:
2739             ret = xmlUCSIsCatCc(codepoint);
2740             break;
2741         case XML_REGEXP_OTHER_FORMAT:
2742             ret = xmlUCSIsCatCf(codepoint);
2743             break;
2744         case XML_REGEXP_OTHER_PRIVATE:
2745             ret = xmlUCSIsCatCo(codepoint);
2746             break;
2747         case XML_REGEXP_OTHER_NA:
2748             /* ret = xmlUCSIsCatCn(codepoint); */
2749             /* Seems it doesn't exist anymore in recent Unicode releases */
2750             ret = 0;
2751             break;
2752         case XML_REGEXP_BLOCK_NAME:
2753             ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2754             break;
2755     }
2756     if (neg)
2757         return(!ret);
2758     return(ret);
2759 }
2760 
2761 static int
xmlRegCheckCharacter(xmlRegAtomPtr atom,int codepoint)2762 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2763     int i, ret = 0;
2764     xmlRegRangePtr range;
2765 
2766     if ((atom == NULL) || (!IS_CHAR(codepoint)))
2767         return(-1);
2768 
2769     switch (atom->type) {
2770         case XML_REGEXP_SUBREG:
2771         case XML_REGEXP_EPSILON:
2772             return(-1);
2773         case XML_REGEXP_CHARVAL:
2774             return(codepoint == atom->codepoint);
2775         case XML_REGEXP_RANGES: {
2776             int accept = 0;
2777 
2778             for (i = 0;i < atom->nbRanges;i++) {
2779                 range = atom->ranges[i];
2780                 if (range->neg == 2) {
2781                     ret = xmlRegCheckCharacterRange(range->type, codepoint,
2782                                                 0, range->start, range->end,
2783                                                 range->blockName);
2784                     if (ret != 0)
2785                         return(0); /* excluded char */
2786                 } else if (range->neg) {
2787                     ret = xmlRegCheckCharacterRange(range->type, codepoint,
2788                                                 0, range->start, range->end,
2789                                                 range->blockName);
2790                     if (ret == 0)
2791                         accept = 1;
2792                     else
2793                         return(0);
2794                 } else {
2795                     ret = xmlRegCheckCharacterRange(range->type, codepoint,
2796                                                 0, range->start, range->end,
2797                                                 range->blockName);
2798                     if (ret != 0)
2799                         accept = 1; /* might still be excluded */
2800                 }
2801             }
2802             return(accept);
2803         }
2804         case XML_REGEXP_STRING:
2805             printf("TODO: XML_REGEXP_STRING\n");
2806             return(-1);
2807         case XML_REGEXP_ANYCHAR:
2808         case XML_REGEXP_ANYSPACE:
2809         case XML_REGEXP_NOTSPACE:
2810         case XML_REGEXP_INITNAME:
2811         case XML_REGEXP_NOTINITNAME:
2812         case XML_REGEXP_NAMECHAR:
2813         case XML_REGEXP_NOTNAMECHAR:
2814         case XML_REGEXP_DECIMAL:
2815         case XML_REGEXP_NOTDECIMAL:
2816         case XML_REGEXP_REALCHAR:
2817         case XML_REGEXP_NOTREALCHAR:
2818         case XML_REGEXP_LETTER:
2819         case XML_REGEXP_LETTER_UPPERCASE:
2820         case XML_REGEXP_LETTER_LOWERCASE:
2821         case XML_REGEXP_LETTER_TITLECASE:
2822         case XML_REGEXP_LETTER_MODIFIER:
2823         case XML_REGEXP_LETTER_OTHERS:
2824         case XML_REGEXP_MARK:
2825         case XML_REGEXP_MARK_NONSPACING:
2826         case XML_REGEXP_MARK_SPACECOMBINING:
2827         case XML_REGEXP_MARK_ENCLOSING:
2828         case XML_REGEXP_NUMBER:
2829         case XML_REGEXP_NUMBER_DECIMAL:
2830         case XML_REGEXP_NUMBER_LETTER:
2831         case XML_REGEXP_NUMBER_OTHERS:
2832         case XML_REGEXP_PUNCT:
2833         case XML_REGEXP_PUNCT_CONNECTOR:
2834         case XML_REGEXP_PUNCT_DASH:
2835         case XML_REGEXP_PUNCT_OPEN:
2836         case XML_REGEXP_PUNCT_CLOSE:
2837         case XML_REGEXP_PUNCT_INITQUOTE:
2838         case XML_REGEXP_PUNCT_FINQUOTE:
2839         case XML_REGEXP_PUNCT_OTHERS:
2840         case XML_REGEXP_SEPAR:
2841         case XML_REGEXP_SEPAR_SPACE:
2842         case XML_REGEXP_SEPAR_LINE:
2843         case XML_REGEXP_SEPAR_PARA:
2844         case XML_REGEXP_SYMBOL:
2845         case XML_REGEXP_SYMBOL_MATH:
2846         case XML_REGEXP_SYMBOL_CURRENCY:
2847         case XML_REGEXP_SYMBOL_MODIFIER:
2848         case XML_REGEXP_SYMBOL_OTHERS:
2849         case XML_REGEXP_OTHER:
2850         case XML_REGEXP_OTHER_CONTROL:
2851         case XML_REGEXP_OTHER_FORMAT:
2852         case XML_REGEXP_OTHER_PRIVATE:
2853         case XML_REGEXP_OTHER_NA:
2854         case XML_REGEXP_BLOCK_NAME:
2855             ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
2856                                             (const xmlChar *)atom->valuep);
2857             if (atom->neg)
2858                 ret = !ret;
2859             break;
2860     }
2861     return(ret);
2862 }
2863 
2864 /************************************************************************
2865  *                                                                      *
2866  *      Saving and restoring state of an execution context              *
2867  *                                                                      *
2868  ************************************************************************/
2869 
2870 #ifdef DEBUG_REGEXP_EXEC
2871 static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec)2872 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
2873     printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
2874     if (exec->inputStack != NULL) {
2875         int i;
2876         printf(": ");
2877         for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
2878             printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
2879     } else {
2880         printf(": %s", &(exec->inputString[exec->index]));
2881     }
2882     printf("\n");
2883 }
2884 #endif
2885 
2886 static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec)2887 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
2888 #ifdef DEBUG_REGEXP_EXEC
2889     printf("saving ");
2890     exec->transno++;
2891     xmlFARegDebugExec(exec);
2892     exec->transno--;
2893 #endif
2894 #ifdef MAX_PUSH
2895     if (exec->nbPush > MAX_PUSH) {
2896         return;
2897     }
2898     exec->nbPush++;
2899 #endif
2900 
2901     if (exec->maxRollbacks == 0) {
2902         exec->maxRollbacks = 4;
2903         exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2904                                              sizeof(xmlRegExecRollback));
2905         if (exec->rollbacks == NULL) {
2906             xmlRegexpErrMemory(NULL, "saving regexp");
2907             exec->maxRollbacks = 0;
2908             return;
2909         }
2910         memset(exec->rollbacks, 0,
2911                exec->maxRollbacks * sizeof(xmlRegExecRollback));
2912     } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2913         xmlRegExecRollback *tmp;
2914         int len = exec->maxRollbacks;
2915 
2916         exec->maxRollbacks *= 2;
2917         tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2918                         exec->maxRollbacks * sizeof(xmlRegExecRollback));
2919         if (tmp == NULL) {
2920             xmlRegexpErrMemory(NULL, "saving regexp");
2921             exec->maxRollbacks /= 2;
2922             return;
2923         }
2924         exec->rollbacks = tmp;
2925         tmp = &exec->rollbacks[len];
2926         memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2927     }
2928     exec->rollbacks[exec->nbRollbacks].state = exec->state;
2929     exec->rollbacks[exec->nbRollbacks].index = exec->index;
2930     exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2931     if (exec->comp->nbCounters > 0) {
2932         if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2933             exec->rollbacks[exec->nbRollbacks].counts = (int *)
2934                 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2935             if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2936                 xmlRegexpErrMemory(NULL, "saving regexp");
2937                 exec->status = -5;
2938                 return;
2939             }
2940         }
2941         memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2942                exec->comp->nbCounters * sizeof(int));
2943     }
2944     exec->nbRollbacks++;
2945 }
2946 
2947 static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec)2948 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2949     if (exec->nbRollbacks <= 0) {
2950         exec->status = -1;
2951 #ifdef DEBUG_REGEXP_EXEC
2952         printf("rollback failed on empty stack\n");
2953 #endif
2954         return;
2955     }
2956     exec->nbRollbacks--;
2957     exec->state = exec->rollbacks[exec->nbRollbacks].state;
2958     exec->index = exec->rollbacks[exec->nbRollbacks].index;
2959     exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2960     if (exec->comp->nbCounters > 0) {
2961         if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2962             fprintf(stderr, "exec save: allocation failed");
2963             exec->status = -6;
2964             return;
2965         }
2966         memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2967                exec->comp->nbCounters * sizeof(int));
2968     }
2969 
2970 #ifdef DEBUG_REGEXP_EXEC
2971     printf("restored ");
2972     xmlFARegDebugExec(exec);
2973 #endif
2974 }
2975 
2976 /************************************************************************
2977  *                                                                      *
2978  *      Verifier, running an input against a compiled regexp            *
2979  *                                                                      *
2980  ************************************************************************/
2981 
2982 static int
xmlFARegExec(xmlRegexpPtr comp,const xmlChar * content)2983 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2984     xmlRegExecCtxt execval;
2985     xmlRegExecCtxtPtr exec = &execval;
2986     int ret, codepoint = 0, len, deter;
2987 
2988     exec->inputString = content;
2989     exec->index = 0;
2990     exec->nbPush = 0;
2991     exec->determinist = 1;
2992     exec->maxRollbacks = 0;
2993     exec->nbRollbacks = 0;
2994     exec->rollbacks = NULL;
2995     exec->status = 0;
2996     exec->comp = comp;
2997     exec->state = comp->states[0];
2998     exec->transno = 0;
2999     exec->transcount = 0;
3000     exec->inputStack = NULL;
3001     exec->inputStackMax = 0;
3002     if (comp->nbCounters > 0) {
3003         exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3004         if (exec->counts == NULL) {
3005             xmlRegexpErrMemory(NULL, "running regexp");
3006             return(-1);
3007         }
3008         memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3009     } else
3010         exec->counts = NULL;
3011     while ((exec->status == 0) &&
3012            ((exec->inputString[exec->index] != 0) ||
3013             (exec->state->type != XML_REGEXP_FINAL_STATE))) {
3014         xmlRegTransPtr trans;
3015         xmlRegAtomPtr atom;
3016 
3017         /*
3018          * If end of input on non-terminal state, rollback, however we may
3019          * still have epsilon like transition for counted transitions
3020          * on counters, in that case don't break too early.  Additionally,
3021          * if we are working on a range like "AB{0,2}", where B is not present,
3022          * we don't want to break.
3023          */
3024         len = 1;
3025         if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3026             /*
3027              * if there is a transition, we must check if
3028              *  atom allows minOccurs of 0
3029              */
3030             if (exec->transno < exec->state->nbTrans) {
3031                 trans = &exec->state->trans[exec->transno];
3032                 if (trans->to >=0) {
3033                     atom = trans->atom;
3034                     if (!((atom->min == 0) && (atom->max > 0)))
3035                         goto rollback;
3036                 }
3037             } else
3038                 goto rollback;
3039         }
3040 
3041         exec->transcount = 0;
3042         for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3043             trans = &exec->state->trans[exec->transno];
3044             if (trans->to < 0)
3045                 continue;
3046             atom = trans->atom;
3047             ret = 0;
3048             deter = 1;
3049             if (trans->count >= 0) {
3050                 int count;
3051                 xmlRegCounterPtr counter;
3052 
3053                 if (exec->counts == NULL) {
3054                     exec->status = -1;
3055                     goto error;
3056                 }
3057                 /*
3058                  * A counted transition.
3059                  */
3060 
3061                 count = exec->counts[trans->count];
3062                 counter = &exec->comp->counters[trans->count];
3063 #ifdef DEBUG_REGEXP_EXEC
3064                 printf("testing count %d: val %d, min %d, max %d\n",
3065                        trans->count, count, counter->min,  counter->max);
3066 #endif
3067                 ret = ((count >= counter->min) && (count <= counter->max));
3068                 if ((ret) && (counter->min != counter->max))
3069                     deter = 0;
3070             } else if (atom == NULL) {
3071                 fprintf(stderr, "epsilon transition left at runtime\n");
3072                 exec->status = -2;
3073                 break;
3074             } else if (exec->inputString[exec->index] != 0) {
3075                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3076                 ret = xmlRegCheckCharacter(atom, codepoint);
3077                 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3078                     xmlRegStatePtr to = comp->states[trans->to];
3079 
3080                     /*
3081                      * this is a multiple input sequence
3082                      * If there is a counter associated increment it now.
3083                      * before potentially saving and rollback
3084                      */
3085                     if (trans->counter >= 0) {
3086                         if (exec->counts == NULL) {
3087                             exec->status = -1;
3088                             goto error;
3089                         }
3090 #ifdef DEBUG_REGEXP_EXEC
3091                         printf("Increasing count %d\n", trans->counter);
3092 #endif
3093                         exec->counts[trans->counter]++;
3094                     }
3095                     if (exec->state->nbTrans > exec->transno + 1) {
3096                         xmlFARegExecSave(exec);
3097                     }
3098                     exec->transcount = 1;
3099                     do {
3100                         /*
3101                          * Try to progress as much as possible on the input
3102                          */
3103                         if (exec->transcount == atom->max) {
3104                             break;
3105                         }
3106                         exec->index += len;
3107                         /*
3108                          * End of input: stop here
3109                          */
3110                         if (exec->inputString[exec->index] == 0) {
3111                             exec->index -= len;
3112                             break;
3113                         }
3114                         if (exec->transcount >= atom->min) {
3115                             int transno = exec->transno;
3116                             xmlRegStatePtr state = exec->state;
3117 
3118                             /*
3119                              * The transition is acceptable save it
3120                              */
3121                             exec->transno = -1; /* trick */
3122                             exec->state = to;
3123                             xmlFARegExecSave(exec);
3124                             exec->transno = transno;
3125                             exec->state = state;
3126                         }
3127                         codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3128                                               len);
3129                         ret = xmlRegCheckCharacter(atom, codepoint);
3130                         exec->transcount++;
3131                     } while (ret == 1);
3132                     if (exec->transcount < atom->min)
3133                         ret = 0;
3134 
3135                     /*
3136                      * If the last check failed but one transition was found
3137                      * possible, rollback
3138                      */
3139                     if (ret < 0)
3140                         ret = 0;
3141                     if (ret == 0) {
3142                         goto rollback;
3143                     }
3144                     if (trans->counter >= 0) {
3145                         if (exec->counts == NULL) {
3146                             exec->status = -1;
3147                             goto error;
3148                         }
3149 #ifdef DEBUG_REGEXP_EXEC
3150                         printf("Decreasing count %d\n", trans->counter);
3151 #endif
3152                         exec->counts[trans->counter]--;
3153                     }
3154                 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3155                     /*
3156                      * we don't match on the codepoint, but minOccurs of 0
3157                      * says that's ok.  Setting len to 0 inhibits stepping
3158                      * over the codepoint.
3159                      */
3160                     exec->transcount = 1;
3161                     len = 0;
3162                     ret = 1;
3163                 }
3164             } else if ((atom->min == 0) && (atom->max > 0)) {
3165                 /* another spot to match when minOccurs is 0 */
3166                 exec->transcount = 1;
3167                 len = 0;
3168                 ret = 1;
3169             }
3170             if (ret == 1) {
3171                 if ((trans->nd == 1) ||
3172                     ((trans->count >= 0) && (deter == 0) &&
3173                      (exec->state->nbTrans > exec->transno + 1))) {
3174 #ifdef DEBUG_REGEXP_EXEC
3175                     if (trans->nd == 1)
3176                         printf("Saving on nd transition atom %d for %c at %d\n",
3177                                trans->atom->no, codepoint, exec->index);
3178                     else
3179                         printf("Saving on counted transition count %d for %c at %d\n",
3180                                trans->count, codepoint, exec->index);
3181 #endif
3182                     xmlFARegExecSave(exec);
3183                 }
3184                 if (trans->counter >= 0) {
3185                     if (exec->counts == NULL) {
3186                         exec->status = -1;
3187                         goto error;
3188                     }
3189 #ifdef DEBUG_REGEXP_EXEC
3190                     printf("Increasing count %d\n", trans->counter);
3191 #endif
3192                     exec->counts[trans->counter]++;
3193                 }
3194                 if ((trans->count >= 0) &&
3195                     (trans->count < REGEXP_ALL_COUNTER)) {
3196                     if (exec->counts == NULL) {
3197                         exec->status = -1;
3198                         goto error;
3199                     }
3200 #ifdef DEBUG_REGEXP_EXEC
3201                     printf("resetting count %d on transition\n",
3202                            trans->count);
3203 #endif
3204                     exec->counts[trans->count] = 0;
3205                 }
3206 #ifdef DEBUG_REGEXP_EXEC
3207                 printf("entering state %d\n", trans->to);
3208 #endif
3209                 exec->state = comp->states[trans->to];
3210                 exec->transno = 0;
3211                 if (trans->atom != NULL) {
3212                     exec->index += len;
3213                 }
3214                 goto progress;
3215             } else if (ret < 0) {
3216                 exec->status = -4;
3217                 break;
3218             }
3219         }
3220         if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3221 rollback:
3222             /*
3223              * Failed to find a way out
3224              */
3225             exec->determinist = 0;
3226 #ifdef DEBUG_REGEXP_EXEC
3227             printf("rollback from state %d on %d:%c\n", exec->state->no,
3228                    codepoint,codepoint);
3229 #endif
3230             xmlFARegExecRollBack(exec);
3231         }
3232 progress:
3233         continue;
3234     }
3235 error:
3236     if (exec->rollbacks != NULL) {
3237         if (exec->counts != NULL) {
3238             int i;
3239 
3240             for (i = 0;i < exec->maxRollbacks;i++)
3241                 if (exec->rollbacks[i].counts != NULL)
3242                     xmlFree(exec->rollbacks[i].counts);
3243         }
3244         xmlFree(exec->rollbacks);
3245     }
3246     if (exec->counts != NULL)
3247         xmlFree(exec->counts);
3248     if (exec->status == 0)
3249         return(1);
3250     if (exec->status == -1) {
3251         if (exec->nbPush > MAX_PUSH)
3252             return(-1);
3253         return(0);
3254     }
3255     return(exec->status);
3256 }
3257 
3258 /************************************************************************
3259  *                                                                      *
3260  *      Progressive interface to the verifier one atom at a time        *
3261  *                                                                      *
3262  ************************************************************************/
3263 #ifdef DEBUG_ERR
3264 static void testerr(xmlRegExecCtxtPtr exec);
3265 #endif
3266 
3267 /**
3268  * xmlRegNewExecCtxt:
3269  * @comp: a precompiled regular expression
3270  * @callback: a callback function used for handling progresses in the
3271  *            automata matching phase
3272  * @data: the context data associated to the callback in this context
3273  *
3274  * Build a context used for progressive evaluation of a regexp.
3275  *
3276  * Returns the new context
3277  */
3278 xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp,xmlRegExecCallbacks callback,void * data)3279 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3280     xmlRegExecCtxtPtr exec;
3281 
3282     if (comp == NULL)
3283         return(NULL);
3284     if ((comp->compact == NULL) && (comp->states == NULL))
3285         return(NULL);
3286     exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3287     if (exec == NULL) {
3288         xmlRegexpErrMemory(NULL, "creating execution context");
3289         return(NULL);
3290     }
3291     memset(exec, 0, sizeof(xmlRegExecCtxt));
3292     exec->inputString = NULL;
3293     exec->index = 0;
3294     exec->determinist = 1;
3295     exec->maxRollbacks = 0;
3296     exec->nbRollbacks = 0;
3297     exec->rollbacks = NULL;
3298     exec->status = 0;
3299     exec->comp = comp;
3300     if (comp->compact == NULL)
3301         exec->state = comp->states[0];
3302     exec->transno = 0;
3303     exec->transcount = 0;
3304     exec->callback = callback;
3305     exec->data = data;
3306     if (comp->nbCounters > 0) {
3307         /*
3308          * For error handling, exec->counts is allocated twice the size
3309          * the second half is used to store the data in case of rollback
3310          */
3311         exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3312                                          * 2);
3313         if (exec->counts == NULL) {
3314             xmlRegexpErrMemory(NULL, "creating execution context");
3315             xmlFree(exec);
3316             return(NULL);
3317         }
3318         memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3319         exec->errCounts = &exec->counts[comp->nbCounters];
3320     } else {
3321         exec->counts = NULL;
3322         exec->errCounts = NULL;
3323     }
3324     exec->inputStackMax = 0;
3325     exec->inputStackNr = 0;
3326     exec->inputStack = NULL;
3327     exec->errStateNo = -1;
3328     exec->errString = NULL;
3329     exec->nbPush = 0;
3330     return(exec);
3331 }
3332 
3333 /**
3334  * xmlRegFreeExecCtxt:
3335  * @exec: a regular expression evaulation context
3336  *
3337  * Free the structures associated to a regular expression evaulation context.
3338  */
3339 void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec)3340 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3341     if (exec == NULL)
3342         return;
3343 
3344     if (exec->rollbacks != NULL) {
3345         if (exec->counts != NULL) {
3346             int i;
3347 
3348             for (i = 0;i < exec->maxRollbacks;i++)
3349                 if (exec->rollbacks[i].counts != NULL)
3350                     xmlFree(exec->rollbacks[i].counts);
3351         }
3352         xmlFree(exec->rollbacks);
3353     }
3354     if (exec->counts != NULL)
3355         xmlFree(exec->counts);
3356     if (exec->inputStack != NULL) {
3357         int i;
3358 
3359         for (i = 0;i < exec->inputStackNr;i++) {
3360             if (exec->inputStack[i].value != NULL)
3361                 xmlFree(exec->inputStack[i].value);
3362         }
3363         xmlFree(exec->inputStack);
3364     }
3365     if (exec->errString != NULL)
3366         xmlFree(exec->errString);
3367     xmlFree(exec);
3368 }
3369 
3370 static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3371 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3372                             void *data) {
3373 #ifdef DEBUG_PUSH
3374     printf("saving value: %d:%s\n", exec->inputStackNr, value);
3375 #endif
3376     if (exec->inputStackMax == 0) {
3377         exec->inputStackMax = 4;
3378         exec->inputStack = (xmlRegInputTokenPtr)
3379             xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3380         if (exec->inputStack == NULL) {
3381             xmlRegexpErrMemory(NULL, "pushing input string");
3382             exec->inputStackMax = 0;
3383             return;
3384         }
3385     } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3386         xmlRegInputTokenPtr tmp;
3387 
3388         exec->inputStackMax *= 2;
3389         tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3390                         exec->inputStackMax * sizeof(xmlRegInputToken));
3391         if (tmp == NULL) {
3392             xmlRegexpErrMemory(NULL, "pushing input string");
3393             exec->inputStackMax /= 2;
3394             return;
3395         }
3396         exec->inputStack = tmp;
3397     }
3398     exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3399     exec->inputStack[exec->inputStackNr].data = data;
3400     exec->inputStackNr++;
3401     exec->inputStack[exec->inputStackNr].value = NULL;
3402     exec->inputStack[exec->inputStackNr].data = NULL;
3403 }
3404 
3405 /**
3406  * xmlRegStrEqualWildcard:
3407  * @expStr:  the string to be evaluated
3408  * @valStr:  the validation string
3409  *
3410  * Checks if both strings are equal or have the same content. "*"
3411  * can be used as a wildcard in @valStr; "|" is used as a seperator of
3412  * substrings in both @expStr and @valStr.
3413  *
3414  * Returns 1 if the comparison is satisfied and the number of substrings
3415  * is equal, 0 otherwise.
3416  */
3417 
3418 static int
xmlRegStrEqualWildcard(const xmlChar * expStr,const xmlChar * valStr)3419 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3420     if (expStr == valStr) return(1);
3421     if (expStr == NULL) return(0);
3422     if (valStr == NULL) return(0);
3423     do {
3424         /*
3425         * Eval if we have a wildcard for the current item.
3426         */
3427         if (*expStr != *valStr) {
3428             /* if one of them starts with a wildcard make valStr be it */
3429             if (*valStr == '*') {
3430                 const xmlChar *tmp;
3431 
3432                 tmp = valStr;
3433                 valStr = expStr;
3434                 expStr = tmp;
3435             }
3436             if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3437                 do {
3438                     if (*valStr == XML_REG_STRING_SEPARATOR)
3439                         break;
3440                     valStr++;
3441                 } while (*valStr != 0);
3442                 continue;
3443             } else
3444                 return(0);
3445         }
3446         expStr++;
3447         valStr++;
3448     } while (*valStr != 0);
3449     if (*expStr != 0)
3450         return (0);
3451     else
3452         return (1);
3453 }
3454 
3455 /**
3456  * xmlRegCompactPushString:
3457  * @exec: a regexp execution context
3458  * @comp:  the precompiled exec with a compact table
3459  * @value: a string token input
3460  * @data: data associated to the token to reuse in callbacks
3461  *
3462  * Push one input token in the execution context
3463  *
3464  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3465  *     a negative value in case of error.
3466  */
3467 static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,xmlRegexpPtr comp,const xmlChar * value,void * data)3468 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3469                         xmlRegexpPtr comp,
3470                         const xmlChar *value,
3471                         void *data) {
3472     int state = exec->index;
3473     int i, target;
3474 
3475     if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3476         return(-1);
3477 
3478     if (value == NULL) {
3479         /*
3480          * are we at a final state ?
3481          */
3482         if (comp->compact[state * (comp->nbstrings + 1)] ==
3483             XML_REGEXP_FINAL_STATE)
3484             return(1);
3485         return(0);
3486     }
3487 
3488 #ifdef DEBUG_PUSH
3489     printf("value pushed: %s\n", value);
3490 #endif
3491 
3492     /*
3493      * Examine all outside transitions from current state
3494      */
3495     for (i = 0;i < comp->nbstrings;i++) {
3496         target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3497         if ((target > 0) && (target <= comp->nbstates)) {
3498             target--; /* to avoid 0 */
3499             if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3500                 exec->index = target;
3501                 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3502                     exec->callback(exec->data, value,
3503                           comp->transdata[state * comp->nbstrings + i], data);
3504                 }
3505 #ifdef DEBUG_PUSH
3506                 printf("entering state %d\n", target);
3507 #endif
3508                 if (comp->compact[target * (comp->nbstrings + 1)] ==
3509                     XML_REGEXP_SINK_STATE)
3510                     goto error;
3511 
3512                 if (comp->compact[target * (comp->nbstrings + 1)] ==
3513                     XML_REGEXP_FINAL_STATE)
3514                     return(1);
3515                 return(0);
3516             }
3517         }
3518     }
3519     /*
3520      * Failed to find an exit transition out from current state for the
3521      * current token
3522      */
3523 #ifdef DEBUG_PUSH
3524     printf("failed to find a transition for %s on state %d\n", value, state);
3525 #endif
3526 error:
3527     if (exec->errString != NULL)
3528         xmlFree(exec->errString);
3529     exec->errString = xmlStrdup(value);
3530     exec->errStateNo = state;
3531     exec->status = -1;
3532 #ifdef DEBUG_ERR
3533     testerr(exec);
3534 #endif
3535     return(-1);
3536 }
3537 
3538 /**
3539  * xmlRegExecPushStringInternal:
3540  * @exec: a regexp execution context or NULL to indicate the end
3541  * @value: a string token input
3542  * @data: data associated to the token to reuse in callbacks
3543  * @compound: value was assembled from 2 strings
3544  *
3545  * Push one input token in the execution context
3546  *
3547  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3548  *     a negative value in case of error.
3549  */
3550 static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data,int compound)3551 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3552                              void *data, int compound) {
3553     xmlRegTransPtr trans;
3554     xmlRegAtomPtr atom;
3555     int ret;
3556     int final = 0;
3557     int progress = 1;
3558 
3559     if (exec == NULL)
3560         return(-1);
3561     if (exec->comp == NULL)
3562         return(-1);
3563     if (exec->status != 0)
3564         return(exec->status);
3565 
3566     if (exec->comp->compact != NULL)
3567         return(xmlRegCompactPushString(exec, exec->comp, value, data));
3568 
3569     if (value == NULL) {
3570         if (exec->state->type == XML_REGEXP_FINAL_STATE)
3571             return(1);
3572         final = 1;
3573     }
3574 
3575 #ifdef DEBUG_PUSH
3576     printf("value pushed: %s\n", value);
3577 #endif
3578     /*
3579      * If we have an active rollback stack push the new value there
3580      * and get back to where we were left
3581      */
3582     if ((value != NULL) && (exec->inputStackNr > 0)) {
3583         xmlFARegExecSaveInputString(exec, value, data);
3584         value = exec->inputStack[exec->index].value;
3585         data = exec->inputStack[exec->index].data;
3586 #ifdef DEBUG_PUSH
3587         printf("value loaded: %s\n", value);
3588 #endif
3589     }
3590 
3591     while ((exec->status == 0) &&
3592            ((value != NULL) ||
3593             ((final == 1) &&
3594              (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3595 
3596         /*
3597          * End of input on non-terminal state, rollback, however we may
3598          * still have epsilon like transition for counted transitions
3599          * on counters, in that case don't break too early.
3600          */
3601         if ((value == NULL) && (exec->counts == NULL))
3602             goto rollback;
3603 
3604         exec->transcount = 0;
3605         for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3606             trans = &exec->state->trans[exec->transno];
3607             if (trans->to < 0)
3608                 continue;
3609             atom = trans->atom;
3610             ret = 0;
3611             if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3612                 int i;
3613                 int count;
3614                 xmlRegTransPtr t;
3615                 xmlRegCounterPtr counter;
3616 
3617                 ret = 0;
3618 
3619 #ifdef DEBUG_PUSH
3620                 printf("testing all lax %d\n", trans->count);
3621 #endif
3622                 /*
3623                  * Check all counted transitions from the current state
3624                  */
3625                 if ((value == NULL) && (final)) {
3626                     ret = 1;
3627                 } else if (value != NULL) {
3628                     for (i = 0;i < exec->state->nbTrans;i++) {
3629                         t = &exec->state->trans[i];
3630                         if ((t->counter < 0) || (t == trans))
3631                             continue;
3632                         counter = &exec->comp->counters[t->counter];
3633                         count = exec->counts[t->counter];
3634                         if ((count < counter->max) &&
3635                             (t->atom != NULL) &&
3636                             (xmlStrEqual(value, t->atom->valuep))) {
3637                             ret = 0;
3638                             break;
3639                         }
3640                         if ((count >= counter->min) &&
3641                             (count < counter->max) &&
3642                             (t->atom != NULL) &&
3643                             (xmlStrEqual(value, t->atom->valuep))) {
3644                             ret = 1;
3645                             break;
3646                         }
3647                     }
3648                 }
3649             } else if (trans->count == REGEXP_ALL_COUNTER) {
3650                 int i;
3651                 int count;
3652                 xmlRegTransPtr t;
3653                 xmlRegCounterPtr counter;
3654 
3655                 ret = 1;
3656 
3657 #ifdef DEBUG_PUSH
3658                 printf("testing all %d\n", trans->count);
3659 #endif
3660                 /*
3661                  * Check all counted transitions from the current state
3662                  */
3663                 for (i = 0;i < exec->state->nbTrans;i++) {
3664                     t = &exec->state->trans[i];
3665                     if ((t->counter < 0) || (t == trans))
3666                         continue;
3667                     counter = &exec->comp->counters[t->counter];
3668                     count = exec->counts[t->counter];
3669                     if ((count < counter->min) || (count > counter->max)) {
3670                         ret = 0;
3671                         break;
3672                     }
3673                 }
3674             } else if (trans->count >= 0) {
3675                 int count;
3676                 xmlRegCounterPtr counter;
3677 
3678                 /*
3679                  * A counted transition.
3680                  */
3681 
3682                 count = exec->counts[trans->count];
3683                 counter = &exec->comp->counters[trans->count];
3684 #ifdef DEBUG_PUSH
3685                 printf("testing count %d: val %d, min %d, max %d\n",
3686                        trans->count, count, counter->min,  counter->max);
3687 #endif
3688                 ret = ((count >= counter->min) && (count <= counter->max));
3689             } else if (atom == NULL) {
3690                 fprintf(stderr, "epsilon transition left at runtime\n");
3691                 exec->status = -2;
3692                 break;
3693             } else if (value != NULL) {
3694                 ret = xmlRegStrEqualWildcard(atom->valuep, value);
3695                 if (atom->neg) {
3696                     ret = !ret;
3697                     if (!compound)
3698                         ret = 0;
3699                 }
3700                 if ((ret == 1) && (trans->counter >= 0)) {
3701                     xmlRegCounterPtr counter;
3702                     int count;
3703 
3704                     count = exec->counts[trans->counter];
3705                     counter = &exec->comp->counters[trans->counter];
3706                     if (count >= counter->max)
3707                         ret = 0;
3708                 }
3709 
3710                 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3711                     xmlRegStatePtr to = exec->comp->states[trans->to];
3712 
3713                     /*
3714                      * this is a multiple input sequence
3715                      */
3716                     if (exec->state->nbTrans > exec->transno + 1) {
3717                         if (exec->inputStackNr <= 0) {
3718                             xmlFARegExecSaveInputString(exec, value, data);
3719                         }
3720                         xmlFARegExecSave(exec);
3721                     }
3722                     exec->transcount = 1;
3723                     do {
3724                         /*
3725                          * Try to progress as much as possible on the input
3726                          */
3727                         if (exec->transcount == atom->max) {
3728                             break;
3729                         }
3730                         exec->index++;
3731                         value = exec->inputStack[exec->index].value;
3732                         data = exec->inputStack[exec->index].data;
3733 #ifdef DEBUG_PUSH
3734                         printf("value loaded: %s\n", value);
3735 #endif
3736 
3737                         /*
3738                          * End of input: stop here
3739                          */
3740                         if (value == NULL) {
3741                             exec->index --;
3742                             break;
3743                         }
3744                         if (exec->transcount >= atom->min) {
3745                             int transno = exec->transno;
3746                             xmlRegStatePtr state = exec->state;
3747 
3748                             /*
3749                              * The transition is acceptable save it
3750                              */
3751                             exec->transno = -1; /* trick */
3752                             exec->state = to;
3753                             if (exec->inputStackNr <= 0) {
3754                                 xmlFARegExecSaveInputString(exec, value, data);
3755                             }
3756                             xmlFARegExecSave(exec);
3757                             exec->transno = transno;
3758                             exec->state = state;
3759                         }
3760                         ret = xmlStrEqual(value, atom->valuep);
3761                         exec->transcount++;
3762                     } while (ret == 1);
3763                     if (exec->transcount < atom->min)
3764                         ret = 0;
3765 
3766                     /*
3767                      * If the last check failed but one transition was found
3768                      * possible, rollback
3769                      */
3770                     if (ret < 0)
3771                         ret = 0;
3772                     if (ret == 0) {
3773                         goto rollback;
3774                     }
3775                 }
3776             }
3777             if (ret == 1) {
3778                 if ((exec->callback != NULL) && (atom != NULL) &&
3779                         (data != NULL)) {
3780                     exec->callback(exec->data, atom->valuep,
3781                                    atom->data, data);
3782                 }
3783                 if (exec->state->nbTrans > exec->transno + 1) {
3784                     if (exec->inputStackNr <= 0) {
3785                         xmlFARegExecSaveInputString(exec, value, data);
3786                     }
3787                     xmlFARegExecSave(exec);
3788                 }
3789                 if (trans->counter >= 0) {
3790 #ifdef DEBUG_PUSH
3791                     printf("Increasing count %d\n", trans->counter);
3792 #endif
3793                     exec->counts[trans->counter]++;
3794                 }
3795                 if ((trans->count >= 0) &&
3796                     (trans->count < REGEXP_ALL_COUNTER)) {
3797 #ifdef DEBUG_REGEXP_EXEC
3798                     printf("resetting count %d on transition\n",
3799                            trans->count);
3800 #endif
3801                     exec->counts[trans->count] = 0;
3802                 }
3803 #ifdef DEBUG_PUSH
3804                 printf("entering state %d\n", trans->to);
3805 #endif
3806                 if ((exec->comp->states[trans->to] != NULL) &&
3807                     (exec->comp->states[trans->to]->type ==
3808                      XML_REGEXP_SINK_STATE)) {
3809                     /*
3810                      * entering a sink state, save the current state as error
3811                      * state.
3812                      */
3813                     if (exec->errString != NULL)
3814                         xmlFree(exec->errString);
3815                     exec->errString = xmlStrdup(value);
3816                     exec->errState = exec->state;
3817                     memcpy(exec->errCounts, exec->counts,
3818                            exec->comp->nbCounters * sizeof(int));
3819                 }
3820                 exec->state = exec->comp->states[trans->to];
3821                 exec->transno = 0;
3822                 if (trans->atom != NULL) {
3823                     if (exec->inputStack != NULL) {
3824                         exec->index++;
3825                         if (exec->index < exec->inputStackNr) {
3826                             value = exec->inputStack[exec->index].value;
3827                             data = exec->inputStack[exec->index].data;
3828 #ifdef DEBUG_PUSH
3829                             printf("value loaded: %s\n", value);
3830 #endif
3831                         } else {
3832                             value = NULL;
3833                             data = NULL;
3834 #ifdef DEBUG_PUSH
3835                             printf("end of input\n");
3836 #endif
3837                         }
3838                     } else {
3839                         value = NULL;
3840                         data = NULL;
3841 #ifdef DEBUG_PUSH
3842                         printf("end of input\n");
3843 #endif
3844                     }
3845                 }
3846                 goto progress;
3847             } else if (ret < 0) {
3848                 exec->status = -4;
3849                 break;
3850             }
3851         }
3852         if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3853 rollback:
3854             /*
3855              * if we didn't yet rollback on the current input
3856              * store the current state as the error state.
3857              */
3858             if ((progress) && (exec->state != NULL) &&
3859                 (exec->state->type != XML_REGEXP_SINK_STATE)) {
3860                 progress = 0;
3861                 if (exec->errString != NULL)
3862                     xmlFree(exec->errString);
3863                 exec->errString = xmlStrdup(value);
3864                 exec->errState = exec->state;
3865                 memcpy(exec->errCounts, exec->counts,
3866                        exec->comp->nbCounters * sizeof(int));
3867             }
3868 
3869             /*
3870              * Failed to find a way out
3871              */
3872             exec->determinist = 0;
3873             xmlFARegExecRollBack(exec);
3874             if (exec->status == 0) {
3875                 value = exec->inputStack[exec->index].value;
3876                 data = exec->inputStack[exec->index].data;
3877 #ifdef DEBUG_PUSH
3878                 printf("value loaded: %s\n", value);
3879 #endif
3880             }
3881         }
3882         continue;
3883 progress:
3884         progress = 1;
3885         continue;
3886     }
3887     if (exec->status == 0) {
3888         return(exec->state->type == XML_REGEXP_FINAL_STATE);
3889     }
3890 #ifdef DEBUG_ERR
3891     if (exec->status < 0) {
3892         testerr(exec);
3893     }
3894 #endif
3895     return(exec->status);
3896 }
3897 
3898 /**
3899  * xmlRegExecPushString:
3900  * @exec: a regexp execution context or NULL to indicate the end
3901  * @value: a string token input
3902  * @data: data associated to the token to reuse in callbacks
3903  *
3904  * Push one input token in the execution context
3905  *
3906  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3907  *     a negative value in case of error.
3908  */
3909 int
xmlRegExecPushString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3910 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3911                      void *data) {
3912     return(xmlRegExecPushStringInternal(exec, value, data, 0));
3913 }
3914 
3915 /**
3916  * xmlRegExecPushString2:
3917  * @exec: a regexp execution context or NULL to indicate the end
3918  * @value: the first string token input
3919  * @value2: the second string token input
3920  * @data: data associated to the token to reuse in callbacks
3921  *
3922  * Push one input token in the execution context
3923  *
3924  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3925  *     a negative value in case of error.
3926  */
3927 int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec,const xmlChar * value,const xmlChar * value2,void * data)3928 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
3929                       const xmlChar *value2, void *data) {
3930     xmlChar buf[150];
3931     int lenn, lenp, ret;
3932     xmlChar *str;
3933 
3934     if (exec == NULL)
3935         return(-1);
3936     if (exec->comp == NULL)
3937         return(-1);
3938     if (exec->status != 0)
3939         return(exec->status);
3940 
3941     if (value2 == NULL)
3942         return(xmlRegExecPushString(exec, value, data));
3943 
3944     lenn = strlen((char *) value2);
3945     lenp = strlen((char *) value);
3946 
3947     if (150 < lenn + lenp + 2) {
3948         str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
3949         if (str == NULL) {
3950             exec->status = -1;
3951             return(-1);
3952         }
3953     } else {
3954         str = buf;
3955     }
3956     memcpy(&str[0], value, lenp);
3957     str[lenp] = XML_REG_STRING_SEPARATOR;
3958     memcpy(&str[lenp + 1], value2, lenn);
3959     str[lenn + lenp + 1] = 0;
3960 
3961     if (exec->comp->compact != NULL)
3962         ret = xmlRegCompactPushString(exec, exec->comp, str, data);
3963     else
3964         ret = xmlRegExecPushStringInternal(exec, str, data, 1);
3965 
3966     if (str != buf)
3967         xmlFree(str);
3968     return(ret);
3969 }
3970 
3971 /**
3972  * xmlRegExecGetValues:
3973  * @exec: a regexp execution context
3974  * @err: error extraction or normal one
3975  * @nbval: pointer to the number of accepted values IN/OUT
3976  * @nbneg: return number of negative transitions
3977  * @values: pointer to the array of acceptable values
3978  * @terminal: return value if this was a terminal state
3979  *
3980  * Extract informations from the regexp execution, internal routine to
3981  * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
3982  *
3983  * Returns: 0 in case of success or -1 in case of error.
3984  */
3985 static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec,int err,int * nbval,int * nbneg,xmlChar ** values,int * terminal)3986 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
3987                     int *nbval, int *nbneg,
3988                     xmlChar **values, int *terminal) {
3989     int maxval;
3990     int nb = 0;
3991 
3992     if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
3993         (values == NULL) || (*nbval <= 0))
3994         return(-1);
3995 
3996     maxval = *nbval;
3997     *nbval = 0;
3998     *nbneg = 0;
3999     if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4000         xmlRegexpPtr comp;
4001         int target, i, state;
4002 
4003         comp = exec->comp;
4004 
4005         if (err) {
4006             if (exec->errStateNo == -1) return(-1);
4007             state = exec->errStateNo;
4008         } else {
4009             state = exec->index;
4010         }
4011         if (terminal != NULL) {
4012             if (comp->compact[state * (comp->nbstrings + 1)] ==
4013                 XML_REGEXP_FINAL_STATE)
4014                 *terminal = 1;
4015             else
4016                 *terminal = 0;
4017         }
4018         for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4019             target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4020             if ((target > 0) && (target <= comp->nbstates) &&
4021                 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4022                  XML_REGEXP_SINK_STATE)) {
4023                 values[nb++] = comp->stringMap[i];
4024                 (*nbval)++;
4025             }
4026         }
4027         for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4028             target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4029             if ((target > 0) && (target <= comp->nbstates) &&
4030                 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4031                  XML_REGEXP_SINK_STATE)) {
4032                 values[nb++] = comp->stringMap[i];
4033                 (*nbneg)++;
4034             }
4035         }
4036     } else {
4037         int transno;
4038         xmlRegTransPtr trans;
4039         xmlRegAtomPtr atom;
4040         xmlRegStatePtr state;
4041 
4042         if (terminal != NULL) {
4043             if (exec->state->type == XML_REGEXP_FINAL_STATE)
4044                 *terminal = 1;
4045             else
4046                 *terminal = 0;
4047         }
4048 
4049         if (err) {
4050             if (exec->errState == NULL) return(-1);
4051             state = exec->errState;
4052         } else {
4053             if (exec->state == NULL) return(-1);
4054             state = exec->state;
4055         }
4056         for (transno = 0;
4057              (transno < state->nbTrans) && (nb < maxval);
4058              transno++) {
4059             trans = &state->trans[transno];
4060             if (trans->to < 0)
4061                 continue;
4062             atom = trans->atom;
4063             if ((atom == NULL) || (atom->valuep == NULL))
4064                 continue;
4065             if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4066                 /* this should not be reached but ... */
4067                 TODO;
4068             } else if (trans->count == REGEXP_ALL_COUNTER) {
4069                 /* this should not be reached but ... */
4070                 TODO;
4071             } else if (trans->counter >= 0) {
4072                 xmlRegCounterPtr counter = NULL;
4073                 int count;
4074 
4075                 if (err)
4076                     count = exec->errCounts[trans->counter];
4077                 else
4078                     count = exec->counts[trans->counter];
4079                 if (exec->comp != NULL)
4080                     counter = &exec->comp->counters[trans->counter];
4081                 if ((counter == NULL) || (count < counter->max)) {
4082                     if (atom->neg)
4083                         values[nb++] = (xmlChar *) atom->valuep2;
4084                     else
4085                         values[nb++] = (xmlChar *) atom->valuep;
4086                     (*nbval)++;
4087                 }
4088             } else {
4089                 if ((exec->comp->states[trans->to] != NULL) &&
4090                     (exec->comp->states[trans->to]->type !=
4091                      XML_REGEXP_SINK_STATE)) {
4092                     if (atom->neg)
4093                         values[nb++] = (xmlChar *) atom->valuep2;
4094                     else
4095                         values[nb++] = (xmlChar *) atom->valuep;
4096                     (*nbval)++;
4097                 }
4098             }
4099         }
4100         for (transno = 0;
4101              (transno < state->nbTrans) && (nb < maxval);
4102              transno++) {
4103             trans = &state->trans[transno];
4104             if (trans->to < 0)
4105                 continue;
4106             atom = trans->atom;
4107             if ((atom == NULL) || (atom->valuep == NULL))
4108                 continue;
4109             if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4110                 continue;
4111             } else if (trans->count == REGEXP_ALL_COUNTER) {
4112                 continue;
4113             } else if (trans->counter >= 0) {
4114                 continue;
4115             } else {
4116                 if ((exec->comp->states[trans->to] != NULL) &&
4117                     (exec->comp->states[trans->to]->type ==
4118                      XML_REGEXP_SINK_STATE)) {
4119                     if (atom->neg)
4120                         values[nb++] = (xmlChar *) atom->valuep2;
4121                     else
4122                         values[nb++] = (xmlChar *) atom->valuep;
4123                     (*nbneg)++;
4124                 }
4125             }
4126         }
4127     }
4128     return(0);
4129 }
4130 
4131 /**
4132  * xmlRegExecNextValues:
4133  * @exec: a regexp execution context
4134  * @nbval: pointer to the number of accepted values IN/OUT
4135  * @nbneg: return number of negative transitions
4136  * @values: pointer to the array of acceptable values
4137  * @terminal: return value if this was a terminal state
4138  *
4139  * Extract informations from the regexp execution,
4140  * the parameter @values must point to an array of @nbval string pointers
4141  * on return nbval will contain the number of possible strings in that
4142  * state and the @values array will be updated with them. The string values
4143  * returned will be freed with the @exec context and don't need to be
4144  * deallocated.
4145  *
4146  * Returns: 0 in case of success or -1 in case of error.
4147  */
4148 int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4149 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4150                      xmlChar **values, int *terminal) {
4151     return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4152 }
4153 
4154 /**
4155  * xmlRegExecErrInfo:
4156  * @exec: a regexp execution context generating an error
4157  * @string: return value for the error string
4158  * @nbval: pointer to the number of accepted values IN/OUT
4159  * @nbneg: return number of negative transitions
4160  * @values: pointer to the array of acceptable values
4161  * @terminal: return value if this was a terminal state
4162  *
4163  * Extract error informations from the regexp execution, the parameter
4164  * @string will be updated with the value pushed and not accepted,
4165  * the parameter @values must point to an array of @nbval string pointers
4166  * on return nbval will contain the number of possible strings in that
4167  * state and the @values array will be updated with them. The string values
4168  * returned will be freed with the @exec context and don't need to be
4169  * deallocated.
4170  *
4171  * Returns: 0 in case of success or -1 in case of error.
4172  */
4173 int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec,const xmlChar ** string,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4174 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4175                   int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4176     if (exec == NULL)
4177         return(-1);
4178     if (string != NULL) {
4179         if (exec->status != 0)
4180             *string = exec->errString;
4181         else
4182             *string = NULL;
4183     }
4184     return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4185 }
4186 
4187 #ifdef DEBUG_ERR
testerr(xmlRegExecCtxtPtr exec)4188 static void testerr(xmlRegExecCtxtPtr exec) {
4189     const xmlChar *string;
4190     xmlChar *values[5];
4191     int nb = 5;
4192     int nbneg;
4193     int terminal;
4194     xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4195 }
4196 #endif
4197 
4198 #if 0
4199 static int
4200 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4201     xmlRegTransPtr trans;
4202     xmlRegAtomPtr atom;
4203     int ret;
4204     int codepoint, len;
4205 
4206     if (exec == NULL)
4207         return(-1);
4208     if (exec->status != 0)
4209         return(exec->status);
4210 
4211     while ((exec->status == 0) &&
4212            ((exec->inputString[exec->index] != 0) ||
4213             (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4214 
4215         /*
4216          * End of input on non-terminal state, rollback, however we may
4217          * still have epsilon like transition for counted transitions
4218          * on counters, in that case don't break too early.
4219          */
4220         if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4221             goto rollback;
4222 
4223         exec->transcount = 0;
4224         for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4225             trans = &exec->state->trans[exec->transno];
4226             if (trans->to < 0)
4227                 continue;
4228             atom = trans->atom;
4229             ret = 0;
4230             if (trans->count >= 0) {
4231                 int count;
4232                 xmlRegCounterPtr counter;
4233 
4234                 /*
4235                  * A counted transition.
4236                  */
4237 
4238                 count = exec->counts[trans->count];
4239                 counter = &exec->comp->counters[trans->count];
4240 #ifdef DEBUG_REGEXP_EXEC
4241                 printf("testing count %d: val %d, min %d, max %d\n",
4242                        trans->count, count, counter->min,  counter->max);
4243 #endif
4244                 ret = ((count >= counter->min) && (count <= counter->max));
4245             } else if (atom == NULL) {
4246                 fprintf(stderr, "epsilon transition left at runtime\n");
4247                 exec->status = -2;
4248                 break;
4249             } else if (exec->inputString[exec->index] != 0) {
4250                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4251                 ret = xmlRegCheckCharacter(atom, codepoint);
4252                 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4253                     xmlRegStatePtr to = exec->comp->states[trans->to];
4254 
4255                     /*
4256                      * this is a multiple input sequence
4257                      */
4258                     if (exec->state->nbTrans > exec->transno + 1) {
4259                         xmlFARegExecSave(exec);
4260                     }
4261                     exec->transcount = 1;
4262                     do {
4263                         /*
4264                          * Try to progress as much as possible on the input
4265                          */
4266                         if (exec->transcount == atom->max) {
4267                             break;
4268                         }
4269                         exec->index += len;
4270                         /*
4271                          * End of input: stop here
4272                          */
4273                         if (exec->inputString[exec->index] == 0) {
4274                             exec->index -= len;
4275                             break;
4276                         }
4277                         if (exec->transcount >= atom->min) {
4278                             int transno = exec->transno;
4279                             xmlRegStatePtr state = exec->state;
4280 
4281                             /*
4282                              * The transition is acceptable save it
4283                              */
4284                             exec->transno = -1; /* trick */
4285                             exec->state = to;
4286                             xmlFARegExecSave(exec);
4287                             exec->transno = transno;
4288                             exec->state = state;
4289                         }
4290                         codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4291                                               len);
4292                         ret = xmlRegCheckCharacter(atom, codepoint);
4293                         exec->transcount++;
4294                     } while (ret == 1);
4295                     if (exec->transcount < atom->min)
4296                         ret = 0;
4297 
4298                     /*
4299                      * If the last check failed but one transition was found
4300                      * possible, rollback
4301                      */
4302                     if (ret < 0)
4303                         ret = 0;
4304                     if (ret == 0) {
4305                         goto rollback;
4306                     }
4307                 }
4308             }
4309             if (ret == 1) {
4310                 if (exec->state->nbTrans > exec->transno + 1) {
4311                     xmlFARegExecSave(exec);
4312                 }
4313                 /*
4314                  * restart count for expressions like this ((abc){2})*
4315                  */
4316                 if (trans->count >= 0) {
4317 #ifdef DEBUG_REGEXP_EXEC
4318                     printf("Reset count %d\n", trans->count);
4319 #endif
4320                     exec->counts[trans->count] = 0;
4321                 }
4322                 if (trans->counter >= 0) {
4323 #ifdef DEBUG_REGEXP_EXEC
4324                     printf("Increasing count %d\n", trans->counter);
4325 #endif
4326                     exec->counts[trans->counter]++;
4327                 }
4328 #ifdef DEBUG_REGEXP_EXEC
4329                 printf("entering state %d\n", trans->to);
4330 #endif
4331                 exec->state = exec->comp->states[trans->to];
4332                 exec->transno = 0;
4333                 if (trans->atom != NULL) {
4334                     exec->index += len;
4335                 }
4336                 goto progress;
4337             } else if (ret < 0) {
4338                 exec->status = -4;
4339                 break;
4340             }
4341         }
4342         if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4343 rollback:
4344             /*
4345              * Failed to find a way out
4346              */
4347             exec->determinist = 0;
4348             xmlFARegExecRollBack(exec);
4349         }
4350 progress:
4351         continue;
4352     }
4353 }
4354 #endif
4355 /************************************************************************
4356  *                                                                      *
4357  *      Parser for the Schemas Datatype Regular Expressions             *
4358  *      http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs      *
4359  *                                                                      *
4360  ************************************************************************/
4361 
4362 /**
4363  * xmlFAIsChar:
4364  * @ctxt:  a regexp parser context
4365  *
4366  * [10]   Char   ::=   [^.\?*+()|#x5B#x5D]
4367  */
4368 static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt)4369 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4370     int cur;
4371     int len;
4372 
4373     cur = CUR_SCHAR(ctxt->cur, len);
4374     if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4375         (cur == '*') || (cur == '+') || (cur == '(') ||
4376         (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4377         (cur == 0x5D) || (cur == 0))
4378         return(-1);
4379     return(cur);
4380 }
4381 
4382 /**
4383  * xmlFAParseCharProp:
4384  * @ctxt:  a regexp parser context
4385  *
4386  * [27]   charProp   ::=   IsCategory | IsBlock
4387  * [28]   IsCategory ::= Letters | Marks | Numbers | Punctuation |
4388  *                       Separators | Symbols | Others
4389  * [29]   Letters   ::=   'L' [ultmo]?
4390  * [30]   Marks   ::=   'M' [nce]?
4391  * [31]   Numbers   ::=   'N' [dlo]?
4392  * [32]   Punctuation   ::=   'P' [cdseifo]?
4393  * [33]   Separators   ::=   'Z' [slp]?
4394  * [34]   Symbols   ::=   'S' [mcko]?
4395  * [35]   Others   ::=   'C' [cfon]?
4396  * [36]   IsBlock   ::=   'Is' [a-zA-Z0-9#x2D]+
4397  */
4398 static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt)4399 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4400     int cur;
4401     xmlRegAtomType type = (xmlRegAtomType) 0;
4402     xmlChar *blockName = NULL;
4403 
4404     cur = CUR;
4405     if (cur == 'L') {
4406         NEXT;
4407         cur = CUR;
4408         if (cur == 'u') {
4409             NEXT;
4410             type = XML_REGEXP_LETTER_UPPERCASE;
4411         } else if (cur == 'l') {
4412             NEXT;
4413             type = XML_REGEXP_LETTER_LOWERCASE;
4414         } else if (cur == 't') {
4415             NEXT;
4416             type = XML_REGEXP_LETTER_TITLECASE;
4417         } else if (cur == 'm') {
4418             NEXT;
4419             type = XML_REGEXP_LETTER_MODIFIER;
4420         } else if (cur == 'o') {
4421             NEXT;
4422             type = XML_REGEXP_LETTER_OTHERS;
4423         } else {
4424             type = XML_REGEXP_LETTER;
4425         }
4426     } else if (cur == 'M') {
4427         NEXT;
4428         cur = CUR;
4429         if (cur == 'n') {
4430             NEXT;
4431             /* nonspacing */
4432             type = XML_REGEXP_MARK_NONSPACING;
4433         } else if (cur == 'c') {
4434             NEXT;
4435             /* spacing combining */
4436             type = XML_REGEXP_MARK_SPACECOMBINING;
4437         } else if (cur == 'e') {
4438             NEXT;
4439             /* enclosing */
4440             type = XML_REGEXP_MARK_ENCLOSING;
4441         } else {
4442             /* all marks */
4443             type = XML_REGEXP_MARK;
4444         }
4445     } else if (cur == 'N') {
4446         NEXT;
4447         cur = CUR;
4448         if (cur == 'd') {
4449             NEXT;
4450             /* digital */
4451             type = XML_REGEXP_NUMBER_DECIMAL;
4452         } else if (cur == 'l') {
4453             NEXT;
4454             /* letter */
4455             type = XML_REGEXP_NUMBER_LETTER;
4456         } else if (cur == 'o') {
4457             NEXT;
4458             /* other */
4459             type = XML_REGEXP_NUMBER_OTHERS;
4460         } else {
4461             /* all numbers */
4462             type = XML_REGEXP_NUMBER;
4463         }
4464     } else if (cur == 'P') {
4465         NEXT;
4466         cur = CUR;
4467         if (cur == 'c') {
4468             NEXT;
4469             /* connector */
4470             type = XML_REGEXP_PUNCT_CONNECTOR;
4471         } else if (cur == 'd') {
4472             NEXT;
4473             /* dash */
4474             type = XML_REGEXP_PUNCT_DASH;
4475         } else if (cur == 's') {
4476             NEXT;
4477             /* open */
4478             type = XML_REGEXP_PUNCT_OPEN;
4479         } else if (cur == 'e') {
4480             NEXT;
4481             /* close */
4482             type = XML_REGEXP_PUNCT_CLOSE;
4483         } else if (cur == 'i') {
4484             NEXT;
4485             /* initial quote */
4486             type = XML_REGEXP_PUNCT_INITQUOTE;
4487         } else if (cur == 'f') {
4488             NEXT;
4489             /* final quote */
4490             type = XML_REGEXP_PUNCT_FINQUOTE;
4491         } else if (cur == 'o') {
4492             NEXT;
4493             /* other */
4494             type = XML_REGEXP_PUNCT_OTHERS;
4495         } else {
4496             /* all punctuation */
4497             type = XML_REGEXP_PUNCT;
4498         }
4499     } else if (cur == 'Z') {
4500         NEXT;
4501         cur = CUR;
4502         if (cur == 's') {
4503             NEXT;
4504             /* space */
4505             type = XML_REGEXP_SEPAR_SPACE;
4506         } else if (cur == 'l') {
4507             NEXT;
4508             /* line */
4509             type = XML_REGEXP_SEPAR_LINE;
4510         } else if (cur == 'p') {
4511             NEXT;
4512             /* paragraph */
4513             type = XML_REGEXP_SEPAR_PARA;
4514         } else {
4515             /* all separators */
4516             type = XML_REGEXP_SEPAR;
4517         }
4518     } else if (cur == 'S') {
4519         NEXT;
4520         cur = CUR;
4521         if (cur == 'm') {
4522             NEXT;
4523             type = XML_REGEXP_SYMBOL_MATH;
4524             /* math */
4525         } else if (cur == 'c') {
4526             NEXT;
4527             type = XML_REGEXP_SYMBOL_CURRENCY;
4528             /* currency */
4529         } else if (cur == 'k') {
4530             NEXT;
4531             type = XML_REGEXP_SYMBOL_MODIFIER;
4532             /* modifiers */
4533         } else if (cur == 'o') {
4534             NEXT;
4535             type = XML_REGEXP_SYMBOL_OTHERS;
4536             /* other */
4537         } else {
4538             /* all symbols */
4539             type = XML_REGEXP_SYMBOL;
4540         }
4541     } else if (cur == 'C') {
4542         NEXT;
4543         cur = CUR;
4544         if (cur == 'c') {
4545             NEXT;
4546             /* control */
4547             type = XML_REGEXP_OTHER_CONTROL;
4548         } else if (cur == 'f') {
4549             NEXT;
4550             /* format */
4551             type = XML_REGEXP_OTHER_FORMAT;
4552         } else if (cur == 'o') {
4553             NEXT;
4554             /* private use */
4555             type = XML_REGEXP_OTHER_PRIVATE;
4556         } else if (cur == 'n') {
4557             NEXT;
4558             /* not assigned */
4559             type = XML_REGEXP_OTHER_NA;
4560         } else {
4561             /* all others */
4562             type = XML_REGEXP_OTHER;
4563         }
4564     } else if (cur == 'I') {
4565         const xmlChar *start;
4566         NEXT;
4567         cur = CUR;
4568         if (cur != 's') {
4569             ERROR("IsXXXX expected");
4570             return;
4571         }
4572         NEXT;
4573         start = ctxt->cur;
4574         cur = CUR;
4575         if (((cur >= 'a') && (cur <= 'z')) ||
4576             ((cur >= 'A') && (cur <= 'Z')) ||
4577             ((cur >= '0') && (cur <= '9')) ||
4578             (cur == 0x2D)) {
4579             NEXT;
4580             cur = CUR;
4581             while (((cur >= 'a') && (cur <= 'z')) ||
4582                 ((cur >= 'A') && (cur <= 'Z')) ||
4583                 ((cur >= '0') && (cur <= '9')) ||
4584                 (cur == 0x2D)) {
4585                 NEXT;
4586                 cur = CUR;
4587             }
4588         }
4589         type = XML_REGEXP_BLOCK_NAME;
4590         blockName = xmlStrndup(start, ctxt->cur - start);
4591     } else {
4592         ERROR("Unknown char property");
4593         return;
4594     }
4595     if (ctxt->atom == NULL) {
4596         ctxt->atom = xmlRegNewAtom(ctxt, type);
4597         if (ctxt->atom != NULL)
4598             ctxt->atom->valuep = blockName;
4599     } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4600         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4601                            type, 0, 0, blockName);
4602     }
4603 }
4604 
4605 /**
4606  * xmlFAParseCharClassEsc:
4607  * @ctxt:  a regexp parser context
4608  *
4609  * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4610  * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4611  * [25] catEsc   ::=   '\p{' charProp '}'
4612  * [26] complEsc ::=   '\P{' charProp '}'
4613  * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4614  */
4615 static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt)4616 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4617     int cur;
4618 
4619     if (CUR == '.') {
4620         if (ctxt->atom == NULL) {
4621             ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4622         } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4623             xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4624                                XML_REGEXP_ANYCHAR, 0, 0, NULL);
4625         }
4626         NEXT;
4627         return;
4628     }
4629     if (CUR != '\\') {
4630         ERROR("Escaped sequence: expecting \\");
4631         return;
4632     }
4633     NEXT;
4634     cur = CUR;
4635     if (cur == 'p') {
4636         NEXT;
4637         if (CUR != '{') {
4638             ERROR("Expecting '{'");
4639             return;
4640         }
4641         NEXT;
4642         xmlFAParseCharProp(ctxt);
4643         if (CUR != '}') {
4644             ERROR("Expecting '}'");
4645             return;
4646         }
4647         NEXT;
4648     } else if (cur == 'P') {
4649         NEXT;
4650         if (CUR != '{') {
4651             ERROR("Expecting '{'");
4652             return;
4653         }
4654         NEXT;
4655         xmlFAParseCharProp(ctxt);
4656         ctxt->atom->neg = 1;
4657         if (CUR != '}') {
4658             ERROR("Expecting '}'");
4659             return;
4660         }
4661         NEXT;
4662     } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4663         (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4664         (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4665         (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4666         (cur == 0x5E)) {
4667         if (ctxt->atom == NULL) {
4668             ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4669             if (ctxt->atom != NULL) {
4670                 switch (cur) {
4671                     case 'n':
4672                         ctxt->atom->codepoint = '\n';
4673                         break;
4674                     case 'r':
4675                         ctxt->atom->codepoint = '\r';
4676                         break;
4677                     case 't':
4678                         ctxt->atom->codepoint = '\t';
4679                         break;
4680                     default:
4681                         ctxt->atom->codepoint = cur;
4682                 }
4683             }
4684         } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4685             xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4686                                XML_REGEXP_CHARVAL, cur, cur, NULL);
4687         }
4688         NEXT;
4689     } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4690         (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4691         (cur == 'w') || (cur == 'W')) {
4692         xmlRegAtomType type = XML_REGEXP_ANYSPACE;
4693 
4694         switch (cur) {
4695             case 's':
4696                 type = XML_REGEXP_ANYSPACE;
4697                 break;
4698             case 'S':
4699                 type = XML_REGEXP_NOTSPACE;
4700                 break;
4701             case 'i':
4702                 type = XML_REGEXP_INITNAME;
4703                 break;
4704             case 'I':
4705                 type = XML_REGEXP_NOTINITNAME;
4706                 break;
4707             case 'c':
4708                 type = XML_REGEXP_NAMECHAR;
4709                 break;
4710             case 'C':
4711                 type = XML_REGEXP_NOTNAMECHAR;
4712                 break;
4713             case 'd':
4714                 type = XML_REGEXP_DECIMAL;
4715                 break;
4716             case 'D':
4717                 type = XML_REGEXP_NOTDECIMAL;
4718                 break;
4719             case 'w':
4720                 type = XML_REGEXP_REALCHAR;
4721                 break;
4722             case 'W':
4723                 type = XML_REGEXP_NOTREALCHAR;
4724                 break;
4725         }
4726         NEXT;
4727         if (ctxt->atom == NULL) {
4728             ctxt->atom = xmlRegNewAtom(ctxt, type);
4729         } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4730             xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4731                                type, 0, 0, NULL);
4732         }
4733     }
4734 }
4735 
4736 /**
4737  * xmlFAParseCharRef:
4738  * @ctxt:  a regexp parser context
4739  *
4740  * [19]   XmlCharRef   ::=   ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
4741  */
4742 static int
xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt)4743 xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
4744     int ret = 0, cur;
4745 
4746     if ((CUR != '&') || (NXT(1) != '#'))
4747         return(-1);
4748     NEXT;
4749     NEXT;
4750     cur = CUR;
4751     if (cur == 'x') {
4752         NEXT;
4753         cur = CUR;
4754         if (((cur >= '0') && (cur <= '9')) ||
4755             ((cur >= 'a') && (cur <= 'f')) ||
4756             ((cur >= 'A') && (cur <= 'F'))) {
4757             while (((cur >= '0') && (cur <= '9')) ||
4758                    ((cur >= 'a') && (cur <= 'f')) ||
4759                    ((cur >= 'A') && (cur <= 'F'))) {
4760                 if ((cur >= '0') && (cur <= '9'))
4761                     ret = ret * 16 + cur - '0';
4762                 else if ((cur >= 'a') && (cur <= 'f'))
4763                     ret = ret * 16 + 10 + (cur - 'a');
4764                 else
4765                     ret = ret * 16 + 10 + (cur - 'A');
4766                 NEXT;
4767                 cur = CUR;
4768             }
4769         } else {
4770             ERROR("Char ref: expecting [0-9A-F]");
4771             return(-1);
4772         }
4773     } else {
4774         if ((cur >= '0') && (cur <= '9')) {
4775             while ((cur >= '0') && (cur <= '9')) {
4776                 ret = ret * 10 + cur - '0';
4777                 NEXT;
4778                 cur = CUR;
4779             }
4780         } else {
4781             ERROR("Char ref: expecting [0-9]");
4782             return(-1);
4783         }
4784     }
4785     if (cur != ';') {
4786         ERROR("Char ref: expecting ';'");
4787         return(-1);
4788     } else {
4789         NEXT;
4790     }
4791     return(ret);
4792 }
4793 
4794 /**
4795  * xmlFAParseCharRange:
4796  * @ctxt:  a regexp parser context
4797  *
4798  * [17]   charRange   ::=     seRange | XmlCharRef | XmlCharIncDash
4799  * [18]   seRange   ::=   charOrEsc '-' charOrEsc
4800  * [20]   charOrEsc   ::=   XmlChar | SingleCharEsc
4801  * [21]   XmlChar   ::=   [^\#x2D#x5B#x5D]
4802  * [22]   XmlCharIncDash   ::=   [^\#x5B#x5D]
4803  */
4804 static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt)4805 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
4806     int cur, len;
4807     int start = -1;
4808     int end = -1;
4809 
4810     if (CUR == '\0') {
4811         ERROR("Expecting ']'");
4812         return;
4813     }
4814 
4815     if ((CUR == '&') && (NXT(1) == '#')) {
4816         end = start = xmlFAParseCharRef(ctxt);
4817         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4818                            XML_REGEXP_CHARVAL, start, end, NULL);
4819         return;
4820     }
4821     cur = CUR;
4822     if (cur == '\\') {
4823         NEXT;
4824         cur = CUR;
4825         switch (cur) {
4826             case 'n': start = 0xA; break;
4827             case 'r': start = 0xD; break;
4828             case 't': start = 0x9; break;
4829             case '\\': case '|': case '.': case '-': case '^': case '?':
4830             case '*': case '+': case '{': case '}': case '(': case ')':
4831             case '[': case ']':
4832                 start = cur; break;
4833             default:
4834                 ERROR("Invalid escape value");
4835                 return;
4836         }
4837         end = start;
4838         len = 1;
4839     } else if ((cur != 0x5B) && (cur != 0x5D)) {
4840         end = start = CUR_SCHAR(ctxt->cur, len);
4841     } else {
4842         ERROR("Expecting a char range");
4843         return;
4844     }
4845     NEXTL(len);
4846     if (start == '-') {
4847         return;
4848     }
4849     cur = CUR;
4850     if ((cur != '-') || (NXT(1) == ']')) {
4851         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4852                               XML_REGEXP_CHARVAL, start, end, NULL);
4853         return;
4854     }
4855     NEXT;
4856     cur = CUR;
4857     if (cur == '\\') {
4858         NEXT;
4859         cur = CUR;
4860         switch (cur) {
4861             case 'n': end = 0xA; break;
4862             case 'r': end = 0xD; break;
4863             case 't': end = 0x9; break;
4864             case '\\': case '|': case '.': case '-': case '^': case '?':
4865             case '*': case '+': case '{': case '}': case '(': case ')':
4866             case '[': case ']':
4867                 end = cur; break;
4868             default:
4869                 ERROR("Invalid escape value");
4870                 return;
4871         }
4872         len = 1;
4873     } else if ((cur != 0x5B) && (cur != 0x5D)) {
4874         end = CUR_SCHAR(ctxt->cur, len);
4875     } else {
4876         ERROR("Expecting the end of a char range");
4877         return;
4878     }
4879     NEXTL(len);
4880     /* TODO check that the values are acceptable character ranges for XML */
4881     if (end < start) {
4882         ERROR("End of range is before start of range");
4883     } else {
4884         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4885                            XML_REGEXP_CHARVAL, start, end, NULL);
4886     }
4887     return;
4888 }
4889 
4890 /**
4891  * xmlFAParsePosCharGroup:
4892  * @ctxt:  a regexp parser context
4893  *
4894  * [14]   posCharGroup ::= ( charRange | charClassEsc  )+
4895  */
4896 static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt)4897 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
4898     do {
4899         if ((CUR == '\\') || (CUR == '.')) {
4900             xmlFAParseCharClassEsc(ctxt);
4901         } else {
4902             xmlFAParseCharRange(ctxt);
4903         }
4904     } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
4905              (CUR != 0) && (ctxt->error == 0));
4906 }
4907 
4908 /**
4909  * xmlFAParseCharGroup:
4910  * @ctxt:  a regexp parser context
4911  *
4912  * [13]   charGroup    ::= posCharGroup | negCharGroup | charClassSub
4913  * [15]   negCharGroup ::= '^' posCharGroup
4914  * [16]   charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
4915  * [12]   charClassExpr ::= '[' charGroup ']'
4916  */
4917 static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt)4918 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
4919     int n = ctxt->neg;
4920     while ((CUR != ']') && (ctxt->error == 0)) {
4921         if (CUR == '^') {
4922             int neg = ctxt->neg;
4923 
4924             NEXT;
4925             ctxt->neg = !ctxt->neg;
4926             xmlFAParsePosCharGroup(ctxt);
4927             ctxt->neg = neg;
4928         } else if ((CUR == '-') && (NXT(1) == '[')) {
4929             int neg = ctxt->neg;
4930             ctxt->neg = 2;
4931             NEXT;       /* eat the '-' */
4932             NEXT;       /* eat the '[' */
4933             xmlFAParseCharGroup(ctxt);
4934             if (CUR == ']') {
4935                 NEXT;
4936             } else {
4937                 ERROR("charClassExpr: ']' expected");
4938                 break;
4939             }
4940             ctxt->neg = neg;
4941             break;
4942         } else if (CUR != ']') {
4943             xmlFAParsePosCharGroup(ctxt);
4944         }
4945     }
4946     ctxt->neg = n;
4947 }
4948 
4949 /**
4950  * xmlFAParseCharClass:
4951  * @ctxt:  a regexp parser context
4952  *
4953  * [11]   charClass   ::=     charClassEsc | charClassExpr
4954  * [12]   charClassExpr   ::=   '[' charGroup ']'
4955  */
4956 static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt)4957 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
4958     if (CUR == '[') {
4959         NEXT;
4960         ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
4961         if (ctxt->atom == NULL)
4962             return;
4963         xmlFAParseCharGroup(ctxt);
4964         if (CUR == ']') {
4965             NEXT;
4966         } else {
4967             ERROR("xmlFAParseCharClass: ']' expected");
4968         }
4969     } else {
4970         xmlFAParseCharClassEsc(ctxt);
4971     }
4972 }
4973 
4974 /**
4975  * xmlFAParseQuantExact:
4976  * @ctxt:  a regexp parser context
4977  *
4978  * [8]   QuantExact   ::=   [0-9]+
4979  *
4980  * Returns 0 if success or -1 in case of error
4981  */
4982 static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt)4983 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
4984     int ret = 0;
4985     int ok = 0;
4986 
4987     while ((CUR >= '0') && (CUR <= '9')) {
4988         ret = ret * 10 + (CUR - '0');
4989         ok = 1;
4990         NEXT;
4991     }
4992     if (ok != 1) {
4993         return(-1);
4994     }
4995     return(ret);
4996 }
4997 
4998 /**
4999  * xmlFAParseQuantifier:
5000  * @ctxt:  a regexp parser context
5001  *
5002  * [4]   quantifier   ::=   [?*+] | ( '{' quantity '}' )
5003  * [5]   quantity   ::=   quantRange | quantMin | QuantExact
5004  * [6]   quantRange   ::=   QuantExact ',' QuantExact
5005  * [7]   quantMin   ::=   QuantExact ','
5006  * [8]   QuantExact   ::=   [0-9]+
5007  */
5008 static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt)5009 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5010     int cur;
5011 
5012     cur = CUR;
5013     if ((cur == '?') || (cur == '*') || (cur == '+')) {
5014         if (ctxt->atom != NULL) {
5015             if (cur == '?')
5016                 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5017             else if (cur == '*')
5018                 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5019             else if (cur == '+')
5020                 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5021         }
5022         NEXT;
5023         return(1);
5024     }
5025     if (cur == '{') {
5026         int min = 0, max = 0;
5027 
5028         NEXT;
5029         cur = xmlFAParseQuantExact(ctxt);
5030         if (cur >= 0)
5031             min = cur;
5032         if (CUR == ',') {
5033             NEXT;
5034             if (CUR == '}')
5035                 max = INT_MAX;
5036             else {
5037                 cur = xmlFAParseQuantExact(ctxt);
5038                 if (cur >= 0)
5039                     max = cur;
5040                 else {
5041                     ERROR("Improper quantifier");
5042                 }
5043             }
5044         }
5045         if (CUR == '}') {
5046             NEXT;
5047         } else {
5048             ERROR("Unterminated quantifier");
5049         }
5050         if (max == 0)
5051             max = min;
5052         if (ctxt->atom != NULL) {
5053             ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5054             ctxt->atom->min = min;
5055             ctxt->atom->max = max;
5056         }
5057         return(1);
5058     }
5059     return(0);
5060 }
5061 
5062 /**
5063  * xmlFAParseAtom:
5064  * @ctxt:  a regexp parser context
5065  *
5066  * [9]   atom   ::=   Char | charClass | ( '(' regExp ')' )
5067  */
5068 static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt)5069 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5070     int codepoint, len;
5071 
5072     codepoint = xmlFAIsChar(ctxt);
5073     if (codepoint > 0) {
5074         ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5075         if (ctxt->atom == NULL)
5076             return(-1);
5077         codepoint = CUR_SCHAR(ctxt->cur, len);
5078         ctxt->atom->codepoint = codepoint;
5079         NEXTL(len);
5080         return(1);
5081     } else if (CUR == '|') {
5082         return(0);
5083     } else if (CUR == 0) {
5084         return(0);
5085     } else if (CUR == ')') {
5086         return(0);
5087     } else if (CUR == '(') {
5088         xmlRegStatePtr start, oldend;
5089 
5090         NEXT;
5091         xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5092         start = ctxt->state;
5093         oldend = ctxt->end;
5094         ctxt->end = NULL;
5095         ctxt->atom = NULL;
5096         xmlFAParseRegExp(ctxt, 0);
5097         if (CUR == ')') {
5098             NEXT;
5099         } else {
5100             ERROR("xmlFAParseAtom: expecting ')'");
5101         }
5102         ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5103         if (ctxt->atom == NULL)
5104             return(-1);
5105         ctxt->atom->start = start;
5106         ctxt->atom->stop = ctxt->state;
5107         ctxt->end = oldend;
5108         return(1);
5109     } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5110         xmlFAParseCharClass(ctxt);
5111         return(1);
5112     }
5113     return(0);
5114 }
5115 
5116 /**
5117  * xmlFAParsePiece:
5118  * @ctxt:  a regexp parser context
5119  *
5120  * [3]   piece   ::=   atom quantifier?
5121  */
5122 static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt)5123 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5124     int ret;
5125 
5126     ctxt->atom = NULL;
5127     ret = xmlFAParseAtom(ctxt);
5128     if (ret == 0)
5129         return(0);
5130     if (ctxt->atom == NULL) {
5131         ERROR("internal: no atom generated");
5132     }
5133     xmlFAParseQuantifier(ctxt);
5134     return(1);
5135 }
5136 
5137 /**
5138  * xmlFAParseBranch:
5139  * @ctxt:  a regexp parser context
5140  * @to: optional target to the end of the branch
5141  *
5142  * @to is used to optimize by removing duplicate path in automata
5143  * in expressions like (a|b)(c|d)
5144  *
5145  * [2]   branch   ::=   piece*
5146  */
5147 static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr to)5148 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5149     xmlRegStatePtr previous;
5150     int ret;
5151 
5152     previous = ctxt->state;
5153     ret = xmlFAParsePiece(ctxt);
5154     if (ret != 0) {
5155         if (xmlFAGenerateTransitions(ctxt, previous,
5156                 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
5157             return(-1);
5158         previous = ctxt->state;
5159         ctxt->atom = NULL;
5160     }
5161     while ((ret != 0) && (ctxt->error == 0)) {
5162         ret = xmlFAParsePiece(ctxt);
5163         if (ret != 0) {
5164             if (xmlFAGenerateTransitions(ctxt, previous,
5165                     (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
5166                     return(-1);
5167             previous = ctxt->state;
5168             ctxt->atom = NULL;
5169         }
5170     }
5171     return(0);
5172 }
5173 
5174 /**
5175  * xmlFAParseRegExp:
5176  * @ctxt:  a regexp parser context
5177  * @top:  is this the top-level expression ?
5178  *
5179  * [1]   regExp   ::=     branch  ( '|' branch )*
5180  */
5181 static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt,int top)5182 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5183     xmlRegStatePtr start, end;
5184 
5185     /* if not top start should have been generated by an epsilon trans */
5186     start = ctxt->state;
5187     ctxt->end = NULL;
5188     xmlFAParseBranch(ctxt, NULL);
5189     if (top) {
5190 #ifdef DEBUG_REGEXP_GRAPH
5191         printf("State %d is final\n", ctxt->state->no);
5192 #endif
5193         ctxt->state->type = XML_REGEXP_FINAL_STATE;
5194     }
5195     if (CUR != '|') {
5196         ctxt->end = ctxt->state;
5197         return;
5198     }
5199     end = ctxt->state;
5200     while ((CUR == '|') && (ctxt->error == 0)) {
5201         NEXT;
5202         ctxt->state = start;
5203         ctxt->end = NULL;
5204         xmlFAParseBranch(ctxt, end);
5205     }
5206     if (!top) {
5207         ctxt->state = end;
5208         ctxt->end = end;
5209     }
5210 }
5211 
5212 /************************************************************************
5213  *                                                                      *
5214  *                      The basic API                                   *
5215  *                                                                      *
5216  ************************************************************************/
5217 
5218 /**
5219  * xmlRegexpPrint:
5220  * @output: the file for the output debug
5221  * @regexp: the compiled regexp
5222  *
5223  * Print the content of the compiled regular expression
5224  */
5225 void
xmlRegexpPrint(FILE * output,xmlRegexpPtr regexp)5226 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5227     int i;
5228 
5229     if (output == NULL)
5230         return;
5231     fprintf(output, " regexp: ");
5232     if (regexp == NULL) {
5233         fprintf(output, "NULL\n");
5234         return;
5235     }
5236     fprintf(output, "'%s' ", regexp->string);
5237     fprintf(output, "\n");
5238     fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5239     for (i = 0;i < regexp->nbAtoms; i++) {
5240         fprintf(output, " %02d ", i);
5241         xmlRegPrintAtom(output, regexp->atoms[i]);
5242     }
5243     fprintf(output, "%d states:", regexp->nbStates);
5244     fprintf(output, "\n");
5245     for (i = 0;i < regexp->nbStates; i++) {
5246         xmlRegPrintState(output, regexp->states[i]);
5247     }
5248     fprintf(output, "%d counters:\n", regexp->nbCounters);
5249     for (i = 0;i < regexp->nbCounters; i++) {
5250         fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5251                                                 regexp->counters[i].max);
5252     }
5253 }
5254 
5255 /**
5256  * xmlRegexpCompile:
5257  * @regexp:  a regular expression string
5258  *
5259  * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5260  * Appendix F and builds an automata suitable for testing strings against
5261  * that regular expression
5262  *
5263  * Returns the compiled expression or NULL in case of error
5264  */
5265 xmlRegexpPtr
xmlRegexpCompile(const xmlChar * regexp)5266 xmlRegexpCompile(const xmlChar *regexp) {
5267     xmlRegexpPtr ret;
5268     xmlRegParserCtxtPtr ctxt;
5269 
5270     ctxt = xmlRegNewParserCtxt(regexp);
5271     if (ctxt == NULL)
5272         return(NULL);
5273 
5274     /* initialize the parser */
5275     ctxt->end = NULL;
5276     ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5277     xmlRegStatePush(ctxt, ctxt->start);
5278 
5279     /* parse the expression building an automata */
5280     xmlFAParseRegExp(ctxt, 1);
5281     if (CUR != 0) {
5282         ERROR("xmlFAParseRegExp: extra characters");
5283     }
5284     ctxt->end = ctxt->state;
5285     ctxt->start->type = XML_REGEXP_START_STATE;
5286     ctxt->end->type = XML_REGEXP_FINAL_STATE;
5287 
5288     /* remove the Epsilon except for counted transitions */
5289     xmlFAEliminateEpsilonTransitions(ctxt);
5290 
5291 
5292     if (ctxt->error != 0) {
5293         xmlRegFreeParserCtxt(ctxt);
5294         return(NULL);
5295     }
5296     ret = xmlRegEpxFromParse(ctxt);
5297     xmlRegFreeParserCtxt(ctxt);
5298     return(ret);
5299 }
5300 
5301 /**
5302  * xmlRegexpExec:
5303  * @comp:  the compiled regular expression
5304  * @content:  the value to check against the regular expression
5305  *
5306  * Check if the regular expression generates the value
5307  *
5308  * Returns 1 if it matches, 0 if not and a negative value in case of error
5309  */
5310 int
xmlRegexpExec(xmlRegexpPtr comp,const xmlChar * content)5311 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5312     if ((comp == NULL) || (content == NULL))
5313         return(-1);
5314     return(xmlFARegExec(comp, content));
5315 }
5316 
5317 /**
5318  * xmlRegexpIsDeterminist:
5319  * @comp:  the compiled regular expression
5320  *
5321  * Check if the regular expression is determinist
5322  *
5323  * Returns 1 if it yes, 0 if not and a negative value in case of error
5324  */
5325 int
xmlRegexpIsDeterminist(xmlRegexpPtr comp)5326 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5327     xmlAutomataPtr am;
5328     int ret;
5329 
5330     if (comp == NULL)
5331         return(-1);
5332     if (comp->determinist != -1)
5333         return(comp->determinist);
5334 
5335     am = xmlNewAutomata();
5336     if (am->states != NULL) {
5337         int i;
5338 
5339         for (i = 0;i < am->nbStates;i++)
5340             xmlRegFreeState(am->states[i]);
5341         xmlFree(am->states);
5342     }
5343     am->nbAtoms = comp->nbAtoms;
5344     am->atoms = comp->atoms;
5345     am->nbStates = comp->nbStates;
5346     am->states = comp->states;
5347     am->determinist = -1;
5348     ret = xmlFAComputesDeterminism(am);
5349     am->atoms = NULL;
5350     am->states = NULL;
5351     xmlFreeAutomata(am);
5352     return(ret);
5353 }
5354 
5355 /**
5356  * xmlRegFreeRegexp:
5357  * @regexp:  the regexp
5358  *
5359  * Free a regexp
5360  */
5361 void
xmlRegFreeRegexp(xmlRegexpPtr regexp)5362 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5363     int i;
5364     if (regexp == NULL)
5365         return;
5366 
5367     if (regexp->string != NULL)
5368         xmlFree(regexp->string);
5369     if (regexp->states != NULL) {
5370         for (i = 0;i < regexp->nbStates;i++)
5371             xmlRegFreeState(regexp->states[i]);
5372         xmlFree(regexp->states);
5373     }
5374     if (regexp->atoms != NULL) {
5375         for (i = 0;i < regexp->nbAtoms;i++)
5376             xmlRegFreeAtom(regexp->atoms[i]);
5377         xmlFree(regexp->atoms);
5378     }
5379     if (regexp->counters != NULL)
5380         xmlFree(regexp->counters);
5381     if (regexp->compact != NULL)
5382         xmlFree(regexp->compact);
5383     if (regexp->transdata != NULL)
5384         xmlFree(regexp->transdata);
5385     if (regexp->stringMap != NULL) {
5386         for (i = 0; i < regexp->nbstrings;i++)
5387             xmlFree(regexp->stringMap[i]);
5388         xmlFree(regexp->stringMap);
5389     }
5390 
5391     xmlFree(regexp);
5392 }
5393 
5394 #ifdef LIBXML_AUTOMATA_ENABLED
5395 /************************************************************************
5396  *                                                                      *
5397  *                      The Automata interface                          *
5398  *                                                                      *
5399  ************************************************************************/
5400 
5401 /**
5402  * xmlNewAutomata:
5403  *
5404  * Create a new automata
5405  *
5406  * Returns the new object or NULL in case of failure
5407  */
5408 xmlAutomataPtr
xmlNewAutomata(void)5409 xmlNewAutomata(void) {
5410     xmlAutomataPtr ctxt;
5411 
5412     ctxt = xmlRegNewParserCtxt(NULL);
5413     if (ctxt == NULL)
5414         return(NULL);
5415 
5416     /* initialize the parser */
5417     ctxt->end = NULL;
5418     ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5419     if (ctxt->start == NULL) {
5420         xmlFreeAutomata(ctxt);
5421         return(NULL);
5422     }
5423     ctxt->start->type = XML_REGEXP_START_STATE;
5424     if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5425         xmlRegFreeState(ctxt->start);
5426         xmlFreeAutomata(ctxt);
5427         return(NULL);
5428     }
5429 
5430     return(ctxt);
5431 }
5432 
5433 /**
5434  * xmlFreeAutomata:
5435  * @am: an automata
5436  *
5437  * Free an automata
5438  */
5439 void
xmlFreeAutomata(xmlAutomataPtr am)5440 xmlFreeAutomata(xmlAutomataPtr am) {
5441     if (am == NULL)
5442         return;
5443     xmlRegFreeParserCtxt(am);
5444 }
5445 
5446 /**
5447  * xmlAutomataGetInitState:
5448  * @am: an automata
5449  *
5450  * Initial state lookup
5451  *
5452  * Returns the initial state of the automata
5453  */
5454 xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am)5455 xmlAutomataGetInitState(xmlAutomataPtr am) {
5456     if (am == NULL)
5457         return(NULL);
5458     return(am->start);
5459 }
5460 
5461 /**
5462  * xmlAutomataSetFinalState:
5463  * @am: an automata
5464  * @state: a state in this automata
5465  *
5466  * Makes that state a final state
5467  *
5468  * Returns 0 or -1 in case of error
5469  */
5470 int
xmlAutomataSetFinalState(xmlAutomataPtr am,xmlAutomataStatePtr state)5471 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5472     if ((am == NULL) || (state == NULL))
5473         return(-1);
5474     state->type = XML_REGEXP_FINAL_STATE;
5475     return(0);
5476 }
5477 
5478 /**
5479  * xmlAutomataNewTransition:
5480  * @am: an automata
5481  * @from: the starting point of the transition
5482  * @to: the target point of the transition or NULL
5483  * @token: the input string associated to that transition
5484  * @data: data passed to the callback function if the transition is activated
5485  *
5486  * If @to is NULL, this creates first a new target state in the automata
5487  * and then adds a transition from the @from state to the target state
5488  * activated by the value of @token
5489  *
5490  * Returns the target state or NULL in case of error
5491  */
5492 xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,void * data)5493 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5494                          xmlAutomataStatePtr to, const xmlChar *token,
5495                          void *data) {
5496     xmlRegAtomPtr atom;
5497 
5498     if ((am == NULL) || (from == NULL) || (token == NULL))
5499         return(NULL);
5500     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5501     if (atom == NULL)
5502         return(NULL);
5503     atom->data = data;
5504     if (atom == NULL)
5505         return(NULL);
5506     atom->valuep = xmlStrdup(token);
5507 
5508     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5509         xmlRegFreeAtom(atom);
5510         return(NULL);
5511     }
5512     if (to == NULL)
5513         return(am->state);
5514     return(to);
5515 }
5516 
5517 /**
5518  * xmlAutomataNewTransition2:
5519  * @am: an automata
5520  * @from: the starting point of the transition
5521  * @to: the target point of the transition or NULL
5522  * @token: the first input string associated to that transition
5523  * @token2: the second input string associated to that transition
5524  * @data: data passed to the callback function if the transition is activated
5525  *
5526  * If @to is NULL, this creates first a new target state in the automata
5527  * and then adds a transition from the @from state to the target state
5528  * activated by the value of @token
5529  *
5530  * Returns the target state or NULL in case of error
5531  */
5532 xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5533 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5534                           xmlAutomataStatePtr to, const xmlChar *token,
5535                           const xmlChar *token2, void *data) {
5536     xmlRegAtomPtr atom;
5537 
5538     if ((am == NULL) || (from == NULL) || (token == NULL))
5539         return(NULL);
5540     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5541     if (atom == NULL)
5542         return(NULL);
5543     atom->data = data;
5544     if ((token2 == NULL) || (*token2 == 0)) {
5545         atom->valuep = xmlStrdup(token);
5546     } else {
5547         int lenn, lenp;
5548         xmlChar *str;
5549 
5550         lenn = strlen((char *) token2);
5551         lenp = strlen((char *) token);
5552 
5553         str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5554         if (str == NULL) {
5555             xmlRegFreeAtom(atom);
5556             return(NULL);
5557         }
5558         memcpy(&str[0], token, lenp);
5559         str[lenp] = '|';
5560         memcpy(&str[lenp + 1], token2, lenn);
5561         str[lenn + lenp + 1] = 0;
5562 
5563         atom->valuep = str;
5564     }
5565 
5566     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5567         xmlRegFreeAtom(atom);
5568         return(NULL);
5569     }
5570     if (to == NULL)
5571         return(am->state);
5572     return(to);
5573 }
5574 
5575 /**
5576  * xmlAutomataNewNegTrans:
5577  * @am: an automata
5578  * @from: the starting point of the transition
5579  * @to: the target point of the transition or NULL
5580  * @token: the first input string associated to that transition
5581  * @token2: the second input string associated to that transition
5582  * @data: data passed to the callback function if the transition is activated
5583  *
5584  * If @to is NULL, this creates first a new target state in the automata
5585  * and then adds a transition from the @from state to the target state
5586  * activated by any value except (@token,@token2)
5587  * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5588  # the semantic of XSD ##other
5589  *
5590  * Returns the target state or NULL in case of error
5591  */
5592 xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5593 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5594                        xmlAutomataStatePtr to, const xmlChar *token,
5595                        const xmlChar *token2, void *data) {
5596     xmlRegAtomPtr atom;
5597     xmlChar err_msg[200];
5598 
5599     if ((am == NULL) || (from == NULL) || (token == NULL))
5600         return(NULL);
5601     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5602     if (atom == NULL)
5603         return(NULL);
5604     atom->data = data;
5605     atom->neg = 1;
5606     if ((token2 == NULL) || (*token2 == 0)) {
5607         atom->valuep = xmlStrdup(token);
5608     } else {
5609         int lenn, lenp;
5610         xmlChar *str;
5611 
5612         lenn = strlen((char *) token2);
5613         lenp = strlen((char *) token);
5614 
5615         str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5616         if (str == NULL) {
5617             xmlRegFreeAtom(atom);
5618             return(NULL);
5619         }
5620         memcpy(&str[0], token, lenp);
5621         str[lenp] = '|';
5622         memcpy(&str[lenp + 1], token2, lenn);
5623         str[lenn + lenp + 1] = 0;
5624 
5625         atom->valuep = str;
5626     }
5627     snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5628     err_msg[199] = 0;
5629     atom->valuep2 = xmlStrdup(err_msg);
5630 
5631     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5632         xmlRegFreeAtom(atom);
5633         return(NULL);
5634     }
5635     am->negs++;
5636     if (to == NULL)
5637         return(am->state);
5638     return(to);
5639 }
5640 
5641 /**
5642  * xmlAutomataNewCountTrans2:
5643  * @am: an automata
5644  * @from: the starting point of the transition
5645  * @to: the target point of the transition or NULL
5646  * @token: the input string associated to that transition
5647  * @token2: the second input string associated to that transition
5648  * @min:  the minimum successive occurences of token
5649  * @max:  the maximum successive occurences of token
5650  * @data:  data associated to the transition
5651  *
5652  * If @to is NULL, this creates first a new target state in the automata
5653  * and then adds a transition from the @from state to the target state
5654  * activated by a succession of input of value @token and @token2 and
5655  * whose number is between @min and @max
5656  *
5657  * Returns the target state or NULL in case of error
5658  */
5659 xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)5660 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5661                          xmlAutomataStatePtr to, const xmlChar *token,
5662                          const xmlChar *token2,
5663                          int min, int max, void *data) {
5664     xmlRegAtomPtr atom;
5665     int counter;
5666 
5667     if ((am == NULL) || (from == NULL) || (token == NULL))
5668         return(NULL);
5669     if (min < 0)
5670         return(NULL);
5671     if ((max < min) || (max < 1))
5672         return(NULL);
5673     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5674     if (atom == NULL)
5675         return(NULL);
5676     if ((token2 == NULL) || (*token2 == 0)) {
5677         atom->valuep = xmlStrdup(token);
5678     } else {
5679         int lenn, lenp;
5680         xmlChar *str;
5681 
5682         lenn = strlen((char *) token2);
5683         lenp = strlen((char *) token);
5684 
5685         str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5686         if (str == NULL) {
5687             xmlRegFreeAtom(atom);
5688             return(NULL);
5689         }
5690         memcpy(&str[0], token, lenp);
5691         str[lenp] = '|';
5692         memcpy(&str[lenp + 1], token2, lenn);
5693         str[lenn + lenp + 1] = 0;
5694 
5695         atom->valuep = str;
5696     }
5697     atom->data = data;
5698     if (min == 0)
5699         atom->min = 1;
5700     else
5701         atom->min = min;
5702     atom->max = max;
5703 
5704     /*
5705      * associate a counter to the transition.
5706      */
5707     counter = xmlRegGetCounter(am);
5708     am->counters[counter].min = min;
5709     am->counters[counter].max = max;
5710 
5711     /* xmlFAGenerateTransitions(am, from, to, atom); */
5712     if (to == NULL) {
5713         to = xmlRegNewState(am);
5714         xmlRegStatePush(am, to);
5715     }
5716     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5717     xmlRegAtomPush(am, atom);
5718     am->state = to;
5719 
5720     if (to == NULL)
5721         to = am->state;
5722     if (to == NULL)
5723         return(NULL);
5724     if (min == 0)
5725         xmlFAGenerateEpsilonTransition(am, from, to);
5726     return(to);
5727 }
5728 
5729 /**
5730  * xmlAutomataNewCountTrans:
5731  * @am: an automata
5732  * @from: the starting point of the transition
5733  * @to: the target point of the transition or NULL
5734  * @token: the input string associated to that transition
5735  * @min:  the minimum successive occurences of token
5736  * @max:  the maximum successive occurences of token
5737  * @data:  data associated to the transition
5738  *
5739  * If @to is NULL, this creates first a new target state in the automata
5740  * and then adds a transition from the @from state to the target state
5741  * activated by a succession of input of value @token and whose number
5742  * is between @min and @max
5743  *
5744  * Returns the target state or NULL in case of error
5745  */
5746 xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)5747 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5748                          xmlAutomataStatePtr to, const xmlChar *token,
5749                          int min, int max, void *data) {
5750     xmlRegAtomPtr atom;
5751     int counter;
5752 
5753     if ((am == NULL) || (from == NULL) || (token == NULL))
5754         return(NULL);
5755     if (min < 0)
5756         return(NULL);
5757     if ((max < min) || (max < 1))
5758         return(NULL);
5759     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5760     if (atom == NULL)
5761         return(NULL);
5762     atom->valuep = xmlStrdup(token);
5763     atom->data = data;
5764     if (min == 0)
5765         atom->min = 1;
5766     else
5767         atom->min = min;
5768     atom->max = max;
5769 
5770     /*
5771      * associate a counter to the transition.
5772      */
5773     counter = xmlRegGetCounter(am);
5774     am->counters[counter].min = min;
5775     am->counters[counter].max = max;
5776 
5777     /* xmlFAGenerateTransitions(am, from, to, atom); */
5778     if (to == NULL) {
5779         to = xmlRegNewState(am);
5780         xmlRegStatePush(am, to);
5781     }
5782     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5783     xmlRegAtomPush(am, atom);
5784     am->state = to;
5785 
5786     if (to == NULL)
5787         to = am->state;
5788     if (to == NULL)
5789         return(NULL);
5790     if (min == 0)
5791         xmlFAGenerateEpsilonTransition(am, from, to);
5792     return(to);
5793 }
5794 
5795 /**
5796  * xmlAutomataNewOnceTrans2:
5797  * @am: an automata
5798  * @from: the starting point of the transition
5799  * @to: the target point of the transition or NULL
5800  * @token: the input string associated to that transition
5801  * @token2: the second input string associated to that transition
5802  * @min:  the minimum successive occurences of token
5803  * @max:  the maximum successive occurences of token
5804  * @data:  data associated to the transition
5805  *
5806  * If @to is NULL, this creates first a new target state in the automata
5807  * and then adds a transition from the @from state to the target state
5808  * activated by a succession of input of value @token and @token2 and whose
5809  * number is between @min and @max, moreover that transition can only be
5810  * crossed once.
5811  *
5812  * Returns the target state or NULL in case of error
5813  */
5814 xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)5815 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5816                          xmlAutomataStatePtr to, const xmlChar *token,
5817                          const xmlChar *token2,
5818                          int min, int max, void *data) {
5819     xmlRegAtomPtr atom;
5820     int counter;
5821 
5822     if ((am == NULL) || (from == NULL) || (token == NULL))
5823         return(NULL);
5824     if (min < 1)
5825         return(NULL);
5826     if ((max < min) || (max < 1))
5827         return(NULL);
5828     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5829     if (atom == NULL)
5830         return(NULL);
5831     if ((token2 == NULL) || (*token2 == 0)) {
5832         atom->valuep = xmlStrdup(token);
5833     } else {
5834         int lenn, lenp;
5835         xmlChar *str;
5836 
5837         lenn = strlen((char *) token2);
5838         lenp = strlen((char *) token);
5839 
5840         str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5841         if (str == NULL) {
5842             xmlRegFreeAtom(atom);
5843             return(NULL);
5844         }
5845         memcpy(&str[0], token, lenp);
5846         str[lenp] = '|';
5847         memcpy(&str[lenp + 1], token2, lenn);
5848         str[lenn + lenp + 1] = 0;
5849 
5850         atom->valuep = str;
5851     }
5852     atom->data = data;
5853     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
5854     atom->min = min;
5855     atom->max = max;
5856     /*
5857      * associate a counter to the transition.
5858      */
5859     counter = xmlRegGetCounter(am);
5860     am->counters[counter].min = 1;
5861     am->counters[counter].max = 1;
5862 
5863     /* xmlFAGenerateTransitions(am, from, to, atom); */
5864     if (to == NULL) {
5865         to = xmlRegNewState(am);
5866         xmlRegStatePush(am, to);
5867     }
5868     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5869     xmlRegAtomPush(am, atom);
5870     am->state = to;
5871     return(to);
5872 }
5873 
5874 
5875 
5876 /**
5877  * xmlAutomataNewOnceTrans:
5878  * @am: an automata
5879  * @from: the starting point of the transition
5880  * @to: the target point of the transition or NULL
5881  * @token: the input string associated to that transition
5882  * @min:  the minimum successive occurences of token
5883  * @max:  the maximum successive occurences of token
5884  * @data:  data associated to the transition
5885  *
5886  * If @to is NULL, this creates first a new target state in the automata
5887  * and then adds a transition from the @from state to the target state
5888  * activated by a succession of input of value @token and whose number
5889  * is between @min and @max, moreover that transition can only be crossed
5890  * once.
5891  *
5892  * Returns the target state or NULL in case of error
5893  */
5894 xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)5895 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5896                          xmlAutomataStatePtr to, const xmlChar *token,
5897                          int min, int max, void *data) {
5898     xmlRegAtomPtr atom;
5899     int counter;
5900 
5901     if ((am == NULL) || (from == NULL) || (token == NULL))
5902         return(NULL);
5903     if (min < 1)
5904         return(NULL);
5905     if ((max < min) || (max < 1))
5906         return(NULL);
5907     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5908     if (atom == NULL)
5909         return(NULL);
5910     atom->valuep = xmlStrdup(token);
5911     atom->data = data;
5912     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
5913     atom->min = min;
5914     atom->max = max;
5915     /*
5916      * associate a counter to the transition.
5917      */
5918     counter = xmlRegGetCounter(am);
5919     am->counters[counter].min = 1;
5920     am->counters[counter].max = 1;
5921 
5922     /* xmlFAGenerateTransitions(am, from, to, atom); */
5923     if (to == NULL) {
5924         to = xmlRegNewState(am);
5925         xmlRegStatePush(am, to);
5926     }
5927     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5928     xmlRegAtomPush(am, atom);
5929     am->state = to;
5930     return(to);
5931 }
5932 
5933 /**
5934  * xmlAutomataNewState:
5935  * @am: an automata
5936  *
5937  * Create a new disconnected state in the automata
5938  *
5939  * Returns the new state or NULL in case of error
5940  */
5941 xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am)5942 xmlAutomataNewState(xmlAutomataPtr am) {
5943     xmlAutomataStatePtr to;
5944 
5945     if (am == NULL)
5946         return(NULL);
5947     to = xmlRegNewState(am);
5948     xmlRegStatePush(am, to);
5949     return(to);
5950 }
5951 
5952 /**
5953  * xmlAutomataNewEpsilon:
5954  * @am: an automata
5955  * @from: the starting point of the transition
5956  * @to: the target point of the transition or NULL
5957  *
5958  * If @to is NULL, this creates first a new target state in the automata
5959  * and then adds an epsilon transition from the @from state to the
5960  * target state
5961  *
5962  * Returns the target state or NULL in case of error
5963  */
5964 xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to)5965 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
5966                       xmlAutomataStatePtr to) {
5967     if ((am == NULL) || (from == NULL))
5968         return(NULL);
5969     xmlFAGenerateEpsilonTransition(am, from, to);
5970     if (to == NULL)
5971         return(am->state);
5972     return(to);
5973 }
5974 
5975 /**
5976  * xmlAutomataNewAllTrans:
5977  * @am: an automata
5978  * @from: the starting point of the transition
5979  * @to: the target point of the transition or NULL
5980  * @lax: allow to transition if not all all transitions have been activated
5981  *
5982  * If @to is NULL, this creates first a new target state in the automata
5983  * and then adds a an ALL transition from the @from state to the
5984  * target state. That transition is an epsilon transition allowed only when
5985  * all transitions from the @from node have been activated.
5986  *
5987  * Returns the target state or NULL in case of error
5988  */
5989 xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int lax)5990 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5991                        xmlAutomataStatePtr to, int lax) {
5992     if ((am == NULL) || (from == NULL))
5993         return(NULL);
5994     xmlFAGenerateAllTransition(am, from, to, lax);
5995     if (to == NULL)
5996         return(am->state);
5997     return(to);
5998 }
5999 
6000 /**
6001  * xmlAutomataNewCounter:
6002  * @am: an automata
6003  * @min:  the minimal value on the counter
6004  * @max:  the maximal value on the counter
6005  *
6006  * Create a new counter
6007  *
6008  * Returns the counter number or -1 in case of error
6009  */
6010 int
xmlAutomataNewCounter(xmlAutomataPtr am,int min,int max)6011 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6012     int ret;
6013 
6014     if (am == NULL)
6015         return(-1);
6016 
6017     ret = xmlRegGetCounter(am);
6018     if (ret < 0)
6019         return(-1);
6020     am->counters[ret].min = min;
6021     am->counters[ret].max = max;
6022     return(ret);
6023 }
6024 
6025 /**
6026  * xmlAutomataNewCountedTrans:
6027  * @am: an automata
6028  * @from: the starting point of the transition
6029  * @to: the target point of the transition or NULL
6030  * @counter: the counter associated to that transition
6031  *
6032  * If @to is NULL, this creates first a new target state in the automata
6033  * and then adds an epsilon transition from the @from state to the target state
6034  * which will increment the counter provided
6035  *
6036  * Returns the target state or NULL in case of error
6037  */
6038 xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6039 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6040                 xmlAutomataStatePtr to, int counter) {
6041     if ((am == NULL) || (from == NULL) || (counter < 0))
6042         return(NULL);
6043     xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6044     if (to == NULL)
6045         return(am->state);
6046     return(to);
6047 }
6048 
6049 /**
6050  * xmlAutomataNewCounterTrans:
6051  * @am: an automata
6052  * @from: the starting point of the transition
6053  * @to: the target point of the transition or NULL
6054  * @counter: the counter associated to that transition
6055  *
6056  * If @to is NULL, this creates first a new target state in the automata
6057  * and then adds an epsilon transition from the @from state to the target state
6058  * which will be allowed only if the counter is within the right range.
6059  *
6060  * Returns the target state or NULL in case of error
6061  */
6062 xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6063 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6064                 xmlAutomataStatePtr to, int counter) {
6065     if ((am == NULL) || (from == NULL) || (counter < 0))
6066         return(NULL);
6067     xmlFAGenerateCountedTransition(am, from, to, counter);
6068     if (to == NULL)
6069         return(am->state);
6070     return(to);
6071 }
6072 
6073 /**
6074  * xmlAutomataCompile:
6075  * @am: an automata
6076  *
6077  * Compile the automata into a Reg Exp ready for being executed.
6078  * The automata should be free after this point.
6079  *
6080  * Returns the compiled regexp or NULL in case of error
6081  */
6082 xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am)6083 xmlAutomataCompile(xmlAutomataPtr am) {
6084     xmlRegexpPtr ret;
6085 
6086     if ((am == NULL) || (am->error != 0)) return(NULL);
6087     xmlFAEliminateEpsilonTransitions(am);
6088     /* xmlFAComputesDeterminism(am); */
6089     ret = xmlRegEpxFromParse(am);
6090 
6091     return(ret);
6092 }
6093 
6094 /**
6095  * xmlAutomataIsDeterminist:
6096  * @am: an automata
6097  *
6098  * Checks if an automata is determinist.
6099  *
6100  * Returns 1 if true, 0 if not, and -1 in case of error
6101  */
6102 int
xmlAutomataIsDeterminist(xmlAutomataPtr am)6103 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6104     int ret;
6105 
6106     if (am == NULL)
6107         return(-1);
6108 
6109     ret = xmlFAComputesDeterminism(am);
6110     return(ret);
6111 }
6112 #endif /* LIBXML_AUTOMATA_ENABLED */
6113 
6114 #ifdef LIBXML_EXPR_ENABLED
6115 /************************************************************************
6116  *                                                                      *
6117  *              Formal Expression handling code                         *
6118  *                                                                      *
6119  ************************************************************************/
6120 /************************************************************************
6121  *                                                                      *
6122  *              Expression handling context                             *
6123  *                                                                      *
6124  ************************************************************************/
6125 
6126 struct _xmlExpCtxt {
6127     xmlDictPtr dict;
6128     xmlExpNodePtr *table;
6129     int size;
6130     int nbElems;
6131     int nb_nodes;
6132     const char *expr;
6133     const char *cur;
6134     int nb_cons;
6135     int tabSize;
6136 };
6137 
6138 /**
6139  * xmlExpNewCtxt:
6140  * @maxNodes:  the maximum number of nodes
6141  * @dict:  optional dictionnary to use internally
6142  *
6143  * Creates a new context for manipulating expressions
6144  *
6145  * Returns the context or NULL in case of error
6146  */
6147 xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes,xmlDictPtr dict)6148 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6149     xmlExpCtxtPtr ret;
6150     int size = 256;
6151 
6152     if (maxNodes <= 4096)
6153         maxNodes = 4096;
6154 
6155     ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6156     if (ret == NULL)
6157         return(NULL);
6158     memset(ret, 0, sizeof(xmlExpCtxt));
6159     ret->size = size;
6160     ret->nbElems = 0;
6161     ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6162     if (ret->table == NULL) {
6163         xmlFree(ret);
6164         return(NULL);
6165     }
6166     memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6167     if (dict == NULL) {
6168         ret->dict = xmlDictCreate();
6169         if (ret->dict == NULL) {
6170             xmlFree(ret->table);
6171             xmlFree(ret);
6172             return(NULL);
6173         }
6174     } else {
6175         ret->dict = dict;
6176         xmlDictReference(ret->dict);
6177     }
6178     return(ret);
6179 }
6180 
6181 /**
6182  * xmlExpFreeCtxt:
6183  * @ctxt:  an expression context
6184  *
6185  * Free an expression context
6186  */
6187 void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt)6188 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6189     if (ctxt == NULL)
6190         return;
6191     xmlDictFree(ctxt->dict);
6192     if (ctxt->table != NULL)
6193         xmlFree(ctxt->table);
6194     xmlFree(ctxt);
6195 }
6196 
6197 /************************************************************************
6198  *                                                                      *
6199  *              Structure associated to an expression node              *
6200  *                                                                      *
6201  ************************************************************************/
6202 #define MAX_NODES 10000
6203 
6204 /* #define DEBUG_DERIV */
6205 
6206 /*
6207  * TODO:
6208  * - Wildcards
6209  * - public API for creation
6210  *
6211  * Started
6212  * - regression testing
6213  *
6214  * Done
6215  * - split into module and test tool
6216  * - memleaks
6217  */
6218 
6219 typedef enum {
6220     XML_EXP_NILABLE = (1 << 0)
6221 } xmlExpNodeInfo;
6222 
6223 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6224 
6225 struct _xmlExpNode {
6226     unsigned char type;/* xmlExpNodeType */
6227     unsigned char info;/* OR of xmlExpNodeInfo */
6228     unsigned short key; /* the hash key */
6229     unsigned int ref;   /* The number of references */
6230     int c_max;          /* the maximum length it can consume */
6231     xmlExpNodePtr exp_left;
6232     xmlExpNodePtr next;/* the next node in the hash table or free list */
6233     union {
6234         struct {
6235             int f_min;
6236             int f_max;
6237         } count;
6238         struct {
6239             xmlExpNodePtr f_right;
6240         } children;
6241         const xmlChar *f_str;
6242     } field;
6243 };
6244 
6245 #define exp_min field.count.f_min
6246 #define exp_max field.count.f_max
6247 /* #define exp_left field.children.f_left */
6248 #define exp_right field.children.f_right
6249 #define exp_str field.f_str
6250 
6251 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6252 static xmlExpNode forbiddenExpNode = {
6253     XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6254 };
6255 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6256 static xmlExpNode emptyExpNode = {
6257     XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6258 };
6259 xmlExpNodePtr emptyExp = &emptyExpNode;
6260 
6261 /************************************************************************
6262  *                                                                      *
6263  *  The custom hash table for unicity and canonicalization              *
6264  *  of sub-expressions pointers                                         *
6265  *                                                                      *
6266  ************************************************************************/
6267 /*
6268  * xmlExpHashNameComputeKey:
6269  * Calculate the hash key for a token
6270  */
6271 static unsigned short
xmlExpHashNameComputeKey(const xmlChar * name)6272 xmlExpHashNameComputeKey(const xmlChar *name) {
6273     unsigned short value = 0L;
6274     char ch;
6275 
6276     if (name != NULL) {
6277         value += 30 * (*name);
6278         while ((ch = *name++) != 0) {
6279             value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6280         }
6281     }
6282     return (value);
6283 }
6284 
6285 /*
6286  * xmlExpHashComputeKey:
6287  * Calculate the hash key for a compound expression
6288  */
6289 static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right)6290 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6291                      xmlExpNodePtr right) {
6292     unsigned long value;
6293     unsigned short ret;
6294 
6295     switch (type) {
6296         case XML_EXP_SEQ:
6297             value = left->key;
6298             value += right->key;
6299             value *= 3;
6300             ret = (unsigned short) value;
6301             break;
6302         case XML_EXP_OR:
6303             value = left->key;
6304             value += right->key;
6305             value *= 7;
6306             ret = (unsigned short) value;
6307             break;
6308         case XML_EXP_COUNT:
6309             value = left->key;
6310             value += right->key;
6311             ret = (unsigned short) value;
6312             break;
6313         default:
6314             ret = 0;
6315     }
6316     return(ret);
6317 }
6318 
6319 
6320 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt,xmlExpNodeType type)6321 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6322     xmlExpNodePtr ret;
6323 
6324     if (ctxt->nb_nodes >= MAX_NODES)
6325         return(NULL);
6326     ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6327     if (ret == NULL)
6328         return(NULL);
6329     memset(ret, 0, sizeof(xmlExpNode));
6330     ret->type = type;
6331     ret->next = NULL;
6332     ctxt->nb_nodes++;
6333     ctxt->nb_cons++;
6334     return(ret);
6335 }
6336 
6337 /**
6338  * xmlExpHashGetEntry:
6339  * @table: the hash table
6340  *
6341  * Get the unique entry from the hash table. The entry is created if
6342  * needed. @left and @right are consumed, i.e. their ref count will
6343  * be decremented by the operation.
6344  *
6345  * Returns the pointer or NULL in case of error
6346  */
6347 static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt,xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right,const xmlChar * name,int min,int max)6348 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6349                    xmlExpNodePtr left, xmlExpNodePtr right,
6350                    const xmlChar *name, int min, int max) {
6351     unsigned short kbase, key;
6352     xmlExpNodePtr entry;
6353     xmlExpNodePtr insert;
6354 
6355     if (ctxt == NULL)
6356         return(NULL);
6357 
6358     /*
6359      * Check for duplicate and insertion location.
6360      */
6361     if (type == XML_EXP_ATOM) {
6362         kbase = xmlExpHashNameComputeKey(name);
6363     } else if (type == XML_EXP_COUNT) {
6364         /* COUNT reduction rule 1 */
6365         /* a{1} -> a */
6366         if (min == max) {
6367             if (min == 1) {
6368                 return(left);
6369             }
6370             if (min == 0) {
6371                 xmlExpFree(ctxt, left);
6372                 return(emptyExp);
6373             }
6374         }
6375         if (min < 0) {
6376             xmlExpFree(ctxt, left);
6377             return(forbiddenExp);
6378         }
6379         if (max == -1)
6380             kbase = min + 79;
6381         else
6382             kbase = max - min;
6383         kbase += left->key;
6384     } else if (type == XML_EXP_OR) {
6385         /* Forbid reduction rules */
6386         if (left->type == XML_EXP_FORBID) {
6387             xmlExpFree(ctxt, left);
6388             return(right);
6389         }
6390         if (right->type == XML_EXP_FORBID) {
6391             xmlExpFree(ctxt, right);
6392             return(left);
6393         }
6394 
6395         /* OR reduction rule 1 */
6396         /* a | a reduced to a */
6397         if (left == right) {
6398             left->ref--;
6399             return(left);
6400         }
6401         /* OR canonicalization rule 1 */
6402         /* linearize (a | b) | c into a | (b | c) */
6403         if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6404             xmlExpNodePtr tmp = left;
6405             left = right;
6406             right = tmp;
6407         }
6408         /* OR reduction rule 2 */
6409         /* a | (a | b) and b | (a | b) are reduced to a | b */
6410         if (right->type == XML_EXP_OR) {
6411             if ((left == right->exp_left) ||
6412                 (left == right->exp_right)) {
6413                 xmlExpFree(ctxt, left);
6414                 return(right);
6415             }
6416         }
6417         /* OR canonicalization rule 2 */
6418         /* linearize (a | b) | c into a | (b | c) */
6419         if (left->type == XML_EXP_OR) {
6420             xmlExpNodePtr tmp;
6421 
6422             /* OR canonicalization rule 2 */
6423             if ((left->exp_right->type != XML_EXP_OR) &&
6424                 (left->exp_right->key < left->exp_left->key)) {
6425                 tmp = left->exp_right;
6426                 left->exp_right = left->exp_left;
6427                 left->exp_left = tmp;
6428             }
6429             left->exp_right->ref++;
6430             tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6431                                      NULL, 0, 0);
6432             left->exp_left->ref++;
6433             tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6434                                      NULL, 0, 0);
6435 
6436             xmlExpFree(ctxt, left);
6437             return(tmp);
6438         }
6439         if (right->type == XML_EXP_OR) {
6440             /* Ordering in the tree */
6441             /* C | (A | B) -> A | (B | C) */
6442             if (left->key > right->exp_right->key) {
6443                 xmlExpNodePtr tmp;
6444                 right->exp_right->ref++;
6445                 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6446                                          left, NULL, 0, 0);
6447                 right->exp_left->ref++;
6448                 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6449                                          tmp, NULL, 0, 0);
6450                 xmlExpFree(ctxt, right);
6451                 return(tmp);
6452             }
6453             /* Ordering in the tree */
6454             /* B | (A | C) -> A | (B | C) */
6455             if (left->key > right->exp_left->key) {
6456                 xmlExpNodePtr tmp;
6457                 right->exp_right->ref++;
6458                 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6459                                          right->exp_right, NULL, 0, 0);
6460                 right->exp_left->ref++;
6461                 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6462                                          tmp, NULL, 0, 0);
6463                 xmlExpFree(ctxt, right);
6464                 return(tmp);
6465             }
6466         }
6467         /* we know both types are != XML_EXP_OR here */
6468         else if (left->key > right->key) {
6469             xmlExpNodePtr tmp = left;
6470             left = right;
6471             right = tmp;
6472         }
6473         kbase = xmlExpHashComputeKey(type, left, right);
6474     } else if (type == XML_EXP_SEQ) {
6475         /* Forbid reduction rules */
6476         if (left->type == XML_EXP_FORBID) {
6477             xmlExpFree(ctxt, right);
6478             return(left);
6479         }
6480         if (right->type == XML_EXP_FORBID) {
6481             xmlExpFree(ctxt, left);
6482             return(right);
6483         }
6484         /* Empty reduction rules */
6485         if (right->type == XML_EXP_EMPTY) {
6486             return(left);
6487         }
6488         if (left->type == XML_EXP_EMPTY) {
6489             return(right);
6490         }
6491         kbase = xmlExpHashComputeKey(type, left, right);
6492     } else
6493         return(NULL);
6494 
6495     key = kbase % ctxt->size;
6496     if (ctxt->table[key] != NULL) {
6497         for (insert = ctxt->table[key]; insert != NULL;
6498              insert = insert->next) {
6499             if ((insert->key == kbase) &&
6500                 (insert->type == type)) {
6501                 if (type == XML_EXP_ATOM) {
6502                     if (name == insert->exp_str) {
6503                         insert->ref++;
6504                         return(insert);
6505                     }
6506                 } else if (type == XML_EXP_COUNT) {
6507                     if ((insert->exp_min == min) && (insert->exp_max == max) &&
6508                         (insert->exp_left == left)) {
6509                         insert->ref++;
6510                         left->ref--;
6511                         return(insert);
6512                     }
6513                 } else if ((insert->exp_left == left) &&
6514                            (insert->exp_right == right)) {
6515                     insert->ref++;
6516                     left->ref--;
6517                     right->ref--;
6518                     return(insert);
6519                 }
6520             }
6521         }
6522     }
6523 
6524     entry = xmlExpNewNode(ctxt, type);
6525     if (entry == NULL)
6526         return(NULL);
6527     entry->key = kbase;
6528     if (type == XML_EXP_ATOM) {
6529         entry->exp_str = name;
6530         entry->c_max = 1;
6531     } else if (type == XML_EXP_COUNT) {
6532         entry->exp_min = min;
6533         entry->exp_max = max;
6534         entry->exp_left = left;
6535         if ((min == 0) || (IS_NILLABLE(left)))
6536             entry->info |= XML_EXP_NILABLE;
6537         if (max < 0)
6538             entry->c_max = -1;
6539         else
6540             entry->c_max = max * entry->exp_left->c_max;
6541     } else {
6542         entry->exp_left = left;
6543         entry->exp_right = right;
6544         if (type == XML_EXP_OR) {
6545             if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6546                 entry->info |= XML_EXP_NILABLE;
6547             if ((entry->exp_left->c_max == -1) ||
6548                 (entry->exp_right->c_max == -1))
6549                 entry->c_max = -1;
6550             else if (entry->exp_left->c_max > entry->exp_right->c_max)
6551                 entry->c_max = entry->exp_left->c_max;
6552             else
6553                 entry->c_max = entry->exp_right->c_max;
6554         } else {
6555             if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6556                 entry->info |= XML_EXP_NILABLE;
6557             if ((entry->exp_left->c_max == -1) ||
6558                 (entry->exp_right->c_max == -1))
6559                 entry->c_max = -1;
6560             else
6561                 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6562         }
6563     }
6564     entry->ref = 1;
6565     if (ctxt->table[key] != NULL)
6566         entry->next = ctxt->table[key];
6567 
6568     ctxt->table[key] = entry;
6569     ctxt->nbElems++;
6570 
6571     return(entry);
6572 }
6573 
6574 /**
6575  * xmlExpFree:
6576  * @ctxt: the expression context
6577  * @exp: the expression
6578  *
6579  * Dereference the expression
6580  */
6581 void
xmlExpFree(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp)6582 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6583     if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6584         return;
6585     exp->ref--;
6586     if (exp->ref == 0) {
6587         unsigned short key;
6588 
6589         /* Unlink it first from the hash table */
6590         key = exp->key % ctxt->size;
6591         if (ctxt->table[key] == exp) {
6592             ctxt->table[key] = exp->next;
6593         } else {
6594             xmlExpNodePtr tmp;
6595 
6596             tmp = ctxt->table[key];
6597             while (tmp != NULL) {
6598                 if (tmp->next == exp) {
6599                     tmp->next = exp->next;
6600                     break;
6601                 }
6602                 tmp = tmp->next;
6603             }
6604         }
6605 
6606         if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6607             xmlExpFree(ctxt, exp->exp_left);
6608             xmlExpFree(ctxt, exp->exp_right);
6609         } else if (exp->type == XML_EXP_COUNT) {
6610             xmlExpFree(ctxt, exp->exp_left);
6611         }
6612         xmlFree(exp);
6613         ctxt->nb_nodes--;
6614     }
6615 }
6616 
6617 /**
6618  * xmlExpRef:
6619  * @exp: the expression
6620  *
6621  * Increase the reference count of the expression
6622  */
6623 void
xmlExpRef(xmlExpNodePtr exp)6624 xmlExpRef(xmlExpNodePtr exp) {
6625     if (exp != NULL)
6626         exp->ref++;
6627 }
6628 
6629 /**
6630  * xmlExpNewAtom:
6631  * @ctxt: the expression context
6632  * @name: the atom name
6633  * @len: the atom name length in byte (or -1);
6634  *
6635  * Get the atom associated to this name from that context
6636  *
6637  * Returns the node or NULL in case of error
6638  */
6639 xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt,const xmlChar * name,int len)6640 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6641     if ((ctxt == NULL) || (name == NULL))
6642         return(NULL);
6643     name = xmlDictLookup(ctxt->dict, name, len);
6644     if (name == NULL)
6645         return(NULL);
6646     return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6647 }
6648 
6649 /**
6650  * xmlExpNewOr:
6651  * @ctxt: the expression context
6652  * @left: left expression
6653  * @right: right expression
6654  *
6655  * Get the atom associated to the choice @left | @right
6656  * Note that @left and @right are consumed in the operation, to keep
6657  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6658  * this is true even in case of failure (unless ctxt == NULL).
6659  *
6660  * Returns the node or NULL in case of error
6661  */
6662 xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6663 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6664     if (ctxt == NULL)
6665         return(NULL);
6666     if ((left == NULL) || (right == NULL)) {
6667         xmlExpFree(ctxt, left);
6668         xmlExpFree(ctxt, right);
6669         return(NULL);
6670     }
6671     return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6672 }
6673 
6674 /**
6675  * xmlExpNewSeq:
6676  * @ctxt: the expression context
6677  * @left: left expression
6678  * @right: right expression
6679  *
6680  * Get the atom associated to the sequence @left , @right
6681  * Note that @left and @right are consumed in the operation, to keep
6682  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6683  * this is true even in case of failure (unless ctxt == NULL).
6684  *
6685  * Returns the node or NULL in case of error
6686  */
6687 xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6688 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6689     if (ctxt == NULL)
6690         return(NULL);
6691     if ((left == NULL) || (right == NULL)) {
6692         xmlExpFree(ctxt, left);
6693         xmlExpFree(ctxt, right);
6694         return(NULL);
6695     }
6696     return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6697 }
6698 
6699 /**
6700  * xmlExpNewRange:
6701  * @ctxt: the expression context
6702  * @subset: the expression to be repeated
6703  * @min: the lower bound for the repetition
6704  * @max: the upper bound for the repetition, -1 means infinite
6705  *
6706  * Get the atom associated to the range (@subset){@min, @max}
6707  * Note that @subset is consumed in the operation, to keep
6708  * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6709  * this is true even in case of failure (unless ctxt == NULL).
6710  *
6711  * Returns the node or NULL in case of error
6712  */
6713 xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt,xmlExpNodePtr subset,int min,int max)6714 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6715     if (ctxt == NULL)
6716         return(NULL);
6717     if ((subset == NULL) || (min < 0) || (max < -1) ||
6718         ((max >= 0) && (min > max))) {
6719         xmlExpFree(ctxt, subset);
6720         return(NULL);
6721     }
6722     return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6723                               NULL, NULL, min, max));
6724 }
6725 
6726 /************************************************************************
6727  *                                                                      *
6728  *              Public API for operations on expressions                *
6729  *                                                                      *
6730  ************************************************************************/
6731 
6732 static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)6733 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6734                      const xmlChar**list, int len, int nb) {
6735     int tmp, tmp2;
6736 tail:
6737     switch (exp->type) {
6738         case XML_EXP_EMPTY:
6739             return(0);
6740         case XML_EXP_ATOM:
6741             for (tmp = 0;tmp < nb;tmp++)
6742                 if (list[tmp] == exp->exp_str)
6743                     return(0);
6744             if (nb >= len)
6745                 return(-2);
6746             list[nb++] = exp->exp_str;
6747             return(1);
6748         case XML_EXP_COUNT:
6749             exp = exp->exp_left;
6750             goto tail;
6751         case XML_EXP_SEQ:
6752         case XML_EXP_OR:
6753             tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6754             if (tmp < 0)
6755                 return(tmp);
6756             tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6757                                         nb + tmp);
6758             if (tmp2 < 0)
6759                 return(tmp2);
6760             return(tmp + tmp2);
6761     }
6762     return(-1);
6763 }
6764 
6765 /**
6766  * xmlExpGetLanguage:
6767  * @ctxt: the expression context
6768  * @exp: the expression
6769  * @langList: where to store the tokens
6770  * @len: the allocated length of @list
6771  *
6772  * Find all the strings used in @exp and store them in @list
6773  *
6774  * Returns the number of unique strings found, -1 in case of errors and
6775  *         -2 if there is more than @len strings
6776  */
6777 int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** langList,int len)6778 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6779                   const xmlChar**langList, int len) {
6780     if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
6781         return(-1);
6782     return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
6783 }
6784 
6785 static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)6786 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6787                   const xmlChar**list, int len, int nb) {
6788     int tmp, tmp2;
6789 tail:
6790     switch (exp->type) {
6791         case XML_EXP_FORBID:
6792             return(0);
6793         case XML_EXP_EMPTY:
6794             return(0);
6795         case XML_EXP_ATOM:
6796             for (tmp = 0;tmp < nb;tmp++)
6797                 if (list[tmp] == exp->exp_str)
6798                     return(0);
6799             if (nb >= len)
6800                 return(-2);
6801             list[nb++] = exp->exp_str;
6802             return(1);
6803         case XML_EXP_COUNT:
6804             exp = exp->exp_left;
6805             goto tail;
6806         case XML_EXP_SEQ:
6807             tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
6808             if (tmp < 0)
6809                 return(tmp);
6810             if (IS_NILLABLE(exp->exp_left)) {
6811                 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
6812                                             nb + tmp);
6813                 if (tmp2 < 0)
6814                     return(tmp2);
6815                 tmp += tmp2;
6816             }
6817             return(tmp);
6818         case XML_EXP_OR:
6819             tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
6820             if (tmp < 0)
6821                 return(tmp);
6822             tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
6823                                         nb + tmp);
6824             if (tmp2 < 0)
6825                 return(tmp2);
6826             return(tmp + tmp2);
6827     }
6828     return(-1);
6829 }
6830 
6831 /**
6832  * xmlExpGetStart:
6833  * @ctxt: the expression context
6834  * @exp: the expression
6835  * @tokList: where to store the tokens
6836  * @len: the allocated length of @list
6837  *
6838  * Find all the strings that appears at the start of the languages
6839  * accepted by @exp and store them in @list. E.g. for (a, b) | c
6840  * it will return the list [a, c]
6841  *
6842  * Returns the number of unique strings found, -1 in case of errors and
6843  *         -2 if there is more than @len strings
6844  */
6845 int
xmlExpGetStart(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** tokList,int len)6846 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6847                const xmlChar**tokList, int len) {
6848     if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
6849         return(-1);
6850     return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
6851 }
6852 
6853 /**
6854  * xmlExpIsNillable:
6855  * @exp: the expression
6856  *
6857  * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce
6858  *
6859  * Returns 1 if nillable, 0 if not and -1 in case of error
6860  */
6861 int
xmlExpIsNillable(xmlExpNodePtr exp)6862 xmlExpIsNillable(xmlExpNodePtr exp) {
6863     if (exp == NULL)
6864         return(-1);
6865     return(IS_NILLABLE(exp) != 0);
6866 }
6867 
6868 static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str)6869 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
6870 {
6871     xmlExpNodePtr ret;
6872 
6873     switch (exp->type) {
6874         case XML_EXP_EMPTY:
6875             return(forbiddenExp);
6876         case XML_EXP_FORBID:
6877             return(forbiddenExp);
6878         case XML_EXP_ATOM:
6879             if (exp->exp_str == str) {
6880 #ifdef DEBUG_DERIV
6881                 printf("deriv atom: equal => Empty\n");
6882 #endif
6883                 ret = emptyExp;
6884             } else {
6885 #ifdef DEBUG_DERIV
6886                 printf("deriv atom: mismatch => forbid\n");
6887 #endif
6888                 /* TODO wildcards here */
6889                 ret = forbiddenExp;
6890             }
6891             return(ret);
6892         case XML_EXP_OR: {
6893             xmlExpNodePtr tmp;
6894 
6895 #ifdef DEBUG_DERIV
6896             printf("deriv or: => or(derivs)\n");
6897 #endif
6898             tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6899             if (tmp == NULL) {
6900                 return(NULL);
6901             }
6902             ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
6903             if (ret == NULL) {
6904                 xmlExpFree(ctxt, tmp);
6905                 return(NULL);
6906             }
6907             ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
6908                              NULL, 0, 0);
6909             return(ret);
6910         }
6911         case XML_EXP_SEQ:
6912 #ifdef DEBUG_DERIV
6913             printf("deriv seq: starting with left\n");
6914 #endif
6915             ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6916             if (ret == NULL) {
6917                 return(NULL);
6918             } else if (ret == forbiddenExp) {
6919                 if (IS_NILLABLE(exp->exp_left)) {
6920 #ifdef DEBUG_DERIV
6921                     printf("deriv seq: left failed but nillable\n");
6922 #endif
6923                     ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
6924                 }
6925             } else {
6926 #ifdef DEBUG_DERIV
6927                 printf("deriv seq: left match => sequence\n");
6928 #endif
6929                 exp->exp_right->ref++;
6930                 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
6931                                          NULL, 0, 0);
6932             }
6933             return(ret);
6934         case XML_EXP_COUNT: {
6935             int min, max;
6936             xmlExpNodePtr tmp;
6937 
6938             if (exp->exp_max == 0)
6939                 return(forbiddenExp);
6940             ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6941             if (ret == NULL)
6942                 return(NULL);
6943             if (ret == forbiddenExp) {
6944 #ifdef DEBUG_DERIV
6945                 printf("deriv count: pattern mismatch => forbid\n");
6946 #endif
6947                 return(ret);
6948             }
6949             if (exp->exp_max == 1)
6950                 return(ret);
6951             if (exp->exp_max < 0) /* unbounded */
6952                 max = -1;
6953             else
6954                 max = exp->exp_max - 1;
6955             if (exp->exp_min > 0)
6956                 min = exp->exp_min - 1;
6957             else
6958                 min = 0;
6959             exp->exp_left->ref++;
6960             tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
6961                                      NULL, min, max);
6962             if (ret == emptyExp) {
6963 #ifdef DEBUG_DERIV
6964                 printf("deriv count: match to empty => new count\n");
6965 #endif
6966                 return(tmp);
6967             }
6968 #ifdef DEBUG_DERIV
6969             printf("deriv count: match => sequence with new count\n");
6970 #endif
6971             return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
6972                                       NULL, 0, 0));
6973         }
6974     }
6975     return(NULL);
6976 }
6977 
6978 /**
6979  * xmlExpStringDerive:
6980  * @ctxt: the expression context
6981  * @exp: the expression
6982  * @str: the string
6983  * @len: the string len in bytes if available
6984  *
6985  * Do one step of Brzozowski derivation of the expression @exp with
6986  * respect to the input string
6987  *
6988  * Returns the resulting expression or NULL in case of internal error
6989  */
6990 xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str,int len)6991 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6992                    const xmlChar *str, int len) {
6993     const xmlChar *input;
6994 
6995     if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
6996         return(NULL);
6997     }
6998     /*
6999      * check the string is in the dictionnary, if yes use an interned
7000      * copy, otherwise we know it's not an acceptable input
7001      */
7002     input = xmlDictExists(ctxt->dict, str, len);
7003     if (input == NULL) {
7004         return(forbiddenExp);
7005     }
7006     return(xmlExpStringDeriveInt(ctxt, exp, input));
7007 }
7008 
7009 static int
xmlExpCheckCard(xmlExpNodePtr exp,xmlExpNodePtr sub)7010 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7011     int ret = 1;
7012 
7013     if (sub->c_max == -1) {
7014         if (exp->c_max != -1)
7015             ret = 0;
7016     } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7017         ret = 0;
7018     }
7019 #if 0
7020     if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7021         ret = 0;
7022 #endif
7023     return(ret);
7024 }
7025 
7026 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7027                                         xmlExpNodePtr sub);
7028 /**
7029  * xmlExpDivide:
7030  * @ctxt: the expressions context
7031  * @exp: the englobing expression
7032  * @sub: the subexpression
7033  * @mult: the multiple expression
7034  * @remain: the remain from the derivation of the multiple
7035  *
7036  * Check if exp is a multiple of sub, i.e. if there is a finite number n
7037  * so that sub{n} subsume exp
7038  *
7039  * Returns the multiple value if successful, 0 if it is not a multiple
7040  *         and -1 in case of internel error.
7041  */
7042 
7043 static int
xmlExpDivide(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub,xmlExpNodePtr * mult,xmlExpNodePtr * remain)7044 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7045              xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7046     int i;
7047     xmlExpNodePtr tmp, tmp2;
7048 
7049     if (mult != NULL) *mult = NULL;
7050     if (remain != NULL) *remain = NULL;
7051     if (exp->c_max == -1) return(0);
7052     if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7053 
7054     for (i = 1;i <= exp->c_max;i++) {
7055         sub->ref++;
7056         tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7057                                  sub, NULL, NULL, i, i);
7058         if (tmp == NULL) {
7059             return(-1);
7060         }
7061         if (!xmlExpCheckCard(tmp, exp)) {
7062             xmlExpFree(ctxt, tmp);
7063             continue;
7064         }
7065         tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7066         if (tmp2 == NULL) {
7067             xmlExpFree(ctxt, tmp);
7068             return(-1);
7069         }
7070         if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7071             if (remain != NULL)
7072                 *remain = tmp2;
7073             else
7074                 xmlExpFree(ctxt, tmp2);
7075             if (mult != NULL)
7076                 *mult = tmp;
7077             else
7078                 xmlExpFree(ctxt, tmp);
7079 #ifdef DEBUG_DERIV
7080             printf("Divide succeeded %d\n", i);
7081 #endif
7082             return(i);
7083         }
7084         xmlExpFree(ctxt, tmp);
7085         xmlExpFree(ctxt, tmp2);
7086     }
7087 #ifdef DEBUG_DERIV
7088     printf("Divide failed\n");
7089 #endif
7090     return(0);
7091 }
7092 
7093 /**
7094  * xmlExpExpDeriveInt:
7095  * @ctxt: the expressions context
7096  * @exp: the englobing expression
7097  * @sub: the subexpression
7098  *
7099  * Try to do a step of Brzozowski derivation but at a higher level
7100  * the input being a subexpression.
7101  *
7102  * Returns the resulting expression or NULL in case of internal error
7103  */
7104 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7105 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7106     xmlExpNodePtr ret, tmp, tmp2, tmp3;
7107     const xmlChar **tab;
7108     int len, i;
7109 
7110     /*
7111      * In case of equality and if the expression can only consume a finite
7112      * amount, then the derivation is empty
7113      */
7114     if ((exp == sub) && (exp->c_max >= 0)) {
7115 #ifdef DEBUG_DERIV
7116         printf("Equal(exp, sub) and finite -> Empty\n");
7117 #endif
7118         return(emptyExp);
7119     }
7120     /*
7121      * decompose sub sequence first
7122      */
7123     if (sub->type == XML_EXP_EMPTY) {
7124 #ifdef DEBUG_DERIV
7125         printf("Empty(sub) -> Empty\n");
7126 #endif
7127         exp->ref++;
7128         return(exp);
7129     }
7130     if (sub->type == XML_EXP_SEQ) {
7131 #ifdef DEBUG_DERIV
7132         printf("Seq(sub) -> decompose\n");
7133 #endif
7134         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7135         if (tmp == NULL)
7136             return(NULL);
7137         if (tmp == forbiddenExp)
7138             return(tmp);
7139         ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7140         xmlExpFree(ctxt, tmp);
7141         return(ret);
7142     }
7143     if (sub->type == XML_EXP_OR) {
7144 #ifdef DEBUG_DERIV
7145         printf("Or(sub) -> decompose\n");
7146 #endif
7147         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7148         if (tmp == forbiddenExp)
7149             return(tmp);
7150         if (tmp == NULL)
7151             return(NULL);
7152         ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7153         if ((ret == NULL) || (ret == forbiddenExp)) {
7154             xmlExpFree(ctxt, tmp);
7155             return(ret);
7156         }
7157         return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7158     }
7159     if (!xmlExpCheckCard(exp, sub)) {
7160 #ifdef DEBUG_DERIV
7161         printf("CheckCard(exp, sub) failed -> Forbid\n");
7162 #endif
7163         return(forbiddenExp);
7164     }
7165     switch (exp->type) {
7166         case XML_EXP_EMPTY:
7167             if (sub == emptyExp)
7168                 return(emptyExp);
7169 #ifdef DEBUG_DERIV
7170             printf("Empty(exp) -> Forbid\n");
7171 #endif
7172             return(forbiddenExp);
7173         case XML_EXP_FORBID:
7174 #ifdef DEBUG_DERIV
7175             printf("Forbid(exp) -> Forbid\n");
7176 #endif
7177             return(forbiddenExp);
7178         case XML_EXP_ATOM:
7179             if (sub->type == XML_EXP_ATOM) {
7180                 /* TODO: handle wildcards */
7181                 if (exp->exp_str == sub->exp_str) {
7182 #ifdef DEBUG_DERIV
7183                     printf("Atom match -> Empty\n");
7184 #endif
7185                     return(emptyExp);
7186                 }
7187 #ifdef DEBUG_DERIV
7188                 printf("Atom mismatch -> Forbid\n");
7189 #endif
7190                 return(forbiddenExp);
7191             }
7192             if ((sub->type == XML_EXP_COUNT) &&
7193                 (sub->exp_max == 1) &&
7194                 (sub->exp_left->type == XML_EXP_ATOM)) {
7195                 /* TODO: handle wildcards */
7196                 if (exp->exp_str == sub->exp_left->exp_str) {
7197 #ifdef DEBUG_DERIV
7198                     printf("Atom match -> Empty\n");
7199 #endif
7200                     return(emptyExp);
7201                 }
7202 #ifdef DEBUG_DERIV
7203                 printf("Atom mismatch -> Forbid\n");
7204 #endif
7205                 return(forbiddenExp);
7206             }
7207 #ifdef DEBUG_DERIV
7208             printf("Compex exp vs Atom -> Forbid\n");
7209 #endif
7210             return(forbiddenExp);
7211         case XML_EXP_SEQ:
7212             /* try to get the sequence consumed only if possible */
7213             if (xmlExpCheckCard(exp->exp_left, sub)) {
7214                 /* See if the sequence can be consumed directly */
7215 #ifdef DEBUG_DERIV
7216                 printf("Seq trying left only\n");
7217 #endif
7218                 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7219                 if ((ret != forbiddenExp) && (ret != NULL)) {
7220 #ifdef DEBUG_DERIV
7221                     printf("Seq trying left only worked\n");
7222 #endif
7223                     /*
7224                      * TODO: assumption here that we are determinist
7225                      *       i.e. we won't get to a nillable exp left
7226                      *       subset which could be matched by the right
7227                      *       part too.
7228                      * e.g.: (a | b)+,(a | c) and 'a+,a'
7229                      */
7230                     exp->exp_right->ref++;
7231                     return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7232                                               exp->exp_right, NULL, 0, 0));
7233                 }
7234 #ifdef DEBUG_DERIV
7235             } else {
7236                 printf("Seq: left too short\n");
7237 #endif
7238             }
7239             /* Try instead to decompose */
7240             if (sub->type == XML_EXP_COUNT) {
7241                 int min, max;
7242 
7243 #ifdef DEBUG_DERIV
7244                 printf("Seq: sub is a count\n");
7245 #endif
7246                 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7247                 if (ret == NULL)
7248                     return(NULL);
7249                 if (ret != forbiddenExp) {
7250 #ifdef DEBUG_DERIV
7251                     printf("Seq , Count match on left\n");
7252 #endif
7253                     if (sub->exp_max < 0)
7254                         max = -1;
7255                     else
7256                         max = sub->exp_max -1;
7257                     if (sub->exp_min > 0)
7258                         min = sub->exp_min -1;
7259                     else
7260                         min = 0;
7261                     exp->exp_right->ref++;
7262                     tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7263                                              exp->exp_right, NULL, 0, 0);
7264                     if (tmp == NULL)
7265                         return(NULL);
7266 
7267                     sub->exp_left->ref++;
7268                     tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7269                                       sub->exp_left, NULL, NULL, min, max);
7270                     if (tmp2 == NULL) {
7271                         xmlExpFree(ctxt, tmp);
7272                         return(NULL);
7273                     }
7274                     ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7275                     xmlExpFree(ctxt, tmp);
7276                     xmlExpFree(ctxt, tmp2);
7277                     return(ret);
7278                 }
7279             }
7280             /* we made no progress on structured operations */
7281             break;
7282         case XML_EXP_OR:
7283 #ifdef DEBUG_DERIV
7284             printf("Or , trying both side\n");
7285 #endif
7286             ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7287             if (ret == NULL)
7288                 return(NULL);
7289             tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7290             if (tmp == NULL) {
7291                 xmlExpFree(ctxt, ret);
7292                 return(NULL);
7293             }
7294             return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7295         case XML_EXP_COUNT: {
7296             int min, max;
7297 
7298             if (sub->type == XML_EXP_COUNT) {
7299                 /*
7300                  * Try to see if the loop is completely subsumed
7301                  */
7302                 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7303                 if (tmp == NULL)
7304                     return(NULL);
7305                 if (tmp == forbiddenExp) {
7306                     int mult;
7307 
7308 #ifdef DEBUG_DERIV
7309                     printf("Count, Count inner don't subsume\n");
7310 #endif
7311                     mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7312                                         NULL, &tmp);
7313                     if (mult <= 0) {
7314 #ifdef DEBUG_DERIV
7315                         printf("Count, Count not multiple => forbidden\n");
7316 #endif
7317                         return(forbiddenExp);
7318                     }
7319                     if (sub->exp_max == -1) {
7320                         max = -1;
7321                         if (exp->exp_max == -1) {
7322                             if (exp->exp_min <= sub->exp_min * mult)
7323                                 min = 0;
7324                             else
7325                                 min = exp->exp_min - sub->exp_min * mult;
7326                         } else {
7327 #ifdef DEBUG_DERIV
7328                             printf("Count, Count finite can't subsume infinite\n");
7329 #endif
7330                             xmlExpFree(ctxt, tmp);
7331                             return(forbiddenExp);
7332                         }
7333                     } else {
7334                         if (exp->exp_max == -1) {
7335 #ifdef DEBUG_DERIV
7336                             printf("Infinite loop consume mult finite loop\n");
7337 #endif
7338                             if (exp->exp_min > sub->exp_min * mult) {
7339                                 max = -1;
7340                                 min = exp->exp_min - sub->exp_min * mult;
7341                             } else {
7342                                 max = -1;
7343                                 min = 0;
7344                             }
7345                         } else {
7346                             if (exp->exp_max < sub->exp_max * mult) {
7347 #ifdef DEBUG_DERIV
7348                                 printf("loops max mult mismatch => forbidden\n");
7349 #endif
7350                                 xmlExpFree(ctxt, tmp);
7351                                 return(forbiddenExp);
7352                             }
7353                             if (sub->exp_max * mult > exp->exp_min)
7354                                 min = 0;
7355                             else
7356                                 min = exp->exp_min - sub->exp_max * mult;
7357                             max = exp->exp_max - sub->exp_max * mult;
7358                         }
7359                     }
7360                 } else if (!IS_NILLABLE(tmp)) {
7361                     /*
7362                      * TODO: loop here to try to grow if working on finite
7363                      *       blocks.
7364                      */
7365 #ifdef DEBUG_DERIV
7366                     printf("Count, Count remain not nillable => forbidden\n");
7367 #endif
7368                     xmlExpFree(ctxt, tmp);
7369                     return(forbiddenExp);
7370                 } else if (sub->exp_max == -1) {
7371                     if (exp->exp_max == -1) {
7372                         if (exp->exp_min <= sub->exp_min) {
7373 #ifdef DEBUG_DERIV
7374                             printf("Infinite loops Okay => COUNT(0,Inf)\n");
7375 #endif
7376                             max = -1;
7377                             min = 0;
7378                         } else {
7379 #ifdef DEBUG_DERIV
7380                             printf("Infinite loops min => Count(X,Inf)\n");
7381 #endif
7382                             max = -1;
7383                             min = exp->exp_min - sub->exp_min;
7384                         }
7385                     } else if (exp->exp_min > sub->exp_min) {
7386 #ifdef DEBUG_DERIV
7387                         printf("loops min mismatch 1 => forbidden ???\n");
7388 #endif
7389                         xmlExpFree(ctxt, tmp);
7390                         return(forbiddenExp);
7391                     } else {
7392                         max = -1;
7393                         min = 0;
7394                     }
7395                 } else {
7396                     if (exp->exp_max == -1) {
7397 #ifdef DEBUG_DERIV
7398                         printf("Infinite loop consume finite loop\n");
7399 #endif
7400                         if (exp->exp_min > sub->exp_min) {
7401                             max = -1;
7402                             min = exp->exp_min - sub->exp_min;
7403                         } else {
7404                             max = -1;
7405                             min = 0;
7406                         }
7407                     } else {
7408                         if (exp->exp_max < sub->exp_max) {
7409 #ifdef DEBUG_DERIV
7410                             printf("loops max mismatch => forbidden\n");
7411 #endif
7412                             xmlExpFree(ctxt, tmp);
7413                             return(forbiddenExp);
7414                         }
7415                         if (sub->exp_max > exp->exp_min)
7416                             min = 0;
7417                         else
7418                             min = exp->exp_min - sub->exp_max;
7419                         max = exp->exp_max - sub->exp_max;
7420                     }
7421                 }
7422 #ifdef DEBUG_DERIV
7423                 printf("loops match => SEQ(COUNT())\n");
7424 #endif
7425                 exp->exp_left->ref++;
7426                 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7427                                           NULL, NULL, min, max);
7428                 if (tmp2 == NULL) {
7429                     return(NULL);
7430                 }
7431                 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7432                                          NULL, 0, 0);
7433                 return(ret);
7434             }
7435             tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7436             if (tmp == NULL)
7437                 return(NULL);
7438             if (tmp == forbiddenExp) {
7439 #ifdef DEBUG_DERIV
7440                 printf("loop mismatch => forbidden\n");
7441 #endif
7442                 return(forbiddenExp);
7443             }
7444             if (exp->exp_min > 0)
7445                 min = exp->exp_min - 1;
7446             else
7447                 min = 0;
7448             if (exp->exp_max < 0)
7449                 max = -1;
7450             else
7451                 max = exp->exp_max - 1;
7452 
7453 #ifdef DEBUG_DERIV
7454             printf("loop match => SEQ(COUNT())\n");
7455 #endif
7456             exp->exp_left->ref++;
7457             tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7458                                       NULL, NULL, min, max);
7459             if (tmp2 == NULL)
7460                 return(NULL);
7461             ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7462                                      NULL, 0, 0);
7463             return(ret);
7464         }
7465     }
7466 
7467 #ifdef DEBUG_DERIV
7468     printf("Fallback to derivative\n");
7469 #endif
7470     if (IS_NILLABLE(sub)) {
7471         if (!(IS_NILLABLE(exp)))
7472             return(forbiddenExp);
7473         else
7474             ret = emptyExp;
7475     } else
7476         ret = NULL;
7477     /*
7478      * here the structured derivation made no progress so
7479      * we use the default token based derivation to force one more step
7480      */
7481     if (ctxt->tabSize == 0)
7482         ctxt->tabSize = 40;
7483 
7484     tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7485                                        sizeof(const xmlChar *));
7486     if (tab == NULL) {
7487         return(NULL);
7488     }
7489 
7490     /*
7491      * collect all the strings accepted by the subexpression on input
7492      */
7493     len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7494     while (len < 0) {
7495         const xmlChar **temp;
7496         temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7497                                              sizeof(const xmlChar *));
7498         if (temp == NULL) {
7499             xmlFree((xmlChar **) tab);
7500             return(NULL);
7501         }
7502         tab = temp;
7503         ctxt->tabSize *= 2;
7504         len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7505     }
7506     for (i = 0;i < len;i++) {
7507         tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7508         if ((tmp == NULL) || (tmp == forbiddenExp)) {
7509             xmlExpFree(ctxt, ret);
7510             xmlFree((xmlChar **) tab);
7511             return(tmp);
7512         }
7513         tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7514         if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7515             xmlExpFree(ctxt, tmp);
7516             xmlExpFree(ctxt, ret);
7517             xmlFree((xmlChar **) tab);
7518             return(tmp);
7519         }
7520         tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7521         xmlExpFree(ctxt, tmp);
7522         xmlExpFree(ctxt, tmp2);
7523 
7524         if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7525             xmlExpFree(ctxt, ret);
7526             xmlFree((xmlChar **) tab);
7527             return(tmp3);
7528         }
7529 
7530         if (ret == NULL)
7531             ret = tmp3;
7532         else {
7533             ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7534             if (ret == NULL) {
7535                 xmlFree((xmlChar **) tab);
7536                 return(NULL);
7537             }
7538         }
7539     }
7540     xmlFree((xmlChar **) tab);
7541     return(ret);
7542 }
7543 
7544 /**
7545  * xmlExpExpDerive:
7546  * @ctxt: the expressions context
7547  * @exp: the englobing expression
7548  * @sub: the subexpression
7549  *
7550  * Evaluates the expression resulting from @exp consuming a sub expression @sub
7551  * Based on algebraic derivation and sometimes direct Brzozowski derivation
7552  * it usually tatkes less than linear time and can handle expressions generating
7553  * infinite languages.
7554  *
7555  * Returns the resulting expression or NULL in case of internal error, the
7556  *         result must be freed
7557  */
7558 xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7559 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7560     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7561         return(NULL);
7562 
7563     /*
7564      * O(1) speedups
7565      */
7566     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7567 #ifdef DEBUG_DERIV
7568         printf("Sub nillable and not exp : can't subsume\n");
7569 #endif
7570         return(forbiddenExp);
7571     }
7572     if (xmlExpCheckCard(exp, sub) == 0) {
7573 #ifdef DEBUG_DERIV
7574         printf("sub generate longuer sequances than exp : can't subsume\n");
7575 #endif
7576         return(forbiddenExp);
7577     }
7578     return(xmlExpExpDeriveInt(ctxt, exp, sub));
7579 }
7580 
7581 /**
7582  * xmlExpSubsume:
7583  * @ctxt: the expressions context
7584  * @exp: the englobing expression
7585  * @sub: the subexpression
7586  *
7587  * Check whether @exp accepts all the languages accexpted by @sub
7588  * the input being a subexpression.
7589  *
7590  * Returns 1 if true 0 if false and -1 in case of failure.
7591  */
7592 int
xmlExpSubsume(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7593 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7594     xmlExpNodePtr tmp;
7595 
7596     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7597         return(-1);
7598 
7599     /*
7600      * TODO: speedup by checking the language of sub is a subset of the
7601      *       language of exp
7602      */
7603     /*
7604      * O(1) speedups
7605      */
7606     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7607 #ifdef DEBUG_DERIV
7608         printf("Sub nillable and not exp : can't subsume\n");
7609 #endif
7610         return(0);
7611     }
7612     if (xmlExpCheckCard(exp, sub) == 0) {
7613 #ifdef DEBUG_DERIV
7614         printf("sub generate longuer sequances than exp : can't subsume\n");
7615 #endif
7616         return(0);
7617     }
7618     tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7619 #ifdef DEBUG_DERIV
7620     printf("Result derivation :\n");
7621     PRINT_EXP(tmp);
7622 #endif
7623     if (tmp == NULL)
7624         return(-1);
7625     if (tmp == forbiddenExp)
7626         return(0);
7627     if (tmp == emptyExp)
7628         return(1);
7629     if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7630         xmlExpFree(ctxt, tmp);
7631         return(1);
7632     }
7633     xmlExpFree(ctxt, tmp);
7634     return(0);
7635 }
7636 
7637 /************************************************************************
7638  *                                                                      *
7639  *                      Parsing expression                              *
7640  *                                                                      *
7641  ************************************************************************/
7642 
7643 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7644 
7645 #undef CUR
7646 #define CUR (*ctxt->cur)
7647 #undef NEXT
7648 #define NEXT ctxt->cur++;
7649 #undef IS_BLANK
7650 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7651 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7652 
7653 static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt)7654 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7655     int ret = 0;
7656 
7657     SKIP_BLANKS
7658     if (CUR == '*') {
7659         NEXT
7660         return(-1);
7661     }
7662     if ((CUR < '0') || (CUR > '9'))
7663         return(-1);
7664     while ((CUR >= '0') && (CUR <= '9')) {
7665         ret = ret * 10 + (CUR - '0');
7666         NEXT
7667     }
7668     return(ret);
7669 }
7670 
7671 static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt)7672 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7673     const char *base;
7674     xmlExpNodePtr ret;
7675     const xmlChar *val;
7676 
7677     SKIP_BLANKS
7678     base = ctxt->cur;
7679     if (*ctxt->cur == '(') {
7680         NEXT
7681         ret = xmlExpParseExpr(ctxt);
7682         SKIP_BLANKS
7683         if (*ctxt->cur != ')') {
7684             fprintf(stderr, "unbalanced '(' : %s\n", base);
7685             xmlExpFree(ctxt, ret);
7686             return(NULL);
7687         }
7688         NEXT;
7689         SKIP_BLANKS
7690         goto parse_quantifier;
7691     }
7692     while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7693            (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7694            (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7695         NEXT;
7696     val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7697     if (val == NULL)
7698         return(NULL);
7699     ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7700     if (ret == NULL)
7701         return(NULL);
7702     SKIP_BLANKS
7703 parse_quantifier:
7704     if (CUR == '{') {
7705         int min, max;
7706 
7707         NEXT
7708         min = xmlExpParseNumber(ctxt);
7709         if (min < 0) {
7710             xmlExpFree(ctxt, ret);
7711             return(NULL);
7712         }
7713         SKIP_BLANKS
7714         if (CUR == ',') {
7715             NEXT
7716             max = xmlExpParseNumber(ctxt);
7717             SKIP_BLANKS
7718         } else
7719             max = min;
7720         if (CUR != '}') {
7721             xmlExpFree(ctxt, ret);
7722             return(NULL);
7723         }
7724         NEXT
7725         ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7726                                  min, max);
7727         SKIP_BLANKS
7728     } else if (CUR == '?') {
7729         NEXT
7730         ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7731                                  0, 1);
7732         SKIP_BLANKS
7733     } else if (CUR == '+') {
7734         NEXT
7735         ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7736                                  1, -1);
7737         SKIP_BLANKS
7738     } else if (CUR == '*') {
7739         NEXT
7740         ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7741                                  0, -1);
7742         SKIP_BLANKS
7743     }
7744     return(ret);
7745 }
7746 
7747 
7748 static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt)7749 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7750     xmlExpNodePtr ret, right;
7751 
7752     ret = xmlExpParseOr(ctxt);
7753     SKIP_BLANKS
7754     while (CUR == '|') {
7755         NEXT
7756         right = xmlExpParseOr(ctxt);
7757         if (right == NULL) {
7758             xmlExpFree(ctxt, ret);
7759             return(NULL);
7760         }
7761         ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7762         if (ret == NULL)
7763             return(NULL);
7764     }
7765     return(ret);
7766 }
7767 
7768 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt)7769 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
7770     xmlExpNodePtr ret, right;
7771 
7772     ret = xmlExpParseSeq(ctxt);
7773     SKIP_BLANKS
7774     while (CUR == ',') {
7775         NEXT
7776         right = xmlExpParseSeq(ctxt);
7777         if (right == NULL) {
7778             xmlExpFree(ctxt, ret);
7779             return(NULL);
7780         }
7781         ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
7782         if (ret == NULL)
7783             return(NULL);
7784     }
7785     return(ret);
7786 }
7787 
7788 /**
7789  * xmlExpParse:
7790  * @ctxt: the expressions context
7791  * @expr: the 0 terminated string
7792  *
7793  * Minimal parser for regexps, it understand the following constructs
7794  *  - string terminals
7795  *  - choice operator |
7796  *  - sequence operator ,
7797  *  - subexpressions (...)
7798  *  - usual cardinality operators + * and ?
7799  *  - finite sequences  { min, max }
7800  *  - infinite sequences { min, * }
7801  * There is minimal checkings made especially no checking on strings values
7802  *
7803  * Returns a new expression or NULL in case of failure
7804  */
7805 xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt,const char * expr)7806 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
7807     xmlExpNodePtr ret;
7808 
7809     ctxt->expr = expr;
7810     ctxt->cur = expr;
7811 
7812     ret = xmlExpParseExpr(ctxt);
7813     SKIP_BLANKS
7814     if (*ctxt->cur != 0) {
7815         xmlExpFree(ctxt, ret);
7816         return(NULL);
7817     }
7818     return(ret);
7819 }
7820 
7821 static void
xmlExpDumpInt(xmlBufferPtr buf,xmlExpNodePtr expr,int glob)7822 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
7823     xmlExpNodePtr c;
7824 
7825     if (expr == NULL) return;
7826     if (glob) xmlBufferWriteChar(buf, "(");
7827     switch (expr->type) {
7828         case XML_EXP_EMPTY:
7829             xmlBufferWriteChar(buf, "empty");
7830             break;
7831         case XML_EXP_FORBID:
7832             xmlBufferWriteChar(buf, "forbidden");
7833             break;
7834         case XML_EXP_ATOM:
7835             xmlBufferWriteCHAR(buf, expr->exp_str);
7836             break;
7837         case XML_EXP_SEQ:
7838             c = expr->exp_left;
7839             if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7840                 xmlExpDumpInt(buf, c, 1);
7841             else
7842                 xmlExpDumpInt(buf, c, 0);
7843             xmlBufferWriteChar(buf, " , ");
7844             c = expr->exp_right;
7845             if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7846                 xmlExpDumpInt(buf, c, 1);
7847             else
7848                 xmlExpDumpInt(buf, c, 0);
7849             break;
7850         case XML_EXP_OR:
7851             c = expr->exp_left;
7852             if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7853                 xmlExpDumpInt(buf, c, 1);
7854             else
7855                 xmlExpDumpInt(buf, c, 0);
7856             xmlBufferWriteChar(buf, " | ");
7857             c = expr->exp_right;
7858             if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7859                 xmlExpDumpInt(buf, c, 1);
7860             else
7861                 xmlExpDumpInt(buf, c, 0);
7862             break;
7863         case XML_EXP_COUNT: {
7864             char rep[40];
7865 
7866             c = expr->exp_left;
7867             if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7868                 xmlExpDumpInt(buf, c, 1);
7869             else
7870                 xmlExpDumpInt(buf, c, 0);
7871             if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
7872                 rep[0] = '?';
7873                 rep[1] = 0;
7874             } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
7875                 rep[0] = '*';
7876                 rep[1] = 0;
7877             } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
7878                 rep[0] = '+';
7879                 rep[1] = 0;
7880             } else if (expr->exp_max == expr->exp_min) {
7881                 snprintf(rep, 39, "{%d}", expr->exp_min);
7882             } else if (expr->exp_max < 0) {
7883                 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
7884             } else {
7885                 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
7886             }
7887             rep[39] = 0;
7888             xmlBufferWriteChar(buf, rep);
7889             break;
7890         }
7891         default:
7892             fprintf(stderr, "Error in tree\n");
7893     }
7894     if (glob)
7895         xmlBufferWriteChar(buf, ")");
7896 }
7897 /**
7898  * xmlExpDump:
7899  * @buf:  a buffer to receive the output
7900  * @expr:  the compiled expression
7901  *
7902  * Serialize the expression as compiled to the buffer
7903  */
7904 void
xmlExpDump(xmlBufferPtr buf,xmlExpNodePtr expr)7905 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
7906     if ((buf == NULL) || (expr == NULL))
7907         return;
7908     xmlExpDumpInt(buf, expr, 0);
7909 }
7910 
7911 /**
7912  * xmlExpMaxToken:
7913  * @expr: a compiled expression
7914  *
7915  * Indicate the maximum number of input a expression can accept
7916  *
7917  * Returns the maximum length or -1 in case of error
7918  */
7919 int
xmlExpMaxToken(xmlExpNodePtr expr)7920 xmlExpMaxToken(xmlExpNodePtr expr) {
7921     if (expr == NULL)
7922         return(-1);
7923     return(expr->c_max);
7924 }
7925 
7926 /**
7927  * xmlExpCtxtNbNodes:
7928  * @ctxt: an expression context
7929  *
7930  * Debugging facility provides the number of allocated nodes at a that point
7931  *
7932  * Returns the number of nodes in use or -1 in case of error
7933  */
7934 int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt)7935 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
7936     if (ctxt == NULL)
7937         return(-1);
7938     return(ctxt->nb_nodes);
7939 }
7940 
7941 /**
7942  * xmlExpCtxtNbCons:
7943  * @ctxt: an expression context
7944  *
7945  * Debugging facility provides the number of allocated nodes over lifetime
7946  *
7947  * Returns the number of nodes ever allocated or -1 in case of error
7948  */
7949 int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt)7950 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
7951     if (ctxt == NULL)
7952         return(-1);
7953     return(ctxt->nb_cons);
7954 }
7955 
7956 #endif /* LIBXML_EXPR_ENABLED */
7957 #define bottom_xmlregexp
7958 #include "elfgcchack.h"
7959 #endif /* LIBXML_REGEXP_ENABLED */
7960