1 /*
2  * Copyright (c) 2002 by The XFree86 Project, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20  * SOFTWARE.
21  *
22  * Except as contained in this notice, the name of the XFree86 Project shall
23  * not be used in advertising or otherwise to promote the sale, use or other
24  * dealings in this Software without prior written authorization from the
25  * XFree86 Project.
26  *
27  * Author: Paulo César Pereira de Andrade
28  */
29 
30 /* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */
31 
32 #include "re.h"
33 
34 #ifndef _rep_h
35 #define _rep_h
36 
37 /*
38  * Local defines
39  */
40 
41 #ifdef MIN
42 #undef MIN
43 #endif
44 #define MIN(a, b)	((a) < (b) ? (a) : (b))
45 
46 #ifdef MAX
47 #undef MAX
48 #endif
49 #define MAX(a, b)	((a) > (b) ? (a) : (b))
50 
51 /*  This value can not be larger than 255, a depth value is the nesting of
52  * repetition operations and alternatives. The number of nested parenthesis
53  * does not matter, but a repetition on the pattern inside the parenthesis
54  * does. Note also that you cannot have more than 9 parenthesis pairs in
55  * an expression.
56  *  Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed
57  * 7 complex repetitions. A complex repetition is a dot followed by an
58  * repetition operator. It is called a complex repetition because dot
59  * matches anything but the empty string, so the engine needs to test
60  * all possible combinations until the end of the string is found.
61  *  Repetitions like .* use one depth until the end of the string is found,
62  * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2.
63  */
64 #define MAX_DEPTH	8
65 
66 /*  Minimum number of strings to generate a "large" string list, that is,
67  * sort the strings and allocate 512 extra bytes to map the first string
68  * with a given initial byte. */
69 #define LARGE_STL_COUNT	16
70 
71 /*
72  * Local types
73  */
74 /* Intermediate compilation types declaration */
75 	/* (r)egular (e)xpression (c)ompile (c)a(se) */
76 typedef struct _rec_cse rec_cse;
77 
78 	/* (r)egular (e)xpression (c)ompile (r)a(ng)e */
79 typedef struct _rec_rng rec_rng;
80 
81 	/* (r)egular (e)xpression (c)ompile (pat)tern */
82 typedef struct _rec_pat rec_pat;
83 
84 	/* (r)egular (e)xpression (c)ompile (rep)etition */
85 typedef struct _rec_rep rec_rep;
86 
87 	/* (r)egular (e)xpression (c)ompile (gr)ou(p) */
88 typedef struct _rec_grp rec_grp;
89 
90 	/* (r)egular (e)xpression (c)ompile (alt)ernatives */
91 typedef struct _rec_alt rec_alt;
92 
93 
94 /* Optimization types */
95 	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist */
96 typedef struct _rec_stl rec_stl;
97 
98 /* Final compilation and execution types */
99 	/* (re)gular expression (inf)ormation */
100 typedef struct _re_inf re_inf;
101 
102 	/* (re)gular expression (eng)ine */
103 typedef struct _re_eng re_eng;
104 
105 
106 /* Codes used by the engine */
107 typedef enum {
108     /* Grouping */
109     Re_Open,			/* ( */
110     Re_Close,			/* ) */
111     Re_Update,			/* Like Re_Close, but is inside a loop */
112 
113     /* Alternatives */
114     Re_Alt,			/* Start alternative list, + next offset */
115     Re_AltNext,			/* Next alternative, + next offset */
116     Re_AltDone,			/* Finish alternative list */
117 
118     /* Repetition */
119     Re_AnyTimes,		/* * */
120     Re_Maybe,			/* ? */
121     Re_AtLeast,			/* +, at least one */
122 
123     /* Repetition like */
124     Re_AnyAnyTimes,		/* .*<re> */
125     Re_AnyMaybe,		/* .?<re> */
126     Re_AnyAtLeast,		/* .+<re> */
127 
128     Re_AnyEatAnyTimes,		/* Expression ends with .* */
129     Re_AnyEatMaybe,		/* Expression ends with .? */
130     Re_AnyEatAtLeast,		/* Expression ends with .+ */
131 
132     /* Repetition with arguments */
133     Re_Exact,			/* {e} */
134     Re_Min,			/* {n,} */
135     Re_Max,			/* {,m} */
136     Re_MinMax,			/* {n,m} */
137 
138     /* Repetition helper instruction */
139     Re_RepJump,			/* Special code, go back to repetition */
140     Re_RepLongJump,		/* Jump needs two bytes */
141 	/*  After the repetition data, all repetitions have an offset
142 	 * to the code after the repetition */
143 
144     /* Matching */
145     Re_Any,			/* . */
146     Re_Odigit,			/* \o */
147     Re_OdigitNot,		/* \O */
148     Re_Digit,			/* \d */
149     Re_DigitNot,		/* \D */
150     Re_Xdigit,			/* \x */
151     Re_XdigitNot,		/* \x */
152     Re_Space,			/* \s */
153     Re_SpaceNot,		/* \S */
154     Re_Tab,			/* \t */
155     Re_Newline,			/* \n */
156     Re_Lower,			/* \l */
157     Re_Upper,			/* \u */
158     Re_Alnum,			/* \w */
159     Re_AlnumNot,		/* \W */
160     Re_Control,			/* \c */
161     Re_ControlNot,		/* \C */
162     Re_Bol,			/* ^ */
163     Re_Eol,			/* $ */
164     Re_Bow,			/* \< */
165     Re_Eow,			/* \> */
166 
167     /* Range matching information */
168     Re_Range,			/* + 256 bytes */
169     Re_RangeNot,		/* + 256 bytes */
170 
171     /* Matching with arguments */
172     Re_Literal,			/* + character */
173     Re_CaseLiteral,		/* + lower + upper */
174     Re_LiteralNot,		/* + character */
175     Re_CaseLiteralNot,		/* + lower + upper */
176     Re_String,			/* + length + string */
177     Re_CaseString,		/* + length + string in format lower-upper */
178 
179     /* These are useful to start matching, or when RE_NOSPEC is used. */
180     Re_SearchLiteral,
181     Re_SearchCaseLiteral,
182     Re_SearchString,
183     Re_SearchCaseString,
184 
185     Re_StringList,		/* + total-length + lengths + strings */
186     Re_CaseStringList,		/* + total-length + lengths + strings */
187 
188     Re_LargeStringList,		/* + total-length + lengths + map + strings */
189     Re_LargeCaseStringList,	/* + total-length + lengths + map + strings */
190 
191     /* Backreference */
192     Re_Backref,			/* + reference number */
193 
194     /* The last codes */
195     Re_DoneIf,			/* Done if at end of input */
196     Re_MaybeDone,		/* Done */
197     Re_Done			/* If this code found, finished execution */
198 } ReCode;
199 
200 
201 /* (r)egular (e)xpresssion (pat)rern (t)ype */
202 typedef enum _rec_pat_t {
203     Rep_Literal			= Re_Literal,
204     Rep_CaseLiteral		= Re_CaseLiteral,
205     Rep_LiteralNot		= Re_LiteralNot,
206     Rep_CaseLiteralNot		= Re_CaseLiteralNot,
207     Rep_Range			= Re_Range,
208     Rep_RangeNot		= Re_RangeNot,
209     Rep_String			= Re_String,
210     Rep_CaseString		= Re_CaseString,
211     Rep_SearchLiteral		= Re_SearchLiteral,
212     Rep_SearchCaseLiteral	= Re_SearchCaseLiteral,
213     Rep_SearchString		= Re_SearchString,
214     Rep_SearchCaseString	= Re_SearchCaseString,
215     Rep_Any			= Re_Any,
216     Rep_AnyAnyTimes		= Re_AnyAnyTimes,
217     Rep_AnyEatAnyTimes		= Re_AnyEatAnyTimes,
218     Rep_AnyMaybe		= Re_AnyMaybe,
219     Rep_AnyEatMaybe		= Re_AnyEatMaybe,
220     Rep_AnyAtLeast		= Re_AnyAtLeast,
221     Rep_AnyEatAtLeast		= Re_AnyEatAtLeast,
222     Rep_Odigit			= Re_Odigit,
223     Rep_OdigitNot		= Re_OdigitNot,
224     Rep_Digit			= Re_Digit,
225     Rep_DigitNot		= Re_DigitNot,
226     Rep_Xdigit			= Re_Xdigit,
227     Rep_XdigitNot		= Re_XdigitNot,
228     Rep_Space			= Re_Space,
229     Rep_SpaceNot		= Re_SpaceNot,
230     Rep_Tab			= Re_Tab,
231     Rep_Newline			= Re_Newline,
232     Rep_Lower			= Re_Lower,
233     Rep_Upper			= Re_Upper,
234     Rep_Alnum			= Re_Alnum,
235     Rep_AlnumNot		= Re_AlnumNot,
236     Rep_Control			= Re_Control,
237     Rep_ControlNot		= Re_ControlNot,
238     Rep_Bol			= Re_Bol,
239     Rep_Eol			= Re_Eol,
240     Rep_Bow			= Re_Bow,
241     Rep_Eow			= Re_Eow,
242     Rep_Backref			= Re_Backref,
243     Rep_StringList		= Re_StringList,
244     Rep_Group			= Re_Open
245 } rec_pat_t;
246 
247 
248 /* (r)egular (e)xpression (rep)etition (t)ype */
249 typedef enum _rec_rep_t {
250     Rer_AnyTimes		= Re_AnyTimes,
251     Rer_AtLeast			= Re_AtLeast,
252     Rer_Maybe			= Re_Maybe,
253     Rer_Exact			= Re_Exact,
254     Rer_Min			= Re_Min,
255     Rer_Max			= Re_Max,
256     Rer_MinMax			= Re_MinMax
257 } rec_rep_t;
258 
259 
260 /*  Decide at re compilation time what is lowercase and what is uppercase */
261 struct _rec_cse {
262     unsigned char lower;
263     unsigned char upper;
264 };
265 
266 
267 /*  A rec_rng is used only during compilation, just a character map */
268 struct _rec_rng {
269     unsigned char range[256];
270 };
271 
272 
273 /*  A rec_pat is used only during compilation, and can be viewed as
274  * a regular expression element like a match to any character, a match
275  * to the beginning or end of the line, etc.
276  *  It is implemented as a linked list, and does not have nesting.
277  *  The data field can contain:
278  *	chr:	the value of a single character to match.
279  *	cse:	the upper and lower case value of a character to match.
280  *	rng:	a character map to match or not match.
281  *	str:	a simple string or a string where every two bytes
282  *		represents the character to match, in lower/upper
283  *		case sequence.
284  *  The rep field is not used for strings, strings are broken in the
285  * last character in this case. That is, strings are just a concatenation
286  * of several character matches.
287  */
288 struct _rec_pat {
289     rec_pat_t type;
290     rec_pat *next, *prev;	/* Linked list information */
291     union {
292 	unsigned char chr;
293 	rec_cse cse;
294 	rec_rng *rng;
295 	rec_grp *grp;
296 	unsigned char *str;
297 	rec_stl *stl;
298     } data;
299     rec_rep *rep;		/* Pattern repetition information */
300 };
301 
302 
303 /*  A rec_rep is used only during compilation, and can be viewed as:
304  *
305  *	? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>}
306  *
307  * where <e> is "exact", <m> is "minimum" and <M> is "maximum".
308  *  In the compiled step it can also be just a NULL pointer, that
309  * is actually equivalent to {1}.
310  */
311 struct _rec_rep {
312     rec_rep_t type;
313     short mine;			/* minimum or exact number of matches */
314     short maxc;			/* maximum number of matches */
315 };
316 
317 
318 /*  A rec_alt is used only during compilation, and can be viewed as:
319  *
320  *	<re>|<re>
321  *
322  * where <re> is any regular expression. The expressions are nested
323  * using the grp field of the rec_pat structure.
324  */
325 struct _rec_alt {
326     rec_alt *next, *prev;	/* Linked list information */
327     rec_pat *pat;
328 };
329 
330 
331 /*  A rec_grp is a place holder for expressions enclosed in parenthesis
332  * and is linked to the compilation data by an rec_pat structure. */
333 struct _rec_grp {
334     rec_pat *parent;		/* Reference to parent pattern */
335     rec_alt *alt;		/* The pattern information */
336     rec_alt *palt;		/* Parent alternative */
337     rec_grp *pgrp;		/* Nested groups */
338     int comp;			/* (comp)lex repetition pattern inside group */
339 };
340 
341 
342 /* Optimization compilation types definition */
343 	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */
344 typedef enum {
345     Resl_StringList		= Re_StringList,
346     Resl_CaseStringList		= Re_CaseStringList
347 } rec_stl_t;
348 
349 struct _rec_stl {
350     rec_stl_t type;
351     int nstrs;			/* Number of strings in list */
352     int tlen;			/* Total length of all strings */
353     unsigned char *lens;	/* Vector of string lengths */
354     unsigned char **strs;	/* The strings */
355 };
356 
357 
358 /*
359  * Prototypes
360  */
361 	/* rep.c */
362 rec_alt *irec_comp(const char*, const char*, int, int*);
363 void irec_free_alt(rec_alt*);
364 
365 	/* reo.c */
366 int orec_comp(rec_alt*, int);
367 void orec_free_stl(rec_stl*);
368 
369 #endif /* _rep_h */
370