1 /*
2  * Internal interface definitions, etc., for the reg package
3  *
4  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
5  *
6  * Development of this software was funded, in part, by Cray Research Inc.,
7  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8  * Corporation, none of whom are responsible for the results.  The author
9  * thanks all of them.
10  *
11  * Redistribution and use in source and binary forms -- with or without
12  * modification -- are permitted for any purpose, provided that
13  * redistributions in source form retain this entire copyright notice and
14  * indicate the origin and nature of any modifications.
15  *
16  * I'd appreciate being given credit for this package in the documentation
17  * of software which uses it, but that is not a requirement.
18  *
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
22  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  * src/include/regex/regguts.h
31  */
32 
33 
34 
35 /*
36  * Environmental customization.  It should not (I hope) be necessary to
37  * alter the file you are now reading -- regcustom.h should handle it all,
38  * given care here and elsewhere.
39  */
40 #include "regcustom.h"
41 
42 
43 
44 /*
45  * Things that regcustom.h might override.
46  */
47 
48 /* assertions */
49 #ifndef assert
50 #ifndef REG_DEBUG
51 #define  NDEBUG					/* no assertions */
52 #endif
53 #include <assert.h>
54 #endif
55 
56 /* voids */
57 #ifndef DISCARD
58 #define DISCARD void			/* for throwing values away */
59 #endif
60 #ifndef VS
61 #define VS(x)	((void *)(x))	/* cast something to generic ptr */
62 #endif
63 
64 /* function-pointer declarator */
65 #ifndef FUNCPTR
66 #define FUNCPTR(name, args) (*(name)) args
67 #endif
68 
69 /* memory allocation */
70 #ifndef MALLOC
71 #define MALLOC(n)	malloc(n)
72 #endif
73 #ifndef REALLOC
74 #define REALLOC(p, n)	realloc(VS(p), n)
75 #endif
76 #ifndef FREE
77 #define FREE(p)		free(VS(p))
78 #endif
79 
80 /* want size of a char in bits, and max value in bounded quantifiers */
81 #ifndef _POSIX2_RE_DUP_MAX
82 #define _POSIX2_RE_DUP_MAX	255 /* normally from <limits.h> */
83 #endif
84 
85 
86 
87 /*
88  * misc
89  */
90 
91 #define NOTREACHED	0
92 
93 #define DUPMAX	_POSIX2_RE_DUP_MAX
94 #define DUPINF	(DUPMAX+1)
95 
96 #define REMAGIC 0xfed7			/* magic number for main struct */
97 
98 /* Type codes for lookaround constraints */
99 #define LATYPE_AHEAD_POS	03	/* positive lookahead */
100 #define LATYPE_AHEAD_NEG	02	/* negative lookahead */
101 #define LATYPE_BEHIND_POS	01	/* positive lookbehind */
102 #define LATYPE_BEHIND_NEG	00	/* negative lookbehind */
103 #define LATYPE_IS_POS(la)	((la) & 01)
104 #define LATYPE_IS_AHEAD(la) ((la) & 02)
105 
106 
107 /*
108  * debugging facilities
109  */
110 #ifdef REG_DEBUG
111 /* FDEBUG does finite-state tracing */
112 #define FDEBUG(arglist) { if (v->eflags&REG_FTRACE) printf arglist; }
113 /* MDEBUG does higher-level tracing */
114 #define MDEBUG(arglist) { if (v->eflags&REG_MTRACE) printf arglist; }
115 #else
116 #define FDEBUG(arglist) {}
117 #define MDEBUG(arglist) {}
118 #endif
119 
120 
121 
122 /*
123  * bitmap manipulation
124  */
125 #define UBITS	(CHAR_BIT * sizeof(unsigned))
126 #define BSET(uv, sn)	((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS))
127 #define ISBSET(uv, sn)	((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
128 
129 
130 /*
131  * known character classes
132  */
133 enum char_classes
134 {
135 	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
136 	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD
137 };
138 
139 #define NUM_CCLASSES 14
140 
141 
142 /*
143  * As soon as possible, we map chrs into equivalence classes -- "colors" --
144  * which are of much more manageable number.
145  *
146  * To further reduce the number of arcs in NFAs and DFAs, we also have a
147  * special RAINBOW "color" that can be assigned to an arc.  This is not a
148  * real color, in that it has no entry in color maps.
149  */
150 typedef short color;			/* colors of characters */
151 
152 #define MAX_COLOR	32767		/* max color (must fit in 'color' datatype) */
153 #define COLORLESS	(-1)		/* impossible color */
154 #define RAINBOW		(-2)		/* represents all colors except pseudocolors */
155 #define WHITE		0			/* default color, parent of all others */
156 /* Note: various places in the code know that WHITE is zero */
157 
158 
159 /*
160  * Per-color data structure for the compile-time color machinery
161  *
162  * If "sub" is not NOSUB then it is the number of the color's current
163  * subcolor, i.e. we are in process of dividing this color (character
164  * equivalence class) into two colors.  See src/backend/regex/README for
165  * discussion of subcolors.
166  *
167  * Currently-unused colors have the FREECOL bit set and are linked into a
168  * freelist using their "sub" fields, but only if their color numbers are
169  * less than colormap.max.  Any array entries beyond "max" are just garbage.
170  */
171 struct colordesc
172 {
173 	int			nschrs;			/* number of simple chars of this color */
174 	int			nuchrs;			/* number of upper map entries of this color */
175 	color		sub;			/* open subcolor, if any; or free-chain ptr */
176 #define  NOSUB	 COLORLESS		/* value of "sub" when no open subcolor */
177 	struct arc *arcs;			/* chain of all arcs of this color */
178 	chr			firstchr;		/* simple char first assigned to this color */
179 	int			flags;			/* bitmask of the following flags: */
180 #define  FREECOL 01				/* currently free */
181 #define  PSEUDO  02				/* pseudocolor, no real chars */
182 #define  COLMARK 04				/* temporary marker used in some functions */
183 };
184 
185 #define  UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
186 
187 /*
188  * The color map itself
189  *
190  * This struct holds both data used only at compile time, and the chr to
191  * color mapping information, used at both compile and run time.  The latter
192  * is the bulk of the space, so it's not really worth separating out the
193  * compile-only portion.
194  *
195  * Ideally, the mapping data would just be an array of colors indexed by
196  * chr codes; but for large character sets that's impractical.  Fortunately,
197  * common characters have smaller codes, so we can use a simple array for chr
198  * codes up to MAX_SIMPLE_CHR, and do something more complex for codes above
199  * that, without much loss of performance.  The "something more complex" is a
200  * 2-D array of color entries, where row indexes correspond to individual chrs
201  * or chr ranges that have been mentioned in the regex (with row zero
202  * representing all other chrs), and column indexes correspond to different
203  * sets of locale-dependent character classes such as "isalpha".  The
204  * classbits[k] entry is zero if we do not care about the k'th character class
205  * in this regex, and otherwise it is the bit to be OR'd into the column index
206  * if the character in question is a member of that class.  We find the color
207  * of a high-valued chr by identifying which colormaprange it is in to get
208  * the row index (use row zero if it's in none of them), identifying which of
209  * the interesting cclasses it's in to get the column index, and then indexing
210  * into the 2-D hicolormap array.
211  *
212  * The colormapranges are required to be nonempty, nonoverlapping, and to
213  * appear in increasing chr-value order.
214  */
215 
216 typedef struct colormaprange
217 {
218 	chr			cmin;			/* range represents cmin..cmax inclusive */
219 	chr			cmax;
220 	int			rownum;			/* row index in hicolormap array (>= 1) */
221 } colormaprange;
222 
223 struct colormap
224 {
225 	int			magic;
226 #define  CMMAGIC 0x876
227 	struct vars *v;				/* for compile error reporting */
228 	size_t		ncds;			/* allocated length of colordescs array */
229 	size_t		max;			/* highest color number currently in use */
230 	color		free;			/* beginning of free chain (if non-0) */
231 	struct colordesc *cd;		/* pointer to array of colordescs */
232 #define  CDEND(cm)	 (&(cm)->cd[(cm)->max + 1])
233 
234 	/* mapping data for chrs <= MAX_SIMPLE_CHR: */
235 	color	   *locolormap;		/* simple array indexed by chr code */
236 
237 	/* mapping data for chrs > MAX_SIMPLE_CHR: */
238 	int			classbits[NUM_CCLASSES];	/* see comment above */
239 	int			numcmranges;	/* number of colormapranges */
240 	colormaprange *cmranges;	/* ranges of high chrs */
241 	color	   *hicolormap;		/* 2-D array of color entries */
242 	int			maxarrayrows;	/* number of array rows allocated */
243 	int			hiarrayrows;	/* number of array rows in use */
244 	int			hiarraycols;	/* number of array columns (2^N) */
245 
246 	/* If we need up to NINLINECDS, we store them here to save a malloc */
247 #define  NINLINECDS  ((size_t) 10)
248 	struct colordesc cdspace[NINLINECDS];
249 };
250 
251 /* fetch color for chr; beware of multiple evaluation of c argument */
252 #define GETCOLOR(cm, c) \
253 	((c) <= MAX_SIMPLE_CHR ? (cm)->locolormap[(c) - CHR_MIN] : pg_reg_getcolor(cm, c))
254 
255 
256 /*
257  * Interface definitions for locale-interface functions in regc_locale.c.
258  */
259 
260 /*
261  * Representation of a set of characters.  chrs[] represents individual
262  * code points, ranges[] represents ranges in the form min..max inclusive.
263  *
264  * If the cvec represents a locale-specific character class, eg [[:alpha:]],
265  * then the chrs[] and ranges[] arrays contain only members of that class
266  * up to MAX_SIMPLE_CHR (inclusive).  cclasscode is set to regc_locale.c's
267  * code for the class, rather than being -1 as it is in an ordinary cvec.
268  *
269  * Note that in cvecs gotten from newcvec() and intended to be freed by
270  * freecvec(), both arrays of chrs are after the end of the struct, not
271  * separately malloc'd; so chrspace and rangespace are effectively immutable.
272  */
273 struct cvec
274 {
275 	int			nchrs;			/* number of chrs */
276 	int			chrspace;		/* number of chrs allocated in chrs[] */
277 	chr		   *chrs;			/* pointer to vector of chrs */
278 	int			nranges;		/* number of ranges (chr pairs) */
279 	int			rangespace;		/* number of ranges allocated in ranges[] */
280 	chr		   *ranges;			/* pointer to vector of chr pairs */
281 	int			cclasscode;		/* value of "enum classes", or -1 */
282 };
283 
284 
285 /*
286  * definitions for NFA internal representation
287  */
288 struct state;
289 
290 struct arc
291 {
292 	int			type;			/* 0 if free, else an NFA arc type code */
293 	color		co;				/* color the arc matches (possibly RAINBOW) */
294 	struct state *from;			/* where it's from */
295 	struct state *to;			/* where it's to */
296 	struct arc *outchain;		/* link in *from's outs chain or free chain */
297 	struct arc *outchainRev;	/* back-link in *from's outs chain */
298 #define  freechain	outchain	/* we do not maintain "freechainRev" */
299 	struct arc *inchain;		/* link in *to's ins chain */
300 	struct arc *inchainRev;		/* back-link in *to's ins chain */
301 	/* these fields are not used when co == RAINBOW: */
302 	struct arc *colorchain;		/* link in color's arc chain */
303 	struct arc *colorchainRev;	/* back-link in color's arc chain */
304 };
305 
306 struct arcbatch
307 {								/* for bulk allocation of arcs */
308 	struct arcbatch *next;		/* chain link */
309 	size_t		narcs;			/* number of arcs allocated in this arcbatch */
310 	struct arc	a[FLEXIBLE_ARRAY_MEMBER];
311 };
312 #define  ARCBATCHSIZE(n)  ((n) * sizeof(struct arc) + offsetof(struct arcbatch, a))
313 /* first batch will have FIRSTABSIZE arcs; then double it until MAXABSIZE */
314 #define  FIRSTABSIZE	64
315 #define  MAXABSIZE		1024
316 
317 struct state
318 {
319 	int			no;				/* state number, zero and up; or FREESTATE */
320 #define  FREESTATE	 (-1)
321 	char		flag;			/* marks special states */
322 	int			nins;			/* number of inarcs */
323 	int			nouts;			/* number of outarcs */
324 	struct arc *ins;			/* chain of inarcs */
325 	struct arc *outs;			/* chain of outarcs */
326 	struct state *tmp;			/* temporary for traversal algorithms */
327 	struct state *next;			/* chain for traversing all live states */
328 	/* the "next" field is also used to chain free states together */
329 	struct state *prev;			/* back-link in chain of all live states */
330 };
331 
332 struct statebatch
333 {								/* for bulk allocation of states */
334 	struct statebatch *next;	/* chain link */
335 	size_t		nstates;		/* number of states allocated in this batch */
336 	struct state s[FLEXIBLE_ARRAY_MEMBER];
337 };
338 #define  STATEBATCHSIZE(n)  ((n) * sizeof(struct state) + offsetof(struct statebatch, s))
339 /* first batch will have FIRSTSBSIZE states; then double it until MAXSBSIZE */
340 #define  FIRSTSBSIZE	32
341 #define  MAXSBSIZE		1024
342 
343 struct nfa
344 {
345 	struct state *pre;			/* pre-initial state */
346 	struct state *init;			/* initial state */
347 	struct state *final;		/* final state */
348 	struct state *post;			/* post-final state */
349 	int			nstates;		/* for numbering states */
350 	struct state *states;		/* chain of live states */
351 	struct state *slast;		/* tail of the chain */
352 	struct state *freestates;	/* chain of free states */
353 	struct arc *freearcs;		/* chain of free arcs */
354 	struct statebatch *lastsb;	/* chain of statebatches */
355 	struct arcbatch *lastab;	/* chain of arcbatches */
356 	size_t		lastsbused;		/* number of states consumed from *lastsb */
357 	size_t		lastabused;		/* number of arcs consumed from *lastab */
358 	struct colormap *cm;		/* the color map */
359 	color		bos[2];			/* colors, if any, assigned to BOS and BOL */
360 	color		eos[2];			/* colors, if any, assigned to EOS and EOL */
361 	int			flags;			/* flags to pass forward to cNFA */
362 	int			minmatchall;	/* min number of chrs to match, if matchall */
363 	int			maxmatchall;	/* max number of chrs to match, or DUPINF */
364 	struct vars *v;				/* simplifies compile error reporting */
365 	struct nfa *parent;			/* parent NFA, if any */
366 };
367 
368 
369 
370 /*
371  * definitions for compacted NFA
372  *
373  * The main space savings in a compacted NFA is from making the arcs as small
374  * as possible.  We store only the transition color and next-state number for
375  * each arc.  The list of out arcs for each state is an array beginning at
376  * cnfa.states[statenumber], and terminated by a dummy carc struct with
377  * co == COLORLESS.
378  *
379  * The non-dummy carc structs are of two types: plain arcs and LACON arcs.
380  * Plain arcs just store the transition color number as "co".  LACON arcs
381  * store the lookaround constraint number plus cnfa.ncolors as "co".  LACON
382  * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
383  *
384  * Note that in a plain arc, "co" can be RAINBOW; since that's negative,
385  * it doesn't break the rule about how to recognize LACON arcs.
386  *
387  * We have special markings for "trivial" NFAs that can match any string
388  * (possibly with limits on the number of characters therein).  In such a
389  * case, flags & MATCHALL is set (and HASLACONS can't be set).  Then the
390  * fields minmatchall and maxmatchall give the minimum and maximum numbers
391  * of characters to match.  For example, ".*" produces minmatchall = 0
392  * and maxmatchall = DUPINF, while ".+" produces minmatchall = 1 and
393  * maxmatchall = DUPINF.
394  */
395 struct carc
396 {
397 	color		co;				/* COLORLESS is list terminator */
398 	int			to;				/* next-state number */
399 };
400 
401 struct cnfa
402 {
403 	int			nstates;		/* number of states */
404 	int			ncolors;		/* number of colors (max color in use + 1) */
405 	int			flags;			/* bitmask of the following flags: */
406 #define  HASLACONS	01			/* uses lookaround constraints */
407 #define  MATCHALL	02			/* matches all strings of a range of lengths */
408 	int			pre;			/* setup state number */
409 	int			post;			/* teardown state number */
410 	color		bos[2];			/* colors, if any, assigned to BOS and BOL */
411 	color		eos[2];			/* colors, if any, assigned to EOS and EOL */
412 	char	   *stflags;		/* vector of per-state flags bytes */
413 #define  CNFA_NOPROGRESS	01	/* flag bit for a no-progress state */
414 	struct carc **states;		/* vector of pointers to outarc lists */
415 	/* states[n] are pointers into a single malloc'd array of arcs */
416 	struct carc *arcs;			/* the area for the lists */
417 	/* these fields are used only in a MATCHALL NFA (else they're -1): */
418 	int			minmatchall;	/* min number of chrs to match */
419 	int			maxmatchall;	/* max number of chrs to match, or DUPINF */
420 };
421 
422 /*
423  * When debugging, it's helpful if an un-filled CNFA is all-zeroes.
424  * In production, though, we only require nstates to be zero.
425  */
426 #ifdef REG_DEBUG
427 #define ZAPCNFA(cnfa)	memset(&(cnfa), 0, sizeof(cnfa))
428 #else
429 #define ZAPCNFA(cnfa)	((cnfa).nstates = 0)
430 #endif
431 #define NULLCNFA(cnfa)	((cnfa).nstates == 0)
432 
433 /*
434  * This symbol limits the transient heap space used by the regex compiler,
435  * and thereby also the maximum complexity of NFAs that we'll deal with.
436  * Currently we only count NFA states and arcs against this; the other
437  * transient data is generally not large enough to notice compared to those.
438  * Note that we do not charge anything for the final output data structures
439  * (the compacted NFA and the colormap).
440  * The scaling here is based on an empirical measurement that very large
441  * NFAs tend to have about 4 arcs/state.
442  */
443 #ifndef REG_MAX_COMPILE_SPACE
444 #define REG_MAX_COMPILE_SPACE  \
445 	(500000 * (sizeof(struct state) + 4 * sizeof(struct arc)))
446 #endif
447 
448 /*
449  * subexpression tree
450  *
451  * "op" is one of:
452  *		'='  plain regex without interesting substructure (implemented as DFA)
453  *		'b'  back-reference (has no substructure either)
454  *		'('  no-op capture node: captures the match of its single child
455  *		'.'  concatenation: matches a match for first child, then second child
456  *		'|'  alternation: matches a match for any of its children
457  *		'*'  iteration: matches some number of matches of its single child
458  *
459  * An alternation node can have any number of children (but at least two),
460  * linked through their sibling fields.
461  *
462  * A concatenation node must have exactly two children.  It might be useful
463  * to support more, but that would complicate the executor.  Note that it is
464  * the first child's greediness that determines the node's preference for
465  * where to split a match.
466  *
467  * Note: when a backref is directly quantified, we stick the min/max counts
468  * into the backref rather than plastering an iteration node on top.  This is
469  * for efficiency: there is no need to search for possible division points.
470  */
471 struct subre
472 {
473 	char		op;				/* see type codes above */
474 	char		flags;
475 #define  LONGER  01				/* prefers longer match */
476 #define  SHORTER 02				/* prefers shorter match */
477 #define  MIXED	 04				/* mixed preference below */
478 #define  CAP	 010			/* capturing parens here or below */
479 #define  BACKR	 020			/* back reference here or below */
480 #define  INUSE	 0100			/* in use in final tree */
481 #define  NOPROP  03				/* bits which may not propagate up */
482 #define  LMIX(f) ((f)<<2)		/* LONGER -> MIXED */
483 #define  SMIX(f) ((f)<<1)		/* SHORTER -> MIXED */
484 #define  UP(f)	 (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
485 #define  MESSY(f)	 ((f)&(MIXED|CAP|BACKR))
486 #define  PREF(f) ((f)&NOPROP)
487 #define  PREF2(f1, f2)	 ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
488 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
489 	char		latype;			/* LATYPE code, if lookaround constraint */
490 	int			id;				/* ID of subre (1..ntree-1) */
491 	int			capno;			/* if capture node, subno to capture into */
492 	int			backno;			/* if backref node, subno it refers to */
493 	short		min;			/* min repetitions for iteration or backref */
494 	short		max;			/* max repetitions for iteration or backref */
495 	struct subre *child;		/* first child, if any (also freelist chain) */
496 	struct subre *sibling;		/* next child of same parent, if any */
497 	struct state *begin;		/* outarcs from here... */
498 	struct state *end;			/* ...ending in inarcs here */
499 	struct cnfa cnfa;			/* compacted NFA, if any */
500 	struct subre *chain;		/* for bookkeeping and error cleanup */
501 };
502 
503 
504 
505 /*
506  * table of function pointers for generic manipulation functions
507  * A regex_t's re_fns points to one of these.
508  */
509 struct fns
510 {
511 	void		FUNCPTR(free, (regex_t *));
512 	int			FUNCPTR(cancel_requested, (void));
513 	int			FUNCPTR(stack_too_deep, (void));
514 };
515 
516 #define CANCEL_REQUESTED(re)  \
517 	((*((struct fns *) (re)->re_fns)->cancel_requested) ())
518 
519 #define STACK_TOO_DEEP(re)	\
520 	((*((struct fns *) (re)->re_fns)->stack_too_deep) ())
521 
522 
523 /*
524  * the insides of a regex_t, hidden behind a void *
525  */
526 struct guts
527 {
528 	int			magic;
529 #define  GUTSMAGIC	 0xfed9
530 	int			cflags;			/* copy of compile flags */
531 	long		info;			/* copy of re_info */
532 	size_t		nsub;			/* copy of re_nsub */
533 	struct subre *tree;
534 	struct cnfa search;			/* for fast preliminary search */
535 	int			ntree;			/* number of subre's, plus one */
536 	struct colormap cmap;
537 	int			FUNCPTR(compare, (const chr *, const chr *, size_t));
538 	struct subre *lacons;		/* lookaround-constraint vector */
539 	int			nlacons;		/* size of lacons[]; note that only slots
540 								 * numbered 1 .. nlacons-1 are used */
541 };
542 
543 
544 /* prototypes for functions that are exported from regcomp.c to regexec.c */
545 extern void pg_set_regex_collation(Oid collation);
546 extern color pg_reg_getcolor(struct colormap *cm, chr c);
547