1 /*
2  * Definitions etc. for regexp(3) routines.
3  *
4  * Caveat:  this is V8 regexp(3) [actually, a reimplementation thereof],
5  * not the System V one.
6  */
7 
8 #ifndef REP_REGEXP_H
9 #define REP_REGEXP_H
10 
11 #define rep_NSUBEXP  10
12 
13 typedef enum rep_regtype {
14     rep_reg_string = 0,
15     rep_reg_obj
16 } rep_regtype;
17 
18 typedef union rep_regsubs {
19     struct {
20 	char *startp[rep_NSUBEXP];
21 	char *endp[rep_NSUBEXP];
22     } string;
23     struct {
24 	repv startp[rep_NSUBEXP];
25 	repv endp[rep_NSUBEXP];
26     } obj;
27 } rep_regsubs;
28 
29 typedef struct rep_regexp {
30 	rep_regtype lasttype;
31 	rep_regsubs matches;
32 
33 	char regstart;	/* Internal use only. */
34 	char reganch;		/* Internal use only. */
35 	char *regmust;		/* Internal use only. */
36 	int regmlen;		/* Internal use only. */
37 	int regsize;		/* actual size of regexp structure */
38 	char program[1];	/* Unwarranted chumminess with compiler. */
39 } rep_regexp;
40 
41 /* Data structure used to save and restore regexp data internally */
42 struct rep_saved_regexp_data {
43     struct rep_saved_regexp_data *next;
44     rep_regtype type;
45     repv data;
46     rep_regsubs matches;
47 };
48 
49 /* eflags for regexec2() */
50 #define rep_REG_NOTBOL 1	/* start of input isn't start of line */
51 #define rep_REG_NOCASE 2	/* fold upper and lower case */
52 #define rep_REG_1LINE  4	/* for regexec_tx: only search to the
53 				   end of the line for the start of the
54 				   match. */
55 
56 #define rep_regexec(p,s) rep_regexec2(p,s,0)
57 
58 extern rep_regexp *rep_regcomp(char *);
59 extern int rep_regexec2(rep_regexp *, char *, int);
60 extern int rep_regmatch_string(rep_regexp *, char *, int);
61 
62 extern int rep_regexp_max_depth;
63 
64 
65 /* Only include the internal stuff if it's explicitly requested, since
66    it comtaminates the namespace.. */
67 
68 #ifdef rep_NEED_REGEXP_INTERNALS
69 
70 /*
71  * Structure for regexp "program".  This is essentially a linear encoding of
72  * a nondeterministic finite-state machine (aka syntax charts or "railroad
73  * normal form" in parsing technology).  Each node is an opcode plus a "next"
74  * pointer, possibly plus an operand.  "Next" pointers of all nodes except
75  * BRANCH implement concatenation; a "next" pointer with a BRANCH on both
76  * ends of it is connecting two alternatives.	(Here we have one of the
77  * subtle syntax dependencies:	an individual BRANCH (as opposed to a
78  * collection of them) is never concatenated with anything because of
79  * operator precedence.)  The operand of some types of node is a literal
80  * string; for others, it is a node leading into a sub-FSM.  In particular,
81  * the operand of a BRANCH node is the first node of the branch. (NB this is
82  * *not* a tree structure:  the tail of the branch connects to the thing
83  * following the set of BRANCHes.)  The opcodes are:
84  */
85 
86 /* definition	number	opnd?	meaning */
87 #define END	0		/* no	End of program. */
88 #define BOL	1		/* no	Match "" at beginning of line. */
89 #define EOL	2		/* no	Match "" at end of line. */
90 #define ANY	3		/* no	Match any one character. */
91 #define ANYOF	4		/* str	Match any character in this string. */
92 #define ANYBUT	5		/* str	Match any character not in this
93 				 * string. */
94 #define BRANCH	6		/* node Match this alternative, or the
95 				 * next... */
96 #define BACK	7		/* no	Match "", "next" ptr points backward. */
97 #define EXACTLY 8		/* str	Match this string. */
98 #define NOTHING 9		/* no	Match empty string. */
99 #define STAR	10		/* node Match this (simple) thing 0 or more
100 				 * times. */
101 #define PLUS	11		/* node Match this (simple) thing 1 or more
102 				 * times. */
103 #define WORD	12		/* no   Match alphanumeric or _ char */
104 #define NWORD	13		/* no   Match non-(alphanumeric or _) char */
105 #define WSPC	14		/* no   Match whitespace char */
106 #define NWSPC	15		/* no   Match non-whitespace char */
107 #define DIGI	16		/* no   Match digit char */
108 #define NDIGI	17		/* no   Match non-digit char */
109 #define WEDGE	18		/* no	Match "" at word boundary */
110 #define NWEDGE	19		/* no	Match "" not at word boundary */
111 #define OPEN	20		/* no	Mark this point in input as start of
112 				 * #n. */
113 /* OPEN+1 is number 1, etc. */
114 #define CLOSE	30		/* no	Analogous to OPEN. */
115 #define NGSTAR	40		/* node Match this (simple) thing 0 or more
116 				   times (non-greedily) */
117 #define NGPLUS	41		/* node	Match this (simple) thing 1 or more
118 				   times (non-greedily) */
119 
120 /*
121  * Opcode notes:
122  *
123  * BRANCH	The set of branches constituting a single choice are hooked together
124  * with their "next" pointers, since precedence prevents anything being
125  * concatenated to any individual branch.  The "next" pointer of the last
126  * BRANCH in a choice points to the thing following the whole choice.  This
127  * is also where the final "next" pointer of each individual branch points;
128  * each branch starts with the operand node of a BRANCH node.
129  *
130  * BACK		Normal "next" pointers all implicitly point forward; BACK exists to
131  * make loop structures possible.
132  *
133  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
134  * BRANCH structures using BACK.  Simple cases (one character per match) are
135  * implemented with STAR and PLUS for speed and to minimize recursive
136  * plunges.
137  *
138  * OPEN,CLOSE	...are numbered at compile time.
139  */
140 
141 /*
142  * A node is one char of opcode followed by two chars of "next" pointer.
143  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
144  * value is a positive offset from the opcode of the node containing it. An
145  * operand, if any, simply follows the node.  (Note that much of the code
146  * generation knows about this implicit relationship.)
147  *
148  * Using two bytes for the "next" pointer is vast overkill for most things, but
149  * allows patterns to get big without disasters.
150  */
151 #define OP(p)	(*(p))
152 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
153 #define OPERAND(p)	((p) + 3)
154 
155 
156 /*
157  * The first byte of the regexp internal "program" is actually this magic
158  * number; the start node begins in the second byte.
159  */
160 #define	MAGIC	0234
161 
162 #endif /* rep_NEED_REGEXP_INTERNALS */
163 
164 #endif /* REP_REGEXP_H */
165