1433d6423SLionel Sambuc /*
2433d6423SLionel Sambuc * regcomp and regexec -- regsub and regerror are elsewhere
3433d6423SLionel Sambuc *
4433d6423SLionel Sambuc * Copyright (c) 1986 by University of Toronto.
5433d6423SLionel Sambuc * Written by Henry Spencer. Not derived from licensed software.
6433d6423SLionel Sambuc *
7433d6423SLionel Sambuc * Permission is granted to anyone to use this software for any
8433d6423SLionel Sambuc * purpose on any computer system, and to redistribute it freely,
9433d6423SLionel Sambuc * subject to the following restrictions:
10433d6423SLionel Sambuc *
11433d6423SLionel Sambuc * 1. The author is not responsible for the consequences of use of
12433d6423SLionel Sambuc * this software, no matter how awful, even if they arise
13433d6423SLionel Sambuc * from defects in it.
14433d6423SLionel Sambuc *
15433d6423SLionel Sambuc * 2. The origin of this software must not be misrepresented, either
16433d6423SLionel Sambuc * by explicit claim or by omission.
17433d6423SLionel Sambuc *
18433d6423SLionel Sambuc * 3. Altered versions must be plainly marked as such, and must not
19433d6423SLionel Sambuc * be misrepresented as being the original software.
20433d6423SLionel Sambuc *
21433d6423SLionel Sambuc * Beware that some of this code is subtly aware of the way operator
22433d6423SLionel Sambuc * precedence is structured in regular expressions. Serious changes in
23433d6423SLionel Sambuc * regular-expression syntax might require a total rethink.
24433d6423SLionel Sambuc */
25433d6423SLionel Sambuc #include <stdio.h>
26433d6423SLionel Sambuc #ifdef UNIX
27433d6423SLionel Sambuc #ifdef USG
28433d6423SLionel Sambuc #include <string.h>
29433d6423SLionel Sambuc #else
30433d6423SLionel Sambuc #include <strings.h>
31433d6423SLionel Sambuc #endif
32433d6423SLionel Sambuc #else
33433d6423SLionel Sambuc #include <string.h>
34433d6423SLionel Sambuc #endif
35433d6423SLionel Sambuc #include "regexp.h"
36433d6423SLionel Sambuc #include "regmagic.h"
37433d6423SLionel Sambuc #include "proto.h"
38433d6423SLionel Sambuc
39433d6423SLionel Sambuc /*
40433d6423SLionel Sambuc * The "internal use only" fields in regexp.h are present to pass info from
41433d6423SLionel Sambuc * compile to execute that permits the execute phase to run lots faster on
42433d6423SLionel Sambuc * simple cases. They are:
43433d6423SLionel Sambuc *
44433d6423SLionel Sambuc * regstart char that must begin a match; '\0' if none obvious
45433d6423SLionel Sambuc * reganch is the match anchored (at beginning-of-line only)?
46433d6423SLionel Sambuc * regmust string (pointer into program) that match must include, or NULL
47433d6423SLionel Sambuc * regmlen length of regmust string
48433d6423SLionel Sambuc *
49433d6423SLionel Sambuc * Regstart and reganch permit very fast decisions on suitable starting points
50433d6423SLionel Sambuc * for a match, cutting down the work a lot. Regmust permits fast rejection
51433d6423SLionel Sambuc * of lines that cannot possibly match. The regmust tests are costly enough
52433d6423SLionel Sambuc * that regcomp() supplies a regmust only if the r.e. contains something
53433d6423SLionel Sambuc * potentially expensive (at present, the only such thing detected is * or +
54433d6423SLionel Sambuc * at the start of the r.e., which can involve a lot of backup). Regmlen is
55433d6423SLionel Sambuc * supplied because the test in regexec() needs it and regcomp() is computing
56433d6423SLionel Sambuc * it anyway.
57433d6423SLionel Sambuc */
58433d6423SLionel Sambuc
59433d6423SLionel Sambuc /*
60433d6423SLionel Sambuc * Structure for regexp "program". This is essentially a linear encoding
61433d6423SLionel Sambuc * of a nondeterministic finite-state machine (aka syntax charts or
62433d6423SLionel Sambuc * "railroad normal form" in parsing technology). Each node is an opcode
63433d6423SLionel Sambuc * plus a "next" pointer, possibly plus an operand. "Next" pointers of
64433d6423SLionel Sambuc * all nodes except BRANCH implement concatenation; a "next" pointer with
65433d6423SLionel Sambuc * a BRANCH on both ends of it is connecting two alternatives. (Here we
66433d6423SLionel Sambuc * have one of the subtle syntax dependencies: an individual BRANCH (as
67433d6423SLionel Sambuc * opposed to a collection of them) is never concatenated with anything
68433d6423SLionel Sambuc * because of operator precedence.) The operand of some types of node is
69433d6423SLionel Sambuc * a literal string; for others, it is a node leading into a sub-FSM. In
70433d6423SLionel Sambuc * particular, the operand of a BRANCH node is the first node of the branch.
71433d6423SLionel Sambuc * (NB this is *not* a tree structure: the tail of the branch connects
72433d6423SLionel Sambuc * to the thing following the set of BRANCHes.) The opcodes are:
73433d6423SLionel Sambuc */
74433d6423SLionel Sambuc
75433d6423SLionel Sambuc /* definition number opnd? meaning */
76433d6423SLionel Sambuc #define END 0 /* no End of program. */
77433d6423SLionel Sambuc #define BOL 1 /* no Match "" at beginning of line. */
78433d6423SLionel Sambuc #define EOL 2 /* no Match "" at end of line. */
79433d6423SLionel Sambuc #define ANY 3 /* no Match any one character. */
80433d6423SLionel Sambuc #define ANYOF 4 /* str Match any character in this string. */
81433d6423SLionel Sambuc #define ANYBUT 5 /* str Match any character not in this string. */
82433d6423SLionel Sambuc #define BRANCH 6 /* node Match this alternative, or the next... */
83433d6423SLionel Sambuc #define BACK 7 /* no Match "", "next" ptr points backward. */
84433d6423SLionel Sambuc #define EXACTLY 8 /* str Match this string. */
85433d6423SLionel Sambuc #define NOTHING 9 /* no Match empty string. */
86433d6423SLionel Sambuc #define STAR 10 /* node Match this (simple) thing 0 or more times. */
87433d6423SLionel Sambuc #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
88433d6423SLionel Sambuc #define OPEN 20 /* no Mark this point in input as start of #n. */
89433d6423SLionel Sambuc /* OPEN+1 is number 1, etc. */
90433d6423SLionel Sambuc #define CLOSE 30 /* no Analogous to OPEN. */
91433d6423SLionel Sambuc
92433d6423SLionel Sambuc /*
93433d6423SLionel Sambuc * Opcode notes:
94433d6423SLionel Sambuc *
95433d6423SLionel Sambuc * BRANCH The set of branches constituting a single choice are hooked
96433d6423SLionel Sambuc * together with their "next" pointers, since precedence prevents
97433d6423SLionel Sambuc * anything being concatenated to any individual branch. The
98433d6423SLionel Sambuc * "next" pointer of the last BRANCH in a choice points to the
99433d6423SLionel Sambuc * thing following the whole choice. This is also where the
100433d6423SLionel Sambuc * final "next" pointer of each individual branch points; each
101433d6423SLionel Sambuc * branch starts with the operand node of a BRANCH node.
102433d6423SLionel Sambuc *
103433d6423SLionel Sambuc * BACK Normal "next" pointers all implicitly point forward; BACK
104433d6423SLionel Sambuc * exists to make loop structures possible.
105433d6423SLionel Sambuc *
106433d6423SLionel Sambuc * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
107433d6423SLionel Sambuc * BRANCH structures using BACK. Simple cases (one character
108433d6423SLionel Sambuc * per match) are implemented with STAR and PLUS for speed
109433d6423SLionel Sambuc * and to minimize recursive plunges.
110433d6423SLionel Sambuc *
111433d6423SLionel Sambuc * OPEN,CLOSE ...are numbered at compile time.
112433d6423SLionel Sambuc */
113433d6423SLionel Sambuc
114433d6423SLionel Sambuc /*
115433d6423SLionel Sambuc * A node is one char of opcode followed by two chars of "next" pointer.
116433d6423SLionel Sambuc * "Next" pointers are stored as two 8-bit pieces, high order first. The
117433d6423SLionel Sambuc * value is a positive offset from the opcode of the node containing it.
118433d6423SLionel Sambuc * An operand, if any, simply follows the node. (Note that much of the
119433d6423SLionel Sambuc * code generation knows about this implicit relationship.)
120433d6423SLionel Sambuc *
121433d6423SLionel Sambuc * Using two bytes for the "next" pointer is vast overkill for most things,
122433d6423SLionel Sambuc * but allows patterns to get big without disasters.
123433d6423SLionel Sambuc */
124433d6423SLionel Sambuc #define OP(p) (*(p))
125433d6423SLionel Sambuc #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
126433d6423SLionel Sambuc #define OPERAND(p) ((p) + 3)
127433d6423SLionel Sambuc
128433d6423SLionel Sambuc /*
129433d6423SLionel Sambuc * See regmagic.h for one further detail of program structure.
130433d6423SLionel Sambuc */
131433d6423SLionel Sambuc
132433d6423SLionel Sambuc
133433d6423SLionel Sambuc /*
134433d6423SLionel Sambuc * Utility definitions.
135433d6423SLionel Sambuc */
136433d6423SLionel Sambuc #ifndef CHARBITS
137433d6423SLionel Sambuc #define UCHARAT(p) ((int)*(unsigned char *)(p))
138433d6423SLionel Sambuc #else
139433d6423SLionel Sambuc #define UCHARAT(p) ((int)*(p)&CHARBITS)
140433d6423SLionel Sambuc #endif
141433d6423SLionel Sambuc
142433d6423SLionel Sambuc #define FAIL(m) { regerror(m); return(NULL); }
143433d6423SLionel Sambuc #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
144433d6423SLionel Sambuc #define META "^$.[()|?+*\\"
145433d6423SLionel Sambuc
146433d6423SLionel Sambuc /*
147433d6423SLionel Sambuc * Flags to be passed up and down.
148433d6423SLionel Sambuc */
149433d6423SLionel Sambuc #define HASWIDTH 01 /* Known never to match null string. */
150433d6423SLionel Sambuc #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
151433d6423SLionel Sambuc #define SPSTART 04 /* Starts with * or +. */
152433d6423SLionel Sambuc #define WORST 0 /* Worst case. */
153433d6423SLionel Sambuc #define STATIC
154433d6423SLionel Sambuc /*
155433d6423SLionel Sambuc * Global work variables for regcomp().
156433d6423SLionel Sambuc */
157433d6423SLionel Sambuc
158433d6423SLionel Sambuc STATIC char *regparse; /* Input-scan pointer. */
159433d6423SLionel Sambuc STATIC int regnpar; /* () count. */
160433d6423SLionel Sambuc STATIC unsigned char regdummy;
161433d6423SLionel Sambuc STATIC unsigned char *regcode; /* Code-emit pointer; ®dummy = don't. */
162433d6423SLionel Sambuc STATIC long regsize; /* Code size. */
163433d6423SLionel Sambuc
164433d6423SLionel Sambuc /*
165433d6423SLionel Sambuc * Forward declarations for regcomp()'s friends.
166433d6423SLionel Sambuc */
167433d6423SLionel Sambuc STATIC unsigned char *reg(int paren, int *flagp );
168433d6423SLionel Sambuc STATIC unsigned char *regbranch(int *flagp );
169433d6423SLionel Sambuc STATIC unsigned char *regpiece(int *flagp );
170433d6423SLionel Sambuc STATIC unsigned char *regatom(int *flagp );
171433d6423SLionel Sambuc STATIC unsigned char *regnode(int op );
172433d6423SLionel Sambuc STATIC void regc(int b );
173433d6423SLionel Sambuc STATIC void reginsert(int op, unsigned char *opnd );
174433d6423SLionel Sambuc STATIC void regtail(unsigned char *p, unsigned char *val );
175433d6423SLionel Sambuc STATIC void regoptail(unsigned char *p, unsigned char *val );
176433d6423SLionel Sambuc STATIC int regtry(regexp *prog, unsigned char *string );
177433d6423SLionel Sambuc STATIC int regmatch(unsigned char *prog );
178433d6423SLionel Sambuc STATIC int regrepeat(unsigned char *p );
179433d6423SLionel Sambuc STATIC unsigned char *regnext(unsigned char *p );
180433d6423SLionel Sambuc STATIC unsigned char *regprop(unsigned char *op );
181433d6423SLionel Sambuc
182433d6423SLionel Sambuc #ifdef STRCSPN
183433d6423SLionel Sambuc STATIC int strcspn(unsigned char *s1, unsigned char *s2 );
184433d6423SLionel Sambuc #endif
185433d6423SLionel Sambuc
186433d6423SLionel Sambuc /*
187433d6423SLionel Sambuc - regcomp - compile a regular expression into internal code
188433d6423SLionel Sambuc *
189433d6423SLionel Sambuc * We can't allocate space until we know how big the compiled form will be,
190433d6423SLionel Sambuc * but we can't compile it (and thus know how big it is) until we've got a
191433d6423SLionel Sambuc * place to put the code. So we cheat: we compile it twice, once with code
192433d6423SLionel Sambuc * generation turned off and size counting turned on, and once "for real".
193433d6423SLionel Sambuc * This also means that we don't allocate space until we are sure that the
194433d6423SLionel Sambuc * thing really will compile successfully, and we never have to move the
195433d6423SLionel Sambuc * code and thus invalidate pointers into it. (Note that it has to be in
196433d6423SLionel Sambuc * one piece because free() must be able to free it all.)
197433d6423SLionel Sambuc *
198433d6423SLionel Sambuc * Beware that the optimization-preparation code in here knows about some
199433d6423SLionel Sambuc * of the structure of the compiled regexp.
200433d6423SLionel Sambuc */
regcomp(char * exp)201d9494baaSJacob Adams regexp *regcomp(char *exp) {
202433d6423SLionel Sambuc register regexp *r;
203433d6423SLionel Sambuc register unsigned char *scan;
204433d6423SLionel Sambuc register unsigned char *longest;
205433d6423SLionel Sambuc register int len;
206433d6423SLionel Sambuc int flags;
207433d6423SLionel Sambuc
208433d6423SLionel Sambuc if (exp == NULL)
209433d6423SLionel Sambuc FAIL("NULL argument");
210433d6423SLionel Sambuc
211433d6423SLionel Sambuc /* First pass: determine size, legality. */
212433d6423SLionel Sambuc regparse = exp;
213433d6423SLionel Sambuc regnpar = 1;
214433d6423SLionel Sambuc regsize = 0L;
215433d6423SLionel Sambuc regcode = ®dummy;
216433d6423SLionel Sambuc regc(MAGIC);
217433d6423SLionel Sambuc if (reg(0, &flags) == NULL)
218433d6423SLionel Sambuc return(NULL);
219433d6423SLionel Sambuc
220433d6423SLionel Sambuc /* Small enough for pointer-storage convention? */
221433d6423SLionel Sambuc if (regsize >= 32767L) /* Probably could be 65535L. */
222433d6423SLionel Sambuc FAIL("regexp too big");
223433d6423SLionel Sambuc
224433d6423SLionel Sambuc /* Allocate space. */
225433d6423SLionel Sambuc r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
226433d6423SLionel Sambuc if (r == NULL)
227433d6423SLionel Sambuc FAIL("out of space");
228433d6423SLionel Sambuc
229433d6423SLionel Sambuc /* Second pass: emit code. */
230433d6423SLionel Sambuc regparse = exp;
231433d6423SLionel Sambuc regnpar = 1;
232433d6423SLionel Sambuc regcode = r->program;
233433d6423SLionel Sambuc regc(MAGIC);
234*ab712d19SDavid van Moolenbroek if (reg(0, &flags) == NULL) {
235*ab712d19SDavid van Moolenbroek free(r);
236433d6423SLionel Sambuc return(NULL);
237*ab712d19SDavid van Moolenbroek }
238433d6423SLionel Sambuc
239433d6423SLionel Sambuc /* Dig out information for optimizations. */
240433d6423SLionel Sambuc r->regstart = '\0'; /* Worst-case defaults. */
241433d6423SLionel Sambuc r->reganch = 0;
242433d6423SLionel Sambuc r->regmust = NULL;
243433d6423SLionel Sambuc r->regmlen = 0;
244433d6423SLionel Sambuc scan = r->program+1; /* First BRANCH. */
245433d6423SLionel Sambuc if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
246433d6423SLionel Sambuc scan = OPERAND(scan);
247433d6423SLionel Sambuc
248433d6423SLionel Sambuc /* Starting-point info. */
249433d6423SLionel Sambuc if (OP(scan) == EXACTLY)
250433d6423SLionel Sambuc r->regstart = *OPERAND(scan);
251433d6423SLionel Sambuc else if (OP(scan) == BOL)
252433d6423SLionel Sambuc r->reganch++;
253433d6423SLionel Sambuc
254433d6423SLionel Sambuc /*
255433d6423SLionel Sambuc * If there's something expensive in the r.e., find the
256433d6423SLionel Sambuc * longest literal string that must appear and make it the
257433d6423SLionel Sambuc * regmust. Resolve ties in favor of later strings, since
258433d6423SLionel Sambuc * the regstart check works with the beginning of the r.e.
259433d6423SLionel Sambuc * and avoiding duplication strengthens checking. Not a
260433d6423SLionel Sambuc * strong reason, but sufficient in the absence of others.
261433d6423SLionel Sambuc */
262433d6423SLionel Sambuc if (flags&SPSTART) {
263433d6423SLionel Sambuc longest = NULL;
264433d6423SLionel Sambuc len = 0;
265433d6423SLionel Sambuc for (; scan != NULL; scan = regnext(scan))
266433d6423SLionel Sambuc if (OP(scan) == EXACTLY
267433d6423SLionel Sambuc && strlen((char *)OPERAND(scan)) >= len) {
268433d6423SLionel Sambuc longest = OPERAND(scan);
269433d6423SLionel Sambuc len = strlen((char *)OPERAND(scan));
270433d6423SLionel Sambuc }
271433d6423SLionel Sambuc r->regmust = longest;
272433d6423SLionel Sambuc r->regmlen = len;
273433d6423SLionel Sambuc }
274433d6423SLionel Sambuc }
275433d6423SLionel Sambuc
276433d6423SLionel Sambuc return(r);
277433d6423SLionel Sambuc }
278433d6423SLionel Sambuc
279433d6423SLionel Sambuc /*
280433d6423SLionel Sambuc - reg - regular expression, i.e. main body or parenthesized thing
281433d6423SLionel Sambuc *
282433d6423SLionel Sambuc * Caller must absorb opening parenthesis.
283433d6423SLionel Sambuc *
284433d6423SLionel Sambuc * Combining parenthesis handling with the base level of regular expression
285433d6423SLionel Sambuc * is a trifle forced, but the need to tie the tails of the branches to what
286433d6423SLionel Sambuc * follows makes it hard to avoid.
287433d6423SLionel Sambuc */
reg(int paren,int * flagp)288d9494baaSJacob Adams STATIC unsigned char *reg(int paren, int *flagp) {
289d9494baaSJacob Adams /* Parenthesized? paren */
290433d6423SLionel Sambuc register unsigned char *ret;
291433d6423SLionel Sambuc register unsigned char *br;
292433d6423SLionel Sambuc register unsigned char *ender;
293433d6423SLionel Sambuc register int parno;
294433d6423SLionel Sambuc int flags;
295433d6423SLionel Sambuc
296433d6423SLionel Sambuc *flagp = HASWIDTH; /* Tentatively. */
297433d6423SLionel Sambuc
298433d6423SLionel Sambuc /* Make an OPEN node, if parenthesized. */
299433d6423SLionel Sambuc if (paren) {
300433d6423SLionel Sambuc if (regnpar >= NSUBEXP)
301433d6423SLionel Sambuc FAIL("too many ()");
302433d6423SLionel Sambuc parno = regnpar;
303433d6423SLionel Sambuc regnpar++;
304433d6423SLionel Sambuc ret = regnode(OPEN+parno);
305433d6423SLionel Sambuc } else
306433d6423SLionel Sambuc ret = NULL;
307433d6423SLionel Sambuc
308433d6423SLionel Sambuc /* Pick up the branches, linking them together. */
309433d6423SLionel Sambuc br = regbranch(&flags);
310433d6423SLionel Sambuc if (br == NULL)
311433d6423SLionel Sambuc return(NULL);
312433d6423SLionel Sambuc if (ret != NULL)
313433d6423SLionel Sambuc regtail(ret, br); /* OPEN -> first. */
314433d6423SLionel Sambuc else
315433d6423SLionel Sambuc ret = br;
316433d6423SLionel Sambuc if (!(flags&HASWIDTH))
317433d6423SLionel Sambuc *flagp &= ~HASWIDTH;
318433d6423SLionel Sambuc *flagp |= flags&SPSTART;
319433d6423SLionel Sambuc while (*regparse == '|') {
320433d6423SLionel Sambuc regparse++;
321433d6423SLionel Sambuc br = regbranch(&flags);
322433d6423SLionel Sambuc if (br == NULL)
323433d6423SLionel Sambuc return(NULL);
324433d6423SLionel Sambuc regtail(ret, br); /* BRANCH -> BRANCH. */
325433d6423SLionel Sambuc if (!(flags&HASWIDTH))
326433d6423SLionel Sambuc *flagp &= ~HASWIDTH;
327433d6423SLionel Sambuc *flagp |= flags&SPSTART;
328433d6423SLionel Sambuc }
329433d6423SLionel Sambuc
330433d6423SLionel Sambuc /* Make a closing node, and hook it on the end. */
331433d6423SLionel Sambuc ender = regnode((paren) ? CLOSE+parno : END);
332433d6423SLionel Sambuc regtail(ret, ender);
333433d6423SLionel Sambuc
334433d6423SLionel Sambuc /* Hook the tails of the branches to the closing node. */
335433d6423SLionel Sambuc for (br = ret; br != NULL; br = regnext(br))
336433d6423SLionel Sambuc regoptail(br, ender);
337433d6423SLionel Sambuc
338433d6423SLionel Sambuc /* Check for proper termination. */
339433d6423SLionel Sambuc if (paren && *regparse++ != ')') {
340433d6423SLionel Sambuc FAIL("unmatched ()");
341433d6423SLionel Sambuc } else if (!paren && *regparse != '\0') {
342433d6423SLionel Sambuc if (*regparse == ')') {
343433d6423SLionel Sambuc FAIL("unmatched ()");
344433d6423SLionel Sambuc } else
345433d6423SLionel Sambuc FAIL("junk on end"); /* "Can't happen". */
346433d6423SLionel Sambuc /* NOTREACHED */
347433d6423SLionel Sambuc }
348433d6423SLionel Sambuc
349433d6423SLionel Sambuc return(ret);
350433d6423SLionel Sambuc }
351433d6423SLionel Sambuc
352433d6423SLionel Sambuc /*
353433d6423SLionel Sambuc - regbranch - one alternative of an | operator
354433d6423SLionel Sambuc *
355433d6423SLionel Sambuc * Implements the concatenation operator.
356433d6423SLionel Sambuc */
regbranch(int * flagp)357d9494baaSJacob Adams STATIC unsigned char *regbranch(int *flagp) {
358433d6423SLionel Sambuc register unsigned char *ret;
359433d6423SLionel Sambuc register unsigned char *chain;
360433d6423SLionel Sambuc register unsigned char *latest;
361433d6423SLionel Sambuc int flags;
362433d6423SLionel Sambuc
363433d6423SLionel Sambuc *flagp = WORST; /* Tentatively. */
364433d6423SLionel Sambuc
365433d6423SLionel Sambuc ret = regnode(BRANCH);
366433d6423SLionel Sambuc chain = NULL;
367433d6423SLionel Sambuc while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
368433d6423SLionel Sambuc latest = regpiece(&flags);
369433d6423SLionel Sambuc if (latest == NULL)
370433d6423SLionel Sambuc return(NULL);
371433d6423SLionel Sambuc *flagp |= flags&HASWIDTH;
372433d6423SLionel Sambuc if (chain == NULL) /* First piece. */
373433d6423SLionel Sambuc *flagp |= flags&SPSTART;
374433d6423SLionel Sambuc else
375433d6423SLionel Sambuc regtail(chain, latest);
376433d6423SLionel Sambuc chain = latest;
377433d6423SLionel Sambuc }
378433d6423SLionel Sambuc if (chain == NULL) /* Loop ran zero times. */
379433d6423SLionel Sambuc (void) regnode(NOTHING);
380433d6423SLionel Sambuc
381433d6423SLionel Sambuc return(ret);
382433d6423SLionel Sambuc }
383433d6423SLionel Sambuc
384433d6423SLionel Sambuc /*
385433d6423SLionel Sambuc - regpiece - something followed by possible [*+?]
386433d6423SLionel Sambuc *
387433d6423SLionel Sambuc * Note that the branching code sequences used for ? and the general cases
388433d6423SLionel Sambuc * of * and + are somewhat optimized: they use the same NOTHING node as
389433d6423SLionel Sambuc * both the endmarker for their branch list and the body of the last branch.
390433d6423SLionel Sambuc * It might seem that this node could be dispensed with entirely, but the
391433d6423SLionel Sambuc * endmarker role is not redundant.
392433d6423SLionel Sambuc */
regpiece(int * flagp)393d9494baaSJacob Adams STATIC unsigned char *regpiece(int *flagp) {
394433d6423SLionel Sambuc register unsigned char *ret;
395433d6423SLionel Sambuc register unsigned char op;
396433d6423SLionel Sambuc register unsigned char *next;
397433d6423SLionel Sambuc int flags;
398433d6423SLionel Sambuc
399433d6423SLionel Sambuc ret = regatom(&flags);
400433d6423SLionel Sambuc if (ret == NULL)
401433d6423SLionel Sambuc return(NULL);
402433d6423SLionel Sambuc
403433d6423SLionel Sambuc op = *regparse;
404433d6423SLionel Sambuc if (!ISMULT(op)) {
405433d6423SLionel Sambuc *flagp = flags;
406433d6423SLionel Sambuc return(ret);
407433d6423SLionel Sambuc }
408433d6423SLionel Sambuc
409433d6423SLionel Sambuc if (!(flags&HASWIDTH) && op != '?')
410433d6423SLionel Sambuc FAIL("*+ operand could be empty");
411433d6423SLionel Sambuc *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
412433d6423SLionel Sambuc
413433d6423SLionel Sambuc if (op == '*' && (flags&SIMPLE))
414433d6423SLionel Sambuc reginsert(STAR, ret);
415433d6423SLionel Sambuc else if (op == '*') {
416433d6423SLionel Sambuc /* Emit x* as (x&|), where & means "self". */
417433d6423SLionel Sambuc reginsert(BRANCH, ret); /* Either x */
418433d6423SLionel Sambuc regoptail(ret, regnode(BACK)); /* and loop */
419433d6423SLionel Sambuc regoptail(ret, ret); /* back */
420433d6423SLionel Sambuc regtail(ret, regnode(BRANCH)); /* or */
421433d6423SLionel Sambuc regtail(ret, regnode(NOTHING)); /* null. */
422433d6423SLionel Sambuc } else if (op == '+' && (flags&SIMPLE))
423433d6423SLionel Sambuc reginsert(PLUS, ret);
424433d6423SLionel Sambuc else if (op == '+') {
425433d6423SLionel Sambuc /* Emit x+ as x(&|), where & means "self". */
426433d6423SLionel Sambuc next = regnode(BRANCH); /* Either */
427433d6423SLionel Sambuc regtail(ret, next);
428433d6423SLionel Sambuc regtail(regnode(BACK), ret); /* loop back */
429433d6423SLionel Sambuc regtail(next, regnode(BRANCH)); /* or */
430433d6423SLionel Sambuc regtail(ret, regnode(NOTHING)); /* null. */
431433d6423SLionel Sambuc } else if (op == '?') {
432433d6423SLionel Sambuc /* Emit x? as (x|) */
433433d6423SLionel Sambuc reginsert(BRANCH, ret); /* Either x */
434433d6423SLionel Sambuc regtail(ret, regnode(BRANCH)); /* or */
435433d6423SLionel Sambuc next = regnode(NOTHING); /* null. */
436433d6423SLionel Sambuc regtail(ret, next);
437433d6423SLionel Sambuc regoptail(ret, next);
438433d6423SLionel Sambuc }
439433d6423SLionel Sambuc regparse++;
440433d6423SLionel Sambuc if (ISMULT(*regparse))
441433d6423SLionel Sambuc FAIL("nested *?+");
442433d6423SLionel Sambuc
443433d6423SLionel Sambuc return(ret);
444433d6423SLionel Sambuc }
445433d6423SLionel Sambuc
446433d6423SLionel Sambuc /*
447433d6423SLionel Sambuc - regatom - the lowest level
448433d6423SLionel Sambuc *
449433d6423SLionel Sambuc * Optimization: gobbles an entire sequence of ordinary characters so that
450433d6423SLionel Sambuc * it can turn them into a single node, which is smaller to store and
451433d6423SLionel Sambuc * faster to run. Backslashed characters are exceptions, each becoming a
452433d6423SLionel Sambuc * separate node; the code is simpler that way and it's not worth fixing.
453433d6423SLionel Sambuc */
regatom(int * flagp)454d9494baaSJacob Adams STATIC unsigned char *regatom(int *flagp) {
455433d6423SLionel Sambuc register unsigned char *ret;
456433d6423SLionel Sambuc int flags;
457433d6423SLionel Sambuc
458433d6423SLionel Sambuc *flagp = WORST; /* Tentatively. */
459433d6423SLionel Sambuc
460433d6423SLionel Sambuc switch (*regparse++) {
461433d6423SLionel Sambuc case '^':
462433d6423SLionel Sambuc ret = regnode(BOL);
463433d6423SLionel Sambuc break;
464433d6423SLionel Sambuc case '$':
465433d6423SLionel Sambuc ret = regnode(EOL);
466433d6423SLionel Sambuc break;
467433d6423SLionel Sambuc case '.':
468433d6423SLionel Sambuc ret = regnode(ANY);
469433d6423SLionel Sambuc *flagp |= HASWIDTH|SIMPLE;
470433d6423SLionel Sambuc break;
471433d6423SLionel Sambuc case '[': {
472433d6423SLionel Sambuc register int class;
473433d6423SLionel Sambuc register int classend;
474433d6423SLionel Sambuc
475433d6423SLionel Sambuc if (*regparse == '^') { /* Complement of range. */
476433d6423SLionel Sambuc ret = regnode(ANYBUT);
477433d6423SLionel Sambuc regparse++;
478433d6423SLionel Sambuc } else
479433d6423SLionel Sambuc ret = regnode(ANYOF);
480433d6423SLionel Sambuc if (*regparse == ']' || *regparse == '-')
481433d6423SLionel Sambuc regc(*regparse++);
482433d6423SLionel Sambuc while (*regparse != '\0' && *regparse != ']') {
483433d6423SLionel Sambuc if (*regparse == '-') {
484433d6423SLionel Sambuc regparse++;
485433d6423SLionel Sambuc if (*regparse == ']' || *regparse == '\0')
486433d6423SLionel Sambuc regc('-');
487433d6423SLionel Sambuc else {
488433d6423SLionel Sambuc class = UCHARAT(regparse-2)+1;
489433d6423SLionel Sambuc classend = UCHARAT(regparse);
490433d6423SLionel Sambuc if (class > classend+1)
491433d6423SLionel Sambuc FAIL("invalid [] range");
492433d6423SLionel Sambuc for (; class <= classend; class++)
493433d6423SLionel Sambuc regc(class);
494433d6423SLionel Sambuc regparse++;
495433d6423SLionel Sambuc }
496433d6423SLionel Sambuc } else
497433d6423SLionel Sambuc regc(*regparse++);
498433d6423SLionel Sambuc }
499433d6423SLionel Sambuc regc('\0');
500433d6423SLionel Sambuc if (*regparse != ']')
501433d6423SLionel Sambuc FAIL("unmatched []");
502433d6423SLionel Sambuc regparse++;
503433d6423SLionel Sambuc *flagp |= HASWIDTH|SIMPLE;
504433d6423SLionel Sambuc }
505433d6423SLionel Sambuc break;
506433d6423SLionel Sambuc case '(':
507433d6423SLionel Sambuc ret = reg(1, &flags);
508433d6423SLionel Sambuc if (ret == NULL)
509433d6423SLionel Sambuc return(NULL);
510433d6423SLionel Sambuc *flagp |= flags&(HASWIDTH|SPSTART);
511433d6423SLionel Sambuc break;
512433d6423SLionel Sambuc case '\0':
513433d6423SLionel Sambuc case '|':
514433d6423SLionel Sambuc case ')':
515433d6423SLionel Sambuc FAIL("internal urp"); /* Supposed to be caught earlier. */
516433d6423SLionel Sambuc break;
517433d6423SLionel Sambuc case '?':
518433d6423SLionel Sambuc case '+':
519433d6423SLionel Sambuc case '*':
520433d6423SLionel Sambuc FAIL("?+* follows nothing");
521433d6423SLionel Sambuc break;
522433d6423SLionel Sambuc case '\\':
523433d6423SLionel Sambuc if (*regparse == '\0')
524433d6423SLionel Sambuc FAIL("trailing \\");
525433d6423SLionel Sambuc ret = regnode(EXACTLY);
526433d6423SLionel Sambuc regc(*regparse++);
527433d6423SLionel Sambuc regc('\0');
528433d6423SLionel Sambuc *flagp |= HASWIDTH|SIMPLE;
529433d6423SLionel Sambuc break;
530433d6423SLionel Sambuc default: {
531433d6423SLionel Sambuc register int len;
532433d6423SLionel Sambuc register unsigned char ender;
533433d6423SLionel Sambuc
534433d6423SLionel Sambuc regparse--;
535433d6423SLionel Sambuc #ifdef STRCSPN
536433d6423SLionel Sambuc len = strcspn(regparse, (unsigned char *)META);
537433d6423SLionel Sambuc #else
538433d6423SLionel Sambuc len = strcspn((char *)regparse, META);
539433d6423SLionel Sambuc #endif
540433d6423SLionel Sambuc if (len <= 0)
541433d6423SLionel Sambuc FAIL("internal disaster");
542433d6423SLionel Sambuc ender = *(regparse+len);
543433d6423SLionel Sambuc if (len > 1 && ISMULT(ender))
544433d6423SLionel Sambuc len--; /* Back off clear of ?+* operand. */
545433d6423SLionel Sambuc *flagp |= HASWIDTH;
546433d6423SLionel Sambuc if (len == 1)
547433d6423SLionel Sambuc *flagp |= SIMPLE;
548433d6423SLionel Sambuc ret = regnode(EXACTLY);
549433d6423SLionel Sambuc while (len > 0) {
550433d6423SLionel Sambuc regc(*regparse++);
551433d6423SLionel Sambuc len--;
552433d6423SLionel Sambuc }
553433d6423SLionel Sambuc regc('\0');
554433d6423SLionel Sambuc }
555433d6423SLionel Sambuc break;
556433d6423SLionel Sambuc }
557433d6423SLionel Sambuc
558433d6423SLionel Sambuc return(ret);
559433d6423SLionel Sambuc }
560433d6423SLionel Sambuc
561433d6423SLionel Sambuc /*
562433d6423SLionel Sambuc - regnode - emit a node
563433d6423SLionel Sambuc */
regnode(int iop)564d9494baaSJacob Adams STATIC unsigned char *regnode(int iop) {
565d9494baaSJacob Adams /* Return value is location */
566433d6423SLionel Sambuc register unsigned char *ret;
567433d6423SLionel Sambuc register unsigned char *ptr;
568433d6423SLionel Sambuc unsigned char op;
569433d6423SLionel Sambuc
570433d6423SLionel Sambuc op = (unsigned char) iop;
571433d6423SLionel Sambuc ret = regcode;
572433d6423SLionel Sambuc if (ret == ®dummy) {
573433d6423SLionel Sambuc regsize += 3;
574433d6423SLionel Sambuc return(ret);
575433d6423SLionel Sambuc }
576433d6423SLionel Sambuc
577433d6423SLionel Sambuc ptr = ret;
578433d6423SLionel Sambuc *ptr++ = op;
579433d6423SLionel Sambuc *ptr++ = '\0'; /* Null "next" pointer. */
580433d6423SLionel Sambuc *ptr++ = '\0';
581433d6423SLionel Sambuc regcode = ptr;
582433d6423SLionel Sambuc
583433d6423SLionel Sambuc return(ret);
584433d6423SLionel Sambuc }
585433d6423SLionel Sambuc
586433d6423SLionel Sambuc /*
587433d6423SLionel Sambuc - regc - emit (if appropriate) a byte of code
588433d6423SLionel Sambuc */
regc(int ib)589d9494baaSJacob Adams STATIC void regc(int ib) {
590433d6423SLionel Sambuc unsigned char b;
591433d6423SLionel Sambuc
592433d6423SLionel Sambuc b = (unsigned char) ib;
593433d6423SLionel Sambuc if (regcode != ®dummy)
594433d6423SLionel Sambuc *regcode++ = b;
595433d6423SLionel Sambuc else
596433d6423SLionel Sambuc regsize++;
597433d6423SLionel Sambuc }
598433d6423SLionel Sambuc
599433d6423SLionel Sambuc /*
600433d6423SLionel Sambuc - reginsert - insert an operator in front of already-emitted operand
601433d6423SLionel Sambuc *
602433d6423SLionel Sambuc * Means relocating the operand.
603433d6423SLionel Sambuc */
reginsert(int iop,unsigned char * opnd)604d9494baaSJacob Adams STATIC void reginsert(int iop, unsigned char *opnd) {
605433d6423SLionel Sambuc register unsigned char *src;
606433d6423SLionel Sambuc register unsigned char *dst;
607433d6423SLionel Sambuc register unsigned char *place;
608433d6423SLionel Sambuc unsigned char op;
609433d6423SLionel Sambuc
610433d6423SLionel Sambuc op = (unsigned char) iop;
611433d6423SLionel Sambuc if (regcode == ®dummy) {
612433d6423SLionel Sambuc regsize += 3;
613433d6423SLionel Sambuc return;
614433d6423SLionel Sambuc }
615433d6423SLionel Sambuc
616433d6423SLionel Sambuc src = regcode;
617433d6423SLionel Sambuc regcode += 3;
618433d6423SLionel Sambuc dst = regcode;
619433d6423SLionel Sambuc while (src > opnd)
620433d6423SLionel Sambuc *--dst = *--src;
621433d6423SLionel Sambuc
622433d6423SLionel Sambuc place = opnd; /* Op node, where operand used to be. */
623433d6423SLionel Sambuc *place++ = op;
624433d6423SLionel Sambuc *place++ = '\0';
625433d6423SLionel Sambuc *place++ = '\0';
626433d6423SLionel Sambuc }
627433d6423SLionel Sambuc
628433d6423SLionel Sambuc /*
629433d6423SLionel Sambuc - regtail - set the next-pointer at the end of a node chain
630433d6423SLionel Sambuc */
regtail(unsigned char * p,unsigned char * val)631d9494baaSJacob Adams STATIC void regtail(unsigned char *p, unsigned char *val) {
632433d6423SLionel Sambuc register unsigned char *scan;
633433d6423SLionel Sambuc register unsigned char *temp;
634433d6423SLionel Sambuc register int offset;
635433d6423SLionel Sambuc
636433d6423SLionel Sambuc if (p == ®dummy)
637433d6423SLionel Sambuc return;
638433d6423SLionel Sambuc
639433d6423SLionel Sambuc /* Find last node. */
640433d6423SLionel Sambuc scan = p;
641433d6423SLionel Sambuc for (;;) {
642433d6423SLionel Sambuc temp = regnext(scan);
643433d6423SLionel Sambuc if (temp == NULL)
644433d6423SLionel Sambuc break;
645433d6423SLionel Sambuc scan = temp;
646433d6423SLionel Sambuc }
647433d6423SLionel Sambuc
648433d6423SLionel Sambuc if (OP(scan) == BACK)
649433d6423SLionel Sambuc offset = scan - val;
650433d6423SLionel Sambuc else
651433d6423SLionel Sambuc offset = val - scan;
652433d6423SLionel Sambuc *(scan+1) = (offset>>8)&0377;
653433d6423SLionel Sambuc *(scan+2) = offset&0377;
654433d6423SLionel Sambuc }
655433d6423SLionel Sambuc
656433d6423SLionel Sambuc /*
657433d6423SLionel Sambuc - regoptail - regtail on operand of first argument; nop if operandless
658433d6423SLionel Sambuc */
regoptail(unsigned char * p,unsigned char * val)659d9494baaSJacob Adams STATIC void regoptail(unsigned char *p, unsigned char *val) {
660433d6423SLionel Sambuc /* "Operandless" and "op != BRANCH" are synonymous in practice. */
661433d6423SLionel Sambuc if (p == NULL || p == ®dummy || OP(p) != BRANCH)
662433d6423SLionel Sambuc return;
663433d6423SLionel Sambuc regtail(OPERAND(p), val);
664433d6423SLionel Sambuc }
665433d6423SLionel Sambuc
666433d6423SLionel Sambuc /*
667433d6423SLionel Sambuc * regexec and friends
668433d6423SLionel Sambuc */
669433d6423SLionel Sambuc
670433d6423SLionel Sambuc /*
671433d6423SLionel Sambuc * Global work variables for regexec().
672433d6423SLionel Sambuc */
673433d6423SLionel Sambuc STATIC unsigned char *reginput; /* String-input pointer. */
674433d6423SLionel Sambuc STATIC unsigned char *regbol; /* Beginning of input, for ^ check. */
675433d6423SLionel Sambuc STATIC unsigned char **regstartp; /* Pointer to startp array. */
676433d6423SLionel Sambuc STATIC unsigned char **regendp; /* Ditto for endp. */
677433d6423SLionel Sambuc
678433d6423SLionel Sambuc #ifdef DEBUG
679433d6423SLionel Sambuc int regnarrate = 0;
680d9494baaSJacob Adams void regdump(void);
681d9494baaSJacob Adams STATIC unsigned char *regprop(void);
682433d6423SLionel Sambuc #endif
683433d6423SLionel Sambuc
684433d6423SLionel Sambuc /*
685433d6423SLionel Sambuc - regexec - match a regexp against a string
686433d6423SLionel Sambuc */
687433d6423SLionel Sambuc int
regexec(register regexp * prog,register unsigned char * string)688d9494baaSJacob Adams regexec(register regexp *prog, register unsigned char *string) {
689433d6423SLionel Sambuc register unsigned char *s;
690433d6423SLionel Sambuc #ifndef STDLIB
691433d6423SLionel Sambuc extern char *strchr();
692433d6423SLionel Sambuc #endif
693433d6423SLionel Sambuc
694433d6423SLionel Sambuc /* Be paranoid... */
695433d6423SLionel Sambuc if (prog == NULL || string == NULL) {
696433d6423SLionel Sambuc regerror("NULL parameter");
697433d6423SLionel Sambuc return(0);
698433d6423SLionel Sambuc }
699433d6423SLionel Sambuc
700433d6423SLionel Sambuc /* Check validity of program. */
701433d6423SLionel Sambuc if (UCHARAT(prog->program) != MAGIC) {
702433d6423SLionel Sambuc regerror("corrupted program");
703433d6423SLionel Sambuc return(0);
704433d6423SLionel Sambuc }
705433d6423SLionel Sambuc
706433d6423SLionel Sambuc /* If there is a "must appear" string, look for it. */
707433d6423SLionel Sambuc if (prog->regmust != NULL) {
708433d6423SLionel Sambuc s = string;
709433d6423SLionel Sambuc while ((s = (unsigned char *)strchr((char *)s,prog->regmust[0]))
710433d6423SLionel Sambuc != NULL) {
711433d6423SLionel Sambuc if (strncmp((char *)s, (char *)prog->regmust,
712433d6423SLionel Sambuc prog->regmlen)
713433d6423SLionel Sambuc == 0)
714433d6423SLionel Sambuc break; /* Found it. */
715433d6423SLionel Sambuc s++;
716433d6423SLionel Sambuc }
717433d6423SLionel Sambuc if (s == NULL) /* Not present. */
718433d6423SLionel Sambuc return(0);
719433d6423SLionel Sambuc }
720433d6423SLionel Sambuc
721433d6423SLionel Sambuc /* Mark beginning of line for ^ . */
722433d6423SLionel Sambuc regbol = string;
723433d6423SLionel Sambuc
724433d6423SLionel Sambuc /* Simplest case: anchored match need be tried only once. */
725433d6423SLionel Sambuc if (prog->reganch)
726433d6423SLionel Sambuc return(regtry(prog, string));
727433d6423SLionel Sambuc
728433d6423SLionel Sambuc /* Messy cases: unanchored match. */
729433d6423SLionel Sambuc s = string;
730433d6423SLionel Sambuc if (prog->regstart != '\0')
731433d6423SLionel Sambuc /* We know what char it must start with. */
732433d6423SLionel Sambuc while ((s = (unsigned char *)strchr((char *)s, prog->regstart))
733433d6423SLionel Sambuc != NULL) {
734433d6423SLionel Sambuc if (regtry(prog, s))
735433d6423SLionel Sambuc return(1);
736433d6423SLionel Sambuc s++;
737433d6423SLionel Sambuc }
738433d6423SLionel Sambuc else
739433d6423SLionel Sambuc /* We don't -- general case. */
740433d6423SLionel Sambuc do {
741433d6423SLionel Sambuc if (regtry(prog, s))
742433d6423SLionel Sambuc return(1);
743433d6423SLionel Sambuc } while (*s++ != '\0');
744433d6423SLionel Sambuc
745433d6423SLionel Sambuc /* Failure. */
746433d6423SLionel Sambuc return(0);
747433d6423SLionel Sambuc }
748433d6423SLionel Sambuc
749433d6423SLionel Sambuc /*
750433d6423SLionel Sambuc - regtry - try match at specific point
751433d6423SLionel Sambuc */
regtry(regexp * prog,unsigned char * string)752d9494baaSJacob Adams STATIC int regtry(regexp *prog, unsigned char *string) {
753d9494baaSJacob Adams /* Return value 0 failure, 1 success */
754433d6423SLionel Sambuc register int i;
755433d6423SLionel Sambuc register unsigned char **sp;
756433d6423SLionel Sambuc register unsigned char **ep;
757433d6423SLionel Sambuc
758433d6423SLionel Sambuc reginput = string;
759433d6423SLionel Sambuc regstartp = prog->startp;
760433d6423SLionel Sambuc regendp = prog->endp;
761433d6423SLionel Sambuc
762433d6423SLionel Sambuc sp = prog->startp;
763433d6423SLionel Sambuc ep = prog->endp;
764433d6423SLionel Sambuc for (i = NSUBEXP; i > 0; i--) {
765433d6423SLionel Sambuc *sp++ = NULL;
766433d6423SLionel Sambuc *ep++ = NULL;
767433d6423SLionel Sambuc }
768433d6423SLionel Sambuc if (regmatch(prog->program + 1)) {
769433d6423SLionel Sambuc prog->startp[0] = string;
770433d6423SLionel Sambuc prog->endp[0] = reginput;
771433d6423SLionel Sambuc return(1);
772433d6423SLionel Sambuc } else
773433d6423SLionel Sambuc return(0);
774433d6423SLionel Sambuc }
775433d6423SLionel Sambuc
776433d6423SLionel Sambuc /*
777433d6423SLionel Sambuc - regmatch - main matching routine
778433d6423SLionel Sambuc *
779433d6423SLionel Sambuc * Conceptually the strategy is simple: check to see whether the current
780433d6423SLionel Sambuc * node matches, call self recursively to see whether the rest matches,
781433d6423SLionel Sambuc * and then act accordingly. In practice we make some effort to avoid
782433d6423SLionel Sambuc * recursion, in particular by going through "ordinary" nodes (that don't
783433d6423SLionel Sambuc * need to know whether the rest of the match failed) by a loop instead of
784433d6423SLionel Sambuc * by recursion.
785433d6423SLionel Sambuc */
regmatch(unsigned char * prog)786d9494baaSJacob Adams STATIC int regmatch( unsigned char *prog) {
787d9494baaSJacob Adams /* Return value 0 failure 1 success */
788433d6423SLionel Sambuc register unsigned char *scan; /* Current node. */
789433d6423SLionel Sambuc unsigned char *next; /* Next node. */
790433d6423SLionel Sambuc #ifndef STDLIB
791433d6423SLionel Sambuc extern char *strchr();
792433d6423SLionel Sambuc #endif
793433d6423SLionel Sambuc
794433d6423SLionel Sambuc scan = prog;
795433d6423SLionel Sambuc #ifdef DEBUG
796433d6423SLionel Sambuc if (scan != NULL && regnarrate)
797433d6423SLionel Sambuc fprintf(stderr, "%s(\n", regprop(scan));
798433d6423SLionel Sambuc #endif
799433d6423SLionel Sambuc while (scan != NULL) {
800433d6423SLionel Sambuc #ifdef DEBUG
801433d6423SLionel Sambuc if (regnarrate)
802433d6423SLionel Sambuc fprintf(stderr, "%s...\n", regprop(scan));
803433d6423SLionel Sambuc #endif
804433d6423SLionel Sambuc next = regnext(scan);
805433d6423SLionel Sambuc
806433d6423SLionel Sambuc switch (OP(scan)) {
807433d6423SLionel Sambuc case BOL:
808433d6423SLionel Sambuc if (reginput != regbol)
809433d6423SLionel Sambuc return(0);
810433d6423SLionel Sambuc break;
811433d6423SLionel Sambuc case EOL:
812433d6423SLionel Sambuc if (*reginput != '\0')
813433d6423SLionel Sambuc return(0);
814433d6423SLionel Sambuc break;
815433d6423SLionel Sambuc case ANY:
816433d6423SLionel Sambuc if (*reginput == '\0')
817433d6423SLionel Sambuc return(0);
818433d6423SLionel Sambuc reginput++;
819433d6423SLionel Sambuc break;
820433d6423SLionel Sambuc case EXACTLY: {
821433d6423SLionel Sambuc register int len;
822433d6423SLionel Sambuc register unsigned char *opnd;
823433d6423SLionel Sambuc
824433d6423SLionel Sambuc opnd = OPERAND(scan);
825433d6423SLionel Sambuc /* Inline the first character, for speed. */
826433d6423SLionel Sambuc if (*opnd != *reginput)
827433d6423SLionel Sambuc return(0);
828433d6423SLionel Sambuc len = strlen((char *)opnd);
829433d6423SLionel Sambuc if (len > 1 && strncmp((char *)opnd,
830433d6423SLionel Sambuc (char *)reginput, len)
831433d6423SLionel Sambuc != 0)
832433d6423SLionel Sambuc return(0);
833433d6423SLionel Sambuc reginput += len;
834433d6423SLionel Sambuc }
835433d6423SLionel Sambuc break;
836433d6423SLionel Sambuc case ANYOF:
837433d6423SLionel Sambuc if (*reginput == '\0'
838433d6423SLionel Sambuc || strchr((char *)OPERAND(scan), *reginput) == NULL)
839433d6423SLionel Sambuc return(0);
840433d6423SLionel Sambuc reginput++;
841433d6423SLionel Sambuc break;
842433d6423SLionel Sambuc case ANYBUT:
843433d6423SLionel Sambuc if (*reginput == '\0'
844433d6423SLionel Sambuc || strchr((char *)OPERAND(scan), *reginput) != NULL)
845433d6423SLionel Sambuc return(0);
846433d6423SLionel Sambuc reginput++;
847433d6423SLionel Sambuc break;
848433d6423SLionel Sambuc case NOTHING:
849433d6423SLionel Sambuc break;
850433d6423SLionel Sambuc case BACK:
851433d6423SLionel Sambuc break;
852433d6423SLionel Sambuc case OPEN+1:
853433d6423SLionel Sambuc case OPEN+2:
854433d6423SLionel Sambuc case OPEN+3:
855433d6423SLionel Sambuc case OPEN+4:
856433d6423SLionel Sambuc case OPEN+5:
857433d6423SLionel Sambuc case OPEN+6:
858433d6423SLionel Sambuc case OPEN+7:
859433d6423SLionel Sambuc case OPEN+8:
860433d6423SLionel Sambuc case OPEN+9: {
861433d6423SLionel Sambuc register int no;
862433d6423SLionel Sambuc register unsigned char *save;
863433d6423SLionel Sambuc
864433d6423SLionel Sambuc no = OP(scan) - OPEN;
865433d6423SLionel Sambuc save = reginput;
866433d6423SLionel Sambuc
867433d6423SLionel Sambuc if (regmatch(next)) {
868433d6423SLionel Sambuc /*
869433d6423SLionel Sambuc * Don't set startp if some later
870433d6423SLionel Sambuc * invocation of the same parentheses
871433d6423SLionel Sambuc * already has.
872433d6423SLionel Sambuc */
873433d6423SLionel Sambuc if (regstartp[no] == NULL)
874433d6423SLionel Sambuc regstartp[no] = save;
875433d6423SLionel Sambuc return(1);
876433d6423SLionel Sambuc } else
877433d6423SLionel Sambuc return(0);
878433d6423SLionel Sambuc }
879433d6423SLionel Sambuc break;
880433d6423SLionel Sambuc case CLOSE+1:
881433d6423SLionel Sambuc case CLOSE+2:
882433d6423SLionel Sambuc case CLOSE+3:
883433d6423SLionel Sambuc case CLOSE+4:
884433d6423SLionel Sambuc case CLOSE+5:
885433d6423SLionel Sambuc case CLOSE+6:
886433d6423SLionel Sambuc case CLOSE+7:
887433d6423SLionel Sambuc case CLOSE+8:
888433d6423SLionel Sambuc case CLOSE+9: {
889433d6423SLionel Sambuc register int no;
890433d6423SLionel Sambuc register unsigned char *save;
891433d6423SLionel Sambuc
892433d6423SLionel Sambuc no = OP(scan) - CLOSE;
893433d6423SLionel Sambuc save = reginput;
894433d6423SLionel Sambuc
895433d6423SLionel Sambuc if (regmatch(next)) {
896433d6423SLionel Sambuc /*
897433d6423SLionel Sambuc * Don't set endp if some later
898433d6423SLionel Sambuc * invocation of the same parentheses
899433d6423SLionel Sambuc * already has.
900433d6423SLionel Sambuc */
901433d6423SLionel Sambuc if (regendp[no] == NULL)
902433d6423SLionel Sambuc regendp[no] = save;
903433d6423SLionel Sambuc return(1);
904433d6423SLionel Sambuc } else
905433d6423SLionel Sambuc return(0);
906433d6423SLionel Sambuc }
907433d6423SLionel Sambuc break;
908433d6423SLionel Sambuc case BRANCH: {
909433d6423SLionel Sambuc register unsigned char *save;
910433d6423SLionel Sambuc
911433d6423SLionel Sambuc if (OP(next) != BRANCH) /* No choice. */
912433d6423SLionel Sambuc next = OPERAND(scan); /* Avoid recursion. */
913433d6423SLionel Sambuc else {
914433d6423SLionel Sambuc do {
915433d6423SLionel Sambuc save = reginput;
916433d6423SLionel Sambuc if (regmatch(OPERAND(scan)))
917433d6423SLionel Sambuc return(1);
918433d6423SLionel Sambuc reginput = save;
919433d6423SLionel Sambuc scan = regnext(scan);
920433d6423SLionel Sambuc } while (scan != NULL && OP(scan) == BRANCH);
921433d6423SLionel Sambuc return(0);
922433d6423SLionel Sambuc /* NOTREACHED */
923433d6423SLionel Sambuc }
924433d6423SLionel Sambuc }
925433d6423SLionel Sambuc break;
926433d6423SLionel Sambuc case STAR:
927433d6423SLionel Sambuc case PLUS: {
928433d6423SLionel Sambuc register unsigned char nextch;
929433d6423SLionel Sambuc register int no;
930433d6423SLionel Sambuc register unsigned char *save;
931433d6423SLionel Sambuc register int min;
932433d6423SLionel Sambuc
933433d6423SLionel Sambuc /*
934433d6423SLionel Sambuc * Lookahead to avoid useless match attempts
935433d6423SLionel Sambuc * when we know what character comes next.
936433d6423SLionel Sambuc */
937433d6423SLionel Sambuc nextch = '\0';
938433d6423SLionel Sambuc if (OP(next) == EXACTLY)
939433d6423SLionel Sambuc nextch = *OPERAND(next);
940433d6423SLionel Sambuc min = (OP(scan) == STAR) ? 0 : 1;
941433d6423SLionel Sambuc save = reginput;
942433d6423SLionel Sambuc no = regrepeat(OPERAND(scan));
943433d6423SLionel Sambuc while (no >= min) {
944433d6423SLionel Sambuc /* If it could work, try it. */
945433d6423SLionel Sambuc if (nextch == '\0' || *reginput == nextch)
946433d6423SLionel Sambuc if (regmatch(next))
947433d6423SLionel Sambuc return(1);
948433d6423SLionel Sambuc /* Couldn't or didn't -- back up. */
949433d6423SLionel Sambuc no--;
950433d6423SLionel Sambuc reginput = save + no;
951433d6423SLionel Sambuc }
952433d6423SLionel Sambuc return(0);
953433d6423SLionel Sambuc }
954433d6423SLionel Sambuc break;
955433d6423SLionel Sambuc case END:
956433d6423SLionel Sambuc return(1); /* Success! */
957433d6423SLionel Sambuc break;
958433d6423SLionel Sambuc default:
959433d6423SLionel Sambuc regerror("memory corruption");
960433d6423SLionel Sambuc return(0);
961433d6423SLionel Sambuc break;
962433d6423SLionel Sambuc }
963433d6423SLionel Sambuc
964433d6423SLionel Sambuc scan = next;
965433d6423SLionel Sambuc }
966433d6423SLionel Sambuc
967433d6423SLionel Sambuc /*
968433d6423SLionel Sambuc * We get here only if there's trouble -- normally "case END" is
969433d6423SLionel Sambuc * the terminating point.
970433d6423SLionel Sambuc */
971433d6423SLionel Sambuc regerror("corrupted pointers");
972433d6423SLionel Sambuc return(0);
973433d6423SLionel Sambuc }
974433d6423SLionel Sambuc
975433d6423SLionel Sambuc /*
976433d6423SLionel Sambuc - regrepeat - repeatedly match something simple, report how many
977433d6423SLionel Sambuc */
regrepeat(unsigned char * p)978d9494baaSJacob Adams STATIC int regrepeat(unsigned char *p) {
979433d6423SLionel Sambuc register int count = 0;
980433d6423SLionel Sambuc register unsigned char *scan;
981433d6423SLionel Sambuc register unsigned char *opnd;
982433d6423SLionel Sambuc
983433d6423SLionel Sambuc scan = reginput;
984433d6423SLionel Sambuc opnd = OPERAND(p);
985433d6423SLionel Sambuc switch (OP(p)) {
986433d6423SLionel Sambuc case ANY:
987433d6423SLionel Sambuc count = strlen((char *)scan);
988433d6423SLionel Sambuc scan += count;
989433d6423SLionel Sambuc break;
990433d6423SLionel Sambuc case EXACTLY:
991433d6423SLionel Sambuc while (*opnd == *scan) {
992433d6423SLionel Sambuc count++;
993433d6423SLionel Sambuc scan++;
994433d6423SLionel Sambuc }
995433d6423SLionel Sambuc break;
996433d6423SLionel Sambuc case ANYOF:
997433d6423SLionel Sambuc while (*scan != '\0' && strchr((char *)opnd, *scan) != NULL) {
998433d6423SLionel Sambuc count++;
999433d6423SLionel Sambuc scan++;
1000433d6423SLionel Sambuc }
1001433d6423SLionel Sambuc break;
1002433d6423SLionel Sambuc case ANYBUT:
1003433d6423SLionel Sambuc while (*scan != '\0' && strchr((char *)opnd, *scan) == NULL) {
1004433d6423SLionel Sambuc count++;
1005433d6423SLionel Sambuc scan++;
1006433d6423SLionel Sambuc }
1007433d6423SLionel Sambuc break;
1008433d6423SLionel Sambuc default: /* Oh dear. Called inappropriately. */
1009433d6423SLionel Sambuc regerror("internal foulup");
1010433d6423SLionel Sambuc count = 0; /* Best compromise. */
1011433d6423SLionel Sambuc break;
1012433d6423SLionel Sambuc }
1013433d6423SLionel Sambuc reginput = scan;
1014433d6423SLionel Sambuc
1015433d6423SLionel Sambuc return(count);
1016433d6423SLionel Sambuc }
1017433d6423SLionel Sambuc
1018433d6423SLionel Sambuc /*
1019433d6423SLionel Sambuc - regnext - dig the "next" pointer out of a node
1020433d6423SLionel Sambuc */
regnext(register unsigned char * p)1021d9494baaSJacob Adams STATIC unsigned char *regnext(register unsigned char *p) {
1022433d6423SLionel Sambuc register int offset;
1023433d6423SLionel Sambuc
1024433d6423SLionel Sambuc if (p == ®dummy)
1025433d6423SLionel Sambuc return(NULL);
1026433d6423SLionel Sambuc
1027433d6423SLionel Sambuc offset = NEXT(p);
1028433d6423SLionel Sambuc if (offset == 0)
1029433d6423SLionel Sambuc return(NULL);
1030433d6423SLionel Sambuc
1031433d6423SLionel Sambuc if (OP(p) == BACK)
1032433d6423SLionel Sambuc return(p-offset);
1033433d6423SLionel Sambuc else
1034433d6423SLionel Sambuc return(p+offset);
1035433d6423SLionel Sambuc }
1036433d6423SLionel Sambuc
1037433d6423SLionel Sambuc #ifdef DEBUG
1038433d6423SLionel Sambuc
1039d9494baaSJacob Adams STATIC unsigned char *regprop(void);
1040433d6423SLionel Sambuc
1041433d6423SLionel Sambuc /*
1042433d6423SLionel Sambuc - regdump - dump a regexp onto stdout in vaguely comprehensible form
1043433d6423SLionel Sambuc */
regdump(regexp r)1044d9494baaSJacob Adams void regdump(regexp r) {
1045433d6423SLionel Sambuc register unsigned char *s;
1046433d6423SLionel Sambuc register unsigned char op = EXACTLY; /* Arbitrary non-END op. */
1047433d6423SLionel Sambuc register unsigned char *next;
1048433d6423SLionel Sambuc extern char *strchr();
1049433d6423SLionel Sambuc
1050433d6423SLionel Sambuc
1051433d6423SLionel Sambuc s = r->program + 1;
1052433d6423SLionel Sambuc while (op != END) { /* While that wasn't END last time... */
1053433d6423SLionel Sambuc op = OP(s);
1054433d6423SLionel Sambuc printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1055433d6423SLionel Sambuc next = regnext(s);
1056433d6423SLionel Sambuc if (next == NULL) /* Next ptr. */
1057433d6423SLionel Sambuc printf("(0)");
1058433d6423SLionel Sambuc else
1059433d6423SLionel Sambuc printf("(%d)", (s-r->program)+(next-s));
1060433d6423SLionel Sambuc s += 3;
1061433d6423SLionel Sambuc if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1062433d6423SLionel Sambuc /* Literal string, where present. */
1063433d6423SLionel Sambuc while (*s != '\0') {
1064433d6423SLionel Sambuc putchar(*s);
1065433d6423SLionel Sambuc s++;
1066433d6423SLionel Sambuc }
1067433d6423SLionel Sambuc s++;
1068433d6423SLionel Sambuc }
1069433d6423SLionel Sambuc putchar('\n');
1070433d6423SLionel Sambuc }
1071433d6423SLionel Sambuc
1072433d6423SLionel Sambuc /* Header fields of interest. */
1073433d6423SLionel Sambuc if (r->regstart != '\0')
1074433d6423SLionel Sambuc printf("start `%c' ", r->regstart);
1075433d6423SLionel Sambuc if (r->reganch)
1076433d6423SLionel Sambuc printf("anchored ");
1077433d6423SLionel Sambuc if (r->regmust != NULL)
1078433d6423SLionel Sambuc printf("must have \"%s\"", r->regmust);
1079433d6423SLionel Sambuc printf("\n");
1080433d6423SLionel Sambuc }
1081433d6423SLionel Sambuc
1082433d6423SLionel Sambuc /*
1083433d6423SLionel Sambuc - regprop - printable representation of opcode
1084433d6423SLionel Sambuc */
1085c6748a4aSJacob Adams STATIC unsigned char regprop_buf[50];
1086d9494baaSJacob Adams
regprop(unsigned char * op)1087d9494baaSJacob Adams STATIC unsigned char *regprop(unsigned char *op) {
1088433d6423SLionel Sambuc register unsigned char *p;
1089433d6423SLionel Sambuc
1090c6748a4aSJacob Adams (void) strcpy(regprop_buf, ":");
1091433d6423SLionel Sambuc
1092433d6423SLionel Sambuc switch (OP(op)) {
1093433d6423SLionel Sambuc case BOL:
1094433d6423SLionel Sambuc p = "BOL";
1095433d6423SLionel Sambuc break;
1096433d6423SLionel Sambuc case EOL:
1097433d6423SLionel Sambuc p = "EOL";
1098433d6423SLionel Sambuc break;
1099433d6423SLionel Sambuc case ANY:
1100433d6423SLionel Sambuc p = "ANY";
1101433d6423SLionel Sambuc break;
1102433d6423SLionel Sambuc case ANYOF:
1103433d6423SLionel Sambuc p = "ANYOF";
1104433d6423SLionel Sambuc break;
1105433d6423SLionel Sambuc case ANYBUT:
1106433d6423SLionel Sambuc p = "ANYBUT";
1107433d6423SLionel Sambuc break;
1108433d6423SLionel Sambuc case BRANCH:
1109433d6423SLionel Sambuc p = "BRANCH";
1110433d6423SLionel Sambuc break;
1111433d6423SLionel Sambuc case EXACTLY:
1112433d6423SLionel Sambuc p = "EXACTLY";
1113433d6423SLionel Sambuc break;
1114433d6423SLionel Sambuc case NOTHING:
1115433d6423SLionel Sambuc p = "NOTHING";
1116433d6423SLionel Sambuc break;
1117433d6423SLionel Sambuc case BACK:
1118433d6423SLionel Sambuc p = "BACK";
1119433d6423SLionel Sambuc break;
1120433d6423SLionel Sambuc case END:
1121433d6423SLionel Sambuc p = "END";
1122433d6423SLionel Sambuc break;
1123433d6423SLionel Sambuc case OPEN+1:
1124433d6423SLionel Sambuc case OPEN+2:
1125433d6423SLionel Sambuc case OPEN+3:
1126433d6423SLionel Sambuc case OPEN+4:
1127433d6423SLionel Sambuc case OPEN+5:
1128433d6423SLionel Sambuc case OPEN+6:
1129433d6423SLionel Sambuc case OPEN+7:
1130433d6423SLionel Sambuc case OPEN+8:
1131433d6423SLionel Sambuc case OPEN+9:
1132c6748a4aSJacob Adams sprintf(regprop_buf+strlen(regprop_buf), "OPEN%d", OP(op)-OPEN);
1133433d6423SLionel Sambuc p = NULL;
1134433d6423SLionel Sambuc break;
1135433d6423SLionel Sambuc case CLOSE+1:
1136433d6423SLionel Sambuc case CLOSE+2:
1137433d6423SLionel Sambuc case CLOSE+3:
1138433d6423SLionel Sambuc case CLOSE+4:
1139433d6423SLionel Sambuc case CLOSE+5:
1140433d6423SLionel Sambuc case CLOSE+6:
1141433d6423SLionel Sambuc case CLOSE+7:
1142433d6423SLionel Sambuc case CLOSE+8:
1143433d6423SLionel Sambuc case CLOSE+9:
1144c6748a4aSJacob Adams sprintf(regprop_buf+strlen(regprop_buf), "CLOSE%d", OP(op)-CLOSE);
1145433d6423SLionel Sambuc p = NULL;
1146433d6423SLionel Sambuc break;
1147433d6423SLionel Sambuc case STAR:
1148433d6423SLionel Sambuc p = "STAR";
1149433d6423SLionel Sambuc break;
1150433d6423SLionel Sambuc case PLUS:
1151433d6423SLionel Sambuc p = "PLUS";
1152433d6423SLionel Sambuc break;
1153433d6423SLionel Sambuc default:
1154433d6423SLionel Sambuc regerror("corrupted opcode");
1155433d6423SLionel Sambuc break;
1156433d6423SLionel Sambuc }
1157433d6423SLionel Sambuc if (p != NULL)
1158c6748a4aSJacob Adams (void) strcat(regprop_buf, p);
1159c6748a4aSJacob Adams return(regprop_buf);
1160433d6423SLionel Sambuc }
1161433d6423SLionel Sambuc #endif
1162433d6423SLionel Sambuc
1163433d6423SLionel Sambuc /*
1164433d6423SLionel Sambuc * The following is provided for those people who do not have strcspn() in
1165433d6423SLionel Sambuc * their C libraries. They should get off their butts and do something
1166433d6423SLionel Sambuc * about it; at least one public-domain implementation of those (highly
1167433d6423SLionel Sambuc * useful) string routines has been published on Usenet.
1168433d6423SLionel Sambuc */
1169433d6423SLionel Sambuc #ifdef STRCSPN
1170433d6423SLionel Sambuc /*
1171433d6423SLionel Sambuc * strcspn - find length of initial segment of s1 consisting entirely
1172433d6423SLionel Sambuc * of characters not from s2
1173433d6423SLionel Sambuc */
1174433d6423SLionel Sambuc
strcspn(unsigned char * s1,unsigned char * s2)1175d9494baaSJacob Adams STATIC int strcspn(unsigned char *s1, unsigned char *s2) {
1176433d6423SLionel Sambuc register unsigned char *scan1;
1177433d6423SLionel Sambuc register unsigned char *scan2;
1178433d6423SLionel Sambuc register int count;
1179433d6423SLionel Sambuc
1180433d6423SLionel Sambuc count = 0;
1181433d6423SLionel Sambuc for (scan1 = s1; *scan1 != '\0'; scan1++) {
1182433d6423SLionel Sambuc for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1183433d6423SLionel Sambuc if (*scan1 == *scan2++)
1184433d6423SLionel Sambuc return(count);
1185433d6423SLionel Sambuc count++;
1186433d6423SLionel Sambuc }
1187433d6423SLionel Sambuc return(count);
1188433d6423SLionel Sambuc }
1189433d6423SLionel Sambuc #endif
1190