1 /*
2  * pattern.c - pattern matching
3  *
4  * This file is part of zsh, the Z shell.
5  *
6  * Copyright (c) 1999 Peter Stephenson
7  * All rights reserved.
8  *
9  * Permission is hereby granted, without written agreement and without
10  * license or royalty fees, to use, copy, modify, and distribute this
11  * software and to distribute modified versions of this software for any
12  * purpose, provided that the above copyright notice and the following
13  * two paragraphs appear in all copies of this software.
14  *
15  * In no event shall Peter Stephenson or the Zsh Development Group be liable
16  * to any party for direct, indirect, special, incidental, or consequential
17  * damages arising out of the use of this software and its documentation,
18  * even if Peter Stephenson and the Zsh Development Group have been advised of
19  * the possibility of such damage.
20  *
21  * Peter Stephenson and the Zsh Development Group specifically disclaim any
22  * warranties, including, but not limited to, the implied warranties of
23  * merchantability and fitness for a particular purpose.  The software
24  * provided hereunder is on an "as is" basis, and Peter Stephenson and the
25  * Zsh Development Group have no obligation to provide maintenance,
26  * support, updates, enhancements, or modifications.
27  *
28  * Pattern matching code derived from the regexp library by Henry
29  * Spencer, which has the following copyright.
30  *
31  *	Copyright (c) 1986 by University of Toronto.
32  *	Written by Henry Spencer.  Not derived from licensed software.
33  *
34  *	Permission is granted to anyone to use this software for any
35  *	purpose on any computer system, and to redistribute it freely,
36  *	subject to the following restrictions:
37  *
38  *	1. The author is not responsible for the consequences of use of
39  *		this software, no matter how awful, even if they arise
40  *		from defects in it.
41  *
42  *	2. The origin of this software must not be misrepresented, either
43  *		by explicit claim or by omission.
44  *
45  *	3. Altered versions must be plainly marked as such, and must not
46  *		be misrepresented as being the original software.
47  *
48  * Eagle-eyed readers will notice this is an altered version.  Incredibly
49  * sharp-eyed readers might even find bits that weren't altered.
50  *
51  *
52  *      And I experienced a sense that, like certain regular
53  *      expressions, seemed to match the day from beginning to end, so
54  *      that I did not need to identify the parenthesised subexpression
55  *      that told of dawn, nor the group of characters that indicated
56  *      the moment when my grandfather returned home with news of
57  *      Swann's departure for Paris; and the whole length of the month
58  *      of May, as if matched by a closure, fitted into the buffer of my
59  *      life with no sign of overflowing, turning the days, like a
60  *      procession of insects that could consist of this or that
61  *      species, into a random and unstructured repetition of different
62  *      sequences, anchored from the first day of the month to the last
63  *      in the same fashion as the weeks when I knew I would not see
64  *      Gilberte and would search in vain for any occurrences of the
65  *      string in the avenue of hawthorns by Tansonville, without my
66  *      having to delimit explicitly the start or finish of the pattern.
67  *
68  *                                 M. Proust, "In Search of Lost Files",
69  *                                 bk I, "The Walk by Bourne's Place".
70  */
71 
72 #include "zsh.mdh"
73 
74 /*
75  * The following union is used mostly for alignment purposes.
76  * Normal nodes are longs, while certain nodes take a char * as an argument;
77  * here we make sure that they both work out to the same length.
78  * The compiled regexp we construct consists of upats stuck together;
79  * anything else to be added (strings, numbers) is stuck after and
80  * then aligned to a whole number of upat units.
81  *
82  * Note also that offsets are in terms of the sizes of these things.
83  */
84 union upat {
85     long l;
86     unsigned char *p;
87 };
88 
89 typedef union upat *Upat;
90 
91 #include "pattern.pro"
92 
93 /* Number of active parenthesized expressions allowed in backreferencing */
94 #define NSUBEXP  9
95 
96 /* definition	number	opnd?	meaning */
97 #define	P_END	  0x00	/* no	End of program. */
98 #define P_EXCSYNC 0x01	/* no   Test if following exclude already failed */
99 #define P_EXCEND  0x02	/* no   Test if exclude matched orig branch */
100 #define	P_BACK	  0x03	/* no	Match "", "next" ptr points backward. */
101 #define	P_EXACTLY 0x04	/* lstr	Match this string. */
102 #define	P_NOTHING 0x05	/* no	Match empty string. */
103 #define	P_ONEHASH 0x06	/* node	Match this (simple) thing 0 or more times. */
104 #define	P_TWOHASH 0x07	/* node	Match this (simple) thing 1 or more times. */
105 #define P_GFLAGS  0x08	/* long Match nothing and set globbing flags */
106 #define P_ISSTART 0x09  /* no   Match start of string. */
107 #define P_ISEND   0x0a  /* no   Match end of string. */
108 #define P_COUNTSTART 0x0b /* no Initialise P_COUNT */
109 #define P_COUNT   0x0c  /* 3*long uc* node Match a number of repetitions */
110 /* numbered so we can test bit 5 for a branch */
111 #define	P_BRANCH  0x20	/* node	Match this alternative, or the next... */
112 #define	P_WBRANCH 0x21	/* uc* node P_BRANCH, but match at least 1 char */
113 /* excludes are also branches, but have bit 4 set, too */
114 #define P_EXCLUDE 0x30	/* uc* node Exclude this from previous branch */
115 #define P_EXCLUDP 0x31	/* uc* node Exclude, using full file path so far */
116 /* numbered so we can test bit 6 so as not to match initial '.' */
117 #define	P_ANY	  0x40	/* no	Match any one character. */
118 #define	P_ANYOF	  0x41	/* str  Match any character in this string. */
119 #define	P_ANYBUT  0x42	/* str  Match any character not in this string. */
120 #define P_STAR    0x43	/* no   Match any set of characters. */
121 #define P_NUMRNG  0x44	/* zr, zr Match a numeric range. */
122 #define P_NUMFROM 0x45	/* zr   Match a number >= X */
123 #define P_NUMTO   0x46	/* zr   Match a number <= X */
124 #define P_NUMANY  0x47	/* no   Match any set of decimal digits */
125 /* spaces left for P_OPEN+n,... for backreferences */
126 #define	P_OPEN	  0x80	/* no	Mark this point in input as start of n. */
127 #define	P_CLOSE	  0x90	/* no	Analogous to OPEN. */
128 /*
129  * no    no argument
130  * zr    the range type zrange_t:  may be zlong or unsigned long
131  * char  a single char
132  * uc*   a pointer to unsigned char, used at run time and initialised
133  *       to NULL.
134  * str   null-terminated, metafied string
135  * lstr  length as long then string, not null-terminated, unmetafied.
136  */
137 
138 /*
139  * Notes on usage:
140  * P_WBRANCH:  This works like a branch and is used in complex closures,
141  *    to ensure we don't succeed on a zero-length match of the pattern,
142  *    since that would cause an infinite loop.  We do this by recording
143  *    the positions where we have already tried to match.   See the
144  *    P_WBRANCH test in patmatch().
145  *
146  *  P_ANY, P_ANYOF:  the operand is a null terminated
147  *    string.  Normal characters match as expected.  Characters
148  *    in the range Meta+PP_ALPHA..Meta+PP_UNKWN do the appropriate
149  *    Posix range tests.  This relies on imeta returning true for these
150  *    characters.  We treat unknown POSIX ranges as never matching.
151  *    PP_RANGE means the next two (possibly metafied) characters form
152  *    the limits of a range to test; it's too much like hard work to
153  *    expand the range.
154  *
155  *  P_EXCLUDE, P_EXCSYNC, PEXCEND:  P_EXCLUDE appears in the pattern like
156  *    P_BRANCH, but applies to the immediately preceding branch.  The code in
157  *    the corresponding branch is followed by a P_EXCSYNC, which simply
158  *    acts as a marker that a P_EXCLUDE comes next.  The P_EXCLUDE
159  *    has a pointer to char embedded in it, which works
160  *    like P_WBRANCH:  if we get to the P_EXCSYNC, and we already matched
161  *    up to the same position, fail.  Thus we are forced to backtrack
162  *    on closures in the P_BRANCH if the first attempt was excluded.
163  *    Corresponding to P_EXCSYNC in the original branch, there is a
164  *    P_EXCEND in the exclusion.  If we get to this point, and we did
165  *    *not* match in the original branch, the exclusion itself fails,
166  *    otherwise it succeeds since we know the tail already matches,
167  *    so P_EXCEND is the end of the exclusion test.
168  *    The whole sorry mess looks like this, where the upper lines
169  *    show the linkage of the branches, and the lower shows the linkage
170  *    of their pattern arguments.
171  *
172  *     	        ---------------------      ----------------------
173  *              ^      	       	     v    ^      	         v
174  *      ( <BRANCH>:apat-><EXCSYNC> <EXCLUDE>:excpat-><EXCEND> ) tail
175  *                               	                         ^
176  *		       	  |                                      |
177  *			   --------------------------------------
178  *
179  * P_EXCLUDP: this behaves exactly like P_EXCLUDE, with the sole exception
180  *   that we prepend the path so far to the exclude pattern.   This is
181  *   for top level file globs, e.g. ** / *.c~*foo.c
182  *                                    ^ I had to leave this space
183  * P_NUM*: zl is a zlong if that is 64-bit, else an unsigned long.
184  *
185  * P_COUNTSTART, P_COUNT: a P_COUNTSTART flags the start of a quantified
186  * closure (#cN,M) and is used to initialise the count.  Executing
187  * the pattern leads back to the P_COUNT, while the next links of the
188  * P_COUNTSTART and P_COUNT lead to the tail of the pattern:
189  *
190  *	       	        ----------------
191  *     	       	       v       	        ^
192  *        <COUNTSTART><COUNT>pattern<BACK> tail
193  *	     	    v      v  	  	    ^
194  *	            ------------------------
195  */
196 
197 #define	P_OP(p)		((p)->l & 0xff)
198 #define	P_NEXT(p)	((p)->l >> 8)
199 #define	P_OPERAND(p)	((p) + 1)
200 #define P_ISBRANCH(p)   ((p)->l & 0x20)
201 #define P_ISEXCLUDE(p)	(((p)->l & 0x30) == 0x30)
202 #define P_NOTDOT(p)	((p)->l & 0x40)
203 
204 /* Specific to lstr type, i.e. P_EXACTLY. */
205 #define P_LS_LEN(p)	((p)[1].l) /* can be used as lvalue */
206 #define P_LS_STR(p)	((char *)((p) + 2))
207 
208 /* Specific to P_COUNT: arguments as offset in nodes from operator */
209 #define P_CT_CURRENT	(1)	/* Current count */
210 #define P_CT_MIN	(2)     /* Minimum count */
211 #define P_CT_MAX	(3)	/* Maximum count, -1 for none */
212 #define P_CT_PTR	(4)	/* Pointer to last match start */
213 #define P_CT_OPERAND	(5)	/* Operand of P_COUNT */
214 
215 /* Flags needed when pattern is executed */
216 #define P_SIMPLE        0x01	/* Simple enough to be #/## operand. */
217 #define P_HSTART        0x02	/* Starts with # or ##'d pattern. */
218 #define P_PURESTR	0x04	/* Can be matched with a strcmp */
219 
220 #if defined(ZSH_64_BIT_TYPE) || defined(LONG_IS_64_BIT)
221 typedef zlong zrange_t;
222 #define ZRANGE_T_IS_SIGNED	(1)
223 #define ZRANGE_MAX ZLONG_MAX
224 #else
225 typedef unsigned long zrange_t;
226 #define ZRANGE_MAX ULONG_MAX
227 #endif
228 
229 #ifdef MULTIBYTE_SUPPORT
230 /*
231  * Handle a byte that's not part of a valid character.
232  *
233  * This range in Unicode is recommended for purposes of this
234  * kind as it corresponds to invalid characters.
235  *
236  * Note that this strictly only works if wchar_t represents
237  * Unicode code points, which isn't necessarily true; however,
238  * converting an invalid character into an unknown format is
239  * a bit tricky...
240  */
241 #define WCHAR_INVALID(ch)			\
242     ((wchar_t) (0xDC00 + STOUC(ch)))
243 #endif /* MULTIBYTE_SUPPORT */
244 
245 /*
246  * Array of characters corresponding to zpc_chars enum, which it must match.
247  */
248 static const char zpc_chars[ZPC_COUNT] = {
249     '/', '\0', Bar, Outpar, Tilde, Inpar, Quest, Star, Inbrack, Inang,
250     Hat, Pound, Bnullkeep, Quest, Star, '+', Bang, '!', '@'
251 };
252 
253 /*
254  * Corresponding strings used in enable/disable -p.
255  * NULL means no way of turning this on or off.
256  */
257 /**/
258 mod_export const char *zpc_strings[ZPC_COUNT] = {
259    NULL, NULL, "|", NULL, "~", "(", "?", "*", "[", "<",
260    "^", "#", NULL, "?(", "*(", "+(", "!(", "\\!(", "@("
261 };
262 
263 /*
264  * Corresponding array of pattern disables as set by the user
265  * using "disable -p".
266  */
267 /**/
268 mod_export char zpc_disables[ZPC_COUNT];
269 
270 /*
271  * Stack of saved (compressed) zpc_disables for function scope.
272  */
273 
274 static struct zpc_disables_save *zpc_disables_stack;
275 
276 /*
277  * Characters which terminate a simple string (ZPC_COUNT) or
278  * an entire pattern segment (the first ZPC_SEG_COUNT).
279  * Each entry is either the corresponding character in zpc_chars
280  * or Marker which is guaranteed not to match a character in a
281  * pattern we are compiling.
282  *
283  * The complete list indicates characters that are special, so e.g.
284  * (testchar == special[ZPC_TILDE]) succeeds only if testchar is a Tilde
285  * *and* Tilde is currently special.
286  */
287 
288 /**/
289 char zpc_special[ZPC_COUNT];
290 
291 /* Default size for pattern buffer */
292 #define P_DEF_ALLOC 256
293 
294 /* Flags used in compilation */
295 static char *patstart, *patparse;	/* input pointers */
296 static int patnpar;		/* () count */
297 static char *patcode;		/* point of code emission */
298 static long patsize;		/* size of code */
299 static char *patout;		/* start of code emission string */
300 static long patalloc;		/* size allocated for same */
301 
302 /* Flags used in both compilation and execution */
303 static int patflags;		    /* flags passed down to patcompile */
304 static int patglobflags;  /* globbing flags & approx */
305 
306 /*
307  * Increment pointer to metafied multibyte string.
308  */
309 #ifdef MULTIBYTE_SUPPORT
310 typedef wint_t patint_t;
311 
312 #define PEOF WEOF
313 
314 #define METACHARINC(x) ((void)metacharinc(&x))
315 
316 /*
317  * TODO: the shiftstate isn't well handled; we don't guarantee
318  * to maintain it properly between characters.  If we don't
319  * need it we should use mbtowc() instead.
320  */
321 static mbstate_t shiftstate;
322 
323 /*
324  * Multibyte version: it's (almost) as easy to return the
325  * value as not, so do so since we sometimes need it..
326  */
327 static wchar_t
metacharinc(char ** x)328 metacharinc(char **x)
329 {
330     char *inptr = *x;
331     char inchar;
332     size_t ret = MB_INVALID;
333     wchar_t wc;
334 
335     /*
336      * Cheat if the top bit isn't set.  This is second-guessing
337      * the library, but we know for sure that if the character
338      * set doesn't have the property that all bytes with the 8th
339      * bit clear are single characters then we are stuffed.
340      */
341     if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*inptr) & 0x80))
342     {
343 	if (itok(*inptr))
344 	    inchar = ztokens[*inptr++ - Pound];
345 	else if (*inptr == Meta) {
346 	    inptr++;
347 	    inchar = *inptr++ ^ 32;
348 	} else {
349 	    inchar = *inptr++;
350 	}
351 	*x = inptr;
352 	return (wchar_t)STOUC(inchar);
353     }
354 
355     while (*inptr) {
356 	if (itok(*inptr))
357 	    inchar = ztokens[*inptr++ - Pound];
358 	else if (*inptr == Meta) {
359 	    inptr++;
360 	    inchar = *inptr++ ^ 32;
361 	} else {
362 	    inchar = *inptr++;
363 	}
364 	ret = mbrtowc(&wc, &inchar, 1, &shiftstate);
365 
366 	if (ret == MB_INVALID)
367 	    break;
368 	if (ret == MB_INCOMPLETE)
369 	    continue;
370 	*x = inptr;
371 	return wc;
372     }
373 
374     /* Error. */
375     /* Reset the shift state for next time. */
376     memset(&shiftstate, 0, sizeof(shiftstate));
377     return WCHAR_INVALID(*(*x)++);
378 }
379 
380 #else
381 typedef int patint_t;
382 
383 #define PEOF EOF
384 
385 #define METACHARINC(x)	((void)((x) += (*(x) == Meta) ? 2 : 1))
386 #endif
387 
388 /*
389  * Return unmetafied char from string (x is any char *).
390  * Used with MULTIBYTE_SUPPORT if the GF_MULTIBYTE is not
391  * in effect.
392  */
393 #define UNMETA(x)	(*(x) == Meta ? (x)[1] ^ 32 : *(x))
394 
395 /* Add n more characters, ensuring there is enough space. */
396 
397 enum {
398     PA_NOALIGN = 1,
399     PA_UNMETA  = 2
400 };
401 
402 /**/
403 static void
patadd(char * add,int ch,long n,int paflags)404 patadd(char *add, int ch, long n, int paflags)
405 {
406     /* Make sure everything gets aligned unless we get PA_NOALIGN. */
407     long newpatsize = patsize + n;
408     if (!(paflags & PA_NOALIGN))
409 	newpatsize = (newpatsize + sizeof(union upat) - 1) &
410 		      ~(sizeof(union upat) - 1);
411     if (patalloc < newpatsize) {
412 	long newpatalloc =
413 	    2*(newpatsize > patalloc ? newpatsize : patalloc);
414 	patout = (char *)zrealloc((char *)patout, newpatalloc);
415 	patcode = patout + patsize;
416 	patalloc = newpatalloc;
417     }
418     patsize = newpatsize;
419     if (add) {
420 	if (paflags & PA_UNMETA) {
421 	    /*
422 	     * Unmetafy and untokenize the string as we go.
423 	     * The Meta characters in add aren't counted in n.
424 	     */
425 	    while (n--) {
426 		if (itok(*add))
427 		    *patcode++ = ztokens[*add++ - Pound];
428 		else if (*add == Meta) {
429 		    add++;
430 		    *patcode++ = *add++ ^ 32;
431 		} else {
432 		    *patcode++ = *add++;
433 		}
434 	    }
435 	} else {
436 	    while (n--)
437 		*patcode++ = *add++;
438 	}
439     } else
440 	*patcode++ = ch;
441     patcode = patout + patsize;
442 }
443 
444 static long rn_offs;
445 /* operates on pointers to union upat, returns a pointer */
446 #define PATNEXT(p) ((rn_offs = P_NEXT(p)) ? \
447 		    (P_OP(p) == P_BACK) ? \
448 		    ((p)-rn_offs) : ((p)+rn_offs) : NULL)
449 
450 /*
451  * Set up zpc_special with characters that end a string segment.
452  * "Marker" cannot occur in the pattern we are compiling so
453  * is used to mark "invalid".
454  */
455 static void
patcompcharsset(void)456 patcompcharsset(void)
457 {
458     char *spp, *disp;
459     int i;
460 
461     /* Initialise enabled special characters */
462     memcpy(zpc_special, zpc_chars, ZPC_COUNT);
463     /* Apply user disables from disable -p */
464     for (i = 0, spp = zpc_special, disp = zpc_disables;
465 	 i < ZPC_COUNT;
466 	 i++, spp++, disp++) {
467 	if (*disp)
468 	    *spp = Marker;
469     }
470 
471     if (!isset(EXTENDEDGLOB)) {
472 	/* Extended glob characters are not active */
473 	zpc_special[ZPC_TILDE] = zpc_special[ZPC_HAT] =
474 	    zpc_special[ZPC_HASH] = Marker;
475     }
476     if (!isset(KSHGLOB)) {
477 	/*
478 	 * Ksh glob characters are not active.
479 	 * * and ? are shared with normal globbing, but for their
480 	 * use here we are looking for a following Inpar.
481 	 */
482 	zpc_special[ZPC_KSH_QUEST] = zpc_special[ZPC_KSH_STAR] =
483 	    zpc_special[ZPC_KSH_PLUS] = zpc_special[ZPC_KSH_BANG] =
484 	    zpc_special[ZPC_KSH_BANG2] = zpc_special[ZPC_KSH_AT] = Marker;
485     }
486     /*
487      * Note that if we are using KSHGLOB, then we test for a following
488      * Inpar, not zpc_special[ZPC_INPAR]:  the latter makes an Inpar on
489      * its own active.  The zpc_special[ZPC_KSH_*] followed by any old Inpar
490      * discriminate ksh globbing.
491      */
492     if (isset(SHGLOB)) {
493 	/*
494 	 * Grouping and numeric ranges are not valid.
495 	 * We do allow alternation, however; it's needed for
496 	 * "case".  This may not be entirely consistent.
497 	 *
498 	 * Don't disable Outpar: we may need to match the end of KSHGLOB
499 	 * parentheses and it would be difficult to tell them apart.
500 	 */
501 	zpc_special[ZPC_INPAR] = zpc_special[ZPC_INANG] = Marker;
502     }
503 }
504 
505 /* Called before parsing a set of file matches to initialize flags */
506 
507 /**/
508 void
patcompstart(void)509 patcompstart(void)
510 {
511     patcompcharsset();
512     if (isset(CASEGLOB))
513 	patglobflags = 0;
514     else
515 	patglobflags = GF_IGNCASE;
516     if (isset(MULTIBYTE))
517 	patglobflags |= GF_MULTIBYTE;
518 }
519 
520 /*
521  * Top level pattern compilation subroutine
522  * exp is a null-terminated, metafied string.
523  * inflags is an or of some PAT_* flags.
524  * endexp, if non-null, is set to a pointer to the end of the
525  *   part of exp which was compiled.  This is used when
526  *   compiling patterns for directories which must be
527  *   matched recursively.
528  */
529 
530 /**/
531 mod_export Patprog
patcompile(char * exp,int inflags,char ** endexp)532 patcompile(char *exp, int inflags, char **endexp)
533 {
534     int flags = 0;
535     long len = 0;
536     long startoff;
537     Upat pscan;
538     char *lng, *strp = NULL;
539     Patprog p;
540 
541     queue_signals();
542 
543     startoff = sizeof(struct patprog);
544     /* Ensure alignment of start of program string */
545     startoff = (startoff + sizeof(union upat) - 1) & ~(sizeof(union upat) - 1);
546 
547     /* Allocate reasonable sized chunk if none, reduce size if too big */
548     if (patalloc != P_DEF_ALLOC)
549 	patout = (char *)zrealloc(patout, patalloc = P_DEF_ALLOC);
550     patcode = patout + startoff;
551     patsize = patcode - patout;
552     patstart = patparse = exp;
553     /*
554      * Note global patnpar numbers parentheses 1..9, while patnpar
555      * in struct is actual count of parentheses.
556      */
557     patnpar = 1;
558     patflags = inflags & ~(PAT_PURES|PAT_HAS_EXCLUDP);
559 
560     if (!(patflags & PAT_FILE)) {
561 	patcompcharsset();
562 	zpc_special[ZPC_SLASH] = Marker;
563 	remnulargs(patparse);
564 	if (isset(MULTIBYTE))
565 	    patglobflags = GF_MULTIBYTE;
566 	else
567 	    patglobflags = 0;
568     }
569     if (patflags & PAT_LCMATCHUC)
570 	patglobflags |= GF_LCMATCHUC;
571     /*
572      * Have to be set now, since they get updated during compilation.
573      */
574     ((Patprog)patout)->globflags = patglobflags;
575 
576     if (!(patflags & PAT_ANY)) {
577 	/* Look for a really pure string, with no tokens at all. */
578 	if (!(patglobflags & ~GF_MULTIBYTE)
579 #ifdef __CYGWIN__
580 	    /*
581 	     * If the OS treats files case-insensitively and we
582 	     * are looking at files, we don't need to use pattern
583 	     * matching to find the file.
584 	     */
585 	    || (!(patglobflags & ~GF_IGNCASE) && (patflags & PAT_FILE))
586 #endif
587 	    )
588 	{
589 	    /*
590 	     * Waah!  I wish I understood this.
591 	     * Empty metafied strings have an initial Nularg.
592 	     * This never corresponds to a real character in
593 	     * a glob pattern or string, so skip it.
594 	     */
595 	    if (*exp == Nularg)
596 		exp++;
597 	    for (strp = exp; *strp &&
598 		     (!(patflags & PAT_FILE) || *strp != '/') && !itok(*strp);
599 		 strp++)
600 		;
601 	}
602 	if (!strp || (*strp && *strp != '/')) {
603 	    /* No, do normal compilation. */
604 	    strp = NULL;
605 	    if (patcompswitch(0, &flags) == 0) {
606 		unqueue_signals();
607 		return NULL;
608 	    }
609 	} else {
610 	    /*
611 	     * Yes, copy the string, and skip compilation altogether.
612 	     * Null terminate for the benefit of globbing.
613 	     * Leave metafied both for globbing and for our own
614 	     * efficiency.
615 	     */
616 	    patparse = strp;
617 	    len = strp - exp;
618 	    patadd(exp, 0, len + 1, 0);
619 	    patout[startoff + len] = '\0';
620 	    patflags |= PAT_PURES;
621 	}
622     }
623 
624     /* end of compilation: safe to use pointers */
625     p = (Patprog)patout;
626     p->startoff = startoff;
627     p->patstartch = '\0';
628     p->globend = patglobflags;
629     p->flags = patflags;
630     p->mustoff = 0;
631     p->size = patsize;
632     p->patmlen = len;
633     p->patnpar = patnpar-1;
634 
635     if (!strp) {
636 	pscan = (Upat)(patout + startoff);
637 
638 	if (!(patflags & PAT_ANY) && P_OP(PATNEXT(pscan)) == P_END) {
639 	    /* only one top level choice */
640 	    pscan = P_OPERAND(pscan);
641 
642 	    if (flags & P_PURESTR) {
643 		/*
644 		 * The pattern can be matched with a simple strncmp/strcmp.
645 		 * Careful in case we've overwritten the node for the next ptr.
646 		 */
647 		char *dst = patout + startoff;
648 		Upat next;
649 		p->flags |= PAT_PURES;
650 		for (; pscan; pscan = next) {
651 		    next = PATNEXT(pscan);
652 		    if (P_OP(pscan) == P_EXACTLY) {
653 			char *opnd = P_LS_STR(pscan), *mtest;
654 			long oplen = P_LS_LEN(pscan), ilen;
655 			int nmeta = 0;
656 			/*
657 			 * Unfortunately we unmetafied the string
658 			 * and we need to put any metacharacters
659 			 * back now we know it's a pure string.
660 			 * This shouldn't happen too often, it's
661 			 * just that there are some cases such
662 			 * as . and .. in files where we really
663 			 * need a pure string even if there are
664 			 * pattern characters flying around.
665 			 */
666 			for (mtest = opnd, ilen = oplen; ilen;
667 			     mtest++, ilen--)
668 			    if (imeta(*mtest))
669 				nmeta++;
670 			if (nmeta) {
671 			    patadd(NULL, 0, nmeta, 0);
672 			    p = (Patprog)patout;
673 			    opnd = dupstring_wlen(opnd, oplen);
674 			    dst = patout + startoff;
675 			}
676 
677 			while (oplen--) {
678 			    if (imeta(*opnd)) {
679 				*dst++ = Meta;
680 				*dst++ = *opnd++ ^ 32;
681 			    } else {
682 				*dst++ = *opnd++;
683 			    }
684 			}
685 			/* Only one string in a PAT_PURES, so now done. */
686 			break;
687 		    }
688 		}
689 		p->size = dst - patout;
690 		/* patmlen is really strlen.  We don't need a null. */
691 		p->patmlen = p->size - startoff;
692 	    } else {
693 		/* starting point info */
694 		if (P_OP(pscan) == P_EXACTLY && !p->globflags &&
695 		    P_LS_LEN(pscan))
696 		    p->patstartch = *P_LS_STR(pscan);
697 		/*
698 		 * Find the longest literal string in something expensive.
699 		 * This is itself not all that cheap if we have
700 		 * case-insensitive matching or approximation, so don't.
701 		 */
702 		if ((flags & P_HSTART) && !p->globflags) {
703 		    lng = NULL;
704 		    len = 0;
705 		    for (; pscan; pscan = PATNEXT(pscan))
706 			if (P_OP(pscan) == P_EXACTLY &&
707 			    P_LS_LEN(pscan) >= len) {
708 			    lng = P_LS_STR(pscan);
709 			    len = P_LS_LEN(pscan);
710 			}
711 		    if (lng) {
712 			p->mustoff = lng - patout;
713 			p->patmlen = len;
714 		    }
715 		}
716 	    }
717 	}
718     }
719 
720     /*
721      * The pattern was compiled in a fixed buffer:  unless told otherwise,
722      * we stick the compiled pattern on the heap.  This is necessary
723      * for files where we will often be compiling multiple segments at once.
724      * But if we get the ZDUP flag we always put it in zalloc()ed memory.
725      */
726     if (patflags & PAT_ZDUP) {
727 	Patprog newp = (Patprog)zalloc(patsize);
728 	memcpy((char *)newp, (char *)p, patsize);
729 	p = newp;
730     } else if (!(patflags & PAT_STATIC)) {
731 	Patprog newp = (Patprog)zhalloc(patsize);
732 	memcpy((char *)newp, (char *)p, patsize);
733 	p = newp;
734     }
735 
736     if (endexp)
737 	*endexp = patparse;
738 
739     unqueue_signals();
740     return p;
741 }
742 
743 /*
744  * Main body or parenthesized subexpression in pattern
745  * Parenthesis (and any ksh_glob gubbins) will have been removed.
746  */
747 
748 /**/
749 static long
patcompswitch(int paren,int * flagp)750 patcompswitch(int paren, int *flagp)
751 {
752     long starter, br, ender, excsync = 0;
753     int parno = 0;
754     int flags, gfchanged = 0;
755     long savglobflags = (long)patglobflags;
756     Upat ptr;
757 
758     *flagp = 0;
759 
760     if (paren && (patglobflags & GF_BACKREF) && patnpar <= NSUBEXP) {
761 	/*
762 	 * parenthesized:  make an open node.
763 	 * We can only refer to the first nine parentheses.
764 	 * For any others, we just use P_OPEN on its own; there's
765 	 * no gain in arbitrarily limiting the number of parentheses.
766 	 */
767 	parno = patnpar++;
768 	starter = patnode(P_OPEN + parno);
769     } else
770 	starter = 0;
771 
772     br = patnode(P_BRANCH);
773     if (!patcompbranch(&flags, paren))
774 	return 0;
775     if (patglobflags != (int)savglobflags)
776 	gfchanged++;
777     if (starter)
778 	pattail(starter, br);
779     else
780 	starter = br;
781 
782     *flagp |= flags & (P_HSTART|P_PURESTR);
783 
784     while (*patparse == zpc_chars[ZPC_BAR] ||
785 	   (*patparse == zpc_special[ZPC_TILDE] &&
786 	    (patparse[1] == '/' ||
787 	     !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
788 	int tilde = *patparse++ == zpc_special[ZPC_TILDE];
789 	long gfnode = 0, newbr;
790 
791 	*flagp &= ~P_PURESTR;
792 
793 	if (tilde) {
794 	    union upat up;
795 	    /* excsync remembers the P_EXCSYNC node before a chain of
796 	     * exclusions:  all point back to this.  only the
797 	     * original (non-excluded) branch gets a trailing P_EXCSYNC.
798 	     */
799 	    if (!excsync) {
800 		excsync = patnode(P_EXCSYNC);
801 		patoptail(br, excsync);
802 	    }
803 	    /*
804 	     * By default, approximations are turned off in exclusions:
805 	     * we need to do this here as otherwise the code compiling
806 	     * the exclusion doesn't know if the flags have really
807 	     * changed if the error count gets restored.
808 	     */
809 	    patglobflags &= ~0xff;
810 	    if (!(patflags & PAT_FILET) || paren) {
811 		br = patnode(P_EXCLUDE);
812 	    } else {
813 		/*
814 		 * At top level (paren == 0) in a file glob !(patflags
815 		 * &PAT_FILET) do the exclusion prepending the file path
816 		 * so far.  We need to flag this to avoid unnecessarily
817 		 * copying the path.
818 		 */
819 		br = patnode(P_EXCLUDP);
820 		patflags |= PAT_HAS_EXCLUDP;
821 	    }
822 	    up.p = NULL;
823 	    patadd((char *)&up, 0, sizeof(up), 0);
824 	    /* / is not treated as special if we are at top level */
825 	    if (!paren && zpc_special[ZPC_SLASH] == '/') {
826 		tilde++;
827 		zpc_special[ZPC_SLASH] = Marker;
828 	    }
829 	} else {
830 	    excsync = 0;
831 	    br = patnode(P_BRANCH);
832 	    /*
833 	     * The position of the following statements means globflags
834 	     * set in the main branch carry over to the exclusion.
835 	     */
836 	    if (!paren) {
837 		patglobflags = 0;
838 		if (((Patprog)patout)->globflags) {
839 		    /*
840 		     * If at top level, we need to reinitialize flags to zero,
841 		     * since (#i)foo|bar only applies to foo and we stuck
842 		     * the #i into the global flags.
843 		     * We could have done it so that they only got set in the
844 		     * first branch, but it's quite convenient having any
845 		     * global flags set in the header and not buried in the
846 		     * pattern.  (Or maybe it isn't and we should
847 		     * forget this bit and always stick in an explicit GFLAGS
848 		     * statement instead of using the header.)
849 		     * Also, this can't happen for file globs where there are
850 		     * no top-level |'s.
851 		     *
852 		     * No gfchanged, as nothing to follow branch at top
853 		     * level.
854 		     */
855 		    union upat up;
856 		    gfnode = patnode(P_GFLAGS);
857 		    up.l = patglobflags;
858 		    patadd((char *)&up, 0, sizeof(union upat), 0);
859 		}
860 	    } else {
861 		patglobflags = (int)savglobflags;
862 	    }
863 	}
864 	newbr = patcompbranch(&flags, paren);
865 	if (tilde == 2) {
866 	    /* restore special treatment of / */
867 	    zpc_special[ZPC_SLASH] = '/';
868 	}
869 	if (!newbr)
870 	    return 0;
871 	if (gfnode)
872 	    pattail(gfnode, newbr);
873 	if (!tilde && patglobflags != (int)savglobflags)
874 	    gfchanged++;
875 	pattail(starter, br);
876 	if (excsync)
877 	    patoptail(br, patnode(P_EXCEND));
878 	*flagp |= flags & P_HSTART;
879     }
880 
881     /*
882      * Make a closing node, hooking it to the end.
883      * Note that we can't optimize P_NOTHING out here, since another
884      * branch at that point would indicate the current choices continue,
885      * which they don't.
886      */
887     ender = patnode(paren ? parno ? P_CLOSE+parno : P_NOTHING : P_END);
888     pattail(starter, ender);
889 
890     /*
891      * Hook the tails of the branches to the closing node,
892      * except for exclusions which terminate where they are.
893      */
894     for (ptr = (Upat)patout + starter; ptr; ptr = PATNEXT(ptr))
895 	if (!P_ISEXCLUDE(ptr))
896 	    patoptail(ptr-(Upat)patout, ender);
897 
898     /* check for proper termination */
899     if ((paren && *patparse++ != Outpar) ||
900 	(!paren && *patparse &&
901 	 !((patflags & PAT_FILE) && *patparse == '/')))
902 	return 0;
903 
904     if (paren && gfchanged) {
905 	/*
906 	 * Restore old values of flags when leaving parentheses.
907 	 * gfchanged detects a change in any branch (except exclusions
908 	 * which are separate), since we need to emit this even if
909 	 * a later branch happened to put the flags back.
910 	 */
911 	pattail(ender, patnode(P_GFLAGS));
912 	patglobflags = (int)savglobflags;
913 	patadd((char *)&savglobflags, 0, sizeof(long), 0);
914     }
915 
916     return starter;
917 }
918 
919 /*
920  * Compile something ended by Bar, Outpar, Tilde, or end of string.
921  * Note the BRANCH or EXCLUDE tag must already have been omitted:
922  * this returns the position of the operand of that.
923  */
924 
925 /**/
926 static long
patcompbranch(int * flagp,int paren)927 patcompbranch(int *flagp, int paren)
928 {
929     long chain, latest = 0, starter;
930     int flags = 0;
931 
932     *flagp = P_PURESTR;
933 
934     starter = chain = 0;
935     while (!memchr(zpc_special, *patparse, ZPC_SEG_COUNT) ||
936 	   (*patparse == zpc_special[ZPC_TILDE] && patparse[1] != '/' &&
937 	    memchr(zpc_special, patparse[1], ZPC_SEG_COUNT))) {
938 	if ((*patparse == zpc_special[ZPC_INPAR] &&
939 	     patparse[1] == zpc_special[ZPC_HASH]) ||
940 	    (*patparse == zpc_special[ZPC_KSH_AT] && patparse[1] == Inpar &&
941 	     patparse[2] == zpc_special[ZPC_HASH])) {
942 	    /* Globbing flags. */
943 	    char *pp1 = patparse;
944 	    int oldglobflags = patglobflags, ignore;
945 	    long assert;
946 	    patparse += (*patparse == '@') ? 3 : 2;
947 	    if (!patgetglobflags(&patparse, &assert, &ignore))
948 		return 0;
949 	    if (!ignore) {
950 		if (assert) {
951 		    /*
952 		     * Start/end assertion looking like flags, but
953 		     * actually handled as a normal node
954 		     */
955 		    latest = patnode(assert);
956 		    flags = 0;
957 		} else {
958 		    if (pp1 == patstart) {
959 			/* Right at start of pattern, the simplest case.
960 			 * Put them into the flags and don't emit anything.
961 			 */
962 			((Patprog)patout)->globflags = patglobflags;
963 			continue;
964 		    } else if (!*patparse) {
965 			/* Right at the end, so just leave the flags for
966 			 * the next Patprog in the chain to pick up.
967 			 */
968 			break;
969 		    }
970 		    /*
971 		     * Otherwise, we have to stick them in as a pattern
972 		     * matching nothing.
973 		     */
974 		    if (oldglobflags != patglobflags) {
975 			/* Flags changed */
976 			union upat up;
977 			latest = patnode(P_GFLAGS);
978 			up.l = patglobflags;
979 			patadd((char *)&up, 0, sizeof(union upat), 0);
980 		    } else {
981 			/* No effect. */
982 			continue;
983 		    }
984 		}
985 	    } else if (!*patparse)
986 		break;
987 	    else
988 		continue;
989 	} else if (*patparse == zpc_special[ZPC_HAT]) {
990 	    /*
991 	     * ^pat:  anything but pat.  For proper backtracking,
992 	     * etc., we turn this into (*~pat), except without the
993 	     * parentheses.
994 	     */
995 	    patparse++;
996 	    latest = patcompnot(0, &flags);
997 	} else
998 	    latest = patcomppiece(&flags, paren);
999 	if (!latest)
1000 	    return 0;
1001 	if (!starter)
1002 	    starter = latest;
1003 	if (!(flags & P_PURESTR))
1004 	    *flagp &= ~P_PURESTR;
1005 	if (!chain)
1006 	    *flagp |= flags & P_HSTART;
1007 	else
1008 	    pattail(chain, latest);
1009 	chain = latest;
1010     }
1011     /* check if there was nothing in the loop, i.e. () */
1012     if (!chain)
1013 	starter = patnode(P_NOTHING);
1014 
1015     return starter;
1016 }
1017 
1018 /* get glob flags, return 1 for success, 0 for failure */
1019 
1020 /**/
1021 int
patgetglobflags(char ** strp,long * assertp,int * ignore)1022 patgetglobflags(char **strp, long *assertp, int *ignore)
1023 {
1024     char *nptr, *ptr = *strp;
1025     zlong ret;
1026 
1027     *assertp = 0;
1028     *ignore = 1;
1029     /* (#X): assumes we are still positioned on the first X */
1030     for (; *ptr && *ptr != Outpar; ptr++) {
1031 	if (*ptr == 'q') {
1032 	    /* Glob qualifiers, ignored in pattern code */
1033 	    while (*ptr && *ptr != Outpar)
1034 		ptr++;
1035 	    break;
1036 	} else {
1037 	    *ignore = 0;
1038 	    switch (*ptr) {
1039 	    case 'a':
1040 		/* Approximate matching, max no. of errors follows */
1041 		ret = zstrtol(++ptr, &nptr, 10);
1042 		/*
1043 		 * We can't have more than 254, because we need 255 to
1044 		 * mark 254 errors in wbranch and exclude sync strings
1045 		 * (hypothetically --- hope no-one tries it).
1046 		 */
1047 		if (ret < 0 || ret > 254 || ptr == nptr)
1048 		    return 0;
1049 		patglobflags = (patglobflags & ~0xff) | (ret & 0xff);
1050 		ptr = nptr-1;
1051 		break;
1052 
1053 	    case 'l':
1054 		/* Lowercase in pattern matches lower or upper in target */
1055 		patglobflags = (patglobflags & ~GF_IGNCASE) | GF_LCMATCHUC;
1056 		break;
1057 
1058 	    case 'i':
1059 		/* Fully case insensitive */
1060 		patglobflags = (patglobflags & ~GF_LCMATCHUC) | GF_IGNCASE;
1061 		break;
1062 
1063 	    case 'I':
1064 		/* Restore case sensitivity */
1065 		patglobflags &= ~(GF_LCMATCHUC|GF_IGNCASE);
1066 		break;
1067 
1068 	    case 'b':
1069 		/* Make backreferences */
1070 		patglobflags |= GF_BACKREF;
1071 		break;
1072 
1073 	    case 'B':
1074 		/* Don't make backreferences */
1075 		patglobflags &= ~GF_BACKREF;
1076 		break;
1077 
1078 	    case 'm':
1079 		/* Make references to complete match */
1080 		patglobflags |= GF_MATCHREF;
1081 		break;
1082 
1083 	    case 'M':
1084 		/* Don't */
1085 		patglobflags &= ~GF_MATCHREF;
1086 		break;
1087 
1088 	    case 's':
1089 		*assertp = P_ISSTART;
1090 		break;
1091 
1092 	    case 'e':
1093 		*assertp = P_ISEND;
1094 		break;
1095 
1096 	    case 'u':
1097 		patglobflags |= GF_MULTIBYTE;
1098 		break;
1099 
1100 	    case 'U':
1101 		patglobflags &= ~GF_MULTIBYTE;
1102 		break;
1103 
1104 	    default:
1105 		return 0;
1106 	    }
1107 	}
1108     }
1109     if (*ptr != Outpar)
1110 	return 0;
1111     /* Start/end assertions must appear on their own. */
1112     if (*assertp && (*strp)[1] != Outpar)
1113 	return 0;
1114     *strp = ptr + 1;
1115     return 1;
1116 }
1117 
1118 
1119 static const char *colon_stuffs[]  = {
1120     "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph",
1121     "lower", "print", "punct", "space", "upper", "xdigit", "IDENT",
1122     "IFS", "IFSSPACE", "WORD", "INCOMPLETE", "INVALID", NULL
1123 };
1124 
1125 /*
1126  * Handle the guts of a [:stuff:] character class element.
1127  * start is the beginning of "stuff" and len is its length.
1128  * This code is exported for the benefit of completion matching.
1129  */
1130 
1131 /**/
1132 mod_export int
range_type(char * start,int len)1133 range_type(char *start, int len)
1134 {
1135     const char **csp;
1136 
1137     for (csp = colon_stuffs; *csp; csp++) {
1138 	if (strlen(*csp) == len && !strncmp(start, *csp, len))
1139 		return (csp - colon_stuffs) + PP_FIRST;
1140     }
1141 
1142     return PP_UNKWN;
1143 }
1144 
1145 
1146 /*
1147  * Convert the contents of a [...] or [^...] expression (just the
1148  * ... part) back into a string.  This is used by compfiles -p/-P
1149  * for some reason.  The compiled form (a metafied string) is
1150  * passed in rangestr.
1151  *
1152  * If outstr is non-NULL the compiled form is placed there.  It
1153  * must be sufficiently long.  A terminating NULL is appended.
1154  *
1155  * Return the length required, not including the terminating NULL.
1156  *
1157  * TODO: this is non-multibyte for now.  It will need to be defined
1158  * appropriately with MULTIBYTE_SUPPORT when the completion matching
1159  * code catches up.
1160  */
1161 
1162 /**/
1163 mod_export int
pattern_range_to_string(char * rangestr,char * outstr)1164 pattern_range_to_string(char *rangestr, char *outstr)
1165 {
1166     int len = 0;
1167 
1168     while (*rangestr) {
1169 	if (imeta(STOUC(*rangestr))) {
1170 	    int swtype = STOUC(*rangestr) - STOUC(Meta);
1171 
1172 	    if (swtype == 0) {
1173 		/* Ordindary metafied character */
1174 		if (outstr)
1175 		{
1176 		    *outstr++ = Meta;
1177 		    *outstr++ = rangestr[1] ^ 32;
1178 		}
1179 		len += 2;
1180 		rangestr += 2;
1181 	    } else if (swtype == PP_RANGE) {
1182 		/* X-Y range */
1183 		int i;
1184 
1185 		for (i = 0; i < 2; i++) {
1186 		    if (*rangestr == Meta) {
1187 			if (outstr) {
1188 			    *outstr++ = Meta;
1189 			    *outstr++ = rangestr[1];
1190 			}
1191 			len += 2;
1192 			rangestr += 2;
1193 		    } else {
1194 			if (outstr)
1195 			    *outstr++ = *rangestr;
1196 			len++;
1197 			rangestr++;
1198 		    }
1199 
1200 		    if (i == 0) {
1201 			if (outstr)
1202 			    *outstr++ = '-';
1203 			len++;
1204 		    }
1205 		}
1206 	    } else if (swtype >= PP_FIRST && swtype <= PP_LAST) {
1207 		/* [:stuff:]; we need to output [: and :] */
1208 		const char *found = colon_stuffs[swtype - PP_FIRST];
1209 		int newlen = strlen(found);
1210 		if (outstr) {
1211 		    strcpy(outstr, "[:");
1212 		    outstr += 2;
1213 		    memcpy(outstr, found, newlen);
1214 		    outstr += newlen;
1215 		    strcpy(outstr, ":]");
1216 		    outstr += 2;
1217 		}
1218 		len += newlen + 4;
1219 		rangestr++;
1220 	    } else {
1221 		/* shouldn't happen */
1222 		DPUTS(1, "BUG: unknown PP_ code in pattern range");
1223 		rangestr++;
1224 	    }
1225 	} else {
1226 	    /* ordinary character, guaranteed no Meta handling needed */
1227 	    if (outstr)
1228 		*outstr++ = *rangestr;
1229 	    len++;
1230 	    rangestr++;
1231 	}
1232     }
1233 
1234     if (outstr)
1235 	*outstr = '\0';
1236     return len;
1237 }
1238 
1239 /*
1240  * compile a chunk such as a literal string or a [...] followed
1241  * by a possible hash operator
1242  */
1243 
1244 /**/
1245 static long
patcomppiece(int * flagp,int paren)1246 patcomppiece(int *flagp, int paren)
1247 {
1248     long starter = 0, next, op, opnd;
1249     int flags, flags2, kshchar, len, ch, patch, nmeta;
1250     int hash, count;
1251     union upat up;
1252     char *nptr, *str0, *ptr, *patprev;
1253     zrange_t from, to;
1254     char *charstart;
1255 
1256     flags = 0;
1257     str0 = patprev = patparse;
1258     for (;;) {
1259 	/*
1260 	 * Check if we have a string. First, we need to make sure
1261 	 * the string doesn't introduce a ksh-like parenthesized expression.
1262 	 */
1263 	kshchar = '\0';
1264 	if (*patparse && patparse[1] == Inpar) {
1265 	    if (*patparse == zpc_special[ZPC_KSH_PLUS])
1266 		kshchar = STOUC('+');
1267 	    else if (*patparse == zpc_special[ZPC_KSH_BANG])
1268 		kshchar = STOUC('!');
1269 	    else if (*patparse == zpc_special[ZPC_KSH_BANG2])
1270 		kshchar = STOUC('!');
1271 	    else if (*patparse == zpc_special[ZPC_KSH_AT])
1272 		kshchar = STOUC('@');
1273 	    else if (*patparse == zpc_special[ZPC_KSH_STAR])
1274 		kshchar = STOUC('*');
1275 	    else if (*patparse == zpc_special[ZPC_KSH_QUEST])
1276 		kshchar = STOUC('?');
1277 	}
1278 
1279 	/*
1280 	 * If '(' is disabled as a pattern char, allow ')' as
1281 	 * an ordinary string character if there are no parentheses to
1282 	 * close.  Don't allow it otherwise, it changes the syntax.
1283 	 */
1284 	if (zpc_special[ZPC_INPAR] != Marker || *patparse != Outpar ||
1285 	    paren) {
1286 	    /*
1287 	     * End of string (or no string at all) if ksh-type parentheses,
1288 	     * or special character, unless that character is a tilde and
1289 	     * the character following is an end-of-segment character.  Thus
1290 	     * tildes are not special if there is nothing following to
1291 	     * be excluded.
1292 	     *
1293 	     * Don't look for X()-style kshglobs at this point; we've
1294 	     * checked above for the case with parentheses and we don't
1295 	     * want to match without parentheses.
1296 	     */
1297 	    if (kshchar ||
1298 		(memchr(zpc_special, *patparse, ZPC_NO_KSH_GLOB) &&
1299 		 (*patparse != zpc_special[ZPC_TILDE] ||
1300 		  patparse[1] == '/' ||
1301 		  !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
1302 		break;
1303 	    }
1304     	}
1305 
1306 	/* Remember the previous character for backtracking */
1307 	patprev = patparse;
1308 	METACHARINC(patparse);
1309     }
1310 
1311     if (patparse > str0) {
1312 	long slen = patparse - str0;
1313 	int morelen;
1314 
1315 	/* Ordinary string: cancel kshchar lookahead */
1316 	kshchar = '\0';
1317 	/*
1318 	 * Assume it matches a simple string until we find otherwise.
1319 	 */
1320 	flags |= P_PURESTR;
1321 	DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece.");
1322 	/* more than one character matched? */
1323 	morelen = (patprev > str0);
1324 	/*
1325 	 * If we have more than one character, a following hash
1326 	 * or (#c...) only applies to the last, so backtrack one character.
1327 	 */
1328 	if ((*patparse == zpc_special[ZPC_HASH] ||
1329 	     (*patparse == zpc_special[ZPC_INPAR] &&
1330 	      patparse[1] == zpc_special[ZPC_HASH] &&
1331 	      patparse[2] == 'c') ||
1332 	     (*patparse == zpc_special[ZPC_KSH_AT] &&
1333 	      patparse[1] == Inpar &&
1334 	      patparse[2] == zpc_special[ZPC_HASH] &&
1335 	      patparse[3] == 'c')) && morelen)
1336 	    patparse = patprev;
1337 	/*
1338 	 * If len is 1, we can't have an active # following, so doesn't
1339 	 * matter that we don't make X in `XX#' simple.
1340 	 */
1341 	if (!morelen)
1342 	    flags |= P_SIMPLE;
1343 	starter = patnode(P_EXACTLY);
1344 
1345 	/* Get length of string without metafication. */
1346 	nmeta = 0;
1347 	/* inherited from domatch, but why, exactly? */
1348 	if (*str0 == Nularg)
1349 	    str0++;
1350 	for (ptr = str0; ptr < patparse; ptr++) {
1351 	    if (*ptr == Meta) {
1352 		nmeta++;
1353 		ptr++;
1354 	    }
1355 	}
1356 	slen = (patparse - str0) - nmeta;
1357 	/* First add length, which is a long */
1358 	patadd((char *)&slen, 0, sizeof(long), 0);
1359 	/*
1360 	 * Then the string, not null terminated.
1361 	 * Unmetafy and untokenize; pass the final length,
1362 	 * which is what we need to allocate, i.e. not including
1363 	 * a count for each Meta in the string.
1364 	 */
1365 	patadd(str0, 0, slen, PA_UNMETA);
1366 	nptr = P_LS_STR((Upat)patout + starter);
1367 	/*
1368 	 * It's much simpler to turn off pure string mode for
1369 	 * any case-insensitive or approximate matching; usually,
1370 	 * that is correct, or they wouldn't have been turned on.
1371 	 * However, we need to make sure we match a "." or ".."
1372 	 * in a file name as a pure string.  There's a minor bug
1373 	 * that this will also apply to something like
1374 	 * ..(#a1).. (i.e. the (#a1) has no effect), but if you're
1375 	 * going to write funny patterns, you get no sympathy from me.
1376 	 */
1377 	if (patglobflags &
1378 #ifdef __CYGWIN__
1379 	    /*
1380 	     * As above: don't use pattern matching for files
1381 	     * just because of case insensitivity if file system
1382 	     * is known to be case insensitive.
1383 	     *
1384 	     * This is known to be necessary in at least one case:
1385 	     * if "mount -c /" is in effect, so that drives appear
1386 	     * directly under / instead of the usual /cygdrive, they
1387 	     * aren't shown by readdir().  So it's vital we don't use
1388 	     * globbing to find "/c", since that'll fail.
1389 	     */
1390 	    ((patflags & PAT_FILE) ?
1391 	    (0xFF|GF_LCMATCHUC) :
1392 	    (0xFF|GF_LCMATCHUC|GF_IGNCASE))
1393 #else
1394 	    (0xFF|GF_LCMATCHUC|GF_IGNCASE)
1395 #endif
1396 	    ) {
1397 	    if (!(patflags & PAT_FILE))
1398 		flags &= ~P_PURESTR;
1399 	    else if (!(nptr[0] == '.' &&
1400 		       (slen == 1 || (nptr[1] == '.' && slen == 2))))
1401 		flags &= ~P_PURESTR;
1402 	}
1403     } else {
1404 	if (kshchar)
1405 	    patparse++;
1406 
1407 	patch = *patparse;
1408 	METACHARINC(patparse);
1409 	switch(patch) {
1410 	case Quest:
1411 	    DPUTS(zpc_special[ZPC_QUEST] == Marker,
1412 		  "Treating '?' as pattern character although disabled");
1413 	    flags |= P_SIMPLE;
1414 	    starter = patnode(P_ANY);
1415 	    break;
1416 	case Star:
1417 	    DPUTS(zpc_special[ZPC_STAR] == Marker,
1418 		  "Treating '*' as pattern character although disabled");
1419 	    /* kshchar is used as a sign that we can't have #'s. */
1420 	    kshchar = -1;
1421 	    starter = patnode(P_STAR);
1422 	    break;
1423 	case Inbrack:
1424 	    DPUTS(zpc_special[ZPC_INBRACK] == Marker,
1425 		  "Treating '[' as pattern character although disabled");
1426 	    flags |= P_SIMPLE;
1427 	    if (*patparse == Hat || *patparse == Bang) {
1428 		patparse++;
1429 		starter = patnode(P_ANYBUT);
1430 	    } else
1431 		starter = patnode(P_ANYOF);
1432 	    /*
1433 	     * []...] means match a "]" or other included characters.
1434 	     * However, to be a bit helpful and for compatibility
1435 	     * with other shells, don't take in that sense if
1436 	     * there's no further "]".  That's still imperfect,
1437 	     * but it's all we can do --- we're required to
1438 	     * treat [$var]*[$var]with empty var as [ ... ]
1439 	     * containing "]*[".
1440 	     */
1441 	    if (*patparse == Outbrack && strchr(patparse+1, Outbrack)) {
1442 		patparse++;
1443 		patadd(NULL, ']', 1, PA_NOALIGN);
1444 	    }
1445 	    while (*patparse && *patparse != Outbrack) {
1446 		/* Meta is not a token */
1447 		if (*patparse == Inbrack && patparse[1] == ':' &&
1448 			(nptr = strchr(patparse+2, ':')) &&
1449 			nptr[1] == Outbrack) {
1450 			/* Posix range. */
1451 			patparse += 2;
1452 			len = nptr - patparse;
1453 			ch = range_type(patparse, len);
1454 			patparse = nptr + 2;
1455 			if (ch != PP_UNKWN)
1456 			    patadd(NULL, STOUC(Meta) + ch, 1, PA_NOALIGN);
1457 			continue;
1458 		}
1459 		charstart = patparse;
1460 		METACHARINC(patparse);
1461 
1462 		if (*patparse == Dash && patparse[1] &&
1463 		    patparse[1] != Outbrack) {
1464 		    patadd(NULL, STOUC(Meta)+PP_RANGE, 1, PA_NOALIGN);
1465 		    if (itok(*charstart)) {
1466 			patadd(0, STOUC(ztokens[*charstart - Pound]), 1,
1467 			       PA_NOALIGN);
1468 		    } else {
1469 			patadd(charstart, 0, patparse-charstart, PA_NOALIGN);
1470 		    }
1471 		    charstart = ++patparse;	/* skip Dash token */
1472 		    METACHARINC(patparse);
1473 		}
1474 		if (itok(*charstart)) {
1475 		    patadd(0, STOUC(ztokens[*charstart - Pound]), 1,
1476 			   PA_NOALIGN);
1477 		} else {
1478 		    patadd(charstart, 0, patparse-charstart, PA_NOALIGN);
1479 		}
1480 	    }
1481 	    if (*patparse != Outbrack)
1482 		return 0;
1483 	    patparse++;
1484 	    /* terminate null string and fix alignment */
1485 	    patadd(NULL, 0, 1, 0);
1486 	    break;
1487 	case Inpar:
1488 	    DPUTS(!kshchar && zpc_special[ZPC_INPAR] == Marker,
1489 		  "Treating '(' as pattern character although disabled");
1490 	    DPUTS(isset(SHGLOB) && !kshchar,
1491 		  "Treating bare '(' as pattern character with SHGLOB");
1492 	    if (kshchar == '!') {
1493 		/* This is nasty, we should really either handle all
1494 		 * kshglobbing below or here.  But most of the
1495 		 * others look like non-ksh patterns, while this one
1496 		 * doesn't, so we handle it here and leave the rest.
1497 		 * We treat it like an extendedglob ^, except that
1498 		 * it goes into parentheses.
1499 		 *
1500 		 * If we did do kshglob here, we could support
1501 		 * the old behaviour that things like !(foo)##
1502 		 * work, but it makes the code more complicated at
1503 		 * the expense of allowing the user to do things
1504 		 * they shouldn't.
1505 		 */
1506 		if (!(starter = patcompnot(1, &flags2)))
1507 		    return 0;
1508 	    } else if (!(starter = patcompswitch(1, &flags2)))
1509 		return 0;
1510 	    flags |= flags2 & P_HSTART;
1511 	    break;
1512 	case Inang:
1513 	    /* Numeric glob */
1514 	    DPUTS(zpc_special[ZPC_INANG] == Marker,
1515 		  "Treating '<' as pattern character although disabled");
1516 	    DPUTS(isset(SHGLOB), "Treating <..> as numeric range with SHGLOB");
1517 	    len = 0;		/* beginning present 1, end present 2 */
1518 	    if (idigit(*patparse)) {
1519 		from = (zrange_t) zstrtol((char *)patparse,
1520 					 (char **)&nptr, 10);
1521 		patparse = nptr;
1522 		len |= 1;
1523 	    }
1524 	    DPUTS(!IS_DASH(*patparse), "BUG: - missing from numeric glob");
1525 	    patparse++;
1526 	    if (idigit(*patparse)) {
1527 		to = (zrange_t) zstrtol((char *)patparse,
1528 					  (char **)&nptr, 10);
1529 		patparse = nptr;
1530 		len |= 2;
1531 	    }
1532 	    if (*patparse != Outang)
1533 		return 0;
1534 	    patparse++;
1535 	    switch(len) {
1536 	    case 3:
1537 		starter = patnode(P_NUMRNG);
1538 		patadd((char *)&from, 0, sizeof(from), 0);
1539 		patadd((char *)&to, 0, sizeof(to), 0);
1540 		break;
1541 	    case 2:
1542 		starter = patnode(P_NUMTO);
1543 		patadd((char *)&to, 0, sizeof(to), 0);
1544 		break;
1545 	    case 1:
1546 		starter = patnode(P_NUMFROM);
1547 		patadd((char *)&from, 0, sizeof(from), 0);
1548 		break;
1549 	    case 0:
1550 		starter = patnode(P_NUMANY);
1551 		break;
1552 	    }
1553 	    /* This can't be simple, because it isn't.
1554 	     * Mention in manual that matching digits with [...]
1555 	     * is more efficient.
1556 	     */
1557 	    break;
1558 	case Pound:
1559 	    DPUTS(zpc_special[ZPC_HASH] == Marker,
1560 		  "Treating '#' as pattern character although disabled");
1561 	    DPUTS(!isset(EXTENDEDGLOB), "BUG: # not treated as string");
1562 	    /*
1563 	     * A hash here is an error; it should follow something
1564 	     * repeatable.
1565 	     */
1566 	    return 0;
1567 	    break;
1568 	case Bnullkeep:
1569 	    /*
1570 	     * Marker for restoring a backslash in output:
1571 	     * does not match a character.
1572 	     */
1573 	    next = patcomppiece(flagp, paren);
1574 	    /*
1575 	     * Can't match a pure string since we need to do this
1576 	     * as multiple chunks.
1577 	     */
1578 	    *flagp &= ~P_PURESTR;
1579 	    return next;
1580 	    break;
1581 #ifdef DEBUG
1582 	default:
1583 	    dputs("BUG: character not handled in patcomppiece");
1584 	    return 0;
1585 	    break;
1586 #endif
1587 	}
1588     }
1589 
1590     count = 0;
1591     if (!(hash = (*patparse == zpc_special[ZPC_HASH])) &&
1592 	!(count = ((*patparse == zpc_special[ZPC_INPAR] &&
1593 		    patparse[1] == zpc_special[ZPC_HASH] &&
1594 		    patparse[2] == 'c') ||
1595 		   (*patparse == zpc_special[ZPC_KSH_AT] &&
1596 		    patparse[1] == Inpar &&
1597 		    patparse[2] == zpc_special[ZPC_HASH] &&
1598 		    patparse[3] == 'c'))) &&
1599 	(kshchar <= 0 || kshchar == '@' || kshchar == '!')) {
1600 	*flagp = flags;
1601 	return starter;
1602     }
1603 
1604     /* too much at once doesn't currently work */
1605     if (kshchar && (hash || count))
1606 	return 0;
1607 
1608     if (kshchar == '*') {
1609 	op = P_ONEHASH;
1610 	*flagp = P_HSTART;
1611     } else if (kshchar == '+') {
1612 	op = P_TWOHASH;
1613 	*flagp = P_HSTART;
1614     } else if (kshchar == '?') {
1615 	op = 0;
1616 	*flagp = 0;
1617     } else if (count) {
1618 	op = P_COUNT;
1619 	patparse += 3;
1620 	*flagp = P_HSTART;
1621     } else if (*++patparse == zpc_special[ZPC_HASH]) {
1622 	op = P_TWOHASH;
1623 	patparse++;
1624 	*flagp = P_HSTART;
1625     } else {
1626 	op = P_ONEHASH;
1627 	*flagp = P_HSTART;
1628     }
1629 
1630     /*
1631      * Note optimizations with pointers into P_NOTHING branches:  some
1632      * should logically point to next node after current piece.
1633      *
1634      * Backtracking is also encoded in a slightly obscure way:  the
1635      * code emitted ensures we test the non-empty branch of complex
1636      * patterns before the empty branch on each repetition.  Hence
1637      * each time we fail on a non-empty branch, we try the empty branch,
1638      * which is equivalent to backtracking.
1639      */
1640     if (op == P_COUNT) {
1641 	/* (#cN,M) */
1642 	union upat countargs[P_CT_OPERAND];
1643 	char *opp = patparse;
1644 
1645 	countargs[0].l = P_COUNT;
1646 	countargs[P_CT_CURRENT].l = 0L;
1647 	countargs[P_CT_MIN].l = (long)zstrtol(patparse, &patparse, 10);
1648 	if (patparse == opp) {
1649 	    /* missing number treated as zero */
1650 	    countargs[P_CT_MIN].l = 0L;
1651 	}
1652 	if (*patparse != ',' && *patparse != Comma) {
1653 	    /* either max = min or error */
1654 	    if (*patparse != Outpar)
1655 		return 0;
1656 	    countargs[P_CT_MAX].l = countargs[P_CT_MIN].l;
1657 	} else {
1658 	    opp = ++patparse;
1659 	    countargs[P_CT_MAX].l = (long)zstrtol(patparse, &patparse, 10);
1660 	    if (*patparse != Outpar)
1661 		return 0;
1662 	    if (patparse == opp) {
1663 		/* missing number treated as infinity: record as -1 */
1664 		countargs[P_CT_MAX].l = -1L;
1665 	    }
1666 	}
1667 	patparse++;
1668 	countargs[P_CT_PTR].p = NULL;
1669 	/* Mark this chain as a min/max count... */
1670 	patinsert(P_COUNTSTART, starter, (char *)countargs, sizeof(countargs));
1671 	/*
1672 	 * The next of the operand is a loop back to the P_COUNT.  This is
1673 	 * how we get recursion for the count.  We don't loop back to
1674 	 * the P_COUNTSTART; that's used for initialising the count
1675 	 * and saving and restoring the count for any enclosing use
1676 	 * of the match.
1677 	 */
1678 	opnd = P_OPERAND(starter) + P_CT_OPERAND;
1679 	pattail(opnd, patnode(P_BACK));
1680 	pattail(opnd, P_OPERAND(starter));
1681 	/*
1682 	 * The next of the counter operators is what follows the
1683 	 * closure.
1684 	 * This handles matching of the tail.
1685 	 */
1686 	next = patnode(P_NOTHING);
1687 	pattail(starter, next);
1688 	pattail(P_OPERAND(starter), next);
1689     } else if ((flags & P_SIMPLE) && (op == P_ONEHASH || op == P_TWOHASH) &&
1690 	P_OP((Upat)patout+starter) == P_ANY) {
1691 	/* Optimize ?# to *.  Silly thing to do, since who would use
1692 	 * use ?# ? But it makes the later code shorter.
1693 	 */
1694 	Upat uptr = (Upat)patout + starter;
1695 	if (op == P_TWOHASH) {
1696 	    /* ?## becomes ?* */
1697 	    uptr->l = (uptr->l & ~0xff) | P_ANY;
1698 	    pattail(starter, patnode(P_STAR));
1699 	} else {
1700 	    uptr->l = (uptr->l & ~0xff) | P_STAR;
1701 	}
1702     } else if ((flags & P_SIMPLE) && op && !(patglobflags & 0xff)) {
1703 	/* Simplify, but not if we need to look for approximations. */
1704 	patinsert(op, starter, NULL, 0);
1705     } else if (op == P_ONEHASH) {
1706 	/* Emit x# as (x&|), where & means "self". */
1707 	up.p = NULL;
1708 	patinsert(P_WBRANCH, starter, (char *)&up, sizeof(up));
1709 	                                      /* Either x */
1710 	patoptail(starter, patnode(P_BACK));  /* and loop */
1711 	patoptail(starter, starter);	      /* back */
1712 	pattail(starter, patnode(P_BRANCH));  /* or */
1713 	pattail(starter, patnode(P_NOTHING)); /* null. */
1714     } else if (op == P_TWOHASH) {
1715 	/* Emit x## as x(&|) where & means "self". */
1716 	next = patnode(P_WBRANCH);	      /* Either */
1717 	up.p = NULL;
1718 	patadd((char *)&up, 0, sizeof(up), 0);
1719 	pattail(starter, next);
1720 	pattail(patnode(P_BACK), starter);    /* loop back */
1721 	pattail(next, patnode(P_BRANCH));     /* or */
1722 	pattail(starter, patnode(P_NOTHING)); /* null. */
1723     } else if (kshchar == '?') {
1724 	/* Emit ?(x) as (x|) */
1725 	patinsert(P_BRANCH, starter, NULL, 0); /* Either x */
1726 	pattail(starter, patnode(P_BRANCH));   /* or */
1727 	next = patnode(P_NOTHING);	       /* null */
1728 	pattail(starter, next);
1729 	patoptail(starter, next);
1730     }
1731     if (*patparse == zpc_special[ZPC_HASH])
1732 	return 0;
1733 
1734     return starter;
1735 }
1736 
1737 /*
1738  * Turn a ^foo (paren = 0) or !(foo) (paren = 1) into *~foo with
1739  * parentheses if necessary.   As you see, that's really quite easy.
1740  */
1741 
1742 /**/
1743 static long
patcompnot(int paren,int * flagsp)1744 patcompnot(int paren, int *flagsp)
1745 {
1746     union upat up;
1747     long excsync, br, excl, n, starter;
1748     int dummy;
1749 
1750     /* Here, we're matching a star at the start. */
1751     *flagsp = P_HSTART;
1752 
1753     starter = patnode(P_BRANCH);
1754     br = patnode(P_STAR);
1755     excsync = patnode(P_EXCSYNC);
1756     pattail(br, excsync);
1757     pattail(starter, excl = patnode(P_EXCLUDE));
1758     up.p = NULL;
1759     patadd((char *)&up, 0, sizeof(up), 0);
1760     if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy, 0))))
1761 	return 0;
1762     pattail(br, patnode(P_EXCEND));
1763     n = patnode(P_NOTHING); /* just so much easier */
1764     pattail(excsync, n);
1765     pattail(excl, n);
1766 
1767     return starter;
1768 }
1769 
1770 /* Emit a node */
1771 
1772 /**/
1773 static long
patnode(long op)1774 patnode(long op)
1775 {
1776     long starter = (Upat)patcode - (Upat)patout;
1777     union upat up;
1778 
1779     up.l = op;
1780     patadd((char *)&up, 0, sizeof(union upat), 0);
1781     return starter;
1782 }
1783 
1784 /*
1785  * insert an operator in front of an already emitted operand:
1786  * we relocate the operand.  there had better be nothing else after.
1787  */
1788 
1789 /**/
1790 static void
patinsert(long op,int opnd,char * xtra,int sz)1791 patinsert(long op, int opnd, char *xtra, int sz)
1792 {
1793     char *src, *dst, *opdst;
1794     union upat buf, *lptr;
1795 
1796     buf.l = 0;
1797     patadd((char *)&buf, 0, sizeof(buf), 0);
1798     if (sz)
1799 	patadd(xtra, 0, sz, 0);
1800     src = patcode - sizeof(union upat) - sz;
1801     dst = patcode;
1802     opdst = patout + opnd * sizeof(union upat);
1803     while (src > opdst)
1804 	*--dst = *--src;
1805 
1806     /* A cast can't be an lvalue */
1807     lptr = (Upat)opdst;
1808     lptr->l = op;
1809     opdst += sizeof(union upat);
1810     while (sz--)
1811 	*opdst++ = *xtra++;
1812 }
1813 
1814 /* set the 'next' pointer at the end of a node chain */
1815 
1816 /**/
1817 static void
pattail(long p,long val)1818 pattail(long p, long val)
1819 {
1820     Upat scan, temp;
1821     long offset;
1822 
1823     scan = (Upat)patout + p;
1824     for (;;) {
1825 	if (!(temp = PATNEXT(scan)))
1826 	    break;
1827 	scan = temp;
1828     }
1829 
1830     offset = (P_OP(scan) == P_BACK)
1831 	? (scan - (Upat)patout) - val : val - (scan - (Upat)patout);
1832 
1833     scan->l |= offset << 8;
1834 }
1835 
1836 /* do pattail, but on operand of first argument; nop if operandless */
1837 
1838 /**/
1839 static void
patoptail(long p,long val)1840 patoptail(long p, long val)
1841 {
1842     Upat ptr = (Upat)patout + p;
1843     int op = P_OP(ptr);
1844     if (!p || !P_ISBRANCH(ptr))
1845 	return;
1846     if (op == P_BRANCH)
1847 	pattail(P_OPERAND(p), val);
1848     else
1849 	pattail(P_OPERAND(p) + 1, val);
1850 }
1851 
1852 
1853 /*
1854  * Run a pattern.
1855  */
1856 struct rpat {
1857     char *patinstart;		/* Start of input string */
1858     char *patinend;		/* End of input string */
1859     char *patinput;		/* String input pointer */
1860     char *patinpath;		/* Full path for use with ~ exclusions */
1861     int   patinlen;		/* Length of last successful match.
1862 				 * Includes count of Meta characters.
1863 				 */
1864 
1865     char *patbeginp[NSUBEXP];	/* Pointer to backref beginnings */
1866     char *patendp[NSUBEXP];	/* Pointer to backref ends */
1867     int parsfound;		/* parentheses (with backrefs) found */
1868 
1869     int globdots;		/* Glob initial dots? */
1870 };
1871 
1872 static struct rpat pattrystate;
1873 
1874 #define patinstart	(pattrystate.patinstart)
1875 #define patinend	(pattrystate.patinend)
1876 #define patinput	(pattrystate.patinput)
1877 #define patinpath	(pattrystate.patinpath)
1878 #define patinlen	(pattrystate.patinlen)
1879 #define patbeginp	(pattrystate.patbeginp)
1880 #define patendp		(pattrystate.patendp)
1881 #define parsfound	(pattrystate.parsfound)
1882 #define globdots	(pattrystate.globdots)
1883 
1884 
1885 /*
1886  * Character functions operating on unmetafied strings.
1887  */
1888 #ifdef MULTIBYTE_SUPPORT
1889 
1890 /* Get a character from the start point in a string */
1891 #define CHARREF(x, y)	charref((x), (y), (int *)NULL)
1892 static wchar_t
charref(char * x,char * y,int * zmb_ind)1893 charref(char *x, char *y, int *zmb_ind)
1894 {
1895     wchar_t wc;
1896     size_t ret;
1897 
1898     if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80))
1899 	return (wchar_t) STOUC(*x);
1900 
1901     ret = mbrtowc(&wc, x, y-x, &shiftstate);
1902 
1903     if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
1904 	/* Error. */
1905 	/* Reset the shift state for next time. */
1906 	memset(&shiftstate, 0, sizeof(shiftstate));
1907 	if (zmb_ind)
1908 	    *zmb_ind = (ret == MB_INVALID) ? ZMB_INVALID : ZMB_INCOMPLETE;
1909 	return WCHAR_INVALID(*x);
1910     }
1911 
1912     if (zmb_ind)
1913 	*zmb_ind = ZMB_VALID;
1914     return wc;
1915 }
1916 
1917 /* Get  a pointer to the next character */
1918 #define CHARNEXT(x, y)	charnext((x), (y))
1919 static char *
charnext(char * x,char * y)1920 charnext(char *x, char *y)
1921 {
1922     wchar_t wc;
1923     size_t ret;
1924 
1925     if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80))
1926 	return x + 1;
1927 
1928     ret = mbrtowc(&wc, x, y-x, &shiftstate);
1929 
1930     if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
1931 	/* Error.  Treat as single byte. */
1932 	/* Reset the shift state for next time. */
1933 	memset(&shiftstate, 0, sizeof(shiftstate));
1934 	return x + 1;
1935     }
1936 
1937     /* Nulls here are normal characters */
1938     return x + (ret ? ret : 1);
1939 }
1940 
1941 /* Increment a pointer past the current character. */
1942 #define CHARINC(x, y)	((x) = charnext((x), (y)))
1943 
1944 
1945 /* Get a character and increment */
1946 #define CHARREFINC(x, y, z)	charrefinc(&(x), (y), (z))
1947 static wchar_t
charrefinc(char ** x,char * y,int * z)1948 charrefinc(char **x, char *y, int *z)
1949 {
1950     wchar_t wc;
1951     size_t ret;
1952 
1953     if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80))
1954 	return (wchar_t) STOUC(*(*x)++);
1955 
1956     ret = mbrtowc(&wc, *x, y-*x, &shiftstate);
1957 
1958     if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
1959 	/* Error.  Treat as single byte, but flag. */
1960 	*z = 1;
1961 	/* Reset the shift state for next time. */
1962 	memset(&shiftstate, 0, sizeof(shiftstate));
1963 	return WCHAR_INVALID(*(*x)++);
1964     }
1965 
1966     /* Nulls here are normal characters */
1967     *x += ret ? ret : 1;
1968 
1969     return wc;
1970 }
1971 
1972 
1973 /*
1974  * Counter the number of characters between two pointers, smaller first
1975  *
1976  * This is used when setting values in parameters, so we obey
1977  * the MULTIBYTE option (even if it's been overridden locally).
1978  */
1979 #define CHARSUB(x,y)	charsub(x, y)
1980 static ptrdiff_t
charsub(char * x,char * y)1981 charsub(char *x, char *y)
1982 {
1983     ptrdiff_t res = 0;
1984     size_t ret;
1985     wchar_t wc;
1986 
1987     if (!isset(MULTIBYTE))
1988 	return y - x;
1989 
1990     while (x < y) {
1991 	ret = mbrtowc(&wc, x, y-x, &shiftstate);
1992 
1993 	if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
1994 	    /* Error.  Treat remainder as single characters */
1995 	    return res + (y - x);
1996 	}
1997 
1998 	/* Treat nulls as normal characters */
1999 	if (!ret)
2000 	    ret = 1;
2001 	res++;
2002 	x += ret;
2003     }
2004 
2005     return res;
2006 }
2007 
2008 #else /* no MULTIBYTE_SUPPORT */
2009 
2010 /* Get a character from the start point in a string */
2011 #define CHARREF(x, y)	(STOUC(*(x)))
2012 /* Get  a pointer to the next character */
2013 #define CHARNEXT(x, y)	((x)+1)
2014 /* Increment a pointer past the current character. */
2015 #define CHARINC(x, y)	((x)++)
2016 /* Get a character and increment */
2017 #define CHARREFINC(x, y, z)	(STOUC(*(x)++))
2018 /* Counter the number of characters between two pointers, smaller first */
2019 #define CHARSUB(x,y)	((y) - (x))
2020 
2021 #endif /* MULTIBYTE_SUPPORT */
2022 
2023 /*
2024  * The following need to be accessed in the globbing scanner for
2025  * a multi-component file path.  See horror story in glob.c.
2026  */
2027 /**/
2028 int errsfound;				/* Total error count so far */
2029 
2030 /**/
2031 int forceerrs;				/* Forced maximum error count */
2032 
2033 /*
2034  * exactpos is used to remember how far down an exact string we have
2035  * matched, if we are doing approximation and can therefore redo from
2036  * the same point; we never need to otherwise.
2037  *
2038  * exactend is a pointer to the end of the string, which isn't
2039  * null-terminated.
2040  */
2041 static char *exactpos, *exactend;
2042 
2043 /**/
2044 void
pattrystart(void)2045 pattrystart(void)
2046 {
2047     forceerrs = -1;
2048     errsfound = 0;
2049 }
2050 
2051 /*
2052  * Fix up string length stuff.
2053  *
2054  * If we call patallocstr() with "force" to set things up early, it's
2055  * done there, else it's done in pattryrefs().  The reason for the
2056  * difference is in the latter case we may not be relying on
2057  * patallocstr() having an effect.
2058  */
2059 
2060 /**/
2061 static void
patmungestring(char ** string,int * stringlen,int * unmetalenin)2062 patmungestring(char **string, int *stringlen, int *unmetalenin)
2063 {
2064     /*
2065      * Special signalling of empty tokenised string.
2066      */
2067     if (*stringlen > 0 && **string == Nularg) {
2068 	(*string)++;
2069 	/*
2070 	 * If we don't have an unmetafied length
2071 	 * and need it (we may not) we'll get it later.
2072 	 */
2073 	if (*unmetalenin > 0)
2074 	    (*unmetalenin)--;
2075 	if (*stringlen > 0)
2076 	    (*stringlen)--;
2077     }
2078 
2079     /* Ensure we have a metafied length */
2080     if (*stringlen < 0)
2081 	*stringlen = strlen(*string);
2082 }
2083 
2084 /*
2085  * Allocate memory for pattern match.  Note this is specific to use
2086  * of pattern *and* trial string.
2087  *
2088  * Unmetafy a trial string for use in pattern matching, if needed.
2089  *
2090  * If it is needed, returns a heap allocated string; if not needed,
2091  * returns NULL.
2092  *
2093  * prog is the pattern to be executed.
2094  * string is the metafied trial string.
2095  * stringlen is it's length; it will be calculated if it's negative
2096  *   (this is a simple strlen()).
2097  * unmetalen is the unmetafied length of the string, may be -1.
2098  * force is 1 if we always unmetafy: this is useful if we are going
2099  *   to try again with different versions of the string.  If this is
2100  *   called from pattryrefs() we don't force unmetafication as it won't
2101  *   be optimal.  This option should be used if the resulting
2102  *   patstralloc is going to be passed to pattrylen() / pattryrefs().
2103  * In patstralloc (supplied by caller, must last until last pattry is done)
2104  *  unmetalen is the unmetafied length of the string; it will be
2105  *    calculated if the input value is negative.
2106  *  unmetalenp is the umetafied length of a path segment preceding
2107  *    the trial string needed for file mananagement; it is calculated as
2108  *    needed so does not need to be initialised.
2109  *  alloced is the memory allocated on the heap --- same as return value from
2110  *    function.
2111  */
2112 /**/
2113 mod_export
patallocstr(Patprog prog,char * string,int stringlen,int unmetalen,int force,Patstralloc patstralloc)2114 char *patallocstr(Patprog prog, char *string, int stringlen, int unmetalen,
2115 		  int force, Patstralloc patstralloc)
2116 {
2117     int needfullpath;
2118 
2119     if (force)
2120 	patmungestring(&string, &stringlen, &unmetalen);
2121 
2122     /*
2123      * For a top-level ~-exclusion, we will need the full
2124      * path to exclude, so copy the path so far and append the
2125      * current test string.
2126      */
2127     needfullpath = (prog->flags & PAT_HAS_EXCLUDP) && pathpos;
2128 
2129     /* Get the length of the full string when unmetafied. */
2130     if (unmetalen < 0)
2131 	patstralloc->unmetalen = ztrsub(string + stringlen, string);
2132     else
2133 	patstralloc->unmetalen = unmetalen;
2134     if (needfullpath) {
2135 	patstralloc->unmetalenp = ztrsub(pathbuf + pathpos, pathbuf);
2136 	if (!patstralloc->unmetalenp)
2137 	    needfullpath = 0;
2138     } else
2139 	patstralloc->unmetalenp = 0;
2140     /* Initialise cache area */
2141     patstralloc->progstrunmeta = NULL;
2142     patstralloc->progstrunmetalen = 0;
2143 
2144     DPUTS(needfullpath && (prog->flags & (PAT_PURES|PAT_ANY)),
2145 	  "rum sort of file exclusion");
2146     /*
2147      * Partly for efficiency, and partly for the convenience of
2148      * globbing, we don't unmetafy pure string patterns, and
2149      * there's no reason to if the pattern is just a *.
2150      */
2151     if (force ||
2152 	(!(prog->flags & (PAT_PURES|PAT_ANY))
2153 	 && (needfullpath || patstralloc->unmetalen != stringlen))) {
2154 	/*
2155 	 * We need to copy if we need to prepend the path so far
2156 	 * (in which case we copy both chunks), or if we have
2157 	 * Meta characters.
2158 	 */
2159 	char *dst, *ptr;
2160 	int i, icopy, ncopy;
2161 
2162 	dst = patstralloc->alloced =
2163 	    zhalloc(patstralloc->unmetalen + patstralloc->unmetalenp);
2164 
2165 	if (needfullpath) {
2166 	    /* loop twice, copy path buffer first time */
2167 	    ptr = pathbuf;
2168 	    ncopy = patstralloc->unmetalenp;
2169 	} else {
2170 	    /* just loop once, copy string with unmetafication */
2171 	    ptr = string;
2172 	    ncopy = patstralloc->unmetalen;
2173 	}
2174 	for (icopy = 0; icopy < 2; icopy++) {
2175 	    for (i = 0; i < ncopy; i++) {
2176 		if (*ptr == Meta) {
2177 		    ptr++;
2178 		    *dst++ = *ptr++ ^ 32;
2179 		} else {
2180 		    *dst++ = *ptr++;
2181 		}
2182 	    }
2183 	    if (!needfullpath)
2184 		break;
2185 	    /* next time append test string to path so far */
2186 	    ptr = string;
2187 	    ncopy = patstralloc->unmetalen;
2188 	}
2189     }
2190     else
2191     {
2192 	patstralloc->alloced = NULL;
2193     }
2194 
2195     return patstralloc->alloced;
2196 }
2197 
2198 
2199 /*
2200  * Test prog against null-terminated, metafied string.
2201  */
2202 
2203 /**/
2204 mod_export int
pattry(Patprog prog,char * string)2205 pattry(Patprog prog, char *string)
2206 {
2207     return pattryrefs(prog, string, -1, -1, NULL, 0, NULL, NULL, NULL);
2208 }
2209 
2210 /*
2211  * Test prog against string of given length, no null termination
2212  * but still metafied at this point.  offset gives an offset
2213  * to include in reported match indices
2214  */
2215 
2216 /**/
2217 mod_export int
pattrylen(Patprog prog,char * string,int len,int unmetalen,Patstralloc patstralloc,int offset)2218 pattrylen(Patprog prog, char *string, int len, int unmetalen,
2219 	  Patstralloc patstralloc, int offset)
2220 {
2221     return pattryrefs(prog, string, len, unmetalen, patstralloc, offset,
2222 		      NULL, NULL, NULL);
2223 }
2224 
2225 /*
2226  * Test prog against string with given lengths.  The input
2227  * string is metafied; stringlen is the raw string length, and
2228  * unmetalen the number of characters in the original string (some
2229  * of which may now be metafied).  Either value may be -1
2230  * to indicate a null-terminated string which will be counted.  Note
2231  * there may be a severe penalty for this if a lot of matching is done
2232  * on one string.
2233  *
2234  * If patstralloc is not NULL it is used to optimise unmetafication
2235  * of a trial string that may be passed (or any substring may be passed) to
2236  * pattryrefs multiple times or the same pattern (N.B. so patstralloc
2237  * depends on both prog *and* the trial string).  This should only be
2238  * done if there is no path prefix (pathpos == 0) as otherwise the path
2239  * buffer and unmetafied string may not match.  To do this,
2240  * patallocstr() is called (use force = 1 to ensure it is always
2241  * unmetafied); paststralloc points to existing storage. Memory is
2242  * on the heap.
2243  *
2244  * patstralloc->alloced and patstralloc->unmetalen contain the
2245  * unmetafied string and its length.  In that case, the rules for the
2246  * earlier arguments change:
2247  * - string is an unmetafied string
2248  * - stringlen is its unmetafied (i.e. actual) length
2249  * - unmetalenin is not used.
2250  * string and stringlen may refer to arbitrary substrings of
2251  * patstralloc->alloced without any internal modification to patstralloc.
2252  *
2253  * patoffset is the position in the original string (not seen by
2254  * the pattern module) at which we are trying to match.
2255  * This is added in to the positions recorded in patbeginp and patendp
2256  * when we are looking for substrings.  Currently this only happens
2257  * in the parameter substitution code.  It refers to a real character
2258  * offset, i.e. is already in the form ready for presentation to the
2259  * general public --- this is necessary as we don't have the
2260  * information to convert it down here.
2261  *
2262  * Note this is a character offset, i.e. a single possibly metafied and
2263  * possibly multibyte character counts as 1.
2264  *
2265  * The last three arguments are used to report the positions for the
2266  * backreferences. On entry, *nump should contain the maximum number
2267  * of positions to report.  In this case the match, mbegin, mend
2268  * arrays are not altered.
2269  *
2270  * If nump is NULL but endp is not NULL, then *endp is set to the
2271  * end position of the match, taking into account patinstart.
2272  */
2273 
2274 /**/
2275 mod_export int
pattryrefs(Patprog prog,char * string,int stringlen,int unmetalenin,Patstralloc patstralloc,int patoffset,int * nump,int * begp,int * endp)2276 pattryrefs(Patprog prog, char *string, int stringlen, int unmetalenin,
2277 	   Patstralloc patstralloc, int patoffset,
2278 	   int *nump, int *begp, int *endp)
2279 {
2280     int i, maxnpos = 0, ret;
2281     int origlen;
2282     char **sp, **ep, *ptr;
2283     char *progstr = (char *)prog + prog->startoff;
2284     struct patstralloc patstralloc_struct;
2285 
2286     if (nump) {
2287 	maxnpos = *nump;
2288 	*nump = 0;
2289     }
2290 
2291     if (!patstralloc)
2292 	patmungestring(&string, &stringlen, &unmetalenin);
2293     origlen = stringlen;
2294 
2295     if (patstralloc) {
2296 	DPUTS(!patstralloc->alloced,
2297 	      "External unmetafy didn't actually unmetafy.");
2298 	DPUTS(patstralloc->unmetalenp,
2299 	      "Ooh-err: pathpos with external unmetafy. I have bad vibes.");
2300 	patinpath = NULL;
2301 	patinstart = string;
2302 	/* stringlen is unmetafied length; unmetalenin is ignored */
2303     } else {
2304 	patstralloc = &patstralloc_struct;
2305 	if (patallocstr(prog, string, stringlen, unmetalenin, 0, patstralloc)) {
2306 	    patinstart = patstralloc->alloced + patstralloc->unmetalenp;
2307 	    stringlen = patstralloc->unmetalen;
2308 	} else
2309 	    patinstart = string;
2310 	if (patstralloc->unmetalenp)
2311 	    patinpath = patstralloc->alloced;
2312 	else
2313 	    patinpath = NULL;
2314     }
2315 
2316     patflags = prog->flags;
2317     patinend = patinstart + stringlen;
2318     /*
2319      * From now on we do not require NULL termination of
2320      * the test string.  There should also be no more references
2321      * to the variable string.
2322      */
2323 
2324     if (prog->flags & (PAT_PURES|PAT_ANY)) {
2325 	/*
2326 	 * Either we are testing against a pure string,
2327 	 * or we can match anything at all.
2328 	 */
2329 	int pstrlen;
2330 	char *pstr;
2331 	if (patstralloc->alloced)
2332 	{
2333 	    /*
2334 	     * Unmetafied; we need pattern string that's also unmetafied.
2335 	     * We'll cache it in the patstralloc structure.
2336 	     * Note it's on the heap.
2337 	     */
2338 	    if (!patstralloc->progstrunmeta)
2339 	    {
2340 		patstralloc->progstrunmeta =
2341 		    dupstrpfx(progstr, (int)prog->patmlen);
2342 		unmetafy(patstralloc->progstrunmeta,
2343 			 &patstralloc->progstrunmetalen);
2344 	    }
2345 	    pstr = patstralloc->progstrunmeta;
2346 	    pstrlen = patstralloc->progstrunmetalen;
2347 	}
2348 	else
2349 	{
2350 	    /* Metafied. */
2351 	    pstr = progstr;
2352 	    pstrlen = (int)prog->patmlen;
2353 	}
2354 	if (prog->flags & PAT_ANY) {
2355 	    /*
2356 	     * Optimisation for a single "*": always matches
2357 	     * (except for no_glob_dots, see below).
2358 	     */
2359 	    ret = 1;
2360 	} else {
2361 	    /*
2362 	     * Testing a pure string.  See if initial
2363 	     * components match.
2364 	     */
2365 	    int lendiff = stringlen - pstrlen;
2366 	    if (lendiff < 0) {
2367 		/* No, the pattern string is too long. */
2368 		ret = 0;
2369 	    } else if (!memcmp(pstr, patinstart, pstrlen)) {
2370 		/*
2371 		 * Initial component matches.  Matches either
2372 		 * if lengths are the same or we are not anchored
2373 		 * to the end of the string.
2374 		 */
2375 		ret = !lendiff || (prog->flags & PAT_NOANCH);
2376 	    } else {
2377 		/* No match. */
2378 		ret = 0;
2379 	    }
2380 	}
2381 	if (ret) {
2382 	    /*
2383 	     * For files, we won't match initial "."s unless
2384 	     * glob_dots is set.
2385 	     */
2386 	    if ((prog->flags & PAT_NOGLD) && *patinstart == '.') {
2387 		ret = 0;
2388 	    } else {
2389 		/*
2390 		 * Remember the length in case used for ${..#..} etc.
2391 		 * In this case, we didn't unmetafy the pattern string
2392 		 * in the original structure, but it might be unmetafied
2393 		 * for use with an unmetafied test string.
2394 		 */
2395 		patinlen = pstrlen;
2396 		/* if matching files, must update globbing flags */
2397 		patglobflags = prog->globend;
2398 
2399 		if ((patglobflags & GF_MATCHREF) &&
2400 		    !(patflags & PAT_FILE)) {
2401 		    char *str;
2402 		    int mlen;
2403 
2404 		    if (patstralloc->alloced) {
2405 			/*
2406 			 * Unmetafied: pstrlen contains unmetafied
2407 			 * length in bytes.
2408 			 */
2409 			str = metafy(patinstart, pstrlen, META_DUP);
2410 			mlen = CHARSUB(patinstart, patinstart + pstrlen);
2411 		    } else {
2412 			str = ztrduppfx(patinstart, patinlen);
2413 			/*
2414 			 * Count the characters.  We're not using CHARSUB()
2415 			 * because the string is still metafied.
2416 			 */
2417 			MB_METACHARINIT();
2418 			mlen = MB_METASTRLEN2END(patinstart, 0,
2419 						 patinstart + patinlen);
2420 		    }
2421 
2422 		    setsparam("MATCH", str);
2423 		    setiparam("MBEGIN",
2424 			      (zlong)(patoffset + !isset(KSHARRAYS)));
2425 		    setiparam("MEND",
2426 			      (zlong)(mlen + patoffset +
2427 				      !isset(KSHARRAYS) - 1));
2428 		}
2429 	    }
2430 	}
2431     } else {
2432 	/*
2433 	 * Test for a `must match' string, unless we're scanning for a match
2434 	 * in which case we don't need to do this each time.
2435 	 */
2436 	ret = 1;
2437 	if (!(prog->flags & PAT_SCAN) && prog->mustoff)
2438 	{
2439 	    char *testptr;	/* start pointer into test string */
2440 	    char *teststop;	/* last point from which we can match */
2441 	    char *patptr = (char *)prog + prog->mustoff;
2442 	    int patlen = prog->patmlen;
2443 	    int found = 0;
2444 
2445 	    if (patlen > stringlen) {
2446 		/* Too long, can't match. */
2447 		ret = 0;
2448 	    } else {
2449 		teststop = patinend - patlen;
2450 
2451 		for (testptr = patinstart; testptr <= teststop; testptr++)
2452 		{
2453 		    if (!memcmp(testptr, patptr, patlen)) {
2454 			found = 1;
2455 			break;
2456 		    }
2457 		}
2458 
2459 		if (!found)
2460 		    ret = 0;
2461 	    }
2462 	}
2463 	if (!ret)
2464 	    return 0;
2465 
2466 	patglobflags = prog->globflags;
2467 	if (!(patflags & PAT_FILE)) {
2468 	    forceerrs = -1;
2469 	    errsfound = 0;
2470 	}
2471 	globdots = !(patflags & PAT_NOGLD);
2472 	parsfound = 0;
2473 
2474 	patinput = patinstart;
2475 
2476 	exactpos = exactend = NULL;
2477 	/* The only external call to patmatch --- all others are recursive */
2478 	if (patmatch((Upat)progstr)) {
2479 	    /*
2480 	     * we were lazy and didn't save the globflags if an exclusion
2481 	     * failed, so set it now
2482 	     */
2483 	    patglobflags = prog->globend;
2484 
2485 	    /*
2486 	     * Record length of successful match, including Meta
2487 	     * characters.  Do it here so that patmatchlen() can return
2488 	     * it even if we delete the pattern strings.
2489 	     */
2490 	    patinlen = patinput - patinstart;
2491 	    /*
2492 	     * Optimization: if we didn't find any Meta characters
2493 	     * to begin with, we don't need to look for them now.
2494 	     *
2495 	     * For patstralloc passed in, we want the unmetafied length.
2496 	     */
2497 	    if (patstralloc == &patstralloc_struct &&
2498 		patstralloc->unmetalen != origlen) {
2499 		for (ptr = patinstart; ptr < patinput; ptr++)
2500 		    if (imeta(*ptr))
2501 			patinlen++;
2502 	    }
2503 
2504 	    /*
2505 	     * Should we clear backreferences and matches on a failed
2506 	     * match?
2507 	     */
2508 	    if ((patglobflags & GF_MATCHREF) && !(patflags & PAT_FILE)) {
2509 		/*
2510 		 * m flag: for global match.  This carries no overhead
2511 		 * in the pattern matching part.
2512 		 *
2513 		 * Remember the test pattern is already unmetafied.
2514 		 */
2515 		char *str;
2516 		int mlen = CHARSUB(patinstart, patinput);
2517 
2518 		str = metafy(patinstart, patinput - patinstart, META_DUP);
2519 		setsparam("MATCH", str);
2520 		setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS)));
2521 		setiparam("MEND",
2522 			  (zlong)(mlen + patoffset +
2523 				  !isset(KSHARRAYS) - 1));
2524 	    }
2525 	    if (prog->patnpar && nump) {
2526 		/*
2527 		 * b flag: for backreferences using parentheses. Reported
2528 		 * directly.
2529 		 */
2530 		*nump = prog->patnpar;
2531 
2532 		sp = patbeginp;
2533 		ep = patendp;
2534 
2535 		for (i = 0; i < prog->patnpar && i < maxnpos; i++) {
2536 		    if (parsfound & (1 << i)) {
2537 			if (begp)
2538 			    *begp++ = CHARSUB(patinstart, *sp) + patoffset;
2539 			if (endp)
2540 			    *endp++ = CHARSUB(patinstart, *ep) + patoffset
2541 				- 1;
2542 		    } else {
2543 			if (begp)
2544 			    *begp++ = -1;
2545 			if (endp)
2546 			    *endp++ = -1;
2547 		    }
2548 
2549 		    sp++;
2550 		    ep++;
2551 		}
2552 	    } else if (prog->patnpar && !(patflags & PAT_FILE)) {
2553 		/*
2554 		 * b flag: for backreferences using parentheses.
2555 		 */
2556 		int palen = prog->patnpar+1;
2557 		char **matcharr, **mbeginarr, **mendarr;
2558 		char numbuf[DIGBUFSIZE];
2559 
2560 		matcharr = zshcalloc(palen*sizeof(char *));
2561 		mbeginarr = zshcalloc(palen*sizeof(char *));
2562 		mendarr = zshcalloc(palen*sizeof(char *));
2563 
2564 		sp = patbeginp;
2565 		ep = patendp;
2566 
2567 		for (i = 0; i < prog->patnpar; i++) {
2568 		    if (parsfound & (1 << i)) {
2569 			matcharr[i] = metafy(*sp, *ep - *sp, META_DUP);
2570 			/*
2571 			 * mbegin and mend give indexes into the string
2572 			 * in the standard notation, i.e. respecting
2573 			 * KSHARRAYS, and with the end index giving
2574 			 * the last character, not one beyond.
2575 			 * For example, foo=foo; [[ $foo = (f)oo ]] gives
2576 			 * (without KSHARRAYS) indexes 1 and 1, which
2577 			 * corresponds to indexing as ${foo[1,1]}.
2578 			 */
2579 			sprintf(numbuf, "%ld",
2580 				(long)(CHARSUB(patinstart, *sp) +
2581 				       patoffset +
2582 				       !isset(KSHARRAYS)));
2583 			mbeginarr[i] = ztrdup(numbuf);
2584 			sprintf(numbuf, "%ld",
2585 				(long)(CHARSUB(patinstart, *ep) +
2586 				       patoffset +
2587 				       !isset(KSHARRAYS) - 1));
2588 			mendarr[i] = ztrdup(numbuf);
2589 		    } else {
2590 			/* Pattern wasn't set: either it was in an
2591 			 * unmatched branch, or a hashed parenthesis
2592 			 * that didn't match at all.
2593 			 */
2594 			matcharr[i] = ztrdup("");
2595 			mbeginarr[i] = ztrdup("-1");
2596 			mendarr[i] = ztrdup("-1");
2597 		    }
2598 		    sp++;
2599 		    ep++;
2600 		}
2601 		setaparam("match", matcharr);
2602 		setaparam("mbegin", mbeginarr);
2603 		setaparam("mend", mendarr);
2604 	    }
2605 
2606 	    if (!nump && endp) {
2607 		/*
2608 		 * We just need the overall end position.
2609 		 */
2610 		*endp = CHARSUB(patinstart, patinput) + patoffset;
2611 	    }
2612 
2613 	    ret = 1;
2614 	} else
2615 	    ret = 0;
2616     }
2617 
2618     return ret;
2619 }
2620 
2621 /*
2622  * Return length of previous successful match.  This is
2623  * in metafied bytes, i.e. includes a count of Meta characters,
2624  * unless the match was done on an unmetafied string using
2625  * a patstralloc struct, in which case it too is unmetafied.
2626  * Unusual and futile attempt at modular encapsulation.
2627  */
2628 
2629 /**/
2630 int
patmatchlen(void)2631 patmatchlen(void)
2632 {
2633     return patinlen;
2634 }
2635 
2636 /*
2637  * Match literal characters with case insensitivity test:  the first
2638  * comes from the input string, the second the current pattern.
2639  */
2640 #ifdef MULTIBYTE_SUPPORT
2641 #define ISUPPER(x)	iswupper(x)
2642 #define ISLOWER(x)	iswlower(x)
2643 #define TOUPPER(x)	towupper(x)
2644 #define TOLOWER(x)	towlower(x)
2645 #define ISDIGIT(x)	iswdigit(x)
2646 #else
2647 #define ISUPPER(x)	isupper(x)
2648 #define ISLOWER(x)	islower(x)
2649 #define TOUPPER(x)	toupper(x)
2650 #define TOLOWER(x)	tolower(x)
2651 #define ISDIGIT(x)	idigit(x)
2652 #endif
2653 #define CHARMATCH(chin, chpa) (chin == chpa || \
2654         ((patglobflags & GF_IGNCASE) ? \
2655 	 ((ISUPPER(chin) ? TOLOWER(chin) : chin) == \
2656 	  (ISUPPER(chpa) ? TOLOWER(chpa) : chpa)) : \
2657 	 (patglobflags & GF_LCMATCHUC) ? \
2658 	 (ISLOWER(chpa) && TOUPPER(chpa) == chin) : 0))
2659 
2660 /*
2661  * The same but caching an expression from the first argument,
2662  * Requires local charmatch_cache definition.
2663  */
2664 #define CHARMATCH_EXPR(expr, chpa) \
2665 	(charmatch_cache = (expr), CHARMATCH(charmatch_cache, chpa))
2666 
2667 /*
2668  * Main matching routine.
2669  *
2670  * Testing the tail end of a match is usually done by recursion, but
2671  * we try to eliminate that in favour of looping for simple cases.
2672  */
2673 
2674 /**/
2675 static int
patmatch(Upat prog)2676 patmatch(Upat prog)
2677 {
2678     /* Current and next nodes */
2679     Upat scan = prog, next, opnd;
2680     char *start, *save, *chrop, *chrend, *compend;
2681     int savglobflags, op, no, min, fail = 0, saverrsfound;
2682     zrange_t from, to, comp;
2683     patint_t nextch;
2684     int q = queue_signal_level();
2685 
2686     /*
2687      * To avoid overhead of saving state if there are no queued signals
2688      * waiting, we pierce the signals.h veil and examine queue state.
2689      */
2690 #define check_for_signals() do if (queue_front != queue_rear) { \
2691 	    int savpatflags = patflags, savpatglobflags = patglobflags; \
2692             char *savexactpos = exactpos, *savexactend = exactend; \
2693 	    struct rpat savpattrystate = pattrystate; \
2694 	    dont_queue_signals(); \
2695 	    restore_queue_signals(q); \
2696 	    exactpos = savexactpos; \
2697 	    exactend = savexactend; \
2698 	    patflags = savpatflags; \
2699 	    patglobflags = savpatglobflags; \
2700 	    pattrystate = savpattrystate; \
2701 	} while (0)
2702 
2703     check_for_signals();
2704 
2705     while  (scan && !errflag) {
2706 	next = PATNEXT(scan);
2707 
2708 	if (!globdots && P_NOTDOT(scan) && patinput == patinstart &&
2709 	    patinput < patinend && *patinput == '.')
2710 	    return 0;
2711 
2712 	switch (P_OP(scan)) {
2713 	case P_ANY:
2714 	    if (patinput == patinend)
2715 		fail = 1;
2716 	    else
2717 		CHARINC(patinput, patinend);
2718 	    break;
2719 	case P_EXACTLY:
2720 	    /*
2721 	     * acts as nothing if *chrop is null:  this is used by
2722 	     * approx code.
2723 	     */
2724 	    if (exactpos) {
2725 		chrop = exactpos;
2726 		chrend = exactend;
2727 	    } else {
2728 		chrop = P_LS_STR(scan);
2729 		chrend = chrop + P_LS_LEN(scan);
2730 	    }
2731 	    exactpos = NULL;
2732 	    while (chrop < chrend && patinput < patinend) {
2733 		char *savpatinput = patinput;
2734 		char *savchrop = chrop;
2735 		int badin = 0, badpa = 0;
2736 		/*
2737 		 * Care with character matching:
2738 		 * We do need to convert the character to wide
2739 		 * representation if possible, because we may need
2740 		 * to do case transformation.  However, we should
2741 		 * be careful in case one, but not the other, wasn't
2742 		 * representable in the current locale---in that
2743 		 * case they don't match even if the returned
2744 		 * values (one properly converted, one raw) are
2745 		 * the same.
2746 		 */
2747 		patint_t chin = CHARREFINC(patinput, patinend, &badin);
2748 		patint_t chpa = CHARREFINC(chrop, chrend, &badpa);
2749 		if (!CHARMATCH(chin, chpa) || badin != badpa) {
2750 		    fail = 1;
2751 		    patinput = savpatinput;
2752 		    chrop = savchrop;
2753 		    break;
2754 		}
2755 	    }
2756 	    if (chrop < chrend) {
2757 		exactpos = chrop;
2758 		exactend = chrend;
2759 		fail = 1;
2760 	    }
2761 	    break;
2762 	case P_ANYOF:
2763 	case P_ANYBUT:
2764 	    if (patinput == patinend)
2765 		fail = 1;
2766 	    else {
2767 #ifdef MULTIBYTE_SUPPORT
2768 		int zmb_ind;
2769 		wchar_t cr = charref(patinput, patinend, &zmb_ind);
2770 		char *scanop = (char *)P_OPERAND(scan);
2771 		if (patglobflags & GF_MULTIBYTE) {
2772 		    if (mb_patmatchrange(scanop, cr, zmb_ind, NULL, NULL) ^
2773 			(P_OP(scan) == P_ANYOF))
2774 			fail = 1;
2775 		    else
2776 			CHARINC(patinput, patinend);
2777 		} else if (patmatchrange(scanop, (int)cr, NULL, NULL) ^
2778 			   (P_OP(scan) == P_ANYOF))
2779 		    fail = 1;
2780 		else
2781 		    CHARINC(patinput, patinend);
2782 #else
2783 		if (patmatchrange((char *)P_OPERAND(scan),
2784 				  CHARREF(patinput, patinend), NULL, NULL) ^
2785 		    (P_OP(scan) == P_ANYOF))
2786 		    fail = 1;
2787 		else
2788 		    CHARINC(patinput, patinend);
2789 #endif
2790 	    }
2791 	    break;
2792 	case P_NUMRNG:
2793 	case P_NUMFROM:
2794 	case P_NUMTO:
2795 	    /*
2796 	     * To do this properly, we really have to treat numbers as
2797 	     * closures:  that's so things like <1-1000>33 will
2798 	     * match 633 (they didn't up to 3.1.6).  To avoid making this
2799 	     * too inefficient, we see if there's an exact match next:
2800 	     * if there is, and it's not a digit, we return 1 after
2801 	     * the first attempt.
2802 	     */
2803 	    op = P_OP(scan);
2804 	    start = (char *)P_OPERAND(scan);
2805 	    from = to = 0;
2806 	    if (op != P_NUMTO) {
2807 #ifdef ZSH_64_BIT_TYPE
2808 		/* We can't rely on pointer alignment being good enough. */
2809 		memcpy((char *)&from, start, sizeof(zrange_t));
2810 #else
2811 		from = *((zrange_t *) start);
2812 #endif
2813 		start += sizeof(zrange_t);
2814 	    }
2815 	    if (op != P_NUMFROM) {
2816 #ifdef ZSH_64_BIT_TYPE
2817 		memcpy((char *)&to, start, sizeof(zrange_t));
2818 #else
2819 		to = *((zrange_t *) start);
2820 #endif
2821 	    }
2822 	    start = compend = patinput;
2823 	    comp = 0;
2824 	    while (patinput < patinend && idigit(*patinput)) {
2825 		int out_of_range = 0;
2826 		int digit = *patinput - '0';
2827 		if (comp > ZRANGE_MAX / (zlong)10) {
2828 		    out_of_range = 1;
2829 		} else {
2830 		    zrange_t c10 = comp ? comp * 10 : 0;
2831 		    if (ZRANGE_MAX - c10 < digit) {
2832 			out_of_range = 1;
2833 		    } else {
2834 			comp = c10;
2835 			comp += digit;
2836 		    }
2837 		}
2838 		patinput++;
2839 		compend++;
2840 
2841 		if (out_of_range ||
2842 		    (comp & ((zrange_t)1 << (sizeof(comp)*8 -
2843 #ifdef ZRANGE_T_IS_SIGNED
2844 					    2
2845 #else
2846 					    1
2847 #endif
2848 				)))) {
2849 		    /*
2850 		     * Out of range (allowing for signedness, which
2851 		     * we need if we are using zlongs).
2852 		     * This is as far as we can go.
2853 		     * If we're doing a range "from", skip all the
2854 		     * remaining numbers.  Otherwise, we can't
2855 		     * match beyond the previous point anyway.
2856 		     * Leave the pointer to the last calculated
2857 		     * position (compend) where it was before.
2858 		     */
2859 		    if (op == P_NUMFROM) {
2860 			while (patinput < patinend && idigit(*patinput))
2861 			    patinput++;
2862 		    }
2863 		}
2864 	    }
2865 	    save = patinput;
2866 	    no = 0;
2867 	    while (patinput > start) {
2868 		/* if already too small, no power on earth can save it */
2869 		if (comp < from && patinput <= compend)
2870 		    break;
2871 		if ((op == P_NUMFROM || comp <= to) && patmatch(next)) {
2872 		    return 1;
2873 		}
2874 		if (!no && P_OP(next) == P_EXACTLY &&
2875 		    (!P_LS_LEN(next) ||
2876 		     !idigit(STOUC(*P_LS_STR(next)))) &&
2877 		    !(patglobflags & 0xff))
2878 		    return 0;
2879 		patinput = --save;
2880 		no++;
2881 		/*
2882 		 * With a range start and an unrepresentable test
2883 		 * number, we just back down the test string without
2884 		 * changing the number until we get to a representable
2885 		 * one.
2886 		 */
2887 		if (patinput < compend)
2888 		    comp /= 10;
2889 	    }
2890 	    patinput = start;
2891 	    fail = 1;
2892 	    break;
2893 	case P_NUMANY:
2894 	    /* This is <->: any old set of digits, don't bother comparing */
2895 	    start = patinput;
2896 	    while (patinput < patinend && idigit(*patinput))
2897 		patinput++;
2898 	    save = patinput;
2899 	    no = 0;
2900 	    while (patinput > start) {
2901 		if (patmatch(next))
2902 		    return 1;
2903 		if (!no && P_OP(next) == P_EXACTLY &&
2904 		    (!P_LS_LEN(next) ||
2905 		     !idigit(*P_LS_STR(next))) &&
2906 		    !(patglobflags & 0xff))
2907 		    return 0;
2908 		patinput = --save;
2909 		no++;
2910 	    }
2911 	    patinput = start;
2912 	    fail = 1;
2913 	    break;
2914 	case P_NOTHING:
2915 	    break;
2916 	case P_BACK:
2917 	    break;
2918 	case P_GFLAGS:
2919 	    patglobflags = P_OPERAND(scan)->l;
2920 	    break;
2921 	case P_OPEN:
2922 	case P_OPEN+1:
2923 	case P_OPEN+2:
2924 	case P_OPEN+3:
2925 	case P_OPEN+4:
2926 	case P_OPEN+5:
2927 	case P_OPEN+6:
2928 	case P_OPEN+7:
2929 	case P_OPEN+8:
2930 	case P_OPEN+9:
2931 	    no = P_OP(scan) - P_OPEN;
2932 	    save = patinput;
2933 
2934 	    if (patmatch(next)) {
2935 		/*
2936 		 * Don't set patbeginp if some later invocation of
2937 		 * the same parentheses already has.
2938 		 */
2939 		if (no && !(parsfound & (1 << (no - 1)))) {
2940 		    patbeginp[no-1] = save;
2941 		    parsfound |= 1 << (no - 1);
2942 		}
2943 		return 1;
2944 	    } else
2945 		return 0;
2946 	    break;
2947 	case P_CLOSE:
2948 	case P_CLOSE+1:
2949 	case P_CLOSE+2:
2950 	case P_CLOSE+3:
2951 	case P_CLOSE+4:
2952 	case P_CLOSE+5:
2953 	case P_CLOSE+6:
2954 	case P_CLOSE+7:
2955 	case P_CLOSE+8:
2956 	case P_CLOSE+9:
2957 	    no = P_OP(scan) - P_CLOSE;
2958 	    save = patinput;
2959 
2960 	    if (patmatch(next)) {
2961 		if (no && !(parsfound & (1 << (no + 15)))) {
2962 		    patendp[no-1] = save;
2963 		    parsfound |= 1 << (no + 15);
2964 		}
2965 		return 1;
2966 	    } else
2967 		return 0;
2968 	    break;
2969 	case P_EXCSYNC:
2970 	    /* See the P_EXCLUDE code below for where syncptr comes from */
2971 	    {
2972 		unsigned char *syncptr;
2973 		Upat after;
2974 		after = P_OPERAND(scan);
2975 		DPUTS(!P_ISEXCLUDE(after),
2976 		      "BUG: EXCSYNC not followed by EXCLUDE.");
2977 		DPUTS(!P_OPERAND(after)->p,
2978 		      "BUG: EXCSYNC not handled by EXCLUDE");
2979 		syncptr = P_OPERAND(after)->p + (patinput - patinstart);
2980 		/*
2981 		 * If we already matched from here, this time we fail.
2982 		 * See WBRANCH code for story about error count.
2983 		 */
2984 		if (*syncptr && errsfound + 1 >= *syncptr)
2985 		    return 0;
2986 		/*
2987 		 * Else record that we (possibly) matched this time.
2988 		 * No harm if we don't:  then the previous test will just
2989 		 * short cut the attempted match that is bound to fail.
2990 		 * We never try to exclude something that has already
2991 		 * failed anyway.
2992 		 */
2993 		*syncptr = errsfound + 1;
2994 	    }
2995 	    break;
2996 	case P_EXCEND:
2997 	    /*
2998 	     * This is followed by a P_EXCSYNC, but only in the P_EXCLUDE
2999 	     * branch.  Actually, we don't bother following it:  all we
3000 	     * need to know is that we successfully matched so far up
3001 	     * to the end of the asserted pattern; the endpoint
3002 	     * in the target string is nulled out.
3003 	     */
3004 	    if (!(fail = (patinput < patinend)))
3005 		return 1;
3006 	    break;
3007 	case P_BRANCH:
3008 	case P_WBRANCH:
3009 	    /* P_EXCLUDE shouldn't occur without a P_BRANCH */
3010 	    if (!P_ISBRANCH(next)) {
3011 		/* no choice, avoid recursion */
3012 		DPUTS(P_OP(scan) == P_WBRANCH,
3013 		      "BUG: WBRANCH with no alternative.");
3014 		next = P_OPERAND(scan);
3015 	    } else {
3016 		do {
3017 		    save = patinput;
3018 		    savglobflags = patglobflags;
3019 		    saverrsfound = errsfound;
3020 		    if (P_ISEXCLUDE(next)) {
3021 			/*
3022 			 * The strategy is to test the asserted pattern,
3023 			 * recording via P_EXCSYNC how far the part to
3024 			 * be excluded matched.  We then set the
3025 			 * length of the test string to that
3026 			 * point and see if the exclusion as far as
3027 			 * P_EXCEND also matches that string.
3028 			 * We need to keep testing the asserted pattern
3029 			 * by backtracking, since the first attempt
3030 			 * may be excluded while a later attempt may not.
3031 			 * For this we keep a pointer just after
3032 			 * the P_EXCLUDE which is tested by the P_EXCSYNC
3033 			 * to see if we matched there last time, in which
3034 			 * case we fail.  If there is nothing to backtrack
3035 			 * over, that doesn't matter:  we should fail anyway.
3036 			 * The pointer also tells us where the asserted
3037 			 * pattern matched for use by the exclusion.
3038 			 *
3039 			 * It's hard to allocate space for this
3040 			 * beforehand since we may need to do it
3041 			 * recursively.
3042 			 *
3043 			 * P.S. in case you were wondering, this code
3044 			 * is horrible.
3045 			 */
3046 			Upat syncstrp;
3047 			char *origpatinend;
3048 			unsigned char *oldsyncstr;
3049 			char *matchpt = NULL;
3050 			int ret, savglobdots, matchederrs = 0;
3051 			int savparsfound = parsfound;
3052 			DPUTS(P_OP(scan) == P_WBRANCH,
3053 			      "BUG: excluded WBRANCH");
3054 			syncstrp = P_OPERAND(next);
3055 			/*
3056 			 * Unlike WBRANCH, each test at the same exclude
3057 			 * sync point (due to an external loop) is separate,
3058 			 * i.e testing (foo~bar)# is no different from
3059 			 * (foo~bar)(foo~bar)... from the exclusion point
3060 			 * of view, so we use a different sync string.
3061 			 */
3062 			oldsyncstr = syncstrp->p;
3063 			syncstrp->p = (unsigned char *)
3064 			    zshcalloc((patinend - patinstart) + 1);
3065 			origpatinend = patinend;
3066 			while ((ret = patmatch(P_OPERAND(scan)))) {
3067 			    unsigned char *syncpt;
3068 			    char *savpatinstart;
3069 			    int savforce = forceerrs;
3070 			    int savpatflags = patflags, synclen;
3071 			    forceerrs = -1;
3072 			    savglobdots = globdots;
3073 			    matchederrs = errsfound;
3074 			    matchpt = patinput;    /* may not be end */
3075 			    globdots = 1;	   /* OK to match . first */
3076 			    /* Find the point where the scan
3077 			     * matched the part to be excluded: because
3078 			     * of backtracking, the one
3079 			     * most recently matched will be the first.
3080 			     * (Luckily, backtracking is done after all
3081 			     * possibilities for approximation have been
3082 			     * checked.)
3083 			     */
3084 			    for (syncpt = syncstrp->p; !*syncpt; syncpt++)
3085 				;
3086 			    synclen = syncpt - syncstrp->p;
3087 			    if (patinstart + synclen != patinend) {
3088 				/*
3089 				 * Temporarily mark the string as
3090 				 * ending at this point.
3091 				 */
3092 				DPUTS(patinstart + synclen > matchpt,
3093 				      "BUG: EXCSYNC failed");
3094 
3095 				patinend = patinstart + synclen;
3096 				/*
3097 				 * If this isn't really the end of the string,
3098 				 * remember this for the (#e) assertion.
3099 				 */
3100 				patflags |= PAT_NOTEND;
3101 			    }
3102 			    savpatinstart = patinstart;
3103 			    next = PATNEXT(scan);
3104 			    while (next && P_ISEXCLUDE(next)) {
3105 				patinput = save;
3106 				/*
3107 				 * turn off approximations in exclusions:
3108 				 * note we keep remaining patglobflags
3109 				 * set by asserted branch (or previous
3110 				 * excluded branches, for consistency).
3111 				 */
3112 				patglobflags &= ~0xff;
3113 				errsfound = 0;
3114 				opnd = P_OPERAND(next) + 1;
3115 				if (P_OP(next) == P_EXCLUDP && patinpath) {
3116 				    /*
3117 				     * Top level exclusion with a file,
3118 				     * applies to whole path so add the
3119 				     * segments already matched.
3120 				     * We copied these in front of the
3121 				     * test pattern, so patinend doesn't
3122 				     * need moving.
3123 				     */
3124 				    DPUTS(patinput != patinstart,
3125 					  "BUG: not at start excluding path");
3126 				    patinput = patinstart = patinpath;
3127 				}
3128 				if (patmatch(opnd)) {
3129 				    ret = 0;
3130 				    /*
3131 				     * Another subtlety: if we exclude the
3132 				     * match, any parentheses just found
3133 				     * become invalidated.
3134 				     */
3135 				    parsfound = savparsfound;
3136 				}
3137 				if (patinpath) {
3138 				    patinput = savpatinstart +
3139 					(patinput - patinstart);
3140 				    patinstart = savpatinstart;
3141 				}
3142 				if (!ret)
3143 				    break;
3144 				next = PATNEXT(next);
3145 			    }
3146 			    /*
3147 			     * Restore original end position.
3148 			     */
3149 			    patinend = origpatinend;
3150 			    patflags = savpatflags;
3151 			    globdots = savglobdots;
3152 			    forceerrs = savforce;
3153 			    if (ret)
3154 				break;
3155 			    patinput = save;
3156 			    patglobflags = savglobflags;
3157 			    errsfound = saverrsfound;
3158 			}
3159 			zfree((char *)syncstrp->p,
3160 			      (patinend - patinstart) + 1);
3161 			syncstrp->p = oldsyncstr;
3162 			if (ret) {
3163 			    patinput = matchpt;
3164 			    errsfound = matchederrs;
3165 			    return 1;
3166 			}
3167 			while ((scan = PATNEXT(scan)) &&
3168 			       P_ISEXCLUDE(scan))
3169 			    ;
3170 		    } else {
3171 			int ret = 1, pfree = 0;
3172 			Upat ptrp = NULL;
3173 			unsigned char *ptr;
3174 			if (P_OP(scan) == P_WBRANCH) {
3175 			    /*
3176 			     * This is where we make sure that we are not
3177 			     * repeatedly matching zero-length strings in
3178 			     * a closure, which would cause an infinite loop,
3179 			     * and also remove exponential behaviour in
3180 			     * backtracking nested closures.
3181 			     * The P_WBRANCH operator leaves a space for a
3182 			     * uchar *, initialized to NULL, which is
3183 			     * turned into a string the same length as the
3184 			     * target string.  Every time we match from a
3185 			     * particular point in the target string, we
3186 			     * stick a 1 at the corresponding point here.
3187 			     * If we come round to the same branch again, and
3188 			     * there is already a 1, then the test fails.
3189 			     */
3190 			    opnd = P_OPERAND(scan);
3191 			    ptrp = opnd++;
3192 			    if (!ptrp->p) {
3193 				ptrp->p = (unsigned char *)
3194 				    zshcalloc((patinend - patinstart) + 1);
3195 				pfree = 1;
3196 			    }
3197 			    ptr = ptrp->p + (patinput - patinstart);
3198 
3199 			    /*
3200 			     * Without approximation, this is just a
3201 			     * single bit test.  With approximation, we
3202 			     * need to know how many errors there were
3203 			     * last time we made the test.  If errsfound
3204 			     * is now smaller than it was, hence we can
3205 			     * make more approximations in the remaining
3206 			     * code, we continue with the test.
3207 			     * (This is why the max number of errors is
3208 			     * 254, not 255.)
3209 			     */
3210 			    if (*ptr && errsfound + 1 >= *ptr)
3211 				ret = 0;
3212 			    *ptr = errsfound + 1;
3213 			} else
3214 			    opnd = P_OPERAND(scan);
3215 			if (ret)
3216 			    ret = patmatch(opnd);
3217 			if (pfree) {
3218 			    zfree((char *)ptrp->p,
3219 				  (patinend - patinstart) + 1);
3220 			    ptrp->p = NULL;
3221 			}
3222 			if (ret)
3223 			    return 1;
3224 			scan = PATNEXT(scan);
3225 		    }
3226 		    patinput = save;
3227 		    patglobflags = savglobflags;
3228 		    errsfound = saverrsfound;
3229 		    DPUTS(P_OP(scan) == P_WBRANCH,
3230 			  "BUG: WBRANCH not first choice.");
3231 		    next = PATNEXT(scan);
3232 		} while (scan && P_ISBRANCH(scan));
3233 		return 0;
3234 	    }
3235 	    break;
3236 	case P_STAR:
3237 	    /* Handle specially for speed, although really P_ONEHASH+P_ANY */
3238 	    while (P_OP(next) == P_STAR) {
3239 		/*
3240 		 * If there's another * following we can optimise it
3241 		 * out.  Chains of *'s can give pathologically bad
3242 		 * performance.
3243 		 */
3244 		scan = next;
3245 		next = PATNEXT(scan);
3246 	    }
3247 	    /*FALLTHROUGH*/
3248 	case P_ONEHASH:
3249 	case P_TWOHASH:
3250 	    /*
3251 	     * This is just simple cases, matching one character.
3252 	     * With approximations, we still handle * this way, since
3253 	     * no approximation is ever necessary, but other closures
3254 	     * are handled by the more complicated branching method
3255 	     */
3256 	    op = P_OP(scan);
3257 	    /* Note that no counts possibly metafied characters */
3258 	    start = patinput;
3259 	    {
3260 		char *lastcharstart;
3261 		/*
3262 		 * Array to record the start of characters for
3263 		 * backtracking.
3264 		 */
3265 		VARARR(char, charstart, patinend-patinput);
3266 		memset(charstart, 0, patinend-patinput);
3267 
3268 		if (op == P_STAR) {
3269 		    for (no = 0; patinput < patinend;
3270 			 CHARINC(patinput, patinend))
3271 		    {
3272 			charstart[patinput-start] = 1;
3273 			no++;
3274 		    }
3275 		    /* simple optimization for reasonably common case */
3276 		    if (P_OP(next) == P_END)
3277 			return 1;
3278 		} else {
3279 		    DPUTS(patglobflags & 0xff,
3280 			  "BUG: wrong backtracking with approximation.");
3281 		    if (!globdots && P_NOTDOT(P_OPERAND(scan)) &&
3282 			patinput == patinstart && patinput < patinend &&
3283 			CHARREF(patinput, patinend) == ZWC('.'))
3284 			return 0;
3285 		    no = patrepeat(P_OPERAND(scan), charstart);
3286 		}
3287 		min = (op == P_TWOHASH) ? 1 : 0;
3288 		/*
3289 		 * Lookahead to avoid useless matches. This is not possible
3290 		 * with approximation.
3291 		 */
3292 		if (P_OP(next) == P_EXACTLY && P_LS_LEN(next) &&
3293 		    !(patglobflags & 0xff)) {
3294 		    char *nextop = P_LS_STR(next);
3295 #ifdef MULTIBYTE_SUPPORT
3296 		    /* else second argument of CHARREF isn't used */
3297 		    int nextlen = P_LS_LEN(next);
3298 #endif
3299 		    /*
3300 		     * If that P_EXACTLY is last (common in simple patterns,
3301 		     * such as *.c), then it can be only be matched at one
3302 		     * point in the test string, so record that.
3303 		     */
3304 		    if (P_OP(PATNEXT(next)) == P_END &&
3305 			!(patflags & PAT_NOANCH)) {
3306 			int ptlen = patinend - patinput;
3307 			int lenmatch = patinend -
3308 			    (min ? CHARNEXT(start, patinend) : start);
3309 			/* Are we in the right range? */
3310 			if (P_LS_LEN(next) > lenmatch ||
3311 			    P_LS_LEN(next) < ptlen)
3312 			    return 0;
3313 			/* Yes, just position appropriately and test. */
3314 			patinput += ptlen - P_LS_LEN(next);
3315 			/*
3316 			 * Here we will need to be careful that patinput is not
3317 			 * in the middle of a multibyte character.
3318 			 */
3319 			/* Continue loop with P_EXACTLY test. */
3320 			break;
3321 		    }
3322 		    nextch = CHARREF(nextop, nextop + nextlen);
3323 		} else
3324 		    nextch = PEOF;
3325 		savglobflags = patglobflags;
3326 		saverrsfound = errsfound;
3327 		lastcharstart = charstart + (patinput - start);
3328 		if (no >= min) {
3329 		    for (;;) {
3330 			patint_t charmatch_cache;
3331 			if (nextch == PEOF ||
3332 			    (patinput < patinend &&
3333 			     CHARMATCH_EXPR(CHARREF(patinput, patinend),
3334 					    nextch))) {
3335 			    if (patmatch(next))
3336 				return 1;
3337 			}
3338 			if (--no < min)
3339 			    break;
3340 			/* find start of previous full character */
3341 			while (!*--lastcharstart)
3342 			    DPUTS(lastcharstart < charstart,
3343 				  "lastcharstart invalid");
3344 			patinput = start + (lastcharstart-charstart);
3345 			patglobflags = savglobflags;
3346 			errsfound = saverrsfound;
3347 		    }
3348 		}
3349 	    }
3350 	    /*
3351 	     * As with branches, the patmatch(next) stuff for *
3352 	     * handles approximation, so we don't need to try
3353 	     * anything here.
3354 	     */
3355 	    return 0;
3356 	case P_ISSTART:
3357 	    if (patinput != patinstart || (patflags & PAT_NOTSTART))
3358 		fail = 1;
3359 	    break;
3360 	case P_ISEND:
3361 	    if (patinput < patinend || (patflags & PAT_NOTEND))
3362 		fail = 1;
3363 	    break;
3364 	case P_COUNTSTART:
3365 	    {
3366 		/*
3367 		 * Save and restore the current count and the
3368 		 * start pointer in case the pattern has been
3369 		 * executed by a previous repetition of a
3370 		 * closure.
3371 		 */
3372 		long *curptr = &P_OPERAND(scan)[P_CT_CURRENT].l;
3373 		long savecount = *curptr;
3374 		unsigned char *saveptr = scan[P_CT_PTR].p;
3375 		int ret;
3376 
3377 		*curptr = 0L;
3378 		ret = patmatch(P_OPERAND(scan));
3379 		*curptr = savecount;
3380 		scan[P_CT_PTR].p = saveptr;
3381 		return ret;
3382 	    }
3383 	case P_COUNT:
3384 	    {
3385 		/* (#cN,M): execution is relatively straightforward */
3386 		long cur = scan[P_CT_CURRENT].l;
3387 		long min = scan[P_CT_MIN].l;
3388 		long max = scan[P_CT_MAX].l;
3389 
3390 		if (cur && cur >= min &&
3391 		    (unsigned char *)patinput == scan[P_CT_PTR].p) {
3392 		    /*
3393 		     * Not at the first attempt to match so
3394 		     * the previous attempt managed zero length.
3395 		     * We can do this indefinitely so there's
3396 		     * no point in going on.  Simply try to
3397 		     * match the remainder of the pattern.
3398 		     */
3399 		    return patmatch(next);
3400 		}
3401 		scan[P_CT_PTR].p = (unsigned char *)patinput;
3402 
3403 		if (max < 0 || cur < max) {
3404 		    char *patinput_thistime = patinput;
3405 		    scan[P_CT_CURRENT].l = cur + 1;
3406 		    if (patmatch(scan + P_CT_OPERAND))
3407 			return 1;
3408 		    scan[P_CT_CURRENT].l = cur;
3409 		    patinput = patinput_thistime;
3410 		}
3411 		if (cur < min)
3412 		    return 0;
3413 		return patmatch(next);
3414 	    }
3415 	case P_END:
3416 	    if (!(fail = (patinput < patinend && !(patflags & PAT_NOANCH))))
3417 		return 1;
3418 	    break;
3419 #ifdef DEBUG
3420 	default:
3421 	    dputs("BUG: bad operand in patmatch.");
3422 	    return 0;
3423 	    break;
3424 #endif
3425 	}
3426 
3427 	if (fail) {
3428 	    if (errsfound < (patglobflags & 0xff) &&
3429 		(forceerrs == -1 || errsfound < forceerrs)) {
3430 		/*
3431 		 * Approximation code.  There are four possibilities
3432 		 *
3433 		 * 1. omit character from input string
3434 		 * 2. transpose characters in input and pattern strings
3435 		 * 3. omit character in both input and pattern strings
3436 		 * 4. omit character from pattern string.
3437 		 *
3438 		 * which we try in that order.
3439 		 *
3440 		 * Of these, 2, 3 and 4 require an exact match string
3441 		 * (P_EXACTLY) while 1, 2 and 3 require that we not
3442 		 * have reached the end of the input string.
3443 		 *
3444 		 * Note in each case after making the approximation we
3445 		 * need to retry the *same* pattern; this is what
3446 		 * requires exactpos, a slightly doleful way of
3447 		 * communicating with the exact character matcher.
3448 		 */
3449 		char *savexact = exactpos;
3450 		save = patinput;
3451 		savglobflags = patglobflags;
3452 		saverrsfound = ++errsfound;
3453 		fail = 0;
3454 
3455 		DPUTS(P_OP(scan) != P_EXACTLY && exactpos,
3456 		      "BUG: non-exact match has set exactpos");
3457 
3458 		/* Try omitting a character from the input string */
3459 		if (patinput < patinend) {
3460 		    CHARINC(patinput, patinend);
3461 		    /* If we are not on an exact match, then this is
3462 		     * our last gasp effort, so we can optimize out
3463 		     * the recursive call.
3464 		     */
3465 		    if (P_OP(scan) != P_EXACTLY)
3466 			continue;
3467 		    if (patmatch(scan))
3468 			return 1;
3469 		}
3470 
3471 		if (P_OP(scan) == P_EXACTLY) {
3472 		    char *nextexact = savexact;
3473 		    DPUTS(!savexact,
3474 			  "BUG: exact match has not set exactpos");
3475 		    CHARINC(nextexact, exactend);
3476 
3477 		    if (save < patinend) {
3478 			char *nextin = save;
3479 			CHARINC(nextin, patinend);
3480 			patglobflags = savglobflags;
3481 			errsfound = saverrsfound;
3482 			exactpos = savexact;
3483 
3484 			/*
3485 			 * Try swapping two characters in patinput and
3486 			 * exactpos
3487 			 */
3488 			if (save < patinend && nextin < patinend &&
3489 			    nextexact < exactend) {
3490 			    patint_t cin0 = CHARREF(save, patinend);
3491 			    patint_t cpa0 = CHARREF(exactpos, exactend);
3492 			    patint_t cin1 = CHARREF(nextin, patinend);
3493 			    patint_t cpa1 = CHARREF(nextexact, exactend);
3494 
3495 			    if (CHARMATCH(cin0, cpa1) &&
3496 				CHARMATCH(cin1, cpa0)) {
3497 				patinput = nextin;
3498 				CHARINC(patinput, patinend);
3499 				exactpos = nextexact;
3500 				CHARINC(exactpos, exactend);
3501 				if (patmatch(scan))
3502 				    return 1;
3503 
3504 				patglobflags = savglobflags;
3505 				errsfound = saverrsfound;
3506 			    }
3507 			}
3508 
3509 			/*
3510 			 * Try moving up both strings.
3511 			 */
3512 			patinput = nextin;
3513 			exactpos = nextexact;
3514 			if (patmatch(scan))
3515 			    return 1;
3516 
3517 			patinput = save;
3518 			patglobflags = savglobflags;
3519 			errsfound = saverrsfound;
3520 			exactpos = savexact;
3521 		    }
3522 
3523 		    DPUTS(exactpos == exactend, "approximating too far");
3524 		    /*
3525 		     * Try moving up the exact match pattern.
3526 		     * This must be the last attempt, so just loop
3527 		     * instead of calling recursively.
3528 		     */
3529 		    CHARINC(exactpos, exactend);
3530 		    continue;
3531 		}
3532 	    }
3533 	    exactpos = NULL;
3534 	    return 0;
3535 	}
3536 
3537 	scan = next;
3538 
3539 	/* Allow handlers to run once per loop */
3540 	check_for_signals();
3541     }
3542 
3543     return 0;
3544 }
3545 
3546 
3547 /**/
3548 #ifdef MULTIBYTE_SUPPORT
3549 
3550 /*
3551  * See if character ch matches a pattern range specification.
3552  * The null-terminated specification is in range; the test
3553  * character is in ch.
3554  *
3555  * zmb is one of the enum defined above charref(), for indicating
3556  * incomplete or invalid multibyte characters.
3557  *
3558  * indptr is used by completion matching, which is why this
3559  * function is exported.  If indptr is not NULL we set *indptr
3560  * to the index of the character in the range string, adjusted
3561  * in the case of "A-B" ranges such that A would count as its
3562  * normal index (say IA), B would count as IA + (B-A), and any
3563  * character within the range as appropriate.  We're not strictly
3564  * guaranteed this fits within a wint_t, but if this is Unicode
3565  * in 32 bits we have a fair amount of distance left over.
3566  *
3567  * mtp is used in the same circumstances.  *mtp returns the match type:
3568  * 0 for a standard character, else the PP_ index.  It's not
3569  * useful if the match failed.
3570  */
3571 
3572 /**/
3573 mod_export int
mb_patmatchrange(char * range,wchar_t ch,int zmb_ind,wint_t * indptr,int * mtp)3574 mb_patmatchrange(char *range, wchar_t ch, int zmb_ind, wint_t *indptr, int *mtp)
3575 {
3576     wchar_t r1, r2;
3577 
3578     if (indptr)
3579 	*indptr = 0;
3580     /*
3581      * Careful here: unlike other strings, range is a NULL-terminated,
3582      * metafied string, because we need to treat the Posix and hyphenated
3583      * ranges specially.
3584      */
3585     while (*range) {
3586 	if (imeta(STOUC(*range))) {
3587 	    int swtype = STOUC(*range++) - STOUC(Meta);
3588 	    if (mtp)
3589 		*mtp = swtype;
3590 	    switch (swtype) {
3591 	    case 0:
3592 		/* ordinary metafied character */
3593 		range--;
3594 		if (metacharinc(&range) == ch)
3595 		    return 1;
3596 		break;
3597 	    case PP_ALPHA:
3598 		if (iswalpha(ch))
3599 		    return 1;
3600 		break;
3601 	    case PP_ALNUM:
3602 		if (iswalnum(ch))
3603 		    return 1;
3604 		break;
3605 	    case PP_ASCII:
3606 		if ((ch & ~0x7f) == 0)
3607 		    return 1;
3608 		break;
3609 	    case PP_BLANK:
3610 #if !defined(HAVE_ISWBLANK) && !defined(iswblank)
3611 /*
3612  * iswblank() is GNU and C99. There's a remote chance that some
3613  * systems still don't support it (but would support the other ones
3614  * if MULTIBYTE_SUPPORT is enabled).
3615  */
3616 #define iswblank(c) (c == L' ' || c == L'\t')
3617 #endif
3618 		if (iswblank(ch))
3619 		    return 1;
3620 		break;
3621 	    case PP_CNTRL:
3622 		if (iswcntrl(ch))
3623 		    return 1;
3624 		break;
3625 	    case PP_DIGIT:
3626 		if (iswdigit(ch))
3627 		    return 1;
3628 		break;
3629 	    case PP_GRAPH:
3630 		if (iswgraph(ch))
3631 		    return 1;
3632 		break;
3633 	    case PP_LOWER:
3634 		if (iswlower(ch))
3635 		    return 1;
3636 		break;
3637 	    case PP_PRINT:
3638 		if (WC_ISPRINT(ch))
3639 		    return 1;
3640 		break;
3641 	    case PP_PUNCT:
3642 		if (iswpunct(ch))
3643 		    return 1;
3644 		break;
3645 	    case PP_SPACE:
3646 		if (iswspace(ch))
3647 		    return 1;
3648 		break;
3649 	    case PP_UPPER:
3650 		if (iswupper(ch))
3651 		    return 1;
3652 		break;
3653 	    case PP_XDIGIT:
3654 		if (iswxdigit(ch))
3655 		    return 1;
3656 		break;
3657 	    case PP_IDENT:
3658 		if (wcsitype(ch, IIDENT))
3659 		    return 1;
3660 		break;
3661 	    case PP_IFS:
3662 		if (wcsitype(ch, ISEP))
3663 		    return 1;
3664 		break;
3665 	    case PP_IFSSPACE:
3666 		/* must be ASCII space character */
3667 		if (ch < 128 && iwsep((int)ch))
3668 		    return 1;
3669 		break;
3670 	    case PP_WORD:
3671 		if (wcsitype(ch, IWORD))
3672 		    return 1;
3673 		break;
3674 	    case PP_RANGE:
3675 		r1 = metacharinc(&range);
3676 		r2 = metacharinc(&range);
3677 		if (r1 <= ch && ch <= r2) {
3678 		    if (indptr)
3679 			*indptr += ch - r1;
3680 		    return 1;
3681 		}
3682 		/* Careful not to screw up counting with bogus range */
3683 		if (indptr && r1 < r2) {
3684 		    /*
3685 		     * This gets incremented again below to get
3686 		     * us past the range end.  This is correct.
3687 		     */
3688 		    *indptr += r2 - r1;
3689 		}
3690 		break;
3691 	    case PP_INCOMPLETE:
3692 		if (zmb_ind == ZMB_INCOMPLETE)
3693 		    return 1;
3694 		break;
3695 	    case PP_INVALID:
3696 		if (zmb_ind == ZMB_INVALID)
3697 		    return 1;
3698 		break;
3699 	    case PP_UNKWN:
3700 		DPUTS(1, "BUG: unknown posix range passed through.\n");
3701 		break;
3702 	    default:
3703 		DPUTS(1, "BUG: unknown metacharacter in range.");
3704 		break;
3705 	    }
3706 	} else if (metacharinc(&range) == ch) {
3707 	    if (mtp)
3708 		*mtp = 0;
3709 	    return 1;
3710 	}
3711 	if (indptr)
3712 	    (*indptr)++;
3713     }
3714     return 0;
3715 }
3716 
3717 
3718 /*
3719  * This is effectively the reverse of mb_patmatchrange().
3720  * Given a range descriptor of the same form, and an index into it,
3721  * try to determine the character that is matched.  If the index
3722  * points to a [:...:] generic style match, set chr to WEOF and
3723  * return the type in mtp instead.  Return 1 if successful, 0 if
3724  * there was no corresponding index.  Note all pointer arguments
3725  * must be non-null.
3726  */
3727 
3728 /**/
3729 mod_export int
mb_patmatchindex(char * range,wint_t ind,wint_t * chr,int * mtp)3730 mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp)
3731 {
3732     wchar_t r1, r2, rchr;
3733     wint_t rdiff;
3734 
3735     *chr = WEOF;
3736     *mtp = 0;
3737 
3738     while (*range) {
3739 	if (imeta(STOUC(*range))) {
3740 	    int swtype = STOUC(*range++) - STOUC(Meta);
3741 	    switch (swtype) {
3742 	    case 0:
3743 		range--;
3744 		rchr = metacharinc(&range);
3745 		if (!ind) {
3746 		    *chr = (wint_t) rchr;
3747 		    return 1;
3748 		}
3749 		break;
3750 
3751 	    case PP_ALPHA:
3752 	    case PP_ALNUM:
3753 	    case PP_ASCII:
3754 	    case PP_BLANK:
3755 	    case PP_CNTRL:
3756 	    case PP_DIGIT:
3757 	    case PP_GRAPH:
3758 	    case PP_LOWER:
3759 	    case PP_PRINT:
3760 	    case PP_PUNCT:
3761 	    case PP_SPACE:
3762 	    case PP_UPPER:
3763 	    case PP_XDIGIT:
3764 	    case PP_IDENT:
3765 	    case PP_IFS:
3766 	    case PP_IFSSPACE:
3767 	    case PP_WORD:
3768 	    case PP_INCOMPLETE:
3769 	    case PP_INVALID:
3770 		if (!ind) {
3771 		    *mtp = swtype;
3772 		    return 1;
3773 		}
3774 		break;
3775 
3776 	    case PP_RANGE:
3777 		r1 = metacharinc(&range);
3778 		r2 = metacharinc(&range);
3779 		rdiff = (wint_t)r2 - (wint_t)r1;
3780 		if (rdiff >= ind) {
3781 		    *chr = (wint_t)r1 + ind;
3782 		    return 1;
3783 		}
3784 		/* note the extra decrement to ind below */
3785 		ind -= rdiff;
3786 		break;
3787 	    case PP_UNKWN:
3788 		DPUTS(1, "BUG: unknown posix range passed through.\n");
3789 		break;
3790 	    default:
3791 		DPUTS(1, "BUG: unknown metacharacter in range.");
3792 		break;
3793 	    }
3794 	} else {
3795 	    rchr = metacharinc(&range);
3796 	    if (!ind) {
3797 		*chr = (wint_t)rchr;
3798 		return 1;
3799 	    }
3800 	}
3801 	if (!ind--)
3802 	    break;
3803     }
3804 
3805     /* No corresponding index. */
3806     return 0;
3807 }
3808 
3809 /**/
3810 #endif /* MULTIBYTE_SUPPORT */
3811 
3812 /*
3813  * Identical function to mb_patmatchrange() above for single-byte
3814  * characters.
3815  */
3816 
3817 /**/
3818 mod_export int
patmatchrange(char * range,int ch,int * indptr,int * mtp)3819 patmatchrange(char *range, int ch, int *indptr, int *mtp)
3820 {
3821     int r1, r2;
3822 
3823     if (indptr)
3824 	*indptr = 0;
3825     /*
3826      * Careful here: unlike other strings, range is a NULL-terminated,
3827      * metafied string, because we need to treat the Posix and hyphenated
3828      * ranges specially.
3829      */
3830     for (; *range; range++) {
3831 	if (imeta(STOUC(*range))) {
3832 	    int swtype = STOUC(*range) - STOUC(Meta);
3833 	    if (mtp)
3834 		*mtp = swtype;
3835 	    switch (swtype) {
3836 	    case 0:
3837 		if (STOUC(*++range ^ 32) == ch)
3838 		    return 1;
3839 		break;
3840 	    case PP_ALPHA:
3841 		if (isalpha(ch))
3842 		    return 1;
3843 		break;
3844 	    case PP_ALNUM:
3845 		if (isalnum(ch))
3846 		    return 1;
3847 		break;
3848 	    case PP_ASCII:
3849 		if ((ch & ~0x7f) == 0)
3850 		    return 1;
3851 		break;
3852 	    case PP_BLANK:
3853 #if !defined(HAVE_ISBLANK) && !defined(isblank)
3854 /*
3855  * isblank() is GNU and C99. There's a remote chance that some
3856  * systems still don't support it.
3857  */
3858 #define isblank(c) (c == ' ' || c == '\t')
3859 #endif
3860 		if (isblank(ch))
3861 		    return 1;
3862 		break;
3863 	    case PP_CNTRL:
3864 		if (iscntrl(ch))
3865 		    return 1;
3866 		break;
3867 	    case PP_DIGIT:
3868 		if (isdigit(ch))
3869 		    return 1;
3870 		break;
3871 	    case PP_GRAPH:
3872 		if (isgraph(ch))
3873 		    return 1;
3874 		break;
3875 	    case PP_LOWER:
3876 		if (islower(ch))
3877 		    return 1;
3878 		break;
3879 	    case PP_PRINT:
3880 		if (ZISPRINT(ch))
3881 		    return 1;
3882 		break;
3883 	    case PP_PUNCT:
3884 		if (ispunct(ch))
3885 		    return 1;
3886 		break;
3887 	    case PP_SPACE:
3888 		if (isspace(ch))
3889 		    return 1;
3890 		break;
3891 	    case PP_UPPER:
3892 		if (isupper(ch))
3893 		    return 1;
3894 		break;
3895 	    case PP_XDIGIT:
3896 		if (isxdigit(ch))
3897 		    return 1;
3898 		break;
3899 	    case PP_IDENT:
3900 		if (iident(ch))
3901 		    return 1;
3902 		break;
3903 	    case PP_IFS:
3904 		if (isep(ch))
3905 		    return 1;
3906 		break;
3907 	    case PP_IFSSPACE:
3908 		if (iwsep(ch))
3909 		    return 1;
3910 		break;
3911 	    case PP_WORD:
3912 		if (iword(ch))
3913 		    return 1;
3914 		break;
3915 	    case PP_RANGE:
3916 		range++;
3917 		r1 = STOUC(UNMETA(range));
3918 		METACHARINC(range);
3919 		r2 = STOUC(UNMETA(range));
3920 		if (*range == Meta)
3921 		    range++;
3922 		if (r1 <= ch && ch <= r2) {
3923 		    if (indptr)
3924 			*indptr += ch - r1;
3925 		    return 1;
3926 		}
3927 		if (indptr && r1 < r2)
3928 		    *indptr += r2 - r1;
3929 		break;
3930 	    case PP_INCOMPLETE:
3931 	    case PP_INVALID:
3932 		/* Never true if not in multibyte mode */
3933 		break;
3934 	    case PP_UNKWN:
3935 		DPUTS(1, "BUG: unknown posix range passed through.\n");
3936 		break;
3937 	    default:
3938 		DPUTS(1, "BUG: unknown metacharacter in range.");
3939 		break;
3940 	    }
3941 	} else if (STOUC(*range) == ch) {
3942 	    if (mtp)
3943 		*mtp = 0;
3944 	    return 1;
3945 	}
3946 	if (indptr)
3947 	    (*indptr)++;
3948     }
3949     return 0;
3950 }
3951 
3952 
3953 /**/
3954 #ifndef MULTIBYTE_SUPPORT
3955 
3956 /*
3957  * Identical function to mb_patmatchindex() above for single-byte
3958  * characters.  Here -1 represents a character that needs a special type.
3959  *
3960  * Unlike patmatchrange, we only need this in ZLE, which always
3961  * uses MULTIBYTE_SUPPORT if compiled in; hence we don't use
3962  * this function in that case.
3963  */
3964 
3965 /**/
3966 mod_export int
patmatchindex(char * range,int ind,int * chr,int * mtp)3967 patmatchindex(char *range, int ind, int *chr, int *mtp)
3968 {
3969     int r1, r2, rdiff, rchr;
3970 
3971     *chr = -1;
3972     *mtp = 0;
3973 
3974     for (; *range; range++) {
3975 	if (imeta(STOUC(*range))) {
3976 	    int swtype = STOUC(*range) - STOUC(Meta);
3977 	    switch (swtype) {
3978 	    case 0:
3979 		/* ordinary metafied character */
3980 		rchr = STOUC(*++range) ^ 32;
3981 		if (!ind) {
3982 		    *chr = rchr;
3983 		    return 1;
3984 		}
3985 		break;
3986 
3987 	    case PP_ALPHA:
3988 	    case PP_ALNUM:
3989 	    case PP_ASCII:
3990 	    case PP_BLANK:
3991 	    case PP_CNTRL:
3992 	    case PP_DIGIT:
3993 	    case PP_GRAPH:
3994 	    case PP_LOWER:
3995 	    case PP_PRINT:
3996 	    case PP_PUNCT:
3997 	    case PP_SPACE:
3998 	    case PP_UPPER:
3999 	    case PP_XDIGIT:
4000 	    case PP_IDENT:
4001 	    case PP_IFS:
4002 	    case PP_IFSSPACE:
4003 	    case PP_WORD:
4004 	    case PP_INCOMPLETE:
4005 	    case PP_INVALID:
4006 		if (!ind) {
4007 		    *mtp = swtype;
4008 		    return 1;
4009 		}
4010 		break;
4011 
4012 	    case PP_RANGE:
4013 		range++;
4014 		r1 = STOUC(UNMETA(range));
4015 		METACHARINC(range);
4016 		r2 = STOUC(UNMETA(range));
4017 		if (*range == Meta)
4018 		    range++;
4019 		rdiff = r2 - r1;
4020 		if (rdiff >= ind) {
4021 		    *chr = r1 + ind;
4022 		    return 1;
4023 		}
4024 		/* note the extra decrement to ind below */
4025 		ind -= rdiff;
4026 		break;
4027 	    case PP_UNKWN:
4028 		DPUTS(1, "BUG: unknown posix range passed through.\n");
4029 		break;
4030 	    default:
4031 		DPUTS(1, "BUG: unknown metacharacter in range.");
4032 		break;
4033 	    }
4034 	} else {
4035 	    if (!ind) {
4036 		*chr = STOUC(*range);
4037 		return 1;
4038 	    }
4039 	}
4040 	if (!ind--)
4041 	    break;
4042     }
4043 
4044     /* No corresponding index. */
4045     return 0;
4046 }
4047 
4048 /**/
4049 #endif /* MULTIBYTE_SUPPORT */
4050 
4051 /*
4052  * Repeatedly match something simple and say how many times.
4053  * charstart is an array parallel to that starting at patinput
4054  * and records the start of (possibly multibyte) characters
4055  * to aid in later backtracking.
4056  */
4057 
4058 /**/
patrepeat(Upat p,char * charstart)4059 static int patrepeat(Upat p, char *charstart)
4060 {
4061     int count = 0;
4062     patint_t tch, charmatch_cache;
4063     char *scan, *opnd;
4064 
4065     scan = patinput;
4066     opnd = (char *)P_OPERAND(p);
4067 
4068     switch(P_OP(p)) {
4069 #ifdef DEBUG
4070     case P_ANY:
4071 	dputs("BUG: ?# did not get optimized to *");
4072 	return 0;
4073 	break;
4074 #endif
4075     case P_EXACTLY:
4076 	DPUTS(P_LS_LEN(p) != 1, "closure following more than one character");
4077 	tch = CHARREF(P_LS_STR(p), P_LS_STR(p) + P_LS_LEN(p));
4078 	while (scan < patinend &&
4079 	       CHARMATCH_EXPR(CHARREF(scan, patinend), tch)) {
4080 	    charstart[scan-patinput] = 1;
4081 	    count++;
4082 	    CHARINC(scan, patinend);
4083 	}
4084 	break;
4085     case P_ANYOF:
4086     case P_ANYBUT:
4087 	while (scan < patinend) {
4088 #ifdef MULTIBYTE_SUPPORT
4089 	    int zmb_ind;
4090 	    wchar_t cr = charref(scan, patinend, &zmb_ind);
4091 	    if (patglobflags & GF_MULTIBYTE) {
4092 		if (mb_patmatchrange(opnd, cr, zmb_ind, NULL, NULL) ^
4093 		    (P_OP(p) == P_ANYOF))
4094 		    break;
4095 	    } else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^
4096 		       (P_OP(p) == P_ANYOF))
4097 		break;
4098 #else
4099 	    if (patmatchrange(opnd, CHARREF(scan, patinend), NULL, NULL) ^
4100 		(P_OP(p) == P_ANYOF))
4101 		break;
4102 #endif
4103 	    charstart[scan-patinput] = 1;
4104 	    count++;
4105 	    CHARINC(scan, patinend);
4106 	}
4107 	break;
4108 #ifdef DEBUG
4109     default:
4110 	dputs("BUG: something very strange is happening in patrepeat");
4111 	return 0;
4112 	break;
4113 #endif
4114     }
4115 
4116     patinput = scan;
4117     return count;
4118 }
4119 
4120 /* Free a patprog. */
4121 
4122 /**/
4123 mod_export void
freepatprog(Patprog prog)4124 freepatprog(Patprog prog)
4125 {
4126     if (prog && prog != dummy_patprog1 && prog != dummy_patprog2)
4127 	zfree(prog, prog->size);
4128 }
4129 
4130 /* Disable or reenable a pattern character */
4131 
4132 /**/
4133 int
pat_enables(const char * cmd,char ** patp,int enable)4134 pat_enables(const char *cmd, char **patp, int enable)
4135 {
4136     int ret = 0;
4137     const char **stringp;
4138     char *disp;
4139 
4140     if (!*patp) {
4141 	int done = 0;
4142 	for (stringp = zpc_strings, disp = zpc_disables;
4143 	     stringp < zpc_strings + ZPC_COUNT;
4144 	     stringp++, disp++) {
4145 	    if (!*stringp)
4146 		continue;
4147 	    if (enable ? *disp : !*disp)
4148 		continue;
4149 	    if (done)
4150 		putc(' ', stdout);
4151 	    printf("'%s'", *stringp);
4152 	    done = 1;
4153 	}
4154 	if (done)
4155 	    putc('\n', stdout);
4156 	return 0;
4157     }
4158 
4159     for (; *patp; patp++) {
4160 	for (stringp = zpc_strings, disp = zpc_disables;
4161 	     stringp < zpc_strings + ZPC_COUNT;
4162 	     stringp++, disp++) {
4163 	    if (*stringp && !strcmp(*stringp, *patp)) {
4164 		*disp = (char)!enable;
4165 		break;
4166 	    }
4167 	}
4168 	if (stringp == zpc_strings + ZPC_COUNT) {
4169 	    zerrnam(cmd, "invalid pattern: %s", *patp);
4170 	    ret = 1;
4171 	}
4172     }
4173 
4174     return ret;
4175 }
4176 
4177 /*
4178  * Save the current state of pattern disables, returning the saved value.
4179  */
4180 
4181 /**/
4182 unsigned int
savepatterndisables(void)4183 savepatterndisables(void)
4184 {
4185     unsigned int disables, bit;
4186     char *disp;
4187 
4188     disables = 0;
4189     for (bit = 1, disp = zpc_disables;
4190 	 disp < zpc_disables + ZPC_COUNT;
4191 	 bit <<= 1, disp++) {
4192 	if (*disp)
4193 	    disables |= bit;
4194     }
4195     return disables;
4196 }
4197 
4198 /*
4199  * Function scope saving pattern enables.
4200  */
4201 
4202 /**/
4203 void
startpatternscope(void)4204 startpatternscope(void)
4205 {
4206     Zpc_disables_save newdis;
4207 
4208     newdis = (Zpc_disables_save)zalloc(sizeof(*newdis));
4209     newdis->next = zpc_disables_stack;
4210     newdis->disables = savepatterndisables();
4211 
4212     zpc_disables_stack = newdis;
4213 }
4214 
4215 /*
4216  * Restore completely the state of pattern disables.
4217  */
4218 
4219 /**/
4220 void
restorepatterndisables(unsigned int disables)4221 restorepatterndisables(unsigned int disables)
4222 {
4223     char *disp;
4224     unsigned int bit;
4225 
4226     for (bit = 1, disp = zpc_disables;
4227 	 disp < zpc_disables + ZPC_COUNT;
4228 	 bit <<= 1, disp++) {
4229 	if (disables & bit)
4230 	    *disp = 1;
4231 	else
4232 	    *disp = 0;
4233     }
4234 }
4235 
4236 /*
4237  * Function scope to restore pattern enables if localpatterns is turned on.
4238  */
4239 
4240 /**/
4241 void
endpatternscope(void)4242 endpatternscope(void)
4243 {
4244     Zpc_disables_save olddis;
4245 
4246     olddis = zpc_disables_stack;
4247     zpc_disables_stack = olddis->next;
4248 
4249     if (isset(LOCALPATTERNS))
4250 	restorepatterndisables(olddis->disables);
4251 
4252     zfree(olddis, sizeof(*olddis));
4253 }
4254 
4255 /* Reinitialise pattern disables */
4256 
4257 /**/
4258 void
clearpatterndisables(void)4259 clearpatterndisables(void)
4260 {
4261     memset(zpc_disables, 0, ZPC_COUNT);
4262 }
4263 
4264 
4265 /* Check to see if str is eligible for filename generation. */
4266 
4267 /**/
4268 mod_export int
haswilds(char * str)4269 haswilds(char *str)
4270 {
4271     char *start;
4272 
4273     /* `[' and `]' are legal even if bad patterns are usually not. */
4274     if ((*str == Inbrack || *str == Outbrack) && !str[1])
4275 	return 0;
4276 
4277     /* If % is immediately followed by ?, then that ? is     *
4278      * not treated as a wildcard.  This is so you don't have *
4279      * to escape job references such as %?foo.               */
4280     if (str[0] == '%' && str[1] == Quest)
4281 	str[1] = '?';
4282 
4283     /*
4284      * Note that at this point zpc_special has not been set up.
4285      */
4286     start = str;
4287     for (; *str; str++) {
4288 	switch (*str) {
4289 	    case Inpar:
4290 		if ((!isset(SHGLOB) && !zpc_disables[ZPC_INPAR]) ||
4291 		    (str > start && isset(KSHGLOB) &&
4292 		     ((str[-1] == Quest && !zpc_disables[ZPC_KSH_QUEST]) ||
4293 		      (str[-1] == Star && !zpc_disables[ZPC_KSH_STAR]) ||
4294 		      (str[-1] == '+' && !zpc_disables[ZPC_KSH_PLUS]) ||
4295 		      (str[-1] == Bang && !zpc_disables[ZPC_KSH_BANG]) ||
4296 		      (str[-1] == '!' && !zpc_disables[ZPC_KSH_BANG2]) ||
4297 		      (str[-1] == '@' && !zpc_disables[ZPC_KSH_AT]))))
4298 		    return 1;
4299 		break;
4300 
4301 	    case Bar:
4302 		if (!zpc_disables[ZPC_BAR])
4303 		    return 1;
4304 		break;
4305 
4306 	    case Star:
4307 		if (!zpc_disables[ZPC_STAR])
4308 		    return 1;
4309 		break;
4310 
4311 	    case Inbrack:
4312 		if (!zpc_disables[ZPC_INBRACK])
4313 		    return 1;
4314 		break;
4315 
4316 	    case Inang:
4317 		if (!zpc_disables[ZPC_INANG])
4318 		    return 1;
4319 		break;
4320 
4321 	    case Quest:
4322 		if (!zpc_disables[ZPC_QUEST])
4323 		    return 1;
4324 		break;
4325 
4326 	    case Pound:
4327 		if (isset(EXTENDEDGLOB) && !zpc_disables[ZPC_HASH])
4328 		    return 1;
4329 		break;
4330 
4331 	    case Hat:
4332 		if (isset(EXTENDEDGLOB) && !zpc_disables[ZPC_HAT])
4333 		    return 1;
4334 		break;
4335 	}
4336     }
4337     return 0;
4338 }
4339