1 /* @(#)match.c	1.36 19/09/21 Copyright 1985, 1995-2019 J. Schilling */
2 #include <schily/utypes.h>	/* For Uchar			*/
3 #include <schily/standard.h>
4 #include <schily/patmatch.h>
5 #define	POSIX_CLASS		/* Support [[:alpha:]] by default */
6 #ifdef	NO_POSIX_CLASS		/* Allow to disable [[:alpha:]]	  */
7 #undef	POSIX_CLASS
8 #endif
9 #ifdef	POSIX_CLASS
10 #include <schily/wchar.h>	/* With [[:alpha:]], we need wctype()	*/
11 #include <schily/wctype.h>	/* and thus wchar.h and wctype.h	*/
12 #endif
13 /*
14  *	Pattern matching functions
15  *
16  *	Copyright (c) 1985, 1995-2019 J. Schilling
17  */
18 /*
19  * The contents of this file are subject to the terms of the
20  * Common Development and Distribution License, Version 1.0 only
21  * (the "License").  You may not use this file except in compliance
22  * with the License.
23  *
24  * See the file CDDL.Schily.txt in this distribution for details.
25  * A copy of the CDDL is also available via the Internet at
26  * http://www.opensource.org/licenses/cddl1.txt
27  *
28  * When distributing Covered Code, include this CDDL HEADER in each
29  * file and include the License file CDDL.Schily.txt from this distribution.
30  */
31 /*
32  *	The pattern matching functions below are based on the algorithm
33  *	presented by Martin Richards in:
34  *
35  *	"A Compact Function for Regular Expression Pattern Matching",
36  *	Software-Practice and Experience, Vol. 9, 527-534 (1979)
37  *
38  *	Several changes have been made to the original source which has been
39  *	written in BCPL:
40  *
41  *	'/'	is replaced by	'!'		(to allow UNIX filenames)
42  *	'(',')' are replaced by	'{', '}'
43  *	'\''	is replaced by	'\\'		(UNIX compatible quote)
44  *
45  *	Character classes have been added to allow "[<character list>]"
46  *	to be used.
47  *	POSIX features like [[:alpha:]] have been added.
48  *	Start of line '^' and end of line '$' have been added.
49  */
50 
51 #undef	CHAR
52 
53 #ifdef	__LINE_MATCH
54 #ifdef	__WIDE_CHAR
55 #define	patmatch	patwlmatch
56 #else
57 #define	opatmatch	opatlmatch
58 #define	patmatch	patlmatch
59 #endif
60 #endif
61 
62 #ifdef	__WIDE_CHAR
63 #ifndef	__LINE_MATCH
64 #define	patcompile	patwcompile
65 #define	patmatch	patwmatch
66 #endif
67 #define	CHAR		wchar_t
68 #define	PCHAR		wchar_t
69 #endif
70 
71 #ifdef	__MB_CHAR
72 #undef	patmatch
73 #ifdef	__LINE_MATCH
74 #define	patmatch	patmblmatch
75 #else
76 #define	patmatch	patmbmatch
77 #endif
78 #define	PCHAR		wchar_t
79 #endif
80 
81 #ifndef	CHAR
82 #define	CHAR		Uchar
83 #endif
84 
85 #ifndef	PCHAR
86 #define	PCHAR		Uchar
87 #endif
88 
89 #define	ENDSTATE	(-1)
90 
91 #define	CL_SIZE		32	/* Max size for '[: :]'			*/
92 
93 /*
94  *	The Interpreter
95  */
96 
97 
98 /*
99  *	put adds a new state to the active list
100  */
101 #define	 put(ret, state, sp, n)	{		\
102 	register int *lstate	= state;	\
103 	register int *lsp	= sp;		\
104 	register int ln		= n;		\
105 						\
106 	while (lstate < lsp) {			\
107 		if (*lstate++ == ln) {		\
108 			ret = lsp;		\
109 			lsp = 0;		\
110 			break;			\
111 		}				\
112 	}					\
113 	if (lsp) {				\
114 		*lstate++ = ln;			\
115 		ret = lstate;			\
116 	}					\
117 }
118 
119 
120 /*
121  *	match a character in class
122  *
123  *	Syntax errors do not appear here, they are handled by the compiler,
124  *	so in theory we could remove the "return (0)" statements from the
125  *	the POSIX class code.
126  */
127 #ifdef	POSIX_CLASS
128 #define	CHK_POSIX_CLASS					\
129 	if (*lpat == LCLASS && lpat[1] == ':') {	\
130 		char	class[CL_SIZE+1];		\
131 		char	*pc = class;			\
132 							\
133 		lpat += 2;	/* Eat ':' */		\
134 		for (;;) {				\
135 			if (*lpat == '\0') {		\
136 				ok = FALSE;		\
137 				goto out;		\
138 			}				\
139 			if (*lpat == ':' && lpat[1] == RCLASS) \
140 				break;			\
141 			if (pc >= &class[CL_SIZE]) {	\
142 				ok = FALSE;		\
143 				goto out;		\
144 			}				\
145 			*pc++ = *lpat++;		\
146 		}					\
147 		if (pc == class) {			\
148 			ok = FALSE;			\
149 			goto out;			\
150 		}					\
151 		*pc = '\0';				\
152 		lpat += 2;	/* Skip ":]" */		\
153 		if (iswctype(lc, wctype(class))) {	\
154 			ok = !ok;			\
155 			goto out;			\
156 		}					\
157 		continue;				\
158 	} else
159 #else
160 #define	CHK_POSIX_CLASS
161 #endif
162 #define	in_class(found, pat, c)	{			\
163 	register const PCHAR	*lpat	= pat;		\
164 	register int		lc	= c;		\
165 	int	lo_bound;				\
166 	int	hi_bound;				\
167 	BOOL	ok			= FALSE;	\
168 							\
169 	if (*lpat == NOT) {				\
170 		lpat++;					\
171 		ok = TRUE;				\
172 	}						\
173 	do {						\
174 		if (*lpat == '\0') {			\
175 			ok = FALSE;			\
176 			goto out;			\
177 		}					\
178 		CHK_POSIX_CLASS				\
179 		if (*lpat == QUOTE)			\
180 			lpat++;				\
181 		lo_bound = *lpat++;			\
182 		if (*lpat == RANGE && lpat[1] != RCLASS) { \
183 			lpat++;				\
184 			if (*lpat == QUOTE)		\
185 				lpat++;			\
186 			hi_bound = *lpat++;		\
187 		} else {				\
188 			hi_bound = lo_bound;		\
189 		}					\
190 		if (lo_bound <= lc && lc <= hi_bound) {	\
191 			ok = !ok;			\
192 			goto out;			\
193 		}					\
194 	} while (*lpat != RCLASS);			\
195 out:							\
196 	found = ok;					\
197 }
198 
199 /*
200  *	opatmatch - the old external interpreter interface.
201  *
202  *	Trys to match a string beginning at offset
203  *	against the compiled pattern.
204  */
205 #if !defined(__WIDE_CHAR) && !defined(__MB_CHAR)
206 EXPORT CHAR
opatmatch(pat,aux,str,soff,slen,alt)207 *opatmatch(pat, aux, str, soff, slen, alt)
208 	const PCHAR	*pat;
209 	const int	*aux;
210 	const CHAR	*str;
211 	int		soff;
212 	int		slen;
213 	int		alt;
214 {
215 	int		state[MAXPAT];
216 
217 	return (patmatch(pat, aux, str, soff, slen, alt, state));
218 }
219 #endif
220 
221 /*
222  *	patmatch - the external interpreter interface.
223  *
224  *	Trys to match a string beginning at offset
225  *	against the compiled pattern.
226  */
227 EXPORT CHAR *
patmatch(pat,aux,str,soff,slen,alt,state)228 patmatch(pat, aux, str, soff, slen, alt, state)
229 	const PCHAR	*pat;
230 	const int	*aux;
231 	const CHAR	*str;
232 	int		soff;
233 	int		slen;
234 	int		alt;
235 	int		state[];
236 {
237 	register int	*sp;
238 	register int	*n;
239 	register int	*i;
240 	register int	p;
241 	register int	q, s, k;
242 #ifdef	__MB_CHAR
243 	wchar_t		c;
244 	int		mlen = 1;
245 #else
246 	int		c;
247 #endif
248 	const CHAR	*lastp = (CHAR *)NULL;
249 
250 #ifdef	__LINE_MATCH
251 for (; soff <= slen; soff++) {
252 #endif
253 
254 	sp = state;
255 	put(sp, state, state, 0);
256 	if (alt != ENDSTATE)
257 		put(sp, state, sp, alt);
258 
259 #ifdef	__MB_CHAR
260 	mbtowc(NULL, NULL, 0);
261 	for (s = soff; ; s += mlen) {
262 #else
263 	for (s = soff; ; s++) {
264 #endif
265 		/*
266 		 * next char from input string
267 		 */
268 		if (s >= slen) {
269 			c = 0;
270 		} else {
271 #ifdef	__MB_CHAR
272 			mlen = mbtowc(&c, (char *)&str[s], slen - s);
273 			if (mlen < 0) {
274 				mbtowc(NULL, NULL, 0);
275 				c = str[s];
276 				mlen = 1;
277 			}
278 #else
279 			c = str[s];
280 #endif
281 		}
282 		/*
283 		 * first complete the closure
284 		 */
285 		for (n = state; n < sp; ) {
286 			p = *n++;		/* next state number */
287 			if (p == ENDSTATE)
288 				continue;
289 			q = aux[p];		/* get next state for pat */
290 			k = pat[p];		/* get next char from pat */
291 			switch (k) {
292 
293 			case REP:
294 				put(sp, state, sp, p+1);
295 				/* FALLTHRU */
296 			case NIL:		/* NIL matches always */
297 			case STAR:
298 				put(sp, state, sp, q);
299 				break;
300 			case LBRACK:		/* alternations */
301 			case ALT:
302 				put(sp, state, sp, p+1);
303 				if (q != ENDSTATE)
304 					put(sp, state, sp, q);
305 				break;
306 			case START:
307 				if (s == 0)
308 					put(sp, state, sp, q);
309 				break;
310 			case END:
311 				if (c == '\0')
312 					put(sp, state, sp, q);
313 				break;
314 			}
315 		}
316 
317 		for (i = state; i < sp; ) {
318 			if (*i++ == ENDSTATE) {
319 				lastp = &str[s];
320 				break;
321 			}
322 		}
323 		if (c == 0)
324 			return ((CHAR *)lastp);
325 
326 		/*
327 		 * now try to match next character
328 		 */
329 		n = sp;
330 		sp = state;
331 		for (i = sp; i < n; ) {
332 			p = *i++;		/* next active state number */
333 			if (p == ENDSTATE)
334 				continue;
335 			k = pat[p];
336 			switch (k) {
337 
338 			case ALT:
339 			case REP:
340 			case NIL:
341 			case LBRACK:
342 			case START:
343 			case END:
344 				continue;
345 			case LCLASS:
346 				in_class(q, &pat[p+1], c);
347 				if (!q)
348 					continue;
349 				break;
350 			case STAR:
351 				put(sp, state, sp, p);
352 				continue;
353 			case QUOTE:
354 				k = pat[p+1];
355 				/* FALLTHRU */
356 			default:
357 				if (k != c)
358 					continue;
359 				/* FALLTHRU */
360 			case ANY:
361 				break;
362 			}
363 			put(sp, state, sp, aux[p]);
364 		}
365 		if (sp == state) {		/* if no new states return */
366 #ifdef	__LINE_MATCH
367 
368 			if (lastp || (soff == slen - 1))
369 				return ((CHAR *)lastp);
370 			else
371 				break;
372 #else
373 			return ((CHAR *)lastp);
374 #endif
375 		}
376 	}
377 #ifdef	__LINE_MATCH
378 }
379 return ((CHAR *)lastp);
380 #endif
381 }
382 
383 
384 #if !defined(__LINE_MATCH) && !defined(__MB_CHAR)
385 /*
386  *	The Compiler
387  */
388 
389 typedef	struct args {
390 	const PCHAR	*pattern;
391 	int		*aux;
392 	int		patp;
393 	int		length;
394 	PCHAR		Ch;
395 } arg_t;
396 
397 LOCAL	void	nextitem __PR((arg_t *));
398 LOCAL	int	prim	 __PR((arg_t *));
399 LOCAL	int	expr	 __PR((arg_t *, int *));
400 LOCAL	void	setexits __PR((int *, int, int));
401 LOCAL	int	join	 __PR((int *, int, int));
402 
403 /*
404  *	'read' the next character from pattern
405  */
406 #define	rch(ap)						\
407 {							\
408 	if (++(ap)->patp >= (ap)->length)		\
409 		(ap)->Ch = 0;				\
410 	else						\
411 		(ap)->Ch = (ap)->pattern[(ap)->patp];	\
412 }
413 
414 /*
415  *	'peek' the next character from pattern
416  */
417 #define	pch(ap)						\
418 	((((ap)->patp + 1) >= (ap)->length) ?		\
419 		0					\
420 	:						\
421 		(ap)->pattern[(ap)->patp+1])		\
422 
423 /*
424  *	get the next item from pattern
425  */
426 LOCAL void
nextitem(ap)427 nextitem(ap)
428 	arg_t	*ap;
429 {
430 	if (ap->Ch == QUOTE)
431 		rch(ap);
432 	rch(ap);
433 }
434 
435 /*
436  *	parse a primary
437  */
438 LOCAL int
prim(ap)439 prim(ap)
440 	arg_t	*ap;
441 {
442 	int	a  = ap->patp;
443 	int	op = ap->Ch;
444 	int	t;
445 
446 	nextitem(ap);			/* Eat '[' */
447 	switch (op) {
448 
449 	case '\0':
450 	case ALT:
451 	case RBRACK:
452 		return (ENDSTATE);
453 	case LCLASS:
454 		if (ap->Ch == NOT)
455 			nextitem(ap);	/* Eat '^' at first position */
456 
457 		t = TRUE;		/* Allow ] as first character */
458 		while ((t || ap->Ch != RCLASS) && ap->Ch != '\0') {
459 			t = FALSE;
460 #ifdef	POSIX_CLASS
461 			if (ap->Ch == LCLASS) {
462 				if (pch(ap) == ':') {	/* [:alpha:] */
463 					char	class[CL_SIZE+1];
464 					char	*pc = class;
465 
466 					nextitem(ap);		/* eat '[' */
467 					nextitem(ap);		/* eat ':' */
468 					while (ap->Ch != ':' &&
469 					    ap->Ch != '\0') {
470 						if (pc >= &class[CL_SIZE])
471 							return (ENDSTATE);
472 						*pc = ap->Ch;
473 						if (*pc++ != ap->Ch)
474 							return (ENDSTATE);
475 						nextitem(ap);
476 					}
477 					if (pc == class)
478 						return (ENDSTATE);
479 					*pc = '\0';
480 					if (ap->Ch == '\0')
481 						return (ENDSTATE);
482 					if (wctype(class) == 0)
483 						return (ENDSTATE);
484 					nextitem(ap);
485 					if (ap->Ch != RCLASS)
486 						return (ENDSTATE);
487 				}
488 			}
489 #endif
490 			/*
491 			 * A '-' before the ending ']' does not have the
492 			 * special range meaning.
493 			 */
494 			if (ap->Ch == RANGE &&
495 			    pch(ap) != RCLASS)	/* One more char required */
496 				nextitem(ap);	/* so get char past '-'	  */
497 			nextitem(ap);
498 		}
499 		if (ap->Ch == '\0')
500 			return (ENDSTATE);
501 		nextitem(ap);
502 		break;
503 	case REP:
504 		t = prim(ap);
505 		if (t == ENDSTATE)
506 			return (ENDSTATE);
507 		setexits(ap->aux, t, a);
508 		break;
509 	case LBRACK:
510 		a = expr(ap, &ap->aux[a]);
511 		if (a == ENDSTATE || ap->Ch != RBRACK)
512 			return (ENDSTATE);
513 		nextitem(ap);
514 		break;
515 	}
516 	return (a);
517 }
518 
519 /*
520  *	parse an expression (a sequence of primaries)
521  */
522 LOCAL int
expr(ap,altp)523 expr(ap, altp)
524 	arg_t	*ap;
525 	int	*altp;
526 {
527 	int	exits = ENDSTATE;
528 	int	a;
529 	int	*aux = ap->aux;
530 	PCHAR	Ch;
531 
532 	for (;;) {
533 		a = prim(ap);
534 		if (a == ENDSTATE)
535 			return (ENDSTATE);
536 		Ch = ap->Ch;
537 		if (Ch == ALT || Ch == RBRACK || Ch == '\0') {
538 			exits = join(aux, exits, a);
539 			if (Ch != ALT)
540 				return (exits);
541 			*altp = ap->patp;
542 			altp = &aux[ap->patp];
543 			nextitem(ap);
544 		} else
545 			setexits(aux, a, ap->patp);
546 	}
547 }
548 
549 /*
550  *	set all exits in a list to a specified value
551  */
552 LOCAL void
setexits(aux,list,val)553 setexits(aux, list, val)
554 	int	*aux;
555 	int	list;
556 	int	val;
557 {
558 	int	a;
559 
560 	while (list != ENDSTATE) {
561 		a = aux[list];
562 		aux[list] = val;
563 		list = a;
564 	}
565 }
566 
567 /*
568  *	concatenate two lists
569  */
570 LOCAL int
join(aux,a,b)571 join(aux, a, b)
572 	int	*aux;
573 	int	a;
574 	int	b;
575 {
576 	int	t;
577 
578 	if (a == ENDSTATE)
579 		return (b);
580 	t = a;
581 	while (aux[t] != ENDSTATE)
582 		t = aux[t];
583 	aux[t] = b;
584 	return (a);
585 }
586 
587 /*
588  *	patcompile - the external compiler interface.
589  *
590  *	The pattern is compiled into the aux array.
591  *	Return value on success, is the outermost alternate which is != 0.
592  *	Error is indicated by return of 0.
593  */
594 EXPORT int
patcompile(pat,len,aux)595 patcompile(pat, len, aux)
596 	const PCHAR	*pat;
597 	int		len;
598 	int		*aux;
599 {
600 	arg_t	a;
601 	int	alt = ENDSTATE;
602 	int	i;
603 
604 	a.pattern = pat;
605 	a.length  = len;
606 	a.aux	  = aux;
607 	a.patp    = -1;
608 
609 	for (i = 0; i < len; i++)
610 		aux[i] = ENDSTATE;
611 	rch(&a);
612 	i = expr(&a, &alt);
613 	if (i == ENDSTATE)
614 		return (0);
615 	setexits(aux, i, ENDSTATE);
616 	return (alt);
617 }
618 #endif /* !defined(__LINE_MATCH) && !defined(__MB_CHAR) */
619