1 /* @(#)match.c	1.27 17/08/13 Copyright 1985, 1995-2017 J. Schilling */
2 #include <schily/standard.h>
3 #include <schily/patmatch.h>
4 #define	POSIX_CLASS		/* Support [[:alpha:]] by default */
5 #ifdef	NO_POSIX_CLASS		/* Allow to disable [[:alpha:]]	  */
6 #undef	POSIX_CLASS
7 #endif
8 #ifdef	POSIX_CLASS
9 #include <schily/wchar.h>	/* With [[:alpha:]], we need wctype()	*/
10 #include <schily/wctype.h>	/* and thus wchar.h and wctype.h	*/
11 #endif
12 /*
13  *	Pattern matching functions
14  *
15  *	Copyright (c) 1985, 1995-2017 J. Schilling
16  */
17 /*
18  * The contents of this file are subject to the terms of the
19  * Common Development and Distribution License, Version 1.0 only
20  * (the "License").  You may not use this file except in compliance
21  * with the License.
22  *
23  * See the file CDDL.Schily.txt in this distribution for details.
24  * A copy of the CDDL is also available via the Internet at
25  * http://www.opensource.org/licenses/cddl1.txt
26  *
27  * When distributing Covered Code, include this CDDL HEADER in each
28  * file and include the License file CDDL.Schily.txt from this distribution.
29  */
30 /*
31  *	The pattern matching functions below are based on the algorithm
32  *	presented by Martin Richards in:
33  *
34  *	"A Compact Function for Regular Expression Pattern Matching",
35  *	Software-Practice and Experience, Vol. 9, 527-534 (1979)
36  *
37  *	Several changes have been made to the original source which has been
38  *	written in BCPL:
39  *
40  *	'/'	is replaced by	'!'		(to allow UNIX filenames)
41  *	'(',')' are replaced by	'{', '}'
42  *	'\''	is replaced by	'\\'		(UNIX compatible quote)
43  *
44  *	Character classes have been added to allow "[<character list>]"
45  *	to be used.
46  *	POSIX features like [[:alpha:]] have been added.
47  *	Start of line '^' and end of line '$' have been added.
48  */
49 
50 #undef	CHAR
51 
52 #ifdef	__LINE_MATCH
53 #ifdef	__WIDE_CHAR
54 #define	patmatch	patwlmatch
55 #else
56 #define	opatmatch	opatlmatch
57 #define	patmatch	patlmatch
58 #endif
59 #endif
60 
61 #ifdef	__WIDE_CHAR
62 #ifndef	__LINE_MATCH
63 #define	patcompile	patwcompile
64 #define	patmatch	patwmatch
65 #endif
66 #define	CHAR		wchar_t
67 #define	PCHAR		wchar_t
68 #endif
69 
70 #ifdef	__MB_CHAR
71 #undef	patmatch
72 #ifdef	__LINE_MATCH
73 #define	patmatch	patmblmatch
74 #else
75 #define	patmatch	patmbmatch
76 #endif
77 #define	PCHAR		wchar_t
78 #endif
79 
80 #ifndef	CHAR
81 typedef	unsigned char	Uchar;
82 #define	DID_UCHAR_TYPE
83 #define	CHAR		Uchar
84 #endif
85 
86 #ifndef	PCHAR
87 #ifndef	DID_UCHAR_TYPE
88 typedef	unsigned char	Uchar;
89 #endif
90 #define	PCHAR		Uchar
91 #endif
92 
93 #define	ENDSTATE	(-1)
94 
95 #define	CL_SIZE		32	/* Max size for '[: :]'			*/
96 
97 /*
98  *	The Interpreter
99  */
100 
101 
102 /*
103  *	put adds a new state to the active list
104  */
105 #define	 put(ret, state, sp, n)	{		\
106 	register int *lstate	= state;	\
107 	register int *lsp	= sp;		\
108 	register int ln		= n;		\
109 						\
110 	while (lstate < lsp) {			\
111 		if (*lstate++ == ln) {		\
112 			ret = lsp;		\
113 			lsp = 0;		\
114 			break;			\
115 		}				\
116 	}					\
117 	if (lsp) {				\
118 		*lstate++ = ln;			\
119 		ret = lstate;			\
120 	}					\
121 }
122 
123 
124 /*
125  *	match a character in class
126  *
127  *	Syntax errors do not appear here, they are handled by the compiler,
128  *	so in theory we could remove the "return (0)" statements from the
129  *	the POSIX class code.
130  */
131 #ifdef	POSIX_CLASS
132 #define	CHK_POSIX_CLASS						\
133 	else if (*lpat == LCLASS) {				\
134 		if (lpat[1] == ':') {				\
135 			char	class[CL_SIZE+1];		\
136 			char	*pc = class;			\
137 								\
138 			lpat += 2;	/* Eat ':' */		\
139 			for (;;) {				\
140 				if (*lpat == '\0') {		\
141 					ok = FALSE;		\
142 					goto out;		\
143 				}				\
144 				if (*lpat == ':' && lpat[1] == RCLASS) \
145 					break;			\
146 				if (pc >= &class[CL_SIZE]) {	\
147 					ok = FALSE;		\
148 					goto out;		\
149 				}				\
150 				*pc++ = *lpat++;		\
151 			}					\
152 			if (pc == class) {			\
153 				ok = FALSE;			\
154 				goto out;			\
155 			}					\
156 			*pc = '\0';				\
157 			lpat += 2;	/* Skip ":]" */		\
158 			if (iswctype(lc, wctype(class))) {	\
159 				ok = !ok;			\
160 				goto out;			\
161 			}					\
162 			continue;				\
163 		}						\
164 	}
165 #else
166 #define	CHK_POSIX_CLASS
167 #endif
168 #define	in_class(found, pat, c)	{			\
169 	register const PCHAR	*lpat	= pat;		\
170 	register int		lc	= c;		\
171 	int	lo_bound;				\
172 	int	hi_bound;				\
173 	BOOL	ok			= FALSE;	\
174 							\
175 	if (*lpat == NOT) {				\
176 		lpat++;					\
177 		ok = TRUE;				\
178 	}						\
179 	while (*lpat != RCLASS) {			\
180 		if (*lpat == QUOTE)			\
181 			lpat++;				\
182 		CHK_POSIX_CLASS				\
183 		lo_bound = *lpat++;			\
184 		if (*lpat == RANGE) {			\
185 			lpat++;				\
186 			if (*lpat == QUOTE)		\
187 				lpat++;			\
188 			hi_bound = *lpat++;		\
189 		} else {				\
190 			hi_bound = lo_bound;		\
191 		}					\
192 		if (lo_bound <= lc && lc <= hi_bound) {	\
193 			ok = !ok;			\
194 			goto out;			\
195 		}					\
196 	}						\
197 out:							\
198 	found = ok;					\
199 }
200 
201 /*
202  *	opatmatch - the old external interpreter interface.
203  *
204  *	Trys to match a string beginning at offset
205  *	against the compiled pattern.
206  */
207 #if !defined(__WIDE_CHAR) && !defined(__MB_CHAR)
208 EXPORT CHAR
209 *opatmatch(pat, aux, str, soff, slen, alt)
210 	const PCHAR	*pat;
211 	const int	*aux;
212 	const CHAR	*str;
213 	int		soff;
214 	int		slen;
215 	int		alt;
216 {
217 	int		state[MAXPAT];
218 
219 	return (patmatch(pat, aux, str, soff, slen, alt, state));
220 }
221 #endif
222 
223 /*
224  *	patmatch - the external interpreter interface.
225  *
226  *	Trys to match a string beginning at offset
227  *	against the compiled pattern.
228  */
229 EXPORT CHAR *
230 patmatch(pat, aux, str, soff, slen, alt, state)
231 	const PCHAR	*pat;
232 	const int	*aux;
233 	const CHAR	*str;
234 	int		soff;
235 	int		slen;
236 	int		alt;
237 	int		state[];
238 {
239 	register int	*sp;
240 	register int	*n;
241 	register int	*i;
242 	register int	p;
243 	register int	q, s, k;
244 #ifdef	__MB_CHAR
245 	wchar_t		c;
246 	int		mlen = 1;
247 #else
248 	int		c;
249 #endif
250 	const CHAR	*lastp = (CHAR *)NULL;
251 
252 #ifdef	__LINE_MATCH
253 for (; soff <= slen; soff++) {
254 #endif
255 
256 	sp = state;
257 	put(sp, state, state, 0);
258 	if (alt != ENDSTATE)
259 		put(sp, state, sp, alt);
260 
261 #ifdef	__MB_CHAR
262 	mbtowc(NULL, NULL, 0);
263 	for (s = soff; ; s += mlen) {
264 #else
265 	for (s = soff; ; s++) {
266 #endif
267 		/*
268 		 * next char from input string
269 		 */
270 		if (s >= slen) {
271 			c = 0;
272 		} else {
273 #ifdef	__MB_CHAR
274 			mlen = mbtowc(&c, (char *)str, slen - s);
275 			if (mlen < 0) {
276 				mbtowc(NULL, NULL, 0);
277 				c = str[s];
278 				mlen = 1;
279 			}
280 #else
281 			c = str[s];
282 #endif
283 		}
284 		/*
285 		 * first complete the closure
286 		 */
287 		for (n = state; n < sp; ) {
288 			p = *n++;		/* next state number */
289 			if (p == ENDSTATE)
290 				continue;
291 			q = aux[p];		/* get next state for pat */
292 			k = pat[p];		/* get next char from pat */
293 			switch (k) {
294 
295 			case REP:
296 				put(sp, state, sp, p+1);
297 				/* FALLTHRU */
298 			case NIL:		/* NIL matches always */
299 			case STAR:
300 				put(sp, state, sp, q);
301 				break;
302 			case LBRACK:		/* alternations */
303 			case ALT:
304 				put(sp, state, sp, p+1);
305 				if (q != ENDSTATE)
306 					put(sp, state, sp, q);
307 				break;
308 			case START:
309 				if (s == 0)
310 					put(sp, state, sp, q);
311 				break;
312 			case END:
313 				if (c == '\0')
314 					put(sp, state, sp, q);
315 				break;
316 			}
317 		}
318 
319 		for (i = state; i < sp; ) {
320 			if (*i++ == ENDSTATE) {
321 				lastp = &str[s];
322 				break;
323 			}
324 		}
325 		if (c == 0)
326 			return ((CHAR *)lastp);
327 
328 		/*
329 		 * now try to match next character
330 		 */
331 		n = sp;
332 		sp = state;
333 		for (i = sp; i < n; ) {
334 			p = *i++;		/* next active state number */
335 			if (p == ENDSTATE)
336 				continue;
337 			k = pat[p];
338 			switch (k) {
339 
340 			case ALT:
341 			case REP:
342 			case NIL:
343 			case LBRACK:
344 			case START:
345 			case END:
346 				continue;
347 			case LCLASS:
348 				in_class(q, &pat[p+1], c);
349 				if (!q)
350 					continue;
351 				break;
352 			case STAR:
353 				put(sp, state, sp, p);
354 				continue;
355 			case QUOTE:
356 				k = pat[p+1];
357 			default:
358 				if (k != c)
359 					continue;
360 				/* FALLTHRU */
361 			case ANY:
362 				break;
363 			}
364 			put(sp, state, sp, aux[p]);
365 		}
366 		if (sp == state) {		/* if no new states return */
367 #ifdef	__LINE_MATCH
368 
369 			if (lastp || (soff == slen - 1))
370 				return ((CHAR *)lastp);
371 			else
372 				break;
373 #else
374 			return ((CHAR *)lastp);
375 #endif
376 		}
377 	}
378 #ifdef	__LINE_MATCH
379 }
380 return ((CHAR *)lastp);
381 #endif
382 }
383 
384 
385 #if !defined(__LINE_MATCH) && !defined(__MB_CHAR)
386 /*
387  *	The Compiler
388  */
389 
390 typedef	struct args {
391 	const PCHAR	*pattern;
392 	int		*aux;
393 	int		patp;
394 	int		length;
395 	PCHAR		Ch;
396 } arg_t;
397 
398 LOCAL	void	nextitem __PR((arg_t *));
399 LOCAL	int	prim	 __PR((arg_t *));
400 LOCAL	int	expr	 __PR((arg_t *, int *));
401 LOCAL	void	setexits __PR((int *, int, int));
402 LOCAL	int	join	 __PR((int *, int, int));
403 
404 /*
405  *	'read' the next character from pattern
406  */
407 #define	rch(ap)						\
408 {							\
409 	if (++(ap)->patp >= (ap)->length)		\
410 		(ap)->Ch = 0;				\
411 	else						\
412 		(ap)->Ch = (ap)->pattern[(ap)->patp];	\
413 }
414 
415 /*
416  *	'peek' the next character from pattern
417  */
418 #define	pch(ap)						\
419 	((((ap)->patp + 1) >= (ap)->length) ?		\
420 		0					\
421 	:						\
422 		(ap)->pattern[(ap)->patp+1])		\
423 
424 /*
425  *	get the next item from pattern
426  */
427 LOCAL void
428 nextitem(ap)
429 	arg_t	*ap;
430 {
431 	if (ap->Ch == QUOTE)
432 		rch(ap);
433 	rch(ap);
434 }
435 
436 /*
437  *	parse a primary
438  */
439 LOCAL int
440 prim(ap)
441 	arg_t	*ap;
442 {
443 	int	a  = ap->patp;
444 	int	op = ap->Ch;
445 	int	t;
446 
447 	nextitem(ap);
448 	switch (op) {
449 
450 	case '\0':
451 	case ALT:
452 	case RBRACK:
453 		return (ENDSTATE);
454 	case LCLASS:
455 		while (ap->Ch != RCLASS && ap->Ch != '\0') {
456 #ifdef	POSIX_CLASS
457 			if (ap->Ch == LCLASS) {
458 				if (pch(ap) == ':') {	/* [:alpha:] */
459 					char	class[CL_SIZE+1];
460 					char	*pc = class;
461 
462 					nextitem(ap);
463 					nextitem(ap);
464 					while (ap->Ch != ':' &&
465 					    ap->Ch != '\0') {
466 						if (pc > &class[CL_SIZE])
467 							return (ENDSTATE);
468 						*pc = ap->Ch;
469 						if (*pc++ != ap->Ch)
470 							return (ENDSTATE);
471 						nextitem(ap);
472 					}
473 					if (pc == class)
474 						return (ENDSTATE);
475 					*pc = '\0';
476 					if (ap->Ch == '\0')
477 						return (ENDSTATE);
478 					if (wctype(class) == 0)
479 						return (ENDSTATE);
480 					nextitem(ap);
481 				}
482 				if (ap->Ch != RCLASS)
483 					return (ENDSTATE);
484 			}
485 #endif
486 			nextitem(ap);
487 		}
488 		if (ap->Ch == '\0')
489 			return (ENDSTATE);
490 		nextitem(ap);
491 		break;
492 	case REP:
493 		t = prim(ap);
494 		if (t == ENDSTATE)
495 			return (ENDSTATE);
496 		setexits(ap->aux, t, a);
497 		break;
498 	case LBRACK:
499 		a = expr(ap, &ap->aux[a]);
500 		if (a == ENDSTATE || ap->Ch != RBRACK)
501 			return (ENDSTATE);
502 		nextitem(ap);
503 		break;
504 	}
505 	return (a);
506 }
507 
508 /*
509  *	parse an expression (a sequence of primaries)
510  */
511 LOCAL int
512 expr(ap, altp)
513 	arg_t	*ap;
514 	int	*altp;
515 {
516 	int	exits = ENDSTATE;
517 	int	a;
518 	int	*aux = ap->aux;
519 	PCHAR	Ch;
520 
521 	for (;;) {
522 		a = prim(ap);
523 		if (a == ENDSTATE)
524 			return (ENDSTATE);
525 		Ch = ap->Ch;
526 		if (Ch == ALT || Ch == RBRACK || Ch == '\0') {
527 			exits = join(aux, exits, a);
528 			if (Ch != ALT)
529 				return (exits);
530 			*altp = ap->patp;
531 			altp = &aux[ap->patp];
532 			nextitem(ap);
533 		} else
534 			setexits(aux, a, ap->patp);
535 	}
536 }
537 
538 /*
539  *	set all exits in a list to a specified value
540  */
541 LOCAL void
542 setexits(aux, list, val)
543 	int	*aux;
544 	int	list;
545 	int	val;
546 {
547 	int	a;
548 
549 	while (list != ENDSTATE) {
550 		a = aux[list];
551 		aux[list] = val;
552 		list = a;
553 	}
554 }
555 
556 /*
557  *	concatenate two lists
558  */
559 LOCAL int
560 join(aux, a, b)
561 	int	*aux;
562 	int	a;
563 	int	b;
564 {
565 	int	t;
566 
567 	if (a == ENDSTATE)
568 		return (b);
569 	t = a;
570 	while (aux[t] != ENDSTATE)
571 		t = aux[t];
572 	aux[t] = b;
573 	return (a);
574 }
575 
576 /*
577  *	patcompile - the external compiler interface.
578  *
579  *	The pattern is compiled into the aux array.
580  *	Return value on success, is the outermost alternate which is != 0.
581  *	Error is indicated by return of 0.
582  */
583 EXPORT int
584 patcompile(pat, len, aux)
585 	const PCHAR	*pat;
586 	int		len;
587 	int		*aux;
588 {
589 	arg_t	a;
590 	int	alt = ENDSTATE;
591 	int	i;
592 
593 	a.pattern = pat;
594 	a.length  = len;
595 	a.aux	  = aux;
596 	a.patp    = -1;
597 
598 	for (i = 0; i < len; i++)
599 		aux[i] = ENDSTATE;
600 	rch(&a);
601 	i = expr(&a, &alt);
602 	if (i == ENDSTATE)
603 		return (0);
604 	setexits(aux, i, ENDSTATE);
605 	return (alt);
606 }
607 #endif /* !defined(__LINE_MATCH) && !defined(__MB_CHAR) */
608