1 /* @(#)match.c 1.36 19/09/21 Copyright 1985, 1995-2019 J. Schilling */
2 #include <schily/utypes.h> /* For Uchar */
3 #include <schily/standard.h>
4 #include <schily/patmatch.h>
5 #define POSIX_CLASS /* Support [[:alpha:]] by default */
6 #ifdef NO_POSIX_CLASS /* Allow to disable [[:alpha:]] */
7 #undef POSIX_CLASS
8 #endif
9 #ifdef POSIX_CLASS
10 #include <schily/wchar.h> /* With [[:alpha:]], we need wctype() */
11 #include <schily/wctype.h> /* and thus wchar.h and wctype.h */
12 #endif
13 /*
14 * Pattern matching functions
15 *
16 * Copyright (c) 1985, 1995-2019 J. Schilling
17 */
18 /*
19 * The contents of this file are subject to the terms of the
20 * Common Development and Distribution License, Version 1.0 only
21 * (the "License"). You may not use this file except in compliance
22 * with the License.
23 *
24 * See the file CDDL.Schily.txt in this distribution for details.
25 * A copy of the CDDL is also available via the Internet at
26 * http://www.opensource.org/licenses/cddl1.txt
27 *
28 * When distributing Covered Code, include this CDDL HEADER in each
29 * file and include the License file CDDL.Schily.txt from this distribution.
30 */
31 /*
32 * The pattern matching functions below are based on the algorithm
33 * presented by Martin Richards in:
34 *
35 * "A Compact Function for Regular Expression Pattern Matching",
36 * Software-Practice and Experience, Vol. 9, 527-534 (1979)
37 *
38 * Several changes have been made to the original source which has been
39 * written in BCPL:
40 *
41 * '/' is replaced by '!' (to allow UNIX filenames)
42 * '(',')' are replaced by '{', '}'
43 * '\'' is replaced by '\\' (UNIX compatible quote)
44 *
45 * Character classes have been added to allow "[<character list>]"
46 * to be used.
47 * POSIX features like [[:alpha:]] have been added.
48 * Start of line '^' and end of line '$' have been added.
49 */
50
51 #undef CHAR
52
53 #ifdef __LINE_MATCH
54 #ifdef __WIDE_CHAR
55 #define patmatch patwlmatch
56 #else
57 #define opatmatch opatlmatch
58 #define patmatch patlmatch
59 #endif
60 #endif
61
62 #ifdef __WIDE_CHAR
63 #ifndef __LINE_MATCH
64 #define patcompile patwcompile
65 #define patmatch patwmatch
66 #endif
67 #define CHAR wchar_t
68 #define PCHAR wchar_t
69 #endif
70
71 #ifdef __MB_CHAR
72 #undef patmatch
73 #ifdef __LINE_MATCH
74 #define patmatch patmblmatch
75 #else
76 #define patmatch patmbmatch
77 #endif
78 #define PCHAR wchar_t
79 #endif
80
81 #ifndef CHAR
82 #define CHAR Uchar
83 #endif
84
85 #ifndef PCHAR
86 #define PCHAR Uchar
87 #endif
88
89 #define ENDSTATE (-1)
90
91 #define CL_SIZE 32 /* Max size for '[: :]' */
92
93 /*
94 * The Interpreter
95 */
96
97
98 /*
99 * put adds a new state to the active list
100 */
101 #define put(ret, state, sp, n) { \
102 register int *lstate = state; \
103 register int *lsp = sp; \
104 register int ln = n; \
105 \
106 while (lstate < lsp) { \
107 if (*lstate++ == ln) { \
108 ret = lsp; \
109 lsp = 0; \
110 break; \
111 } \
112 } \
113 if (lsp) { \
114 *lstate++ = ln; \
115 ret = lstate; \
116 } \
117 }
118
119
120 /*
121 * match a character in class
122 *
123 * Syntax errors do not appear here, they are handled by the compiler,
124 * so in theory we could remove the "return (0)" statements from the
125 * the POSIX class code.
126 */
127 #ifdef POSIX_CLASS
128 #define CHK_POSIX_CLASS \
129 if (*lpat == LCLASS && lpat[1] == ':') { \
130 char class[CL_SIZE+1]; \
131 char *pc = class; \
132 \
133 lpat += 2; /* Eat ':' */ \
134 for (;;) { \
135 if (*lpat == '\0') { \
136 ok = FALSE; \
137 goto out; \
138 } \
139 if (*lpat == ':' && lpat[1] == RCLASS) \
140 break; \
141 if (pc >= &class[CL_SIZE]) { \
142 ok = FALSE; \
143 goto out; \
144 } \
145 *pc++ = *lpat++; \
146 } \
147 if (pc == class) { \
148 ok = FALSE; \
149 goto out; \
150 } \
151 *pc = '\0'; \
152 lpat += 2; /* Skip ":]" */ \
153 if (iswctype(lc, wctype(class))) { \
154 ok = !ok; \
155 goto out; \
156 } \
157 continue; \
158 } else
159 #else
160 #define CHK_POSIX_CLASS
161 #endif
162 #define in_class(found, pat, c) { \
163 register const PCHAR *lpat = pat; \
164 register int lc = c; \
165 int lo_bound; \
166 int hi_bound; \
167 BOOL ok = FALSE; \
168 \
169 if (*lpat == NOT) { \
170 lpat++; \
171 ok = TRUE; \
172 } \
173 do { \
174 if (*lpat == '\0') { \
175 ok = FALSE; \
176 goto out; \
177 } \
178 CHK_POSIX_CLASS \
179 if (*lpat == QUOTE) \
180 lpat++; \
181 lo_bound = *lpat++; \
182 if (*lpat == RANGE && lpat[1] != RCLASS) { \
183 lpat++; \
184 if (*lpat == QUOTE) \
185 lpat++; \
186 hi_bound = *lpat++; \
187 } else { \
188 hi_bound = lo_bound; \
189 } \
190 if (lo_bound <= lc && lc <= hi_bound) { \
191 ok = !ok; \
192 goto out; \
193 } \
194 } while (*lpat != RCLASS); \
195 out: \
196 found = ok; \
197 }
198
199 /*
200 * opatmatch - the old external interpreter interface.
201 *
202 * Trys to match a string beginning at offset
203 * against the compiled pattern.
204 */
205 #if !defined(__WIDE_CHAR) && !defined(__MB_CHAR)
206 EXPORT CHAR
opatmatch(pat,aux,str,soff,slen,alt)207 *opatmatch(pat, aux, str, soff, slen, alt)
208 const PCHAR *pat;
209 const int *aux;
210 const CHAR *str;
211 int soff;
212 int slen;
213 int alt;
214 {
215 int state[MAXPAT];
216
217 return (patmatch(pat, aux, str, soff, slen, alt, state));
218 }
219 #endif
220
221 /*
222 * patmatch - the external interpreter interface.
223 *
224 * Trys to match a string beginning at offset
225 * against the compiled pattern.
226 */
227 EXPORT CHAR *
patmatch(pat,aux,str,soff,slen,alt,state)228 patmatch(pat, aux, str, soff, slen, alt, state)
229 const PCHAR *pat;
230 const int *aux;
231 const CHAR *str;
232 int soff;
233 int slen;
234 int alt;
235 int state[];
236 {
237 register int *sp;
238 register int *n;
239 register int *i;
240 register int p;
241 register int q, s, k;
242 #ifdef __MB_CHAR
243 wchar_t c;
244 int mlen = 1;
245 #else
246 int c;
247 #endif
248 const CHAR *lastp = (CHAR *)NULL;
249
250 #ifdef __LINE_MATCH
251 for (; soff <= slen; soff++) {
252 #endif
253
254 sp = state;
255 put(sp, state, state, 0);
256 if (alt != ENDSTATE)
257 put(sp, state, sp, alt);
258
259 #ifdef __MB_CHAR
260 mbtowc(NULL, NULL, 0);
261 for (s = soff; ; s += mlen) {
262 #else
263 for (s = soff; ; s++) {
264 #endif
265 /*
266 * next char from input string
267 */
268 if (s >= slen) {
269 c = 0;
270 } else {
271 #ifdef __MB_CHAR
272 mlen = mbtowc(&c, (char *)&str[s], slen - s);
273 if (mlen < 0) {
274 mbtowc(NULL, NULL, 0);
275 c = str[s];
276 mlen = 1;
277 }
278 #else
279 c = str[s];
280 #endif
281 }
282 /*
283 * first complete the closure
284 */
285 for (n = state; n < sp; ) {
286 p = *n++; /* next state number */
287 if (p == ENDSTATE)
288 continue;
289 q = aux[p]; /* get next state for pat */
290 k = pat[p]; /* get next char from pat */
291 switch (k) {
292
293 case REP:
294 put(sp, state, sp, p+1);
295 /* FALLTHRU */
296 case NIL: /* NIL matches always */
297 case STAR:
298 put(sp, state, sp, q);
299 break;
300 case LBRACK: /* alternations */
301 case ALT:
302 put(sp, state, sp, p+1);
303 if (q != ENDSTATE)
304 put(sp, state, sp, q);
305 break;
306 case START:
307 if (s == 0)
308 put(sp, state, sp, q);
309 break;
310 case END:
311 if (c == '\0')
312 put(sp, state, sp, q);
313 break;
314 }
315 }
316
317 for (i = state; i < sp; ) {
318 if (*i++ == ENDSTATE) {
319 lastp = &str[s];
320 break;
321 }
322 }
323 if (c == 0)
324 return ((CHAR *)lastp);
325
326 /*
327 * now try to match next character
328 */
329 n = sp;
330 sp = state;
331 for (i = sp; i < n; ) {
332 p = *i++; /* next active state number */
333 if (p == ENDSTATE)
334 continue;
335 k = pat[p];
336 switch (k) {
337
338 case ALT:
339 case REP:
340 case NIL:
341 case LBRACK:
342 case START:
343 case END:
344 continue;
345 case LCLASS:
346 in_class(q, &pat[p+1], c);
347 if (!q)
348 continue;
349 break;
350 case STAR:
351 put(sp, state, sp, p);
352 continue;
353 case QUOTE:
354 k = pat[p+1];
355 /* FALLTHRU */
356 default:
357 if (k != c)
358 continue;
359 /* FALLTHRU */
360 case ANY:
361 break;
362 }
363 put(sp, state, sp, aux[p]);
364 }
365 if (sp == state) { /* if no new states return */
366 #ifdef __LINE_MATCH
367
368 if (lastp || (soff == slen - 1))
369 return ((CHAR *)lastp);
370 else
371 break;
372 #else
373 return ((CHAR *)lastp);
374 #endif
375 }
376 }
377 #ifdef __LINE_MATCH
378 }
379 return ((CHAR *)lastp);
380 #endif
381 }
382
383
384 #if !defined(__LINE_MATCH) && !defined(__MB_CHAR)
385 /*
386 * The Compiler
387 */
388
389 typedef struct args {
390 const PCHAR *pattern;
391 int *aux;
392 int patp;
393 int length;
394 PCHAR Ch;
395 } arg_t;
396
397 LOCAL void nextitem __PR((arg_t *));
398 LOCAL int prim __PR((arg_t *));
399 LOCAL int expr __PR((arg_t *, int *));
400 LOCAL void setexits __PR((int *, int, int));
401 LOCAL int join __PR((int *, int, int));
402
403 /*
404 * 'read' the next character from pattern
405 */
406 #define rch(ap) \
407 { \
408 if (++(ap)->patp >= (ap)->length) \
409 (ap)->Ch = 0; \
410 else \
411 (ap)->Ch = (ap)->pattern[(ap)->patp]; \
412 }
413
414 /*
415 * 'peek' the next character from pattern
416 */
417 #define pch(ap) \
418 ((((ap)->patp + 1) >= (ap)->length) ? \
419 0 \
420 : \
421 (ap)->pattern[(ap)->patp+1]) \
422
423 /*
424 * get the next item from pattern
425 */
426 LOCAL void
nextitem(ap)427 nextitem(ap)
428 arg_t *ap;
429 {
430 if (ap->Ch == QUOTE)
431 rch(ap);
432 rch(ap);
433 }
434
435 /*
436 * parse a primary
437 */
438 LOCAL int
prim(ap)439 prim(ap)
440 arg_t *ap;
441 {
442 int a = ap->patp;
443 int op = ap->Ch;
444 int t;
445
446 nextitem(ap); /* Eat '[' */
447 switch (op) {
448
449 case '\0':
450 case ALT:
451 case RBRACK:
452 return (ENDSTATE);
453 case LCLASS:
454 if (ap->Ch == NOT)
455 nextitem(ap); /* Eat '^' at first position */
456
457 t = TRUE; /* Allow ] as first character */
458 while ((t || ap->Ch != RCLASS) && ap->Ch != '\0') {
459 t = FALSE;
460 #ifdef POSIX_CLASS
461 if (ap->Ch == LCLASS) {
462 if (pch(ap) == ':') { /* [:alpha:] */
463 char class[CL_SIZE+1];
464 char *pc = class;
465
466 nextitem(ap); /* eat '[' */
467 nextitem(ap); /* eat ':' */
468 while (ap->Ch != ':' &&
469 ap->Ch != '\0') {
470 if (pc >= &class[CL_SIZE])
471 return (ENDSTATE);
472 *pc = ap->Ch;
473 if (*pc++ != ap->Ch)
474 return (ENDSTATE);
475 nextitem(ap);
476 }
477 if (pc == class)
478 return (ENDSTATE);
479 *pc = '\0';
480 if (ap->Ch == '\0')
481 return (ENDSTATE);
482 if (wctype(class) == 0)
483 return (ENDSTATE);
484 nextitem(ap);
485 if (ap->Ch != RCLASS)
486 return (ENDSTATE);
487 }
488 }
489 #endif
490 /*
491 * A '-' before the ending ']' does not have the
492 * special range meaning.
493 */
494 if (ap->Ch == RANGE &&
495 pch(ap) != RCLASS) /* One more char required */
496 nextitem(ap); /* so get char past '-' */
497 nextitem(ap);
498 }
499 if (ap->Ch == '\0')
500 return (ENDSTATE);
501 nextitem(ap);
502 break;
503 case REP:
504 t = prim(ap);
505 if (t == ENDSTATE)
506 return (ENDSTATE);
507 setexits(ap->aux, t, a);
508 break;
509 case LBRACK:
510 a = expr(ap, &ap->aux[a]);
511 if (a == ENDSTATE || ap->Ch != RBRACK)
512 return (ENDSTATE);
513 nextitem(ap);
514 break;
515 }
516 return (a);
517 }
518
519 /*
520 * parse an expression (a sequence of primaries)
521 */
522 LOCAL int
expr(ap,altp)523 expr(ap, altp)
524 arg_t *ap;
525 int *altp;
526 {
527 int exits = ENDSTATE;
528 int a;
529 int *aux = ap->aux;
530 PCHAR Ch;
531
532 for (;;) {
533 a = prim(ap);
534 if (a == ENDSTATE)
535 return (ENDSTATE);
536 Ch = ap->Ch;
537 if (Ch == ALT || Ch == RBRACK || Ch == '\0') {
538 exits = join(aux, exits, a);
539 if (Ch != ALT)
540 return (exits);
541 *altp = ap->patp;
542 altp = &aux[ap->patp];
543 nextitem(ap);
544 } else
545 setexits(aux, a, ap->patp);
546 }
547 }
548
549 /*
550 * set all exits in a list to a specified value
551 */
552 LOCAL void
setexits(aux,list,val)553 setexits(aux, list, val)
554 int *aux;
555 int list;
556 int val;
557 {
558 int a;
559
560 while (list != ENDSTATE) {
561 a = aux[list];
562 aux[list] = val;
563 list = a;
564 }
565 }
566
567 /*
568 * concatenate two lists
569 */
570 LOCAL int
join(aux,a,b)571 join(aux, a, b)
572 int *aux;
573 int a;
574 int b;
575 {
576 int t;
577
578 if (a == ENDSTATE)
579 return (b);
580 t = a;
581 while (aux[t] != ENDSTATE)
582 t = aux[t];
583 aux[t] = b;
584 return (a);
585 }
586
587 /*
588 * patcompile - the external compiler interface.
589 *
590 * The pattern is compiled into the aux array.
591 * Return value on success, is the outermost alternate which is != 0.
592 * Error is indicated by return of 0.
593 */
594 EXPORT int
patcompile(pat,len,aux)595 patcompile(pat, len, aux)
596 const PCHAR *pat;
597 int len;
598 int *aux;
599 {
600 arg_t a;
601 int alt = ENDSTATE;
602 int i;
603
604 a.pattern = pat;
605 a.length = len;
606 a.aux = aux;
607 a.patp = -1;
608
609 for (i = 0; i < len; i++)
610 aux[i] = ENDSTATE;
611 rch(&a);
612 i = expr(&a, &alt);
613 if (i == ENDSTATE)
614 return (0);
615 setexits(aux, i, ENDSTATE);
616 return (alt);
617 }
618 #endif /* !defined(__LINE_MATCH) && !defined(__MB_CHAR) */
619