1 /* @(#)match.c 1.27 17/08/13 Copyright 1985, 1995-2017 J. Schilling */
2 #include <schily/standard.h>
3 #include <schily/patmatch.h>
4 #define POSIX_CLASS /* Support [[:alpha:]] by default */
5 #ifdef NO_POSIX_CLASS /* Allow to disable [[:alpha:]] */
6 #undef POSIX_CLASS
7 #endif
8 #ifdef POSIX_CLASS
9 #include <schily/wchar.h> /* With [[:alpha:]], we need wctype() */
10 #include <schily/wctype.h> /* and thus wchar.h and wctype.h */
11 #endif
12 /*
13 * Pattern matching functions
14 *
15 * Copyright (c) 1985, 1995-2017 J. Schilling
16 */
17 /*
18 * The contents of this file are subject to the terms of the
19 * Common Development and Distribution License, Version 1.0 only
20 * (the "License"). You may not use this file except in compliance
21 * with the License.
22 *
23 * See the file CDDL.Schily.txt in this distribution for details.
24 * A copy of the CDDL is also available via the Internet at
25 * http://www.opensource.org/licenses/cddl1.txt
26 *
27 * When distributing Covered Code, include this CDDL HEADER in each
28 * file and include the License file CDDL.Schily.txt from this distribution.
29 */
30 /*
31 * The pattern matching functions below are based on the algorithm
32 * presented by Martin Richards in:
33 *
34 * "A Compact Function for Regular Expression Pattern Matching",
35 * Software-Practice and Experience, Vol. 9, 527-534 (1979)
36 *
37 * Several changes have been made to the original source which has been
38 * written in BCPL:
39 *
40 * '/' is replaced by '!' (to allow UNIX filenames)
41 * '(',')' are replaced by '{', '}'
42 * '\'' is replaced by '\\' (UNIX compatible quote)
43 *
44 * Character classes have been added to allow "[<character list>]"
45 * to be used.
46 * POSIX features like [[:alpha:]] have been added.
47 * Start of line '^' and end of line '$' have been added.
48 */
49
50 #undef CHAR
51
52 #ifdef __LINE_MATCH
53 #ifdef __WIDE_CHAR
54 #define patmatch patwlmatch
55 #else
56 #define opatmatch opatlmatch
57 #define patmatch patlmatch
58 #endif
59 #endif
60
61 #ifdef __WIDE_CHAR
62 #ifndef __LINE_MATCH
63 #define patcompile patwcompile
64 #define patmatch patwmatch
65 #endif
66 #define CHAR wchar_t
67 #define PCHAR wchar_t
68 #endif
69
70 #ifdef __MB_CHAR
71 #undef patmatch
72 #ifdef __LINE_MATCH
73 #define patmatch patmblmatch
74 #else
75 #define patmatch patmbmatch
76 #endif
77 #define PCHAR wchar_t
78 #endif
79
80 #ifndef CHAR
81 typedef unsigned char Uchar;
82 #define DID_UCHAR_TYPE
83 #define CHAR Uchar
84 #endif
85
86 #ifndef PCHAR
87 #ifndef DID_UCHAR_TYPE
88 typedef unsigned char Uchar;
89 #endif
90 #define PCHAR Uchar
91 #endif
92
93 #define ENDSTATE (-1)
94
95 #define CL_SIZE 32 /* Max size for '[: :]' */
96
97 /*
98 * The Interpreter
99 */
100
101
102 /*
103 * put adds a new state to the active list
104 */
105 #define put(ret, state, sp, n) { \
106 register int *lstate = state; \
107 register int *lsp = sp; \
108 register int ln = n; \
109 \
110 while (lstate < lsp) { \
111 if (*lstate++ == ln) { \
112 ret = lsp; \
113 lsp = 0; \
114 break; \
115 } \
116 } \
117 if (lsp) { \
118 *lstate++ = ln; \
119 ret = lstate; \
120 } \
121 }
122
123
124 /*
125 * match a character in class
126 *
127 * Syntax errors do not appear here, they are handled by the compiler,
128 * so in theory we could remove the "return (0)" statements from the
129 * the POSIX class code.
130 */
131 #ifdef POSIX_CLASS
132 #define CHK_POSIX_CLASS \
133 else if (*lpat == LCLASS) { \
134 if (lpat[1] == ':') { \
135 char class[CL_SIZE+1]; \
136 char *pc = class; \
137 \
138 lpat += 2; /* Eat ':' */ \
139 for (;;) { \
140 if (*lpat == '\0') { \
141 ok = FALSE; \
142 goto out; \
143 } \
144 if (*lpat == ':' && lpat[1] == RCLASS) \
145 break; \
146 if (pc >= &class[CL_SIZE]) { \
147 ok = FALSE; \
148 goto out; \
149 } \
150 *pc++ = *lpat++; \
151 } \
152 if (pc == class) { \
153 ok = FALSE; \
154 goto out; \
155 } \
156 *pc = '\0'; \
157 lpat += 2; /* Skip ":]" */ \
158 if (iswctype(lc, wctype(class))) { \
159 ok = !ok; \
160 goto out; \
161 } \
162 continue; \
163 } \
164 }
165 #else
166 #define CHK_POSIX_CLASS
167 #endif
168 #define in_class(found, pat, c) { \
169 register const PCHAR *lpat = pat; \
170 register int lc = c; \
171 int lo_bound; \
172 int hi_bound; \
173 BOOL ok = FALSE; \
174 \
175 if (*lpat == NOT) { \
176 lpat++; \
177 ok = TRUE; \
178 } \
179 while (*lpat != RCLASS) { \
180 if (*lpat == QUOTE) \
181 lpat++; \
182 CHK_POSIX_CLASS \
183 lo_bound = *lpat++; \
184 if (*lpat == RANGE) { \
185 lpat++; \
186 if (*lpat == QUOTE) \
187 lpat++; \
188 hi_bound = *lpat++; \
189 } else { \
190 hi_bound = lo_bound; \
191 } \
192 if (lo_bound <= lc && lc <= hi_bound) { \
193 ok = !ok; \
194 goto out; \
195 } \
196 } \
197 out: \
198 found = ok; \
199 }
200
201 /*
202 * opatmatch - the old external interpreter interface.
203 *
204 * Trys to match a string beginning at offset
205 * against the compiled pattern.
206 */
207 #if !defined(__WIDE_CHAR) && !defined(__MB_CHAR)
208 EXPORT CHAR
opatmatch(pat,aux,str,soff,slen,alt)209 *opatmatch(pat, aux, str, soff, slen, alt)
210 const PCHAR *pat;
211 const int *aux;
212 const CHAR *str;
213 int soff;
214 int slen;
215 int alt;
216 {
217 int state[MAXPAT];
218
219 return (patmatch(pat, aux, str, soff, slen, alt, state));
220 }
221 #endif
222
223 /*
224 * patmatch - the external interpreter interface.
225 *
226 * Trys to match a string beginning at offset
227 * against the compiled pattern.
228 */
229 EXPORT CHAR *
patmatch(pat,aux,str,soff,slen,alt,state)230 patmatch(pat, aux, str, soff, slen, alt, state)
231 const PCHAR *pat;
232 const int *aux;
233 const CHAR *str;
234 int soff;
235 int slen;
236 int alt;
237 int state[];
238 {
239 register int *sp;
240 register int *n;
241 register int *i;
242 register int p;
243 register int q, s, k;
244 #ifdef __MB_CHAR
245 wchar_t c;
246 int mlen = 1;
247 #else
248 int c;
249 #endif
250 const CHAR *lastp = (CHAR *)NULL;
251
252 #ifdef __LINE_MATCH
253 for (; soff <= slen; soff++) {
254 #endif
255
256 sp = state;
257 put(sp, state, state, 0);
258 if (alt != ENDSTATE)
259 put(sp, state, sp, alt);
260
261 #ifdef __MB_CHAR
262 mbtowc(NULL, NULL, 0);
263 for (s = soff; ; s += mlen) {
264 #else
265 for (s = soff; ; s++) {
266 #endif
267 /*
268 * next char from input string
269 */
270 if (s >= slen) {
271 c = 0;
272 } else {
273 #ifdef __MB_CHAR
274 mlen = mbtowc(&c, (char *)str, slen - s);
275 if (mlen < 0) {
276 mbtowc(NULL, NULL, 0);
277 c = str[s];
278 mlen = 1;
279 }
280 #else
281 c = str[s];
282 #endif
283 }
284 /*
285 * first complete the closure
286 */
287 for (n = state; n < sp; ) {
288 p = *n++; /* next state number */
289 if (p == ENDSTATE)
290 continue;
291 q = aux[p]; /* get next state for pat */
292 k = pat[p]; /* get next char from pat */
293 switch (k) {
294
295 case REP:
296 put(sp, state, sp, p+1);
297 /* FALLTHRU */
298 case NIL: /* NIL matches always */
299 case STAR:
300 put(sp, state, sp, q);
301 break;
302 case LBRACK: /* alternations */
303 case ALT:
304 put(sp, state, sp, p+1);
305 if (q != ENDSTATE)
306 put(sp, state, sp, q);
307 break;
308 case START:
309 if (s == 0)
310 put(sp, state, sp, q);
311 break;
312 case END:
313 if (c == '\0')
314 put(sp, state, sp, q);
315 break;
316 }
317 }
318
319 for (i = state; i < sp; ) {
320 if (*i++ == ENDSTATE) {
321 lastp = &str[s];
322 break;
323 }
324 }
325 if (c == 0)
326 return ((CHAR *)lastp);
327
328 /*
329 * now try to match next character
330 */
331 n = sp;
332 sp = state;
333 for (i = sp; i < n; ) {
334 p = *i++; /* next active state number */
335 if (p == ENDSTATE)
336 continue;
337 k = pat[p];
338 switch (k) {
339
340 case ALT:
341 case REP:
342 case NIL:
343 case LBRACK:
344 case START:
345 case END:
346 continue;
347 case LCLASS:
348 in_class(q, &pat[p+1], c);
349 if (!q)
350 continue;
351 break;
352 case STAR:
353 put(sp, state, sp, p);
354 continue;
355 case QUOTE:
356 k = pat[p+1];
357 default:
358 if (k != c)
359 continue;
360 /* FALLTHRU */
361 case ANY:
362 break;
363 }
364 put(sp, state, sp, aux[p]);
365 }
366 if (sp == state) { /* if no new states return */
367 #ifdef __LINE_MATCH
368
369 if (lastp || (soff == slen - 1))
370 return ((CHAR *)lastp);
371 else
372 break;
373 #else
374 return ((CHAR *)lastp);
375 #endif
376 }
377 }
378 #ifdef __LINE_MATCH
379 }
380 return ((CHAR *)lastp);
381 #endif
382 }
383
384
385 #if !defined(__LINE_MATCH) && !defined(__MB_CHAR)
386 /*
387 * The Compiler
388 */
389
390 typedef struct args {
391 const PCHAR *pattern;
392 int *aux;
393 int patp;
394 int length;
395 PCHAR Ch;
396 } arg_t;
397
398 LOCAL void nextitem __PR((arg_t *));
399 LOCAL int prim __PR((arg_t *));
400 LOCAL int expr __PR((arg_t *, int *));
401 LOCAL void setexits __PR((int *, int, int));
402 LOCAL int join __PR((int *, int, int));
403
404 /*
405 * 'read' the next character from pattern
406 */
407 #define rch(ap) \
408 { \
409 if (++(ap)->patp >= (ap)->length) \
410 (ap)->Ch = 0; \
411 else \
412 (ap)->Ch = (ap)->pattern[(ap)->patp]; \
413 }
414
415 /*
416 * 'peek' the next character from pattern
417 */
418 #define pch(ap) \
419 ((((ap)->patp + 1) >= (ap)->length) ? \
420 0 \
421 : \
422 (ap)->pattern[(ap)->patp+1]) \
423
424 /*
425 * get the next item from pattern
426 */
427 LOCAL void
nextitem(ap)428 nextitem(ap)
429 arg_t *ap;
430 {
431 if (ap->Ch == QUOTE)
432 rch(ap);
433 rch(ap);
434 }
435
436 /*
437 * parse a primary
438 */
439 LOCAL int
prim(ap)440 prim(ap)
441 arg_t *ap;
442 {
443 int a = ap->patp;
444 int op = ap->Ch;
445 int t;
446
447 nextitem(ap);
448 switch (op) {
449
450 case '\0':
451 case ALT:
452 case RBRACK:
453 return (ENDSTATE);
454 case LCLASS:
455 while (ap->Ch != RCLASS && ap->Ch != '\0') {
456 #ifdef POSIX_CLASS
457 if (ap->Ch == LCLASS) {
458 if (pch(ap) == ':') { /* [:alpha:] */
459 char class[CL_SIZE+1];
460 char *pc = class;
461
462 nextitem(ap);
463 nextitem(ap);
464 while (ap->Ch != ':' &&
465 ap->Ch != '\0') {
466 if (pc > &class[CL_SIZE])
467 return (ENDSTATE);
468 *pc = ap->Ch;
469 if (*pc++ != ap->Ch)
470 return (ENDSTATE);
471 nextitem(ap);
472 }
473 if (pc == class)
474 return (ENDSTATE);
475 *pc = '\0';
476 if (ap->Ch == '\0')
477 return (ENDSTATE);
478 if (wctype(class) == 0)
479 return (ENDSTATE);
480 nextitem(ap);
481 }
482 if (ap->Ch != RCLASS)
483 return (ENDSTATE);
484 }
485 #endif
486 nextitem(ap);
487 }
488 if (ap->Ch == '\0')
489 return (ENDSTATE);
490 nextitem(ap);
491 break;
492 case REP:
493 t = prim(ap);
494 if (t == ENDSTATE)
495 return (ENDSTATE);
496 setexits(ap->aux, t, a);
497 break;
498 case LBRACK:
499 a = expr(ap, &ap->aux[a]);
500 if (a == ENDSTATE || ap->Ch != RBRACK)
501 return (ENDSTATE);
502 nextitem(ap);
503 break;
504 }
505 return (a);
506 }
507
508 /*
509 * parse an expression (a sequence of primaries)
510 */
511 LOCAL int
expr(ap,altp)512 expr(ap, altp)
513 arg_t *ap;
514 int *altp;
515 {
516 int exits = ENDSTATE;
517 int a;
518 int *aux = ap->aux;
519 PCHAR Ch;
520
521 for (;;) {
522 a = prim(ap);
523 if (a == ENDSTATE)
524 return (ENDSTATE);
525 Ch = ap->Ch;
526 if (Ch == ALT || Ch == RBRACK || Ch == '\0') {
527 exits = join(aux, exits, a);
528 if (Ch != ALT)
529 return (exits);
530 *altp = ap->patp;
531 altp = &aux[ap->patp];
532 nextitem(ap);
533 } else
534 setexits(aux, a, ap->patp);
535 }
536 }
537
538 /*
539 * set all exits in a list to a specified value
540 */
541 LOCAL void
setexits(aux,list,val)542 setexits(aux, list, val)
543 int *aux;
544 int list;
545 int val;
546 {
547 int a;
548
549 while (list != ENDSTATE) {
550 a = aux[list];
551 aux[list] = val;
552 list = a;
553 }
554 }
555
556 /*
557 * concatenate two lists
558 */
559 LOCAL int
join(aux,a,b)560 join(aux, a, b)
561 int *aux;
562 int a;
563 int b;
564 {
565 int t;
566
567 if (a == ENDSTATE)
568 return (b);
569 t = a;
570 while (aux[t] != ENDSTATE)
571 t = aux[t];
572 aux[t] = b;
573 return (a);
574 }
575
576 /*
577 * patcompile - the external compiler interface.
578 *
579 * The pattern is compiled into the aux array.
580 * Return value on success, is the outermost alternate which is != 0.
581 * Error is indicated by return of 0.
582 */
583 EXPORT int
patcompile(pat,len,aux)584 patcompile(pat, len, aux)
585 const PCHAR *pat;
586 int len;
587 int *aux;
588 {
589 arg_t a;
590 int alt = ENDSTATE;
591 int i;
592
593 a.pattern = pat;
594 a.length = len;
595 a.aux = aux;
596 a.patp = -1;
597
598 for (i = 0; i < len; i++)
599 aux[i] = ENDSTATE;
600 rch(&a);
601 i = expr(&a, &alt);
602 if (i == ENDSTATE)
603 return (0);
604 setexits(aux, i, ENDSTATE);
605 return (alt);
606 }
607 #endif /* !defined(__LINE_MATCH) && !defined(__MB_CHAR) */
608