1 /* @(#)match.c 1.27 17/08/13 Copyright 1985, 1995-2017 J. Schilling */ 2 #include <schily/standard.h> 3 #include <schily/patmatch.h> 4 #define POSIX_CLASS /* Support [[:alpha:]] by default */ 5 #ifdef NO_POSIX_CLASS /* Allow to disable [[:alpha:]] */ 6 #undef POSIX_CLASS 7 #endif 8 #ifdef POSIX_CLASS 9 #include <schily/wchar.h> /* With [[:alpha:]], we need wctype() */ 10 #include <schily/wctype.h> /* and thus wchar.h and wctype.h */ 11 #endif 12 /* 13 * Pattern matching functions 14 * 15 * Copyright (c) 1985, 1995-2017 J. Schilling 16 */ 17 /* 18 * The contents of this file are subject to the terms of the 19 * Common Development and Distribution License, Version 1.0 only 20 * (the "License"). You may not use this file except in compliance 21 * with the License. 22 * 23 * See the file CDDL.Schily.txt in this distribution for details. 24 * A copy of the CDDL is also available via the Internet at 25 * http://www.opensource.org/licenses/cddl1.txt 26 * 27 * When distributing Covered Code, include this CDDL HEADER in each 28 * file and include the License file CDDL.Schily.txt from this distribution. 29 */ 30 /* 31 * The pattern matching functions below are based on the algorithm 32 * presented by Martin Richards in: 33 * 34 * "A Compact Function for Regular Expression Pattern Matching", 35 * Software-Practice and Experience, Vol. 9, 527-534 (1979) 36 * 37 * Several changes have been made to the original source which has been 38 * written in BCPL: 39 * 40 * '/' is replaced by '!' (to allow UNIX filenames) 41 * '(',')' are replaced by '{', '}' 42 * '\'' is replaced by '\\' (UNIX compatible quote) 43 * 44 * Character classes have been added to allow "[<character list>]" 45 * to be used. 46 * POSIX features like [[:alpha:]] have been added. 47 * Start of line '^' and end of line '$' have been added. 48 */ 49 50 #undef CHAR 51 52 #ifdef __LINE_MATCH 53 #ifdef __WIDE_CHAR 54 #define patmatch patwlmatch 55 #else 56 #define opatmatch opatlmatch 57 #define patmatch patlmatch 58 #endif 59 #endif 60 61 #ifdef __WIDE_CHAR 62 #ifndef __LINE_MATCH 63 #define patcompile patwcompile 64 #define patmatch patwmatch 65 #endif 66 #define CHAR wchar_t 67 #define PCHAR wchar_t 68 #endif 69 70 #ifdef __MB_CHAR 71 #undef patmatch 72 #ifdef __LINE_MATCH 73 #define patmatch patmblmatch 74 #else 75 #define patmatch patmbmatch 76 #endif 77 #define PCHAR wchar_t 78 #endif 79 80 #ifndef CHAR 81 typedef unsigned char Uchar; 82 #define DID_UCHAR_TYPE 83 #define CHAR Uchar 84 #endif 85 86 #ifndef PCHAR 87 #ifndef DID_UCHAR_TYPE 88 typedef unsigned char Uchar; 89 #endif 90 #define PCHAR Uchar 91 #endif 92 93 #define ENDSTATE (-1) 94 95 #define CL_SIZE 32 /* Max size for '[: :]' */ 96 97 /* 98 * The Interpreter 99 */ 100 101 102 /* 103 * put adds a new state to the active list 104 */ 105 #define put(ret, state, sp, n) { \ 106 register int *lstate = state; \ 107 register int *lsp = sp; \ 108 register int ln = n; \ 109 \ 110 while (lstate < lsp) { \ 111 if (*lstate++ == ln) { \ 112 ret = lsp; \ 113 lsp = 0; \ 114 break; \ 115 } \ 116 } \ 117 if (lsp) { \ 118 *lstate++ = ln; \ 119 ret = lstate; \ 120 } \ 121 } 122 123 124 /* 125 * match a character in class 126 * 127 * Syntax errors do not appear here, they are handled by the compiler, 128 * so in theory we could remove the "return (0)" statements from the 129 * the POSIX class code. 130 */ 131 #ifdef POSIX_CLASS 132 #define CHK_POSIX_CLASS \ 133 else if (*lpat == LCLASS) { \ 134 if (lpat[1] == ':') { \ 135 char class[CL_SIZE+1]; \ 136 char *pc = class; \ 137 \ 138 lpat += 2; /* Eat ':' */ \ 139 for (;;) { \ 140 if (*lpat == '\0') { \ 141 ok = FALSE; \ 142 goto out; \ 143 } \ 144 if (*lpat == ':' && lpat[1] == RCLASS) \ 145 break; \ 146 if (pc >= &class[CL_SIZE]) { \ 147 ok = FALSE; \ 148 goto out; \ 149 } \ 150 *pc++ = *lpat++; \ 151 } \ 152 if (pc == class) { \ 153 ok = FALSE; \ 154 goto out; \ 155 } \ 156 *pc = '\0'; \ 157 lpat += 2; /* Skip ":]" */ \ 158 if (iswctype(lc, wctype(class))) { \ 159 ok = !ok; \ 160 goto out; \ 161 } \ 162 continue; \ 163 } \ 164 } 165 #else 166 #define CHK_POSIX_CLASS 167 #endif 168 #define in_class(found, pat, c) { \ 169 register const PCHAR *lpat = pat; \ 170 register int lc = c; \ 171 int lo_bound; \ 172 int hi_bound; \ 173 BOOL ok = FALSE; \ 174 \ 175 if (*lpat == NOT) { \ 176 lpat++; \ 177 ok = TRUE; \ 178 } \ 179 while (*lpat != RCLASS) { \ 180 if (*lpat == QUOTE) \ 181 lpat++; \ 182 CHK_POSIX_CLASS \ 183 lo_bound = *lpat++; \ 184 if (*lpat == RANGE) { \ 185 lpat++; \ 186 if (*lpat == QUOTE) \ 187 lpat++; \ 188 hi_bound = *lpat++; \ 189 } else { \ 190 hi_bound = lo_bound; \ 191 } \ 192 if (lo_bound <= lc && lc <= hi_bound) { \ 193 ok = !ok; \ 194 goto out; \ 195 } \ 196 } \ 197 out: \ 198 found = ok; \ 199 } 200 201 /* 202 * opatmatch - the old external interpreter interface. 203 * 204 * Trys to match a string beginning at offset 205 * against the compiled pattern. 206 */ 207 #if !defined(__WIDE_CHAR) && !defined(__MB_CHAR) 208 EXPORT CHAR 209 *opatmatch(pat, aux, str, soff, slen, alt) 210 const PCHAR *pat; 211 const int *aux; 212 const CHAR *str; 213 int soff; 214 int slen; 215 int alt; 216 { 217 int state[MAXPAT]; 218 219 return (patmatch(pat, aux, str, soff, slen, alt, state)); 220 } 221 #endif 222 223 /* 224 * patmatch - the external interpreter interface. 225 * 226 * Trys to match a string beginning at offset 227 * against the compiled pattern. 228 */ 229 EXPORT CHAR * 230 patmatch(pat, aux, str, soff, slen, alt, state) 231 const PCHAR *pat; 232 const int *aux; 233 const CHAR *str; 234 int soff; 235 int slen; 236 int alt; 237 int state[]; 238 { 239 register int *sp; 240 register int *n; 241 register int *i; 242 register int p; 243 register int q, s, k; 244 #ifdef __MB_CHAR 245 wchar_t c; 246 int mlen = 1; 247 #else 248 int c; 249 #endif 250 const CHAR *lastp = (CHAR *)NULL; 251 252 #ifdef __LINE_MATCH 253 for (; soff <= slen; soff++) { 254 #endif 255 256 sp = state; 257 put(sp, state, state, 0); 258 if (alt != ENDSTATE) 259 put(sp, state, sp, alt); 260 261 #ifdef __MB_CHAR 262 mbtowc(NULL, NULL, 0); 263 for (s = soff; ; s += mlen) { 264 #else 265 for (s = soff; ; s++) { 266 #endif 267 /* 268 * next char from input string 269 */ 270 if (s >= slen) { 271 c = 0; 272 } else { 273 #ifdef __MB_CHAR 274 mlen = mbtowc(&c, (char *)str, slen - s); 275 if (mlen < 0) { 276 mbtowc(NULL, NULL, 0); 277 c = str[s]; 278 mlen = 1; 279 } 280 #else 281 c = str[s]; 282 #endif 283 } 284 /* 285 * first complete the closure 286 */ 287 for (n = state; n < sp; ) { 288 p = *n++; /* next state number */ 289 if (p == ENDSTATE) 290 continue; 291 q = aux[p]; /* get next state for pat */ 292 k = pat[p]; /* get next char from pat */ 293 switch (k) { 294 295 case REP: 296 put(sp, state, sp, p+1); 297 /* FALLTHRU */ 298 case NIL: /* NIL matches always */ 299 case STAR: 300 put(sp, state, sp, q); 301 break; 302 case LBRACK: /* alternations */ 303 case ALT: 304 put(sp, state, sp, p+1); 305 if (q != ENDSTATE) 306 put(sp, state, sp, q); 307 break; 308 case START: 309 if (s == 0) 310 put(sp, state, sp, q); 311 break; 312 case END: 313 if (c == '\0') 314 put(sp, state, sp, q); 315 break; 316 } 317 } 318 319 for (i = state; i < sp; ) { 320 if (*i++ == ENDSTATE) { 321 lastp = &str[s]; 322 break; 323 } 324 } 325 if (c == 0) 326 return ((CHAR *)lastp); 327 328 /* 329 * now try to match next character 330 */ 331 n = sp; 332 sp = state; 333 for (i = sp; i < n; ) { 334 p = *i++; /* next active state number */ 335 if (p == ENDSTATE) 336 continue; 337 k = pat[p]; 338 switch (k) { 339 340 case ALT: 341 case REP: 342 case NIL: 343 case LBRACK: 344 case START: 345 case END: 346 continue; 347 case LCLASS: 348 in_class(q, &pat[p+1], c); 349 if (!q) 350 continue; 351 break; 352 case STAR: 353 put(sp, state, sp, p); 354 continue; 355 case QUOTE: 356 k = pat[p+1]; 357 default: 358 if (k != c) 359 continue; 360 /* FALLTHRU */ 361 case ANY: 362 break; 363 } 364 put(sp, state, sp, aux[p]); 365 } 366 if (sp == state) { /* if no new states return */ 367 #ifdef __LINE_MATCH 368 369 if (lastp || (soff == slen - 1)) 370 return ((CHAR *)lastp); 371 else 372 break; 373 #else 374 return ((CHAR *)lastp); 375 #endif 376 } 377 } 378 #ifdef __LINE_MATCH 379 } 380 return ((CHAR *)lastp); 381 #endif 382 } 383 384 385 #if !defined(__LINE_MATCH) && !defined(__MB_CHAR) 386 /* 387 * The Compiler 388 */ 389 390 typedef struct args { 391 const PCHAR *pattern; 392 int *aux; 393 int patp; 394 int length; 395 PCHAR Ch; 396 } arg_t; 397 398 LOCAL void nextitem __PR((arg_t *)); 399 LOCAL int prim __PR((arg_t *)); 400 LOCAL int expr __PR((arg_t *, int *)); 401 LOCAL void setexits __PR((int *, int, int)); 402 LOCAL int join __PR((int *, int, int)); 403 404 /* 405 * 'read' the next character from pattern 406 */ 407 #define rch(ap) \ 408 { \ 409 if (++(ap)->patp >= (ap)->length) \ 410 (ap)->Ch = 0; \ 411 else \ 412 (ap)->Ch = (ap)->pattern[(ap)->patp]; \ 413 } 414 415 /* 416 * 'peek' the next character from pattern 417 */ 418 #define pch(ap) \ 419 ((((ap)->patp + 1) >= (ap)->length) ? \ 420 0 \ 421 : \ 422 (ap)->pattern[(ap)->patp+1]) \ 423 424 /* 425 * get the next item from pattern 426 */ 427 LOCAL void 428 nextitem(ap) 429 arg_t *ap; 430 { 431 if (ap->Ch == QUOTE) 432 rch(ap); 433 rch(ap); 434 } 435 436 /* 437 * parse a primary 438 */ 439 LOCAL int 440 prim(ap) 441 arg_t *ap; 442 { 443 int a = ap->patp; 444 int op = ap->Ch; 445 int t; 446 447 nextitem(ap); 448 switch (op) { 449 450 case '\0': 451 case ALT: 452 case RBRACK: 453 return (ENDSTATE); 454 case LCLASS: 455 while (ap->Ch != RCLASS && ap->Ch != '\0') { 456 #ifdef POSIX_CLASS 457 if (ap->Ch == LCLASS) { 458 if (pch(ap) == ':') { /* [:alpha:] */ 459 char class[CL_SIZE+1]; 460 char *pc = class; 461 462 nextitem(ap); 463 nextitem(ap); 464 while (ap->Ch != ':' && 465 ap->Ch != '\0') { 466 if (pc > &class[CL_SIZE]) 467 return (ENDSTATE); 468 *pc = ap->Ch; 469 if (*pc++ != ap->Ch) 470 return (ENDSTATE); 471 nextitem(ap); 472 } 473 if (pc == class) 474 return (ENDSTATE); 475 *pc = '\0'; 476 if (ap->Ch == '\0') 477 return (ENDSTATE); 478 if (wctype(class) == 0) 479 return (ENDSTATE); 480 nextitem(ap); 481 } 482 if (ap->Ch != RCLASS) 483 return (ENDSTATE); 484 } 485 #endif 486 nextitem(ap); 487 } 488 if (ap->Ch == '\0') 489 return (ENDSTATE); 490 nextitem(ap); 491 break; 492 case REP: 493 t = prim(ap); 494 if (t == ENDSTATE) 495 return (ENDSTATE); 496 setexits(ap->aux, t, a); 497 break; 498 case LBRACK: 499 a = expr(ap, &ap->aux[a]); 500 if (a == ENDSTATE || ap->Ch != RBRACK) 501 return (ENDSTATE); 502 nextitem(ap); 503 break; 504 } 505 return (a); 506 } 507 508 /* 509 * parse an expression (a sequence of primaries) 510 */ 511 LOCAL int 512 expr(ap, altp) 513 arg_t *ap; 514 int *altp; 515 { 516 int exits = ENDSTATE; 517 int a; 518 int *aux = ap->aux; 519 PCHAR Ch; 520 521 for (;;) { 522 a = prim(ap); 523 if (a == ENDSTATE) 524 return (ENDSTATE); 525 Ch = ap->Ch; 526 if (Ch == ALT || Ch == RBRACK || Ch == '\0') { 527 exits = join(aux, exits, a); 528 if (Ch != ALT) 529 return (exits); 530 *altp = ap->patp; 531 altp = &aux[ap->patp]; 532 nextitem(ap); 533 } else 534 setexits(aux, a, ap->patp); 535 } 536 } 537 538 /* 539 * set all exits in a list to a specified value 540 */ 541 LOCAL void 542 setexits(aux, list, val) 543 int *aux; 544 int list; 545 int val; 546 { 547 int a; 548 549 while (list != ENDSTATE) { 550 a = aux[list]; 551 aux[list] = val; 552 list = a; 553 } 554 } 555 556 /* 557 * concatenate two lists 558 */ 559 LOCAL int 560 join(aux, a, b) 561 int *aux; 562 int a; 563 int b; 564 { 565 int t; 566 567 if (a == ENDSTATE) 568 return (b); 569 t = a; 570 while (aux[t] != ENDSTATE) 571 t = aux[t]; 572 aux[t] = b; 573 return (a); 574 } 575 576 /* 577 * patcompile - the external compiler interface. 578 * 579 * The pattern is compiled into the aux array. 580 * Return value on success, is the outermost alternate which is != 0. 581 * Error is indicated by return of 0. 582 */ 583 EXPORT int 584 patcompile(pat, len, aux) 585 const PCHAR *pat; 586 int len; 587 int *aux; 588 { 589 arg_t a; 590 int alt = ENDSTATE; 591 int i; 592 593 a.pattern = pat; 594 a.length = len; 595 a.aux = aux; 596 a.patp = -1; 597 598 for (i = 0; i < len; i++) 599 aux[i] = ENDSTATE; 600 rch(&a); 601 i = expr(&a, &alt); 602 if (i == ENDSTATE) 603 return (0); 604 setexits(aux, i, ENDSTATE); 605 return (alt); 606 } 607 #endif /* !defined(__LINE_MATCH) && !defined(__MB_CHAR) */ 608