1 /*- 2 * Copyright (c) 1992 Diomidis Spinellis. 3 * Copyright (c) 1992, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Diomidis Spinellis of Imperial College, University of London. 8 * 9 * %sccs.include.redist.c% 10 */ 11 12 #ifndef lint 13 static char sccsid[] = "@(#)compile.c 8.1 (Berkeley) 06/06/93"; 14 #endif /* not lint */ 15 16 #include <sys/types.h> 17 #include <sys/stat.h> 18 19 #include <ctype.h> 20 #include <errno.h> 21 #include <fcntl.h> 22 #include <limits.h> 23 #include <regex.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 28 #include "defs.h" 29 #include "extern.h" 30 31 #define LHSZ 128 32 #define LHMASK (LHSZ - 1) 33 static struct labhash { 34 struct labhash *lh_next; 35 u_int lh_hash; 36 struct s_command *lh_cmd; 37 int lh_ref; 38 } *labels[LHSZ]; 39 40 static char *compile_addr __P((char *, struct s_addr *)); 41 static char *compile_delimited __P((char *, char *)); 42 static char *compile_flags __P((char *, struct s_subst *)); 43 static char *compile_re __P((char *, regex_t **)); 44 static char *compile_subst __P((char *, struct s_subst *)); 45 static char *compile_text __P((void)); 46 static char *compile_tr __P((char *, char **)); 47 static struct s_command 48 **compile_stream __P((char *, struct s_command **, char *)); 49 static char *duptoeol __P((char *, char *)); 50 static void enterlabel __P((struct s_command *)); 51 static struct s_command 52 *findlabel __P((char *)); 53 static void fixuplabel __P((struct s_command *, struct s_command *)); 54 static void uselabel __P((void)); 55 56 /* 57 * Command specification. This is used to drive the command parser. 58 */ 59 struct s_format { 60 char code; /* Command code */ 61 int naddr; /* Number of address args */ 62 enum e_args args; /* Argument type */ 63 }; 64 65 static struct s_format cmd_fmts[] = { 66 {'{', 2, GROUP}, 67 {'a', 1, TEXT}, 68 {'b', 2, BRANCH}, 69 {'c', 2, TEXT}, 70 {'d', 2, EMPTY}, 71 {'D', 2, EMPTY}, 72 {'g', 2, EMPTY}, 73 {'G', 2, EMPTY}, 74 {'h', 2, EMPTY}, 75 {'H', 2, EMPTY}, 76 {'i', 1, TEXT}, 77 {'l', 2, EMPTY}, 78 {'n', 2, EMPTY}, 79 {'N', 2, EMPTY}, 80 {'p', 2, EMPTY}, 81 {'P', 2, EMPTY}, 82 {'q', 1, EMPTY}, 83 {'r', 1, RFILE}, 84 {'s', 2, SUBST}, 85 {'t', 2, BRANCH}, 86 {'w', 2, WFILE}, 87 {'x', 2, EMPTY}, 88 {'y', 2, TR}, 89 {'!', 2, NONSEL}, 90 {':', 0, LABEL}, 91 {'#', 0, COMMENT}, 92 {'=', 1, EMPTY}, 93 {'\0', 0, COMMENT}, 94 }; 95 96 /* The compiled program. */ 97 struct s_command *prog; 98 99 /* 100 * Compile the program into prog. 101 * Initialise appends. 102 */ 103 void 104 compile() 105 { 106 *compile_stream(NULL, &prog, NULL) = NULL; 107 fixuplabel(prog, NULL); 108 uselabel(); 109 appends = xmalloc(sizeof(struct s_appends) * appendnum); 110 match = xmalloc((maxnsub + 1) * sizeof(regmatch_t)); 111 } 112 113 #define EATSPACE() do { \ 114 if (p) \ 115 while (*p && isascii(*p) && isspace(*p)) \ 116 p++; \ 117 } while (0) 118 119 static struct s_command ** 120 compile_stream(terminator, link, p) 121 char *terminator; 122 struct s_command **link; 123 register char *p; 124 { 125 static char lbuf[_POSIX2_LINE_MAX + 1]; /* To save stack */ 126 struct s_command *cmd, *cmd2; 127 struct s_format *fp; 128 int naddr; /* Number of addresses */ 129 130 if (p != NULL) 131 goto semicolon; 132 for (;;) { 133 if ((p = cu_fgets(lbuf, sizeof(lbuf))) == NULL) { 134 if (terminator != NULL) 135 err(COMPILE, "unexpected EOF (pending }'s)"); 136 return (link); 137 } 138 139 semicolon: EATSPACE(); 140 if (p && (*p == '#' || *p == '\0')) 141 continue; 142 if (*p == '}') { 143 if (terminator == NULL) 144 err(COMPILE, "unexpected }"); 145 return (link); 146 } 147 *link = cmd = xmalloc(sizeof(struct s_command)); 148 link = &cmd->next; 149 cmd->nonsel = cmd->inrange = 0; 150 /* First parse the addresses */ 151 naddr = 0; 152 cmd->a1 = cmd->a2 = NULL; 153 154 /* Valid characters to start an address */ 155 #define addrchar(c) (strchr("0123456789/\\$", (c))) 156 if (addrchar(*p)) { 157 naddr++; 158 cmd->a1 = xmalloc(sizeof(struct s_addr)); 159 p = compile_addr(p, cmd->a1); 160 EATSPACE(); /* EXTENSION */ 161 if (*p == ',') { 162 naddr++; 163 p++; 164 EATSPACE(); /* EXTENSION */ 165 cmd->a2 = xmalloc(sizeof(struct s_addr)); 166 p = compile_addr(p, cmd->a2); 167 } 168 } 169 170 nonsel: /* Now parse the command */ 171 EATSPACE(); 172 if (!*p) 173 err(COMPILE, "command expected"); 174 cmd->code = *p; 175 for (fp = cmd_fmts; fp->code; fp++) 176 if (fp->code == *p) 177 break; 178 if (!fp->code) 179 err(COMPILE, "invalid command code %c", *p); 180 if (naddr > fp->naddr) 181 err(COMPILE, 182 "command %c expects up to %d address(es), found %d", *p, fp->naddr, naddr); 183 switch (fp->args) { 184 case NONSEL: /* ! */ 185 cmd->nonsel = ! cmd->nonsel; 186 p++; 187 goto nonsel; 188 case GROUP: /* { */ 189 p++; 190 EATSPACE(); 191 if (!*p) 192 p = NULL; 193 cmd2 = xmalloc(sizeof(struct s_command)); 194 cmd2->code = '}'; 195 *compile_stream("}", &cmd->u.c, p) = cmd2; 196 cmd->next = cmd2; 197 link = &cmd2->next; 198 break; 199 case EMPTY: /* d D g G h H l n N p P q x = \0 */ 200 p++; 201 EATSPACE(); 202 if (*p == ';') { 203 p++; 204 link = &cmd->next; 205 goto semicolon; 206 } 207 if (*p) 208 err(COMPILE, 209 "extra characters at the end of %c command", cmd->code); 210 break; 211 case TEXT: /* a c i */ 212 p++; 213 EATSPACE(); 214 if (*p != '\\') 215 err(COMPILE, 216 "command %c expects \\ followed by text", cmd->code); 217 p++; 218 EATSPACE(); 219 if (*p) 220 err(COMPILE, 221 "extra characters after \\ at the end of %c command", cmd->code); 222 cmd->t = compile_text(); 223 break; 224 case COMMENT: /* \0 # */ 225 break; 226 case WFILE: /* w */ 227 p++; 228 EATSPACE(); 229 if (*p == '\0') 230 err(COMPILE, "filename expected"); 231 cmd->t = duptoeol(p, "w command"); 232 if (aflag) 233 cmd->u.fd = -1; 234 else if ((cmd->u.fd = open(p, 235 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 236 DEFFILEMODE)) == -1) 237 err(FATAL, "%s: %s\n", p, strerror(errno)); 238 break; 239 case RFILE: /* r */ 240 p++; 241 EATSPACE(); 242 if (*p == '\0') 243 err(COMPILE, "filename expected"); 244 else 245 cmd->t = duptoeol(p, "read command"); 246 break; 247 case BRANCH: /* b t */ 248 p++; 249 EATSPACE(); 250 if (*p == '\0') 251 cmd->t = NULL; 252 else 253 cmd->t = duptoeol(p, "branch"); 254 break; 255 case LABEL: /* : */ 256 p++; 257 EATSPACE(); 258 cmd->t = duptoeol(p, "label"); 259 if (strlen(p) == 0) 260 err(COMPILE, "empty label"); 261 enterlabel(cmd); 262 break; 263 case SUBST: /* s */ 264 p++; 265 if (*p == '\0' || *p == '\\') 266 err(COMPILE, 267 "substitute pattern can not be delimited by newline or backslash"); 268 cmd->u.s = xmalloc(sizeof(struct s_subst)); 269 p = compile_re(p, &cmd->u.s->re); 270 if (p == NULL) 271 err(COMPILE, "unterminated substitute pattern"); 272 --p; 273 p = compile_subst(p, cmd->u.s); 274 p = compile_flags(p, cmd->u.s); 275 EATSPACE(); 276 if (*p == ';') { 277 p++; 278 link = &cmd->next; 279 goto semicolon; 280 } 281 break; 282 case TR: /* y */ 283 p++; 284 p = compile_tr(p, (char **)&cmd->u.y); 285 EATSPACE(); 286 if (*p == ';') { 287 p++; 288 link = &cmd->next; 289 goto semicolon; 290 } 291 if (*p) 292 err(COMPILE, 293 "extra text at the end of a transform command"); 294 break; 295 } 296 } 297 } 298 299 /* 300 * Get a delimited string. P points to the delimeter of the string; d points 301 * to a buffer area. Newline and delimiter escapes are processed; other 302 * escapes are ignored. 303 * 304 * Returns a pointer to the first character after the final delimiter or NULL 305 * in the case of a non-terminated string. The character array d is filled 306 * with the processed string. 307 */ 308 static char * 309 compile_delimited(p, d) 310 char *p, *d; 311 { 312 char c; 313 314 c = *p++; 315 if (c == '\0') 316 return (NULL); 317 else if (c == '\\') 318 err(COMPILE, "\\ can not be used as a string delimiter"); 319 else if (c == '\n') 320 err(COMPILE, "newline can not be used as a string delimiter"); 321 while (*p) { 322 if (*p == '\\' && p[1] == c) 323 p++; 324 else if (*p == '\\' && p[1] == 'n') { 325 *d++ = '\n'; 326 p += 2; 327 continue; 328 } else if (*p == '\\' && p[1] == '\\') 329 *d++ = *p++; 330 else if (*p == c) { 331 *d = '\0'; 332 return (p + 1); 333 } 334 *d++ = *p++; 335 } 336 return (NULL); 337 } 338 339 /* 340 * Get a regular expression. P points to the delimiter of the regular 341 * expression; repp points to the address of a regexp pointer. Newline 342 * and delimiter escapes are processed; other escapes are ignored. 343 * Returns a pointer to the first character after the final delimiter 344 * or NULL in the case of a non terminated regular expression. The regexp 345 * pointer is set to the compiled regular expression. 346 * Cflags are passed to regcomp. 347 */ 348 static char * 349 compile_re(p, repp) 350 char *p; 351 regex_t **repp; 352 { 353 int eval; 354 char re[_POSIX2_LINE_MAX + 1]; 355 356 p = compile_delimited(p, re); 357 if (p && strlen(re) == 0) { 358 *repp = NULL; 359 return (p); 360 } 361 *repp = xmalloc(sizeof(regex_t)); 362 if (p && (eval = regcomp(*repp, re, 0)) != 0) 363 err(COMPILE, "RE error: %s", strregerror(eval, *repp)); 364 if (maxnsub < (*repp)->re_nsub) 365 maxnsub = (*repp)->re_nsub; 366 return (p); 367 } 368 369 /* 370 * Compile the substitution string of a regular expression and set res to 371 * point to a saved copy of it. Nsub is the number of parenthesized regular 372 * expressions. 373 */ 374 static char * 375 compile_subst(p, s) 376 char *p; 377 struct s_subst *s; 378 { 379 static char lbuf[_POSIX2_LINE_MAX + 1]; 380 int asize, ref, size; 381 char c, *text, *op, *sp; 382 383 c = *p++; /* Terminator character */ 384 if (c == '\0') 385 return (NULL); 386 387 s->maxbref = 0; 388 s->linenum = linenum; 389 asize = 2 * _POSIX2_LINE_MAX + 1; 390 text = xmalloc(asize); 391 size = 0; 392 do { 393 op = sp = text + size; 394 for (; *p; p++) { 395 if (*p == '\\') { 396 p++; 397 if (strchr("123456789", *p) != NULL) { 398 *sp++ = '\\'; 399 ref = *p - '0'; 400 if (s->re != NULL && 401 ref > s->re->re_nsub) 402 err(COMPILE, 403 "\\%c not defined in the RE", *p); 404 if (s->maxbref < ref) 405 s->maxbref = ref; 406 } else if (*p == '&' || *p == '\\') 407 *sp++ = '\\'; 408 } else if (*p == c) { 409 p++; 410 *sp++ = '\0'; 411 size += sp - op; 412 s->new = xrealloc(text, size); 413 return (p); 414 } else if (*p == '\n') { 415 err(COMPILE, 416 "unescaped newline inside substitute pattern"); 417 /* NOTREACHED */ 418 } 419 *sp++ = *p; 420 } 421 size += sp - op; 422 if (asize - size < _POSIX2_LINE_MAX + 1) { 423 asize *= 2; 424 text = xmalloc(asize); 425 } 426 } while (cu_fgets(p = lbuf, sizeof(lbuf))); 427 err(COMPILE, "unterminated substitute in regular expression"); 428 /* NOTREACHED */ 429 } 430 431 /* 432 * Compile the flags of the s command 433 */ 434 static char * 435 compile_flags(p, s) 436 char *p; 437 struct s_subst *s; 438 { 439 int gn; /* True if we have seen g or n */ 440 char wfile[_POSIX2_LINE_MAX + 1], *q; 441 442 s->n = 1; /* Default */ 443 s->p = 0; 444 s->wfile = NULL; 445 s->wfd = -1; 446 for (gn = 0;;) { 447 EATSPACE(); /* EXTENSION */ 448 switch (*p) { 449 case 'g': 450 if (gn) 451 err(COMPILE, 452 "more than one number or 'g' in substitute flags"); 453 gn = 1; 454 s->n = 0; 455 break; 456 case '\0': 457 case '\n': 458 case ';': 459 return (p); 460 case 'p': 461 s->p = 1; 462 break; 463 case '1': case '2': case '3': 464 case '4': case '5': case '6': 465 case '7': case '8': case '9': 466 if (gn) 467 err(COMPILE, 468 "more than one number or 'g' in substitute flags"); 469 gn = 1; 470 /* XXX Check for overflow */ 471 s->n = (int)strtol(p, &p, 10); 472 break; 473 case 'w': 474 p++; 475 #ifdef HISTORIC_PRACTICE 476 if (*p != ' ') { 477 err(WARNING, "space missing before w wfile"); 478 return (p); 479 } 480 #endif 481 EATSPACE(); 482 q = wfile; 483 while (*p) { 484 if (*p == '\n') 485 break; 486 *q++ = *p++; 487 } 488 *q = '\0'; 489 if (q == wfile) 490 err(COMPILE, "no wfile specified"); 491 s->wfile = strdup(wfile); 492 if (!aflag && (s->wfd = open(wfile, 493 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 494 DEFFILEMODE)) == -1) 495 err(FATAL, "%s: %s\n", wfile, strerror(errno)); 496 return (p); 497 default: 498 err(COMPILE, 499 "bad flag in substitute command: '%c'", *p); 500 break; 501 } 502 p++; 503 } 504 } 505 506 /* 507 * Compile a translation set of strings into a lookup table. 508 */ 509 static char * 510 compile_tr(p, transtab) 511 char *p; 512 char **transtab; 513 { 514 int i; 515 char *lt, *op, *np; 516 char old[_POSIX2_LINE_MAX + 1]; 517 char new[_POSIX2_LINE_MAX + 1]; 518 519 if (*p == '\0' || *p == '\\') 520 err(COMPILE, 521 "transform pattern can not be delimited by newline or backslash"); 522 p = compile_delimited(p, old); 523 if (p == NULL) { 524 err(COMPILE, "unterminated transform source string"); 525 return (NULL); 526 } 527 p = compile_delimited(--p, new); 528 if (p == NULL) { 529 err(COMPILE, "unterminated transform target string"); 530 return (NULL); 531 } 532 EATSPACE(); 533 if (strlen(new) != strlen(old)) { 534 err(COMPILE, "transform strings are not the same length"); 535 return (NULL); 536 } 537 /* We assume characters are 8 bits */ 538 lt = xmalloc(UCHAR_MAX); 539 for (i = 0; i <= UCHAR_MAX; i++) 540 lt[i] = (char)i; 541 for (op = old, np = new; *op; op++, np++) 542 lt[(u_char)*op] = *np; 543 *transtab = lt; 544 return (p); 545 } 546 547 /* 548 * Compile the text following an a or i command. 549 */ 550 static char * 551 compile_text() 552 { 553 int asize, size; 554 char *text, *p, *op, *s; 555 char lbuf[_POSIX2_LINE_MAX + 1]; 556 557 asize = 2 * _POSIX2_LINE_MAX + 1; 558 text = xmalloc(asize); 559 size = 0; 560 while (cu_fgets(lbuf, sizeof(lbuf))) { 561 op = s = text + size; 562 p = lbuf; 563 EATSPACE(); 564 for (; *p; p++) { 565 if (*p == '\\') 566 p++; 567 *s++ = *p; 568 } 569 size += s - op; 570 if (p[-2] != '\\') { 571 *s = '\0'; 572 break; 573 } 574 if (asize - size < _POSIX2_LINE_MAX + 1) { 575 asize *= 2; 576 text = xmalloc(asize); 577 } 578 } 579 return (xrealloc(text, size + 1)); 580 } 581 582 /* 583 * Get an address and return a pointer to the first character after 584 * it. Fill the structure pointed to according to the address. 585 */ 586 static char * 587 compile_addr(p, a) 588 char *p; 589 struct s_addr *a; 590 { 591 char *end; 592 593 switch (*p) { 594 case '\\': /* Context address */ 595 ++p; 596 /* FALLTHROUGH */ 597 case '/': /* Context address */ 598 p = compile_re(p, &a->u.r); 599 if (p == NULL) 600 err(COMPILE, "unterminated regular expression"); 601 a->type = AT_RE; 602 return (p); 603 604 case '$': /* Last line */ 605 a->type = AT_LAST; 606 return (p + 1); 607 /* Line number */ 608 case '0': case '1': case '2': case '3': case '4': 609 case '5': case '6': case '7': case '8': case '9': 610 a->type = AT_LINE; 611 a->u.l = strtol(p, &end, 10); 612 return (end); 613 default: 614 err(COMPILE, "expected context address"); 615 return (NULL); 616 } 617 } 618 619 /* 620 * duptoeol -- 621 * Return a copy of all the characters up to \n or \0. 622 */ 623 static char * 624 duptoeol(s, ctype) 625 register char *s; 626 char *ctype; 627 { 628 size_t len; 629 int ws; 630 char *start; 631 632 ws = 0; 633 for (start = s; *s != '\0' && *s != '\n'; ++s) 634 ws = isspace(*s); 635 *s = '\0'; 636 if (ws) 637 err(WARNING, "whitespace after %s", ctype); 638 len = s - start + 1; 639 return (memmove(xmalloc(len), start, len)); 640 } 641 642 /* 643 * Convert goto label names to addresses, and count a and r commands, in 644 * the given subset of the script. Free the memory used by labels in b 645 * and t commands (but not by :). 646 * 647 * TODO: Remove } nodes 648 */ 649 static void 650 fixuplabel(cp, end) 651 struct s_command *cp, *end; 652 { 653 654 for (; cp != end; cp = cp->next) 655 switch (cp->code) { 656 case 'a': 657 case 'r': 658 appendnum++; 659 break; 660 case 'b': 661 case 't': 662 /* Resolve branch target. */ 663 if (cp->t == NULL) { 664 cp->u.c = NULL; 665 break; 666 } 667 if ((cp->u.c = findlabel(cp->t)) == NULL) 668 err(COMPILE2, "undefined label '%s'", cp->t); 669 free(cp->t); 670 break; 671 case '{': 672 /* Do interior commands. */ 673 fixuplabel(cp->u.c, cp->next); 674 break; 675 } 676 } 677 678 /* 679 * Associate the given command label for later lookup. 680 */ 681 static void 682 enterlabel(cp) 683 struct s_command *cp; 684 { 685 register struct labhash **lhp, *lh; 686 register u_char *p; 687 register u_int h, c; 688 689 for (h = 0, p = (u_char *)cp->t; (c = *p) != 0; p++) 690 h = (h << 5) + h + c; 691 lhp = &labels[h & LHMASK]; 692 for (lh = *lhp; lh != NULL; lh = lh->lh_next) 693 if (lh->lh_hash == h && strcmp(cp->t, lh->lh_cmd->t) == 0) 694 err(COMPILE2, "duplicate label '%s'", cp->t); 695 lh = xmalloc(sizeof *lh); 696 lh->lh_next = *lhp; 697 lh->lh_hash = h; 698 lh->lh_cmd = cp; 699 lh->lh_ref = 0; 700 *lhp = lh; 701 } 702 703 /* 704 * Find the label contained in the command l in the command linked 705 * list cp. L is excluded from the search. Return NULL if not found. 706 */ 707 static struct s_command * 708 findlabel(name) 709 char *name; 710 { 711 register struct labhash *lh; 712 register u_char *p; 713 register u_int h, c; 714 715 for (h = 0, p = (u_char *)name; (c = *p) != 0; p++) 716 h = (h << 5) + h + c; 717 for (lh = labels[h & LHMASK]; lh != NULL; lh = lh->lh_next) { 718 if (lh->lh_hash == h && strcmp(name, lh->lh_cmd->t) == 0) { 719 lh->lh_ref = 1; 720 return (lh->lh_cmd); 721 } 722 } 723 return (NULL); 724 } 725 726 /* 727 * Warn about any unused labels. As a side effect, release the label hash 728 * table space. 729 */ 730 static void 731 uselabel() 732 { 733 register struct labhash *lh, *next; 734 register int i; 735 736 for (i = 0; i < LHSZ; i++) { 737 for (lh = labels[i]; lh != NULL; lh = next) { 738 next = lh->lh_next; 739 if (!lh->lh_ref) 740 err(WARNING, "unused label '%s'", 741 lh->lh_cmd->t); 742 free(lh); 743 } 744 } 745 } 746