1 /*- 2 * Copyright (c) 1992 Diomidis Spinellis. 3 * Copyright (c) 1992 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Diomidis Spinellis of Imperial College, University of London. 8 * 9 * %sccs.include.redist.c% 10 */ 11 12 #ifndef lint 13 static char sccsid[] = "@(#)compile.c 5.7 (Berkeley) 03/07/93"; 14 #endif /* not lint */ 15 16 #include <sys/types.h> 17 #include <sys/stat.h> 18 19 #include <ctype.h> 20 #include <errno.h> 21 #include <fcntl.h> 22 #include <limits.h> 23 #include <regex.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 28 #include "defs.h" 29 #include "extern.h" 30 31 static char *compile_addr __P((char *, struct s_addr *)); 32 static char *compile_delimited __P((char *, char *)); 33 static char *compile_flags __P((char *, struct s_subst *)); 34 static char *compile_re __P((char *, regex_t **)); 35 static char *compile_subst __P((char *, struct s_subst *)); 36 static char *compile_text __P((void)); 37 static char *compile_tr __P((char *, char **)); 38 static struct s_command 39 **compile_stream __P((char *, struct s_command **, char *)); 40 static char *duptoeol __P((char *, char *)); 41 static struct s_command 42 *findlabel __P((struct s_command *, struct s_command *)); 43 static void fixuplabel __P((struct s_command *, struct s_command *, 44 struct s_command *)); 45 static void uselabel __P((struct s_command *)); 46 47 /* 48 * Command specification. This is used to drive the command parser. 49 */ 50 struct s_format { 51 char code; /* Command code */ 52 int naddr; /* Number of address args */ 53 enum e_args args; /* Argument type */ 54 }; 55 56 static struct s_format cmd_fmts[] = { 57 {'{', 2, GROUP}, 58 {'a', 1, TEXT}, 59 {'b', 2, BRANCH}, 60 {'c', 2, TEXT}, 61 {'d', 2, EMPTY}, 62 {'D', 2, EMPTY}, 63 {'g', 2, EMPTY}, 64 {'G', 2, EMPTY}, 65 {'h', 2, EMPTY}, 66 {'H', 2, EMPTY}, 67 {'i', 1, TEXT}, 68 {'l', 2, EMPTY}, 69 {'n', 2, EMPTY}, 70 {'N', 2, EMPTY}, 71 {'p', 2, EMPTY}, 72 {'P', 2, EMPTY}, 73 {'q', 1, EMPTY}, 74 {'r', 1, RFILE}, 75 {'s', 2, SUBST}, 76 {'t', 2, BRANCH}, 77 {'w', 2, WFILE}, 78 {'x', 2, EMPTY}, 79 {'y', 2, TR}, 80 {'!', 2, NONSEL}, 81 {':', 0, LABEL}, 82 {'#', 0, COMMENT}, 83 {'=', 1, EMPTY}, 84 {'\0', 0, COMMENT}, 85 }; 86 87 /* The compiled program. */ 88 struct s_command *prog; 89 90 /* 91 * Compile the program into prog. 92 * Initialise appends. 93 */ 94 void 95 compile() 96 { 97 *compile_stream(NULL, &prog, NULL) = NULL; 98 fixuplabel(prog, prog, NULL); 99 appends = xmalloc(sizeof(struct s_appends) * appendnum); 100 match = xmalloc((maxnsub + 1) * sizeof(regmatch_t)); 101 } 102 103 #define EATSPACE() do { \ 104 if (p) \ 105 while (*p && isascii(*p) && isspace(*p)) \ 106 p++; \ 107 } while (0) 108 109 static struct s_command ** 110 compile_stream(terminator, link, p) 111 char *terminator; 112 struct s_command **link; 113 register char *p; 114 { 115 static char lbuf[_POSIX2_LINE_MAX + 1]; /* To save stack */ 116 struct s_command *cmd, *cmd2; 117 struct s_format *fp; 118 int naddr; /* Number of addresses */ 119 120 if (p != NULL) 121 goto semicolon; 122 for (;;) { 123 if ((p = cu_fgets(lbuf, sizeof(lbuf))) == NULL) { 124 if (terminator != NULL) 125 err(COMPILE, "unexpected EOF (pending }'s)"); 126 return (link); 127 } 128 129 semicolon: EATSPACE(); 130 if (p && (*p == '#' || *p == '\0')) 131 continue; 132 if (*p == '}') { 133 if (terminator == NULL) 134 err(COMPILE, "unexpected }"); 135 return (link); 136 } 137 *link = cmd = xmalloc(sizeof(struct s_command)); 138 link = &cmd->next; 139 cmd->lused = cmd->nonsel = cmd->inrange = 0; 140 /* First parse the addresses */ 141 naddr = 0; 142 cmd->a1 = cmd->a2 = NULL; 143 144 /* Valid characters to start an address */ 145 #define addrchar(c) (strchr("0123456789/\\$", (c))) 146 if (addrchar(*p)) { 147 naddr++; 148 cmd->a1 = xmalloc(sizeof(struct s_addr)); 149 p = compile_addr(p, cmd->a1); 150 EATSPACE(); /* EXTENSION */ 151 if (*p == ',') { 152 naddr++; 153 p++; 154 EATSPACE(); /* EXTENSION */ 155 cmd->a2 = xmalloc(sizeof(struct s_addr)); 156 p = compile_addr(p, cmd->a2); 157 } 158 } 159 160 nonsel: /* Now parse the command */ 161 EATSPACE(); 162 if (!*p) 163 err(COMPILE, "command expected"); 164 cmd->code = *p; 165 for (fp = cmd_fmts; fp->code; fp++) 166 if (fp->code == *p) 167 break; 168 if (!fp->code) 169 err(COMPILE, "invalid command code %c", *p); 170 if (naddr > fp->naddr) 171 err(COMPILE, 172 "command %c expects up to %d address(es), found %d", *p, fp->naddr, naddr); 173 switch (fp->args) { 174 case NONSEL: /* ! */ 175 cmd->nonsel = ! cmd->nonsel; 176 p++; 177 goto nonsel; 178 case GROUP: /* { */ 179 p++; 180 EATSPACE(); 181 if (!*p) 182 p = NULL; 183 cmd2 = xmalloc(sizeof(struct s_command)); 184 cmd2->code = '}'; 185 *compile_stream("}", &cmd->u.c, p) = cmd2; 186 cmd->next = cmd2; 187 link = &cmd2->next; 188 break; 189 case EMPTY: /* d D g G h H l n N p P q x = \0 */ 190 p++; 191 EATSPACE(); 192 if (*p == ';') { 193 p++; 194 link = &cmd->next; 195 goto semicolon; 196 } 197 if (*p) 198 err(COMPILE, 199 "extra characters at the end of %c command", cmd->code); 200 break; 201 case TEXT: /* a c i */ 202 p++; 203 EATSPACE(); 204 if (*p != '\\') 205 err(COMPILE, 206 "command %c expects \\ followed by text", cmd->code); 207 p++; 208 EATSPACE(); 209 if (*p) 210 err(COMPILE, 211 "extra characters after \\ at the end of %c command", cmd->code); 212 cmd->t = compile_text(); 213 break; 214 case COMMENT: /* \0 # */ 215 break; 216 case WFILE: /* w */ 217 p++; 218 EATSPACE(); 219 if (*p == '\0') 220 err(COMPILE, "filename expected"); 221 cmd->t = duptoeol(p, "w command"); 222 if (aflag) 223 cmd->u.fd = -1; 224 else if ((cmd->u.fd = open(p, 225 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 226 DEFFILEMODE)) == -1) 227 err(FATAL, "%s: %s\n", p, strerror(errno)); 228 break; 229 case RFILE: /* r */ 230 p++; 231 EATSPACE(); 232 if (*p == '\0') 233 err(COMPILE, "filename expected"); 234 else 235 cmd->t = duptoeol(p, "read command"); 236 break; 237 case BRANCH: /* b t */ 238 p++; 239 EATSPACE(); 240 if (*p == '\0') 241 cmd->t = NULL; 242 else 243 cmd->t = duptoeol(p, "branch"); 244 break; 245 case LABEL: /* : */ 246 p++; 247 EATSPACE(); 248 cmd->t = duptoeol(p, "label"); 249 if (strlen(p) == 0) 250 err(COMPILE, "empty label"); 251 break; 252 case SUBST: /* s */ 253 p++; 254 if (*p == '\0' || *p == '\\') 255 err(COMPILE, 256 "substitute pattern can not be delimited by newline or backslash"); 257 cmd->u.s = xmalloc(sizeof(struct s_subst)); 258 p = compile_re(p, &cmd->u.s->re); 259 if (p == NULL) 260 err(COMPILE, "unterminated substitute pattern"); 261 --p; 262 p = compile_subst(p, cmd->u.s); 263 p = compile_flags(p, cmd->u.s); 264 EATSPACE(); 265 if (*p == ';') { 266 p++; 267 link = &cmd->next; 268 goto semicolon; 269 } 270 break; 271 case TR: /* y */ 272 p++; 273 p = compile_tr(p, (char **)&cmd->u.y); 274 EATSPACE(); 275 if (*p == ';') { 276 p++; 277 link = &cmd->next; 278 goto semicolon; 279 } 280 if (*p) 281 err(COMPILE, 282 "extra text at the end of a transform command"); 283 break; 284 } 285 } 286 } 287 288 /* 289 * Get a delimited string. P points to the delimeter of the string; d points 290 * to a buffer area. Newline and delimiter escapes are processed; other 291 * escapes are ignored. 292 * 293 * Returns a pointer to the first character after the final delimiter or NULL 294 * in the case of a non-terminated string. The character array d is filled 295 * with the processed string. 296 */ 297 static char * 298 compile_delimited(p, d) 299 char *p, *d; 300 { 301 char c; 302 303 c = *p++; 304 if (c == '\0') 305 return (NULL); 306 else if (c == '\\') 307 err(COMPILE, "\\ can not be used as a string delimiter"); 308 else if (c == '\n') 309 err(COMPILE, "newline can not be used as a string delimiter"); 310 while (*p) { 311 if (*p == '\\' && p[1] == c) 312 p++; 313 else if (*p == '\\' && p[1] == 'n') { 314 *d++ = '\n'; 315 p += 2; 316 continue; 317 } else if (*p == '\\' && p[1] == '\\') 318 *d++ = *p++; 319 else if (*p == c) { 320 *d = '\0'; 321 return (p + 1); 322 } 323 *d++ = *p++; 324 } 325 return (NULL); 326 } 327 328 /* 329 * Get a regular expression. P points to the delimiter of the regular 330 * expression; repp points to the address of a regexp pointer. Newline 331 * and delimiter escapes are processed; other escapes are ignored. 332 * Returns a pointer to the first character after the final delimiter 333 * or NULL in the case of a non terminated regular expression. The regexp 334 * pointer is set to the compiled regular expression. 335 * Cflags are passed to regcomp. 336 */ 337 static char * 338 compile_re(p, repp) 339 char *p; 340 regex_t **repp; 341 { 342 int eval; 343 char re[_POSIX2_LINE_MAX + 1]; 344 345 p = compile_delimited(p, re); 346 if (p && strlen(re) == 0) { 347 *repp = NULL; 348 return (p); 349 } 350 *repp = xmalloc(sizeof(regex_t)); 351 if (p && (eval = regcomp(*repp, re, 0)) != 0) 352 err(COMPILE, "RE error: %s", strregerror(eval, *repp)); 353 if (maxnsub < (*repp)->re_nsub) 354 maxnsub = (*repp)->re_nsub; 355 return (p); 356 } 357 358 /* 359 * Compile the substitution string of a regular expression and set res to 360 * point to a saved copy of it. Nsub is the number of parenthesized regular 361 * expressions. 362 */ 363 static char * 364 compile_subst(p, s) 365 char *p; 366 struct s_subst *s; 367 { 368 static char lbuf[_POSIX2_LINE_MAX + 1]; 369 int asize, ref, size; 370 char c, *text, *op, *sp; 371 372 c = *p++; /* Terminator character */ 373 if (c == '\0') 374 return (NULL); 375 376 s->maxbref = 0; 377 s->linenum = linenum; 378 asize = 2 * _POSIX2_LINE_MAX + 1; 379 text = xmalloc(asize); 380 size = 0; 381 do { 382 op = sp = text + size; 383 for (; *p; p++) { 384 if (*p == '\\') { 385 p++; 386 if (strchr("123456789", *p) != NULL) { 387 *sp++ = '\\'; 388 ref = *p - '0'; 389 if (s->re != NULL && 390 ref > s->re->re_nsub) 391 err(COMPILE, 392 "\\%c not defined in the RE", *p); 393 if (s->maxbref < ref) 394 s->maxbref = ref; 395 } else if (*p == '&' || *p == '\\') 396 *sp++ = '\\'; 397 } else if (*p == c) { 398 p++; 399 *sp++ = '\0'; 400 size += sp - op; 401 s->new = xrealloc(text, size); 402 return (p); 403 } else if (*p == '\n') { 404 err(COMPILE, 405 "unescaped newline inside substitute pattern"); 406 /* NOTREACHED */ 407 } 408 *sp++ = *p; 409 } 410 size += sp - op; 411 if (asize - size < _POSIX2_LINE_MAX + 1) { 412 asize *= 2; 413 text = xmalloc(asize); 414 } 415 } while (cu_fgets(p = lbuf, sizeof(lbuf))); 416 err(COMPILE, "unterminated substitute in regular expression"); 417 /* NOTREACHED */ 418 } 419 420 /* 421 * Compile the flags of the s command 422 */ 423 static char * 424 compile_flags(p, s) 425 char *p; 426 struct s_subst *s; 427 { 428 int gn; /* True if we have seen g or n */ 429 char wfile[_POSIX2_LINE_MAX + 1], *q; 430 431 s->n = 1; /* Default */ 432 s->p = 0; 433 s->wfile = NULL; 434 s->wfd = -1; 435 for (gn = 0;;) { 436 EATSPACE(); /* EXTENSION */ 437 switch (*p) { 438 case 'g': 439 if (gn) 440 err(COMPILE, 441 "more than one number or 'g' in substitute flags"); 442 gn = 1; 443 s->n = 0; 444 break; 445 case '\0': 446 case '\n': 447 case ';': 448 return (p); 449 case 'p': 450 s->p = 1; 451 break; 452 case '1': case '2': case '3': 453 case '4': case '5': case '6': 454 case '7': case '8': case '9': 455 if (gn) 456 err(COMPILE, 457 "more than one number or 'g' in substitute flags"); 458 gn = 1; 459 /* XXX Check for overflow */ 460 s->n = (int)strtol(p, &p, 10); 461 break; 462 case 'w': 463 p++; 464 #ifdef HISTORIC_PRACTICE 465 if (*p != ' ') { 466 err(WARNING, "space missing before w wfile"); 467 return (p); 468 } 469 #endif 470 EATSPACE(); 471 q = wfile; 472 while (*p) { 473 if (*p == '\n') 474 break; 475 *q++ = *p++; 476 } 477 *q = '\0'; 478 if (q == wfile) 479 err(COMPILE, "no wfile specified"); 480 s->wfile = strdup(wfile); 481 if (!aflag && (s->wfd = open(wfile, 482 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 483 DEFFILEMODE)) == -1) 484 err(FATAL, "%s: %s\n", wfile, strerror(errno)); 485 return (p); 486 default: 487 err(COMPILE, 488 "bad flag in substitute command: '%c'", *p); 489 break; 490 } 491 p++; 492 } 493 } 494 495 /* 496 * Compile a translation set of strings into a lookup table. 497 */ 498 static char * 499 compile_tr(p, transtab) 500 char *p; 501 char **transtab; 502 { 503 int i; 504 char *lt, *op, *np; 505 char old[_POSIX2_LINE_MAX + 1]; 506 char new[_POSIX2_LINE_MAX + 1]; 507 508 if (*p == '\0' || *p == '\\') 509 err(COMPILE, 510 "transform pattern can not be delimited by newline or backslash"); 511 p = compile_delimited(p, old); 512 if (p == NULL) { 513 err(COMPILE, "unterminated transform source string"); 514 return (NULL); 515 } 516 p = compile_delimited(--p, new); 517 if (p == NULL) { 518 err(COMPILE, "unterminated transform target string"); 519 return (NULL); 520 } 521 EATSPACE(); 522 if (strlen(new) != strlen(old)) { 523 err(COMPILE, "transform strings are not the same length"); 524 return (NULL); 525 } 526 /* We assume characters are 8 bits */ 527 lt = xmalloc(UCHAR_MAX); 528 for (i = 0; i <= UCHAR_MAX; i++) 529 lt[i] = (char)i; 530 for (op = old, np = new; *op; op++, np++) 531 lt[(u_char)*op] = *np; 532 *transtab = lt; 533 return (p); 534 } 535 536 /* 537 * Compile the text following an a or i command. 538 */ 539 static char * 540 compile_text() 541 { 542 int asize, size; 543 char *text, *p, *op, *s; 544 char lbuf[_POSIX2_LINE_MAX + 1]; 545 546 asize = 2 * _POSIX2_LINE_MAX + 1; 547 text = xmalloc(asize); 548 size = 0; 549 while (cu_fgets(lbuf, sizeof(lbuf))) { 550 op = s = text + size; 551 p = lbuf; 552 EATSPACE(); 553 for (; *p; p++) { 554 if (*p == '\\') 555 p++; 556 *s++ = *p; 557 } 558 size += s - op; 559 if (p[-2] != '\\') { 560 *s = '\0'; 561 break; 562 } 563 if (asize - size < _POSIX2_LINE_MAX + 1) { 564 asize *= 2; 565 text = xmalloc(asize); 566 } 567 } 568 return (xrealloc(text, size + 1)); 569 } 570 571 /* 572 * Get an address and return a pointer to the first character after 573 * it. Fill the structure pointed to according to the address. 574 */ 575 static char * 576 compile_addr(p, a) 577 char *p; 578 struct s_addr *a; 579 { 580 char *end; 581 582 switch (*p) { 583 case '\\': /* Context address */ 584 ++p; 585 /* FALLTHROUGH */ 586 case '/': /* Context address */ 587 p = compile_re(p, &a->u.r); 588 if (p == NULL) 589 err(COMPILE, "unterminated regular expression"); 590 a->type = AT_RE; 591 return (p); 592 593 case '$': /* Last line */ 594 a->type = AT_LAST; 595 return (p + 1); 596 /* Line number */ 597 case '0': case '1': case '2': case '3': case '4': 598 case '5': case '6': case '7': case '8': case '9': 599 a->type = AT_LINE; 600 a->u.l = strtol(p, &end, 10); 601 return (end); 602 default: 603 err(COMPILE, "expected context address"); 604 return (NULL); 605 } 606 } 607 608 /* 609 * duptoeol -- 610 * Return a copy of all the characters up to \n or \0. 611 */ 612 static char * 613 duptoeol(s, ctype) 614 register char *s; 615 char *ctype; 616 { 617 size_t len; 618 int ws; 619 char *start; 620 621 ws = 0; 622 for (start = s; *s != '\0' && *s != '\n'; ++s) 623 ws = isspace(*s); 624 *s = '\0'; 625 if (ws) 626 err(WARNING, "whitespace after %s", ctype); 627 len = s - start + 1; 628 return (memmove(xmalloc(len), start, len)); 629 } 630 631 /* 632 * Convert goto label names to addresses. Detect unused and duplicate labels. 633 * Set appendnum to the number of a and r commands in the script. Free the 634 * memory used by labels in b and t commands (but not by :). Root is a pointer 635 * to the script linked list; cp points to the search start. 636 * 637 * TODO: Remove } nodes 638 */ 639 static void 640 fixuplabel(root, cp, end) 641 struct s_command *root, *cp, *end; 642 { 643 644 for (; cp != end; cp = cp->next) 645 switch (cp->code) { 646 case ':': 647 if (findlabel(cp, root)) 648 err(COMPILE2, "duplicate label %s", cp->t); 649 break; 650 case 'a': 651 case 'r': 652 appendnum++; 653 break; 654 case 'b': 655 case 't': 656 if (cp->t == NULL) { 657 cp->u.c = NULL; 658 break; 659 } 660 if ((cp->u.c = findlabel(cp, root)) == NULL) 661 err(COMPILE2, "undefined label '%s'", cp->t); 662 cp->u.c->lused = 1; 663 free(cp->t); 664 break; 665 case '{': 666 fixuplabel(root, cp->u.c, cp->next); 667 break; 668 } 669 uselabel(root); 670 } 671 672 /* 673 * Find the label contained in the command l in the command linked 674 * list cp. L is excluded from the search. Return NULL if not found. 675 */ 676 static struct s_command * 677 findlabel(l, cp) 678 struct s_command *l, *cp; 679 { 680 struct s_command *r; 681 682 for (; cp; cp = cp->next) { 683 if (cp->code == ':' && cp != l && strcmp(l->t, cp->t) == 0) 684 return (cp); 685 if (cp->code == '{' && (r = findlabel(l, cp->u.c))) 686 return (r); 687 } 688 return (NULL); 689 } 690 691 /* 692 * Find any unused labels. This is because we want to warn the user if they 693 * accidentally put whitespace on a label name causing it be a different label 694 * than they intended. 695 */ 696 static void 697 uselabel(cp) 698 struct s_command *cp; 699 { 700 for (; cp; cp = cp->next) { 701 if (cp->code == ':' && cp->lused == 0) 702 err(WARNING, "unused label '%s'", cp->t); 703 if (cp->code == '{') 704 uselabel(cp->u.c); 705 } 706 } 707