1 /*- 2 * Copyright (c) 1992 Diomidis Spinellis. 3 * Copyright (c) 1992 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Diomidis Spinellis of Imperial College, University of London. 8 * 9 * %sccs.include.redist.c% 10 */ 11 12 #ifndef lint 13 static char sccsid[] = "@(#)compile.c 5.6 (Berkeley) 11/02/92"; 14 #endif /* not lint */ 15 16 #include <sys/types.h> 17 #include <sys/stat.h> 18 19 #include <ctype.h> 20 #include <errno.h> 21 #include <fcntl.h> 22 #include <limits.h> 23 #include <regex.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 28 #include "defs.h" 29 #include "extern.h" 30 31 static char *compile_addr __P((char *, struct s_addr *)); 32 static char *compile_delimited __P((char *, char *)); 33 static char *compile_flags __P((char *, struct s_subst *)); 34 static char *compile_re __P((char *, regex_t **)); 35 static char *compile_subst __P((char *, struct s_subst *)); 36 static char *compile_text __P((void)); 37 static char *compile_tr __P((char *, char **)); 38 static struct s_command 39 **compile_stream __P((char *, struct s_command **, char *)); 40 static char *duptoeol __P((char *)); 41 static struct s_command 42 *findlabel __P((struct s_command *, struct s_command *)); 43 static void fixuplabel __P((struct s_command *, struct s_command *, 44 struct s_command *)); 45 46 /* 47 * Command specification. This is used to drive the command parser. 48 */ 49 struct s_format { 50 char code; /* Command code */ 51 int naddr; /* Number of address args */ 52 enum e_args args; /* Argument type */ 53 }; 54 55 static struct s_format cmd_fmts[] = { 56 {'{', 2, GROUP}, 57 {'a', 1, TEXT}, 58 {'b', 2, BRANCH}, 59 {'c', 2, TEXT}, 60 {'d', 2, EMPTY}, 61 {'D', 2, EMPTY}, 62 {'g', 2, EMPTY}, 63 {'G', 2, EMPTY}, 64 {'h', 2, EMPTY}, 65 {'H', 2, EMPTY}, 66 {'i', 1, TEXT}, 67 {'l', 2, EMPTY}, 68 {'n', 2, EMPTY}, 69 {'N', 2, EMPTY}, 70 {'p', 2, EMPTY}, 71 {'P', 2, EMPTY}, 72 {'q', 1, EMPTY}, 73 {'r', 1, RFILE}, 74 {'s', 2, SUBST}, 75 {'t', 2, BRANCH}, 76 {'w', 2, WFILE}, 77 {'x', 2, EMPTY}, 78 {'y', 2, TR}, 79 {'!', 2, NONSEL}, 80 {':', 0, LABEL}, 81 {'#', 0, COMMENT}, 82 {'=', 1, EMPTY}, 83 {'\0', 0, COMMENT}, 84 }; 85 86 /* The compiled program. */ 87 struct s_command *prog; 88 89 /* 90 * Compile the program into prog. 91 * Initialise appends. 92 */ 93 void 94 compile() 95 { 96 *compile_stream(NULL, &prog, NULL) = NULL; 97 fixuplabel(prog, prog, NULL); 98 appends = xmalloc(sizeof(struct s_appends) * appendnum); 99 match = xmalloc((maxnsub + 1) * sizeof(regmatch_t)); 100 } 101 102 #define EATSPACE() do { \ 103 if (p) \ 104 while (*p && isascii(*p) && isspace(*p)) \ 105 p++; \ 106 } while (0) 107 108 static struct s_command ** 109 compile_stream(terminator, link, p) 110 char *terminator; 111 struct s_command **link; 112 register char *p; 113 { 114 static char lbuf[_POSIX2_LINE_MAX + 1]; /* To save stack */ 115 struct s_command *cmd, *cmd2; 116 struct s_format *fp; 117 int naddr; /* Number of addresses */ 118 119 if (p != NULL) 120 goto semicolon; 121 for (;;) { 122 if ((p = cu_fgets(lbuf, sizeof(lbuf))) == NULL) { 123 if (terminator != NULL) 124 err(COMPILE, "unexpected EOF (pending }'s)"); 125 return (link); 126 } 127 128 semicolon: EATSPACE(); 129 if (p && (*p == '#' || *p == '\0')) 130 continue; 131 if (*p == '}') { 132 if (terminator == NULL) 133 err(COMPILE, "unexpected }"); 134 return (link); 135 } 136 *link = cmd = xmalloc(sizeof(struct s_command)); 137 link = &cmd->next; 138 cmd->nonsel = cmd->inrange = 0; 139 /* First parse the addresses */ 140 naddr = 0; 141 cmd->a1 = cmd->a2 = NULL; 142 143 /* Valid characters to start an address */ 144 #define addrchar(c) (strchr("0123456789/\\$", (c))) 145 if (addrchar(*p)) { 146 naddr++; 147 cmd->a1 = xmalloc(sizeof(struct s_addr)); 148 p = compile_addr(p, cmd->a1); 149 EATSPACE(); /* EXTENSION */ 150 if (*p == ',') { 151 naddr++; 152 p++; 153 EATSPACE(); /* EXTENSION */ 154 cmd->a2 = xmalloc(sizeof(struct s_addr)); 155 p = compile_addr(p, cmd->a2); 156 } 157 } 158 159 nonsel: /* Now parse the command */ 160 EATSPACE(); 161 if (!*p) 162 err(COMPILE, "command expected"); 163 cmd->code = *p; 164 for (fp = cmd_fmts; fp->code; fp++) 165 if (fp->code == *p) 166 break; 167 if (!fp->code) 168 err(COMPILE, "invalid command code %c", *p); 169 if (naddr > fp->naddr) 170 err(COMPILE, 171 "command %c expects up to %d address(es), found %d", *p, fp->naddr, naddr); 172 switch (fp->args) { 173 case NONSEL: /* ! */ 174 cmd->nonsel = ! cmd->nonsel; 175 p++; 176 goto nonsel; 177 case GROUP: /* { */ 178 p++; 179 EATSPACE(); 180 if (!*p) 181 p = NULL; 182 cmd2 = xmalloc(sizeof(struct s_command)); 183 cmd2->code = '}'; 184 *compile_stream("}", &cmd->u.c, p) = cmd2; 185 cmd->next = cmd2; 186 link = &cmd2->next; 187 break; 188 case EMPTY: /* d D g G h H l n N p P q x = \0 */ 189 p++; 190 EATSPACE(); 191 if (*p == ';') { 192 p++; 193 link = &cmd->next; 194 goto semicolon; 195 } 196 if (*p) 197 err(COMPILE, 198 "extra characters at the end of %c command", cmd->code); 199 break; 200 case TEXT: /* a c i */ 201 p++; 202 EATSPACE(); 203 if (*p != '\\') 204 err(COMPILE, 205 "command %c expects \\ followed by text", cmd->code); 206 p++; 207 EATSPACE(); 208 if (*p) 209 err(COMPILE, 210 "extra characters after \\ at the end of %c command", cmd->code); 211 cmd->t = compile_text(); 212 break; 213 case COMMENT: /* \0 # */ 214 break; 215 case WFILE: /* w */ 216 p++; 217 EATSPACE(); 218 if (*p == '\0') 219 err(COMPILE, "filename expected"); 220 cmd->t = duptoeol(p); 221 if (aflag) 222 cmd->u.fd = -1; 223 else if ((cmd->u.fd = open(p, 224 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 225 DEFFILEMODE)) == -1) 226 err(FATAL, "%s: %s\n", p, strerror(errno)); 227 break; 228 case RFILE: /* r */ 229 p++; 230 EATSPACE(); 231 if (*p == '\0') 232 err(COMPILE, "filename expected"); 233 else 234 cmd->t = duptoeol(p); 235 break; 236 case BRANCH: /* b t */ 237 p++; 238 EATSPACE(); 239 if (*p == '\0') 240 cmd->t = NULL; 241 else 242 cmd->t = duptoeol(p); 243 break; 244 case LABEL: /* : */ 245 p++; 246 EATSPACE(); 247 cmd->t = duptoeol(p); 248 if (strlen(p) == 0) 249 err(COMPILE, "empty label"); 250 break; 251 case SUBST: /* s */ 252 p++; 253 if (*p == '\0' || *p == '\\') 254 err(COMPILE, 255 "substitute pattern can not be delimited by newline or backslash"); 256 cmd->u.s = xmalloc(sizeof(struct s_subst)); 257 p = compile_re(p, &cmd->u.s->re); 258 if (p == NULL) 259 err(COMPILE, "unterminated substitute pattern"); 260 --p; 261 p = compile_subst(p, cmd->u.s); 262 p = compile_flags(p, cmd->u.s); 263 EATSPACE(); 264 if (*p == ';') { 265 p++; 266 link = &cmd->next; 267 goto semicolon; 268 } 269 break; 270 case TR: /* y */ 271 p++; 272 p = compile_tr(p, (char **)&cmd->u.y); 273 EATSPACE(); 274 if (*p == ';') { 275 p++; 276 link = &cmd->next; 277 goto semicolon; 278 } 279 if (*p) 280 err(COMPILE, 281 "extra text at the end of a transform command"); 282 break; 283 } 284 } 285 } 286 287 /* 288 * Get a delimited string. P points to the delimeter of the string; d points 289 * to a buffer area. Newline and delimiter escapes are processed; other 290 * escapes are ignored. 291 * 292 * Returns a pointer to the first character after the final delimiter or NULL 293 * in the case of a non-terminated string. The character array d is filled 294 * with the processed string. 295 */ 296 static char * 297 compile_delimited(p, d) 298 char *p, *d; 299 { 300 char c; 301 302 c = *p++; 303 if (c == '\0') 304 return (NULL); 305 else if (c == '\\') 306 err(COMPILE, "\\ can not be used as a string delimiter"); 307 else if (c == '\n') 308 err(COMPILE, "newline can not be used as a string delimiter"); 309 while (*p) { 310 if (*p == '\\' && p[1] == c) 311 p++; 312 else if (*p == '\\' && p[1] == 'n') { 313 *d++ = '\n'; 314 p += 2; 315 continue; 316 } else if (*p == '\\' && p[1] == '\\') 317 *d++ = *p++; 318 else if (*p == c) { 319 *d = '\0'; 320 return (p + 1); 321 } 322 *d++ = *p++; 323 } 324 return (NULL); 325 } 326 327 /* 328 * Get a regular expression. P points to the delimiter of the regular 329 * expression; repp points to the address of a regexp pointer. Newline 330 * and delimiter escapes are processed; other escapes are ignored. 331 * Returns a pointer to the first character after the final delimiter 332 * or NULL in the case of a non terminated regular expression. The regexp 333 * pointer is set to the compiled regular expression. 334 * Cflags are passed to regcomp. 335 */ 336 static char * 337 compile_re(p, repp) 338 char *p; 339 regex_t **repp; 340 { 341 int eval; 342 char re[_POSIX2_LINE_MAX + 1]; 343 344 p = compile_delimited(p, re); 345 if (p && strlen(re) == 0) { 346 *repp = NULL; 347 return (p); 348 } 349 *repp = xmalloc(sizeof(regex_t)); 350 if (p && (eval = regcomp(*repp, re, 0)) != 0) 351 err(COMPILE, "RE error: %s", strregerror(eval, *repp)); 352 if (maxnsub < (*repp)->re_nsub) 353 maxnsub = (*repp)->re_nsub; 354 return (p); 355 } 356 357 /* 358 * Compile the substitution string of a regular expression and set res to 359 * point to a saved copy of it. Nsub is the number of parenthesized regular 360 * expressions. 361 */ 362 static char * 363 compile_subst(p, s) 364 char *p; 365 struct s_subst *s; 366 { 367 static char lbuf[_POSIX2_LINE_MAX + 1]; 368 int asize, ref, size; 369 char c, *text, *op, *sp; 370 371 c = *p++; /* Terminator character */ 372 if (c == '\0') 373 return (NULL); 374 375 s->maxbref = 0; 376 s->linenum = linenum; 377 asize = 2 * _POSIX2_LINE_MAX + 1; 378 text = xmalloc(asize); 379 size = 0; 380 do { 381 op = sp = text + size; 382 for (; *p; p++) { 383 if (*p == '\\') { 384 p++; 385 if (strchr("123456789", *p) != NULL) { 386 *sp++ = '\\'; 387 ref = *p - '0'; 388 if (s->re != NULL && 389 ref > s->re->re_nsub) 390 err(COMPILE, 391 "\\%c not defined in the RE", *p); 392 if (s->maxbref < ref) 393 s->maxbref = ref; 394 } else if (*p == '&' || *p == '\\') 395 *sp++ = '\\'; 396 } else if (*p == c) { 397 p++; 398 *sp++ = '\0'; 399 size += sp - op; 400 s->new = xrealloc(text, size); 401 return (p); 402 } else if (*p == '\n') { 403 err(COMPILE, 404 "unescaped newline inside substitute pattern"); 405 /* NOTREACHED */ 406 } 407 *sp++ = *p; 408 } 409 size += sp - op; 410 if (asize - size < _POSIX2_LINE_MAX + 1) { 411 asize *= 2; 412 text = xmalloc(asize); 413 } 414 } while (cu_fgets(p = lbuf, sizeof(lbuf))); 415 err(COMPILE, "unterminated substitute in regular expression"); 416 /* NOTREACHED */ 417 } 418 419 /* 420 * Compile the flags of the s command 421 */ 422 static char * 423 compile_flags(p, s) 424 char *p; 425 struct s_subst *s; 426 { 427 int gn; /* True if we have seen g or n */ 428 char wfile[_POSIX2_LINE_MAX + 1], *q; 429 430 s->n = 1; /* Default */ 431 s->p = 0; 432 s->wfile = NULL; 433 s->wfd = -1; 434 for (gn = 0;;) { 435 EATSPACE(); /* EXTENSION */ 436 switch (*p) { 437 case 'g': 438 if (gn) 439 err(COMPILE, 440 "more than one number or 'g' in substitute flags"); 441 gn = 1; 442 s->n = 0; 443 break; 444 case '\0': 445 case '\n': 446 case ';': 447 return (p); 448 case 'p': 449 s->p = 1; 450 break; 451 case '1': case '2': case '3': 452 case '4': case '5': case '6': 453 case '7': case '8': case '9': 454 if (gn) 455 err(COMPILE, 456 "more than one number or 'g' in substitute flags"); 457 gn = 1; 458 /* XXX Check for overflow */ 459 s->n = (int)strtol(p, &p, 10); 460 break; 461 case 'w': 462 p++; 463 #ifdef HISTORIC_PRACTICE 464 if (*p != ' ') { 465 err(WARNING, "space missing before w wfile"); 466 return (p); 467 } 468 #endif 469 EATSPACE(); 470 q = wfile; 471 while (*p) { 472 if (*p == '\n') 473 break; 474 *q++ = *p++; 475 } 476 *q = '\0'; 477 if (q == wfile) 478 err(COMPILE, "no wfile specified"); 479 s->wfile = strdup(wfile); 480 if (!aflag && (s->wfd = open(wfile, 481 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 482 DEFFILEMODE)) == -1) 483 err(FATAL, "%s: %s\n", wfile, strerror(errno)); 484 return (p); 485 default: 486 err(COMPILE, 487 "bad flag in substitute command: '%c'", *p); 488 break; 489 } 490 p++; 491 } 492 } 493 494 /* 495 * Compile a translation set of strings into a lookup table. 496 */ 497 static char * 498 compile_tr(p, transtab) 499 char *p; 500 char **transtab; 501 { 502 int i; 503 char *lt, *op, *np; 504 char old[_POSIX2_LINE_MAX + 1]; 505 char new[_POSIX2_LINE_MAX + 1]; 506 507 if (*p == '\0' || *p == '\\') 508 err(COMPILE, 509 "transform pattern can not be delimited by newline or backslash"); 510 p = compile_delimited(p, old); 511 if (p == NULL) { 512 err(COMPILE, "unterminated transform source string"); 513 return (NULL); 514 } 515 p = compile_delimited(--p, new); 516 if (p == NULL) { 517 err(COMPILE, "unterminated transform target string"); 518 return (NULL); 519 } 520 EATSPACE(); 521 if (strlen(new) != strlen(old)) { 522 err(COMPILE, "transform strings are not the same length"); 523 return (NULL); 524 } 525 /* We assume characters are 8 bits */ 526 lt = xmalloc(UCHAR_MAX); 527 for (i = 0; i <= UCHAR_MAX; i++) 528 lt[i] = (char)i; 529 for (op = old, np = new; *op; op++, np++) 530 lt[(u_char)*op] = *np; 531 *transtab = lt; 532 return (p); 533 } 534 535 /* 536 * Compile the text following an a or i command. 537 */ 538 static char * 539 compile_text() 540 { 541 int asize, size; 542 char *text, *p, *op, *s; 543 char lbuf[_POSIX2_LINE_MAX + 1]; 544 545 asize = 2 * _POSIX2_LINE_MAX + 1; 546 text = xmalloc(asize); 547 size = 0; 548 while (cu_fgets(lbuf, sizeof(lbuf))) { 549 op = s = text + size; 550 p = lbuf; 551 EATSPACE(); 552 for (; *p; p++) { 553 if (*p == '\\') 554 p++; 555 *s++ = *p; 556 } 557 size += s - op; 558 if (p[-2] != '\\') { 559 *s = '\0'; 560 break; 561 } 562 if (asize - size < _POSIX2_LINE_MAX + 1) { 563 asize *= 2; 564 text = xmalloc(asize); 565 } 566 } 567 return (xrealloc(text, size + 1)); 568 } 569 570 /* 571 * Get an address and return a pointer to the first character after 572 * it. Fill the structure pointed to according to the address. 573 */ 574 static char * 575 compile_addr(p, a) 576 char *p; 577 struct s_addr *a; 578 { 579 char *end; 580 581 switch (*p) { 582 case '\\': /* Context address */ 583 ++p; 584 /* FALLTHROUGH */ 585 case '/': /* Context address */ 586 p = compile_re(p, &a->u.r); 587 if (p == NULL) 588 err(COMPILE, "unterminated regular expression"); 589 a->type = AT_RE; 590 return (p); 591 592 case '$': /* Last line */ 593 a->type = AT_LAST; 594 return (p + 1); 595 /* Line number */ 596 case '0': case '1': case '2': case '3': case '4': 597 case '5': case '6': case '7': case '8': case '9': 598 a->type = AT_LINE; 599 a->u.l = strtol(p, &end, 10); 600 return (end); 601 default: 602 err(COMPILE, "expected context address"); 603 return (NULL); 604 } 605 } 606 607 /* 608 * Return a copy of all the characters up to \n or \0 609 */ 610 static char * 611 duptoeol(s) 612 register char *s; 613 { 614 size_t len; 615 char *start; 616 617 for (start = s; *s != '\0' && *s != '\n'; ++s); 618 *s = '\0'; 619 len = s - start + 1; 620 return (memmove(xmalloc(len), start, len)); 621 } 622 623 /* 624 * Find the label contained in the command l in the command linked list cp. 625 * L is excluded from the search. Return NULL if not found. 626 */ 627 static struct s_command * 628 findlabel(l, cp) 629 struct s_command *l, *cp; 630 { 631 struct s_command *r; 632 633 for (; cp; cp = cp->next) 634 if (cp->code == ':' && cp != l && strcmp(l->t, cp->t) == 0) 635 return (cp); 636 else if (cp->code == '{' && (r = findlabel(l, cp->u.c))) 637 return (r); 638 return (NULL); 639 } 640 641 /* 642 * Convert goto label names to addresses. 643 * Detect duplicate labels. 644 * Set appendnum to the number of a and r commands in the script. 645 * Free the memory used by labels in b and t commands (but not by :) 646 * Root is a pointer to the script linked list; cp points to the 647 * search start. 648 * TODO: Remove } nodes 649 */ 650 static void 651 fixuplabel(root, cp, end) 652 struct s_command *root, *cp, *end; 653 { 654 struct s_command *cp2; 655 656 for (; cp != end; cp = cp->next) 657 switch (cp->code) { 658 case ':': 659 if (findlabel(cp, root)) 660 err(COMPILE2, "duplicate label %s", cp->t); 661 break; 662 case 'a': 663 case 'r': 664 appendnum++; 665 break; 666 case 'b': 667 case 't': 668 if (cp->t == NULL) { 669 cp->u.c = NULL; 670 break; 671 } 672 if ((cp2 = findlabel(cp, root)) == NULL) 673 err(COMPILE2, "undefined label '%s'", cp->t); 674 free(cp->t); 675 cp->u.c = cp2; 676 break; 677 case '{': 678 fixuplabel(root, cp->u.c, cp->next); 679 break; 680 } 681 } 682