1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1995-2013 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <glenn.s.fowler@gmail.com> *
18 * *
19 ***********************************************************************/
20 #pragma prototyped
21
22 #include "sed.h"
23
24 #include <ctype.h>
25
26 #define ustrlen(p) strlen((char*)(p))
27 #define ustrcmp(p, q) strcmp((char*)(p), (char*)(q))
28 #define ustrcpy(p, q) (unsigned char*)strcpy((char*)(p), (char*)(q))
29 #define ustrchr(p, c) (unsigned char*)strchr((char*)(p), c)
30
31 int blank(Text*);
32 void fixlabels(Text*);
33 void fixbrack(Text*);
34 void ckludge(Text*, int, int, int, Text*);
35 int addr(Text*, Text*);
36 word* instr(unsigned char*);
37 unsigned char *succi(unsigned char*);
38
39 #if DEBUG
40 extern void regdump(regex_t*); /* secret entry into regex pkg */
41 #endif
42
43 static Text rebuf;
44
45 static const unsigned char adrs[UCHAR_MAX+1] = { /* max no. of addrs, 3 is illegal */
46 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, /* <nl> */
47 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
48 3, 2, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* !# */
49 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3, 1, 3, 3, /* := */
50 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, /* DGHN */
51 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* P */
52 3, 1, 2, 2, 2, 3, 3, 2, 2, 1, 3, 3, 2, 3, 2, 3, /* a-n */
53 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 3, 2, 3, 0, 3, 3, /* p-y{} */
54 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
55 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
56 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
57 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
58 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
59 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
60 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
61 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
62 };
63
64 #define Ec Tc /* commands that have same compilation method */
65 #define Dc Tc
66 #define Gc Tc
67 #define Hc Tc
68 #define Nc Tc
69 #define Pc Tc
70 #define dc Tc
71 #define gc Tc
72 #define hc Tc
73 #define lc Tc
74 #define nc Tc
75 #define pc Tc
76 #define xc Tc
77 #define tc bc
78 #define ic ac
79 #define cc ac
80
81 unsigned char *synl; /* current line pointer for syntax errors */
82
83 /* COMMAND LAYOUT */
84
85 int
blank(Text * t)86 blank(Text *t)
87 {
88 if(*t->w==' ' || *t->w=='\t' || *t->w=='\r') {
89 t->w++;
90 return 1;
91 } else
92 return 0;
93 }
94
95 word *
instr(unsigned char * p)96 instr(unsigned char *p) /* get address of command word */
97 {
98 word *q = (word*)p;
99 while((*q & IMASK) != IMASK)
100 q++;
101 return q;
102 }
103
104 unsigned char *
succi(unsigned char * p)105 succi(unsigned char *p)
106 {
107 word *q = instr(p);
108 if(code(*q) == '{')
109 return (unsigned char*)(q+1);
110 else
111 return p + (*q & LMASK);
112 }
113
114 word
pack(int neg,int cmd,word length)115 pack(int neg, int cmd, word length)
116 {
117 int l = length & LMASK;
118 if(length != l)
119 syntax("<command-list> or <text> too long");
120 return IMASK | neg | cmd << 2*BYTE | l;
121 }
122
123 void
putword(Text * s,word n)124 putword(Text *s, word n)
125 {
126 assure(s, sizeof(word));
127 *(word*)s->w = n;
128 s->w += sizeof(word);
129 }
130
131 int
number(Text * t)132 number(Text *t)
133 {
134 unsigned n = 0;
135 while(isdigit(*t->w)) {
136 if(n > (INT_MAX-9)/10)
137 syntax("number too big");
138 n = n*10 + *t->w++ - '0';
139 }
140 return n;
141 }
142
143 int
addr(Text * script,Text * t)144 addr(Text *script, Text *t)
145 {
146 word n;
147 if(reflags & REG_LENIENT)
148 while(*t->w == ' ' || *t->w == '\t' || *t->w == '\r')
149 t->w++;
150 switch(*t->w) {
151 default:
152 return 0;
153 case '$':
154 t->w++;
155 n = DOLLAR;
156 break;
157 case '\\':
158 t->w++;
159 case '/':
160 n = recomp(&rebuf, t, 0) | REGADR;
161 break;
162 case '0': case '1': case '2': case '3': case '4':
163 case '5': case '6': case '7': case '8': case '9':
164 n = number(t);
165 if(n == 0)
166 syntax("address is zero");
167 }
168 putword(script, n);
169 if(reflags & REG_LENIENT)
170 while(*t->w == ' ' || *t->w == '\t' || *t->w == '\r')
171 t->w++;
172 return 1;
173 }
174
175 regex_t *
readdr(word x)176 readdr(word x)
177 {
178 return (regex_t*)(rebuf.s + (x&AMASK));
179 }
180
181 /* LABEL HANDLING */
182
183 /* the labels array consists of int values followed by strings.
184 value -1 means unassigned; other values are relative to the
185 beginning of the script
186
187 on the first pass, every script ref to a label becomes the
188 integer offset of that label in the labels array, or -1 if
189 it is a branch to the end of script
190
191 on the second pass (fixlabels), the script ref is replaced
192 by the value from the labels array. */
193
194 Text labels;
195
196 word *
lablook(unsigned char * l,Text * labels)197 lablook(unsigned char *l, Text *labels)
198 {
199 unsigned char *p, *q;
200 word n, m;
201 assure(labels, 1);
202 for(p = labels->s; p < labels->w; ) {
203 q = p + sizeof(word);
204 if(ustrcmp(q, l) == 0)
205 return (word*)p;
206 q += ustrlen(q) + 1;
207 p = (unsigned char*)wordp(q);
208 }
209 n = ustrlen(l);
210 m = (p - labels->s);
211 assure(labels, sizeof(word)+n+1+sizeof(word));
212 p = labels->s + m;
213 *(word*)p = -1;
214 q = p + sizeof(word);
215 ustrcpy(q, l);
216 q += ustrlen(q) + 1;
217 labels->w = (unsigned char*)wordp(q);
218 return (word*)p;
219 }
220
221 /* find pos in label list; assign value i to label if i>=0 */
222
223 word
getlab(Text * t,word i)224 getlab(Text *t, word i)
225 {
226 word *p;
227 unsigned char *u;
228 while(blank(t)); /* not exactly posix */
229 for(u=t->w; *t->w!='\n'; t->w++)
230 if(!isprint(*t->w) || *t->w==' ' || *t->w=='\t' || *t->w=='\r')
231 synwarn("invisible character in name");
232 if(u == t->w)
233 return -1;
234 *t->w = 0;
235 p = lablook(u, &labels);
236 if(*p == -1)
237 *p = i;
238 else if(i != -1)
239 syntax("duplicate label");
240 *t->w = '\n';
241 return (unsigned char*)p - labels.s;
242 }
243
244 void
Cc(Text * script,Text * t)245 Cc(Text *script, Text *t) /* colon */
246 {
247 if(getlab(t, script->w - sizeof(word) - script->s) == -1)
248 syntax("missing label");
249 }
250
251 void
bc(Text * script,Text * t)252 bc(Text *script, Text *t)
253 {
254 word g;
255 g = getlab(t, -1); /* relative pointer to label list */
256 putword(script, g);
257 }
258
259 void
fixlabels(Text * script)260 fixlabels(Text *script)
261 {
262 unsigned char *p;
263 word *q;
264 for(p=script->s; p<script->w; p=succi(p)) {
265 q = instr(p);
266 switch(code(*q)) {
267 case 't':
268 case 'b':
269 if(q[1] == -1)
270 q[1] = script->w - script->s;
271 else if(*(word*)(labels.s+q[1]) != -1)
272 q[1] = *(word*)(labels.s+q[1]);
273 else
274 error(3, "undefined label: %s",
275 labels.s+q[1]+sizeof(word));
276 }
277 }
278 free(labels.s);
279 }
280
281 /* FILES */
282
283 Text files;
284
285 void
rc(Text * script,Text * t)286 rc(Text *script, Text *t)
287 {
288 unsigned char *u;
289 if(!blank(t))
290 synwarn("no space before file name");
291 while(blank(t)) ;
292 for(u=t->w; *t->w!='\n'; t->w++) ;
293 if(u == t->w)
294 syntax("missing file name");
295 *t->w = 0;
296 putword(script, (unsigned char*)lablook(u, &files) - files.s);
297 *t->w = '\n';
298 }
299
300 void
wc(Text * script,Text * t)301 wc(Text *script, Text *t)
302 {
303 word *p;
304 rc(script, t);
305 p = (word*)(files.s + ((word*)script->w)[-1]);
306 if(*p != -1)
307 return;
308 *(Sfio_t**)p = sfopen(NiL, (char*)(p+1), "w");
309 if(*(Sfio_t**)p == 0)
310 syntax("can't open file for writing");
311 }
312
313 /* BRACKETS */
314
315 Text brack;
316
317 /* Lc() stacks (in brack) the location of the { command word.
318 Rc() stuffs into that word the offset of the } sequel
319 relative to the command word.
320 fixbrack() modifies the offset to be relative to the
321 beginning of the instruction, including addresses. */
322
323 void /* { */
Lc(Text * script,Text * t)324 Lc(Text *script, Text *t)
325 {
326 while(blank(t));
327 putword(&brack, script->w - sizeof(word) - script->s);
328 }
329
330 void /* } */
Rc(Text * script,Text * t)331 Rc(Text *script, Text *t)
332 {
333 word l;
334 word *p;
335 t = t;
336 if(brack.w == 0 || (brack.w-=sizeof(word)) < brack.s)
337 syntax("unmatched }");
338 l = *(word*)brack.w;
339 p = (word*)(script->s + l);
340 l = script->w - script->s - l;
341 if(l >= LMASK - 3*sizeof(word)) /* fixbrack could add 3 */
342 syntax("{command-list} too long)");
343 *p = (*p&~LMASK) | l;
344 }
345
346 void
fixbrack(Text * script)347 fixbrack(Text *script)
348 {
349 unsigned char *p;
350 word *q;
351 if(brack.w == 0)
352 return;
353 if(brack.w > brack.s)
354 syntax("unmatched {");
355 for(p=script->s; p<script->w; p=succi(p)) {
356 q = instr(p);
357 if(code(*q) == '{')
358 *q += (unsigned char*)q - p;
359 }
360 free(brack.s);
361 }
362
363 /* EASY COMMANDS */
364
365 void
Xc(Text * script,Text * t)366 Xc(Text *script, Text *t) /* # */
367 {
368 script = script; /* avoid use/set diagnostics */
369 if(t->s[1]=='n')
370 nflag = 1;
371 while(*t->w != '\n')
372 t->w++;
373 }
374
375 void
Ic(Text * script,Text * t)376 Ic(Text *script, Text *t) /* ignore */
377 {
378 script = script;
379 t->w--;
380 }
381
382 void
Tc(Text * script,Text * t)383 Tc(Text *script, Text *t) /* trivial to compile */
384 {
385 script = script;
386 t = t;
387 }
388
389 void
xx(Text * script,Text * t)390 xx(Text *script, Text *t)
391 {
392 script = script;
393 t = t;
394 syntax("unknown command");
395 }
396
397 /* MISCELLANY */
398
399 void
ac(Text * script,Text * t)400 ac(Text *script, Text *t)
401 {
402 if(*t->w++ != '\\' || *t->w++ != '\n')
403 syntax("\\<newline> missing after command");
404 for(;;) {
405 while(bflag && blank(t)) ;
406 assure(script, 2 + sizeof(word));
407 switch(*t->w) {
408 case 0:
409 error(ERROR_PANIC|4, "bug: missed end of <text>");
410 case '\n':
411 *script->w++ = *t->w;
412 *script->w++ = 0;
413 script->w = (unsigned char*)wordp(script->w);
414 return;
415 case '\\':
416 t->w++;
417 default:
418 *script->w++ = *t->w++;
419 }
420 }
421 }
422
423 void
qc(Text * script,Text * t)424 qc(Text *script, Text *t)
425 {
426 sfset(sfstdin, SF_SHARE, 1);
427 script = script;
428 t = t;
429 }
430
431 void
sc(Text * script,Text * t)432 sc(Text *script, Text *t)
433 {
434 regex_t* re;
435 word n;
436 int c;
437 n = recomp(&rebuf, t, 1);
438 putword(script, n);
439 re = readdr(n);
440 if(c = regsubcomp(re, (char*)t->w, NiL, 0, 0))
441 badre(re, c);
442 t->w += re->re_npat;
443 script->w = (unsigned char*)wordp(script->w);
444 if(re->re_sub->re_flags & REG_SUB_WRITE)
445 wc(script, t);
446 }
447
448 void
yc(Text * script,Text * t)449 yc(Text *script, Text *t)
450 {
451 word i, m, x;
452 int delim;
453 unsigned char *s, *pb, *qb;
454 unsigned char *p, *q, *o, *v, **w;
455 int pc, qc;
456 wchar_t wc;
457 Mbstate_t oq, pq, qq;
458 m = 0;
459 if(mbwide()) {
460 mbinit(&pq);
461 pb = t->w;
462 if((delim = mbchar(&wc, pb, t->e - pb, &pq)) == '\n' || delim=='\\')
463 syntax("missing delimiter");
464 mbinit(&pq);
465 p = pb;
466 while((o=p),(pc = mbchar(&wc, p, t->e - p, &pq))!=delim) {
467 if(pc=='\n')
468 syntax("missing delimiter");
469 if(pc=='\\') {
470 o = p;
471 pc = mbchar(&wc, p, t->e - p, &pq);
472 }
473 if((p-o)>1 && pc>m)
474 m = pc;
475 }
476 }
477 if(m) {
478 x = 0;
479 qb = p;
480 while((o=p), (pc = mbchar(&wc, p, t->e - p, &pq))!=delim) {
481 if(pc=='\\') {
482 o = p;
483 pc = mbchar(&wc, p, t->e - p, &pq);
484 }
485 x += (p-o)+1;
486 }
487 x = roundof(x, sizeof(word));
488 m++;
489 assure(script, (m+1)*sizeof(unsigned char*)+x);
490 w = (unsigned char**)script->w;
491 *w++ = (unsigned char*)0 + m;
492 script->w += (m+1)*sizeof(unsigned char*);
493 v = (unsigned char*)script->w;
494 script->w += x;
495 for(i=0; i<m; i++)
496 w[i] = 0;
497 mbinit(&pq);
498 p = pb;
499 mbinit(&qq);
500 q = qb;
501 while((pb=p), (oq = pq), (pc = mbchar(&wc, p, t->e - p, &pq))!=delim) {
502 if(pc=='\\') {
503 if((qc = mbchar(&wc, p, t->e - p, &pq))=='n')
504 pc = '\n';
505 else if(qc==delim || qc=='\\')
506 pc = qc;
507 else {
508 p = pb;
509 pq = oq;
510 }
511 }
512 oq = qq;
513 qb = q;
514 if((qc = mbchar(&wc, q, t->e - q, &qq)) == '\n')
515 syntax("missing delimiter");
516 if(qc==delim)
517 syntax("string lengths differ");
518 if(qc=='\\') {
519 qq = oq;
520 if((qc = mbchar(&wc, q, t->e - q, &qq))=='n')
521 *qb = '\n';
522 else if(qc!=delim && qc!='\\') {
523 q = qb;
524 qq = oq;
525 }
526 }
527 i = (q-qb);
528 if(w[pc]) {
529 if(w[pc][0]!=i || memcmp(&w[pc][1], qb, i))
530 syntax("ambiguous map");
531 synwarn("redundant map");
532 }
533 else {
534 w[pc] = v;
535 *v++ = (unsigned char)i;
536 memcpy(v, qb, i);
537 v += i;
538 }
539 }
540 if(mbchar(&wc, q, t->e - q, &qq) != delim)
541 syntax("string lengths differ");
542 }
543 else {
544 if((delim = *t->w++) == '\n' || delim=='\\')
545 syntax("missing delimiter");
546 assure(script, sizeof(unsigned char*)+UCHAR_MAX+1);
547 w = (unsigned char**)script->w;
548 *w++ = 0;
549 s = (unsigned char*)w;
550 script->w += sizeof(unsigned char*)+UCHAR_MAX+1;
551 for(i=0; i<UCHAR_MAX+1; i++)
552 s[i] = 0;
553 for(q=t->w; (qc = *q++)!=delim; ) {
554 if(qc == '\n')
555 syntax("missing delimiter");
556 if(qc=='\\' && *q==delim)
557 q++;
558 }
559 for(p=t->w; (pc = *p++) != delim; ) {
560 if(pc=='\\') {
561 if(*p==delim || *p=='\\')
562 pc = *p++;
563 else if(*p=='n') {
564 p++;
565 pc = '\n';
566 }
567 }
568 if((qc = *q++) == '\n')
569 syntax("missing delimiter");
570 if(qc==delim)
571 syntax("string lengths differ");
572 if(qc=='\\') {
573 if(*q==delim || *q=='\\')
574 qc = *q++;
575 else if(*q=='n') {
576 q++;
577 qc = '\n';
578 }
579 }
580 if(s[pc]) {
581 if(s[pc]!=qc)
582 syntax("ambiguous map");
583 synwarn("redundant map");
584 }
585 s[pc] = qc;
586 }
587 if(*q++ != delim)
588 syntax("string lengths differ");
589 for(i=0; i<UCHAR_MAX+1; i++)
590 if(s[i] == 0)
591 s[i] = (unsigned char)i;
592 }
593 t->w = q;
594 }
595
596 void
synwarn(char * s)597 synwarn(char *s)
598 {
599 unsigned char *t = ustrchr(synl, '\n');
600 error(1, "%s: %.*s", s, t-synl, synl);
601 }
602
603 void
syntax(char * s)604 syntax(char *s)
605 {
606 unsigned char *t = ustrchr(synl, '\n');
607 error(3, "%s: %.*s", s, t-synl, synl);
608 }
609
610 void
badre(regex_t * re,int code)611 badre(regex_t* re, int code)
612 {
613 unsigned char *t = ustrchr(synl, '\n');
614 if(code && code!= REG_NOMATCH) {
615 char buf[UCHAR_MAX+1];
616 regerror(code, re, buf, sizeof(buf));
617 error(3, "%s: %.*s", buf, t-synl, synl);
618 }
619 else
620 error(3, "invalid regular expression: %.*s", t-synl, synl);
621 }
622
623 #if DEBUG
624
625 void
printscript(Text * script)626 printscript(Text *script)
627 {
628 unsigned char *s;
629 word *q;
630 for(s=script->s; s<script->w; s = succi(s)) {
631 q = (word*)s;
632 if((*q&IMASK) != IMASK) {
633 if((*q®ADR) == 0)
634 printf("%d", *q);
635 else
636 regdump((regex_t*)(*q & AMASK));
637 q++;
638 }
639 if((*q&IMASK) != IMASK) {
640 if((*q®ADR) == 0)
641 printf(",%d", *q);
642 else
643 regdump((regex_t*)(*q & AMASK));
644 q += 2;
645 }
646 if(code(*q) == '\n')
647 continue;
648 printf("%s%c\n", *q&NEG?"!":"", code(*q));
649 }
650 }
651
652 #endif
653
654 #if DEBUG & 2
655
656 /* debugging code 2; execute stub.
657 prints the compiled script (without arguments)
658 then each input line with line numbers */
659
660 void
execute(Text * script,Text * y)661 execute(Text *script, Text *y)
662 {
663 if(recno == 1)
664 printscript(script);
665 printf("%d:%s",recno,y->s);
666 }
667
668 #endif
669
670 typedef void (*cmdf)(Text*, Text*);
671
672 static const cmdf docom[128] = {
673 xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,Ic,xx,xx,xx,xx,xx, /* <nl> */
674 xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,
675 xx,Ic,xx,Xc,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx, /* !# */
676 xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,Cc,Ic,xx,Ec,xx,xx, /* :;= */
677 xx,xx,xx,xx,Dc,xx,xx,Gc,Hc,xx,xx,xx,xx,xx,Nc,xx, /* DGHN */
678 Pc,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx, /* P */
679 xx,ac,bc,cc,dc,xx,xx,gc,hc,ic,xx,xx,lc,xx,nc,xx, /* a-n */
680 pc,qc,rc,sc,tc,xx,xx,wc,xc,yc,xx,Lc,xx,Rc,xx,xx /* p-y{} */
681 };
682
683 void
compile(Text * script,Text * t)684 compile(Text *script, Text *t)
685 {
686 word loc; /* progam counter */
687 int neg; /* ! in effect */
688 int cmd;
689 int naddr;
690 word *q; /* address of instruction word */
691 t->w = t->s; /* here w is a read pointer */
692 while(*t->w) {
693 assure(script, 4*sizeof(word));
694 loc = script->w - script->s;
695 synl = t->w;
696 naddr = 0;
697 while(blank(t)) ;
698 naddr += addr(script, t);
699 if(naddr && *t->w ==',') {
700 t->w++;
701 naddr += addr(script, t);
702 if(naddr < 2)
703 syntax("missing address");
704 }
705 q = (word*)script->w;
706 if(naddr == 2)
707 *q++ = INACT;
708 script->w = (unsigned char*)(q+1);
709 neg = 0;
710 for(;;) {
711 while(blank(t));
712 cmd = *t->w++;
713 if(neg && docom[ccmapchr(map,cmd)&0x7f]==Ic)
714 syntax("improper !");
715 if(cmd != '!')
716 break;
717 neg = NEG;
718 }
719 if(!neg) {
720 switch(adrs[ccmapchr(map,cmd)]) {
721 case 1:
722 if(naddr <= 1)
723 break;
724 case 0:
725 if(naddr == 0)
726 break;
727 syntax("too many addresses");
728 }
729 }
730 (*docom[ccmapchr(map,cmd)&0x7f])(script, t);
731 while(*t->w == ' ' || *t->w == '\t' || *t->w == '\r')
732 t->w++;
733 switch(*t->w) {
734 case 0:
735 script->w = script->s + loc;
736 break;
737 case ';':
738 case '\n':
739 t->w++;
740 break;
741 default:
742 if(cmd == '{')
743 break;
744 syntax("junk after command");
745 }
746 *q = pack(neg,cmd,script->w-script->s-loc);
747 }
748 fixbrack(script);
749 fixlabels(script);
750 }
751