xref: /original-bsd/usr.bin/ex/ex_re.c (revision d25e1985)
1 /* Copyright (c) 1980 Regents of the University of California */
2 static char *sccsid = "@(#)ex_re.c	5.1 08/20/80";
3 #include "ex.h"
4 #include "ex_re.h"
5 
6 /*
7  * Global, substitute and regular expressions.
8  * Very similar to ed, with some re extensions and
9  * confirmed substitute.
10  */
11 global(k)
12 	bool k;
13 {
14 	register char *gp;
15 	register int c;
16 	register line *a1;
17 	char globuf[GBSIZE], *Cwas;
18 	int lines = lineDOL();
19 	int oinglobal = inglobal;
20 	char *oglobp = globp;
21 
22 	Cwas = Command;
23 	/*
24 	 * States of inglobal:
25 	 *  0: ordinary - not in a global command.
26 	 *  1: text coming from some buffer, not tty.
27 	 *  2: like 1, but the source of the buffer is a global command.
28 	 * Hence you're only in a global command if inglobal==2. This
29 	 * strange sounding convention is historically derived from
30 	 * everybody simulating a global command.
31 	 */
32 	if (inglobal==2)
33 		error("Global within global@not allowed");
34 	markDOT();
35 	setall();
36 	nonzero();
37 	if (skipend())
38 		error("Global needs re|Missing regular expression for global");
39 	c = getchar();
40 	ignore(compile(c, 1));
41 	savere(scanre);
42 	gp = globuf;
43 	while ((c = getchar()) != '\n') {
44 		switch (c) {
45 
46 		case EOF:
47 			c = '\n';
48 			goto brkwh;
49 
50 		case '\\':
51 			c = getchar();
52 			switch (c) {
53 
54 			case '\\':
55 				ungetchar(c);
56 				break;
57 
58 			case '\n':
59 				break;
60 
61 			default:
62 				*gp++ = '\\';
63 				break;
64 			}
65 			break;
66 		}
67 		*gp++ = c;
68 		if (gp >= &globuf[GBSIZE - 2])
69 			error("Global command too long");
70 	}
71 brkwh:
72 	ungetchar(c);
73 out:
74 	newline();
75 	*gp++ = c;
76 	*gp++ = 0;
77 	saveall();
78 	inglobal = 2;
79 	for (a1 = one; a1 <= dol; a1++) {
80 		*a1 &= ~01;
81 		if (a1 >= addr1 && a1 <= addr2 && execute(0, a1) == k)
82 			*a1 |= 01;
83 	}
84 	/* should use gdelete from ed to avoid n**2 here on g/.../d */
85 	if (inopen)
86 		inopen = -1;
87 	for (a1 = one; a1 <= dol; a1++) {
88 		if (*a1 & 01) {
89 			*a1 &= ~01;
90 			dot = a1;
91 			globp = globuf;
92 			commands(1, 1);
93 			a1 = zero;
94 		}
95 	}
96 	globp = oglobp;
97 	inglobal = oinglobal;
98 	endline = 1;
99 	Command = Cwas;
100 	netchHAD(lines);
101 	setlastchar(EOF);
102 	if (inopen) {
103 		ungetchar(EOF);
104 		inopen = 1;
105 	}
106 }
107 
108 bool	cflag;
109 int	scount, slines, stotal;
110 
111 substitute(c)
112 	int c;
113 {
114 	register line *addr;
115 	register int n;
116 	int gsubf;
117 
118 	gsubf = compsub(c);
119 	if(FIXUNDO)
120 		save12(), undkind = UNDCHANGE;
121 	stotal = 0;
122 	slines = 0;
123 	for (addr = addr1; addr <= addr2; addr++) {
124 		scount = 0;
125 		if (dosubcon(0, addr) == 0)
126 			continue;
127 		if (gsubf) {
128 #ifdef notdef
129 			/*
130 			 * should check but loc2 is already munged.
131 			 * This needs a fancier check later.
132 			 */
133 			if (loc1 == loc2)
134 				error("substitution loop");
135 #endif
136 			while (*loc2)
137 				if (dosubcon(1, addr) == 0)
138 					break;
139 		}
140 		if (scount) {
141 			stotal += scount;
142 			slines++;
143 			putmark(addr);
144 			n = append(getsub, addr);
145 			addr += n;
146 			addr2 += n;
147 		}
148 	}
149 	if (stotal == 0 && !inglobal && !cflag)
150 		error("Fail|Substitute pattern match failed");
151 	snote(stotal, slines);
152 	return (stotal);
153 }
154 
155 compsub(ch)
156 {
157 	register int seof, c, uselastre;
158 	static int gsubf;
159 
160 	if (!value(EDCOMPATIBLE))
161 		gsubf = cflag = 0;
162 	uselastre = 0;
163 	switch (ch) {
164 
165 	case 's':
166 		ignore(skipwh());
167 		seof = getchar();
168 		if (endcmd(seof) || any(seof, "gcr")) {
169 			ungetchar(seof);
170 			goto redo;
171 		}
172 		if (isalpha(seof) || isdigit(seof))
173 			error("Substitute needs re|Missing regular expression for substitute");
174 		seof = compile(seof, 1);
175 		uselastre = 1;
176 		comprhs(seof);
177 		gsubf = 0;
178 		cflag = 0;
179 		break;
180 
181 	case '~':
182 		uselastre = 1;
183 		/* fall into ... */
184 	case '&':
185 	redo:
186 		if (re.Expbuf[0] == 0)
187 			error("No previous re|No previous regular expression");
188 		if (subre.Expbuf[0] == 0)
189 			error("No previous substitute re|No previous substitute to repeat");
190 		break;
191 	}
192 	for (;;) {
193 		c = getchar();
194 		switch (c) {
195 
196 		case 'g':
197 			gsubf = !gsubf;
198 			continue;
199 
200 		case 'c':
201 			cflag = !cflag;
202 			continue;
203 
204 		case 'r':
205 			uselastre = 1;
206 			continue;
207 
208 		default:
209 			ungetchar(c);
210 			setcount();
211 			newline();
212 			if (uselastre)
213 				savere(subre);
214 			else
215 				resre(subre);
216 			return (gsubf);
217 		}
218 	}
219 }
220 
221 comprhs(seof)
222 	int seof;
223 {
224 	register char *rp, *orp;
225 	register int c;
226 	char orhsbuf[LBSIZE / 2];
227 
228 	rp = rhsbuf;
229 	CP(orhsbuf, rp);
230 	for (;;) {
231 		c = getchar();
232 		if (c == seof)
233 			break;
234 		switch (c) {
235 
236 		case '\\':
237 			c = getchar();
238 			if (c == EOF) {
239 				ungetchar(c);
240 				break;
241 			}
242 			if (value(MAGIC)) {
243 				/*
244 				 * When "magic", \& turns into a plain &,
245 				 * and all other chars work fine quoted.
246 				 */
247 				if (c != '&')
248 					c |= QUOTE;
249 				break;
250 			}
251 magic:
252 			if (c == '~') {
253 				for (orp = orhsbuf; *orp; *rp++ = *orp++)
254 					if (rp >= &rhsbuf[LBSIZE / 2 + 1])
255 						goto toobig;
256 				continue;
257 			}
258 			c |= QUOTE;
259 			break;
260 
261 		case '\n':
262 		case EOF:
263 			if (!(globp && globp[0])) {
264 				ungetchar(c);
265 				goto endrhs;
266 			}
267 
268 		case '~':
269 		case '&':
270 			if (value(MAGIC))
271 				goto magic;
272 			break;
273 		}
274 		if (rp >= &rhsbuf[LBSIZE / 2 - 1])
275 toobig:
276 			error("Replacement pattern too long@- limit 256 characters");
277 		*rp++ = c;
278 	}
279 endrhs:
280 	*rp++ = 0;
281 }
282 
283 getsub()
284 {
285 	register char *p;
286 
287 	if ((p = linebp) == 0)
288 		return (EOF);
289 	strcLIN(p);
290 	linebp = 0;
291 	return (0);
292 }
293 
294 dosubcon(f, a)
295 	bool f;
296 	line *a;
297 {
298 
299 	if (execute(f, a) == 0)
300 		return (0);
301 	if (confirmed(a)) {
302 		dosub();
303 		scount++;
304 	}
305 	return (1);
306 }
307 
308 confirmed(a)
309 	line *a;
310 {
311 	register int c, ch;
312 
313 	if (cflag == 0)
314 		return (1);
315 	pofix();
316 	pline(lineno(a));
317 	if (inopen)
318 		putchar('\n' | QUOTE);
319 	c = column(loc1 - 1);
320 	ugo(c - 1 + (inopen ? 1 : 0), ' ');
321 	ugo(column(loc2 - 1) - c, '^');
322 	flush();
323 	ch = c = getkey();
324 again:
325 	if (c == '\r')
326 		c = '\n';
327 	if (inopen)
328 		putchar(c), flush();
329 	if (c != '\n' && c != EOF) {
330 		c = getkey();
331 		goto again;
332 	}
333 	noteinp();
334 	return (ch == 'y');
335 }
336 
337 getch()
338 {
339 	char c;
340 
341 	if (read(2, &c, 1) != 1)
342 		return (EOF);
343 	return (c & TRIM);
344 }
345 
346 ugo(cnt, with)
347 	int with;
348 	int cnt;
349 {
350 
351 	if (cnt > 0)
352 		do
353 			putchar(with);
354 		while (--cnt > 0);
355 }
356 
357 int	casecnt;
358 bool	destuc;
359 
360 dosub()
361 {
362 	register char *lp, *sp, *rp;
363 	int c;
364 
365 	lp = linebuf;
366 	sp = genbuf;
367 	rp = rhsbuf;
368 	while (lp < loc1)
369 		*sp++ = *lp++;
370 	casecnt = 0;
371 	while (c = *rp++) {
372 		if (c & QUOTE)
373 			switch (c & TRIM) {
374 
375 			case '&':
376 				sp = place(sp, loc1, loc2);
377 				if (sp == 0)
378 					goto ovflo;
379 				continue;
380 
381 			case 'l':
382 				casecnt = 1;
383 				destuc = 0;
384 				continue;
385 
386 			case 'L':
387 				casecnt = LBSIZE;
388 				destuc = 0;
389 				continue;
390 
391 			case 'u':
392 				casecnt = 1;
393 				destuc = 1;
394 				continue;
395 
396 			case 'U':
397 				casecnt = LBSIZE;
398 				destuc = 1;
399 				continue;
400 
401 			case 'E':
402 			case 'e':
403 				casecnt = 0;
404 				continue;
405 			}
406 		if (c < 0 && (c &= TRIM) >= '1' && c < nbra + '1') {
407 			sp = place(sp, braslist[c - '1'], braelist[c - '1']);
408 			if (sp == 0)
409 				goto ovflo;
410 			continue;
411 		}
412 		if (casecnt)
413 			*sp++ = fixcase(c & TRIM);
414 		else
415 			*sp++ = c & TRIM;
416 		if (sp >= &genbuf[LBSIZE])
417 ovflo:
418 			error("Line overflow@in substitute");
419 	}
420 	lp = loc2;
421 	loc2 = sp + (linebuf - genbuf);
422 	while (*sp++ = *lp++)
423 		if (sp >= &genbuf[LBSIZE])
424 			goto ovflo;
425 	strcLIN(genbuf);
426 }
427 
428 fixcase(c)
429 	register int c;
430 {
431 
432 	if (casecnt == 0)
433 		return (c);
434 	casecnt--;
435 	if (destuc) {
436 		if (islower(c))
437 			c = toupper(c);
438 	} else
439 		if (isupper(c))
440 			c = tolower(c);
441 	return (c);
442 }
443 
444 char *
445 place(sp, l1, l2)
446 	register char *sp, *l1, *l2;
447 {
448 
449 	while (l1 < l2) {
450 		*sp++ = fixcase(*l1++);
451 		if (sp >= &genbuf[LBSIZE])
452 			return (0);
453 	}
454 	return (sp);
455 }
456 
457 snote(total, lines)
458 	register int total, lines;
459 {
460 
461 	if (!notable(total))
462 		return;
463 	printf(mesg("%d subs|%d substitutions"), total);
464 	if (lines != 1 && lines != total)
465 		printf(" on %d lines", lines);
466 	noonl();
467 	flush();
468 }
469 
470 compile(eof, oknl)
471 	int eof;
472 	int oknl;
473 {
474 	register int c;
475 	register char *ep;
476 	char *lastep;
477 	char bracket[NBRA], *bracketp, *rhsp;
478 	int cclcnt;
479 
480 	if (isalpha(eof) || isdigit(eof))
481 		error("Regular expressions cannot be delimited by letters or digits");
482 	ep = expbuf;
483 	c = getchar();
484 	if (eof == '\\')
485 		switch (c) {
486 
487 		case '/':
488 		case '?':
489 			if (scanre.Expbuf[0] == 0)
490 error("No previous scan re|No previous scanning regular expression");
491 			resre(scanre);
492 			return (c);
493 
494 		case '&':
495 			if (subre.Expbuf[0] == 0)
496 error("No previous substitute re|No previous substitute regular expression");
497 			resre(subre);
498 			return (c);
499 
500 		default:
501 			error("Badly formed re|Regular expression \\ must be followed by / or ?");
502 		}
503 	if (c == eof || c == '\n' || c == EOF) {
504 		if (*ep == 0)
505 			error("No previous re|No previous regular expression");
506 		if (c == '\n' && oknl == 0)
507 			error("Missing closing delimiter@for regular expression");
508 		if (c != eof)
509 			ungetchar(c);
510 		return (eof);
511 	}
512 	bracketp = bracket;
513 	nbra = 0;
514 	circfl = 0;
515 	if (c == '^') {
516 		c = getchar();
517 		circfl++;
518 	}
519 	ungetchar(c);
520 	for (;;) {
521 		if (ep >= &expbuf[ESIZE - 2])
522 complex:
523 			cerror("Re too complex|Regular expression too complicated");
524 		c = getchar();
525 		if (c == eof || c == EOF) {
526 			if (bracketp != bracket)
527 cerror("Unmatched \\(|More \\('s than \\)'s in regular expression");
528 			*ep++ = CEOFC;
529 			if (c == EOF)
530 				ungetchar(c);
531 			return (eof);
532 		}
533 		if (value(MAGIC)) {
534 			if (c != '*' || ep == expbuf)
535 				lastep = ep;
536 		} else
537 			if (c != '\\' || peekchar() != '*' || ep == expbuf)
538 				lastep = ep;
539 		switch (c) {
540 
541 		case '\\':
542 			c = getchar();
543 			switch (c) {
544 
545 			case '(':
546 				if (nbra >= NBRA)
547 cerror("Awash in \\('s!|Too many \\('d subexressions in a regular expression");
548 				*bracketp++ = nbra;
549 				*ep++ = CBRA;
550 				*ep++ = nbra++;
551 				continue;
552 
553 			case ')':
554 				if (bracketp <= bracket)
555 cerror("Extra \\)|More \\)'s than \\('s in regular expression");
556 				*ep++ = CKET;
557 				*ep++ = *--bracketp;
558 				continue;
559 
560 			case '<':
561 				*ep++ = CBRC;
562 				continue;
563 
564 			case '>':
565 				*ep++ = CLET;
566 				continue;
567 			}
568 			if (value(MAGIC) == 0)
569 magic:
570 			switch (c) {
571 
572 			case '.':
573 				*ep++ = CDOT;
574 				continue;
575 
576 			case '~':
577 				rhsp = rhsbuf;
578 				while (*rhsp) {
579 					if (*rhsp & QUOTE) {
580 						c = *rhsp & TRIM;
581 						if (c == '&')
582 error("Replacement pattern contains &@- cannot use in re");
583 						if (c >= '1' && c <= '9')
584 error("Replacement pattern contains \\d@- cannot use in re");
585 					}
586 					if (ep >= &expbuf[ESIZE-2])
587 						goto complex;
588 					*ep++ = CCHR;
589 					*ep++ = *rhsp++ & TRIM;
590 				}
591 				continue;
592 
593 			case '*':
594 				if (ep == expbuf)
595 					break;
596 				if (*lastep == CBRA || *lastep == CKET)
597 cerror("Illegal *|Can't * a \\( ... \\) in regular expression");
598 				if (*lastep == CCHR && (lastep[1] & QUOTE))
599 cerror("Illegal *|Can't * a \\n in regular expression");
600 				*lastep |= STAR;
601 				continue;
602 
603 			case '[':
604 				*ep++ = CCL;
605 				*ep++ = 0;
606 				cclcnt = 1;
607 				c = getchar();
608 				if (c == '^') {
609 					c = getchar();
610 					ep[-2] = NCCL;
611 				}
612 				if (c == ']')
613 cerror("Bad character class|Empty character class '[]' or '[^]' cannot match");
614 				while (c != ']') {
615 					if (c == '\\' && any(peekchar(), "]-^\\"))
616 						c = getchar() | QUOTE;
617 					if (c == '\n' || c == EOF)
618 						cerror("Missing ]");
619 					*ep++ = c;
620 					cclcnt++;
621 					if (ep >= &expbuf[ESIZE])
622 						goto complex;
623 					c = getchar();
624 				}
625 				lastep[1] = cclcnt;
626 				continue;
627 			}
628 			if (c == EOF) {
629 				ungetchar(EOF);
630 				c = '\\';
631 				goto defchar;
632 			}
633 			*ep++ = CCHR;
634 			if (c == '\n')
635 cerror("No newlines in re's|Can't escape newlines into regular expressions");
636 /*
637 			if (c < '1' || c > NBRA + '1') {
638 */
639 				*ep++ = c;
640 				continue;
641 /*
642 			}
643 			c -= '1';
644 			if (c >= nbra)
645 cerror("Bad \\n|\\n in regular expression with n greater than the number of \\('s");
646 			*ep++ = c | QUOTE;
647 			continue;
648 */
649 
650 		case '\n':
651 			if (oknl) {
652 				ungetchar(c);
653 				*ep++ = CEOFC;
654 				return (eof);
655 			}
656 cerror("Badly formed re|Missing closing delimiter for regular expression");
657 
658 		case '$':
659 			if (peekchar() == eof || peekchar() == EOF || oknl && peekchar() == '\n') {
660 				*ep++ = CDOL;
661 				continue;
662 			}
663 			goto defchar;
664 
665 		case '.':
666 		case '~':
667 		case '*':
668 		case '[':
669 			if (value(MAGIC))
670 				goto magic;
671 defchar:
672 		default:
673 			*ep++ = CCHR;
674 			*ep++ = c;
675 			continue;
676 		}
677 	}
678 }
679 
680 cerror(s)
681 	char *s;
682 {
683 
684 	expbuf[0] = 0;
685 	error(s);
686 }
687 
688 same(a, b)
689 	register int a, b;
690 {
691 
692 	return (a == b || value(IGNORECASE) &&
693 	   ((islower(a) && toupper(a) == b) || (islower(b) && toupper(b) == a)));
694 }
695 
696 char	*locs;
697 
698 execute(gf, addr)
699 	line *addr;
700 {
701 	register char *p1, *p2;
702 	register int c;
703 
704 	if (gf) {
705 		if (circfl)
706 			return (0);
707 		locs = p1 = loc2;
708 	} else {
709 		if (addr == zero)
710 			return (0);
711 		p1 = linebuf;
712 		getline(*addr);
713 		locs = 0;
714 	}
715 	p2 = expbuf;
716 	if (circfl) {
717 		loc1 = p1;
718 		return (advance(p1, p2));
719 	}
720 	/* fast check for first character */
721 	if (*p2 == CCHR) {
722 		c = p2[1];
723 		do {
724 			if (c != *p1 && (!value(IGNORECASE) ||
725 			   !((islower(c) && toupper(c) == *p1) ||
726 			   (islower(*p1) && toupper(*p1) == c))))
727 				continue;
728 			if (advance(p1, p2)) {
729 				loc1 = p1;
730 				return (1);
731 			}
732 		} while (*p1++);
733 		return (0);
734 	}
735 	/* regular algorithm */
736 	do {
737 		if (advance(p1, p2)) {
738 			loc1 = p1;
739 			return (1);
740 		}
741 	} while (*p1++);
742 	return (0);
743 }
744 
745 #define	uletter(c)	(isalpha(c) || c == '_')
746 
747 advance(lp, ep)
748 	register char *lp, *ep;
749 {
750 	register char *curlp;
751 	char *sp, *sp1;
752 	int c;
753 
754 	for (;;) switch (*ep++) {
755 
756 	case CCHR:
757 /* useless
758 		if (*ep & QUOTE) {
759 			c = *ep++ & TRIM;
760 			sp = braslist[c];
761 			sp1 = braelist[c];
762 			while (sp < sp1) {
763 				if (!same(*sp, *lp))
764 					return (0);
765 				sp++, lp++;
766 			}
767 			continue;
768 		}
769 */
770 		if (!same(*ep, *lp))
771 			return (0);
772 		ep++, lp++;
773 		continue;
774 
775 	case CDOT:
776 		if (*lp++)
777 			continue;
778 		return (0);
779 
780 	case CDOL:
781 		if (*lp == 0)
782 			continue;
783 		return (0);
784 
785 	case CEOFC:
786 		loc2 = lp;
787 		return (1);
788 
789 	case CCL:
790 		if (cclass(ep, *lp++, 1)) {
791 			ep += *ep;
792 			continue;
793 		}
794 		return (0);
795 
796 	case NCCL:
797 		if (cclass(ep, *lp++, 0)) {
798 			ep += *ep;
799 			continue;
800 		}
801 		return (0);
802 
803 	case CBRA:
804 		braslist[*ep++] = lp;
805 		continue;
806 
807 	case CKET:
808 		braelist[*ep++] = lp;
809 		continue;
810 
811 	case CDOT|STAR:
812 		curlp = lp;
813 		while (*lp++)
814 			continue;
815 		goto star;
816 
817 	case CCHR|STAR:
818 		curlp = lp;
819 		while (same(*lp, *ep))
820 			lp++;
821 		lp++;
822 		ep++;
823 		goto star;
824 
825 	case CCL|STAR:
826 	case NCCL|STAR:
827 		curlp = lp;
828 		while (cclass(ep, *lp++, ep[-1] == (CCL|STAR)))
829 			continue;
830 		ep += *ep;
831 		goto star;
832 star:
833 		do {
834 			lp--;
835 			if (lp == locs)
836 				break;
837 			if (advance(lp, ep))
838 				return (1);
839 		} while (lp > curlp);
840 		return (0);
841 
842 	case CBRC:
843 		if (lp == expbuf)
844 			continue;
845 		if ((isdigit(*lp) || uletter(*lp)) && !uletter(lp[-1]) && !isdigit(lp[-1]))
846 			continue;
847 		return (0);
848 
849 	case CLET:
850 		if (!uletter(*lp) && !isdigit(*lp))
851 			continue;
852 		return (0);
853 
854 	default:
855 		error("Re internal error");
856 	}
857 }
858 
859 cclass(set, c, af)
860 	register char *set;
861 	register int c;
862 	int af;
863 {
864 	register int n;
865 
866 	if (c == 0)
867 		return (0);
868 	if (value(IGNORECASE) && isupper(c))
869 		c = tolower(c);
870 	n = *set++;
871 	while (--n)
872 		if (n > 2 && set[1] == '-') {
873 			if (c >= (set[0] & TRIM) && c <= (set[2] & TRIM))
874 				return (af);
875 			set += 3;
876 			n -= 2;
877 		} else
878 			if ((*set++ & TRIM) == c)
879 				return (af);
880 	return (!af);
881 }
882