xref: /openbsd/usr.bin/vi/ex/ex_subst.c (revision 80ddc267)
1 /*	$OpenBSD: ex_subst.c,v 1.31 2023/06/23 15:06:45 millert Exp $	*/
2 
3 /*-
4  * Copyright (c) 1992, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 1992, 1993, 1994, 1995, 1996
7  *	Keith Bostic.  All rights reserved.
8  *
9  * See the LICENSE file for redistribution information.
10  */
11 
12 #include "config.h"
13 
14 #include <sys/queue.h>
15 #include <sys/time.h>
16 
17 #include <bitstring.h>
18 #include <ctype.h>
19 #include <errno.h>
20 #include <limits.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <unistd.h>
25 
26 #include "../common/common.h"
27 #include "../vi/vi.h"
28 
29 #define MAXIMUM(a, b)	(((a) > (b)) ? (a) : (b))
30 
31 #define	SUB_FIRST	0x01		/* The 'r' flag isn't reasonable. */
32 #define	SUB_MUSTSETR	0x02		/* The 'r' flag is required. */
33 
34 static int re_conv(SCR *, char **, size_t *, int *);
35 static int re_sub(SCR *, char *, char **, size_t *, size_t *, regmatch_t [10]);
36 static int re_tag_conv(SCR *, char **, size_t *, int *);
37 static int s(SCR *, EXCMD *, char *, regex_t *, u_int);
38 
39 /*
40  * ex_s --
41  *	[line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]]
42  *
43  *	Substitute on lines matching a pattern.
44  *
45  * PUBLIC: int ex_s(SCR *, EXCMD *);
46  */
47 int
ex_s(SCR * sp,EXCMD * cmdp)48 ex_s(SCR *sp, EXCMD *cmdp)
49 {
50 	regex_t *re;
51 	size_t blen, len;
52 	u_int flags;
53 	int delim;
54 	char *bp, *ptrn, *rep, *p, *t;
55 
56 	/*
57 	 * Skip leading white space.
58 	 *
59 	 * !!!
60 	 * Historic vi allowed any non-alphanumeric to serve as the
61 	 * substitution command delimiter.
62 	 *
63 	 * !!!
64 	 * If the arguments are empty, it's the same as &, i.e. we
65 	 * repeat the last substitution.
66 	 */
67 	if (cmdp->argc == 0)
68 		goto subagain;
69 	for (p = cmdp->argv[0]->bp,
70 	    len = cmdp->argv[0]->len; len > 0; --len, ++p) {
71 		if (!isblank(*p))
72 			break;
73 	}
74 	if (len == 0)
75 subagain:	return (ex_subagain(sp, cmdp));
76 
77 	delim = *p++;
78 	if (isalnum(delim) || delim == '\\')
79 		return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR));
80 
81 	/*
82 	 * !!!
83 	 * The full-blown substitute command reset the remembered
84 	 * state of the 'c' and 'g' suffices.
85 	 */
86 	sp->c_suffix = sp->g_suffix = 0;
87 
88 	/*
89 	 * Get the pattern string, toss escaping characters.
90 	 *
91 	 * !!!
92 	 * Historic vi accepted any of the following forms:
93 	 *
94 	 *	:s/abc/def/		change "abc" to "def"
95 	 *	:s/abc/def		change "abc" to "def"
96 	 *	:s/abc/			delete "abc"
97 	 *	:s/abc			delete "abc"
98 	 *
99 	 * QUOTING NOTE:
100 	 *
101 	 * Only toss an escaping character if it escapes a delimiter.
102 	 * This means that "s/A/\\\\f" replaces "A" with "\\f".  It
103 	 * would be nice to be more regular, i.e. for each layer of
104 	 * escaping a single escaping character is removed, but that's
105 	 * not how the historic vi worked.
106 	 */
107 	for (ptrn = t = p;;) {
108 		if (p[0] == '\0' || p[0] == delim) {
109 			if (p[0] == delim)
110 				++p;
111 			/*
112 			 * !!!
113 			 * Nul terminate the pattern string -- it's passed
114 			 * to regcomp which doesn't understand anything else.
115 			 */
116 			*t = '\0';
117 			break;
118 		}
119 		if (p[0] == '\\') {
120 			if (p[1] == delim)
121 				++p;
122 			else if (p[1] == '\\')
123 				*t++ = *p++;
124 		}
125 		*t++ = *p++;
126 	}
127 
128 	/*
129 	 * If the pattern string is empty, use the last RE (not just the
130 	 * last substitution RE).
131 	 */
132 	if (*ptrn == '\0') {
133 		if (sp->re == NULL) {
134 			ex_emsg(sp, NULL, EXM_NOPREVRE);
135 			return (1);
136 		}
137 
138 		/* Re-compile the RE if necessary. */
139 		if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp,
140 		    sp->re, sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH))
141 			return (1);
142 		flags = 0;
143 	} else {
144 		/*
145 		 * !!!
146 		 * Compile the RE.  Historic practice is that substitutes set
147 		 * the search direction as well as both substitute and search
148 		 * RE's.  We compile the RE twice, as we don't want to bother
149 		 * ref counting the pattern string and (opaque) structure.
150 		 */
151 		if (re_compile(sp, ptrn, t - ptrn,
152 		    &sp->re, &sp->re_len, &sp->re_c, RE_C_SEARCH))
153 			return (1);
154 		if (re_compile(sp, ptrn, t - ptrn,
155 		    &sp->subre, &sp->subre_len, &sp->subre_c, RE_C_SUBST))
156 			return (1);
157 
158 		flags = SUB_FIRST;
159 		sp->searchdir = FORWARD;
160 	}
161 	re = &sp->re_c;
162 
163 	/*
164 	 * Get the replacement string.
165 	 *
166 	 * The special character & (\& if O_MAGIC not set) matches the
167 	 * entire RE.  No handling of & is required here, it's done by
168 	 * re_sub().
169 	 *
170 	 * The special character ~ (\~ if O_MAGIC not set) inserts the
171 	 * previous replacement string into this replacement string.
172 	 * Count ~'s to figure out how much space we need.  We could
173 	 * special case nonexistent last patterns or whether or not
174 	 * O_MAGIC is set, but it's probably not worth the effort.
175 	 *
176 	 * QUOTING NOTE:
177 	 *
178 	 * Only toss an escaping character if it escapes a delimiter or
179 	 * if O_MAGIC is set and it escapes a tilde.
180 	 *
181 	 * !!!
182 	 * If the entire replacement pattern is "%", then use the last
183 	 * replacement pattern.  This semantic was added to vi in System
184 	 * V and then percolated elsewhere, presumably around the time
185 	 * that it was added to their version of ed(1).
186 	 */
187 	if (p[0] == '\0' || p[0] == delim) {
188 		if (p[0] == delim)
189 			++p;
190 		free(sp->repl);
191 		sp->repl = NULL;
192 		sp->repl_len = 0;
193 	} else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim))
194 		p += p[1] == delim ? 2 : 1;
195 	else {
196 		for (rep = p, len = 0;
197 		    p[0] != '\0' && p[0] != delim; ++p, ++len)
198 			if (p[0] == '~')
199 				len += sp->repl_len;
200 		GET_SPACE_RET(sp, bp, blen, len);
201 		for (t = bp, len = 0, p = rep;;) {
202 			if (p[0] == '\0' || p[0] == delim) {
203 				if (p[0] == delim)
204 					++p;
205 				break;
206 			}
207 			if (p[0] == '\\') {
208 				if (p[1] == delim)
209 					++p;
210 				else if (p[1] == '\\') {
211 					*t++ = *p++;
212 					++len;
213 				} else if (p[1] == '~') {
214 					++p;
215 					if (!O_ISSET(sp, O_MAGIC))
216 						goto tilde;
217 				}
218 			} else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) {
219 tilde:				++p;
220 				memcpy(t, sp->repl, sp->repl_len);
221 				t += sp->repl_len;
222 				len += sp->repl_len;
223 				continue;
224 			}
225 			*t++ = *p++;
226 			++len;
227 		}
228 		if ((sp->repl_len = len) != 0) {
229 			free(sp->repl);
230 			if ((sp->repl = malloc(len)) == NULL) {
231 				msgq(sp, M_SYSERR, NULL);
232 				FREE_SPACE(sp, bp, blen);
233 				return (1);
234 			}
235 			memcpy(sp->repl, bp, len);
236 		}
237 		FREE_SPACE(sp, bp, blen);
238 	}
239 	return (s(sp, cmdp, p, re, flags));
240 }
241 
242 /*
243  * ex_subagain --
244  *	[line [,line]] & [cgr] [count] [#lp]]
245  *
246  *	Substitute using the last substitute RE and replacement pattern.
247  *
248  * PUBLIC: int ex_subagain(SCR *, EXCMD *);
249  */
250 int
ex_subagain(SCR * sp,EXCMD * cmdp)251 ex_subagain(SCR *sp, EXCMD *cmdp)
252 {
253 	if (sp->subre == NULL) {
254 		ex_emsg(sp, NULL, EXM_NOPREVRE);
255 		return (1);
256 	}
257 	if (!F_ISSET(sp, SC_RE_SUBST) && re_compile(sp,
258 	    sp->subre, sp->subre_len, NULL, NULL, &sp->subre_c, RE_C_SUBST))
259 		return (1);
260 	return (s(sp,
261 	    cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->subre_c, 0));
262 }
263 
264 /*
265  * ex_subtilde --
266  *	[line [,line]] ~ [cgr] [count] [#lp]]
267  *
268  *	Substitute using the last RE and last substitute replacement pattern.
269  *
270  * PUBLIC: int ex_subtilde(SCR *, EXCMD *);
271  */
272 int
ex_subtilde(SCR * sp,EXCMD * cmdp)273 ex_subtilde(SCR *sp, EXCMD *cmdp)
274 {
275 	if (sp->re == NULL) {
276 		ex_emsg(sp, NULL, EXM_NOPREVRE);
277 		return (1);
278 	}
279 	if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp,
280 	    sp->re, sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH))
281 		return (1);
282 	return (s(sp,
283 	    cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0));
284 }
285 
286 /*
287  * s --
288  * Do the substitution.  This stuff is *really* tricky.  There are lots of
289  * special cases, and general nastiness.  Don't mess with it unless you're
290  * pretty confident.
291  *
292  * The nasty part of the substitution is what happens when the replacement
293  * string contains newlines.  It's a bit tricky -- consider the information
294  * that has to be retained for "s/f\(o\)o/^M\1^M\1/".  The solution here is
295  * to build a set of newline offsets which we use to break the line up later,
296  * when the replacement is done.  Don't change it unless you're *damned*
297  * confident.
298  */
299 #define	NEEDNEWLINE(sp) {						\
300 	if ((sp)->newl_len == (sp)->newl_cnt) {				\
301 		(sp)->newl_len += 25;					\
302 		REALLOCARRAY((sp), (sp)->newl,				\
303 		    (sp)->newl_len, sizeof(size_t));			\
304 		if ((sp)->newl == NULL) {				\
305 			(sp)->newl_len = 0;				\
306 			return (1);					\
307 		}							\
308 	}								\
309 }
310 
311 #define	BUILD(sp, l, len) {						\
312 	if (lbclen + (len) > lblen) {					\
313 		lblen += MAXIMUM(lbclen + (len), 256);			\
314 		REALLOC((sp), lb, lblen);				\
315 		if (lb == NULL) {					\
316 			lbclen = 0;					\
317 			return (1);					\
318 		}							\
319 	}								\
320 	memcpy(lb + lbclen, (l), (len));				\
321 	lbclen += (len);						\
322 }
323 
324 #define	NEEDSP(sp, len, pnt) {						\
325 	if (lbclen + (len) > lblen) {					\
326 		lblen += MAXIMUM(lbclen + (len), 256);			\
327 		REALLOC((sp), lb, lblen);				\
328 		if (lb == NULL) {					\
329 			lbclen = 0;					\
330 			return (1);					\
331 		}							\
332 		(pnt) = lb + lbclen;					\
333 	}								\
334 }
335 
336 static int
s(SCR * sp,EXCMD * cmdp,char * s,regex_t * re,u_int flags)337 s(SCR *sp, EXCMD *cmdp, char *s, regex_t *re, u_int flags)
338 {
339 	EVENT ev;
340 	MARK from, to;
341 	TEXTH tiq;
342 	recno_t elno, lno, slno;
343 	regmatch_t match[10];
344 	size_t blen, cnt, last, lbclen, lblen, len, llen;
345 	size_t offset, saved_offset, scno;
346 	int lflag, nflag, pflag, rflag;
347 	int didsub, do_eol_match, eflags, nempty, eval;
348 	int linechanged, matched, quit, rval;
349 	unsigned long ul;
350 	char *bp, *lb;
351 
352 	NEEDFILE(sp, cmdp);
353 
354 	slno = sp->lno;
355 	scno = sp->cno;
356 
357 	/*
358 	 * !!!
359 	 * Historically, the 'g' and 'c' suffices were always toggled as flags,
360 	 * so ":s/A/B/" was the same as ":s/A/B/ccgg".  If O_EDCOMPATIBLE was
361 	 * not set, they were initialized to 0 for all substitute commands.  If
362 	 * O_EDCOMPATIBLE was set, they were initialized to 0 only if the user
363 	 * specified substitute/replacement patterns (see ex_s()).
364 	 */
365 	if (!O_ISSET(sp, O_EDCOMPATIBLE))
366 		sp->c_suffix = sp->g_suffix = 0;
367 
368 	/*
369 	 * Historic vi permitted the '#', 'l' and 'p' options in vi mode, but
370 	 * it only displayed the last change.  I'd disallow them, but they are
371 	 * useful in combination with the [v]global commands.  In the current
372 	 * model the problem is combining them with the 'c' flag -- the screen
373 	 * would have to flip back and forth between the confirm screen and the
374 	 * ex print screen, which would be pretty awful.  We do display all
375 	 * changes, though, for what that's worth.
376 	 *
377 	 * !!!
378 	 * Historic vi was fairly strict about the order of "options", the
379 	 * count, and "flags".  I'm somewhat fuzzy on the difference between
380 	 * options and flags, anyway, so this is a simpler approach, and we
381 	 * just take it them in whatever order the user gives them.  (The ex
382 	 * usage statement doesn't reflect this.)
383 	 */
384 	lflag = nflag = pflag = rflag = 0;
385 	if (s == NULL)
386 		goto noargs;
387 	for (lno = OOBLNO; *s != '\0'; ++s)
388 		switch (*s) {
389 		case ' ':
390 		case '\t':
391 			continue;
392 		case '+':
393 			++cmdp->flagoff;
394 			break;
395 		case '-':
396 			--cmdp->flagoff;
397 			break;
398 		case '0': case '1': case '2': case '3': case '4':
399 		case '5': case '6': case '7': case '8': case '9':
400 			if (lno != OOBLNO)
401 				goto usage;
402 			errno = 0;
403 			if ((ul = strtoul(s, &s, 10)) >= UINT_MAX)
404 				errno = ERANGE;
405 			if (*s == '\0')		/* Loop increment correction. */
406 				--s;
407 			if (errno == ERANGE) {
408 				if (ul >= UINT_MAX)
409 					msgq(sp, M_ERR, "Count overflow");
410 				else
411 					msgq(sp, M_SYSERR, NULL);
412 				return (1);
413 			}
414 			lno = (recno_t)ul;
415 			/*
416 			 * In historic vi, the count was inclusive from the
417 			 * second address.
418 			 */
419 			cmdp->addr1.lno = cmdp->addr2.lno;
420 			cmdp->addr2.lno += lno - 1;
421 			if (!db_exist(sp, cmdp->addr2.lno) &&
422 			    db_last(sp, &cmdp->addr2.lno))
423 				return (1);
424 			break;
425 		case '#':
426 			nflag = 1;
427 			break;
428 		case 'c':
429 			sp->c_suffix = !sp->c_suffix;
430 
431 			/* Ex text structure initialization. */
432 			if (F_ISSET(sp, SC_EX)) {
433 				memset(&tiq, 0, sizeof(TEXTH));
434 				TAILQ_INIT(&tiq);
435 			}
436 			break;
437 		case 'g':
438 			sp->g_suffix = !sp->g_suffix;
439 			break;
440 		case 'l':
441 			lflag = 1;
442 			break;
443 		case 'p':
444 			pflag = 1;
445 			break;
446 		case 'r':
447 			if (LF_ISSET(SUB_FIRST)) {
448 				msgq(sp, M_ERR,
449 		    "Regular expression specified; r flag meaningless");
450 				return (1);
451 			}
452 			if (!F_ISSET(sp, SC_RE_SEARCH)) {
453 				ex_emsg(sp, NULL, EXM_NOPREVRE);
454 				return (1);
455 			}
456 			rflag = 1;
457 			re = &sp->re_c;
458 			break;
459 		default:
460 			goto usage;
461 		}
462 
463 	if (*s != '\0' || (!rflag && LF_ISSET(SUB_MUSTSETR))) {
464 usage:		ex_emsg(sp, cmdp->cmd->usage, EXM_USAGE);
465 		return (1);
466 	}
467 
468 noargs:	if (F_ISSET(sp, SC_VI) && sp->c_suffix && (lflag || nflag || pflag)) {
469 		msgq(sp, M_ERR,
470 "The #, l and p flags may not be combined with the c flag in vi mode");
471 		return (1);
472 	}
473 
474 	/*
475 	 * bp:		if interactive, line cache
476 	 * blen:	if interactive, line cache length
477 	 * lb:		build buffer pointer.
478 	 * lbclen:	current length of built buffer.
479 	 * lblen;	length of build buffer.
480 	 */
481 	bp = lb = NULL;
482 	blen = lbclen = lblen = 0;
483 
484 	/* For each line... */
485 	for (matched = quit = 0, lno = cmdp->addr1.lno,
486 	    elno = cmdp->addr2.lno; !quit && lno <= elno; ++lno) {
487 
488 		/* Someone's unhappy, time to stop. */
489 		if (INTERRUPTED(sp))
490 			break;
491 
492 		/* Get the line. */
493 		if (db_get(sp, lno, DBG_FATAL, &s, &llen))
494 			goto err;
495 
496 		/*
497 		 * Make a local copy if doing confirmation -- when calling
498 		 * the confirm routine we're likely to lose the cached copy.
499 		 */
500 		if (sp->c_suffix) {
501 			if (bp == NULL) {
502 				GET_SPACE_RET(sp, bp, blen, llen);
503 			} else
504 				ADD_SPACE_RET(sp, bp, blen, llen);
505 			memcpy(bp, s, llen);
506 			s = bp;
507 		}
508 
509 		/* Start searching from the beginning. */
510 		offset = 0;
511 		len = llen;
512 
513 		/* Reset the build buffer offset. */
514 		lbclen = 0;
515 
516 		/* Reset empty match test variable. */
517 		nempty = -1;
518 
519 		/*
520 		 * We don't want to have to do a setline if the line didn't
521 		 * change -- keep track of whether or not this line changed.
522 		 * If doing confirmations, don't want to keep setting the
523 		 * line if change is refused -- keep track of substitutions.
524 		 */
525 		didsub = linechanged = 0;
526 
527 		/* New line, do an EOL match. */
528 		do_eol_match = 1;
529 
530 		/* It's not nul terminated, but we pretend it is. */
531 		eflags = REG_STARTEND;
532 
533 		/* The search area is from s + offset to the EOL.  */
534 nextmatch:	match[0].rm_so = offset;
535 		match[0].rm_eo = llen;
536 
537 		/* Get the next match. */
538 		eval = regexec(re, (char *)s, 10, match, eflags);
539 
540 		/*
541 		 * There wasn't a match or if there was an error, deal with
542 		 * it.  If there was a previous match in this line, resolve
543 		 * the changes into the database.  Otherwise, just move on.
544 		 */
545 		if (eval == REG_NOMATCH)
546 			goto endmatch;
547 		if (eval != 0) {
548 			re_error(sp, eval, re);
549 			goto err;
550 		}
551 		matched = 1;
552 
553 		/* Only the first search can match an anchored expression. */
554 		eflags |= REG_NOTBOL;
555 
556 		/*
557 		 * !!!
558 		 * It's possible to match 0-length strings -- for example, the
559 		 * command s;a*;X;, when matched against the string "aabb" will
560 		 * result in "XbXbX", i.e. the matches are "aa", the space
561 		 * between the b's and the space between the b's and the end of
562 		 * the string.  There is a similar space between the beginning
563 		 * of the string and the a's.  The rule that we use (because vi
564 		 * historically used it) is that any 0-length match, occurring
565 		 * immediately after a match, is ignored.  Otherwise, the above
566 		 * example would have resulted in "XXbXbX".  Another example is
567 		 * incorrectly using " *" to replace groups of spaces with one
568 		 * space.
569 		 *
570 		 * If the match is empty and at the same place as the end of the
571 		 * previous match, ignore the match and move forward.  If
572 		 * there's no more characters in the string, we were
573 		 * attempting to match after the last character, so quit.
574 		 */
575 		if (match[0].rm_so == nempty && match[0].rm_eo == nempty) {
576 			nempty = -1;
577 			if (len == 0)
578 				goto endmatch;
579 			BUILD(sp, s + offset, 1)
580 			++offset;
581 			--len;
582 			goto nextmatch;
583 		}
584 
585 		/* Confirm change. */
586 		if (sp->c_suffix) {
587 			/*
588 			 * Set the cursor position for confirmation.  Note,
589 			 * if we matched on a '$', the cursor may be past
590 			 * the end of line.
591 			 */
592 			from.lno = to.lno = lno;
593 			from.cno = match[0].rm_so;
594 			to.cno = match[0].rm_eo;
595 			/*
596 			 * Both ex and vi have to correct for a change before
597 			 * the first character in the line.
598 			 */
599 			if (llen == 0)
600 				from.cno = to.cno = 0;
601 			if (F_ISSET(sp, SC_VI)) {
602 				/*
603 				 * Only vi has to correct for a change after
604 				 * the last character in the line.
605 				 *
606 				 * XXX
607 				 * It would be nice to change the vi code so
608 				 * that we could display a cursor past EOL.
609 				 */
610 				if (to.cno >= llen)
611 					to.cno = llen - 1;
612 				if (from.cno >= llen)
613 					from.cno = llen - 1;
614 
615 				sp->lno = from.lno;
616 				sp->cno = from.cno;
617 				if (vs_refresh(sp, 1))
618 					goto err;
619 
620 				vs_update(sp, "Confirm change? [n]", NULL);
621 
622 				if (v_event_get(sp, &ev, 0, 0))
623 					goto err;
624 				switch (ev.e_event) {
625 				case E_CHARACTER:
626 					break;
627 				case E_EOF:
628 				case E_ERR:
629 				case E_INTERRUPT:
630 					goto lquit;
631 				default:
632 					v_event_err(sp, &ev);
633 					goto lquit;
634 				}
635 			} else {
636 				const int flags =
637 				    O_ISSET(sp, O_NUMBER) ? E_C_HASH : 0;
638 				if (ex_print(sp, cmdp, &from, &to, flags) ||
639 				    ex_scprint(sp, &from, &to))
640 					goto lquit;
641 				if (ex_txt(sp, &tiq, 0, TXT_CR))
642 					goto err;
643 				ev.e_c = TAILQ_FIRST(&tiq)->lb[0];
644 			}
645 
646 			switch (ev.e_c) {
647 			case CH_YES:
648 				break;
649 			default:
650 			case CH_NO:
651 				didsub = 0;
652 				BUILD(sp, s + offset, match[0].rm_eo - offset);
653 				goto skip;
654 			case CH_QUIT:
655 				/* Set the quit/interrupted flags. */
656 lquit:				quit = 1;
657 				F_SET(sp->gp, G_INTERRUPTED);
658 
659 				/*
660 				 * Resolve any changes, then return to (and
661 				 * exit from) the main loop.
662 				 */
663 				goto endmatch;
664 			}
665 		}
666 
667 		/*
668 		 * Set the cursor to the last position changed, converting
669 		 * from 1-based to 0-based.
670 		 */
671 		sp->lno = lno;
672 		sp->cno = match[0].rm_so;
673 
674 		/* Copy the bytes before the match into the build buffer. */
675 		BUILD(sp, s + offset, match[0].rm_so - offset);
676 
677 		/* Substitute the matching bytes. */
678 		didsub = 1;
679 		if (re_sub(sp, s, &lb, &lbclen, &lblen, match))
680 			goto err;
681 
682 		/* Set the change flag so we know this line was modified. */
683 		linechanged = 1;
684 
685 		/* Move past the matched bytes. */
686 skip:		offset = match[0].rm_eo;
687 		len = llen - match[0].rm_eo;
688 
689 		/* A match cannot be followed by an empty pattern. */
690 		nempty = match[0].rm_eo;
691 
692 		/*
693 		 * If doing a global change with confirmation, we have to
694 		 * update the screen.  The basic idea is to store the line
695 		 * so the screen update routines can find it, and restart.
696 		 */
697 		if (didsub && sp->c_suffix && sp->g_suffix) {
698 			/*
699 			 * The new search offset will be the end of the
700 			 * modified line.
701 			 */
702 			saved_offset = lbclen;
703 
704 			/* Copy the rest of the line. */
705 			if (len)
706 				BUILD(sp, s + offset, len)
707 
708 			/* Set the new offset. */
709 			offset = saved_offset;
710 
711 			/* Store inserted lines, adjusting the build buffer. */
712 			last = 0;
713 			if (sp->newl_cnt) {
714 				for (cnt = 0;
715 				    cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
716 					if (db_insert(sp, lno,
717 					    lb + last, sp->newl[cnt] - last))
718 						goto err;
719 					last = sp->newl[cnt] + 1;
720 					++sp->rptlines[L_ADDED];
721 				}
722 				lbclen -= last;
723 				offset -= last;
724 				sp->newl_cnt = 0;
725 			}
726 
727 			/* Store and retrieve the line. */
728 			if (db_set(sp, lno, lb + last, lbclen))
729 				goto err;
730 			if (db_get(sp, lno, DBG_FATAL, &s, &llen))
731 				goto err;
732 			ADD_SPACE_RET(sp, bp, blen, llen)
733 			memcpy(bp, s, llen);
734 			s = bp;
735 			len = llen - offset;
736 
737 			/* Restart the build. */
738 			lbclen = 0;
739 			BUILD(sp, s, offset);
740 
741 			/*
742 			 * If we haven't already done the after-the-string
743 			 * match, do one.  Set REG_NOTEOL so the '$' pattern
744 			 * only matches once.
745 			 */
746 			if (!do_eol_match)
747 				goto endmatch;
748 			if (offset == len) {
749 				do_eol_match = 0;
750 				eflags |= REG_NOTEOL;
751 			}
752 			goto nextmatch;
753 		}
754 
755 		/*
756 		 * If it's a global:
757 		 *
758 		 * If at the end of the string, do a test for the after
759 		 * the string match.  Set REG_NOTEOL so the '$' pattern
760 		 * only matches once.
761 		 */
762 		if (sp->g_suffix && do_eol_match) {
763 			if (len == 0) {
764 				do_eol_match = 0;
765 				eflags |= REG_NOTEOL;
766 			}
767 			goto nextmatch;
768 		}
769 
770 endmatch:	if (!linechanged)
771 			continue;
772 
773 		/* Copy any remaining bytes into the build buffer. */
774 		if (len)
775 			BUILD(sp, s + offset, len)
776 
777 		/* Store inserted lines, adjusting the build buffer. */
778 		last = 0;
779 		if (sp->newl_cnt) {
780 			for (cnt = 0;
781 			    cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
782 				if (db_insert(sp,
783 				    lno, lb + last, sp->newl[cnt] - last))
784 					goto err;
785 				last = sp->newl[cnt] + 1;
786 				++sp->rptlines[L_ADDED];
787 			}
788 			lbclen -= last;
789 			sp->newl_cnt = 0;
790 		}
791 
792 		/* Store the changed line. */
793 		if (db_set(sp, lno, lb + last, lbclen))
794 			goto err;
795 
796 		/* Update changed line counter. */
797 		if (sp->rptlchange != lno) {
798 			sp->rptlchange = lno;
799 			++sp->rptlines[L_CHANGED];
800 		}
801 
802 		/*
803 		 * !!!
804 		 * Display as necessary.  Historic practice is to only
805 		 * display the last line of a line split into multiple
806 		 * lines.
807 		 */
808 		if (lflag || nflag || pflag) {
809 			from.lno = to.lno = lno;
810 			from.cno = to.cno = 0;
811 			if (lflag)
812 				(void)ex_print(sp, cmdp, &from, &to, E_C_LIST);
813 			if (nflag)
814 				(void)ex_print(sp, cmdp, &from, &to, E_C_HASH);
815 			if (pflag)
816 				(void)ex_print(sp, cmdp, &from, &to, E_C_PRINT);
817 		}
818 	}
819 
820 	/*
821 	 * !!!
822 	 * Historically, vi attempted to leave the cursor at the same place if
823 	 * the substitution was done at the current cursor position.  Otherwise
824 	 * it moved it to the first non-blank of the last line changed.  There
825 	 * were some problems: for example, :s/$/foo/ with the cursor on the
826 	 * last character of the line left the cursor on the last character, or
827 	 * the & command with multiple occurrences of the matching string in the
828 	 * line usually left the cursor in a fairly random position.
829 	 *
830 	 * We try to do the same thing, with the exception that if the user is
831 	 * doing substitution with confirmation, we move to the last line about
832 	 * which the user was consulted, as opposed to the last line that they
833 	 * actually changed.  This prevents a screen flash if the user doesn't
834 	 * change many of the possible lines.
835 	 */
836 	if (!sp->c_suffix && (sp->lno != slno || sp->cno != scno)) {
837 		sp->cno = 0;
838 		(void)nonblank(sp, sp->lno, &sp->cno);
839 	}
840 
841 	/*
842 	 * If not in a global command, and nothing matched, say so.
843 	 * Else, if none of the lines displayed, put something up.
844 	 */
845 	rval = 0;
846 	if (!matched) {
847 		if (!F_ISSET(sp, SC_EX_GLOBAL)) {
848 			msgq(sp, M_ERR, "No match found");
849 			goto err;
850 		}
851 	} else if (!lflag && !nflag && !pflag)
852 		F_SET(cmdp, E_AUTOPRINT);
853 
854 	if (0) {
855 err:		rval = 1;
856 	}
857 
858 	if (bp != NULL)
859 		FREE_SPACE(sp, bp, blen);
860 	free(lb);
861 	return (rval);
862 }
863 
864 /*
865  * re_compile --
866  *	Compile the RE.
867  *
868  * PUBLIC: int re_compile(SCR *,
869  * PUBLIC:     char *, size_t, char **, size_t *, regex_t *, u_int);
870  */
871 int
re_compile(SCR * sp,char * ptrn,size_t plen,char ** ptrnp,size_t * lenp,regex_t * rep,u_int flags)872 re_compile(SCR *sp, char *ptrn, size_t plen, char **ptrnp, size_t *lenp,
873     regex_t *rep, u_int flags)
874 {
875 	size_t len;
876 	int reflags, replaced, rval;
877 	char *p;
878 
879 	/* Set RE flags. */
880 	reflags = 0;
881 	if (!LF_ISSET(RE_C_TAG)) {
882 		if (O_ISSET(sp, O_EXTENDED))
883 			reflags |= REG_EXTENDED;
884 		if (O_ISSET(sp, O_IGNORECASE))
885 			reflags |= REG_ICASE;
886 		if (O_ISSET(sp, O_ICLOWER)) {
887 			for (p = ptrn, len = plen; len > 0; ++p, --len)
888 				if (isupper(*p))
889 					break;
890 			if (len == 0)
891 				reflags |= REG_ICASE;
892 		}
893 	}
894 
895 	/* If we're replacing a saved value, clear the old one. */
896 	if (LF_ISSET(RE_C_SEARCH) && F_ISSET(sp, SC_RE_SEARCH)) {
897 		regfree(&sp->re_c);
898 		F_CLR(sp, SC_RE_SEARCH);
899 	}
900 	if (LF_ISSET(RE_C_SUBST) && F_ISSET(sp, SC_RE_SUBST)) {
901 		regfree(&sp->subre_c);
902 		F_CLR(sp, SC_RE_SUBST);
903 	}
904 
905 	/*
906 	 * If we're saving the string, it's a pattern we haven't seen before,
907 	 * so convert the vi-style RE's to POSIX 1003.2 RE's.  Save a copy for
908 	 * later recompilation.   Free any previously saved value.
909 	 */
910 	if (ptrnp != NULL) {
911 		if (LF_ISSET(RE_C_TAG)) {
912 			if (re_tag_conv(sp, &ptrn, &plen, &replaced))
913 				return (1);
914 		} else
915 			if (re_conv(sp, &ptrn, &plen, &replaced))
916 				return (1);
917 
918 		/* Discard previous pattern. */
919 		free(*ptrnp);
920 		*ptrnp = NULL;
921 		if (lenp != NULL)
922 			*lenp = plen;
923 
924 		/*
925 		 * Copy the string into allocated memory.
926 		 *
927 		 * XXX
928 		 * Regcomp isn't 8-bit clean, so the pattern is nul-terminated
929 		 * for now.  There's just no other solution.
930 		 */
931 		MALLOC(sp, *ptrnp, plen + 1);
932 		if (*ptrnp != NULL) {
933 			memcpy(*ptrnp, ptrn, plen);
934 			(*ptrnp)[plen] = '\0';
935 		}
936 
937 		/* Free up conversion-routine-allocated memory. */
938 		if (replaced)
939 			FREE_SPACE(sp, ptrn, 0);
940 
941 		if (*ptrnp == NULL)
942 			return (1);
943 
944 		ptrn = *ptrnp;
945 	}
946 
947 	/*
948 	 * XXX
949 	 * Regcomp isn't 8-bit clean, so we just lost if the pattern
950 	 * contained a nul.  Bummer!
951 	 */
952 	if ((rval = regcomp(rep, ptrn, /* plen, */ reflags)) != 0) {
953 		if (!LF_ISSET(RE_C_SILENT))
954 			re_error(sp, rval, rep);
955 		return (1);
956 	}
957 
958 	if (LF_ISSET(RE_C_SEARCH))
959 		F_SET(sp, SC_RE_SEARCH);
960 	if (LF_ISSET(RE_C_SUBST))
961 		F_SET(sp, SC_RE_SUBST);
962 
963 	return (0);
964 }
965 
966 /*
967  * re_conv --
968  *	Convert vi's regular expressions into something that the
969  *	the POSIX 1003.2 RE functions can handle.
970  *
971  * There are two conversions we make to make vi's RE's (specifically
972  * the global, search, and substitute patterns) work with POSIX RE's.
973  * We assume that \<ptrn\> does "word" searches, which is non-standard
974  * but supported by most regexp libraries..
975  *
976  * 1: If O_MAGIC is not set, strip backslashes from the magic character
977  *    set (.[*~) that have them, and add them to the ones that don't.
978  * 2: If O_MAGIC is not set, the string "\~" is replaced with the text
979  *    from the last substitute command's replacement string.  If O_MAGIC
980  *    is set, it's the string "~".
981  *
982  * !!!/XXX
983  * This doesn't exactly match the historic behavior of vi because we do
984  * the ~ substitution before calling the RE engine, so magic characters
985  * in the replacement string will be expanded by the RE engine, and they
986  * weren't historically.  It's a bug.
987  */
988 static int
re_conv(SCR * sp,char ** ptrnp,size_t * plenp,int * replacedp)989 re_conv(SCR *sp, char **ptrnp, size_t *plenp, int *replacedp)
990 {
991 	size_t blen, len, needlen;
992 	int magic;
993 	char *bp, *p, *t;
994 
995 	/*
996 	 * First pass through, we figure out how much space we'll need.
997 	 * We do it in two passes, on the grounds that most of the time
998 	 * the user is doing a search and won't have magic characters.
999 	 * That way we can skip most of the memory allocation and copies.
1000 	 */
1001 	magic = 0;
1002 	for (p = *ptrnp, len = *plenp, needlen = 0; len > 0; ++p, --len)
1003 		switch (*p) {
1004 		case '\\':
1005 			if (len > 1) {
1006 				--len;
1007 				switch (*++p) {
1008 				case '~':
1009 					if (!O_ISSET(sp, O_MAGIC)) {
1010 						magic = 1;
1011 						needlen += sp->repl_len;
1012 					}
1013 					break;
1014 				case '.':
1015 				case '[':
1016 				case '*':
1017 					if (!O_ISSET(sp, O_MAGIC)) {
1018 						magic = 1;
1019 						needlen += 1;
1020 					}
1021 					break;
1022 				default:
1023 					needlen += 2;
1024 				}
1025 			} else
1026 				needlen += 1;
1027 			break;
1028 		case '~':
1029 			if (O_ISSET(sp, O_MAGIC)) {
1030 				magic = 1;
1031 				needlen += sp->repl_len;
1032 			}
1033 			break;
1034 		case '.':
1035 		case '[':
1036 		case '*':
1037 			if (!O_ISSET(sp, O_MAGIC)) {
1038 				magic = 1;
1039 				needlen += 2;
1040 			}
1041 			break;
1042 		default:
1043 			needlen += 1;
1044 			break;
1045 		}
1046 
1047 	if (!magic) {
1048 		*replacedp = 0;
1049 		return (0);
1050 	}
1051 
1052 	/* Get enough memory to hold the final pattern. */
1053 	*replacedp = 1;
1054 	GET_SPACE_RET(sp, bp, blen, needlen);
1055 
1056 	for (p = *ptrnp, len = *plenp, t = bp; len > 0; ++p, --len)
1057 		switch (*p) {
1058 		case '\\':
1059 			if (len > 1) {
1060 				--len;
1061 				switch (*++p) {
1062 				case '~':
1063 					if (O_ISSET(sp, O_MAGIC))
1064 						*t++ = '~';
1065 					else {
1066 						memcpy(t,
1067 						    sp->repl, sp->repl_len);
1068 						t += sp->repl_len;
1069 					}
1070 					break;
1071 				case '.':
1072 				case '[':
1073 				case '*':
1074 					if (O_ISSET(sp, O_MAGIC))
1075 						*t++ = '\\';
1076 					*t++ = *p;
1077 					break;
1078 				default:
1079 					*t++ = '\\';
1080 					*t++ = *p;
1081 				}
1082 			} else
1083 				*t++ = '\\';
1084 			break;
1085 		case '~':
1086 			if (O_ISSET(sp, O_MAGIC)) {
1087 				memcpy(t, sp->repl, sp->repl_len);
1088 				t += sp->repl_len;
1089 			} else
1090 				*t++ = '~';
1091 			break;
1092 		case '.':
1093 		case '[':
1094 		case '*':
1095 			if (!O_ISSET(sp, O_MAGIC))
1096 				*t++ = '\\';
1097 			*t++ = *p;
1098 			break;
1099 		default:
1100 			*t++ = *p;
1101 			break;
1102 		}
1103 
1104 	*ptrnp = bp;
1105 	*plenp = t - bp;
1106 	return (0);
1107 }
1108 
1109 /*
1110  * re_tag_conv --
1111  *	Convert a tags search path into something that the POSIX
1112  *	1003.2 RE functions can handle.
1113  */
1114 static int
re_tag_conv(SCR * sp,char ** ptrnp,size_t * plenp,int * replacedp)1115 re_tag_conv(SCR *sp, char **ptrnp, size_t *plenp, int *replacedp)
1116 {
1117 	size_t blen, len;
1118 	int lastdollar;
1119 	char *bp, *p, *t;
1120 
1121 	len = *plenp;
1122 
1123 	/* Max memory usage is 2 times the length of the string. */
1124 	*replacedp = 1;
1125 	GET_SPACE_RET(sp, bp, blen, len * 2);
1126 
1127 	p = *ptrnp;
1128 	t = bp;
1129 
1130 	/* If the last character is a '/' or '?', we just strip it. */
1131 	if (len > 0 && (p[len - 1] == '/' || p[len - 1] == '?'))
1132 		--len;
1133 
1134 	/* If the next-to-last or last character is a '$', it's magic. */
1135 	if (len > 0 && p[len - 1] == '$') {
1136 		--len;
1137 		lastdollar = 1;
1138 	} else
1139 		lastdollar = 0;
1140 
1141 	/* If the first character is a '/' or '?', we just strip it. */
1142 	if (len > 0 && (p[0] == '/' || p[0] == '?')) {
1143 		++p;
1144 		--len;
1145 	}
1146 
1147 	/* If the first or second character is a '^', it's magic. */
1148 	if (p[0] == '^') {
1149 		*t++ = *p++;
1150 		--len;
1151 	}
1152 
1153 	/*
1154 	 * Escape every other magic character we can find, meanwhile stripping
1155 	 * the backslashes ctags inserts when escaping the search delimiter
1156 	 * characters.
1157 	 */
1158 	for (; len > 0; --len) {
1159 		if (p[0] == '\\' && (p[1] == '/' || p[1] == '?')) {
1160 			++p;
1161 			--len;
1162 		} else if (strchr("^.[]$*", p[0]))
1163 			*t++ = '\\';
1164 		*t++ = *p++;
1165 		if (len == 0)
1166 			break;
1167 	}
1168 	if (lastdollar)
1169 		*t++ = '$';
1170 
1171 	*ptrnp = bp;
1172 	*plenp = t - bp;
1173 	return (0);
1174 }
1175 
1176 /*
1177  * re_error --
1178  *	Report a regular expression error.
1179  *
1180  * PUBLIC: void re_error(SCR *, int, regex_t *);
1181  */
1182 void
re_error(SCR * sp,int errcode,regex_t * preg)1183 re_error(SCR *sp, int errcode, regex_t *preg)
1184 {
1185 	size_t s;
1186 	char *oe;
1187 
1188 	s = regerror(errcode, preg, "", 0);
1189 	if ((oe = malloc(s)) == NULL)
1190 		msgq(sp, M_SYSERR, NULL);
1191 	else {
1192 		(void)regerror(errcode, preg, oe, s);
1193 		msgq(sp, M_ERR, "RE error: %s", oe);
1194 		free(oe);
1195 	}
1196 }
1197 
1198 /*
1199  * re_sub --
1200  * 	Do the substitution for a regular expression.
1201  */
1202 static int
re_sub(SCR * sp,char * ip,char ** lbp,size_t * lbclenp,size_t * lblenp,regmatch_t match[10])1203 re_sub(SCR *sp, char *ip, char **lbp, size_t *lbclenp, size_t *lblenp,
1204     regmatch_t match[10])
1205 {
1206 	enum { C_NOTSET, C_LOWER, C_ONELOWER, C_ONEUPPER, C_UPPER } conv;
1207 	size_t lbclen, lblen;		/* Local copies. */
1208 	size_t mlen;			/* Match length. */
1209 	size_t rpl;			/* Remaining replacement length. */
1210 	char *rp;			/* Replacement pointer. */
1211 	int ch;
1212 	int no;				/* Match replacement offset. */
1213 	char *p, *t;			/* Buffer pointers. */
1214 	char *lb;			/* Local copies. */
1215 
1216 	lb = *lbp;			/* Get local copies. */
1217 	lbclen = *lbclenp;
1218 	lblen = *lblenp;
1219 
1220 	/*
1221 	 * QUOTING NOTE:
1222 	 *
1223 	 * There are some special sequences that vi provides in the
1224 	 * replacement patterns.
1225 	 *	 & string the RE matched (\& if nomagic set)
1226 	 *	\# n-th regular subexpression
1227 	 *	\E end \U, \L conversion
1228 	 *	\e end \U, \L conversion
1229 	 *	\l convert the next character to lower-case
1230 	 *	\L convert to lower-case, until \E, \e, or end of replacement
1231 	 *	\u convert the next character to upper-case
1232 	 *	\U convert to upper-case, until \E, \e, or end of replacement
1233 	 *
1234 	 * Otherwise, since this is the lowest level of replacement, discard
1235 	 * all escaping characters.  This (hopefully) matches historic practice.
1236 	 */
1237 #define	OUTCH(ch, nltrans) {						\
1238 	CHAR_T __ch = (ch);						\
1239 	u_int __value = KEY_VAL(sp, __ch);				\
1240 	if ((nltrans) && (__value == K_CR || __value == K_NL)) {	\
1241 		NEEDNEWLINE(sp);					\
1242 		sp->newl[sp->newl_cnt++] = lbclen;			\
1243 	} else if (conv != C_NOTSET) {					\
1244 		switch (conv) {						\
1245 		case C_ONELOWER:					\
1246 			conv = C_NOTSET;				\
1247 			/* FALLTHROUGH */				\
1248 		case C_LOWER:						\
1249 			if (isupper(__ch))				\
1250 				__ch = tolower(__ch);			\
1251 			break;						\
1252 		case C_ONEUPPER:					\
1253 			conv = C_NOTSET;				\
1254 			/* FALLTHROUGH */				\
1255 		case C_UPPER:						\
1256 			if (islower(__ch))				\
1257 				__ch = toupper(__ch);			\
1258 			break;						\
1259 		default:						\
1260 			abort();					\
1261 		}							\
1262 	}								\
1263 	NEEDSP(sp, 1, p);						\
1264 	*p++ = __ch;							\
1265 	++lbclen;							\
1266 }
1267 	conv = C_NOTSET;
1268 	for (rp = sp->repl, rpl = sp->repl_len, p = lb + lbclen; rpl--;) {
1269 		switch (ch = *rp++) {
1270 		case '&':
1271 			if (O_ISSET(sp, O_MAGIC)) {
1272 				no = 0;
1273 				goto subzero;
1274 			}
1275 			break;
1276 		case '\\':
1277 			if (rpl == 0)
1278 				break;
1279 			--rpl;
1280 			switch (ch = *rp) {
1281 			case '&':
1282 				++rp;
1283 				if (!O_ISSET(sp, O_MAGIC)) {
1284 					no = 0;
1285 					goto subzero;
1286 				}
1287 				break;
1288 			case '0': case '1': case '2': case '3': case '4':
1289 			case '5': case '6': case '7': case '8': case '9':
1290 				no = *rp++ - '0';
1291 subzero:			if (match[no].rm_so == -1 ||
1292 			    	    match[no].rm_eo == -1)
1293 					break;
1294 				mlen = match[no].rm_eo - match[no].rm_so;
1295 				for (t = ip + match[no].rm_so; mlen--; ++t)
1296 					OUTCH(*t, 0);
1297 				continue;
1298 			case 'e':
1299 			case 'E':
1300 				++rp;
1301 				conv = C_NOTSET;
1302 				continue;
1303 			case 'l':
1304 				++rp;
1305 				conv = C_ONELOWER;
1306 				continue;
1307 			case 'L':
1308 				++rp;
1309 				conv = C_LOWER;
1310 				continue;
1311 			case 'u':
1312 				++rp;
1313 				conv = C_ONEUPPER;
1314 				continue;
1315 			case 'U':
1316 				++rp;
1317 				conv = C_UPPER;
1318 				continue;
1319 			default:
1320 				++rp;
1321 				break;
1322 			}
1323 		}
1324 		OUTCH(ch, 1);
1325 	}
1326 
1327 	*lbp = lb;			/* Update caller's information. */
1328 	*lbclenp = lbclen;
1329 	*lblenp = lblen;
1330 	return (0);
1331 }
1332