xref: /openbsd/usr.bin/mg/re_search.c (revision 79a91dcb)
1 /*	$OpenBSD: re_search.c,v 1.36 2021/04/22 19:50:55 lum Exp $	*/
2 
3 /* This file is in the public domain. */
4 
5 /*
6  *	regular expression search commands for Mg
7  *
8  * This file contains functions to implement several of gnuemacs's regular
9  * expression functions for Mg.  Several of the routines below are just minor
10  * re-arrangements of Mg's non-regular expression search functions.  Some of
11  * them are similar in structure to the original MicroEMACS, others are
12  * modifications of Rich Ellison's code.  Peter Newton re-wrote about half of
13  * them from scratch.
14  */
15 
16 #ifdef REGEX
17 #include <sys/queue.h>
18 #include <sys/types.h>
19 #include <regex.h>
20 #include <signal.h>
21 #include <stdio.h>
22 #include <string.h>
23 
24 #include "def.h"
25 #include "macro.h"
26 
27 #define SRCH_BEGIN	(0)		/* search sub-codes		    */
28 #define SRCH_FORW	(-1)
29 #define SRCH_BACK	(-2)
30 #define SRCH_NOPR	(-3)
31 #define SRCH_ACCM	(-4)
32 #define SRCH_MARK	(-5)
33 
34 #define RE_NMATCH	10		/* max number of matches	    */
35 #define REPLEN		256		/* max length of replacement string */
36 
37 char	re_pat[NPAT];			/* regex pattern		    */
38 int	re_srch_lastdir = SRCH_NOPR;	/* last search flags		    */
39 int	casefoldsearch = TRUE;		/* does search ignore case?	    */
40 
41 static int	 re_doreplace(RSIZE, char *);
42 static int	 re_forwsrch(void);
43 static int	 re_backsrch(void);
44 static int	 re_readpattern(char *);
45 static int	 killmatches(int);
46 static int	 countmatches(int);
47 
48 /*
49  * Search forward.
50  * Get a search string from the user and search for it starting at ".".  If
51  * found, move "." to just after the matched characters.  display does all
52  * the hard stuff.  If not found, it just prints a message.
53  */
54 /* ARGSUSED */
55 int
56 re_forwsearch(int f, int n)
57 {
58 	int	s;
59 
60 	if ((s = re_readpattern("RE Search")) != TRUE)
61 		return (s);
62 	if (re_forwsrch() == FALSE) {
63 		dobeep();
64 		ewprintf("Search failed: \"%s\"", re_pat);
65 		return (FALSE);
66 	}
67 	re_srch_lastdir = SRCH_FORW;
68 	return (TRUE);
69 }
70 
71 /*
72  * Reverse search.
73  * Get a search string from the user, and search, starting at "."
74  * and proceeding toward the front of the buffer. If found "." is left
75  * pointing at the first character of the pattern [the last character that
76  * was matched].
77  */
78 /* ARGSUSED */
79 int
80 re_backsearch(int f, int n)
81 {
82 	int	s;
83 
84 	if ((s = re_readpattern("RE Search backward")) != TRUE)
85 		return (s);
86 	if (re_backsrch() == FALSE) {
87 		dobeep();
88 		ewprintf("Search failed: \"%s\"", re_pat);
89 		return (FALSE);
90 	}
91 	re_srch_lastdir = SRCH_BACK;
92 	return (TRUE);
93 }
94 
95 /*
96  * Search again, using the same search string and direction as the last search
97  * command.  The direction has been saved in "srch_lastdir", so you know which
98  * way to go.
99  *
100  * XXX: This code has problems -- some incompatibility(?) with extend.c causes
101  * match to fail when it should not.
102  */
103 /* ARGSUSED */
104 int
105 re_searchagain(int f, int n)
106 {
107 	if (re_srch_lastdir == SRCH_NOPR) {
108 		dobeep();
109 		ewprintf("No last search");
110 		return (FALSE);
111 	}
112 	if (re_srch_lastdir == SRCH_FORW) {
113 		if (re_forwsrch() == FALSE) {
114 			dobeep();
115 			ewprintf("Search failed: \"%s\"", re_pat);
116 			return (FALSE);
117 		}
118 		return (TRUE);
119 	}
120 	if (re_srch_lastdir == SRCH_BACK)
121 		if (re_backsrch() == FALSE) {
122 			dobeep();
123 			ewprintf("Search failed: \"%s\"", re_pat);
124 			return (FALSE);
125 		}
126 
127 	return (TRUE);
128 }
129 
130 /* Compiled regex goes here-- changed only when new pattern read */
131 static regex_t		regex_buff;
132 static regmatch_t	regex_match[RE_NMATCH];
133 
134 /*
135  * Re-Query Replace.
136  *	Replace strings selectively.  Does a search and replace operation.
137  */
138 /* ARGSUSED */
139 int
140 re_queryrepl(int f, int n)
141 {
142 	int	rcnt = 0;		/* replacements made so far	*/
143 	int	plen, s;		/* length of found string	*/
144 	char	news[NPAT];		/* replacement string		*/
145 
146 	if ((s = re_readpattern("RE Query replace")) != TRUE)
147 		return (s);
148 	if (eread("Query replace %s with: ", news, NPAT,
149 	    EFNUL | EFNEW | EFCR, re_pat) == NULL)
150 		return (ABORT);
151 	ewprintf("Query replacing %s with %s:", re_pat, news);
152 
153 	/*
154 	 * Search forward repeatedly, checking each time whether to insert
155 	 * or not.  The "!" case makes the check always true, so it gets put
156 	 * into a tighter loop for efficiency.
157 	 */
158 	while (re_forwsrch() == TRUE) {
159 retry:
160 		update(CMODE);
161 		switch (getkey(FALSE)) {
162 		case ' ':
163 			plen = regex_match[0].rm_eo - regex_match[0].rm_so;
164 			if (re_doreplace((RSIZE)plen, news) == FALSE)
165 				return (FALSE);
166 			rcnt++;
167 			break;
168 
169 		case '.':
170 			plen = regex_match[0].rm_eo - regex_match[0].rm_so;
171 			if (re_doreplace((RSIZE)plen, news) == FALSE)
172 				return (FALSE);
173 			rcnt++;
174 			goto stopsearch;
175 
176 		case CCHR('G'):				/* ^G */
177 			(void)ctrlg(FFRAND, 0);
178 			goto stopsearch;
179 		case CCHR('['):				/* ESC */
180 		case '`':
181 			goto stopsearch;
182 		case '!':
183 			do {
184 				plen = regex_match[0].rm_eo - regex_match[0].rm_so;
185 				if (re_doreplace((RSIZE)plen, news) == FALSE)
186 					return (FALSE);
187 				rcnt++;
188 			} while (re_forwsrch() == TRUE);
189 			goto stopsearch;
190 
191 		case CCHR('?'):				/* To not replace */
192 			break;
193 
194 		default:
195 			ewprintf("<SP> replace, [.] rep-end, <DEL> don't, [!] repl rest <ESC> quit");
196 			goto retry;
197 		}
198 	}
199 
200 stopsearch:
201 	curwp->w_rflag |= WFFULL;
202 	update(CMODE);
203 	if (!inmacro) {
204 		if (rcnt == 0)
205 			ewprintf("(No replacements done)");
206 		else if (rcnt == 1)
207 			ewprintf("(1 replacement done)");
208 		else
209 			ewprintf("(%d replacements done)", rcnt);
210 	}
211 	return (TRUE);
212 }
213 
214 int
215 re_repl(int f, int n)
216 {
217 	int     rcnt = 0;		/* replacements made so far     */
218 	int     plen, s;		/* length of found string       */
219 	char    news[NPAT];		/* replacement string           */
220 
221 	if ((s = re_readpattern("RE Replace")) != TRUE)
222 		return (s);
223 	if (eread("Replace %s with: ", news, NPAT,
224 	    EFNUL | EFNEW | EFCR, re_pat) == NULL)
225                 return (ABORT);
226 
227 	while (re_forwsrch() == TRUE) {
228 		plen = regex_match[0].rm_eo - regex_match[0].rm_so;
229 		if (re_doreplace((RSIZE)plen, news) == FALSE)
230 			return (FALSE);
231 		rcnt++;
232 	}
233 
234 	curwp->w_rflag |= WFFULL;
235 	update(CMODE);
236 	if (!inmacro)
237 		ewprintf("(%d replacement(s) done)", rcnt);
238 
239 	return(TRUE);
240 }
241 
242 /*
243  * Routine re_doreplace calls lreplace to make replacements needed by
244  * re_query replace.  Its reason for existence is to deal with \1, \2. etc.
245  *  plen: length to remove
246  *  st:   replacement string
247  */
248 static int
249 re_doreplace(RSIZE plen, char *st)
250 {
251 	int	 j, k, s, more, num, state;
252 	struct line	*clp;
253 	char	 repstr[REPLEN];
254 
255 	clp = curwp->w_dotp;
256 	more = TRUE;
257 	j = 0;
258 	state = 0;
259 	num = 0;
260 
261 	/* The following FSA parses the replacement string */
262 	while (more) {
263 		switch (state) {
264 		case 0:
265 			if (*st == '\\') {
266 				st++;
267 				state = 1;
268 			} else if (*st == '\0')
269 				more = FALSE;
270 			else {
271 				repstr[j] = *st;
272 				j++;
273 				if (j >= REPLEN)
274 					return (FALSE);
275 				st++;
276 			}
277 			break;
278 		case 1:
279 			if (*st >= '0' && *st <= '9') {
280 				num = *st - '0';
281 				st++;
282 				state = 2;
283 			} else if (*st == '\0')
284 				more = FALSE;
285 			else {
286 				repstr[j] = *st;
287 				j++;
288 				if (j >= REPLEN)
289 					return (FALSE);
290 				st++;
291 				state = 0;
292 			}
293 			break;
294 		case 2:
295 			if (*st >= '0' && *st <= '9') {
296 				num = 10 * num + *st - '0';
297 				st++;
298 			} else {
299 				if (num >= RE_NMATCH)
300 					return (FALSE);
301 				k = regex_match[num].rm_eo - regex_match[num].rm_so;
302 				if (j + k >= REPLEN)
303 					return (FALSE);
304 				bcopy(&(clp->l_text[regex_match[num].rm_so]),
305 				    &repstr[j], k);
306 				j += k;
307 				if (*st == '\0')
308 					more = FALSE;
309 				if (*st == '\\') {
310 					st++;
311 					state = 1;
312 				} else {
313 					repstr[j] = *st;
314 					j++;
315 					if (j >= REPLEN)
316 						return (FALSE);
317 					st++;
318 					state = 0;
319 				}
320 			}
321 			break;
322 		}		/* switch (state) */
323 	}			/* while (more)   */
324 
325 	repstr[j] = '\0';
326 	s = lreplace(plen, repstr);
327 	return (s);
328 }
329 
330 /*
331  * This routine does the real work of a forward search.  The pattern is
332  * sitting in the external variable "pat".  If found, dot is updated, the
333  * window system is notified of the change, and TRUE is returned.  If the
334  * string isn't found, FALSE is returned.
335  */
336 static int
337 re_forwsrch(void)
338 {
339 	int	 	 re_flags, tbo, tdotline, error;
340 	struct line	*clp;
341 
342 	clp = curwp->w_dotp;
343 	tbo = curwp->w_doto;
344 	tdotline = curwp->w_dotline;
345 
346 	if (tbo == clp->l_used)
347 		/*
348 		 * Don't start matching past end of line -- must move to
349 		 * beginning of next line, unless line is empty or at
350 		 * end of file.
351 		 */
352 		if (clp != curbp->b_headp && llength(clp) != 0) {
353 			clp = lforw(clp);
354 			tdotline++;
355 			tbo = 0;
356 		}
357 	/*
358 	 * Note this loop does not process the last line, but this editor
359 	 * always makes the last line empty so this is good.
360 	 */
361 	while (clp != (curbp->b_headp)) {
362 		re_flags = REG_STARTEND;
363 		if (tbo != 0)
364 			re_flags |= REG_NOTBOL;
365 		regex_match[0].rm_so = tbo;
366 		regex_match[0].rm_eo = llength(clp);
367 		error = regexec(&regex_buff, ltext(clp) ? ltext(clp) : "",
368 		    RE_NMATCH, regex_match, re_flags);
369 		if (error != 0) {
370 			clp = lforw(clp);
371 			tdotline++;
372 			tbo = 0;
373 		} else {
374 			curwp->w_doto = regex_match[0].rm_eo;
375 			curwp->w_dotp = clp;
376 			curwp->w_dotline = tdotline;
377 			curwp->w_rflag |= WFMOVE;
378 			return (TRUE);
379 		}
380 	}
381 	return (FALSE);
382 }
383 
384 /*
385  * This routine does the real work of a backward search.  The pattern is sitting
386  * in the external variable "re_pat".  If found, dot is updated, the window
387  * system is notified of the change, and TRUE is returned.  If the string isn't
388  * found, FALSE is returned.
389  */
390 static int
391 re_backsrch(void)
392 {
393 	struct line		*clp;
394 	int		 tbo, tdotline;
395 	regmatch_t	 lastmatch;
396 
397 	clp = curwp->w_dotp;
398 	tbo = curwp->w_doto;
399 	tdotline = curwp->w_dotline;
400 
401 	/* Start search one position to the left of dot */
402 	tbo = tbo - 1;
403 	if (tbo < 0) {
404 		/* must move up one line */
405 		clp = lback(clp);
406 		tdotline--;
407 		tbo = llength(clp);
408 	}
409 
410 	/*
411 	 * Note this loop does not process the last line, but this editor
412 	 * always makes the last line empty so this is good.
413 	 */
414 	while (clp != (curbp->b_headp)) {
415 		regex_match[0].rm_so = 0;
416 		regex_match[0].rm_eo = llength(clp);
417 		lastmatch.rm_so = -1;
418 		/*
419 		 * Keep searching until we don't match any longer.  Assumes a
420 		 * non-match does not modify the regex_match array.  We have to
421 		 * do this character-by-character after the first match since
422 		 * POSIX regexps don't give you a way to do reverse matches.
423 		 */
424 		while (!regexec(&regex_buff, ltext(clp) ? ltext(clp) : "",
425 		    RE_NMATCH, regex_match, REG_STARTEND) &&
426 		    regex_match[0].rm_so <= tbo) {
427 			memcpy(&lastmatch, &regex_match[0], sizeof(regmatch_t));
428 			regex_match[0].rm_so++;
429 			regex_match[0].rm_eo = llength(clp);
430 		}
431 		if (lastmatch.rm_so == -1) {
432 			clp = lback(clp);
433 			tdotline--;
434 			tbo = llength(clp);
435 		} else {
436 			memcpy(&regex_match[0], &lastmatch, sizeof(regmatch_t));
437 			curwp->w_doto = regex_match[0].rm_so;
438 			curwp->w_dotp = clp;
439 			curwp->w_dotline = tdotline;
440 			curwp->w_rflag |= WFMOVE;
441 			return (TRUE);
442 		}
443 	}
444 	return (FALSE);
445 }
446 
447 /*
448  * Read a pattern.
449  * Stash it in the external variable "re_pat". The "pat" is
450  * not updated if the user types in an empty line. If the user typed
451  * an empty line, and there is no old pattern, it is an error.
452  * Display the old pattern, in the style of Jeff Lomicka. There is
453  * some do-it-yourself control expansion.
454  */
455 static int
456 re_readpattern(char *re_prompt)
457 {
458 	static int	dofree = 0;
459 	int		flags, error, s;
460 	char		tpat[NPAT], *rep;
461 
462 	if (re_pat[0] == '\0')
463 		rep = eread("%s: ", tpat, NPAT, EFNEW | EFCR, re_prompt);
464 	else
465 		rep = eread("%s (default %s): ", tpat, NPAT,
466 		    EFNUL | EFNEW | EFCR, re_prompt, re_pat);
467 	if (rep == NULL)
468 		return (ABORT);
469 	if (rep[0] != '\0') {
470 		/* New pattern given */
471 		(void)strlcpy(re_pat, tpat, sizeof(re_pat));
472 		if (casefoldsearch)
473 			flags = REG_EXTENDED | REG_ICASE;
474 		else
475 			flags = REG_EXTENDED;
476 		if (dofree)
477 			regfree(&regex_buff);
478 		error = regcomp(&regex_buff, re_pat, flags);
479 		if (error != 0) {
480 			char	message[256];
481 			regerror(error, &regex_buff, message, sizeof(message));
482 			dobeep();
483 			ewprintf("Regex Error: %s", message);
484 			re_pat[0] = '\0';
485 			return (FALSE);
486 		}
487 		dofree = 1;
488 		s = TRUE;
489 	} else if (rep[0] == '\0' && re_pat[0] != '\0')
490 		/* Just using old pattern */
491 		s = TRUE;
492 	else
493 		s = FALSE;
494 	return (s);
495 }
496 
497 /*
498  * Cause case to not matter in searches.  This is the default.	If called
499  * with argument cause case to matter.
500  */
501 /* ARGSUSED*/
502 int
503 setcasefold(int f, int n)
504 {
505 	if (f & FFARG) {
506 		casefoldsearch = FALSE;
507 		ewprintf("Case-fold-search unset");
508 	} else {
509 		casefoldsearch = TRUE;
510 		ewprintf("Case-fold-search set");
511 	}
512 
513 	/*
514 	 * Invalidate the regular expression pattern since I'm too lazy to
515 	 * recompile it.
516 	 */
517 	re_pat[0] = '\0';
518 	return (TRUE);
519 }
520 
521 /*
522  * Delete all lines after dot that contain a string matching regex.
523  */
524 /* ARGSUSED */
525 int
526 delmatchlines(int f, int n)
527 {
528 	int	s;
529 
530 	if ((s = re_readpattern("Flush lines (containing match for regexp)"))
531 	    != TRUE)
532 		return (s);
533 
534 	s = killmatches(TRUE);
535 	return (s);
536 }
537 
538 /*
539  * Delete all lines after dot that don't contain a string matching regex.
540  */
541 /* ARGSUSED */
542 int
543 delnonmatchlines(int f, int n)
544 {
545 	int	s;
546 
547 	if ((s = re_readpattern("Keep lines (containing match for regexp)"))
548 	    != TRUE)
549 		return (s);
550 
551 	s = killmatches(FALSE);
552 	return (s);
553 }
554 
555 /*
556  * This function does the work of deleting matching lines.
557  */
558 static int
559 killmatches(int cond)
560 {
561 	int	 s, error;
562 	int	 count = 0;
563 	struct line	*clp;
564 
565 	clp = curwp->w_dotp;
566 	if (curwp->w_doto == llength(clp))
567 		/* Consider dot on next line */
568 		clp = lforw(clp);
569 
570 	while (clp != (curbp->b_headp)) {
571 		/* see if line matches */
572 		regex_match[0].rm_so = 0;
573 		regex_match[0].rm_eo = llength(clp);
574 		error = regexec(&regex_buff, ltext(clp) ? ltext(clp) : "",
575 		    RE_NMATCH, regex_match, REG_STARTEND);
576 
577 		/* Delete line when appropriate */
578 		if ((cond == FALSE && error) || (cond == TRUE && !error)) {
579 			curwp->w_doto = 0;
580 			curwp->w_dotp = clp;
581 			count++;
582 			s = ldelete(llength(clp) + 1, KNONE);
583 			clp = curwp->w_dotp;
584 			curwp->w_rflag |= WFMOVE;
585 			if (s == FALSE)
586 				return (FALSE);
587 		} else
588 			clp = lforw(clp);
589 	}
590 
591 	ewprintf("%d line(s) deleted", count);
592 	if (count > 0)
593 		curwp->w_rflag |= WFMOVE;
594 
595 	return (TRUE);
596 }
597 
598 /*
599  * Count lines matching regex.
600  */
601 /* ARGSUSED */
602 int
603 cntmatchlines(int f, int n)
604 {
605 	int	s;
606 
607 	if ((s = re_readpattern("Count lines (matching regexp)")) != TRUE)
608 		return (s);
609 	s = countmatches(TRUE);
610 
611 	return (s);
612 }
613 
614 /*
615  * Count lines that fail to match regex.
616  */
617 /* ARGSUSED */
618 int
619 cntnonmatchlines(int f, int n)
620 {
621 	int	s;
622 
623 	if ((s = re_readpattern("Count lines (not matching regexp)")) != TRUE)
624 		return (s);
625 	s = countmatches(FALSE);
626 
627 	return (s);
628 }
629 
630 /*
631  * This function does the work of counting matching lines.
632  */
633 int
634 countmatches(int cond)
635 {
636 	int	 error;
637 	int	 count = 0;
638 	struct line	*clp;
639 
640 	clp = curwp->w_dotp;
641 	if (curwp->w_doto == llength(clp))
642 		/* Consider dot on next line */
643 		clp = lforw(clp);
644 
645 	while (clp != (curbp->b_headp)) {
646 		/* see if line matches */
647 		regex_match[0].rm_so = 0;
648 		regex_match[0].rm_eo = llength(clp);
649 		error = regexec(&regex_buff, ltext(clp) ? ltext(clp) : "",
650 		    RE_NMATCH, regex_match, REG_STARTEND);
651 
652 		/* Count line when appropriate */
653 		if ((cond == FALSE && error) || (cond == TRUE && !error))
654 			count++;
655 		clp = lforw(clp);
656 	}
657 
658 	if (cond)
659 		ewprintf("Number of lines matching: %d", count);
660 	else
661 		ewprintf("Number of lines not matching: %d", count);
662 
663 	return (TRUE);
664 }
665 #endif	/* REGEX */
666