xref: /openbsd/usr.bin/mg/re_search.c (revision 8a6c6275)
1 /*	$OpenBSD: re_search.c,v 1.35 2020/07/22 13:29:05 tb Exp $	*/
2 
3 /* This file is in the public domain. */
4 
5 /*
6  *	regular expression search commands for Mg
7  *
8  * This file contains functions to implement several of gnuemacs's regular
9  * expression functions for Mg.  Several of the routines below are just minor
10  * re-arrangements of Mg's non-regular expression search functions.  Some of
11  * them are similar in structure to the original MicroEMACS, others are
12  * modifications of Rich Ellison's code.  Peter Newton re-wrote about half of
13  * them from scratch.
14  */
15 
16 #ifdef REGEX
17 #include <sys/queue.h>
18 #include <sys/types.h>
19 #include <regex.h>
20 #include <signal.h>
21 #include <stdio.h>
22 #include <string.h>
23 
24 #include "def.h"
25 #include "macro.h"
26 
27 #define SRCH_BEGIN	(0)		/* search sub-codes		    */
28 #define SRCH_FORW	(-1)
29 #define SRCH_BACK	(-2)
30 #define SRCH_NOPR	(-3)
31 #define SRCH_ACCM	(-4)
32 #define SRCH_MARK	(-5)
33 
34 #define RE_NMATCH	10		/* max number of matches	    */
35 #define REPLEN		256		/* max length of replacement string */
36 
37 char	re_pat[NPAT];			/* regex pattern		    */
38 int	re_srch_lastdir = SRCH_NOPR;	/* last search flags		    */
39 int	casefoldsearch = TRUE;		/* does search ignore case?	    */
40 
41 static int	 re_doreplace(RSIZE, char *);
42 static int	 re_forwsrch(void);
43 static int	 re_backsrch(void);
44 static int	 re_readpattern(char *);
45 static int	 killmatches(int);
46 static int	 countmatches(int);
47 
48 /*
49  * Search forward.
50  * Get a search string from the user and search for it starting at ".".  If
51  * found, move "." to just after the matched characters.  display does all
52  * the hard stuff.  If not found, it just prints a message.
53  */
54 /* ARGSUSED */
55 int
56 re_forwsearch(int f, int n)
57 {
58 	int	s;
59 
60 	if ((s = re_readpattern("RE Search")) != TRUE)
61 		return (s);
62 	if (re_forwsrch() == FALSE) {
63 		dobeep();
64 		ewprintf("Search failed: \"%s\"", re_pat);
65 		return (FALSE);
66 	}
67 	re_srch_lastdir = SRCH_FORW;
68 	return (TRUE);
69 }
70 
71 /*
72  * Reverse search.
73  * Get a search string from the user, and search, starting at "."
74  * and proceeding toward the front of the buffer. If found "." is left
75  * pointing at the first character of the pattern [the last character that
76  * was matched].
77  */
78 /* ARGSUSED */
79 int
80 re_backsearch(int f, int n)
81 {
82 	int	s;
83 
84 	if ((s = re_readpattern("RE Search backward")) != TRUE)
85 		return (s);
86 	if (re_backsrch() == FALSE) {
87 		dobeep();
88 		ewprintf("Search failed: \"%s\"", re_pat);
89 		return (FALSE);
90 	}
91 	re_srch_lastdir = SRCH_BACK;
92 	return (TRUE);
93 }
94 
95 /*
96  * Search again, using the same search string and direction as the last search
97  * command.  The direction has been saved in "srch_lastdir", so you know which
98  * way to go.
99  *
100  * XXX: This code has problems -- some incompatibility(?) with extend.c causes
101  * match to fail when it should not.
102  */
103 /* ARGSUSED */
104 int
105 re_searchagain(int f, int n)
106 {
107 	if (re_srch_lastdir == SRCH_NOPR) {
108 		dobeep();
109 		ewprintf("No last search");
110 		return (FALSE);
111 	}
112 	if (re_srch_lastdir == SRCH_FORW) {
113 		if (re_forwsrch() == FALSE) {
114 			dobeep();
115 			ewprintf("Search failed: \"%s\"", re_pat);
116 			return (FALSE);
117 		}
118 		return (TRUE);
119 	}
120 	if (re_srch_lastdir == SRCH_BACK)
121 		if (re_backsrch() == FALSE) {
122 			dobeep();
123 			ewprintf("Search failed: \"%s\"", re_pat);
124 			return (FALSE);
125 		}
126 
127 	return (TRUE);
128 }
129 
130 /* Compiled regex goes here-- changed only when new pattern read */
131 static regex_t		regex_buff;
132 static regmatch_t	regex_match[RE_NMATCH];
133 
134 /*
135  * Re-Query Replace.
136  *	Replace strings selectively.  Does a search and replace operation.
137  */
138 /* ARGSUSED */
139 int
140 re_queryrepl(int f, int n)
141 {
142 	int	rcnt = 0;		/* replacements made so far	*/
143 	int	plen, s;		/* length of found string	*/
144 	char	news[NPAT];		/* replacement string		*/
145 
146 	if ((s = re_readpattern("RE Query replace")) != TRUE)
147 		return (s);
148 	if (eread("Query replace %s with: ", news, NPAT,
149 	    EFNUL | EFNEW | EFCR, re_pat) == NULL)
150 		return (ABORT);
151 	ewprintf("Query replacing %s with %s:", re_pat, news);
152 
153 	/*
154 	 * Search forward repeatedly, checking each time whether to insert
155 	 * or not.  The "!" case makes the check always true, so it gets put
156 	 * into a tighter loop for efficiency.
157 	 */
158 	while (re_forwsrch() == TRUE) {
159 retry:
160 		update(CMODE);
161 		switch (getkey(FALSE)) {
162 		case ' ':
163 			plen = regex_match[0].rm_eo - regex_match[0].rm_so;
164 			if (re_doreplace((RSIZE)plen, news) == FALSE)
165 				return (FALSE);
166 			rcnt++;
167 			break;
168 
169 		case '.':
170 			plen = regex_match[0].rm_eo - regex_match[0].rm_so;
171 			if (re_doreplace((RSIZE)plen, news) == FALSE)
172 				return (FALSE);
173 			rcnt++;
174 			goto stopsearch;
175 
176 		case CCHR('G'):				/* ^G */
177 			(void)ctrlg(FFRAND, 0);
178 			goto stopsearch;
179 		case CCHR('['):				/* ESC */
180 		case '`':
181 			goto stopsearch;
182 		case '!':
183 			do {
184 				plen = regex_match[0].rm_eo - regex_match[0].rm_so;
185 				if (re_doreplace((RSIZE)plen, news) == FALSE)
186 					return (FALSE);
187 				rcnt++;
188 			} while (re_forwsrch() == TRUE);
189 			goto stopsearch;
190 
191 		case CCHR('?'):				/* To not replace */
192 			break;
193 
194 		default:
195 			ewprintf("<SP> replace, [.] rep-end, <DEL> don't, [!] repl rest <ESC> quit");
196 			goto retry;
197 		}
198 	}
199 
200 stopsearch:
201 	curwp->w_rflag |= WFFULL;
202 	update(CMODE);
203 	if (!inmacro) {
204 		if (rcnt == 0)
205 			ewprintf("(No replacements done)");
206 		else if (rcnt == 1)
207 			ewprintf("(1 replacement done)");
208 		else
209 			ewprintf("(%d replacements done)", rcnt);
210 	}
211 	return (TRUE);
212 }
213 
214 /*
215  * Routine re_doreplace calls lreplace to make replacements needed by
216  * re_query replace.  Its reason for existence is to deal with \1, \2. etc.
217  *  plen: length to remove
218  *  st:   replacement string
219  */
220 static int
221 re_doreplace(RSIZE plen, char *st)
222 {
223 	int	 j, k, s, more, num, state;
224 	struct line	*clp;
225 	char	 repstr[REPLEN];
226 
227 	clp = curwp->w_dotp;
228 	more = TRUE;
229 	j = 0;
230 	state = 0;
231 	num = 0;
232 
233 	/* The following FSA parses the replacement string */
234 	while (more) {
235 		switch (state) {
236 		case 0:
237 			if (*st == '\\') {
238 				st++;
239 				state = 1;
240 			} else if (*st == '\0')
241 				more = FALSE;
242 			else {
243 				repstr[j] = *st;
244 				j++;
245 				if (j >= REPLEN)
246 					return (FALSE);
247 				st++;
248 			}
249 			break;
250 		case 1:
251 			if (*st >= '0' && *st <= '9') {
252 				num = *st - '0';
253 				st++;
254 				state = 2;
255 			} else if (*st == '\0')
256 				more = FALSE;
257 			else {
258 				repstr[j] = *st;
259 				j++;
260 				if (j >= REPLEN)
261 					return (FALSE);
262 				st++;
263 				state = 0;
264 			}
265 			break;
266 		case 2:
267 			if (*st >= '0' && *st <= '9') {
268 				num = 10 * num + *st - '0';
269 				st++;
270 			} else {
271 				if (num >= RE_NMATCH)
272 					return (FALSE);
273 				k = regex_match[num].rm_eo - regex_match[num].rm_so;
274 				if (j + k >= REPLEN)
275 					return (FALSE);
276 				bcopy(&(clp->l_text[regex_match[num].rm_so]),
277 				    &repstr[j], k);
278 				j += k;
279 				if (*st == '\0')
280 					more = FALSE;
281 				if (*st == '\\') {
282 					st++;
283 					state = 1;
284 				} else {
285 					repstr[j] = *st;
286 					j++;
287 					if (j >= REPLEN)
288 						return (FALSE);
289 					st++;
290 					state = 0;
291 				}
292 			}
293 			break;
294 		}		/* switch (state) */
295 	}			/* while (more)   */
296 
297 	repstr[j] = '\0';
298 	s = lreplace(plen, repstr);
299 	return (s);
300 }
301 
302 /*
303  * This routine does the real work of a forward search.  The pattern is
304  * sitting in the external variable "pat".  If found, dot is updated, the
305  * window system is notified of the change, and TRUE is returned.  If the
306  * string isn't found, FALSE is returned.
307  */
308 static int
309 re_forwsrch(void)
310 {
311 	int	 	 re_flags, tbo, tdotline, error;
312 	struct line	*clp;
313 
314 	clp = curwp->w_dotp;
315 	tbo = curwp->w_doto;
316 	tdotline = curwp->w_dotline;
317 
318 	if (tbo == clp->l_used)
319 		/*
320 		 * Don't start matching past end of line -- must move to
321 		 * beginning of next line, unless line is empty or at
322 		 * end of file.
323 		 */
324 		if (clp != curbp->b_headp && llength(clp) != 0) {
325 			clp = lforw(clp);
326 			tdotline++;
327 			tbo = 0;
328 		}
329 	/*
330 	 * Note this loop does not process the last line, but this editor
331 	 * always makes the last line empty so this is good.
332 	 */
333 	while (clp != (curbp->b_headp)) {
334 		re_flags = REG_STARTEND;
335 		if (tbo != 0)
336 			re_flags |= REG_NOTBOL;
337 		regex_match[0].rm_so = tbo;
338 		regex_match[0].rm_eo = llength(clp);
339 		error = regexec(&regex_buff, ltext(clp) ? ltext(clp) : "",
340 		    RE_NMATCH, regex_match, re_flags);
341 		if (error != 0) {
342 			clp = lforw(clp);
343 			tdotline++;
344 			tbo = 0;
345 		} else {
346 			curwp->w_doto = regex_match[0].rm_eo;
347 			curwp->w_dotp = clp;
348 			curwp->w_dotline = tdotline;
349 			curwp->w_rflag |= WFMOVE;
350 			return (TRUE);
351 		}
352 	}
353 	return (FALSE);
354 }
355 
356 /*
357  * This routine does the real work of a backward search.  The pattern is sitting
358  * in the external variable "re_pat".  If found, dot is updated, the window
359  * system is notified of the change, and TRUE is returned.  If the string isn't
360  * found, FALSE is returned.
361  */
362 static int
363 re_backsrch(void)
364 {
365 	struct line		*clp;
366 	int		 tbo, tdotline;
367 	regmatch_t	 lastmatch;
368 
369 	clp = curwp->w_dotp;
370 	tbo = curwp->w_doto;
371 	tdotline = curwp->w_dotline;
372 
373 	/* Start search one position to the left of dot */
374 	tbo = tbo - 1;
375 	if (tbo < 0) {
376 		/* must move up one line */
377 		clp = lback(clp);
378 		tdotline--;
379 		tbo = llength(clp);
380 	}
381 
382 	/*
383 	 * Note this loop does not process the last line, but this editor
384 	 * always makes the last line empty so this is good.
385 	 */
386 	while (clp != (curbp->b_headp)) {
387 		regex_match[0].rm_so = 0;
388 		regex_match[0].rm_eo = llength(clp);
389 		lastmatch.rm_so = -1;
390 		/*
391 		 * Keep searching until we don't match any longer.  Assumes a
392 		 * non-match does not modify the regex_match array.  We have to
393 		 * do this character-by-character after the first match since
394 		 * POSIX regexps don't give you a way to do reverse matches.
395 		 */
396 		while (!regexec(&regex_buff, ltext(clp) ? ltext(clp) : "",
397 		    RE_NMATCH, regex_match, REG_STARTEND) &&
398 		    regex_match[0].rm_so <= tbo) {
399 			memcpy(&lastmatch, &regex_match[0], sizeof(regmatch_t));
400 			regex_match[0].rm_so++;
401 			regex_match[0].rm_eo = llength(clp);
402 		}
403 		if (lastmatch.rm_so == -1) {
404 			clp = lback(clp);
405 			tdotline--;
406 			tbo = llength(clp);
407 		} else {
408 			memcpy(&regex_match[0], &lastmatch, sizeof(regmatch_t));
409 			curwp->w_doto = regex_match[0].rm_so;
410 			curwp->w_dotp = clp;
411 			curwp->w_dotline = tdotline;
412 			curwp->w_rflag |= WFMOVE;
413 			return (TRUE);
414 		}
415 	}
416 	return (FALSE);
417 }
418 
419 /*
420  * Read a pattern.
421  * Stash it in the external variable "re_pat". The "pat" is
422  * not updated if the user types in an empty line. If the user typed
423  * an empty line, and there is no old pattern, it is an error.
424  * Display the old pattern, in the style of Jeff Lomicka. There is
425  * some do-it-yourself control expansion.
426  */
427 static int
428 re_readpattern(char *re_prompt)
429 {
430 	static int	dofree = 0;
431 	int		flags, error, s;
432 	char		tpat[NPAT], *rep;
433 
434 	if (re_pat[0] == '\0')
435 		rep = eread("%s: ", tpat, NPAT, EFNEW | EFCR, re_prompt);
436 	else
437 		rep = eread("%s (default %s): ", tpat, NPAT,
438 		    EFNUL | EFNEW | EFCR, re_prompt, re_pat);
439 	if (rep == NULL)
440 		return (ABORT);
441 	if (rep[0] != '\0') {
442 		/* New pattern given */
443 		(void)strlcpy(re_pat, tpat, sizeof(re_pat));
444 		if (casefoldsearch)
445 			flags = REG_EXTENDED | REG_ICASE;
446 		else
447 			flags = REG_EXTENDED;
448 		if (dofree)
449 			regfree(&regex_buff);
450 		error = regcomp(&regex_buff, re_pat, flags);
451 		if (error != 0) {
452 			char	message[256];
453 			regerror(error, &regex_buff, message, sizeof(message));
454 			dobeep();
455 			ewprintf("Regex Error: %s", message);
456 			re_pat[0] = '\0';
457 			return (FALSE);
458 		}
459 		dofree = 1;
460 		s = TRUE;
461 	} else if (rep[0] == '\0' && re_pat[0] != '\0')
462 		/* Just using old pattern */
463 		s = TRUE;
464 	else
465 		s = FALSE;
466 	return (s);
467 }
468 
469 /*
470  * Cause case to not matter in searches.  This is the default.	If called
471  * with argument cause case to matter.
472  */
473 /* ARGSUSED*/
474 int
475 setcasefold(int f, int n)
476 {
477 	if (f & FFARG) {
478 		casefoldsearch = FALSE;
479 		ewprintf("Case-fold-search unset");
480 	} else {
481 		casefoldsearch = TRUE;
482 		ewprintf("Case-fold-search set");
483 	}
484 
485 	/*
486 	 * Invalidate the regular expression pattern since I'm too lazy to
487 	 * recompile it.
488 	 */
489 	re_pat[0] = '\0';
490 	return (TRUE);
491 }
492 
493 /*
494  * Delete all lines after dot that contain a string matching regex.
495  */
496 /* ARGSUSED */
497 int
498 delmatchlines(int f, int n)
499 {
500 	int	s;
501 
502 	if ((s = re_readpattern("Flush lines (containing match for regexp)"))
503 	    != TRUE)
504 		return (s);
505 
506 	s = killmatches(TRUE);
507 	return (s);
508 }
509 
510 /*
511  * Delete all lines after dot that don't contain a string matching regex.
512  */
513 /* ARGSUSED */
514 int
515 delnonmatchlines(int f, int n)
516 {
517 	int	s;
518 
519 	if ((s = re_readpattern("Keep lines (containing match for regexp)"))
520 	    != TRUE)
521 		return (s);
522 
523 	s = killmatches(FALSE);
524 	return (s);
525 }
526 
527 /*
528  * This function does the work of deleting matching lines.
529  */
530 static int
531 killmatches(int cond)
532 {
533 	int	 s, error;
534 	int	 count = 0;
535 	struct line	*clp;
536 
537 	clp = curwp->w_dotp;
538 	if (curwp->w_doto == llength(clp))
539 		/* Consider dot on next line */
540 		clp = lforw(clp);
541 
542 	while (clp != (curbp->b_headp)) {
543 		/* see if line matches */
544 		regex_match[0].rm_so = 0;
545 		regex_match[0].rm_eo = llength(clp);
546 		error = regexec(&regex_buff, ltext(clp) ? ltext(clp) : "",
547 		    RE_NMATCH, regex_match, REG_STARTEND);
548 
549 		/* Delete line when appropriate */
550 		if ((cond == FALSE && error) || (cond == TRUE && !error)) {
551 			curwp->w_doto = 0;
552 			curwp->w_dotp = clp;
553 			count++;
554 			s = ldelete(llength(clp) + 1, KNONE);
555 			clp = curwp->w_dotp;
556 			curwp->w_rflag |= WFMOVE;
557 			if (s == FALSE)
558 				return (FALSE);
559 		} else
560 			clp = lforw(clp);
561 	}
562 
563 	ewprintf("%d line(s) deleted", count);
564 	if (count > 0)
565 		curwp->w_rflag |= WFMOVE;
566 
567 	return (TRUE);
568 }
569 
570 /*
571  * Count lines matching regex.
572  */
573 /* ARGSUSED */
574 int
575 cntmatchlines(int f, int n)
576 {
577 	int	s;
578 
579 	if ((s = re_readpattern("Count lines (matching regexp)")) != TRUE)
580 		return (s);
581 	s = countmatches(TRUE);
582 
583 	return (s);
584 }
585 
586 /*
587  * Count lines that fail to match regex.
588  */
589 /* ARGSUSED */
590 int
591 cntnonmatchlines(int f, int n)
592 {
593 	int	s;
594 
595 	if ((s = re_readpattern("Count lines (not matching regexp)")) != TRUE)
596 		return (s);
597 	s = countmatches(FALSE);
598 
599 	return (s);
600 }
601 
602 /*
603  * This function does the work of counting matching lines.
604  */
605 int
606 countmatches(int cond)
607 {
608 	int	 error;
609 	int	 count = 0;
610 	struct line	*clp;
611 
612 	clp = curwp->w_dotp;
613 	if (curwp->w_doto == llength(clp))
614 		/* Consider dot on next line */
615 		clp = lforw(clp);
616 
617 	while (clp != (curbp->b_headp)) {
618 		/* see if line matches */
619 		regex_match[0].rm_so = 0;
620 		regex_match[0].rm_eo = llength(clp);
621 		error = regexec(&regex_buff, ltext(clp) ? ltext(clp) : "",
622 		    RE_NMATCH, regex_match, REG_STARTEND);
623 
624 		/* Count line when appropriate */
625 		if ((cond == FALSE && error) || (cond == TRUE && !error))
626 			count++;
627 		clp = lforw(clp);
628 	}
629 
630 	if (cond)
631 		ewprintf("Number of lines matching: %d", count);
632 	else
633 		ewprintf("Number of lines not matching: %d", count);
634 
635 	return (TRUE);
636 }
637 #endif	/* REGEX */
638