xref: /openbsd/usr.bin/mg/re_search.c (revision 26083131)
1 /*	$OpenBSD: re_search.c,v 1.29 2013/12/20 18:44:13 florian Exp $	*/
2 
3 /* This file is in the public domain. */
4 
5 /*
6  *	regular expression search commands for Mg
7  *
8  * This file contains functions to implement several of gnuemacs's regular
9  * expression functions for Mg.  Several of the routines below are just minor
10  * re-arrangements of Mg's non-regular expression search functions.  Some of
11  * them are similar in structure to the original MicroEMACS, others are
12  * modifications of Rich Ellison's code.  Peter Newton re-wrote about half of
13  * them from scratch.
14  */
15 
16 #ifdef REGEX
17 #include "def.h"
18 
19 #include <sys/types.h>
20 #include <regex.h>
21 
22 #include "macro.h"
23 
24 #define SRCH_BEGIN	(0)		/* search sub-codes		    */
25 #define SRCH_FORW	(-1)
26 #define SRCH_BACK	(-2)
27 #define SRCH_NOPR	(-3)
28 #define SRCH_ACCM	(-4)
29 #define SRCH_MARK	(-5)
30 
31 #define RE_NMATCH	10		/* max number of matches	    */
32 #define REPLEN		256		/* max length of replacement string */
33 
34 char	re_pat[NPAT];			/* regex pattern		    */
35 int	re_srch_lastdir = SRCH_NOPR;	/* last search flags		    */
36 int	casefoldsearch = TRUE;		/* does search ignore case?	    */
37 
38 static int	 re_doreplace(RSIZE, char *);
39 static int	 re_forwsrch(void);
40 static int	 re_backsrch(void);
41 static int	 re_readpattern(char *);
42 static int	 killmatches(int);
43 static int	 countmatches(int);
44 
45 /*
46  * Search forward.
47  * Get a search string from the user and search for it starting at ".".  If
48  * found, move "." to just after the matched characters.  display does all
49  * the hard stuff.  If not found, it just prints a message.
50  */
51 /* ARGSUSED */
52 int
53 re_forwsearch(int f, int n)
54 {
55 	int	s;
56 
57 	if ((s = re_readpattern("RE Search")) != TRUE)
58 		return (s);
59 	if (re_forwsrch() == FALSE) {
60 		ewprintf("Search failed: \"%s\"", re_pat);
61 		return (FALSE);
62 	}
63 	re_srch_lastdir = SRCH_FORW;
64 	return (TRUE);
65 }
66 
67 /*
68  * Reverse search.
69  * Get a search string from the user, and search, starting at "."
70  * and proceeding toward the front of the buffer. If found "." is left
71  * pointing at the first character of the pattern [the last character that
72  * was matched].
73  */
74 /* ARGSUSED */
75 int
76 re_backsearch(int f, int n)
77 {
78 	int	s;
79 
80 	if ((s = re_readpattern("RE Search backward")) != TRUE)
81 		return (s);
82 	if (re_backsrch() == FALSE) {
83 		ewprintf("Search failed: \"%s\"", re_pat);
84 		return (FALSE);
85 	}
86 	re_srch_lastdir = SRCH_BACK;
87 	return (TRUE);
88 }
89 
90 /*
91  * Search again, using the same search string and direction as the last search
92  * command.  The direction has been saved in "srch_lastdir", so you know which
93  * way to go.
94  *
95  * XXX: This code has problems -- some incompatibility(?) with extend.c causes
96  * match to fail when it should not.
97  */
98 /* ARGSUSED */
99 int
100 re_searchagain(int f, int n)
101 {
102 	if (re_srch_lastdir == SRCH_NOPR) {
103 		ewprintf("No last search");
104 		return (FALSE);
105 	}
106 	if (re_srch_lastdir == SRCH_FORW) {
107 		if (re_forwsrch() == FALSE) {
108 			ewprintf("Search failed: \"%s\"", re_pat);
109 			return (FALSE);
110 		}
111 		return (TRUE);
112 	}
113 	if (re_srch_lastdir == SRCH_BACK)
114 		if (re_backsrch() == FALSE) {
115 			ewprintf("Search failed: \"%s\"", re_pat);
116 			return (FALSE);
117 		}
118 
119 	return (TRUE);
120 }
121 
122 /* Compiled regex goes here-- changed only when new pattern read */
123 static regex_t		regex_buff;
124 static regmatch_t	regex_match[RE_NMATCH];
125 
126 /*
127  * Re-Query Replace.
128  *	Replace strings selectively.  Does a search and replace operation.
129  */
130 /* ARGSUSED */
131 int
132 re_queryrepl(int f, int n)
133 {
134 	int	rcnt = 0;		/* replacements made so far	*/
135 	int	plen, s;		/* length of found string	*/
136 	char	news[NPAT];		/* replacement string		*/
137 
138 	if ((s = re_readpattern("RE Query replace")) != TRUE)
139 		return (s);
140 	if (eread("Query replace %s with: ", news, NPAT,
141 	    EFNUL | EFNEW | EFCR, re_pat) == NULL)
142 		return (ABORT);
143 	ewprintf("Query replacing %s with %s:", re_pat, news);
144 
145 	/*
146 	 * Search forward repeatedly, checking each time whether to insert
147 	 * or not.  The "!" case makes the check always true, so it gets put
148 	 * into a tighter loop for efficiency.
149 	 */
150 	while (re_forwsrch() == TRUE) {
151 retry:
152 		update(CMODE);
153 		switch (getkey(FALSE)) {
154 		case ' ':
155 			plen = regex_match[0].rm_eo - regex_match[0].rm_so;
156 			if (re_doreplace((RSIZE)plen, news) == FALSE)
157 				return (FALSE);
158 			rcnt++;
159 			break;
160 
161 		case '.':
162 			plen = regex_match[0].rm_eo - regex_match[0].rm_so;
163 			if (re_doreplace((RSIZE)plen, news) == FALSE)
164 				return (FALSE);
165 			rcnt++;
166 			goto stopsearch;
167 
168 		case CCHR('G'):				/* ^G */
169 			(void)ctrlg(FFRAND, 0);
170 			goto stopsearch;
171 		case CCHR('['):				/* ESC */
172 		case '`':
173 			goto stopsearch;
174 		case '!':
175 			do {
176 				plen = regex_match[0].rm_eo - regex_match[0].rm_so;
177 				if (re_doreplace((RSIZE)plen, news) == FALSE)
178 					return (FALSE);
179 				rcnt++;
180 			} while (re_forwsrch() == TRUE);
181 			goto stopsearch;
182 
183 		case CCHR('?'):				/* To not replace */
184 			break;
185 
186 		default:
187 			ewprintf("<SP> replace, [.] rep-end, <DEL> don't, [!] repl rest <ESC> quit");
188 			goto retry;
189 		}
190 	}
191 
192 stopsearch:
193 	curwp->w_rflag |= WFFULL;
194 	update(CMODE);
195 	if (!inmacro) {
196 		if (rcnt == 0)
197 			ewprintf("(No replacements done)");
198 		else if (rcnt == 1)
199 			ewprintf("(1 replacement done)");
200 		else
201 			ewprintf("(%d replacements done)", rcnt);
202 	}
203 	return (TRUE);
204 }
205 
206 /*
207  * Routine re_doreplace calls lreplace to make replacements needed by
208  * re_query replace.  Its reason for existence is to deal with \1, \2. etc.
209  *  plen: length to remove
210  *  st:   replacement string
211  */
212 static int
213 re_doreplace(RSIZE plen, char *st)
214 {
215 	int	 j, k, s, more, num, state;
216 	struct line	*clp;
217 	char	 repstr[REPLEN];
218 
219 	clp = curwp->w_dotp;
220 	more = TRUE;
221 	j = 0;
222 	state = 0;
223 	num = 0;
224 
225 	/* The following FSA parses the replacement string */
226 	while (more) {
227 		switch (state) {
228 		case 0:
229 			if (*st == '\\') {
230 				st++;
231 				state = 1;
232 			} else if (*st == '\0')
233 				more = FALSE;
234 			else {
235 				repstr[j] = *st;
236 				j++;
237 				if (j >= REPLEN)
238 					return (FALSE);
239 				st++;
240 			}
241 			break;
242 		case 1:
243 			if (*st >= '0' && *st <= '9') {
244 				num = *st - '0';
245 				st++;
246 				state = 2;
247 			} else if (*st == '\0')
248 				more = FALSE;
249 			else {
250 				repstr[j] = *st;
251 				j++;
252 				if (j >= REPLEN)
253 					return (FALSE);
254 				st++;
255 				state = 0;
256 			}
257 			break;
258 		case 2:
259 			if (*st >= '0' && *st <= '9') {
260 				num = 10 * num + *st - '0';
261 				st++;
262 			} else {
263 				if (num >= RE_NMATCH)
264 					return (FALSE);
265 				k = regex_match[num].rm_eo - regex_match[num].rm_so;
266 				if (j + k >= REPLEN)
267 					return (FALSE);
268 				bcopy(&(clp->l_text[regex_match[num].rm_so]),
269 				    &repstr[j], k);
270 				j += k;
271 				if (*st == '\0')
272 					more = FALSE;
273 				if (*st == '\\') {
274 					st++;
275 					state = 1;
276 				} else {
277 					repstr[j] = *st;
278 					j++;
279 					if (j >= REPLEN)
280 						return (FALSE);
281 					st++;
282 					state = 0;
283 				}
284 			}
285 			break;
286 		}		/* switch (state) */
287 	}			/* while (more)   */
288 
289 	repstr[j] = '\0';
290 	s = lreplace(plen, repstr);
291 	return (s);
292 }
293 
294 /*
295  * This routine does the real work of a forward search.  The pattern is
296  * sitting in the external variable "pat".  If found, dot is updated, the
297  * window system is notified of the change, and TRUE is returned.  If the
298  * string isn't found, FALSE is returned.
299  */
300 static int
301 re_forwsrch(void)
302 {
303 	int	 tbo, tdotline, error;
304 	struct line	*clp;
305 
306 	clp = curwp->w_dotp;
307 	tbo = curwp->w_doto;
308 	tdotline = curwp->w_dotline;
309 
310 	if (tbo == clp->l_used)
311 		/*
312 		 * Don't start matching past end of line -- must move to
313 		 * beginning of next line, unless at end of file.
314 		 */
315 		if (clp != curbp->b_headp) {
316 			clp = lforw(clp);
317 			tdotline++;
318 			tbo = 0;
319 		}
320 	/*
321 	 * Note this loop does not process the last line, but this editor
322 	 * always makes the last line empty so this is good.
323 	 */
324 	while (clp != (curbp->b_headp)) {
325 		regex_match[0].rm_so = tbo;
326 		regex_match[0].rm_eo = llength(clp);
327 		error = regexec(&regex_buff, ltext(clp), RE_NMATCH, regex_match,
328 		    REG_STARTEND);
329 		if (error != 0) {
330 			clp = lforw(clp);
331 			tdotline++;
332 			tbo = 0;
333 		} else {
334 			curwp->w_doto = regex_match[0].rm_eo;
335 			curwp->w_dotp = clp;
336 			curwp->w_dotline = tdotline;
337 			curwp->w_rflag |= WFMOVE;
338 			return (TRUE);
339 		}
340 	}
341 	return (FALSE);
342 }
343 
344 /*
345  * This routine does the real work of a backward search.  The pattern is sitting
346  * in the external variable "re_pat".  If found, dot is updated, the window
347  * system is notified of the change, and TRUE is returned.  If the string isn't
348  * found, FALSE is returned.
349  */
350 static int
351 re_backsrch(void)
352 {
353 	struct line		*clp;
354 	int		 tbo, tdotline;
355 	regmatch_t	 lastmatch;
356 
357 	clp = curwp->w_dotp;
358 	tbo = curwp->w_doto;
359 	tdotline = curwp->w_dotline;
360 
361 	/* Start search one position to the left of dot */
362 	tbo = tbo - 1;
363 	if (tbo < 0) {
364 		/* must move up one line */
365 		clp = lback(clp);
366 		tdotline--;
367 		tbo = llength(clp);
368 	}
369 
370 	/*
371 	 * Note this loop does not process the last line, but this editor
372 	 * always makes the last line empty so this is good.
373 	 */
374 	while (clp != (curbp->b_headp)) {
375 		regex_match[0].rm_so = 0;
376 		regex_match[0].rm_eo = llength(clp);
377 		lastmatch.rm_so = -1;
378 		/*
379 		 * Keep searching until we don't match any longer.  Assumes a
380 		 * non-match does not modify the regex_match array.  We have to
381 		 * do this character-by-character after the first match since
382 		 * POSIX regexps don't give you a way to do reverse matches.
383 		 */
384 		while (!regexec(&regex_buff, ltext(clp), RE_NMATCH, regex_match,
385 		    REG_STARTEND) && regex_match[0].rm_so < tbo) {
386 			memcpy(&lastmatch, &regex_match[0], sizeof(regmatch_t));
387 			regex_match[0].rm_so++;
388 			regex_match[0].rm_eo = llength(clp);
389 		}
390 		if (lastmatch.rm_so == -1) {
391 			clp = lback(clp);
392 			tdotline--;
393 			tbo = llength(clp);
394 		} else {
395 			memcpy(&regex_match[0], &lastmatch, sizeof(regmatch_t));
396 			curwp->w_doto = regex_match[0].rm_so;
397 			curwp->w_dotp = clp;
398 			curwp->w_dotline = tdotline;
399 			curwp->w_rflag |= WFMOVE;
400 			return (TRUE);
401 		}
402 	}
403 	return (FALSE);
404 }
405 
406 /*
407  * Read a pattern.
408  * Stash it in the external variable "re_pat". The "pat" is
409  * not updated if the user types in an empty line. If the user typed
410  * an empty line, and there is no old pattern, it is an error.
411  * Display the old pattern, in the style of Jeff Lomicka. There is
412  * some do-it-yourself control expansion.
413  */
414 static int
415 re_readpattern(char *prompt)
416 {
417 	static int	dofree = 0;
418 	int		flags, error, s;
419 	char		tpat[NPAT], *rep;
420 
421 	if (re_pat[0] == '\0')
422 		rep = eread("%s: ", tpat, NPAT, EFNEW | EFCR, prompt);
423 	else
424 		rep = eread("%s: (default %s) ", tpat, NPAT,
425 		    EFNUL | EFNEW | EFCR, prompt, re_pat);
426 	if (rep == NULL)
427 		return (ABORT);
428 	if (rep[0] != '\0') {
429 		/* New pattern given */
430 		(void)strlcpy(re_pat, tpat, sizeof(re_pat));
431 		if (casefoldsearch)
432 			flags = REG_EXTENDED | REG_ICASE;
433 		else
434 			flags = REG_EXTENDED;
435 		if (dofree)
436 			regfree(&regex_buff);
437 		error = regcomp(&regex_buff, re_pat, flags);
438 		if (error != 0) {
439 			char	message[256];
440 			regerror(error, &regex_buff, message, sizeof(message));
441 			ewprintf("Regex Error: %s", message);
442 			re_pat[0] = '\0';
443 			return (FALSE);
444 		}
445 		dofree = 1;
446 		s = TRUE;
447 	} else if (rep[0] == '\0' && re_pat[0] != '\0')
448 		/* Just using old pattern */
449 		s = TRUE;
450 	else
451 		s = FALSE;
452 	return (s);
453 }
454 
455 /*
456  * Cause case to not matter in searches.  This is the default.	If called
457  * with argument cause case to matter.
458  */
459 /* ARGSUSED*/
460 int
461 setcasefold(int f, int n)
462 {
463 	if (f & FFARG) {
464 		casefoldsearch = FALSE;
465 		ewprintf("Case-fold-search unset");
466 	} else {
467 		casefoldsearch = TRUE;
468 		ewprintf("Case-fold-search set");
469 	}
470 
471 	/*
472 	 * Invalidate the regular expression pattern since I'm too lazy to
473 	 * recompile it.
474 	 */
475 	re_pat[0] = '\0';
476 	return (TRUE);
477 }
478 
479 /*
480  * Delete all lines after dot that contain a string matching regex.
481  */
482 /* ARGSUSED */
483 int
484 delmatchlines(int f, int n)
485 {
486 	int	s;
487 
488 	if ((s = re_readpattern("Flush lines (containing match for regexp)"))
489 	    != TRUE)
490 		return (s);
491 
492 	s = killmatches(TRUE);
493 	return (s);
494 }
495 
496 /*
497  * Delete all lines after dot that don't contain a string matching regex.
498  */
499 /* ARGSUSED */
500 int
501 delnonmatchlines(int f, int n)
502 {
503 	int	s;
504 
505 	if ((s = re_readpattern("Keep lines (containing match for regexp)"))
506 	    != TRUE)
507 		return (s);
508 
509 	s = killmatches(FALSE);
510 	return (s);
511 }
512 
513 /*
514  * This function does the work of deleting matching lines.
515  */
516 static int
517 killmatches(int cond)
518 {
519 	int	 s, error;
520 	int	 count = 0;
521 	struct line	*clp;
522 
523 	clp = curwp->w_dotp;
524 	if (curwp->w_doto == llength(clp))
525 		/* Consider dot on next line */
526 		clp = lforw(clp);
527 
528 	while (clp != (curbp->b_headp)) {
529 		/* see if line matches */
530 		regex_match[0].rm_so = 0;
531 		regex_match[0].rm_eo = llength(clp);
532 		error = regexec(&regex_buff, ltext(clp), RE_NMATCH, regex_match,
533 		    REG_STARTEND);
534 
535 		/* Delete line when appropriate */
536 		if ((cond == FALSE && error) || (cond == TRUE && !error)) {
537 			curwp->w_doto = 0;
538 			curwp->w_dotp = clp;
539 			count++;
540 			s = ldelete(llength(clp) + 1, KNONE);
541 			clp = curwp->w_dotp;
542 			curwp->w_rflag |= WFMOVE;
543 			if (s == FALSE)
544 				return (FALSE);
545 		} else
546 			clp = lforw(clp);
547 	}
548 
549 	ewprintf("%d line(s) deleted", count);
550 	if (count > 0)
551 		curwp->w_rflag |= WFMOVE;
552 
553 	return (TRUE);
554 }
555 
556 /*
557  * Count lines matching regex.
558  */
559 /* ARGSUSED */
560 int
561 cntmatchlines(int f, int n)
562 {
563 	int	s;
564 
565 	if ((s = re_readpattern("Count lines (matching regexp)")) != TRUE)
566 		return (s);
567 	s = countmatches(TRUE);
568 
569 	return (s);
570 }
571 
572 /*
573  * Count lines that fail to match regex.
574  */
575 /* ARGSUSED */
576 int
577 cntnonmatchlines(int f, int n)
578 {
579 	int	s;
580 
581 	if ((s = re_readpattern("Count lines (not matching regexp)")) != TRUE)
582 		return (s);
583 	s = countmatches(FALSE);
584 
585 	return (s);
586 }
587 
588 /*
589  * This function does the work of counting matching lines.
590  */
591 int
592 countmatches(int cond)
593 {
594 	int	 error;
595 	int	 count = 0;
596 	struct line	*clp;
597 
598 	clp = curwp->w_dotp;
599 	if (curwp->w_doto == llength(clp))
600 		/* Consider dot on next line */
601 		clp = lforw(clp);
602 
603 	while (clp != (curbp->b_headp)) {
604 		/* see if line matches */
605 		regex_match[0].rm_so = 0;
606 		regex_match[0].rm_eo = llength(clp);
607 		error = regexec(&regex_buff, ltext(clp), RE_NMATCH, regex_match,
608 		    REG_STARTEND);
609 
610 		/* Count line when appropriate */
611 		if ((cond == FALSE && error) || (cond == TRUE && !error))
612 			count++;
613 		clp = lforw(clp);
614 	}
615 
616 	if (cond)
617 		ewprintf("Number of lines matching: %d", count);
618 	else
619 		ewprintf("Number of lines not matching: %d", count);
620 
621 	return (TRUE);
622 }
623 #endif	/* REGEX */
624