xref: /openbsd/usr.bin/mg/re_search.c (revision b59d45bc)
1 /*	$OpenBSD: re_search.c,v 1.28 2013/09/24 13:29:51 jasper Exp $	*/
2 
3 /* This file is in the public domain. */
4 
5 /*
6  *	regular expression search commands for Mg
7  *
8  * This file contains functions to implement several of gnuemacs's regular
9  * expression functions for Mg.  Several of the routines below are just minor
10  * re-arrangements of Mg's non-regular expression search functions.  Some of
11  * them are similar in structure to the original MicroEMACS, others are
12  * modifications of Rich Ellison's code.  Peter Newton re-wrote about half of
13  * them from scratch.
14  */
15 
16 #ifdef REGEX
17 #include "def.h"
18 
19 #include <sys/types.h>
20 #include <regex.h>
21 
22 #include "macro.h"
23 
24 #define SRCH_BEGIN	(0)		/* search sub-codes		    */
25 #define SRCH_FORW	(-1)
26 #define SRCH_BACK	(-2)
27 #define SRCH_NOPR	(-3)
28 #define SRCH_ACCM	(-4)
29 #define SRCH_MARK	(-5)
30 
31 #define RE_NMATCH	10		/* max number of matches	    */
32 #define REPLEN		256		/* max length of replacement string */
33 
34 char	re_pat[NPAT];			/* regex pattern		    */
35 int	re_srch_lastdir = SRCH_NOPR;	/* last search flags		    */
36 int	casefoldsearch = TRUE;		/* does search ignore case?	    */
37 
38 static int	 re_doreplace(RSIZE, char *);
39 static int	 re_forwsrch(void);
40 static int	 re_backsrch(void);
41 static int	 re_readpattern(char *);
42 static int	 killmatches(int);
43 static int	 countmatches(int);
44 
45 /*
46  * Search forward.
47  * Get a search string from the user and search for it starting at ".".  If
48  * found, move "." to just after the matched characters.  display does all
49  * the hard stuff.  If not found, it just prints a message.
50  */
51 /* ARGSUSED */
52 int
53 re_forwsearch(int f, int n)
54 {
55 	int	s;
56 
57 	if ((s = re_readpattern("RE Search")) != TRUE)
58 		return (s);
59 	if (re_forwsrch() == FALSE) {
60 		ewprintf("Search failed: \"%s\"", re_pat);
61 		return (FALSE);
62 	}
63 	re_srch_lastdir = SRCH_FORW;
64 	return (TRUE);
65 }
66 
67 /*
68  * Reverse search.
69  * Get a search string from the user, and search, starting at "."
70  * and proceeding toward the front of the buffer. If found "." is left
71  * pointing at the first character of the pattern [the last character that
72  * was matched].
73  */
74 /* ARGSUSED */
75 int
76 re_backsearch(int f, int n)
77 {
78 	int	s;
79 
80 	if ((s = re_readpattern("RE Search backward")) != TRUE)
81 		return (s);
82 	if (re_backsrch() == FALSE) {
83 		ewprintf("Search failed: \"%s\"", re_pat);
84 		return (FALSE);
85 	}
86 	re_srch_lastdir = SRCH_BACK;
87 	return (TRUE);
88 }
89 
90 /*
91  * Search again, using the same search string and direction as the last search
92  * command.  The direction has been saved in "srch_lastdir", so you know which
93  * way to go.
94  *
95  * XXX: This code has problems -- some incompatibility(?) with extend.c causes
96  * match to fail when it should not.
97  */
98 /* ARGSUSED */
99 int
100 re_searchagain(int f, int n)
101 {
102 	if (re_srch_lastdir == SRCH_NOPR) {
103 		ewprintf("No last search");
104 		return (FALSE);
105 	}
106 	if (re_srch_lastdir == SRCH_FORW) {
107 		if (re_forwsrch() == FALSE) {
108 			ewprintf("Search failed: \"%s\"", re_pat);
109 			return (FALSE);
110 		}
111 		return (TRUE);
112 	}
113 	if (re_srch_lastdir == SRCH_BACK)
114 		if (re_backsrch() == FALSE) {
115 			ewprintf("Search failed: \"%s\"", re_pat);
116 			return (FALSE);
117 		}
118 
119 	return (TRUE);
120 }
121 
122 /* Compiled regex goes here-- changed only when new pattern read */
123 static regex_t		regex_buff;
124 static regmatch_t	regex_match[RE_NMATCH];
125 
126 /*
127  * Re-Query Replace.
128  *	Replace strings selectively.  Does a search and replace operation.
129  */
130 /* ARGSUSED */
131 int
132 re_queryrepl(int f, int n)
133 {
134 	int	rcnt = 0;		/* replacements made so far	*/
135 	int	plen, s;		/* length of found string	*/
136 	char	news[NPAT];		/* replacement string		*/
137 
138 	if ((s = re_readpattern("RE Query replace")) != TRUE)
139 		return (s);
140 	if (eread("Query replace %s with: ", news, NPAT,
141 	    EFNUL | EFNEW | EFCR, re_pat) == NULL)
142 		return (ABORT);
143 	ewprintf("Query replacing %s with %s:", re_pat, news);
144 
145 	/*
146 	 * Search forward repeatedly, checking each time whether to insert
147 	 * or not.  The "!" case makes the check always true, so it gets put
148 	 * into a tighter loop for efficiency.
149 	 */
150 	while (re_forwsrch() == TRUE) {
151 retry:
152 		update(CMODE);
153 		switch (getkey(FALSE)) {
154 		case ' ':
155 			plen = regex_match[0].rm_eo - regex_match[0].rm_so;
156 			if (re_doreplace((RSIZE)plen, news) == FALSE)
157 				return (FALSE);
158 			rcnt++;
159 			break;
160 
161 		case '.':
162 			plen = regex_match[0].rm_eo - regex_match[0].rm_so;
163 			if (re_doreplace((RSIZE)plen, news) == FALSE)
164 				return (FALSE);
165 			rcnt++;
166 			goto stopsearch;
167 
168 		case CCHR('G'):				/* ^G */
169 			(void)ctrlg(FFRAND, 0);
170 			goto stopsearch;
171 		case CCHR('['):				/* ESC */
172 		case '`':
173 			goto stopsearch;
174 		case '!':
175 			do {
176 				plen = regex_match[0].rm_eo - regex_match[0].rm_so;
177 				if (re_doreplace((RSIZE)plen, news) == FALSE)
178 					return (FALSE);
179 				rcnt++;
180 			} while (re_forwsrch() == TRUE);
181 			goto stopsearch;
182 
183 		case CCHR('?'):				/* To not replace */
184 			break;
185 
186 		default:
187 			ewprintf("<SP> replace, [.] rep-end, <DEL> don't, [!] repl rest <ESC> quit");
188 			goto retry;
189 		}
190 	}
191 
192 stopsearch:
193 	curwp->w_rflag |= WFFULL;
194 	update(CMODE);
195 	if (!inmacro) {
196 		if (rcnt == 0)
197 			ewprintf("(No replacements done)");
198 		else if (rcnt == 1)
199 			ewprintf("(1 replacement done)");
200 		else
201 			ewprintf("(%d replacements done)", rcnt);
202 	}
203 	return (TRUE);
204 }
205 
206 /*
207  * Routine re_doreplace calls lreplace to make replacements needed by
208  * re_query replace.  Its reason for existence is to deal with \1, \2. etc.
209  *  plen: length to remove
210  *  st:   replacement string
211  */
212 static int
213 re_doreplace(RSIZE plen, char *st)
214 {
215 	int	 j, k, s, more, num, state;
216 	struct line	*clp;
217 	char	 repstr[REPLEN];
218 
219 	clp = curwp->w_dotp;
220 	more = TRUE;
221 	j = 0;
222 	state = 0;
223 	num = 0;
224 
225 	/* The following FSA parses the replacement string */
226 	while (more) {
227 		switch (state) {
228 		case 0:
229 			if (*st == '\\') {
230 				st++;
231 				state = 1;
232 			} else if (*st == '\0')
233 				more = FALSE;
234 			else {
235 				repstr[j] = *st;
236 				j++;
237 				if (j >= REPLEN)
238 					return (FALSE);
239 				st++;
240 			}
241 			break;
242 		case 1:
243 			if (*st >= '0' && *st <= '9') {
244 				num = *st - '0';
245 				st++;
246 				state = 2;
247 			} else if (*st == '\0')
248 				more = FALSE;
249 			else {
250 				repstr[j] = *st;
251 				j++;
252 				if (j >= REPLEN)
253 					return (FALSE);
254 				st++;
255 				state = 0;
256 			}
257 			break;
258 		case 2:
259 			if (*st >= '0' && *st <= '9') {
260 				num = 10 * num + *st - '0';
261 				st++;
262 			} else {
263 				if (num >= RE_NMATCH)
264 					return (FALSE);
265 				k = regex_match[num].rm_eo - regex_match[num].rm_so;
266 				if (j + k >= REPLEN)
267 					return (FALSE);
268 				bcopy(&(clp->l_text[regex_match[num].rm_so]),
269 				    &repstr[j], k);
270 				j += k;
271 				if (*st == '\0')
272 					more = FALSE;
273 				if (*st == '\\') {
274 					st++;
275 					state = 1;
276 				} else {
277 					repstr[j] = *st;
278 					j++;
279 					if (j >= REPLEN)
280 						return (FALSE);
281 					st++;
282 					state = 0;
283 				}
284 			}
285 			break;
286 		}		/* switch (state) */
287 	}			/* while (more)   */
288 
289 	repstr[j] = '\0';
290 	s = lreplace(plen, repstr);
291 	return (s);
292 }
293 
294 /*
295  * This routine does the real work of a forward search.  The pattern is
296  * sitting in the external variable "pat".  If found, dot is updated, the
297  * window system is notified of the change, and TRUE is returned.  If the
298  * string isn't found, FALSE is returned.
299  */
300 static int
301 re_forwsrch(void)
302 {
303 	int	 tbo, error;
304 	struct line	*clp;
305 
306 	clp = curwp->w_dotp;
307 	tbo = curwp->w_doto;
308 
309 	if (tbo == clp->l_used)
310 		/*
311 		 * Don't start matching past end of line -- must move to
312 		 * beginning of next line, unless at end of file.
313 		 */
314 		if (clp != curbp->b_headp) {
315 			clp = lforw(clp);
316 			tbo = 0;
317 		}
318 	/*
319 	 * Note this loop does not process the last line, but this editor
320 	 * always makes the last line empty so this is good.
321 	 */
322 	while (clp != (curbp->b_headp)) {
323 		regex_match[0].rm_so = tbo;
324 		regex_match[0].rm_eo = llength(clp);
325 		error = regexec(&regex_buff, ltext(clp), RE_NMATCH, regex_match,
326 		    REG_STARTEND);
327 		if (error != 0) {
328 			clp = lforw(clp);
329 			tbo = 0;
330 		} else {
331 			curwp->w_doto = regex_match[0].rm_eo;
332 			curwp->w_dotp = clp;
333 			curwp->w_rflag |= WFMOVE;
334 			return (TRUE);
335 		}
336 	}
337 	return (FALSE);
338 }
339 
340 /*
341  * This routine does the real work of a backward search.  The pattern is sitting
342  * in the external variable "re_pat".  If found, dot is updated, the window
343  * system is notified of the change, and TRUE is returned.  If the string isn't
344  * found, FALSE is returned.
345  */
346 static int
347 re_backsrch(void)
348 {
349 	struct line		*clp;
350 	int		 tbo;
351 	regmatch_t	 lastmatch;
352 
353 	clp = curwp->w_dotp;
354 	tbo = curwp->w_doto;
355 
356 	/* Start search one position to the left of dot */
357 	tbo = tbo - 1;
358 	if (tbo < 0) {
359 		/* must move up one line */
360 		clp = lback(clp);
361 		tbo = llength(clp);
362 	}
363 
364 	/*
365 	 * Note this loop does not process the last line, but this editor
366 	 * always makes the last line empty so this is good.
367 	 */
368 	while (clp != (curbp->b_headp)) {
369 		regex_match[0].rm_so = 0;
370 		regex_match[0].rm_eo = llength(clp);
371 		lastmatch.rm_so = -1;
372 		/*
373 		 * Keep searching until we don't match any longer.  Assumes a
374 		 * non-match does not modify the regex_match array.  We have to
375 		 * do this character-by-character after the first match since
376 		 * POSIX regexps don't give you a way to do reverse matches.
377 		 */
378 		while (!regexec(&regex_buff, ltext(clp), RE_NMATCH, regex_match,
379 		    REG_STARTEND) && regex_match[0].rm_so < tbo) {
380 			memcpy(&lastmatch, &regex_match[0], sizeof(regmatch_t));
381 			regex_match[0].rm_so++;
382 			regex_match[0].rm_eo = llength(clp);
383 		}
384 		if (lastmatch.rm_so == -1) {
385 			clp = lback(clp);
386 			tbo = llength(clp);
387 		} else {
388 			memcpy(&regex_match[0], &lastmatch, sizeof(regmatch_t));
389 			curwp->w_doto = regex_match[0].rm_so;
390 			curwp->w_dotp = clp;
391 			curwp->w_rflag |= WFMOVE;
392 			return (TRUE);
393 		}
394 	}
395 	return (FALSE);
396 }
397 
398 /*
399  * Read a pattern.
400  * Stash it in the external variable "re_pat". The "pat" is
401  * not updated if the user types in an empty line. If the user typed
402  * an empty line, and there is no old pattern, it is an error.
403  * Display the old pattern, in the style of Jeff Lomicka. There is
404  * some do-it-yourself control expansion.
405  */
406 static int
407 re_readpattern(char *prompt)
408 {
409 	static int	dofree = 0;
410 	int		flags, error, s;
411 	char		tpat[NPAT], *rep;
412 
413 	if (re_pat[0] == '\0')
414 		rep = eread("%s: ", tpat, NPAT, EFNEW | EFCR, prompt);
415 	else
416 		rep = eread("%s: (default %s) ", tpat, NPAT,
417 		    EFNUL | EFNEW | EFCR, prompt, re_pat);
418 	if (rep == NULL)
419 		return (ABORT);
420 	if (rep[0] != '\0') {
421 		/* New pattern given */
422 		(void)strlcpy(re_pat, tpat, sizeof(re_pat));
423 		if (casefoldsearch)
424 			flags = REG_EXTENDED | REG_ICASE;
425 		else
426 			flags = REG_EXTENDED;
427 		if (dofree)
428 			regfree(&regex_buff);
429 		error = regcomp(&regex_buff, re_pat, flags);
430 		if (error != 0) {
431 			char	message[256];
432 			regerror(error, &regex_buff, message, sizeof(message));
433 			ewprintf("Regex Error: %s", message);
434 			re_pat[0] = '\0';
435 			return (FALSE);
436 		}
437 		dofree = 1;
438 		s = TRUE;
439 	} else if (rep[0] == '\0' && re_pat[0] != '\0')
440 		/* Just using old pattern */
441 		s = TRUE;
442 	else
443 		s = FALSE;
444 	return (s);
445 }
446 
447 /*
448  * Cause case to not matter in searches.  This is the default.	If called
449  * with argument cause case to matter.
450  */
451 /* ARGSUSED*/
452 int
453 setcasefold(int f, int n)
454 {
455 	if (f & FFARG) {
456 		casefoldsearch = FALSE;
457 		ewprintf("Case-fold-search unset");
458 	} else {
459 		casefoldsearch = TRUE;
460 		ewprintf("Case-fold-search set");
461 	}
462 
463 	/*
464 	 * Invalidate the regular expression pattern since I'm too lazy to
465 	 * recompile it.
466 	 */
467 	re_pat[0] = '\0';
468 	return (TRUE);
469 }
470 
471 /*
472  * Delete all lines after dot that contain a string matching regex.
473  */
474 /* ARGSUSED */
475 int
476 delmatchlines(int f, int n)
477 {
478 	int	s;
479 
480 	if ((s = re_readpattern("Flush lines (containing match for regexp)"))
481 	    != TRUE)
482 		return (s);
483 
484 	s = killmatches(TRUE);
485 	return (s);
486 }
487 
488 /*
489  * Delete all lines after dot that don't contain a string matching regex.
490  */
491 /* ARGSUSED */
492 int
493 delnonmatchlines(int f, int n)
494 {
495 	int	s;
496 
497 	if ((s = re_readpattern("Keep lines (containing match for regexp)"))
498 	    != TRUE)
499 		return (s);
500 
501 	s = killmatches(FALSE);
502 	return (s);
503 }
504 
505 /*
506  * This function does the work of deleting matching lines.
507  */
508 static int
509 killmatches(int cond)
510 {
511 	int	 s, error;
512 	int	 count = 0;
513 	struct line	*clp;
514 
515 	clp = curwp->w_dotp;
516 	if (curwp->w_doto == llength(clp))
517 		/* Consider dot on next line */
518 		clp = lforw(clp);
519 
520 	while (clp != (curbp->b_headp)) {
521 		/* see if line matches */
522 		regex_match[0].rm_so = 0;
523 		regex_match[0].rm_eo = llength(clp);
524 		error = regexec(&regex_buff, ltext(clp), RE_NMATCH, regex_match,
525 		    REG_STARTEND);
526 
527 		/* Delete line when appropriate */
528 		if ((cond == FALSE && error) || (cond == TRUE && !error)) {
529 			curwp->w_doto = 0;
530 			curwp->w_dotp = clp;
531 			count++;
532 			s = ldelete(llength(clp) + 1, KNONE);
533 			clp = curwp->w_dotp;
534 			curwp->w_rflag |= WFMOVE;
535 			if (s == FALSE)
536 				return (FALSE);
537 		} else
538 			clp = lforw(clp);
539 	}
540 
541 	ewprintf("%d line(s) deleted", count);
542 	if (count > 0)
543 		curwp->w_rflag |= WFMOVE;
544 
545 	return (TRUE);
546 }
547 
548 /*
549  * Count lines matching regex.
550  */
551 /* ARGSUSED */
552 int
553 cntmatchlines(int f, int n)
554 {
555 	int	s;
556 
557 	if ((s = re_readpattern("Count lines (matching regexp)")) != TRUE)
558 		return (s);
559 	s = countmatches(TRUE);
560 
561 	return (s);
562 }
563 
564 /*
565  * Count lines that fail to match regex.
566  */
567 /* ARGSUSED */
568 int
569 cntnonmatchlines(int f, int n)
570 {
571 	int	s;
572 
573 	if ((s = re_readpattern("Count lines (not matching regexp)")) != TRUE)
574 		return (s);
575 	s = countmatches(FALSE);
576 
577 	return (s);
578 }
579 
580 /*
581  * This function does the work of counting matching lines.
582  */
583 int
584 countmatches(int cond)
585 {
586 	int	 error;
587 	int	 count = 0;
588 	struct line	*clp;
589 
590 	clp = curwp->w_dotp;
591 	if (curwp->w_doto == llength(clp))
592 		/* Consider dot on next line */
593 		clp = lforw(clp);
594 
595 	while (clp != (curbp->b_headp)) {
596 		/* see if line matches */
597 		regex_match[0].rm_so = 0;
598 		regex_match[0].rm_eo = llength(clp);
599 		error = regexec(&regex_buff, ltext(clp), RE_NMATCH, regex_match,
600 		    REG_STARTEND);
601 
602 		/* Count line when appropriate */
603 		if ((cond == FALSE && error) || (cond == TRUE && !error))
604 			count++;
605 		clp = lforw(clp);
606 	}
607 
608 	if (cond)
609 		ewprintf("Number of lines matching: %d", count);
610 	else
611 		ewprintf("Number of lines not matching: %d", count);
612 
613 	return (TRUE);
614 }
615 #endif	/* REGEX */
616