1 /* @(#)search.c	1.30 09/07/09 Copyright 1984-2009 J. Schilling */
2 #include <schily/mconfig.h>
3 #ifndef lint
4 static	UConst char sccsid[] =
5 	"@(#)search.c	1.30 09/07/09 Copyright 1984-2009 J. Schilling";
6 #endif
7 /*
8  *	Search functions for VED
9  *
10  *	Copyright (c) 1984-2009 J. Schilling
11  */
12 /*
13  * The contents of this file are subject to the terms of the
14  * Common Development and Distribution License, Version 1.0 only
15  * (the "License").  You may not use this file except in compliance
16  * with the License.
17  *
18  * See the file CDDL.Schily.txt in this distribution for details.
19  * A copy of the CDDL is also available via the Internet at
20  * http://www.opensource.org/licenses/cddl1.txt
21  *
22  * When distributing Covered Code, include this CDDL HEADER in each
23  * file and include the License file CDDL.Schily.txt from this distribution.
24  */
25 
26 /*
27  * This are the low level support routines to search forwards or backwards
28  * through the buffer. High level interfaces are found in searchcmds.c and
29  * movedot.c
30  *
31  * The package implements two levels of of optimazation:
32  * If the pattern does not contain special characters of the pattern matcher
33  * simplesearch() and simplereverse() are used. If the pattern in addition
34  * is a one byte pattern, searchchar() and srevchar() are used.
35  * All other search operations are handled directly inside search()
36  * and reverse() by using the pattern matcher.
37  *
38  * All routines return a value > eof to indicate that the requested object
39  * could not be found. Some routines decrement/incremet the value 'begin' so
40  * we must return eof+2 to be different from all possible 'correct' values.
41  * If the pattern is bad, eof+3 is returned to indicate that no 'not found'
42  * message should be displayed.
43  *
44  *	Return values:
45  *		eof+2	Pattern not found
46  *		eof+3	Bad Pattern
47  *		other	Pattern found
48  */
49 #include "ved.h"
50 #include "buffer.h"
51 #include <schily/patmatch.h>
52 
53 #define	SEARCHSIZE 4096
54 
55 EXPORT	BOOL	issimple	__PR((Uchar* pattern, int length));
56 LOCAL	epos_t	searchchar	__PR((ewin_t *wp, epos_t begin, int ch));
57 EXPORT	epos_t	srevchar	__PR((ewin_t *wp, epos_t begin, int ch));
58 EXPORT	epos_t	search		__PR((ewin_t *wp, epos_t begin, Uchar* pattern, int patsize, BOOL domark));
59 EXPORT	epos_t	reverse		__PR((ewin_t *wp, epos_t begin, Uchar* pattern, int patsize, BOOL domark));
60 LOCAL	BOOL	simplesearch	__PR((Uchar* searchstr, int searchlen, Uchar* pattern, int patlen, Uchar** beginp, Uchar** endp));
61 LOCAL	BOOL	simplereverse	__PR((Uchar* searchstr, int searchlen, Uchar* pattern, int patlen, Uchar** beginp, Uchar** endp));
62 LOCAL	epos_t	searchdone	__PR((ewin_t *wp, epos_t start, epos_t ret, BOOL domark));
63 
64 /*
65  * Check if the pattern contains special characters of the pattern matcher.
66  * Do not look for characters that may be used only inside of a character set.
67  */
68 EXPORT BOOL
issimple(pattern,length)69 issimple(pattern, length)
70 	register Uchar	*pattern;
71 	register int	length;
72 {
73 	while (length-- > 0) {
74 		switch (*pattern++) {
75 
76 		case ALT:	case REP:	case NIL:
77 		case STAR:	case LBRACK:	case RBRACK:
78 		case LCLASS:	case RCLASS:	case QUOTE:
79 		case ANY:	case START:	case END:
80 								return (FALSE);
81 		}
82 	}
83 	return (TRUE);
84 }
85 
86 /*
87  * Search forwards for a one character 'pattern'.
88  */
89 LOCAL epos_t
searchchar(wp,begin,ch)90 searchchar(wp, begin, ch)
91 	ewin_t	*wp;
92 	epos_t	begin;
93 	Uchar	ch;
94 {
95 	register headr_t	*linkp;
96 	register Uchar		*stuff;
97 	register int		linksize;
98 		headr_t		*passlinkp;
99 		int		pos;
100 
101 	findpos(wp, begin, &passlinkp, &pos);
102 	linkp = passlinkp;
103 	linksize = linkp->size - pos;
104 	stuff = linkp->cont + pos;
105 
106 	for (;;) {
107 		while (linksize--) {
108 			if (*stuff++ == ch)
109 				return (begin);
110 			begin++;
111 		}
112 		if ((linkp = linkp->next) == NULL)
113 			return (wp->eof+2);
114 		readybuffer(wp, linkp);
115 		linksize = linkp->size;
116 		stuff = linkp->cont;
117 	}
118 }
119 
120 /*
121  * Search backwards for a one character 'pattern'.
122  */
123 EXPORT epos_t
srevchar(wp,begin,ch)124 srevchar(wp, begin, ch)
125 	ewin_t	*wp;
126 	epos_t	begin;
127 	Uchar	ch;
128 {
129 	register headr_t	*linkp;
130 	register Uchar		*stuff;
131 	register int		linksize;
132 		headr_t		*passlinkp;
133 		int		pos;
134 
135 	findpos(wp, begin, &passlinkp, &pos);
136 	linkp = passlinkp;
137 	linksize = pos + 1;
138 	stuff = linkp->cont + pos;
139 
140 	for (;;) {
141 		while (linksize--) {
142 			if (*stuff-- == ch)
143 				return (begin);
144 			begin--;
145 		}
146 		if ((linkp = linkp->prev) == NULL)
147 			return (wp->eof+2);
148 		readybuffer(wp, linkp);
149 		linksize = linkp->size;
150 		stuff = linkp->cont + linkp->size - 1;
151 	}
152 }
153 
154 /*
155  * Search forwards for 'pattern' doing the appropriate optimizations.
156  */
157 EXPORT epos_t
search(wp,begin,pattern,patsize,domark)158 search(wp, begin, pattern, patsize, domark)
159 	ewin_t	*wp;
160 	epos_t	begin;
161 	Uchar	*pattern;
162 	int	patsize;
163 	BOOL	domark;
164 {
165 	Uchar	searchstr[SEARCHSIZE + 1];
166 	int	aux[SEARCHSIZE];	/* Nobody should be able to type such*/
167 	int	state[SEARCHSIZE+1];	/* a long pattern - avoid malloc()   */
168 	int	alt;
169 	int	start;
170 	int	readsize;
171 	int	firsttime;
172 	int	midline;
173 	int	nl;
174 	Uchar	*strstart;
175 	Uchar	*place;
176 	epos_t	pos;
177 
178 	if (begin >= wp->eof)
179 		return (wp->eof+2);
180 	if (patsize == 0)
181 		return (wp->eof+2);
182 
183 	if (wp->magic == FALSE || issimple(pattern, patsize)) {
184 		/*
185 		 * Search without the pattern matcher.
186 		 */
187 		if (patsize == 1) {		/* single character pattern */
188 			if ((pos = searchchar(wp, begin, *pattern)) < wp->eof)
189 				return (searchdone(wp, pos, pos+1, domark));
190 			else
191 				return (wp->eof+2);
192 		}
193 
194 /*		while (readsize = extr_line(begin, searchstr, SEARCHSIZE)) {*/
195 		while ((readsize = extract(wp, begin, searchstr, SEARCHSIZE)) != 0) {
196 			if (simplesearch(searchstr, readsize,
197 					pattern, patsize, &strstart, &place)) {
198 
199 				return (searchdone(wp, begin+
200 					(int)(strstart-searchstr),
201 					begin+(int)(place-searchstr),
202 					domark));
203 			}
204 			begin += readsize;
205 #ifdef	_use_extr_line_
206 			/*extr_line*/
207 			if (begin < wp->eof && searchstr[readsize-1] != '\n') {
208 #else
209 			if (begin < wp->eof) {
210 #endif
211 				begin -= patsize - 1;
212 			}
213 		}
214 	} else {
215 		/*
216 		 * First compile the pattern.
217 		 */
218 		if (patsize > sizeof (aux)/sizeof (aux[0])) {
219 			writeerr(wp, "PATTERN TOO LONG");
220 			return (wp->eof+3);
221 		}
222 		alt = patcompile(pattern, patsize, aux);
223 		if (alt == 0) {
224 			writeerr(wp, "BAD PATTERN");
225 			return (wp->eof+3);
226 		}
227 		/*
228 		 * Check if we start searching at the beginning of a line.
229 		 */
230 		midline = 0;
231 		nl = 1;
232 		if (begin != 0L &&
233 		    extract(wp, begin-1, searchstr, 1) && searchstr[0] != '\n') {
234 			nl = 0;  /* No newline found one left to the cursor */
235 		}
236 
237 		/*
238 		 * Then start searching...
239 		 */
240 		for (firsttime = 1;
241 		    (readsize = extr_line(wp, begin, C searchstr, SEARCHSIZE)) != 0;
242 						firsttime = 0, midline = 0) {
243 			/*
244 			 * If the current line does not begin with a new-line
245 			 * we are in touble. In theory, we need to disable '^'
246 			 * (begin of line) but this is only a silly approach.
247 			 */
248 			if (!nl)
249 				midline++;
250 			nl = 0;
251 			if (searchstr[readsize-1] == '\n') {
252 				nl++;
253 				/* XXX Ist das richtig ??? */
254 /*				searchstr[readsize-1] = '\0';*/
255 			}
256 			for (start = 0; start < readsize; start++)
257 				if ((place = UC patmatch(pattern, aux,
258 						UC (searchstr - midline),
259 						start + midline,
260 						readsize + midline, alt, state)) != NULL)
261 					if (!firsttime || place > searchstr) {
262 						/*
263 						 * Only found if the cursor moved.
264 						 */
265 						return (searchdone(wp, begin+start,
266 							begin+(int)(place-searchstr),
267 									domark));
268 					}
269 			begin += readsize;
270 			if (begin < wp->eof && !nl)
271 				begin -= patsize - 1;
272 		}
273 	}
274 	return (wp->eof+2);
275 }
276 
277 /*
278  * Search backwards for 'pattern' doing the appropriate optimizations.
279  * Decrement 'begin' first to make sure we will not find the same place twice.
280  */
281 EXPORT epos_t
reverse(wp,begin,pattern,patsize,domark)282 reverse(wp, begin, pattern, patsize, domark)
283 	ewin_t	*wp;
284 	epos_t	begin;
285 	Uchar	*pattern;
286 	int	patsize;
287 	BOOL	domark;
288 {
289 	Uchar	searchstr[SEARCHSIZE + 1];
290 	int	aux[SEARCHSIZE];	/* Nobody should be able to type such*/
291 	int	state[SEARCHSIZE+1];	/* a long pattern - avoid malloc()   */
292 	int	alt;
293 	int	start;
294 	int	readsize;
295 	int	firsttime;
296 	Uchar	*strstart;
297 	Uchar	*place;
298 	Uchar	*bestplace;
299 	epos_t	pos = wp->dot;
300 
301 	begin--;
302 	if (begin <= 0)
303 		return (wp->eof+2);
304 	if (patsize == 0)
305 		return (wp->eof+2);
306 	if (begin > wp->eof)			/* OK JS */
307 		return (wp->eof+2);
308 
309 	if (wp->magic == FALSE || issimple(pattern, patsize)) {
310 		/*
311 		 * Search without the pattern matcher.
312 		 */
313 		if (patsize == 1) {		/* single character pattern */
314 			if ((pos = srevchar(wp, begin-1, *pattern)) < wp->eof)
315 /*			if ((pos = srevchar(wp, begin, *pattern)) < wp->eof)*/
316 				return (searchdone(wp, pos, pos+1, domark));
317 			else
318 				return (wp->eof+2);
319 		}
320 
321 /*		while (readsize = retractline(begin, searchstr, SEARCHSIZE)) {*/
322 		while ((readsize = extract(wp, max(0, begin-SEARCHSIZE), searchstr, (int)min(begin, SEARCHSIZE))) != 0) {
323 			begin -= readsize;
324 			if (begin < 0)
325 				begin = 0;
326 			if (simplereverse(searchstr, readsize,
327 					pattern, patsize, &strstart, &place))
328 				return (searchdone(wp, begin+(int)(strstart-searchstr),
329 					begin+(int)(place-searchstr), domark));
330 			if (begin > 0)
331 				begin += patsize - 1;
332 		}
333 	} else {
334 		/*
335 		 * First compile the pattern.
336 		 */
337 		if (patsize > sizeof (aux)/sizeof (aux[0])) {
338 			writeerr(wp, "PATTERN TOO LONG");
339 			return (wp->eof+3);
340 		}
341 		alt = patcompile(pattern, patsize, aux);
342 		if (alt == 0) {
343 			writeerr(wp, "BAD PATTERN");
344 			return (wp->eof+3);
345 		}
346 
347 		/*
348 		 * Then start searching...
349 		 */
350 		++begin;
351 		for (firsttime = 1;
352 			(readsize = retractline(wp, begin, C searchstr, SEARCHSIZE)) != 0;
353 								firsttime = 0) {
354 			begin -= readsize;
355 			/* XXX Ist das richtig ??? */
356 /*			if (searchstr[readsize-1] == '\n')*/
357 /*				searchstr[readsize-1] = '\0';*/
358 			for (start = 0, bestplace = 0; start < readsize; ++start)
359 				if ((place = UC patmatch(pattern, aux, searchstr,
360 							start, readsize, alt, state)) != NULL)
361 					if (!firsttime ||
362 						place - searchstr < readsize)
363 						/*
364 						 * Only found if the cursor moved.
365 						 */
366 						if (place > bestplace) {
367 							/*
368 							 * Use the longest
369 							 * possible match.
370 							 */
371 							bestplace = place;
372 							pos = start;
373 						}
374 			if (bestplace)
375 				return (searchdone(wp, begin+pos,
376 					begin+(int)(bestplace-searchstr), domark));
377 		}
378 	}
379 	return (wp->eof+2);
380 }
381 
382 /*
383  * Search forwards for a simple pattern.
384  * If the pattern could be found, *beginp amd *endp are set.
385  * beginp points to the first character that matches,
386  * endp points to character the after the last matching character.
387  */
388 LOCAL BOOL
simplesearch(searchstr,searchlen,pattern,patlen,beginp,endp)389 simplesearch(searchstr, searchlen, pattern, patlen, beginp, endp)
390 	Uchar	*searchstr;	/* buffer to be searched	   */
391 	int	searchlen;	/* length of buffer to be searched */
392 	Uchar	*pattern;	/* search pattern		   */
393 	int	patlen;		/* length of search pattern	   */
394 	Uchar	**beginp;	/* start 'return' value		   */
395 	Uchar	**endp;		/* end 'return' value		   */
396 {
397 	register Uchar	*str;
398 	register Uchar	*pat;
399 	register Uchar	*strstart;
400 	register Uchar	*patstart;
401 	register int	n;
402 	register int	len;
403 	register int	plen;
404 
405 	len = searchlen;
406 	plen = patlen;
407 	strstart = searchstr;
408 	patstart = pattern;
409 	while (len-- >= plen) {
410 		for (str = strstart, pat = patstart, n = 0;
411 		    *str == *pat && n < plen;
412 		    str++, pat++, n++) {
413 			;
414 		}
415 		if (n == plen) {		/* Found */
416 			*beginp = strstart;
417 			*endp = str;
418 			return (TRUE);
419 		}
420 		strstart++;
421 	}
422 	return (FALSE);
423 }
424 
425 /*
426  * Search backwards for a simple pattern.
427  * If the pattern could be found, *beginp amd *endp are set.
428  * beginp points to the first character that matches,
429  * endp points to character the after the last matching character.
430  */
431 LOCAL BOOL
simplereverse(searchstr,searchlen,pattern,patlen,beginp,endp)432 simplereverse(searchstr, searchlen, pattern, patlen, beginp, endp)
433 		Uchar	*searchstr;	/* buffer to be searched	   */
434 	register int	searchlen;	/* length of buffer to be searched */
435 		Uchar	*pattern;	/* search pattern		   */
436 		int	patlen;		/* length of search pattern	   */
437 		Uchar	**beginp;	/* start 'return' value		   */
438 		Uchar	**endp;		/* end 'return' value		   */
439 {
440 	register Uchar	*str;
441 	register Uchar	*pat;
442 	register Uchar	*searchpos;
443 	register Uchar	*strstart;
444 	register Uchar	*patstart;
445 	register int	n;
446 	register int	plen;
447 
448 	plen = patlen;
449 	strstart = searchstr;
450 	searchpos = strstart + searchlen - plen + 1;
451 	patstart = pattern;
452 	while (--searchpos >= strstart) {
453 		for (str = searchpos, pat = patstart, n = 0;
454 		    *str == *pat && n < plen;
455 		    str++, pat++, n++)
456 			;
457 		if (n == plen) {		/* Found */
458 			*beginp = searchpos;
459 			*endp = str;
460 			return (TRUE);
461 		}
462 	}
463 	return (FALSE);
464 }
465 
466 /*
467  * Return the end of the found pattern and set a mark at the start of the
468  * found pattern if wanted.
469  */
470 LOCAL epos_t
searchdone(wp,start,ret,domark)471 searchdone(wp, start, ret, domark)
472 	ewin_t	*wp;
473 	epos_t	start;
474 	epos_t	ret;
475 	BOOL	domark;
476 {
477 	if (domark)
478 		setmark(wp, start);
479 	return (ret);
480 }
481