xref: /openbsd/gnu/usr.bin/texinfo/info/search.c (revision a1acfa9b)
1 /* search.c -- searching large bodies of text.
2    $Id: search.c,v 1.1.1.5 2006/07/17 16:03:43 espie Exp $
3 
4    Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software
18    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 
20    Written by Brian Fox (bfox@ai.mit.edu). */
21 
22 #include "info.h"
23 
24 #include "search.h"
25 #include "nodes.h"
26 
27 /* The search functions take two arguments:
28 
29      1) a string to search for, and
30 
31      2) a pointer to a SEARCH_BINDING which contains the buffer, start,
32         and end of the search.
33 
34    They return a long, which is the offset from the start of the buffer
35    at which the match was found.  An offset of -1 indicates failure. */
36 
37 /* A function which makes a binding with buffer and bounds. */
38 SEARCH_BINDING *
make_binding(char * buffer,long int start,long int end)39 make_binding (char *buffer, long int start, long int end)
40 {
41   SEARCH_BINDING *binding;
42 
43   binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING));
44   binding->buffer = buffer;
45   binding->start = start;
46   binding->end = end;
47   binding->flags = 0;
48 
49   return (binding);
50 }
51 
52 /* Make a copy of BINDING without duplicating the data. */
53 SEARCH_BINDING *
copy_binding(SEARCH_BINDING * binding)54 copy_binding (SEARCH_BINDING *binding)
55 {
56   SEARCH_BINDING *copy;
57 
58   copy = make_binding (binding->buffer, binding->start, binding->end);
59   copy->flags = binding->flags;
60   return (copy);
61 }
62 
63 
64 /* **************************************************************** */
65 /*                                                                  */
66 /*                 The Actual Searching Functions                   */
67 /*                                                                  */
68 /* **************************************************************** */
69 
70 /* Search forwards or backwards for the text delimited by BINDING.
71    The search is forwards if BINDING->start is greater than BINDING->end. */
72 long
search(char * string,SEARCH_BINDING * binding)73 search (char *string, SEARCH_BINDING *binding)
74 {
75   long result;
76 
77   /* If the search is backwards, then search backwards, otherwise forwards. */
78   if (binding->start > binding->end)
79     result = search_backward (string, binding);
80   else
81     result = search_forward (string, binding);
82 
83   return (result);
84 }
85 
86 /* Search forwards for STRING through the text delimited in BINDING. */
87 long
search_forward(char * string,SEARCH_BINDING * binding)88 search_forward (char *string, SEARCH_BINDING *binding)
89 {
90   register int c, i, len;
91   register char *buff, *end;
92   char *alternate = (char *)NULL;
93 
94   len = strlen (string);
95 
96   /* We match characters in the search buffer against STRING and ALTERNATE.
97      ALTERNATE is a case reversed version of STRING; this is cheaper than
98      case folding each character before comparison.   Alternate is only
99      used if the case folding bit is turned on in the passed BINDING. */
100 
101   if (binding->flags & S_FoldCase)
102     {
103       alternate = xstrdup (string);
104 
105       for (i = 0; i < len; i++)
106         {
107           if (islower (alternate[i]))
108             alternate[i] = toupper (alternate[i]);
109           else if (isupper (alternate[i]))
110             alternate[i] = tolower (alternate[i]);
111         }
112     }
113 
114   buff = binding->buffer + binding->start;
115   end = binding->buffer + binding->end + 1;
116 
117   while (buff < (end - len))
118     {
119       for (i = 0; i < len; i++)
120         {
121           c = buff[i];
122 
123           if ((c != string[i]) && (!alternate || c != alternate[i]))
124             break;
125         }
126 
127       if (!string[i])
128         {
129           if (alternate)
130             free (alternate);
131           if (binding->flags & S_SkipDest)
132             buff += len;
133           return ((long) (buff - binding->buffer));
134         }
135 
136       buff++;
137     }
138 
139   if (alternate)
140     free (alternate);
141 
142   return ((long) -1);
143 }
144 
145 /* Search for STRING backwards through the text delimited in BINDING. */
146 long
search_backward(char * input_string,SEARCH_BINDING * binding)147 search_backward (char *input_string, SEARCH_BINDING *binding)
148 {
149   register int c, i, len;
150   register char *buff, *end;
151   char *string;
152   char *alternate = (char *)NULL;
153 
154   len = strlen (input_string);
155 
156   /* Reverse the characters in the search string. */
157   string = (char *)xmalloc (1 + len);
158   for (c = 0, i = len - 1; input_string[c]; c++, i--)
159     string[i] = input_string[c];
160 
161   string[c] = '\0';
162 
163   /* We match characters in the search buffer against STRING and ALTERNATE.
164      ALTERNATE is a case reversed version of STRING; this is cheaper than
165      case folding each character before comparison.   ALTERNATE is only
166      used if the case folding bit is turned on in the passed BINDING. */
167 
168   if (binding->flags & S_FoldCase)
169     {
170       alternate = xstrdup (string);
171 
172       for (i = 0; i < len; i++)
173         {
174           if (islower (alternate[i]))
175             alternate[i] = toupper (alternate[i]);
176           else if (isupper (alternate[i]))
177             alternate[i] = tolower (alternate[i]);
178         }
179     }
180 
181   buff = binding->buffer + binding->start - 1;
182   end = binding->buffer + binding->end;
183 
184   while (buff > (end + len))
185     {
186       for (i = 0; i < len; i++)
187         {
188           c = *(buff - i);
189 
190           if (c != string[i] && (!alternate || c != alternate[i]))
191             break;
192         }
193 
194       if (!string[i])
195         {
196           free (string);
197           if (alternate)
198             free (alternate);
199 
200           if (binding->flags & S_SkipDest)
201             buff -= len;
202           return ((long) (1 + (buff - binding->buffer)));
203         }
204 
205       buff--;
206     }
207 
208   free (string);
209   if (alternate)
210     free (alternate);
211 
212   return ((long) -1);
213 }
214 
215 /* Find STRING in LINE, returning the offset of the end of the string.
216    Return an offset of -1 if STRING does not appear in LINE.  The search
217    is bound by the end of the line (i.e., either NEWLINE or 0). */
218 int
string_in_line(char * string,char * line)219 string_in_line (char *string, char *line)
220 {
221   register int end;
222   SEARCH_BINDING binding;
223 
224   /* Find the end of the line. */
225   for (end = 0; line[end] && line[end] != '\n'; end++);
226 
227   /* Search for STRING within these confines. */
228   binding.buffer = line;
229   binding.start = 0;
230   binding.end = end;
231   binding.flags = S_FoldCase | S_SkipDest;
232 
233   return (search_forward (string, &binding));
234 }
235 
236 /* Return non-zero if STRING is the first text to appear at BINDING. */
237 int
looking_at(char * string,SEARCH_BINDING * binding)238 looking_at (char *string, SEARCH_BINDING *binding)
239 {
240   long search_end;
241 
242   search_end = search (string, binding);
243 
244   /* If the string was not found, SEARCH_END is -1.  If the string was found,
245      but not right away, SEARCH_END is != binding->start.  Otherwise, the
246      string was found at binding->start. */
247   return (search_end == binding->start);
248 }
249 
250 /* **************************************************************** */
251 /*                                                                  */
252 /*                    Small String Searches                         */
253 /*                                                                  */
254 /* **************************************************************** */
255 
256 /* Function names that start with "skip" are passed a string, and return
257    an offset from the start of that string.  Function names that start
258    with "find" are passed a SEARCH_BINDING, and return an absolute position
259    marker of the item being searched for.  "Find" functions return a value
260    of -1 if the item being looked for couldn't be found. */
261 
262 /* Return the index of the first non-whitespace character in STRING. */
263 int
skip_whitespace(char * string)264 skip_whitespace (char *string)
265 {
266   register int i;
267 
268   for (i = 0; string && whitespace (string[i]); i++);
269   return (i);
270 }
271 
272 /* Return the index of the first non-whitespace or newline character in
273    STRING. */
274 int
skip_whitespace_and_newlines(char * string)275 skip_whitespace_and_newlines (char *string)
276 {
277   register int i;
278 
279   for (i = 0; string && whitespace_or_newline (string[i]); i++);
280   return (i);
281 }
282 
283 /* Return the index of the first whitespace character in STRING. */
284 int
skip_non_whitespace(char * string)285 skip_non_whitespace (char *string)
286 {
287   register int i;
288 
289   for (i = 0; string && string[i] && !whitespace (string[i]); i++);
290   return (i);
291 }
292 
293 /* Return the index of the first non-node character in STRING.  Note that
294    this function contains quite a bit of hair to ignore periods in some
295    special cases.  This is because we here at GNU ship some info files which
296    contain nodenames that contain periods.  No such nodename can start with
297    a period, or continue with whitespace, newline, or ')' immediately following
298    the period.  If second argument NEWLINES_OKAY is non-zero, newlines should
299    be skipped while parsing out the nodename specification. */
300 int
skip_node_characters(char * string,int newlines_okay)301 skip_node_characters (char *string, int newlines_okay)
302 {
303   register int c, i = 0;
304   int paren_seen = 0;
305   int paren = 0;
306 
307   /* Handle special case.  This is when another function has parsed out the
308      filename component of the node name, and we just want to parse out the
309      nodename proper.  In that case, a period at the start of the nodename
310      indicates an empty nodename. */
311   if (string && *string == '.')
312     return (0);
313 
314   if (string && *string == '(')
315     {
316       paren++;
317       paren_seen++;
318       i++;
319     }
320 
321   for (; string && (c = string[i]); i++)
322     {
323       if (paren)
324         {
325           if (c == '(')
326             paren++;
327           else if (c == ')')
328             paren--;
329 
330           continue;
331         }
332 
333       /* If the character following the close paren is a space or period,
334          then this node name has no more characters associated with it. */
335       if (c == '\t' ||
336           c == ','  ||
337           c == INFO_TAGSEP ||
338           ((!newlines_okay) && (c == '\n')) ||
339           ((paren_seen && string[i - 1] == ')') &&
340            (c == ' ' || c == '.')) ||
341           (c == '.' &&
342            (
343 #if 0
344 /* This test causes a node name ending in a period, like `This.', not to
345    be found.  The trailing . is stripped.  This occurs in the jargon
346    file (`I see no X here.' is a node name).  */
347            (!string[i + 1]) ||
348 #endif
349             (whitespace_or_newline (string[i + 1])) ||
350             (string[i + 1] == ')'))))
351         break;
352     }
353   return (i);
354 }
355 
356 
357 /* **************************************************************** */
358 /*                                                                  */
359 /*                   Searching FILE_BUFFER's                        */
360 /*                                                                  */
361 /* **************************************************************** */
362 
363 /* Return the absolute position of the first occurence of a node separator in
364    BINDING-buffer.  The search starts at BINDING->start.  Return -1 if no node
365    separator was found. */
366 long
find_node_separator(SEARCH_BINDING * binding)367 find_node_separator (SEARCH_BINDING *binding)
368 {
369   register long i;
370   char *body;
371 
372   body = binding->buffer;
373 
374   /* A node is started by [^L]^_[^L]\n.  That is to say, the C-l's are
375      optional, but the DELETE and NEWLINE are not.  This separator holds
376      true for all separated elements in an Info file, including the tags
377      table (if present) and the indirect tags table (if present). */
378   for (i = binding->start; i < binding->end - 1; i++)
379     if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
380          (body[i + 2] == '\n' ||
381           (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
382         ((body[i] == INFO_COOKIE) &&
383          (body[i + 1] == '\n' ||
384           (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
385       return (i);
386   return (-1);
387 }
388 
389 /* Return the length of the node separator characters that BODY is
390    currently pointing at. */
391 int
skip_node_separator(char * body)392 skip_node_separator (char *body)
393 {
394   register int i;
395 
396   i = 0;
397 
398   if (body[i] == INFO_FF)
399     i++;
400 
401   if (body[i++] != INFO_COOKIE)
402     return (0);
403 
404   if (body[i] == INFO_FF)
405     i++;
406 
407   if (body[i++] != '\n')
408     return (0);
409 
410   return (i);
411 }
412 
413 /* Return the number of characters from STRING to the start of
414    the next line. */
415 int
skip_line(char * string)416 skip_line (char *string)
417 {
418   register int i;
419 
420   for (i = 0; string && string[i] && string[i] != '\n'; i++);
421 
422   if (string[i] == '\n')
423     i++;
424 
425   return (i);
426 }
427 
428 /* Return the absolute position of the beginning of a tags table in this
429    binding starting the search at binding->start. */
430 long
find_tags_table(SEARCH_BINDING * binding)431 find_tags_table (SEARCH_BINDING *binding)
432 {
433   SEARCH_BINDING tmp_search;
434   long position;
435 
436   tmp_search.buffer = binding->buffer;
437   tmp_search.start = binding->start;
438   tmp_search.end = binding->end;
439   tmp_search.flags = S_FoldCase;
440 
441   while ((position = find_node_separator (&tmp_search)) != -1 )
442     {
443       tmp_search.start = position;
444       tmp_search.start += skip_node_separator (tmp_search.buffer
445           + tmp_search.start);
446 
447       if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
448         return (position);
449     }
450   return (-1);
451 }
452 
453 /* Return the absolute position of the node named NODENAME in BINDING.
454    This is a brute force search, and we wish to avoid it when possible.
455    This function is called when a tag (indirect or otherwise) doesn't
456    really point to the right node.  It returns the absolute position of
457    the separator preceding the node. */
458 long
find_node_in_binding(char * nodename,SEARCH_BINDING * binding)459 find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
460 {
461   long position;
462   int offset, namelen;
463   SEARCH_BINDING tmp_search;
464 
465   namelen = strlen (nodename);
466 
467   tmp_search.buffer = binding->buffer;
468   tmp_search.start = binding->start;
469   tmp_search.end = binding->end;
470   tmp_search.flags = 0;
471 
472   while ((position = find_node_separator (&tmp_search)) != -1)
473     {
474       tmp_search.start = position;
475       tmp_search.start += skip_node_separator
476         (tmp_search.buffer + tmp_search.start);
477 
478       offset = string_in_line
479         (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
480 
481       if (offset == -1)
482         continue;
483 
484       tmp_search.start += offset;
485       tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
486       offset = skip_node_characters
487         (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
488 
489       /* Notice that this is an exact match.  You cannot grovel through
490          the buffer with this function looking for random nodes. */
491        if ((offset == namelen) &&
492            (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
493            (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
494          return (position);
495     }
496   return (-1);
497 }
498