1 /* search.c -- searching large bodies of text. 2 $Id: search.c,v 1.1.1.5 2006/07/17 16:03:43 espie Exp $ 3 4 Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software 18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 20 Written by Brian Fox (bfox@ai.mit.edu). */ 21 22 #include "info.h" 23 24 #include "search.h" 25 #include "nodes.h" 26 27 /* The search functions take two arguments: 28 29 1) a string to search for, and 30 31 2) a pointer to a SEARCH_BINDING which contains the buffer, start, 32 and end of the search. 33 34 They return a long, which is the offset from the start of the buffer 35 at which the match was found. An offset of -1 indicates failure. */ 36 37 /* A function which makes a binding with buffer and bounds. */ 38 SEARCH_BINDING * 39 make_binding (char *buffer, long int start, long int end) 40 { 41 SEARCH_BINDING *binding; 42 43 binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING)); 44 binding->buffer = buffer; 45 binding->start = start; 46 binding->end = end; 47 binding->flags = 0; 48 49 return (binding); 50 } 51 52 /* Make a copy of BINDING without duplicating the data. */ 53 SEARCH_BINDING * 54 copy_binding (SEARCH_BINDING *binding) 55 { 56 SEARCH_BINDING *copy; 57 58 copy = make_binding (binding->buffer, binding->start, binding->end); 59 copy->flags = binding->flags; 60 return (copy); 61 } 62 63 64 /* **************************************************************** */ 65 /* */ 66 /* The Actual Searching Functions */ 67 /* */ 68 /* **************************************************************** */ 69 70 /* Search forwards or backwards for the text delimited by BINDING. 71 The search is forwards if BINDING->start is greater than BINDING->end. */ 72 long 73 search (char *string, SEARCH_BINDING *binding) 74 { 75 long result; 76 77 /* If the search is backwards, then search backwards, otherwise forwards. */ 78 if (binding->start > binding->end) 79 result = search_backward (string, binding); 80 else 81 result = search_forward (string, binding); 82 83 return (result); 84 } 85 86 /* Search forwards for STRING through the text delimited in BINDING. */ 87 long 88 search_forward (char *string, SEARCH_BINDING *binding) 89 { 90 register int c, i, len; 91 register char *buff, *end; 92 char *alternate = (char *)NULL; 93 94 len = strlen (string); 95 96 /* We match characters in the search buffer against STRING and ALTERNATE. 97 ALTERNATE is a case reversed version of STRING; this is cheaper than 98 case folding each character before comparison. Alternate is only 99 used if the case folding bit is turned on in the passed BINDING. */ 100 101 if (binding->flags & S_FoldCase) 102 { 103 alternate = xstrdup (string); 104 105 for (i = 0; i < len; i++) 106 { 107 if (islower (alternate[i])) 108 alternate[i] = toupper (alternate[i]); 109 else if (isupper (alternate[i])) 110 alternate[i] = tolower (alternate[i]); 111 } 112 } 113 114 buff = binding->buffer + binding->start; 115 end = binding->buffer + binding->end + 1; 116 117 while (buff < (end - len)) 118 { 119 for (i = 0; i < len; i++) 120 { 121 c = buff[i]; 122 123 if ((c != string[i]) && (!alternate || c != alternate[i])) 124 break; 125 } 126 127 if (!string[i]) 128 { 129 if (alternate) 130 free (alternate); 131 if (binding->flags & S_SkipDest) 132 buff += len; 133 return ((long) (buff - binding->buffer)); 134 } 135 136 buff++; 137 } 138 139 if (alternate) 140 free (alternate); 141 142 return ((long) -1); 143 } 144 145 /* Search for STRING backwards through the text delimited in BINDING. */ 146 long 147 search_backward (char *input_string, SEARCH_BINDING *binding) 148 { 149 register int c, i, len; 150 register char *buff, *end; 151 char *string; 152 char *alternate = (char *)NULL; 153 154 len = strlen (input_string); 155 156 /* Reverse the characters in the search string. */ 157 string = (char *)xmalloc (1 + len); 158 for (c = 0, i = len - 1; input_string[c]; c++, i--) 159 string[i] = input_string[c]; 160 161 string[c] = '\0'; 162 163 /* We match characters in the search buffer against STRING and ALTERNATE. 164 ALTERNATE is a case reversed version of STRING; this is cheaper than 165 case folding each character before comparison. ALTERNATE is only 166 used if the case folding bit is turned on in the passed BINDING. */ 167 168 if (binding->flags & S_FoldCase) 169 { 170 alternate = xstrdup (string); 171 172 for (i = 0; i < len; i++) 173 { 174 if (islower (alternate[i])) 175 alternate[i] = toupper (alternate[i]); 176 else if (isupper (alternate[i])) 177 alternate[i] = tolower (alternate[i]); 178 } 179 } 180 181 buff = binding->buffer + binding->start - 1; 182 end = binding->buffer + binding->end; 183 184 while (buff > (end + len)) 185 { 186 for (i = 0; i < len; i++) 187 { 188 c = *(buff - i); 189 190 if (c != string[i] && (!alternate || c != alternate[i])) 191 break; 192 } 193 194 if (!string[i]) 195 { 196 free (string); 197 if (alternate) 198 free (alternate); 199 200 if (binding->flags & S_SkipDest) 201 buff -= len; 202 return ((long) (1 + (buff - binding->buffer))); 203 } 204 205 buff--; 206 } 207 208 free (string); 209 if (alternate) 210 free (alternate); 211 212 return ((long) -1); 213 } 214 215 /* Find STRING in LINE, returning the offset of the end of the string. 216 Return an offset of -1 if STRING does not appear in LINE. The search 217 is bound by the end of the line (i.e., either NEWLINE or 0). */ 218 int 219 string_in_line (char *string, char *line) 220 { 221 register int end; 222 SEARCH_BINDING binding; 223 224 /* Find the end of the line. */ 225 for (end = 0; line[end] && line[end] != '\n'; end++); 226 227 /* Search for STRING within these confines. */ 228 binding.buffer = line; 229 binding.start = 0; 230 binding.end = end; 231 binding.flags = S_FoldCase | S_SkipDest; 232 233 return (search_forward (string, &binding)); 234 } 235 236 /* Return non-zero if STRING is the first text to appear at BINDING. */ 237 int 238 looking_at (char *string, SEARCH_BINDING *binding) 239 { 240 long search_end; 241 242 search_end = search (string, binding); 243 244 /* If the string was not found, SEARCH_END is -1. If the string was found, 245 but not right away, SEARCH_END is != binding->start. Otherwise, the 246 string was found at binding->start. */ 247 return (search_end == binding->start); 248 } 249 250 /* **************************************************************** */ 251 /* */ 252 /* Small String Searches */ 253 /* */ 254 /* **************************************************************** */ 255 256 /* Function names that start with "skip" are passed a string, and return 257 an offset from the start of that string. Function names that start 258 with "find" are passed a SEARCH_BINDING, and return an absolute position 259 marker of the item being searched for. "Find" functions return a value 260 of -1 if the item being looked for couldn't be found. */ 261 262 /* Return the index of the first non-whitespace character in STRING. */ 263 int 264 skip_whitespace (char *string) 265 { 266 register int i; 267 268 for (i = 0; string && whitespace (string[i]); i++); 269 return (i); 270 } 271 272 /* Return the index of the first non-whitespace or newline character in 273 STRING. */ 274 int 275 skip_whitespace_and_newlines (char *string) 276 { 277 register int i; 278 279 for (i = 0; string && whitespace_or_newline (string[i]); i++); 280 return (i); 281 } 282 283 /* Return the index of the first whitespace character in STRING. */ 284 int 285 skip_non_whitespace (char *string) 286 { 287 register int i; 288 289 for (i = 0; string && string[i] && !whitespace (string[i]); i++); 290 return (i); 291 } 292 293 /* Return the index of the first non-node character in STRING. Note that 294 this function contains quite a bit of hair to ignore periods in some 295 special cases. This is because we here at GNU ship some info files which 296 contain nodenames that contain periods. No such nodename can start with 297 a period, or continue with whitespace, newline, or ')' immediately following 298 the period. If second argument NEWLINES_OKAY is non-zero, newlines should 299 be skipped while parsing out the nodename specification. */ 300 int 301 skip_node_characters (char *string, int newlines_okay) 302 { 303 register int c, i = 0; 304 int paren_seen = 0; 305 int paren = 0; 306 307 /* Handle special case. This is when another function has parsed out the 308 filename component of the node name, and we just want to parse out the 309 nodename proper. In that case, a period at the start of the nodename 310 indicates an empty nodename. */ 311 if (string && *string == '.') 312 return (0); 313 314 if (string && *string == '(') 315 { 316 paren++; 317 paren_seen++; 318 i++; 319 } 320 321 for (; string && (c = string[i]); i++) 322 { 323 if (paren) 324 { 325 if (c == '(') 326 paren++; 327 else if (c == ')') 328 paren--; 329 330 continue; 331 } 332 333 /* If the character following the close paren is a space or period, 334 then this node name has no more characters associated with it. */ 335 if (c == '\t' || 336 c == ',' || 337 c == INFO_TAGSEP || 338 ((!newlines_okay) && (c == '\n')) || 339 ((paren_seen && string[i - 1] == ')') && 340 (c == ' ' || c == '.')) || 341 (c == '.' && 342 ( 343 #if 0 344 /* This test causes a node name ending in a period, like `This.', not to 345 be found. The trailing . is stripped. This occurs in the jargon 346 file (`I see no X here.' is a node name). */ 347 (!string[i + 1]) || 348 #endif 349 (whitespace_or_newline (string[i + 1])) || 350 (string[i + 1] == ')')))) 351 break; 352 } 353 return (i); 354 } 355 356 357 /* **************************************************************** */ 358 /* */ 359 /* Searching FILE_BUFFER's */ 360 /* */ 361 /* **************************************************************** */ 362 363 /* Return the absolute position of the first occurence of a node separator in 364 BINDING-buffer. The search starts at BINDING->start. Return -1 if no node 365 separator was found. */ 366 long 367 find_node_separator (SEARCH_BINDING *binding) 368 { 369 register long i; 370 char *body; 371 372 body = binding->buffer; 373 374 /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are 375 optional, but the DELETE and NEWLINE are not. This separator holds 376 true for all separated elements in an Info file, including the tags 377 table (if present) and the indirect tags table (if present). */ 378 for (i = binding->start; i < binding->end - 1; i++) 379 if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) && 380 (body[i + 2] == '\n' || 381 (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) || 382 ((body[i] == INFO_COOKIE) && 383 (body[i + 1] == '\n' || 384 (body[i + 1] == INFO_FF && body[i + 2] == '\n')))) 385 return (i); 386 return (-1); 387 } 388 389 /* Return the length of the node separator characters that BODY is 390 currently pointing at. */ 391 int 392 skip_node_separator (char *body) 393 { 394 register int i; 395 396 i = 0; 397 398 if (body[i] == INFO_FF) 399 i++; 400 401 if (body[i++] != INFO_COOKIE) 402 return (0); 403 404 if (body[i] == INFO_FF) 405 i++; 406 407 if (body[i++] != '\n') 408 return (0); 409 410 return (i); 411 } 412 413 /* Return the number of characters from STRING to the start of 414 the next line. */ 415 int 416 skip_line (char *string) 417 { 418 register int i; 419 420 for (i = 0; string && string[i] && string[i] != '\n'; i++); 421 422 if (string[i] == '\n') 423 i++; 424 425 return (i); 426 } 427 428 /* Return the absolute position of the beginning of a tags table in this 429 binding starting the search at binding->start. */ 430 long 431 find_tags_table (SEARCH_BINDING *binding) 432 { 433 SEARCH_BINDING tmp_search; 434 long position; 435 436 tmp_search.buffer = binding->buffer; 437 tmp_search.start = binding->start; 438 tmp_search.end = binding->end; 439 tmp_search.flags = S_FoldCase; 440 441 while ((position = find_node_separator (&tmp_search)) != -1 ) 442 { 443 tmp_search.start = position; 444 tmp_search.start += skip_node_separator (tmp_search.buffer 445 + tmp_search.start); 446 447 if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search)) 448 return (position); 449 } 450 return (-1); 451 } 452 453 /* Return the absolute position of the node named NODENAME in BINDING. 454 This is a brute force search, and we wish to avoid it when possible. 455 This function is called when a tag (indirect or otherwise) doesn't 456 really point to the right node. It returns the absolute position of 457 the separator preceding the node. */ 458 long 459 find_node_in_binding (char *nodename, SEARCH_BINDING *binding) 460 { 461 long position; 462 int offset, namelen; 463 SEARCH_BINDING tmp_search; 464 465 namelen = strlen (nodename); 466 467 tmp_search.buffer = binding->buffer; 468 tmp_search.start = binding->start; 469 tmp_search.end = binding->end; 470 tmp_search.flags = 0; 471 472 while ((position = find_node_separator (&tmp_search)) != -1) 473 { 474 tmp_search.start = position; 475 tmp_search.start += skip_node_separator 476 (tmp_search.buffer + tmp_search.start); 477 478 offset = string_in_line 479 (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start); 480 481 if (offset == -1) 482 continue; 483 484 tmp_search.start += offset; 485 tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start); 486 offset = skip_node_characters 487 (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES); 488 489 /* Notice that this is an exact match. You cannot grovel through 490 the buffer with this function looking for random nodes. */ 491 if ((offset == namelen) && 492 (tmp_search.buffer[tmp_search.start] == nodename[0]) && 493 (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0)) 494 return (position); 495 } 496 return (-1); 497 } 498