1 /*
2 * Copyright 2000
3 * Traakan, Inc., Los Altos, CA
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice unmodified, this list of conditions, and the following
11 * disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 /*
30 * Project: NDMJOB
31 * Ident: $Id: $
32 *
33 * Description:
34 * Binary Search Text File (BSTF)
35 *
36 * Use conventional binary search method on a sorted text
37 * file. The file MUST be sorted in ascending, lexicographic
38 * order. This is the default order of sort(1).
39 */
40
41
42 #include "ndmlib.h"
43
44
45 #ifdef SELF_TEST
46 int n_seek, n_align, n_line, n_getc;
47 #endif
48
49
50 #define MIN_DELTA 2048
51
52
53 /*
54 * ndmbstf_first()
55 * ndmbstf_first_with_bounds()
56 *
57 * Returns:
58 * <0 Error
59 * -1 EOF
60 * -2 Malformed line/file
61 * -3 fseek() to determine file size failed
62 * -4 fseek() to hop failed
63 * =0 First line found in buf[], does not match key
64 * >0 Length of first matching line in buf[]
65 */
66
ndmbstf_first(FILE * fp,char * key,char * buf,unsigned max_buf)67 int ndmbstf_first(FILE* fp, /* the file to search */
68 char* key, /* what we're looking for */
69 char* buf, /* returned line */
70 unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */
71 {
72 return ndmbstf_first_with_bounds(fp, key, buf, max_buf, 0, 0);
73 }
74
ndmbstf_first_with_bounds(FILE * fp,char * key,char * buf,unsigned max_buf,off_t lower_bound,off_t upper_bound)75 int ndmbstf_first_with_bounds(
76 FILE* fp, /* the file to search */
77 char* key, /* what we're looking for */
78 char* buf, /* returned line */
79 unsigned max_buf, /* maximum lenght of buf (sizeof (buf)) */
80 off_t lower_bound, /* offset, to skip headers, usually 0 */
81 off_t upper_bound) /* 0->don't know, >0 limit */
82 {
83 off_t off;
84 off_t lower, upper; /* bounds */
85 off_t delta;
86 int rc, buf_len;
87
88 if (upper_bound == 0) {
89 off_t end_off;
90
91 /*
92 * Determine the file size using fseek()/ftell()
93 */
94
95 fseeko(fp, 0, SEEK_END);
96 end_off = ftello(fp);
97 if (end_off == -1) return -3;
98 upper_bound = end_off;
99 }
100
101 /*
102 * Set lower and upper bounds of the binary search
103 */
104 lower = lower_bound;
105 upper = upper_bound;
106 for (;;) {
107 /*
108 * Determine the delta (distance) between the current
109 * lower and upper bounds. If delta is small, it is more
110 * efficient to do a linear search than to continue
111 * seeking. This is because stdio will pre-read
112 * portions no matter what and fseek()ing will discard
113 * the pre-read portions. MIN_DELTA is the approximation
114 * of the stdio pre-read size. Better to just
115 * linearly process the pre-read portion in the
116 * hopes that our answer is already sitting in the
117 * stdio buffer.
118 */
119 delta = upper - lower;
120 if (delta <= MIN_DELTA) break;
121
122 /*
123 * Seek to the first line after the midpoint
124 * between the lower and upper bounds.
125 */
126
127 off = lower + delta / 2;
128 rc = ndmbstf_seek_and_align(fp, &off);
129 if (rc < 0) {
130 if (rc == EOF) {
131 /*
132 * Alignment found a very long line without
133 * a \n at the end of the file. All we
134 * can do now is try a linear search
135 * from the current lower bound.
136 */
137 }
138 return -4; /* fseek() for hop failed */
139 }
140
141 /*
142 * Read the next line up into buf[].
143 */
144
145 buf_len = ndmbstf_getline(fp, buf, max_buf);
146 if (buf_len <= 0) {
147 /*
148 * EOF, or malformed line. All we can do now
149 * is try a linear search from the current
150 * lower bound.
151 */
152 break;
153 }
154
155 /*
156 * This is the crucial point.
157 *
158 * buf[] contains a line just read.
159 * off points the the line we just read.
160 * key[] is what we're looking for.
161 *
162 * Is the objective line (what we're trying to find)
163 * somewhere between lower..off or between off..upper.
164 */
165 rc = ndmbstf_compare(key, buf);
166 if (rc > 0) {
167 /* key>buf. Objective somewhere in off..upper */
168 lower = off;
169 } else {
170 /*
171 * key<=buf. Objective somewhere in lower..off
172 * Notice that if key==buf, there still might
173 * be a line ==key before the one we just
174 * read. There might be hundreds of such
175 * lines. The objective is the FIRST line
176 * greater than or equal to the key.
177 * This might be it. It might not.
178 * So, keep searching.
179 */
180 upper = off;
181 }
182 }
183
184 /*
185 * Do an unbounded linear search begining at the
186 * lower bound.
187 */
188
189 off = lower;
190 rc = ndmbstf_seek_and_align(fp, &off);
191 if (rc < 0) {
192 if (rc == EOF) {
193 /*
194 * Alignment found a very long line without
195 * a \n at the end of the file. All we
196 * can do is give up.
197 */
198 return -2;
199 }
200 return -4; /* fseek() for hop failed */
201 }
202
203 /*
204 * Read the next line up into buf[].
205 */
206 for (;;) {
207 buf_len = ndmbstf_getline(fp, buf, max_buf);
208
209 if (buf_len <= 0) {
210 if (buf_len == EOF) break; /* at end of file */
211 return -2;
212 }
213
214 rc = ndmbstf_compare(key, buf);
215 if (rc == 0) {
216 /* match */
217 return buf_len;
218 }
219 if (rc < 0) return 0; /* have line but it doesn't match */
220 }
221
222 return EOF;
223 }
224
ndmbstf_next(FILE * fp,char * key,char * buf,unsigned max_buf)225 int ndmbstf_next(FILE* fp, /* the file to search */
226 char* key, /* what we're looking for */
227 char* buf, /* returned line */
228 unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */
229 {
230 int rc, buf_len;
231
232 /*
233 * Read the next line up into buf[].
234 */
235 buf_len = ndmbstf_getline(fp, buf, max_buf);
236
237 if (buf_len <= 0) {
238 if (buf_len == EOF) return EOF; /* at end of file */
239 return -2; /* malformed line */
240 }
241
242 rc = ndmbstf_compare(key, buf);
243 if (rc == 0) {
244 /* match */
245 return buf_len;
246 } else {
247 return 0; /* have line but it doesn't match */
248 }
249 }
250
251
252 /*
253 * ndmbstr_getline()
254 *
255 * Returns:
256 * <0 Error
257 * -1 EOF
258 * -2 Malformed line
259 * >=0 Length of buf
260 */
261
ndmbstf_getline(FILE * fp,char * buf,unsigned max_buf)262 int ndmbstf_getline(FILE* fp, char* buf, unsigned max_buf)
263 {
264 char* p = buf;
265 char* p_end = buf + max_buf - 2;
266 int c;
267
268 while ((c = getc(fp)) != EOF) {
269 #ifdef SELF_TEST
270 n_getc++;
271 #endif
272 if (c == '\n') break;
273 if (p < p_end) *p++ = c;
274 }
275 *p = 0;
276
277 if (c == EOF) {
278 if (p > buf) return -2; /* malformed line */
279 return EOF;
280 }
281
282 #ifdef SELF_TEST
283 n_line++;
284 #endif
285 return p - buf;
286 }
287
ndmbstf_seek_and_align(FILE * fp,off_t * off)288 int ndmbstf_seek_and_align(FILE* fp, off_t* off)
289 {
290 int c;
291
292 if (fseeko(fp, *off, SEEK_SET) == -1) { return -2; }
293 #ifdef SELF_TEST
294 n_seek++;
295 #endif
296
297 /*
298 * There is a slim chance that we're at the
299 * true begining of a line. Too slim.
300 * Scan forward discarding the trailing
301 * portion of the line we just fseek()ed
302 * to, and leave the stdio stream positioned
303 * for the subsequent line. Notice
304 * we keep off upated so that it reflects
305 * the seek position of the stdio stream.
306 */
307
308 while ((c = getc(fp)) != EOF) {
309 *off += 1;
310 #ifdef SELF_TEST
311 n_align++;
312 #endif
313 if (c == '\n') break;
314 }
315 if (c == EOF) {
316 /* at end of file */
317 return EOF;
318 }
319
320 return 0;
321 }
322
323
324 /*
325 * ndmbstf_compare()
326 *
327 * Given a key[] and a buf[], return whether or not they match.
328 * This effectively is strncmp (key, buf, strlen(key)).
329 * Because of the cost of the call to strlen(), we implement
330 * it ndmbstf_compare() here with the exact semantics.
331 *
332 * Return values are:
333 * <0 No match, key[] less than buf[]
334 * =0 Match, the key[] is a prefix for buf[]
335 * >0 No match, key[] greater than buf[]
336 */
337
ndmbstf_compare(char * key,char * buf)338 int ndmbstf_compare(char* key, char* buf)
339 {
340 char* p = key;
341 char* q = buf;
342
343 while (*p != 0 && *p == *q) {
344 p++;
345 q++;
346 }
347
348 if (*p == 0)
349 return 0; /* entire key matched */
350 else
351 return *p - *q;
352 }
353
354
355 /*
356 * ndmbstf_match()
357 *
358 * A simple wrapper around ndmbstf_compare() with an
359 * easy-to-use return value..
360 *
361 * Returns:
362 * !0 match
363 * =0 no match
364 */
365
ndmbstf_match(char * key,char * buf)366 int ndmbstf_match(char* key, char* buf)
367 {
368 return ndmbstf_compare(key, buf) == 0;
369 }
370
371
372 #ifdef SELF_TEST
main(int ac,char * av[])373 int main(int ac, char* av[])
374 {
375 int i;
376 FILE* fp;
377 int verbose = 1;
378 int total_n_match = 0;
379 int total_n_no_match = 0;
380 int total_n_error = 0;
381
382 i = 1;
383 if (i < ac && strcmp(av[i], "-q") == 0) {
384 i++;
385 verbose = 0;
386 }
387
388 if (ac < i + 2) {
389 printf("usage: %s [-q] FILE KEY ...\n", av[0]);
390 return 1;
391 }
392
393 fp = fopen(av[i], "r");
394 if (!fp) {
395 perror(av[i]);
396 return 2;
397 }
398
399 for (i++; i < ac; i++) {
400 char buf[512];
401 char* key = av[i];
402 int n_match = 0;
403 int rc;
404
405 rc = ndmbstf_first(fp, key, buf, sizeof buf);
406 if (rc < 0) {
407 printf("Key '%s' err=%d\n", key, rc);
408 total_n_error++;
409 continue;
410 }
411
412 if (rc == 0) {
413 printf("Key '%s' no matches\n", key);
414 total_n_no_match++;
415 continue;
416 }
417
418 if (verbose) printf("Key '%s'\n", key);
419 for (; rc > 0; rc = ndmbstf_next(fp, key, buf, sizeof buf)) {
420 n_match++;
421 if (verbose) printf(" %2d: '%s'\n", n_match, buf);
422 }
423 total_n_match += n_match;
424 }
425
426 fclose(fp);
427
428 printf("n_match=%d n_miss=%d n_error=%d\n", total_n_match, total_n_no_match,
429 total_n_error);
430 printf("n_seek=%d n_align=%d n_line=%d\n", n_seek, n_align, n_line);
431
432 return 0;
433 }
434 #endif /* SELF_TEST */
435