1 /*
2  * Copyright 2000
3  *      Traakan, Inc., Los Altos, CA
4  *      All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * Project:  NDMJOB
31  * Ident:    $Id: $
32  *
33  * Description:
34  *      Binary Search Text File (BSTF)
35  *
36  *      Use conventional binary search method on a sorted text
37  *      file. The file MUST be sorted in ascending, lexicographic
38  *      order. This is the default order of sort(1).
39  */
40 
41 
42 #include "ndmlib.h"
43 
44 
45 #ifdef SELF_TEST
46 int n_seek, n_align, n_line, n_getc;
47 #endif
48 
49 
50 #define MIN_DELTA 2048
51 
52 
53 /*
54  * ndmbstf_first()
55  * ndmbstf_first_with_bounds()
56  *
57  * Returns:
58  *      <0      Error
59  *         -1   EOF
60  *         -2   Malformed line/file
61  *         -3   fseek() to determine file size failed
62  *         -4   fseek() to hop failed
63  *      =0      First line found in buf[], does not match key
64  *      >0      Length of first matching line in buf[]
65  */
66 
ndmbstf_first(FILE * fp,char * key,char * buf,unsigned max_buf)67 int ndmbstf_first(FILE* fp,         /* the file to search */
68                   char* key,        /* what we're looking for */
69                   char* buf,        /* returned line */
70                   unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */
71 {
72   return ndmbstf_first_with_bounds(fp, key, buf, max_buf, 0, 0);
73 }
74 
ndmbstf_first_with_bounds(FILE * fp,char * key,char * buf,unsigned max_buf,off_t lower_bound,off_t upper_bound)75 int ndmbstf_first_with_bounds(
76     FILE* fp,          /* the file to search */
77     char* key,         /* what we're looking for */
78     char* buf,         /* returned line */
79     unsigned max_buf,  /* maximum lenght of buf (sizeof (buf)) */
80     off_t lower_bound, /* offset, to skip headers, usually 0 */
81     off_t upper_bound) /* 0->don't know, >0 limit */
82 {
83   off_t off;
84   off_t lower, upper; /* bounds */
85   off_t delta;
86   int rc, buf_len;
87 
88   if (upper_bound == 0) {
89     off_t end_off;
90 
91     /*
92      * Determine the file size using fseek()/ftell()
93      */
94 
95     fseeko(fp, 0, SEEK_END);
96     end_off = ftello(fp);
97     if (end_off == -1) return -3;
98     upper_bound = end_off;
99   }
100 
101   /*
102    * Set lower and upper bounds of the binary search
103    */
104   lower = lower_bound;
105   upper = upper_bound;
106   for (;;) {
107     /*
108      * Determine the delta (distance) between the current
109      * lower and upper bounds. If delta is small, it is more
110      * efficient to do a linear search than to continue
111      * seeking. This is because stdio will pre-read
112      * portions no matter what and fseek()ing will discard
113      * the pre-read portions. MIN_DELTA is the approximation
114      * of the stdio pre-read size. Better to just
115      * linearly process the pre-read portion in the
116      * hopes that our answer is already sitting in the
117      * stdio buffer.
118      */
119     delta = upper - lower;
120     if (delta <= MIN_DELTA) break;
121 
122     /*
123      * Seek to the first line after the midpoint
124      * between the lower and upper bounds.
125      */
126 
127     off = lower + delta / 2;
128     rc = ndmbstf_seek_and_align(fp, &off);
129     if (rc < 0) {
130       if (rc == EOF) {
131         /*
132          * Alignment found a very long line without
133          * a \n at the end of the file. All we
134          * can do now is try a linear search
135          * from the current lower bound.
136          */
137       }
138       return -4; /* fseek() for hop failed */
139     }
140 
141     /*
142      * Read the next line up into buf[].
143      */
144 
145     buf_len = ndmbstf_getline(fp, buf, max_buf);
146     if (buf_len <= 0) {
147       /*
148        * EOF, or malformed line. All we can do now
149        * is try a linear search from the current
150        * lower bound.
151        */
152       break;
153     }
154 
155     /*
156      * This is the crucial point.
157      *
158      *   buf[]  contains a line just read.
159      *   off    points the the line we just read.
160      *   key[]  is what we're looking for.
161      *
162      * Is the objective line (what we're trying to find)
163      * somewhere between lower..off or between off..upper.
164      */
165     rc = ndmbstf_compare(key, buf);
166     if (rc > 0) {
167       /* key>buf. Objective somewhere in off..upper */
168       lower = off;
169     } else {
170       /*
171        * key<=buf. Objective somewhere in lower..off
172        * Notice that if key==buf, there still might
173        * be a line ==key before the one we just
174        * read. There might be hundreds of such
175        * lines. The objective is the FIRST line
176        * greater than or equal to the key.
177        * This might be it. It might not.
178        * So, keep searching.
179        */
180       upper = off;
181     }
182   }
183 
184   /*
185    * Do an unbounded linear search begining at the
186    * lower bound.
187    */
188 
189   off = lower;
190   rc = ndmbstf_seek_and_align(fp, &off);
191   if (rc < 0) {
192     if (rc == EOF) {
193       /*
194        * Alignment found a very long line without
195        * a \n at the end of the file. All we
196        * can do is give up.
197        */
198       return -2;
199     }
200     return -4; /* fseek() for hop failed */
201   }
202 
203   /*
204    * Read the next line up into buf[].
205    */
206   for (;;) {
207     buf_len = ndmbstf_getline(fp, buf, max_buf);
208 
209     if (buf_len <= 0) {
210       if (buf_len == EOF) break; /* at end of file */
211       return -2;
212     }
213 
214     rc = ndmbstf_compare(key, buf);
215     if (rc == 0) {
216       /* match */
217       return buf_len;
218     }
219     if (rc < 0) return 0; /* have line but it doesn't match */
220   }
221 
222   return EOF;
223 }
224 
ndmbstf_next(FILE * fp,char * key,char * buf,unsigned max_buf)225 int ndmbstf_next(FILE* fp,         /* the file to search */
226                  char* key,        /* what we're looking for */
227                  char* buf,        /* returned line */
228                  unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */
229 {
230   int rc, buf_len;
231 
232   /*
233    * Read the next line up into buf[].
234    */
235   buf_len = ndmbstf_getline(fp, buf, max_buf);
236 
237   if (buf_len <= 0) {
238     if (buf_len == EOF) return EOF; /* at end of file */
239     return -2;                      /* malformed line */
240   }
241 
242   rc = ndmbstf_compare(key, buf);
243   if (rc == 0) {
244     /* match */
245     return buf_len;
246   } else {
247     return 0; /* have line but it doesn't match */
248   }
249 }
250 
251 
252 /*
253  * ndmbstr_getline()
254  *
255  * Returns:
256  *      <0      Error
257  *         -1   EOF
258  *         -2   Malformed line
259  *      >=0     Length of buf
260  */
261 
ndmbstf_getline(FILE * fp,char * buf,unsigned max_buf)262 int ndmbstf_getline(FILE* fp, char* buf, unsigned max_buf)
263 {
264   char* p = buf;
265   char* p_end = buf + max_buf - 2;
266   int c;
267 
268   while ((c = getc(fp)) != EOF) {
269 #ifdef SELF_TEST
270     n_getc++;
271 #endif
272     if (c == '\n') break;
273     if (p < p_end) *p++ = c;
274   }
275   *p = 0;
276 
277   if (c == EOF) {
278     if (p > buf) return -2; /* malformed line */
279     return EOF;
280   }
281 
282 #ifdef SELF_TEST
283   n_line++;
284 #endif
285   return p - buf;
286 }
287 
ndmbstf_seek_and_align(FILE * fp,off_t * off)288 int ndmbstf_seek_and_align(FILE* fp, off_t* off)
289 {
290   int c;
291 
292   if (fseeko(fp, *off, SEEK_SET) == -1) { return -2; }
293 #ifdef SELF_TEST
294   n_seek++;
295 #endif
296 
297   /*
298    * There is a slim chance that we're at the
299    * true begining of a line. Too slim.
300    * Scan forward discarding the trailing
301    * portion of the line we just fseek()ed
302    * to, and leave the stdio stream positioned
303    * for the subsequent line. Notice
304    * we keep off upated so that it reflects
305    * the seek position of the stdio stream.
306    */
307 
308   while ((c = getc(fp)) != EOF) {
309     *off += 1;
310 #ifdef SELF_TEST
311     n_align++;
312 #endif
313     if (c == '\n') break;
314   }
315   if (c == EOF) {
316     /* at end of file */
317     return EOF;
318   }
319 
320   return 0;
321 }
322 
323 
324 /*
325  * ndmbstf_compare()
326  *
327  * Given a key[] and a buf[], return whether or not they match.
328  * This effectively is strncmp (key, buf, strlen(key)).
329  * Because of the cost of the call to strlen(), we implement
330  * it ndmbstf_compare() here with the exact semantics.
331  *
332  * Return values are:
333  *      <0      No match, key[] less than buf[]
334  *      =0      Match, the key[] is a prefix for buf[]
335  *      >0      No match, key[] greater than buf[]
336  */
337 
ndmbstf_compare(char * key,char * buf)338 int ndmbstf_compare(char* key, char* buf)
339 {
340   char* p = key;
341   char* q = buf;
342 
343   while (*p != 0 && *p == *q) {
344     p++;
345     q++;
346   }
347 
348   if (*p == 0)
349     return 0; /* entire key matched */
350   else
351     return *p - *q;
352 }
353 
354 
355 /*
356  * ndmbstf_match()
357  *
358  * A simple wrapper around ndmbstf_compare() with an
359  * easy-to-use return value..
360  *
361  * Returns:
362  *      !0      match
363  *      =0      no match
364  */
365 
ndmbstf_match(char * key,char * buf)366 int ndmbstf_match(char* key, char* buf)
367 {
368   return ndmbstf_compare(key, buf) == 0;
369 }
370 
371 
372 #ifdef SELF_TEST
main(int ac,char * av[])373 int main(int ac, char* av[])
374 {
375   int i;
376   FILE* fp;
377   int verbose = 1;
378   int total_n_match = 0;
379   int total_n_no_match = 0;
380   int total_n_error = 0;
381 
382   i = 1;
383   if (i < ac && strcmp(av[i], "-q") == 0) {
384     i++;
385     verbose = 0;
386   }
387 
388   if (ac < i + 2) {
389     printf("usage: %s [-q] FILE KEY ...\n", av[0]);
390     return 1;
391   }
392 
393   fp = fopen(av[i], "r");
394   if (!fp) {
395     perror(av[i]);
396     return 2;
397   }
398 
399   for (i++; i < ac; i++) {
400     char buf[512];
401     char* key = av[i];
402     int n_match = 0;
403     int rc;
404 
405     rc = ndmbstf_first(fp, key, buf, sizeof buf);
406     if (rc < 0) {
407       printf("Key '%s' err=%d\n", key, rc);
408       total_n_error++;
409       continue;
410     }
411 
412     if (rc == 0) {
413       printf("Key '%s' no matches\n", key);
414       total_n_no_match++;
415       continue;
416     }
417 
418     if (verbose) printf("Key '%s'\n", key);
419     for (; rc > 0; rc = ndmbstf_next(fp, key, buf, sizeof buf)) {
420       n_match++;
421       if (verbose) printf("    %2d: '%s'\n", n_match, buf);
422     }
423     total_n_match += n_match;
424   }
425 
426   fclose(fp);
427 
428   printf("n_match=%d n_miss=%d n_error=%d\n", total_n_match, total_n_no_match,
429          total_n_error);
430   printf("n_seek=%d n_align=%d n_line=%d\n", n_seek, n_align, n_line);
431 
432   return 0;
433 }
434 #endif /* SELF_TEST */
435