/* * Copyright 2000 * Traakan, Inc., Los Altos, CA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Project: NDMJOB * Ident: $Id: $ * * Description: * Binary Search Text File (BSTF) * * Use conventional binary search method on a sorted text * file. The file MUST be sorted in ascending, lexicographic * order. This is the default order of sort(1). */ #include "ndmlib.h" #ifdef SELF_TEST int n_seek, n_align, n_line, n_getc; #endif #define MIN_DELTA 2048 /* * ndmbstf_first() * ndmbstf_first_with_bounds() * * Returns: * <0 Error * -1 EOF * -2 Malformed line/file * -3 fseek() to determine file size failed * -4 fseek() to hop failed * =0 First line found in buf[], does not match key * >0 Length of first matching line in buf[] */ int ndmbstf_first(FILE* fp, /* the file to search */ char* key, /* what we're looking for */ char* buf, /* returned line */ unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */ { return ndmbstf_first_with_bounds(fp, key, buf, max_buf, 0, 0); } int ndmbstf_first_with_bounds( FILE* fp, /* the file to search */ char* key, /* what we're looking for */ char* buf, /* returned line */ unsigned max_buf, /* maximum lenght of buf (sizeof (buf)) */ off_t lower_bound, /* offset, to skip headers, usually 0 */ off_t upper_bound) /* 0->don't know, >0 limit */ { off_t off; off_t lower, upper; /* bounds */ off_t delta; int rc, buf_len; if (upper_bound == 0) { off_t end_off; /* * Determine the file size using fseek()/ftell() */ fseeko(fp, 0, SEEK_END); end_off = ftello(fp); if (end_off == -1) return -3; upper_bound = end_off; } /* * Set lower and upper bounds of the binary search */ lower = lower_bound; upper = upper_bound; for (;;) { /* * Determine the delta (distance) between the current * lower and upper bounds. If delta is small, it is more * efficient to do a linear search than to continue * seeking. This is because stdio will pre-read * portions no matter what and fseek()ing will discard * the pre-read portions. MIN_DELTA is the approximation * of the stdio pre-read size. Better to just * linearly process the pre-read portion in the * hopes that our answer is already sitting in the * stdio buffer. */ delta = upper - lower; if (delta <= MIN_DELTA) break; /* * Seek to the first line after the midpoint * between the lower and upper bounds. */ off = lower + delta / 2; rc = ndmbstf_seek_and_align(fp, &off); if (rc < 0) { if (rc == EOF) { /* * Alignment found a very long line without * a \n at the end of the file. All we * can do now is try a linear search * from the current lower bound. */ } return -4; /* fseek() for hop failed */ } /* * Read the next line up into buf[]. */ buf_len = ndmbstf_getline(fp, buf, max_buf); if (buf_len <= 0) { /* * EOF, or malformed line. All we can do now * is try a linear search from the current * lower bound. */ break; } /* * This is the crucial point. * * buf[] contains a line just read. * off points the the line we just read. * key[] is what we're looking for. * * Is the objective line (what we're trying to find) * somewhere between lower..off or between off..upper. */ rc = ndmbstf_compare(key, buf); if (rc > 0) { /* key>buf. Objective somewhere in off..upper */ lower = off; } else { /* * key<=buf. Objective somewhere in lower..off * Notice that if key==buf, there still might * be a line ==key before the one we just * read. There might be hundreds of such * lines. The objective is the FIRST line * greater than or equal to the key. * This might be it. It might not. * So, keep searching. */ upper = off; } } /* * Do an unbounded linear search begining at the * lower bound. */ off = lower; rc = ndmbstf_seek_and_align(fp, &off); if (rc < 0) { if (rc == EOF) { /* * Alignment found a very long line without * a \n at the end of the file. All we * can do is give up. */ return -2; } return -4; /* fseek() for hop failed */ } /* * Read the next line up into buf[]. */ for (;;) { buf_len = ndmbstf_getline(fp, buf, max_buf); if (buf_len <= 0) { if (buf_len == EOF) break; /* at end of file */ return -2; } rc = ndmbstf_compare(key, buf); if (rc == 0) { /* match */ return buf_len; } if (rc < 0) return 0; /* have line but it doesn't match */ } return EOF; } int ndmbstf_next(FILE* fp, /* the file to search */ char* key, /* what we're looking for */ char* buf, /* returned line */ unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */ { int rc, buf_len; /* * Read the next line up into buf[]. */ buf_len = ndmbstf_getline(fp, buf, max_buf); if (buf_len <= 0) { if (buf_len == EOF) return EOF; /* at end of file */ return -2; /* malformed line */ } rc = ndmbstf_compare(key, buf); if (rc == 0) { /* match */ return buf_len; } else { return 0; /* have line but it doesn't match */ } } /* * ndmbstr_getline() * * Returns: * <0 Error * -1 EOF * -2 Malformed line * >=0 Length of buf */ int ndmbstf_getline(FILE* fp, char* buf, unsigned max_buf) { char* p = buf; char* p_end = buf + max_buf - 2; int c; while ((c = getc(fp)) != EOF) { #ifdef SELF_TEST n_getc++; #endif if (c == '\n') break; if (p < p_end) *p++ = c; } *p = 0; if (c == EOF) { if (p > buf) return -2; /* malformed line */ return EOF; } #ifdef SELF_TEST n_line++; #endif return p - buf; } int ndmbstf_seek_and_align(FILE* fp, off_t* off) { int c; if (fseeko(fp, *off, SEEK_SET) == -1) { return -2; } #ifdef SELF_TEST n_seek++; #endif /* * There is a slim chance that we're at the * true begining of a line. Too slim. * Scan forward discarding the trailing * portion of the line we just fseek()ed * to, and leave the stdio stream positioned * for the subsequent line. Notice * we keep off upated so that it reflects * the seek position of the stdio stream. */ while ((c = getc(fp)) != EOF) { *off += 1; #ifdef SELF_TEST n_align++; #endif if (c == '\n') break; } if (c == EOF) { /* at end of file */ return EOF; } return 0; } /* * ndmbstf_compare() * * Given a key[] and a buf[], return whether or not they match. * This effectively is strncmp (key, buf, strlen(key)). * Because of the cost of the call to strlen(), we implement * it ndmbstf_compare() here with the exact semantics. * * Return values are: * <0 No match, key[] less than buf[] * =0 Match, the key[] is a prefix for buf[] * >0 No match, key[] greater than buf[] */ int ndmbstf_compare(char* key, char* buf) { char* p = key; char* q = buf; while (*p != 0 && *p == *q) { p++; q++; } if (*p == 0) return 0; /* entire key matched */ else return *p - *q; } /* * ndmbstf_match() * * A simple wrapper around ndmbstf_compare() with an * easy-to-use return value.. * * Returns: * !0 match * =0 no match */ int ndmbstf_match(char* key, char* buf) { return ndmbstf_compare(key, buf) == 0; } #ifdef SELF_TEST int main(int ac, char* av[]) { int i; FILE* fp; int verbose = 1; int total_n_match = 0; int total_n_no_match = 0; int total_n_error = 0; i = 1; if (i < ac && strcmp(av[i], "-q") == 0) { i++; verbose = 0; } if (ac < i + 2) { printf("usage: %s [-q] FILE KEY ...\n", av[0]); return 1; } fp = fopen(av[i], "r"); if (!fp) { perror(av[i]); return 2; } for (i++; i < ac; i++) { char buf[512]; char* key = av[i]; int n_match = 0; int rc; rc = ndmbstf_first(fp, key, buf, sizeof buf); if (rc < 0) { printf("Key '%s' err=%d\n", key, rc); total_n_error++; continue; } if (rc == 0) { printf("Key '%s' no matches\n", key); total_n_no_match++; continue; } if (verbose) printf("Key '%s'\n", key); for (; rc > 0; rc = ndmbstf_next(fp, key, buf, sizeof buf)) { n_match++; if (verbose) printf(" %2d: '%s'\n", n_match, buf); } total_n_match += n_match; } fclose(fp); printf("n_match=%d n_miss=%d n_error=%d\n", total_n_match, total_n_no_match, total_n_error); printf("n_seek=%d n_align=%d n_line=%d\n", n_seek, n_align, n_line); return 0; } #endif /* SELF_TEST */