1 /* 2 * Copyright (c) 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * James A. Woods. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #ifndef lint 12 char copyright[] = 13 "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 14 All rights reserved.\n"; 15 #endif /* not lint */ 16 17 #ifndef lint 18 static char sccsid[] = "@(#)locate.code.c 5.1 (Berkeley) 02/06/92"; 19 #endif /* not lint */ 20 21 /* 22 * PURPOSE: sorted list compressor (works with a modified 'find' 23 * to encode/decode a filename database) 24 * 25 * USAGE: bigram < list > bigrams 26 * process bigrams (see updatedb) > common_bigrams 27 * code common_bigrams < list > squozen_list 28 * 29 * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 30 * February/March 1983, p. 8). Output format is, per line, an 31 * offset differential count byte followed by a partially bigram- 32 * encoded ascii residue. A bigram is a two-character sequence, 33 * the first 128 most common of which are encoded in one byte. 34 * 35 * EXAMPLE: For simple front compression with no bigram encoding, 36 * if the input is... then the output is... 37 * 38 * /usr/src 0 /usr/src 39 * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 40 * /usr/src/cmd/armadillo.c 14 armadillo.c 41 * /usr/tmp/zoo 5 tmp/zoo 42 * 43 * The codes are: 44 * 45 * 0-28 likeliest differential counts + offset to make nonnegative 46 * 30 switch code for out-of-range count to follow in next word 47 * 128-255 bigram codes (128 most common, as determined by 'updatedb') 48 * 32-127 single character (printable) ascii residue (ie, literal) 49 * 50 * SEE ALSO: updatedb.csh, bigram.c 51 * 52 * AUTHOR: James A. Woods, Informatics General Corp., 53 * NASA Ames Research Center, 10/82 54 */ 55 56 #include <sys/param.h> 57 #include <errno.h> 58 #include <stdlib.h> 59 #include <string.h> 60 #include <stdio.h> 61 #include "locate.h" 62 63 #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 64 #define OERR err("stdout: %s", strerror(errno)) 65 66 char buf1[MAXPATHLEN] = " "; 67 char buf2[MAXPATHLEN]; 68 char bigrams[BGBUFSIZE + 1] = { 0 }; 69 70 int bgindex __P((char *)); 71 void err __P((const char *, ...)); 72 void usage __P((void)); 73 74 int 75 main(argc, argv) 76 int argc; 77 char *argv[]; 78 { 79 register char *cp, *oldpath, *path; 80 int ch, code, count, diffcount, oldcount; 81 FILE *fp; 82 83 while ((ch = getopt(argc, argv, "")) != EOF) 84 switch(ch) { 85 case '?': 86 default: 87 usage(); 88 } 89 argc -= optind; 90 argv += optind; 91 92 if (argc != 1) 93 usage(); 94 95 if ((fp = fopen(argv[1], "r")) == NULL) 96 err("%s: %s", argv[1], strerror(errno)); 97 98 /* First copy bigram array to stdout. */ 99 (void)fgets(bigrams, BGBUFSIZE + 1, fp); 100 if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) 101 err("stdout: %s", strerror(errno)); 102 (void)fclose(fp); 103 104 oldpath = buf1; 105 path = buf2; 106 oldcount = 0; 107 while (fgets(path, sizeof(buf2), stdin) != NULL) { 108 /* Truncate newline. */ 109 cp = path + strlen(path) - 1; 110 if (cp > path && *cp == '\n') 111 *cp = '\0'; 112 113 /* Squelch characters that would botch the decoding. */ 114 for (cp = path; *cp != NULL; cp++) { 115 if ((u_char)*cp >= PARITY) 116 *cp &= PARITY-1; 117 else if (*cp <= SWITCH) 118 *cp = '?'; 119 } 120 121 /* Skip longest common prefix. */ 122 for (cp = path; *cp == *oldpath; cp++, oldpath++) 123 if (*oldpath == NULL) 124 break; 125 count = cp - path; 126 diffcount = count - oldcount + OFFSET; 127 oldcount = count; 128 if (diffcount < 0 || diffcount > 2 * OFFSET) 129 if (putchar(SWITCH) == EOF || 130 putw(diffcount, stdout) == EOF) 131 OERR; 132 else 133 if (putchar(diffcount) == EOF) 134 OERR; 135 136 while (*cp != NULL) { 137 if (*(cp + 1) == NULL) { 138 if (putchar(*cp) == EOF) 139 OERR; 140 break; 141 } 142 if ((code = bgindex(cp)) < 0) { 143 if (putchar(*cp++) == EOF || 144 putchar(*cp++) == EOF) 145 OERR; 146 } else { 147 /* Found, so mark byte with parity bit. */ 148 if (putchar((code / 2) | PARITY) == EOF) 149 OERR; 150 cp += 2; 151 } 152 } 153 if (path == buf1) { /* swap pointers */ 154 path = buf2; 155 oldpath = buf1; 156 } else { 157 path = buf1; 158 oldpath = buf2; 159 } 160 } 161 exit(0); 162 } 163 164 int 165 bgindex(bg) /* Return location of bg in bigrams or -1. */ 166 char *bg; 167 { 168 register char bg0, bg1, *p; 169 170 bg0 = bg[0]; 171 bg1 = bg[1]; 172 for (p = bigrams; *p != NULL; p++) 173 if (*p++ == bg0 && *p == bg1) 174 break; 175 return (*p == NULL ? -1 : --p - bigrams); 176 } 177 178 void 179 usage() 180 { 181 (void)fprintf(stderr, 182 "usage: locate.code common_bigrams < list > squozen_list\n"); 183 exit(1); 184 } 185 186 #if __STDC__ 187 #include <stdarg.h> 188 #else 189 #include <varargs.h> 190 #endif 191 192 void 193 #if __STDC__ 194 err(const char *fmt, ...) 195 #else 196 err(fmt, va_alist) 197 char *fmt; 198 va_dcl 199 #endif 200 { 201 va_list ap; 202 #if __STDC__ 203 va_start(ap, fmt); 204 #else 205 va_start(ap); 206 #endif 207 (void)fprintf(stderr, "locate.code: "); 208 (void)vfprintf(stderr, fmt, ap); 209 va_end(ap); 210 (void)fprintf(stderr, "\n"); 211 exit(1); 212 /* NOTREACHED */ 213 } 214