1 /* 2 * Copyright (c) 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * James A. Woods. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #ifndef lint 12 char copyright[] = 13 "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 14 All rights reserved.\n"; 15 #endif /* not lint */ 16 17 #ifndef lint 18 static char sccsid[] = "@(#)locate.code.c 4.8 (Berkeley) 06/01/90"; 19 #endif /* not lint */ 20 21 /* 22 * PURPOSE: sorted list compressor (works with a modified 'find' 23 * to encode/decode a filename database) 24 * 25 * USAGE: bigram < list > bigrams 26 * process bigrams (see updatedb) > common_bigrams 27 * code common_bigrams < list > squozen_list 28 * 29 * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 30 * February/March 1983, p. 8 ). Output format is, per line, an 31 * offset differential count byte followed by a partially bigram- 32 * encoded ascii residue. A bigram is a two-character sequence, 33 * the first 128 most common of which are encoded in one byte. 34 * 35 * EXAMPLE: For simple front compression with no bigram encoding, 36 * if the input is... then the output is... 37 * 38 * /usr/src 0 /usr/src 39 * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 40 * /usr/src/cmd/armadillo.c 14 armadillo.c 41 * /usr/tmp/zoo 5 tmp/zoo 42 * 43 * The codes are: 44 * 45 * 0-28 likeliest differential counts + offset to make nonnegative 46 * 30 switch code for out-of-range count to follow in next word 47 * 128-255 bigram codes (128 most common, as determined by 'updatedb') 48 * 32-127 single character (printable) ascii residue (ie, literal) 49 * 50 * SEE ALSO: updatedb.csh, bigram.c, find.c 51 * 52 * AUTHOR: James A. Woods, Informatics General Corp., 53 * NASA Ames Research Center, 10/82 54 */ 55 56 #include <sys/param.h> 57 #include <stdio.h> 58 #include "locate.h" 59 60 #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 61 62 char buf1[MAXPATHLEN] = " "; 63 char buf2[MAXPATHLEN]; 64 char bigrams[BGBUFSIZE + 1] = { 0 }; 65 66 main ( argc, argv ) 67 int argc; char *argv[]; 68 { 69 register char *cp, *oldpath = buf1, *path = buf2; 70 int code, count, diffcount, oldcount = 0; 71 FILE *fp; 72 73 if ((fp = fopen(argv[1], "r")) == NULL) { 74 printf("Usage: code common_bigrams < list > squozen_list\n"); 75 exit(1); 76 } 77 /* first copy bigram array to stdout */ 78 fgets ( bigrams, BGBUFSIZE + 1, fp ); 79 fwrite ( bigrams, 1, BGBUFSIZE, stdout ); 80 fclose( fp ); 81 82 while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { 83 /* truncate newline */ 84 cp = path + strlen(path) - 1; 85 if (cp > path && *cp == '\n') 86 *cp = '\0'; 87 /* squelch characters that would botch the decoding */ 88 for ( cp = path; *cp != NULL; cp++ ) { 89 if ( (unsigned char)*cp >= PARITY ) 90 *cp &= PARITY-1; 91 else if ( *cp <= SWITCH ) 92 *cp = '?'; 93 } 94 /* skip longest common prefix */ 95 for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 96 if ( *oldpath == NULL ) 97 break; 98 count = cp - path; 99 diffcount = count - oldcount + OFFSET; 100 oldcount = count; 101 if ( diffcount < 0 || diffcount > 2*OFFSET ) { 102 putc ( SWITCH, stdout ); 103 putw ( diffcount, stdout ); 104 } 105 else 106 putc ( diffcount, stdout ); 107 108 while ( *cp != NULL ) { 109 if ( *(cp + 1) == NULL ) { 110 putchar ( *cp ); 111 break; 112 } 113 if ( (code = bgindex ( cp )) < 0 ) { 114 putchar ( *cp++ ); 115 putchar ( *cp++ ); 116 } 117 else { /* found, so mark byte with parity bit */ 118 putchar ( (code / 2) | PARITY ); 119 cp += 2; 120 } 121 } 122 if ( path == buf1 ) /* swap pointers */ 123 path = buf2, oldpath = buf1; 124 else 125 path = buf1, oldpath = buf2; 126 } 127 } 128 129 bgindex ( bg ) /* return location of bg in bigrams or -1 */ 130 char *bg; 131 { 132 register char *p; 133 register char bg0 = bg[0], bg1 = bg[1]; 134 135 for ( p = bigrams; *p != NULL; p++ ) 136 if ( *p++ == bg0 && *p == bg1 ) 137 break; 138 return ( *p == NULL ? -1 : --p - bigrams ); 139 } 140