1 /* 2 * Copyright (c) 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * James A. Woods. 7 * 8 * Redistribution and use in source and binary forms are permitted 9 * provided that the above copyright notice and this paragraph are 10 * duplicated in all such forms and that any documentation, 11 * advertising materials, and other materials related to such 12 * distribution and use acknowledge that the software was developed 13 * by the University of California, Berkeley. The name of the 14 * University may not be used to endorse or promote products derived 15 * from this software without specific prior written permission. 16 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 19 */ 20 21 #ifndef lint 22 char copyright[] = 23 "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 24 All rights reserved.\n"; 25 #endif /* not lint */ 26 27 #ifndef lint 28 static char sccsid[] = "@(#)locate.code.c 4.6 (Berkeley) 10/13/89"; 29 #endif /* not lint */ 30 31 /* 32 * PURPOSE: sorted list compressor (works with a modified 'find' 33 * to encode/decode a filename database) 34 * 35 * USAGE: bigram < list > bigrams 36 * process bigrams (see updatedb) > common_bigrams 37 * code common_bigrams < list > squozen_list 38 * 39 * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 40 * February/March 1983, p. 8 ). Output format is, per line, an 41 * offset differential count byte followed by a partially bigram- 42 * encoded ascii residue. A bigram is a two-character sequence, 43 * the first 128 most common of which are encoded in one byte. 44 * 45 * EXAMPLE: For simple front compression with no bigram encoding, 46 * if the input is... then the output is... 47 * 48 * /usr/src 0 /usr/src 49 * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 50 * /usr/src/cmd/armadillo.c 14 armadillo.c 51 * /usr/tmp/zoo 5 tmp/zoo 52 * 53 * The codes are: 54 * 55 * 0-28 likeliest differential counts + offset to make nonnegative 56 * 30 switch code for out-of-range count to follow in next word 57 * 128-255 bigram codes (128 most common, as determined by 'updatedb') 58 * 32-127 single character (printable) ascii residue (ie, literal) 59 * 60 * SEE ALSO: updatedb.csh, bigram.c, find.c 61 * 62 * AUTHOR: James A. Woods, Informatics General Corp., 63 * NASA Ames Research Center, 10/82 64 */ 65 66 #include <stdio.h> 67 #include <sys/param.h> 68 #include "find.h" 69 70 #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 71 72 char buf1[MAXPATHLEN] = " "; 73 char buf2[MAXPATHLEN]; 74 char bigrams[BGBUFSIZE + 1] = { 0 }; 75 76 main ( argc, argv ) 77 int argc; char *argv[]; 78 { 79 register char *cp, *oldpath = buf1, *path = buf2; 80 int code, count, diffcount, oldcount = 0; 81 FILE *fp; 82 83 if ((fp = fopen(argv[1], "r")) == NULL) { 84 printf("Usage: code common_bigrams < list > squozen_list\n"); 85 exit(1); 86 } 87 /* first copy bigram array to stdout */ 88 fgets ( bigrams, BGBUFSIZE + 1, fp ); 89 fwrite ( bigrams, 1, BGBUFSIZE, stdout ); 90 fclose( fp ); 91 92 while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { 93 /* truncate newline */ 94 cp = path + strlen(path) - 1; 95 if (cp > path && *cp == '\n') 96 *cp = '\0'; 97 /* squelch characters that would botch the decoding */ 98 for ( cp = path; *cp != NULL; cp++ ) { 99 if ( (unsigned char)*cp >= PARITY ) 100 *cp &= PARITY-1; 101 else if ( *cp <= SWITCH ) 102 *cp = '?'; 103 } 104 /* skip longest common prefix */ 105 for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 106 if ( *oldpath == NULL ) 107 break; 108 count = cp - path; 109 diffcount = count - oldcount + OFFSET; 110 oldcount = count; 111 if ( diffcount < 0 || diffcount > 2*OFFSET ) { 112 putc ( SWITCH, stdout ); 113 putw ( diffcount, stdout ); 114 } 115 else 116 putc ( diffcount, stdout ); 117 118 while ( *cp != NULL ) { 119 if ( *(cp + 1) == NULL ) { 120 putchar ( *cp ); 121 break; 122 } 123 if ( (code = bgindex ( cp )) < 0 ) { 124 putchar ( *cp++ ); 125 putchar ( *cp++ ); 126 } 127 else { /* found, so mark byte with parity bit */ 128 putchar ( (code / 2) | PARITY ); 129 cp += 2; 130 } 131 } 132 if ( path == buf1 ) /* swap pointers */ 133 path = buf2, oldpath = buf1; 134 else 135 path = buf1, oldpath = buf2; 136 } 137 } 138 139 bgindex ( bg ) /* return location of bg in bigrams or -1 */ 140 char *bg; 141 { 142 register char *p; 143 register char bg0 = bg[0], bg1 = bg[1]; 144 145 for ( p = bigrams; *p != NULL; p++ ) 146 if ( *p++ == bg0 && *p == bg1 ) 147 break; 148 return ( *p == NULL ? -1 : --p - bigrams ); 149 } 150