1df930be7Sderaadt /* 2*d33b10b9Sderaadt * $OpenBSD: locate.code.c,v 1.11 2003/07/07 21:36:16 deraadt Exp $ 30f3d8120Smichaels * 4df930be7Sderaadt * Copyright (c) 1989, 1993 5df930be7Sderaadt * The Regents of the University of California. All rights reserved. 6df930be7Sderaadt * 7df930be7Sderaadt * This code is derived from software contributed to Berkeley by 8df930be7Sderaadt * James A. Woods. 9df930be7Sderaadt * 10df930be7Sderaadt * Redistribution and use in source and binary forms, with or without 11df930be7Sderaadt * modification, are permitted provided that the following conditions 12df930be7Sderaadt * are met: 13df930be7Sderaadt * 1. Redistributions of source code must retain the above copyright 14df930be7Sderaadt * notice, this list of conditions and the following disclaimer. 15df930be7Sderaadt * 2. Redistributions in binary form must reproduce the above copyright 16df930be7Sderaadt * notice, this list of conditions and the following disclaimer in the 17df930be7Sderaadt * documentation and/or other materials provided with the distribution. 18f75387cbSmillert * 3. Neither the name of the University nor the names of its contributors 19df930be7Sderaadt * may be used to endorse or promote products derived from this software 20df930be7Sderaadt * without specific prior written permission. 21df930be7Sderaadt * 22df930be7Sderaadt * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23df930be7Sderaadt * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24df930be7Sderaadt * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25df930be7Sderaadt * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26df930be7Sderaadt * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27df930be7Sderaadt * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28df930be7Sderaadt * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29df930be7Sderaadt * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30df930be7Sderaadt * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31df930be7Sderaadt * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32df930be7Sderaadt * SUCH DAMAGE. 335dc1102eSmichaels * 34*d33b10b9Sderaadt * $Id: locate.code.c,v 1.11 2003/07/07 21:36:16 deraadt Exp $ 35df930be7Sderaadt */ 36df930be7Sderaadt 37df930be7Sderaadt #ifndef lint 38df930be7Sderaadt static char copyright[] = 39df930be7Sderaadt "@(#) Copyright (c) 1989, 1993\n\ 40df930be7Sderaadt The Regents of the University of California. All rights reserved.\n"; 41df930be7Sderaadt #endif /* not lint */ 42df930be7Sderaadt 43df930be7Sderaadt #ifndef lint 44df930be7Sderaadt #if 0 45c41e3a31Smichaels static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; 46c41e3a31Smichaels #else 47*d33b10b9Sderaadt static char rcsid[] = "$OpenBSD: locate.code.c,v 1.11 2003/07/07 21:36:16 deraadt Exp $"; 48df930be7Sderaadt #endif 49df930be7Sderaadt #endif /* not lint */ 50df930be7Sderaadt 51df930be7Sderaadt /* 52df930be7Sderaadt * PURPOSE: sorted list compressor (works with a modified 'find' 53df930be7Sderaadt * to encode/decode a filename database) 54df930be7Sderaadt * 55df930be7Sderaadt * USAGE: bigram < list > bigrams 56df930be7Sderaadt * process bigrams (see updatedb) > common_bigrams 57df930be7Sderaadt * code common_bigrams < list > squozen_list 58df930be7Sderaadt * 59df930be7Sderaadt * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 60df930be7Sderaadt * February/March 1983, p. 8). Output format is, per line, an 61df930be7Sderaadt * offset differential count byte followed by a partially bigram- 62df930be7Sderaadt * encoded ascii residue. A bigram is a two-character sequence, 63df930be7Sderaadt * the first 128 most common of which are encoded in one byte. 64df930be7Sderaadt * 65df930be7Sderaadt * EXAMPLE: For simple front compression with no bigram encoding, 66df930be7Sderaadt * if the input is... then the output is... 67df930be7Sderaadt * 68df930be7Sderaadt * /usr/src 0 /usr/src 69df930be7Sderaadt * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 70df930be7Sderaadt * /usr/src/cmd/armadillo.c 14 armadillo.c 71df930be7Sderaadt * /usr/tmp/zoo 5 tmp/zoo 72df930be7Sderaadt * 73df930be7Sderaadt * The codes are: 74df930be7Sderaadt * 75df930be7Sderaadt * 0-28 likeliest differential counts + offset to make nonnegative 76df930be7Sderaadt * 30 switch code for out-of-range count to follow in next word 770f3d8120Smichaels * 31 an 8 bit char followed 78df930be7Sderaadt * 128-255 bigram codes (128 most common, as determined by 'updatedb') 79df930be7Sderaadt * 32-127 single character (printable) ascii residue (ie, literal) 80df930be7Sderaadt * 810f3d8120Smichaels * The locate database store any character except newline ('\n') 820f3d8120Smichaels * and NUL ('\0'). The 8-bit character support don't wast extra 830f3d8120Smichaels * space until you have characters in file names less than 32 840f3d8120Smichaels * or greather than 127. 850f3d8120Smichaels * 860f3d8120Smichaels * 870f3d8120Smichaels * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c 88df930be7Sderaadt * 89df930be7Sderaadt * AUTHOR: James A. Woods, Informatics General Corp., 90df930be7Sderaadt * NASA Ames Research Center, 10/82 910f3d8120Smichaels * 8-bit file names characters: 920f3d8120Smichaels * Wolfram Schneider, Berlin September 1996 93df930be7Sderaadt */ 94df930be7Sderaadt 95df930be7Sderaadt #include <sys/param.h> 96df930be7Sderaadt #include <err.h> 97df930be7Sderaadt #include <errno.h> 98df930be7Sderaadt #include <stdlib.h> 99df930be7Sderaadt #include <string.h> 100c41e3a31Smichaels #include <stdio.h> 101df930be7Sderaadt #include "locate.h" 102df930be7Sderaadt 103df930be7Sderaadt #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 104df930be7Sderaadt 105c41e3a31Smichaels u_char buf1[MAXPATHLEN] = " "; 106c41e3a31Smichaels u_char buf2[MAXPATHLEN]; 1070f3d8120Smichaels u_char bigrams[BGBUFSIZE + 1] = { 0 }; 108df930be7Sderaadt 1095dc1102eSmichaels #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ 1105dc1102eSmichaels 111c41e3a31Smichaels #ifdef LOOKUP 1120f3d8120Smichaels #define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) 1130f3d8120Smichaels typedef short bg_t; 1140f3d8120Smichaels bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; 115c41e3a31Smichaels #else 116c41e3a31Smichaels #define BGINDEX(x) bgindex(x) 117c41e3a31Smichaels typedef int bg_t; 118c72b5b24Smillert int bgindex(char *); 1195dc1102eSmichaels #endif /* LOOKUP */ 1205dc1102eSmichaels 1215dc1102eSmichaels 122c72b5b24Smillert void usage(void); 123c41e3a31Smichaels extern int optind; 124c41e3a31Smichaels extern int optopt; 125df930be7Sderaadt 126df930be7Sderaadt int 127*d33b10b9Sderaadt main(int argc, char *argv[]) 128df930be7Sderaadt { 129c0932ef1Smpech u_char *cp, *oldpath, *path; 130df930be7Sderaadt int ch, code, count, diffcount, oldcount; 131df930be7Sderaadt FILE *fp; 132c0932ef1Smpech int i, j; 133df930be7Sderaadt 13472799b18Smillert while ((ch = getopt(argc, argv, "")) != -1) 135df930be7Sderaadt switch(ch) { 136df930be7Sderaadt default: 137df930be7Sderaadt usage(); 138df930be7Sderaadt } 139df930be7Sderaadt argc -= optind; 140df930be7Sderaadt argv += optind; 141df930be7Sderaadt 142df930be7Sderaadt if (argc != 1) 143df930be7Sderaadt usage(); 144df930be7Sderaadt 145df930be7Sderaadt if ((fp = fopen(argv[0], "r")) == NULL) 146df930be7Sderaadt err(1, "%s", argv[0]); 147df930be7Sderaadt 148df930be7Sderaadt /* First copy bigram array to stdout. */ 149df930be7Sderaadt (void)fgets(bigrams, BGBUFSIZE + 1, fp); 1505dc1102eSmichaels 151df930be7Sderaadt if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) 152df930be7Sderaadt err(1, "stdout"); 153df930be7Sderaadt (void)fclose(fp); 154df930be7Sderaadt 155c41e3a31Smichaels #ifdef LOOKUP 156c41e3a31Smichaels /* init lookup table */ 1570f3d8120Smichaels for (i = 0; i < UCHAR_MAX + 1; i++) 1580f3d8120Smichaels for (j = 0; j < UCHAR_MAX + 1; j++) 159c41e3a31Smichaels big[i][j] = (bg_t)-1; 160c41e3a31Smichaels 1615dc1102eSmichaels for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) 1620f3d8120Smichaels big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; 1630f3d8120Smichaels 1645dc1102eSmichaels #endif /* LOOKUP */ 165c41e3a31Smichaels 166df930be7Sderaadt oldpath = buf1; 167df930be7Sderaadt path = buf2; 168df930be7Sderaadt oldcount = 0; 1695dc1102eSmichaels 170df930be7Sderaadt while (fgets(path, sizeof(buf2), stdin) != NULL) { 171c41e3a31Smichaels 172c41e3a31Smichaels /* skip empty lines */ 173c41e3a31Smichaels if (*path == '\n') 174c41e3a31Smichaels continue; 175df930be7Sderaadt 1760f3d8120Smichaels /* remove newline */ 1775dc1102eSmichaels for (cp = path; *cp != '\0'; cp++) { 178c41e3a31Smichaels /* chop newline */ 179c41e3a31Smichaels if (*cp == '\n') 1805dc1102eSmichaels *cp = '\0'; 181df930be7Sderaadt } 182df930be7Sderaadt 183df930be7Sderaadt /* Skip longest common prefix. */ 1840f3d8120Smichaels for (cp = path; *cp == *oldpath; cp++, oldpath++) 1850f3d8120Smichaels if (*cp == '\0') 1860f3d8120Smichaels break; 187c41e3a31Smichaels 188df930be7Sderaadt count = cp - path; 189df930be7Sderaadt diffcount = count - oldcount + OFFSET; 190df930be7Sderaadt oldcount = count; 191df930be7Sderaadt if (diffcount < 0 || diffcount > 2 * OFFSET) { 192df930be7Sderaadt if (putchar(SWITCH) == EOF || 193df930be7Sderaadt putw(diffcount, stdout) == EOF) 194df930be7Sderaadt err(1, "stdout"); 195df930be7Sderaadt } else 196df930be7Sderaadt if (putchar(diffcount) == EOF) 197df930be7Sderaadt err(1, "stdout"); 198df930be7Sderaadt 1995dc1102eSmichaels while (*cp != '\0') { 2000f3d8120Smichaels /* print *two* characters */ 2010f3d8120Smichaels 2020f3d8120Smichaels if ((code = BGINDEX(cp)) != (bg_t)-1) { 2030f3d8120Smichaels /* 2040f3d8120Smichaels * print *one* as bigram 2050f3d8120Smichaels * Found, so mark byte with 2060f3d8120Smichaels * parity bit. 2070f3d8120Smichaels */ 208df930be7Sderaadt if (putchar((code / 2) | PARITY) == EOF) 209df930be7Sderaadt err(1, "stdout"); 210df930be7Sderaadt cp += 2; 211df930be7Sderaadt } 2120f3d8120Smichaels 2130f3d8120Smichaels else { 2140f3d8120Smichaels for (i = 0; i < 2; i++) { 2150f3d8120Smichaels if (*cp == '\0') 2160f3d8120Smichaels break; 2170f3d8120Smichaels 2180f3d8120Smichaels /* print umlauts in file names */ 2190f3d8120Smichaels if (*cp < ASCII_MIN || 2200f3d8120Smichaels *cp > ASCII_MAX) { 2210f3d8120Smichaels if (putchar(UMLAUT) == EOF || 2220f3d8120Smichaels putchar(*cp++) == EOF) 2230f3d8120Smichaels err(1, "stdout"); 224df930be7Sderaadt } 2250f3d8120Smichaels 2260f3d8120Smichaels else { 2270f3d8120Smichaels /* normal character */ 2280f3d8120Smichaels if(putchar(*cp++) == EOF) 2290f3d8120Smichaels err(1, "stdout"); 2300f3d8120Smichaels } 2310f3d8120Smichaels } 2320f3d8120Smichaels 2330f3d8120Smichaels } 2340f3d8120Smichaels } 2350f3d8120Smichaels 236df930be7Sderaadt if (path == buf1) { /* swap pointers */ 237df930be7Sderaadt path = buf2; 238df930be7Sderaadt oldpath = buf1; 239df930be7Sderaadt } else { 240df930be7Sderaadt path = buf1; 241df930be7Sderaadt oldpath = buf2; 242df930be7Sderaadt } 243df930be7Sderaadt } 244df930be7Sderaadt /* Non-zero status if there were errors */ 245df930be7Sderaadt if (fflush(stdout) != 0 || ferror(stdout)) 246df930be7Sderaadt exit(1); 2475dc1102eSmichaels exit(0); 248df930be7Sderaadt } 249df930be7Sderaadt 250c41e3a31Smichaels #ifndef LOOKUP 251df930be7Sderaadt int 252*d33b10b9Sderaadt bgindex(char *bg) /* Return location of bg in bigrams or -1. */ 253df930be7Sderaadt { 254c0932ef1Smpech char bg0, bg1, *p; 255df930be7Sderaadt 256df930be7Sderaadt bg0 = bg[0]; 257df930be7Sderaadt bg1 = bg[1]; 2585dc1102eSmichaels for (p = bigrams; *p != NULL; p++) 259df930be7Sderaadt if (*p++ == bg0 && *p == bg1) 260df930be7Sderaadt break; 2610f3d8120Smichaels return (*p == NULL ? -1 : (--p - bigrams)); 262df930be7Sderaadt } 263c41e3a31Smichaels #endif /* !LOOKUP */ 264df930be7Sderaadt 265df930be7Sderaadt void 266*d33b10b9Sderaadt usage(void) 267df930be7Sderaadt { 268df930be7Sderaadt (void)fprintf(stderr, 269df930be7Sderaadt "usage: locate.code common_bigrams < list > squozen_list\n"); 270df930be7Sderaadt exit(1); 271df930be7Sderaadt } 272