1 /*-
2  * Copyright (c) 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Ken Arnold.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #ifndef lint
12 char copyright[] =
13 "@(#) Copyright (c) 1989 The Regents of the University of California.\n\
14  All rights reserved.\n";
15 #endif /* not lint */
16 
17 #ifndef lint
18 static char sccsid[] = "@(#)strfile.c	5.12 (Berkeley) 04/08/91";
19 #endif /* not lint */
20 
21 # include	<machine/endian.h>
22 # include	<sys/param.h>
23 # include	<stdio.h>
24 # include	<ctype.h>
25 # include	"strfile.h"
26 
27 # ifndef MAXPATHLEN
28 # define	MAXPATHLEN	1024
29 # endif	/* MAXPATHLEN */
30 
31 /*
32  *	This program takes a file composed of strings seperated by
33  * lines starting with two consecutive delimiting character (default
34  * character is '%') and creates another file which consists of a table
35  * describing the file (structure from "strfile.h"), a table of seek
36  * pointers to the start of the strings, and the strings, each terminated
37  * by a null byte.  Usage:
38  *
39  *	% strfile [-iorsx] [ -cC ] sourcefile [ datafile ]
40  *
41  *	c - Change delimiting character from '%' to 'C'
42  *	s - Silent.  Give no summary of data processed at the end of
43  *	    the run.
44  *	o - order the strings in alphabetic order
45  *	i - if ordering, ignore case
46  *	r - randomize the order of the strings
47  *	x - set rotated bit
48  *
49  *		Ken Arnold	Sept. 7, 1978 --
50  *
51  *	Added ordering options.
52  */
53 
54 # define	TRUE	1
55 # define	FALSE	0
56 
57 # define	STORING_PTRS	(Oflag || Rflag)
58 # define	CHUNKSIZE	512
59 
60 #ifdef lint
61 # define	ALWAYS	atoi("1")
62 #else
63 # define	ALWAYS	1
64 #endif
65 # define	ALLOC(ptr,sz)	if (ALWAYS) { \
66 			if (ptr == NULL) \
67 				ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \
68 			else if (((sz) + 1) % CHUNKSIZE == 0) \
69 				ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \
70 			if (ptr == NULL) { \
71 				fprintf(stderr, "out of space\n"); \
72 				exit(1); \
73 			} \
74 		} else
75 
76 #ifdef NO_VOID
77 # define	void	char
78 #endif
79 
80 typedef struct {
81 	char	first;
82 	off_t	pos;
83 } STR;
84 
85 char	*Infile		= NULL,		/* input file name */
86 	Outfile[MAXPATHLEN] = "",	/* output file name */
87 	Delimch		= '%';		/* delimiting character */
88 
89 int	Sflag		= FALSE;	/* silent run flag */
90 int	Oflag		= FALSE;	/* ordering flag */
91 int	Iflag		= FALSE;	/* ignore case flag */
92 int	Rflag		= FALSE;	/* randomize order flag */
93 int	Xflag		= FALSE;	/* set rotated bit */
94 long	Num_pts		= 0;		/* number of pointers/strings */
95 
96 off_t	*Seekpts;
97 
98 FILE	*Sort_1, *Sort_2;		/* pointers for sorting */
99 
100 STRFILE	Tbl;				/* statistics table */
101 
102 STR	*Firstch;			/* first chars of each string */
103 
104 char	*fgets(), *strcpy(), *strcat();
105 
106 void	*malloc(), *realloc();
107 
108 /*
109  * main:
110  *	Drive the sucker.  There are two main modes -- either we store
111  *	the seek pointers, if the table is to be sorted or randomized,
112  *	or we write the pointer directly to the file, if we are to stay
113  *	in file order.  If the former, we allocate and re-allocate in
114  *	CHUNKSIZE blocks; if the latter, we just write each pointer,
115  *	and then seek back to the beginning to write in the table.
116  */
117 main(ac, av)
118 int	ac;
119 char	**av;
120 {
121 	register char		*sp, dc;
122 	register FILE		*inf, *outf;
123 	register off_t		last_off, length, pos, *p;
124 	register int		first, cnt;
125 	register char		*nsp;
126 	register STR		*fp;
127 	static char		string[257];
128 
129 	getargs(ac, av);		/* evalute arguments */
130 	dc = Delimch;
131 	if ((inf = fopen(Infile, "r")) == NULL) {
132 		perror(Infile);
133 		exit(1);
134 	}
135 
136 	if ((outf = fopen(Outfile, "w")) == NULL) {
137 		perror(Outfile);
138 		exit(1);
139 	}
140 	if (!STORING_PTRS)
141 		(void) fseek(outf, sizeof Tbl, 0);
142 
143 	/*
144 	 * Write the strings onto the file
145 	 */
146 
147 	Tbl.str_longlen = 0;
148 	Tbl.str_shortlen = (unsigned int) 0xffffffff;
149 	Tbl.str_delim = dc;
150 	Tbl.str_version = VERSION;
151 	first = Oflag;
152 	add_offset(outf, ftell(inf));
153 	last_off = 0;
154 	do {
155 		sp = fgets(string, 256, inf);
156 		if (sp == NULL || sp[0] == dc && sp[1] == '\n') {
157 			pos = ftell(inf);
158 			length = pos - last_off - (sp ? strlen(sp) : 0);
159 			last_off = pos;
160 			if (!length)
161 				continue;
162 			add_offset(outf, pos);
163 			if (Tbl.str_longlen < length)
164 				Tbl.str_longlen = length;
165 			if (Tbl.str_shortlen > length)
166 				Tbl.str_shortlen = length;
167 			first = Oflag;
168 		}
169 		else if (first) {
170 			for (nsp = sp; !isalnum(*nsp); nsp++)
171 				continue;
172 			ALLOC(Firstch, Num_pts);
173 			fp = &Firstch[Num_pts - 1];
174 			if (Iflag && isupper(*nsp))
175 				fp->first = tolower(*nsp);
176 			else
177 				fp->first = *nsp;
178 			fp->pos = Seekpts[Num_pts - 1];
179 			first = FALSE;
180 		}
181 	} while (sp != NULL);
182 
183 	/*
184 	 * write the tables in
185 	 */
186 
187 	(void) fclose(inf);
188 
189 	if (Oflag)
190 		do_order();
191 	else if (Rflag)
192 		randomize();
193 
194 	if (Xflag)
195 		Tbl.str_flags |= STR_ROTATED;
196 
197 	if (!Sflag) {
198 		printf("\"%s\" created\n", Outfile);
199 		if (Num_pts == 2)
200 			puts("There was 1 string");
201 		else
202 			printf("There were %d strings\n", Num_pts - 1);
203 		printf("Longest string: %lu byte%s\n", Tbl.str_longlen,
204 		       Tbl.str_longlen == 1 ? "" : "s");
205 		printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen,
206 		       Tbl.str_shortlen == 1 ? "" : "s");
207 	}
208 
209 	(void) fseek(outf, (off_t) 0, 0);
210 	Tbl.str_version = htonl(Tbl.str_version);
211 	Tbl.str_numstr = htonl(Num_pts - 1);
212 	Tbl.str_longlen = htonl(Tbl.str_longlen);
213 	Tbl.str_shortlen = htonl(Tbl.str_shortlen);
214 	Tbl.str_flags = htonl(Tbl.str_flags);
215 	(void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf);
216 	if (STORING_PTRS) {
217 		for (p = Seekpts, cnt = Num_pts; cnt--; ++p)
218 			*p = htonl(*p);
219 		(void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf);
220 	}
221 	(void) fclose(outf);
222 	exit(0);
223 }
224 
225 /*
226  *	This routine evaluates arguments from the command line
227  */
228 getargs(argc, argv)
229 int	argc;
230 char	**argv;
231 {
232 	extern char	*optarg;
233 	extern int	optind;
234 	int	ch;
235 
236 	while ((ch = getopt(argc, argv, "c:iorsx")) != EOF)
237 		switch(ch) {
238 		case 'c':			/* new delimiting char */
239 			Delimch = *optarg;
240 			if (!isascii(Delimch)) {
241 				printf("bad delimiting character: '\\%o\n'",
242 				       Delimch);
243 			}
244 			break;
245 		case 'i':			/* ignore case in ordering */
246 			Iflag++;
247 			break;
248 		case 'o':			/* order strings */
249 			Oflag++;
250 			break;
251 		case 'r':			/* randomize pointers */
252 			Rflag++;
253 			break;
254 		case 's':			/* silent */
255 			Sflag++;
256 			break;
257 		case 'x':			/* set the rotated bit */
258 			Xflag++;
259 			break;
260 		case '?':
261 		default:
262 			usage();
263 		}
264 	argv += optind;
265 
266 	if (*argv) {
267 		Infile = *argv;
268 		if (*++argv)
269 			(void) strcpy(Outfile, *argv);
270 	}
271 	if (!Infile) {
272 		puts("No input file name");
273 		usage();
274 	}
275 	if (*Outfile == '\0') {
276 		(void) strcpy(Outfile, Infile);
277 		(void) strcat(Outfile, ".dat");
278 	}
279 }
280 
281 usage()
282 {
283 	(void) fprintf(stderr,
284 	    "strfile [-iorsx] [-c char] sourcefile [datafile]\n");
285 	exit(1);
286 }
287 
288 /*
289  * add_offset:
290  *	Add an offset to the list, or write it out, as appropriate.
291  */
292 add_offset(fp, off)
293 FILE	*fp;
294 off_t	off;
295 {
296 	off_t net;
297 
298 	if (!STORING_PTRS) {
299 		net = htonl(off);
300 		fwrite(&net, 1, sizeof net, fp);
301 	} else {
302 		ALLOC(Seekpts, Num_pts + 1);
303 		Seekpts[Num_pts] = off;
304 	}
305 	Num_pts++;
306 }
307 
308 /*
309  * do_order:
310  *	Order the strings alphabetically (possibly ignoring case).
311  */
312 do_order()
313 {
314 	register int	i;
315 	register off_t	*lp;
316 	register STR	*fp;
317 	extern int	cmp_str();
318 
319 	Sort_1 = fopen(Infile, "r");
320 	Sort_2 = fopen(Infile, "r");
321 	qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str);
322 	i = Tbl.str_numstr;
323 	lp = Seekpts;
324 	fp = Firstch;
325 	while (i--)
326 		*lp++ = fp++->pos;
327 	(void) fclose(Sort_1);
328 	(void) fclose(Sort_2);
329 	Tbl.str_flags |= STR_ORDERED;
330 }
331 
332 /*
333  * cmp_str:
334  *	Compare two strings in the file
335  */
336 char *
337 unctrl(c)
338 char c;
339 {
340 	static char	buf[3];
341 
342 	if (isprint(c)) {
343 		buf[0] = c;
344 		buf[1] = '\0';
345 	}
346 	else if (c == 0177) {
347 		buf[0] = '^';
348 		buf[1] = '?';
349 	}
350 	else {
351 		buf[0] = '^';
352 		buf[1] = c + 'A' - 1;
353 	}
354 	return buf;
355 }
356 
357 cmp_str(p1, p2)
358 STR	*p1, *p2;
359 {
360 	register int	c1, c2;
361 	register int	n1, n2;
362 
363 # define	SET_N(nf,ch)	(nf = (ch == '\n'))
364 # define	IS_END(ch,nf)	(ch == Delimch && nf)
365 
366 	c1 = p1->first;
367 	c2 = p2->first;
368 	if (c1 != c2)
369 		return c1 - c2;
370 
371 	(void) fseek(Sort_1, p1->pos, 0);
372 	(void) fseek(Sort_2, p2->pos, 0);
373 
374 	n1 = FALSE;
375 	n2 = FALSE;
376 	while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0')
377 		SET_N(n1, c1);
378 	while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0')
379 		SET_N(n2, c2);
380 
381 	while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
382 		if (Iflag) {
383 			if (isupper(c1))
384 				c1 = tolower(c1);
385 			if (isupper(c2))
386 				c2 = tolower(c2);
387 		}
388 		if (c1 != c2)
389 			return c1 - c2;
390 		SET_N(n1, c1);
391 		SET_N(n2, c2);
392 		c1 = getc(Sort_1);
393 		c2 = getc(Sort_2);
394 	}
395 	if (IS_END(c1, n1))
396 		c1 = 0;
397 	if (IS_END(c2, n2))
398 		c2 = 0;
399 	return c1 - c2;
400 }
401 
402 /*
403  * randomize:
404  *	Randomize the order of the string table.  We must be careful
405  *	not to randomize across delimiter boundaries.  All
406  *	randomization is done within each block.
407  */
408 randomize()
409 {
410 	register int	cnt, i;
411 	register off_t	tmp;
412 	register off_t	*sp;
413 	extern time_t	time();
414 
415 	srandom((int)(time((time_t *) NULL) + getpid()));
416 
417 	Tbl.str_flags |= STR_RANDOM;
418 	cnt = Tbl.str_numstr;
419 
420 	/*
421 	 * move things around randomly
422 	 */
423 
424 	for (sp = Seekpts; cnt > 0; cnt--, sp++) {
425 		i = random() % cnt;
426 		tmp = sp[0];
427 		sp[0] = sp[i];
428 		sp[i] = tmp;
429 	}
430 }
431