xref: /freebsd/usr.bin/fortune/strfile/strfile.c (revision 1d386b48)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Ken Arnold.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #if 0
34 #ifndef lint
35 static const char copyright[] =
36 "@(#) Copyright (c) 1989, 1993\n\
37 	The Regents of the University of California.  All rights reserved.\n";
38 #endif /* not lint */
39 
40 #ifndef lint
41 static const char sccsid[] = "@(#)strfile.c   8.1 (Berkeley) 5/31/93";
42 #endif /* not lint */
43 #endif
44 #include <sys/cdefs.h>
45 #include <sys/param.h>
46 #include <sys/endian.h>
47 #include <ctype.h>
48 #include <locale.h>
49 #include <stdbool.h>
50 #include <stdio.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <time.h>
54 #include <unistd.h>
55 
56 #include "strfile.h"
57 
58 /*
59  *	This program takes a file composed of strings separated by
60  * lines starting with two consecutive delimiting character (default
61  * character is '%') and creates another file which consists of a table
62  * describing the file (structure from "strfile.h"), a table of seek
63  * pointers to the start of the strings, and the strings, each terminated
64  * by a null byte.  Usage:
65  *
66  *	% strfile [-iorsx] [ -cC ] sourcefile [ datafile ]
67  *
68  *	C - Allow comments marked by a double delimiter at line's beginning
69  *	c - Change delimiting character from '%' to 'C'
70  *	s - Silent.  Give no summary of data processed at the end of
71  *	    the run.
72  *	o - order the strings in alphabetic order
73  *	i - if ordering, ignore case
74  *	r - randomize the order of the strings
75  *	x - set rotated bit
76  *
77  *		Ken Arnold	Sept. 7, 1978 --
78  *
79  *	Added ordering options.
80  */
81 
82 #define	STORING_PTRS	(Oflag || Rflag)
83 #define	CHUNKSIZE	512
84 
85 #define		ALLOC(ptr, sz)	do { \
86 			if (ptr == NULL) \
87 				ptr = malloc(CHUNKSIZE * sizeof(*ptr)); \
88 			else if (((sz) + 1) % CHUNKSIZE == 0) \
89 				ptr = realloc(ptr, ((sz) + CHUNKSIZE) * sizeof(*ptr)); \
90 			if (ptr == NULL) { \
91 				fprintf(stderr, "out of space\n"); \
92 				exit(1); \
93 			} \
94 		} while (0)
95 
96 typedef struct {
97 	int	first;
98 	off_t	pos;
99 } STR;
100 
101 static char	*Infile		= NULL,		/* input file name */
102 		Outfile[MAXPATHLEN] = "",	/* output file name */
103 		Delimch		= '%';		/* delimiting character */
104 
105 static int	Cflag		= false;	/* embedded comments */
106 static int	Sflag		= false;	/* silent run flag */
107 static int	Oflag		= false;	/* ordering flag */
108 static int	Iflag		= false;	/* ignore case flag */
109 static int	Rflag		= false;	/* randomize order flag */
110 static int	Xflag		= false;	/* set rotated bit */
111 static uint32_t	Num_pts		= 0;		/* number of pointers/strings */
112 
113 static off_t	*Seekpts;
114 
115 static FILE	*Sort_1, *Sort_2;		/* pointers for sorting */
116 
117 static STRFILE	Tbl;				/* statistics table */
118 
119 static STR	*Firstch;			/* first chars of each string */
120 
121 static void add_offset(FILE *, off_t);
122 static int cmp_str(const void *, const void *);
123 static int stable_collate_range_cmp(int, int);
124 static void do_order(void);
125 static void getargs(int, char **);
126 static void randomize(void);
127 static void usage(void) __dead2;
128 
129 /*
130  * main:
131  *	Drive the sucker.  There are two main modes -- either we store
132  *	the seek pointers, if the table is to be sorted or randomized,
133  *	or we write the pointer directly to the file, if we are to stay
134  *	in file order.  If the former, we allocate and re-allocate in
135  *	CHUNKSIZE blocks; if the latter, we just write each pointer,
136  *	and then seek back to the beginning to write in the table.
137  */
138 int
139 main(int ac, char *av[])
140 {
141 	char *sp, *nsp, dc;
142 	FILE *inf, *outf;
143 	off_t last_off, pos, *p;
144 	size_t length;
145 	int first;
146 	uint32_t cnt;
147 	STR *fp;
148 	static char string[257];
149 
150 	setlocale(LC_ALL, "");
151 
152 	getargs(ac, av);		/* evalute arguments */
153 	dc = Delimch;
154 	if ((inf = fopen(Infile, "r")) == NULL) {
155 		perror(Infile);
156 		exit(1);
157 	}
158 
159 	if ((outf = fopen(Outfile, "w")) == NULL) {
160 		perror(Outfile);
161 		exit(1);
162 	}
163 	if (!STORING_PTRS)
164 		fseek(outf, (long)sizeof(Tbl), SEEK_SET);
165 
166 	/*
167 	 * Write the strings onto the file
168 	 */
169 
170 	Tbl.str_longlen = 0;
171 	Tbl.str_shortlen = 0xffffffff;
172 	Tbl.str_delim = dc;
173 	Tbl.str_version = VERSION;
174 	first = Oflag;
175 	add_offset(outf, ftello(inf));
176 	last_off = 0;
177 	do {
178 		sp = fgets(string, 256, inf);
179 		if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) {
180 			pos = ftello(inf);
181 			length = (size_t)(pos - last_off) -
182 			    (sp != NULL ? strlen(sp) : 0);
183 			last_off = pos;
184 			if (length == 0)
185 				continue;
186 			add_offset(outf, pos);
187 			if ((size_t)Tbl.str_longlen < length)
188 				Tbl.str_longlen = length;
189 			if ((size_t)Tbl.str_shortlen > length)
190 				Tbl.str_shortlen = length;
191 			first = Oflag;
192 		}
193 		else if (first) {
194 			for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++)
195 				continue;
196 			ALLOC(Firstch, Num_pts);
197 			fp = &Firstch[Num_pts - 1];
198 			if (Iflag && isupper((unsigned char)*nsp))
199 				fp->first = tolower((unsigned char)*nsp);
200 			else
201 				fp->first = *nsp;
202 			fp->pos = Seekpts[Num_pts - 1];
203 			first = false;
204 		}
205 	} while (sp != NULL);
206 
207 	/*
208 	 * write the tables in
209 	 */
210 
211 	fclose(inf);
212 	Tbl.str_numstr = Num_pts - 1;
213 
214 	if (Cflag)
215 		Tbl.str_flags |= STR_COMMENTS;
216 
217 	if (Oflag)
218 		do_order();
219 	else if (Rflag)
220 		randomize();
221 
222 	if (Xflag)
223 		Tbl.str_flags |= STR_ROTATED;
224 
225 	if (!Sflag) {
226 		printf("\"%s\" created\n", Outfile);
227 		if (Num_pts == 2)
228 			puts("There was 1 string");
229 		else
230 			printf("There were %u strings\n", Num_pts - 1);
231 		printf("Longest string: %u byte%s\n", Tbl.str_longlen,
232 		       Tbl.str_longlen == 1 ? "" : "s");
233 		printf("Shortest string: %u byte%s\n", Tbl.str_shortlen,
234 		       Tbl.str_shortlen == 1 ? "" : "s");
235 	}
236 
237 	rewind(outf);
238 	Tbl.str_version = htobe32(Tbl.str_version);
239 	Tbl.str_numstr = htobe32(Tbl.str_numstr);
240 	Tbl.str_longlen = htobe32(Tbl.str_longlen);
241 	Tbl.str_shortlen = htobe32(Tbl.str_shortlen);
242 	Tbl.str_flags = htobe32(Tbl.str_flags);
243 	fwrite((char *)&Tbl, sizeof(Tbl), 1, outf);
244 	if (STORING_PTRS) {
245 		for (p = Seekpts, cnt = Num_pts; cnt--; ++p)
246 			*p = htobe64(*p);
247 		fwrite(Seekpts, sizeof(*Seekpts), (size_t)Num_pts, outf);
248 	}
249 	fclose(outf);
250 	exit(0);
251 }
252 
253 /*
254  *	This routine evaluates arguments from the command line
255  */
256 void
257 getargs(int argc, char **argv)
258 {
259 	int ch;
260 
261 	while ((ch = getopt(argc, argv, "Cc:iorsx")) != -1)
262 		switch(ch) {
263 		case 'C':			/* embedded comments */
264 			Cflag++;
265 			break;
266 		case 'c':			/* new delimiting char */
267 			Delimch = *optarg;
268 			if (!isascii(Delimch)) {
269 				printf("bad delimiting character: '\\%o\n'",
270 				       (unsigned char)Delimch);
271 			}
272 			break;
273 		case 'i':			/* ignore case in ordering */
274 			Iflag++;
275 			break;
276 		case 'o':			/* order strings */
277 			Oflag++;
278 			break;
279 		case 'r':			/* randomize pointers */
280 			Rflag++;
281 			break;
282 		case 's':			/* silent */
283 			Sflag++;
284 			break;
285 		case 'x':			/* set the rotated bit */
286 			Xflag++;
287 			break;
288 		case '?':
289 		default:
290 			usage();
291 		}
292 	argv += optind;
293 
294 	if (*argv) {
295 		Infile = *argv;
296 		if (*++argv) {
297 			if (strlcpy(Outfile, *argv, sizeof(Outfile)) >=
298 			    sizeof(Outfile)) {
299 				fprintf(stderr,
300 				    "output_file path is too long\n");
301 				exit(1);
302 			}
303 		}
304 	}
305 	if (!Infile) {
306 		puts("No input file name");
307 		usage();
308 	}
309 	if (*Outfile == '\0') {
310 		if ((size_t)snprintf(Outfile, sizeof(Outfile), "%s.dat",
311 		    Infile) >= sizeof(Outfile)) {
312 			fprintf(stderr,
313 			    "generated output_file path is too long\n");
314 			exit(1);
315 		}
316 	}
317 }
318 
319 void
320 usage(void)
321 {
322 	fprintf(stderr,
323 	    "strfile [-Ciorsx] [-c char] source_file [output_file]\n");
324 	exit(1);
325 }
326 
327 /*
328  * add_offset:
329  *	Add an offset to the list, or write it out, as appropriate.
330  */
331 void
332 add_offset(FILE *fp, off_t off)
333 {
334 	off_t beoff;
335 
336 	if (!STORING_PTRS) {
337 		beoff = htobe64(off);
338 		fwrite(&beoff, 1, sizeof(beoff), fp);
339 	} else {
340 		ALLOC(Seekpts, Num_pts + 1);
341 		Seekpts[Num_pts] = off;
342 	}
343 	Num_pts++;
344 }
345 
346 /*
347  * do_order:
348  *	Order the strings alphabetically (possibly ignoring case).
349  */
350 void
351 do_order(void)
352 {
353 	uint32_t i;
354 	off_t *lp;
355 	STR *fp;
356 
357 	Sort_1 = fopen(Infile, "r");
358 	Sort_2 = fopen(Infile, "r");
359 	qsort(Firstch, (size_t)Tbl.str_numstr, sizeof(*Firstch), cmp_str);
360 	i = Tbl.str_numstr;
361 	lp = Seekpts;
362 	fp = Firstch;
363 	while (i--)
364 		*lp++ = fp++->pos;
365 	fclose(Sort_1);
366 	fclose(Sort_2);
367 	Tbl.str_flags |= STR_ORDERED;
368 }
369 
370 static int
371 stable_collate_range_cmp(int c1, int c2)
372 {
373 	static char s1[2], s2[2];
374 	int ret;
375 
376 	s1[0] = c1;
377 	s2[0] = c2;
378 	if ((ret = strcoll(s1, s2)) != 0)
379 		return (ret);
380 	return (c1 - c2);
381 }
382 
383 /*
384  * cmp_str:
385  *	Compare two strings in the file
386  */
387 int
388 cmp_str(const void *s1, const void *s2)
389 {
390 	const STR *p1, *p2;
391 	int c1, c2, n1, n2, r;
392 
393 #define	SET_N(nf,ch)	(nf = (ch == '\n'))
394 #define	IS_END(ch,nf)	(ch == EOF || (ch == (unsigned char)Delimch && nf))
395 
396 	p1 = (const STR *)s1;
397 	p2 = (const STR *)s2;
398 
399 	c1 = (unsigned char)p1->first;
400 	c2 = (unsigned char)p2->first;
401 	if ((r = stable_collate_range_cmp(c1, c2)) != 0)
402 		return (r);
403 
404 	fseeko(Sort_1, p1->pos, SEEK_SET);
405 	fseeko(Sort_2, p2->pos, SEEK_SET);
406 
407 	n1 = false;
408 	n2 = false;
409 	while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0' && c1 != EOF)
410 		SET_N(n1, c1);
411 	while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0' && c2 != EOF)
412 		SET_N(n2, c2);
413 
414 	while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
415 		if (Iflag) {
416 			if (isupper(c1))
417 				c1 = tolower(c1);
418 			if (isupper(c2))
419 				c2 = tolower(c2);
420 		}
421 		if ((r = stable_collate_range_cmp(c1, c2)) != 0)
422 			return (r);
423 		SET_N(n1, c1);
424 		SET_N(n2, c2);
425 		c1 = getc(Sort_1);
426 		c2 = getc(Sort_2);
427 	}
428 	if (IS_END(c1, n1))
429 		c1 = 0;
430 	if (IS_END(c2, n2))
431 		c2 = 0;
432 
433 	return (stable_collate_range_cmp(c1, c2));
434 }
435 
436 /*
437  * randomize:
438  *	Randomize the order of the string table.  We must be careful
439  *	not to randomize across delimiter boundaries.  All
440  *	randomization is done within each block.
441  */
442 void
443 randomize(void)
444 {
445 	uint32_t cnt, i;
446 	off_t tmp;
447 	off_t *sp;
448 
449 	Tbl.str_flags |= STR_RANDOM;
450 	cnt = Tbl.str_numstr;
451 
452 	/*
453 	 * move things around randomly
454 	 */
455 
456 	for (sp = Seekpts; cnt > 0; cnt--, sp++) {
457 		i = arc4random_uniform(cnt);
458 		tmp = sp[0];
459 		sp[0] = sp[i];
460 		sp[i] = tmp;
461 	}
462 }
463