107917fe8Szrj /*-
207917fe8Szrj * Copyright (c) 1989, 1993
307917fe8Szrj * The Regents of the University of California. All rights reserved.
407917fe8Szrj *
507917fe8Szrj * This code is derived from software contributed to Berkeley by
607917fe8Szrj * Ken Arnold.
707917fe8Szrj *
807917fe8Szrj * Redistribution and use in source and binary forms, with or without
907917fe8Szrj * modification, are permitted provided that the following conditions
1007917fe8Szrj * are met:
1107917fe8Szrj * 1. Redistributions of source code must retain the above copyright
1207917fe8Szrj * notice, this list of conditions and the following disclaimer.
1307917fe8Szrj * 2. Redistributions in binary form must reproduce the above copyright
1407917fe8Szrj * notice, this list of conditions and the following disclaimer in the
1507917fe8Szrj * documentation and/or other materials provided with the distribution.
1607917fe8Szrj * 3. Neither the name of the University nor the names of its contributors
1707917fe8Szrj * may be used to endorse or promote products derived from this software
1807917fe8Szrj * without specific prior written permission.
1907917fe8Szrj *
2007917fe8Szrj * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2107917fe8Szrj * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2207917fe8Szrj * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2307917fe8Szrj * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2407917fe8Szrj * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2507917fe8Szrj * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2607917fe8Szrj * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2707917fe8Szrj * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2807917fe8Szrj * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2907917fe8Szrj * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3007917fe8Szrj * SUCH DAMAGE.
3107917fe8Szrj *
3207917fe8Szrj * @(#) Copyright (c) 1989, 1993 The Regents of the University of California. All rights reserved.
3307917fe8Szrj * @(#)strfile.c 8.1 (Berkeley) 5/31/93
34*7b5acc11Szrj * $FreeBSD: head/usr.bin/fortune/strfile/strfile.c 316500 2017-04-04 19:46:23Z asomers $
3507917fe8Szrj */
3607917fe8Szrj
3707917fe8Szrj #include <sys/param.h>
38*7b5acc11Szrj #include <sys/endian.h>
3907917fe8Szrj #include <ctype.h>
4007917fe8Szrj #include <locale.h>
4107917fe8Szrj #include <stdbool.h>
4207917fe8Szrj #include <stdio.h>
4307917fe8Szrj #include <stdlib.h>
4407917fe8Szrj #include <string.h>
4507917fe8Szrj #include <time.h>
4607917fe8Szrj #include <unistd.h>
4707917fe8Szrj
4807917fe8Szrj #include "strfile.h"
4907917fe8Szrj
5007917fe8Szrj /*
5107917fe8Szrj * This program takes a file composed of strings separated by
5207917fe8Szrj * lines starting with two consecutive delimiting character (default
5307917fe8Szrj * character is '%') and creates another file which consists of a table
5407917fe8Szrj * describing the file (structure from "strfile.h"), a table of seek
5507917fe8Szrj * pointers to the start of the strings, and the strings, each terminated
5607917fe8Szrj * by a null byte. Usage:
5707917fe8Szrj *
5807917fe8Szrj * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ]
5907917fe8Szrj *
6007917fe8Szrj * C - Allow comments marked by a double delimiter at line's beginning
6107917fe8Szrj * c - Change delimiting character from '%' to 'C'
6207917fe8Szrj * s - Silent. Give no summary of data processed at the end of
6307917fe8Szrj * the run.
6407917fe8Szrj * o - order the strings in alphabetic order
6507917fe8Szrj * i - if ordering, ignore case
6607917fe8Szrj * r - randomize the order of the strings
6707917fe8Szrj * x - set rotated bit
6807917fe8Szrj *
6907917fe8Szrj * Ken Arnold Sept. 7, 1978 --
7007917fe8Szrj *
7107917fe8Szrj * Added ordering options.
7207917fe8Szrj */
7307917fe8Szrj
7407917fe8Szrj #define STORING_PTRS (Oflag || Rflag)
7507917fe8Szrj #define CHUNKSIZE 512
7607917fe8Szrj
7707917fe8Szrj #define ALLOC(ptr, sz) do { \
7807917fe8Szrj if (ptr == NULL) \
79*7b5acc11Szrj ptr = malloc(CHUNKSIZE * sizeof(*ptr)); \
8007917fe8Szrj else if (((sz) + 1) % CHUNKSIZE == 0) \
81*7b5acc11Szrj ptr = realloc(ptr, ((sz) + CHUNKSIZE) * sizeof(*ptr)); \
8207917fe8Szrj if (ptr == NULL) { \
8307917fe8Szrj fprintf(stderr, "out of space\n"); \
8407917fe8Szrj exit(1); \
8507917fe8Szrj } \
8607917fe8Szrj } while (0)
8707917fe8Szrj
8807917fe8Szrj typedef struct {
89*7b5acc11Szrj int first;
90*7b5acc11Szrj off_t pos;
9107917fe8Szrj } STR;
9207917fe8Szrj
9307917fe8Szrj static char *Infile = NULL; /* input file name */
9407917fe8Szrj static char Outfile[MAXPATHLEN] = ""; /* output file name */
9507917fe8Szrj static char Delimch = '%'; /* delimiting character */
9607917fe8Szrj
9707917fe8Szrj static int Cflag = false; /* embedded comments */
9807917fe8Szrj static int Sflag = false; /* silent run flag */
9907917fe8Szrj static int Oflag = false; /* ordering flag */
10007917fe8Szrj static int Iflag = false; /* ignore case flag */
10107917fe8Szrj static int Rflag = false; /* randomize order flag */
10207917fe8Szrj static int Xflag = false; /* set rotated bit */
103*7b5acc11Szrj static uint32_t Num_pts = 0; /* number of pointers/strings */
10407917fe8Szrj
105*7b5acc11Szrj static off_t *Seekpts;
10607917fe8Szrj
10707917fe8Szrj static FILE *Sort_1, *Sort_2; /* pointers for sorting */
10807917fe8Szrj
10907917fe8Szrj static STRFILE Tbl; /* statistics table */
11007917fe8Szrj
11107917fe8Szrj static STR *Firstch; /* first chars of each string */
11207917fe8Szrj
113*7b5acc11Szrj static void add_offset(FILE *, off_t);
11407917fe8Szrj static int cmp_str(const void *, const void *);
11507917fe8Szrj static int collate_range_cmp(int, int);
11607917fe8Szrj static void do_order(void);
11707917fe8Szrj static void getargs(int, char **);
11807917fe8Szrj static void randomize(void);
11907917fe8Szrj static void usage(void);
12007917fe8Szrj
12107917fe8Szrj /*
12207917fe8Szrj * main:
12307917fe8Szrj * Drive the sucker. There are two main modes -- either we store
12407917fe8Szrj * the seek pointers, if the table is to be sorted or randomized,
12507917fe8Szrj * or we write the pointer directly to the file, if we are to stay
12607917fe8Szrj * in file order. If the former, we allocate and re-allocate in
12707917fe8Szrj * CHUNKSIZE blocks; if the latter, we just write each pointer,
12807917fe8Szrj * and then seek back to the beginning to write in the table.
12907917fe8Szrj */
13007917fe8Szrj int
main(int argc,char * argv[])13107917fe8Szrj main(int argc, char *argv[])
13207917fe8Szrj {
13307917fe8Szrj char *sp, *nsp, dc;
13407917fe8Szrj FILE *inf, *outf;
135*7b5acc11Szrj off_t last_off, pos, *p;
136*7b5acc11Szrj size_t length;
137*7b5acc11Szrj int first;
138*7b5acc11Szrj uint32_t cnt;
13907917fe8Szrj STR *fp;
14007917fe8Szrj static char string[257];
14107917fe8Szrj
14207917fe8Szrj setlocale(LC_ALL, "");
14307917fe8Szrj
14407917fe8Szrj getargs(argc, argv); /* evalute arguments */
14507917fe8Szrj dc = Delimch;
14607917fe8Szrj if ((inf = fopen(Infile, "r")) == NULL) {
14707917fe8Szrj perror(Infile);
14807917fe8Szrj exit(1);
14907917fe8Szrj }
15007917fe8Szrj
15107917fe8Szrj if ((outf = fopen(Outfile, "w")) == NULL) {
15207917fe8Szrj perror(Outfile);
15307917fe8Szrj exit(1);
15407917fe8Szrj }
15507917fe8Szrj if (!STORING_PTRS)
156*7b5acc11Szrj fseek(outf, (long)sizeof(Tbl), SEEK_SET);
15707917fe8Szrj
15807917fe8Szrj /*
15907917fe8Szrj * Write the strings onto the file
16007917fe8Szrj */
16107917fe8Szrj
16207917fe8Szrj Tbl.str_longlen = 0;
163*7b5acc11Szrj Tbl.str_shortlen = 0xffffffff;
16407917fe8Szrj Tbl.str_delim = dc;
16507917fe8Szrj Tbl.str_version = VERSION;
16607917fe8Szrj first = Oflag;
167*7b5acc11Szrj add_offset(outf, ftello(inf));
16807917fe8Szrj last_off = 0;
16907917fe8Szrj do {
17007917fe8Szrj sp = fgets(string, 256, inf);
17107917fe8Szrj if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) {
172*7b5acc11Szrj pos = ftello(inf);
173*7b5acc11Szrj length = (size_t)(pos - last_off) -
174*7b5acc11Szrj (sp != NULL ? strlen(sp) : 0);
17507917fe8Szrj last_off = pos;
176*7b5acc11Szrj if (length == 0)
17707917fe8Szrj continue;
17807917fe8Szrj add_offset(outf, pos);
179*7b5acc11Szrj if ((size_t)Tbl.str_longlen < length)
18007917fe8Szrj Tbl.str_longlen = length;
181*7b5acc11Szrj if ((size_t)Tbl.str_shortlen > length)
18207917fe8Szrj Tbl.str_shortlen = length;
18307917fe8Szrj first = Oflag;
18407917fe8Szrj }
18507917fe8Szrj else if (first) {
18607917fe8Szrj for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++)
18707917fe8Szrj continue;
18807917fe8Szrj ALLOC(Firstch, Num_pts);
18907917fe8Szrj fp = &Firstch[Num_pts - 1];
19007917fe8Szrj if (Iflag && isupper((unsigned char)*nsp))
19107917fe8Szrj fp->first = tolower((unsigned char)*nsp);
19207917fe8Szrj else
19307917fe8Szrj fp->first = *nsp;
19407917fe8Szrj fp->pos = Seekpts[Num_pts - 1];
19507917fe8Szrj first = false;
19607917fe8Szrj }
19707917fe8Szrj } while (sp != NULL);
19807917fe8Szrj
19907917fe8Szrj /*
20007917fe8Szrj * write the tables in
20107917fe8Szrj */
20207917fe8Szrj
20307917fe8Szrj fclose(inf);
20407917fe8Szrj Tbl.str_numstr = Num_pts - 1;
20507917fe8Szrj
20607917fe8Szrj if (Cflag)
20707917fe8Szrj Tbl.str_flags |= STR_COMMENTS;
20807917fe8Szrj
20907917fe8Szrj if (Oflag)
21007917fe8Szrj do_order();
21107917fe8Szrj else if (Rflag)
21207917fe8Szrj randomize();
21307917fe8Szrj
21407917fe8Szrj if (Xflag)
21507917fe8Szrj Tbl.str_flags |= STR_ROTATED;
21607917fe8Szrj
21707917fe8Szrj if (!Sflag) {
21807917fe8Szrj printf("\"%s\" created\n", Outfile);
21907917fe8Szrj if (Num_pts == 2)
22007917fe8Szrj puts("There was 1 string");
22107917fe8Szrj else
222*7b5acc11Szrj printf("There were %u strings\n", Num_pts - 1);
223*7b5acc11Szrj printf("Longest string: %u byte%s\n", Tbl.str_longlen,
22407917fe8Szrj Tbl.str_longlen == 1 ? "" : "s");
225*7b5acc11Szrj printf("Shortest string: %u byte%s\n", Tbl.str_shortlen,
22607917fe8Szrj Tbl.str_shortlen == 1 ? "" : "s");
22707917fe8Szrj }
22807917fe8Szrj
22907917fe8Szrj rewind(outf);
230*7b5acc11Szrj Tbl.str_version = htobe32(Tbl.str_version);
231*7b5acc11Szrj Tbl.str_numstr = htobe32(Tbl.str_numstr);
232*7b5acc11Szrj Tbl.str_longlen = htobe32(Tbl.str_longlen);
233*7b5acc11Szrj Tbl.str_shortlen = htobe32(Tbl.str_shortlen);
234*7b5acc11Szrj Tbl.str_flags = htobe32(Tbl.str_flags);
23507917fe8Szrj fwrite((char *)&Tbl, sizeof(Tbl), 1, outf);
23607917fe8Szrj if (STORING_PTRS) {
23707917fe8Szrj for (p = Seekpts, cnt = Num_pts; cnt--; ++p)
238*7b5acc11Szrj *p = htobe64(*p);
239*7b5acc11Szrj fwrite(Seekpts, sizeof(*Seekpts), (size_t)Num_pts, outf);
24007917fe8Szrj }
24107917fe8Szrj fclose(outf);
24207917fe8Szrj exit(0);
24307917fe8Szrj }
24407917fe8Szrj
24507917fe8Szrj /*
24607917fe8Szrj * This routine evaluates arguments from the command line
24707917fe8Szrj */
24807917fe8Szrj void
getargs(int argc,char ** argv)24907917fe8Szrj getargs(int argc, char **argv)
25007917fe8Szrj {
25107917fe8Szrj int ch;
25207917fe8Szrj
25307917fe8Szrj while ((ch = getopt(argc, argv, "Cc:iorsx")) != -1)
25407917fe8Szrj switch(ch) {
25507917fe8Szrj case 'C': /* embedded comments */
25607917fe8Szrj Cflag++;
25707917fe8Szrj break;
25807917fe8Szrj case 'c': /* new delimiting char */
25907917fe8Szrj Delimch = *optarg;
26007917fe8Szrj if (!isascii(Delimch)) {
26107917fe8Szrj printf("bad delimiting character: '\\%o\n'",
26207917fe8Szrj (unsigned char)Delimch);
26307917fe8Szrj }
26407917fe8Szrj break;
26507917fe8Szrj case 'i': /* ignore case in ordering */
26607917fe8Szrj Iflag++;
26707917fe8Szrj break;
26807917fe8Szrj case 'o': /* order strings */
26907917fe8Szrj Oflag++;
27007917fe8Szrj break;
27107917fe8Szrj case 'r': /* randomize pointers */
27207917fe8Szrj Rflag++;
27307917fe8Szrj break;
27407917fe8Szrj case 's': /* silent */
27507917fe8Szrj Sflag++;
27607917fe8Szrj break;
27707917fe8Szrj case 'x': /* set the rotated bit */
27807917fe8Szrj Xflag++;
27907917fe8Szrj break;
28007917fe8Szrj case '?':
28107917fe8Szrj default:
28207917fe8Szrj usage();
28307917fe8Szrj }
28407917fe8Szrj argv += optind;
28507917fe8Szrj
28607917fe8Szrj if (*argv) {
28707917fe8Szrj Infile = *argv;
28807917fe8Szrj if (*++argv)
28907917fe8Szrj strcpy(Outfile, *argv);
29007917fe8Szrj }
29107917fe8Szrj if (!Infile) {
29207917fe8Szrj puts("No input file name");
29307917fe8Szrj usage();
29407917fe8Szrj }
29507917fe8Szrj if (*Outfile == '\0') {
29607917fe8Szrj strcpy(Outfile, Infile);
29707917fe8Szrj strcat(Outfile, ".dat");
29807917fe8Szrj }
29907917fe8Szrj }
30007917fe8Szrj
30107917fe8Szrj void
usage(void)30207917fe8Szrj usage(void)
30307917fe8Szrj {
30407917fe8Szrj fprintf(stderr,
30507917fe8Szrj "strfile [-Ciorsx] [-c char] source_file [output_file]\n");
30607917fe8Szrj exit(1);
30707917fe8Szrj }
30807917fe8Szrj
30907917fe8Szrj /*
31007917fe8Szrj * add_offset:
31107917fe8Szrj * Add an offset to the list, or write it out, as appropriate.
31207917fe8Szrj */
31307917fe8Szrj void
add_offset(FILE * fp,off_t off)314*7b5acc11Szrj add_offset(FILE *fp, off_t off)
31507917fe8Szrj {
316*7b5acc11Szrj off_t beoff;
31707917fe8Szrj
31807917fe8Szrj if (!STORING_PTRS) {
319*7b5acc11Szrj beoff = htobe64(off);
320*7b5acc11Szrj fwrite(&beoff, 1, sizeof(beoff), fp);
32107917fe8Szrj } else {
32207917fe8Szrj ALLOC(Seekpts, Num_pts + 1);
32307917fe8Szrj Seekpts[Num_pts] = off;
32407917fe8Szrj }
32507917fe8Szrj Num_pts++;
32607917fe8Szrj }
32707917fe8Szrj
32807917fe8Szrj /*
32907917fe8Szrj * do_order:
33007917fe8Szrj * Order the strings alphabetically (possibly ignoring case).
33107917fe8Szrj */
33207917fe8Szrj void
do_order(void)33307917fe8Szrj do_order(void)
33407917fe8Szrj {
335*7b5acc11Szrj uint32_t i;
336*7b5acc11Szrj off_t *lp;
33707917fe8Szrj STR *fp;
33807917fe8Szrj
33907917fe8Szrj Sort_1 = fopen(Infile, "r");
34007917fe8Szrj Sort_2 = fopen(Infile, "r");
341*7b5acc11Szrj qsort(Firstch, (size_t)Tbl.str_numstr, sizeof(*Firstch), cmp_str);
34207917fe8Szrj i = Tbl.str_numstr;
34307917fe8Szrj lp = Seekpts;
34407917fe8Szrj fp = Firstch;
34507917fe8Szrj while (i--)
34607917fe8Szrj *lp++ = fp++->pos;
34707917fe8Szrj fclose(Sort_1);
34807917fe8Szrj fclose(Sort_2);
34907917fe8Szrj Tbl.str_flags |= STR_ORDERED;
35007917fe8Szrj }
35107917fe8Szrj
35207917fe8Szrj static int
collate_range_cmp(int c1,int c2)35307917fe8Szrj collate_range_cmp(int c1, int c2)
35407917fe8Szrj {
35507917fe8Szrj static char s1[2], s2[2];
35607917fe8Szrj int ret;
35707917fe8Szrj
35807917fe8Szrj s1[0] = c1;
35907917fe8Szrj s2[0] = c2;
36007917fe8Szrj if ((ret = strcoll(s1, s2)) != 0)
36107917fe8Szrj return (ret);
36207917fe8Szrj return (c1 - c2);
36307917fe8Szrj }
36407917fe8Szrj
36507917fe8Szrj /*
36607917fe8Szrj * cmp_str:
36707917fe8Szrj * Compare two strings in the file
36807917fe8Szrj */
36907917fe8Szrj int
cmp_str(const void * s1,const void * s2)37007917fe8Szrj cmp_str(const void *s1, const void *s2)
37107917fe8Szrj {
37207917fe8Szrj const STR *p1, *p2;
37307917fe8Szrj int c1, c2, n1, n2, r;
37407917fe8Szrj
37507917fe8Szrj #define SET_N(nf,ch) (nf = (ch == '\n'))
37607917fe8Szrj #define IS_END(ch,nf) (ch == EOF || (ch == (unsigned char)Delimch && nf))
37707917fe8Szrj
37807917fe8Szrj p1 = (const STR *)s1;
37907917fe8Szrj p2 = (const STR *)s2;
38007917fe8Szrj
38107917fe8Szrj c1 = (unsigned char)p1->first;
38207917fe8Szrj c2 = (unsigned char)p2->first;
38307917fe8Szrj if ((r = collate_range_cmp(c1, c2)) != 0)
38407917fe8Szrj return r;
38507917fe8Szrj
386*7b5acc11Szrj fseeko(Sort_1, p1->pos, SEEK_SET);
387*7b5acc11Szrj fseeko(Sort_2, p2->pos, SEEK_SET);
38807917fe8Szrj
38907917fe8Szrj n1 = false;
39007917fe8Szrj n2 = false;
39107917fe8Szrj while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0' && c1 != EOF)
39207917fe8Szrj SET_N(n1, c1);
39307917fe8Szrj while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0' && c2 != EOF)
39407917fe8Szrj SET_N(n2, c2);
39507917fe8Szrj
39607917fe8Szrj while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
39707917fe8Szrj if (Iflag) {
39807917fe8Szrj if (isupper(c1))
39907917fe8Szrj c1 = tolower(c1);
40007917fe8Szrj if (isupper(c2))
40107917fe8Szrj c2 = tolower(c2);
40207917fe8Szrj }
40307917fe8Szrj if ((r = collate_range_cmp(c1, c2)) != 0)
40407917fe8Szrj return r;
40507917fe8Szrj SET_N(n1, c1);
40607917fe8Szrj SET_N(n2, c2);
40707917fe8Szrj c1 = getc(Sort_1);
40807917fe8Szrj c2 = getc(Sort_2);
40907917fe8Szrj }
41007917fe8Szrj if (IS_END(c1, n1))
41107917fe8Szrj c1 = 0;
41207917fe8Szrj if (IS_END(c2, n2))
41307917fe8Szrj c2 = 0;
41407917fe8Szrj
41507917fe8Szrj return (collate_range_cmp(c1, c2));
41607917fe8Szrj }
41707917fe8Szrj
41807917fe8Szrj /*
41907917fe8Szrj * randomize:
42007917fe8Szrj * Randomize the order of the string table. We must be careful
42107917fe8Szrj * not to randomize across delimiter boundaries. All
42207917fe8Szrj * randomization is done within each block.
42307917fe8Szrj */
42407917fe8Szrj void
randomize(void)42507917fe8Szrj randomize(void)
42607917fe8Szrj {
42707917fe8Szrj uint32_t cnt, i;
428*7b5acc11Szrj off_t tmp;
429*7b5acc11Szrj off_t *sp;
43007917fe8Szrj
43107917fe8Szrj Tbl.str_flags |= STR_RANDOM;
43207917fe8Szrj cnt = Tbl.str_numstr;
43307917fe8Szrj
43407917fe8Szrj /*
43507917fe8Szrj * move things around randomly
43607917fe8Szrj */
43707917fe8Szrj
43807917fe8Szrj for (sp = Seekpts; cnt > 0; cnt--, sp++) {
43907917fe8Szrj i = arc4random_uniform(cnt);
44007917fe8Szrj tmp = sp[0];
44107917fe8Szrj sp[0] = sp[i];
44207917fe8Szrj sp[i] = tmp;
44307917fe8Szrj }
44407917fe8Szrj }
445