1 /* $OpenBSD: strfile.c,v 1.30 2020/02/14 19:17:33 schwarze Exp $ */ 2 /* $NetBSD: strfile.c,v 1.4 1995/04/24 12:23:09 cgd Exp $ */ 3 4 /*- 5 * Copyright (c) 1989, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Ken Arnold. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <ctype.h> 37 #include <err.h> 38 #include <limits.h> 39 #include <stdbool.h> 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <unistd.h> 44 45 #include "strfile.h" 46 47 /* 48 * This program takes a file composed of strings separated by 49 * lines starting with two consecutive delimiting character (default 50 * character is '%') and creates another file which consists of a table 51 * describing the file (structure from "strfile.h"), a table of seek 52 * pointers to the start of the strings, and the strings, each terminated 53 * by a null byte. Usage: 54 * 55 * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] 56 * 57 * c - Change delimiting character from '%' to 'C' 58 * s - Silent. Give no summary of data processed at the end of 59 * the run. 60 * o - order the strings in alphabetic order 61 * i - if ordering, ignore case 62 * r - randomize the order of the strings 63 * x - set rotated bit 64 * 65 * Ken Arnold Sept. 7, 1978 -- 66 * 67 * Added ordering options. 68 */ 69 70 #define STORING_PTRS (Oflag || Rflag) 71 #define CHUNKSIZE 512 72 73 # define ALLOC(ptr,sz) do { \ 74 if (ptr == NULL) \ 75 ptr = calloc(CHUNKSIZE, sizeof *ptr); \ 76 else if (((sz) + 1) % CHUNKSIZE == 0) \ 77 ptr = reallocarray(ptr, \ 78 (sz) + CHUNKSIZE, \ 79 sizeof(*ptr)); \ 80 if (ptr == NULL) \ 81 err(1, NULL); \ 82 } while (0) 83 84 typedef struct { 85 char first; 86 int32_t pos; 87 } STR; 88 89 char *Infile = NULL, /* input file name */ 90 Outfile[PATH_MAX] = "", /* output file name */ 91 Delimch = '%'; /* delimiting character */ 92 93 bool Sflag = false; /* silent run flag */ 94 bool Oflag = false; /* ordering flag */ 95 bool Iflag = false; /* ignore case flag */ 96 bool Rflag = false; /* randomize order flag */ 97 bool Xflag = false; /* set rotated bit */ 98 long Num_pts = 0; /* number of pointers/strings */ 99 100 int32_t *Seekpts; 101 102 FILE *Sort_1, *Sort_2; /* pointers for sorting */ 103 104 STRFILE Tbl; /* statistics table */ 105 106 STR *Firstch; /* first chars of each string */ 107 108 109 void add_offset(FILE *, int32_t); 110 int cmp_str(const void *, const void *); 111 void do_order(void); 112 void getargs(int, char **); 113 void randomize(void); 114 char *unctrl(char); 115 __dead void usage(void); 116 117 /* 118 * main: 119 * Drive the sucker. There are two main modes -- either we store 120 * the seek pointers, if the table is to be sorted or randomized, 121 * or we write the pointer directly to the file, if we are to stay 122 * in file order. If the former, we allocate and re-allocate in 123 * CHUNKSIZE blocks; if the latter, we just write each pointer, 124 * and then seek back to the beginning to write in the table. 125 */ 126 int 127 main(int ac, char *av[]) 128 { 129 bool first; 130 char *sp, dc; 131 FILE *inf, *outf; 132 int32_t last_off, length, pos; 133 int32_t *p; 134 int cnt; 135 char *nsp; 136 STR *fp; 137 static char string[257]; 138 139 if (pledge("stdio rpath wpath cpath", NULL) == -1) 140 err(1, "pledge"); 141 142 getargs(ac, av); /* evalute arguments */ 143 dc = Delimch; 144 if ((inf = fopen(Infile, "r")) == NULL) 145 err(1, "%s", Infile); 146 147 if ((outf = fopen(Outfile, "w")) == NULL) 148 err(1, "%s", Outfile); 149 150 if (pledge("stdio", NULL) == -1) 151 err(1, "pledge"); 152 153 if (!STORING_PTRS) 154 (void) fseek(outf, sizeof Tbl, SEEK_SET); 155 156 /* 157 * Write the strings onto the file 158 */ 159 160 Tbl.str_longlen = 0; 161 Tbl.str_shortlen = (unsigned int) 0xffffffff; 162 Tbl.str_delim = dc; 163 Tbl.str_version = VERSION; 164 first = Oflag; 165 add_offset(outf, ftell(inf)); 166 last_off = 0; 167 do { 168 sp = fgets(string, sizeof(string), inf); 169 if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) { 170 pos = ftell(inf); 171 length = pos - last_off - (sp ? strlen(sp) : 0); 172 last_off = pos; 173 if (!length) 174 continue; 175 add_offset(outf, pos); 176 if (Tbl.str_longlen < (u_int32_t)length) 177 Tbl.str_longlen = length; 178 if (Tbl.str_shortlen > (u_int32_t)length) 179 Tbl.str_shortlen = length; 180 first = Oflag; 181 } else if (first) { 182 for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++) 183 continue; 184 ALLOC(Firstch, Num_pts); 185 fp = &Firstch[Num_pts - 1]; 186 if (Iflag && isupper((unsigned char)*nsp)) 187 fp->first = tolower((unsigned char)*nsp); 188 else 189 fp->first = *nsp; 190 fp->pos = Seekpts[Num_pts - 1]; 191 first = false; 192 } 193 } while (sp != NULL); 194 195 /* 196 * write the tables in 197 */ 198 199 (void) fclose(inf); 200 Tbl.str_numstr = Num_pts - 1; 201 if (Tbl.str_numstr == 0) 202 Tbl.str_shortlen = 0; 203 204 if (Oflag) 205 do_order(); 206 else if (Rflag) 207 randomize(); 208 209 if (Xflag) 210 Tbl.str_flags |= STR_ROTATED; 211 212 if (!Sflag) { 213 printf("\"%s\" created\n", Outfile); 214 if (Tbl.str_numstr == 1) 215 puts("There was 1 string"); 216 else 217 printf("There were %u strings\n", Tbl.str_numstr); 218 printf("Longest string: %lu byte%s\n", 219 (unsigned long) Tbl.str_longlen, 220 Tbl.str_longlen == 1 ? "" : "s"); 221 printf("Shortest string: %lu byte%s\n", 222 (unsigned long) Tbl.str_shortlen, 223 Tbl.str_shortlen == 1 ? "" : "s"); 224 } 225 226 (void) fseek(outf, 0, SEEK_SET); 227 Tbl.str_version = htonl(Tbl.str_version); 228 Tbl.str_numstr = htonl(Tbl.str_numstr); 229 Tbl.str_longlen = htonl(Tbl.str_longlen); 230 Tbl.str_shortlen = htonl(Tbl.str_shortlen); 231 Tbl.str_flags = htonl(Tbl.str_flags); 232 (void) fwrite(&Tbl.str_version, sizeof(Tbl.str_version), 1, outf); 233 (void) fwrite(&Tbl.str_numstr, sizeof(Tbl.str_numstr), 1, outf); 234 (void) fwrite(&Tbl.str_longlen, sizeof(Tbl.str_longlen), 1, outf); 235 (void) fwrite(&Tbl.str_shortlen, sizeof(Tbl.str_shortlen), 1, outf); 236 (void) fwrite(&Tbl.str_flags, sizeof(Tbl.str_flags), 1, outf); 237 (void) fwrite( Tbl.stuff, sizeof(Tbl.stuff), 1, outf); 238 if (STORING_PTRS) { 239 for (p = Seekpts, cnt = Num_pts; cnt--; ++p) { 240 *p = htonl(*p); 241 (void) fwrite(p, sizeof(*p), 1, outf); 242 } 243 } 244 if (fclose(outf)) 245 err(1, "fclose `%s'", Outfile); 246 return 0; 247 } 248 249 /* 250 * This routine evaluates arguments from the command line 251 */ 252 void 253 getargs(int argc, char *argv[]) 254 { 255 int ch; 256 257 while ((ch = getopt(argc, argv, "c:hiorsx")) != -1) { 258 switch(ch) { 259 case 'c': /* new delimiting char */ 260 Delimch = *optarg; 261 if (!isascii((unsigned char)Delimch)) { 262 printf("bad delimiting character: '\\%o\n'", 263 Delimch); 264 } 265 break; 266 case 'i': /* ignore case in ordering */ 267 Iflag = true; 268 break; 269 case 'o': /* order strings */ 270 Oflag = true; 271 break; 272 case 'r': /* randomize pointers */ 273 Rflag = true; 274 break; 275 case 's': /* silent */ 276 Sflag = true; 277 break; 278 case 'x': /* set the rotated bit */ 279 Xflag = true; 280 break; 281 case 'h': 282 default: 283 usage(); 284 } 285 } 286 argv += optind; 287 288 if (*argv) { 289 Infile = *argv; 290 if (*++argv) 291 (void) strlcpy(Outfile, *argv, sizeof Outfile); 292 } 293 if (!Infile) { 294 puts("No input file name"); 295 usage(); 296 } 297 if (*Outfile == '\0') { 298 (void) strlcpy(Outfile, Infile, sizeof(Outfile)); 299 if (strlcat(Outfile, ".dat", sizeof(Outfile)) >= sizeof(Outfile)) 300 errx(1, "`%s': name too long", Infile); 301 } 302 } 303 304 void 305 usage(void) 306 { 307 (void) fprintf(stderr, 308 "%s [-iorsx] [-c char] sourcefile [datafile]\n", getprogname()); 309 exit(1); 310 } 311 312 /* 313 * add_offset: 314 * Add an offset to the list, or write it out, as appropriate. 315 */ 316 void 317 add_offset(FILE *fp, int32_t off) 318 { 319 int32_t net; 320 321 if (!STORING_PTRS) { 322 net = htonl(off); 323 fwrite(&net, 1, sizeof net, fp); 324 } else { 325 ALLOC(Seekpts, Num_pts + 1); 326 Seekpts[Num_pts] = off; 327 } 328 Num_pts++; 329 } 330 331 /* 332 * do_order: 333 * Order the strings alphabetically (possibly ignoring case). 334 */ 335 void 336 do_order(void) 337 { 338 int i; 339 int32_t *lp; 340 STR *fp; 341 342 Sort_1 = fopen(Infile, "r"); 343 Sort_2 = fopen(Infile, "r"); 344 qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 345 i = Tbl.str_numstr; 346 lp = Seekpts; 347 fp = Firstch; 348 while (i--) 349 *lp++ = fp++->pos; 350 (void) fclose(Sort_1); 351 (void) fclose(Sort_2); 352 Tbl.str_flags |= STR_ORDERED; 353 } 354 355 /* 356 * cmp_str: 357 * Compare two strings in the file 358 */ 359 char * 360 unctrl(char c) 361 { 362 static char buf[3]; 363 364 if (isprint((unsigned char)c)) { 365 buf[0] = c; 366 buf[1] = '\0'; 367 } else if (c == 0177) { 368 buf[0] = '^'; 369 buf[1] = '?'; 370 } else { 371 buf[0] = '^'; 372 buf[1] = c + 'A' - 1; 373 } 374 return buf; 375 } 376 377 int 378 cmp_str(const void *p1, const void *p2) 379 { 380 bool n1, n2; 381 int c1, c2; 382 383 # define SET_N(nf,ch) (nf = (ch == '\n')) 384 # define IS_END(ch,nf) (ch == Delimch && nf) 385 386 c1 = ((STR *)p1)->first; 387 c2 = ((STR *)p2)->first; 388 if (c1 != c2) 389 return c1 - c2; 390 391 (void) fseek(Sort_1, ((STR *)p1)->pos, SEEK_SET); 392 (void) fseek(Sort_2, ((STR *)p2)->pos, SEEK_SET); 393 394 n1 = false; 395 n2 = false; 396 while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 397 SET_N(n1, c1); 398 while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 399 SET_N(n2, c2); 400 401 while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 402 if (Iflag) { 403 if (isupper(c1)) 404 c1 = tolower(c1); 405 if (isupper(c2)) 406 c2 = tolower(c2); 407 } 408 if (c1 != c2) 409 return c1 - c2; 410 SET_N(n1, c1); 411 SET_N(n2, c2); 412 c1 = getc(Sort_1); 413 c2 = getc(Sort_2); 414 } 415 if (IS_END(c1, n1)) 416 c1 = 0; 417 if (IS_END(c2, n2)) 418 c2 = 0; 419 return c1 - c2; 420 } 421 422 /* 423 * randomize: 424 * Randomize the order of the string table. We must be careful 425 * not to randomize across delimiter boundaries. All 426 * randomization is done within each block. 427 */ 428 void 429 randomize(void) 430 { 431 int cnt, i; 432 int32_t tmp; 433 int32_t *sp; 434 435 Tbl.str_flags |= STR_RANDOM; 436 cnt = Tbl.str_numstr; 437 438 /* 439 * move things around randomly 440 */ 441 442 for (sp = Seekpts; cnt > 0; cnt--, sp++) { 443 i = arc4random_uniform(cnt); 444 tmp = sp[0]; 445 sp[0] = sp[i]; 446 sp[i] = tmp; 447 } 448 } 449