1 /* $OpenBSD: strfile.c,v 1.29 2017/06/04 13:39:25 fcambus Exp $ */ 2 /* $NetBSD: strfile.c,v 1.4 1995/04/24 12:23:09 cgd Exp $ */ 3 4 /*- 5 * Copyright (c) 1989, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Ken Arnold. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <ctype.h> 37 #include <err.h> 38 #include <limits.h> 39 #include <stdbool.h> 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <unistd.h> 44 45 #include "strfile.h" 46 47 /* 48 * This program takes a file composed of strings separated by 49 * lines starting with two consecutive delimiting character (default 50 * character is '%') and creates another file which consists of a table 51 * describing the file (structure from "strfile.h"), a table of seek 52 * pointers to the start of the strings, and the strings, each terminated 53 * by a null byte. Usage: 54 * 55 * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] 56 * 57 * c - Change delimiting character from '%' to 'C' 58 * s - Silent. Give no summary of data processed at the end of 59 * the run. 60 * o - order the strings in alphabetic order 61 * i - if ordering, ignore case 62 * r - randomize the order of the strings 63 * x - set rotated bit 64 * 65 * Ken Arnold Sept. 7, 1978 -- 66 * 67 * Added ordering options. 68 */ 69 70 #define STORING_PTRS (Oflag || Rflag) 71 #define CHUNKSIZE 512 72 73 # define ALLOC(ptr,sz) do { \ 74 if (ptr == NULL) \ 75 ptr = calloc(CHUNKSIZE, sizeof *ptr); \ 76 else if (((sz) + 1) % CHUNKSIZE == 0) \ 77 ptr = reallocarray(ptr, \ 78 (sz) + CHUNKSIZE, \ 79 sizeof(*ptr)); \ 80 if (ptr == NULL) \ 81 err(1, NULL); \ 82 } while (0) 83 84 typedef struct { 85 char first; 86 int32_t pos; 87 } STR; 88 89 char *Infile = NULL, /* input file name */ 90 Outfile[PATH_MAX] = "", /* output file name */ 91 Delimch = '%'; /* delimiting character */ 92 93 bool Sflag = false; /* silent run flag */ 94 bool Oflag = false; /* ordering flag */ 95 bool Iflag = false; /* ignore case flag */ 96 bool Rflag = false; /* randomize order flag */ 97 bool Xflag = false; /* set rotated bit */ 98 long Num_pts = 0; /* number of pointers/strings */ 99 100 int32_t *Seekpts; 101 102 FILE *Sort_1, *Sort_2; /* pointers for sorting */ 103 104 STRFILE Tbl; /* statistics table */ 105 106 STR *Firstch; /* first chars of each string */ 107 108 109 void add_offset(FILE *, int32_t); 110 int cmp_str(const void *, const void *); 111 void do_order(void); 112 void getargs(int, char **); 113 void randomize(void); 114 char *unctrl(char); 115 __dead void usage(void); 116 117 /* 118 * main: 119 * Drive the sucker. There are two main modes -- either we store 120 * the seek pointers, if the table is to be sorted or randomized, 121 * or we write the pointer directly to the file, if we are to stay 122 * in file order. If the former, we allocate and re-allocate in 123 * CHUNKSIZE blocks; if the latter, we just write each pointer, 124 * and then seek back to the beginning to write in the table. 125 */ 126 int 127 main(int ac, char *av[]) 128 { 129 bool first; 130 char *sp, dc; 131 FILE *inf, *outf; 132 int32_t last_off, length, pos; 133 int32_t *p; 134 int cnt; 135 char *nsp; 136 STR *fp; 137 static char string[257]; 138 139 if (pledge("stdio rpath wpath cpath", NULL) == -1) 140 err(1, "pledge"); 141 142 getargs(ac, av); /* evalute arguments */ 143 dc = Delimch; 144 if ((inf = fopen(Infile, "r")) == NULL) 145 err(1, "%s", Infile); 146 147 if ((outf = fopen(Outfile, "w")) == NULL) 148 err(1, "%s", Outfile); 149 150 if (pledge("stdio", NULL) == -1) 151 err(1, "pledge"); 152 153 if (!STORING_PTRS) 154 (void) fseek(outf, sizeof Tbl, SEEK_SET); 155 156 /* 157 * Write the strings onto the file 158 */ 159 160 Tbl.str_longlen = 0; 161 Tbl.str_shortlen = (unsigned int) 0xffffffff; 162 Tbl.str_delim = dc; 163 Tbl.str_version = VERSION; 164 first = Oflag; 165 add_offset(outf, ftell(inf)); 166 last_off = 0; 167 do { 168 sp = fgets(string, sizeof(string), inf); 169 if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) { 170 pos = ftell(inf); 171 length = pos - last_off - (sp ? strlen(sp) : 0); 172 last_off = pos; 173 if (!length) 174 continue; 175 add_offset(outf, pos); 176 if (Tbl.str_longlen < (u_int32_t)length) 177 Tbl.str_longlen = length; 178 if (Tbl.str_shortlen > (u_int32_t)length) 179 Tbl.str_shortlen = length; 180 first = Oflag; 181 } else if (first) { 182 for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++) 183 continue; 184 ALLOC(Firstch, Num_pts); 185 fp = &Firstch[Num_pts - 1]; 186 if (Iflag && isupper((unsigned char)*nsp)) 187 fp->first = tolower((unsigned char)*nsp); 188 else 189 fp->first = *nsp; 190 fp->pos = Seekpts[Num_pts - 1]; 191 first = false; 192 } 193 } while (sp != NULL); 194 195 /* 196 * write the tables in 197 */ 198 199 (void) fclose(inf); 200 Tbl.str_numstr = Num_pts - 1; 201 if (Tbl.str_numstr == 0) 202 Tbl.str_shortlen = 0; 203 204 if (Oflag) 205 do_order(); 206 else if (Rflag) 207 randomize(); 208 209 if (Xflag) 210 Tbl.str_flags |= STR_ROTATED; 211 212 if (!Sflag) { 213 printf("\"%s\" created\n", Outfile); 214 if (Tbl.str_numstr == 1) 215 puts("There was 1 string"); 216 else 217 printf("There were %u strings\n", Tbl.str_numstr); 218 printf("Longest string: %lu byte%s\n", 219 (unsigned long) Tbl.str_longlen, 220 Tbl.str_longlen == 1 ? "" : "s"); 221 printf("Shortest string: %lu byte%s\n", 222 (unsigned long) Tbl.str_shortlen, 223 Tbl.str_shortlen == 1 ? "" : "s"); 224 } 225 226 (void) fseek(outf, 0, SEEK_SET); 227 Tbl.str_version = htonl(Tbl.str_version); 228 Tbl.str_numstr = htonl(Tbl.str_numstr); 229 Tbl.str_longlen = htonl(Tbl.str_longlen); 230 Tbl.str_shortlen = htonl(Tbl.str_shortlen); 231 Tbl.str_flags = htonl(Tbl.str_flags); 232 (void) fwrite(&Tbl.str_version, sizeof(Tbl.str_version), 1, outf); 233 (void) fwrite(&Tbl.str_numstr, sizeof(Tbl.str_numstr), 1, outf); 234 (void) fwrite(&Tbl.str_longlen, sizeof(Tbl.str_longlen), 1, outf); 235 (void) fwrite(&Tbl.str_shortlen, sizeof(Tbl.str_shortlen), 1, outf); 236 (void) fwrite(&Tbl.str_flags, sizeof(Tbl.str_flags), 1, outf); 237 (void) fwrite( Tbl.stuff, sizeof(Tbl.stuff), 1, outf); 238 if (STORING_PTRS) { 239 for (p = Seekpts, cnt = Num_pts; cnt--; ++p) { 240 *p = htonl(*p); 241 (void) fwrite(p, sizeof(*p), 1, outf); 242 } 243 } 244 if (fclose(outf)) 245 err(1, "fclose `%s'", Outfile); 246 return 0; 247 } 248 249 /* 250 * This routine evaluates arguments from the command line 251 */ 252 void 253 getargs(int argc, char *argv[]) 254 { 255 extern char *optarg; 256 extern int optind; 257 int ch; 258 259 while ((ch = getopt(argc, argv, "c:hiorsx")) != -1) { 260 switch(ch) { 261 case 'c': /* new delimiting char */ 262 Delimch = *optarg; 263 if (!isascii((unsigned char)Delimch)) { 264 printf("bad delimiting character: '\\%o\n'", 265 Delimch); 266 } 267 break; 268 case 'i': /* ignore case in ordering */ 269 Iflag = true; 270 break; 271 case 'o': /* order strings */ 272 Oflag = true; 273 break; 274 case 'r': /* randomize pointers */ 275 Rflag = true; 276 break; 277 case 's': /* silent */ 278 Sflag = true; 279 break; 280 case 'x': /* set the rotated bit */ 281 Xflag = true; 282 break; 283 case 'h': 284 default: 285 usage(); 286 } 287 } 288 argv += optind; 289 290 if (*argv) { 291 Infile = *argv; 292 if (*++argv) 293 (void) strlcpy(Outfile, *argv, sizeof Outfile); 294 } 295 if (!Infile) { 296 puts("No input file name"); 297 usage(); 298 } 299 if (*Outfile == '\0') { 300 (void) strlcpy(Outfile, Infile, sizeof(Outfile)); 301 if (strlcat(Outfile, ".dat", sizeof(Outfile)) >= sizeof(Outfile)) 302 errx(1, "`%s': name too long", Infile); 303 } 304 } 305 306 void 307 usage(void) 308 { 309 (void) fprintf(stderr, 310 "%s [-iorsx] [-c char] sourcefile [datafile]\n", getprogname()); 311 exit(1); 312 } 313 314 /* 315 * add_offset: 316 * Add an offset to the list, or write it out, as appropriate. 317 */ 318 void 319 add_offset(FILE *fp, int32_t off) 320 { 321 int32_t net; 322 323 if (!STORING_PTRS) { 324 net = htonl(off); 325 fwrite(&net, 1, sizeof net, fp); 326 } else { 327 ALLOC(Seekpts, Num_pts + 1); 328 Seekpts[Num_pts] = off; 329 } 330 Num_pts++; 331 } 332 333 /* 334 * do_order: 335 * Order the strings alphabetically (possibly ignoring case). 336 */ 337 void 338 do_order(void) 339 { 340 int i; 341 int32_t *lp; 342 STR *fp; 343 344 Sort_1 = fopen(Infile, "r"); 345 Sort_2 = fopen(Infile, "r"); 346 qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 347 i = Tbl.str_numstr; 348 lp = Seekpts; 349 fp = Firstch; 350 while (i--) 351 *lp++ = fp++->pos; 352 (void) fclose(Sort_1); 353 (void) fclose(Sort_2); 354 Tbl.str_flags |= STR_ORDERED; 355 } 356 357 /* 358 * cmp_str: 359 * Compare two strings in the file 360 */ 361 char * 362 unctrl(char c) 363 { 364 static char buf[3]; 365 366 if (isprint((unsigned char)c)) { 367 buf[0] = c; 368 buf[1] = '\0'; 369 } else if (c == 0177) { 370 buf[0] = '^'; 371 buf[1] = '?'; 372 } else { 373 buf[0] = '^'; 374 buf[1] = c + 'A' - 1; 375 } 376 return buf; 377 } 378 379 int 380 cmp_str(const void *p1, const void *p2) 381 { 382 bool n1, n2; 383 int c1, c2; 384 385 # define SET_N(nf,ch) (nf = (ch == '\n')) 386 # define IS_END(ch,nf) (ch == Delimch && nf) 387 388 c1 = ((STR *)p1)->first; 389 c2 = ((STR *)p2)->first; 390 if (c1 != c2) 391 return c1 - c2; 392 393 (void) fseek(Sort_1, ((STR *)p1)->pos, SEEK_SET); 394 (void) fseek(Sort_2, ((STR *)p2)->pos, SEEK_SET); 395 396 n1 = false; 397 n2 = false; 398 while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 399 SET_N(n1, c1); 400 while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 401 SET_N(n2, c2); 402 403 while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 404 if (Iflag) { 405 if (isupper(c1)) 406 c1 = tolower(c1); 407 if (isupper(c2)) 408 c2 = tolower(c2); 409 } 410 if (c1 != c2) 411 return c1 - c2; 412 SET_N(n1, c1); 413 SET_N(n2, c2); 414 c1 = getc(Sort_1); 415 c2 = getc(Sort_2); 416 } 417 if (IS_END(c1, n1)) 418 c1 = 0; 419 if (IS_END(c2, n2)) 420 c2 = 0; 421 return c1 - c2; 422 } 423 424 /* 425 * randomize: 426 * Randomize the order of the string table. We must be careful 427 * not to randomize across delimiter boundaries. All 428 * randomization is done within each block. 429 */ 430 void 431 randomize(void) 432 { 433 int cnt, i; 434 int32_t tmp; 435 int32_t *sp; 436 437 Tbl.str_flags |= STR_RANDOM; 438 cnt = Tbl.str_numstr; 439 440 /* 441 * move things around randomly 442 */ 443 444 for (sp = Seekpts; cnt > 0; cnt--, sp++) { 445 i = arc4random_uniform(cnt); 446 tmp = sp[0]; 447 sp[0] = sp[i]; 448 sp[i] = tmp; 449 } 450 } 451