1 /*- 2 * Copyright (c) 1991, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Steve Hayman of the Computer Science Department, Indiana University, 7 * Michiro Hikida and David Goodenough. 8 * 9 * %sccs.include.redist.c% 10 */ 11 12 #ifndef lint 13 static char copyright[] = 14 "@(#) Copyright (c) 1991, 1993, 1994\n\ 15 The Regents of the University of California. All rights reserved.\n"; 16 #endif /* not lint */ 17 18 #ifndef lint 19 static char sccsid[] = "@(#)join.c 8.6 (Berkeley) 05/04/95"; 20 #endif /* not lint */ 21 22 #include <sys/param.h> 23 24 #include <ctype.h> 25 #include <err.h> 26 #include <errno.h> 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <unistd.h> 31 32 /* 33 * There's a structure per input file which encapsulates the state of the 34 * file. We repeatedly read lines from each file until we've read in all 35 * the consecutive lines from the file with a common join field. Then we 36 * compare the set of lines with an equivalent set from the other file. 37 */ 38 typedef struct { 39 char *line; /* line */ 40 u_long linealloc; /* line allocated count */ 41 char **fields; /* line field(s) */ 42 u_long fieldcnt; /* line field(s) count */ 43 u_long fieldalloc; /* line field(s) allocated count */ 44 } LINE; 45 46 typedef struct { 47 FILE *fp; /* file descriptor */ 48 u_long joinf; /* join field (-1, -2, -j) */ 49 int unpair; /* output unpairable lines (-a) */ 50 int number; /* 1 for file 1, 2 for file 2 */ 51 52 LINE *set; /* set of lines with same field */ 53 int pushbool; /* if pushback is set */ 54 u_long pushback; /* line on the stack */ 55 u_long setcnt; /* set count */ 56 u_long setalloc; /* set allocated count */ 57 } INPUT; 58 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, }, 59 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, }; 60 61 typedef struct { 62 u_long filenum; /* file number */ 63 u_long fieldno; /* field number */ 64 } OLIST; 65 OLIST *olist; /* output field list */ 66 u_long olistcnt; /* output field list count */ 67 u_long olistalloc; /* output field allocated count */ 68 69 int joinout = 1; /* show lines with matched join fields (-v) */ 70 int needsep; /* need separator character */ 71 int spans = 1; /* span multiple delimiters (-t) */ 72 char *empty; /* empty field replacement string (-e) */ 73 char *tabchar = " \t"; /* delimiter characters (-t) */ 74 75 int cmp __P((LINE *, u_long, LINE *, u_long)); 76 void fieldarg __P((char *)); 77 void joinlines __P((INPUT *, INPUT *)); 78 void obsolete __P((char **)); 79 void outfield __P((LINE *, u_long, int)); 80 void outoneline __P((INPUT *, LINE *)); 81 void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *)); 82 void slurp __P((INPUT *)); 83 void usage __P((void)); 84 85 int 86 main(argc, argv) 87 int argc; 88 char *argv[]; 89 { 90 INPUT *F1, *F2; 91 int aflag, ch, cval, vflag; 92 char *end; 93 94 F1 = &input1; 95 F2 = &input2; 96 97 aflag = vflag = 0; 98 obsolete(argv); 99 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != EOF) { 100 switch (ch) { 101 case '\01': /* See comment in obsolete(). */ 102 aflag = 1; 103 F1->unpair = F2->unpair = 1; 104 break; 105 case '1': 106 if ((F1->joinf = strtol(optarg, &end, 10)) < 1) 107 errx(1, "-1 option field number less than 1"); 108 if (*end) 109 errx(1, "illegal field number -- %s", optarg); 110 --F1->joinf; 111 break; 112 case '2': 113 if ((F2->joinf = strtol(optarg, &end, 10)) < 1) 114 errx(1, "-2 option field number less than 1"); 115 if (*end) 116 errx(1, "illegal field number -- %s", optarg); 117 --F2->joinf; 118 break; 119 case 'a': 120 aflag = 1; 121 switch(strtol(optarg, &end, 10)) { 122 case 1: 123 F1->unpair = 1; 124 break; 125 case 2: 126 F2->unpair = 1; 127 break; 128 default: 129 errx(1, "-a option file number not 1 or 2"); 130 break; 131 } 132 if (*end) 133 errx(1, "illegal file number -- %s", optarg); 134 break; 135 case 'e': 136 empty = optarg; 137 break; 138 case 'j': 139 if ((F1->joinf = F2->joinf = 140 strtol(optarg, &end, 10)) < 1) 141 errx(1, "-j option field number less than 1"); 142 if (*end) 143 errx(1, "illegal field number -- %s", optarg); 144 --F1->joinf; 145 --F2->joinf; 146 break; 147 case 'o': 148 fieldarg(optarg); 149 break; 150 case 't': 151 spans = 0; 152 if (strlen(tabchar = optarg) != 1) 153 errx(1, "illegal tab character specification"); 154 break; 155 case 'v': 156 vflag = 1; 157 joinout = 0; 158 switch (strtol(optarg, &end, 10)) { 159 case 1: 160 F1->unpair = 1; 161 break; 162 case 2: 163 F2->unpair = 1; 164 break; 165 default: 166 errx(1, "-v option file number not 1 or 2"); 167 break; 168 } 169 if (*end) 170 errx(1, "illegal file number -- %s", optarg); 171 break; 172 case '?': 173 default: 174 usage(); 175 } 176 } 177 argc -= optind; 178 argv += optind; 179 180 if (aflag && vflag) 181 errx(1, "the -a and -v options are mutually exclusive"); 182 183 if (argc != 2) 184 usage(); 185 186 /* Open the files; "-" means stdin. */ 187 if (!strcmp(*argv, "-")) 188 F1->fp = stdin; 189 else if ((F1->fp = fopen(*argv, "r")) == NULL) 190 err(1, "%s", *argv); 191 ++argv; 192 if (!strcmp(*argv, "-")) 193 F2->fp = stdin; 194 else if ((F2->fp = fopen(*argv, "r")) == NULL) 195 err(1, "%s", *argv); 196 if (F1->fp == stdin && F2->fp == stdin) 197 errx(1, "only one input file may be stdin"); 198 199 slurp(F1); 200 slurp(F2); 201 while (F1->setcnt && F2->setcnt) { 202 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf); 203 if (cval == 0) { 204 /* Oh joy, oh rapture, oh beauty divine! */ 205 if (joinout) 206 joinlines(F1, F2); 207 slurp(F1); 208 slurp(F2); 209 } else if (cval < 0) { 210 /* File 1 takes the lead... */ 211 if (F1->unpair) 212 joinlines(F1, NULL); 213 slurp(F1); 214 } else { 215 /* File 2 takes the lead... */ 216 if (F2->unpair) 217 joinlines(F2, NULL); 218 slurp(F2); 219 } 220 } 221 222 /* 223 * Now that one of the files is used up, optionally output any 224 * remaining lines from the other file. 225 */ 226 if (F1->unpair) 227 while (F1->setcnt) { 228 joinlines(F1, NULL); 229 slurp(F1); 230 } 231 if (F2->unpair) 232 while (F2->setcnt) { 233 joinlines(F2, NULL); 234 slurp(F2); 235 } 236 exit(0); 237 } 238 239 void 240 slurp(F) 241 INPUT *F; 242 { 243 LINE *lp, *lastlp, tmp; 244 size_t len; 245 int cnt; 246 char *bp, *fieldp; 247 248 /* 249 * Read all of the lines from an input file that have the same 250 * join field. 251 */ 252 F->setcnt = 0; 253 for (lastlp = NULL;; ++F->setcnt, lastlp = lp) { 254 /* 255 * If we're out of space to hold line structures, allocate 256 * more. Initialize the structure so that we know that this 257 * is new space. 258 */ 259 if (F->setcnt == F->setalloc) { 260 cnt = F->setalloc; 261 F->setalloc += 50; 262 if ((F->set = realloc(F->set, 263 F->setalloc * sizeof(LINE))) == NULL) 264 err(1, NULL); 265 memset(F->set + cnt, 0, 50 * sizeof(LINE)); 266 267 /* re-set lastlp in case it moved */ 268 if (lastlp != NULL) 269 lastlp = &F->set[F->setcnt - 1]; 270 } 271 272 /* 273 * Get any pushed back line, else get the next line. Allocate 274 * space as necessary. If taking the line from the stack swap 275 * the two structures so that we don't lose space allocated to 276 * either structure. This could be avoided by doing another 277 * level of indirection, but it's probably okay as is. 278 * but it's probably okay as is. 279 */ 280 lp = &F->set[F->setcnt]; 281 if (F->pushbool) { 282 tmp = F->set[F->setcnt]; 283 F->set[F->setcnt] = F->set[F->pushback]; 284 F->set[F->pushback] = tmp; 285 F->pushbool = 0; 286 continue; 287 } 288 if ((bp = fgetln(F->fp, &len)) == NULL) 289 return; 290 if (lp->linealloc <= len + 1) { 291 lp->linealloc += MAX(100, len + 1 - lp->linealloc); 292 if ((lp->line = 293 realloc(lp->line, lp->linealloc)) == NULL) 294 err(1, NULL); 295 } 296 memmove(lp->line, bp, len); 297 298 /* Replace trailing newline, if it exists. */ 299 if (bp[len - 1] == '\n') 300 lp->line[len - 1] = '\0'; 301 else 302 lp->line[len] = '\0'; 303 bp = lp->line; 304 305 /* Split the line into fields, allocate space as necessary. */ 306 lp->fieldcnt = 0; 307 while ((fieldp = strsep(&bp, tabchar)) != NULL) { 308 if (spans && *fieldp == '\0') 309 continue; 310 if (lp->fieldcnt == lp->fieldalloc) { 311 lp->fieldalloc += 50; 312 if ((lp->fields = realloc(lp->fields, 313 lp->fieldalloc * sizeof(char *))) == NULL) 314 err(1, NULL); 315 } 316 lp->fields[lp->fieldcnt++] = fieldp; 317 } 318 319 /* See if the join field value has changed. */ 320 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) { 321 F->pushbool = 1; 322 F->pushback = F->setcnt; 323 break; 324 } 325 } 326 } 327 328 int 329 cmp(lp1, fieldno1, lp2, fieldno2) 330 LINE *lp1, *lp2; 331 u_long fieldno1, fieldno2; 332 { 333 if (lp1->fieldcnt <= fieldno1) 334 return (lp2->fieldcnt <= fieldno2 ? 0 : 1); 335 if (lp2->fieldcnt <= fieldno2) 336 return (-1); 337 return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2])); 338 } 339 340 void 341 joinlines(F1, F2) 342 INPUT *F1, *F2; 343 { 344 int cnt1, cnt2; 345 346 /* 347 * Output the results of a join comparison. The output may be from 348 * either file 1 or file 2 (in which case the first argument is the 349 * file from which to output) or from both. 350 */ 351 if (F2 == NULL) { 352 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 353 outoneline(F1, &F1->set[cnt1]); 354 return; 355 } 356 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 357 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2) 358 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]); 359 } 360 361 void 362 outoneline(F, lp) 363 INPUT *F; 364 LINE *lp; 365 { 366 int cnt; 367 368 /* 369 * Output a single line from one of the files, according to the 370 * join rules. This happens when we are writing unmatched single 371 * lines. Output empty fields in the right places. 372 */ 373 if (olist) 374 for (cnt = 0; cnt < olistcnt; ++cnt) { 375 if (olist[cnt].filenum == F->number) 376 outfield(lp, olist[cnt].fieldno, 0); 377 else 378 outfield(lp, 0, 1); 379 } 380 else 381 for (cnt = 0; cnt < lp->fieldcnt; ++cnt) 382 outfield(lp, cnt, 0); 383 (void)printf("\n"); 384 if (ferror(stdout)) 385 err(1, "stdout"); 386 needsep = 0; 387 } 388 389 void 390 outtwoline(F1, lp1, F2, lp2) 391 INPUT *F1, *F2; 392 LINE *lp1, *lp2; 393 { 394 int cnt; 395 396 /* Output a pair of lines according to the join list (if any). */ 397 if (olist) 398 for (cnt = 0; cnt < olistcnt; ++cnt) 399 if (olist[cnt].filenum == 1) 400 outfield(lp1, olist[cnt].fieldno, 0); 401 else /* if (olist[cnt].filenum == 2) */ 402 outfield(lp2, olist[cnt].fieldno, 0); 403 else { 404 /* 405 * Output the join field, then the remaining fields from F1 406 * and F2. 407 */ 408 outfield(lp1, F1->joinf, 0); 409 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt) 410 if (F1->joinf != cnt) 411 outfield(lp1, cnt, 0); 412 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt) 413 if (F2->joinf != cnt) 414 outfield(lp2, cnt, 0); 415 } 416 (void)printf("\n"); 417 if (ferror(stdout)) 418 err(1, "stdout"); 419 needsep = 0; 420 } 421 422 void 423 outfield(lp, fieldno, out_empty) 424 LINE *lp; 425 u_long fieldno; 426 int out_empty; 427 { 428 if (needsep++) 429 (void)printf("%c", *tabchar); 430 if (!ferror(stdout)) 431 if (lp->fieldcnt < fieldno || out_empty) { 432 if (empty != NULL) 433 (void)printf("%s", empty); 434 } else { 435 if (*lp->fields[fieldno] == '\0') 436 return; 437 (void)printf("%s", lp->fields[fieldno]); 438 } 439 if (ferror(stdout)) 440 err(1, "stdout"); 441 } 442 443 /* 444 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output 445 * fields. 446 */ 447 void 448 fieldarg(option) 449 char *option; 450 { 451 u_long fieldno; 452 char *end, *token; 453 454 while ((token = strsep(&option, ", \t")) != NULL) { 455 if (*token == '\0') 456 continue; 457 if (token[0] != '1' && token[0] != '2' || token[1] != '.') 458 errx(1, "malformed -o option field"); 459 fieldno = strtol(token + 2, &end, 10); 460 if (*end) 461 errx(1, "malformed -o option field"); 462 if (fieldno == 0) 463 errx(1, "field numbers are 1 based"); 464 if (olistcnt == olistalloc) { 465 olistalloc += 50; 466 if ((olist = realloc(olist, 467 olistalloc * sizeof(OLIST))) == NULL) 468 err(1, NULL); 469 } 470 olist[olistcnt].filenum = token[0] - '0'; 471 olist[olistcnt].fieldno = fieldno - 1; 472 ++olistcnt; 473 } 474 } 475 476 void 477 obsolete(argv) 478 char **argv; 479 { 480 int len; 481 char **p, *ap, *t; 482 483 while ((ap = *++argv) != NULL) { 484 /* Return if "--". */ 485 if (ap[0] == '-' && ap[1] == '-') 486 return; 487 switch (ap[1]) { 488 case 'a': 489 /* 490 * The original join allowed "-a", which meant the 491 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2 492 * only specifies this as "-a 1" and "a -2", so we 493 * have to use another option flag, one that is 494 * unlikely to ever be used or accidentally entered 495 * on the command line. (Well, we could reallocate 496 * the argv array, but that hardly seems worthwhile.) 497 */ 498 if (ap[2] == '\0') 499 ap[1] = '\01'; 500 break; 501 case 'j': 502 /* 503 * The original join allowed "-j[12] arg" and "-j arg". 504 * Convert the former to "-[12] arg". Don't convert 505 * the latter since getopt(3) can handle it. 506 */ 507 switch(ap[2]) { 508 case '1': 509 if (ap[3] != '\0') 510 goto jbad; 511 ap[1] = '1'; 512 ap[2] = '\0'; 513 break; 514 case '2': 515 if (ap[3] != '\0') 516 goto jbad; 517 ap[1] = '2'; 518 ap[2] = '\0'; 519 break; 520 case '\0': 521 break; 522 default: 523 jbad: errx(1, "illegal option -- %s", ap); 524 usage(); 525 } 526 break; 527 case 'o': 528 /* 529 * The original join allowed "-o arg arg". 530 * Convert to "-o arg -o arg". 531 */ 532 if (ap[2] != '\0') 533 break; 534 for (p = argv + 2; *p; ++p) { 535 if (p[0][0] != '1' && 536 p[0][0] != '2' || p[0][1] != '.') 537 break; 538 len = strlen(*p); 539 if (len - 2 != strspn(*p + 2, "0123456789")) 540 break; 541 if ((t = malloc(len + 3)) == NULL) 542 err(1, NULL); 543 t[0] = '-'; 544 t[1] = 'o'; 545 memmove(t + 2, *p, len + 1); 546 *p = t; 547 } 548 argv = p - 1; 549 break; 550 } 551 } 552 } 553 554 void 555 usage() 556 { 557 (void)fprintf(stderr, "%s%s\n", 558 "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ", 559 "[-2 field]\n [-o list] [-t char] file1 file2"); 560 exit(1); 561 } 562