1 /*- 2 * Copyright (c) 1991, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Steve Hayman of the Computer Science Department, Indiana University, 7 * Michiro Hikida and David Goodenough. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #ifndef lint 39 static const char copyright[] = 40 "@(#) Copyright (c) 1991, 1993, 1994\n\ 41 The Regents of the University of California. All rights reserved.\n"; 42 #endif /* not lint */ 43 44 #ifndef lint 45 #if 0 46 static char sccsid[] = "@(#)join.c 8.6 (Berkeley) 5/4/95"; 47 #endif 48 static const char rcsid[] = 49 "$FreeBSD: src/usr.bin/join/join.c,v 1.10.2.1 2002/06/18 05:14:49 jmallett Exp $"; 50 #endif /* not lint */ 51 52 #include <sys/param.h> 53 54 #include <err.h> 55 #include <errno.h> 56 #include <locale.h> 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <string.h> 60 #include <unistd.h> 61 62 /* 63 * There's a structure per input file which encapsulates the state of the 64 * file. We repeatedly read lines from each file until we've read in all 65 * the consecutive lines from the file with a common join field. Then we 66 * compare the set of lines with an equivalent set from the other file. 67 */ 68 typedef struct { 69 char *line; /* line */ 70 u_long linealloc; /* line allocated count */ 71 char **fields; /* line field(s) */ 72 u_long fieldcnt; /* line field(s) count */ 73 u_long fieldalloc; /* line field(s) allocated count */ 74 } LINE; 75 76 typedef struct { 77 FILE *fp; /* file descriptor */ 78 u_long joinf; /* join field (-1, -2, -j) */ 79 int unpair; /* output unpairable lines (-a) */ 80 int number; /* 1 for file 1, 2 for file 2 */ 81 82 LINE *set; /* set of lines with same field */ 83 int pushbool; /* if pushback is set */ 84 u_long pushback; /* line on the stack */ 85 u_long setcnt; /* set count */ 86 u_long setalloc; /* set allocated count */ 87 } INPUT; 88 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 }, 89 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 }; 90 91 typedef struct { 92 u_long filenum; /* file number */ 93 u_long fieldno; /* field number */ 94 } OLIST; 95 OLIST *olist; /* output field list */ 96 u_long olistcnt; /* output field list count */ 97 u_long olistalloc; /* output field allocated count */ 98 99 int joinout = 1; /* show lines with matched join fields (-v) */ 100 int needsep; /* need separator character */ 101 int spans = 1; /* span multiple delimiters (-t) */ 102 char *empty; /* empty field replacement string (-e) */ 103 static char default_tabchar[] = " \t"; 104 char *tabchar = default_tabchar;/* delimiter characters (-t) */ 105 106 int cmp(LINE *, u_long, LINE *, u_long); 107 void fieldarg(char *); 108 void joinlines(INPUT *, INPUT *); 109 void obsolete(char **); 110 void outfield(LINE *, u_long, int); 111 void outoneline(INPUT *, LINE *); 112 void outtwoline(INPUT *, LINE *, INPUT *, LINE *); 113 void slurp(INPUT *); 114 void usage(void); 115 116 int 117 main(argc, argv) 118 int argc; 119 char *argv[]; 120 { 121 INPUT *F1, *F2; 122 int aflag, ch, cval, vflag; 123 char *end; 124 125 setlocale(LC_ALL, ""); 126 127 F1 = &input1; 128 F2 = &input2; 129 130 aflag = vflag = 0; 131 obsolete(argv); 132 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) { 133 switch (ch) { 134 case '\01': /* See comment in obsolete(). */ 135 aflag = 1; 136 F1->unpair = F2->unpair = 1; 137 break; 138 case '1': 139 if ((F1->joinf = strtol(optarg, &end, 10)) < 1) 140 errx(1, "-1 option field number less than 1"); 141 if (*end) 142 errx(1, "illegal field number -- %s", optarg); 143 --F1->joinf; 144 break; 145 case '2': 146 if ((F2->joinf = strtol(optarg, &end, 10)) < 1) 147 errx(1, "-2 option field number less than 1"); 148 if (*end) 149 errx(1, "illegal field number -- %s", optarg); 150 --F2->joinf; 151 break; 152 case 'a': 153 aflag = 1; 154 switch(strtol(optarg, &end, 10)) { 155 case 1: 156 F1->unpair = 1; 157 break; 158 case 2: 159 F2->unpair = 1; 160 break; 161 default: 162 errx(1, "-a option file number not 1 or 2"); 163 break; 164 } 165 if (*end) 166 errx(1, "illegal file number -- %s", optarg); 167 break; 168 case 'e': 169 empty = optarg; 170 break; 171 case 'j': 172 if ((F1->joinf = F2->joinf = 173 strtol(optarg, &end, 10)) < 1) 174 errx(1, "-j option field number less than 1"); 175 if (*end) 176 errx(1, "illegal field number -- %s", optarg); 177 --F1->joinf; 178 --F2->joinf; 179 break; 180 case 'o': 181 fieldarg(optarg); 182 break; 183 case 't': 184 spans = 0; 185 if (strlen(tabchar = optarg) != 1) 186 errx(1, "illegal tab character specification"); 187 break; 188 case 'v': 189 vflag = 1; 190 joinout = 0; 191 switch (strtol(optarg, &end, 10)) { 192 case 1: 193 F1->unpair = 1; 194 break; 195 case 2: 196 F2->unpair = 1; 197 break; 198 default: 199 errx(1, "-v option file number not 1 or 2"); 200 break; 201 } 202 if (*end) 203 errx(1, "illegal file number -- %s", optarg); 204 break; 205 case '?': 206 default: 207 usage(); 208 } 209 } 210 argc -= optind; 211 argv += optind; 212 213 if (aflag && vflag) 214 errx(1, "the -a and -v options are mutually exclusive"); 215 216 if (argc != 2) 217 usage(); 218 219 /* Open the files; "-" means stdin. */ 220 if (!strcmp(*argv, "-")) 221 F1->fp = stdin; 222 else if ((F1->fp = fopen(*argv, "r")) == NULL) 223 err(1, "%s", *argv); 224 ++argv; 225 if (!strcmp(*argv, "-")) 226 F2->fp = stdin; 227 else if ((F2->fp = fopen(*argv, "r")) == NULL) 228 err(1, "%s", *argv); 229 if (F1->fp == stdin && F2->fp == stdin) 230 errx(1, "only one input file may be stdin"); 231 232 slurp(F1); 233 slurp(F2); 234 while (F1->setcnt && F2->setcnt) { 235 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf); 236 if (cval == 0) { 237 /* Oh joy, oh rapture, oh beauty divine! */ 238 if (joinout) 239 joinlines(F1, F2); 240 slurp(F1); 241 slurp(F2); 242 } else if (cval < 0) { 243 /* File 1 takes the lead... */ 244 if (F1->unpair) 245 joinlines(F1, NULL); 246 slurp(F1); 247 } else { 248 /* File 2 takes the lead... */ 249 if (F2->unpair) 250 joinlines(F2, NULL); 251 slurp(F2); 252 } 253 } 254 255 /* 256 * Now that one of the files is used up, optionally output any 257 * remaining lines from the other file. 258 */ 259 if (F1->unpair) 260 while (F1->setcnt) { 261 joinlines(F1, NULL); 262 slurp(F1); 263 } 264 if (F2->unpair) 265 while (F2->setcnt) { 266 joinlines(F2, NULL); 267 slurp(F2); 268 } 269 exit(0); 270 } 271 272 void 273 slurp(F) 274 INPUT *F; 275 { 276 LINE *lp, *lastlp, tmp; 277 size_t len; 278 int cnt; 279 char *bp, *fieldp; 280 281 /* 282 * Read all of the lines from an input file that have the same 283 * join field. 284 */ 285 F->setcnt = 0; 286 for (lastlp = NULL;; ++F->setcnt) { 287 /* 288 * If we're out of space to hold line structures, allocate 289 * more. Initialize the structure so that we know that this 290 * is new space. 291 */ 292 if (F->setcnt == F->setalloc) { 293 cnt = F->setalloc; 294 F->setalloc += 50; 295 if ((F->set = realloc(F->set, 296 F->setalloc * sizeof(LINE))) == NULL) 297 err(1, NULL); 298 memset(F->set + cnt, 0, 50 * sizeof(LINE)); 299 300 /* re-set lastlp in case it moved */ 301 if (lastlp != NULL) 302 lastlp = &F->set[F->setcnt - 1]; 303 } 304 305 /* 306 * Get any pushed back line, else get the next line. Allocate 307 * space as necessary. If taking the line from the stack swap 308 * the two structures so that we don't lose space allocated to 309 * either structure. This could be avoided by doing another 310 * level of indirection, but it's probably okay as is. 311 */ 312 lp = &F->set[F->setcnt]; 313 if (F->setcnt) 314 lastlp = &F->set[F->setcnt - 1]; 315 if (F->pushbool) { 316 tmp = F->set[F->setcnt]; 317 F->set[F->setcnt] = F->set[F->pushback]; 318 F->set[F->pushback] = tmp; 319 F->pushbool = 0; 320 continue; 321 } 322 if ((bp = fgetln(F->fp, &len)) == NULL) 323 return; 324 if (lp->linealloc <= len + 1) { 325 lp->linealloc += MAX(100, len + 1 - lp->linealloc); 326 if ((lp->line = 327 realloc(lp->line, lp->linealloc)) == NULL) 328 err(1, NULL); 329 } 330 memmove(lp->line, bp, len); 331 332 /* Replace trailing newline, if it exists. */ 333 if (bp[len - 1] == '\n') 334 lp->line[len - 1] = '\0'; 335 else 336 lp->line[len] = '\0'; 337 bp = lp->line; 338 339 /* Split the line into fields, allocate space as necessary. */ 340 lp->fieldcnt = 0; 341 while ((fieldp = strsep(&bp, tabchar)) != NULL) { 342 if (spans && *fieldp == '\0') 343 continue; 344 if (lp->fieldcnt == lp->fieldalloc) { 345 lp->fieldalloc += 50; 346 if ((lp->fields = realloc(lp->fields, 347 lp->fieldalloc * sizeof(char *))) == NULL) 348 err(1, NULL); 349 } 350 lp->fields[lp->fieldcnt++] = fieldp; 351 } 352 353 /* See if the join field value has changed. */ 354 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) { 355 F->pushbool = 1; 356 F->pushback = F->setcnt; 357 break; 358 } 359 } 360 } 361 362 int 363 cmp(lp1, fieldno1, lp2, fieldno2) 364 LINE *lp1, *lp2; 365 u_long fieldno1, fieldno2; 366 { 367 if (lp1->fieldcnt <= fieldno1) 368 return (lp2->fieldcnt <= fieldno2 ? 0 : 1); 369 if (lp2->fieldcnt <= fieldno2) 370 return (-1); 371 return (strcoll(lp1->fields[fieldno1], lp2->fields[fieldno2])); 372 } 373 374 void 375 joinlines(F1, F2) 376 INPUT *F1, *F2; 377 { 378 unsigned int cnt1, cnt2; 379 380 /* 381 * Output the results of a join comparison. The output may be from 382 * either file 1 or file 2 (in which case the first argument is the 383 * file from which to output) or from both. 384 */ 385 if (F2 == NULL) { 386 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 387 outoneline(F1, &F1->set[cnt1]); 388 return; 389 } 390 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 391 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2) 392 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]); 393 } 394 395 void 396 outoneline(F, lp) 397 INPUT *F; 398 LINE *lp; 399 { 400 unsigned int cnt; 401 402 /* 403 * Output a single line from one of the files, according to the 404 * join rules. This happens when we are writing unmatched single 405 * lines. Output empty fields in the right places. 406 */ 407 if (olist) 408 for (cnt = 0; cnt < olistcnt; ++cnt) { 409 if (olist[cnt].filenum == (unsigned)F->number) 410 outfield(lp, olist[cnt].fieldno, 0); 411 else if (olist[cnt].filenum == 0) 412 outfield(lp, F->joinf, 0); 413 else 414 outfield(lp, 0, 1); 415 } 416 else 417 for (cnt = 0; cnt < lp->fieldcnt; ++cnt) 418 outfield(lp, cnt, 0); 419 (void)printf("\n"); 420 if (ferror(stdout)) 421 err(1, "stdout"); 422 needsep = 0; 423 } 424 425 void 426 outtwoline(F1, lp1, F2, lp2) 427 INPUT *F1, *F2; 428 LINE *lp1, *lp2; 429 { 430 unsigned int cnt; 431 432 /* Output a pair of lines according to the join list (if any). */ 433 if (olist) 434 for (cnt = 0; cnt < olistcnt; ++cnt) 435 if (olist[cnt].filenum == 0) { 436 if (lp1->fieldcnt >= F1->joinf) 437 outfield(lp1, F1->joinf, 0); 438 else 439 outfield(lp2, F2->joinf, 0); 440 } else if (olist[cnt].filenum == 1) 441 outfield(lp1, olist[cnt].fieldno, 0); 442 else /* if (olist[cnt].filenum == 2) */ 443 outfield(lp2, olist[cnt].fieldno, 0); 444 else { 445 /* 446 * Output the join field, then the remaining fields from F1 447 * and F2. 448 */ 449 outfield(lp1, F1->joinf, 0); 450 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt) 451 if (F1->joinf != cnt) 452 outfield(lp1, cnt, 0); 453 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt) 454 if (F2->joinf != cnt) 455 outfield(lp2, cnt, 0); 456 } 457 (void)printf("\n"); 458 if (ferror(stdout)) 459 err(1, "stdout"); 460 needsep = 0; 461 } 462 463 void 464 outfield(lp, fieldno, out_empty) 465 LINE *lp; 466 u_long fieldno; 467 int out_empty; 468 { 469 if (needsep++) 470 (void)printf("%c", *tabchar); 471 if (!ferror(stdout)) { 472 if (lp->fieldcnt <= fieldno || out_empty) { 473 if (empty != NULL) 474 (void)printf("%s", empty); 475 } else { 476 if (*lp->fields[fieldno] == '\0') 477 return; 478 (void)printf("%s", lp->fields[fieldno]); 479 } 480 } 481 if (ferror(stdout)) 482 err(1, "stdout"); 483 } 484 485 /* 486 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output 487 * fields. 488 */ 489 void 490 fieldarg(option) 491 char *option; 492 { 493 u_long fieldno, filenum; 494 char *end, *token; 495 496 while ((token = strsep(&option, ", \t")) != NULL) { 497 if (*token == '\0') 498 continue; 499 if (token[0] == '0') 500 filenum = fieldno = 0; 501 else if ((token[0] == '1' || token[0] == '2') && 502 token[1] == '.') { 503 filenum = token[0] - '0'; 504 fieldno = strtol(token + 2, &end, 10); 505 if (*end) 506 errx(1, "malformed -o option field"); 507 if (fieldno == 0) 508 errx(1, "field numbers are 1 based"); 509 --fieldno; 510 } else 511 errx(1, "malformed -o option field"); 512 if (olistcnt == olistalloc) { 513 olistalloc += 50; 514 if ((olist = realloc(olist, 515 olistalloc * sizeof(OLIST))) == NULL) 516 err(1, NULL); 517 } 518 olist[olistcnt].filenum = filenum; 519 olist[olistcnt].fieldno = fieldno; 520 ++olistcnt; 521 } 522 } 523 524 void 525 obsolete(argv) 526 char **argv; 527 { 528 unsigned int len; 529 char **p, *ap, *t; 530 531 while ((ap = *++argv) != NULL) { 532 /* Return if "--". */ 533 if (ap[0] == '-' && ap[1] == '-') 534 return; 535 /* skip if not an option */ 536 if (ap[0] != '-') 537 continue; 538 switch (ap[1]) { 539 case 'a': 540 /* 541 * The original join allowed "-a", which meant the 542 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2 543 * only specifies this as "-a 1" and "a -2", so we 544 * have to use another option flag, one that is 545 * unlikely to ever be used or accidentally entered 546 * on the command line. (Well, we could reallocate 547 * the argv array, but that hardly seems worthwhile.) 548 */ 549 if (ap[2] == '\0' && (argv[1] == NULL || 550 (strcmp(argv[1], "1") != 0 && 551 strcmp(argv[1], "2") != 0))) { 552 ap[1] = '\01'; 553 warnx("-a option used without an argument; " 554 "reverting to historical behavior"); 555 } 556 break; 557 case 'j': 558 /* 559 * The original join allowed "-j[12] arg" and "-j arg". 560 * Convert the former to "-[12] arg". Don't convert 561 * the latter since getopt(3) can handle it. 562 */ 563 switch(ap[2]) { 564 case '1': 565 if (ap[3] != '\0') 566 goto jbad; 567 ap[1] = '1'; 568 ap[2] = '\0'; 569 break; 570 case '2': 571 if (ap[3] != '\0') 572 goto jbad; 573 ap[1] = '2'; 574 ap[2] = '\0'; 575 break; 576 case '\0': 577 break; 578 default: 579 jbad: errx(1, "illegal option -- %s", ap); 580 usage(); 581 } 582 break; 583 case 'o': 584 /* 585 * The original join allowed "-o arg arg". 586 * Convert to "-o arg -o arg". 587 */ 588 if (ap[2] != '\0') 589 break; 590 for (p = argv + 2; *p; ++p) { 591 if (p[0][0] == '0' || (p[0][0] != '1' && 592 p[0][0] != '2' || p[0][1] != '.')) 593 break; 594 len = strlen(*p); 595 if (len - 2 != strspn(*p + 2, "0123456789")) 596 break; 597 if ((t = malloc(len + 3)) == NULL) 598 err(1, NULL); 599 t[0] = '-'; 600 t[1] = 'o'; 601 memmove(t + 2, *p, len + 1); 602 *p = t; 603 } 604 argv = p - 1; 605 break; 606 } 607 } 608 } 609 610 void 611 usage() 612 { 613 (void)fprintf(stderr, "%s %s\n%s\n", 614 "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]", 615 "[-2 field]", 616 " [-o list] [-t char] file1 file2"); 617 exit(1); 618 } 619