1 /* $NetBSD: join.c,v 1.21 2002/07/24 17:03:00 jonb Exp $ */ 2 3 /*- 4 * Copyright (c) 1991 The Regents of the University of California. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Steve Hayman of Indiana University, Michiro Hikida and David 9 * Goodenough. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 */ 39 40 #include <sys/cdefs.h> 41 #ifndef lint 42 __COPYRIGHT( 43 "@(#) Copyright (c) 1991 The Regents of the University of California.\n\ 44 All rights reserved.\n"); 45 #endif /* not lint */ 46 47 #ifndef lint 48 #if 0 49 static char sccsid[] = "from: @(#)join.c 5.1 (Berkeley) 11/18/91"; 50 #else 51 __RCSID("$NetBSD: join.c,v 1.21 2002/07/24 17:03:00 jonb Exp $"); 52 #endif 53 #endif /* not lint */ 54 55 #include <sys/types.h> 56 #include <ctype.h> 57 #include <err.h> 58 #include <errno.h> 59 #include <stdio.h> 60 #include <stdlib.h> 61 #include <unistd.h> 62 #include <string.h> 63 #include <unistd.h> 64 65 /* 66 * There's a structure per input file which encapsulates the state of the 67 * file. We repeatedly read lines from each file until we've read in all 68 * the consecutive lines from the file with a common join field. Then we 69 * compare the set of lines with an equivalent set from the other file. 70 */ 71 typedef struct { 72 char *line; /* line */ 73 u_long linealloc; /* line allocated count */ 74 char **fields; /* line field(s) */ 75 u_long fieldcnt; /* line field(s) count */ 76 u_long fieldalloc; /* line field(s) allocated count */ 77 } LINE; 78 79 LINE noline = {"", 0, 0, 0, 0}; /* arg for outfield if no line to output */ 80 81 typedef struct { 82 FILE *fp; /* file descriptor */ 83 u_long joinf; /* join field (-1, -2, -j) */ 84 int unpair; /* output unpairable lines (-a) */ 85 int number; /* 1 for file 1, 2 for file 2 */ 86 87 LINE *set; /* set of lines with same field */ 88 u_long pushback; /* line on the stack */ 89 u_long setcnt; /* set count */ 90 u_long setalloc; /* set allocated count */ 91 } INPUT; 92 INPUT input1 = { NULL, 0, 0, 1, NULL, -1, 0, 0, }, 93 input2 = { NULL, 0, 0, 2, NULL, -1, 0, 0, }; 94 95 typedef struct { 96 u_long fileno; /* file number */ 97 u_long fieldno; /* field number */ 98 } OLIST; 99 OLIST *olist; /* output field list */ 100 u_long olistcnt; /* output field list count */ 101 u_long olistalloc; /* output field allocated count */ 102 103 int joinout = 1; /* show lines with matched join fields (-v) */ 104 int needsep; /* need separator character */ 105 int spans = 1; /* span multiple delimiters (-t) */ 106 char *empty; /* empty field replacement string (-e) */ 107 char *tabchar = " \t"; /* delimiter characters (-t) */ 108 109 int cmp __P((LINE *, u_long, LINE *, u_long)); 110 void enomem __P((void)); 111 void fieldarg __P((char *)); 112 void joinlines __P((INPUT *, INPUT *)); 113 int main __P((int, char **)); 114 void obsolete __P((char **)); 115 void outfield __P((LINE *, u_long)); 116 void outoneline __P((INPUT *, LINE *)); 117 void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *)); 118 void slurp __P((INPUT *)); 119 void usage __P((void)); 120 121 int 122 main(argc, argv) 123 int argc; 124 char *argv[]; 125 { 126 INPUT *F1, *F2; 127 int aflag, ch, cval, vflag; 128 char *end; 129 130 F1 = &input1; 131 F2 = &input2; 132 133 aflag = vflag = 0; 134 obsolete(argv); 135 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) { 136 switch (ch) { 137 case '\01': 138 aflag = 1; 139 F1->unpair = F2->unpair = 1; 140 break; 141 case '1': 142 if ((F1->joinf = strtol(optarg, &end, 10)) < 1) { 143 warnx("-1 option field number less than 1"); 144 usage(); 145 } 146 if (*end) { 147 warnx("illegal field number -- %s", optarg); 148 usage(); 149 } 150 --F1->joinf; 151 break; 152 case '2': 153 if ((F2->joinf = strtol(optarg, &end, 10)) < 1) { 154 warnx("-2 option field number less than 1"); 155 usage(); 156 } 157 if (*end) { 158 warnx("illegal field number -- %s", optarg); 159 usage(); 160 } 161 --F2->joinf; 162 break; 163 case 'a': 164 aflag = 1; 165 switch(strtol(optarg, &end, 10)) { 166 case 1: 167 F1->unpair = 1; 168 break; 169 case 2: 170 F2->unpair = 1; 171 break; 172 default: 173 warnx("-a option file number not 1 or 2"); 174 usage(); 175 break; 176 } 177 if (*end) { 178 warnx("illegal file number -- %s", optarg); 179 usage(); 180 } 181 break; 182 case 'e': 183 empty = optarg; 184 break; 185 case 'j': 186 if ((F1->joinf = F2->joinf = 187 strtol(optarg, &end, 10)) < 1) { 188 warnx("-j option field number less than 1"); 189 usage(); 190 } 191 if (*end) { 192 warnx("illegal field number -- %s", optarg); 193 usage(); 194 } 195 --F1->joinf; 196 --F2->joinf; 197 break; 198 case 'o': 199 fieldarg(optarg); 200 break; 201 case 't': 202 spans = 0; 203 if (strlen(tabchar = optarg) != 1) { 204 warnx("illegal tab character specification"); 205 usage(); 206 } 207 break; 208 case 'v': 209 vflag = 1; 210 joinout = 0; 211 switch(strtol(optarg, &end, 10)) { 212 case 1: 213 F1->unpair = 1; 214 break; 215 case 2: 216 F2->unpair = 1; 217 break; 218 default: 219 warnx("-v option file number not 1 or 2"); 220 usage(); 221 break; 222 } 223 if (*end) { 224 warnx("illegal file number -- %s", optarg); 225 usage(); 226 } 227 break; 228 case '?': 229 default: 230 usage(); 231 } 232 } 233 argc -= optind; 234 argv += optind; 235 236 if (aflag && vflag) 237 errx(1, "-a and -v options mutually exclusive"); 238 239 if (argc != 2) 240 usage(); 241 242 /* Open the files; "-" means stdin. */ 243 if (!strcmp(*argv, "-")) 244 F1->fp = stdin; 245 else if ((F1->fp = fopen(*argv, "r")) == NULL) 246 err(1, "%s", *argv); 247 ++argv; 248 if (!strcmp(*argv, "-")) 249 F2->fp = stdin; 250 else if ((F2->fp = fopen(*argv, "r")) == NULL) 251 err(1, "%s", *argv); 252 if (F1->fp == stdin && F2->fp == stdin) 253 errx(1, "only one input file may be stdin"); 254 255 slurp(F1); 256 slurp(F2); 257 while (F1->setcnt && F2->setcnt) { 258 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf); 259 if (cval == 0) { 260 /* Oh joy, oh rapture, oh beauty divine! */ 261 if (joinout) 262 joinlines(F1, F2); 263 slurp(F1); 264 slurp(F2); 265 } else if (cval < 0) { 266 /* File 1 takes the lead... */ 267 if (F1->unpair) 268 joinlines(F1, NULL); 269 slurp(F1); 270 } else { 271 /* File 2 takes the lead... */ 272 if (F2->unpair) 273 joinlines(F2, NULL); 274 slurp(F2); 275 } 276 } 277 278 /* 279 * Now that one of the files is used up, optionally output any 280 * remaining lines from the other file. 281 */ 282 if (F1->unpair) 283 while (F1->setcnt) { 284 joinlines(F1, NULL); 285 slurp(F1); 286 } 287 if (F2->unpair) 288 while (F2->setcnt) { 289 joinlines(F2, NULL); 290 slurp(F2); 291 } 292 exit(0); 293 } 294 295 void 296 slurp(F) 297 INPUT *F; 298 { 299 LINE *lp; 300 LINE tmp; 301 size_t len; 302 int cnt; 303 char *bp, *fieldp; 304 305 /* 306 * Read all of the lines from an input file that have the same 307 * join field. 308 */ 309 for (F->setcnt = 0;; ++F->setcnt) { 310 /* 311 * If we're out of space to hold line structures, allocate 312 * more. Initialize the structure so that we know that this 313 * is new space. 314 */ 315 if (F->setcnt == F->setalloc) { 316 cnt = F->setalloc; 317 if (F->setalloc == 0) 318 F->setalloc = 64; 319 else 320 F->setalloc <<= 1; 321 if ((F->set = realloc(F->set, 322 F->setalloc * sizeof(LINE))) == NULL) 323 enomem(); 324 memset(F->set + cnt, 0, 325 (F->setalloc - cnt) * sizeof(LINE)); 326 } 327 328 /* 329 * Get any pushed back line, else get the next line. Allocate 330 * space as necessary. If taking the line from the stack swap 331 * the two structures so that we don't lose the allocated space. 332 * This could be avoided by doing another level of indirection, 333 * but it's probably okay as is. 334 */ 335 lp = &F->set[F->setcnt]; 336 if (F->pushback != -1) { 337 tmp = F->set[F->setcnt]; 338 F->set[F->setcnt] = F->set[F->pushback]; 339 F->set[F->pushback] = tmp; 340 F->pushback = -1; 341 continue; 342 } 343 if ((bp = fgetln(F->fp, &len)) == NULL) 344 return; 345 if (lp->linealloc <= len + 1) { 346 if (lp->linealloc == 0) 347 lp->linealloc = 128; 348 while (lp->linealloc <= len + 1) 349 lp->linealloc <<= 1; 350 if ((lp->line = realloc(lp->line, 351 lp->linealloc * sizeof(char))) == NULL) 352 enomem(); 353 } 354 memmove(lp->line, bp, len); 355 356 /* Replace trailing newline, if it exists. */ 357 if (bp[len - 1] == '\n') 358 lp->line[len - 1] = '\0'; 359 else 360 lp->line[len] = '\0'; 361 bp = lp->line; 362 363 /* Split the line into fields, allocate space as necessary. */ 364 lp->fieldcnt = 0; 365 while ((fieldp = strsep(&bp, tabchar)) != NULL) { 366 if (spans && *fieldp == '\0') 367 continue; 368 if (lp->fieldcnt == lp->fieldalloc) { 369 if (lp->fieldalloc == 0) 370 lp->fieldalloc = 16; 371 else 372 lp->fieldalloc <<= 1; 373 if ((lp->fields = realloc(lp->fields, 374 lp->fieldalloc * sizeof(char *))) == NULL) 375 enomem(); 376 } 377 lp->fields[lp->fieldcnt++] = fieldp; 378 } 379 380 /* See if the join field value has changed. */ 381 if (F->setcnt && cmp(lp, F->joinf, lp - 1, F->joinf)) { 382 F->pushback = F->setcnt; 383 break; 384 } 385 } 386 } 387 388 int 389 cmp(lp1, fieldno1, lp2, fieldno2) 390 LINE *lp1, *lp2; 391 u_long fieldno1, fieldno2; 392 { 393 394 if (lp1->fieldcnt <= fieldno1) 395 return (lp2->fieldcnt <= fieldno2 ? 0 : 1); 396 if (lp2->fieldcnt <= fieldno2) 397 return (-1); 398 return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2])); 399 } 400 401 void 402 joinlines(F1, F2) 403 INPUT *F1, *F2; 404 { 405 int cnt1, cnt2; 406 407 /* 408 * Output the results of a join comparison. The output may be from 409 * either file 1 or file 2 (in which case the first argument is the 410 * file from which to output) or from both. 411 */ 412 if (F2 == NULL) { 413 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 414 outoneline(F1, &F1->set[cnt1]); 415 return; 416 } 417 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 418 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2) 419 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]); 420 } 421 422 void 423 outoneline(F, lp) 424 INPUT *F; 425 LINE *lp; 426 { 427 int cnt; 428 429 /* 430 * Output a single line from one of the files, according to the 431 * join rules. This happens when we are writing unmatched single 432 * lines. Output empty fields in the right places. 433 */ 434 if (olist) 435 for (cnt = 0; cnt < olistcnt; ++cnt) { 436 if (olist[cnt].fileno == F->number) 437 outfield(lp, olist[cnt].fieldno); 438 else 439 outfield(&noline, 1); 440 } 441 else 442 for (cnt = 0; cnt < lp->fieldcnt; ++cnt) 443 outfield(lp, cnt); 444 (void)printf("\n"); 445 if (ferror(stdout)) 446 err(1, "stdout"); 447 needsep = 0; 448 } 449 450 void 451 outtwoline(F1, lp1, F2, lp2) 452 INPUT *F1, *F2; 453 LINE *lp1, *lp2; 454 { 455 int cnt; 456 457 /* Output a pair of lines according to the join list (if any). */ 458 if (olist) { 459 for (cnt = 0; cnt < olistcnt; ++cnt) 460 if (olist[cnt].fileno == 1) 461 outfield(lp1, olist[cnt].fieldno); 462 else /* if (olist[cnt].fileno == 2) */ 463 outfield(lp2, olist[cnt].fieldno); 464 } else { 465 /* 466 * Output the join field, then the remaining fields from F1 467 * and F2. 468 */ 469 outfield(lp1, F1->joinf); 470 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt) 471 if (F1->joinf != cnt) 472 outfield(lp1, cnt); 473 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt) 474 if (F2->joinf != cnt) 475 outfield(lp2, cnt); 476 } 477 (void)printf("\n"); 478 if (ferror(stdout)) 479 err(1, "stdout"); 480 needsep = 0; 481 } 482 483 void 484 outfield(lp, fieldno) 485 LINE *lp; 486 u_long fieldno; 487 { 488 if (needsep++) 489 (void)printf("%c", *tabchar); 490 if (!ferror(stdout)) { 491 if (lp->fieldcnt <= fieldno) { 492 if (empty != NULL) 493 (void)printf("%s", empty); 494 } else { 495 if (*lp->fields[fieldno] == '\0') 496 return; 497 (void)printf("%s", lp->fields[fieldno]); 498 } 499 } 500 if (ferror(stdout)) 501 err(1, "stdout"); 502 } 503 504 /* 505 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output 506 * fields. 507 */ 508 void 509 fieldarg(option) 510 char *option; 511 { 512 u_long fieldno; 513 char *end, *token; 514 515 while ((token = strsep(&option, ", \t")) != NULL) { 516 if (*token == '\0') 517 continue; 518 if ((token[0] != '1' && token[0] != '2') || token[1] != '.') 519 errx(1, "malformed -o option field"); 520 fieldno = strtol(token + 2, &end, 10); 521 if (*end) 522 errx(1, "malformed -o option field"); 523 if (fieldno == 0) 524 errx(1, "field numbers are 1 based"); 525 if (olistcnt == olistalloc) { 526 olistalloc += 50; 527 if ((olist = realloc(olist, 528 olistalloc * sizeof(OLIST))) == NULL) 529 enomem(); 530 } 531 olist[olistcnt].fileno = token[0] - '0'; 532 olist[olistcnt].fieldno = fieldno - 1; 533 ++olistcnt; 534 } 535 } 536 537 void 538 obsolete(argv) 539 char **argv; 540 { 541 int len; 542 char **p, *ap, *t; 543 544 while ((ap = *++argv) != NULL) { 545 /* Return if "--". */ 546 if (ap[0] == '-' && ap[1] == '-') 547 return; 548 switch (ap[1]) { 549 case 'a': 550 /* 551 * The original join allowed "-a", which meant the 552 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2 553 * only specifies this as "-a 1" and "a -2", so we 554 * have to use another option flag, one that is 555 * unlikely to ever be used or accidentally entered 556 * on the command line. (Well, we could reallocate 557 * the argv array, but that hardly seems worthwhile.) 558 */ 559 if (ap[2] == '\0') 560 ap[1] = '\01'; 561 break; 562 case 'j': 563 /* 564 * The original join allowed "-j[12] arg" and "-j arg". 565 * Convert the former to "-[12] arg". Don't convert 566 * the latter since getopt(3) can handle it. 567 */ 568 switch(ap[2]) { 569 case '1': 570 if (ap[3] != '\0') 571 goto jbad; 572 ap[1] = '1'; 573 ap[2] = '\0'; 574 break; 575 case '2': 576 if (ap[3] != '\0') 577 goto jbad; 578 ap[1] = '2'; 579 ap[2] = '\0'; 580 break; 581 case '\0': 582 break; 583 default: 584 jbad: errx(1, "illegal option -- %s", ap); 585 usage(); 586 } 587 break; 588 case 'o': 589 /* 590 * The original join allowed "-o arg arg". Convert to 591 * "-o arg -o arg". 592 */ 593 if (ap[2] != '\0') 594 break; 595 for (p = argv + 2; *p; ++p) { 596 if ((p[0][0] != '1' && p[0][0] != '2') || 597 p[0][1] != '.') 598 break; 599 len = strlen(*p); 600 if (len - 2 != strspn(*p + 2, "0123456789")) 601 break; 602 if ((t = malloc(len + 3)) == NULL) 603 enomem(); 604 t[0] = '-'; 605 t[1] = 'o'; 606 memmove(t + 2, *p, len + 1); 607 *p = t; 608 } 609 argv = p - 1; 610 break; 611 } 612 } 613 } 614 615 void 616 enomem() 617 { 618 errx(1, "no memory"); 619 } 620 621 void 622 usage() 623 { 624 (void)fprintf(stderr, "%s%s\n", 625 "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ", 626 "[-2 field]\n [-o list] [-t char] file1 file2"); 627 exit(1); 628 } 629