1 /*- 2 * Copyright (c) 1991, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Steve Hayman of the Computer Science Department, Indiana University, 7 * Michiro Hikida and David Goodenough. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#) Copyright (c) 1991, 1993, 1994 The Regents of the University of California. All rights reserved. 34 * @(#)join.c 8.6 (Berkeley) 5/4/95 35 * $FreeBSD: src/usr.bin/join/join.c,v 1.10.2.1 2002/06/18 05:14:49 jmallett Exp $ 36 * $DragonFly: src/usr.bin/join/join.c,v 1.6 2005/01/16 21:40:42 cpressey Exp $ 37 */ 38 39 #include <sys/param.h> 40 41 #include <err.h> 42 #include <errno.h> 43 #include <locale.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <unistd.h> 48 49 /* 50 * There's a structure per input file which encapsulates the state of the 51 * file. We repeatedly read lines from each file until we've read in all 52 * the consecutive lines from the file with a common join field. Then we 53 * compare the set of lines with an equivalent set from the other file. 54 */ 55 typedef struct { 56 char *line; /* line */ 57 u_long linealloc; /* line allocated count */ 58 char **fields; /* line field(s) */ 59 u_long fieldcnt; /* line field(s) count */ 60 u_long fieldalloc; /* line field(s) allocated count */ 61 } LINE; 62 63 typedef struct { 64 FILE *fp; /* file descriptor */ 65 u_long joinf; /* join field (-1, -2, -j) */ 66 int unpair; /* output unpairable lines (-a) */ 67 int number; /* 1 for file 1, 2 for file 2 */ 68 69 LINE *set; /* set of lines with same field */ 70 int pushbool; /* if pushback is set */ 71 u_long pushback; /* line on the stack */ 72 u_long setcnt; /* set count */ 73 u_long setalloc; /* set allocated count */ 74 } INPUT; 75 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 }, 76 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 }; 77 78 typedef struct { 79 u_long filenum; /* file number */ 80 u_long fieldno; /* field number */ 81 } OLIST; 82 OLIST *olist; /* output field list */ 83 u_long olistcnt; /* output field list count */ 84 u_long olistalloc; /* output field allocated count */ 85 86 int joinout = 1; /* show lines with matched join fields (-v) */ 87 int needsep; /* need separator character */ 88 int spans = 1; /* span multiple delimiters (-t) */ 89 char *empty; /* empty field replacement string (-e) */ 90 static char default_tabchar[] = " \t"; 91 char *tabchar = default_tabchar;/* delimiter characters (-t) */ 92 93 int cmp(LINE *, u_long, LINE *, u_long); 94 void fieldarg(char *); 95 void joinlines(INPUT *, INPUT *); 96 void obsolete(char **); 97 void outfield(LINE *, u_long, int); 98 void outoneline(INPUT *, LINE *); 99 void outtwoline(INPUT *, LINE *, INPUT *, LINE *); 100 void slurp(INPUT *); 101 void usage(void); 102 103 int 104 main(int argc, char **argv) 105 { 106 INPUT *F1, *F2; 107 int aflag, ch, cval, vflag; 108 char *end; 109 110 setlocale(LC_ALL, ""); 111 112 F1 = &input1; 113 F2 = &input2; 114 115 aflag = vflag = 0; 116 obsolete(argv); 117 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) { 118 switch (ch) { 119 case '\01': /* See comment in obsolete(). */ 120 aflag = 1; 121 F1->unpair = F2->unpair = 1; 122 break; 123 case '1': 124 if ((F1->joinf = strtol(optarg, &end, 10)) < 1) 125 errx(1, "-1 option field number less than 1"); 126 if (*end) 127 errx(1, "illegal field number -- %s", optarg); 128 --F1->joinf; 129 break; 130 case '2': 131 if ((F2->joinf = strtol(optarg, &end, 10)) < 1) 132 errx(1, "-2 option field number less than 1"); 133 if (*end) 134 errx(1, "illegal field number -- %s", optarg); 135 --F2->joinf; 136 break; 137 case 'a': 138 aflag = 1; 139 switch(strtol(optarg, &end, 10)) { 140 case 1: 141 F1->unpair = 1; 142 break; 143 case 2: 144 F2->unpair = 1; 145 break; 146 default: 147 errx(1, "-a option file number not 1 or 2"); 148 break; 149 } 150 if (*end) 151 errx(1, "illegal file number -- %s", optarg); 152 break; 153 case 'e': 154 empty = optarg; 155 break; 156 case 'j': 157 if ((F1->joinf = F2->joinf = 158 strtol(optarg, &end, 10)) < 1) 159 errx(1, "-j option field number less than 1"); 160 if (*end) 161 errx(1, "illegal field number -- %s", optarg); 162 --F1->joinf; 163 --F2->joinf; 164 break; 165 case 'o': 166 fieldarg(optarg); 167 break; 168 case 't': 169 spans = 0; 170 if (strlen(tabchar = optarg) != 1) 171 errx(1, "illegal tab character specification"); 172 break; 173 case 'v': 174 vflag = 1; 175 joinout = 0; 176 switch (strtol(optarg, &end, 10)) { 177 case 1: 178 F1->unpair = 1; 179 break; 180 case 2: 181 F2->unpair = 1; 182 break; 183 default: 184 errx(1, "-v option file number not 1 or 2"); 185 break; 186 } 187 if (*end) 188 errx(1, "illegal file number -- %s", optarg); 189 break; 190 case '?': 191 default: 192 usage(); 193 } 194 } 195 argc -= optind; 196 argv += optind; 197 198 if (aflag && vflag) 199 errx(1, "the -a and -v options are mutually exclusive"); 200 201 if (argc != 2) 202 usage(); 203 204 /* Open the files; "-" means stdin. */ 205 if (!strcmp(*argv, "-")) 206 F1->fp = stdin; 207 else if ((F1->fp = fopen(*argv, "r")) == NULL) 208 err(1, "%s", *argv); 209 ++argv; 210 if (!strcmp(*argv, "-")) 211 F2->fp = stdin; 212 else if ((F2->fp = fopen(*argv, "r")) == NULL) 213 err(1, "%s", *argv); 214 if (F1->fp == stdin && F2->fp == stdin) 215 errx(1, "only one input file may be stdin"); 216 217 slurp(F1); 218 slurp(F2); 219 while (F1->setcnt && F2->setcnt) { 220 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf); 221 if (cval == 0) { 222 /* Oh joy, oh rapture, oh beauty divine! */ 223 if (joinout) 224 joinlines(F1, F2); 225 slurp(F1); 226 slurp(F2); 227 } else if (cval < 0) { 228 /* File 1 takes the lead... */ 229 if (F1->unpair) 230 joinlines(F1, NULL); 231 slurp(F1); 232 } else { 233 /* File 2 takes the lead... */ 234 if (F2->unpair) 235 joinlines(F2, NULL); 236 slurp(F2); 237 } 238 } 239 240 /* 241 * Now that one of the files is used up, optionally output any 242 * remaining lines from the other file. 243 */ 244 if (F1->unpair) 245 while (F1->setcnt) { 246 joinlines(F1, NULL); 247 slurp(F1); 248 } 249 if (F2->unpair) 250 while (F2->setcnt) { 251 joinlines(F2, NULL); 252 slurp(F2); 253 } 254 exit(0); 255 } 256 257 void 258 slurp(INPUT *F) 259 { 260 LINE *lp, *lastlp, tmp; 261 size_t len; 262 int cnt; 263 char *bp, *fieldp; 264 265 /* 266 * Read all of the lines from an input file that have the same 267 * join field. 268 */ 269 F->setcnt = 0; 270 for (lastlp = NULL;; ++F->setcnt) { 271 /* 272 * If we're out of space to hold line structures, allocate 273 * more. Initialize the structure so that we know that this 274 * is new space. 275 */ 276 if (F->setcnt == F->setalloc) { 277 cnt = F->setalloc; 278 F->setalloc += 50; 279 if ((F->set = realloc(F->set, 280 F->setalloc * sizeof(LINE))) == NULL) 281 err(1, NULL); 282 memset(F->set + cnt, 0, 50 * sizeof(LINE)); 283 284 /* re-set lastlp in case it moved */ 285 if (lastlp != NULL) 286 lastlp = &F->set[F->setcnt - 1]; 287 } 288 289 /* 290 * Get any pushed back line, else get the next line. Allocate 291 * space as necessary. If taking the line from the stack swap 292 * the two structures so that we don't lose space allocated to 293 * either structure. This could be avoided by doing another 294 * level of indirection, but it's probably okay as is. 295 */ 296 lp = &F->set[F->setcnt]; 297 if (F->setcnt) 298 lastlp = &F->set[F->setcnt - 1]; 299 if (F->pushbool) { 300 tmp = F->set[F->setcnt]; 301 F->set[F->setcnt] = F->set[F->pushback]; 302 F->set[F->pushback] = tmp; 303 F->pushbool = 0; 304 continue; 305 } 306 if ((bp = fgetln(F->fp, &len)) == NULL) 307 return; 308 if (lp->linealloc <= len + 1) { 309 lp->linealloc += MAX(100, len + 1 - lp->linealloc); 310 if ((lp->line = 311 realloc(lp->line, lp->linealloc)) == NULL) 312 err(1, NULL); 313 } 314 memmove(lp->line, bp, len); 315 316 /* Replace trailing newline, if it exists. */ 317 if (bp[len - 1] == '\n') 318 lp->line[len - 1] = '\0'; 319 else 320 lp->line[len] = '\0'; 321 bp = lp->line; 322 323 /* Split the line into fields, allocate space as necessary. */ 324 lp->fieldcnt = 0; 325 while ((fieldp = strsep(&bp, tabchar)) != NULL) { 326 if (spans && *fieldp == '\0') 327 continue; 328 if (lp->fieldcnt == lp->fieldalloc) { 329 lp->fieldalloc += 50; 330 if ((lp->fields = realloc(lp->fields, 331 lp->fieldalloc * sizeof(char *))) == NULL) 332 err(1, NULL); 333 } 334 lp->fields[lp->fieldcnt++] = fieldp; 335 } 336 337 /* See if the join field value has changed. */ 338 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) { 339 F->pushbool = 1; 340 F->pushback = F->setcnt; 341 break; 342 } 343 } 344 } 345 346 int 347 cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2) 348 { 349 if (lp1->fieldcnt <= fieldno1) 350 return (lp2->fieldcnt <= fieldno2 ? 0 : 1); 351 if (lp2->fieldcnt <= fieldno2) 352 return (-1); 353 return (strcoll(lp1->fields[fieldno1], lp2->fields[fieldno2])); 354 } 355 356 void 357 joinlines(INPUT *F1, INPUT *F2) 358 { 359 unsigned int cnt1, cnt2; 360 361 /* 362 * Output the results of a join comparison. The output may be from 363 * either file 1 or file 2 (in which case the first argument is the 364 * file from which to output) or from both. 365 */ 366 if (F2 == NULL) { 367 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 368 outoneline(F1, &F1->set[cnt1]); 369 return; 370 } 371 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 372 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2) 373 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]); 374 } 375 376 void 377 outoneline(INPUT *F, LINE *lp) 378 { 379 unsigned int cnt; 380 381 /* 382 * Output a single line from one of the files, according to the 383 * join rules. This happens when we are writing unmatched single 384 * lines. Output empty fields in the right places. 385 */ 386 if (olist) 387 for (cnt = 0; cnt < olistcnt; ++cnt) { 388 if (olist[cnt].filenum == (unsigned)F->number) 389 outfield(lp, olist[cnt].fieldno, 0); 390 else if (olist[cnt].filenum == 0) 391 outfield(lp, F->joinf, 0); 392 else 393 outfield(lp, 0, 1); 394 } 395 else 396 for (cnt = 0; cnt < lp->fieldcnt; ++cnt) 397 outfield(lp, cnt, 0); 398 (void)printf("\n"); 399 if (ferror(stdout)) 400 err(1, "stdout"); 401 needsep = 0; 402 } 403 404 void 405 outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2) 406 { 407 unsigned int cnt; 408 409 /* Output a pair of lines according to the join list (if any). */ 410 if (olist) 411 for (cnt = 0; cnt < olistcnt; ++cnt) 412 if (olist[cnt].filenum == 0) { 413 if (lp1->fieldcnt >= F1->joinf) 414 outfield(lp1, F1->joinf, 0); 415 else 416 outfield(lp2, F2->joinf, 0); 417 } else if (olist[cnt].filenum == 1) 418 outfield(lp1, olist[cnt].fieldno, 0); 419 else /* if (olist[cnt].filenum == 2) */ 420 outfield(lp2, olist[cnt].fieldno, 0); 421 else { 422 /* 423 * Output the join field, then the remaining fields from F1 424 * and F2. 425 */ 426 outfield(lp1, F1->joinf, 0); 427 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt) 428 if (F1->joinf != cnt) 429 outfield(lp1, cnt, 0); 430 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt) 431 if (F2->joinf != cnt) 432 outfield(lp2, cnt, 0); 433 } 434 (void)printf("\n"); 435 if (ferror(stdout)) 436 err(1, "stdout"); 437 needsep = 0; 438 } 439 440 void 441 outfield(LINE *lp, u_long fieldno, int out_empty) 442 { 443 if (needsep++) 444 (void)printf("%c", *tabchar); 445 if (!ferror(stdout)) { 446 if (lp->fieldcnt <= fieldno || out_empty) { 447 if (empty != NULL) 448 (void)printf("%s", empty); 449 } else { 450 if (*lp->fields[fieldno] == '\0') 451 return; 452 (void)printf("%s", lp->fields[fieldno]); 453 } 454 } 455 if (ferror(stdout)) 456 err(1, "stdout"); 457 } 458 459 /* 460 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output 461 * fields. 462 */ 463 void 464 fieldarg(char *option) 465 { 466 u_long fieldno, filenum; 467 char *end, *token; 468 469 while ((token = strsep(&option, ", \t")) != NULL) { 470 if (*token == '\0') 471 continue; 472 if (token[0] == '0') 473 filenum = fieldno = 0; 474 else if ((token[0] == '1' || token[0] == '2') && 475 token[1] == '.') { 476 filenum = token[0] - '0'; 477 fieldno = strtol(token + 2, &end, 10); 478 if (*end) 479 errx(1, "malformed -o option field"); 480 if (fieldno == 0) 481 errx(1, "field numbers are 1 based"); 482 --fieldno; 483 } else 484 errx(1, "malformed -o option field"); 485 if (olistcnt == olistalloc) { 486 olistalloc += 50; 487 if ((olist = realloc(olist, 488 olistalloc * sizeof(OLIST))) == NULL) 489 err(1, NULL); 490 } 491 olist[olistcnt].filenum = filenum; 492 olist[olistcnt].fieldno = fieldno; 493 ++olistcnt; 494 } 495 } 496 497 void 498 obsolete(char **argv) 499 { 500 unsigned int len; 501 char **p, *ap, *t; 502 503 while ((ap = *++argv) != NULL) { 504 /* Return if "--". */ 505 if (ap[0] == '-' && ap[1] == '-') 506 return; 507 /* skip if not an option */ 508 if (ap[0] != '-') 509 continue; 510 switch (ap[1]) { 511 case 'a': 512 /* 513 * The original join allowed "-a", which meant the 514 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2 515 * only specifies this as "-a 1" and "a -2", so we 516 * have to use another option flag, one that is 517 * unlikely to ever be used or accidentally entered 518 * on the command line. (Well, we could reallocate 519 * the argv array, but that hardly seems worthwhile.) 520 */ 521 if (ap[2] == '\0' && (argv[1] == NULL || 522 (strcmp(argv[1], "1") != 0 && 523 strcmp(argv[1], "2") != 0))) { 524 ap[1] = '\01'; 525 warnx("-a option used without an argument; " 526 "reverting to historical behavior"); 527 } 528 break; 529 case 'j': 530 /* 531 * The original join allowed "-j[12] arg" and "-j arg". 532 * Convert the former to "-[12] arg". Don't convert 533 * the latter since getopt(3) can handle it. 534 */ 535 switch(ap[2]) { 536 case '1': 537 if (ap[3] != '\0') 538 goto jbad; 539 ap[1] = '1'; 540 ap[2] = '\0'; 541 break; 542 case '2': 543 if (ap[3] != '\0') 544 goto jbad; 545 ap[1] = '2'; 546 ap[2] = '\0'; 547 break; 548 case '\0': 549 break; 550 default: 551 jbad: errx(1, "illegal option -- %s", ap); 552 usage(); 553 } 554 break; 555 case 'o': 556 /* 557 * The original join allowed "-o arg arg". 558 * Convert to "-o arg -o arg". 559 */ 560 if (ap[2] != '\0') 561 break; 562 for (p = argv + 2; *p; ++p) { 563 if ((p[0][0] != '1' && p[0][0] != '2') || 564 p[0][1] != '.') 565 break; 566 len = strlen(*p); 567 if (len - 2 != strspn(*p + 2, "0123456789")) 568 break; 569 if ((t = malloc(len + 3)) == NULL) 570 err(1, NULL); 571 t[0] = '-'; 572 t[1] = 'o'; 573 memmove(t + 2, *p, len + 1); 574 *p = t; 575 } 576 argv = p - 1; 577 break; 578 } 579 } 580 } 581 582 void 583 usage(void) 584 { 585 fprintf(stderr, "%s", 586 "usage: join [-a fileno | -v fileno ] [-e string] [-j fileno field]\n" 587 " [-1 field] [-2 field] [-o list] [-t char] file1 file2\n"); 588 exit(1); 589 } 590