1 /*- 2 * Copyright (c) 1991, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Steve Hayman of the Computer Science Department, Indiana University, 7 * Michiro Hikida and David Goodenough. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#) Copyright (c) 1991, 1993, 1994 The Regents of the University of California. All rights reserved. 34 * @(#)join.c 8.6 (Berkeley) 5/4/95 35 * $FreeBSD: head/usr.bin/join/join.c 246319 2013-02-04 10:05:55Z andrew $ 36 */ 37 38 #include <sys/param.h> 39 40 #include <err.h> 41 #include <errno.h> 42 #include <limits.h> 43 #include <locale.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <unistd.h> 48 #include <wchar.h> 49 50 /* 51 * There's a structure per input file which encapsulates the state of the 52 * file. We repeatedly read lines from each file until we've read in all 53 * the consecutive lines from the file with a common join field. Then we 54 * compare the set of lines with an equivalent set from the other file. 55 */ 56 typedef struct { 57 char *line; /* line */ 58 u_long linealloc; /* line allocated count */ 59 char **fields; /* line field(s) */ 60 u_long fieldcnt; /* line field(s) count */ 61 u_long fieldalloc; /* line field(s) allocated count */ 62 } LINE; 63 64 typedef struct { 65 FILE *fp; /* file descriptor */ 66 u_long joinf; /* join field (-1, -2, -j) */ 67 int unpair; /* output unpairable lines (-a) */ 68 u_long number; /* 1 for file 1, 2 for file 2 */ 69 70 LINE *set; /* set of lines with same field */ 71 int pushbool; /* if pushback is set */ 72 u_long pushback; /* line on the stack */ 73 u_long setcnt; /* set count */ 74 u_long setalloc; /* set allocated count */ 75 } INPUT; 76 static INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 }, 77 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 }; 78 79 typedef struct { 80 u_long filenum; /* file number */ 81 u_long fieldno; /* field number */ 82 } OLIST; 83 static OLIST *olist; /* output field list */ 84 static u_long olistcnt; /* output field list count */ 85 static u_long olistalloc; /* output field allocated count */ 86 87 static int joinout = 1; /* show lines with matched join fields (-v) */ 88 static int needsep; /* need separator character */ 89 static int spans = 1; /* span multiple delimiters (-t) */ 90 static char *empty; /* empty field replacement string (-e) */ 91 static wchar_t default_tabchar[] = L" \t"; 92 static wchar_t *tabchar = default_tabchar; /* delimiter characters (-t) */ 93 94 static int cmp(LINE *, u_long, LINE *, u_long); 95 static void fieldarg(char *); 96 static void joinlines(INPUT *, INPUT *); 97 static int mbscoll(const char *, const char *); 98 static char *mbssep(char **, const wchar_t *); 99 static void obsolete(char **); 100 static void outfield(LINE *, u_long, int); 101 static void outoneline(INPUT *, LINE *); 102 static void outtwoline(INPUT *, LINE *, INPUT *, LINE *); 103 static void slurp(INPUT *); 104 static wchar_t *towcs(const char *); 105 static void usage(void); 106 107 int 108 main(int argc, char *argv[]) 109 { 110 INPUT *F1, *F2; 111 int aflag, ch, cval, vflag; 112 char *end; 113 114 setlocale(LC_ALL, ""); 115 116 F1 = &input1; 117 F2 = &input2; 118 119 aflag = vflag = 0; 120 obsolete(argv); 121 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) { 122 switch (ch) { 123 case '\01': /* See comment in obsolete(). */ 124 aflag = 1; 125 F1->unpair = F2->unpair = 1; 126 break; 127 case '1': 128 if ((F1->joinf = strtol(optarg, &end, 10)) < 1) 129 errx(1, "-1 option field number less than 1"); 130 if (*end) 131 errx(1, "illegal field number -- %s", optarg); 132 --F1->joinf; 133 break; 134 case '2': 135 if ((F2->joinf = strtol(optarg, &end, 10)) < 1) 136 errx(1, "-2 option field number less than 1"); 137 if (*end) 138 errx(1, "illegal field number -- %s", optarg); 139 --F2->joinf; 140 break; 141 case 'a': 142 aflag = 1; 143 switch(strtol(optarg, &end, 10)) { 144 case 1: 145 F1->unpair = 1; 146 break; 147 case 2: 148 F2->unpair = 1; 149 break; 150 default: 151 errx(1, "-a option file number not 1 or 2"); 152 break; 153 } 154 if (*end) 155 errx(1, "illegal file number -- %s", optarg); 156 break; 157 case 'e': 158 empty = optarg; 159 break; 160 case 'j': 161 if ((F1->joinf = F2->joinf = 162 strtol(optarg, &end, 10)) < 1) 163 errx(1, "-j option field number less than 1"); 164 if (*end) 165 errx(1, "illegal field number -- %s", optarg); 166 --F1->joinf; 167 --F2->joinf; 168 break; 169 case 'o': 170 fieldarg(optarg); 171 break; 172 case 't': 173 spans = 0; 174 if (mbrtowc(&tabchar[0], optarg, MB_LEN_MAX, NULL) != 175 strlen(optarg)) 176 errx(1, "illegal tab character specification"); 177 tabchar[1] = L'\0'; 178 break; 179 case 'v': 180 vflag = 1; 181 joinout = 0; 182 switch (strtol(optarg, &end, 10)) { 183 case 1: 184 F1->unpair = 1; 185 break; 186 case 2: 187 F2->unpair = 1; 188 break; 189 default: 190 errx(1, "-v option file number not 1 or 2"); 191 break; 192 } 193 if (*end) 194 errx(1, "illegal file number -- %s", optarg); 195 break; 196 case '?': 197 default: 198 usage(); 199 } 200 } 201 argc -= optind; 202 argv += optind; 203 204 if (aflag && vflag) 205 errx(1, "the -a and -v options are mutually exclusive"); 206 207 if (argc != 2) 208 usage(); 209 210 /* Open the files; "-" means stdin. */ 211 if (!strcmp(*argv, "-")) 212 F1->fp = stdin; 213 else if ((F1->fp = fopen(*argv, "r")) == NULL) 214 err(1, "%s", *argv); 215 ++argv; 216 if (!strcmp(*argv, "-")) 217 F2->fp = stdin; 218 else if ((F2->fp = fopen(*argv, "r")) == NULL) 219 err(1, "%s", *argv); 220 if (F1->fp == stdin && F2->fp == stdin) 221 errx(1, "only one input file may be stdin"); 222 223 slurp(F1); 224 slurp(F2); 225 while (F1->setcnt && F2->setcnt) { 226 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf); 227 if (cval == 0) { 228 /* Oh joy, oh rapture, oh beauty divine! */ 229 if (joinout) 230 joinlines(F1, F2); 231 slurp(F1); 232 slurp(F2); 233 } else if (cval < 0) { 234 /* File 1 takes the lead... */ 235 if (F1->unpair) 236 joinlines(F1, NULL); 237 slurp(F1); 238 } else { 239 /* File 2 takes the lead... */ 240 if (F2->unpair) 241 joinlines(F2, NULL); 242 slurp(F2); 243 } 244 } 245 246 /* 247 * Now that one of the files is used up, optionally output any 248 * remaining lines from the other file. 249 */ 250 if (F1->unpair) 251 while (F1->setcnt) { 252 joinlines(F1, NULL); 253 slurp(F1); 254 } 255 if (F2->unpair) 256 while (F2->setcnt) { 257 joinlines(F2, NULL); 258 slurp(F2); 259 } 260 exit(0); 261 } 262 263 static void 264 slurp(INPUT *F) 265 { 266 LINE *lp, *lastlp, tmp; 267 size_t len; 268 int cnt; 269 char *bp, *fieldp; 270 271 /* 272 * Read all of the lines from an input file that have the same 273 * join field. 274 */ 275 F->setcnt = 0; 276 for (lastlp = NULL;; ++F->setcnt) { 277 /* 278 * If we're out of space to hold line structures, allocate 279 * more. Initialize the structure so that we know that this 280 * is new space. 281 */ 282 if (F->setcnt == F->setalloc) { 283 cnt = F->setalloc; 284 F->setalloc += 50; 285 if ((F->set = realloc(F->set, 286 F->setalloc * sizeof(LINE))) == NULL) 287 err(1, NULL); 288 memset(F->set + cnt, 0, 50 * sizeof(LINE)); 289 290 /* re-set lastlp in case it moved */ 291 if (lastlp != NULL) 292 lastlp = &F->set[F->setcnt - 1]; 293 } 294 295 /* 296 * Get any pushed back line, else get the next line. Allocate 297 * space as necessary. If taking the line from the stack swap 298 * the two structures so that we don't lose space allocated to 299 * either structure. This could be avoided by doing another 300 * level of indirection, but it's probably okay as is. 301 */ 302 lp = &F->set[F->setcnt]; 303 if (F->setcnt) 304 lastlp = &F->set[F->setcnt - 1]; 305 if (F->pushbool) { 306 tmp = F->set[F->setcnt]; 307 F->set[F->setcnt] = F->set[F->pushback]; 308 F->set[F->pushback] = tmp; 309 F->pushbool = 0; 310 continue; 311 } 312 if ((bp = fgetln(F->fp, &len)) == NULL) 313 return; 314 if (lp->linealloc <= len + 1) { 315 lp->linealloc += MAX(100, len + 1 - lp->linealloc); 316 if ((lp->line = 317 realloc(lp->line, lp->linealloc)) == NULL) 318 err(1, NULL); 319 } 320 memmove(lp->line, bp, len); 321 322 /* Replace trailing newline, if it exists. */ 323 if (bp[len - 1] == '\n') 324 lp->line[len - 1] = '\0'; 325 else 326 lp->line[len] = '\0'; 327 bp = lp->line; 328 329 /* Split the line into fields, allocate space as necessary. */ 330 lp->fieldcnt = 0; 331 while ((fieldp = mbssep(&bp, tabchar)) != NULL) { 332 if (spans && *fieldp == '\0') 333 continue; 334 if (lp->fieldcnt == lp->fieldalloc) { 335 lp->fieldalloc += 50; 336 if ((lp->fields = realloc(lp->fields, 337 lp->fieldalloc * sizeof(char *))) == NULL) 338 err(1, NULL); 339 } 340 lp->fields[lp->fieldcnt++] = fieldp; 341 } 342 343 /* See if the join field value has changed. */ 344 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) { 345 F->pushbool = 1; 346 F->pushback = F->setcnt; 347 break; 348 } 349 } 350 } 351 352 static char * 353 mbssep(char **stringp, const wchar_t *delim) 354 { 355 char *s, *tok; 356 const wchar_t *spanp; 357 wchar_t c, sc; 358 size_t n; 359 360 if ((s = *stringp) == NULL) 361 return (NULL); 362 for (tok = s;;) { 363 n = mbrtowc(&c, s, MB_LEN_MAX, NULL); 364 if (n == (size_t)-1 || n == (size_t)-2) 365 errc(1, EILSEQ, NULL); /* XXX */ 366 s += n; 367 spanp = delim; 368 do { 369 if ((sc = *spanp++) == c) { 370 if (c == 0) 371 s = NULL; 372 else 373 s[-n] = '\0'; 374 *stringp = s; 375 return (tok); 376 } 377 } while (sc != 0); 378 } 379 } 380 381 static int 382 cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2) 383 { 384 if (lp1->fieldcnt <= fieldno1) 385 return (lp2->fieldcnt <= fieldno2 ? 0 : 1); 386 if (lp2->fieldcnt <= fieldno2) 387 return (-1); 388 return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2])); 389 } 390 391 static int 392 mbscoll(const char *s1, const char *s2) 393 { 394 wchar_t *w1, *w2; 395 int ret; 396 397 if (MB_CUR_MAX == 1) 398 return (strcoll(s1, s2)); 399 if ((w1 = towcs(s1)) == NULL || (w2 = towcs(s2)) == NULL) 400 err(1, NULL); /* XXX */ 401 ret = wcscoll(w1, w2); 402 free(w1); 403 free(w2); 404 return (ret); 405 } 406 407 static wchar_t * 408 towcs(const char *s) 409 { 410 wchar_t *wcs; 411 size_t n; 412 413 if ((n = mbsrtowcs(NULL, &s, 0, NULL)) == (size_t)-1) 414 return (NULL); 415 if ((wcs = malloc((n + 1) * sizeof(*wcs))) == NULL) 416 return (NULL); 417 mbsrtowcs(wcs, &s, n + 1, NULL); 418 return (wcs); 419 } 420 421 static void 422 joinlines(INPUT *F1, INPUT *F2) 423 { 424 u_long cnt1, cnt2; 425 426 /* 427 * Output the results of a join comparison. The output may be from 428 * either file 1 or file 2 (in which case the first argument is the 429 * file from which to output) or from both. 430 */ 431 if (F2 == NULL) { 432 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 433 outoneline(F1, &F1->set[cnt1]); 434 return; 435 } 436 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 437 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2) 438 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]); 439 } 440 441 static void 442 outoneline(INPUT *F, LINE *lp) 443 { 444 u_long cnt; 445 446 /* 447 * Output a single line from one of the files, according to the 448 * join rules. This happens when we are writing unmatched single 449 * lines. Output empty fields in the right places. 450 */ 451 if (olist) 452 for (cnt = 0; cnt < olistcnt; ++cnt) { 453 if (olist[cnt].filenum == (unsigned)F->number) 454 outfield(lp, olist[cnt].fieldno, 0); 455 else if (olist[cnt].filenum == 0) 456 outfield(lp, F->joinf, 0); 457 else 458 outfield(lp, 0, 1); 459 } 460 else 461 for (cnt = 0; cnt < lp->fieldcnt; ++cnt) 462 outfield(lp, cnt, 0); 463 printf("\n"); 464 if (ferror(stdout)) 465 err(1, "stdout"); 466 needsep = 0; 467 } 468 469 static void 470 outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2) 471 { 472 u_long cnt; 473 474 /* Output a pair of lines according to the join list (if any). */ 475 if (olist) 476 for (cnt = 0; cnt < olistcnt; ++cnt) 477 if (olist[cnt].filenum == 0) { 478 if (lp1->fieldcnt >= F1->joinf) 479 outfield(lp1, F1->joinf, 0); 480 else 481 outfield(lp2, F2->joinf, 0); 482 } else if (olist[cnt].filenum == 1) 483 outfield(lp1, olist[cnt].fieldno, 0); 484 else /* if (olist[cnt].filenum == 2) */ 485 outfield(lp2, olist[cnt].fieldno, 0); 486 else { 487 /* 488 * Output the join field, then the remaining fields from F1 489 * and F2. 490 */ 491 outfield(lp1, F1->joinf, 0); 492 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt) 493 if (F1->joinf != cnt) 494 outfield(lp1, cnt, 0); 495 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt) 496 if (F2->joinf != cnt) 497 outfield(lp2, cnt, 0); 498 } 499 printf("\n"); 500 if (ferror(stdout)) 501 err(1, "stdout"); 502 needsep = 0; 503 } 504 505 static void 506 outfield(LINE *lp, u_long fieldno, int out_empty) 507 { 508 if (needsep++) 509 printf("%lc", (wint_t)*tabchar); 510 if (!ferror(stdout)) { 511 if (lp->fieldcnt <= fieldno || out_empty) { 512 if (empty != NULL) 513 printf("%s", empty); 514 } else { 515 if (*lp->fields[fieldno] == '\0') 516 return; 517 printf("%s", lp->fields[fieldno]); 518 } 519 } 520 if (ferror(stdout)) 521 err(1, "stdout"); 522 } 523 524 /* 525 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output 526 * fields. 527 */ 528 static void 529 fieldarg(char *option) 530 { 531 u_long fieldno, filenum; 532 char *end, *token; 533 534 while ((token = strsep(&option, ", \t")) != NULL) { 535 if (*token == '\0') 536 continue; 537 if (token[0] == '0') 538 filenum = fieldno = 0; 539 else if ((token[0] == '1' || token[0] == '2') && 540 token[1] == '.') { 541 filenum = token[0] - '0'; 542 fieldno = strtol(token + 2, &end, 10); 543 if (*end) 544 errx(1, "malformed -o option field"); 545 if (fieldno == 0) 546 errx(1, "field numbers are 1 based"); 547 --fieldno; 548 } else 549 errx(1, "malformed -o option field"); 550 if (olistcnt == olistalloc) { 551 olistalloc += 50; 552 if ((olist = realloc(olist, 553 olistalloc * sizeof(OLIST))) == NULL) 554 err(1, NULL); 555 } 556 olist[olistcnt].filenum = filenum; 557 olist[olistcnt].fieldno = fieldno; 558 ++olistcnt; 559 } 560 } 561 562 static void 563 obsolete(char **argv) 564 { 565 size_t len; 566 char **p, *ap, *t; 567 568 while ((ap = *++argv) != NULL) { 569 /* Return if "--". */ 570 if (ap[0] == '-' && ap[1] == '-') 571 return; 572 /* skip if not an option */ 573 if (ap[0] != '-') 574 continue; 575 switch (ap[1]) { 576 case 'a': 577 /* 578 * The original join allowed "-a", which meant the 579 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2 580 * only specifies this as "-a 1" and "a -2", so we 581 * have to use another option flag, one that is 582 * unlikely to ever be used or accidentally entered 583 * on the command line. (Well, we could reallocate 584 * the argv array, but that hardly seems worthwhile.) 585 */ 586 if (ap[2] == '\0' && (argv[1] == NULL || 587 (strcmp(argv[1], "1") != 0 && 588 strcmp(argv[1], "2") != 0))) { 589 ap[1] = '\01'; 590 warnx("-a option used without an argument; " 591 "reverting to historical behavior"); 592 } 593 break; 594 case 'j': 595 /* 596 * The original join allowed "-j[12] arg" and "-j arg". 597 * Convert the former to "-[12] arg". Don't convert 598 * the latter since getopt(3) can handle it. 599 */ 600 switch(ap[2]) { 601 case '1': 602 if (ap[3] != '\0') 603 goto jbad; 604 ap[1] = '1'; 605 ap[2] = '\0'; 606 break; 607 case '2': 608 if (ap[3] != '\0') 609 goto jbad; 610 ap[1] = '2'; 611 ap[2] = '\0'; 612 break; 613 case '\0': 614 break; 615 default: 616 jbad: errx(1, "illegal option -- %s", ap); 617 usage(); 618 } 619 break; 620 case 'o': 621 /* 622 * The original join allowed "-o arg arg". 623 * Convert to "-o arg -o arg". 624 */ 625 if (ap[2] != '\0') 626 break; 627 for (p = argv + 2; *p; ++p) { 628 if (p[0][0] == '0' || ((p[0][0] != '1' && 629 p[0][0] != '2') || p[0][1] != '.')) 630 break; 631 len = strlen(*p); 632 if (len - 2 != strspn(*p + 2, "0123456789")) 633 break; 634 if ((t = malloc(len + 3)) == NULL) 635 err(1, NULL); 636 t[0] = '-'; 637 t[1] = 'o'; 638 memmove(t + 2, *p, len + 1); 639 *p = t; 640 } 641 argv = p - 1; 642 break; 643 } 644 } 645 } 646 647 static void 648 usage(void) 649 { 650 fprintf(stderr, "%s %s\n%s\n", 651 "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]", 652 "[-2 field]", 653 " [-o list] [-t char] file1 file2"); 654 exit(1); 655 } 656