1 /*-
2 * Copyright (c) 1991, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Steve Hayman of the Computer Science Department, Indiana University,
7 * Michiro Hikida and David Goodenough.
8 *
9 * %sccs.include.redist.c%
10 */
11
12 #ifndef lint
13 static char copyright[] =
14 "@(#) Copyright (c) 1991, 1993, 1994\n\
15 The Regents of the University of California. All rights reserved.\n";
16 #endif /* not lint */
17
18 #ifndef lint
19 static char sccsid[] = "@(#)join.c 8.6 (Berkeley) 05/04/95";
20 #endif /* not lint */
21
22 #include <sys/param.h>
23
24 #include <ctype.h>
25 #include <err.h>
26 #include <errno.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31
32 /*
33 * There's a structure per input file which encapsulates the state of the
34 * file. We repeatedly read lines from each file until we've read in all
35 * the consecutive lines from the file with a common join field. Then we
36 * compare the set of lines with an equivalent set from the other file.
37 */
38 typedef struct {
39 char *line; /* line */
40 u_long linealloc; /* line allocated count */
41 char **fields; /* line field(s) */
42 u_long fieldcnt; /* line field(s) count */
43 u_long fieldalloc; /* line field(s) allocated count */
44 } LINE;
45
46 typedef struct {
47 FILE *fp; /* file descriptor */
48 u_long joinf; /* join field (-1, -2, -j) */
49 int unpair; /* output unpairable lines (-a) */
50 int number; /* 1 for file 1, 2 for file 2 */
51
52 LINE *set; /* set of lines with same field */
53 int pushbool; /* if pushback is set */
54 u_long pushback; /* line on the stack */
55 u_long setcnt; /* set count */
56 u_long setalloc; /* set allocated count */
57 } INPUT;
58 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, },
59 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, };
60
61 typedef struct {
62 u_long filenum; /* file number */
63 u_long fieldno; /* field number */
64 } OLIST;
65 OLIST *olist; /* output field list */
66 u_long olistcnt; /* output field list count */
67 u_long olistalloc; /* output field allocated count */
68
69 int joinout = 1; /* show lines with matched join fields (-v) */
70 int needsep; /* need separator character */
71 int spans = 1; /* span multiple delimiters (-t) */
72 char *empty; /* empty field replacement string (-e) */
73 char *tabchar = " \t"; /* delimiter characters (-t) */
74
75 int cmp __P((LINE *, u_long, LINE *, u_long));
76 void fieldarg __P((char *));
77 void joinlines __P((INPUT *, INPUT *));
78 void obsolete __P((char **));
79 void outfield __P((LINE *, u_long, int));
80 void outoneline __P((INPUT *, LINE *));
81 void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *));
82 void slurp __P((INPUT *));
83 void usage __P((void));
84
85 int
main(argc,argv)86 main(argc, argv)
87 int argc;
88 char *argv[];
89 {
90 INPUT *F1, *F2;
91 int aflag, ch, cval, vflag;
92 char *end;
93
94 F1 = &input1;
95 F2 = &input2;
96
97 aflag = vflag = 0;
98 obsolete(argv);
99 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != EOF) {
100 switch (ch) {
101 case '\01': /* See comment in obsolete(). */
102 aflag = 1;
103 F1->unpair = F2->unpair = 1;
104 break;
105 case '1':
106 if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
107 errx(1, "-1 option field number less than 1");
108 if (*end)
109 errx(1, "illegal field number -- %s", optarg);
110 --F1->joinf;
111 break;
112 case '2':
113 if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
114 errx(1, "-2 option field number less than 1");
115 if (*end)
116 errx(1, "illegal field number -- %s", optarg);
117 --F2->joinf;
118 break;
119 case 'a':
120 aflag = 1;
121 switch(strtol(optarg, &end, 10)) {
122 case 1:
123 F1->unpair = 1;
124 break;
125 case 2:
126 F2->unpair = 1;
127 break;
128 default:
129 errx(1, "-a option file number not 1 or 2");
130 break;
131 }
132 if (*end)
133 errx(1, "illegal file number -- %s", optarg);
134 break;
135 case 'e':
136 empty = optarg;
137 break;
138 case 'j':
139 if ((F1->joinf = F2->joinf =
140 strtol(optarg, &end, 10)) < 1)
141 errx(1, "-j option field number less than 1");
142 if (*end)
143 errx(1, "illegal field number -- %s", optarg);
144 --F1->joinf;
145 --F2->joinf;
146 break;
147 case 'o':
148 fieldarg(optarg);
149 break;
150 case 't':
151 spans = 0;
152 if (strlen(tabchar = optarg) != 1)
153 errx(1, "illegal tab character specification");
154 break;
155 case 'v':
156 vflag = 1;
157 joinout = 0;
158 switch (strtol(optarg, &end, 10)) {
159 case 1:
160 F1->unpair = 1;
161 break;
162 case 2:
163 F2->unpair = 1;
164 break;
165 default:
166 errx(1, "-v option file number not 1 or 2");
167 break;
168 }
169 if (*end)
170 errx(1, "illegal file number -- %s", optarg);
171 break;
172 case '?':
173 default:
174 usage();
175 }
176 }
177 argc -= optind;
178 argv += optind;
179
180 if (aflag && vflag)
181 errx(1, "the -a and -v options are mutually exclusive");
182
183 if (argc != 2)
184 usage();
185
186 /* Open the files; "-" means stdin. */
187 if (!strcmp(*argv, "-"))
188 F1->fp = stdin;
189 else if ((F1->fp = fopen(*argv, "r")) == NULL)
190 err(1, "%s", *argv);
191 ++argv;
192 if (!strcmp(*argv, "-"))
193 F2->fp = stdin;
194 else if ((F2->fp = fopen(*argv, "r")) == NULL)
195 err(1, "%s", *argv);
196 if (F1->fp == stdin && F2->fp == stdin)
197 errx(1, "only one input file may be stdin");
198
199 slurp(F1);
200 slurp(F2);
201 while (F1->setcnt && F2->setcnt) {
202 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
203 if (cval == 0) {
204 /* Oh joy, oh rapture, oh beauty divine! */
205 if (joinout)
206 joinlines(F1, F2);
207 slurp(F1);
208 slurp(F2);
209 } else if (cval < 0) {
210 /* File 1 takes the lead... */
211 if (F1->unpair)
212 joinlines(F1, NULL);
213 slurp(F1);
214 } else {
215 /* File 2 takes the lead... */
216 if (F2->unpair)
217 joinlines(F2, NULL);
218 slurp(F2);
219 }
220 }
221
222 /*
223 * Now that one of the files is used up, optionally output any
224 * remaining lines from the other file.
225 */
226 if (F1->unpair)
227 while (F1->setcnt) {
228 joinlines(F1, NULL);
229 slurp(F1);
230 }
231 if (F2->unpair)
232 while (F2->setcnt) {
233 joinlines(F2, NULL);
234 slurp(F2);
235 }
236 exit(0);
237 }
238
239 void
slurp(F)240 slurp(F)
241 INPUT *F;
242 {
243 LINE *lp, *lastlp, tmp;
244 size_t len;
245 int cnt;
246 char *bp, *fieldp;
247
248 /*
249 * Read all of the lines from an input file that have the same
250 * join field.
251 */
252 F->setcnt = 0;
253 for (lastlp = NULL;; ++F->setcnt, lastlp = lp) {
254 /*
255 * If we're out of space to hold line structures, allocate
256 * more. Initialize the structure so that we know that this
257 * is new space.
258 */
259 if (F->setcnt == F->setalloc) {
260 cnt = F->setalloc;
261 F->setalloc += 50;
262 if ((F->set = realloc(F->set,
263 F->setalloc * sizeof(LINE))) == NULL)
264 err(1, NULL);
265 memset(F->set + cnt, 0, 50 * sizeof(LINE));
266
267 /* re-set lastlp in case it moved */
268 if (lastlp != NULL)
269 lastlp = &F->set[F->setcnt - 1];
270 }
271
272 /*
273 * Get any pushed back line, else get the next line. Allocate
274 * space as necessary. If taking the line from the stack swap
275 * the two structures so that we don't lose space allocated to
276 * either structure. This could be avoided by doing another
277 * level of indirection, but it's probably okay as is.
278 * but it's probably okay as is.
279 */
280 lp = &F->set[F->setcnt];
281 if (F->pushbool) {
282 tmp = F->set[F->setcnt];
283 F->set[F->setcnt] = F->set[F->pushback];
284 F->set[F->pushback] = tmp;
285 F->pushbool = 0;
286 continue;
287 }
288 if ((bp = fgetln(F->fp, &len)) == NULL)
289 return;
290 if (lp->linealloc <= len + 1) {
291 lp->linealloc += MAX(100, len + 1 - lp->linealloc);
292 if ((lp->line =
293 realloc(lp->line, lp->linealloc)) == NULL)
294 err(1, NULL);
295 }
296 memmove(lp->line, bp, len);
297
298 /* Replace trailing newline, if it exists. */
299 if (bp[len - 1] == '\n')
300 lp->line[len - 1] = '\0';
301 else
302 lp->line[len] = '\0';
303 bp = lp->line;
304
305 /* Split the line into fields, allocate space as necessary. */
306 lp->fieldcnt = 0;
307 while ((fieldp = strsep(&bp, tabchar)) != NULL) {
308 if (spans && *fieldp == '\0')
309 continue;
310 if (lp->fieldcnt == lp->fieldalloc) {
311 lp->fieldalloc += 50;
312 if ((lp->fields = realloc(lp->fields,
313 lp->fieldalloc * sizeof(char *))) == NULL)
314 err(1, NULL);
315 }
316 lp->fields[lp->fieldcnt++] = fieldp;
317 }
318
319 /* See if the join field value has changed. */
320 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
321 F->pushbool = 1;
322 F->pushback = F->setcnt;
323 break;
324 }
325 }
326 }
327
328 int
cmp(lp1,fieldno1,lp2,fieldno2)329 cmp(lp1, fieldno1, lp2, fieldno2)
330 LINE *lp1, *lp2;
331 u_long fieldno1, fieldno2;
332 {
333 if (lp1->fieldcnt <= fieldno1)
334 return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
335 if (lp2->fieldcnt <= fieldno2)
336 return (-1);
337 return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
338 }
339
340 void
joinlines(F1,F2)341 joinlines(F1, F2)
342 INPUT *F1, *F2;
343 {
344 int cnt1, cnt2;
345
346 /*
347 * Output the results of a join comparison. The output may be from
348 * either file 1 or file 2 (in which case the first argument is the
349 * file from which to output) or from both.
350 */
351 if (F2 == NULL) {
352 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
353 outoneline(F1, &F1->set[cnt1]);
354 return;
355 }
356 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
357 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
358 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
359 }
360
361 void
outoneline(F,lp)362 outoneline(F, lp)
363 INPUT *F;
364 LINE *lp;
365 {
366 int cnt;
367
368 /*
369 * Output a single line from one of the files, according to the
370 * join rules. This happens when we are writing unmatched single
371 * lines. Output empty fields in the right places.
372 */
373 if (olist)
374 for (cnt = 0; cnt < olistcnt; ++cnt) {
375 if (olist[cnt].filenum == F->number)
376 outfield(lp, olist[cnt].fieldno, 0);
377 else
378 outfield(lp, 0, 1);
379 }
380 else
381 for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
382 outfield(lp, cnt, 0);
383 (void)printf("\n");
384 if (ferror(stdout))
385 err(1, "stdout");
386 needsep = 0;
387 }
388
389 void
outtwoline(F1,lp1,F2,lp2)390 outtwoline(F1, lp1, F2, lp2)
391 INPUT *F1, *F2;
392 LINE *lp1, *lp2;
393 {
394 int cnt;
395
396 /* Output a pair of lines according to the join list (if any). */
397 if (olist)
398 for (cnt = 0; cnt < olistcnt; ++cnt)
399 if (olist[cnt].filenum == 1)
400 outfield(lp1, olist[cnt].fieldno, 0);
401 else /* if (olist[cnt].filenum == 2) */
402 outfield(lp2, olist[cnt].fieldno, 0);
403 else {
404 /*
405 * Output the join field, then the remaining fields from F1
406 * and F2.
407 */
408 outfield(lp1, F1->joinf, 0);
409 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
410 if (F1->joinf != cnt)
411 outfield(lp1, cnt, 0);
412 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
413 if (F2->joinf != cnt)
414 outfield(lp2, cnt, 0);
415 }
416 (void)printf("\n");
417 if (ferror(stdout))
418 err(1, "stdout");
419 needsep = 0;
420 }
421
422 void
outfield(lp,fieldno,out_empty)423 outfield(lp, fieldno, out_empty)
424 LINE *lp;
425 u_long fieldno;
426 int out_empty;
427 {
428 if (needsep++)
429 (void)printf("%c", *tabchar);
430 if (!ferror(stdout))
431 if (lp->fieldcnt < fieldno || out_empty) {
432 if (empty != NULL)
433 (void)printf("%s", empty);
434 } else {
435 if (*lp->fields[fieldno] == '\0')
436 return;
437 (void)printf("%s", lp->fields[fieldno]);
438 }
439 if (ferror(stdout))
440 err(1, "stdout");
441 }
442
443 /*
444 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
445 * fields.
446 */
447 void
fieldarg(option)448 fieldarg(option)
449 char *option;
450 {
451 u_long fieldno;
452 char *end, *token;
453
454 while ((token = strsep(&option, ", \t")) != NULL) {
455 if (*token == '\0')
456 continue;
457 if (token[0] != '1' && token[0] != '2' || token[1] != '.')
458 errx(1, "malformed -o option field");
459 fieldno = strtol(token + 2, &end, 10);
460 if (*end)
461 errx(1, "malformed -o option field");
462 if (fieldno == 0)
463 errx(1, "field numbers are 1 based");
464 if (olistcnt == olistalloc) {
465 olistalloc += 50;
466 if ((olist = realloc(olist,
467 olistalloc * sizeof(OLIST))) == NULL)
468 err(1, NULL);
469 }
470 olist[olistcnt].filenum = token[0] - '0';
471 olist[olistcnt].fieldno = fieldno - 1;
472 ++olistcnt;
473 }
474 }
475
476 void
obsolete(argv)477 obsolete(argv)
478 char **argv;
479 {
480 int len;
481 char **p, *ap, *t;
482
483 while ((ap = *++argv) != NULL) {
484 /* Return if "--". */
485 if (ap[0] == '-' && ap[1] == '-')
486 return;
487 switch (ap[1]) {
488 case 'a':
489 /*
490 * The original join allowed "-a", which meant the
491 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2
492 * only specifies this as "-a 1" and "a -2", so we
493 * have to use another option flag, one that is
494 * unlikely to ever be used or accidentally entered
495 * on the command line. (Well, we could reallocate
496 * the argv array, but that hardly seems worthwhile.)
497 */
498 if (ap[2] == '\0')
499 ap[1] = '\01';
500 break;
501 case 'j':
502 /*
503 * The original join allowed "-j[12] arg" and "-j arg".
504 * Convert the former to "-[12] arg". Don't convert
505 * the latter since getopt(3) can handle it.
506 */
507 switch(ap[2]) {
508 case '1':
509 if (ap[3] != '\0')
510 goto jbad;
511 ap[1] = '1';
512 ap[2] = '\0';
513 break;
514 case '2':
515 if (ap[3] != '\0')
516 goto jbad;
517 ap[1] = '2';
518 ap[2] = '\0';
519 break;
520 case '\0':
521 break;
522 default:
523 jbad: errx(1, "illegal option -- %s", ap);
524 usage();
525 }
526 break;
527 case 'o':
528 /*
529 * The original join allowed "-o arg arg".
530 * Convert to "-o arg -o arg".
531 */
532 if (ap[2] != '\0')
533 break;
534 for (p = argv + 2; *p; ++p) {
535 if (p[0][0] != '1' &&
536 p[0][0] != '2' || p[0][1] != '.')
537 break;
538 len = strlen(*p);
539 if (len - 2 != strspn(*p + 2, "0123456789"))
540 break;
541 if ((t = malloc(len + 3)) == NULL)
542 err(1, NULL);
543 t[0] = '-';
544 t[1] = 'o';
545 memmove(t + 2, *p, len + 1);
546 *p = t;
547 }
548 argv = p - 1;
549 break;
550 }
551 }
552 }
553
554 void
usage()555 usage()
556 {
557 (void)fprintf(stderr, "%s%s\n",
558 "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ",
559 "[-2 field]\n [-o list] [-t char] file1 file2");
560 exit(1);
561 }
562