1 /* $OpenBSD: join.c,v 1.34 2022/12/04 23:50:48 cheloha Exp $ */
2
3 /*-
4 * Copyright (c) 1991, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Steve Hayman of the Computer Science Department, Indiana University,
9 * Michiro Hikida and David Goodenough.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36 #include <err.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <locale.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <unistd.h>
44 #include <wchar.h>
45
46 /*
47 * There's a structure per input file which encapsulates the state of the
48 * file. We repeatedly read lines from each file until we've read in all
49 * the consecutive lines from the file with a common join field. Then we
50 * compare the set of lines with an equivalent set from the other file.
51 */
52 typedef struct {
53 char *line; /* line */
54 size_t linealloc; /* bytes allocated for line */
55 char **fields; /* line field(s) */
56 u_long fieldcnt; /* line field(s) count */
57 u_long fieldalloc; /* line field(s) allocated count */
58 } LINE;
59
60 typedef struct {
61 FILE *fp; /* file descriptor */
62 u_long joinf; /* join field (-1, -2, -j) */
63 int unpair; /* output unpairable lines (-a) */
64 u_long number; /* 1 for file 1, 2 for file 2 */
65 LINE *set; /* set of lines with same field */
66 int pushbool; /* if pushback is set */
67 u_long pushback; /* line on the stack */
68 u_long setcnt; /* set count */
69 u_long setalloc; /* set allocated count */
70 } INPUT;
71 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 },
72 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 };
73
74 typedef struct {
75 u_long filenum; /* file number */
76 u_long fieldno; /* field number */
77 } OLIST;
78 OLIST *olist; /* output field list */
79 u_long olistcnt; /* output field list count */
80 u_long olistalloc; /* output field allocated count */
81
82 int joinout = 1; /* show lines with matched join fields (-v) */
83 int needsep; /* need separator character */
84 int spans = 1; /* span multiple delimiters (-t) */
85 char *empty; /* empty field replacement string (-e) */
86 wchar_t tabchar[] = L" \t"; /* delimiter characters (-t) */
87
88 int cmp(LINE *, u_long, LINE *, u_long);
89 void fieldarg(char *);
90 void joinlines(INPUT *, INPUT *);
91 char *mbssep(char **, const wchar_t *);
92 void obsolete(char **);
93 void outfield(LINE *, u_long, int);
94 void outoneline(INPUT *, LINE *);
95 void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
96 void slurp(INPUT *);
97 void usage(void);
98
99 int
main(int argc,char * argv[])100 main(int argc, char *argv[])
101 {
102 INPUT *F1, *F2;
103 int aflag, ch, cval, vflag;
104 char *end;
105
106 setlocale(LC_CTYPE, "");
107
108 if (pledge("stdio rpath", NULL) == -1)
109 err(1, "pledge");
110
111 F1 = &input1;
112 F2 = &input2;
113
114 aflag = vflag = 0;
115 obsolete(argv);
116 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) {
117 switch (ch) {
118 case '\01': /* See comment in obsolete(). */
119 aflag = 1;
120 F1->unpair = F2->unpair = 1;
121 break;
122 case '1':
123 if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
124 errx(1, "-1 option field number less than 1");
125 if (*end)
126 errx(1, "illegal field number -- %s", optarg);
127 --F1->joinf;
128 break;
129 case '2':
130 if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
131 errx(1, "-2 option field number less than 1");
132 if (*end)
133 errx(1, "illegal field number -- %s", optarg);
134 --F2->joinf;
135 break;
136 case 'a':
137 aflag = 1;
138 switch(strtol(optarg, &end, 10)) {
139 case 1:
140 F1->unpair = 1;
141 break;
142 case 2:
143 F2->unpair = 1;
144 break;
145 default:
146 errx(1, "-a option file number not 1 or 2");
147 break;
148 }
149 if (*end)
150 errx(1, "illegal file number -- %s", optarg);
151 break;
152 case 'e':
153 empty = optarg;
154 break;
155 case 'j':
156 if ((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
157 errx(1, "-j option field number less than 1");
158 if (*end)
159 errx(1, "illegal field number -- %s", optarg);
160 --F1->joinf;
161 --F2->joinf;
162 break;
163 case 'o':
164 fieldarg(optarg);
165 break;
166 case 't':
167 spans = 0;
168 if (mbtowc(tabchar, optarg, MB_CUR_MAX) !=
169 strlen(optarg))
170 errx(1, "illegal tab character specification");
171 tabchar[1] = L'\0';
172 break;
173 case 'v':
174 vflag = 1;
175 joinout = 0;
176 switch (strtol(optarg, &end, 10)) {
177 case 1:
178 F1->unpair = 1;
179 break;
180 case 2:
181 F2->unpair = 1;
182 break;
183 default:
184 errx(1, "-v option file number not 1 or 2");
185 break;
186 }
187 if (*end)
188 errx(1, "illegal file number -- %s", optarg);
189 break;
190 default:
191 usage();
192 }
193 }
194 argc -= optind;
195 argv += optind;
196
197 if (aflag && vflag)
198 errx(1, "the -a and -v options are mutually exclusive");
199
200 if (argc != 2)
201 usage();
202
203 /* Open the files; "-" means stdin. */
204 if (!strcmp(*argv, "-"))
205 F1->fp = stdin;
206 else if ((F1->fp = fopen(*argv, "r")) == NULL)
207 err(1, "%s", *argv);
208 ++argv;
209 if (!strcmp(*argv, "-"))
210 F2->fp = stdin;
211 else if ((F2->fp = fopen(*argv, "r")) == NULL)
212 err(1, "%s", *argv);
213 if (F1->fp == stdin && F2->fp == stdin)
214 errx(1, "only one input file may be stdin");
215
216 if (pledge("stdio", NULL) == -1)
217 err(1, "pledge");
218
219 slurp(F1);
220 slurp(F2);
221
222 /*
223 * We try to let the files have the same field value, advancing
224 * whoever falls behind and always advancing the file(s) we output
225 * from.
226 */
227 while (F1->setcnt && F2->setcnt) {
228 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
229 if (cval == 0) {
230 /* Oh joy, oh rapture, oh beauty divine! */
231 if (joinout)
232 joinlines(F1, F2);
233 slurp(F1);
234 slurp(F2);
235 } else if (cval < 0) {
236 /* File 1 takes the lead... */
237 if (F1->unpair)
238 joinlines(F1, NULL);
239 slurp(F1);
240 } else {
241 /* File 2 takes the lead... */
242 if (F2->unpair)
243 joinlines(F2, NULL);
244 slurp(F2);
245 }
246 }
247
248 /*
249 * Now that one of the files is used up, optionally output any
250 * remaining lines from the other file.
251 */
252 if (F1->unpair)
253 while (F1->setcnt) {
254 joinlines(F1, NULL);
255 slurp(F1);
256 }
257 if (F2->unpair)
258 while (F2->setcnt) {
259 joinlines(F2, NULL);
260 slurp(F2);
261 }
262
263 return 0;
264 }
265
266 void
slurp(INPUT * F)267 slurp(INPUT *F)
268 {
269 LINE *lp, *lastlp, tmp;
270 ssize_t len;
271 u_long cnt;
272 char *bp, *fieldp;
273
274 /*
275 * Read all of the lines from an input file that have the same
276 * join field.
277 */
278
279 F->setcnt = 0;
280 for (lastlp = NULL; ; ++F->setcnt) {
281 /*
282 * If we're out of space to hold line structures, allocate
283 * more. Initialize the structure so that we know that this
284 * is new space.
285 */
286 if (F->setcnt == F->setalloc) {
287 LINE *p;
288 u_long newsize = F->setalloc + 50;
289 cnt = F->setalloc;
290 if ((p = reallocarray(F->set, newsize, sizeof(LINE)))
291 == NULL)
292 err(1, NULL);
293 F->set = p;
294 F->setalloc = newsize;
295 memset(F->set + cnt, 0, 50 * sizeof(LINE));
296 /* re-set lastlp in case it moved */
297 if (lastlp != NULL)
298 lastlp = &F->set[F->setcnt - 1];
299 }
300 /*
301 * Get any pushed back line, else get the next line. Allocate
302 * space as necessary. If taking the line from the stack swap
303 * the two structures so that we don't lose space allocated to
304 * either structure. This could be avoided by doing another
305 * level of indirection, but it's probably okay as is.
306 */
307 lp = &F->set[F->setcnt];
308 if (F->setcnt)
309 lastlp = &F->set[F->setcnt - 1];
310 if (F->pushbool) {
311 tmp = F->set[F->setcnt];
312 F->set[F->setcnt] = F->set[F->pushback];
313 F->set[F->pushback] = tmp;
314 F->pushbool = 0;
315 continue;
316 }
317 if ((len = getline(&(lp->line), &(lp->linealloc), F->fp)) == -1)
318 break;
319
320 /* Remove the trailing newline, if any. */
321 if (lp->line[len - 1] == '\n')
322 lp->line[--len] = '\0';
323
324 /* Split the line into fields, allocate space as necessary. */
325 lp->fieldcnt = 0;
326 bp = lp->line;
327 while ((fieldp = mbssep(&bp, tabchar)) != NULL) {
328 if (spans && *fieldp == '\0')
329 continue;
330 if (lp->fieldcnt == lp->fieldalloc) {
331 char **p;
332 u_long newsize = lp->fieldalloc + 50;
333 if ((p = reallocarray(lp->fields, newsize,
334 sizeof(char *))) == NULL)
335 err(1, NULL);
336 lp->fields = p;
337 lp->fieldalloc = newsize;
338 }
339 lp->fields[lp->fieldcnt++] = fieldp;
340 }
341
342 /* See if the join field value has changed. */
343 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
344 F->pushbool = 1;
345 F->pushback = F->setcnt;
346 break;
347 }
348 }
349 }
350
351 char *
mbssep(char ** stringp,const wchar_t * wcdelim)352 mbssep(char **stringp, const wchar_t *wcdelim)
353 {
354 char *s, *p;
355 size_t ndelim;
356 int i;
357 /* tabchar is never more than 2 */
358 char mbdelim[2][MB_LEN_MAX + 1];
359 size_t mblen[2];
360
361 if ((s = *stringp) == NULL)
362 return NULL;
363 ndelim = wcslen(wcdelim);
364 for (i = 0; i < ndelim; i++) {
365 /* wcdelim generated via mbtowc */
366 mblen[i] = wctomb(mbdelim[i], wcdelim[i]);
367 }
368 for (p = s; *p != '\0'; p++) {
369 for (i = 0; i < ndelim; i++) {
370 if (strncmp(p, mbdelim[i], mblen[i]) == 0) {
371 *p = '\0';
372 *stringp = p + mblen[i];
373 return s;
374 }
375 }
376 }
377 *stringp = NULL;
378 return s;
379 }
380
381 int
cmp(LINE * lp1,u_long fieldno1,LINE * lp2,u_long fieldno2)382 cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
383 {
384 if (lp1->fieldcnt <= fieldno1)
385 return lp2->fieldcnt <= fieldno2 ? 0 : -1;
386 if (lp2->fieldcnt <= fieldno2)
387 return 1;
388 return strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]);
389 }
390
391 void
joinlines(INPUT * F1,INPUT * F2)392 joinlines(INPUT *F1, INPUT *F2)
393 {
394 u_long cnt1, cnt2;
395
396 /*
397 * Output the results of a join comparison. The output may be from
398 * either file 1 or file 2 (in which case the first argument is the
399 * file from which to output) or from both.
400 */
401 if (F2 == NULL) {
402 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
403 outoneline(F1, &F1->set[cnt1]);
404 return;
405 }
406 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
407 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
408 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
409 }
410
411 void
outoneline(INPUT * F,LINE * lp)412 outoneline(INPUT *F, LINE *lp)
413 {
414 u_long cnt;
415
416 /*
417 * Output a single line from one of the files, according to the
418 * join rules. This happens when we are writing unmatched single
419 * lines. Output empty fields in the right places.
420 */
421 if (olist)
422 for (cnt = 0; cnt < olistcnt; ++cnt) {
423 if (olist[cnt].filenum == F->number)
424 outfield(lp, olist[cnt].fieldno, 0);
425 else if (olist[cnt].filenum == 0)
426 outfield(lp, F->joinf, 0);
427 else
428 outfield(lp, 0, 1);
429 }
430 else {
431 /*
432 * Output the join field, then the remaining fields from F
433 */
434 outfield(lp, F->joinf, 0);
435 for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
436 if (F->joinf != cnt)
437 outfield(lp, cnt, 0);
438 }
439
440 putchar('\n');
441 if (ferror(stdout))
442 err(1, "stdout");
443 needsep = 0;
444 }
445
446 void
outtwoline(INPUT * F1,LINE * lp1,INPUT * F2,LINE * lp2)447 outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
448 {
449 u_long cnt;
450
451 /* Output a pair of lines according to the join list (if any). */
452 if (olist) {
453 for (cnt = 0; cnt < olistcnt; ++cnt)
454 if (olist[cnt].filenum == 0) {
455 if (lp1->fieldcnt >= F1->joinf)
456 outfield(lp1, F1->joinf, 0);
457 else
458 outfield(lp2, F2->joinf, 0);
459 } else if (olist[cnt].filenum == 1)
460 outfield(lp1, olist[cnt].fieldno, 0);
461 else /* if (olist[cnt].filenum == 2) */
462 outfield(lp2, olist[cnt].fieldno, 0);
463 } else {
464 /*
465 * Output the join field, then the remaining fields from F1
466 * and F2.
467 */
468 outfield(lp1, F1->joinf, 0);
469 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
470 if (F1->joinf != cnt)
471 outfield(lp1, cnt, 0);
472 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
473 if (F2->joinf != cnt)
474 outfield(lp2, cnt, 0);
475 }
476 putchar('\n');
477 if (ferror(stdout))
478 err(1, "stdout");
479 needsep = 0;
480 }
481
482 void
outfield(LINE * lp,u_long fieldno,int out_empty)483 outfield(LINE *lp, u_long fieldno, int out_empty)
484 {
485 if (needsep++)
486 putwchar(*tabchar);
487 if (!ferror(stdout)) {
488 if (lp->fieldcnt <= fieldno || out_empty) {
489 if (empty != NULL)
490 fputs(empty, stdout);
491 } else {
492 if (*lp->fields[fieldno] == '\0')
493 return;
494 fputs(lp->fields[fieldno], stdout);
495 }
496 }
497 if (ferror(stdout))
498 err(1, "stdout");
499 }
500
501 /*
502 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
503 * fields.
504 */
505 void
fieldarg(char * option)506 fieldarg(char *option)
507 {
508 u_long fieldno, filenum;
509 char *end, *token;
510
511 while ((token = strsep(&option, ", \t")) != NULL) {
512 if (*token == '\0')
513 continue;
514 if (token[0] == '0')
515 filenum = fieldno = 0;
516 else if ((token[0] == '1' || token[0] == '2') &&
517 token[1] == '.') {
518 filenum = token[0] - '0';
519 fieldno = strtol(token + 2, &end, 10);
520 if (*end)
521 errx(1, "malformed -o option field");
522 if (fieldno == 0)
523 errx(1, "field numbers are 1 based");
524 --fieldno;
525 } else
526 errx(1, "malformed -o option field");
527 if (olistcnt == olistalloc) {
528 OLIST *p;
529 u_long newsize = olistalloc + 50;
530 if ((p = reallocarray(olist, newsize, sizeof(OLIST)))
531 == NULL)
532 err(1, NULL);
533 olist = p;
534 olistalloc = newsize;
535 }
536 olist[olistcnt].filenum = filenum;
537 olist[olistcnt].fieldno = fieldno;
538 ++olistcnt;
539 }
540 }
541
542 void
obsolete(char ** argv)543 obsolete(char **argv)
544 {
545 size_t len;
546 char **p, *ap, *t;
547
548 while ((ap = *++argv) != NULL) {
549 /* Return if "--". */
550 if (ap[0] == '-' && ap[1] == '-')
551 return;
552 /* skip if not an option */
553 if (ap[0] != '-')
554 continue;
555 switch (ap[1]) {
556 case 'a':
557 /*
558 * The original join allowed "-a", which meant the
559 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2
560 * only specifies this as "-a 1" and "a -2", so we
561 * have to use another option flag, one that is
562 * unlikely to ever be used or accidentally entered
563 * on the command line. (Well, we could reallocate
564 * the argv array, but that hardly seems worthwhile.)
565 */
566 if (ap[2] == '\0' && (argv[1] == NULL ||
567 (strcmp(argv[1], "1") != 0 &&
568 strcmp(argv[1], "2") != 0))) {
569 ap[1] = '\01';
570 warnx("-a option used without an argument; "
571 "reverting to historical behavior");
572 }
573 break;
574 case 'j':
575 /*
576 * The original join allowed "-j[12] arg" and "-j arg".
577 * Convert the former to "-[12] arg". Don't convert
578 * the latter since getopt(3) can handle it.
579 */
580 switch(ap[2]) {
581 case '1':
582 case '2':
583 if (ap[3] != '\0')
584 goto jbad;
585 ap[1] = ap[2];
586 ap[2] = '\0';
587 break;
588 case '\0':
589 break;
590 default:
591 jbad: warnx("unknown option -- %s", ap + 1);
592 usage();
593 }
594 break;
595 case 'o':
596 /*
597 * The original join allowed "-o arg arg".
598 * Convert to "-o arg -o arg".
599 */
600 if (ap[2] != '\0' || argv[1] == NULL)
601 break;
602 for (p = argv + 2; *p != NULL; ++p) {
603 if (p[0][0] == '0' || ((p[0][0] != '1' &&
604 p[0][0] != '2') || p[0][1] != '.'))
605 break;
606 len = strlen(*p);
607 if (len - 2 != strspn(*p + 2, "0123456789"))
608 break;
609 if ((t = malloc(len + 3)) == NULL)
610 err(1, NULL);
611 t[0] = '-';
612 t[1] = 'o';
613 memmove(t + 2, *p, len + 1);
614 *p = t;
615 }
616 argv = p - 1;
617 break;
618 }
619 }
620 }
621
622 void
usage(void)623 usage(void)
624 {
625 int len;
626 extern char *__progname;
627
628 len = strlen(__progname) + sizeof("usage: ");
629 (void)fprintf(stderr, "usage: %s [-1 field] [-2 field] "
630 "[-a file_number | -v file_number] [-e string]\n"
631 "%*s[-o list] [-t char] file1 file2\n",
632 __progname, len, "");
633 exit(1);
634 }
635