1 /* join - join lines of two files on a common field
2 Copyright (C) 1991-2018 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
18
19 #include <config.h>
20
21 #include <assert.h>
22 #include <sys/types.h>
23 #include <getopt.h>
24
25 #include "system.h"
26 #include "die.h"
27 #include "error.h"
28 #include "fadvise.h"
29 #include "hard-locale.h"
30 #include "linebuffer.h"
31 #include "memcasecmp.h"
32 #include "quote.h"
33 #include "stdio--.h"
34 #include "xmemcoll.h"
35 #include "xstrtol.h"
36 #include "argmatch.h"
37
38 /* The official name of this program (e.g., no 'g' prefix). */
39 #define PROGRAM_NAME "join"
40
41 #define AUTHORS proper_name ("Mike Haertel")
42
43 #define join system_join
44
45 #define SWAPLINES(a, b) do { \
46 struct line *tmp = a; \
47 a = b; \
48 b = tmp; \
49 } while (0);
50
51 /* An element of the list identifying which fields to print for each
52 output line. */
53 struct outlist
54 {
55 /* File number: 0, 1, or 2. 0 means use the join field.
56 1 means use the first file argument, 2 the second. */
57 int file;
58
59 /* Field index (zero-based), specified only when FILE is 1 or 2. */
60 size_t field;
61
62 struct outlist *next;
63 };
64
65 /* A field of a line. */
66 struct field
67 {
68 char *beg; /* First character in field. */
69 size_t len; /* The length of the field. */
70 };
71
72 /* A line read from an input file. */
73 struct line
74 {
75 struct linebuffer buf; /* The line itself. */
76 size_t nfields; /* Number of elements in 'fields'. */
77 size_t nfields_allocated; /* Number of elements allocated for 'fields'. */
78 struct field *fields;
79 };
80
81 /* One or more consecutive lines read from a file that all have the
82 same join field value. */
83 struct seq
84 {
85 size_t count; /* Elements used in 'lines'. */
86 size_t alloc; /* Elements allocated in 'lines'. */
87 struct line **lines;
88 };
89
90 /* The previous line read from each file. */
91 static struct line *prevline[2] = {NULL, NULL};
92
93 /* The number of lines read from each file. */
94 static uintmax_t line_no[2] = {0, 0};
95
96 /* The input file names. */
97 static char *g_names[2];
98
99 /* This provides an extra line buffer for each file. We need these if we
100 try to read two consecutive lines into the same buffer, since we don't
101 want to overwrite the previous buffer before we check order. */
102 static struct line *spareline[2] = {NULL, NULL};
103
104 /* True if the LC_COLLATE locale is hard. */
105 static bool hard_LC_COLLATE;
106
107 /* If nonzero, print unpairable lines in file 1 or 2. */
108 static bool print_unpairables_1, print_unpairables_2;
109
110 /* If nonzero, print pairable lines. */
111 static bool print_pairables;
112
113 /* If nonzero, we have seen at least one unpairable line. */
114 static bool seen_unpairable;
115
116 /* If nonzero, we have warned about disorder in that file. */
117 static bool issued_disorder_warning[2];
118
119 /* Empty output field filler. */
120 static char const *empty_filler;
121
122 /* Whether to ensure the same number of fields are output from each line. */
123 static bool autoformat;
124 /* The number of fields to output for each line.
125 Only significant when autoformat is true. */
126 static size_t autocount_1;
127 static size_t autocount_2;
128
129 /* Field to join on; SIZE_MAX means they haven't been determined yet. */
130 static size_t join_field_1 = SIZE_MAX;
131 static size_t join_field_2 = SIZE_MAX;
132
133 /* List of fields to print. */
134 static struct outlist outlist_head;
135
136 /* Last element in 'outlist', where a new element can be added. */
137 static struct outlist *outlist_end = &outlist_head;
138
139 /* Tab character separating fields. If negative, fields are separated
140 by any nonempty string of blanks, otherwise by exactly one
141 tab character whose value (when cast to unsigned char) equals TAB. */
142 static int tab = -1;
143
144 /* If nonzero, check that the input is correctly ordered. */
145 static enum
146 {
147 CHECK_ORDER_DEFAULT,
148 CHECK_ORDER_ENABLED,
149 CHECK_ORDER_DISABLED
150 } check_input_order;
151
152 enum
153 {
154 CHECK_ORDER_OPTION = CHAR_MAX + 1,
155 NOCHECK_ORDER_OPTION,
156 HEADER_LINE_OPTION
157 };
158
159
160 static struct option const longopts[] =
161 {
162 {"ignore-case", no_argument, NULL, 'i'},
163 {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
164 {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
165 {"zero-terminated", no_argument, NULL, 'z'},
166 {"header", no_argument, NULL, HEADER_LINE_OPTION},
167 {GETOPT_HELP_OPTION_DECL},
168 {GETOPT_VERSION_OPTION_DECL},
169 {NULL, 0, NULL, 0}
170 };
171
172 /* Used to print non-joining lines */
173 static struct line uni_blank;
174
175 /* If nonzero, ignore case when comparing join fields. */
176 static bool ignore_case;
177
178 /* If nonzero, treat the first line of each file as column headers --
179 join them without checking for ordering */
180 static bool join_header_lines;
181
182 /* The character marking end of line. Default to \n. */
183 static char eolchar = '\n';
184
185 void
usage(int status)186 usage (int status)
187 {
188 if (status != EXIT_SUCCESS)
189 emit_try_help ();
190 else
191 {
192 printf (_("\
193 Usage: %s [OPTION]... FILE1 FILE2\n\
194 "),
195 program_name);
196 fputs (_("\
197 For each pair of input lines with identical join fields, write a line to\n\
198 standard output. The default join field is the first, delimited by blanks.\
199 \n\
200 "), stdout);
201 fputs (_("\
202 \n\
203 When FILE1 or FILE2 (not both) is -, read standard input.\n\
204 "), stdout);
205 fputs (_("\
206 \n\
207 -a FILENUM also print unpairable lines from file FILENUM, where\n\
208 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
209 -e EMPTY replace missing input fields with EMPTY\n\
210 "), stdout);
211 fputs (_("\
212 -i, --ignore-case ignore differences in case when comparing fields\n\
213 -j FIELD equivalent to '-1 FIELD -2 FIELD'\n\
214 -o FORMAT obey FORMAT while constructing output line\n\
215 -t CHAR use CHAR as input and output field separator\n\
216 "), stdout);
217 fputs (_("\
218 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
219 -1 FIELD join on this FIELD of file 1\n\
220 -2 FIELD join on this FIELD of file 2\n\
221 --check-order check that the input is correctly sorted, even\n\
222 if all input lines are pairable\n\
223 --nocheck-order do not check that the input is correctly sorted\n\
224 --header treat the first line in each file as field headers,\n\
225 print them without trying to pair them\n\
226 "), stdout);
227 fputs (_("\
228 -z, --zero-terminated line delimiter is NUL, not newline\n\
229 "), stdout);
230 fputs (HELP_OPTION_DESCRIPTION, stdout);
231 fputs (VERSION_OPTION_DESCRIPTION, stdout);
232 fputs (_("\
233 \n\
234 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
235 else fields are separated by CHAR. Any FIELD is a field number counted\n\
236 from 1. FORMAT is one or more comma or blank separated specifications,\n\
237 each being 'FILENUM.FIELD' or '0'. Default FORMAT outputs the join field,\n\
238 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
239 separated by CHAR. If FORMAT is the keyword 'auto', then the first\n\
240 line of each file determines the number of fields output for each line.\n\
241 \n\
242 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
243 E.g., use \"sort -k 1b,1\" if 'join' has no options,\n\
244 or use \"join -t ''\" if 'sort' has no options.\n\
245 Note, comparisons honor the rules specified by 'LC_COLLATE'.\n\
246 If the input is not sorted and some lines cannot be joined, a\n\
247 warning message will be given.\n\
248 "), stdout);
249 emit_ancillary_info (PROGRAM_NAME);
250 }
251 exit (status);
252 }
253
254 /* Record a field in LINE, with location FIELD and size LEN. */
255
256 static void
extract_field(struct line * line,char * field,size_t len)257 extract_field (struct line *line, char *field, size_t len)
258 {
259 if (line->nfields >= line->nfields_allocated)
260 {
261 line->fields = X2NREALLOC (line->fields, &line->nfields_allocated);
262 }
263 line->fields[line->nfields].beg = field;
264 line->fields[line->nfields].len = len;
265 ++(line->nfields);
266 }
267
268 /* Fill in the 'fields' structure in LINE. */
269
270 static void
xfields(struct line * line)271 xfields (struct line *line)
272 {
273 char *ptr = line->buf.buffer;
274 char const *lim = ptr + line->buf.length - 1;
275
276 if (ptr == lim)
277 return;
278
279 if (0 <= tab && tab != '\n')
280 {
281 char *sep;
282 for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
283 extract_field (line, ptr, sep - ptr);
284 }
285 else if (tab < 0)
286 {
287 /* Skip leading blanks before the first field. */
288 while (field_sep (*ptr))
289 if (++ptr == lim)
290 return;
291
292 do
293 {
294 char *sep;
295 for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
296 continue;
297 extract_field (line, ptr, sep - ptr);
298 if (sep == lim)
299 return;
300 for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
301 continue;
302 }
303 while (ptr != lim);
304 }
305
306 extract_field (line, ptr, lim - ptr);
307 }
308
309 static void
freeline(struct line * line)310 freeline (struct line *line)
311 {
312 if (line == NULL)
313 return;
314 free (line->fields);
315 line->fields = NULL;
316 free (line->buf.buffer);
317 line->buf.buffer = NULL;
318 }
319
320 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
321 >0 if it compares greater; 0 if it compares equal.
322 Report an error and exit if the comparison fails.
323 Use join fields JF_1 and JF_2 respectively. */
324
325 static int
keycmp(struct line const * line1,struct line const * line2,size_t jf_1,size_t jf_2)326 keycmp (struct line const *line1, struct line const *line2,
327 size_t jf_1, size_t jf_2)
328 {
329 /* Start of field to compare in each file. */
330 char *beg1;
331 char *beg2;
332
333 size_t len1;
334 size_t len2; /* Length of fields to compare. */
335 int diff;
336
337 if (jf_1 < line1->nfields)
338 {
339 beg1 = line1->fields[jf_1].beg;
340 len1 = line1->fields[jf_1].len;
341 }
342 else
343 {
344 beg1 = NULL;
345 len1 = 0;
346 }
347
348 if (jf_2 < line2->nfields)
349 {
350 beg2 = line2->fields[jf_2].beg;
351 len2 = line2->fields[jf_2].len;
352 }
353 else
354 {
355 beg2 = NULL;
356 len2 = 0;
357 }
358
359 if (len1 == 0)
360 return len2 == 0 ? 0 : -1;
361 if (len2 == 0)
362 return 1;
363
364 if (ignore_case)
365 {
366 /* FIXME: ignore_case does not work with NLS (in particular,
367 with multibyte chars). */
368 diff = memcasecmp (beg1, beg2, MIN (len1, len2));
369 }
370 else
371 {
372 if (hard_LC_COLLATE)
373 return xmemcoll (beg1, len1, beg2, len2);
374 diff = memcmp (beg1, beg2, MIN (len1, len2));
375 }
376
377 if (diff)
378 return diff;
379 return len1 < len2 ? -1 : len1 != len2;
380 }
381
382 /* Check that successive input lines PREV and CURRENT from input file
383 WHATFILE are presented in order, unless the user may be relying on
384 the GNU extension that input lines may be out of order if no input
385 lines are unpairable.
386
387 If the user specified --nocheck-order, the check is not made.
388 If the user specified --check-order, the problem is fatal.
389 Otherwise (the default), the message is simply a warning.
390
391 A message is printed at most once per input file. */
392
393 static void
check_order(const struct line * prev,const struct line * current,int whatfile)394 check_order (const struct line *prev,
395 const struct line *current,
396 int whatfile)
397 {
398 if (check_input_order != CHECK_ORDER_DISABLED
399 && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
400 {
401 if (!issued_disorder_warning[whatfile-1])
402 {
403 size_t join_field = whatfile == 1 ? join_field_1 : join_field_2;
404 if (keycmp (prev, current, join_field, join_field) > 0)
405 {
406 /* Exclude any trailing newline. */
407 size_t len = current->buf.length;
408 if (0 < len && current->buf.buffer[len - 1] == '\n')
409 --len;
410
411 /* If the offending line is longer than INT_MAX, output
412 only the first INT_MAX bytes in this diagnostic. */
413 len = MIN (INT_MAX, len);
414
415 error ((check_input_order == CHECK_ORDER_ENABLED
416 ? EXIT_FAILURE : 0),
417 0, _("%s:%"PRIuMAX": is not sorted: %.*s"),
418 g_names[whatfile - 1], line_no[whatfile - 1],
419 (int) len, current->buf.buffer);
420
421 /* If we get to here, the message was merely a warning.
422 Arrange to issue it only once per file. */
423 issued_disorder_warning[whatfile-1] = true;
424 }
425 }
426 }
427 }
428
429 static inline void
reset_line(struct line * line)430 reset_line (struct line *line)
431 {
432 line->nfields = 0;
433 }
434
435 static struct line *
init_linep(struct line ** linep)436 init_linep (struct line **linep)
437 {
438 struct line *line = xcalloc (1, sizeof *line);
439 *linep = line;
440 return line;
441 }
442
443 /* Read a line from FP into LINE and split it into fields.
444 Return true if successful. */
445
446 static bool
get_line(FILE * fp,struct line ** linep,int which)447 get_line (FILE *fp, struct line **linep, int which)
448 {
449 struct line *line = *linep;
450
451 if (line == prevline[which - 1])
452 {
453 SWAPLINES (line, spareline[which - 1]);
454 *linep = line;
455 }
456
457 if (line)
458 reset_line (line);
459 else
460 line = init_linep (linep);
461
462 if (! readlinebuffer_delim (&line->buf, fp, eolchar))
463 {
464 if (ferror (fp))
465 die (EXIT_FAILURE, errno, _("read error"));
466 freeline (line);
467 return false;
468 }
469 ++line_no[which - 1];
470
471 xfields (line);
472
473 if (prevline[which - 1])
474 check_order (prevline[which - 1], line, which);
475
476 prevline[which - 1] = line;
477 return true;
478 }
479
480 static void
free_spareline(void)481 free_spareline (void)
482 {
483 for (size_t i = 0; i < ARRAY_CARDINALITY (spareline); i++)
484 {
485 if (spareline[i])
486 {
487 freeline (spareline[i]);
488 free (spareline[i]);
489 }
490 }
491 }
492
493 static void
initseq(struct seq * seq)494 initseq (struct seq *seq)
495 {
496 seq->count = 0;
497 seq->alloc = 0;
498 seq->lines = NULL;
499 }
500
501 /* Read a line from FP and add it to SEQ. Return true if successful. */
502
503 static bool
getseq(FILE * fp,struct seq * seq,int whichfile)504 getseq (FILE *fp, struct seq *seq, int whichfile)
505 {
506 if (seq->count == seq->alloc)
507 {
508 seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
509 for (size_t i = seq->count; i < seq->alloc; i++)
510 seq->lines[i] = NULL;
511 }
512
513 if (get_line (fp, &seq->lines[seq->count], whichfile))
514 {
515 ++seq->count;
516 return true;
517 }
518 return false;
519 }
520
521 /* Read a line from FP and add it to SEQ, as the first item if FIRST is
522 true, else as the next. */
523 static bool
advance_seq(FILE * fp,struct seq * seq,bool first,int whichfile)524 advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
525 {
526 if (first)
527 seq->count = 0;
528
529 return getseq (fp, seq, whichfile);
530 }
531
532 static void
delseq(struct seq * seq)533 delseq (struct seq *seq)
534 {
535 for (size_t i = 0; i < seq->alloc; i++)
536 {
537 freeline (seq->lines[i]);
538 free (seq->lines[i]);
539 }
540 free (seq->lines);
541 }
542
543
544 /* Print field N of LINE if it exists and is nonempty, otherwise
545 'empty_filler' if it is nonempty. */
546
547 static void
prfield(size_t n,struct line const * line)548 prfield (size_t n, struct line const *line)
549 {
550 size_t len;
551
552 if (n < line->nfields)
553 {
554 len = line->fields[n].len;
555 if (len)
556 fwrite (line->fields[n].beg, 1, len, stdout);
557 else if (empty_filler)
558 fputs (empty_filler, stdout);
559 }
560 else if (empty_filler)
561 fputs (empty_filler, stdout);
562 }
563
564 /* Output all the fields in line, other than the join field. */
565
566 static void
prfields(struct line const * line,size_t join_field,size_t autocount)567 prfields (struct line const *line, size_t join_field, size_t autocount)
568 {
569 size_t i;
570 size_t nfields = autoformat ? autocount : line->nfields;
571 char output_separator = tab < 0 ? ' ' : tab;
572
573 for (i = 0; i < join_field && i < nfields; ++i)
574 {
575 putchar (output_separator);
576 prfield (i, line);
577 }
578 for (i = join_field + 1; i < nfields; ++i)
579 {
580 putchar (output_separator);
581 prfield (i, line);
582 }
583 }
584
585 /* Print the join of LINE1 and LINE2. */
586
587 static void
prjoin(struct line const * line1,struct line const * line2)588 prjoin (struct line const *line1, struct line const *line2)
589 {
590 const struct outlist *outlist;
591 char output_separator = tab < 0 ? ' ' : tab;
592 size_t field;
593 struct line const *line;
594
595 outlist = outlist_head.next;
596 if (outlist)
597 {
598 const struct outlist *o;
599
600 o = outlist;
601 while (1)
602 {
603 if (o->file == 0)
604 {
605 if (line1 == &uni_blank)
606 {
607 line = line2;
608 field = join_field_2;
609 }
610 else
611 {
612 line = line1;
613 field = join_field_1;
614 }
615 }
616 else
617 {
618 line = (o->file == 1 ? line1 : line2);
619 field = o->field;
620 }
621 prfield (field, line);
622 o = o->next;
623 if (o == NULL)
624 break;
625 putchar (output_separator);
626 }
627 putchar (eolchar);
628 }
629 else
630 {
631 if (line1 == &uni_blank)
632 {
633 line = line2;
634 field = join_field_2;
635 }
636 else
637 {
638 line = line1;
639 field = join_field_1;
640 }
641
642 /* Output the join field. */
643 prfield (field, line);
644
645 /* Output other fields. */
646 prfields (line1, join_field_1, autocount_1);
647 prfields (line2, join_field_2, autocount_2);
648
649 putchar (eolchar);
650 }
651 }
652
653 /* Print the join of the files in FP1 and FP2. */
654
655 static void
join(FILE * fp1,FILE * fp2)656 join (FILE *fp1, FILE *fp2)
657 {
658 struct seq seq1, seq2;
659 int diff;
660 bool eof1, eof2;
661
662 fadvise (fp1, FADVISE_SEQUENTIAL);
663 fadvise (fp2, FADVISE_SEQUENTIAL);
664
665 /* Read the first line of each file. */
666 initseq (&seq1);
667 getseq (fp1, &seq1, 1);
668 initseq (&seq2);
669 getseq (fp2, &seq2, 2);
670
671 if (autoformat)
672 {
673 autocount_1 = seq1.count ? seq1.lines[0]->nfields : 0;
674 autocount_2 = seq2.count ? seq2.lines[0]->nfields : 0;
675 }
676
677 if (join_header_lines && (seq1.count || seq2.count))
678 {
679 struct line const *hline1 = seq1.count ? seq1.lines[0] : &uni_blank;
680 struct line const *hline2 = seq2.count ? seq2.lines[0] : &uni_blank;
681 prjoin (hline1, hline2);
682 prevline[0] = NULL;
683 prevline[1] = NULL;
684 if (seq1.count)
685 advance_seq (fp1, &seq1, true, 1);
686 if (seq2.count)
687 advance_seq (fp2, &seq2, true, 2);
688 }
689
690 while (seq1.count && seq2.count)
691 {
692 diff = keycmp (seq1.lines[0], seq2.lines[0],
693 join_field_1, join_field_2);
694 if (diff < 0)
695 {
696 if (print_unpairables_1)
697 prjoin (seq1.lines[0], &uni_blank);
698 advance_seq (fp1, &seq1, true, 1);
699 seen_unpairable = true;
700 continue;
701 }
702 if (diff > 0)
703 {
704 if (print_unpairables_2)
705 prjoin (&uni_blank, seq2.lines[0]);
706 advance_seq (fp2, &seq2, true, 2);
707 seen_unpairable = true;
708 continue;
709 }
710
711 /* Keep reading lines from file1 as long as they continue to
712 match the current line from file2. */
713 eof1 = false;
714 do
715 if (!advance_seq (fp1, &seq1, false, 1))
716 {
717 eof1 = true;
718 ++seq1.count;
719 break;
720 }
721 while (!keycmp (seq1.lines[seq1.count - 1], seq2.lines[0],
722 join_field_1, join_field_2));
723
724 /* Keep reading lines from file2 as long as they continue to
725 match the current line from file1. */
726 eof2 = false;
727 do
728 if (!advance_seq (fp2, &seq2, false, 2))
729 {
730 eof2 = true;
731 ++seq2.count;
732 break;
733 }
734 while (!keycmp (seq1.lines[0], seq2.lines[seq2.count - 1],
735 join_field_1, join_field_2));
736
737 if (print_pairables)
738 {
739 for (size_t i = 0; i < seq1.count - 1; ++i)
740 {
741 size_t j;
742 for (j = 0; j < seq2.count - 1; ++j)
743 prjoin (seq1.lines[i], seq2.lines[j]);
744 }
745 }
746
747 if (!eof1)
748 {
749 SWAPLINES (seq1.lines[0], seq1.lines[seq1.count - 1]);
750 seq1.count = 1;
751 }
752 else
753 seq1.count = 0;
754
755 if (!eof2)
756 {
757 SWAPLINES (seq2.lines[0], seq2.lines[seq2.count - 1]);
758 seq2.count = 1;
759 }
760 else
761 seq2.count = 0;
762 }
763
764 /* If the user did not specify --nocheck-order, then we read the
765 tail ends of both inputs to verify that they are in order. We
766 skip the rest of the tail once we have issued a warning for that
767 file, unless we actually need to print the unpairable lines. */
768 struct line *line = NULL;
769 bool checktail = false;
770
771 if (check_input_order != CHECK_ORDER_DISABLED
772 && !(issued_disorder_warning[0] && issued_disorder_warning[1]))
773 checktail = true;
774
775 if ((print_unpairables_1 || checktail) && seq1.count)
776 {
777 if (print_unpairables_1)
778 prjoin (seq1.lines[0], &uni_blank);
779 if (seq2.count)
780 seen_unpairable = true;
781 while (get_line (fp1, &line, 1))
782 {
783 if (print_unpairables_1)
784 prjoin (line, &uni_blank);
785 if (issued_disorder_warning[0] && !print_unpairables_1)
786 break;
787 }
788 }
789
790 if ((print_unpairables_2 || checktail) && seq2.count)
791 {
792 if (print_unpairables_2)
793 prjoin (&uni_blank, seq2.lines[0]);
794 if (seq1.count)
795 seen_unpairable = true;
796 while (get_line (fp2, &line, 2))
797 {
798 if (print_unpairables_2)
799 prjoin (&uni_blank, line);
800 if (issued_disorder_warning[1] && !print_unpairables_2)
801 break;
802 }
803 }
804
805 freeline (line);
806 free (line);
807
808 delseq (&seq1);
809 delseq (&seq2);
810 }
811
812 /* Add a field spec for field FIELD of file FILE to 'outlist'. */
813
814 static void
add_field(int file,size_t field)815 add_field (int file, size_t field)
816 {
817 struct outlist *o;
818
819 assert (file == 0 || file == 1 || file == 2);
820 assert (file != 0 || field == 0);
821
822 o = xmalloc (sizeof *o);
823 o->file = file;
824 o->field = field;
825 o->next = NULL;
826
827 /* Add to the end of the list so the fields are in the right order. */
828 outlist_end->next = o;
829 outlist_end = o;
830 }
831
832 /* Convert a string of decimal digits, STR (the 1-based join field number),
833 to an integral value. Upon successful conversion, return one less
834 (the zero-based field number). Silently convert too-large values
835 to SIZE_MAX - 1. Otherwise, if a value cannot be converted, give a
836 diagnostic and exit. */
837
838 static size_t
string_to_join_field(char const * str)839 string_to_join_field (char const *str)
840 {
841 size_t result;
842 unsigned long int val;
843 verify (SIZE_MAX <= ULONG_MAX);
844
845 strtol_error s_err = xstrtoul (str, NULL, 10, &val, "");
846 if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && SIZE_MAX < val))
847 val = SIZE_MAX;
848 else if (s_err != LONGINT_OK || val == 0)
849 die (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
850
851 result = val - 1;
852
853 return result;
854 }
855
856 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
857 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
858 If S is valid, return true. Otherwise, give a diagnostic and exit. */
859
860 static void
decode_field_spec(const char * s,int * file_index,size_t * field_index)861 decode_field_spec (const char *s, int *file_index, size_t *field_index)
862 {
863 /* The first character must be 0, 1, or 2. */
864 switch (s[0])
865 {
866 case '0':
867 if (s[1])
868 {
869 /* '0' must be all alone -- no '.FIELD'. */
870 die (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
871 }
872 *file_index = 0;
873 *field_index = 0;
874 break;
875
876 case '1':
877 case '2':
878 if (s[1] != '.')
879 die (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
880 *file_index = s[0] - '0';
881 *field_index = string_to_join_field (s + 2);
882 break;
883
884 default:
885 die (EXIT_FAILURE, 0,
886 _("invalid file number in field spec: %s"), quote (s));
887
888 /* Tell gcc -W -Wall that we can't get beyond this point.
889 This avoids a warning (otherwise legit) that the caller's copies
890 of *file_index and *field_index might be used uninitialized. */
891 abort ();
892
893 break;
894 }
895 }
896
897 /* Add the comma or blank separated field spec(s) in STR to 'outlist'. */
898
899 static void
add_field_list(char * str)900 add_field_list (char *str)
901 {
902 char *p = str;
903
904 do
905 {
906 int file_index;
907 size_t field_index;
908 char const *spec_item = p;
909
910 p = strpbrk (p, ", \t");
911 if (p)
912 *p++ = '\0';
913 decode_field_spec (spec_item, &file_index, &field_index);
914 add_field (file_index, field_index);
915 }
916 while (p);
917 }
918
919 /* Set the join field *VAR to VAL, but report an error if *VAR is set
920 more than once to incompatible values. */
921
922 static void
set_join_field(size_t * var,size_t val)923 set_join_field (size_t *var, size_t val)
924 {
925 if (*var != SIZE_MAX && *var != val)
926 {
927 unsigned long int var1 = *var + 1;
928 unsigned long int val1 = val + 1;
929 die (EXIT_FAILURE, 0,
930 _("incompatible join fields %lu, %lu"), var1, val1);
931 }
932 *var = val;
933 }
934
935 /* Status of command-line arguments. */
936
937 enum operand_status
938 {
939 /* This argument must be an operand, i.e., one of the files to be
940 joined. */
941 MUST_BE_OPERAND,
942
943 /* This might be the argument of the preceding -j1 or -j2 option,
944 or it might be an operand. */
945 MIGHT_BE_J1_ARG,
946 MIGHT_BE_J2_ARG,
947
948 /* This might be the argument of the preceding -o option, or it might be
949 an operand. */
950 MIGHT_BE_O_ARG
951 };
952
953 /* Add NAME to the array of input file NAMES with operand statuses
954 OPERAND_STATUS; currently there are NFILES names in the list. */
955
956 static void
add_file_name(char * name,char * names[2],int operand_status[2],int joption_count[2],int * nfiles,int * prev_optc_status,int * optc_status)957 add_file_name (char *name, char *names[2],
958 int operand_status[2], int joption_count[2], int *nfiles,
959 int *prev_optc_status, int *optc_status)
960 {
961 int n = *nfiles;
962
963 if (n == 2)
964 {
965 bool op0 = (operand_status[0] == MUST_BE_OPERAND);
966 char *arg = names[op0];
967 switch (operand_status[op0])
968 {
969 case MUST_BE_OPERAND:
970 error (0, 0, _("extra operand %s"), quoteaf (name));
971 usage (EXIT_FAILURE);
972
973 case MIGHT_BE_J1_ARG:
974 joption_count[0]--;
975 set_join_field (&join_field_1, string_to_join_field (arg));
976 break;
977
978 case MIGHT_BE_J2_ARG:
979 joption_count[1]--;
980 set_join_field (&join_field_2, string_to_join_field (arg));
981 break;
982
983 case MIGHT_BE_O_ARG:
984 add_field_list (arg);
985 break;
986 }
987 if (!op0)
988 {
989 operand_status[0] = operand_status[1];
990 names[0] = names[1];
991 }
992 n = 1;
993 }
994
995 operand_status[n] = *prev_optc_status;
996 names[n] = name;
997 *nfiles = n + 1;
998 if (*prev_optc_status == MIGHT_BE_O_ARG)
999 *optc_status = MIGHT_BE_O_ARG;
1000 }
1001
1002 int
main(int argc,char ** argv)1003 main (int argc, char **argv)
1004 {
1005 int optc_status;
1006 int prev_optc_status = MUST_BE_OPERAND;
1007 int operand_status[2];
1008 int joption_count[2] = { 0, 0 };
1009 FILE *fp1, *fp2;
1010 int optc;
1011 int nfiles = 0;
1012 int i;
1013
1014 initialize_main (&argc, &argv);
1015 set_program_name (argv[0]);
1016 setlocale (LC_ALL, "");
1017 bindtextdomain (PACKAGE, LOCALEDIR);
1018 textdomain (PACKAGE);
1019 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1020
1021 atexit (close_stdout);
1022 atexit (free_spareline);
1023
1024 print_pairables = true;
1025 seen_unpairable = false;
1026 issued_disorder_warning[0] = issued_disorder_warning[1] = false;
1027 check_input_order = CHECK_ORDER_DEFAULT;
1028
1029 while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:z",
1030 longopts, NULL))
1031 != -1)
1032 {
1033 optc_status = MUST_BE_OPERAND;
1034
1035 switch (optc)
1036 {
1037 case 'v':
1038 print_pairables = false;
1039 FALLTHROUGH;
1040
1041 case 'a':
1042 {
1043 unsigned long int val;
1044 if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK
1045 || (val != 1 && val != 2))
1046 die (EXIT_FAILURE, 0,
1047 _("invalid field number: %s"), quote (optarg));
1048 if (val == 1)
1049 print_unpairables_1 = true;
1050 else
1051 print_unpairables_2 = true;
1052 }
1053 break;
1054
1055 case 'e':
1056 if (empty_filler && ! STREQ (empty_filler, optarg))
1057 die (EXIT_FAILURE, 0,
1058 _("conflicting empty-field replacement strings"));
1059 empty_filler = optarg;
1060 break;
1061
1062 case 'i':
1063 ignore_case = true;
1064 break;
1065
1066 case '1':
1067 set_join_field (&join_field_1, string_to_join_field (optarg));
1068 break;
1069
1070 case '2':
1071 set_join_field (&join_field_2, string_to_join_field (optarg));
1072 break;
1073
1074 case 'j':
1075 if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
1076 && optarg == argv[optind - 1] + 2)
1077 {
1078 /* The argument was either "-j1" or "-j2". */
1079 bool is_j2 = (optarg[0] == '2');
1080 joption_count[is_j2]++;
1081 optc_status = MIGHT_BE_J1_ARG + is_j2;
1082 }
1083 else
1084 {
1085 set_join_field (&join_field_1, string_to_join_field (optarg));
1086 set_join_field (&join_field_2, join_field_1);
1087 }
1088 break;
1089
1090 case 'o':
1091 if (STREQ (optarg, "auto"))
1092 autoformat = true;
1093 else
1094 {
1095 add_field_list (optarg);
1096 optc_status = MIGHT_BE_O_ARG;
1097 }
1098 break;
1099
1100 case 't':
1101 {
1102 unsigned char newtab = optarg[0];
1103 if (! newtab)
1104 newtab = '\n'; /* '' => process the whole line. */
1105 else if (optarg[1])
1106 {
1107 if (STREQ (optarg, "\\0"))
1108 newtab = '\0';
1109 else
1110 die (EXIT_FAILURE, 0, _("multi-character tab %s"),
1111 quote (optarg));
1112 }
1113 if (0 <= tab && tab != newtab)
1114 die (EXIT_FAILURE, 0, _("incompatible tabs"));
1115 tab = newtab;
1116 }
1117 break;
1118
1119 case 'z':
1120 eolchar = 0;
1121 break;
1122
1123 case NOCHECK_ORDER_OPTION:
1124 check_input_order = CHECK_ORDER_DISABLED;
1125 break;
1126
1127 case CHECK_ORDER_OPTION:
1128 check_input_order = CHECK_ORDER_ENABLED;
1129 break;
1130
1131 case 1: /* Non-option argument. */
1132 add_file_name (optarg, g_names, operand_status, joption_count,
1133 &nfiles, &prev_optc_status, &optc_status);
1134 break;
1135
1136 case HEADER_LINE_OPTION:
1137 join_header_lines = true;
1138 break;
1139
1140 case_GETOPT_HELP_CHAR;
1141
1142 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1143
1144 default:
1145 usage (EXIT_FAILURE);
1146 }
1147
1148 prev_optc_status = optc_status;
1149 }
1150
1151 /* Process any operands after "--". */
1152 prev_optc_status = MUST_BE_OPERAND;
1153 while (optind < argc)
1154 add_file_name (argv[optind++], g_names, operand_status, joption_count,
1155 &nfiles, &prev_optc_status, &optc_status);
1156
1157 if (nfiles != 2)
1158 {
1159 if (nfiles == 0)
1160 error (0, 0, _("missing operand"));
1161 else
1162 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1163 usage (EXIT_FAILURE);
1164 }
1165
1166 /* If "-j1" was specified and it turns out not to have had an argument,
1167 treat it as "-j 1". Likewise for -j2. */
1168 for (i = 0; i < 2; i++)
1169 if (joption_count[i] != 0)
1170 {
1171 set_join_field (&join_field_1, i);
1172 set_join_field (&join_field_2, i);
1173 }
1174
1175 if (join_field_1 == SIZE_MAX)
1176 join_field_1 = 0;
1177 if (join_field_2 == SIZE_MAX)
1178 join_field_2 = 0;
1179
1180 fp1 = STREQ (g_names[0], "-") ? stdin : fopen (g_names[0], "r");
1181 if (!fp1)
1182 die (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1183 fp2 = STREQ (g_names[1], "-") ? stdin : fopen (g_names[1], "r");
1184 if (!fp2)
1185 die (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1186 if (fp1 == fp2)
1187 die (EXIT_FAILURE, errno, _("both files cannot be standard input"));
1188 join (fp1, fp2);
1189
1190 if (fclose (fp1) != 0)
1191 die (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1192 if (fclose (fp2) != 0)
1193 die (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1194
1195 if (issued_disorder_warning[0] || issued_disorder_warning[1])
1196 return EXIT_FAILURE;
1197 else
1198 return EXIT_SUCCESS;
1199 }
1200