1 /* join - join lines of two files on a common field
2    Copyright (C) 1991-2018 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.
16 
17    Written by Mike Haertel, mike@gnu.ai.mit.edu.  */
18 
19 #include <config.h>
20 
21 #include <assert.h>
22 #include <sys/types.h>
23 #include <getopt.h>
24 
25 #include "system.h"
26 #include "die.h"
27 #include "error.h"
28 #include "fadvise.h"
29 #include "hard-locale.h"
30 #include "linebuffer.h"
31 #include "memcasecmp.h"
32 #include "quote.h"
33 #include "stdio--.h"
34 #include "xmemcoll.h"
35 #include "xstrtol.h"
36 #include "argmatch.h"
37 
38 /* The official name of this program (e.g., no 'g' prefix).  */
39 #define PROGRAM_NAME "join"
40 
41 #define AUTHORS proper_name ("Mike Haertel")
42 
43 #define join system_join
44 
45 #define SWAPLINES(a, b) do { \
46   struct line *tmp = a; \
47   a = b; \
48   b = tmp; \
49 } while (0);
50 
51 /* An element of the list identifying which fields to print for each
52    output line.  */
53 struct outlist
54   {
55     /* File number: 0, 1, or 2.  0 means use the join field.
56        1 means use the first file argument, 2 the second.  */
57     int file;
58 
59     /* Field index (zero-based), specified only when FILE is 1 or 2.  */
60     size_t field;
61 
62     struct outlist *next;
63   };
64 
65 /* A field of a line.  */
66 struct field
67   {
68     char *beg;			/* First character in field.  */
69     size_t len;			/* The length of the field.  */
70   };
71 
72 /* A line read from an input file.  */
73 struct line
74   {
75     struct linebuffer buf;	/* The line itself.  */
76     size_t nfields;		/* Number of elements in 'fields'.  */
77     size_t nfields_allocated;	/* Number of elements allocated for 'fields'. */
78     struct field *fields;
79   };
80 
81 /* One or more consecutive lines read from a file that all have the
82    same join field value.  */
83 struct seq
84   {
85     size_t count;			/* Elements used in 'lines'.  */
86     size_t alloc;			/* Elements allocated in 'lines'.  */
87     struct line **lines;
88   };
89 
90 /* The previous line read from each file.  */
91 static struct line *prevline[2] = {NULL, NULL};
92 
93 /* The number of lines read from each file.  */
94 static uintmax_t line_no[2] = {0, 0};
95 
96 /* The input file names.  */
97 static char *g_names[2];
98 
99 /* This provides an extra line buffer for each file.  We need these if we
100    try to read two consecutive lines into the same buffer, since we don't
101    want to overwrite the previous buffer before we check order. */
102 static struct line *spareline[2] = {NULL, NULL};
103 
104 /* True if the LC_COLLATE locale is hard.  */
105 static bool hard_LC_COLLATE;
106 
107 /* If nonzero, print unpairable lines in file 1 or 2.  */
108 static bool print_unpairables_1, print_unpairables_2;
109 
110 /* If nonzero, print pairable lines.  */
111 static bool print_pairables;
112 
113 /* If nonzero, we have seen at least one unpairable line. */
114 static bool seen_unpairable;
115 
116 /* If nonzero, we have warned about disorder in that file. */
117 static bool issued_disorder_warning[2];
118 
119 /* Empty output field filler.  */
120 static char const *empty_filler;
121 
122 /* Whether to ensure the same number of fields are output from each line.  */
123 static bool autoformat;
124 /* The number of fields to output for each line.
125    Only significant when autoformat is true.  */
126 static size_t autocount_1;
127 static size_t autocount_2;
128 
129 /* Field to join on; SIZE_MAX means they haven't been determined yet.  */
130 static size_t join_field_1 = SIZE_MAX;
131 static size_t join_field_2 = SIZE_MAX;
132 
133 /* List of fields to print.  */
134 static struct outlist outlist_head;
135 
136 /* Last element in 'outlist', where a new element can be added.  */
137 static struct outlist *outlist_end = &outlist_head;
138 
139 /* Tab character separating fields.  If negative, fields are separated
140    by any nonempty string of blanks, otherwise by exactly one
141    tab character whose value (when cast to unsigned char) equals TAB.  */
142 static int tab = -1;
143 
144 /* If nonzero, check that the input is correctly ordered. */
145 static enum
146   {
147     CHECK_ORDER_DEFAULT,
148     CHECK_ORDER_ENABLED,
149     CHECK_ORDER_DISABLED
150   } check_input_order;
151 
152 enum
153 {
154   CHECK_ORDER_OPTION = CHAR_MAX + 1,
155   NOCHECK_ORDER_OPTION,
156   HEADER_LINE_OPTION
157 };
158 
159 
160 static struct option const longopts[] =
161 {
162   {"ignore-case", no_argument, NULL, 'i'},
163   {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
164   {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
165   {"zero-terminated", no_argument, NULL, 'z'},
166   {"header", no_argument, NULL, HEADER_LINE_OPTION},
167   {GETOPT_HELP_OPTION_DECL},
168   {GETOPT_VERSION_OPTION_DECL},
169   {NULL, 0, NULL, 0}
170 };
171 
172 /* Used to print non-joining lines */
173 static struct line uni_blank;
174 
175 /* If nonzero, ignore case when comparing join fields.  */
176 static bool ignore_case;
177 
178 /* If nonzero, treat the first line of each file as column headers --
179    join them without checking for ordering */
180 static bool join_header_lines;
181 
182 /* The character marking end of line. Default to \n. */
183 static char eolchar = '\n';
184 
185 void
usage(int status)186 usage (int status)
187 {
188   if (status != EXIT_SUCCESS)
189     emit_try_help ();
190   else
191     {
192       printf (_("\
193 Usage: %s [OPTION]... FILE1 FILE2\n\
194 "),
195               program_name);
196       fputs (_("\
197 For each pair of input lines with identical join fields, write a line to\n\
198 standard output.  The default join field is the first, delimited by blanks.\
199 \n\
200 "), stdout);
201       fputs (_("\
202 \n\
203 When FILE1 or FILE2 (not both) is -, read standard input.\n\
204 "), stdout);
205       fputs (_("\
206 \n\
207   -a FILENUM        also print unpairable lines from file FILENUM, where\n\
208                       FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
209   -e EMPTY          replace missing input fields with EMPTY\n\
210 "), stdout);
211       fputs (_("\
212   -i, --ignore-case  ignore differences in case when comparing fields\n\
213   -j FIELD          equivalent to '-1 FIELD -2 FIELD'\n\
214   -o FORMAT         obey FORMAT while constructing output line\n\
215   -t CHAR           use CHAR as input and output field separator\n\
216 "), stdout);
217       fputs (_("\
218   -v FILENUM        like -a FILENUM, but suppress joined output lines\n\
219   -1 FIELD          join on this FIELD of file 1\n\
220   -2 FIELD          join on this FIELD of file 2\n\
221   --check-order     check that the input is correctly sorted, even\n\
222                       if all input lines are pairable\n\
223   --nocheck-order   do not check that the input is correctly sorted\n\
224   --header          treat the first line in each file as field headers,\n\
225                       print them without trying to pair them\n\
226 "), stdout);
227       fputs (_("\
228   -z, --zero-terminated     line delimiter is NUL, not newline\n\
229 "), stdout);
230       fputs (HELP_OPTION_DESCRIPTION, stdout);
231       fputs (VERSION_OPTION_DESCRIPTION, stdout);
232       fputs (_("\
233 \n\
234 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
235 else fields are separated by CHAR.  Any FIELD is a field number counted\n\
236 from 1.  FORMAT is one or more comma or blank separated specifications,\n\
237 each being 'FILENUM.FIELD' or '0'.  Default FORMAT outputs the join field,\n\
238 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
239 separated by CHAR.  If FORMAT is the keyword 'auto', then the first\n\
240 line of each file determines the number of fields output for each line.\n\
241 \n\
242 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
243 E.g., use \"sort -k 1b,1\" if 'join' has no options,\n\
244 or use \"join -t ''\" if 'sort' has no options.\n\
245 Note, comparisons honor the rules specified by 'LC_COLLATE'.\n\
246 If the input is not sorted and some lines cannot be joined, a\n\
247 warning message will be given.\n\
248 "), stdout);
249       emit_ancillary_info (PROGRAM_NAME);
250     }
251   exit (status);
252 }
253 
254 /* Record a field in LINE, with location FIELD and size LEN.  */
255 
256 static void
extract_field(struct line * line,char * field,size_t len)257 extract_field (struct line *line, char *field, size_t len)
258 {
259   if (line->nfields >= line->nfields_allocated)
260     {
261       line->fields = X2NREALLOC (line->fields, &line->nfields_allocated);
262     }
263   line->fields[line->nfields].beg = field;
264   line->fields[line->nfields].len = len;
265   ++(line->nfields);
266 }
267 
268 /* Fill in the 'fields' structure in LINE.  */
269 
270 static void
xfields(struct line * line)271 xfields (struct line *line)
272 {
273   char *ptr = line->buf.buffer;
274   char const *lim = ptr + line->buf.length - 1;
275 
276   if (ptr == lim)
277     return;
278 
279   if (0 <= tab && tab != '\n')
280     {
281       char *sep;
282       for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
283         extract_field (line, ptr, sep - ptr);
284     }
285   else if (tab < 0)
286     {
287       /* Skip leading blanks before the first field.  */
288       while (field_sep (*ptr))
289         if (++ptr == lim)
290           return;
291 
292       do
293         {
294           char *sep;
295           for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
296             continue;
297           extract_field (line, ptr, sep - ptr);
298           if (sep == lim)
299             return;
300           for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
301             continue;
302         }
303       while (ptr != lim);
304     }
305 
306   extract_field (line, ptr, lim - ptr);
307 }
308 
309 static void
freeline(struct line * line)310 freeline (struct line *line)
311 {
312   if (line == NULL)
313     return;
314   free (line->fields);
315   line->fields = NULL;
316   free (line->buf.buffer);
317   line->buf.buffer = NULL;
318 }
319 
320 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
321    >0 if it compares greater; 0 if it compares equal.
322    Report an error and exit if the comparison fails.
323    Use join fields JF_1 and JF_2 respectively.  */
324 
325 static int
keycmp(struct line const * line1,struct line const * line2,size_t jf_1,size_t jf_2)326 keycmp (struct line const *line1, struct line const *line2,
327         size_t jf_1, size_t jf_2)
328 {
329   /* Start of field to compare in each file.  */
330   char *beg1;
331   char *beg2;
332 
333   size_t len1;
334   size_t len2;		/* Length of fields to compare.  */
335   int diff;
336 
337   if (jf_1 < line1->nfields)
338     {
339       beg1 = line1->fields[jf_1].beg;
340       len1 = line1->fields[jf_1].len;
341     }
342   else
343     {
344       beg1 = NULL;
345       len1 = 0;
346     }
347 
348   if (jf_2 < line2->nfields)
349     {
350       beg2 = line2->fields[jf_2].beg;
351       len2 = line2->fields[jf_2].len;
352     }
353   else
354     {
355       beg2 = NULL;
356       len2 = 0;
357     }
358 
359   if (len1 == 0)
360     return len2 == 0 ? 0 : -1;
361   if (len2 == 0)
362     return 1;
363 
364   if (ignore_case)
365     {
366       /* FIXME: ignore_case does not work with NLS (in particular,
367          with multibyte chars).  */
368       diff = memcasecmp (beg1, beg2, MIN (len1, len2));
369     }
370   else
371     {
372       if (hard_LC_COLLATE)
373         return xmemcoll (beg1, len1, beg2, len2);
374       diff = memcmp (beg1, beg2, MIN (len1, len2));
375     }
376 
377   if (diff)
378     return diff;
379   return len1 < len2 ? -1 : len1 != len2;
380 }
381 
382 /* Check that successive input lines PREV and CURRENT from input file
383    WHATFILE are presented in order, unless the user may be relying on
384    the GNU extension that input lines may be out of order if no input
385    lines are unpairable.
386 
387    If the user specified --nocheck-order, the check is not made.
388    If the user specified --check-order, the problem is fatal.
389    Otherwise (the default), the message is simply a warning.
390 
391    A message is printed at most once per input file. */
392 
393 static void
check_order(const struct line * prev,const struct line * current,int whatfile)394 check_order (const struct line *prev,
395              const struct line *current,
396              int whatfile)
397 {
398   if (check_input_order != CHECK_ORDER_DISABLED
399       && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
400     {
401       if (!issued_disorder_warning[whatfile-1])
402         {
403           size_t join_field = whatfile == 1 ? join_field_1 : join_field_2;
404           if (keycmp (prev, current, join_field, join_field) > 0)
405             {
406               /* Exclude any trailing newline. */
407               size_t len = current->buf.length;
408               if (0 < len && current->buf.buffer[len - 1] == '\n')
409                 --len;
410 
411               /* If the offending line is longer than INT_MAX, output
412                  only the first INT_MAX bytes in this diagnostic.  */
413               len = MIN (INT_MAX, len);
414 
415               error ((check_input_order == CHECK_ORDER_ENABLED
416                       ? EXIT_FAILURE : 0),
417                      0, _("%s:%"PRIuMAX": is not sorted: %.*s"),
418                      g_names[whatfile - 1], line_no[whatfile - 1],
419                      (int) len, current->buf.buffer);
420 
421               /* If we get to here, the message was merely a warning.
422                  Arrange to issue it only once per file.  */
423               issued_disorder_warning[whatfile-1] = true;
424             }
425         }
426     }
427 }
428 
429 static inline void
reset_line(struct line * line)430 reset_line (struct line *line)
431 {
432   line->nfields = 0;
433 }
434 
435 static struct line *
init_linep(struct line ** linep)436 init_linep (struct line **linep)
437 {
438   struct line *line = xcalloc (1, sizeof *line);
439   *linep = line;
440   return line;
441 }
442 
443 /* Read a line from FP into LINE and split it into fields.
444    Return true if successful.  */
445 
446 static bool
get_line(FILE * fp,struct line ** linep,int which)447 get_line (FILE *fp, struct line **linep, int which)
448 {
449   struct line *line = *linep;
450 
451   if (line == prevline[which - 1])
452     {
453       SWAPLINES (line, spareline[which - 1]);
454       *linep = line;
455     }
456 
457   if (line)
458     reset_line (line);
459   else
460     line = init_linep (linep);
461 
462   if (! readlinebuffer_delim (&line->buf, fp, eolchar))
463     {
464       if (ferror (fp))
465         die (EXIT_FAILURE, errno, _("read error"));
466       freeline (line);
467       return false;
468     }
469   ++line_no[which - 1];
470 
471   xfields (line);
472 
473   if (prevline[which - 1])
474     check_order (prevline[which - 1], line, which);
475 
476   prevline[which - 1] = line;
477   return true;
478 }
479 
480 static void
free_spareline(void)481 free_spareline (void)
482 {
483   for (size_t i = 0; i < ARRAY_CARDINALITY (spareline); i++)
484     {
485       if (spareline[i])
486         {
487           freeline (spareline[i]);
488           free (spareline[i]);
489         }
490     }
491 }
492 
493 static void
initseq(struct seq * seq)494 initseq (struct seq *seq)
495 {
496   seq->count = 0;
497   seq->alloc = 0;
498   seq->lines = NULL;
499 }
500 
501 /* Read a line from FP and add it to SEQ.  Return true if successful.  */
502 
503 static bool
getseq(FILE * fp,struct seq * seq,int whichfile)504 getseq (FILE *fp, struct seq *seq, int whichfile)
505 {
506   if (seq->count == seq->alloc)
507     {
508       seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
509       for (size_t i = seq->count; i < seq->alloc; i++)
510         seq->lines[i] = NULL;
511     }
512 
513   if (get_line (fp, &seq->lines[seq->count], whichfile))
514     {
515       ++seq->count;
516       return true;
517     }
518   return false;
519 }
520 
521 /* Read a line from FP and add it to SEQ, as the first item if FIRST is
522    true, else as the next.  */
523 static bool
advance_seq(FILE * fp,struct seq * seq,bool first,int whichfile)524 advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
525 {
526   if (first)
527     seq->count = 0;
528 
529   return getseq (fp, seq, whichfile);
530 }
531 
532 static void
delseq(struct seq * seq)533 delseq (struct seq *seq)
534 {
535   for (size_t i = 0; i < seq->alloc; i++)
536     {
537       freeline (seq->lines[i]);
538       free (seq->lines[i]);
539     }
540   free (seq->lines);
541 }
542 
543 
544 /* Print field N of LINE if it exists and is nonempty, otherwise
545    'empty_filler' if it is nonempty.  */
546 
547 static void
prfield(size_t n,struct line const * line)548 prfield (size_t n, struct line const *line)
549 {
550   size_t len;
551 
552   if (n < line->nfields)
553     {
554       len = line->fields[n].len;
555       if (len)
556         fwrite (line->fields[n].beg, 1, len, stdout);
557       else if (empty_filler)
558         fputs (empty_filler, stdout);
559     }
560   else if (empty_filler)
561     fputs (empty_filler, stdout);
562 }
563 
564 /* Output all the fields in line, other than the join field.  */
565 
566 static void
prfields(struct line const * line,size_t join_field,size_t autocount)567 prfields (struct line const *line, size_t join_field, size_t autocount)
568 {
569   size_t i;
570   size_t nfields = autoformat ? autocount : line->nfields;
571   char output_separator = tab < 0 ? ' ' : tab;
572 
573   for (i = 0; i < join_field && i < nfields; ++i)
574     {
575       putchar (output_separator);
576       prfield (i, line);
577     }
578   for (i = join_field + 1; i < nfields; ++i)
579     {
580       putchar (output_separator);
581       prfield (i, line);
582     }
583 }
584 
585 /* Print the join of LINE1 and LINE2.  */
586 
587 static void
prjoin(struct line const * line1,struct line const * line2)588 prjoin (struct line const *line1, struct line const *line2)
589 {
590   const struct outlist *outlist;
591   char output_separator = tab < 0 ? ' ' : tab;
592   size_t field;
593   struct line const *line;
594 
595   outlist = outlist_head.next;
596   if (outlist)
597     {
598       const struct outlist *o;
599 
600       o = outlist;
601       while (1)
602         {
603           if (o->file == 0)
604             {
605               if (line1 == &uni_blank)
606                 {
607                   line = line2;
608                   field = join_field_2;
609                 }
610               else
611                 {
612                   line = line1;
613                   field = join_field_1;
614                 }
615             }
616           else
617             {
618               line = (o->file == 1 ? line1 : line2);
619               field = o->field;
620             }
621           prfield (field, line);
622           o = o->next;
623           if (o == NULL)
624             break;
625           putchar (output_separator);
626         }
627       putchar (eolchar);
628     }
629   else
630     {
631       if (line1 == &uni_blank)
632         {
633           line = line2;
634           field = join_field_2;
635         }
636       else
637         {
638           line = line1;
639           field = join_field_1;
640         }
641 
642       /* Output the join field.  */
643       prfield (field, line);
644 
645       /* Output other fields.  */
646       prfields (line1, join_field_1, autocount_1);
647       prfields (line2, join_field_2, autocount_2);
648 
649       putchar (eolchar);
650     }
651 }
652 
653 /* Print the join of the files in FP1 and FP2.  */
654 
655 static void
join(FILE * fp1,FILE * fp2)656 join (FILE *fp1, FILE *fp2)
657 {
658   struct seq seq1, seq2;
659   int diff;
660   bool eof1, eof2;
661 
662   fadvise (fp1, FADVISE_SEQUENTIAL);
663   fadvise (fp2, FADVISE_SEQUENTIAL);
664 
665   /* Read the first line of each file.  */
666   initseq (&seq1);
667   getseq (fp1, &seq1, 1);
668   initseq (&seq2);
669   getseq (fp2, &seq2, 2);
670 
671   if (autoformat)
672     {
673       autocount_1 = seq1.count ? seq1.lines[0]->nfields : 0;
674       autocount_2 = seq2.count ? seq2.lines[0]->nfields : 0;
675     }
676 
677   if (join_header_lines && (seq1.count || seq2.count))
678     {
679       struct line const *hline1 = seq1.count ? seq1.lines[0] : &uni_blank;
680       struct line const *hline2 = seq2.count ? seq2.lines[0] : &uni_blank;
681       prjoin (hline1, hline2);
682       prevline[0] = NULL;
683       prevline[1] = NULL;
684       if (seq1.count)
685         advance_seq (fp1, &seq1, true, 1);
686       if (seq2.count)
687         advance_seq (fp2, &seq2, true, 2);
688     }
689 
690   while (seq1.count && seq2.count)
691     {
692       diff = keycmp (seq1.lines[0], seq2.lines[0],
693                      join_field_1, join_field_2);
694       if (diff < 0)
695         {
696           if (print_unpairables_1)
697             prjoin (seq1.lines[0], &uni_blank);
698           advance_seq (fp1, &seq1, true, 1);
699           seen_unpairable = true;
700           continue;
701         }
702       if (diff > 0)
703         {
704           if (print_unpairables_2)
705             prjoin (&uni_blank, seq2.lines[0]);
706           advance_seq (fp2, &seq2, true, 2);
707           seen_unpairable = true;
708           continue;
709         }
710 
711       /* Keep reading lines from file1 as long as they continue to
712          match the current line from file2.  */
713       eof1 = false;
714       do
715         if (!advance_seq (fp1, &seq1, false, 1))
716           {
717             eof1 = true;
718             ++seq1.count;
719             break;
720           }
721       while (!keycmp (seq1.lines[seq1.count - 1], seq2.lines[0],
722                       join_field_1, join_field_2));
723 
724       /* Keep reading lines from file2 as long as they continue to
725          match the current line from file1.  */
726       eof2 = false;
727       do
728         if (!advance_seq (fp2, &seq2, false, 2))
729           {
730             eof2 = true;
731             ++seq2.count;
732             break;
733           }
734       while (!keycmp (seq1.lines[0], seq2.lines[seq2.count - 1],
735                       join_field_1, join_field_2));
736 
737       if (print_pairables)
738         {
739           for (size_t i = 0; i < seq1.count - 1; ++i)
740             {
741               size_t j;
742               for (j = 0; j < seq2.count - 1; ++j)
743                 prjoin (seq1.lines[i], seq2.lines[j]);
744             }
745         }
746 
747       if (!eof1)
748         {
749           SWAPLINES (seq1.lines[0], seq1.lines[seq1.count - 1]);
750           seq1.count = 1;
751         }
752       else
753         seq1.count = 0;
754 
755       if (!eof2)
756         {
757           SWAPLINES (seq2.lines[0], seq2.lines[seq2.count - 1]);
758           seq2.count = 1;
759         }
760       else
761         seq2.count = 0;
762     }
763 
764   /* If the user did not specify --nocheck-order, then we read the
765      tail ends of both inputs to verify that they are in order.  We
766      skip the rest of the tail once we have issued a warning for that
767      file, unless we actually need to print the unpairable lines.  */
768   struct line *line = NULL;
769   bool checktail = false;
770 
771   if (check_input_order != CHECK_ORDER_DISABLED
772       && !(issued_disorder_warning[0] && issued_disorder_warning[1]))
773     checktail = true;
774 
775   if ((print_unpairables_1 || checktail) && seq1.count)
776     {
777       if (print_unpairables_1)
778         prjoin (seq1.lines[0], &uni_blank);
779       if (seq2.count)
780         seen_unpairable = true;
781       while (get_line (fp1, &line, 1))
782         {
783           if (print_unpairables_1)
784             prjoin (line, &uni_blank);
785           if (issued_disorder_warning[0] && !print_unpairables_1)
786             break;
787         }
788     }
789 
790   if ((print_unpairables_2 || checktail) && seq2.count)
791     {
792       if (print_unpairables_2)
793         prjoin (&uni_blank, seq2.lines[0]);
794       if (seq1.count)
795         seen_unpairable = true;
796       while (get_line (fp2, &line, 2))
797         {
798           if (print_unpairables_2)
799             prjoin (&uni_blank, line);
800           if (issued_disorder_warning[1] && !print_unpairables_2)
801             break;
802         }
803     }
804 
805   freeline (line);
806   free (line);
807 
808   delseq (&seq1);
809   delseq (&seq2);
810 }
811 
812 /* Add a field spec for field FIELD of file FILE to 'outlist'.  */
813 
814 static void
add_field(int file,size_t field)815 add_field (int file, size_t field)
816 {
817   struct outlist *o;
818 
819   assert (file == 0 || file == 1 || file == 2);
820   assert (file != 0 || field == 0);
821 
822   o = xmalloc (sizeof *o);
823   o->file = file;
824   o->field = field;
825   o->next = NULL;
826 
827   /* Add to the end of the list so the fields are in the right order.  */
828   outlist_end->next = o;
829   outlist_end = o;
830 }
831 
832 /* Convert a string of decimal digits, STR (the 1-based join field number),
833    to an integral value.  Upon successful conversion, return one less
834    (the zero-based field number).  Silently convert too-large values
835    to SIZE_MAX - 1.  Otherwise, if a value cannot be converted, give a
836    diagnostic and exit.  */
837 
838 static size_t
string_to_join_field(char const * str)839 string_to_join_field (char const *str)
840 {
841   size_t result;
842   unsigned long int val;
843   verify (SIZE_MAX <= ULONG_MAX);
844 
845   strtol_error s_err = xstrtoul (str, NULL, 10, &val, "");
846   if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && SIZE_MAX < val))
847     val = SIZE_MAX;
848   else if (s_err != LONGINT_OK || val == 0)
849     die (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
850 
851   result = val - 1;
852 
853   return result;
854 }
855 
856 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
857    pair.  In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
858    If S is valid, return true.  Otherwise, give a diagnostic and exit.  */
859 
860 static void
decode_field_spec(const char * s,int * file_index,size_t * field_index)861 decode_field_spec (const char *s, int *file_index, size_t *field_index)
862 {
863   /* The first character must be 0, 1, or 2.  */
864   switch (s[0])
865     {
866     case '0':
867       if (s[1])
868         {
869           /* '0' must be all alone -- no '.FIELD'.  */
870           die (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
871         }
872       *file_index = 0;
873       *field_index = 0;
874       break;
875 
876     case '1':
877     case '2':
878       if (s[1] != '.')
879         die (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
880       *file_index = s[0] - '0';
881       *field_index = string_to_join_field (s + 2);
882       break;
883 
884     default:
885       die (EXIT_FAILURE, 0,
886            _("invalid file number in field spec: %s"), quote (s));
887 
888       /* Tell gcc -W -Wall that we can't get beyond this point.
889          This avoids a warning (otherwise legit) that the caller's copies
890          of *file_index and *field_index might be used uninitialized.  */
891       abort ();
892 
893       break;
894     }
895 }
896 
897 /* Add the comma or blank separated field spec(s) in STR to 'outlist'.  */
898 
899 static void
add_field_list(char * str)900 add_field_list (char *str)
901 {
902   char *p = str;
903 
904   do
905     {
906       int file_index;
907       size_t field_index;
908       char const *spec_item = p;
909 
910       p = strpbrk (p, ", \t");
911       if (p)
912         *p++ = '\0';
913       decode_field_spec (spec_item, &file_index, &field_index);
914       add_field (file_index, field_index);
915     }
916   while (p);
917 }
918 
919 /* Set the join field *VAR to VAL, but report an error if *VAR is set
920    more than once to incompatible values.  */
921 
922 static void
set_join_field(size_t * var,size_t val)923 set_join_field (size_t *var, size_t val)
924 {
925   if (*var != SIZE_MAX && *var != val)
926     {
927       unsigned long int var1 = *var + 1;
928       unsigned long int val1 = val + 1;
929       die (EXIT_FAILURE, 0,
930            _("incompatible join fields %lu, %lu"), var1, val1);
931     }
932   *var = val;
933 }
934 
935 /* Status of command-line arguments.  */
936 
937 enum operand_status
938   {
939     /* This argument must be an operand, i.e., one of the files to be
940        joined.  */
941     MUST_BE_OPERAND,
942 
943     /* This might be the argument of the preceding -j1 or -j2 option,
944        or it might be an operand.  */
945     MIGHT_BE_J1_ARG,
946     MIGHT_BE_J2_ARG,
947 
948     /* This might be the argument of the preceding -o option, or it might be
949        an operand.  */
950     MIGHT_BE_O_ARG
951   };
952 
953 /* Add NAME to the array of input file NAMES with operand statuses
954    OPERAND_STATUS; currently there are NFILES names in the list.  */
955 
956 static void
add_file_name(char * name,char * names[2],int operand_status[2],int joption_count[2],int * nfiles,int * prev_optc_status,int * optc_status)957 add_file_name (char *name, char *names[2],
958                int operand_status[2], int joption_count[2], int *nfiles,
959                int *prev_optc_status, int *optc_status)
960 {
961   int n = *nfiles;
962 
963   if (n == 2)
964     {
965       bool op0 = (operand_status[0] == MUST_BE_OPERAND);
966       char *arg = names[op0];
967       switch (operand_status[op0])
968         {
969         case MUST_BE_OPERAND:
970           error (0, 0, _("extra operand %s"), quoteaf (name));
971           usage (EXIT_FAILURE);
972 
973         case MIGHT_BE_J1_ARG:
974           joption_count[0]--;
975           set_join_field (&join_field_1, string_to_join_field (arg));
976           break;
977 
978         case MIGHT_BE_J2_ARG:
979           joption_count[1]--;
980           set_join_field (&join_field_2, string_to_join_field (arg));
981           break;
982 
983         case MIGHT_BE_O_ARG:
984           add_field_list (arg);
985           break;
986         }
987       if (!op0)
988         {
989           operand_status[0] = operand_status[1];
990           names[0] = names[1];
991         }
992       n = 1;
993     }
994 
995   operand_status[n] = *prev_optc_status;
996   names[n] = name;
997   *nfiles = n + 1;
998   if (*prev_optc_status == MIGHT_BE_O_ARG)
999     *optc_status = MIGHT_BE_O_ARG;
1000 }
1001 
1002 int
main(int argc,char ** argv)1003 main (int argc, char **argv)
1004 {
1005   int optc_status;
1006   int prev_optc_status = MUST_BE_OPERAND;
1007   int operand_status[2];
1008   int joption_count[2] = { 0, 0 };
1009   FILE *fp1, *fp2;
1010   int optc;
1011   int nfiles = 0;
1012   int i;
1013 
1014   initialize_main (&argc, &argv);
1015   set_program_name (argv[0]);
1016   setlocale (LC_ALL, "");
1017   bindtextdomain (PACKAGE, LOCALEDIR);
1018   textdomain (PACKAGE);
1019   hard_LC_COLLATE = hard_locale (LC_COLLATE);
1020 
1021   atexit (close_stdout);
1022   atexit (free_spareline);
1023 
1024   print_pairables = true;
1025   seen_unpairable = false;
1026   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
1027   check_input_order = CHECK_ORDER_DEFAULT;
1028 
1029   while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:z",
1030                               longopts, NULL))
1031          != -1)
1032     {
1033       optc_status = MUST_BE_OPERAND;
1034 
1035       switch (optc)
1036         {
1037         case 'v':
1038             print_pairables = false;
1039             FALLTHROUGH;
1040 
1041         case 'a':
1042           {
1043             unsigned long int val;
1044             if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK
1045                 || (val != 1 && val != 2))
1046               die (EXIT_FAILURE, 0,
1047                    _("invalid field number: %s"), quote (optarg));
1048             if (val == 1)
1049               print_unpairables_1 = true;
1050             else
1051               print_unpairables_2 = true;
1052           }
1053           break;
1054 
1055         case 'e':
1056           if (empty_filler && ! STREQ (empty_filler, optarg))
1057             die (EXIT_FAILURE, 0,
1058                  _("conflicting empty-field replacement strings"));
1059           empty_filler = optarg;
1060           break;
1061 
1062         case 'i':
1063           ignore_case = true;
1064           break;
1065 
1066         case '1':
1067           set_join_field (&join_field_1, string_to_join_field (optarg));
1068           break;
1069 
1070         case '2':
1071           set_join_field (&join_field_2, string_to_join_field (optarg));
1072           break;
1073 
1074         case 'j':
1075           if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
1076               && optarg == argv[optind - 1] + 2)
1077             {
1078               /* The argument was either "-j1" or "-j2".  */
1079               bool is_j2 = (optarg[0] == '2');
1080               joption_count[is_j2]++;
1081               optc_status = MIGHT_BE_J1_ARG + is_j2;
1082             }
1083           else
1084             {
1085               set_join_field (&join_field_1, string_to_join_field (optarg));
1086               set_join_field (&join_field_2, join_field_1);
1087             }
1088           break;
1089 
1090         case 'o':
1091           if (STREQ (optarg, "auto"))
1092             autoformat = true;
1093           else
1094             {
1095               add_field_list (optarg);
1096               optc_status = MIGHT_BE_O_ARG;
1097             }
1098           break;
1099 
1100         case 't':
1101           {
1102             unsigned char newtab = optarg[0];
1103             if (! newtab)
1104               newtab = '\n'; /* '' => process the whole line.  */
1105             else if (optarg[1])
1106               {
1107                 if (STREQ (optarg, "\\0"))
1108                   newtab = '\0';
1109                 else
1110                   die (EXIT_FAILURE, 0, _("multi-character tab %s"),
1111                        quote (optarg));
1112               }
1113             if (0 <= tab && tab != newtab)
1114               die (EXIT_FAILURE, 0, _("incompatible tabs"));
1115             tab = newtab;
1116           }
1117           break;
1118 
1119         case 'z':
1120           eolchar = 0;
1121           break;
1122 
1123         case NOCHECK_ORDER_OPTION:
1124           check_input_order = CHECK_ORDER_DISABLED;
1125           break;
1126 
1127         case CHECK_ORDER_OPTION:
1128           check_input_order = CHECK_ORDER_ENABLED;
1129           break;
1130 
1131         case 1:		/* Non-option argument.  */
1132           add_file_name (optarg, g_names, operand_status, joption_count,
1133                          &nfiles, &prev_optc_status, &optc_status);
1134           break;
1135 
1136         case HEADER_LINE_OPTION:
1137           join_header_lines = true;
1138           break;
1139 
1140         case_GETOPT_HELP_CHAR;
1141 
1142         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1143 
1144         default:
1145           usage (EXIT_FAILURE);
1146         }
1147 
1148       prev_optc_status = optc_status;
1149     }
1150 
1151   /* Process any operands after "--".  */
1152   prev_optc_status = MUST_BE_OPERAND;
1153   while (optind < argc)
1154     add_file_name (argv[optind++], g_names, operand_status, joption_count,
1155                    &nfiles, &prev_optc_status, &optc_status);
1156 
1157   if (nfiles != 2)
1158     {
1159       if (nfiles == 0)
1160         error (0, 0, _("missing operand"));
1161       else
1162         error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1163       usage (EXIT_FAILURE);
1164     }
1165 
1166   /* If "-j1" was specified and it turns out not to have had an argument,
1167      treat it as "-j 1".  Likewise for -j2.  */
1168   for (i = 0; i < 2; i++)
1169     if (joption_count[i] != 0)
1170       {
1171         set_join_field (&join_field_1, i);
1172         set_join_field (&join_field_2, i);
1173       }
1174 
1175   if (join_field_1 == SIZE_MAX)
1176     join_field_1 = 0;
1177   if (join_field_2 == SIZE_MAX)
1178     join_field_2 = 0;
1179 
1180   fp1 = STREQ (g_names[0], "-") ? stdin : fopen (g_names[0], "r");
1181   if (!fp1)
1182     die (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1183   fp2 = STREQ (g_names[1], "-") ? stdin : fopen (g_names[1], "r");
1184   if (!fp2)
1185     die (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1186   if (fp1 == fp2)
1187     die (EXIT_FAILURE, errno, _("both files cannot be standard input"));
1188   join (fp1, fp2);
1189 
1190   if (fclose (fp1) != 0)
1191     die (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1192   if (fclose (fp2) != 0)
1193     die (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1194 
1195   if (issued_disorder_warning[0] || issued_disorder_warning[1])
1196     return EXIT_FAILURE;
1197   else
1198     return EXIT_SUCCESS;
1199 }
1200