1 /* csplit - split a file into sections determined by context lines
2    Copyright (C) 1991-2020 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16 
17 /* Written by Stuart Kemp, cpsrk@groper.jcu.edu.au.
18    Modified by David MacKenzie, djm@gnu.ai.mit.edu. */
19 
20 #include <config.h>
21 
22 #include <assert.h>
23 #include <getopt.h>
24 #include <sys/types.h>
25 #include <signal.h>
26 
27 #include "system.h"
28 
29 #include <regex.h>
30 
31 #include "die.h"
32 #include "error.h"
33 #include "fd-reopen.h"
34 #include "quote.h"
35 #include "safe-read.h"
36 #include "stdio--.h"
37 #include "xdectoint.h"
38 #include "xstrtol.h"
39 
40 /* The official name of this program (e.g., no 'g' prefix).  */
41 #define PROGRAM_NAME "csplit"
42 
43 #define AUTHORS \
44   proper_name ("Stuart Kemp"), \
45   proper_name ("David MacKenzie")
46 
47 /* The default prefix for output file names. */
48 #define DEFAULT_PREFIX	"xx"
49 
50 /* A compiled pattern arg. */
51 struct control
52 {
53   intmax_t offset;		/* Offset from regexp to split at. */
54   uintmax_t lines_required;	/* Number of lines required. */
55   uintmax_t repeat;		/* Repeat count. */
56   int argnum;			/* ARGV index. */
57   bool repeat_forever;		/* True if '*' used as a repeat count. */
58   bool ignore;			/* If true, produce no output (for regexp). */
59   bool regexpr;			/* True if regular expression was used. */
60   struct re_pattern_buffer re_compiled;	/* Compiled regular expression. */
61 };
62 
63 /* Initial size of data area in buffers. */
64 #define START_SIZE	8191
65 
66 /* Increment size for data area. */
67 #define INCR_SIZE	2048
68 
69 /* Number of lines kept in each node in line list. */
70 #define CTRL_SIZE	80
71 
72 #ifdef DEBUG
73 /* Some small values to test the algorithms. */
74 # define START_SIZE	200
75 # define INCR_SIZE	10
76 # define CTRL_SIZE	1
77 #endif
78 
79 /* A string with a length count. */
80 struct cstring
81 {
82   size_t len;
83   char *str;
84 };
85 
86 /* Pointers to the beginnings of lines in the buffer area.
87    These structures are linked together if needed. */
88 struct line
89 {
90   size_t used;			/* Number of offsets used in this struct. */
91   size_t insert_index;		/* Next offset to use when inserting line. */
92   size_t retrieve_index;	/* Next index to use when retrieving line. */
93   struct cstring starts[CTRL_SIZE]; /* Lines in the data area. */
94   struct line *next;		/* Next in linked list. */
95 };
96 
97 /* The structure to hold the input lines.
98    Contains a pointer to the data area and a list containing
99    pointers to the individual lines. */
100 struct buffer_record
101 {
102   size_t bytes_alloc;		/* Size of the buffer area. */
103   size_t bytes_used;		/* Bytes used in the buffer area. */
104   uintmax_t start_line;		/* First line number in this buffer. */
105   uintmax_t first_available;	/* First line that can be retrieved. */
106   size_t num_lines;		/* Number of complete lines in this buffer. */
107   char *buffer;			/* Data area. */
108   struct line *line_start;	/* Head of list of pointers to lines. */
109   struct line *curr_line;	/* The line start record currently in use. */
110   struct buffer_record *next;
111 };
112 
113 static void close_output_file (void);
114 static void create_output_file (void);
115 static void delete_all_files (bool);
116 static void save_line_to_file (const struct cstring *line);
117 
118 /* Start of buffer list. */
119 static struct buffer_record *head = NULL;
120 
121 /* Partially read line. */
122 static char *hold_area = NULL;
123 
124 /* Number of bytes in 'hold_area'. */
125 static size_t hold_count = 0;
126 
127 /* Number of the last line in the buffers. */
128 static uintmax_t last_line_number = 0;
129 
130 /* Number of the line currently being examined. */
131 static uintmax_t current_line = 0;
132 
133 /* If true, we have read EOF. */
134 static bool have_read_eof = false;
135 
136 /* Name of output files. */
137 static char *volatile filename_space = NULL;
138 
139 /* Prefix part of output file names. */
140 static char const *volatile prefix = NULL;
141 
142 /* Suffix part of output file names. */
143 static char *volatile suffix = NULL;
144 
145 /* Number of digits to use in output file names. */
146 static int volatile digits = 2;
147 
148 /* Number of files created so far. */
149 static unsigned int volatile files_created = 0;
150 
151 /* Number of bytes written to current file. */
152 static uintmax_t bytes_written;
153 
154 /* Output file pointer. */
155 static FILE *output_stream = NULL;
156 
157 /* Output file name. */
158 static char *output_filename = NULL;
159 
160 /* Perhaps it would be cleaner to pass arg values instead of indexes. */
161 static char **global_argv;
162 
163 /* If true, do not print the count of bytes in each output file. */
164 static bool suppress_count;
165 
166 /* If true, remove output files on error. */
167 static bool volatile remove_files;
168 
169 /* If true, remove all output files which have a zero length. */
170 static bool elide_empty_files;
171 
172 /* If true, suppress the lines that match the PATTERN */
173 static bool suppress_matched;
174 
175 /* The compiled pattern arguments, which determine how to split
176    the input file. */
177 static struct control *controls;
178 
179 /* Number of elements in 'controls'. */
180 static size_t control_used;
181 
182 /* The set of signals that are caught.  */
183 static sigset_t caught_signals;
184 
185 /* For long options that have no equivalent short option, use a
186    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
187 enum
188 {
189   SUPPRESS_MATCHED_OPTION = CHAR_MAX + 1
190 };
191 
192 static struct option const longopts[] =
193 {
194   {"digits", required_argument, NULL, 'n'},
195   {"quiet", no_argument, NULL, 'q'},
196   {"silent", no_argument, NULL, 's'},
197   {"keep-files", no_argument, NULL, 'k'},
198   {"elide-empty-files", no_argument, NULL, 'z'},
199   {"prefix", required_argument, NULL, 'f'},
200   {"suffix-format", required_argument, NULL, 'b'},
201   {"suppress-matched", no_argument, NULL, SUPPRESS_MATCHED_OPTION},
202   {GETOPT_HELP_OPTION_DECL},
203   {GETOPT_VERSION_OPTION_DECL},
204   {NULL, 0, NULL, 0}
205 };
206 
207 /* Optionally remove files created so far; then exit.
208    Called when an error detected. */
209 
210 static void
cleanup(void)211 cleanup (void)
212 {
213   sigset_t oldset;
214 
215   close_output_file ();
216 
217   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
218   delete_all_files (false);
219   sigprocmask (SIG_SETMASK, &oldset, NULL);
220 }
221 
222 static void cleanup_fatal (void) ATTRIBUTE_NORETURN;
223 static void
cleanup_fatal(void)224 cleanup_fatal (void)
225 {
226   cleanup ();
227   exit (EXIT_FAILURE);
228 }
229 
230 extern void
xalloc_die(void)231 xalloc_die (void)
232 {
233   error (0, 0, "%s", _("memory exhausted"));
234   cleanup_fatal ();
235 }
236 
237 static void
interrupt_handler(int sig)238 interrupt_handler (int sig)
239 {
240   delete_all_files (true);
241   signal (sig, SIG_DFL);
242   /* The signal has been reset to SIG_DFL, but blocked during this
243      handler.  Force the default action of this signal once the
244      handler returns and the block is removed.  */
245   raise (sig);
246 }
247 
248 /* Keep track of NUM bytes of a partial line in buffer START.
249    These bytes will be retrieved later when another large buffer is read.  */
250 
251 static void
save_to_hold_area(char * start,size_t num)252 save_to_hold_area (char *start, size_t num)
253 {
254   free (hold_area);
255   hold_area = start;
256   hold_count = num;
257 }
258 
259 /* Read up to MAX_N_BYTES bytes from the input stream into DEST.
260    Return the number of bytes read. */
261 
262 static size_t
read_input(char * dest,size_t max_n_bytes)263 read_input (char *dest, size_t max_n_bytes)
264 {
265   size_t bytes_read;
266 
267   if (max_n_bytes == 0)
268     return 0;
269 
270   bytes_read = safe_read (STDIN_FILENO, dest, max_n_bytes);
271 
272   if (bytes_read == 0)
273     have_read_eof = true;
274 
275   if (bytes_read == SAFE_READ_ERROR)
276     {
277       error (0, errno, _("read error"));
278       cleanup_fatal ();
279     }
280 
281   return bytes_read;
282 }
283 
284 /* Initialize existing line record P. */
285 
286 static void
clear_line_control(struct line * p)287 clear_line_control (struct line *p)
288 {
289   p->used = 0;
290   p->insert_index = 0;
291   p->retrieve_index = 0;
292 }
293 
294 /* Return a new, initialized line record. */
295 
296 static struct line *
new_line_control(void)297 new_line_control (void)
298 {
299   struct line *p = xmalloc (sizeof *p);
300 
301   p->next = NULL;
302   clear_line_control (p);
303 
304   return p;
305 }
306 
307 /* Record LINE_START, which is the address of the start of a line
308    of length LINE_LEN in the large buffer, in the lines buffer of B. */
309 
310 static void
keep_new_line(struct buffer_record * b,char * line_start,size_t line_len)311 keep_new_line (struct buffer_record *b, char *line_start, size_t line_len)
312 {
313   struct line *l;
314 
315   /* If there is no existing area to keep line info, get some. */
316   if (b->line_start == NULL)
317     b->line_start = b->curr_line = new_line_control ();
318 
319   /* If existing area for lines is full, get more. */
320   if (b->curr_line->used == CTRL_SIZE)
321     {
322       b->curr_line->next = new_line_control ();
323       b->curr_line = b->curr_line->next;
324     }
325 
326   l = b->curr_line;
327 
328   /* Record the start of the line, and update counters. */
329   l->starts[l->insert_index].str = line_start;
330   l->starts[l->insert_index].len = line_len;
331   l->used++;
332   l->insert_index++;
333 }
334 
335 /* Scan the buffer in B for newline characters
336    and record the line start locations and lengths in B.
337    Return the number of lines found in this buffer.
338 
339    There may be an incomplete line at the end of the buffer;
340    a pointer is kept to this area, which will be used when
341    the next buffer is filled. */
342 
343 static size_t
record_line_starts(struct buffer_record * b)344 record_line_starts (struct buffer_record *b)
345 {
346   char *line_start;		/* Start of current line. */
347   char *line_end;		/* End of each line found. */
348   size_t bytes_left;		/* Length of incomplete last line. */
349   size_t lines;			/* Number of lines found. */
350   size_t line_length;		/* Length of each line found. */
351 
352   if (b->bytes_used == 0)
353     return 0;
354 
355   lines = 0;
356   line_start = b->buffer;
357   bytes_left = b->bytes_used;
358 
359   while (true)
360     {
361       line_end = memchr (line_start, '\n', bytes_left);
362       if (line_end == NULL)
363         break;
364       line_length = line_end - line_start + 1;
365       keep_new_line (b, line_start, line_length);
366       bytes_left -= line_length;
367       line_start = line_end + 1;
368       lines++;
369     }
370 
371   /* Check for an incomplete last line. */
372   if (bytes_left)
373     {
374       if (have_read_eof)
375         {
376           keep_new_line (b, line_start, bytes_left);
377           lines++;
378         }
379       else
380         save_to_hold_area (xmemdup (line_start, bytes_left), bytes_left);
381     }
382 
383   b->num_lines = lines;
384   b->first_available = b->start_line = last_line_number + 1;
385   last_line_number += lines;
386 
387   return lines;
388 }
389 
390 /* Return a new buffer with room to store SIZE bytes, plus
391    an extra byte for safety. */
392 
393 static struct buffer_record *
create_new_buffer(size_t size)394 create_new_buffer (size_t size)
395 {
396   struct buffer_record *new_buffer = xmalloc (sizeof *new_buffer);
397 
398   new_buffer->buffer = xmalloc (size + 1);
399 
400   new_buffer->bytes_alloc = size;
401   new_buffer->line_start = new_buffer->curr_line = NULL;
402 
403   return new_buffer;
404 }
405 
406 /* Return a new buffer of at least MINSIZE bytes.  If a buffer of at
407    least that size is currently free, use it, otherwise create a new one. */
408 
409 static struct buffer_record *
get_new_buffer(size_t min_size)410 get_new_buffer (size_t min_size)
411 {
412   struct buffer_record *new_buffer; /* Buffer to return. */
413   size_t alloc_size;	/* Actual size that will be requested. */
414 
415   alloc_size = START_SIZE;
416   if (alloc_size < min_size)
417     {
418       size_t s = min_size - alloc_size + INCR_SIZE - 1;
419       alloc_size += s - s % INCR_SIZE;
420     }
421 
422   new_buffer = create_new_buffer (alloc_size);
423 
424   new_buffer->num_lines = 0;
425   new_buffer->bytes_used = 0;
426   new_buffer->start_line = new_buffer->first_available = last_line_number + 1;
427   new_buffer->next = NULL;
428 
429   return new_buffer;
430 }
431 
432 static void
free_buffer(struct buffer_record * buf)433 free_buffer (struct buffer_record *buf)
434 {
435   struct line *l;
436   for (l = buf->line_start; l;)
437     {
438       struct line *n = l->next;
439       free (l);
440       l = n;
441     }
442   buf->line_start = NULL;
443   free (buf->buffer);
444   buf->buffer = NULL;
445 }
446 
447 /* Append buffer BUF to the linked list of buffers that contain
448    some data yet to be processed. */
449 
450 static void
save_buffer(struct buffer_record * buf)451 save_buffer (struct buffer_record *buf)
452 {
453   struct buffer_record *p;
454 
455   buf->next = NULL;
456   buf->curr_line = buf->line_start;
457 
458   if (head == NULL)
459     head = buf;
460   else
461     {
462       for (p = head; p->next; p = p->next)
463         /* Do nothing. */ ;
464       p->next = buf;
465     }
466 }
467 
468 /* Fill a buffer of input.
469 
470    Set the initial size of the buffer to a default.
471    Fill the buffer (from the hold area and input stream)
472    and find the individual lines.
473    If no lines are found (the buffer is too small to hold the next line),
474    release the current buffer (whose contents would have been put in the
475    hold area) and repeat the process with another large buffer until at least
476    one entire line has been read.
477 
478    Return true if a new buffer was obtained, otherwise false
479    (in which case end-of-file must have been encountered). */
480 
481 static bool
load_buffer(void)482 load_buffer (void)
483 {
484   struct buffer_record *b;
485   size_t bytes_wanted = START_SIZE; /* Minimum buffer size. */
486   size_t bytes_avail;		/* Size of new buffer created. */
487   size_t lines_found;		/* Number of lines in this new buffer. */
488   char *p;			/* Place to load into buffer. */
489 
490   if (have_read_eof)
491     return false;
492 
493   /* We must make the buffer at least as large as the amount of data
494      in the partial line left over from the last call. */
495   if (bytes_wanted < hold_count)
496     bytes_wanted = hold_count;
497 
498   while (1)
499     {
500       b = get_new_buffer (bytes_wanted);
501       bytes_avail = b->bytes_alloc; /* Size of buffer returned. */
502       p = b->buffer;
503 
504       /* First check the 'holding' area for a partial line. */
505       if (hold_count)
506         {
507           memcpy (p, hold_area, hold_count);
508           p += hold_count;
509           b->bytes_used += hold_count;
510           bytes_avail -= hold_count;
511           hold_count = 0;
512         }
513 
514       b->bytes_used += read_input (p, bytes_avail);
515 
516       lines_found = record_line_starts (b);
517 
518       if (lines_found || have_read_eof)
519         break;
520 
521       if (xalloc_oversized (2, b->bytes_alloc))
522         xalloc_die ();
523       bytes_wanted = 2 * b->bytes_alloc;
524       free_buffer (b);
525       free (b);
526     }
527 
528   if (lines_found)
529     save_buffer (b);
530   else
531     {
532       free_buffer (b);
533       free (b);
534     }
535 
536   return lines_found != 0;
537 }
538 
539 /* Return the line number of the first line that has not yet been retrieved. */
540 
541 static uintmax_t
get_first_line_in_buffer(void)542 get_first_line_in_buffer (void)
543 {
544   if (head == NULL && !load_buffer ())
545     die (EXIT_FAILURE, errno, _("input disappeared"));
546 
547   return head->first_available;
548 }
549 
550 /* Return a pointer to the logical first line in the buffer and make the
551    next line the logical first line.
552    Return NULL if there is no more input. */
553 
554 static struct cstring *
remove_line(void)555 remove_line (void)
556 {
557   /* If non-NULL, this is the buffer for which the previous call
558      returned the final line.  So now, presuming that line has been
559      processed, we can free the buffer and reset this pointer.  */
560   static struct buffer_record *prev_buf = NULL;
561 
562   struct cstring *line;		/* Return value. */
563   struct line *l;		/* For convenience. */
564 
565   if (prev_buf)
566     {
567       free_buffer (prev_buf);
568       free (prev_buf);
569       prev_buf = NULL;
570     }
571 
572   if (head == NULL && !load_buffer ())
573     return NULL;
574 
575   if (current_line < head->first_available)
576     current_line = head->first_available;
577 
578   ++(head->first_available);
579 
580   l = head->curr_line;
581 
582   line = &l->starts[l->retrieve_index];
583 
584   /* Advance index to next line. */
585   if (++l->retrieve_index == l->used)
586     {
587       /* Go on to the next line record. */
588       head->curr_line = l->next;
589       if (head->curr_line == NULL || head->curr_line->used == 0)
590         {
591           /* Go on to the next data block.
592              but first record the current one so we can free it
593              once the line we're returning has been processed.  */
594           prev_buf = head;
595           head = head->next;
596         }
597     }
598 
599   return line;
600 }
601 
602 /* Search the buffers for line LINENUM, reading more input if necessary.
603    Return a pointer to the line, or NULL if it is not found in the file. */
604 
605 static struct cstring *
find_line(uintmax_t linenum)606 find_line (uintmax_t linenum)
607 {
608   struct buffer_record *b;
609 
610   if (head == NULL && !load_buffer ())
611     return NULL;
612 
613   if (linenum < head->start_line)
614     return NULL;
615 
616   for (b = head;;)
617     {
618       assert (b);
619       if (linenum < b->start_line + b->num_lines)
620         {
621           /* The line is in this buffer. */
622           struct line *l;
623           size_t offset;	/* How far into the buffer the line is. */
624 
625           l = b->line_start;
626           offset = linenum - b->start_line;
627           /* Find the control record. */
628           while (offset >= CTRL_SIZE)
629             {
630               l = l->next;
631               offset -= CTRL_SIZE;
632             }
633           return &l->starts[offset];
634         }
635       if (b->next == NULL && !load_buffer ())
636         return NULL;
637       b = b->next;		/* Try the next data block. */
638     }
639 }
640 
641 /* Return true if at least one more line is available for input. */
642 
643 static bool
no_more_lines(void)644 no_more_lines (void)
645 {
646   return find_line (current_line + 1) == NULL;
647 }
648 
649 /* Open NAME as standard input.  */
650 
651 static void
set_input_file(const char * name)652 set_input_file (const char *name)
653 {
654   if (! STREQ (name, "-") && fd_reopen (STDIN_FILENO, name, O_RDONLY, 0) < 0)
655     die (EXIT_FAILURE, errno, _("cannot open %s for reading"),
656          quoteaf (name));
657 }
658 
659 /* Write all lines from the beginning of the buffer up to, but
660    not including, line LAST_LINE, to the current output file.
661    If IGNORE is true, do not output lines selected here.
662    ARGNUM is the index in ARGV of the current pattern. */
663 
664 static void
write_to_file(uintmax_t last_line,bool ignore,int argnum)665 write_to_file (uintmax_t last_line, bool ignore, int argnum)
666 {
667   struct cstring *line;
668   uintmax_t first_line;		/* First available input line. */
669   uintmax_t lines;		/* Number of lines to output. */
670   uintmax_t i;
671 
672   first_line = get_first_line_in_buffer ();
673 
674   if (first_line > last_line)
675     {
676       error (0, 0, _("%s: line number out of range"),
677              quote (global_argv[argnum]));
678       cleanup_fatal ();
679     }
680 
681   lines = last_line - first_line;
682 
683   for (i = 0; i < lines; i++)
684     {
685       line = remove_line ();
686       if (line == NULL)
687         {
688           error (0, 0, _("%s: line number out of range"),
689                  quote (global_argv[argnum]));
690           cleanup_fatal ();
691         }
692       if (!ignore)
693         save_line_to_file (line);
694     }
695 }
696 
697 /* Output any lines left after all regexps have been processed. */
698 
699 static void
dump_rest_of_file(void)700 dump_rest_of_file (void)
701 {
702   struct cstring *line;
703 
704   while ((line = remove_line ()) != NULL)
705     save_line_to_file (line);
706 }
707 
708 /* Handle an attempt to read beyond EOF under the control of record P,
709    on iteration REPETITION if nonzero. */
710 
711 static void handle_line_error (const struct control *, uintmax_t)
712      ATTRIBUTE_NORETURN;
713 static void
handle_line_error(const struct control * p,uintmax_t repetition)714 handle_line_error (const struct control *p, uintmax_t repetition)
715 {
716   char buf[INT_BUFSIZE_BOUND (uintmax_t)];
717 
718   fprintf (stderr, _("%s: %s: line number out of range"),
719            program_name, quote (umaxtostr (p->lines_required, buf)));
720   if (repetition)
721     fprintf (stderr, _(" on repetition %s\n"), umaxtostr (repetition, buf));
722   else
723     fprintf (stderr, "\n");
724 
725   cleanup_fatal ();
726 }
727 
728 /* Determine the line number that marks the end of this file,
729    then get those lines and save them to the output file.
730    P is the control record.
731    REPETITION is the repetition number. */
732 
733 static void
process_line_count(const struct control * p,uintmax_t repetition)734 process_line_count (const struct control *p, uintmax_t repetition)
735 {
736   uintmax_t linenum;
737   uintmax_t last_line_to_save = p->lines_required * (repetition + 1);
738 
739   create_output_file ();
740 
741   /* Ensure that the line number specified is not 1 greater than
742      the number of lines in the file.
743      When suppressing matched lines, check before the loop. */
744   if (no_more_lines () && suppress_matched)
745     handle_line_error (p, repetition);
746 
747   linenum = get_first_line_in_buffer ();
748   while (linenum++ < last_line_to_save)
749     {
750       struct cstring *line = remove_line ();
751       if (line == NULL)
752         handle_line_error (p, repetition);
753       save_line_to_file (line);
754     }
755 
756   close_output_file ();
757 
758   if (suppress_matched)
759     remove_line ();
760 
761   /* Ensure that the line number specified is not 1 greater than
762      the number of lines in the file. */
763   if (no_more_lines () && !suppress_matched)
764     handle_line_error (p, repetition);
765 }
766 
767 static void regexp_error (struct control *, uintmax_t, bool) ATTRIBUTE_NORETURN;
768 static void
regexp_error(struct control * p,uintmax_t repetition,bool ignore)769 regexp_error (struct control *p, uintmax_t repetition, bool ignore)
770 {
771   fprintf (stderr, _("%s: %s: match not found"),
772            program_name, quote (global_argv[p->argnum]));
773 
774   if (repetition)
775     {
776       char buf[INT_BUFSIZE_BOUND (uintmax_t)];
777       fprintf (stderr, _(" on repetition %s\n"), umaxtostr (repetition, buf));
778     }
779   else
780     fprintf (stderr, "\n");
781 
782   if (!ignore)
783     {
784       dump_rest_of_file ();
785       close_output_file ();
786     }
787   cleanup_fatal ();
788 }
789 
790 /* Read the input until a line matches the regexp in P, outputting
791    it unless P->IGNORE is true.
792    REPETITION is this repeat-count; 0 means the first time. */
793 
794 static void
process_regexp(struct control * p,uintmax_t repetition)795 process_regexp (struct control *p, uintmax_t repetition)
796 {
797   struct cstring *line;		/* From input file. */
798   size_t line_len;		/* To make "$" in regexps work. */
799   uintmax_t break_line;		/* First line number of next file. */
800   bool ignore = p->ignore;	/* If true, skip this section. */
801   regoff_t ret;
802 
803   if (!ignore)
804     create_output_file ();
805 
806   if (suppress_matched && current_line > 0)
807     remove_line ();
808 
809   /* If there is no offset for the regular expression, or
810      it is positive, then it is not necessary to buffer the lines. */
811 
812   if (p->offset >= 0)
813     {
814       while (true)
815         {
816           line = find_line (++current_line);
817           if (line == NULL)
818             {
819               if (p->repeat_forever)
820                 {
821                   if (!ignore)
822                     {
823                       dump_rest_of_file ();
824                       close_output_file ();
825                     }
826                   exit (EXIT_SUCCESS);
827                 }
828               else
829                 regexp_error (p, repetition, ignore);
830             }
831           line_len = line->len;
832           if (line->str[line_len - 1] == '\n')
833             line_len--;
834           ret = re_search (&p->re_compiled, line->str, line_len,
835                            0, line_len, NULL);
836           if (ret == -2)
837             {
838               error (0, 0, _("error in regular expression search"));
839               cleanup_fatal ();
840             }
841           if (ret == -1)
842             {
843               line = remove_line ();
844               if (!ignore)
845                 save_line_to_file (line);
846             }
847           else
848             break;
849         }
850     }
851   else
852     {
853       /* Buffer the lines. */
854       while (true)
855         {
856           line = find_line (++current_line);
857           if (line == NULL)
858             {
859               if (p->repeat_forever)
860                 {
861                   if (!ignore)
862                     {
863                       dump_rest_of_file ();
864                       close_output_file ();
865                     }
866                   exit (EXIT_SUCCESS);
867                 }
868               else
869                 regexp_error (p, repetition, ignore);
870             }
871           line_len = line->len;
872           if (line->str[line_len - 1] == '\n')
873             line_len--;
874           ret = re_search (&p->re_compiled, line->str, line_len,
875                            0, line_len, NULL);
876           if (ret == -2)
877             {
878               error (0, 0, _("error in regular expression search"));
879               cleanup_fatal ();
880             }
881           if (ret != -1)
882             break;
883         }
884     }
885 
886   /* Account for any offset from this regexp. */
887   break_line = current_line + p->offset;
888 
889   write_to_file (break_line, ignore, p->argnum);
890 
891   if (!ignore)
892     close_output_file ();
893 
894   if (p->offset > 0)
895     current_line = break_line;
896 }
897 
898 /* Split the input file according to the control records we have built. */
899 
900 static void
split_file(void)901 split_file (void)
902 {
903   for (size_t i = 0; i < control_used; i++)
904     {
905       uintmax_t j;
906       if (controls[i].regexpr)
907         {
908           for (j = 0; (controls[i].repeat_forever
909                        || j <= controls[i].repeat); j++)
910             process_regexp (&controls[i], j);
911         }
912       else
913         {
914           for (j = 0; (controls[i].repeat_forever
915                        || j <= controls[i].repeat); j++)
916             process_line_count (&controls[i], j);
917         }
918     }
919 
920   create_output_file ();
921   dump_rest_of_file ();
922   close_output_file ();
923 }
924 
925 /* Return the name of output file number NUM.
926 
927    This function is called from a signal handler, so it should invoke
928    only reentrant functions that are async-signal-safe.  POSIX does
929    not guarantee this for the functions called below, but we don't
930    know of any hosts where this implementation isn't safe.  */
931 
932 static char *
make_filename(unsigned int num)933 make_filename (unsigned int num)
934 {
935   strcpy (filename_space, prefix);
936   if (suffix)
937     sprintf (filename_space + strlen (prefix), suffix, num);
938   else
939     sprintf (filename_space + strlen (prefix), "%0*u", digits, num);
940   return filename_space;
941 }
942 
943 /* Create the next output file. */
944 
945 static void
create_output_file(void)946 create_output_file (void)
947 {
948   bool fopen_ok;
949   int fopen_errno;
950 
951   output_filename = make_filename (files_created);
952 
953   if (files_created == UINT_MAX)
954     {
955       fopen_ok = false;
956       fopen_errno = EOVERFLOW;
957     }
958   else
959     {
960       /* Create the output file in a critical section, to avoid races.  */
961       sigset_t oldset;
962       sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
963       output_stream = fopen (output_filename, "w");
964       fopen_ok = (output_stream != NULL);
965       fopen_errno = errno;
966       files_created += fopen_ok;
967       sigprocmask (SIG_SETMASK, &oldset, NULL);
968     }
969 
970   if (! fopen_ok)
971     {
972       error (0, fopen_errno, "%s", quotef (output_filename));
973       cleanup_fatal ();
974     }
975   bytes_written = 0;
976 }
977 
978 /* If requested, delete all the files we have created.  This function
979    must be called only from critical sections.  */
980 
981 static void
delete_all_files(bool in_signal_handler)982 delete_all_files (bool in_signal_handler)
983 {
984   if (! remove_files)
985     return;
986 
987   for (unsigned int i = 0; i < files_created; i++)
988     {
989       const char *name = make_filename (i);
990       if (unlink (name) != 0 && !in_signal_handler)
991         error (0, errno, "%s", quotef (name));
992     }
993 
994   files_created = 0;
995 }
996 
997 /* Close the current output file and print the count
998    of characters in this file. */
999 
1000 static void
close_output_file(void)1001 close_output_file (void)
1002 {
1003   if (output_stream)
1004     {
1005       if (ferror (output_stream))
1006         {
1007           error (0, 0, _("write error for %s"), quoteaf (output_filename));
1008           output_stream = NULL;
1009           cleanup_fatal ();
1010         }
1011       if (fclose (output_stream) != 0)
1012         {
1013           error (0, errno, "%s", quotef (output_filename));
1014           output_stream = NULL;
1015           cleanup_fatal ();
1016         }
1017       if (bytes_written == 0 && elide_empty_files)
1018         {
1019           sigset_t oldset;
1020           bool unlink_ok;
1021           int unlink_errno;
1022 
1023           /* Remove the output file in a critical section, to avoid races.  */
1024           sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
1025           unlink_ok = (unlink (output_filename) == 0);
1026           unlink_errno = errno;
1027           files_created -= unlink_ok;
1028           sigprocmask (SIG_SETMASK, &oldset, NULL);
1029 
1030           if (! unlink_ok)
1031             error (0, unlink_errno, "%s", quotef (output_filename));
1032         }
1033       else
1034         {
1035           if (!suppress_count)
1036             {
1037               char buf[INT_BUFSIZE_BOUND (uintmax_t)];
1038               fprintf (stdout, "%s\n", umaxtostr (bytes_written, buf));
1039             }
1040         }
1041       output_stream = NULL;
1042     }
1043 }
1044 
1045 /* Save line LINE to the output file and
1046    increment the character count for the current file. */
1047 
1048 static void
save_line_to_file(const struct cstring * line)1049 save_line_to_file (const struct cstring *line)
1050 {
1051   size_t l = fwrite (line->str, sizeof (char), line->len, output_stream);
1052   if (l != line->len)
1053     {
1054       error (0, errno, _("write error for %s"), quoteaf (output_filename));
1055       output_stream = NULL;
1056       cleanup_fatal ();
1057     }
1058   bytes_written += line->len;
1059 }
1060 
1061 /* Return a new, initialized control record. */
1062 
1063 static struct control *
new_control_record(void)1064 new_control_record (void)
1065 {
1066   static size_t control_allocated = 0; /* Total space allocated. */
1067   struct control *p;
1068 
1069   if (control_used == control_allocated)
1070     controls = X2NREALLOC (controls, &control_allocated);
1071   p = &controls[control_used++];
1072   p->regexpr = false;
1073   p->repeat = 0;
1074   p->repeat_forever = false;
1075   p->lines_required = 0;
1076   p->offset = 0;
1077   return p;
1078 }
1079 
1080 /* Check if there is a numeric offset after a regular expression.
1081    STR is the entire command line argument.
1082    P is the control record for this regular expression.
1083    NUM is the numeric part of STR. */
1084 
1085 static void
check_for_offset(struct control * p,const char * str,const char * num)1086 check_for_offset (struct control *p, const char *str, const char *num)
1087 {
1088   if (xstrtoimax (num, NULL, 10, &p->offset, "") != LONGINT_OK)
1089     die (EXIT_FAILURE, 0, _("%s: integer expected after delimiter"),
1090          quote (str));
1091 }
1092 
1093 /* Given that the first character of command line arg STR is '{',
1094    make sure that the rest of the string is a valid repeat count
1095    and store its value in P.
1096    ARGNUM is the ARGV index of STR. */
1097 
1098 static void
parse_repeat_count(int argnum,struct control * p,char * str)1099 parse_repeat_count (int argnum, struct control *p, char *str)
1100 {
1101   uintmax_t val;
1102   char *end;
1103 
1104   end = str + strlen (str) - 1;
1105   if (*end != '}')
1106     die (EXIT_FAILURE, 0, _("%s: '}' is required in repeat count"),
1107          quote (str));
1108   *end = '\0';
1109 
1110   if (str+1 == end-1 && *(str+1) == '*')
1111     p->repeat_forever = true;
1112   else
1113     {
1114       if (xstrtoumax (str + 1, NULL, 10, &val, "") != LONGINT_OK)
1115         {
1116           die (EXIT_FAILURE, 0,
1117                _("%s}: integer required between '{' and '}'"),
1118                quote (global_argv[argnum]));
1119         }
1120       p->repeat = val;
1121     }
1122 
1123   *end = '}';
1124 }
1125 
1126 /* Extract the regular expression from STR and check for a numeric offset.
1127    STR should start with the regexp delimiter character.
1128    Return a new control record for the regular expression.
1129    ARGNUM is the ARGV index of STR.
1130    Unless IGNORE is true, mark these lines for output. */
1131 
1132 static struct control *
extract_regexp(int argnum,bool ignore,char const * str)1133 extract_regexp (int argnum, bool ignore, char const *str)
1134 {
1135   size_t len;			/* Number of bytes in this regexp. */
1136   char delim = *str;
1137   char const *closing_delim;
1138   struct control *p;
1139   const char *err;
1140 
1141   closing_delim = strrchr (str + 1, delim);
1142   if (closing_delim == NULL)
1143     die (EXIT_FAILURE, 0,
1144          _("%s: closing delimiter '%c' missing"), str, delim);
1145 
1146   len = closing_delim - str - 1;
1147   p = new_control_record ();
1148   p->argnum = argnum;
1149   p->ignore = ignore;
1150 
1151   p->regexpr = true;
1152   p->re_compiled.buffer = NULL;
1153   p->re_compiled.allocated = 0;
1154   p->re_compiled.fastmap = xmalloc (UCHAR_MAX + 1);
1155   p->re_compiled.translate = NULL;
1156   re_syntax_options =
1157     RE_SYNTAX_POSIX_BASIC & ~RE_CONTEXT_INVALID_DUP & ~RE_NO_EMPTY_RANGES;
1158   err = re_compile_pattern (str + 1, len, &p->re_compiled);
1159   if (err)
1160     {
1161       error (0, 0, _("%s: invalid regular expression: %s"), quote (str), err);
1162       cleanup_fatal ();
1163     }
1164 
1165   if (closing_delim[1])
1166     check_for_offset (p, str, closing_delim + 1);
1167 
1168   return p;
1169 }
1170 
1171 /* Extract the break patterns from args START through ARGC - 1 of ARGV.
1172    After each pattern, check if the next argument is a repeat count. */
1173 
1174 static void
parse_patterns(int argc,int start,char ** argv)1175 parse_patterns (int argc, int start, char **argv)
1176 {
1177   struct control *p;		/* New control record created. */
1178   uintmax_t val;
1179   static uintmax_t last_val = 0;
1180 
1181   for (int i = start; i < argc; i++)
1182     {
1183       if (*argv[i] == '/' || *argv[i] == '%')
1184         {
1185           p = extract_regexp (i, *argv[i] == '%', argv[i]);
1186         }
1187       else
1188         {
1189           p = new_control_record ();
1190           p->argnum = i;
1191 
1192           if (xstrtoumax (argv[i], NULL, 10, &val, "") != LONGINT_OK)
1193             die (EXIT_FAILURE, 0, _("%s: invalid pattern"), quote (argv[i]));
1194           if (val == 0)
1195             die (EXIT_FAILURE, 0,
1196                  _("%s: line number must be greater than zero"), argv[i]);
1197           if (val < last_val)
1198             {
1199               char buf[INT_BUFSIZE_BOUND (uintmax_t)];
1200               die (EXIT_FAILURE, 0,
1201                _("line number %s is smaller than preceding line number, %s"),
1202                    quote (argv[i]), umaxtostr (last_val, buf));
1203             }
1204 
1205           if (val == last_val)
1206             error (0, 0,
1207            _("warning: line number %s is the same as preceding line number"),
1208                    quote (argv[i]));
1209 
1210           last_val = val;
1211 
1212           p->lines_required = val;
1213         }
1214 
1215       if (i + 1 < argc && *argv[i + 1] == '{')
1216         {
1217           /* We have a repeat count. */
1218           i++;
1219           parse_repeat_count (i, p, argv[i]);
1220         }
1221     }
1222 }
1223 
1224 
1225 
1226 /* Names for the printf format flags ' and #.  These can be ORed together.  */
1227 enum { FLAG_THOUSANDS = 1, FLAG_ALTERNATIVE = 2 };
1228 
1229 /* Scan the printf format flags in FORMAT, storing info about the
1230    flags into *FLAGS_PTR.  Return the number of flags found.  */
1231 static size_t
get_format_flags(char const * format,int * flags_ptr)1232 get_format_flags (char const *format, int *flags_ptr)
1233 {
1234   int flags = 0;
1235 
1236   for (size_t count = 0; ; count++)
1237     {
1238       switch (format[count])
1239         {
1240         case '-':
1241         case '0':
1242           break;
1243 
1244         case '\'':
1245           flags |= FLAG_THOUSANDS;
1246           break;
1247 
1248         case '#':
1249           flags |= FLAG_ALTERNATIVE;
1250           break;
1251 
1252         default:
1253           *flags_ptr = flags;
1254           return count;
1255         }
1256     }
1257 }
1258 
1259 /* Check that the printf format conversion specifier *FORMAT is valid
1260    and compatible with FLAGS.  Change it to 'u' if it is 'd' or 'i',
1261    since the format will be used with an unsigned value.  */
1262 static void
check_format_conv_type(char * format,int flags)1263 check_format_conv_type (char *format, int flags)
1264 {
1265   unsigned char ch = *format;
1266   int compatible_flags = FLAG_THOUSANDS;
1267 
1268   switch (ch)
1269     {
1270     case 'd':
1271     case 'i':
1272       *format = 'u';
1273       break;
1274 
1275     case 'u':
1276       break;
1277 
1278     case 'o':
1279     case 'x':
1280     case 'X':
1281       compatible_flags = FLAG_ALTERNATIVE;
1282       break;
1283 
1284     case 0:
1285       die (EXIT_FAILURE, 0, _("missing conversion specifier in suffix"));
1286 
1287     default:
1288       if (isprint (ch))
1289         die (EXIT_FAILURE, 0,
1290              _("invalid conversion specifier in suffix: %c"), ch);
1291       else
1292         die (EXIT_FAILURE, 0,
1293              _("invalid conversion specifier in suffix: \\%.3o"), ch);
1294     }
1295 
1296   if (flags & ~ compatible_flags)
1297     die (EXIT_FAILURE, 0,
1298          _("invalid flags in conversion specification: %%%c%c"),
1299          (flags & ~ compatible_flags & FLAG_ALTERNATIVE ? '#' : '\''), ch);
1300 }
1301 
1302 /* Return the maximum number of bytes that can be generated by
1303    applying FORMAT to an unsigned int value.  If the format is
1304    invalid, diagnose the problem and exit.  */
1305 static size_t
max_out(char * format)1306 max_out (char *format)
1307 {
1308   bool percent = false;
1309 
1310   for (char *f = format; *f; f++)
1311     if (*f == '%' && *++f != '%')
1312       {
1313         if (percent)
1314           die (EXIT_FAILURE, 0,
1315                _("too many %% conversion specifications in suffix"));
1316         percent = true;
1317         int flags;
1318         f += get_format_flags (f, &flags);
1319         while (ISDIGIT (*f))
1320           f++;
1321         if (*f == '.')
1322           while (ISDIGIT (*++f))
1323             continue;
1324         check_format_conv_type (f, flags);
1325       }
1326 
1327   if (! percent)
1328     die (EXIT_FAILURE, 0,
1329          _("missing %% conversion specification in suffix"));
1330 
1331   int maxlen = snprintf (NULL, 0, format, UINT_MAX);
1332   if (! (0 <= maxlen && maxlen <= SIZE_MAX))
1333     xalloc_die ();
1334   return maxlen;
1335 }
1336 
1337 int
main(int argc,char ** argv)1338 main (int argc, char **argv)
1339 {
1340   int optc;
1341 
1342   initialize_main (&argc, &argv);
1343   set_program_name (argv[0]);
1344   setlocale (LC_ALL, "");
1345   bindtextdomain (PACKAGE, LOCALEDIR);
1346   textdomain (PACKAGE);
1347 
1348   atexit (close_stdout);
1349 
1350   global_argv = argv;
1351   controls = NULL;
1352   control_used = 0;
1353   suppress_count = false;
1354   remove_files = true;
1355   suppress_matched = false;
1356   prefix = DEFAULT_PREFIX;
1357 
1358   while ((optc = getopt_long (argc, argv, "f:b:kn:sqz", longopts, NULL)) != -1)
1359     switch (optc)
1360       {
1361       case 'f':
1362         prefix = optarg;
1363         break;
1364 
1365       case 'b':
1366         suffix = optarg;
1367         break;
1368 
1369       case 'k':
1370         remove_files = false;
1371         break;
1372 
1373       case 'n':
1374         digits = xdectoimax (optarg, 0, MIN (INT_MAX, SIZE_MAX), "",
1375                              _("invalid number"), 0);
1376         break;
1377 
1378       case 's':
1379       case 'q':
1380         suppress_count = true;
1381         break;
1382 
1383       case 'z':
1384         elide_empty_files = true;
1385         break;
1386 
1387       case SUPPRESS_MATCHED_OPTION:
1388         suppress_matched = true;
1389         break;
1390 
1391       case_GETOPT_HELP_CHAR;
1392 
1393       case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1394 
1395       default:
1396         usage (EXIT_FAILURE);
1397       }
1398 
1399   if (argc - optind < 2)
1400     {
1401       if (argc <= optind)
1402         error (0, 0, _("missing operand"));
1403       else
1404         error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1405       usage (EXIT_FAILURE);
1406     }
1407 
1408   size_t prefix_len = strlen (prefix);
1409   size_t max_digit_string_len
1410     = (suffix
1411        ? max_out (suffix)
1412        : MAX (INT_STRLEN_BOUND (unsigned int), digits));
1413   if (SIZE_MAX - 1 - prefix_len < max_digit_string_len)
1414     xalloc_die ();
1415   filename_space = xmalloc (prefix_len + max_digit_string_len + 1);
1416 
1417   set_input_file (argv[optind++]);
1418 
1419   parse_patterns (argc, optind, argv);
1420 
1421   {
1422     int i;
1423     static int const sig[] =
1424       {
1425         /* The usual suspects.  */
1426         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
1427 #ifdef SIGPOLL
1428         SIGPOLL,
1429 #endif
1430 #ifdef SIGPROF
1431         SIGPROF,
1432 #endif
1433 #ifdef SIGVTALRM
1434         SIGVTALRM,
1435 #endif
1436 #ifdef SIGXCPU
1437         SIGXCPU,
1438 #endif
1439 #ifdef SIGXFSZ
1440         SIGXFSZ,
1441 #endif
1442       };
1443     enum { nsigs = ARRAY_CARDINALITY (sig) };
1444 
1445     struct sigaction act;
1446 
1447     sigemptyset (&caught_signals);
1448     for (i = 0; i < nsigs; i++)
1449       {
1450         sigaction (sig[i], NULL, &act);
1451         if (act.sa_handler != SIG_IGN)
1452           sigaddset (&caught_signals, sig[i]);
1453       }
1454 
1455     act.sa_handler = interrupt_handler;
1456     act.sa_mask = caught_signals;
1457     act.sa_flags = 0;
1458 
1459     for (i = 0; i < nsigs; i++)
1460       if (sigismember (&caught_signals, sig[i]))
1461         sigaction (sig[i], &act, NULL);
1462   }
1463 
1464   split_file ();
1465 
1466   if (close (STDIN_FILENO) != 0)
1467     {
1468       error (0, errno, _("read error"));
1469       cleanup_fatal ();
1470     }
1471 
1472   return EXIT_SUCCESS;
1473 }
1474 
1475 void
usage(int status)1476 usage (int status)
1477 {
1478   if (status != EXIT_SUCCESS)
1479     emit_try_help ();
1480   else
1481     {
1482       printf (_("\
1483 Usage: %s [OPTION]... FILE PATTERN...\n\
1484 "),
1485               program_name);
1486       fputs (_("\
1487 Output pieces of FILE separated by PATTERN(s) to files 'xx00', 'xx01', ...,\n\
1488 and output byte counts of each piece to standard output.\n\
1489 "), stdout);
1490        fputs (_("\
1491 \n\
1492 Read standard input if FILE is -\n\
1493 "), stdout);
1494 
1495       emit_mandatory_arg_note ();
1496 
1497       fputs (_("\
1498   -b, --suffix-format=FORMAT  use sprintf FORMAT instead of %02d\n\
1499   -f, --prefix=PREFIX        use PREFIX instead of 'xx'\n\
1500   -k, --keep-files           do not remove output files on errors\n\
1501 "), stdout);
1502       fputs (_("\
1503       --suppress-matched     suppress the lines matching PATTERN\n\
1504 "), stdout);
1505       fputs (_("\
1506   -n, --digits=DIGITS        use specified number of digits instead of 2\n\
1507   -s, --quiet, --silent      do not print counts of output file sizes\n\
1508   -z, --elide-empty-files    remove empty output files\n\
1509 "), stdout);
1510       fputs (HELP_OPTION_DESCRIPTION, stdout);
1511       fputs (VERSION_OPTION_DESCRIPTION, stdout);
1512       fputs (_("\
1513 \n\
1514 Each PATTERN may be:\n\
1515   INTEGER            copy up to but not including specified line number\n\
1516   /REGEXP/[OFFSET]   copy up to but not including a matching line\n\
1517   %REGEXP%[OFFSET]   skip to, but not including a matching line\n\
1518   {INTEGER}          repeat the previous pattern specified number of times\n\
1519   {*}                repeat the previous pattern as many times as possible\n\
1520 \n\
1521 A line OFFSET is a required '+' or '-' followed by a positive integer.\n\
1522 "), stdout);
1523       emit_ancillary_info (PROGRAM_NAME);
1524     }
1525   exit (status);
1526 }
1527