1 /* csplit - split a file into sections determined by context lines
2 Copyright (C) 1991-2020 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 /* Written by Stuart Kemp, cpsrk@groper.jcu.edu.au.
18 Modified by David MacKenzie, djm@gnu.ai.mit.edu. */
19
20 #include <config.h>
21
22 #include <assert.h>
23 #include <getopt.h>
24 #include <sys/types.h>
25 #include <signal.h>
26
27 #include "system.h"
28
29 #include <regex.h>
30
31 #include "die.h"
32 #include "error.h"
33 #include "fd-reopen.h"
34 #include "quote.h"
35 #include "safe-read.h"
36 #include "stdio--.h"
37 #include "xdectoint.h"
38 #include "xstrtol.h"
39
40 /* The official name of this program (e.g., no 'g' prefix). */
41 #define PROGRAM_NAME "csplit"
42
43 #define AUTHORS \
44 proper_name ("Stuart Kemp"), \
45 proper_name ("David MacKenzie")
46
47 /* The default prefix for output file names. */
48 #define DEFAULT_PREFIX "xx"
49
50 /* A compiled pattern arg. */
51 struct control
52 {
53 intmax_t offset; /* Offset from regexp to split at. */
54 uintmax_t lines_required; /* Number of lines required. */
55 uintmax_t repeat; /* Repeat count. */
56 int argnum; /* ARGV index. */
57 bool repeat_forever; /* True if '*' used as a repeat count. */
58 bool ignore; /* If true, produce no output (for regexp). */
59 bool regexpr; /* True if regular expression was used. */
60 struct re_pattern_buffer re_compiled; /* Compiled regular expression. */
61 };
62
63 /* Initial size of data area in buffers. */
64 #define START_SIZE 8191
65
66 /* Increment size for data area. */
67 #define INCR_SIZE 2048
68
69 /* Number of lines kept in each node in line list. */
70 #define CTRL_SIZE 80
71
72 #ifdef DEBUG
73 /* Some small values to test the algorithms. */
74 # define START_SIZE 200
75 # define INCR_SIZE 10
76 # define CTRL_SIZE 1
77 #endif
78
79 /* A string with a length count. */
80 struct cstring
81 {
82 size_t len;
83 char *str;
84 };
85
86 /* Pointers to the beginnings of lines in the buffer area.
87 These structures are linked together if needed. */
88 struct line
89 {
90 size_t used; /* Number of offsets used in this struct. */
91 size_t insert_index; /* Next offset to use when inserting line. */
92 size_t retrieve_index; /* Next index to use when retrieving line. */
93 struct cstring starts[CTRL_SIZE]; /* Lines in the data area. */
94 struct line *next; /* Next in linked list. */
95 };
96
97 /* The structure to hold the input lines.
98 Contains a pointer to the data area and a list containing
99 pointers to the individual lines. */
100 struct buffer_record
101 {
102 size_t bytes_alloc; /* Size of the buffer area. */
103 size_t bytes_used; /* Bytes used in the buffer area. */
104 uintmax_t start_line; /* First line number in this buffer. */
105 uintmax_t first_available; /* First line that can be retrieved. */
106 size_t num_lines; /* Number of complete lines in this buffer. */
107 char *buffer; /* Data area. */
108 struct line *line_start; /* Head of list of pointers to lines. */
109 struct line *curr_line; /* The line start record currently in use. */
110 struct buffer_record *next;
111 };
112
113 static void close_output_file (void);
114 static void create_output_file (void);
115 static void delete_all_files (bool);
116 static void save_line_to_file (const struct cstring *line);
117
118 /* Start of buffer list. */
119 static struct buffer_record *head = NULL;
120
121 /* Partially read line. */
122 static char *hold_area = NULL;
123
124 /* Number of bytes in 'hold_area'. */
125 static size_t hold_count = 0;
126
127 /* Number of the last line in the buffers. */
128 static uintmax_t last_line_number = 0;
129
130 /* Number of the line currently being examined. */
131 static uintmax_t current_line = 0;
132
133 /* If true, we have read EOF. */
134 static bool have_read_eof = false;
135
136 /* Name of output files. */
137 static char *volatile filename_space = NULL;
138
139 /* Prefix part of output file names. */
140 static char const *volatile prefix = NULL;
141
142 /* Suffix part of output file names. */
143 static char *volatile suffix = NULL;
144
145 /* Number of digits to use in output file names. */
146 static int volatile digits = 2;
147
148 /* Number of files created so far. */
149 static unsigned int volatile files_created = 0;
150
151 /* Number of bytes written to current file. */
152 static uintmax_t bytes_written;
153
154 /* Output file pointer. */
155 static FILE *output_stream = NULL;
156
157 /* Output file name. */
158 static char *output_filename = NULL;
159
160 /* Perhaps it would be cleaner to pass arg values instead of indexes. */
161 static char **global_argv;
162
163 /* If true, do not print the count of bytes in each output file. */
164 static bool suppress_count;
165
166 /* If true, remove output files on error. */
167 static bool volatile remove_files;
168
169 /* If true, remove all output files which have a zero length. */
170 static bool elide_empty_files;
171
172 /* If true, suppress the lines that match the PATTERN */
173 static bool suppress_matched;
174
175 /* The compiled pattern arguments, which determine how to split
176 the input file. */
177 static struct control *controls;
178
179 /* Number of elements in 'controls'. */
180 static size_t control_used;
181
182 /* The set of signals that are caught. */
183 static sigset_t caught_signals;
184
185 /* For long options that have no equivalent short option, use a
186 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
187 enum
188 {
189 SUPPRESS_MATCHED_OPTION = CHAR_MAX + 1
190 };
191
192 static struct option const longopts[] =
193 {
194 {"digits", required_argument, NULL, 'n'},
195 {"quiet", no_argument, NULL, 'q'},
196 {"silent", no_argument, NULL, 's'},
197 {"keep-files", no_argument, NULL, 'k'},
198 {"elide-empty-files", no_argument, NULL, 'z'},
199 {"prefix", required_argument, NULL, 'f'},
200 {"suffix-format", required_argument, NULL, 'b'},
201 {"suppress-matched", no_argument, NULL, SUPPRESS_MATCHED_OPTION},
202 {GETOPT_HELP_OPTION_DECL},
203 {GETOPT_VERSION_OPTION_DECL},
204 {NULL, 0, NULL, 0}
205 };
206
207 /* Optionally remove files created so far; then exit.
208 Called when an error detected. */
209
210 static void
cleanup(void)211 cleanup (void)
212 {
213 sigset_t oldset;
214
215 close_output_file ();
216
217 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
218 delete_all_files (false);
219 sigprocmask (SIG_SETMASK, &oldset, NULL);
220 }
221
222 static void cleanup_fatal (void) ATTRIBUTE_NORETURN;
223 static void
cleanup_fatal(void)224 cleanup_fatal (void)
225 {
226 cleanup ();
227 exit (EXIT_FAILURE);
228 }
229
230 extern void
xalloc_die(void)231 xalloc_die (void)
232 {
233 error (0, 0, "%s", _("memory exhausted"));
234 cleanup_fatal ();
235 }
236
237 static void
interrupt_handler(int sig)238 interrupt_handler (int sig)
239 {
240 delete_all_files (true);
241 signal (sig, SIG_DFL);
242 /* The signal has been reset to SIG_DFL, but blocked during this
243 handler. Force the default action of this signal once the
244 handler returns and the block is removed. */
245 raise (sig);
246 }
247
248 /* Keep track of NUM bytes of a partial line in buffer START.
249 These bytes will be retrieved later when another large buffer is read. */
250
251 static void
save_to_hold_area(char * start,size_t num)252 save_to_hold_area (char *start, size_t num)
253 {
254 free (hold_area);
255 hold_area = start;
256 hold_count = num;
257 }
258
259 /* Read up to MAX_N_BYTES bytes from the input stream into DEST.
260 Return the number of bytes read. */
261
262 static size_t
read_input(char * dest,size_t max_n_bytes)263 read_input (char *dest, size_t max_n_bytes)
264 {
265 size_t bytes_read;
266
267 if (max_n_bytes == 0)
268 return 0;
269
270 bytes_read = safe_read (STDIN_FILENO, dest, max_n_bytes);
271
272 if (bytes_read == 0)
273 have_read_eof = true;
274
275 if (bytes_read == SAFE_READ_ERROR)
276 {
277 error (0, errno, _("read error"));
278 cleanup_fatal ();
279 }
280
281 return bytes_read;
282 }
283
284 /* Initialize existing line record P. */
285
286 static void
clear_line_control(struct line * p)287 clear_line_control (struct line *p)
288 {
289 p->used = 0;
290 p->insert_index = 0;
291 p->retrieve_index = 0;
292 }
293
294 /* Return a new, initialized line record. */
295
296 static struct line *
new_line_control(void)297 new_line_control (void)
298 {
299 struct line *p = xmalloc (sizeof *p);
300
301 p->next = NULL;
302 clear_line_control (p);
303
304 return p;
305 }
306
307 /* Record LINE_START, which is the address of the start of a line
308 of length LINE_LEN in the large buffer, in the lines buffer of B. */
309
310 static void
keep_new_line(struct buffer_record * b,char * line_start,size_t line_len)311 keep_new_line (struct buffer_record *b, char *line_start, size_t line_len)
312 {
313 struct line *l;
314
315 /* If there is no existing area to keep line info, get some. */
316 if (b->line_start == NULL)
317 b->line_start = b->curr_line = new_line_control ();
318
319 /* If existing area for lines is full, get more. */
320 if (b->curr_line->used == CTRL_SIZE)
321 {
322 b->curr_line->next = new_line_control ();
323 b->curr_line = b->curr_line->next;
324 }
325
326 l = b->curr_line;
327
328 /* Record the start of the line, and update counters. */
329 l->starts[l->insert_index].str = line_start;
330 l->starts[l->insert_index].len = line_len;
331 l->used++;
332 l->insert_index++;
333 }
334
335 /* Scan the buffer in B for newline characters
336 and record the line start locations and lengths in B.
337 Return the number of lines found in this buffer.
338
339 There may be an incomplete line at the end of the buffer;
340 a pointer is kept to this area, which will be used when
341 the next buffer is filled. */
342
343 static size_t
record_line_starts(struct buffer_record * b)344 record_line_starts (struct buffer_record *b)
345 {
346 char *line_start; /* Start of current line. */
347 char *line_end; /* End of each line found. */
348 size_t bytes_left; /* Length of incomplete last line. */
349 size_t lines; /* Number of lines found. */
350 size_t line_length; /* Length of each line found. */
351
352 if (b->bytes_used == 0)
353 return 0;
354
355 lines = 0;
356 line_start = b->buffer;
357 bytes_left = b->bytes_used;
358
359 while (true)
360 {
361 line_end = memchr (line_start, '\n', bytes_left);
362 if (line_end == NULL)
363 break;
364 line_length = line_end - line_start + 1;
365 keep_new_line (b, line_start, line_length);
366 bytes_left -= line_length;
367 line_start = line_end + 1;
368 lines++;
369 }
370
371 /* Check for an incomplete last line. */
372 if (bytes_left)
373 {
374 if (have_read_eof)
375 {
376 keep_new_line (b, line_start, bytes_left);
377 lines++;
378 }
379 else
380 save_to_hold_area (xmemdup (line_start, bytes_left), bytes_left);
381 }
382
383 b->num_lines = lines;
384 b->first_available = b->start_line = last_line_number + 1;
385 last_line_number += lines;
386
387 return lines;
388 }
389
390 /* Return a new buffer with room to store SIZE bytes, plus
391 an extra byte for safety. */
392
393 static struct buffer_record *
create_new_buffer(size_t size)394 create_new_buffer (size_t size)
395 {
396 struct buffer_record *new_buffer = xmalloc (sizeof *new_buffer);
397
398 new_buffer->buffer = xmalloc (size + 1);
399
400 new_buffer->bytes_alloc = size;
401 new_buffer->line_start = new_buffer->curr_line = NULL;
402
403 return new_buffer;
404 }
405
406 /* Return a new buffer of at least MINSIZE bytes. If a buffer of at
407 least that size is currently free, use it, otherwise create a new one. */
408
409 static struct buffer_record *
get_new_buffer(size_t min_size)410 get_new_buffer (size_t min_size)
411 {
412 struct buffer_record *new_buffer; /* Buffer to return. */
413 size_t alloc_size; /* Actual size that will be requested. */
414
415 alloc_size = START_SIZE;
416 if (alloc_size < min_size)
417 {
418 size_t s = min_size - alloc_size + INCR_SIZE - 1;
419 alloc_size += s - s % INCR_SIZE;
420 }
421
422 new_buffer = create_new_buffer (alloc_size);
423
424 new_buffer->num_lines = 0;
425 new_buffer->bytes_used = 0;
426 new_buffer->start_line = new_buffer->first_available = last_line_number + 1;
427 new_buffer->next = NULL;
428
429 return new_buffer;
430 }
431
432 static void
free_buffer(struct buffer_record * buf)433 free_buffer (struct buffer_record *buf)
434 {
435 struct line *l;
436 for (l = buf->line_start; l;)
437 {
438 struct line *n = l->next;
439 free (l);
440 l = n;
441 }
442 buf->line_start = NULL;
443 free (buf->buffer);
444 buf->buffer = NULL;
445 }
446
447 /* Append buffer BUF to the linked list of buffers that contain
448 some data yet to be processed. */
449
450 static void
save_buffer(struct buffer_record * buf)451 save_buffer (struct buffer_record *buf)
452 {
453 struct buffer_record *p;
454
455 buf->next = NULL;
456 buf->curr_line = buf->line_start;
457
458 if (head == NULL)
459 head = buf;
460 else
461 {
462 for (p = head; p->next; p = p->next)
463 /* Do nothing. */ ;
464 p->next = buf;
465 }
466 }
467
468 /* Fill a buffer of input.
469
470 Set the initial size of the buffer to a default.
471 Fill the buffer (from the hold area and input stream)
472 and find the individual lines.
473 If no lines are found (the buffer is too small to hold the next line),
474 release the current buffer (whose contents would have been put in the
475 hold area) and repeat the process with another large buffer until at least
476 one entire line has been read.
477
478 Return true if a new buffer was obtained, otherwise false
479 (in which case end-of-file must have been encountered). */
480
481 static bool
load_buffer(void)482 load_buffer (void)
483 {
484 struct buffer_record *b;
485 size_t bytes_wanted = START_SIZE; /* Minimum buffer size. */
486 size_t bytes_avail; /* Size of new buffer created. */
487 size_t lines_found; /* Number of lines in this new buffer. */
488 char *p; /* Place to load into buffer. */
489
490 if (have_read_eof)
491 return false;
492
493 /* We must make the buffer at least as large as the amount of data
494 in the partial line left over from the last call. */
495 if (bytes_wanted < hold_count)
496 bytes_wanted = hold_count;
497
498 while (1)
499 {
500 b = get_new_buffer (bytes_wanted);
501 bytes_avail = b->bytes_alloc; /* Size of buffer returned. */
502 p = b->buffer;
503
504 /* First check the 'holding' area for a partial line. */
505 if (hold_count)
506 {
507 memcpy (p, hold_area, hold_count);
508 p += hold_count;
509 b->bytes_used += hold_count;
510 bytes_avail -= hold_count;
511 hold_count = 0;
512 }
513
514 b->bytes_used += read_input (p, bytes_avail);
515
516 lines_found = record_line_starts (b);
517
518 if (lines_found || have_read_eof)
519 break;
520
521 if (xalloc_oversized (2, b->bytes_alloc))
522 xalloc_die ();
523 bytes_wanted = 2 * b->bytes_alloc;
524 free_buffer (b);
525 free (b);
526 }
527
528 if (lines_found)
529 save_buffer (b);
530 else
531 {
532 free_buffer (b);
533 free (b);
534 }
535
536 return lines_found != 0;
537 }
538
539 /* Return the line number of the first line that has not yet been retrieved. */
540
541 static uintmax_t
get_first_line_in_buffer(void)542 get_first_line_in_buffer (void)
543 {
544 if (head == NULL && !load_buffer ())
545 die (EXIT_FAILURE, errno, _("input disappeared"));
546
547 return head->first_available;
548 }
549
550 /* Return a pointer to the logical first line in the buffer and make the
551 next line the logical first line.
552 Return NULL if there is no more input. */
553
554 static struct cstring *
remove_line(void)555 remove_line (void)
556 {
557 /* If non-NULL, this is the buffer for which the previous call
558 returned the final line. So now, presuming that line has been
559 processed, we can free the buffer and reset this pointer. */
560 static struct buffer_record *prev_buf = NULL;
561
562 struct cstring *line; /* Return value. */
563 struct line *l; /* For convenience. */
564
565 if (prev_buf)
566 {
567 free_buffer (prev_buf);
568 free (prev_buf);
569 prev_buf = NULL;
570 }
571
572 if (head == NULL && !load_buffer ())
573 return NULL;
574
575 if (current_line < head->first_available)
576 current_line = head->first_available;
577
578 ++(head->first_available);
579
580 l = head->curr_line;
581
582 line = &l->starts[l->retrieve_index];
583
584 /* Advance index to next line. */
585 if (++l->retrieve_index == l->used)
586 {
587 /* Go on to the next line record. */
588 head->curr_line = l->next;
589 if (head->curr_line == NULL || head->curr_line->used == 0)
590 {
591 /* Go on to the next data block.
592 but first record the current one so we can free it
593 once the line we're returning has been processed. */
594 prev_buf = head;
595 head = head->next;
596 }
597 }
598
599 return line;
600 }
601
602 /* Search the buffers for line LINENUM, reading more input if necessary.
603 Return a pointer to the line, or NULL if it is not found in the file. */
604
605 static struct cstring *
find_line(uintmax_t linenum)606 find_line (uintmax_t linenum)
607 {
608 struct buffer_record *b;
609
610 if (head == NULL && !load_buffer ())
611 return NULL;
612
613 if (linenum < head->start_line)
614 return NULL;
615
616 for (b = head;;)
617 {
618 assert (b);
619 if (linenum < b->start_line + b->num_lines)
620 {
621 /* The line is in this buffer. */
622 struct line *l;
623 size_t offset; /* How far into the buffer the line is. */
624
625 l = b->line_start;
626 offset = linenum - b->start_line;
627 /* Find the control record. */
628 while (offset >= CTRL_SIZE)
629 {
630 l = l->next;
631 offset -= CTRL_SIZE;
632 }
633 return &l->starts[offset];
634 }
635 if (b->next == NULL && !load_buffer ())
636 return NULL;
637 b = b->next; /* Try the next data block. */
638 }
639 }
640
641 /* Return true if at least one more line is available for input. */
642
643 static bool
no_more_lines(void)644 no_more_lines (void)
645 {
646 return find_line (current_line + 1) == NULL;
647 }
648
649 /* Open NAME as standard input. */
650
651 static void
set_input_file(const char * name)652 set_input_file (const char *name)
653 {
654 if (! STREQ (name, "-") && fd_reopen (STDIN_FILENO, name, O_RDONLY, 0) < 0)
655 die (EXIT_FAILURE, errno, _("cannot open %s for reading"),
656 quoteaf (name));
657 }
658
659 /* Write all lines from the beginning of the buffer up to, but
660 not including, line LAST_LINE, to the current output file.
661 If IGNORE is true, do not output lines selected here.
662 ARGNUM is the index in ARGV of the current pattern. */
663
664 static void
write_to_file(uintmax_t last_line,bool ignore,int argnum)665 write_to_file (uintmax_t last_line, bool ignore, int argnum)
666 {
667 struct cstring *line;
668 uintmax_t first_line; /* First available input line. */
669 uintmax_t lines; /* Number of lines to output. */
670 uintmax_t i;
671
672 first_line = get_first_line_in_buffer ();
673
674 if (first_line > last_line)
675 {
676 error (0, 0, _("%s: line number out of range"),
677 quote (global_argv[argnum]));
678 cleanup_fatal ();
679 }
680
681 lines = last_line - first_line;
682
683 for (i = 0; i < lines; i++)
684 {
685 line = remove_line ();
686 if (line == NULL)
687 {
688 error (0, 0, _("%s: line number out of range"),
689 quote (global_argv[argnum]));
690 cleanup_fatal ();
691 }
692 if (!ignore)
693 save_line_to_file (line);
694 }
695 }
696
697 /* Output any lines left after all regexps have been processed. */
698
699 static void
dump_rest_of_file(void)700 dump_rest_of_file (void)
701 {
702 struct cstring *line;
703
704 while ((line = remove_line ()) != NULL)
705 save_line_to_file (line);
706 }
707
708 /* Handle an attempt to read beyond EOF under the control of record P,
709 on iteration REPETITION if nonzero. */
710
711 static void handle_line_error (const struct control *, uintmax_t)
712 ATTRIBUTE_NORETURN;
713 static void
handle_line_error(const struct control * p,uintmax_t repetition)714 handle_line_error (const struct control *p, uintmax_t repetition)
715 {
716 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
717
718 fprintf (stderr, _("%s: %s: line number out of range"),
719 program_name, quote (umaxtostr (p->lines_required, buf)));
720 if (repetition)
721 fprintf (stderr, _(" on repetition %s\n"), umaxtostr (repetition, buf));
722 else
723 fprintf (stderr, "\n");
724
725 cleanup_fatal ();
726 }
727
728 /* Determine the line number that marks the end of this file,
729 then get those lines and save them to the output file.
730 P is the control record.
731 REPETITION is the repetition number. */
732
733 static void
process_line_count(const struct control * p,uintmax_t repetition)734 process_line_count (const struct control *p, uintmax_t repetition)
735 {
736 uintmax_t linenum;
737 uintmax_t last_line_to_save = p->lines_required * (repetition + 1);
738
739 create_output_file ();
740
741 /* Ensure that the line number specified is not 1 greater than
742 the number of lines in the file.
743 When suppressing matched lines, check before the loop. */
744 if (no_more_lines () && suppress_matched)
745 handle_line_error (p, repetition);
746
747 linenum = get_first_line_in_buffer ();
748 while (linenum++ < last_line_to_save)
749 {
750 struct cstring *line = remove_line ();
751 if (line == NULL)
752 handle_line_error (p, repetition);
753 save_line_to_file (line);
754 }
755
756 close_output_file ();
757
758 if (suppress_matched)
759 remove_line ();
760
761 /* Ensure that the line number specified is not 1 greater than
762 the number of lines in the file. */
763 if (no_more_lines () && !suppress_matched)
764 handle_line_error (p, repetition);
765 }
766
767 static void regexp_error (struct control *, uintmax_t, bool) ATTRIBUTE_NORETURN;
768 static void
regexp_error(struct control * p,uintmax_t repetition,bool ignore)769 regexp_error (struct control *p, uintmax_t repetition, bool ignore)
770 {
771 fprintf (stderr, _("%s: %s: match not found"),
772 program_name, quote (global_argv[p->argnum]));
773
774 if (repetition)
775 {
776 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
777 fprintf (stderr, _(" on repetition %s\n"), umaxtostr (repetition, buf));
778 }
779 else
780 fprintf (stderr, "\n");
781
782 if (!ignore)
783 {
784 dump_rest_of_file ();
785 close_output_file ();
786 }
787 cleanup_fatal ();
788 }
789
790 /* Read the input until a line matches the regexp in P, outputting
791 it unless P->IGNORE is true.
792 REPETITION is this repeat-count; 0 means the first time. */
793
794 static void
process_regexp(struct control * p,uintmax_t repetition)795 process_regexp (struct control *p, uintmax_t repetition)
796 {
797 struct cstring *line; /* From input file. */
798 size_t line_len; /* To make "$" in regexps work. */
799 uintmax_t break_line; /* First line number of next file. */
800 bool ignore = p->ignore; /* If true, skip this section. */
801 regoff_t ret;
802
803 if (!ignore)
804 create_output_file ();
805
806 if (suppress_matched && current_line > 0)
807 remove_line ();
808
809 /* If there is no offset for the regular expression, or
810 it is positive, then it is not necessary to buffer the lines. */
811
812 if (p->offset >= 0)
813 {
814 while (true)
815 {
816 line = find_line (++current_line);
817 if (line == NULL)
818 {
819 if (p->repeat_forever)
820 {
821 if (!ignore)
822 {
823 dump_rest_of_file ();
824 close_output_file ();
825 }
826 exit (EXIT_SUCCESS);
827 }
828 else
829 regexp_error (p, repetition, ignore);
830 }
831 line_len = line->len;
832 if (line->str[line_len - 1] == '\n')
833 line_len--;
834 ret = re_search (&p->re_compiled, line->str, line_len,
835 0, line_len, NULL);
836 if (ret == -2)
837 {
838 error (0, 0, _("error in regular expression search"));
839 cleanup_fatal ();
840 }
841 if (ret == -1)
842 {
843 line = remove_line ();
844 if (!ignore)
845 save_line_to_file (line);
846 }
847 else
848 break;
849 }
850 }
851 else
852 {
853 /* Buffer the lines. */
854 while (true)
855 {
856 line = find_line (++current_line);
857 if (line == NULL)
858 {
859 if (p->repeat_forever)
860 {
861 if (!ignore)
862 {
863 dump_rest_of_file ();
864 close_output_file ();
865 }
866 exit (EXIT_SUCCESS);
867 }
868 else
869 regexp_error (p, repetition, ignore);
870 }
871 line_len = line->len;
872 if (line->str[line_len - 1] == '\n')
873 line_len--;
874 ret = re_search (&p->re_compiled, line->str, line_len,
875 0, line_len, NULL);
876 if (ret == -2)
877 {
878 error (0, 0, _("error in regular expression search"));
879 cleanup_fatal ();
880 }
881 if (ret != -1)
882 break;
883 }
884 }
885
886 /* Account for any offset from this regexp. */
887 break_line = current_line + p->offset;
888
889 write_to_file (break_line, ignore, p->argnum);
890
891 if (!ignore)
892 close_output_file ();
893
894 if (p->offset > 0)
895 current_line = break_line;
896 }
897
898 /* Split the input file according to the control records we have built. */
899
900 static void
split_file(void)901 split_file (void)
902 {
903 for (size_t i = 0; i < control_used; i++)
904 {
905 uintmax_t j;
906 if (controls[i].regexpr)
907 {
908 for (j = 0; (controls[i].repeat_forever
909 || j <= controls[i].repeat); j++)
910 process_regexp (&controls[i], j);
911 }
912 else
913 {
914 for (j = 0; (controls[i].repeat_forever
915 || j <= controls[i].repeat); j++)
916 process_line_count (&controls[i], j);
917 }
918 }
919
920 create_output_file ();
921 dump_rest_of_file ();
922 close_output_file ();
923 }
924
925 /* Return the name of output file number NUM.
926
927 This function is called from a signal handler, so it should invoke
928 only reentrant functions that are async-signal-safe. POSIX does
929 not guarantee this for the functions called below, but we don't
930 know of any hosts where this implementation isn't safe. */
931
932 static char *
make_filename(unsigned int num)933 make_filename (unsigned int num)
934 {
935 strcpy (filename_space, prefix);
936 if (suffix)
937 sprintf (filename_space + strlen (prefix), suffix, num);
938 else
939 sprintf (filename_space + strlen (prefix), "%0*u", digits, num);
940 return filename_space;
941 }
942
943 /* Create the next output file. */
944
945 static void
create_output_file(void)946 create_output_file (void)
947 {
948 bool fopen_ok;
949 int fopen_errno;
950
951 output_filename = make_filename (files_created);
952
953 if (files_created == UINT_MAX)
954 {
955 fopen_ok = false;
956 fopen_errno = EOVERFLOW;
957 }
958 else
959 {
960 /* Create the output file in a critical section, to avoid races. */
961 sigset_t oldset;
962 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
963 output_stream = fopen (output_filename, "w");
964 fopen_ok = (output_stream != NULL);
965 fopen_errno = errno;
966 files_created += fopen_ok;
967 sigprocmask (SIG_SETMASK, &oldset, NULL);
968 }
969
970 if (! fopen_ok)
971 {
972 error (0, fopen_errno, "%s", quotef (output_filename));
973 cleanup_fatal ();
974 }
975 bytes_written = 0;
976 }
977
978 /* If requested, delete all the files we have created. This function
979 must be called only from critical sections. */
980
981 static void
delete_all_files(bool in_signal_handler)982 delete_all_files (bool in_signal_handler)
983 {
984 if (! remove_files)
985 return;
986
987 for (unsigned int i = 0; i < files_created; i++)
988 {
989 const char *name = make_filename (i);
990 if (unlink (name) != 0 && !in_signal_handler)
991 error (0, errno, "%s", quotef (name));
992 }
993
994 files_created = 0;
995 }
996
997 /* Close the current output file and print the count
998 of characters in this file. */
999
1000 static void
close_output_file(void)1001 close_output_file (void)
1002 {
1003 if (output_stream)
1004 {
1005 if (ferror (output_stream))
1006 {
1007 error (0, 0, _("write error for %s"), quoteaf (output_filename));
1008 output_stream = NULL;
1009 cleanup_fatal ();
1010 }
1011 if (fclose (output_stream) != 0)
1012 {
1013 error (0, errno, "%s", quotef (output_filename));
1014 output_stream = NULL;
1015 cleanup_fatal ();
1016 }
1017 if (bytes_written == 0 && elide_empty_files)
1018 {
1019 sigset_t oldset;
1020 bool unlink_ok;
1021 int unlink_errno;
1022
1023 /* Remove the output file in a critical section, to avoid races. */
1024 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
1025 unlink_ok = (unlink (output_filename) == 0);
1026 unlink_errno = errno;
1027 files_created -= unlink_ok;
1028 sigprocmask (SIG_SETMASK, &oldset, NULL);
1029
1030 if (! unlink_ok)
1031 error (0, unlink_errno, "%s", quotef (output_filename));
1032 }
1033 else
1034 {
1035 if (!suppress_count)
1036 {
1037 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
1038 fprintf (stdout, "%s\n", umaxtostr (bytes_written, buf));
1039 }
1040 }
1041 output_stream = NULL;
1042 }
1043 }
1044
1045 /* Save line LINE to the output file and
1046 increment the character count for the current file. */
1047
1048 static void
save_line_to_file(const struct cstring * line)1049 save_line_to_file (const struct cstring *line)
1050 {
1051 size_t l = fwrite (line->str, sizeof (char), line->len, output_stream);
1052 if (l != line->len)
1053 {
1054 error (0, errno, _("write error for %s"), quoteaf (output_filename));
1055 output_stream = NULL;
1056 cleanup_fatal ();
1057 }
1058 bytes_written += line->len;
1059 }
1060
1061 /* Return a new, initialized control record. */
1062
1063 static struct control *
new_control_record(void)1064 new_control_record (void)
1065 {
1066 static size_t control_allocated = 0; /* Total space allocated. */
1067 struct control *p;
1068
1069 if (control_used == control_allocated)
1070 controls = X2NREALLOC (controls, &control_allocated);
1071 p = &controls[control_used++];
1072 p->regexpr = false;
1073 p->repeat = 0;
1074 p->repeat_forever = false;
1075 p->lines_required = 0;
1076 p->offset = 0;
1077 return p;
1078 }
1079
1080 /* Check if there is a numeric offset after a regular expression.
1081 STR is the entire command line argument.
1082 P is the control record for this regular expression.
1083 NUM is the numeric part of STR. */
1084
1085 static void
check_for_offset(struct control * p,const char * str,const char * num)1086 check_for_offset (struct control *p, const char *str, const char *num)
1087 {
1088 if (xstrtoimax (num, NULL, 10, &p->offset, "") != LONGINT_OK)
1089 die (EXIT_FAILURE, 0, _("%s: integer expected after delimiter"),
1090 quote (str));
1091 }
1092
1093 /* Given that the first character of command line arg STR is '{',
1094 make sure that the rest of the string is a valid repeat count
1095 and store its value in P.
1096 ARGNUM is the ARGV index of STR. */
1097
1098 static void
parse_repeat_count(int argnum,struct control * p,char * str)1099 parse_repeat_count (int argnum, struct control *p, char *str)
1100 {
1101 uintmax_t val;
1102 char *end;
1103
1104 end = str + strlen (str) - 1;
1105 if (*end != '}')
1106 die (EXIT_FAILURE, 0, _("%s: '}' is required in repeat count"),
1107 quote (str));
1108 *end = '\0';
1109
1110 if (str+1 == end-1 && *(str+1) == '*')
1111 p->repeat_forever = true;
1112 else
1113 {
1114 if (xstrtoumax (str + 1, NULL, 10, &val, "") != LONGINT_OK)
1115 {
1116 die (EXIT_FAILURE, 0,
1117 _("%s}: integer required between '{' and '}'"),
1118 quote (global_argv[argnum]));
1119 }
1120 p->repeat = val;
1121 }
1122
1123 *end = '}';
1124 }
1125
1126 /* Extract the regular expression from STR and check for a numeric offset.
1127 STR should start with the regexp delimiter character.
1128 Return a new control record for the regular expression.
1129 ARGNUM is the ARGV index of STR.
1130 Unless IGNORE is true, mark these lines for output. */
1131
1132 static struct control *
extract_regexp(int argnum,bool ignore,char const * str)1133 extract_regexp (int argnum, bool ignore, char const *str)
1134 {
1135 size_t len; /* Number of bytes in this regexp. */
1136 char delim = *str;
1137 char const *closing_delim;
1138 struct control *p;
1139 const char *err;
1140
1141 closing_delim = strrchr (str + 1, delim);
1142 if (closing_delim == NULL)
1143 die (EXIT_FAILURE, 0,
1144 _("%s: closing delimiter '%c' missing"), str, delim);
1145
1146 len = closing_delim - str - 1;
1147 p = new_control_record ();
1148 p->argnum = argnum;
1149 p->ignore = ignore;
1150
1151 p->regexpr = true;
1152 p->re_compiled.buffer = NULL;
1153 p->re_compiled.allocated = 0;
1154 p->re_compiled.fastmap = xmalloc (UCHAR_MAX + 1);
1155 p->re_compiled.translate = NULL;
1156 re_syntax_options =
1157 RE_SYNTAX_POSIX_BASIC & ~RE_CONTEXT_INVALID_DUP & ~RE_NO_EMPTY_RANGES;
1158 err = re_compile_pattern (str + 1, len, &p->re_compiled);
1159 if (err)
1160 {
1161 error (0, 0, _("%s: invalid regular expression: %s"), quote (str), err);
1162 cleanup_fatal ();
1163 }
1164
1165 if (closing_delim[1])
1166 check_for_offset (p, str, closing_delim + 1);
1167
1168 return p;
1169 }
1170
1171 /* Extract the break patterns from args START through ARGC - 1 of ARGV.
1172 After each pattern, check if the next argument is a repeat count. */
1173
1174 static void
parse_patterns(int argc,int start,char ** argv)1175 parse_patterns (int argc, int start, char **argv)
1176 {
1177 struct control *p; /* New control record created. */
1178 uintmax_t val;
1179 static uintmax_t last_val = 0;
1180
1181 for (int i = start; i < argc; i++)
1182 {
1183 if (*argv[i] == '/' || *argv[i] == '%')
1184 {
1185 p = extract_regexp (i, *argv[i] == '%', argv[i]);
1186 }
1187 else
1188 {
1189 p = new_control_record ();
1190 p->argnum = i;
1191
1192 if (xstrtoumax (argv[i], NULL, 10, &val, "") != LONGINT_OK)
1193 die (EXIT_FAILURE, 0, _("%s: invalid pattern"), quote (argv[i]));
1194 if (val == 0)
1195 die (EXIT_FAILURE, 0,
1196 _("%s: line number must be greater than zero"), argv[i]);
1197 if (val < last_val)
1198 {
1199 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
1200 die (EXIT_FAILURE, 0,
1201 _("line number %s is smaller than preceding line number, %s"),
1202 quote (argv[i]), umaxtostr (last_val, buf));
1203 }
1204
1205 if (val == last_val)
1206 error (0, 0,
1207 _("warning: line number %s is the same as preceding line number"),
1208 quote (argv[i]));
1209
1210 last_val = val;
1211
1212 p->lines_required = val;
1213 }
1214
1215 if (i + 1 < argc && *argv[i + 1] == '{')
1216 {
1217 /* We have a repeat count. */
1218 i++;
1219 parse_repeat_count (i, p, argv[i]);
1220 }
1221 }
1222 }
1223
1224
1225
1226 /* Names for the printf format flags ' and #. These can be ORed together. */
1227 enum { FLAG_THOUSANDS = 1, FLAG_ALTERNATIVE = 2 };
1228
1229 /* Scan the printf format flags in FORMAT, storing info about the
1230 flags into *FLAGS_PTR. Return the number of flags found. */
1231 static size_t
get_format_flags(char const * format,int * flags_ptr)1232 get_format_flags (char const *format, int *flags_ptr)
1233 {
1234 int flags = 0;
1235
1236 for (size_t count = 0; ; count++)
1237 {
1238 switch (format[count])
1239 {
1240 case '-':
1241 case '0':
1242 break;
1243
1244 case '\'':
1245 flags |= FLAG_THOUSANDS;
1246 break;
1247
1248 case '#':
1249 flags |= FLAG_ALTERNATIVE;
1250 break;
1251
1252 default:
1253 *flags_ptr = flags;
1254 return count;
1255 }
1256 }
1257 }
1258
1259 /* Check that the printf format conversion specifier *FORMAT is valid
1260 and compatible with FLAGS. Change it to 'u' if it is 'd' or 'i',
1261 since the format will be used with an unsigned value. */
1262 static void
check_format_conv_type(char * format,int flags)1263 check_format_conv_type (char *format, int flags)
1264 {
1265 unsigned char ch = *format;
1266 int compatible_flags = FLAG_THOUSANDS;
1267
1268 switch (ch)
1269 {
1270 case 'd':
1271 case 'i':
1272 *format = 'u';
1273 break;
1274
1275 case 'u':
1276 break;
1277
1278 case 'o':
1279 case 'x':
1280 case 'X':
1281 compatible_flags = FLAG_ALTERNATIVE;
1282 break;
1283
1284 case 0:
1285 die (EXIT_FAILURE, 0, _("missing conversion specifier in suffix"));
1286
1287 default:
1288 if (isprint (ch))
1289 die (EXIT_FAILURE, 0,
1290 _("invalid conversion specifier in suffix: %c"), ch);
1291 else
1292 die (EXIT_FAILURE, 0,
1293 _("invalid conversion specifier in suffix: \\%.3o"), ch);
1294 }
1295
1296 if (flags & ~ compatible_flags)
1297 die (EXIT_FAILURE, 0,
1298 _("invalid flags in conversion specification: %%%c%c"),
1299 (flags & ~ compatible_flags & FLAG_ALTERNATIVE ? '#' : '\''), ch);
1300 }
1301
1302 /* Return the maximum number of bytes that can be generated by
1303 applying FORMAT to an unsigned int value. If the format is
1304 invalid, diagnose the problem and exit. */
1305 static size_t
max_out(char * format)1306 max_out (char *format)
1307 {
1308 bool percent = false;
1309
1310 for (char *f = format; *f; f++)
1311 if (*f == '%' && *++f != '%')
1312 {
1313 if (percent)
1314 die (EXIT_FAILURE, 0,
1315 _("too many %% conversion specifications in suffix"));
1316 percent = true;
1317 int flags;
1318 f += get_format_flags (f, &flags);
1319 while (ISDIGIT (*f))
1320 f++;
1321 if (*f == '.')
1322 while (ISDIGIT (*++f))
1323 continue;
1324 check_format_conv_type (f, flags);
1325 }
1326
1327 if (! percent)
1328 die (EXIT_FAILURE, 0,
1329 _("missing %% conversion specification in suffix"));
1330
1331 int maxlen = snprintf (NULL, 0, format, UINT_MAX);
1332 if (! (0 <= maxlen && maxlen <= SIZE_MAX))
1333 xalloc_die ();
1334 return maxlen;
1335 }
1336
1337 int
main(int argc,char ** argv)1338 main (int argc, char **argv)
1339 {
1340 int optc;
1341
1342 initialize_main (&argc, &argv);
1343 set_program_name (argv[0]);
1344 setlocale (LC_ALL, "");
1345 bindtextdomain (PACKAGE, LOCALEDIR);
1346 textdomain (PACKAGE);
1347
1348 atexit (close_stdout);
1349
1350 global_argv = argv;
1351 controls = NULL;
1352 control_used = 0;
1353 suppress_count = false;
1354 remove_files = true;
1355 suppress_matched = false;
1356 prefix = DEFAULT_PREFIX;
1357
1358 while ((optc = getopt_long (argc, argv, "f:b:kn:sqz", longopts, NULL)) != -1)
1359 switch (optc)
1360 {
1361 case 'f':
1362 prefix = optarg;
1363 break;
1364
1365 case 'b':
1366 suffix = optarg;
1367 break;
1368
1369 case 'k':
1370 remove_files = false;
1371 break;
1372
1373 case 'n':
1374 digits = xdectoimax (optarg, 0, MIN (INT_MAX, SIZE_MAX), "",
1375 _("invalid number"), 0);
1376 break;
1377
1378 case 's':
1379 case 'q':
1380 suppress_count = true;
1381 break;
1382
1383 case 'z':
1384 elide_empty_files = true;
1385 break;
1386
1387 case SUPPRESS_MATCHED_OPTION:
1388 suppress_matched = true;
1389 break;
1390
1391 case_GETOPT_HELP_CHAR;
1392
1393 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1394
1395 default:
1396 usage (EXIT_FAILURE);
1397 }
1398
1399 if (argc - optind < 2)
1400 {
1401 if (argc <= optind)
1402 error (0, 0, _("missing operand"));
1403 else
1404 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1405 usage (EXIT_FAILURE);
1406 }
1407
1408 size_t prefix_len = strlen (prefix);
1409 size_t max_digit_string_len
1410 = (suffix
1411 ? max_out (suffix)
1412 : MAX (INT_STRLEN_BOUND (unsigned int), digits));
1413 if (SIZE_MAX - 1 - prefix_len < max_digit_string_len)
1414 xalloc_die ();
1415 filename_space = xmalloc (prefix_len + max_digit_string_len + 1);
1416
1417 set_input_file (argv[optind++]);
1418
1419 parse_patterns (argc, optind, argv);
1420
1421 {
1422 int i;
1423 static int const sig[] =
1424 {
1425 /* The usual suspects. */
1426 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
1427 #ifdef SIGPOLL
1428 SIGPOLL,
1429 #endif
1430 #ifdef SIGPROF
1431 SIGPROF,
1432 #endif
1433 #ifdef SIGVTALRM
1434 SIGVTALRM,
1435 #endif
1436 #ifdef SIGXCPU
1437 SIGXCPU,
1438 #endif
1439 #ifdef SIGXFSZ
1440 SIGXFSZ,
1441 #endif
1442 };
1443 enum { nsigs = ARRAY_CARDINALITY (sig) };
1444
1445 struct sigaction act;
1446
1447 sigemptyset (&caught_signals);
1448 for (i = 0; i < nsigs; i++)
1449 {
1450 sigaction (sig[i], NULL, &act);
1451 if (act.sa_handler != SIG_IGN)
1452 sigaddset (&caught_signals, sig[i]);
1453 }
1454
1455 act.sa_handler = interrupt_handler;
1456 act.sa_mask = caught_signals;
1457 act.sa_flags = 0;
1458
1459 for (i = 0; i < nsigs; i++)
1460 if (sigismember (&caught_signals, sig[i]))
1461 sigaction (sig[i], &act, NULL);
1462 }
1463
1464 split_file ();
1465
1466 if (close (STDIN_FILENO) != 0)
1467 {
1468 error (0, errno, _("read error"));
1469 cleanup_fatal ();
1470 }
1471
1472 return EXIT_SUCCESS;
1473 }
1474
1475 void
usage(int status)1476 usage (int status)
1477 {
1478 if (status != EXIT_SUCCESS)
1479 emit_try_help ();
1480 else
1481 {
1482 printf (_("\
1483 Usage: %s [OPTION]... FILE PATTERN...\n\
1484 "),
1485 program_name);
1486 fputs (_("\
1487 Output pieces of FILE separated by PATTERN(s) to files 'xx00', 'xx01', ...,\n\
1488 and output byte counts of each piece to standard output.\n\
1489 "), stdout);
1490 fputs (_("\
1491 \n\
1492 Read standard input if FILE is -\n\
1493 "), stdout);
1494
1495 emit_mandatory_arg_note ();
1496
1497 fputs (_("\
1498 -b, --suffix-format=FORMAT use sprintf FORMAT instead of %02d\n\
1499 -f, --prefix=PREFIX use PREFIX instead of 'xx'\n\
1500 -k, --keep-files do not remove output files on errors\n\
1501 "), stdout);
1502 fputs (_("\
1503 --suppress-matched suppress the lines matching PATTERN\n\
1504 "), stdout);
1505 fputs (_("\
1506 -n, --digits=DIGITS use specified number of digits instead of 2\n\
1507 -s, --quiet, --silent do not print counts of output file sizes\n\
1508 -z, --elide-empty-files remove empty output files\n\
1509 "), stdout);
1510 fputs (HELP_OPTION_DESCRIPTION, stdout);
1511 fputs (VERSION_OPTION_DESCRIPTION, stdout);
1512 fputs (_("\
1513 \n\
1514 Each PATTERN may be:\n\
1515 INTEGER copy up to but not including specified line number\n\
1516 /REGEXP/[OFFSET] copy up to but not including a matching line\n\
1517 %REGEXP%[OFFSET] skip to, but not including a matching line\n\
1518 {INTEGER} repeat the previous pattern specified number of times\n\
1519 {*} repeat the previous pattern as many times as possible\n\
1520 \n\
1521 A line OFFSET is a required '+' or '-' followed by a positive integer.\n\
1522 "), stdout);
1523 emit_ancillary_info (PROGRAM_NAME);
1524 }
1525 exit (status);
1526 }
1527