xref: /freebsd/usr.bin/csplit/csplit.c (revision 780fb4a2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002 Tim J. Robbins.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * csplit -- split files based on context
31  *
32  * This utility splits its input into numbered output files by line number
33  * or by a regular expression. Regular expression matches have an optional
34  * offset with them, allowing the split to occur a specified number of
35  * lines before or after the match.
36  *
37  * To handle negative offsets, we stop reading when the match occurs and
38  * store the offset that the file should have been split at, then use
39  * this output file as input until all the "overflowed" lines have been read.
40  * The file is then closed and truncated to the correct length.
41  *
42  * We assume that the output files can be seeked upon (ie. they cannot be
43  * symlinks to named pipes or character devices), but make no such
44  * assumption about the input.
45  */
46 
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
49 
50 #include <sys/types.h>
51 
52 #include <ctype.h>
53 #include <err.h>
54 #include <errno.h>
55 #include <limits.h>
56 #include <locale.h>
57 #include <regex.h>
58 #include <signal.h>
59 #include <stdint.h>
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <string.h>
63 #include <unistd.h>
64 
65 static void	 cleanup(void);
66 static void	 do_lineno(const char *);
67 static void	 do_rexp(const char *);
68 static char	*get_line(void);
69 static void	 handlesig(int);
70 static FILE	*newfile(void);
71 static void	 toomuch(FILE *, long);
72 static void	 usage(void);
73 
74 /*
75  * Command line options
76  */
77 static const char *prefix;	/* File name prefix */
78 static long	 sufflen;	/* Number of decimal digits for suffix */
79 static int	 sflag;		/* Suppress output of file names */
80 static int	 kflag;		/* Keep output if error occurs */
81 
82 /*
83  * Other miscellaneous globals (XXX too many)
84  */
85 static long	 lineno;	/* Current line number in input file */
86 static long	 reps;		/* Number of repetitions for this pattern */
87 static long	 nfiles;	/* Number of files output so far */
88 static long	 maxfiles;	/* Maximum number of files we can create */
89 static char	 currfile[PATH_MAX]; /* Current output file */
90 static const char *infn;	/* Name of the input file */
91 static FILE	*infile;	/* Input file handle */
92 static FILE	*overfile;	/* Overflow file for toomuch() */
93 static off_t	 truncofs;	/* Offset this file should be truncated at */
94 static int	 doclean;	/* Should cleanup() remove output? */
95 
96 int
97 main(int argc, char *argv[])
98 {
99 	struct sigaction sa;
100 	long i;
101 	int ch;
102 	const char *expr;
103 	char *ep, *p;
104 	FILE *ofp;
105 
106 	setlocale(LC_ALL, "");
107 
108 	kflag = sflag = 0;
109 	prefix = "xx";
110 	sufflen = 2;
111 	while ((ch = getopt(argc, argv, "ksf:n:")) > 0) {
112 		switch (ch) {
113 		case 'f':
114 			prefix = optarg;
115 			break;
116 		case 'k':
117 			kflag = 1;
118 			break;
119 		case 'n':
120 			errno = 0;
121 			sufflen = strtol(optarg, &ep, 10);
122 			if (sufflen <= 0 || *ep != '\0' || errno != 0)
123 				errx(1, "%s: bad suffix length", optarg);
124 			break;
125 		case 's':
126 			sflag = 1;
127 			break;
128 		default:
129 			usage();
130 			/*NOTREACHED*/
131 		}
132 	}
133 
134 	if (sufflen + strlen(prefix) >= PATH_MAX)
135 		errx(1, "name too long");
136 
137 	argc -= optind;
138 	argv += optind;
139 
140 	if ((infn = *argv++) == NULL)
141 		usage();
142 	if (strcmp(infn, "-") == 0) {
143 		infile = stdin;
144 		infn = "stdin";
145 	} else if ((infile = fopen(infn, "r")) == NULL)
146 		err(1, "%s", infn);
147 
148 	if (!kflag) {
149 		doclean = 1;
150 		atexit(cleanup);
151 		sa.sa_flags = 0;
152 		sa.sa_handler = handlesig;
153 		sigemptyset(&sa.sa_mask);
154 		sigaddset(&sa.sa_mask, SIGHUP);
155 		sigaddset(&sa.sa_mask, SIGINT);
156 		sigaddset(&sa.sa_mask, SIGTERM);
157 		sigaction(SIGHUP, &sa, NULL);
158 		sigaction(SIGINT, &sa, NULL);
159 		sigaction(SIGTERM, &sa, NULL);
160 	}
161 
162 	lineno = 0;
163 	nfiles = 0;
164 	truncofs = 0;
165 	overfile = NULL;
166 
167 	/* Ensure 10^sufflen < LONG_MAX. */
168 	for (maxfiles = 1, i = 0; i < sufflen; i++) {
169 		if (maxfiles > LONG_MAX / 10)
170 			errx(1, "%ld: suffix too long (limit %ld)",
171 			    sufflen, i);
172 		maxfiles *= 10;
173 	}
174 
175 	/* Create files based on supplied patterns. */
176 	while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) {
177 		/* Look ahead & see if this pattern has any repetitions. */
178 		if (*argv != NULL && **argv == '{') {
179 			errno = 0;
180 			reps = strtol(*argv + 1, &ep, 10);
181 			if (reps < 0 || *ep != '}' || errno != 0)
182 				errx(1, "%s: bad repetition count", *argv + 1);
183 			argv++;
184 		} else
185 			reps = 0;
186 
187 		if (*expr == '/' || *expr == '%') {
188 			do
189 				do_rexp(expr);
190 			while (reps-- != 0 && nfiles < maxfiles - 1);
191 		} else if (isdigit((unsigned char)*expr))
192 			do_lineno(expr);
193 		else
194 			errx(1, "%s: unrecognised pattern", expr);
195 	}
196 
197 	/* Copy the rest into a new file. */
198 	if (!feof(infile)) {
199 		ofp = newfile();
200 		while ((p = get_line()) != NULL && fputs(p, ofp) != EOF)
201 			;
202 		if (!sflag)
203 			printf("%jd\n", (intmax_t)ftello(ofp));
204 		if (fclose(ofp) != 0)
205 			err(1, "%s", currfile);
206 	}
207 
208 	toomuch(NULL, 0);
209 	doclean = 0;
210 
211 	return (0);
212 }
213 
214 static void
215 usage(void)
216 {
217 
218 	fprintf(stderr,
219 "usage: csplit [-ks] [-f prefix] [-n number] file args ...\n");
220 	exit(1);
221 }
222 
223 static void
224 handlesig(int sig __unused)
225 {
226 	const char msg[] = "csplit: caught signal, cleaning up\n";
227 
228 	write(STDERR_FILENO, msg, sizeof(msg) - 1);
229 	cleanup();
230 	_exit(2);
231 }
232 
233 /* Create a new output file. */
234 static FILE *
235 newfile(void)
236 {
237 	FILE *fp;
238 
239 	if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix,
240 	    (int)sufflen, nfiles) >= sizeof(currfile))
241 		errc(1, ENAMETOOLONG, NULL);
242 	if ((fp = fopen(currfile, "w+")) == NULL)
243 		err(1, "%s", currfile);
244 	nfiles++;
245 
246 	return (fp);
247 }
248 
249 /* Remove partial output, called before exiting. */
250 static void
251 cleanup(void)
252 {
253 	char fnbuf[PATH_MAX];
254 	long i;
255 
256 	if (!doclean)
257 		return;
258 
259 	/*
260 	 * NOTE: One cannot portably assume to be able to call snprintf()
261 	 * from inside a signal handler. It does, however, appear to be safe
262 	 * to do on FreeBSD. The solution to this problem is worse than the
263 	 * problem itself.
264 	 */
265 
266 	for (i = 0; i < nfiles; i++) {
267 		snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix,
268 		    (int)sufflen, i);
269 		unlink(fnbuf);
270 	}
271 }
272 
273 /* Read a line from the input into a static buffer. */
274 static char *
275 get_line(void)
276 {
277 	static char lbuf[LINE_MAX];
278 	FILE *src;
279 
280 	src = overfile != NULL ? overfile : infile;
281 
282 again: if (fgets(lbuf, sizeof(lbuf), src) == NULL) {
283 		if (src == overfile) {
284 			src = infile;
285 			goto again;
286 		}
287 		return (NULL);
288 	}
289 	if (ferror(src))
290 		err(1, "%s", infn);
291 	lineno++;
292 
293 	return (lbuf);
294 }
295 
296 /* Conceptually rewind the input (as obtained by get_line()) back `n' lines. */
297 static void
298 toomuch(FILE *ofp, long n)
299 {
300 	char buf[BUFSIZ];
301 	size_t i, nread;
302 
303 	if (overfile != NULL) {
304 		/*
305 		 * Truncate the previous file we overflowed into back to
306 		 * the correct length, close it.
307 		 */
308 		if (fflush(overfile) != 0)
309 			err(1, "overflow");
310 		if (ftruncate(fileno(overfile), truncofs) != 0)
311 			err(1, "overflow");
312 		if (fclose(overfile) != 0)
313 			err(1, "overflow");
314 		overfile = NULL;
315 	}
316 
317 	if (n == 0)
318 		/* Just tidying up */
319 		return;
320 
321 	lineno -= n;
322 
323 	/*
324 	 * Wind the overflow file backwards to `n' lines before the
325 	 * current one.
326 	 */
327 	do {
328 		if (ftello(ofp) < (off_t)sizeof(buf))
329 			rewind(ofp);
330 		else
331 			fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR);
332 		if (ferror(ofp))
333 			errx(1, "%s: can't seek", currfile);
334 		if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0)
335 			errx(1, "can't read overflowed output");
336 		if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0)
337 			err(1, "%s", currfile);
338 		for (i = 1; i <= nread; i++)
339 			if (buf[nread - i] == '\n' && n-- == 0)
340 				break;
341 		if (ftello(ofp) == 0)
342 			break;
343 	} while (n > 0);
344 	if (fseeko(ofp, nread - i + 1, SEEK_CUR) != 0)
345 		err(1, "%s", currfile);
346 
347 	/*
348 	 * get_line() will read from here. Next call will truncate to
349 	 * truncofs in this file.
350 	 */
351 	overfile = ofp;
352 	truncofs = ftello(overfile);
353 }
354 
355 /* Handle splits for /regexp/ and %regexp% patterns. */
356 static void
357 do_rexp(const char *expr)
358 {
359 	regex_t cre;
360 	intmax_t nwritten;
361 	long ofs;
362 	int first;
363 	char *ecopy, *ep, *p, *pofs, *re;
364 	FILE *ofp;
365 
366 	if ((ecopy = strdup(expr)) == NULL)
367 		err(1, "strdup");
368 
369 	re = ecopy + 1;
370 	if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\')
371 		errx(1, "%s: missing trailing %c", expr, *expr);
372 	*pofs++ = '\0';
373 
374 	if (*pofs != '\0') {
375 		errno = 0;
376 		ofs = strtol(pofs, &ep, 10);
377 		if (*ep != '\0' || errno != 0)
378 			errx(1, "%s: bad offset", pofs);
379 	} else
380 		ofs = 0;
381 
382 	if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0)
383 		errx(1, "%s: bad regular expression", re);
384 
385 	if (*expr == '/')
386 		/* /regexp/: Save results to a file. */
387 		ofp = newfile();
388 	else {
389 		/* %regexp%: Make a temporary file for overflow. */
390 		if ((ofp = tmpfile()) == NULL)
391 			err(1, "tmpfile");
392 	}
393 
394 	/* Read and output lines until we get a match. */
395 	first = 1;
396 	while ((p = get_line()) != NULL) {
397 		if (fputs(p, ofp) == EOF)
398 			break;
399 		if (!first && regexec(&cre, p, 0, NULL, 0) == 0)
400 			break;
401 		first = 0;
402 	}
403 
404 	if (p == NULL) {
405 		toomuch(NULL, 0);
406 		errx(1, "%s: no match", re);
407 	}
408 
409 	if (ofs <= 0) {
410 		/*
411 		 * Negative (or zero) offset: throw back any lines we should
412 		 * not have read yet.
413 		  */
414 		if (p != NULL) {
415 			toomuch(ofp, -ofs + 1);
416 			nwritten = (intmax_t)truncofs;
417 		} else
418 			nwritten = (intmax_t)ftello(ofp);
419 	} else {
420 		/*
421 		 * Positive offset: copy the requested number of lines
422 		 * after the match.
423 		 */
424 		while (--ofs > 0 && (p = get_line()) != NULL)
425 			fputs(p, ofp);
426 		toomuch(NULL, 0);
427 		nwritten = (intmax_t)ftello(ofp);
428 		if (fclose(ofp) != 0)
429 			err(1, "%s", currfile);
430 	}
431 
432 	if (!sflag && *expr == '/')
433 		printf("%jd\n", nwritten);
434 
435 	regfree(&cre);
436 	free(ecopy);
437 }
438 
439 /* Handle splits based on line number. */
440 static void
441 do_lineno(const char *expr)
442 {
443 	long lastline, tgtline;
444 	char *ep, *p;
445 	FILE *ofp;
446 
447 	errno = 0;
448 	tgtline = strtol(expr, &ep, 10);
449 	if (tgtline <= 0 || errno != 0 || *ep != '\0')
450 		errx(1, "%s: bad line number", expr);
451 	lastline = tgtline;
452 	if (lastline <= lineno)
453 		errx(1, "%s: can't go backwards", expr);
454 
455 	while (nfiles < maxfiles - 1) {
456 		ofp = newfile();
457 		while (lineno + 1 != lastline) {
458 			if ((p = get_line()) == NULL)
459 				errx(1, "%ld: out of range", lastline);
460 			if (fputs(p, ofp) == EOF)
461 				break;
462 		}
463 		if (!sflag)
464 			printf("%jd\n", (intmax_t)ftello(ofp));
465 		if (fclose(ofp) != 0)
466 			err(1, "%s", currfile);
467 		if (reps-- == 0)
468 			break;
469 		lastline += tgtline;
470 	}
471 }
472