1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1989-2013 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *               Glenn Fowler <glenn.s.fowler@gmail.com>                *
18 *                                                                      *
19 ***********************************************************************/
20 #pragma prototyped
21 /*
22  * split.c
23  * David Korn
24  * AT&T Research
25  */
26 
27 static const char split_usage[] =
28 "[-?\n@(#)$Id: split (AT&T Research) 2006-09-19 $\n]"
29 USAGE_LICENSE
30 "[+NAME?split - split files into pieces]"
31 "[+DESCRIPTION?\bsplit\b reads an input file and writes one or more "
32     "output files so that \bcat\b(1) on these files will produce the input "
33     "file. The default size for each piece is 1000 lines. The suffix "
34     "consists of \asuffix_len\a lower case characters from the POSIX "
35     "locale.]"
36 "[+?If \aname\a is specified then it will be used as a prefix for each "
37     "of the resulting files from the split operation; otherwise the prefix "
38     "\bx\b will be used.]"
39 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bsplit\b "
40     "copies from standard input starting at the current location.]"
41 "[+?The option arguments for \b-b\b and \b-C\b can optionally be "
42     "followed by one of the following characters to specify a different unit "
43     "other than a single byte:]"
44     "{"
45         "[+b?512 bytes.]"
46         "[+k?1-killobytes.]"
47         "[+m?1-megabyte.]"
48         "[+g?1-gigabyte.]"
49         "[+t?1-terabyte.]"
50     "}"
51 "[+?For backwards compatibility, \b-\b\aline_count\a is equivalent to "
52     "\b-l\b \aline_count\a.]"
53 "[l:lines]#[line_count:=1000?\aline_count\a specified the number of "
54     "lines for each piece except the last. If the input does not end in a "
55     "newline, the partial line is included in the last piece.]"
56 "[a|n:suffix-length]#[suffix_len:=2?\asuffix_len\a defines the number of "
57     "letters that form the suffix portion of the file names for each of the "
58     "pieces that the file is split into.]"
59 "[b:bytes]#[n?Splits the file into byte size pieces defined by \an\a "
60     "rather than lines.]"
61 "[C:line-bytes]#[n?Splits the file into lines totaling at most \an\a "
62     "bytes.]"
63 "\n"
64 "\n[ file [ name ] ]\n"
65 "\n"
66 "[+EXIT STATUS]"
67     "{"
68         "[+0?Successful completion.]"
69         "[+>0?An error occurred.]"
70     "}"
71 "[+SEE ALSO? \bcsplit\b(1), \bcat\b(1)]"
72 ;
73 
74 static const char csplit_usage[] =
75 "[-?\n@(#)$Id: csplit (AT&T Research) 2003-08-21 $\n]"
76 USAGE_LICENSE
77 "[+NAME?csplit - split a file into sections determined by context lines]"
78 "[+DESCRIPTION?\bcsplit\b creates zero or more output files containing"
79 "	sections of the given input \afile\a, or the standard input if the"
80 "	name \b-\b is given. By default, \bcsplit\b prints the number of"
81 "	bytes written to each output file after it has been created.]"
82 "[+?The contents of the output files are determined by the \apattern\a"
83 "	arguments. An error occurs if a pattern argument refers to a"
84 "	nonexistent line of the input file, such as if no remaining line"
85 "	matches a given regular expression.  After all the given patterns have"
86 "	been matched, any remaining output is copied into one last output"
87 "	file. The types of pattern arguments are:]{"
88 "		[+line?Create an output file containing the current line up"
89 "			to (but not including) line \aline\a (a positive"
90 "			integer) of the input file. If followed by a repeat"
91 "			count, also create an output file containing the"
92 "			next \aline\a lines of the input file once for each"
93 "			repeat.]"
94 "		[+/regexp/[offset]]?Create an output file containing the"
95 "			current line up to (but not including) the next line"
96 "			of the input file that contains a match for"
97 "			\aregexp\a. The optional \aoffset\a is a \b+\b or"
98 "			\b-\b followed by a positive integer. If it is given,"
99 "			the input up to the matching line plus or minus"
100 "			\aoffset\a is put into the output file, and the line"
101 "			after that begins the next section of input.]"
102 "		[+%regexp%[offset]]?Like the previous type, except that it"
103 "			does not create an output file, so that section of"
104 "			the input file is effectively ignored.]"
105 "		[+{repeat-count}?Repeat the previous pattern \arepeat-count\a"
106 "			(a positive integer) additional times. An asterisk"
107 "			may be given in place of the (integer) repeat count,"
108 "			in which case the preceeding pattern is repeated as"
109 "			many times as necessary until the input is exausted.]"
110 "	}"
111 "[+?The output file names consist of a prefix followed by a suffix. By"
112 "	default, the suffix is merely an ascending linear sequence of two-digit"
113 "	decimal numbers starting with 00 and ranging up to 99, however this"
114 "	default may be overridden by either the \b--digits\b option or by the"
115 "	\b--suffix-format\b option (see below.) In any case, concatenating"
116 "	the output files in sorted order by file name produces the original"
117 "	input file, in order. The default output file name prefix is \bxx\b.]"
118 "[+?By default, if \bcsplit\b encounters an error or receives a hangup,"
119 "	interrupt, quit, or terminate signal, it removes any output files"
120 "	that it has created so far before it exits.]"
121 "[b:suffix-format?Use the \bprintf\b(3) \aformat\a to generate the file"
122 "	name suffix.]:[format:=\b%02d\b]"
123 "[f:prefix?Use \aprefix\a to generate the file name prefix.]:[prefix:=\bxx\b]"
124 "[k:keep-files?Do not remove output files on errors.]"
125 "[a|n:digits?Use \adigits\a in the generated file name suffixes.]#[digits:=2]"
126 "[s:silent|quiet?Do not print output file counts and sizes.]"
127 "[z:elide-empty-files?Remove empty output files.]"
128 "\n"
129 "\nfile arg ...\n"
130 "\n"
131 "[+EXIT STATUS?]{"
132 "	[+0?Successful completion.]"
133 "	[+>0?An error occurred.]"
134 "}"
135 "[+SEE ALSO? \bsplit\b(1), \bcat\b(1)]"
136 ;
137 
138 #include <cmd.h>
139 #include <regex.h>
140 
141 #define	S_FLAG		001
142 #define	K_FLAG		002
143 #define	C_FLAG		004
144 #define	B_FLAG		010
145 #define	Z_FLAG		020
146 #define	M_FLAG		040
147 
148 #define OP_LINES	0
149 #define OP_SEARCH	1
150 #define OP_SKIP		2
151 #define OP_ABSOLUTE	3
152 
153 #define BLK_SIZE	2048
154 
155 struct fname
156 {
157 	char*		fname;
158 	char*		format;
159 	char*		suffix;
160 	char*		last;
161 	char		low;
162 	char		high;
163 	int		count;
164 };
165 
166 struct op
167 {
168 	struct op*	next;
169 	Sfoff_t		size;
170 	size_t		repeat;
171 	int		flags;
172 	regex_t*	re;
173 };
174 
175 /*
176  * create an operation structure
177  */
178 static struct op*
getop(struct op ** prev,Sfoff_t size,size_t repeat,int flags,int re)179 getop(struct op** prev, Sfoff_t size, size_t repeat, int flags, int re)
180 {
181 	struct op*	op;
182 
183 	if (op = newof(0, struct op, 1, re ? sizeof(regex_t) : 0))
184 	{
185 		op->repeat = repeat;
186 		op->flags = flags;
187 		op->size = size;
188 		op->next = 0;
189 		if (re)
190 			op->re = (regex_t*)(op + 1);
191 		*prev = op;
192 	}
193 	return op;
194 }
195 
196 /*
197  * process /expr/offset arguments
198  * returns new operation structure which is added to linked list
199  */
200 
201 static struct op*
getexpr(struct op ** prev,const char * arg)202 getexpr(struct op** prev, const char* arg)
203 {
204 	char*		cp = (char*)arg;
205 	char*		ep;
206 	int		n;
207 	struct op*	op;
208 
209 	if (op = getop(prev, 0, 1, *cp == '/' ? OP_SEARCH : OP_SKIP, 1))
210 	{
211 		if (n = regcomp(op->re, cp, REG_DELIMITED|REG_NOSUB))
212 		{
213 			regfatal(op->re, 2, n);
214 			return 0;
215 		}
216 		cp += op->re->re_npat;
217 		if (*cp)
218 		{
219 			op->size = strtoll(cp, &ep, 10);
220 			if (*ep)
221 				error(ERROR_exit(1), "%s: invalid offset", cp);
222 		}
223 	}
224 	return op;
225 }
226 
227 /*
228  * set up file name generator whose form is <prefix>... where ... is
229  * suflen characters from low..high
230  * returns a pointer to a structure that can be used to create
231  * file names
232  */
233 
234 static struct fname*
setfname(const char * prefix,char * format,int suflen,int low,int high)235 setfname(const char* prefix, char* format, int suflen, int low, int high)
236 {
237 	struct fname*	fp;
238 	int		flen;
239 	int		slen;
240 	int		len;
241 	char*		cp;
242 
243 	flen = strlen(prefix);
244 	len = flen + suflen + 1;
245 	if (format)
246 	{
247 		slen = strlen(format);
248 		len += flen + slen + 1;
249 	}
250 	else
251 		slen = 0;
252 	if (fp = newof(0, struct fname, 1, len))
253 	{
254 		cp = (char*)(fp + 1);
255 		if (format)
256 		{
257 			strcpy(fp->format = cp, prefix);
258 			cp += flen;
259 			strcpy(cp, format);
260 			cp += slen + 1;
261 		}
262 		fp->low = low;
263 		fp->high = high;
264 		fp->count = 0;
265 		strcpy(fp->fname = cp, prefix);
266 		cp += flen;
267 		fp->suffix = cp;
268 		while (suflen-- > 0)
269 			*cp++ = low;
270 		*cp-- = 0;
271 		fp->last = cp;
272 		(*cp)--;
273 		flen = _POSIX_NAME_MAX;
274 		if (cp = strrchr(fp->fname, '/'))
275 		{
276 			cp++;
277 			len = strlen(cp);
278 			if (len > flen)
279 			{
280 				*(cp - 1) = 0;
281 				flen = (int)strtol(astconf("NAME_MAX", fp->fname, NiL), NiL, 0);
282 				*(cp - 1) = '/';
283 			}
284 		}
285 		else
286 		{
287 			cp = fp->fname;
288 			if (len > flen)
289 				flen = (int)strtol(astconf("NAME_MAX", ".", NiL), NiL, 0);
290 		}
291 		if (len > flen)
292 			error(ERROR_exit(1), "%s: filename too long", prefix);
293 	}
294 	return fp;
295 }
296 
297 /*
298  * return next sequential file name
299  */
300 
301 static char*
getfname(struct fname * fp)302 getfname(struct fname* fp)
303 {
304 	register char*	cp = fp->last;
305 
306 	if (fp->format)
307 		return sfprints(fp->format, fp->count++);
308 	while (++(*cp) > fp->high)
309 	{
310 		*cp-- = fp->low;
311 		if (cp < fp->suffix)
312 		{
313 			error(0, "file limit reached");
314 			return 0;
315 		}
316 	}
317 	fp->count++;
318 	return fp->fname;
319 }
320 
321 /*
322  * remove all generated files
323  */
324 
325 static void
removeall(struct fname * fp)326 removeall(struct fname* fp)
327 {
328 	register char*	cp = fp->suffix;
329 
330 	while (*cp)
331 		*cp++ = fp->low;
332 	*(cp - 1) -= 1;
333 	while (fp->count-- > 0)
334 	{
335 		remove(getfname(fp));
336 		fp->count--;
337 	}
338 	fp->count = 0;
339 }
340 
341 static int
msize(Sfio_t * in,long len)342 msize(Sfio_t* in, long len)
343 {
344 	Sfoff_t		off = sftell(in);
345 	register char*	cp;
346 	register char*	dp;
347 	register long	m;
348 	register long	n = len;
349 	register long	nlen = 0;
350 
351 	if (sfsize(in) - off <= len)
352 		return len;
353 	while (nlen == 0 && n > 0)
354 	{
355 		n -= BLK_SIZE;
356 		if (n < 0)
357 			n = 0;
358 		sfseek(in, off + n, SEEK_SET);
359 		if (!(dp = cp = sfreserve(in, BLK_SIZE, 0)))
360 			return len;
361 		m = BLK_SIZE;
362 		while (m-- > 0)
363 		{
364 			if (*cp++ == '\n')
365 				nlen = n + (cp - dp);
366 		}
367 	}
368 	if (n > 0)
369 		sfseek(in, off, SEEK_SET);
370 	return nlen ? nlen : len;
371 }
372 
373 static int
split(Sfio_t * in,struct fname * fp,struct op * op,int flags)374 split(Sfio_t* in, struct fname* fp, struct op* op, int flags)
375 {
376 	register char*		cp;
377 	register char*		s;
378 	Sfoff_t			len;
379 	Sfoff_t			z;
380 	Sfoff_t			size;
381 	size_t			repeat;
382 	int			c;
383 
384 	register Sfio_t*	out = 0;
385 	register char*		peek = 0;
386 	register long		n = 0;
387 	int			delim = (flags & B_FLAG) ? -1 : '\n';
388 	size_t			lineno = 1;
389 
390 	while (op)
391 	{
392 		if (op->flags == OP_LINES)
393 			len = op->size;
394 		repeat = op->repeat;
395 		do
396 		{
397 			if (op->flags != OP_SKIP)
398 			{
399 				if (!(cp = getfname(fp)))
400 					goto err;
401 				if (!(out = sfopen(NiL, cp, "w")))
402 				{
403 					fp->count--;
404 					error(ERROR_SYSTEM|2, "%s: cannot create", cp);
405 					goto err;
406 				}
407 			}
408 			if (op->flags == OP_ABSOLUTE || op->flags == OP_LINES)
409 			{
410 				if (op->flags == OP_ABSOLUTE)
411 					len = op->size - lineno;
412 				if (peek)
413 				{
414 					if ((n = sfputr(out, peek, delim)) <= 0)
415 						goto done;
416 					peek = 0;
417 					if (len > 0)
418 						len--;
419 					lineno++;
420 				}
421 				if (len)
422 				{
423 					z = (flags & M_FLAG) ? msize(in, len) : len;
424 					if ((n = sfmove(in, out, z, delim)) < z || n < 0)
425 						goto done;
426 					lineno += n;
427 				}
428 			}
429 			else
430 			{
431 				if (peek)
432 				{
433 					if (out && (n = sfputr(out, peek, delim)) <= 0)
434 						goto done;
435 					lineno++;
436 					peek = 0;
437 				}
438 				while (s = sfgetr(in, delim, 1))
439 				{
440 					if (!(c = regexec(op->re, s, 0, NiL, 0)))
441 						break;
442 					lineno++;
443 					if (c != REG_NOMATCH)
444 					{
445 						regfatal(op->re, 2, c);
446 						goto err;
447 					}
448 					if (out && (n = sfputr(out, s, delim)) <= 0)
449 						goto done;
450 				}
451 				if (!(peek = s))
452 				{
453 					while (op->next)
454 						op = op->next;
455 					repeat = 1;
456 				}
457 			}
458 			if (out)
459 			{
460 				size = sfseek(out, (Sfoff_t)0, SEEK_END);
461 				if (!(flags & S_FLAG))
462 					sfprintf(sfstdout, "%I*d\n", sizeof(size), size);
463 				sfclose(out);
464 				out = 0;
465 				if ((flags & Z_FLAG) && size <= 0)
466 					remove(cp);
467 			}
468 		} while (!repeat || --repeat);
469 		op = op->next;
470 	}
471  done:
472 	if (out)
473 	{
474 		sfclose(out);
475 		if (n <= 0)
476 			remove(cp);
477 	}
478 	if (n >= 0)
479 		return 0;
480  err:
481 	if (!(flags & K_FLAG))
482 		removeall(fp);
483 	return 1;
484 }
485 
486 int
main(int argc,char ** argv)487 main(int argc, char** argv)
488 {
489 	struct fname*	fp;
490 	struct op*	top;
491 	char*		cp;
492 	char*		prefix;
493 	const char*	usage;
494 	Sfio_t*		in;
495 	int		flags;
496 	ssize_t		n;
497 
498 	char*		format = 0;
499 	Sfoff_t		size = 10000;
500 	int		suflen = 2;
501 
502 	if (cp = strrchr(*argv, '/'))
503 		cp++;
504 	else
505 		cp = *argv;
506 	error_info.id = cp;
507 	if (streq(cp, "split"))
508 	{
509 		usage = split_usage;
510 		flags = S_FLAG|K_FLAG;
511 		prefix = "x";
512 	}
513 	else
514 	{
515 		usage = csplit_usage;
516 		flags = C_FLAG;
517 		prefix = "xx";
518 	}
519 	for (;;)
520 	{
521 		switch (optget(argv, usage))
522 		{
523 		case 0:
524 			break;
525 		case 'l':
526 			flags &= ~(B_FLAG|M_FLAG);
527 			if ((size = opt_info.number) <= 0)
528 				error(1, "%s: invalid size", opt_info.arg);
529 			continue;
530 		case 'k':
531 			flags |= K_FLAG;
532 			continue;
533 		case 's':
534 			flags |= S_FLAG;
535 			continue;
536 		case 'z':
537 			flags |= Z_FLAG;
538 			continue;
539 		case 'f':
540 			prefix = opt_info.arg;
541 			continue;
542 		case 'a':
543 		case 'n':
544 			suflen = opt_info.num;
545 			continue;
546 		case 'C':
547 			flags |= M_FLAG;
548 		case 'b':
549 			if (flags & S_FLAG)
550 			{
551 				if ((size = opt_info.number) <= 0)
552 					error(1, "%s: invalid size", opt_info.arg);
553 				flags |= B_FLAG;
554 			}
555 			else
556 				format = opt_info.arg;
557 			continue;
558 		case ':':
559 			error(2, "%s", opt_info.arg);
560 			break;
561 		case '?':
562 			error(ERROR_usage(2), "%s", opt_info.arg);
563 			break;
564 		}
565 		break;
566 	}
567 	argv += opt_info.index;
568 	argc -= opt_info.index;
569 	if (error_info.errors || !(flags & C_FLAG) && argc > 2 || (flags & C_FLAG) && argc < 2)
570 		error(ERROR_usage(2), "%s", optusage(NiL));
571 	cp = *argv++;
572 	if (flags & C_FLAG)
573 	{
574 		struct op*	op = 0;
575 		char*		sp;
576 
577 		while (sp = *argv++)
578 		{
579 			switch (*sp)
580 			{
581 			case '/':
582 			case '?':
583 			case '%':
584 				op = getexpr(op ? &op->next : &top, sp);
585 				break;
586 			case '{':
587 				if (!op)
588 					error(ERROR_exit(1), "%s: pattern expected for repeat count", *(argv - 1));
589 				if (*++sp == '*' && *(sp + 1) == '}' && !*(sp + 2))
590 					op->repeat = 0;
591 				else
592 				{
593 					if ((n = strtol(sp, &sp, 10)) <= 0 || *sp != '}' || *(sp + 1))
594 						error(ERROR_exit(1), "%s: invalid repeat count", *(argv - 1));
595 					op->repeat = n + 1;
596 				}
597 				if (op->flags == OP_ABSOLUTE)
598 					op->flags = OP_LINES;
599 				break;
600 			default:
601 				if ((size = strtoll(sp, &sp, 10)) <= 0 || *sp)
602 					error(ERROR_exit(1), "%s: invalid line number", *(argv - 1));
603 				op = getop(op ? &op->next : &top, size, 1, OP_ABSOLUTE, 0);
604 				break;
605 			}
606 		}
607 		op = getop(op ? &op->next : &top, SF_UNBOUND, 1, OP_LINES, 0);
608 		fp = setfname(prefix, format, suflen, '0', '9');
609 	}
610 	else
611 	{
612 		if (cp && *argv)
613 			prefix = *argv;
614 		getop(&top, size, SF_UNBOUND, OP_LINES, 0);
615 		fp = setfname(prefix, format, suflen, 'a', 'z');
616 	}
617 	if (!cp || streq(cp, "-"))
618 		in = sfstdin;
619 	else if (!(in = sfopen(NiL, cp, "r")))
620 		error(ERROR_system(1), "%s: cannot open", cp);
621 	n = split(in, fp, top, flags);
622 	if (in != sfstdin)
623 		sfclose(in);
624 	return n;
625 }
626