1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1989-2013 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <glenn.s.fowler@gmail.com> *
18 * *
19 ***********************************************************************/
20 #pragma prototyped
21 /*
22 * split.c
23 * David Korn
24 * AT&T Research
25 */
26
27 static const char split_usage[] =
28 "[-?\n@(#)$Id: split (AT&T Research) 2006-09-19 $\n]"
29 USAGE_LICENSE
30 "[+NAME?split - split files into pieces]"
31 "[+DESCRIPTION?\bsplit\b reads an input file and writes one or more "
32 "output files so that \bcat\b(1) on these files will produce the input "
33 "file. The default size for each piece is 1000 lines. The suffix "
34 "consists of \asuffix_len\a lower case characters from the POSIX "
35 "locale.]"
36 "[+?If \aname\a is specified then it will be used as a prefix for each "
37 "of the resulting files from the split operation; otherwise the prefix "
38 "\bx\b will be used.]"
39 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bsplit\b "
40 "copies from standard input starting at the current location.]"
41 "[+?The option arguments for \b-b\b and \b-C\b can optionally be "
42 "followed by one of the following characters to specify a different unit "
43 "other than a single byte:]"
44 "{"
45 "[+b?512 bytes.]"
46 "[+k?1-killobytes.]"
47 "[+m?1-megabyte.]"
48 "[+g?1-gigabyte.]"
49 "[+t?1-terabyte.]"
50 "}"
51 "[+?For backwards compatibility, \b-\b\aline_count\a is equivalent to "
52 "\b-l\b \aline_count\a.]"
53 "[l:lines]#[line_count:=1000?\aline_count\a specified the number of "
54 "lines for each piece except the last. If the input does not end in a "
55 "newline, the partial line is included in the last piece.]"
56 "[a|n:suffix-length]#[suffix_len:=2?\asuffix_len\a defines the number of "
57 "letters that form the suffix portion of the file names for each of the "
58 "pieces that the file is split into.]"
59 "[b:bytes]#[n?Splits the file into byte size pieces defined by \an\a "
60 "rather than lines.]"
61 "[C:line-bytes]#[n?Splits the file into lines totaling at most \an\a "
62 "bytes.]"
63 "\n"
64 "\n[ file [ name ] ]\n"
65 "\n"
66 "[+EXIT STATUS]"
67 "{"
68 "[+0?Successful completion.]"
69 "[+>0?An error occurred.]"
70 "}"
71 "[+SEE ALSO? \bcsplit\b(1), \bcat\b(1)]"
72 ;
73
74 static const char csplit_usage[] =
75 "[-?\n@(#)$Id: csplit (AT&T Research) 2003-08-21 $\n]"
76 USAGE_LICENSE
77 "[+NAME?csplit - split a file into sections determined by context lines]"
78 "[+DESCRIPTION?\bcsplit\b creates zero or more output files containing"
79 " sections of the given input \afile\a, or the standard input if the"
80 " name \b-\b is given. By default, \bcsplit\b prints the number of"
81 " bytes written to each output file after it has been created.]"
82 "[+?The contents of the output files are determined by the \apattern\a"
83 " arguments. An error occurs if a pattern argument refers to a"
84 " nonexistent line of the input file, such as if no remaining line"
85 " matches a given regular expression. After all the given patterns have"
86 " been matched, any remaining output is copied into one last output"
87 " file. The types of pattern arguments are:]{"
88 " [+line?Create an output file containing the current line up"
89 " to (but not including) line \aline\a (a positive"
90 " integer) of the input file. If followed by a repeat"
91 " count, also create an output file containing the"
92 " next \aline\a lines of the input file once for each"
93 " repeat.]"
94 " [+/regexp/[offset]]?Create an output file containing the"
95 " current line up to (but not including) the next line"
96 " of the input file that contains a match for"
97 " \aregexp\a. The optional \aoffset\a is a \b+\b or"
98 " \b-\b followed by a positive integer. If it is given,"
99 " the input up to the matching line plus or minus"
100 " \aoffset\a is put into the output file, and the line"
101 " after that begins the next section of input.]"
102 " [+%regexp%[offset]]?Like the previous type, except that it"
103 " does not create an output file, so that section of"
104 " the input file is effectively ignored.]"
105 " [+{repeat-count}?Repeat the previous pattern \arepeat-count\a"
106 " (a positive integer) additional times. An asterisk"
107 " may be given in place of the (integer) repeat count,"
108 " in which case the preceeding pattern is repeated as"
109 " many times as necessary until the input is exausted.]"
110 " }"
111 "[+?The output file names consist of a prefix followed by a suffix. By"
112 " default, the suffix is merely an ascending linear sequence of two-digit"
113 " decimal numbers starting with 00 and ranging up to 99, however this"
114 " default may be overridden by either the \b--digits\b option or by the"
115 " \b--suffix-format\b option (see below.) In any case, concatenating"
116 " the output files in sorted order by file name produces the original"
117 " input file, in order. The default output file name prefix is \bxx\b.]"
118 "[+?By default, if \bcsplit\b encounters an error or receives a hangup,"
119 " interrupt, quit, or terminate signal, it removes any output files"
120 " that it has created so far before it exits.]"
121 "[b:suffix-format?Use the \bprintf\b(3) \aformat\a to generate the file"
122 " name suffix.]:[format:=\b%02d\b]"
123 "[f:prefix?Use \aprefix\a to generate the file name prefix.]:[prefix:=\bxx\b]"
124 "[k:keep-files?Do not remove output files on errors.]"
125 "[a|n:digits?Use \adigits\a in the generated file name suffixes.]#[digits:=2]"
126 "[s:silent|quiet?Do not print output file counts and sizes.]"
127 "[z:elide-empty-files?Remove empty output files.]"
128 "\n"
129 "\nfile arg ...\n"
130 "\n"
131 "[+EXIT STATUS?]{"
132 " [+0?Successful completion.]"
133 " [+>0?An error occurred.]"
134 "}"
135 "[+SEE ALSO? \bsplit\b(1), \bcat\b(1)]"
136 ;
137
138 #include <cmd.h>
139 #include <regex.h>
140
141 #define S_FLAG 001
142 #define K_FLAG 002
143 #define C_FLAG 004
144 #define B_FLAG 010
145 #define Z_FLAG 020
146 #define M_FLAG 040
147
148 #define OP_LINES 0
149 #define OP_SEARCH 1
150 #define OP_SKIP 2
151 #define OP_ABSOLUTE 3
152
153 #define BLK_SIZE 2048
154
155 struct fname
156 {
157 char* fname;
158 char* format;
159 char* suffix;
160 char* last;
161 char low;
162 char high;
163 int count;
164 };
165
166 struct op
167 {
168 struct op* next;
169 Sfoff_t size;
170 size_t repeat;
171 int flags;
172 regex_t* re;
173 };
174
175 /*
176 * create an operation structure
177 */
178 static struct op*
getop(struct op ** prev,Sfoff_t size,size_t repeat,int flags,int re)179 getop(struct op** prev, Sfoff_t size, size_t repeat, int flags, int re)
180 {
181 struct op* op;
182
183 if (op = newof(0, struct op, 1, re ? sizeof(regex_t) : 0))
184 {
185 op->repeat = repeat;
186 op->flags = flags;
187 op->size = size;
188 op->next = 0;
189 if (re)
190 op->re = (regex_t*)(op + 1);
191 *prev = op;
192 }
193 return op;
194 }
195
196 /*
197 * process /expr/offset arguments
198 * returns new operation structure which is added to linked list
199 */
200
201 static struct op*
getexpr(struct op ** prev,const char * arg)202 getexpr(struct op** prev, const char* arg)
203 {
204 char* cp = (char*)arg;
205 char* ep;
206 int n;
207 struct op* op;
208
209 if (op = getop(prev, 0, 1, *cp == '/' ? OP_SEARCH : OP_SKIP, 1))
210 {
211 if (n = regcomp(op->re, cp, REG_DELIMITED|REG_NOSUB))
212 {
213 regfatal(op->re, 2, n);
214 return 0;
215 }
216 cp += op->re->re_npat;
217 if (*cp)
218 {
219 op->size = strtoll(cp, &ep, 10);
220 if (*ep)
221 error(ERROR_exit(1), "%s: invalid offset", cp);
222 }
223 }
224 return op;
225 }
226
227 /*
228 * set up file name generator whose form is <prefix>... where ... is
229 * suflen characters from low..high
230 * returns a pointer to a structure that can be used to create
231 * file names
232 */
233
234 static struct fname*
setfname(const char * prefix,char * format,int suflen,int low,int high)235 setfname(const char* prefix, char* format, int suflen, int low, int high)
236 {
237 struct fname* fp;
238 int flen;
239 int slen;
240 int len;
241 char* cp;
242
243 flen = strlen(prefix);
244 len = flen + suflen + 1;
245 if (format)
246 {
247 slen = strlen(format);
248 len += flen + slen + 1;
249 }
250 else
251 slen = 0;
252 if (fp = newof(0, struct fname, 1, len))
253 {
254 cp = (char*)(fp + 1);
255 if (format)
256 {
257 strcpy(fp->format = cp, prefix);
258 cp += flen;
259 strcpy(cp, format);
260 cp += slen + 1;
261 }
262 fp->low = low;
263 fp->high = high;
264 fp->count = 0;
265 strcpy(fp->fname = cp, prefix);
266 cp += flen;
267 fp->suffix = cp;
268 while (suflen-- > 0)
269 *cp++ = low;
270 *cp-- = 0;
271 fp->last = cp;
272 (*cp)--;
273 flen = _POSIX_NAME_MAX;
274 if (cp = strrchr(fp->fname, '/'))
275 {
276 cp++;
277 len = strlen(cp);
278 if (len > flen)
279 {
280 *(cp - 1) = 0;
281 flen = (int)strtol(astconf("NAME_MAX", fp->fname, NiL), NiL, 0);
282 *(cp - 1) = '/';
283 }
284 }
285 else
286 {
287 cp = fp->fname;
288 if (len > flen)
289 flen = (int)strtol(astconf("NAME_MAX", ".", NiL), NiL, 0);
290 }
291 if (len > flen)
292 error(ERROR_exit(1), "%s: filename too long", prefix);
293 }
294 return fp;
295 }
296
297 /*
298 * return next sequential file name
299 */
300
301 static char*
getfname(struct fname * fp)302 getfname(struct fname* fp)
303 {
304 register char* cp = fp->last;
305
306 if (fp->format)
307 return sfprints(fp->format, fp->count++);
308 while (++(*cp) > fp->high)
309 {
310 *cp-- = fp->low;
311 if (cp < fp->suffix)
312 {
313 error(0, "file limit reached");
314 return 0;
315 }
316 }
317 fp->count++;
318 return fp->fname;
319 }
320
321 /*
322 * remove all generated files
323 */
324
325 static void
removeall(struct fname * fp)326 removeall(struct fname* fp)
327 {
328 register char* cp = fp->suffix;
329
330 while (*cp)
331 *cp++ = fp->low;
332 *(cp - 1) -= 1;
333 while (fp->count-- > 0)
334 {
335 remove(getfname(fp));
336 fp->count--;
337 }
338 fp->count = 0;
339 }
340
341 static int
msize(Sfio_t * in,long len)342 msize(Sfio_t* in, long len)
343 {
344 Sfoff_t off = sftell(in);
345 register char* cp;
346 register char* dp;
347 register long m;
348 register long n = len;
349 register long nlen = 0;
350
351 if (sfsize(in) - off <= len)
352 return len;
353 while (nlen == 0 && n > 0)
354 {
355 n -= BLK_SIZE;
356 if (n < 0)
357 n = 0;
358 sfseek(in, off + n, SEEK_SET);
359 if (!(dp = cp = sfreserve(in, BLK_SIZE, 0)))
360 return len;
361 m = BLK_SIZE;
362 while (m-- > 0)
363 {
364 if (*cp++ == '\n')
365 nlen = n + (cp - dp);
366 }
367 }
368 if (n > 0)
369 sfseek(in, off, SEEK_SET);
370 return nlen ? nlen : len;
371 }
372
373 static int
split(Sfio_t * in,struct fname * fp,struct op * op,int flags)374 split(Sfio_t* in, struct fname* fp, struct op* op, int flags)
375 {
376 register char* cp;
377 register char* s;
378 Sfoff_t len;
379 Sfoff_t z;
380 Sfoff_t size;
381 size_t repeat;
382 int c;
383
384 register Sfio_t* out = 0;
385 register char* peek = 0;
386 register long n = 0;
387 int delim = (flags & B_FLAG) ? -1 : '\n';
388 size_t lineno = 1;
389
390 while (op)
391 {
392 if (op->flags == OP_LINES)
393 len = op->size;
394 repeat = op->repeat;
395 do
396 {
397 if (op->flags != OP_SKIP)
398 {
399 if (!(cp = getfname(fp)))
400 goto err;
401 if (!(out = sfopen(NiL, cp, "w")))
402 {
403 fp->count--;
404 error(ERROR_SYSTEM|2, "%s: cannot create", cp);
405 goto err;
406 }
407 }
408 if (op->flags == OP_ABSOLUTE || op->flags == OP_LINES)
409 {
410 if (op->flags == OP_ABSOLUTE)
411 len = op->size - lineno;
412 if (peek)
413 {
414 if ((n = sfputr(out, peek, delim)) <= 0)
415 goto done;
416 peek = 0;
417 if (len > 0)
418 len--;
419 lineno++;
420 }
421 if (len)
422 {
423 z = (flags & M_FLAG) ? msize(in, len) : len;
424 if ((n = sfmove(in, out, z, delim)) < z || n < 0)
425 goto done;
426 lineno += n;
427 }
428 }
429 else
430 {
431 if (peek)
432 {
433 if (out && (n = sfputr(out, peek, delim)) <= 0)
434 goto done;
435 lineno++;
436 peek = 0;
437 }
438 while (s = sfgetr(in, delim, 1))
439 {
440 if (!(c = regexec(op->re, s, 0, NiL, 0)))
441 break;
442 lineno++;
443 if (c != REG_NOMATCH)
444 {
445 regfatal(op->re, 2, c);
446 goto err;
447 }
448 if (out && (n = sfputr(out, s, delim)) <= 0)
449 goto done;
450 }
451 if (!(peek = s))
452 {
453 while (op->next)
454 op = op->next;
455 repeat = 1;
456 }
457 }
458 if (out)
459 {
460 size = sfseek(out, (Sfoff_t)0, SEEK_END);
461 if (!(flags & S_FLAG))
462 sfprintf(sfstdout, "%I*d\n", sizeof(size), size);
463 sfclose(out);
464 out = 0;
465 if ((flags & Z_FLAG) && size <= 0)
466 remove(cp);
467 }
468 } while (!repeat || --repeat);
469 op = op->next;
470 }
471 done:
472 if (out)
473 {
474 sfclose(out);
475 if (n <= 0)
476 remove(cp);
477 }
478 if (n >= 0)
479 return 0;
480 err:
481 if (!(flags & K_FLAG))
482 removeall(fp);
483 return 1;
484 }
485
486 int
main(int argc,char ** argv)487 main(int argc, char** argv)
488 {
489 struct fname* fp;
490 struct op* top;
491 char* cp;
492 char* prefix;
493 const char* usage;
494 Sfio_t* in;
495 int flags;
496 ssize_t n;
497
498 char* format = 0;
499 Sfoff_t size = 10000;
500 int suflen = 2;
501
502 if (cp = strrchr(*argv, '/'))
503 cp++;
504 else
505 cp = *argv;
506 error_info.id = cp;
507 if (streq(cp, "split"))
508 {
509 usage = split_usage;
510 flags = S_FLAG|K_FLAG;
511 prefix = "x";
512 }
513 else
514 {
515 usage = csplit_usage;
516 flags = C_FLAG;
517 prefix = "xx";
518 }
519 for (;;)
520 {
521 switch (optget(argv, usage))
522 {
523 case 0:
524 break;
525 case 'l':
526 flags &= ~(B_FLAG|M_FLAG);
527 if ((size = opt_info.number) <= 0)
528 error(1, "%s: invalid size", opt_info.arg);
529 continue;
530 case 'k':
531 flags |= K_FLAG;
532 continue;
533 case 's':
534 flags |= S_FLAG;
535 continue;
536 case 'z':
537 flags |= Z_FLAG;
538 continue;
539 case 'f':
540 prefix = opt_info.arg;
541 continue;
542 case 'a':
543 case 'n':
544 suflen = opt_info.num;
545 continue;
546 case 'C':
547 flags |= M_FLAG;
548 case 'b':
549 if (flags & S_FLAG)
550 {
551 if ((size = opt_info.number) <= 0)
552 error(1, "%s: invalid size", opt_info.arg);
553 flags |= B_FLAG;
554 }
555 else
556 format = opt_info.arg;
557 continue;
558 case ':':
559 error(2, "%s", opt_info.arg);
560 break;
561 case '?':
562 error(ERROR_usage(2), "%s", opt_info.arg);
563 break;
564 }
565 break;
566 }
567 argv += opt_info.index;
568 argc -= opt_info.index;
569 if (error_info.errors || !(flags & C_FLAG) && argc > 2 || (flags & C_FLAG) && argc < 2)
570 error(ERROR_usage(2), "%s", optusage(NiL));
571 cp = *argv++;
572 if (flags & C_FLAG)
573 {
574 struct op* op = 0;
575 char* sp;
576
577 while (sp = *argv++)
578 {
579 switch (*sp)
580 {
581 case '/':
582 case '?':
583 case '%':
584 op = getexpr(op ? &op->next : &top, sp);
585 break;
586 case '{':
587 if (!op)
588 error(ERROR_exit(1), "%s: pattern expected for repeat count", *(argv - 1));
589 if (*++sp == '*' && *(sp + 1) == '}' && !*(sp + 2))
590 op->repeat = 0;
591 else
592 {
593 if ((n = strtol(sp, &sp, 10)) <= 0 || *sp != '}' || *(sp + 1))
594 error(ERROR_exit(1), "%s: invalid repeat count", *(argv - 1));
595 op->repeat = n + 1;
596 }
597 if (op->flags == OP_ABSOLUTE)
598 op->flags = OP_LINES;
599 break;
600 default:
601 if ((size = strtoll(sp, &sp, 10)) <= 0 || *sp)
602 error(ERROR_exit(1), "%s: invalid line number", *(argv - 1));
603 op = getop(op ? &op->next : &top, size, 1, OP_ABSOLUTE, 0);
604 break;
605 }
606 }
607 op = getop(op ? &op->next : &top, SF_UNBOUND, 1, OP_LINES, 0);
608 fp = setfname(prefix, format, suflen, '0', '9');
609 }
610 else
611 {
612 if (cp && *argv)
613 prefix = *argv;
614 getop(&top, size, SF_UNBOUND, OP_LINES, 0);
615 fp = setfname(prefix, format, suflen, 'a', 'z');
616 }
617 if (!cp || streq(cp, "-"))
618 in = sfstdin;
619 else if (!(in = sfopen(NiL, cp, "r")))
620 error(ERROR_system(1), "%s: cannot open", cp);
621 n = split(in, fp, top, flags);
622 if (in != sfstdin)
623 sfclose(in);
624 return n;
625 }
626