1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2013 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <glenn.s.fowler@gmail.com> *
18 * David Korn <dgkorn@gmail.com> *
19 * *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23 * David Korn
24 * AT&T Bell Laboratories
25 *
26 * cut fields or columns from fields from a file
27 */
28
29 static const char usage[] =
30 "[-?\n@(#)$Id: cut (AT&T Research) 2013-09-13 $\n]"
31 USAGE_LICENSE
32 "[+NAME?cut - cut out selected columns or fields of each line of a file]"
33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34 "from one or more files, contatenating them on standard output.]"
35 "[+?The option argument \alist\a is a comma-separated or blank-separated "
36 "list of positive numbers and ranges. Ranges can be of three "
37 "forms. The first is two positive integers separated by a hyphen "
38 "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39 "\ahigh\a. The second is a positive number preceded by a hyphen "
40 "(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41 "\ahigh\a. The last is a positive number followed by a hyphen "
42 "(\alow\a\b-\b), which represents all fields from \alow\a to the "
43 "last field, inclusive. Elements in the \alist\a can be repeated, "
44 "can overlap, and can appear in any order. The order of the "
45 "output is that of the input.]"
46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48 "cuts from standard input. The start of the file is defined "
49 "as the current offset.]"
50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51 "[c:characters]:[list?\bcut\b based on a list of character counts.]"
52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53 "to \adelim\a. The default is the \btab\b character.]"
54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55 "character specified with the \b-d\b optiion.]"
56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58 "records of length \areclen\a when used with the \b-b\b or \b-c\b "
59 "option.]"
60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61 "when used with the \b-f\b option. By default, lines with no "
62 "delimiters will be passsed in untouched.]"
63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64 "the \b-f\b option is set to \aldelim\a. The default is the "
65 "\bnewline\b character.]"
66 "[N!:newline?Output new-lines at end of each record when used "
67 "with the \b-b\b or \b-c\b option.]"
68 "\n"
69 "\n[file ...]\n"
70 "\n"
71 "[+EXIT STATUS?]{"
72 "[+0?All files processed successfully.]"
73 "[+>0?One or more files failed to open or could not be read.]"
74 "}"
75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76 ;
77
78 #include <cmd.h>
79 #include <ctype.h>
80
81 typedef struct Delim_s
82 {
83 char* str;
84 int len;
85 int chr;
86 } Delim_t;
87
88 typedef struct Cut_s
89 {
90 int mb;
91 int eob;
92 int cflag;
93 int nosplit;
94 int sflag;
95 int nlflag;
96 int reclen;
97 Delim_t wdelim;
98 Delim_t ldelim;
99 Mbstate_t q;
100 unsigned char space[UCHAR_MAX+1];
101 int list[2]; /* NOTE: must be last member */
102 } Cut_t;
103
104 #define HUGE INT_MAX
105 #define BLOCK 8*1024
106 #define C_BYTES 1
107 #define C_CHARS 2
108 #define C_FIELDS 4
109 #define C_SUPRESS 8
110 #define C_NOSPLIT 16
111 #define C_NONEWLINE 32
112
113 #define SP_LINE 1
114 #define SP_WORD 2
115 #define SP_WIDE 3
116
117 /*
118 * compare the first of an array of integers
119 */
120
121 static int
mycomp(register const void * a,register const void * b)122 mycomp(register const void* a, register const void* b)
123 {
124 if (*((int*)a) < *((int*)b))
125 return -1;
126 if (*((int*)a) > *((int*)b))
127 return 1;
128 return 0;
129 }
130
131 static Cut_t*
cutinit(int mode,char * str,Delim_t * wdelim,Delim_t * ldelim,size_t reclen)132 cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
133 {
134 register int* lp;
135 register int c;
136 register int n = 0;
137 register int range = 0;
138 register char* cp = str;
139 Cut_t* cut;
140
141 if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
142 error(ERROR_exit(1), "out of space");
143 if (cut->mb = mbwide())
144 {
145 memset(cut->space, 0, sizeof(cut->space) / 2);
146 memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
147 }
148 else
149 memset(cut->space, 0, sizeof(cut->space));
150 cut->wdelim = *wdelim;
151 if (wdelim->len == 1)
152 cut->space[wdelim->chr] = SP_WORD;
153 cut->ldelim = *ldelim;
154 cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
155 cut->space[cut->eob] = SP_LINE;
156 cut->cflag = (mode&C_CHARS) && cut->mb;
157 cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
158 cut->sflag = (mode&C_SUPRESS) != 0;
159 cut->nlflag = (mode&C_NONEWLINE) != 0;
160 cut->reclen = reclen;
161 lp = cut->list;
162 for (;;)
163 switch(c = *cp++)
164 {
165 case ' ':
166 case '\t':
167 while(*cp==' ' || *cp=='\t')
168 cp++;
169 /*FALLTHROUGH*/
170 case 0:
171 case ',':
172 if(range)
173 {
174 --range;
175 if((n = (n ? (n-range) : (HUGE-1))) < 0)
176 error(ERROR_exit(1),"invalid range for c/f option");
177 *lp++ = range;
178 *lp++ = n;
179 }
180 else
181 {
182 *lp++ = --n;
183 *lp++ = 1;
184 }
185 if(c==0)
186 {
187 register int *dp;
188 *lp = HUGE;
189 n = 1 + (lp-cut->list)/2;
190 qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
191 /* eliminate overlapping regions */
192 for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
193 {
194 if(lp[0] <= range)
195 {
196 if(lp[1]==HUGE)
197 {
198 dp[-1] = HUGE;
199 break;
200 }
201 if((c = lp[0]+lp[1]-range)>0)
202 {
203 range += c;
204 dp[-1] += c;
205 }
206 }
207 else
208 {
209 range = *dp++ = lp[0];
210 if(lp[1]==HUGE)
211 {
212 *dp++ = HUGE;
213 break;
214 }
215 range += (*dp++ = lp[1]);
216 }
217 }
218 *dp = HUGE;
219 lp = cut->list;
220 /* convert ranges into gaps */
221 for(n=0; *lp!=HUGE; lp+=2)
222 {
223 c = *lp;
224 *lp -= n;
225 n = c+lp[1];
226 }
227 return cut;
228 }
229 n = range = 0;
230 break;
231
232 case '-':
233 if(range)
234 error(ERROR_exit(1),"bad list for c/f option");
235 range = n?n:1;
236 n = 0;
237 break;
238
239 default:
240 if(!isdigit(c))
241 error(ERROR_exit(1),"bad list for c/f option");
242 n = 10*n + (c-'0');
243 break;
244 }
245 /* NOTREACHED */
246 }
247
248 /*
249 * cut each line of file <fdin> and put results to <fdout> using list <list>
250 */
251
252 static void
cutcols(Cut_t * cut,Sfio_t * fdin,Sfio_t * fdout)253 cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
254 {
255 register int c;
256 register int len;
257 register int ncol = 0;
258 register const int* lp = cut->list;
259 register char* bp;
260 register int skip; /* non-zero for don't copy */
261 int must;
262 const char* xx;
263
264 for (;;)
265 {
266 if (len = cut->reclen)
267 bp = sfreserve(fdin, len, -1);
268 else
269 bp = sfgetr(fdin, '\n', 0);
270 if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
271 break;
272 len = sfvalue(fdin);
273 xx = 0;
274 if (!(ncol = skip = *(lp = cut->list)))
275 ncol = *++lp;
276 must = 1;
277 do
278 {
279 if (cut->nosplit)
280 {
281 register const char* s = bp;
282 register int w = len < ncol ? len : ncol;
283 register int z;
284
285 while (w > 0)
286 {
287 if (!(*s & 0x80))
288 z = 1;
289 else if ((z = mbtsize(s, w, &cut->q)) <= 0)
290 {
291 if (s == bp && xx)
292 {
293 w += s - xx;
294 bp = (char*)(s = xx);
295 xx = 0;
296 continue;
297 }
298 xx = s;
299 if (skip)
300 s += w;
301 w = 0;
302 break;
303 }
304 s += z;
305 w -= z;
306 }
307 c = s - bp;
308 ncol = !w && ncol >= len;
309 }
310 else if (cut->cflag)
311 {
312 register const char* s = bp;
313 register int w = len;
314 register int z;
315
316 while (w > 0 && ncol > 0)
317 {
318 ncol--;
319 if (!(*s & 0x80) || (z = mbtsize(s, w, &cut->q)) <= 0)
320 z = 1;
321 s += z;
322 w -= z;
323
324 }
325 c = s - bp;
326 ncol = !w && (ncol || !skip);
327 }
328 else
329 {
330 if ((c = ncol) > len)
331 c = len;
332 else if (c == len && !skip)
333 ncol++;
334 ncol -= c;
335 }
336 if (!skip && c)
337 {
338 if (sfwrite(fdout, (char*)bp, c) < 0)
339 return;
340 must = 0;
341 }
342 bp += c;
343 if (ncol)
344 break;
345 len -= c;
346 ncol = *++lp;
347 skip = !skip;
348 } while (ncol != HUGE);
349 if (!cut->nlflag && (skip || must || cut->reclen))
350 {
351 if (cut->ldelim.len > 1)
352 sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
353 else
354 sfputc(fdout, cut->ldelim.chr);
355 }
356 }
357 }
358
359 /*
360 * cut each line of file <fdin> and put results to <fdout> using list <list>
361 * stream <fdin> must be line buffered
362 */
363
364 static void
cutfields(Cut_t * cut,Sfio_t * fdin,Sfio_t * fdout)365 cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
366 {
367 register unsigned char *sp = cut->space;
368 register unsigned char *cp;
369 register unsigned char *wp;
370 register int c, nfields;
371 register const int *lp = cut->list;
372 register unsigned char *copy;
373 register int nodelim, empty, inword=0;
374 register unsigned char *ep;
375 unsigned char *bp, *first;
376 char *tp;
377 int lastchar;
378 wchar_t w;
379 Sfio_t *fdtmp = 0;
380 long offset = 0;
381 unsigned char mb[8];
382 /* process each buffer */
383 while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
384 {
385 cp = bp;
386 ep = cp + --c;
387 if((lastchar = cp[c]) != cut->eob)
388 *ep = cut->eob;
389 /* process each line in the buffer */
390 while (cp <= ep)
391 {
392 first = cp;
393 if (!inword)
394 {
395 nodelim = empty = 1;
396 copy = cp;
397 if (nfields = *(lp = cut->list))
398 copy = 0;
399 else
400 nfields = *++lp;
401 }
402 else if (copy)
403 copy = cp;
404 inword = 0;
405 do
406 {
407 /* skip over non-delimiter characters */
408 if (cut->mb)
409 for (;;)
410 {
411 switch (c = sp[*(unsigned char*)cp++])
412 {
413 case 0:
414 continue;
415 case SP_WIDE:
416 wp = --cp;
417 while ((c = mbchar(&w, cp, ep - cp, &cut->q)), mberrno(&cut->q) == E2BIG)
418 {
419 int i;
420 int j;
421 int k;
422
423 /* mb char spanning buffer boundary -- fun stuff */
424
425 cp = wp;
426 if (lastchar != cut->eob)
427 {
428 *ep = lastchar;
429 if ((c = mbchar(&w, cp, ep - cp + 1, &cut->q)), mberrno(&cut->q) != E2BIG)
430 break;
431 }
432 if (copy)
433 {
434 empty = 0;
435 if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
436 goto failed;
437 }
438 for (i = 0; i <= (ep - cp); i++)
439 mb[i] = cp[i];
440 if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
441 goto failed;
442 cp = bp;
443 ep = cp + --c;
444 if ((lastchar = cp[c]) != cut->eob)
445 *ep = cut->eob;
446 j = i;
447 k = 0;
448 while (j < mbmax())
449 mb[j++] = cp[k++];
450 tp = (char*)mb;
451 w = mbchar(&w, tp, j, &cut->q);
452 if (mberrno(&cut->q))
453 {
454 w = 0;
455 c = i;
456 }
457 else
458 c = tp - (char*)mb;
459 first = bp = cp += c - i;
460 if (copy)
461 {
462 copy = bp;
463 if (w == cut->ldelim.chr)
464 lastchar = cut->ldelim.chr;
465 else if (w != cut->wdelim.chr)
466 {
467 empty = 0;
468 if (sfwrite(fdout, (char*)mb, c) < 0)
469 goto failed;
470 }
471 }
472 }
473 if (c == cut->wdelim.chr)
474 {
475 c = SP_WORD;
476 break;
477 }
478 if (c == cut->ldelim.chr)
479 {
480 c = SP_LINE;
481 break;
482 }
483 continue;
484 default:
485 wp = cp - 1;
486 break;
487 }
488 break;
489 }
490 else
491 {
492 while (!(c = sp[*cp++]));
493 wp = cp - 1;
494 }
495 /* check for end-of-line */
496 if (c == SP_LINE)
497 {
498 if (cp <= ep)
499 break;
500 if (lastchar == cut->ldelim.chr)
501 break;
502 /* restore cut->last character */
503 if (lastchar != cut->eob)
504 *ep = lastchar;
505 inword++;
506 if (!sp[lastchar])
507 break;
508 }
509 nodelim = 0;
510 if (--nfields > 0)
511 continue;
512 nfields = *++lp;
513 if (copy)
514 {
515 empty = 0;
516 if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
517 goto failed;
518 copy = 0;
519 }
520 else
521 /* set to delimiter unless the first field */
522 copy = empty ? cp : wp;
523 } while (!inword);
524 if (!inword)
525 {
526 if (!copy)
527 {
528 if (nodelim)
529 {
530 if (!cut->sflag)
531 {
532 if (offset)
533 {
534 sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
535 sfmove(fdtmp,fdout,offset,-1);
536 }
537 copy = first;
538 }
539 }
540 else
541 sfputc(fdout,'\n');
542 }
543 if (offset)
544 sfseek(fdtmp,offset=0,SEEK_SET);
545 }
546 if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
547 goto failed;
548 }
549 /* see whether to save in tmp file */
550 if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
551 {
552 /* copy line to tmpfile in case no fields */
553 if(!fdtmp)
554 fdtmp = sftmp(BLOCK);
555 sfwrite(fdtmp,(char*)first,c);
556 offset +=c;
557 }
558 }
559 failed:
560 if(fdtmp)
561 sfclose(fdtmp);
562 }
563
564 int
b_cut(int argc,char ** argv,Shbltin_t * context)565 b_cut(int argc, char** argv, Shbltin_t* context)
566 {
567 register char* cp = 0;
568 register Sfio_t* fp;
569 char* s;
570 int n;
571 Cut_t* cut;
572 int mode = 0;
573 Delim_t wdelim;
574 Delim_t ldelim;
575 size_t reclen = 0;
576 wchar_t w;
577 Mbstate_t q;
578
579 cmdinit(argc, argv, context, ERROR_CATALOG, 0);
580 wdelim.chr = '\t';
581 ldelim.chr = '\n';
582 wdelim.len = ldelim.len = 1;
583 for (;;)
584 {
585 switch (optget(argv, usage))
586 {
587 case 0:
588 break;
589 case 'b':
590 case 'c':
591 if(mode&C_FIELDS)
592 {
593 error(2, "f option already specified");
594 continue;
595 }
596 cp = opt_info.arg;
597 if(opt_info.option[1]=='b')
598 mode |= C_BYTES;
599 else
600 mode |= C_CHARS;
601 continue;
602 case 'D':
603 ldelim.str = opt_info.arg;
604 if (mbwide())
605 {
606 s = opt_info.arg;
607 mbinit(&q);
608 ldelim.chr = mbchar(&w, s, MB_LEN_MAX, &q);
609 if ((n = s - opt_info.arg) > 1)
610 {
611 ldelim.len = n;
612 continue;
613 }
614 }
615 ldelim.chr = *(unsigned char*)opt_info.arg;
616 ldelim.len = 1;
617 continue;
618 case 'd':
619 wdelim.str = opt_info.arg;
620 if (mbwide())
621 {
622 s = opt_info.arg;
623 mbinit(&q);
624 wdelim.chr = mbchar(&w, s, MB_LEN_MAX, &q);
625 if ((n = s - opt_info.arg) > 1)
626 {
627 wdelim.len = n;
628 continue;
629 }
630 }
631 wdelim.chr = *(unsigned char*)opt_info.arg;
632 wdelim.len = 1;
633 continue;
634 case 'f':
635 if(mode&(C_CHARS|C_BYTES))
636 {
637 error(2, "c option already specified");
638 continue;
639 }
640 cp = opt_info.arg;
641 mode |= C_FIELDS;
642 continue;
643 case 'n':
644 mode |= C_NOSPLIT;
645 continue;
646 case 'N':
647 mode |= C_NONEWLINE;
648 continue;
649 case 'R':
650 case 'r':
651 if(opt_info.num>0)
652 reclen = opt_info.num;
653 continue;
654 case 's':
655 mode |= C_SUPRESS;
656 continue;
657 case ':':
658 error(2, "%s", opt_info.arg);
659 break;
660 case '?':
661 error(ERROR_usage(2), "%s", opt_info.arg);
662 break;
663 }
664 break;
665 }
666 argv += opt_info.index;
667 if (error_info.errors)
668 error(ERROR_usage(2), "%s",optusage(NiL));
669 if(!cp)
670 {
671 error(2, "b, c or f option must be specified");
672 error(ERROR_usage(2), "%s", optusage(NiL));
673 }
674 if(!*cp)
675 error(3, "non-empty b, c or f option must be specified");
676 if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
677 error(3, "s option requires f option");
678 cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
679 if(cp = *argv)
680 argv++;
681 do
682 {
683 if(!cp || streq(cp,"-"))
684 fp = sfstdin;
685 else if(!(fp = sfopen(NiL,cp,"r")))
686 {
687 error(ERROR_system(0),"%s: cannot open",cp);
688 continue;
689 }
690 if(mode&C_FIELDS)
691 cutfields(cut,fp,sfstdout);
692 else
693 cutcols(cut,fp,sfstdout);
694 if(fp!=sfstdin)
695 sfclose(fp);
696 } while(cp = *argv++);
697 if (sfsync(sfstdout))
698 error(ERROR_system(0), "write error");
699 return error_info.errors != 0;
700 }
701