1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1992-2013 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *               Glenn Fowler <glenn.s.fowler@gmail.com>                *
18 *                    David Korn <dgkorn@gmail.com>                     *
19 *                                                                      *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23  * David Korn
24  * AT&T Bell Laboratories
25  *
26  * cut fields or columns from fields from a file
27  */
28 
29 static const char usage[] =
30 "[-?\n@(#)$Id: cut (AT&T Research) 2013-09-13 $\n]"
31 USAGE_LICENSE
32 "[+NAME?cut - cut out selected columns or fields of each line of a file]"
33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34 	"from one or more files, contatenating them on standard output.]"
35 "[+?The option argument \alist\a is a comma-separated or blank-separated "
36 	"list of positive numbers and ranges.  Ranges can be of three "
37 	"forms.  The first is two positive integers separated by a hyphen "
38 	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39 	"\ahigh\a.  The second is a positive number preceded by a hyphen "
40 	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41 	"\ahigh\a.  The last is a positive number followed by a hyphen "
42 	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
43 	"last field, inclusive.  Elements in the \alist\a can be repeated, "
44 	"can overlap, and can appear in any order.  The order of the "
45 	"output is that of the input.]"
46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48         "cuts from standard input.   The start of the file is defined "
49         "as the current offset.]"
50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51 "[c:characters]:[list?\bcut\b based on a list of character counts.]"
52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53 	"to \adelim\a.  The default is the \btab\b character.]"
54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55 	"character specified with the \b-d\b optiion.]"
56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58 	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
59 	"option.]"
60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61 	"when used with the \b-f\b option.  By default, lines with no "
62 	"delimiters will be passsed in untouched.]"
63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64 	"the \b-f\b option is set to \aldelim\a.  The default is the "
65 	"\bnewline\b character.]"
66 "[N!:newline?Output new-lines at end of each record when used "
67 	"with the \b-b\b or \b-c\b option.]"
68 "\n"
69 "\n[file ...]\n"
70 "\n"
71 "[+EXIT STATUS?]{"
72 	"[+0?All files processed successfully.]"
73 	"[+>0?One or more files failed to open or could not be read.]"
74 "}"
75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76 ;
77 
78 #include <cmd.h>
79 #include <ctype.h>
80 
81 typedef struct Delim_s
82 {
83 	char*		str;
84 	int		len;
85 	int		chr;
86 } Delim_t;
87 
88 typedef struct Cut_s
89 {
90 	int		mb;
91 	int		eob;
92 	int		cflag;
93 	int		nosplit;
94 	int		sflag;
95 	int		nlflag;
96 	int		reclen;
97 	Delim_t		wdelim;
98 	Delim_t		ldelim;
99 	Mbstate_t	q;
100 	unsigned char	space[UCHAR_MAX+1];
101 	int		list[2];	/* NOTE: must be last member */
102 } Cut_t;
103 
104 #define HUGE		INT_MAX
105 #define BLOCK		8*1024
106 #define C_BYTES		1
107 #define C_CHARS		2
108 #define C_FIELDS	4
109 #define C_SUPRESS	8
110 #define C_NOSPLIT	16
111 #define C_NONEWLINE	32
112 
113 #define SP_LINE		1
114 #define SP_WORD		2
115 #define SP_WIDE		3
116 
117 /*
118  * compare the first of an array of integers
119  */
120 
121 static int
mycomp(register const void * a,register const void * b)122 mycomp(register const void* a, register const void* b)
123 {
124 	if (*((int*)a) < *((int*)b))
125 		return -1;
126 	if (*((int*)a) > *((int*)b))
127 		return 1;
128 	return 0;
129 }
130 
131 static Cut_t*
cutinit(int mode,char * str,Delim_t * wdelim,Delim_t * ldelim,size_t reclen)132 cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
133 {
134 	register int*	lp;
135 	register int	c;
136 	register int	n = 0;
137 	register int	range = 0;
138 	register char*	cp = str;
139 	Cut_t*		cut;
140 
141 	if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
142 		error(ERROR_exit(1), "out of space");
143 	if (cut->mb = mbwide())
144 	{
145 		memset(cut->space, 0, sizeof(cut->space) / 2);
146 		memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
147 	}
148 	else
149 		memset(cut->space, 0, sizeof(cut->space));
150 	cut->wdelim = *wdelim;
151 	if (wdelim->len == 1)
152 		cut->space[wdelim->chr] = SP_WORD;
153 	cut->ldelim = *ldelim;
154 	cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
155 	cut->space[cut->eob] = SP_LINE;
156 	cut->cflag = (mode&C_CHARS) && cut->mb;
157 	cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
158 	cut->sflag = (mode&C_SUPRESS) != 0;
159 	cut->nlflag = (mode&C_NONEWLINE) != 0;
160 	cut->reclen = reclen;
161 	lp = cut->list;
162 	for (;;)
163 		switch(c = *cp++)
164 		{
165 		case ' ':
166 		case '\t':
167 			while(*cp==' ' || *cp=='\t')
168 				cp++;
169 			/*FALLTHROUGH*/
170 		case 0:
171 		case ',':
172 			if(range)
173 			{
174 				--range;
175 				if((n = (n ? (n-range) : (HUGE-1))) < 0)
176 					error(ERROR_exit(1),"invalid range for c/f option");
177 				*lp++ = range;
178 				*lp++ = n;
179 			}
180 			else
181 			{
182 				*lp++ = --n;
183 				*lp++ = 1;
184 			}
185 			if(c==0)
186 			{
187 				register int *dp;
188 				*lp = HUGE;
189 				n = 1 + (lp-cut->list)/2;
190 				qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
191 				/* eliminate overlapping regions */
192 				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
193 				{
194 					if(lp[0] <= range)
195 					{
196 						if(lp[1]==HUGE)
197 						{
198 							dp[-1] = HUGE;
199 							break;
200 						}
201 						if((c = lp[0]+lp[1]-range)>0)
202 						{
203 							range += c;
204 							dp[-1] += c;
205 						}
206 					}
207 					else
208 					{
209 						range = *dp++ = lp[0];
210 						if(lp[1]==HUGE)
211 						{
212 							*dp++ = HUGE;
213 							break;
214 						}
215 						range += (*dp++ = lp[1]);
216 					}
217 				}
218 				*dp = HUGE;
219 				lp = cut->list;
220 				/* convert ranges into gaps */
221 				for(n=0; *lp!=HUGE; lp+=2)
222 				{
223 					c = *lp;
224 					*lp -= n;
225 					n = c+lp[1];
226 				}
227 				return cut;
228 			}
229 			n = range = 0;
230 			break;
231 
232 		case '-':
233 			if(range)
234 				error(ERROR_exit(1),"bad list for c/f option");
235 			range = n?n:1;
236 			n = 0;
237 			break;
238 
239 		default:
240 			if(!isdigit(c))
241 				error(ERROR_exit(1),"bad list for c/f option");
242 			n = 10*n + (c-'0');
243 			break;
244 		}
245 	/* NOTREACHED */
246 }
247 
248 /*
249  * cut each line of file <fdin> and put results to <fdout> using list <list>
250  */
251 
252 static void
cutcols(Cut_t * cut,Sfio_t * fdin,Sfio_t * fdout)253 cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
254 {
255 	register int		c;
256 	register int		len;
257 	register int		ncol = 0;
258 	register const int*	lp = cut->list;
259 	register char*		bp;
260 	register int		skip; /* non-zero for don't copy */
261 	int			must;
262 	const char*		xx;
263 
264 	for (;;)
265 	{
266 		if (len = cut->reclen)
267 			bp = sfreserve(fdin, len, -1);
268 		else
269 			bp = sfgetr(fdin, '\n', 0);
270 		if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
271 			break;
272 		len = sfvalue(fdin);
273 		xx = 0;
274 		if (!(ncol = skip  = *(lp = cut->list)))
275 			ncol = *++lp;
276 		must = 1;
277 		do
278 		{
279 			if (cut->nosplit)
280 			{
281 				register const char*	s = bp;
282 				register int		w = len < ncol ? len : ncol;
283 				register int		z;
284 
285 				while (w > 0)
286 				{
287 					if (!(*s & 0x80))
288 						z = 1;
289 					else if ((z = mbtsize(s, w, &cut->q)) <= 0)
290 					{
291 						if (s == bp && xx)
292 						{
293 							w += s - xx;
294 							bp = (char*)(s = xx);
295 							xx = 0;
296 							continue;
297 						}
298 						xx = s;
299 						if (skip)
300 							s += w;
301 						w = 0;
302 						break;
303 					}
304 					s += z;
305 					w -= z;
306 				}
307 				c = s - bp;
308 				ncol = !w && ncol >= len;
309 			}
310 			else if (cut->cflag)
311 			{
312 				register const char*	s = bp;
313 				register int		w = len;
314 				register int		z;
315 
316 				while (w > 0 && ncol > 0)
317 				{
318 					ncol--;
319 					if (!(*s & 0x80) || (z = mbtsize(s, w, &cut->q)) <= 0)
320 						z = 1;
321 					s += z;
322 					w -= z;
323 
324 				}
325 				c = s - bp;
326 				ncol = !w && (ncol || !skip);
327 			}
328 			else
329 			{
330 				if ((c = ncol) > len)
331 					c = len;
332 				else if (c == len && !skip)
333 					ncol++;
334 				ncol -= c;
335 			}
336 			if (!skip && c)
337 			{
338 				if (sfwrite(fdout, (char*)bp, c) < 0)
339 					return;
340 				must = 0;
341 			}
342 			bp += c;
343 			if (ncol)
344 				break;
345 			len -= c;
346 			ncol = *++lp;
347 			skip = !skip;
348 		} while (ncol != HUGE);
349 		if (!cut->nlflag && (skip || must || cut->reclen))
350 		{
351 			if (cut->ldelim.len > 1)
352 				sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
353 			else
354 				sfputc(fdout, cut->ldelim.chr);
355 		}
356 	}
357 }
358 
359 /*
360  * cut each line of file <fdin> and put results to <fdout> using list <list>
361  * stream <fdin> must be line buffered
362  */
363 
364 static void
cutfields(Cut_t * cut,Sfio_t * fdin,Sfio_t * fdout)365 cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
366 {
367 	register unsigned char *sp = cut->space;
368 	register unsigned char *cp;
369 	register unsigned char *wp;
370 	register int c, nfields;
371 	register const int *lp = cut->list;
372 	register unsigned char *copy;
373 	register int nodelim, empty, inword=0;
374 	register unsigned char *ep;
375 	unsigned char *bp, *first;
376 	char *tp;
377 	int lastchar;
378 	wchar_t w;
379 	Sfio_t *fdtmp = 0;
380 	long offset = 0;
381 	unsigned char mb[8];
382 	/* process each buffer */
383 	while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
384 	{
385 		cp = bp;
386 		ep = cp + --c;
387 		if((lastchar = cp[c]) != cut->eob)
388 			*ep = cut->eob;
389 		/* process each line in the buffer */
390 		while (cp <= ep)
391 		{
392 			first = cp;
393 			if (!inword)
394 			{
395 				nodelim = empty = 1;
396 				copy = cp;
397 				if (nfields = *(lp = cut->list))
398 					copy = 0;
399 				else
400 					nfields = *++lp;
401 			}
402 			else if (copy)
403 				copy = cp;
404 			inword = 0;
405 			do
406 			{
407 				/* skip over non-delimiter characters */
408 				if (cut->mb)
409 					for (;;)
410 					{
411 						switch (c = sp[*(unsigned char*)cp++])
412 						{
413 						case 0:
414 							continue;
415 						case SP_WIDE:
416 							wp = --cp;
417 							while ((c = mbchar(&w, cp, ep - cp, &cut->q)), mberrno(&cut->q) == E2BIG)
418 							{
419 								int	i;
420 								int	j;
421 								int	k;
422 
423 								/* mb char spanning buffer boundary -- fun stuff */
424 
425 								cp = wp;
426 								if (lastchar != cut->eob)
427 								{
428 									*ep = lastchar;
429 									if ((c = mbchar(&w, cp, ep - cp + 1, &cut->q)), mberrno(&cut->q) != E2BIG)
430 										break;
431 								}
432 								if (copy)
433 								{
434 									empty = 0;
435 									if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
436 										goto failed;
437 								}
438 								for (i = 0; i <= (ep - cp); i++)
439 									mb[i] = cp[i];
440 								if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
441 									goto failed;
442 								cp = bp;
443 								ep = cp + --c;
444 								if ((lastchar = cp[c]) != cut->eob)
445 									*ep = cut->eob;
446 								j = i;
447 								k = 0;
448 								while (j < mbmax())
449 									mb[j++] = cp[k++];
450 								tp = (char*)mb;
451 								w = mbchar(&w, tp, j, &cut->q);
452 								if (mberrno(&cut->q))
453 								{
454 									w = 0;
455 									c = i;
456 								}
457 								else
458 									c = tp - (char*)mb;
459 								first = bp = cp += c - i;
460 								if (copy)
461 								{
462 									copy = bp;
463 									if (w == cut->ldelim.chr)
464 										lastchar = cut->ldelim.chr;
465 									else if (w != cut->wdelim.chr)
466 									{
467 										empty = 0;
468 										if (sfwrite(fdout, (char*)mb, c) < 0)
469 											goto failed;
470 									}
471 								}
472 							}
473 							if (c == cut->wdelim.chr)
474 							{
475 								c = SP_WORD;
476 								break;
477 							}
478 							if (c == cut->ldelim.chr)
479 							{
480 								c = SP_LINE;
481 								break;
482 							}
483 							continue;
484 						default:
485 							wp = cp - 1;
486 							break;
487 						}
488 						break;
489 					}
490 				else
491 				{
492 					while (!(c = sp[*cp++]));
493 					wp = cp - 1;
494 				}
495 				/* check for end-of-line */
496 				if (c == SP_LINE)
497 				{
498 					if (cp <= ep)
499 						break;
500 					if (lastchar == cut->ldelim.chr)
501 						break;
502 					/* restore cut->last character */
503 					if (lastchar != cut->eob)
504 						*ep = lastchar;
505 					inword++;
506 					if (!sp[lastchar])
507 						break;
508 				}
509 				nodelim = 0;
510 				if (--nfields > 0)
511 					continue;
512 				nfields = *++lp;
513 				if (copy)
514 				{
515 					empty = 0;
516 					if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
517 						goto failed;
518 					copy = 0;
519 				}
520 				else
521 					/* set to delimiter unless the first field */
522 					copy = empty ? cp : wp;
523 			} while (!inword);
524 			if (!inword)
525 			{
526 				if (!copy)
527 				{
528 					if (nodelim)
529 					{
530 						if (!cut->sflag)
531 						{
532 							if (offset)
533 							{
534 								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
535 								sfmove(fdtmp,fdout,offset,-1);
536 							}
537 							copy = first;
538 						}
539 					}
540 					else
541 						sfputc(fdout,'\n');
542 				}
543 				if (offset)
544 					sfseek(fdtmp,offset=0,SEEK_SET);
545 			}
546 			if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
547 				goto failed;
548 		}
549 		/* see whether to save in tmp file */
550 		if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
551 		{
552 			/* copy line to tmpfile in case no fields */
553 			if(!fdtmp)
554 				fdtmp = sftmp(BLOCK);
555 			sfwrite(fdtmp,(char*)first,c);
556 			offset +=c;
557 		}
558 	}
559  failed:
560 	if(fdtmp)
561 		sfclose(fdtmp);
562 }
563 
564 int
b_cut(int argc,char ** argv,Shbltin_t * context)565 b_cut(int argc, char** argv, Shbltin_t* context)
566 {
567 	register char*		cp = 0;
568 	register Sfio_t*	fp;
569 	char*			s;
570 	int			n;
571 	Cut_t*			cut;
572 	int			mode = 0;
573 	Delim_t			wdelim;
574 	Delim_t			ldelim;
575 	size_t			reclen = 0;
576 	wchar_t			w;
577 	Mbstate_t		q;
578 
579 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
580 	wdelim.chr = '\t';
581 	ldelim.chr = '\n';
582 	wdelim.len = ldelim.len = 1;
583 	for (;;)
584 	{
585 		switch (optget(argv, usage))
586 		{
587 		case 0:
588 			break;
589 		case 'b':
590 		case 'c':
591 			if(mode&C_FIELDS)
592 			{
593 				error(2, "f option already specified");
594 				continue;
595 			}
596 			cp = opt_info.arg;
597 			if(opt_info.option[1]=='b')
598 				mode |= C_BYTES;
599 			else
600 				mode |= C_CHARS;
601 			continue;
602 		case 'D':
603 			ldelim.str = opt_info.arg;
604 			if (mbwide())
605 			{
606 				s = opt_info.arg;
607 				mbinit(&q);
608 				ldelim.chr = mbchar(&w, s, MB_LEN_MAX, &q);
609 				if ((n = s - opt_info.arg) > 1)
610 				{
611 					ldelim.len = n;
612 					continue;
613 				}
614 			}
615 			ldelim.chr = *(unsigned char*)opt_info.arg;
616 			ldelim.len = 1;
617 			continue;
618 		case 'd':
619 			wdelim.str = opt_info.arg;
620 			if (mbwide())
621 			{
622 				s = opt_info.arg;
623 				mbinit(&q);
624 				wdelim.chr = mbchar(&w, s, MB_LEN_MAX, &q);
625 				if ((n = s - opt_info.arg) > 1)
626 				{
627 					wdelim.len = n;
628 					continue;
629 				}
630 			}
631 			wdelim.chr = *(unsigned char*)opt_info.arg;
632 			wdelim.len = 1;
633 			continue;
634 		case 'f':
635 			if(mode&(C_CHARS|C_BYTES))
636 			{
637 				error(2, "c option already specified");
638 				continue;
639 			}
640 			cp = opt_info.arg;
641 			mode |= C_FIELDS;
642 			continue;
643 		case 'n':
644 			mode |= C_NOSPLIT;
645 			continue;
646 		case 'N':
647 			mode |= C_NONEWLINE;
648 			continue;
649 		case 'R':
650 		case 'r':
651 			if(opt_info.num>0)
652 				reclen = opt_info.num;
653 			continue;
654 		case 's':
655 			mode |= C_SUPRESS;
656 			continue;
657 		case ':':
658 			error(2, "%s", opt_info.arg);
659 			break;
660 		case '?':
661 			error(ERROR_usage(2), "%s", opt_info.arg);
662 			break;
663 		}
664 		break;
665 	}
666 	argv += opt_info.index;
667 	if (error_info.errors)
668 		error(ERROR_usage(2), "%s",optusage(NiL));
669 	if(!cp)
670 	{
671 		error(2, "b, c or f option must be specified");
672 		error(ERROR_usage(2), "%s", optusage(NiL));
673 	}
674 	if(!*cp)
675 		error(3, "non-empty b, c or f option must be specified");
676 	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
677 		error(3, "s option requires f option");
678 	cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
679 	if(cp = *argv)
680 		argv++;
681 	do
682 	{
683 		if(!cp || streq(cp,"-"))
684 			fp = sfstdin;
685 		else if(!(fp = sfopen(NiL,cp,"r")))
686 		{
687 			error(ERROR_system(0),"%s: cannot open",cp);
688 			continue;
689 		}
690 		if(mode&C_FIELDS)
691 			cutfields(cut,fp,sfstdout);
692 		else
693 			cutcols(cut,fp,sfstdout);
694 		if(fp!=sfstdin)
695 			sfclose(fp);
696 	} while(cp = *argv++);
697 	if (sfsync(sfstdout))
698 		error(ERROR_system(0), "write error");
699 	return error_info.errors != 0;
700 }
701