1 /*
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Adam S. Moskowitz of Menlo Consulting and Marciano Pitargue.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <ctype.h>
36 #include <err.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <locale.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <unistd.h>
44 #include <wchar.h>
45
46 static int bflag;
47 static int cflag;
48 static wchar_t dchar;
49 static char dcharmb[MB_LEN_MAX + 1];
50 static int dflag;
51 static int fflag;
52 static int nflag;
53 static int sflag;
54 static int wflag;
55
56 static size_t autostart, autostop, maxval;
57 static char * positions;
58
59 static int b_cut(FILE *, const char *);
60 static int b_n_cut(FILE *, const char *);
61 static int c_cut(FILE *, const char *);
62 static int f_cut(FILE *, const char *);
63 static void get_list(char *);
64 static int is_delim(wchar_t);
65 static void needpos(size_t);
66 static void usage(void);
67
68 int
main(int argc,char * argv[])69 main(int argc, char *argv[])
70 {
71 FILE *fp;
72 int (*fcn)(FILE *, const char *);
73 int ch, rval;
74 size_t n;
75
76 setlocale(LC_ALL, "");
77
78 fcn = NULL;
79 dchar = '\t'; /* default delimiter is \t */
80 strcpy(dcharmb, "\t");
81
82 while ((ch = getopt(argc, argv, "b:c:d:f:snw")) != -1)
83 switch(ch) {
84 case 'b':
85 get_list(optarg);
86 bflag = 1;
87 break;
88 case 'c':
89 get_list(optarg);
90 cflag = 1;
91 break;
92 case 'd':
93 n = mbrtowc(&dchar, optarg, MB_LEN_MAX, NULL);
94 if (dchar == '\0' || n != strlen(optarg))
95 errx(1, "bad delimiter");
96 strcpy(dcharmb, optarg);
97 dflag = 1;
98 break;
99 case 'f':
100 get_list(optarg);
101 fflag = 1;
102 break;
103 case 's':
104 sflag = 1;
105 break;
106 case 'n':
107 nflag = 1;
108 break;
109 case 'w':
110 wflag = 1;
111 break;
112 case '?':
113 default:
114 usage();
115 }
116 argc -= optind;
117 argv += optind;
118
119 if (fflag) {
120 if (bflag || cflag || nflag || (wflag && dflag))
121 usage();
122 } else if (!(bflag || cflag) || dflag || sflag || wflag)
123 usage();
124 else if (!bflag && nflag)
125 usage();
126
127 if (fflag)
128 fcn = f_cut;
129 else if (cflag)
130 fcn = MB_CUR_MAX > 1 ? c_cut : b_cut;
131 else if (bflag)
132 fcn = nflag && MB_CUR_MAX > 1 ? b_n_cut : b_cut;
133
134 rval = 0;
135 if (*argv)
136 for (; *argv; ++argv) {
137 if (strcmp(*argv, "-") == 0)
138 rval |= fcn(stdin, "stdin");
139 else {
140 if (!(fp = fopen(*argv, "r"))) {
141 warn("%s", *argv);
142 rval = 1;
143 continue;
144 }
145 fcn(fp, *argv);
146 (void)fclose(fp);
147 }
148 }
149 else
150 rval = fcn(stdin, "stdin");
151 exit(rval);
152 }
153
154 static void
get_list(char * list)155 get_list(char *list)
156 {
157 size_t setautostart, start, stop;
158 char *pos;
159 char *p;
160
161 /*
162 * set a byte in the positions array to indicate if a field or
163 * column is to be selected; use +1, it's 1-based, not 0-based.
164 * Numbers and number ranges may be overlapping, repeated, and in
165 * any order. We handle "-3-5" although there's no real reason to.
166 */
167 for (; (p = strsep(&list, ", \t")) != NULL;) {
168 setautostart = start = stop = 0;
169 if (*p == '-') {
170 ++p;
171 setautostart = 1;
172 }
173 if (isdigit((unsigned char)*p)) {
174 start = stop = strtol(p, &p, 10);
175 if (setautostart && start > autostart)
176 autostart = start;
177 }
178 if (*p == '-') {
179 if (isdigit((unsigned char)p[1]))
180 stop = strtol(p + 1, &p, 10);
181 if (*p == '-') {
182 ++p;
183 if (!autostop || autostop > stop)
184 autostop = stop;
185 }
186 }
187 if (*p)
188 errx(1, "[-bcf] list: illegal list value");
189 if (!stop || !start)
190 errx(1, "[-bcf] list: values may not include zero");
191 if (maxval < stop) {
192 maxval = stop;
193 needpos(maxval + 1);
194 }
195 for (pos = positions + start; start++ <= stop; *pos++ = 1);
196 }
197
198 /* overlapping ranges */
199 if (autostop && maxval > autostop) {
200 maxval = autostop;
201 needpos(maxval + 1);
202 }
203
204 /* reversed range with autostart */
205 if (maxval < autostart) {
206 maxval = autostart;
207 needpos(maxval + 1);
208 }
209
210 /* set autostart */
211 if (autostart)
212 memset(positions + 1, '1', autostart);
213 }
214
215 static void
needpos(size_t n)216 needpos(size_t n)
217 {
218 static size_t npos;
219 size_t oldnpos;
220
221 /* Grow the positions array to at least the specified size. */
222 if (n > npos) {
223 oldnpos = npos;
224 if (npos == 0)
225 npos = n;
226 while (n > npos)
227 npos *= 2;
228 if ((positions = realloc(positions, npos)) == NULL)
229 err(1, "realloc");
230 memset((char *)positions + oldnpos, 0, npos - oldnpos);
231 }
232 }
233
234 static int
b_cut(FILE * fp,const char * fname __unused)235 b_cut(FILE *fp, const char *fname __unused)
236 {
237 int ch, col;
238 char *pos;
239
240 ch = 0;
241 for (;;) {
242 pos = positions + 1;
243 for (col = maxval; col; --col) {
244 if ((ch = getc(fp)) == EOF)
245 return (0);
246 if (ch == '\n')
247 break;
248 if (*pos++)
249 (void)putchar(ch);
250 }
251 if (ch != '\n') {
252 if (autostop)
253 while ((ch = getc(fp)) != EOF && ch != '\n')
254 (void)putchar(ch);
255 else
256 while ((ch = getc(fp)) != EOF && ch != '\n');
257 }
258 (void)putchar('\n');
259 }
260 return (0);
261 }
262
263 /*
264 * Cut based on byte positions, taking care not to split multibyte characters.
265 * Although this function also handles the case where -n is not specified,
266 * b_cut() ought to be much faster.
267 */
268 static int
b_n_cut(FILE * fp,const char * fname)269 b_n_cut(FILE *fp, const char *fname)
270 {
271 size_t col, i, bufsize = 0;
272 ssize_t lbuflen;
273 char *lbuf = NULL;
274 int canwrite, clen, warned;
275 mbstate_t mbs;
276
277 memset(&mbs, 0, sizeof(mbs));
278 warned = 0;
279 while ((lbuflen = getline(&lbuf, &bufsize, fp)) >= 0) {
280 for (col = 0; lbuflen > 0; col += clen) {
281 if ((clen = mbrlen(lbuf, lbuflen, &mbs)) < 0) {
282 if (!warned) {
283 warn("%s", fname);
284 warned = 1;
285 }
286 memset(&mbs, 0, sizeof(mbs));
287 clen = 1;
288 }
289 if (clen == 0 || *lbuf == '\n')
290 break;
291 if (col < maxval && !positions[1 + col]) {
292 /*
293 * Print the character if (1) after an initial
294 * segment of un-selected bytes, the rest of
295 * it is selected, and (2) the last byte is
296 * selected.
297 */
298 i = col;
299 while (i < col + clen && i < maxval &&
300 !positions[1 + i])
301 i++;
302 canwrite = i < col + clen;
303 for (; i < col + clen && i < maxval; i++)
304 canwrite &= positions[1 + i];
305 if (canwrite)
306 fwrite(lbuf, 1, clen, stdout);
307 } else {
308 /*
309 * Print the character if all of it has
310 * been selected.
311 */
312 canwrite = 1;
313 for (i = col; i < col + clen; i++)
314 if ((i >= maxval && !autostop) ||
315 (i < maxval && !positions[1 + i])) {
316 canwrite = 0;
317 break;
318 }
319 if (canwrite)
320 fwrite(lbuf, 1, clen, stdout);
321 }
322 lbuf += clen;
323 lbuflen -= clen;
324 }
325 if (lbuflen > 0)
326 putchar('\n');
327 }
328 free(lbuf);
329 return (warned);
330 }
331
332 static int
c_cut(FILE * fp,const char * fname)333 c_cut(FILE *fp, const char *fname)
334 {
335 wint_t ch;
336 int col;
337 char *pos;
338
339 ch = 0;
340 for (;;) {
341 pos = positions + 1;
342 for (col = maxval; col; --col) {
343 if ((ch = getwc(fp)) == WEOF)
344 goto out;
345 if (ch == '\n')
346 break;
347 if (*pos++)
348 (void)putwchar(ch);
349 }
350 if (ch != '\n') {
351 if (autostop)
352 while ((ch = getwc(fp)) != WEOF && ch != '\n')
353 (void)putwchar(ch);
354 else
355 while ((ch = getwc(fp)) != WEOF && ch != '\n');
356 }
357 (void)putwchar('\n');
358 }
359 out:
360 if (ferror(fp)) {
361 warn("%s", fname);
362 return (1);
363 }
364 return (0);
365 }
366
367 static int
is_delim(wchar_t ch)368 is_delim(wchar_t ch)
369 {
370 if (wflag) {
371 if (ch == ' ' || ch == '\t')
372 return 1;
373 } else {
374 if (ch == dchar)
375 return 1;
376 }
377 return 0;
378 }
379
380 static int
f_cut(FILE * fp,const char * fname)381 f_cut(FILE *fp, const char *fname)
382 {
383 wchar_t ch;
384 int field, i, isdelim;
385 char *pos, *p;
386 int output;
387 char *lbuf = NULL;
388 size_t clen, bufsize = 0, reallen;
389 ssize_t lbuflen;
390
391 while ((lbuflen = getline(&lbuf, &bufsize, fp)) >= 0) {
392 reallen = lbuflen;
393 /* Assert EOL has a newline. */
394 if (lbuflen > 0 && *(lbuf + lbuflen - 1) != '\n') {
395 /* Can't have > 1 line with no trailing newline. */
396 if ((ssize_t)bufsize < (lbuflen + 1)) {
397 bufsize = lbuflen + 1;
398 lbuf = realloc(lbuf, bufsize);
399 }
400 if (lbuf == NULL)
401 err(1, "realloc");
402 lbuf[lbuflen] = '\n';
403 reallen++;
404 }
405 output = 0;
406 for (isdelim = 0, p = lbuf;; p += clen) {
407 clen = mbrtowc(&ch, p, lbuf + reallen - p, NULL);
408 if (clen == (size_t)-1 || clen == (size_t)-2) {
409 warnc(EILSEQ, "%s", fname);
410 free(lbuf);
411 return (1);
412 }
413 if (clen == 0)
414 clen = 1;
415 /* this should work if newline is delimiter */
416 if (is_delim(ch))
417 isdelim = 1;
418 if (ch == '\n') {
419 if (!isdelim && !sflag)
420 (void)fwrite(lbuf, lbuflen, 1, stdout);
421 break;
422 }
423 }
424 if (!isdelim)
425 continue;
426
427 pos = positions + 1;
428 for (field = maxval, p = lbuf; field; --field, ++pos) {
429 if (*pos && output++)
430 for (i = 0; dcharmb[i] != '\0'; i++)
431 putchar(dcharmb[i]);
432 for (;;) {
433 clen = mbrtowc(&ch, p, lbuf + reallen - p,
434 NULL);
435 if (clen == (size_t)-1 || clen == (size_t)-2) {
436 warnc(EILSEQ, "%s", fname);
437 free(lbuf);
438 return (1);
439 }
440 if (clen == 0)
441 clen = 1;
442 p += clen;
443 if (ch == '\n' || is_delim(ch)) {
444 /* compress whitespace */
445 if (wflag && ch != '\n')
446 while (is_delim(*p))
447 p++;
448 break;
449 }
450 if (*pos)
451 for (i = 0; i < (int)clen; i++)
452 putchar(p[i - clen]);
453 }
454 if (ch == '\n')
455 break;
456 }
457 if (ch != '\n') {
458 if (autostop) {
459 if (output)
460 for (i = 0; dcharmb[i] != '\0'; i++)
461 putchar(dcharmb[i]);
462 for (; (ch = *p) != '\n'; ++p)
463 (void)putchar(ch);
464 } else
465 for (; (ch = *p) != '\n'; ++p);
466 }
467 (void)putchar('\n');
468 }
469 free(lbuf);
470 return (0);
471 }
472
473 static void
usage(void)474 usage(void)
475 {
476 (void)fprintf(stderr, "%s\n%s\n%s\n",
477 "usage: cut -b list [-n] [file ...]",
478 " cut -c list [file ...]",
479 " cut -f list [-s] [-w | -d delim] [file ...]");
480 exit(1);
481 }
482