xref: /openbsd/usr.bin/uniq/uniq.c (revision 73471bf0)
1 /*	$OpenBSD: uniq.c,v 1.29 2021/11/17 23:09:38 cheloha Exp $	*/
2 /*	$NetBSD: uniq.c,v 1.7 1995/08/31 22:03:48 jtc Exp $	*/
3 
4 /*
5  * Copyright (c) 1989, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Case Larsen.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include <ctype.h>
37 #include <err.h>
38 #include <errno.h>
39 #include <limits.h>
40 #include <locale.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <unistd.h>
45 #include <wchar.h>
46 #include <wctype.h>
47 
48 int cflag, dflag, iflag, uflag;
49 int numchars, numfields, repeats;
50 
51 FILE	*file(char *, char *);
52 void	 show(FILE *, char *);
53 char	*skip(char *);
54 void	 obsolete(char *[]);
55 __dead void	usage(void);
56 
57 int
58 main(int argc, char *argv[])
59 {
60 	char *prevline, *t1, *t2, *thisline;
61 	FILE *ifp = NULL, *ofp = NULL;
62 	size_t prevsize, thissize, tmpsize;
63 	ssize_t len;
64 	int ch;
65 
66 	setlocale(LC_CTYPE, "");
67 
68 	if (pledge("stdio rpath wpath cpath", NULL) == -1)
69 		err(1, "pledge");
70 
71 	obsolete(argv);
72 	while ((ch = getopt(argc, argv, "cdf:is:u")) != -1) {
73 		const char *errstr;
74 
75 		switch (ch) {
76 		case 'c':
77 			cflag = 1;
78 			break;
79 		case 'd':
80 			dflag = 1;
81 			break;
82 		case 'f':
83 			numfields = (int)strtonum(optarg, 0, INT_MAX,
84 			    &errstr);
85 			if (errstr)
86 				errx(1, "field skip value is %s: %s",
87 				    errstr, optarg);
88 			break;
89 		case 'i':
90 			iflag = 1;
91 			break;
92 		case 's':
93 			numchars = (int)strtonum(optarg, 0, INT_MAX,
94 			    &errstr);
95 			if (errstr)
96 				errx(1,
97 				    "character skip value is %s: %s",
98 				    errstr, optarg);
99 			break;
100 		case 'u':
101 			uflag = 1;
102 			break;
103 		default:
104 			usage();
105 		}
106 	}
107 
108 	argc -= optind;
109 	argv += optind;
110 
111 	/* If neither -d nor -u are set, default is -d -u. */
112 	if (!dflag && !uflag)
113 		dflag = uflag = 1;
114 
115 	switch (argc) {
116 	case 0:
117 		ifp = stdin;
118 		ofp = stdout;
119 		break;
120 	case 1:
121 		ifp = file(argv[0], "r");
122 		ofp = stdout;
123 		break;
124 	case 2:
125 		ifp = file(argv[0], "r");
126 		ofp = file(argv[1], "w");
127 		break;
128 	default:
129 		usage();
130 	}
131 
132 	if (pledge("stdio", NULL) == -1)
133 		err(1, "pledge");
134 
135 	prevsize = 0;
136 	prevline = NULL;
137 	if ((len = getline(&prevline, &prevsize, ifp)) == -1) {
138 		free(prevline);
139 		if (ferror(ifp))
140 			err(1, "getline");
141 		exit(0);
142 	}
143 	if (prevline[len - 1] == '\n')
144 		prevline[len - 1] = '\0';
145 
146 	thissize = 0;
147 	thisline = NULL;
148 	while ((len = getline(&thisline, &thissize, ifp)) != -1) {
149 		if (thisline[len - 1] == '\n')
150 			thisline[len - 1] = '\0';
151 
152 		/* If requested get the chosen fields + character offsets. */
153 		if (numfields || numchars) {
154 			t1 = skip(thisline);
155 			t2 = skip(prevline);
156 		} else {
157 			t1 = thisline;
158 			t2 = prevline;
159 		}
160 
161 		/* If different, print; set previous to new value. */
162 		if ((iflag ? strcasecmp : strcmp)(t1, t2)) {
163 			show(ofp, prevline);
164 			t1 = prevline;
165 			prevline = thisline;
166 			thisline = t1;
167 			tmpsize = prevsize;
168 			prevsize = thissize;
169 			thissize = tmpsize;
170 			repeats = 0;
171 		} else
172 			++repeats;
173 	}
174 	free(thisline);
175 	if (ferror(ifp))
176 		err(1, "getline");
177 
178 	show(ofp, prevline);
179 	free(prevline);
180 
181 	exit(0);
182 }
183 
184 /*
185  * show --
186  *	Output a line depending on the flags and number of repetitions
187  *	of the line.
188  */
189 void
190 show(FILE *ofp, char *str)
191 {
192 	if ((dflag && repeats) || (uflag && !repeats)) {
193 		if (cflag)
194 			fprintf(ofp, "%4d %s\n", repeats + 1, str);
195 		else
196 			fprintf(ofp, "%s\n", str);
197 	}
198 }
199 
200 char *
201 skip(char *str)
202 {
203 	wchar_t wc;
204 	int nchars, nfields;
205 	int len;
206 	int field_started;
207 
208 	for (nfields = numfields; nfields && *str; nfields--) {
209 		/* Skip one field, including preceding blanks. */
210 		for (field_started = 0; *str != '\0'; str += len) {
211 			if ((len = mbtowc(&wc, str, MB_CUR_MAX)) == -1) {
212 				(void)mbtowc(NULL, NULL, MB_CUR_MAX);
213 				wc = L'?';
214 				len = 1;
215 			}
216 			if (iswblank(wc)) {
217 				if (field_started)
218 					break;
219 			} else
220 				field_started = 1;
221 		}
222 	}
223 
224 	/* Skip some additional characters. */
225 	for (nchars = numchars; nchars-- && *str != '\0'; str += len)
226 		if ((len = mblen(str, MB_CUR_MAX)) == -1)
227 			len = 1;
228 
229 	return (str);
230 }
231 
232 FILE *
233 file(char *name, char *mode)
234 {
235 	FILE *fp;
236 
237 	if (strcmp(name, "-") == 0)
238 		return(*mode == 'r' ? stdin : stdout);
239 	if ((fp = fopen(name, mode)) == NULL)
240 		err(1, "%s", name);
241 	return (fp);
242 }
243 
244 void
245 obsolete(char *argv[])
246 {
247 	size_t len;
248 	char *ap, *p, *start;
249 
250 	while ((ap = *++argv)) {
251 		/* Return if "--" or not an option of any form. */
252 		if (ap[0] != '-') {
253 			if (ap[0] != '+')
254 				return;
255 		} else if (ap[1] == '-')
256 			return;
257 		if (!isdigit((unsigned char)ap[1]))
258 			continue;
259 		/*
260 		 * Digit signifies an old-style option.  Malloc space for dash,
261 		 * new option and argument.
262 		 */
263 		len = strlen(ap) + 3;
264 		if ((start = p = malloc(len)) == NULL)
265 			err(1, "malloc");
266 		*p++ = '-';
267 		*p++ = ap[0] == '+' ? 's' : 'f';
268 		(void)strlcpy(p, ap + 1, len - 2);
269 		*argv = start;
270 	}
271 }
272 
273 __dead void
274 usage(void)
275 {
276 	extern char *__progname;
277 
278 	(void)fprintf(stderr,
279 	    "usage: %s [-ci] [-d | -u] [-f fields] [-s chars] [input_file [output_file]]\n",
280 	    __progname);
281 	exit(1);
282 }
283