xref: /openbsd/usr.bin/uniq/uniq.c (revision 17a80e6e)
1 /*	$OpenBSD: uniq.c,v 1.33 2022/01/01 18:20:52 cheloha Exp $	*/
2 /*	$NetBSD: uniq.c,v 1.7 1995/08/31 22:03:48 jtc Exp $	*/
3 
4 /*
5  * Copyright (c) 1989, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Case Larsen.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include <ctype.h>
37 #include <err.h>
38 #include <limits.h>
39 #include <locale.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <strings.h>
44 #include <unistd.h>
45 #include <wchar.h>
46 #include <wctype.h>
47 
48 long long numchars, numfields;
49 unsigned long long repeats;
50 int cflag, dflag, iflag, uflag;
51 
52 void	 show(const char *);
53 char	*skip(char *);
54 void	 obsolete(char *[]);
55 __dead void	usage(void);
56 
57 int
main(int argc,char * argv[])58 main(int argc, char *argv[])
59 {
60 	const char *errstr;
61 	char *p, *prevline, *t, *thisline, *tmp;
62 	size_t prevsize, thissize, tmpsize;
63 	ssize_t len;
64 	int ch;
65 
66 	setlocale(LC_CTYPE, "");
67 
68 	if (pledge("stdio rpath wpath cpath", NULL) == -1)
69 		err(1, "pledge");
70 
71 	obsolete(argv);
72 	while ((ch = getopt(argc, argv, "cdf:is:u")) != -1) {
73 		switch (ch) {
74 		case 'c':
75 			cflag = 1;
76 			break;
77 		case 'd':
78 			dflag = 1;
79 			break;
80 		case 'f':
81 			numfields = strtonum(optarg, 0, LLONG_MAX, &errstr);
82 			if (errstr)
83 				errx(1, "fields is %s: %s", errstr, optarg);
84 			break;
85 		case 'i':
86 			iflag = 1;
87 			break;
88 		case 's':
89 			numchars = strtonum(optarg, 0, LLONG_MAX, &errstr);
90 			if (errstr)
91 				errx(1, "chars is %s: %s", errstr, optarg);
92 			break;
93 		case 'u':
94 			uflag = 1;
95 			break;
96 		default:
97 			usage();
98 		}
99 	}
100 	argc -= optind;
101 	argv += optind;
102 
103 	/* If neither -d nor -u are set, default is -d -u. */
104 	if (!dflag && !uflag)
105 		dflag = uflag = 1;
106 
107 	if (argc > 2)
108 		usage();
109 	if (argc >= 1 && strcmp(argv[0], "-") != 0) {
110 		if (freopen(argv[0], "r", stdin) == NULL)
111 			err(1, "%s", argv[0]);
112 	}
113 	if (argc == 2 && strcmp(argv[1], "-") != 0) {
114 		if (freopen(argv[1], "w", stdout) == NULL)
115 			err(1, "%s", argv[1]);
116 	}
117 
118 	if (pledge("stdio", NULL) == -1)
119 		err(1, "pledge");
120 
121 	prevsize = 0;
122 	prevline = NULL;
123 	if ((len = getline(&prevline, &prevsize, stdin)) == -1) {
124 		free(prevline);
125 		if (ferror(stdin))
126 			err(1, "getline");
127 		return 0;
128 	}
129 	if (prevline[len - 1] == '\n')
130 		prevline[len - 1] = '\0';
131 	if (numfields || numchars)
132 		p = skip(prevline);
133 	else
134 		p = prevline;
135 
136 	thissize = 0;
137 	thisline = NULL;
138 	while ((len = getline(&thisline, &thissize, stdin)) != -1) {
139 		if (thisline[len - 1] == '\n')
140 			thisline[len - 1] = '\0';
141 
142 		/* If requested get the chosen fields + character offsets. */
143 		if (numfields || numchars)
144 			t = skip(thisline);
145 		else
146 			t = thisline;
147 
148 		/* If different, print; set previous to new value. */
149 		if ((iflag ? strcasecmp : strcmp)(p, t)) {
150 			show(prevline);
151 			tmp = prevline;
152 			prevline = thisline;
153 			thisline = tmp;
154 			tmp = p;
155 			p = t;
156 			t = tmp;
157 			tmpsize = prevsize;
158 			prevsize = thissize;
159 			thissize = tmpsize;
160 			repeats = 0;
161 		} else
162 			++repeats;
163 	}
164 	free(thisline);
165 	if (ferror(stdin))
166 		err(1, "getline");
167 
168 	show(prevline);
169 	free(prevline);
170 
171 	return 0;
172 }
173 
174 /*
175  * show --
176  *	Output a line depending on the flags and number of repetitions
177  *	of the line.
178  */
179 void
show(const char * str)180 show(const char *str)
181 {
182 	if ((dflag && repeats) || (uflag && !repeats)) {
183 		if (cflag)
184 			printf("%4llu %s\n", repeats + 1, str);
185 		else
186 			printf("%s\n", str);
187 	}
188 }
189 
190 char *
skip(char * str)191 skip(char *str)
192 {
193 	long long nchars, nfields;
194 	wchar_t wc;
195 	int len;
196 	int field_started;
197 
198 	for (nfields = numfields; nfields && *str; nfields--) {
199 		/* Skip one field, including preceding blanks. */
200 		for (field_started = 0; *str != '\0'; str += len) {
201 			if ((len = mbtowc(&wc, str, MB_CUR_MAX)) == -1) {
202 				(void)mbtowc(NULL, NULL, MB_CUR_MAX);
203 				wc = L'?';
204 				len = 1;
205 			}
206 			if (iswblank(wc)) {
207 				if (field_started)
208 					break;
209 			} else
210 				field_started = 1;
211 		}
212 	}
213 
214 	/* Skip some additional characters. */
215 	for (nchars = numchars; nchars-- && *str != '\0'; str += len)
216 		if ((len = mblen(str, MB_CUR_MAX)) == -1)
217 			len = 1;
218 
219 	return (str);
220 }
221 
222 void
obsolete(char * argv[])223 obsolete(char *argv[])
224 {
225 	size_t len;
226 	char *ap, *p, *start;
227 
228 	while ((ap = *++argv)) {
229 		/* Return if "--" or not an option of any form. */
230 		if (ap[0] != '-') {
231 			if (ap[0] != '+')
232 				return;
233 		} else if (ap[1] == '-')
234 			return;
235 		if (!isdigit((unsigned char)ap[1]))
236 			continue;
237 		/*
238 		 * Digit signifies an old-style option.  Malloc space for dash,
239 		 * new option and argument.
240 		 */
241 		len = strlen(ap) + 3;
242 		if ((start = p = malloc(len)) == NULL)
243 			err(1, "malloc");
244 		*p++ = '-';
245 		*p++ = ap[0] == '+' ? 's' : 'f';
246 		(void)strlcpy(p, ap + 1, len - 2);
247 		*argv = start;
248 	}
249 }
250 
251 __dead void
usage(void)252 usage(void)
253 {
254 	fprintf(stderr,
255 	    "usage: %s [-ci] [-d | -u] [-f fields] [-s chars] [input_file [output_file]]\n",
256 	    getprogname());
257 	exit(1);
258 }
259