xref: /openbsd/usr.bin/wc/wc.c (revision e291b8af)
1 /*	$OpenBSD: wc.c,v 1.32 2024/09/11 03:57:14 guenther Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1987, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/stat.h>
33 
34 #include <fcntl.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <locale.h>
38 #include <ctype.h>
39 #include <err.h>
40 #include <unistd.h>
41 #include <util.h>
42 #include <wchar.h>
43 #include <wctype.h>
44 
45 #define	_MAXBSIZE (64 * 1024)
46 
47 int64_t	tlinect, twordct, tcharct;
48 int	doline, doword, dochar, humanchar, multibyte;
49 int	rval;
50 extern char *__progname;
51 
52 static void print_counts(int64_t, int64_t, int64_t, const char *);
53 static void format_and_print(int64_t);
54 static void cnt(const char *);
55 
56 int
main(int argc,char * argv[])57 main(int argc, char *argv[])
58 {
59 	int ch;
60 
61 	setlocale(LC_CTYPE, "");
62 
63 	if (pledge("stdio rpath", NULL) == -1)
64 		err(1, "pledge");
65 
66 	while ((ch = getopt(argc, argv, "lwchm")) != -1)
67 		switch(ch) {
68 		case 'l':
69 			doline = 1;
70 			break;
71 		case 'w':
72 			doword = 1;
73 			break;
74 		case 'm':
75 			if (MB_CUR_MAX > 1)
76 				multibyte = 1;
77 			/* FALLTHROUGH */
78 		case 'c':
79 			dochar = 1;
80 			break;
81 		case 'h':
82 			humanchar = 1;
83 			break;
84 		default:
85 			fprintf(stderr,
86 			    "usage: %s [-c | -m] [-hlw] [file ...]\n",
87 			    __progname);
88 			return 1;
89 		}
90 	argv += optind;
91 	argc -= optind;
92 
93 	/*
94 	 * wc is unusual in that its flags are on by default, so,
95 	 * if you don't get any arguments, you have to turn them
96 	 * all on.
97 	 */
98 	if (!doline && !doword && !dochar)
99 		doline = doword = dochar = 1;
100 
101 	if (!*argv) {
102 		cnt(NULL);
103 	} else {
104 		int dototal = (argc > 1);
105 
106 		do {
107 			cnt(*argv);
108 		} while(*++argv);
109 
110 		if (dototal)
111 			print_counts(tlinect, twordct, tcharct, "total");
112 	}
113 
114 	return rval;
115 }
116 
117 static void
cnt(const char * path)118 cnt(const char *path)
119 {
120 	static char *buf;
121 	static size_t bufsz;
122 
123 	FILE *stream;
124 	const char *file;
125 	char *C;
126 	wchar_t wc;
127 	short gotsp;
128 	ssize_t len;
129 	int64_t linect, wordct, charct;
130 	struct stat sbuf;
131 	int fd;
132 
133 	linect = wordct = charct = 0;
134 	stream = NULL;
135 	if (path != NULL) {
136 		file = path;
137 		if ((fd = open(file, O_RDONLY)) == -1) {
138 			warn("%s", file);
139 			rval = 1;
140 			return;
141 		}
142 	} else  {
143 		file = "(stdin)";
144 		fd = STDIN_FILENO;
145 	}
146 
147 	if (!multibyte) {
148 		if (bufsz < _MAXBSIZE &&
149 		    (buf = realloc(buf, _MAXBSIZE)) == NULL)
150 			err(1, NULL);
151 
152 		/*
153 		 * According to POSIX, a word is a "maximal string of
154 		 * characters delimited by whitespace."  Nothing is said
155 		 * about a character being printing or non-printing.
156 		 */
157 		if (doword) {
158 			gotsp = 1;
159 			while ((len = read(fd, buf, _MAXBSIZE)) > 0) {
160 				charct += len;
161 				for (C = buf; len--; ++C) {
162 					if (isspace((unsigned char)*C)) {
163 						gotsp = 1;
164 						if (*C == '\n')
165 							++linect;
166 					} else if (gotsp) {
167 						gotsp = 0;
168 						++wordct;
169 					}
170 				}
171 			}
172 			if (len == -1) {
173 				warn("%s", file);
174 				rval = 1;
175 			}
176 		}
177 		/*
178 		 * Line counting is split out because it's a lot
179 		 * faster to get lines than to get words, since
180 		 * the word count requires some logic.
181 		 */
182 		else if (doline) {
183 			while ((len = read(fd, buf, _MAXBSIZE)) > 0) {
184 				charct += len;
185 				for (C = buf; len--; ++C)
186 					if (*C == '\n')
187 						++linect;
188 			}
189 			if (len == -1) {
190 				warn("%s", file);
191 				rval = 1;
192 			}
193 		}
194 		/*
195 		 * If all we need is the number of characters and
196 		 * it's a directory or a regular file, just stat
197 		 * our handle.  We avoid testing for it not being
198 		 * a special device in case someone adds a new type
199 		 * of inode.
200 		 */
201 		else if (dochar) {
202 			if (fstat(fd, &sbuf)) {
203 				warn("%s", file);
204 				rval = 1;
205 			} else {
206 				if (S_ISREG(sbuf.st_mode) || S_ISDIR(sbuf.st_mode))
207 					charct = sbuf.st_size;
208 				else {
209 					while ((len = read(fd, buf, _MAXBSIZE)) > 0)
210 						charct += len;
211 					if (len == -1) {
212 						warn("%s", file);
213 						rval = 1;
214 					}
215 				}
216 			}
217 		}
218 	} else {
219 		if (path == NULL)
220 			stream = stdin;
221 		else if ((stream = fdopen(fd, "r")) == NULL) {
222 			warn("%s", file);
223 			close(fd);
224 			rval = 1;
225 			return;
226 		}
227 
228 		gotsp = 1;
229 		while ((len = getline(&buf, &bufsz, stream)) > 0) {
230 			const char *end = buf + len;
231 			for (C = buf; C < end; C += len) {
232 				++charct;
233 				len = mbtowc(&wc, C, MB_CUR_MAX);
234 				if (len == -1) {
235 					mbtowc(NULL, NULL,
236 					    MB_CUR_MAX);
237 					len = 1;
238 					wc = L'?';
239 				} else if (len == 0)
240 					len = 1;
241 				if (iswspace(wc)) {
242 					gotsp = 1;
243 					if (wc == L'\n')
244 						++linect;
245 				} else if (gotsp) {
246 					gotsp = 0;
247 					++wordct;
248 				}
249 			}
250 		}
251 		if (ferror(stream)) {
252 			warn("%s", file);
253 			rval = 1;
254 		}
255 	}
256 
257 	print_counts(linect, wordct, charct, path);
258 
259 	/*
260 	 * Don't bother checking doline, doword, or dochar -- speeds
261 	 * up the common case
262 	 */
263 	tlinect += linect;
264 	twordct += wordct;
265 	tcharct += charct;
266 
267 	if ((stream == NULL ? close(fd) : fclose(stream)) != 0) {
268 		warn("%s", file);
269 		rval = 1;
270 	}
271 }
272 
273 static void
format_and_print(int64_t v)274 format_and_print(int64_t v)
275 {
276 	if (humanchar) {
277 		char result[FMT_SCALED_STRSIZE];
278 
279 		fmt_scaled((long long)v, result);
280 		printf("%7s", result);
281 	} else {
282 		printf(" %7lld", v);
283 	}
284 }
285 
286 static void
print_counts(int64_t lines,int64_t words,int64_t chars,const char * name)287 print_counts(int64_t lines, int64_t words, int64_t chars, const char *name)
288 {
289 	if (doline)
290 		format_and_print(lines);
291 	if (doword)
292 		format_and_print(words);
293 	if (dochar)
294 		format_and_print(chars);
295 
296 	if (name)
297 		printf(" %s\n", name);
298 	else
299 		printf("\n");
300 }
301