1 /*
2  * wc - word count
3  *
4  * Gunnar Ritter, Freiburg i. Br., Germany, December 2000.
5  */
6 /*
7  * Copyright (c) 2003 Gunnar Ritter
8  *
9  * This software is provided 'as-is', without any express or implied
10  * warranty. In no event will the authors be held liable for any damages
11  * arising from the use of this software.
12  *
13  * Permission is granted to anyone to use this software for any purpose,
14  * including commercial applications, and to alter it and redistribute
15  * it freely, subject to the following restrictions:
16  *
17  * 1. The origin of this software must not be misrepresented; you must not
18  *    claim that you wrote the original software. If you use this software
19  *    in a product, an acknowledgment in the product documentation would be
20  *    appreciated but is not required.
21  *
22  * 2. Altered source versions must be plainly marked as such, and must not be
23  *    misrepresented as being the original software.
24  *
25  * 3. This notice may not be removed or altered from any source distribution.
26  */
27 
28 #if __GNUC__ >= 3 && __GNUC_MINOR__ >= 4 || __GNUC__ >= 4
29 #define	USED	__attribute__ ((used))
30 #elif defined __GNUC__
31 #define	USED	__attribute__ ((unused))
32 #else
33 #define	USED
34 #endif
35 #if defined (S42)
36 static const char sccsid[] USED = "@(#)wc_s42.sl	1.42 (gritter) 5/29/05";
37 #elif defined (SUS)
38 static const char sccsid[] USED = "@(#)wc_sus.sl	1.42 (gritter) 5/29/05";
39 #else
40 static const char sccsid[] USED = "@(#)wc.sl	1.42 (gritter) 5/29/05";
41 #endif
42 
43 #include	<sys/types.h>
44 #include	<sys/stat.h>
45 #include	<fcntl.h>
46 #include	<stdio.h>
47 #include	<string.h>
48 #include	<stdlib.h>
49 #include	<wchar.h>
50 #include	<locale.h>
51 #include	<ctype.h>
52 #include	<wctype.h>
53 #include	<unistd.h>
54 #include	<libgen.h>
55 #include	<limits.h>
56 
57 #include	<iblok.h>
58 #include	<blank.h>
59 #include	<mbtowi.h>
60 
61 static unsigned		errcnt;		/* count of errors */
62 static int		cflag;		/* count bytes only */
63 static int		lflag;		/* count lines only */
64 static int		mflag;		/* count characters only */
65 static int		wflag;		/* count words only */
66 static int		illflag;	/* illegal flag given */
67 static long long	bytec;		/* byte count */
68 static long long	charc;		/* character count */
69 static long long	wordc;		/* word count */
70 static long long	linec;		/* line count */
71 static long long	chart;		/* total character count */
72 static long long	bytet;		/* total byte count */
73 static long long	wordt;		/* total word count */
74 static long long	linet;		/* total line count */
75 #if defined (S42)
76 static int		putspace;	/* wrote space to output line */
77 #endif
78 static char		*progname;	/* argv0 to main */
79 static int		mb_cur_max;	/* MB_CUR_MAX */
80 
81 static void		usage(void);
82 
83 /*
84  * Format output.
85  */
86 static void
report(unsigned long long count)87 report(unsigned long long count)
88 {
89 #if defined (S42)
90 	if (putspace++)
91 		printf(" ");
92 	printf("%llu", count);
93 #else	/* !S42 */
94 	printf("%7llu ", count);
95 #endif	/* !S42 */
96 }
97 
98 #ifdef	SUS
99 #define	COUNT(c) \
100 			if (isspace(c)) { \
101 				if ((c) == '\n') \
102 					linec++; \
103 				hadspace++; \
104 			} else { \
105 				if (hadspace) { \
106 					hadspace = 0; \
107 					wordc++; \
108 				} \
109 			}
110 #else	/* !SUS */
111 #define	COUNT(c) \
112 			if ((c) == '\n') { \
113 				linec++; \
114 				hadspace++; \
115 			} else if (blank[c]) { \
116 				hadspace++; \
117 			} else { \
118 				if (hadspace && graph[c]) { \
119 					hadspace = 0; \
120 					wordc++; \
121 				} \
122 			}
123 #endif	/* !SUS */
124 
125 static int
sbwc(struct iblok * ip)126 sbwc(struct iblok *ip)
127 {
128 	register long long hadspace = 1;
129 	register int c, i;
130 	size_t sz;
131 #ifndef	SUS
132 	/*
133 	 * If SUS is defined, this optimization brings no measurable
134 	 * performance gain.
135 	 */
136 	int	blank[256], graph[256];
137 
138 	for (c = 0; c < 256; c++) {
139 		blank[c] = isblank(c);
140 		graph[c] = isgraph(c);
141 	}
142 #endif	/* !SUS */
143 
144 	while (ib_read(ip) != EOF) {
145 		ip->ib_cur--;
146 		sz = ip->ib_end - ip->ib_cur;
147 		charc += sz;
148 		for (i = 0; i < sz; i++) {
149 			c = ip->ib_cur[i] & 0377;
150 			COUNT(c)
151 		}
152 	}
153 	bytec = charc;
154 	return 0;
155 }
156 
157 static int
mbwc(struct iblok * ip)158 mbwc(struct iblok *ip)
159 {
160 	register long long hadspace = 1;
161 	char	mb[MB_LEN_MAX];
162 	wint_t	c;
163 	int	i, k, n;
164 	size_t sz;
165 	char	*curp;
166 	int	eof = 0;
167 #ifndef	SUS
168 	/*
169 	 * If SUS is defined, this optimization brings no measurable
170 	 * performance gain.
171 	 */
172 	int	blank[128], graph[128];
173 
174 	for (c = 0; c < 128; c++) {
175 		blank[c] = isblank(c);
176 		graph[c] = isgraph(c);
177 	}
178 #endif	/* !SUS */
179 
180 	while (eof == 0) {
181 		if (ip->ib_cur == ip->ib_end) {
182 			if (ib_read(ip) == EOF)
183 				break;
184 			curp = ip->ib_cur - 1;
185 		} else
186 			curp = ip->ib_cur;
187 		ip->ib_cur = ip->ib_end;
188 		sz = ip->ib_end - curp;
189 		bytec += sz;
190 		/*
191 		 * Incrementing charc here en bloc and decrementing it
192 		 * later for multibyte characters if necessary is
193 		 * considerably faster with ASCII files.
194 		 */
195 		charc += sz;
196 		for (i = 0; i < sz; i++) {
197 			if (curp[i] & 0200) {
198 				if (sz - i < mb_cur_max) {
199 					for (k = 0; k < sz - i; k++)
200 						mb[k] = curp[i+k];
201 					while (eof == 0 && k < mb_cur_max) {
202 						if ((c = ib_get(ip)) != EOF) {
203 							mb[k++] = c;
204 							bytec++;
205 							charc++;
206 						} else
207 							eof = 1;
208 					}
209 					curp = mb;
210 					sz = k;
211 					i = 0;
212 				}
213 				n = mbtowi(&c, &curp[i], sz - i);
214 				if (n < 0) {
215 					charc--;
216 					continue;
217 				}
218 				charc -= n - 1;
219 				i += n - 1;
220 #ifdef	SUS
221 				if (iswspace(c)) {
222 					if (c == '\n')
223 						linec++;
224 					hadspace++;
225 				} else {
226 					if (hadspace) {
227 						hadspace = 0;
228 						wordc++;
229 					}
230 				}
231 #else	/* !SUS */
232 				if (c == '\n') {
233 					linec++;
234 					hadspace++;
235 				} else if (iswblank(c)) {
236 					hadspace++;
237 				} else {
238 					if (hadspace && iswgraph(c)) {
239 						hadspace = 0;
240 						wordc++;
241 					}
242 				}
243 #endif	/* !SUS */
244 			} else /* isascii(c) */ {
245 				c = curp[i];
246 				COUNT(c)
247 			}
248 		}
249 	}
250 	return 0;
251 }
252 
253 /*
254  * Count a file.
255  */
256 static int
fpwc(int fd)257 fpwc(int fd)
258 {
259 	struct iblok	*ip;
260 
261 	bytec = charc = wordc = linec = 0;
262 	ip = ib_alloc(fd, 0);
263 	if ((mb_cur_max > 1 && (wflag || mflag) ? mbwc : sbwc)(ip)) {
264 		ib_free(ip);
265 		return -1;
266 	}
267 	ib_free(ip);
268 	if (illflag)
269 		usage();
270 #if defined (S42)
271 	putspace = 0;
272 #endif
273 	if (lflag)
274 		report(linec);
275 	if (wflag)
276 		report(wordc);
277 	if (cflag)
278 		report(bytec);
279 	if (mflag)
280 		report(charc);
281 	linet += linec;
282 	wordt += wordc;
283 	chart += charc;
284 	bytet += bytec;
285 	return 0;
286 }
287 
288 /*
289  * Count a file given by its name.
290  */
291 static int
filewc(const char * fn)292 filewc(const char *fn)
293 {
294 	int fd;
295 	int r;
296 
297 	if ((fd = open(fn, O_RDONLY)) < 0) {
298 		fprintf(stderr, "%s: cannot open %s\n", progname, fn);
299 		errcnt |= 1;
300 		return 0;
301 	}
302 	if ((r = fpwc(fd)) >= 0)
303 		printf(" %s\n", fn);
304 	if (fd > 0)
305 		close(fd);
306 	return r >= 0 ? 1 : 0;
307 }
308 
309 static void
usage(void)310 usage(void)
311 {
312 	fprintf(stderr, "usage: %s [-clw] [name ...]\n", progname);
313 	exit(2);
314 }
315 
316 int
main(int argc,char ** argv)317 main(int argc, char **argv)
318 {
319 	int c;
320 	unsigned ac;
321 
322 #ifdef	__GLIBC__
323 	putenv("POSIXLY_CORRECT=1");
324 #endif
325 	progname = basename(argv[0]);
326 	setlocale(LC_CTYPE, "");
327 	mb_cur_max = MB_CUR_MAX;
328 	ac = 0;
329 	while ((c = getopt(argc, argv, ":clmw")) != EOF) {
330 		ac++;
331 		switch (c) {
332 		case 'c':
333 			cflag = ac;
334 			break;
335 		case 'l':
336 			lflag = ac;
337 			break;
338 		case 'm':
339 			mflag = ac;
340 			break;
341 		case 'w':
342 			wflag = ac;
343 			break;
344 		default:
345 			illflag = 1;
346 		}
347 	}
348 	if (ac == 0) {
349 #if !defined (SUS) && !defined(S42)
350 		if (argv[1] && argv[1][0] == '-' && argv[1][1] == '\0')
351 			optind++;
352 		else
353 #endif	/* !SUS, !S42 */
354 			cflag = lflag = wflag = 1;
355 	}
356 	if (mflag)
357 		cflag = 0;
358 	if (optind < argc) {
359 		ac = 0;
360 		while (optind < argc)
361 			ac += filewc(argv[optind++]);
362 		if (ac > 1) {
363 #if defined (S42)
364 			putspace = 0;
365 #endif
366 			if (lflag)
367 				report(linet);
368 			if (wflag)
369 				report(wordt);
370 			if (cflag)
371 				report(bytet);
372 			if (mflag)
373 				report(chart);
374 			printf(" total\n");
375 		}
376 	} else {
377 		fpwc(0);
378 		printf("\n");
379 	}
380 	return errcnt;
381 }
382