1 /* $NetBSD: wc.c,v 1.28 2002/05/02 13:07:13 wiz Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1987, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #ifndef lint 38 __COPYRIGHT("@(#) Copyright (c) 1980, 1987, 1991, 1993\n\ 39 The Regents of the University of California. All rights reserved.\n"); 40 #endif /* not lint */ 41 42 #ifndef lint 43 #if 0 44 static char sccsid[] = "@(#)wc.c 8.2 (Berkeley) 5/2/95"; 45 #else 46 __RCSID("$NetBSD: wc.c,v 1.28 2002/05/02 13:07:13 wiz Exp $"); 47 #endif 48 #endif /* not lint */ 49 50 /* wc line, word and char count */ 51 52 #include <sys/param.h> 53 #include <sys/file.h> 54 #include <sys/stat.h> 55 56 #include <ctype.h> 57 #include <fcntl.h> 58 #include <err.h> 59 #include <errno.h> 60 #include <locale.h> 61 #include <stdio.h> 62 #include <stdlib.h> 63 #include <string.h> 64 #include <unistd.h> 65 #include <wchar.h> 66 #include <wctype.h> 67 68 #ifdef NO_QUAD 69 typedef u_long wc_count_t; 70 # define WCFMT " %7lu" 71 # define WCCAST unsigned long 72 #else 73 typedef u_quad_t wc_count_t; 74 # define WCFMT " %7llu" 75 # define WCCAST unsigned long long 76 #endif 77 78 static wc_count_t tlinect, twordct, tcharct; 79 static int doline, doword, dobyte, dochar; 80 static int rval = 0; 81 82 static void cnt __P((char *)); 83 static void print_counts __P((wc_count_t, wc_count_t, wc_count_t, char *)); 84 static void usage __P((void)); 85 static size_t do_mb __P((wchar_t *, const char *, size_t, mbstate_t *, 86 size_t *, const char *)); 87 int main __P((int, char *[])); 88 89 int 90 main(argc, argv) 91 int argc; 92 char *argv[]; 93 { 94 int ch; 95 96 setlocale(LC_ALL, ""); 97 98 while ((ch = getopt(argc, argv, "lwcm")) != -1) 99 switch (ch) { 100 case 'l': 101 doline = 1; 102 break; 103 case 'w': 104 doword = 1; 105 break; 106 case 'm': 107 dochar = 1; 108 dobyte = 0; 109 break; 110 case 'c': 111 dochar = 0; 112 dobyte = 1; 113 break; 114 case '?': 115 default: 116 usage(); 117 } 118 argv += optind; 119 argc -= optind; 120 121 /* Wc's flags are on by default. */ 122 if (doline + doword + dobyte + dochar == 0) 123 doline = doword = dobyte = 1; 124 125 if (!*argv) { 126 cnt(NULL); 127 } else { 128 int dototal = (argc > 1); 129 130 do { 131 cnt(*argv); 132 } while(*++argv); 133 134 if (dototal) 135 print_counts(tlinect, twordct, tcharct, "total"); 136 } 137 138 exit(rval); 139 } 140 141 static size_t 142 do_mb(wc, p, mblen, st, cnt, file) 143 wchar_t *wc; 144 const char *p; 145 size_t mblen; 146 mbstate_t *st; 147 size_t *cnt; 148 const char *file; 149 { 150 size_t r; 151 size_t c = 0; 152 153 do { 154 r = mbrtowc(wc, p, mblen, st); 155 if (r == (size_t)-1) { 156 warnx("%s: invalid byte sequence", file); 157 rval = 1; 158 159 /* XXX skip 1 byte */ 160 mblen--; 161 p++; 162 memset(st, 0, sizeof(*st)); 163 continue; 164 } else if (r == (size_t)-2) 165 break; 166 else if (r == 0) 167 r = 1; 168 c++; 169 if (wc) 170 wc++; 171 mblen -= r; 172 p += r; 173 } while (mblen > 0); 174 175 *cnt = c; 176 177 return (r); 178 } 179 180 static void 181 cnt(file) 182 char *file; 183 { 184 u_char buf[MAXBSIZE]; 185 wchar_t wbuf[MAXBSIZE]; 186 struct stat sb; 187 wc_count_t charct, linect, wordct; 188 mbstate_t st; 189 u_char *C; 190 wchar_t *WC; 191 char *name; /* filename or <stdin> */ 192 size_t r = 0; 193 int fd, gotsp, len = 0; 194 195 linect = wordct = charct = 0; 196 if (file) { 197 if ((fd = open(file, O_RDONLY, 0)) < 0) { 198 warn("%s", file); 199 rval = 1; 200 return; 201 } 202 name = file; 203 } else { 204 fd = STDIN_FILENO; 205 name = "<stdin>"; 206 } 207 208 if (dochar || doword) 209 memset(&st, 0, sizeof(st)); 210 211 if (!doword) { 212 /* 213 * line counting is split out because it's a lot 214 * faster to get lines than to get words, since 215 * the word count requires some logic. 216 */ 217 if (doline || dochar) { 218 while ((len = read(fd, buf, MAXBSIZE)) > 0) { 219 if (dochar) { 220 size_t wlen; 221 222 r = do_mb(0, (char *)buf, (size_t)len, 223 &st, &wlen, name); 224 charct += wlen; 225 } else if (dobyte) 226 charct += len; 227 if (doline) 228 for (C = buf; len--; ++C) 229 if (*C == '\n') 230 ++linect; 231 } 232 } 233 234 /* 235 * if all we need is the number of characters and 236 * it's a directory or a regular or linked file, just 237 * stat the puppy. We avoid testing for it not being 238 * a special device in case someone adds a new type 239 * of inode. 240 */ 241 else if (dobyte) { 242 if (fstat(fd, &sb)) { 243 warn("%s", name); 244 rval = 1; 245 } else { 246 if (S_ISREG(sb.st_mode) || 247 S_ISLNK(sb.st_mode) || 248 S_ISDIR(sb.st_mode)) { 249 charct = sb.st_size; 250 } else { 251 while ((len = 252 read(fd, buf, MAXBSIZE)) > 0) 253 charct += len; 254 } 255 } 256 } 257 } else { 258 /* do it the hard way... */ 259 gotsp = 1; 260 while ((len = read(fd, buf, MAXBSIZE)) > 0) { 261 size_t wlen; 262 263 r = do_mb(wbuf, (char *)buf, (size_t)len, &st, &wlen, 264 name); 265 if (dochar) { 266 charct += wlen; 267 } else if (dobyte) 268 charct += len; 269 for (WC = wbuf; wlen--; ++WC) { 270 if (iswspace(*WC)) { 271 gotsp = 1; 272 if (*WC == L'\n') { 273 ++linect; 274 } 275 } else { 276 /* 277 * This line implements the POSIX 278 * spec, i.e. a word is a "maximal 279 * string of characters delimited by 280 * whitespace." Notice nothing was 281 * said about a character being 282 * printing or non-printing. 283 */ 284 if (gotsp) { 285 gotsp = 0; 286 ++wordct; 287 } 288 } 289 } 290 } 291 } 292 293 if (len == -1) { 294 warn("%s", name); 295 rval = 1; 296 } 297 if (dochar && r == (size_t)-2) { 298 warnx("%s: incomplete multibyte character", name); 299 rval = 1; 300 } 301 302 print_counts(linect, wordct, charct, file); 303 304 /* 305 * don't bother checkint doline, doword, or dobyte --- speeds 306 * up the common case 307 */ 308 tlinect += linect; 309 twordct += wordct; 310 tcharct += charct; 311 312 if (close(fd)) { 313 warn("%s", name); 314 rval = 1; 315 } 316 } 317 318 static void 319 print_counts(lines, words, chars, name) 320 wc_count_t lines; 321 wc_count_t words; 322 wc_count_t chars; 323 char *name; 324 { 325 326 if (doline) 327 printf(WCFMT, (WCCAST)lines); 328 if (doword) 329 printf(WCFMT, (WCCAST)words); 330 if (dobyte || dochar) 331 printf(WCFMT, (WCCAST)chars); 332 333 if (name) 334 printf(" %s\n", name); 335 else 336 printf("\n"); 337 } 338 339 static void 340 usage() 341 { 342 343 (void)fprintf(stderr, "usage: wc [-c | -m] [-lw] [file ...]\n"); 344 exit(1); 345 } 346