1 /* $Id: preconv.c,v 1.4 2011/05/26 21:13:07 kristaps Exp $ */ 2 /* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include <sys/stat.h> 22 #include <sys/mman.h> 23 24 #include <assert.h> 25 #include <fcntl.h> 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <unistd.h> 30 31 /* 32 * The read_whole_file() and resize_buf() functions are copied from 33 * read.c, including all dependency code (MAP_FILE, etc.). 34 */ 35 36 #ifndef MAP_FILE 37 #define MAP_FILE 0 38 #endif 39 40 enum enc { 41 ENC_UTF_8, /* UTF-8 */ 42 ENC_US_ASCII, /* US-ASCII */ 43 ENC_LATIN_1, /* Latin-1 */ 44 ENC__MAX 45 }; 46 47 struct buf { 48 char *buf; /* binary input buffer */ 49 size_t sz; /* size of binary buffer */ 50 size_t offs; /* starting buffer offset */ 51 }; 52 53 struct encode { 54 const char *name; 55 int (*conv)(const struct buf *); 56 }; 57 58 static int cue_enc(const struct buf *, size_t *, enum enc *); 59 static int conv_latin_1(const struct buf *); 60 static int conv_us_ascii(const struct buf *); 61 static int conv_utf_8(const struct buf *); 62 static int read_whole_file(const char *, int, 63 struct buf *, int *); 64 static void resize_buf(struct buf *, size_t); 65 static void usage(void); 66 67 static const struct encode encs[ENC__MAX] = { 68 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */ 69 { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */ 70 { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */ 71 }; 72 73 static const char *progname; 74 75 static void 76 usage(void) 77 { 78 79 fprintf(stderr, "usage: %s " 80 "[-D enc] " 81 "[-e ENC] " 82 "[file]\n", progname); 83 } 84 85 static int 86 conv_latin_1(const struct buf *b) 87 { 88 size_t i; 89 unsigned char cu; 90 const char *cp; 91 92 cp = b->buf + (int)b->offs; 93 94 /* 95 * Latin-1 falls into the first 256 code-points of Unicode, so 96 * there's no need for any sort of translation. Just make the 97 * 8-bit characters use the Unicode escape. 98 * Note that binary values 128 < v < 160 are passed through 99 * unmodified to mandoc. 100 */ 101 102 for (i = b->offs; i < b->sz; i++) { 103 cu = (unsigned char)*cp++; 104 cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu); 105 } 106 107 return(1); 108 } 109 110 static int 111 conv_us_ascii(const struct buf *b) 112 { 113 114 /* 115 * US-ASCII has no conversion since it falls into the first 128 116 * bytes of Unicode. 117 */ 118 119 fwrite(b->buf, 1, b->sz, stdout); 120 return(1); 121 } 122 123 static int 124 conv_utf_8(const struct buf *b) 125 { 126 int state, be; 127 unsigned int accum; 128 size_t i; 129 unsigned char cu; 130 const char *cp; 131 const long one = 1L; 132 133 cp = b->buf + (int)b->offs; 134 state = 0; 135 accum = 0U; 136 be = 0; 137 138 /* Quick test for big-endian value. */ 139 140 if ( ! (*((const char *)(&one)))) 141 be = 1; 142 143 for (i = b->offs; i < b->sz; i++) { 144 cu = (unsigned char)*cp++; 145 if (state) { 146 if ( ! (cu & 128) || (cu & 64)) { 147 /* Bad sequence header. */ 148 return(0); 149 } 150 151 /* Accept only legitimate bit patterns. */ 152 153 if (cu > 191 || cu < 128) { 154 /* Bad in-sequence bits. */ 155 return(0); 156 } 157 158 accum |= (cu & 63) << --state * 6; 159 160 /* 161 * Accum is held in little-endian order as 162 * stipulated by the UTF-8 sequence coding. We 163 * need to convert to a native big-endian if our 164 * architecture requires it. 165 */ 166 167 if (0 == state && be) 168 accum = (accum >> 24) | 169 ((accum << 8) & 0x00FF0000) | 170 ((accum >> 8) & 0x0000FF00) | 171 (accum << 24); 172 173 if (0 == state) { 174 accum < 128U ? putchar(accum) : 175 printf("\\[u%.4X]", accum); 176 accum = 0U; 177 } 178 } else if (cu & (1 << 7)) { 179 /* 180 * Entering a UTF-8 state: if we encounter a 181 * UTF-8 bitmask, calculate the expected UTF-8 182 * state from it. 183 */ 184 for (state = 0; state < 7; state++) 185 if ( ! (cu & (1 << (7 - state)))) 186 break; 187 188 /* Accept only legitimate bit patterns. */ 189 190 switch (state) { 191 case (4): 192 if (cu <= 244 && cu >= 240) { 193 accum = (cu & 7) << 18; 194 break; 195 } 196 /* Bad 4-sequence start bits. */ 197 return(0); 198 case (3): 199 if (cu <= 239 && cu >= 224) { 200 accum = (cu & 15) << 12; 201 break; 202 } 203 /* Bad 3-sequence start bits. */ 204 return(0); 205 case (2): 206 if (cu <= 223 && cu >= 194) { 207 accum = (cu & 31) << 6; 208 break; 209 } 210 /* Bad 2-sequence start bits. */ 211 return(0); 212 default: 213 /* Bad sequence bit mask. */ 214 return(0); 215 } 216 state--; 217 } else 218 putchar(cu); 219 } 220 221 if (0 != state) { 222 /* Bad trailing bits. */ 223 return(0); 224 } 225 226 return(1); 227 } 228 229 static void 230 resize_buf(struct buf *buf, size_t initial) 231 { 232 233 buf->sz = buf->sz > initial / 2 ? 234 2 * buf->sz : initial; 235 236 buf->buf = realloc(buf->buf, buf->sz); 237 if (NULL == buf->buf) { 238 perror(NULL); 239 exit(EXIT_FAILURE); 240 } 241 } 242 243 static int 244 read_whole_file(const char *f, int fd, 245 struct buf *fb, int *with_mmap) 246 { 247 struct stat st; 248 size_t off; 249 ssize_t ssz; 250 251 if (-1 == fstat(fd, &st)) { 252 perror(f); 253 return(0); 254 } 255 256 /* 257 * If we're a regular file, try just reading in the whole entry 258 * via mmap(). This is faster than reading it into blocks, and 259 * since each file is only a few bytes to begin with, I'm not 260 * concerned that this is going to tank any machines. 261 */ 262 263 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) { 264 fprintf(stderr, "%s: input too large\n", f); 265 return(0); 266 } 267 268 if (S_ISREG(st.st_mode)) { 269 *with_mmap = 1; 270 fb->sz = (size_t)st.st_size; 271 fb->buf = mmap(NULL, fb->sz, PROT_READ, 272 MAP_FILE|MAP_SHARED, fd, 0); 273 if (fb->buf != MAP_FAILED) 274 return(1); 275 } 276 277 /* 278 * If this isn't a regular file (like, say, stdin), then we must 279 * go the old way and just read things in bit by bit. 280 */ 281 282 *with_mmap = 0; 283 off = 0; 284 fb->sz = 0; 285 fb->buf = NULL; 286 for (;;) { 287 if (off == fb->sz && fb->sz == (1U << 31)) { 288 fprintf(stderr, "%s: input too large\n", f); 289 break; 290 } 291 292 if (off == fb->sz) 293 resize_buf(fb, 65536); 294 295 ssz = read(fd, fb->buf + (int)off, fb->sz - off); 296 if (ssz == 0) { 297 fb->sz = off; 298 return(1); 299 } 300 if (ssz == -1) { 301 perror(f); 302 break; 303 } 304 off += (size_t)ssz; 305 } 306 307 free(fb->buf); 308 fb->buf = NULL; 309 return(0); 310 } 311 312 static int 313 cue_enc(const struct buf *b, size_t *offs, enum enc *enc) 314 { 315 const char *ln, *eoln, *eoph; 316 size_t sz, phsz, nsz; 317 int i; 318 319 ln = b->buf + (int)*offs; 320 sz = b->sz - *offs; 321 322 /* Look for the end-of-line. */ 323 324 if (NULL == (eoln = memchr(ln, '\n', sz))) 325 return(-1); 326 327 /* Set next-line marker. */ 328 329 *offs = (size_t)((eoln + 1) - b->buf); 330 331 /* Check if we have the correct header/trailer. */ 332 333 if ((sz = (size_t)(eoln - ln)) < 10 || 334 memcmp(ln, ".\\\" -*-", 7) || 335 memcmp(eoln - 3, "-*-", 3)) 336 return(0); 337 338 /* Move after the header and adjust for the trailer. */ 339 340 ln += 7; 341 sz -= 10; 342 343 while (sz > 0) { 344 while (sz > 0 && ' ' == *ln) { 345 ln++; 346 sz--; 347 } 348 if (0 == sz) 349 break; 350 351 /* Find the end-of-phrase marker (or eoln). */ 352 353 if (NULL == (eoph = memchr(ln, ';', sz))) 354 eoph = eoln - 3; 355 else 356 eoph++; 357 358 /* Only account for the "coding" phrase. */ 359 360 if ((phsz = (size_t)(eoph - ln)) < 7 || 361 strncasecmp(ln, "coding:", 7)) { 362 sz -= phsz; 363 ln += phsz; 364 continue; 365 } 366 367 sz -= 7; 368 ln += 7; 369 370 while (sz > 0 && ' ' == *ln) { 371 ln++; 372 sz--; 373 } 374 if (0 == sz) 375 break; 376 377 /* Check us against known encodings. */ 378 379 for (i = 0; i < (int)ENC__MAX; i++) { 380 nsz = strlen(encs[i].name); 381 if (phsz < nsz) 382 continue; 383 if (strncasecmp(ln, encs[i].name, nsz)) 384 continue; 385 386 *enc = (enum enc)i; 387 return(1); 388 } 389 390 /* Unknown encoding. */ 391 392 *enc = ENC__MAX; 393 return(1); 394 } 395 396 return(0); 397 } 398 399 int 400 main(int argc, char *argv[]) 401 { 402 int i, ch, map, fd, rc; 403 struct buf b; 404 const char *fn; 405 enum enc enc, def; 406 unsigned char bom[3] = { 0xEF, 0xBB, 0xBF }; 407 size_t offs; 408 extern int optind; 409 extern char *optarg; 410 411 progname = strrchr(argv[0], '/'); 412 if (progname == NULL) 413 progname = argv[0]; 414 else 415 ++progname; 416 417 fn = "<stdin>"; 418 fd = STDIN_FILENO; 419 rc = EXIT_FAILURE; 420 enc = def = ENC__MAX; 421 map = 0; 422 423 memset(&b, 0, sizeof(struct buf)); 424 425 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh"))) 426 switch (ch) { 427 case ('D'): 428 /* FALLTHROUGH */ 429 case ('e'): 430 for (i = 0; i < (int)ENC__MAX; i++) { 431 if (strcasecmp(optarg, encs[i].name)) 432 continue; 433 break; 434 } 435 if (i < (int)ENC__MAX) { 436 if ('D' == ch) 437 def = (enum enc)i; 438 else 439 enc = (enum enc)i; 440 break; 441 } 442 443 fprintf(stderr, "%s: Bad encoding\n", optarg); 444 return(EXIT_FAILURE); 445 case ('r'): 446 /* FALLTHROUGH */ 447 case ('d'): 448 /* FALLTHROUGH */ 449 case ('v'): 450 /* Compatibility with GNU preconv. */ 451 break; 452 case ('h'): 453 /* Compatibility with GNU preconv. */ 454 /* FALLTHROUGH */ 455 default: 456 usage(); 457 return(EXIT_FAILURE); 458 } 459 460 argc -= optind; 461 argv += optind; 462 463 /* 464 * Open and read the first argument on the command-line. 465 * If we don't have one, we default to stdin. 466 */ 467 468 if (argc > 0) { 469 fn = *argv; 470 fd = open(fn, O_RDONLY, 0); 471 if (-1 == fd) { 472 perror(fn); 473 return(EXIT_FAILURE); 474 } 475 } 476 477 if ( ! read_whole_file(fn, fd, &b, &map)) 478 goto out; 479 480 /* Try to read the UTF-8 BOM. */ 481 482 if (ENC__MAX == enc) 483 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) { 484 b.offs = 3; 485 enc = ENC_UTF_8; 486 } 487 488 /* Try reading from the "-*-" cue. */ 489 490 if (ENC__MAX == enc) { 491 offs = b.offs; 492 ch = cue_enc(&b, &offs, &enc); 493 if (0 == ch) 494 ch = cue_enc(&b, &offs, &enc); 495 } 496 497 /* 498 * No encoding has been detected. 499 * Thus, we either fall into our default encoder, if specified, 500 * or use Latin-1 if all else fails. 501 */ 502 503 if (ENC__MAX == enc) 504 enc = ENC__MAX == def ? ENC_LATIN_1 : def; 505 506 if ( ! (*encs[(int)enc].conv)(&b)) { 507 fprintf(stderr, "%s: Bad encoding\n", fn); 508 goto out; 509 } 510 511 rc = EXIT_SUCCESS; 512 out: 513 if (map) 514 munmap(b.buf, b.sz); 515 else 516 free(b.buf); 517 518 if (fd > STDIN_FILENO) 519 close(fd); 520 521 return(rc); 522 } 523