xref: /dragonfly/contrib/mdocml/preconv.c (revision 3170ffd7)
1 /*	$Id: preconv.c,v 1.5 2011/07/24 18:15:14 kristaps Exp $ */
2 /*
3  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #ifdef HAVE_MMAP
22 #include <sys/stat.h>
23 #include <sys/mman.h>
24 #endif
25 
26 #include <assert.h>
27 #include <fcntl.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <unistd.h>
32 
33 /*
34  * The read_whole_file() and resize_buf() functions are copied from
35  * read.c, including all dependency code (MAP_FILE, etc.).
36  */
37 
38 #ifndef MAP_FILE
39 #define	MAP_FILE	0
40 #endif
41 
42 enum	enc {
43 	ENC_UTF_8, /* UTF-8 */
44 	ENC_US_ASCII, /* US-ASCII */
45 	ENC_LATIN_1, /* Latin-1 */
46 	ENC__MAX
47 };
48 
49 struct	buf {
50 	char		 *buf; /* binary input buffer */
51 	size_t	 	  sz; /* size of binary buffer */
52 	size_t		  offs; /* starting buffer offset */
53 };
54 
55 struct	encode {
56 	const char	 *name;
57 	int		(*conv)(const struct buf *);
58 };
59 
60 static	int	 cue_enc(const struct buf *, size_t *, enum enc *);
61 static	int	 conv_latin_1(const struct buf *);
62 static	int	 conv_us_ascii(const struct buf *);
63 static	int	 conv_utf_8(const struct buf *);
64 static	int	 read_whole_file(const char *, int,
65 			struct buf *, int *);
66 static	void	 resize_buf(struct buf *, size_t);
67 static	void	 usage(void);
68 
69 static	const struct encode encs[ENC__MAX] = {
70 	{ "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
71 	{ "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
72 	{ "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
73 };
74 
75 static	const char	 *progname;
76 
77 static void
78 usage(void)
79 {
80 
81 	fprintf(stderr, "usage: %s "
82 			"[-D enc] "
83 			"[-e ENC] "
84 			"[file]\n", progname);
85 }
86 
87 static int
88 conv_latin_1(const struct buf *b)
89 {
90 	size_t		 i;
91 	unsigned char	 cu;
92 	const char	*cp;
93 
94 	cp = b->buf + (int)b->offs;
95 
96 	/*
97 	 * Latin-1 falls into the first 256 code-points of Unicode, so
98 	 * there's no need for any sort of translation.  Just make the
99 	 * 8-bit characters use the Unicode escape.
100 	 * Note that binary values 128 < v < 160 are passed through
101 	 * unmodified to mandoc.
102 	 */
103 
104 	for (i = b->offs; i < b->sz; i++) {
105 		cu = (unsigned char)*cp++;
106 		cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
107 	}
108 
109 	return(1);
110 }
111 
112 static int
113 conv_us_ascii(const struct buf *b)
114 {
115 
116 	/*
117 	 * US-ASCII has no conversion since it falls into the first 128
118 	 * bytes of Unicode.
119 	 */
120 
121 	fwrite(b->buf, 1, b->sz, stdout);
122 	return(1);
123 }
124 
125 static int
126 conv_utf_8(const struct buf *b)
127 {
128 	int		 state, be;
129 	unsigned int	 accum;
130 	size_t		 i;
131 	unsigned char	 cu;
132 	const char	*cp;
133 	const long	 one = 1L;
134 
135 	cp = b->buf + (int)b->offs;
136 	state = 0;
137 	accum = 0U;
138 	be = 0;
139 
140 	/* Quick test for big-endian value. */
141 
142 	if ( ! (*((const char *)(&one))))
143 		be = 1;
144 
145 	for (i = b->offs; i < b->sz; i++) {
146 		cu = (unsigned char)*cp++;
147 		if (state) {
148 			if ( ! (cu & 128) || (cu & 64)) {
149 				/* Bad sequence header. */
150 				return(0);
151 			}
152 
153 			/* Accept only legitimate bit patterns. */
154 
155 			if (cu > 191 || cu < 128) {
156 				/* Bad in-sequence bits. */
157 				return(0);
158 			}
159 
160 			accum |= (cu & 63) << --state * 6;
161 
162 			/*
163 			 * Accum is held in little-endian order as
164 			 * stipulated by the UTF-8 sequence coding.  We
165 			 * need to convert to a native big-endian if our
166 			 * architecture requires it.
167 			 */
168 
169 			if (0 == state && be)
170 				accum = (accum >> 24) |
171 					((accum << 8) & 0x00FF0000) |
172 					((accum >> 8) & 0x0000FF00) |
173 					(accum << 24);
174 
175 			if (0 == state) {
176 				accum < 128U ? putchar(accum) :
177 					printf("\\[u%.4X]", accum);
178 				accum = 0U;
179 			}
180 		} else if (cu & (1 << 7)) {
181 			/*
182 			 * Entering a UTF-8 state:  if we encounter a
183 			 * UTF-8 bitmask, calculate the expected UTF-8
184 			 * state from it.
185 			 */
186 			for (state = 0; state < 7; state++)
187 				if ( ! (cu & (1 << (7 - state))))
188 					break;
189 
190 			/* Accept only legitimate bit patterns. */
191 
192 			switch (state) {
193 			case (4):
194 				if (cu <= 244 && cu >= 240) {
195 					accum = (cu & 7) << 18;
196 					break;
197 				}
198 				/* Bad 4-sequence start bits. */
199 				return(0);
200 			case (3):
201 				if (cu <= 239 && cu >= 224) {
202 					accum = (cu & 15) << 12;
203 					break;
204 				}
205 				/* Bad 3-sequence start bits. */
206 				return(0);
207 			case (2):
208 				if (cu <= 223 && cu >= 194) {
209 					accum = (cu & 31) << 6;
210 					break;
211 				}
212 				/* Bad 2-sequence start bits. */
213 				return(0);
214 			default:
215 				/* Bad sequence bit mask. */
216 				return(0);
217 			}
218 			state--;
219 		} else
220 			putchar(cu);
221 	}
222 
223 	if (0 != state) {
224 		/* Bad trailing bits. */
225 		return(0);
226 	}
227 
228 	return(1);
229 }
230 
231 static void
232 resize_buf(struct buf *buf, size_t initial)
233 {
234 
235 	buf->sz = buf->sz > initial / 2 ?
236 		2 * buf->sz : initial;
237 
238 	buf->buf = realloc(buf->buf, buf->sz);
239 	if (NULL == buf->buf) {
240 		perror(NULL);
241 		exit(EXIT_FAILURE);
242 	}
243 }
244 
245 static int
246 read_whole_file(const char *f, int fd,
247 		struct buf *fb, int *with_mmap)
248 {
249 	size_t		 off;
250 	ssize_t		 ssz;
251 
252 #ifdef	HAVE_MMAP
253 	struct stat	 st;
254 	if (-1 == fstat(fd, &st)) {
255 		perror(f);
256 		return(0);
257 	}
258 
259 	/*
260 	 * If we're a regular file, try just reading in the whole entry
261 	 * via mmap().  This is faster than reading it into blocks, and
262 	 * since each file is only a few bytes to begin with, I'm not
263 	 * concerned that this is going to tank any machines.
264 	 */
265 
266 	if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
267 		fprintf(stderr, "%s: input too large\n", f);
268 		return(0);
269 	}
270 
271 	if (S_ISREG(st.st_mode)) {
272 		*with_mmap = 1;
273 		fb->sz = (size_t)st.st_size;
274 		fb->buf = mmap(NULL, fb->sz, PROT_READ,
275 				MAP_FILE|MAP_SHARED, fd, 0);
276 		if (fb->buf != MAP_FAILED)
277 			return(1);
278 	}
279 #endif
280 
281 	/*
282 	 * If this isn't a regular file (like, say, stdin), then we must
283 	 * go the old way and just read things in bit by bit.
284 	 */
285 
286 	*with_mmap = 0;
287 	off = 0;
288 	fb->sz = 0;
289 	fb->buf = NULL;
290 	for (;;) {
291 		if (off == fb->sz && fb->sz == (1U << 31)) {
292 			fprintf(stderr, "%s: input too large\n", f);
293 			break;
294 		}
295 
296 		if (off == fb->sz)
297 			resize_buf(fb, 65536);
298 
299 		ssz = read(fd, fb->buf + (int)off, fb->sz - off);
300 		if (ssz == 0) {
301 			fb->sz = off;
302 			return(1);
303 		}
304 		if (ssz == -1) {
305 			perror(f);
306 			break;
307 		}
308 		off += (size_t)ssz;
309 	}
310 
311 	free(fb->buf);
312 	fb->buf = NULL;
313 	return(0);
314 }
315 
316 static int
317 cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
318 {
319 	const char	*ln, *eoln, *eoph;
320 	size_t		 sz, phsz, nsz;
321 	int		 i;
322 
323 	ln = b->buf + (int)*offs;
324 	sz = b->sz - *offs;
325 
326 	/* Look for the end-of-line. */
327 
328 	if (NULL == (eoln = memchr(ln, '\n', sz)))
329 		return(-1);
330 
331 	/* Set next-line marker. */
332 
333 	*offs = (size_t)((eoln + 1) - b->buf);
334 
335 	/* Check if we have the correct header/trailer. */
336 
337 	if ((sz = (size_t)(eoln - ln)) < 10 ||
338 			memcmp(ln, ".\\\" -*-", 7) ||
339 			memcmp(eoln - 3, "-*-", 3))
340 		return(0);
341 
342 	/* Move after the header and adjust for the trailer. */
343 
344 	ln += 7;
345 	sz -= 10;
346 
347 	while (sz > 0) {
348 		while (sz > 0 && ' ' == *ln) {
349 			ln++;
350 			sz--;
351 		}
352 		if (0 == sz)
353 			break;
354 
355 		/* Find the end-of-phrase marker (or eoln). */
356 
357 		if (NULL == (eoph = memchr(ln, ';', sz)))
358 			eoph = eoln - 3;
359 		else
360 			eoph++;
361 
362 		/* Only account for the "coding" phrase. */
363 
364 		if ((phsz = (size_t)(eoph - ln)) < 7 ||
365 				strncasecmp(ln, "coding:", 7)) {
366 			sz -= phsz;
367 			ln += phsz;
368 			continue;
369 		}
370 
371 		sz -= 7;
372 		ln += 7;
373 
374 		while (sz > 0 && ' ' == *ln) {
375 			ln++;
376 			sz--;
377 		}
378 		if (0 == sz)
379 			break;
380 
381 		/* Check us against known encodings. */
382 
383 		for (i = 0; i < (int)ENC__MAX; i++) {
384 			nsz = strlen(encs[i].name);
385 			if (phsz < nsz)
386 				continue;
387 			if (strncasecmp(ln, encs[i].name, nsz))
388 				continue;
389 
390 			*enc = (enum enc)i;
391 			return(1);
392 		}
393 
394 		/* Unknown encoding. */
395 
396 		*enc = ENC__MAX;
397 		return(1);
398 	}
399 
400 	return(0);
401 }
402 
403 int
404 main(int argc, char *argv[])
405 {
406 	int	 	 i, ch, map, fd, rc;
407 	struct buf	 b;
408 	const char	*fn;
409 	enum enc	 enc, def;
410 	unsigned char 	 bom[3] = { 0xEF, 0xBB, 0xBF };
411 	size_t		 offs;
412 	extern int	 optind;
413 	extern char	*optarg;
414 
415 	progname = strrchr(argv[0], '/');
416 	if (progname == NULL)
417 		progname = argv[0];
418 	else
419 		++progname;
420 
421 	fn = "<stdin>";
422 	fd = STDIN_FILENO;
423 	rc = EXIT_FAILURE;
424 	enc = def = ENC__MAX;
425 	map = 0;
426 
427 	memset(&b, 0, sizeof(struct buf));
428 
429 	while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
430 		switch (ch) {
431 		case ('D'):
432 			/* FALLTHROUGH */
433 		case ('e'):
434 			for (i = 0; i < (int)ENC__MAX; i++) {
435 				if (strcasecmp(optarg, encs[i].name))
436 					continue;
437 				break;
438 			}
439 			if (i < (int)ENC__MAX) {
440 				if ('D' == ch)
441 					def = (enum enc)i;
442 				else
443 					enc = (enum enc)i;
444 				break;
445 			}
446 
447 			fprintf(stderr, "%s: Bad encoding\n", optarg);
448 			return(EXIT_FAILURE);
449 		case ('r'):
450 			/* FALLTHROUGH */
451 		case ('d'):
452 			/* FALLTHROUGH */
453 		case ('v'):
454 			/* Compatibility with GNU preconv. */
455 			break;
456 		case ('h'):
457 			/* Compatibility with GNU preconv. */
458 			/* FALLTHROUGH */
459 		default:
460 			usage();
461 			return(EXIT_FAILURE);
462 		}
463 
464 	argc -= optind;
465 	argv += optind;
466 
467 	/*
468 	 * Open and read the first argument on the command-line.
469 	 * If we don't have one, we default to stdin.
470 	 */
471 
472 	if (argc > 0) {
473 		fn = *argv;
474 		fd = open(fn, O_RDONLY, 0);
475 		if (-1 == fd) {
476 			perror(fn);
477 			return(EXIT_FAILURE);
478 		}
479 	}
480 
481 	if ( ! read_whole_file(fn, fd, &b, &map))
482 		goto out;
483 
484 	/* Try to read the UTF-8 BOM. */
485 
486 	if (ENC__MAX == enc)
487 		if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
488 			b.offs = 3;
489 			enc = ENC_UTF_8;
490 		}
491 
492 	/* Try reading from the "-*-" cue. */
493 
494 	if (ENC__MAX == enc) {
495 		offs = b.offs;
496 		ch = cue_enc(&b, &offs, &enc);
497 		if (0 == ch)
498 			ch = cue_enc(&b, &offs, &enc);
499 	}
500 
501 	/*
502 	 * No encoding has been detected.
503 	 * Thus, we either fall into our default encoder, if specified,
504 	 * or use Latin-1 if all else fails.
505 	 */
506 
507 	if (ENC__MAX == enc)
508 		enc = ENC__MAX == def ? ENC_LATIN_1 : def;
509 
510 	if ( ! (*encs[(int)enc].conv)(&b)) {
511 		fprintf(stderr, "%s: Bad encoding\n", fn);
512 		goto out;
513 	}
514 
515 	rc = EXIT_SUCCESS;
516 out:
517 #ifdef	HAVE_MMAP
518 	if (map)
519 		munmap(b.buf, b.sz);
520 	else
521 #endif
522 		free(b.buf);
523 
524 	if (fd > STDIN_FILENO)
525 		close(fd);
526 
527 	return(rc);
528 }
529