xref: /openbsd/usr.bin/mandoc/preconv.c (revision d415bd75)
1 /*	$OpenBSD: preconv.c,v 1.9 2018/12/13 11:55:14 schwarze Exp $ */
2 /*
3  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <stdio.h>
22 #include <string.h>
23 
24 #include "mandoc.h"
25 #include "roff.h"
26 #include "mandoc_parse.h"
27 #include "libmandoc.h"
28 
29 int
30 preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
31     int *filenc)
32 {
33 	const unsigned char	*cu;
34 	int			 nby;
35 	unsigned int		 accum;
36 
37 	cu = (const unsigned char *)ib->buf + *ii;
38 	assert(*cu & 0x80);
39 
40 	if ( ! (*filenc & MPARSE_UTF8))
41 		goto latin;
42 
43 	nby = 1;
44 	while (nby < 5 && *cu & (1 << (7 - nby)))
45 		nby++;
46 
47 	switch (nby) {
48 	case 2:
49 		accum = *cu & 0x1f;
50 		if (accum < 0x02)  /* Obfuscated ASCII. */
51 			goto latin;
52 		break;
53 	case 3:
54 		accum = *cu & 0x0f;
55 		break;
56 	case 4:
57 		accum = *cu & 0x07;
58 		if (accum > 0x04) /* Beyond Unicode. */
59 			goto latin;
60 		break;
61 	default:  /* Bad sequence header. */
62 		goto latin;
63 	}
64 
65 	cu++;
66 	switch (nby) {
67 	case 3:
68 		if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
69 		    (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
70 			goto latin;
71 		break;
72 	case 4:
73 		if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
74 		    (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
75 			goto latin;
76 		break;
77 	default:
78 		break;
79 	}
80 
81 	while (--nby) {
82 		if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
83 			goto latin;
84 		accum <<= 6;
85 		accum += *cu & 0x3f;
86 		cu++;
87 	}
88 
89 	assert(accum > 0x7f);
90 	assert(accum < 0x110000);
91 	assert(accum < 0xd800 || accum > 0xdfff);
92 
93 	*oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
94 	*ii = (const char *)cu - ib->buf;
95 	*filenc &= ~MPARSE_LATIN1;
96 	return 1;
97 
98 latin:
99 	if ( ! (*filenc & MPARSE_LATIN1))
100 		return 0;
101 
102 	*oi += snprintf(ob->buf + *oi, 11,
103 	    "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
104 
105 	*filenc &= ~MPARSE_UTF8;
106 	return 1;
107 }
108 
109 int
110 preconv_cue(const struct buf *b, size_t offset)
111 {
112 	const char	*ln, *eoln, *eoph;
113 	size_t		 sz, phsz;
114 
115 	ln = b->buf + offset;
116 	sz = b->sz - offset;
117 
118 	/* Look for the end-of-line. */
119 
120 	if (NULL == (eoln = memchr(ln, '\n', sz)))
121 		eoln = ln + sz;
122 
123 	/* Check if we have the correct header/trailer. */
124 
125 	if ((sz = (size_t)(eoln - ln)) < 10 ||
126 	    memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
127 		return MPARSE_UTF8 | MPARSE_LATIN1;
128 
129 	/* Move after the header and adjust for the trailer. */
130 
131 	ln += 7;
132 	sz -= 10;
133 
134 	while (sz > 0) {
135 		while (sz > 0 && ' ' == *ln) {
136 			ln++;
137 			sz--;
138 		}
139 		if (0 == sz)
140 			break;
141 
142 		/* Find the end-of-phrase marker (or eoln). */
143 
144 		if (NULL == (eoph = memchr(ln, ';', sz)))
145 			eoph = eoln - 3;
146 		else
147 			eoph++;
148 
149 		/* Only account for the "coding" phrase. */
150 
151 		if ((phsz = eoph - ln) < 7 ||
152 		    strncasecmp(ln, "coding:", 7)) {
153 			sz -= phsz;
154 			ln += phsz;
155 			continue;
156 		}
157 
158 		sz -= 7;
159 		ln += 7;
160 
161 		while (sz > 0 && ' ' == *ln) {
162 			ln++;
163 			sz--;
164 		}
165 		if (0 == sz)
166 			return 0;
167 
168 		/* Check us against known encodings. */
169 
170 		if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
171 			return MPARSE_UTF8;
172 		if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
173 			return MPARSE_LATIN1;
174 		return 0;
175 	}
176 	return MPARSE_UTF8 | MPARSE_LATIN1;
177 }
178