1 /*
2 * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
3 *
4 * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
5 *
6 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7 */
8
9 #include <string.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include "text_cbits.h"
13
_hs_text_memcpy(void * dest,size_t doff,const void * src,size_t soff,size_t n)14 void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
15 size_t n)
16 {
17 memcpy(dest + (doff<<1), src + (soff<<1), n<<1);
18 }
19
_hs_text_memcmp(const void * a,size_t aoff,const void * b,size_t boff,size_t n)20 int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
21 size_t n)
22 {
23 return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
24 }
25
26 #define UTF8_ACCEPT 0
27 #define UTF8_REJECT 12
28
29 static const uint8_t utf8d[] = {
30 /*
31 * The first part of the table maps bytes to character classes that
32 * to reduce the size of the transition table and create bitmasks.
33 */
34 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
35 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
36 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
39 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
40 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
41 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
42
43 /*
44 * The second part is a transition table that maps a combination of
45 * a state of the automaton and a character class to a state.
46 */
47 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
48 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
49 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
50 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
51 12,36,12,12,12,12,12,12,12,12,12,12,
52 };
53
54 static inline uint32_t
decode(uint32_t * state,uint32_t * codep,uint32_t byte)55 decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
56 uint32_t type = utf8d[byte];
57
58 *codep = (*state != UTF8_ACCEPT) ?
59 (byte & 0x3fu) | (*codep << 6) :
60 (0xff >> type) & (byte);
61
62 return *state = utf8d[256 + *state + type];
63 }
64
65 /*
66 * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
67 * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
68 * an UTF16 array
69 */
70 void
_hs_text_decode_latin1(uint16_t * dest,const uint8_t * src,const uint8_t * srcend)71 _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
72 const uint8_t *srcend)
73 {
74 const uint8_t *p = src;
75
76 #if defined(__i386__) || defined(__x86_64__)
77 /* This optimization works on a little-endian systems by using
78 (aligned) 32-bit loads instead of 8-bit loads
79 */
80
81 /* consume unaligned prefix */
82 while (p != srcend && (uintptr_t)p & 0x3)
83 *dest++ = *p++;
84
85 /* iterate over 32-bit aligned loads */
86 while (p < srcend - 3) {
87 const uint32_t w = *((const uint32_t *)p);
88
89 *dest++ = w & 0xff;
90 *dest++ = (w >> 8) & 0xff;
91 *dest++ = (w >> 16) & 0xff;
92 *dest++ = (w >> 24) & 0xff;
93
94 p += 4;
95 }
96 #endif
97
98 /* handle unaligned suffix */
99 while (p != srcend)
100 *dest++ = *p++;
101 }
102
103 /*
104 * A best-effort decoder. Runs until it hits either end of input or
105 * the start of an invalid byte sequence.
106 *
107 * At exit, we update *destoff with the next offset to write to, *src
108 * with the next source location past the last one successfully
109 * decoded, and return the next source location to read from.
110 *
111 * Moreover, we expose the internal decoder state (state0 and
112 * codepoint0), allowing one to restart the decoder after it
113 * terminates (say, due to a partial codepoint).
114 *
115 * In particular, there are a few possible outcomes,
116 *
117 * 1) We decoded the buffer entirely:
118 * In this case we return srcend
119 * state0 == UTF8_ACCEPT
120 *
121 * 2) We met an invalid encoding
122 * In this case we return the address of the first invalid byte
123 * state0 == UTF8_REJECT
124 *
125 * 3) We reached the end of the buffer while decoding a codepoint
126 * In this case we return a pointer to the first byte of the partial codepoint
127 * state0 != UTF8_ACCEPT, UTF8_REJECT
128 *
129 */
130 #if defined(__GNUC__) || defined(__clang__)
131 static inline uint8_t const *
132 _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
133 const uint8_t **src, const uint8_t *srcend,
134 uint32_t *codepoint0, uint32_t *state0)
135 __attribute((always_inline));
136 #endif
137
138 static inline uint8_t const *
_hs_text_decode_utf8_int(uint16_t * const dest,size_t * destoff,const uint8_t ** src,const uint8_t * srcend,uint32_t * codepoint0,uint32_t * state0)139 _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
140 const uint8_t **src, const uint8_t *srcend,
141 uint32_t *codepoint0, uint32_t *state0)
142 {
143 uint16_t *d = dest + *destoff;
144 const uint8_t *s = *src, *last = *src;
145 uint32_t state = *state0;
146 uint32_t codepoint = *codepoint0;
147
148 while (s < srcend) {
149 #if defined(__i386__) || defined(__x86_64__)
150 /*
151 * This code will only work on a little-endian system that
152 * supports unaligned loads.
153 *
154 * It gives a substantial speed win on data that is purely or
155 * partly ASCII (e.g. HTML), at only a slight cost on purely
156 * non-ASCII text.
157 */
158
159 if (state == UTF8_ACCEPT) {
160 while (s < srcend - 4) {
161 codepoint = *((uint32_t *) s);
162 if ((codepoint & 0x80808080) != 0)
163 break;
164 s += 4;
165
166 /*
167 * Tried 32-bit stores here, but the extra bit-twiddling
168 * slowed the code down.
169 */
170
171 *d++ = (uint16_t) (codepoint & 0xff);
172 *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
173 *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
174 *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
175 }
176 last = s;
177 }
178 #endif
179
180 if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
181 if (state != UTF8_REJECT)
182 continue;
183 break;
184 }
185
186 if (codepoint <= 0xffff)
187 *d++ = (uint16_t) codepoint;
188 else {
189 *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
190 *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
191 }
192 last = s;
193 }
194
195 *destoff = d - dest;
196 *codepoint0 = codepoint;
197 *state0 = state;
198 *src = last;
199
200 return s;
201 }
202
203 uint8_t const *
_hs_text_decode_utf8_state(uint16_t * const dest,size_t * destoff,const uint8_t ** src,const uint8_t * srcend,uint32_t * codepoint0,uint32_t * state0)204 _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
205 const uint8_t **src,
206 const uint8_t *srcend,
207 uint32_t *codepoint0, uint32_t *state0)
208 {
209 _hs_text_decode_utf8_int(dest, destoff, src, srcend, codepoint0, state0);
210
211 return *src;
212 }
213
214 /*
215 * Helper to decode buffer and discard final decoder state
216 */
217 const uint8_t *
_hs_text_decode_utf8(uint16_t * const dest,size_t * destoff,const uint8_t * src,const uint8_t * const srcend)218 _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
219 const uint8_t *src, const uint8_t *const srcend)
220 {
221 uint32_t codepoint;
222 uint32_t state = UTF8_ACCEPT;
223 _hs_text_decode_utf8_int(dest, destoff, &src, srcend,
224 &codepoint, &state);
225 return src;
226 }
227
228 void
_hs_text_encode_utf8(uint8_t ** destp,const uint16_t * src,size_t srcoff,size_t srclen)229 _hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
230 size_t srclen)
231 {
232 const uint16_t *srcend;
233 uint8_t *dest = *destp;
234
235 src += srcoff;
236 srcend = src + srclen;
237
238 ascii:
239 #if defined(__x86_64__)
240 while (srcend - src >= 4) {
241 uint64_t w = *((uint64_t *) src);
242
243 if (w & 0xFF80FF80FF80FF80ULL) {
244 if (!(w & 0x000000000000FF80ULL)) {
245 *dest++ = w & 0xFFFF;
246 src++;
247 if (!(w & 0x00000000FF800000ULL)) {
248 *dest++ = (w >> 16) & 0xFFFF;
249 src++;
250 if (!(w & 0x0000FF8000000000ULL)) {
251 *dest++ = (w >> 32) & 0xFFFF;
252 src++;
253 }
254 }
255 }
256 break;
257 }
258 *dest++ = w & 0xFFFF;
259 *dest++ = (w >> 16) & 0xFFFF;
260 *dest++ = (w >> 32) & 0xFFFF;
261 *dest++ = w >> 48;
262 src += 4;
263 }
264 #endif
265
266 #if defined(__i386__)
267 while (srcend - src >= 2) {
268 uint32_t w = *((uint32_t *) src);
269
270 if (w & 0xFF80FF80)
271 break;
272 *dest++ = w & 0xFFFF;
273 *dest++ = w >> 16;
274 src += 2;
275 }
276 #endif
277
278 while (src < srcend) {
279 uint16_t w = *src++;
280
281 if (w <= 0x7F) {
282 *dest++ = w;
283 /* An ASCII byte is likely to begin a run of ASCII bytes.
284 Falling back into the fast path really helps performance. */
285 goto ascii;
286 }
287 else if (w <= 0x7FF) {
288 *dest++ = (w >> 6) | 0xC0;
289 *dest++ = (w & 0x3f) | 0x80;
290 }
291 else if (w < 0xD800 || w > 0xDBFF) {
292 *dest++ = (w >> 12) | 0xE0;
293 *dest++ = ((w >> 6) & 0x3F) | 0x80;
294 *dest++ = (w & 0x3F) | 0x80;
295 } else {
296 uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
297 (((uint32_t) *src++) - 0xDC00) + 0x10000;
298 *dest++ = (c >> 18) | 0xF0;
299 *dest++ = ((c >> 12) & 0x3F) | 0x80;
300 *dest++ = ((c >> 6) & 0x3F) | 0x80;
301 *dest++ = (c & 0x3F) | 0x80;
302 }
303 }
304
305 *destp = dest;
306 }
307