1 /*
2  * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
3  *
4  * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
5  *
6  * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7  */
8 
9 #include <string.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include "text_cbits.h"
13 
_hs_text_memcpy(void * dest,size_t doff,const void * src,size_t soff,size_t n)14 void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
15 		     size_t n)
16 {
17   memcpy(dest + (doff<<1), src + (soff<<1), n<<1);
18 }
19 
_hs_text_memcmp(const void * a,size_t aoff,const void * b,size_t boff,size_t n)20 int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
21 		    size_t n)
22 {
23   return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
24 }
25 
26 #define UTF8_ACCEPT 0
27 #define UTF8_REJECT 12
28 
29 static const uint8_t utf8d[] = {
30   /*
31    * The first part of the table maps bytes to character classes that
32    * to reduce the size of the transition table and create bitmasks.
33    */
34    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
35    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
36    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
39    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
40    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
41   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
42 
43   /*
44    * The second part is a transition table that maps a combination of
45    * a state of the automaton and a character class to a state.
46    */
47    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
48   12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
49   12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
50   12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
51   12,36,12,12,12,12,12,12,12,12,12,12,
52 };
53 
54 static inline uint32_t
decode(uint32_t * state,uint32_t * codep,uint32_t byte)55 decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
56   uint32_t type = utf8d[byte];
57 
58   *codep = (*state != UTF8_ACCEPT) ?
59     (byte & 0x3fu) | (*codep << 6) :
60     (0xff >> type) & (byte);
61 
62   return *state = utf8d[256 + *state + type];
63 }
64 
65 /*
66  * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
67  * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
68  * an UTF16 array
69  */
70 void
_hs_text_decode_latin1(uint16_t * dest,const uint8_t * src,const uint8_t * srcend)71 _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
72                        const uint8_t *srcend)
73 {
74   const uint8_t *p = src;
75 
76 #if defined(__i386__) || defined(__x86_64__)
77   /* This optimization works on a little-endian systems by using
78      (aligned) 32-bit loads instead of 8-bit loads
79    */
80 
81   /* consume unaligned prefix */
82   while (p != srcend && (uintptr_t)p & 0x3)
83     *dest++ = *p++;
84 
85   /* iterate over 32-bit aligned loads */
86   while (p < srcend - 3) {
87     const uint32_t w = *((const uint32_t *)p);
88 
89     *dest++ =  w        & 0xff;
90     *dest++ = (w >> 8)  & 0xff;
91     *dest++ = (w >> 16) & 0xff;
92     *dest++ = (w >> 24) & 0xff;
93 
94     p += 4;
95   }
96 #endif
97 
98   /* handle unaligned suffix */
99   while (p != srcend)
100     *dest++ = *p++;
101 }
102 
103 /*
104  * A best-effort decoder. Runs until it hits either end of input or
105  * the start of an invalid byte sequence.
106  *
107  * At exit, we update *destoff with the next offset to write to, *src
108  * with the next source location past the last one successfully
109  * decoded, and return the next source location to read from.
110  *
111  * Moreover, we expose the internal decoder state (state0 and
112  * codepoint0), allowing one to restart the decoder after it
113  * terminates (say, due to a partial codepoint).
114  *
115  * In particular, there are a few possible outcomes,
116  *
117  *   1) We decoded the buffer entirely:
118  *      In this case we return srcend
119  *      state0 == UTF8_ACCEPT
120  *
121  *   2) We met an invalid encoding
122  *      In this case we return the address of the first invalid byte
123  *      state0 == UTF8_REJECT
124  *
125  *   3) We reached the end of the buffer while decoding a codepoint
126  *      In this case we return a pointer to the first byte of the partial codepoint
127  *      state0 != UTF8_ACCEPT, UTF8_REJECT
128  *
129  */
130 #if defined(__GNUC__) || defined(__clang__)
131 static inline uint8_t const *
132 _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
133 			 const uint8_t **src, const uint8_t *srcend,
134 			 uint32_t *codepoint0, uint32_t *state0)
135   __attribute((always_inline));
136 #endif
137 
138 static inline uint8_t const *
_hs_text_decode_utf8_int(uint16_t * const dest,size_t * destoff,const uint8_t ** src,const uint8_t * srcend,uint32_t * codepoint0,uint32_t * state0)139 _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
140 			 const uint8_t **src, const uint8_t *srcend,
141 			 uint32_t *codepoint0, uint32_t *state0)
142 {
143   uint16_t *d = dest + *destoff;
144   const uint8_t *s = *src, *last = *src;
145   uint32_t state = *state0;
146   uint32_t codepoint = *codepoint0;
147 
148   while (s < srcend) {
149 #if defined(__i386__) || defined(__x86_64__)
150     /*
151      * This code will only work on a little-endian system that
152      * supports unaligned loads.
153      *
154      * It gives a substantial speed win on data that is purely or
155      * partly ASCII (e.g. HTML), at only a slight cost on purely
156      * non-ASCII text.
157      */
158 
159     if (state == UTF8_ACCEPT) {
160       while (s < srcend - 4) {
161 	codepoint = *((uint32_t *) s);
162 	if ((codepoint & 0x80808080) != 0)
163 	  break;
164 	s += 4;
165 
166 	/*
167 	 * Tried 32-bit stores here, but the extra bit-twiddling
168 	 * slowed the code down.
169 	 */
170 
171 	*d++ = (uint16_t) (codepoint & 0xff);
172 	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
173 	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
174 	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
175       }
176       last = s;
177     }
178 #endif
179 
180     if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
181       if (state != UTF8_REJECT)
182 	continue;
183       break;
184     }
185 
186     if (codepoint <= 0xffff)
187       *d++ = (uint16_t) codepoint;
188     else {
189       *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
190       *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
191     }
192     last = s;
193   }
194 
195   *destoff = d - dest;
196   *codepoint0 = codepoint;
197   *state0 = state;
198   *src = last;
199 
200   return s;
201 }
202 
203 uint8_t const *
_hs_text_decode_utf8_state(uint16_t * const dest,size_t * destoff,const uint8_t ** src,const uint8_t * srcend,uint32_t * codepoint0,uint32_t * state0)204 _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
205                            const uint8_t **src,
206                            const uint8_t *srcend,
207                            uint32_t *codepoint0, uint32_t *state0)
208 {
209   _hs_text_decode_utf8_int(dest, destoff, src, srcend, codepoint0, state0);
210 
211   return *src;
212 }
213 
214 /*
215  * Helper to decode buffer and discard final decoder state
216  */
217 const uint8_t *
_hs_text_decode_utf8(uint16_t * const dest,size_t * destoff,const uint8_t * src,const uint8_t * const srcend)218 _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
219                      const uint8_t *src, const uint8_t *const srcend)
220 {
221   uint32_t codepoint;
222   uint32_t state = UTF8_ACCEPT;
223   _hs_text_decode_utf8_int(dest, destoff, &src, srcend,
224                           &codepoint, &state);
225   return src;
226 }
227 
228 void
_hs_text_encode_utf8(uint8_t ** destp,const uint16_t * src,size_t srcoff,size_t srclen)229 _hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
230 		     size_t srclen)
231 {
232   const uint16_t *srcend;
233   uint8_t *dest = *destp;
234 
235   src += srcoff;
236   srcend = src + srclen;
237 
238  ascii:
239 #if defined(__x86_64__)
240   while (srcend - src >= 4) {
241     uint64_t w = *((uint64_t *) src);
242 
243     if (w & 0xFF80FF80FF80FF80ULL) {
244       if (!(w & 0x000000000000FF80ULL)) {
245 	*dest++ = w & 0xFFFF;
246 	src++;
247 	if (!(w & 0x00000000FF800000ULL)) {
248 	  *dest++ = (w >> 16) & 0xFFFF;
249 	  src++;
250 	  if (!(w & 0x0000FF8000000000ULL)) {
251 	    *dest++ = (w >> 32) & 0xFFFF;
252 	    src++;
253 	  }
254 	}
255       }
256       break;
257     }
258     *dest++ = w & 0xFFFF;
259     *dest++ = (w >> 16) & 0xFFFF;
260     *dest++ = (w >> 32) & 0xFFFF;
261     *dest++ = w >> 48;
262     src += 4;
263   }
264 #endif
265 
266 #if defined(__i386__)
267   while (srcend - src >= 2) {
268     uint32_t w = *((uint32_t *) src);
269 
270     if (w & 0xFF80FF80)
271       break;
272     *dest++ = w & 0xFFFF;
273     *dest++ = w >> 16;
274     src += 2;
275   }
276 #endif
277 
278   while (src < srcend) {
279     uint16_t w = *src++;
280 
281     if (w <= 0x7F) {
282       *dest++ = w;
283       /* An ASCII byte is likely to begin a run of ASCII bytes.
284 	 Falling back into the fast path really helps performance. */
285       goto ascii;
286     }
287     else if (w <= 0x7FF) {
288       *dest++ = (w >> 6) | 0xC0;
289       *dest++ = (w & 0x3f) | 0x80;
290     }
291     else if (w < 0xD800 || w > 0xDBFF) {
292       *dest++ = (w >> 12) | 0xE0;
293       *dest++ = ((w >> 6) & 0x3F) | 0x80;
294       *dest++ = (w & 0x3F) | 0x80;
295     } else {
296       uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
297 	(((uint32_t) *src++) - 0xDC00) + 0x10000;
298       *dest++ = (c >> 18) | 0xF0;
299       *dest++ = ((c >> 12) & 0x3F) | 0x80;
300       *dest++ = ((c >> 6) & 0x3F) | 0x80;
301       *dest++ = (c & 0x3F) | 0x80;
302     }
303   }
304 
305   *destp = dest;
306 }
307