1 // Copyright (c) 2008-2009 Bjoern Hoehrmann
2 // Copyright (c) 2015, Ondrej Palkovsky
3 // Copyright (c) 2016, Winterland
4
5 #include <string.h>
6 #include <stdio.h>
7 #include <stdint.h>
8
9
10 #define UTF8_ACCEPT 0
11 #define UTF8_REJECT 12
12
13 static const uint8_t utf8d[] = {
14 // The first part of the table maps bytes to character classes that
15 // to reduce the size of the transition table and create bitmasks.
16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
17 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
21 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
22 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
23 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
24
25 // The second part is a transition table that maps a combination
26 // of a state of the automaton and a character class to a state.
27 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
28 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
29 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
30 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
31 12,36,12,12,12,12,12,12,12,12,12,12,
32 };
33
decode(uint32_t * state,uint32_t * codep,uint32_t byte)34 static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
35 uint32_t type = utf8d[byte];
36
37 *codep = (*state != UTF8_ACCEPT) ?
38 (byte & 0x3fu) | (*codep << 6) :
39 (0xff >> type) & (byte);
40
41 *state = utf8d[256 + *state + type];
42 return *state;
43 }
44
decode_hex(uint32_t c)45 static inline uint16_t decode_hex(uint32_t c)
46 {
47 if (c >= '0' && c <= '9') return c - '0';
48 else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
49 else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
50 return 0xFFFF; // Should not happen
51 }
52
53 // Decode, return non-zero value on error
_js_decode_string(uint16_t * const dest,size_t * destoff,const uint8_t * s,const uint8_t * const srcend)54 int _js_decode_string(uint16_t *const dest, size_t *destoff,
55 const uint8_t *s, const uint8_t *const srcend)
56 {
57 uint16_t *d = dest + *destoff;
58 uint32_t state = 0;
59 uint32_t codepoint;
60
61 uint8_t surrogate = 0;
62 uint16_t temp_hex = 0;
63 uint16_t unidata;
64
65 // Optimized version of dispatch when just an ASCII char is expected
66 #define DISPATCH_ASCII(label) {\
67 if (s >= srcend) {\
68 return -1;\
69 }\
70 codepoint = *s++;\
71 goto label;\
72 }
73
74 standard:
75 // Test end of stream
76 while (s < srcend) {
77 if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
78 if (state == UTF8_REJECT) { return -1; }
79 continue;
80 }
81
82 if (codepoint == '\\')
83 DISPATCH_ASCII(backslash)
84 else if (codepoint <= 0xffff)
85 *d++ = (uint16_t) codepoint;
86 else {
87 *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
88 *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
89 }
90 }
91 *destoff = d - dest;
92 // Exit point
93 return (state != UTF8_ACCEPT);
94 backslash:
95 switch (codepoint) {
96 case '"':
97 case '\\':
98 case '/':
99 *d++ = (uint16_t) codepoint;
100 goto standard;
101 break;
102 case 'b': *d++ = '\b';goto standard;
103 case 'f': *d++ = '\f';goto standard;
104 case 'n': *d++ = '\n';goto standard;
105 case 'r': *d++ = '\r';goto standard;
106 case 't': *d++ = '\t';goto standard;
107 case 'u': DISPATCH_ASCII(unicode1);;break;
108 default:
109 return -1;
110 }
111 unicode1:
112 temp_hex = decode_hex(codepoint);
113 if (temp_hex == 0xFFFF) { return -1; }
114 else unidata = temp_hex << 12;
115 DISPATCH_ASCII(unicode2);
116 unicode2:
117 temp_hex = decode_hex(codepoint);
118 if (temp_hex == 0xFFFF) { return -1; }
119 else unidata |= temp_hex << 8;
120 DISPATCH_ASCII(unicode3);
121 unicode3:
122 temp_hex = decode_hex(codepoint);
123 if (temp_hex == 0xFFFF) { return -1; }
124 else unidata |= temp_hex << 4;
125 DISPATCH_ASCII(unicode4);
126 unicode4:
127 temp_hex = decode_hex(codepoint);
128 if (temp_hex == 0xFFFF) { return -1; }
129 else unidata |= temp_hex;
130 *d++ = (uint16_t) unidata;
131
132 if (surrogate) {
133 if (unidata < 0xDC00 || unidata > 0xDFFF) // is not low surrogate
134 return -1;
135 surrogate = 0;
136 } else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate
137 surrogate = 1;
138 DISPATCH_ASCII(surrogate1);
139 } else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate
140 return -1;
141 }
142 goto standard;
143 surrogate1:
144 if (codepoint != '\\') { return -1; }
145 DISPATCH_ASCII(surrogate2)
146 surrogate2:
147 if (codepoint != 'u') { return -1; }
148 DISPATCH_ASCII(unicode1)
149 }
150