1 // Copyright (c) 2008-2009 Bjoern Hoehrmann
2 // Copyright (c) 2015, Ondrej Palkovsky
3 // Copyright (c) 2016, Winterland
4 
5 #include <string.h>
6 #include <stdio.h>
7 #include <stdint.h>
8 
9 
10 #define UTF8_ACCEPT 0
11 #define UTF8_REJECT 12
12 
13 static const uint8_t utf8d[] = {
14   // The first part of the table maps bytes to character classes that
15   // to reduce the size of the transition table and create bitmasks.
16    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
17    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
18    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
21    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
22    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
23   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
24 
25   // The second part is a transition table that maps a combination
26   // of a state of the automaton and a character class to a state.
27    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
28   12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
29   12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
30   12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
31   12,36,12,12,12,12,12,12,12,12,12,12,
32 };
33 
decode(uint32_t * state,uint32_t * codep,uint32_t byte)34 static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
35   uint32_t type = utf8d[byte];
36 
37   *codep = (*state != UTF8_ACCEPT) ?
38     (byte & 0x3fu) | (*codep << 6) :
39     (0xff >> type) & (byte);
40 
41   *state = utf8d[256 + *state + type];
42   return *state;
43 }
44 
decode_hex(uint32_t c)45 static inline uint16_t decode_hex(uint32_t c)
46 {
47   if (c >= '0' && c <= '9')      return c - '0';
48   else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
49   else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
50   return 0xFFFF; // Should not happen
51 }
52 
53 // Decode, return non-zero value on error
_js_decode_string(uint16_t * const dest,size_t * destoff,const uint8_t * s,const uint8_t * const srcend)54 int _js_decode_string(uint16_t *const dest, size_t *destoff,
55                   const uint8_t *s, const uint8_t *const srcend)
56 {
57   uint16_t *d = dest + *destoff;
58   uint32_t state = 0;
59   uint32_t codepoint;
60 
61   uint8_t surrogate = 0;
62   uint16_t temp_hex = 0;
63   uint16_t unidata;
64 
65   // Optimized version of dispatch when just an ASCII char is expected
66   #define DISPATCH_ASCII(label) {\
67     if (s >= srcend) {\
68       return -1;\
69     }\
70     codepoint = *s++;\
71     goto label;\
72   }
73 
74   standard:
75     // Test end of stream
76     while (s < srcend) {
77         if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
78           if (state == UTF8_REJECT) { return -1; }
79           continue;
80         }
81 
82         if (codepoint == '\\')
83           DISPATCH_ASCII(backslash)
84         else if (codepoint <= 0xffff)
85           *d++ = (uint16_t) codepoint;
86         else {
87           *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
88           *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
89         }
90     }
91     *destoff = d - dest;
92     // Exit point
93     return (state != UTF8_ACCEPT);
94   backslash:
95     switch (codepoint) {
96       case '"':
97       case '\\':
98       case '/':
99         *d++ = (uint16_t) codepoint;
100         goto standard;
101         break;
102       case 'b': *d++ = '\b';goto standard;
103       case 'f': *d++ = '\f';goto standard;
104       case 'n': *d++ = '\n';goto standard;
105       case 'r': *d++ = '\r';goto standard;
106       case 't': *d++ = '\t';goto standard;
107       case 'u': DISPATCH_ASCII(unicode1);;break;
108       default:
109         return -1;
110     }
111   unicode1:
112     temp_hex = decode_hex(codepoint);
113     if (temp_hex == 0xFFFF) { return -1; }
114     else unidata = temp_hex << 12;
115     DISPATCH_ASCII(unicode2);
116   unicode2:
117     temp_hex = decode_hex(codepoint);
118     if (temp_hex == 0xFFFF) { return -1; }
119     else unidata |= temp_hex << 8;
120     DISPATCH_ASCII(unicode3);
121   unicode3:
122     temp_hex = decode_hex(codepoint);
123     if (temp_hex == 0xFFFF) { return -1; }
124     else unidata |= temp_hex << 4;
125     DISPATCH_ASCII(unicode4);
126   unicode4:
127     temp_hex = decode_hex(codepoint);
128     if (temp_hex == 0xFFFF) { return -1; }
129     else unidata |= temp_hex;
130     *d++ = (uint16_t) unidata;
131 
132     if (surrogate) {
133       if (unidata < 0xDC00 || unidata > 0xDFFF) // is not low surrogate
134         return -1;
135       surrogate = 0;
136     } else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate
137         surrogate = 1;
138         DISPATCH_ASCII(surrogate1);
139     } else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate
140         return -1;
141     }
142     goto standard;
143   surrogate1:
144     if (codepoint != '\\') { return -1; }
145     DISPATCH_ASCII(surrogate2)
146   surrogate2:
147     if (codepoint != 'u') { return -1; }
148     DISPATCH_ASCII(unicode1)
149 }
150