xref: /qemu/qobject/json-lexer.c (revision 6402cbbb)
1 /*
2  * JSON lexer
3  *
4  * Copyright IBM, Corp. 2009
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10  * See the COPYING.LIB file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qemu-common.h"
16 #include "qapi/qmp/json-lexer.h"
17 
18 #define MAX_TOKEN_SIZE (64ULL << 20)
19 
20 /*
21  * Required by JSON (RFC 7159):
22  *
23  * \"([^\\\"]|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*\"
24  * -?(0|[1-9][0-9]*)(.[0-9]+)?([eE][-+]?[0-9]+)?
25  * [{}\[\],:]
26  * [a-z]+   # covers null, true, false
27  *
28  * Extension of '' strings:
29  *
30  * '([^\\']|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*'
31  *
32  * Extension for vararg handling in JSON construction:
33  *
34  * %((l|ll|I64)?d|[ipsf])
35  *
36  */
37 
38 enum json_lexer_state {
39     IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
40     IN_DQ_UCODE3,
41     IN_DQ_UCODE2,
42     IN_DQ_UCODE1,
43     IN_DQ_UCODE0,
44     IN_DQ_STRING_ESCAPE,
45     IN_DQ_STRING,
46     IN_SQ_UCODE3,
47     IN_SQ_UCODE2,
48     IN_SQ_UCODE1,
49     IN_SQ_UCODE0,
50     IN_SQ_STRING_ESCAPE,
51     IN_SQ_STRING,
52     IN_ZERO,
53     IN_DIGITS,
54     IN_DIGIT,
55     IN_EXP_E,
56     IN_MANTISSA,
57     IN_MANTISSA_DIGITS,
58     IN_NONZERO_NUMBER,
59     IN_NEG_NONZERO_NUMBER,
60     IN_KEYWORD,
61     IN_ESCAPE,
62     IN_ESCAPE_L,
63     IN_ESCAPE_LL,
64     IN_ESCAPE_I,
65     IN_ESCAPE_I6,
66     IN_ESCAPE_I64,
67     IN_WHITESPACE,
68     IN_START,
69 };
70 
71 QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START);
72 
73 #define TERMINAL(state) [0 ... 0x7F] = (state)
74 
75 /* Return whether TERMINAL is a terminal state and the transition to it
76    from OLD_STATE required lookahead.  This happens whenever the table
77    below uses the TERMINAL macro.  */
78 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
79             (json_lexer[(old_state)][0] == (terminal))
80 
81 static const uint8_t json_lexer[][256] =  {
82     /* Relies on default initialization to IN_ERROR! */
83 
84     /* double quote string */
85     [IN_DQ_UCODE3] = {
86         ['0' ... '9'] = IN_DQ_STRING,
87         ['a' ... 'f'] = IN_DQ_STRING,
88         ['A' ... 'F'] = IN_DQ_STRING,
89     },
90     [IN_DQ_UCODE2] = {
91         ['0' ... '9'] = IN_DQ_UCODE3,
92         ['a' ... 'f'] = IN_DQ_UCODE3,
93         ['A' ... 'F'] = IN_DQ_UCODE3,
94     },
95     [IN_DQ_UCODE1] = {
96         ['0' ... '9'] = IN_DQ_UCODE2,
97         ['a' ... 'f'] = IN_DQ_UCODE2,
98         ['A' ... 'F'] = IN_DQ_UCODE2,
99     },
100     [IN_DQ_UCODE0] = {
101         ['0' ... '9'] = IN_DQ_UCODE1,
102         ['a' ... 'f'] = IN_DQ_UCODE1,
103         ['A' ... 'F'] = IN_DQ_UCODE1,
104     },
105     [IN_DQ_STRING_ESCAPE] = {
106         ['b'] = IN_DQ_STRING,
107         ['f'] =  IN_DQ_STRING,
108         ['n'] =  IN_DQ_STRING,
109         ['r'] =  IN_DQ_STRING,
110         ['t'] =  IN_DQ_STRING,
111         ['/'] = IN_DQ_STRING,
112         ['\\'] = IN_DQ_STRING,
113         ['\''] = IN_DQ_STRING,
114         ['\"'] = IN_DQ_STRING,
115         ['u'] = IN_DQ_UCODE0,
116     },
117     [IN_DQ_STRING] = {
118         [1 ... 0xBF] = IN_DQ_STRING,
119         [0xC2 ... 0xF4] = IN_DQ_STRING,
120         ['\\'] = IN_DQ_STRING_ESCAPE,
121         ['"'] = JSON_STRING,
122     },
123 
124     /* single quote string */
125     [IN_SQ_UCODE3] = {
126         ['0' ... '9'] = IN_SQ_STRING,
127         ['a' ... 'f'] = IN_SQ_STRING,
128         ['A' ... 'F'] = IN_SQ_STRING,
129     },
130     [IN_SQ_UCODE2] = {
131         ['0' ... '9'] = IN_SQ_UCODE3,
132         ['a' ... 'f'] = IN_SQ_UCODE3,
133         ['A' ... 'F'] = IN_SQ_UCODE3,
134     },
135     [IN_SQ_UCODE1] = {
136         ['0' ... '9'] = IN_SQ_UCODE2,
137         ['a' ... 'f'] = IN_SQ_UCODE2,
138         ['A' ... 'F'] = IN_SQ_UCODE2,
139     },
140     [IN_SQ_UCODE0] = {
141         ['0' ... '9'] = IN_SQ_UCODE1,
142         ['a' ... 'f'] = IN_SQ_UCODE1,
143         ['A' ... 'F'] = IN_SQ_UCODE1,
144     },
145     [IN_SQ_STRING_ESCAPE] = {
146         ['b'] = IN_SQ_STRING,
147         ['f'] =  IN_SQ_STRING,
148         ['n'] =  IN_SQ_STRING,
149         ['r'] =  IN_SQ_STRING,
150         ['t'] =  IN_SQ_STRING,
151         ['/'] = IN_SQ_STRING,
152         ['\\'] = IN_SQ_STRING,
153         ['\''] = IN_SQ_STRING,
154         ['\"'] = IN_SQ_STRING,
155         ['u'] = IN_SQ_UCODE0,
156     },
157     [IN_SQ_STRING] = {
158         [1 ... 0xBF] = IN_SQ_STRING,
159         [0xC2 ... 0xF4] = IN_SQ_STRING,
160         ['\\'] = IN_SQ_STRING_ESCAPE,
161         ['\''] = JSON_STRING,
162     },
163 
164     /* Zero */
165     [IN_ZERO] = {
166         TERMINAL(JSON_INTEGER),
167         ['0' ... '9'] = IN_ERROR,
168         ['.'] = IN_MANTISSA,
169     },
170 
171     /* Float */
172     [IN_DIGITS] = {
173         TERMINAL(JSON_FLOAT),
174         ['0' ... '9'] = IN_DIGITS,
175     },
176 
177     [IN_DIGIT] = {
178         ['0' ... '9'] = IN_DIGITS,
179     },
180 
181     [IN_EXP_E] = {
182         ['-'] = IN_DIGIT,
183         ['+'] = IN_DIGIT,
184         ['0' ... '9'] = IN_DIGITS,
185     },
186 
187     [IN_MANTISSA_DIGITS] = {
188         TERMINAL(JSON_FLOAT),
189         ['0' ... '9'] = IN_MANTISSA_DIGITS,
190         ['e'] = IN_EXP_E,
191         ['E'] = IN_EXP_E,
192     },
193 
194     [IN_MANTISSA] = {
195         ['0' ... '9'] = IN_MANTISSA_DIGITS,
196     },
197 
198     /* Number */
199     [IN_NONZERO_NUMBER] = {
200         TERMINAL(JSON_INTEGER),
201         ['0' ... '9'] = IN_NONZERO_NUMBER,
202         ['e'] = IN_EXP_E,
203         ['E'] = IN_EXP_E,
204         ['.'] = IN_MANTISSA,
205     },
206 
207     [IN_NEG_NONZERO_NUMBER] = {
208         ['0'] = IN_ZERO,
209         ['1' ... '9'] = IN_NONZERO_NUMBER,
210     },
211 
212     /* keywords */
213     [IN_KEYWORD] = {
214         TERMINAL(JSON_KEYWORD),
215         ['a' ... 'z'] = IN_KEYWORD,
216     },
217 
218     /* whitespace */
219     [IN_WHITESPACE] = {
220         TERMINAL(JSON_SKIP),
221         [' '] = IN_WHITESPACE,
222         ['\t'] = IN_WHITESPACE,
223         ['\r'] = IN_WHITESPACE,
224         ['\n'] = IN_WHITESPACE,
225     },
226 
227     /* escape */
228     [IN_ESCAPE_LL] = {
229         ['d'] = JSON_ESCAPE,
230         ['u'] = JSON_ESCAPE,
231     },
232 
233     [IN_ESCAPE_L] = {
234         ['d'] = JSON_ESCAPE,
235         ['l'] = IN_ESCAPE_LL,
236         ['u'] = JSON_ESCAPE,
237     },
238 
239     [IN_ESCAPE_I64] = {
240         ['d'] = JSON_ESCAPE,
241         ['u'] = JSON_ESCAPE,
242     },
243 
244     [IN_ESCAPE_I6] = {
245         ['4'] = IN_ESCAPE_I64,
246     },
247 
248     [IN_ESCAPE_I] = {
249         ['6'] = IN_ESCAPE_I6,
250     },
251 
252     [IN_ESCAPE] = {
253         ['d'] = JSON_ESCAPE,
254         ['i'] = JSON_ESCAPE,
255         ['p'] = JSON_ESCAPE,
256         ['s'] = JSON_ESCAPE,
257         ['u'] = JSON_ESCAPE,
258         ['f'] = JSON_ESCAPE,
259         ['l'] = IN_ESCAPE_L,
260         ['I'] = IN_ESCAPE_I,
261     },
262 
263     /* top level rule */
264     [IN_START] = {
265         ['"'] = IN_DQ_STRING,
266         ['\''] = IN_SQ_STRING,
267         ['0'] = IN_ZERO,
268         ['1' ... '9'] = IN_NONZERO_NUMBER,
269         ['-'] = IN_NEG_NONZERO_NUMBER,
270         ['{'] = JSON_LCURLY,
271         ['}'] = JSON_RCURLY,
272         ['['] = JSON_LSQUARE,
273         [']'] = JSON_RSQUARE,
274         [','] = JSON_COMMA,
275         [':'] = JSON_COLON,
276         ['a' ... 'z'] = IN_KEYWORD,
277         ['%'] = IN_ESCAPE,
278         [' '] = IN_WHITESPACE,
279         ['\t'] = IN_WHITESPACE,
280         ['\r'] = IN_WHITESPACE,
281         ['\n'] = IN_WHITESPACE,
282     },
283 };
284 
285 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
286 {
287     lexer->emit = func;
288     lexer->state = IN_START;
289     lexer->token = g_string_sized_new(3);
290     lexer->x = lexer->y = 0;
291 }
292 
293 static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
294 {
295     int char_consumed, new_state;
296 
297     lexer->x++;
298     if (ch == '\n') {
299         lexer->x = 0;
300         lexer->y++;
301     }
302 
303     do {
304         assert(lexer->state <= ARRAY_SIZE(json_lexer));
305         new_state = json_lexer[lexer->state][(uint8_t)ch];
306         char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
307         if (char_consumed) {
308             g_string_append_c(lexer->token, ch);
309         }
310 
311         switch (new_state) {
312         case JSON_LCURLY:
313         case JSON_RCURLY:
314         case JSON_LSQUARE:
315         case JSON_RSQUARE:
316         case JSON_COLON:
317         case JSON_COMMA:
318         case JSON_ESCAPE:
319         case JSON_INTEGER:
320         case JSON_FLOAT:
321         case JSON_KEYWORD:
322         case JSON_STRING:
323             lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
324             /* fall through */
325         case JSON_SKIP:
326             g_string_truncate(lexer->token, 0);
327             new_state = IN_START;
328             break;
329         case IN_ERROR:
330             /* XXX: To avoid having previous bad input leaving the parser in an
331              * unresponsive state where we consume unpredictable amounts of
332              * subsequent "good" input, percolate this error state up to the
333              * tokenizer/parser by forcing a NULL object to be emitted, then
334              * reset state.
335              *
336              * Also note that this handling is required for reliable channel
337              * negotiation between QMP and the guest agent, since chr(0xFF)
338              * is placed at the beginning of certain events to ensure proper
339              * delivery when the channel is in an unknown state. chr(0xFF) is
340              * never a valid ASCII/UTF-8 sequence, so this should reliably
341              * induce an error/flush state.
342              */
343             lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
344             g_string_truncate(lexer->token, 0);
345             new_state = IN_START;
346             lexer->state = new_state;
347             return 0;
348         default:
349             break;
350         }
351         lexer->state = new_state;
352     } while (!char_consumed && !flush);
353 
354     /* Do not let a single token grow to an arbitrarily large size,
355      * this is a security consideration.
356      */
357     if (lexer->token->len > MAX_TOKEN_SIZE) {
358         lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
359         g_string_truncate(lexer->token, 0);
360         lexer->state = IN_START;
361     }
362 
363     return 0;
364 }
365 
366 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
367 {
368     size_t i;
369 
370     for (i = 0; i < size; i++) {
371         int err;
372 
373         err = json_lexer_feed_char(lexer, buffer[i], false);
374         if (err < 0) {
375             return err;
376         }
377     }
378 
379     return 0;
380 }
381 
382 int json_lexer_flush(JSONLexer *lexer)
383 {
384     return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
385 }
386 
387 void json_lexer_destroy(JSONLexer *lexer)
388 {
389     g_string_free(lexer->token, true);
390 }
391