xref: /qemu/qobject/json-lexer.c (revision ac06724a)
1 /*
2  * JSON lexer
3  *
4  * Copyright IBM, Corp. 2009
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10  * See the COPYING.LIB file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qemu-common.h"
16 #include "qapi/qmp/json-lexer.h"
17 
18 #define MAX_TOKEN_SIZE (64ULL << 20)
19 
20 /*
21  * Required by JSON (RFC 7159):
22  *
23  * \"([^\\\"]|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*\"
24  * -?(0|[1-9][0-9]*)(.[0-9]+)?([eE][-+]?[0-9]+)?
25  * [{}\[\],:]
26  * [a-z]+   # covers null, true, false
27  *
28  * Extension of '' strings:
29  *
30  * '([^\\']|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*'
31  *
32  * Extension for vararg handling in JSON construction:
33  *
34  * %((l|ll|I64)?d|[ipsf])
35  *
36  */
37 
38 enum json_lexer_state {
39     IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
40     IN_DQ_UCODE3,
41     IN_DQ_UCODE2,
42     IN_DQ_UCODE1,
43     IN_DQ_UCODE0,
44     IN_DQ_STRING_ESCAPE,
45     IN_DQ_STRING,
46     IN_SQ_UCODE3,
47     IN_SQ_UCODE2,
48     IN_SQ_UCODE1,
49     IN_SQ_UCODE0,
50     IN_SQ_STRING_ESCAPE,
51     IN_SQ_STRING,
52     IN_ZERO,
53     IN_DIGITS,
54     IN_DIGIT,
55     IN_EXP_E,
56     IN_MANTISSA,
57     IN_MANTISSA_DIGITS,
58     IN_NONZERO_NUMBER,
59     IN_NEG_NONZERO_NUMBER,
60     IN_KEYWORD,
61     IN_ESCAPE,
62     IN_ESCAPE_L,
63     IN_ESCAPE_LL,
64     IN_ESCAPE_I,
65     IN_ESCAPE_I6,
66     IN_ESCAPE_I64,
67     IN_WHITESPACE,
68     IN_START,
69 };
70 
71 QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START);
72 
73 #define TERMINAL(state) [0 ... 0x7F] = (state)
74 
75 /* Return whether TERMINAL is a terminal state and the transition to it
76    from OLD_STATE required lookahead.  This happens whenever the table
77    below uses the TERMINAL macro.  */
78 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
79             (json_lexer[(old_state)][0] == (terminal))
80 
81 static const uint8_t json_lexer[][256] =  {
82     /* Relies on default initialization to IN_ERROR! */
83 
84     /* double quote string */
85     [IN_DQ_UCODE3] = {
86         ['0' ... '9'] = IN_DQ_STRING,
87         ['a' ... 'f'] = IN_DQ_STRING,
88         ['A' ... 'F'] = IN_DQ_STRING,
89     },
90     [IN_DQ_UCODE2] = {
91         ['0' ... '9'] = IN_DQ_UCODE3,
92         ['a' ... 'f'] = IN_DQ_UCODE3,
93         ['A' ... 'F'] = IN_DQ_UCODE3,
94     },
95     [IN_DQ_UCODE1] = {
96         ['0' ... '9'] = IN_DQ_UCODE2,
97         ['a' ... 'f'] = IN_DQ_UCODE2,
98         ['A' ... 'F'] = IN_DQ_UCODE2,
99     },
100     [IN_DQ_UCODE0] = {
101         ['0' ... '9'] = IN_DQ_UCODE1,
102         ['a' ... 'f'] = IN_DQ_UCODE1,
103         ['A' ... 'F'] = IN_DQ_UCODE1,
104     },
105     [IN_DQ_STRING_ESCAPE] = {
106         ['b'] = IN_DQ_STRING,
107         ['f'] =  IN_DQ_STRING,
108         ['n'] =  IN_DQ_STRING,
109         ['r'] =  IN_DQ_STRING,
110         ['t'] =  IN_DQ_STRING,
111         ['/'] = IN_DQ_STRING,
112         ['\\'] = IN_DQ_STRING,
113         ['\''] = IN_DQ_STRING,
114         ['\"'] = IN_DQ_STRING,
115         ['u'] = IN_DQ_UCODE0,
116     },
117     [IN_DQ_STRING] = {
118         [1 ... 0xBF] = IN_DQ_STRING,
119         [0xC2 ... 0xF4] = IN_DQ_STRING,
120         ['\\'] = IN_DQ_STRING_ESCAPE,
121         ['"'] = JSON_STRING,
122     },
123 
124     /* single quote string */
125     [IN_SQ_UCODE3] = {
126         ['0' ... '9'] = IN_SQ_STRING,
127         ['a' ... 'f'] = IN_SQ_STRING,
128         ['A' ... 'F'] = IN_SQ_STRING,
129     },
130     [IN_SQ_UCODE2] = {
131         ['0' ... '9'] = IN_SQ_UCODE3,
132         ['a' ... 'f'] = IN_SQ_UCODE3,
133         ['A' ... 'F'] = IN_SQ_UCODE3,
134     },
135     [IN_SQ_UCODE1] = {
136         ['0' ... '9'] = IN_SQ_UCODE2,
137         ['a' ... 'f'] = IN_SQ_UCODE2,
138         ['A' ... 'F'] = IN_SQ_UCODE2,
139     },
140     [IN_SQ_UCODE0] = {
141         ['0' ... '9'] = IN_SQ_UCODE1,
142         ['a' ... 'f'] = IN_SQ_UCODE1,
143         ['A' ... 'F'] = IN_SQ_UCODE1,
144     },
145     [IN_SQ_STRING_ESCAPE] = {
146         ['b'] = IN_SQ_STRING,
147         ['f'] =  IN_SQ_STRING,
148         ['n'] =  IN_SQ_STRING,
149         ['r'] =  IN_SQ_STRING,
150         ['t'] =  IN_SQ_STRING,
151         ['/'] = IN_SQ_STRING,
152         ['\\'] = IN_SQ_STRING,
153         ['\''] = IN_SQ_STRING,
154         ['\"'] = IN_SQ_STRING,
155         ['u'] = IN_SQ_UCODE0,
156     },
157     [IN_SQ_STRING] = {
158         [1 ... 0xBF] = IN_SQ_STRING,
159         [0xC2 ... 0xF4] = IN_SQ_STRING,
160         ['\\'] = IN_SQ_STRING_ESCAPE,
161         ['\''] = JSON_STRING,
162     },
163 
164     /* Zero */
165     [IN_ZERO] = {
166         TERMINAL(JSON_INTEGER),
167         ['0' ... '9'] = IN_ERROR,
168         ['.'] = IN_MANTISSA,
169     },
170 
171     /* Float */
172     [IN_DIGITS] = {
173         TERMINAL(JSON_FLOAT),
174         ['0' ... '9'] = IN_DIGITS,
175     },
176 
177     [IN_DIGIT] = {
178         ['0' ... '9'] = IN_DIGITS,
179     },
180 
181     [IN_EXP_E] = {
182         ['-'] = IN_DIGIT,
183         ['+'] = IN_DIGIT,
184         ['0' ... '9'] = IN_DIGITS,
185     },
186 
187     [IN_MANTISSA_DIGITS] = {
188         TERMINAL(JSON_FLOAT),
189         ['0' ... '9'] = IN_MANTISSA_DIGITS,
190         ['e'] = IN_EXP_E,
191         ['E'] = IN_EXP_E,
192     },
193 
194     [IN_MANTISSA] = {
195         ['0' ... '9'] = IN_MANTISSA_DIGITS,
196     },
197 
198     /* Number */
199     [IN_NONZERO_NUMBER] = {
200         TERMINAL(JSON_INTEGER),
201         ['0' ... '9'] = IN_NONZERO_NUMBER,
202         ['e'] = IN_EXP_E,
203         ['E'] = IN_EXP_E,
204         ['.'] = IN_MANTISSA,
205     },
206 
207     [IN_NEG_NONZERO_NUMBER] = {
208         ['0'] = IN_ZERO,
209         ['1' ... '9'] = IN_NONZERO_NUMBER,
210     },
211 
212     /* keywords */
213     [IN_KEYWORD] = {
214         TERMINAL(JSON_KEYWORD),
215         ['a' ... 'z'] = IN_KEYWORD,
216     },
217 
218     /* whitespace */
219     [IN_WHITESPACE] = {
220         TERMINAL(JSON_SKIP),
221         [' '] = IN_WHITESPACE,
222         ['\t'] = IN_WHITESPACE,
223         ['\r'] = IN_WHITESPACE,
224         ['\n'] = IN_WHITESPACE,
225     },
226 
227     /* escape */
228     [IN_ESCAPE_LL] = {
229         ['d'] = JSON_ESCAPE,
230     },
231 
232     [IN_ESCAPE_L] = {
233         ['d'] = JSON_ESCAPE,
234         ['l'] = IN_ESCAPE_LL,
235     },
236 
237     [IN_ESCAPE_I64] = {
238         ['d'] = JSON_ESCAPE,
239     },
240 
241     [IN_ESCAPE_I6] = {
242         ['4'] = IN_ESCAPE_I64,
243     },
244 
245     [IN_ESCAPE_I] = {
246         ['6'] = IN_ESCAPE_I6,
247     },
248 
249     [IN_ESCAPE] = {
250         ['d'] = JSON_ESCAPE,
251         ['i'] = JSON_ESCAPE,
252         ['p'] = JSON_ESCAPE,
253         ['s'] = JSON_ESCAPE,
254         ['f'] = JSON_ESCAPE,
255         ['l'] = IN_ESCAPE_L,
256         ['I'] = IN_ESCAPE_I,
257     },
258 
259     /* top level rule */
260     [IN_START] = {
261         ['"'] = IN_DQ_STRING,
262         ['\''] = IN_SQ_STRING,
263         ['0'] = IN_ZERO,
264         ['1' ... '9'] = IN_NONZERO_NUMBER,
265         ['-'] = IN_NEG_NONZERO_NUMBER,
266         ['{'] = JSON_LCURLY,
267         ['}'] = JSON_RCURLY,
268         ['['] = JSON_LSQUARE,
269         [']'] = JSON_RSQUARE,
270         [','] = JSON_COMMA,
271         [':'] = JSON_COLON,
272         ['a' ... 'z'] = IN_KEYWORD,
273         ['%'] = IN_ESCAPE,
274         [' '] = IN_WHITESPACE,
275         ['\t'] = IN_WHITESPACE,
276         ['\r'] = IN_WHITESPACE,
277         ['\n'] = IN_WHITESPACE,
278     },
279 };
280 
281 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
282 {
283     lexer->emit = func;
284     lexer->state = IN_START;
285     lexer->token = g_string_sized_new(3);
286     lexer->x = lexer->y = 0;
287 }
288 
289 static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
290 {
291     int char_consumed, new_state;
292 
293     lexer->x++;
294     if (ch == '\n') {
295         lexer->x = 0;
296         lexer->y++;
297     }
298 
299     do {
300         assert(lexer->state <= ARRAY_SIZE(json_lexer));
301         new_state = json_lexer[lexer->state][(uint8_t)ch];
302         char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
303         if (char_consumed) {
304             g_string_append_c(lexer->token, ch);
305         }
306 
307         switch (new_state) {
308         case JSON_LCURLY:
309         case JSON_RCURLY:
310         case JSON_LSQUARE:
311         case JSON_RSQUARE:
312         case JSON_COLON:
313         case JSON_COMMA:
314         case JSON_ESCAPE:
315         case JSON_INTEGER:
316         case JSON_FLOAT:
317         case JSON_KEYWORD:
318         case JSON_STRING:
319             lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
320             /* fall through */
321         case JSON_SKIP:
322             g_string_truncate(lexer->token, 0);
323             new_state = IN_START;
324             break;
325         case IN_ERROR:
326             /* XXX: To avoid having previous bad input leaving the parser in an
327              * unresponsive state where we consume unpredictable amounts of
328              * subsequent "good" input, percolate this error state up to the
329              * tokenizer/parser by forcing a NULL object to be emitted, then
330              * reset state.
331              *
332              * Also note that this handling is required for reliable channel
333              * negotiation between QMP and the guest agent, since chr(0xFF)
334              * is placed at the beginning of certain events to ensure proper
335              * delivery when the channel is in an unknown state. chr(0xFF) is
336              * never a valid ASCII/UTF-8 sequence, so this should reliably
337              * induce an error/flush state.
338              */
339             lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
340             g_string_truncate(lexer->token, 0);
341             new_state = IN_START;
342             lexer->state = new_state;
343             return 0;
344         default:
345             break;
346         }
347         lexer->state = new_state;
348     } while (!char_consumed && !flush);
349 
350     /* Do not let a single token grow to an arbitrarily large size,
351      * this is a security consideration.
352      */
353     if (lexer->token->len > MAX_TOKEN_SIZE) {
354         lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
355         g_string_truncate(lexer->token, 0);
356         lexer->state = IN_START;
357     }
358 
359     return 0;
360 }
361 
362 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
363 {
364     size_t i;
365 
366     for (i = 0; i < size; i++) {
367         int err;
368 
369         err = json_lexer_feed_char(lexer, buffer[i], false);
370         if (err < 0) {
371             return err;
372         }
373     }
374 
375     return 0;
376 }
377 
378 int json_lexer_flush(JSONLexer *lexer)
379 {
380     return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
381 }
382 
383 void json_lexer_destroy(JSONLexer *lexer)
384 {
385     g_string_free(lexer->token, true);
386 }
387