xref: /qemu/qobject/json-parser.c (revision 727385c4)
1 /*
2  * JSON Parser
3  *
4  * Copyright IBM, Corp. 2009
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10  * See the COPYING.LIB file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qemu/ctype.h"
16 #include "qemu/cutils.h"
17 #include "qemu/unicode.h"
18 #include "qapi/error.h"
19 #include "qapi/qmp/qbool.h"
20 #include "qapi/qmp/qdict.h"
21 #include "qapi/qmp/qlist.h"
22 #include "qapi/qmp/qnull.h"
23 #include "qapi/qmp/qnum.h"
24 #include "qapi/qmp/qstring.h"
25 #include "json-parser-int.h"
26 
27 struct JSONToken {
28     JSONTokenType type;
29     int x;
30     int y;
31     char str[];
32 };
33 
34 typedef struct JSONParserContext {
35     Error *err;
36     JSONToken *current;
37     GQueue *buf;
38     va_list *ap;
39 } JSONParserContext;
40 
41 #define BUG_ON(cond) assert(!(cond))
42 
43 /**
44  * TODO
45  *
46  * 0) make errors meaningful again
47  * 1) add geometry information to tokens
48  * 3) should we return a parsed size?
49  * 4) deal with premature EOI
50  */
51 
52 static QObject *parse_value(JSONParserContext *ctxt);
53 
54 /**
55  * Error handler
56  */
57 static void GCC_FMT_ATTR(3, 4) parse_error(JSONParserContext *ctxt,
58                                            JSONToken *token, const char *msg, ...)
59 {
60     va_list ap;
61     char message[1024];
62 
63     if (ctxt->err) {
64         return;
65     }
66     va_start(ap, msg);
67     vsnprintf(message, sizeof(message), msg, ap);
68     va_end(ap);
69     error_setg(&ctxt->err, "JSON parse error, %s", message);
70 }
71 
72 static int cvt4hex(const char *s)
73 {
74     int cp, i;
75 
76     cp = 0;
77     for (i = 0; i < 4; i++) {
78         if (!qemu_isxdigit(s[i])) {
79             return -1;
80         }
81         cp <<= 4;
82         if (s[i] >= '0' && s[i] <= '9') {
83             cp |= s[i] - '0';
84         } else if (s[i] >= 'a' && s[i] <= 'f') {
85             cp |= 10 + s[i] - 'a';
86         } else if (s[i] >= 'A' && s[i] <= 'F') {
87             cp |= 10 + s[i] - 'A';
88         } else {
89             return -1;
90         }
91     }
92     return cp;
93 }
94 
95 /**
96  * parse_string(): Parse a JSON string
97  *
98  * From RFC 8259 "The JavaScript Object Notation (JSON) Data
99  * Interchange Format":
100  *
101  *    char = unescaped /
102  *        escape (
103  *            %x22 /          ; "    quotation mark  U+0022
104  *            %x5C /          ; \    reverse solidus U+005C
105  *            %x2F /          ; /    solidus         U+002F
106  *            %x62 /          ; b    backspace       U+0008
107  *            %x66 /          ; f    form feed       U+000C
108  *            %x6E /          ; n    line feed       U+000A
109  *            %x72 /          ; r    carriage return U+000D
110  *            %x74 /          ; t    tab             U+0009
111  *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
112  *    escape = %x5C              ; \
113  *    quotation-mark = %x22      ; "
114  *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
115  *
116  * Extensions over RFC 8259:
117  * - Extra escape sequence in strings:
118  *   0x27 (apostrophe) is recognized after escape, too
119  * - Single-quoted strings:
120  *   Like double-quoted strings, except they're delimited by %x27
121  *   (apostrophe) instead of %x22 (quotation mark), and can't contain
122  *   unescaped apostrophe, but can contain unescaped quotation mark.
123  *
124  * Note:
125  * - Encoding is modified UTF-8.
126  * - Invalid Unicode characters are rejected.
127  * - Control characters \x00..\x1F are rejected by the lexer.
128  */
129 static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
130 {
131     const char *ptr = token->str;
132     GString *str;
133     char quote;
134     const char *beg;
135     int cp, trailing;
136     char *end;
137     ssize_t len;
138     char utf8_buf[5];
139 
140     assert(*ptr == '"' || *ptr == '\'');
141     quote = *ptr++;
142     str = g_string_new(NULL);
143 
144     while (*ptr != quote) {
145         assert(*ptr);
146         switch (*ptr) {
147         case '\\':
148             beg = ptr++;
149             switch (*ptr++) {
150             case '"':
151                 g_string_append_c(str, '"');
152                 break;
153             case '\'':
154                 g_string_append_c(str, '\'');
155                 break;
156             case '\\':
157                 g_string_append_c(str, '\\');
158                 break;
159             case '/':
160                 g_string_append_c(str, '/');
161                 break;
162             case 'b':
163                 g_string_append_c(str, '\b');
164                 break;
165             case 'f':
166                 g_string_append_c(str, '\f');
167                 break;
168             case 'n':
169                 g_string_append_c(str, '\n');
170                 break;
171             case 'r':
172                 g_string_append_c(str, '\r');
173                 break;
174             case 't':
175                 g_string_append_c(str, '\t');
176                 break;
177             case 'u':
178                 cp = cvt4hex(ptr);
179                 ptr += 4;
180 
181                 /* handle surrogate pairs */
182                 if (cp >= 0xD800 && cp <= 0xDBFF
183                     && ptr[0] == '\\' && ptr[1] == 'u') {
184                     /* leading surrogate followed by \u */
185                     cp = 0x10000 + ((cp & 0x3FF) << 10);
186                     trailing = cvt4hex(ptr + 2);
187                     if (trailing >= 0xDC00 && trailing <= 0xDFFF) {
188                         /* followed by trailing surrogate */
189                         cp |= trailing & 0x3FF;
190                         ptr += 6;
191                     } else {
192                         cp = -1; /* invalid */
193                     }
194                 }
195 
196                 if (mod_utf8_encode(utf8_buf, sizeof(utf8_buf), cp) < 0) {
197                     parse_error(ctxt, token,
198                                 "%.*s is not a valid Unicode character",
199                                 (int)(ptr - beg), beg);
200                     goto out;
201                 }
202                 g_string_append(str, utf8_buf);
203                 break;
204             default:
205                 parse_error(ctxt, token, "invalid escape sequence in string");
206                 goto out;
207             }
208             break;
209         case '%':
210             if (ctxt->ap) {
211                 if (ptr[1] != '%') {
212                     parse_error(ctxt, token, "can't interpolate into string");
213                     goto out;
214                 }
215                 ptr++;
216             }
217             /* fall through */
218         default:
219             cp = mod_utf8_codepoint(ptr, 6, &end);
220             if (cp < 0) {
221                 parse_error(ctxt, token, "invalid UTF-8 sequence in string");
222                 goto out;
223             }
224             ptr = end;
225             len = mod_utf8_encode(utf8_buf, sizeof(utf8_buf), cp);
226             assert(len >= 0);
227             g_string_append(str, utf8_buf);
228         }
229     }
230 
231     return qstring_from_gstring(str);
232 
233 out:
234     g_string_free(str, true);
235     return NULL;
236 }
237 
238 /* Note: the token object returned by parser_context_peek_token or
239  * parser_context_pop_token is deleted as soon as parser_context_pop_token
240  * is called again.
241  */
242 static JSONToken *parser_context_pop_token(JSONParserContext *ctxt)
243 {
244     g_free(ctxt->current);
245     ctxt->current = g_queue_pop_head(ctxt->buf);
246     return ctxt->current;
247 }
248 
249 static JSONToken *parser_context_peek_token(JSONParserContext *ctxt)
250 {
251     return g_queue_peek_head(ctxt->buf);
252 }
253 
254 /**
255  * Parsing rules
256  */
257 static int parse_pair(JSONParserContext *ctxt, QDict *dict)
258 {
259     QObject *key_obj = NULL;
260     QString *key;
261     QObject *value;
262     JSONToken *peek, *token;
263 
264     peek = parser_context_peek_token(ctxt);
265     if (peek == NULL) {
266         parse_error(ctxt, NULL, "premature EOI");
267         goto out;
268     }
269 
270     key_obj = parse_value(ctxt);
271     key = qobject_to(QString, key_obj);
272     if (!key) {
273         parse_error(ctxt, peek, "key is not a string in object");
274         goto out;
275     }
276 
277     token = parser_context_pop_token(ctxt);
278     if (token == NULL) {
279         parse_error(ctxt, NULL, "premature EOI");
280         goto out;
281     }
282 
283     if (token->type != JSON_COLON) {
284         parse_error(ctxt, token, "missing : in object pair");
285         goto out;
286     }
287 
288     value = parse_value(ctxt);
289     if (value == NULL) {
290         parse_error(ctxt, token, "Missing value in dict");
291         goto out;
292     }
293 
294     if (qdict_haskey(dict, qstring_get_str(key))) {
295         parse_error(ctxt, token, "duplicate key");
296         goto out;
297     }
298 
299     qdict_put_obj(dict, qstring_get_str(key), value);
300 
301     qobject_unref(key_obj);
302     return 0;
303 
304 out:
305     qobject_unref(key_obj);
306     return -1;
307 }
308 
309 static QObject *parse_object(JSONParserContext *ctxt)
310 {
311     QDict *dict = NULL;
312     JSONToken *token, *peek;
313 
314     token = parser_context_pop_token(ctxt);
315     assert(token && token->type == JSON_LCURLY);
316 
317     dict = qdict_new();
318 
319     peek = parser_context_peek_token(ctxt);
320     if (peek == NULL) {
321         parse_error(ctxt, NULL, "premature EOI");
322         goto out;
323     }
324 
325     if (peek->type != JSON_RCURLY) {
326         if (parse_pair(ctxt, dict) == -1) {
327             goto out;
328         }
329 
330         token = parser_context_pop_token(ctxt);
331         if (token == NULL) {
332             parse_error(ctxt, NULL, "premature EOI");
333             goto out;
334         }
335 
336         while (token->type != JSON_RCURLY) {
337             if (token->type != JSON_COMMA) {
338                 parse_error(ctxt, token, "expected separator in dict");
339                 goto out;
340             }
341 
342             if (parse_pair(ctxt, dict) == -1) {
343                 goto out;
344             }
345 
346             token = parser_context_pop_token(ctxt);
347             if (token == NULL) {
348                 parse_error(ctxt, NULL, "premature EOI");
349                 goto out;
350             }
351         }
352     } else {
353         (void)parser_context_pop_token(ctxt);
354     }
355 
356     return QOBJECT(dict);
357 
358 out:
359     qobject_unref(dict);
360     return NULL;
361 }
362 
363 static QObject *parse_array(JSONParserContext *ctxt)
364 {
365     QList *list = NULL;
366     JSONToken *token, *peek;
367 
368     token = parser_context_pop_token(ctxt);
369     assert(token && token->type == JSON_LSQUARE);
370 
371     list = qlist_new();
372 
373     peek = parser_context_peek_token(ctxt);
374     if (peek == NULL) {
375         parse_error(ctxt, NULL, "premature EOI");
376         goto out;
377     }
378 
379     if (peek->type != JSON_RSQUARE) {
380         QObject *obj;
381 
382         obj = parse_value(ctxt);
383         if (obj == NULL) {
384             parse_error(ctxt, token, "expecting value");
385             goto out;
386         }
387 
388         qlist_append_obj(list, obj);
389 
390         token = parser_context_pop_token(ctxt);
391         if (token == NULL) {
392             parse_error(ctxt, NULL, "premature EOI");
393             goto out;
394         }
395 
396         while (token->type != JSON_RSQUARE) {
397             if (token->type != JSON_COMMA) {
398                 parse_error(ctxt, token, "expected separator in list");
399                 goto out;
400             }
401 
402             obj = parse_value(ctxt);
403             if (obj == NULL) {
404                 parse_error(ctxt, token, "expecting value");
405                 goto out;
406             }
407 
408             qlist_append_obj(list, obj);
409 
410             token = parser_context_pop_token(ctxt);
411             if (token == NULL) {
412                 parse_error(ctxt, NULL, "premature EOI");
413                 goto out;
414             }
415         }
416     } else {
417         (void)parser_context_pop_token(ctxt);
418     }
419 
420     return QOBJECT(list);
421 
422 out:
423     qobject_unref(list);
424     return NULL;
425 }
426 
427 static QObject *parse_keyword(JSONParserContext *ctxt)
428 {
429     JSONToken *token;
430 
431     token = parser_context_pop_token(ctxt);
432     assert(token && token->type == JSON_KEYWORD);
433 
434     if (!strcmp(token->str, "true")) {
435         return QOBJECT(qbool_from_bool(true));
436     } else if (!strcmp(token->str, "false")) {
437         return QOBJECT(qbool_from_bool(false));
438     } else if (!strcmp(token->str, "null")) {
439         return QOBJECT(qnull());
440     }
441     parse_error(ctxt, token, "invalid keyword '%s'", token->str);
442     return NULL;
443 }
444 
445 static QObject *parse_interpolation(JSONParserContext *ctxt)
446 {
447     JSONToken *token;
448 
449     token = parser_context_pop_token(ctxt);
450     assert(token && token->type == JSON_INTERP);
451 
452     if (!strcmp(token->str, "%p")) {
453         return va_arg(*ctxt->ap, QObject *);
454     } else if (!strcmp(token->str, "%i")) {
455         return QOBJECT(qbool_from_bool(va_arg(*ctxt->ap, int)));
456     } else if (!strcmp(token->str, "%d")) {
457         return QOBJECT(qnum_from_int(va_arg(*ctxt->ap, int)));
458     } else if (!strcmp(token->str, "%ld")) {
459         return QOBJECT(qnum_from_int(va_arg(*ctxt->ap, long)));
460     } else if (!strcmp(token->str, "%lld")) {
461         return QOBJECT(qnum_from_int(va_arg(*ctxt->ap, long long)));
462     } else if (!strcmp(token->str, "%" PRId64)) {
463         return QOBJECT(qnum_from_int(va_arg(*ctxt->ap, int64_t)));
464     } else if (!strcmp(token->str, "%u")) {
465         return QOBJECT(qnum_from_uint(va_arg(*ctxt->ap, unsigned int)));
466     } else if (!strcmp(token->str, "%lu")) {
467         return QOBJECT(qnum_from_uint(va_arg(*ctxt->ap, unsigned long)));
468     } else if (!strcmp(token->str, "%llu")) {
469         return QOBJECT(qnum_from_uint(va_arg(*ctxt->ap, unsigned long long)));
470     } else if (!strcmp(token->str, "%" PRIu64)) {
471         return QOBJECT(qnum_from_uint(va_arg(*ctxt->ap, uint64_t)));
472     } else if (!strcmp(token->str, "%s")) {
473         return QOBJECT(qstring_from_str(va_arg(*ctxt->ap, const char *)));
474     } else if (!strcmp(token->str, "%f")) {
475         return QOBJECT(qnum_from_double(va_arg(*ctxt->ap, double)));
476     }
477     parse_error(ctxt, token, "invalid interpolation '%s'", token->str);
478     return NULL;
479 }
480 
481 static QObject *parse_literal(JSONParserContext *ctxt)
482 {
483     JSONToken *token;
484 
485     token = parser_context_pop_token(ctxt);
486     assert(token);
487 
488     switch (token->type) {
489     case JSON_STRING:
490         return QOBJECT(parse_string(ctxt, token));
491     case JSON_INTEGER: {
492         /*
493          * Represent JSON_INTEGER as QNUM_I64 if possible, else as
494          * QNUM_U64, else as QNUM_DOUBLE.  Note that qemu_strtoi64()
495          * and qemu_strtou64() fail with ERANGE when it's not
496          * possible.
497          *
498          * qnum_get_int() will then work for any signed 64-bit
499          * JSON_INTEGER, qnum_get_uint() for any unsigned 64-bit
500          * integer, and qnum_get_double() both for any JSON_INTEGER
501          * and any JSON_FLOAT (with precision loss for integers beyond
502          * 53 bits)
503          */
504         int ret;
505         int64_t value;
506         uint64_t uvalue;
507 
508         ret = qemu_strtoi64(token->str, NULL, 10, &value);
509         if (!ret) {
510             return QOBJECT(qnum_from_int(value));
511         }
512         assert(ret == -ERANGE);
513 
514         if (token->str[0] != '-') {
515             ret = qemu_strtou64(token->str, NULL, 10, &uvalue);
516             if (!ret) {
517                 return QOBJECT(qnum_from_uint(uvalue));
518             }
519             assert(ret == -ERANGE);
520         }
521     }
522     /* fall through to JSON_FLOAT */
523     case JSON_FLOAT:
524         /* FIXME dependent on locale; a pervasive issue in QEMU */
525         /* FIXME our lexer matches RFC 8259 in forbidding Inf or NaN,
526          * but those might be useful extensions beyond JSON */
527         return QOBJECT(qnum_from_double(strtod(token->str, NULL)));
528     default:
529         abort();
530     }
531 }
532 
533 static QObject *parse_value(JSONParserContext *ctxt)
534 {
535     JSONToken *token;
536 
537     token = parser_context_peek_token(ctxt);
538     if (token == NULL) {
539         parse_error(ctxt, NULL, "premature EOI");
540         return NULL;
541     }
542 
543     switch (token->type) {
544     case JSON_LCURLY:
545         return parse_object(ctxt);
546     case JSON_LSQUARE:
547         return parse_array(ctxt);
548     case JSON_INTERP:
549         return parse_interpolation(ctxt);
550     case JSON_INTEGER:
551     case JSON_FLOAT:
552     case JSON_STRING:
553         return parse_literal(ctxt);
554     case JSON_KEYWORD:
555         return parse_keyword(ctxt);
556     default:
557         parse_error(ctxt, token, "expecting value");
558         return NULL;
559     }
560 }
561 
562 JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
563 {
564     JSONToken *token = g_malloc(sizeof(JSONToken) + tokstr->len + 1);
565 
566     token->type = type;
567     memcpy(token->str, tokstr->str, tokstr->len);
568     token->str[tokstr->len] = 0;
569     token->x = x;
570     token->y = y;
571     return token;
572 }
573 
574 QObject *json_parser_parse(GQueue *tokens, va_list *ap, Error **errp)
575 {
576     JSONParserContext ctxt = { .buf = tokens, .ap = ap };
577     QObject *result;
578 
579     result = parse_value(&ctxt);
580     assert(ctxt.err || g_queue_is_empty(ctxt.buf));
581 
582     error_propagate(errp, ctxt.err);
583 
584     while (!g_queue_is_empty(ctxt.buf)) {
585         parser_context_pop_token(&ctxt);
586     }
587     g_free(ctxt.current);
588 
589     return result;
590 }
591