1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qemu/osdep.h" 15 #include "qemu-common.h" 16 #include "qapi/qmp/json-lexer.h" 17 #include "qapi/qmp/json-streamer.h" 18 19 #define MAX_TOKEN_SIZE (64ULL << 20) 20 21 /* 22 * From RFC 8259 "The JavaScript Object Notation (JSON) Data 23 * Interchange Format", with [comments in brackets]: 24 * 25 * The set of tokens includes six structural characters, strings, 26 * numbers, and three literal names. 27 * 28 * These are the six structural characters: 29 * 30 * begin-array = ws %x5B ws ; [ left square bracket 31 * begin-object = ws %x7B ws ; { left curly bracket 32 * end-array = ws %x5D ws ; ] right square bracket 33 * end-object = ws %x7D ws ; } right curly bracket 34 * name-separator = ws %x3A ws ; : colon 35 * value-separator = ws %x2C ws ; , comma 36 * 37 * Insignificant whitespace is allowed before or after any of the six 38 * structural characters. 39 * [This lexer accepts it before or after any token, which is actually 40 * the same, as the grammar always has structural characters between 41 * other tokens.] 42 * 43 * ws = *( 44 * %x20 / ; Space 45 * %x09 / ; Horizontal tab 46 * %x0A / ; Line feed or New line 47 * %x0D ) ; Carriage return 48 * 49 * [...] three literal names: 50 * false null true 51 * [This lexer accepts [a-z]+, and leaves rejecting unknown literal 52 * names to the parser.] 53 * 54 * [Numbers:] 55 * 56 * number = [ minus ] int [ frac ] [ exp ] 57 * decimal-point = %x2E ; . 58 * digit1-9 = %x31-39 ; 1-9 59 * e = %x65 / %x45 ; e E 60 * exp = e [ minus / plus ] 1*DIGIT 61 * frac = decimal-point 1*DIGIT 62 * int = zero / ( digit1-9 *DIGIT ) 63 * minus = %x2D ; - 64 * plus = %x2B ; + 65 * zero = %x30 ; 0 66 * 67 * [Strings:] 68 * string = quotation-mark *char quotation-mark 69 * 70 * char = unescaped / 71 * escape ( 72 * %x22 / ; " quotation mark U+0022 73 * %x5C / ; \ reverse solidus U+005C 74 * %x2F / ; / solidus U+002F 75 * %x62 / ; b backspace U+0008 76 * %x66 / ; f form feed U+000C 77 * %x6E / ; n line feed U+000A 78 * %x72 / ; r carriage return U+000D 79 * %x74 / ; t tab U+0009 80 * %x75 4HEXDIG ) ; uXXXX U+XXXX 81 * escape = %x5C ; \ 82 * quotation-mark = %x22 ; " 83 * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF 84 * [This lexer accepts any non-control character after escape, and 85 * leaves rejecting invalid ones to the parser.] 86 * 87 * 88 * Extensions over RFC 8259: 89 * - Extra escape sequence in strings: 90 * 0x27 (apostrophe) is recognized after escape, too 91 * - Single-quoted strings: 92 * Like double-quoted strings, except they're delimited by %x27 93 * (apostrophe) instead of %x22 (quotation mark), and can't contain 94 * unescaped apostrophe, but can contain unescaped quotation mark. 95 * - Interpolation: 96 * interpolation = %((l|ll|I64)[du]|[ipsf]) 97 * 98 * Note: 99 * - Input must be encoded in modified UTF-8. 100 * - Decoding and validating is left to the parser. 101 */ 102 103 enum json_lexer_state { 104 IN_ERROR = 0, /* must really be 0, see json_lexer[] */ 105 IN_DQ_STRING_ESCAPE, 106 IN_DQ_STRING, 107 IN_SQ_STRING_ESCAPE, 108 IN_SQ_STRING, 109 IN_ZERO, 110 IN_DIGITS, 111 IN_DIGIT, 112 IN_EXP_E, 113 IN_MANTISSA, 114 IN_MANTISSA_DIGITS, 115 IN_NONZERO_NUMBER, 116 IN_NEG_NONZERO_NUMBER, 117 IN_KEYWORD, 118 IN_ESCAPE, 119 IN_ESCAPE_L, 120 IN_ESCAPE_LL, 121 IN_ESCAPE_I, 122 IN_ESCAPE_I6, 123 IN_ESCAPE_I64, 124 IN_WHITESPACE, 125 IN_START, 126 }; 127 128 QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START); 129 130 #define TERMINAL(state) [0 ... 0x7F] = (state) 131 132 /* Return whether TERMINAL is a terminal state and the transition to it 133 from OLD_STATE required lookahead. This happens whenever the table 134 below uses the TERMINAL macro. */ 135 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 136 (terminal != IN_ERROR && json_lexer[(old_state)][0] == (terminal)) 137 138 static const uint8_t json_lexer[][256] = { 139 /* Relies on default initialization to IN_ERROR! */ 140 141 /* double quote string */ 142 [IN_DQ_STRING_ESCAPE] = { 143 [0x20 ... 0xFD] = IN_DQ_STRING, 144 }, 145 [IN_DQ_STRING] = { 146 [0x20 ... 0xFD] = IN_DQ_STRING, 147 ['\\'] = IN_DQ_STRING_ESCAPE, 148 ['"'] = JSON_STRING, 149 }, 150 151 /* single quote string */ 152 [IN_SQ_STRING_ESCAPE] = { 153 [0x20 ... 0xFD] = IN_SQ_STRING, 154 }, 155 [IN_SQ_STRING] = { 156 [0x20 ... 0xFD] = IN_SQ_STRING, 157 ['\\'] = IN_SQ_STRING_ESCAPE, 158 ['\''] = JSON_STRING, 159 }, 160 161 /* Zero */ 162 [IN_ZERO] = { 163 TERMINAL(JSON_INTEGER), 164 ['0' ... '9'] = IN_ERROR, 165 ['.'] = IN_MANTISSA, 166 }, 167 168 /* Float */ 169 [IN_DIGITS] = { 170 TERMINAL(JSON_FLOAT), 171 ['0' ... '9'] = IN_DIGITS, 172 }, 173 174 [IN_DIGIT] = { 175 ['0' ... '9'] = IN_DIGITS, 176 }, 177 178 [IN_EXP_E] = { 179 ['-'] = IN_DIGIT, 180 ['+'] = IN_DIGIT, 181 ['0' ... '9'] = IN_DIGITS, 182 }, 183 184 [IN_MANTISSA_DIGITS] = { 185 TERMINAL(JSON_FLOAT), 186 ['0' ... '9'] = IN_MANTISSA_DIGITS, 187 ['e'] = IN_EXP_E, 188 ['E'] = IN_EXP_E, 189 }, 190 191 [IN_MANTISSA] = { 192 ['0' ... '9'] = IN_MANTISSA_DIGITS, 193 }, 194 195 /* Number */ 196 [IN_NONZERO_NUMBER] = { 197 TERMINAL(JSON_INTEGER), 198 ['0' ... '9'] = IN_NONZERO_NUMBER, 199 ['e'] = IN_EXP_E, 200 ['E'] = IN_EXP_E, 201 ['.'] = IN_MANTISSA, 202 }, 203 204 [IN_NEG_NONZERO_NUMBER] = { 205 ['0'] = IN_ZERO, 206 ['1' ... '9'] = IN_NONZERO_NUMBER, 207 }, 208 209 /* keywords */ 210 [IN_KEYWORD] = { 211 TERMINAL(JSON_KEYWORD), 212 ['a' ... 'z'] = IN_KEYWORD, 213 }, 214 215 /* whitespace */ 216 [IN_WHITESPACE] = { 217 TERMINAL(JSON_SKIP), 218 [' '] = IN_WHITESPACE, 219 ['\t'] = IN_WHITESPACE, 220 ['\r'] = IN_WHITESPACE, 221 ['\n'] = IN_WHITESPACE, 222 }, 223 224 /* escape */ 225 [IN_ESCAPE_LL] = { 226 ['d'] = JSON_ESCAPE, 227 ['u'] = JSON_ESCAPE, 228 }, 229 230 [IN_ESCAPE_L] = { 231 ['d'] = JSON_ESCAPE, 232 ['l'] = IN_ESCAPE_LL, 233 ['u'] = JSON_ESCAPE, 234 }, 235 236 [IN_ESCAPE_I64] = { 237 ['d'] = JSON_ESCAPE, 238 ['u'] = JSON_ESCAPE, 239 }, 240 241 [IN_ESCAPE_I6] = { 242 ['4'] = IN_ESCAPE_I64, 243 }, 244 245 [IN_ESCAPE_I] = { 246 ['6'] = IN_ESCAPE_I6, 247 }, 248 249 [IN_ESCAPE] = { 250 ['d'] = JSON_ESCAPE, 251 ['i'] = JSON_ESCAPE, 252 ['p'] = JSON_ESCAPE, 253 ['s'] = JSON_ESCAPE, 254 ['u'] = JSON_ESCAPE, 255 ['f'] = JSON_ESCAPE, 256 ['l'] = IN_ESCAPE_L, 257 ['I'] = IN_ESCAPE_I, 258 }, 259 260 /* top level rule */ 261 [IN_START] = { 262 ['"'] = IN_DQ_STRING, 263 ['\''] = IN_SQ_STRING, 264 ['0'] = IN_ZERO, 265 ['1' ... '9'] = IN_NONZERO_NUMBER, 266 ['-'] = IN_NEG_NONZERO_NUMBER, 267 ['{'] = JSON_LCURLY, 268 ['}'] = JSON_RCURLY, 269 ['['] = JSON_LSQUARE, 270 [']'] = JSON_RSQUARE, 271 [','] = JSON_COMMA, 272 [':'] = JSON_COLON, 273 ['a' ... 'z'] = IN_KEYWORD, 274 ['%'] = IN_ESCAPE, 275 [' '] = IN_WHITESPACE, 276 ['\t'] = IN_WHITESPACE, 277 ['\r'] = IN_WHITESPACE, 278 ['\n'] = IN_WHITESPACE, 279 }, 280 }; 281 282 void json_lexer_init(JSONLexer *lexer) 283 { 284 lexer->state = IN_START; 285 lexer->token = g_string_sized_new(3); 286 lexer->x = lexer->y = 0; 287 } 288 289 static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 290 { 291 int char_consumed, new_state; 292 293 lexer->x++; 294 if (ch == '\n') { 295 lexer->x = 0; 296 lexer->y++; 297 } 298 299 do { 300 assert(lexer->state <= ARRAY_SIZE(json_lexer)); 301 new_state = json_lexer[lexer->state][(uint8_t)ch]; 302 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 303 if (char_consumed && !flush) { 304 g_string_append_c(lexer->token, ch); 305 } 306 307 switch (new_state) { 308 case JSON_LCURLY: 309 case JSON_RCURLY: 310 case JSON_LSQUARE: 311 case JSON_RSQUARE: 312 case JSON_COLON: 313 case JSON_COMMA: 314 case JSON_ESCAPE: 315 case JSON_INTEGER: 316 case JSON_FLOAT: 317 case JSON_KEYWORD: 318 case JSON_STRING: 319 json_message_process_token(lexer, lexer->token, new_state, 320 lexer->x, lexer->y); 321 /* fall through */ 322 case JSON_SKIP: 323 g_string_truncate(lexer->token, 0); 324 new_state = IN_START; 325 break; 326 case IN_ERROR: 327 /* XXX: To avoid having previous bad input leaving the parser in an 328 * unresponsive state where we consume unpredictable amounts of 329 * subsequent "good" input, percolate this error state up to the 330 * tokenizer/parser by forcing a NULL object to be emitted, then 331 * reset state. 332 * 333 * Also note that this handling is required for reliable channel 334 * negotiation between QMP and the guest agent, since chr(0xFF) 335 * is placed at the beginning of certain events to ensure proper 336 * delivery when the channel is in an unknown state. chr(0xFF) is 337 * never a valid ASCII/UTF-8 sequence, so this should reliably 338 * induce an error/flush state. 339 */ 340 json_message_process_token(lexer, lexer->token, JSON_ERROR, 341 lexer->x, lexer->y); 342 g_string_truncate(lexer->token, 0); 343 new_state = IN_START; 344 lexer->state = new_state; 345 return; 346 default: 347 break; 348 } 349 lexer->state = new_state; 350 } while (!char_consumed && !flush); 351 352 /* Do not let a single token grow to an arbitrarily large size, 353 * this is a security consideration. 354 */ 355 if (lexer->token->len > MAX_TOKEN_SIZE) { 356 json_message_process_token(lexer, lexer->token, lexer->state, 357 lexer->x, lexer->y); 358 g_string_truncate(lexer->token, 0); 359 lexer->state = IN_START; 360 } 361 } 362 363 void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 364 { 365 size_t i; 366 367 for (i = 0; i < size; i++) { 368 json_lexer_feed_char(lexer, buffer[i], false); 369 } 370 } 371 372 void json_lexer_flush(JSONLexer *lexer) 373 { 374 if (lexer->state != IN_START) { 375 json_lexer_feed_char(lexer, 0, true); 376 } 377 } 378 379 void json_lexer_destroy(JSONLexer *lexer) 380 { 381 g_string_free(lexer->token, true); 382 } 383