1 /* 2 * JSON parser, yacc/bison based. Manual lexer. 3 * Mikhail. 4 */ 5 6 %define api.pure full 7 8 %{ 9 /* * ** *** ***** ******** ************* ********************* */ 10 #include <tcl.h> 11 #include <ctype.h> 12 #include <math.h> 13 #include <string.h> 14 #include <stdlib.h> 15 #include <assert.h> 16 17 #include <json_y.h> 18 19 #define TOKEN(tok) TRACE (("TOKEN %s\n", tok)) 20 #define TOKEN1(tok) TRACE (("TOKEN %s (%s)\n", tok, Tcl_GetString(context->obj))) 21 #define REDUCE(rule) TRACE (("REDUCE %s\n", rule)) 22 23 #define TRUE_O (Tcl_NewStringObj("true", 4)) 24 #define FALSE_O (Tcl_NewStringObj("false", 5)) 25 #define NULL_O (Tcl_NewStringObj("null", 4)) 26 27 typedef union YYSTYPE YYSTYPE; 28 29 static void jsonerror (struct context *, const char *); 30 static int jsonlexp (YYSTYPE *lvalp, struct context *context); 31 32 #define yylex jsonlexp 33 #define yyerror jsonerror 34 35 /* * ** *** ***** ******** ************* ********************* 36 ** User declarations <EOF> 37 */ 38 %} 39 40 %union { 41 Tcl_Obj *obj; 42 struct { 43 Tcl_Obj *key; 44 Tcl_Obj *val; 45 } keyval; 46 }; 47 48 %lex-param {struct context* context} 49 %parse-param {struct context* context} 50 51 %token STRING CONSTANT 52 53 %type <obj> tree 54 %type <obj> json 55 %type <obj> object 56 %type <obj> list 57 %type <obj> values 58 %type <obj> members 59 %type <obj> value 60 %type <obj> string 61 %type <keyval> member 62 63 %% 64 65 tree : json 66 { 67 REDUCE("TREE"); 68 if (context->I) { 69 Tcl_SetObjResult(context->I, $1); 70 TRACE ((" RESULT (%s)\n", Tcl_GetString($1))); 71 } 72 context->result = TCL_OK; 73 } 74 ; 75 76 json : value 77 ; 78 79 object : '{' members '}' 80 { 81 $$ = $2; 82 } 83 | '{' '}' 84 { 85 $$ = Tcl_NewObj(); 86 } 87 ; 88 89 list : '[' values ']' 90 { 91 $$ = $2; 92 } 93 | '[' ']' 94 { 95 $$ = Tcl_NewObj(); 96 } 97 ; 98 99 values : value 100 { 101 $$ = Tcl_NewListObj(1, &$1); 102 } 103 | values ',' value 104 { 105 Tcl_ListObjAppendElement(NULL, $1, $3); 106 $$ = $1; 107 } 108 ; 109 110 members : member 111 { 112 $$ = Tcl_NewListObj(0, NULL); 113 Tcl_ListObjAppendElement(NULL, $$, $1.key); 114 Tcl_ListObjAppendElement(NULL, $$, $1.val); 115 } 116 | members ',' member 117 { 118 Tcl_ListObjAppendElement(NULL, $1, $3.key); 119 Tcl_ListObjAppendElement(NULL, $1, $3.val); 120 $$ = $1; 121 } 122 ; 123 124 member : string ':' value 125 { 126 $$.key = $1; 127 $$.val = $3; 128 } 129 ; 130 131 string : STRING 132 { 133 $$ = context->obj; 134 } 135 ; 136 137 value : CONSTANT 138 { 139 $$ = context->obj; 140 } 141 | string 142 | object 143 | list 144 ; 145 146 %% 147 /* * ** *** ***** ******** ************* ********************* 148 ** User definitions 149 */ 150 151 void 152 jsonparse (struct context* context) 153 { 154 yyparse (context); 155 } 156 157 #define HAVE(n) (context->remaining >= n) 158 159 #define DRAIN(n) context->text += n, context->remaining -= n 160 161 #define STORESTRINGSEGMENT() \ 162 if (initialized) { \ 163 if (context->text != bp) { \ 164 Tcl_AppendToObj(context->obj, \ 165 bp, context->text - bp); \ 166 } \ 167 } else { \ 168 context->obj = Tcl_NewStringObj( \ 169 bp, context->text - bp); \ 170 initialized = 1; \ 171 } 172 173 void 174 jsonskip(struct context *context) 175 { 176 while (context->remaining) { 177 switch (*context->text) { 178 case '\n': 179 case ' ': 180 case '\t': 181 case '\r': 182 DRAIN(1); 183 continue; 184 } 185 break; 186 } 187 } 188 189 static int 190 jsonlexp(YYSTYPE *lvalp, struct context *context) 191 { 192 const char *bp = NULL; 193 194 /* Question: Why not plain numbers 1,2 for the states 195 * but these specific hex patterns ? 196 */ 197 enum { 198 PLAIN = 0x0000ff00, 199 INSTR = 0x00ff0000 200 } lstate; 201 double d; 202 char *end; 203 const char *p; 204 int initialized = 0; 205 206 /* 207 * Do not auto-lex beyond a full json structure. 208 */ 209 if (context->result == TCL_OK) { 210 TOKEN ("<<eof>>"); 211 return 0; 212 } 213 214 /* 215 * Quickly skip and ignore whitespace. 216 */ 217 while (context->remaining) { 218 switch (*context->text) { 219 case '\n': 220 case ' ': 221 case '\t': 222 case '\r': 223 DRAIN(1); 224 continue; 225 } 226 break; 227 } 228 229 /* 230 * Handle the token following the whitespace. Small state machine to 231 * handle strings and escapes in them, and bare words (various 232 * contants, and numbers). 233 */ 234 for (lstate = PLAIN; context->remaining > 0; DRAIN(1)) { 235 if (lstate == INSTR) { 236 if (*context->text == '"') { 237 /* 238 * End of quoted string 239 */ 240 241 STORESTRINGSEGMENT(); 242 DRAIN(1); 243 TOKEN1 ("STRING"); 244 return STRING; 245 } 246 247 if (*context->text == '\\') { 248 /* 249 * Escaped sequence. The 9 sequences specified at json.org 250 * are: 251 * \" \\ \/ \b \f \n \r \t \uXXXX 252 */ 253 char buf[TCL_UTF_MAX]; 254 int len, consumed; 255 256 STORESTRINGSEGMENT(); 257 258 /* 259 * Perform additional checks to restrict the set of accepted 260 * escape sequence to what is allowed by json.org instead of 261 * Tcl_UtfBackslash. 262 */ 263 264 if (!HAVE(1)) { 265 Tcl_AppendToObj(context->obj, "\\", 1); 266 yyerror(context,"incomplete escape at <<eof> error"); 267 TOKEN("incomplete escape at <<eof>> error"); 268 return -1; 269 } 270 switch (context->text[1]) { 271 case '"': 272 case '\\': 273 case '/': 274 case 'b': 275 case 'f': 276 case 'n': 277 case 'r': 278 case 't': 279 break; 280 case 'u': 281 if (!HAVE(5)) { 282 Tcl_AppendToObj(context->obj, "\\u", 2); 283 yyerror(context,"incomplete escape at <<eof> error"); 284 TOKEN("incomplete escape at <<eof>> error"); 285 return -1; 286 } 287 break; 288 default: 289 Tcl_AppendToObj(context->obj, context->text + 1, 1); 290 yyerror(context,"bad escape"); 291 TOKEN("bad escape"); 292 return -1; 293 } 294 295 /* 296 * XXX Tcl_UtfBackslash() may be more 297 * XXX permissive, than JSON standard. 298 * XXX But that may be a good thing: 299 * XXX "be generous in what you accept". 300 */ 301 len = Tcl_UtfBackslash(context->text, 302 &consumed, buf); 303 DRAIN(consumed - 1); 304 bp = context->text + 1; 305 Tcl_AppendToObj(context->obj, buf, len); 306 } 307 continue; 308 } 309 310 switch (*context->text) { 311 case ',': 312 case '{': 313 case ':': 314 case '}': 315 case '[': 316 case ']': 317 DRAIN(1); 318 TOKEN (context->text[-1]); 319 return context->text[-1]; 320 case 't': 321 if ((context->remaining < 4) || 322 strncmp("rue", context->text + 1, 3)) 323 goto bareword; 324 DRAIN(4); 325 context->obj = TRUE_O; 326 TOKEN1 ("CONSTANT"); 327 return CONSTANT; 328 case 'f': 329 if ((context->remaining < 5) || 330 strncmp("alse", context->text + 1, 4)) 331 goto bareword; 332 DRAIN(5); 333 context->obj = FALSE_O; 334 TOKEN1 ("CONSTANT"); 335 return CONSTANT; 336 case 'n': 337 if ((context->remaining < 4) || 338 strncmp("ull", context->text + 1, 3)) 339 goto bareword; 340 DRAIN(4); 341 context->obj = NULL_O; 342 TOKEN1 ("CONSTANT"); 343 return CONSTANT; 344 case '"': 345 bp = context->text + 1; 346 lstate = INSTR; 347 continue; 348 case '\\': 349 yyerror(context,"Escape character outside of string"); 350 TOKEN ("escape error"); 351 return -1; 352 } 353 354 /* 355 * We already considered the null, true, and false 356 * above, so it can only be a number now. 357 * 358 * NOTE: At this point we do not care about double 359 * versus integer, nor about the possible integer 360 * range. We generate a plain string Tcl_Obj and leave 361 * it to the user of the generated structure to 362 * convert to a number when actually needed. This 363 * defered conversion also ensures that the Tcl and 364 * platform we are building against does not matter 365 * regarding integer range, only the abilities of the 366 * Tcl at runtime. 367 */ 368 369 d = strtod(context->text, &end); 370 if (end == context->text) 371 goto bareword; /* Nothing parsed */ 372 373 context->obj = Tcl_NewStringObj (context->text, 374 end - context->text); 375 376 context->remaining -= (end - context->text); 377 context->text = end; 378 TOKEN1 ("CONSTANT"); 379 return CONSTANT; 380 } 381 382 TOKEN ("<<eof>>"); 383 return 0; 384 bareword: 385 yyerror(context,"Bare word encountered"); 386 TOKEN ("bare word error"); 387 return -1; 388 } 389 390 #if 0 391 int 392 jsonlex(struct context *context) 393 { 394 const char *bp = NULL; 395 396 /* Question: Why not plain numbers 1,2 for the states 397 * but these specific hex patterns ? 398 */ 399 enum { 400 PLAIN = 0x0000ff00, 401 INSTR = 0x00ff0000 402 } lstate; 403 double d; 404 char *end; 405 const char *p; 406 int initialized = 0; 407 408 while (context->remaining) { 409 /* Iterate over the whole string and check all tokens. 410 * Nothing else. 411 */ 412 413 /* 414 * Quickly skip and ignore whitespace. 415 */ 416 while (context->remaining) { 417 switch (*context->text) { 418 case '\n': 419 case ' ': 420 case '\t': 421 case '\r': 422 DRAIN(1); 423 continue; 424 } 425 break; 426 } 427 428 /* 429 * Handle the token following the whitespace. Small state machine to 430 * handle strings and escapes in them, and bare words (various 431 * contants, and numbers). 432 */ 433 for (lstate = PLAIN; context->remaining > 0; DRAIN(1)) { 434 if (lstate == INSTR) { 435 if (*context->text == '"') { 436 /* 437 * End of quoted string 438 */ 439 DRAIN(1); 440 goto next_token; 441 } 442 443 if (*context->text == '\\') { 444 /* 445 * Escaped sequence 446 */ 447 char buf[TCL_UTF_MAX]; 448 int len, consumed; 449 450 /* 451 * XXX Tcl_UtfBackslash() may be more 452 * XXX permissive, than JSON standard. 453 * XXX But that may be a good thing: 454 * XXX "be generous in what you accept". 455 */ 456 len = Tcl_UtfBackslash(context->text, &consumed, buf); 457 DRAIN(consumed - 1); 458 } 459 continue; 460 } 461 462 switch (*context->text) { 463 case ',': 464 case '{': 465 case ':': 466 case '}': 467 case '[': 468 case ']': 469 DRAIN(1); 470 goto next_token; 471 472 case 't': 473 if ((context->remaining < 4) || 474 strncmp("rue", context->text + 1, 3)) 475 return -1; /* bare word */ 476 DRAIN(4); 477 goto next_token; 478 case 'f': 479 if ((context->remaining < 5) || 480 strncmp("alse", context->text + 1, 4)) 481 return -1; /* bare word */ 482 DRAIN(5); 483 goto next_token; 484 case 'n': 485 if ((context->remaining < 4) || 486 strncmp("ull", context->text + 1, 3)) 487 return -1; /* bare word */ 488 DRAIN(4); 489 goto next_token; 490 case '"': 491 bp = context->text + 1; 492 lstate = INSTR; 493 continue; 494 case '\\': 495 /* Escape outside string, abort. */ 496 return -1; 497 } 498 499 /* 500 * We already considered the null, true, and false 501 * above, so it can only be a number now. 502 * 503 * NOTE: At this point we do not care about double 504 * versus integer, nor about the possible integer 505 * range. We generate a plain string Tcl_Obj and leave 506 * it to the user of the generated structure to 507 * convert to a number when actually needed. This 508 * defered conversion also ensures that the Tcl and 509 * platform we are building against does not matter 510 * regarding integer range, only the abilities of the 511 * Tcl at runtime. 512 */ 513 514 d = strtod(context->text, &end); 515 if (end == context->text) 516 return -1; /* bare word */ 517 518 context->remaining -= (end - context->text); 519 context->text = end; 520 goto next_token; 521 } 522 523 return 0; 524 525 next_token: 526 continue; 527 } 528 } 529 #endif 530 531 static void 532 jsonerror(struct context *context, const char *message) 533 { 534 char *fullmessage; 535 char *yytext; 536 int yyleng; 537 538 if (context->has_error) return; 539 540 if (context->obj) { 541 yytext = Tcl_GetStringFromObj(context->obj, &yyleng); 542 fullmessage = Tcl_Alloc(strlen(message) + 63 + yyleng); 543 544 sprintf(fullmessage, "%s %d bytes before end, around ``%.*s''", 545 message, context->remaining, yyleng, yytext); 546 } else { 547 fullmessage = Tcl_Alloc(strlen(message) + 63); 548 549 sprintf(fullmessage, "%s %d bytes before end", 550 message, context->remaining); 551 } 552 553 TRACE ((">>> %s\n",fullmessage)); 554 Tcl_SetResult (context->I, fullmessage, TCL_DYNAMIC); 555 Tcl_SetErrorCode (context->I, "JSON", "SYNTAX", NULL); 556 context->has_error = 1; 557 } 558