1 /*
2  * JSON parser, yacc/bison based. Manual lexer.
3  * Mikhail.
4  */
5 
6 %define api.pure full
7 
8 %{
9 /* * ** *** ***** ******** ************* ********************* */
10 #include <tcl.h>
11 #include <ctype.h>
12 #include <math.h>
13 #include <string.h>
14 #include <stdlib.h>
15 #include <assert.h>
16 
17 #include <json_y.h>
18 
19 #define TOKEN(tok)   TRACE (("TOKEN  %s\n", tok))
20 #define TOKEN1(tok)  TRACE (("TOKEN  %s (%s)\n", tok, Tcl_GetString(context->obj)))
21 #define REDUCE(rule) TRACE (("REDUCE %s\n", rule))
22 
23 #define TRUE_O  (Tcl_NewStringObj("true", 4))
24 #define FALSE_O (Tcl_NewStringObj("false", 5))
25 #define NULL_O  (Tcl_NewStringObj("null", 4))
26 
27 typedef union YYSTYPE YYSTYPE;
28 
29 static void jsonerror (struct context *, const char *);
30 static int  jsonlexp  (YYSTYPE *lvalp, struct context *context);
31 
32 #define yylex   jsonlexp
33 #define yyerror jsonerror
34 
35 /* * ** *** ***** ******** ************* *********************
36 ** User declarations <EOF>
37  */
38 %}
39 
40 %union {
41 	Tcl_Obj		*obj;
42 	struct {
43 		Tcl_Obj	*key;
44 		Tcl_Obj	*val;
45 	} keyval;
46 };
47 
48 %lex-param   {struct context* context}
49 %parse-param {struct context* context}
50 
51 %token STRING CONSTANT
52 
53 %type <obj>	tree
54 %type <obj>	json
55 %type <obj>	object
56 %type <obj>	list
57 %type <obj>	values
58 %type <obj>	members
59 %type <obj>	value
60 %type <obj>	string
61 %type <keyval>	member
62 
63 %%
64 
65 tree    : json
66 	{
67 		REDUCE("TREE");
68 		if (context->I) {
69 		  Tcl_SetObjResult(context->I, $1);
70 		  TRACE (("  RESULT (%s)\n", Tcl_GetString($1)));
71 		}
72 		context->result = TCL_OK;
73 	}
74 	;
75 
76 json    : value
77 	;
78 
79 object	: '{' members '}'
80 	{
81 		$$ = $2;
82 	}
83 	| '{' '}'
84 	{
85 		$$ = Tcl_NewObj();
86 	}
87 	;
88 
89 list	: '[' values ']'
90 	{
91 		$$ = $2;
92 	}
93 	| '[' ']'
94 	{
95 		$$ = Tcl_NewObj();
96 	}
97 	;
98 
99 values	: value
100 	{
101 		$$ = Tcl_NewListObj(1, &$1);
102 	}
103 	| values ',' value
104 	{
105 		Tcl_ListObjAppendElement(NULL, $1, $3);
106 		$$ = $1;
107 	}
108 	;
109 
110 members	: member
111 	{
112 	        $$ = Tcl_NewListObj(0, NULL);
113 		Tcl_ListObjAppendElement(NULL, $$, $1.key);
114 		Tcl_ListObjAppendElement(NULL, $$, $1.val);
115 	}
116 	| members ',' member
117 	{
118 		Tcl_ListObjAppendElement(NULL, $1, $3.key);
119 		Tcl_ListObjAppendElement(NULL, $1, $3.val);
120 		$$ = $1;
121 	}
122 	;
123 
124 member	: string ':' value
125 	{
126 		$$.key = $1;
127 		$$.val = $3;
128 	}
129 	;
130 
131 string	: STRING
132 	{
133 		$$ = context->obj;
134 	}
135 	;
136 
137 value	: CONSTANT
138 	{
139 		$$ = context->obj;
140 	}
141 	| string
142 	| object
143 	| list
144 	;
145 
146 %%
147 /* * ** *** ***** ******** ************* *********************
148 ** User definitions
149  */
150 
151 void
152 jsonparse (struct context* context)
153 {
154   yyparse (context);
155 }
156 
157 #define HAVE(n) (context->remaining >= n)
158 
159 #define DRAIN(n) context->text += n, context->remaining -= n
160 
161 #define	STORESTRINGSEGMENT()				\
162 	if (initialized) {				\
163 		if (context->text != bp) {		\
164 			Tcl_AppendToObj(context->obj,	\
165 			    bp, context->text - bp);	\
166 		}					\
167 	} else {					\
168 		context->obj = Tcl_NewStringObj(	\
169 		    bp, context->text - bp);		\
170 		initialized = 1;			\
171 	}
172 
173 void
174 jsonskip(struct context *context)
175 {
176   while (context->remaining) {
177     switch (*context->text) {
178     case '\n':
179     case ' ':
180     case '\t':
181     case '\r':
182       DRAIN(1);
183       continue;
184     }
185     break;
186   }
187 }
188 
189 static int
190 jsonlexp(YYSTYPE *lvalp, struct context *context)
191 {
192   const char *bp = NULL;
193 
194   /* Question: Why not plain numbers 1,2 for the states
195    *           but these specific hex patterns ?
196    */
197   enum {
198     PLAIN	= 0x0000ff00,
199     INSTR	= 0x00ff0000
200   } lstate;
201   double 	 d;
202   char		*end;
203   const char	*p;
204   int		 initialized = 0;
205 
206   /*
207    * Do not auto-lex beyond a full json structure.
208    */
209   if (context->result == TCL_OK) {
210     TOKEN ("<<eof>>");
211     return 0;
212   }
213 
214   /*
215    * Quickly skip and ignore whitespace.
216    */
217   while (context->remaining) {
218     switch (*context->text) {
219     case '\n':
220     case ' ':
221     case '\t':
222     case '\r':
223       DRAIN(1);
224       continue;
225     }
226     break;
227   }
228 
229   /*
230    * Handle the token following the whitespace. Small state machine to
231    * handle strings and escapes in them, and bare words (various
232    * contants, and numbers).
233    */
234   for (lstate = PLAIN; context->remaining > 0; DRAIN(1)) {
235     if (lstate == INSTR) {
236       if (*context->text == '"') {
237 	/*
238 	 * End of quoted string
239 	 */
240 
241 	STORESTRINGSEGMENT();
242 	DRAIN(1);
243 	TOKEN1 ("STRING");
244 	return STRING;
245       }
246 
247       if (*context->text == '\\') {
248 	/*
249 	 * Escaped sequence. The 9 sequences specified at json.org
250 	 * are:
251 	 *       \"  \\  \/  \b  \f  \n  \r  \t  \uXXXX
252 	 */
253 	char	buf[TCL_UTF_MAX];
254 	int	len, consumed;
255 
256 	STORESTRINGSEGMENT();
257 
258 	/*
259 	 * Perform additional checks to restrict the set of accepted
260 	 * escape sequence to what is allowed by json.org instead of
261 	 * Tcl_UtfBackslash.
262 	 */
263 
264 	if (!HAVE(1)) {
265 	  Tcl_AppendToObj(context->obj, "\\", 1);
266 	  yyerror(context,"incomplete escape at <<eof> error");
267 	  TOKEN("incomplete escape at <<eof>> error");
268 	  return -1;
269 	}
270 	switch (context->text[1]) {
271 	  case '"':
272 	  case '\\':
273 	  case '/':
274 	  case 'b':
275 	  case 'f':
276 	  case 'n':
277 	  case 'r':
278 	  case 't':
279 	    break;
280 	  case 'u':
281 	    if (!HAVE(5)) {
282 	      Tcl_AppendToObj(context->obj, "\\u", 2);
283 	      yyerror(context,"incomplete escape at <<eof> error");
284 	      TOKEN("incomplete escape at <<eof>> error");
285 	      return -1;
286 	    }
287 	    break;
288 	  default:
289 	    Tcl_AppendToObj(context->obj, context->text + 1, 1);
290 	    yyerror(context,"bad escape");
291 	    TOKEN("bad escape");
292 	    return -1;
293 	}
294 
295 	/*
296 	 * XXX Tcl_UtfBackslash() may be more
297 	 * XXX permissive, than JSON standard.
298 	 * XXX But that may be a good thing:
299 	 * XXX "be generous in what you accept".
300 	 */
301 	len = Tcl_UtfBackslash(context->text,
302 			       &consumed, buf);
303 	DRAIN(consumed - 1);
304 	bp = context->text + 1;
305 	Tcl_AppendToObj(context->obj, buf, len);
306       }
307       continue;
308     }
309 
310     switch (*context->text) {
311     case ',':
312     case '{':
313     case ':':
314     case '}':
315     case '[':
316     case ']':
317       DRAIN(1);
318       TOKEN (context->text[-1]);
319       return context->text[-1];
320     case 't':
321       if ((context->remaining < 4) ||
322 	  strncmp("rue", context->text + 1, 3))
323 	goto bareword;
324       DRAIN(4);
325       context->obj = TRUE_O;
326       TOKEN1 ("CONSTANT");
327       return CONSTANT;
328     case 'f':
329       if ((context->remaining < 5) ||
330 	  strncmp("alse", context->text + 1, 4))
331 	goto bareword;
332       DRAIN(5);
333       context->obj = FALSE_O;
334       TOKEN1 ("CONSTANT");
335       return CONSTANT;
336     case 'n':
337       if ((context->remaining < 4) ||
338 	  strncmp("ull", context->text + 1, 3))
339 	goto bareword;
340       DRAIN(4);
341       context->obj = NULL_O;
342       TOKEN1 ("CONSTANT");
343       return CONSTANT;
344     case '"':
345       bp = context->text + 1;
346       lstate = INSTR;
347       continue;
348     case '\\':
349       yyerror(context,"Escape character outside of string");
350       TOKEN ("escape error");
351       return -1;
352     }
353 
354     /*
355      * We already considered the null, true, and false
356      * above, so it can only be a number now.
357      *
358      * NOTE: At this point we do not care about double
359      * versus integer, nor about the possible integer
360      * range. We generate a plain string Tcl_Obj and leave
361      * it to the user of the generated structure to
362      * convert to a number when actually needed. This
363      * defered conversion also ensures that the Tcl and
364      * platform we are building against does not matter
365      * regarding integer range, only the abilities of the
366      * Tcl at runtime.
367      */
368 
369     d = strtod(context->text, &end);
370     if (end == context->text)
371       goto bareword; /* Nothing parsed */
372 
373     context->obj = Tcl_NewStringObj (context->text,
374 				     end - context->text);
375 
376     context->remaining -= (end - context->text);
377     context->text = end;
378     TOKEN1 ("CONSTANT");
379     return CONSTANT;
380   }
381 
382   TOKEN ("<<eof>>");
383   return 0;
384  bareword:
385   yyerror(context,"Bare word encountered");
386   TOKEN ("bare word error");
387   return -1;
388 }
389 
390 #if 0
391 int
392 jsonlex(struct context *context)
393 {
394   const char *bp = NULL;
395 
396   /* Question: Why not plain numbers 1,2 for the states
397    *           but these specific hex patterns ?
398    */
399   enum {
400     PLAIN	= 0x0000ff00,
401     INSTR	= 0x00ff0000
402   } lstate;
403   double 	 d;
404   char		*end;
405   const char	*p;
406   int		 initialized = 0;
407 
408   while (context->remaining) {
409     /* Iterate over the whole string and check all tokens.
410      * Nothing else.
411      */
412 
413     /*
414      * Quickly skip and ignore whitespace.
415      */
416     while (context->remaining) {
417       switch (*context->text) {
418       case '\n':
419       case ' ':
420       case '\t':
421       case '\r':
422 	DRAIN(1);
423 	continue;
424       }
425       break;
426     }
427 
428   /*
429    * Handle the token following the whitespace. Small state machine to
430    * handle strings and escapes in them, and bare words (various
431    * contants, and numbers).
432    */
433   for (lstate = PLAIN; context->remaining > 0; DRAIN(1)) {
434     if (lstate == INSTR) {
435       if (*context->text == '"') {
436 	/*
437 	 * End of quoted string
438 	 */
439 	DRAIN(1);
440 	goto next_token;
441       }
442 
443       if (*context->text == '\\') {
444 	/*
445 	 * Escaped sequence
446 	 */
447 	char	buf[TCL_UTF_MAX];
448 	int	len, consumed;
449 
450 	/*
451 	 * XXX Tcl_UtfBackslash() may be more
452 	 * XXX permissive, than JSON standard.
453 	 * XXX But that may be a good thing:
454 	 * XXX "be generous in what you accept".
455 	 */
456 	len = Tcl_UtfBackslash(context->text, &consumed, buf);
457 	DRAIN(consumed - 1);
458       }
459       continue;
460     }
461 
462     switch (*context->text) {
463     case ',':
464     case '{':
465     case ':':
466     case '}':
467     case '[':
468     case ']':
469       DRAIN(1);
470       goto next_token;
471 
472     case 't':
473       if ((context->remaining < 4) ||
474 	  strncmp("rue", context->text + 1, 3))
475 	return -1; /* bare word */
476       DRAIN(4);
477       goto next_token;
478     case 'f':
479       if ((context->remaining < 5) ||
480 	  strncmp("alse", context->text + 1, 4))
481 	return -1; /* bare word */
482       DRAIN(5);
483       goto next_token;
484     case 'n':
485       if ((context->remaining < 4) ||
486 	  strncmp("ull", context->text + 1, 3))
487 	return -1; /* bare word */
488       DRAIN(4);
489       goto next_token;
490     case '"':
491       bp = context->text + 1;
492       lstate = INSTR;
493       continue;
494     case '\\':
495       /* Escape outside string, abort. */
496       return -1;
497     }
498 
499     /*
500      * We already considered the null, true, and false
501      * above, so it can only be a number now.
502      *
503      * NOTE: At this point we do not care about double
504      * versus integer, nor about the possible integer
505      * range. We generate a plain string Tcl_Obj and leave
506      * it to the user of the generated structure to
507      * convert to a number when actually needed. This
508      * defered conversion also ensures that the Tcl and
509      * platform we are building against does not matter
510      * regarding integer range, only the abilities of the
511      * Tcl at runtime.
512      */
513 
514     d = strtod(context->text, &end);
515     if (end == context->text)
516       return -1; /* bare word */
517 
518     context->remaining -= (end - context->text);
519     context->text = end;
520     goto next_token;
521   }
522 
523   return 0;
524 
525   next_token:
526   continue;
527   }
528 }
529 #endif
530 
531 static void
532 jsonerror(struct context *context, const char *message)
533 {
534   char *fullmessage;
535   char *yytext;
536   int   yyleng;
537 
538   if (context->has_error) return;
539 
540   if (context->obj) {
541     yytext = Tcl_GetStringFromObj(context->obj, &yyleng);
542     fullmessage = Tcl_Alloc(strlen(message) + 63 + yyleng);
543 
544     sprintf(fullmessage, "%s %d bytes before end, around ``%.*s''",
545 	    message, context->remaining, yyleng, yytext);
546   } else {
547     fullmessage = Tcl_Alloc(strlen(message) + 63);
548 
549     sprintf(fullmessage, "%s %d bytes before end",
550 	    message, context->remaining);
551   }
552 
553   TRACE ((">>> %s\n",fullmessage));
554   Tcl_SetResult    (context->I, fullmessage, TCL_DYNAMIC);
555   Tcl_SetErrorCode (context->I, "JSON", "SYNTAX", NULL);
556   context->has_error = 1;
557 }
558