1 /*
2  * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 #include "yajl_lex.h"
18 #include "yajl_buf.h"
19 
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <assert.h>
23 #include <string.h>
24 
25 #ifdef YAJL_LEXER_DEBUG
26 static const char *
tokToStr(yajl_tok tok)27 tokToStr(yajl_tok tok)
28 {
29     switch (tok) {
30         case yajl_tok_bool: return "bool";
31         case yajl_tok_colon: return "colon";
32         case yajl_tok_comma: return "comma";
33         case yajl_tok_eof: return "eof";
34         case yajl_tok_error: return "error";
35         case yajl_tok_left_brace: return "brace";
36         case yajl_tok_left_bracket: return "bracket";
37         case yajl_tok_null: return "null";
38         case yajl_tok_integer: return "integer";
39         case yajl_tok_double: return "double";
40         case yajl_tok_right_brace: return "brace";
41         case yajl_tok_right_bracket: return "bracket";
42         case yajl_tok_string: return "string";
43         case yajl_tok_string_with_escapes: return "string_with_escapes";
44     }
45     return "unknown";
46 }
47 #endif
48 
49 /* Impact of the stream parsing feature on the lexer:
50  *
51  * YAJL support stream parsing.  That is, the ability to parse the first
52  * bits of a chunk of JSON before the last bits are available (still on
53  * the network or disk).  This makes the lexer more complex.  The
54  * responsibility of the lexer is to handle transparently the case where
55  * a chunk boundary falls in the middle of a token.  This is
56  * accomplished is via a buffer and a character reading abstraction.
57  *
58  * Overview of implementation
59  *
60  * When we lex to end of input string before end of token is hit, we
61  * copy all of the input text composing the token into our lexBuf.
62  *
63  * Every time we read a character, we do so through the readChar function.
64  * readChar's responsibility is to handle pulling all chars from the buffer
65  * before pulling chars from input text
66  */
67 
68 struct yajl_lexer_t {
69     /* the overal line and char offset into the data */
70     size_t lineOff;
71     size_t charOff;
72 
73     /* error */
74     yajl_lex_error error;
75 
76     /* a input buffer to handle the case where a token is spread over
77      * multiple chunks */
78     yajl_buf buf;
79 
80     /* in the case where we have data in the lexBuf, bufOff holds
81      * the current offset into the lexBuf. */
82     size_t bufOff;
83 
84     /* are we using the lex buf? */
85     unsigned int bufInUse;
86 
87     /* shall we allow comments? */
88     unsigned int allowComments;
89 
90     /* shall we validate utf8 inside strings? */
91     unsigned int validateUTF8;
92 
93     yajl_alloc_funcs * alloc;
94 };
95 
96 #define readChar(lxr, txt, off)                      \
97     (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
98      (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
99      ((txt)[(*(off))++]))
100 
101 #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
102 
103 yajl_lexer
yajl_lex_alloc(yajl_alloc_funcs * alloc,unsigned int allowComments,unsigned int validateUTF8)104 yajl_lex_alloc(yajl_alloc_funcs * alloc,
105                unsigned int allowComments, unsigned int validateUTF8)
106 {
107     yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
108     memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
109     lxr->buf = yajl_buf_alloc(alloc);
110     lxr->allowComments = allowComments;
111     lxr->validateUTF8 = validateUTF8;
112     lxr->alloc = alloc;
113     return lxr;
114 }
115 
116 void
yajl_lex_free(yajl_lexer lxr)117 yajl_lex_free(yajl_lexer lxr)
118 {
119     yajl_buf_free(lxr->buf);
120     YA_FREE(lxr->alloc, lxr);
121     return;
122 }
123 
124 /* a lookup table which lets us quickly determine three things:
125  * VEC - valid escaped control char
126  * note.  the solidus '/' may be escaped or not.
127  * IJC - invalid json char
128  * VHC - valid hex char
129  * NFP - needs further processing (from a string scanning perspective)
130  * NUC - needs utf8 checking when enabled (from a string scanning perspective)
131  */
132 #define VEC 0x01
133 #define IJC 0x02
134 #define VHC 0x04
135 #define NFP 0x08
136 #define NUC 0x10
137 
138 static const char charLookupTable[256] =
139 {
140 /*00*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
141 /*08*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
142 /*10*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
143 /*18*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
144 
145 /*20*/ 0      , 0      , NFP|VEC|IJC, 0      , 0      , 0      , 0      , 0      ,
146 /*28*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , VEC    ,
147 /*30*/ VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    ,
148 /*38*/ VHC    , VHC    , 0      , 0      , 0      , 0      , 0      , 0      ,
149 
150 /*40*/ 0      , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , 0      ,
151 /*48*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
152 /*50*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
153 /*58*/ 0      , 0      , 0      , 0      , NFP|VEC|IJC, 0      , 0      , 0      ,
154 
155 /*60*/ 0      , VHC    , VEC|VHC, VHC    , VHC    , VHC    , VEC|VHC, 0      ,
156 /*68*/ 0      , 0      , 0      , 0      , 0      , 0      , VEC    , 0      ,
157 /*70*/ 0      , 0      , VEC    , 0      , VEC    , 0      , 0      , 0      ,
158 /*78*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
159 
160        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
161        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
162        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
163        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
164 
165        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
166        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
167        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
168        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
169 
170        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
171        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
172        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
173        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
174 
175        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
176        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
177        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
178        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC
179 };
180 
181 /** process a variable length utf8 encoded codepoint.
182  *
183  *  returns:
184  *    yajl_tok_string - if valid utf8 char was parsed and offset was
185  *                      advanced
186  *    yajl_tok_eof - if end of input was hit before validation could
187  *                   complete
188  *    yajl_tok_error - if invalid utf8 was encountered
189  *
190  *  NOTE: on error the offset will point to the first char of the
191  *  invalid utf8 */
192 #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
193 
194 static yajl_tok
yajl_lex_utf8_char(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset,unsigned char curChar)195 yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
196                    size_t jsonTextLen, size_t * offset,
197                    unsigned char curChar)
198 {
199     if (curChar <= 0x7f) {
200         /* single byte */
201         return yajl_tok_string;
202     } else if ((curChar >> 5) == 0x6) {
203         /* two byte */
204         UTF8_CHECK_EOF;
205         curChar = readChar(lexer, jsonText, offset);
206         if ((curChar >> 6) == 0x2) return yajl_tok_string;
207     } else if ((curChar >> 4) == 0x0e) {
208         /* three byte */
209         UTF8_CHECK_EOF;
210         curChar = readChar(lexer, jsonText, offset);
211         if ((curChar >> 6) == 0x2) {
212             UTF8_CHECK_EOF;
213             curChar = readChar(lexer, jsonText, offset);
214             if ((curChar >> 6) == 0x2) return yajl_tok_string;
215         }
216     } else if ((curChar >> 3) == 0x1e) {
217         /* four byte */
218         UTF8_CHECK_EOF;
219         curChar = readChar(lexer, jsonText, offset);
220         if ((curChar >> 6) == 0x2) {
221             UTF8_CHECK_EOF;
222             curChar = readChar(lexer, jsonText, offset);
223             if ((curChar >> 6) == 0x2) {
224                 UTF8_CHECK_EOF;
225                 curChar = readChar(lexer, jsonText, offset);
226                 if ((curChar >> 6) == 0x2) return yajl_tok_string;
227             }
228         }
229     }
230 
231     return yajl_tok_error;
232 }
233 
234 /* lex a string.  input is the lexer, pointer to beginning of
235  * json text, and start of string (offset).
236  * a token is returned which has the following meanings:
237  * yajl_tok_string: lex of string was successful.  offset points to
238  *                  terminating '"'.
239  * yajl_tok_eof: end of text was encountered before we could complete
240  *               the lex.
241  * yajl_tok_error: embedded in the string were unallowable chars.  offset
242  *               points to the offending char
243  */
244 #define STR_CHECK_EOF \
245 if (*offset >= jsonTextLen) { \
246    tok = yajl_tok_eof; \
247    goto finish_string_lex; \
248 }
249 
250 /** scan a string for interesting characters that might need further
251  *  review.  return the number of chars that are uninteresting and can
252  *  be skipped.
253  * (lth) hi world, any thoughts on how to make this routine faster? */
254 static size_t
yajl_string_scan(const unsigned char * buf,size_t len,int utf8check)255 yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
256 {
257     unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
258     size_t skip = 0;
259     while (skip < len && !(charLookupTable[*buf] & mask))
260     {
261         skip++;
262         buf++;
263     }
264     return skip;
265 }
266 
267 static yajl_tok
yajl_lex_string(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset)268 yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
269                 size_t jsonTextLen, size_t * offset)
270 {
271     yajl_tok tok = yajl_tok_error;
272     int hasEscapes = 0;
273 
274     for (;;) {
275         unsigned char curChar;
276 
277         /* now jump into a faster scanning routine to skip as much
278          * of the buffers as possible */
279         {
280             const unsigned char * p;
281             size_t len;
282 
283             if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
284                  lexer->bufOff < yajl_buf_len(lexer->buf)))
285             {
286                 p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
287                      (lexer->bufOff));
288                 len = yajl_buf_len(lexer->buf) - lexer->bufOff;
289                 lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
290             }
291             else if (*offset < jsonTextLen)
292             {
293                 p = jsonText + *offset;
294                 len = jsonTextLen - *offset;
295                 *offset += yajl_string_scan(p, len, lexer->validateUTF8);
296             }
297         }
298 
299         STR_CHECK_EOF;
300 
301         curChar = readChar(lexer, jsonText, offset);
302 
303         /* quote terminates */
304         if (curChar == '"') {
305             tok = yajl_tok_string;
306             break;
307         }
308         /* backslash escapes a set of control chars, */
309         else if (curChar == '\\') {
310             hasEscapes = 1;
311             STR_CHECK_EOF;
312 
313             /* special case \u */
314             curChar = readChar(lexer, jsonText, offset);
315             if (curChar == 'u') {
316                 unsigned int i = 0;
317 
318                 for (i=0;i<4;i++) {
319                     STR_CHECK_EOF;
320                     curChar = readChar(lexer, jsonText, offset);
321                     if (!(charLookupTable[curChar] & VHC)) {
322                         /* back up to offending char */
323                         unreadChar(lexer, offset);
324                         lexer->error = yajl_lex_string_invalid_hex_char;
325                         goto finish_string_lex;
326                     }
327                 }
328             } else if (!(charLookupTable[curChar] & VEC)) {
329                 /* back up to offending char */
330                 unreadChar(lexer, offset);
331                 lexer->error = yajl_lex_string_invalid_escaped_char;
332                 goto finish_string_lex;
333             }
334         }
335         /* when not validating UTF8 it's a simple table lookup to determine
336          * if the present character is invalid */
337         else if(charLookupTable[curChar] & IJC) {
338             /* back up to offending char */
339             unreadChar(lexer, offset);
340             lexer->error = yajl_lex_string_invalid_json_char;
341             goto finish_string_lex;
342         }
343         /* when in validate UTF8 mode we need to do some extra work */
344         else if (lexer->validateUTF8) {
345             yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
346                                             offset, curChar);
347 
348             if (t == yajl_tok_eof) {
349                 tok = yajl_tok_eof;
350                 goto finish_string_lex;
351             } else if (t == yajl_tok_error) {
352                 lexer->error = yajl_lex_string_invalid_utf8;
353                 goto finish_string_lex;
354             }
355         }
356         /* accept it, and move on */
357     }
358   finish_string_lex:
359     /* tell our buddy, the parser, wether he needs to process this string
360      * again */
361     if (hasEscapes && tok == yajl_tok_string) {
362         tok = yajl_tok_string_with_escapes;
363     }
364 
365     return tok;
366 }
367 
368 #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
369 
370 static yajl_tok
yajl_lex_number(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset)371 yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
372                 size_t jsonTextLen, size_t * offset)
373 {
374     /** XXX: numbers are the only entities in json that we must lex
375      *       _beyond_ in order to know that they are complete.  There
376      *       is an ambiguous case for integers at EOF. */
377 
378     unsigned char c;
379 
380     yajl_tok tok = yajl_tok_integer;
381 
382     RETURN_IF_EOF;
383     c = readChar(lexer, jsonText, offset);
384 
385     /* optional leading minus */
386     if (c == '-') {
387         RETURN_IF_EOF;
388         c = readChar(lexer, jsonText, offset);
389     }
390 
391     /* a single zero, or a series of integers */
392     if (c == '0') {
393         RETURN_IF_EOF;
394         c = readChar(lexer, jsonText, offset);
395     } else if (c >= '1' && c <= '9') {
396         do {
397             RETURN_IF_EOF;
398             c = readChar(lexer, jsonText, offset);
399         } while (c >= '0' && c <= '9');
400     } else {
401         unreadChar(lexer, offset);
402         lexer->error = yajl_lex_missing_integer_after_minus;
403         return yajl_tok_error;
404     }
405 
406     /* optional fraction (indicates this is floating point) */
407     if (c == '.') {
408         int numRd = 0;
409 
410         RETURN_IF_EOF;
411         c = readChar(lexer, jsonText, offset);
412 
413         while (c >= '0' && c <= '9') {
414             numRd++;
415             RETURN_IF_EOF;
416             c = readChar(lexer, jsonText, offset);
417         }
418 
419         if (!numRd) {
420             unreadChar(lexer, offset);
421             lexer->error = yajl_lex_missing_integer_after_decimal;
422             return yajl_tok_error;
423         }
424         tok = yajl_tok_double;
425     }
426 
427     /* optional exponent (indicates this is floating point) */
428     if (c == 'e' || c == 'E') {
429         RETURN_IF_EOF;
430         c = readChar(lexer, jsonText, offset);
431 
432         /* optional sign */
433         if (c == '+' || c == '-') {
434             RETURN_IF_EOF;
435             c = readChar(lexer, jsonText, offset);
436         }
437 
438         if (c >= '0' && c <= '9') {
439             do {
440                 RETURN_IF_EOF;
441                 c = readChar(lexer, jsonText, offset);
442             } while (c >= '0' && c <= '9');
443         } else {
444             unreadChar(lexer, offset);
445             lexer->error = yajl_lex_missing_integer_after_exponent;
446             return yajl_tok_error;
447         }
448         tok = yajl_tok_double;
449     }
450 
451     /* we always go "one too far" */
452     unreadChar(lexer, offset);
453 
454     return tok;
455 }
456 
457 static yajl_tok
yajl_lex_comment(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset)458 yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
459                  size_t jsonTextLen, size_t * offset)
460 {
461     unsigned char c;
462 
463     yajl_tok tok = yajl_tok_comment;
464 
465     RETURN_IF_EOF;
466     c = readChar(lexer, jsonText, offset);
467 
468     /* either slash or star expected */
469     if (c == '/') {
470         /* now we throw away until end of line */
471         do {
472             RETURN_IF_EOF;
473             c = readChar(lexer, jsonText, offset);
474         } while (c != '\n');
475     } else if (c == '*') {
476         /* now we throw away until end of comment */
477         for (;;) {
478             RETURN_IF_EOF;
479             c = readChar(lexer, jsonText, offset);
480             if (c == '*') {
481                 RETURN_IF_EOF;
482                 c = readChar(lexer, jsonText, offset);
483                 if (c == '/') {
484                     break;
485                 } else {
486                     unreadChar(lexer, offset);
487                 }
488             }
489         }
490     } else {
491         lexer->error = yajl_lex_invalid_char;
492         tok = yajl_tok_error;
493     }
494 
495     return tok;
496 }
497 
498 yajl_tok
yajl_lex_lex(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset,const unsigned char ** outBuf,size_t * outLen)499 yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
500              size_t jsonTextLen, size_t * offset,
501              const unsigned char ** outBuf, size_t * outLen)
502 {
503     yajl_tok tok = yajl_tok_error;
504     unsigned char c;
505     size_t startOffset = *offset;
506 
507     *outBuf = NULL;
508     *outLen = 0;
509 
510     for (;;) {
511         assert(*offset <= jsonTextLen);
512 
513         if (*offset >= jsonTextLen) {
514             tok = yajl_tok_eof;
515             goto lexed;
516         }
517 
518         c = readChar(lexer, jsonText, offset);
519 
520         switch (c) {
521             case '{':
522                 tok = yajl_tok_left_bracket;
523                 goto lexed;
524             case '}':
525                 tok = yajl_tok_right_bracket;
526                 goto lexed;
527             case '[':
528                 tok = yajl_tok_left_brace;
529                 goto lexed;
530             case ']':
531                 tok = yajl_tok_right_brace;
532                 goto lexed;
533             case ',':
534                 tok = yajl_tok_comma;
535                 goto lexed;
536             case ':':
537                 tok = yajl_tok_colon;
538                 goto lexed;
539             case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
540                 startOffset++;
541                 break;
542             case 't': {
543                 const char * want = "rue";
544                 do {
545                     if (*offset >= jsonTextLen) {
546                         tok = yajl_tok_eof;
547                         goto lexed;
548                     }
549                     c = readChar(lexer, jsonText, offset);
550                     if (c != *want) {
551                         unreadChar(lexer, offset);
552                         lexer->error = yajl_lex_invalid_string;
553                         tok = yajl_tok_error;
554                         goto lexed;
555                     }
556                 } while (*(++want));
557                 tok = yajl_tok_bool;
558                 goto lexed;
559             }
560             case 'f': {
561                 const char * want = "alse";
562                 do {
563                     if (*offset >= jsonTextLen) {
564                         tok = yajl_tok_eof;
565                         goto lexed;
566                     }
567                     c = readChar(lexer, jsonText, offset);
568                     if (c != *want) {
569                         unreadChar(lexer, offset);
570                         lexer->error = yajl_lex_invalid_string;
571                         tok = yajl_tok_error;
572                         goto lexed;
573                     }
574                 } while (*(++want));
575                 tok = yajl_tok_bool;
576                 goto lexed;
577             }
578             case 'n': {
579                 const char * want = "ull";
580                 do {
581                     if (*offset >= jsonTextLen) {
582                         tok = yajl_tok_eof;
583                         goto lexed;
584                     }
585                     c = readChar(lexer, jsonText, offset);
586                     if (c != *want) {
587                         unreadChar(lexer, offset);
588                         lexer->error = yajl_lex_invalid_string;
589                         tok = yajl_tok_error;
590                         goto lexed;
591                     }
592                 } while (*(++want));
593                 tok = yajl_tok_null;
594                 goto lexed;
595             }
596             case '"': {
597                 tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
598                                       jsonTextLen, offset);
599                 goto lexed;
600             }
601             case '-':
602             case '0': case '1': case '2': case '3': case '4':
603             case '5': case '6': case '7': case '8': case '9': {
604                 /* integer parsing wants to start from the beginning */
605                 unreadChar(lexer, offset);
606                 tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
607                                       jsonTextLen, offset);
608                 goto lexed;
609             }
610             case '/':
611                 /* hey, look, a probable comment!  If comments are disabled
612                  * it's an error. */
613                 if (!lexer->allowComments) {
614                     unreadChar(lexer, offset);
615                     lexer->error = yajl_lex_unallowed_comment;
616                     tok = yajl_tok_error;
617                     goto lexed;
618                 }
619                 /* if comments are enabled, then we should try to lex
620                  * the thing.  possible outcomes are
621                  * - successful lex (tok_comment, which means continue),
622                  * - malformed comment opening (slash not followed by
623                  *   '*' or '/') (tok_error)
624                  * - eof hit. (tok_eof) */
625                 tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
626                                        jsonTextLen, offset);
627                 if (tok == yajl_tok_comment) {
628                     /* "error" is silly, but that's the initial
629                      * state of tok.  guilty until proven innocent. */
630                     tok = yajl_tok_error;
631                     yajl_buf_clear(lexer->buf);
632                     lexer->bufInUse = 0;
633                     startOffset = *offset;
634                     break;
635                 }
636                 /* hit error or eof, bail */
637                 goto lexed;
638             default:
639                 lexer->error = yajl_lex_invalid_char;
640                 tok = yajl_tok_error;
641                 goto lexed;
642         }
643     }
644 
645 
646   lexed:
647     /* need to append to buffer if the buffer is in use or
648      * if it's an EOF token */
649     if (tok == yajl_tok_eof || lexer->bufInUse) {
650         if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
651         lexer->bufInUse = 1;
652         yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
653         lexer->bufOff = 0;
654 
655         if (tok != yajl_tok_eof) {
656             *outBuf = yajl_buf_data(lexer->buf);
657             *outLen = yajl_buf_len(lexer->buf);
658             lexer->bufInUse = 0;
659         }
660     } else if (tok != yajl_tok_error) {
661         *outBuf = jsonText + startOffset;
662         *outLen = *offset - startOffset;
663     }
664 
665     /* special case for strings. skip the quotes. */
666     if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
667     {
668         assert(*outLen >= 2);
669         (*outBuf)++;
670         *outLen -= 2;
671     }
672 
673 
674 #ifdef YAJL_LEXER_DEBUG
675     if (tok == yajl_tok_error) {
676         printf("lexical error: %s\n",
677                yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
678     } else if (tok == yajl_tok_eof) {
679         printf("EOF hit\n");
680     } else {
681         printf("lexed %s: '", tokToStr(tok));
682         fwrite(*outBuf, 1, *outLen, stdout);
683         printf("'\n");
684     }
685 #endif
686 
687     return tok;
688 }
689 
690 const char *
yajl_lex_error_to_string(yajl_lex_error error)691 yajl_lex_error_to_string(yajl_lex_error error)
692 {
693     switch (error) {
694         case yajl_lex_e_ok:
695             return "ok, no error";
696         case yajl_lex_string_invalid_utf8:
697             return "invalid bytes in UTF8 string.";
698         case yajl_lex_string_invalid_escaped_char:
699             return "inside a string, '\\' occurs before a character "
700                    "which it may not.";
701         case yajl_lex_string_invalid_json_char:
702             return "invalid character inside string.";
703         case yajl_lex_string_invalid_hex_char:
704             return "invalid (non-hex) character occurs after '\\u' inside "
705                    "string.";
706         case yajl_lex_invalid_char:
707             return "invalid char in json text.";
708         case yajl_lex_invalid_string:
709             return "invalid string in json text.";
710         case yajl_lex_missing_integer_after_exponent:
711             return "malformed number, a digit is required after the exponent.";
712         case yajl_lex_missing_integer_after_decimal:
713             return "malformed number, a digit is required after the "
714                    "decimal point.";
715         case yajl_lex_missing_integer_after_minus:
716             return "malformed number, a digit is required after the "
717                    "minus sign.";
718         case yajl_lex_unallowed_comment:
719             return "probable comment found in input text, comments are "
720                    "not enabled.";
721     }
722     return "unknown error code";
723 }
724 
725 
726 /** allows access to more specific information about the lexical
727  *  error when yajl_lex_lex returns yajl_tok_error. */
728 yajl_lex_error
yajl_lex_get_error(yajl_lexer lexer)729 yajl_lex_get_error(yajl_lexer lexer)
730 {
731     if (lexer == NULL) return (yajl_lex_error) -1;
732     return lexer->error;
733 }
734 
yajl_lex_current_line(yajl_lexer lexer)735 size_t yajl_lex_current_line(yajl_lexer lexer)
736 {
737     return lexer->lineOff;
738 }
739 
yajl_lex_current_char(yajl_lexer lexer)740 size_t yajl_lex_current_char(yajl_lexer lexer)
741 {
742     return lexer->charOff;
743 }
744 
yajl_lex_peek(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t offset)745 yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
746                        size_t jsonTextLen, size_t offset)
747 {
748     const unsigned char * outBuf;
749     size_t outLen;
750     size_t bufLen = yajl_buf_len(lexer->buf);
751     size_t bufOff = lexer->bufOff;
752     unsigned int bufInUse = lexer->bufInUse;
753     yajl_tok tok;
754 
755     tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
756                        &outBuf, &outLen);
757 
758     lexer->bufOff = bufOff;
759     lexer->bufInUse = bufInUse;
760     yajl_buf_truncate(lexer->buf, bufLen);
761 
762     return tok;
763 }
764