1 /*
2 * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include "yajl_lex.h"
18 #include "yajl_buf.h"
19
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <assert.h>
23 #include <string.h>
24
25 #ifdef YAJL_LEXER_DEBUG
26 static const char *
tokToStr(yajl_tok tok)27 tokToStr(yajl_tok tok)
28 {
29 switch (tok) {
30 case yajl_tok_bool: return "bool";
31 case yajl_tok_colon: return "colon";
32 case yajl_tok_comma: return "comma";
33 case yajl_tok_eof: return "eof";
34 case yajl_tok_error: return "error";
35 case yajl_tok_left_brace: return "brace";
36 case yajl_tok_left_bracket: return "bracket";
37 case yajl_tok_null: return "null";
38 case yajl_tok_integer: return "integer";
39 case yajl_tok_double: return "double";
40 case yajl_tok_right_brace: return "brace";
41 case yajl_tok_right_bracket: return "bracket";
42 case yajl_tok_string: return "string";
43 case yajl_tok_string_with_escapes: return "string_with_escapes";
44 }
45 return "unknown";
46 }
47 #endif
48
49 /* Impact of the stream parsing feature on the lexer:
50 *
51 * YAJL support stream parsing. That is, the ability to parse the first
52 * bits of a chunk of JSON before the last bits are available (still on
53 * the network or disk). This makes the lexer more complex. The
54 * responsibility of the lexer is to handle transparently the case where
55 * a chunk boundary falls in the middle of a token. This is
56 * accomplished is via a buffer and a character reading abstraction.
57 *
58 * Overview of implementation
59 *
60 * When we lex to end of input string before end of token is hit, we
61 * copy all of the input text composing the token into our lexBuf.
62 *
63 * Every time we read a character, we do so through the readChar function.
64 * readChar's responsibility is to handle pulling all chars from the buffer
65 * before pulling chars from input text
66 */
67
68 struct yajl_lexer_t {
69 /* the overal line and char offset into the data */
70 size_t lineOff;
71 size_t charOff;
72
73 /* error */
74 yajl_lex_error error;
75
76 /* a input buffer to handle the case where a token is spread over
77 * multiple chunks */
78 yajl_buf buf;
79
80 /* in the case where we have data in the lexBuf, bufOff holds
81 * the current offset into the lexBuf. */
82 size_t bufOff;
83
84 /* are we using the lex buf? */
85 unsigned int bufInUse;
86
87 /* shall we allow comments? */
88 unsigned int allowComments;
89
90 /* shall we validate utf8 inside strings? */
91 unsigned int validateUTF8;
92
93 yajl_alloc_funcs * alloc;
94 };
95
96 #define readChar(lxr, txt, off) \
97 (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
98 (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
99 ((txt)[(*(off))++]))
100
101 #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
102
103 yajl_lexer
yajl_lex_alloc(yajl_alloc_funcs * alloc,unsigned int allowComments,unsigned int validateUTF8)104 yajl_lex_alloc(yajl_alloc_funcs * alloc,
105 unsigned int allowComments, unsigned int validateUTF8)
106 {
107 yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
108 memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
109 lxr->buf = yajl_buf_alloc(alloc);
110 lxr->allowComments = allowComments;
111 lxr->validateUTF8 = validateUTF8;
112 lxr->alloc = alloc;
113 return lxr;
114 }
115
116 void
yajl_lex_free(yajl_lexer lxr)117 yajl_lex_free(yajl_lexer lxr)
118 {
119 yajl_buf_free(lxr->buf);
120 YA_FREE(lxr->alloc, lxr);
121 return;
122 }
123
124 /* a lookup table which lets us quickly determine three things:
125 * VEC - valid escaped control char
126 * note. the solidus '/' may be escaped or not.
127 * IJC - invalid json char
128 * VHC - valid hex char
129 * NFP - needs further processing (from a string scanning perspective)
130 * NUC - needs utf8 checking when enabled (from a string scanning perspective)
131 */
132 #define VEC 0x01
133 #define IJC 0x02
134 #define VHC 0x04
135 #define NFP 0x08
136 #define NUC 0x10
137
138 static const char charLookupTable[256] =
139 {
140 /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
141 /*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
142 /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
143 /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
144
145 /*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
146 /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
147 /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
148 /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
149
150 /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
151 /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
152 /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
153 /*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,
154
155 /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
156 /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
157 /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
158 /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
159
160 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
161 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
162 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
163 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
164
165 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
166 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
167 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
168 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
169
170 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
171 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
172 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
173 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
174
175 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
176 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
177 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
178 NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
179 };
180
181 /** process a variable length utf8 encoded codepoint.
182 *
183 * returns:
184 * yajl_tok_string - if valid utf8 char was parsed and offset was
185 * advanced
186 * yajl_tok_eof - if end of input was hit before validation could
187 * complete
188 * yajl_tok_error - if invalid utf8 was encountered
189 *
190 * NOTE: on error the offset will point to the first char of the
191 * invalid utf8 */
192 #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
193
194 static yajl_tok
yajl_lex_utf8_char(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset,unsigned char curChar)195 yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
196 size_t jsonTextLen, size_t * offset,
197 unsigned char curChar)
198 {
199 if (curChar <= 0x7f) {
200 /* single byte */
201 return yajl_tok_string;
202 } else if ((curChar >> 5) == 0x6) {
203 /* two byte */
204 UTF8_CHECK_EOF;
205 curChar = readChar(lexer, jsonText, offset);
206 if ((curChar >> 6) == 0x2) return yajl_tok_string;
207 } else if ((curChar >> 4) == 0x0e) {
208 /* three byte */
209 UTF8_CHECK_EOF;
210 curChar = readChar(lexer, jsonText, offset);
211 if ((curChar >> 6) == 0x2) {
212 UTF8_CHECK_EOF;
213 curChar = readChar(lexer, jsonText, offset);
214 if ((curChar >> 6) == 0x2) return yajl_tok_string;
215 }
216 } else if ((curChar >> 3) == 0x1e) {
217 /* four byte */
218 UTF8_CHECK_EOF;
219 curChar = readChar(lexer, jsonText, offset);
220 if ((curChar >> 6) == 0x2) {
221 UTF8_CHECK_EOF;
222 curChar = readChar(lexer, jsonText, offset);
223 if ((curChar >> 6) == 0x2) {
224 UTF8_CHECK_EOF;
225 curChar = readChar(lexer, jsonText, offset);
226 if ((curChar >> 6) == 0x2) return yajl_tok_string;
227 }
228 }
229 }
230
231 return yajl_tok_error;
232 }
233
234 /* lex a string. input is the lexer, pointer to beginning of
235 * json text, and start of string (offset).
236 * a token is returned which has the following meanings:
237 * yajl_tok_string: lex of string was successful. offset points to
238 * terminating '"'.
239 * yajl_tok_eof: end of text was encountered before we could complete
240 * the lex.
241 * yajl_tok_error: embedded in the string were unallowable chars. offset
242 * points to the offending char
243 */
244 #define STR_CHECK_EOF \
245 if (*offset >= jsonTextLen) { \
246 tok = yajl_tok_eof; \
247 goto finish_string_lex; \
248 }
249
250 /** scan a string for interesting characters that might need further
251 * review. return the number of chars that are uninteresting and can
252 * be skipped.
253 * (lth) hi world, any thoughts on how to make this routine faster? */
254 static size_t
yajl_string_scan(const unsigned char * buf,size_t len,int utf8check)255 yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
256 {
257 unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
258 size_t skip = 0;
259 while (skip < len && !(charLookupTable[*buf] & mask))
260 {
261 skip++;
262 buf++;
263 }
264 return skip;
265 }
266
267 static yajl_tok
yajl_lex_string(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset)268 yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
269 size_t jsonTextLen, size_t * offset)
270 {
271 yajl_tok tok = yajl_tok_error;
272 int hasEscapes = 0;
273
274 for (;;) {
275 unsigned char curChar;
276
277 /* now jump into a faster scanning routine to skip as much
278 * of the buffers as possible */
279 {
280 const unsigned char * p;
281 size_t len;
282
283 if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
284 lexer->bufOff < yajl_buf_len(lexer->buf)))
285 {
286 p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
287 (lexer->bufOff));
288 len = yajl_buf_len(lexer->buf) - lexer->bufOff;
289 lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
290 }
291 else if (*offset < jsonTextLen)
292 {
293 p = jsonText + *offset;
294 len = jsonTextLen - *offset;
295 *offset += yajl_string_scan(p, len, lexer->validateUTF8);
296 }
297 }
298
299 STR_CHECK_EOF;
300
301 curChar = readChar(lexer, jsonText, offset);
302
303 /* quote terminates */
304 if (curChar == '"') {
305 tok = yajl_tok_string;
306 break;
307 }
308 /* backslash escapes a set of control chars, */
309 else if (curChar == '\\') {
310 hasEscapes = 1;
311 STR_CHECK_EOF;
312
313 /* special case \u */
314 curChar = readChar(lexer, jsonText, offset);
315 if (curChar == 'u') {
316 unsigned int i = 0;
317
318 for (i=0;i<4;i++) {
319 STR_CHECK_EOF;
320 curChar = readChar(lexer, jsonText, offset);
321 if (!(charLookupTable[curChar] & VHC)) {
322 /* back up to offending char */
323 unreadChar(lexer, offset);
324 lexer->error = yajl_lex_string_invalid_hex_char;
325 goto finish_string_lex;
326 }
327 }
328 } else if (!(charLookupTable[curChar] & VEC)) {
329 /* back up to offending char */
330 unreadChar(lexer, offset);
331 lexer->error = yajl_lex_string_invalid_escaped_char;
332 goto finish_string_lex;
333 }
334 }
335 /* when not validating UTF8 it's a simple table lookup to determine
336 * if the present character is invalid */
337 else if(charLookupTable[curChar] & IJC) {
338 /* back up to offending char */
339 unreadChar(lexer, offset);
340 lexer->error = yajl_lex_string_invalid_json_char;
341 goto finish_string_lex;
342 }
343 /* when in validate UTF8 mode we need to do some extra work */
344 else if (lexer->validateUTF8) {
345 yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
346 offset, curChar);
347
348 if (t == yajl_tok_eof) {
349 tok = yajl_tok_eof;
350 goto finish_string_lex;
351 } else if (t == yajl_tok_error) {
352 lexer->error = yajl_lex_string_invalid_utf8;
353 goto finish_string_lex;
354 }
355 }
356 /* accept it, and move on */
357 }
358 finish_string_lex:
359 /* tell our buddy, the parser, wether he needs to process this string
360 * again */
361 if (hasEscapes && tok == yajl_tok_string) {
362 tok = yajl_tok_string_with_escapes;
363 }
364
365 return tok;
366 }
367
368 #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
369
370 static yajl_tok
yajl_lex_number(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset)371 yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
372 size_t jsonTextLen, size_t * offset)
373 {
374 /** XXX: numbers are the only entities in json that we must lex
375 * _beyond_ in order to know that they are complete. There
376 * is an ambiguous case for integers at EOF. */
377
378 unsigned char c;
379
380 yajl_tok tok = yajl_tok_integer;
381
382 RETURN_IF_EOF;
383 c = readChar(lexer, jsonText, offset);
384
385 /* optional leading minus */
386 if (c == '-') {
387 RETURN_IF_EOF;
388 c = readChar(lexer, jsonText, offset);
389 }
390
391 /* a single zero, or a series of integers */
392 if (c == '0') {
393 RETURN_IF_EOF;
394 c = readChar(lexer, jsonText, offset);
395 } else if (c >= '1' && c <= '9') {
396 do {
397 RETURN_IF_EOF;
398 c = readChar(lexer, jsonText, offset);
399 } while (c >= '0' && c <= '9');
400 } else {
401 unreadChar(lexer, offset);
402 lexer->error = yajl_lex_missing_integer_after_minus;
403 return yajl_tok_error;
404 }
405
406 /* optional fraction (indicates this is floating point) */
407 if (c == '.') {
408 int numRd = 0;
409
410 RETURN_IF_EOF;
411 c = readChar(lexer, jsonText, offset);
412
413 while (c >= '0' && c <= '9') {
414 numRd++;
415 RETURN_IF_EOF;
416 c = readChar(lexer, jsonText, offset);
417 }
418
419 if (!numRd) {
420 unreadChar(lexer, offset);
421 lexer->error = yajl_lex_missing_integer_after_decimal;
422 return yajl_tok_error;
423 }
424 tok = yajl_tok_double;
425 }
426
427 /* optional exponent (indicates this is floating point) */
428 if (c == 'e' || c == 'E') {
429 RETURN_IF_EOF;
430 c = readChar(lexer, jsonText, offset);
431
432 /* optional sign */
433 if (c == '+' || c == '-') {
434 RETURN_IF_EOF;
435 c = readChar(lexer, jsonText, offset);
436 }
437
438 if (c >= '0' && c <= '9') {
439 do {
440 RETURN_IF_EOF;
441 c = readChar(lexer, jsonText, offset);
442 } while (c >= '0' && c <= '9');
443 } else {
444 unreadChar(lexer, offset);
445 lexer->error = yajl_lex_missing_integer_after_exponent;
446 return yajl_tok_error;
447 }
448 tok = yajl_tok_double;
449 }
450
451 /* we always go "one too far" */
452 unreadChar(lexer, offset);
453
454 return tok;
455 }
456
457 static yajl_tok
yajl_lex_comment(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset)458 yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
459 size_t jsonTextLen, size_t * offset)
460 {
461 unsigned char c;
462
463 yajl_tok tok = yajl_tok_comment;
464
465 RETURN_IF_EOF;
466 c = readChar(lexer, jsonText, offset);
467
468 /* either slash or star expected */
469 if (c == '/') {
470 /* now we throw away until end of line */
471 do {
472 RETURN_IF_EOF;
473 c = readChar(lexer, jsonText, offset);
474 } while (c != '\n');
475 } else if (c == '*') {
476 /* now we throw away until end of comment */
477 for (;;) {
478 RETURN_IF_EOF;
479 c = readChar(lexer, jsonText, offset);
480 if (c == '*') {
481 RETURN_IF_EOF;
482 c = readChar(lexer, jsonText, offset);
483 if (c == '/') {
484 break;
485 } else {
486 unreadChar(lexer, offset);
487 }
488 }
489 }
490 } else {
491 lexer->error = yajl_lex_invalid_char;
492 tok = yajl_tok_error;
493 }
494
495 return tok;
496 }
497
498 yajl_tok
yajl_lex_lex(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t * offset,const unsigned char ** outBuf,size_t * outLen)499 yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
500 size_t jsonTextLen, size_t * offset,
501 const unsigned char ** outBuf, size_t * outLen)
502 {
503 yajl_tok tok = yajl_tok_error;
504 unsigned char c;
505 size_t startOffset = *offset;
506
507 *outBuf = NULL;
508 *outLen = 0;
509
510 for (;;) {
511 assert(*offset <= jsonTextLen);
512
513 if (*offset >= jsonTextLen) {
514 tok = yajl_tok_eof;
515 goto lexed;
516 }
517
518 c = readChar(lexer, jsonText, offset);
519
520 switch (c) {
521 case '{':
522 tok = yajl_tok_left_bracket;
523 goto lexed;
524 case '}':
525 tok = yajl_tok_right_bracket;
526 goto lexed;
527 case '[':
528 tok = yajl_tok_left_brace;
529 goto lexed;
530 case ']':
531 tok = yajl_tok_right_brace;
532 goto lexed;
533 case ',':
534 tok = yajl_tok_comma;
535 goto lexed;
536 case ':':
537 tok = yajl_tok_colon;
538 goto lexed;
539 case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
540 startOffset++;
541 break;
542 case 't': {
543 const char * want = "rue";
544 do {
545 if (*offset >= jsonTextLen) {
546 tok = yajl_tok_eof;
547 goto lexed;
548 }
549 c = readChar(lexer, jsonText, offset);
550 if (c != *want) {
551 unreadChar(lexer, offset);
552 lexer->error = yajl_lex_invalid_string;
553 tok = yajl_tok_error;
554 goto lexed;
555 }
556 } while (*(++want));
557 tok = yajl_tok_bool;
558 goto lexed;
559 }
560 case 'f': {
561 const char * want = "alse";
562 do {
563 if (*offset >= jsonTextLen) {
564 tok = yajl_tok_eof;
565 goto lexed;
566 }
567 c = readChar(lexer, jsonText, offset);
568 if (c != *want) {
569 unreadChar(lexer, offset);
570 lexer->error = yajl_lex_invalid_string;
571 tok = yajl_tok_error;
572 goto lexed;
573 }
574 } while (*(++want));
575 tok = yajl_tok_bool;
576 goto lexed;
577 }
578 case 'n': {
579 const char * want = "ull";
580 do {
581 if (*offset >= jsonTextLen) {
582 tok = yajl_tok_eof;
583 goto lexed;
584 }
585 c = readChar(lexer, jsonText, offset);
586 if (c != *want) {
587 unreadChar(lexer, offset);
588 lexer->error = yajl_lex_invalid_string;
589 tok = yajl_tok_error;
590 goto lexed;
591 }
592 } while (*(++want));
593 tok = yajl_tok_null;
594 goto lexed;
595 }
596 case '"': {
597 tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
598 jsonTextLen, offset);
599 goto lexed;
600 }
601 case '-':
602 case '0': case '1': case '2': case '3': case '4':
603 case '5': case '6': case '7': case '8': case '9': {
604 /* integer parsing wants to start from the beginning */
605 unreadChar(lexer, offset);
606 tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
607 jsonTextLen, offset);
608 goto lexed;
609 }
610 case '/':
611 /* hey, look, a probable comment! If comments are disabled
612 * it's an error. */
613 if (!lexer->allowComments) {
614 unreadChar(lexer, offset);
615 lexer->error = yajl_lex_unallowed_comment;
616 tok = yajl_tok_error;
617 goto lexed;
618 }
619 /* if comments are enabled, then we should try to lex
620 * the thing. possible outcomes are
621 * - successful lex (tok_comment, which means continue),
622 * - malformed comment opening (slash not followed by
623 * '*' or '/') (tok_error)
624 * - eof hit. (tok_eof) */
625 tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
626 jsonTextLen, offset);
627 if (tok == yajl_tok_comment) {
628 /* "error" is silly, but that's the initial
629 * state of tok. guilty until proven innocent. */
630 tok = yajl_tok_error;
631 yajl_buf_clear(lexer->buf);
632 lexer->bufInUse = 0;
633 startOffset = *offset;
634 break;
635 }
636 /* hit error or eof, bail */
637 goto lexed;
638 default:
639 lexer->error = yajl_lex_invalid_char;
640 tok = yajl_tok_error;
641 goto lexed;
642 }
643 }
644
645
646 lexed:
647 /* need to append to buffer if the buffer is in use or
648 * if it's an EOF token */
649 if (tok == yajl_tok_eof || lexer->bufInUse) {
650 if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
651 lexer->bufInUse = 1;
652 yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
653 lexer->bufOff = 0;
654
655 if (tok != yajl_tok_eof) {
656 *outBuf = yajl_buf_data(lexer->buf);
657 *outLen = yajl_buf_len(lexer->buf);
658 lexer->bufInUse = 0;
659 }
660 } else if (tok != yajl_tok_error) {
661 *outBuf = jsonText + startOffset;
662 *outLen = *offset - startOffset;
663 }
664
665 /* special case for strings. skip the quotes. */
666 if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
667 {
668 assert(*outLen >= 2);
669 (*outBuf)++;
670 *outLen -= 2;
671 }
672
673
674 #ifdef YAJL_LEXER_DEBUG
675 if (tok == yajl_tok_error) {
676 printf("lexical error: %s\n",
677 yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
678 } else if (tok == yajl_tok_eof) {
679 printf("EOF hit\n");
680 } else {
681 printf("lexed %s: '", tokToStr(tok));
682 fwrite(*outBuf, 1, *outLen, stdout);
683 printf("'\n");
684 }
685 #endif
686
687 return tok;
688 }
689
690 const char *
yajl_lex_error_to_string(yajl_lex_error error)691 yajl_lex_error_to_string(yajl_lex_error error)
692 {
693 switch (error) {
694 case yajl_lex_e_ok:
695 return "ok, no error";
696 case yajl_lex_string_invalid_utf8:
697 return "invalid bytes in UTF8 string.";
698 case yajl_lex_string_invalid_escaped_char:
699 return "inside a string, '\\' occurs before a character "
700 "which it may not.";
701 case yajl_lex_string_invalid_json_char:
702 return "invalid character inside string.";
703 case yajl_lex_string_invalid_hex_char:
704 return "invalid (non-hex) character occurs after '\\u' inside "
705 "string.";
706 case yajl_lex_invalid_char:
707 return "invalid char in json text.";
708 case yajl_lex_invalid_string:
709 return "invalid string in json text.";
710 case yajl_lex_missing_integer_after_exponent:
711 return "malformed number, a digit is required after the exponent.";
712 case yajl_lex_missing_integer_after_decimal:
713 return "malformed number, a digit is required after the "
714 "decimal point.";
715 case yajl_lex_missing_integer_after_minus:
716 return "malformed number, a digit is required after the "
717 "minus sign.";
718 case yajl_lex_unallowed_comment:
719 return "probable comment found in input text, comments are "
720 "not enabled.";
721 }
722 return "unknown error code";
723 }
724
725
726 /** allows access to more specific information about the lexical
727 * error when yajl_lex_lex returns yajl_tok_error. */
728 yajl_lex_error
yajl_lex_get_error(yajl_lexer lexer)729 yajl_lex_get_error(yajl_lexer lexer)
730 {
731 if (lexer == NULL) return (yajl_lex_error) -1;
732 return lexer->error;
733 }
734
yajl_lex_current_line(yajl_lexer lexer)735 size_t yajl_lex_current_line(yajl_lexer lexer)
736 {
737 return lexer->lineOff;
738 }
739
yajl_lex_current_char(yajl_lexer lexer)740 size_t yajl_lex_current_char(yajl_lexer lexer)
741 {
742 return lexer->charOff;
743 }
744
yajl_lex_peek(yajl_lexer lexer,const unsigned char * jsonText,size_t jsonTextLen,size_t offset)745 yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
746 size_t jsonTextLen, size_t offset)
747 {
748 const unsigned char * outBuf;
749 size_t outLen;
750 size_t bufLen = yajl_buf_len(lexer->buf);
751 size_t bufOff = lexer->bufOff;
752 unsigned int bufInUse = lexer->bufInUse;
753 yajl_tok tok;
754
755 tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
756 &outBuf, &outLen);
757
758 lexer->bufOff = bufOff;
759 lexer->bufInUse = bufInUse;
760 yajl_buf_truncate(lexer->buf, bufLen);
761
762 return tok;
763 }
764