1 /*-------------------------------------------------------------------------
2 *
3 * jsonapi.c
4 * JSON parser and lexer interfaces
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/common/jsonapi.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #ifndef FRONTEND
15 #include "postgres.h"
16 #else
17 #include "postgres_fe.h"
18 #endif
19
20 #include "common/jsonapi.h"
21 #include "mb/pg_wchar.h"
22
23 #ifdef FRONTEND
24 #include "common/logging.h"
25 #else
26 #include "miscadmin.h"
27 #endif
28
29 #ifdef FRONTEND
30 #define check_stack_depth()
31 #define json_log_and_abort(...) \
32 do { pg_log_fatal(__VA_ARGS__); exit(1); } while(0)
33 #else
34 #define json_log_and_abort(...) elog(ERROR, __VA_ARGS__)
35 #endif
36
37 /*
38 * The context of the parser is maintained by the recursive descent
39 * mechanism, but is passed explicitly to the error reporting routine
40 * for better diagnostics.
41 */
42 typedef enum /* contexts of JSON parser */
43 {
44 JSON_PARSE_VALUE, /* expecting a value */
45 JSON_PARSE_STRING, /* expecting a string (for a field name) */
46 JSON_PARSE_ARRAY_START, /* saw '[', expecting value or ']' */
47 JSON_PARSE_ARRAY_NEXT, /* saw array element, expecting ',' or ']' */
48 JSON_PARSE_OBJECT_START, /* saw '{', expecting label or '}' */
49 JSON_PARSE_OBJECT_LABEL, /* saw object label, expecting ':' */
50 JSON_PARSE_OBJECT_NEXT, /* saw object value, expecting ',' or '}' */
51 JSON_PARSE_OBJECT_COMMA, /* saw object ',', expecting next label */
52 JSON_PARSE_END /* saw the end of a document, expect nothing */
53 } JsonParseContext;
54
55 static inline JsonParseErrorType json_lex_string(JsonLexContext *lex);
56 static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s,
57 bool *num_err, int *total_len);
58 static inline JsonParseErrorType parse_scalar(JsonLexContext *lex, JsonSemAction *sem);
59 static JsonParseErrorType parse_object_field(JsonLexContext *lex, JsonSemAction *sem);
60 static JsonParseErrorType parse_object(JsonLexContext *lex, JsonSemAction *sem);
61 static JsonParseErrorType parse_array_element(JsonLexContext *lex, JsonSemAction *sem);
62 static JsonParseErrorType parse_array(JsonLexContext *lex, JsonSemAction *sem);
63 static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex);
64 static char *extract_token(JsonLexContext *lex);
65
66 /* the null action object used for pure validation */
67 JsonSemAction nullSemAction =
68 {
69 NULL, NULL, NULL, NULL, NULL,
70 NULL, NULL, NULL, NULL, NULL
71 };
72
73 /* Recursive Descent parser support routines */
74
75 /*
76 * lex_peek
77 *
78 * what is the current look_ahead token?
79 */
80 static inline JsonTokenType
lex_peek(JsonLexContext * lex)81 lex_peek(JsonLexContext *lex)
82 {
83 return lex->token_type;
84 }
85
86 /*
87 * lex_expect
88 *
89 * move the lexer to the next token if the current look_ahead token matches
90 * the parameter token. Otherwise, report an error.
91 */
92 static inline JsonParseErrorType
lex_expect(JsonParseContext ctx,JsonLexContext * lex,JsonTokenType token)93 lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token)
94 {
95 if (lex_peek(lex) == token)
96 return json_lex(lex);
97 else
98 return report_parse_error(ctx, lex);
99 }
100
101 /* chars to consider as part of an alphanumeric token */
102 #define JSON_ALPHANUMERIC_CHAR(c) \
103 (((c) >= 'a' && (c) <= 'z') || \
104 ((c) >= 'A' && (c) <= 'Z') || \
105 ((c) >= '0' && (c) <= '9') || \
106 (c) == '_' || \
107 IS_HIGHBIT_SET(c))
108
109 /*
110 * Utility function to check if a string is a valid JSON number.
111 *
112 * str is of length len, and need not be null-terminated.
113 */
114 bool
IsValidJsonNumber(const char * str,int len)115 IsValidJsonNumber(const char *str, int len)
116 {
117 bool numeric_error;
118 int total_len;
119 JsonLexContext dummy_lex;
120
121 if (len <= 0)
122 return false;
123
124 /*
125 * json_lex_number expects a leading '-' to have been eaten already.
126 *
127 * having to cast away the constness of str is ugly, but there's not much
128 * easy alternative.
129 */
130 if (*str == '-')
131 {
132 dummy_lex.input = unconstify(char *, str) + 1;
133 dummy_lex.input_length = len - 1;
134 }
135 else
136 {
137 dummy_lex.input = unconstify(char *, str);
138 dummy_lex.input_length = len;
139 }
140
141 json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len);
142
143 return (!numeric_error) && (total_len == dummy_lex.input_length);
144 }
145
146 /*
147 * makeJsonLexContextCstringLen
148 *
149 * lex constructor, with or without StringInfo object for de-escaped lexemes.
150 *
151 * Without is better as it makes the processing faster, so only make one
152 * if really required.
153 */
154 JsonLexContext *
makeJsonLexContextCstringLen(char * json,int len,int encoding,bool need_escapes)155 makeJsonLexContextCstringLen(char *json, int len, int encoding, bool need_escapes)
156 {
157 JsonLexContext *lex = palloc0(sizeof(JsonLexContext));
158
159 lex->input = lex->token_terminator = lex->line_start = json;
160 lex->line_number = 1;
161 lex->input_length = len;
162 lex->input_encoding = encoding;
163 if (need_escapes)
164 lex->strval = makeStringInfo();
165 return lex;
166 }
167
168 /*
169 * pg_parse_json
170 *
171 * Publicly visible entry point for the JSON parser.
172 *
173 * lex is a lexing context, set up for the json to be processed by calling
174 * makeJsonLexContext(). sem is a structure of function pointers to semantic
175 * action routines to be called at appropriate spots during parsing, and a
176 * pointer to a state object to be passed to those routines.
177 */
178 JsonParseErrorType
pg_parse_json(JsonLexContext * lex,JsonSemAction * sem)179 pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
180 {
181 JsonTokenType tok;
182 JsonParseErrorType result;
183
184 /* get the initial token */
185 result = json_lex(lex);
186 if (result != JSON_SUCCESS)
187 return result;
188
189 tok = lex_peek(lex);
190
191 /* parse by recursive descent */
192 switch (tok)
193 {
194 case JSON_TOKEN_OBJECT_START:
195 result = parse_object(lex, sem);
196 break;
197 case JSON_TOKEN_ARRAY_START:
198 result = parse_array(lex, sem);
199 break;
200 default:
201 result = parse_scalar(lex, sem); /* json can be a bare scalar */
202 }
203
204 if (result == JSON_SUCCESS)
205 result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END);
206
207 return result;
208 }
209
210 /*
211 * json_count_array_elements
212 *
213 * Returns number of array elements in lex context at start of array token
214 * until end of array token at same nesting level.
215 *
216 * Designed to be called from array_start routines.
217 */
218 JsonParseErrorType
json_count_array_elements(JsonLexContext * lex,int * elements)219 json_count_array_elements(JsonLexContext *lex, int *elements)
220 {
221 JsonLexContext copylex;
222 int count;
223 JsonParseErrorType result;
224
225 /*
226 * It's safe to do this with a shallow copy because the lexical routines
227 * don't scribble on the input. They do scribble on the other pointers
228 * etc, so doing this with a copy makes that safe.
229 */
230 memcpy(©lex, lex, sizeof(JsonLexContext));
231 copylex.strval = NULL; /* not interested in values here */
232 copylex.lex_level++;
233
234 count = 0;
235 result = lex_expect(JSON_PARSE_ARRAY_START, ©lex,
236 JSON_TOKEN_ARRAY_START);
237 if (result != JSON_SUCCESS)
238 return result;
239 if (lex_peek(©lex) != JSON_TOKEN_ARRAY_END)
240 {
241 while (1)
242 {
243 count++;
244 result = parse_array_element(©lex, &nullSemAction);
245 if (result != JSON_SUCCESS)
246 return result;
247 if (copylex.token_type != JSON_TOKEN_COMMA)
248 break;
249 result = json_lex(©lex);
250 if (result != JSON_SUCCESS)
251 return result;
252 }
253 }
254 result = lex_expect(JSON_PARSE_ARRAY_NEXT, ©lex,
255 JSON_TOKEN_ARRAY_END);
256 if (result != JSON_SUCCESS)
257 return result;
258
259 *elements = count;
260 return JSON_SUCCESS;
261 }
262
263 /*
264 * Recursive Descent parse routines. There is one for each structural
265 * element in a json document:
266 * - scalar (string, number, true, false, null)
267 * - array ( [ ] )
268 * - array element
269 * - object ( { } )
270 * - object field
271 */
272 static inline JsonParseErrorType
parse_scalar(JsonLexContext * lex,JsonSemAction * sem)273 parse_scalar(JsonLexContext *lex, JsonSemAction *sem)
274 {
275 char *val = NULL;
276 json_scalar_action sfunc = sem->scalar;
277 JsonTokenType tok = lex_peek(lex);
278 JsonParseErrorType result;
279
280 /* a scalar must be a string, a number, true, false, or null */
281 if (tok != JSON_TOKEN_STRING && tok != JSON_TOKEN_NUMBER &&
282 tok != JSON_TOKEN_TRUE && tok != JSON_TOKEN_FALSE &&
283 tok != JSON_TOKEN_NULL)
284 return report_parse_error(JSON_PARSE_VALUE, lex);
285
286 /* if no semantic function, just consume the token */
287 if (sfunc == NULL)
288 return json_lex(lex);
289
290 /* extract the de-escaped string value, or the raw lexeme */
291 if (lex_peek(lex) == JSON_TOKEN_STRING)
292 {
293 if (lex->strval != NULL)
294 val = pstrdup(lex->strval->data);
295 }
296 else
297 {
298 int len = (lex->token_terminator - lex->token_start);
299
300 val = palloc(len + 1);
301 memcpy(val, lex->token_start, len);
302 val[len] = '\0';
303 }
304
305 /* consume the token */
306 result = json_lex(lex);
307 if (result != JSON_SUCCESS)
308 return result;
309
310 /* invoke the callback */
311 (*sfunc) (sem->semstate, val, tok);
312
313 return JSON_SUCCESS;
314 }
315
316 static JsonParseErrorType
parse_object_field(JsonLexContext * lex,JsonSemAction * sem)317 parse_object_field(JsonLexContext *lex, JsonSemAction *sem)
318 {
319 /*
320 * An object field is "fieldname" : value where value can be a scalar,
321 * object or array. Note: in user-facing docs and error messages, we
322 * generally call a field name a "key".
323 */
324
325 char *fname = NULL; /* keep compiler quiet */
326 json_ofield_action ostart = sem->object_field_start;
327 json_ofield_action oend = sem->object_field_end;
328 bool isnull;
329 JsonTokenType tok;
330 JsonParseErrorType result;
331
332 if (lex_peek(lex) != JSON_TOKEN_STRING)
333 return report_parse_error(JSON_PARSE_STRING, lex);
334 if ((ostart != NULL || oend != NULL) && lex->strval != NULL)
335 fname = pstrdup(lex->strval->data);
336 result = json_lex(lex);
337 if (result != JSON_SUCCESS)
338 return result;
339
340 result = lex_expect(JSON_PARSE_OBJECT_LABEL, lex, JSON_TOKEN_COLON);
341 if (result != JSON_SUCCESS)
342 return result;
343
344 tok = lex_peek(lex);
345 isnull = tok == JSON_TOKEN_NULL;
346
347 if (ostart != NULL)
348 (*ostart) (sem->semstate, fname, isnull);
349
350 switch (tok)
351 {
352 case JSON_TOKEN_OBJECT_START:
353 result = parse_object(lex, sem);
354 break;
355 case JSON_TOKEN_ARRAY_START:
356 result = parse_array(lex, sem);
357 break;
358 default:
359 result = parse_scalar(lex, sem);
360 }
361 if (result != JSON_SUCCESS)
362 return result;
363
364 if (oend != NULL)
365 (*oend) (sem->semstate, fname, isnull);
366 return JSON_SUCCESS;
367 }
368
369 static JsonParseErrorType
parse_object(JsonLexContext * lex,JsonSemAction * sem)370 parse_object(JsonLexContext *lex, JsonSemAction *sem)
371 {
372 /*
373 * an object is a possibly empty sequence of object fields, separated by
374 * commas and surrounded by curly braces.
375 */
376 json_struct_action ostart = sem->object_start;
377 json_struct_action oend = sem->object_end;
378 JsonTokenType tok;
379 JsonParseErrorType result;
380
381 check_stack_depth();
382
383 if (ostart != NULL)
384 (*ostart) (sem->semstate);
385
386 /*
387 * Data inside an object is at a higher nesting level than the object
388 * itself. Note that we increment this after we call the semantic routine
389 * for the object start and restore it before we call the routine for the
390 * object end.
391 */
392 lex->lex_level++;
393
394 Assert(lex_peek(lex) == JSON_TOKEN_OBJECT_START);
395 result = json_lex(lex);
396 if (result != JSON_SUCCESS)
397 return result;
398
399 tok = lex_peek(lex);
400 switch (tok)
401 {
402 case JSON_TOKEN_STRING:
403 result = parse_object_field(lex, sem);
404 while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
405 {
406 result = json_lex(lex);
407 if (result != JSON_SUCCESS)
408 break;
409 result = parse_object_field(lex, sem);
410 }
411 break;
412 case JSON_TOKEN_OBJECT_END:
413 break;
414 default:
415 /* case of an invalid initial token inside the object */
416 result = report_parse_error(JSON_PARSE_OBJECT_START, lex);
417 }
418 if (result != JSON_SUCCESS)
419 return result;
420
421 result = lex_expect(JSON_PARSE_OBJECT_NEXT, lex, JSON_TOKEN_OBJECT_END);
422 if (result != JSON_SUCCESS)
423 return result;
424
425 lex->lex_level--;
426
427 if (oend != NULL)
428 (*oend) (sem->semstate);
429
430 return JSON_SUCCESS;
431 }
432
433 static JsonParseErrorType
parse_array_element(JsonLexContext * lex,JsonSemAction * sem)434 parse_array_element(JsonLexContext *lex, JsonSemAction *sem)
435 {
436 json_aelem_action astart = sem->array_element_start;
437 json_aelem_action aend = sem->array_element_end;
438 JsonTokenType tok = lex_peek(lex);
439 JsonParseErrorType result;
440
441 bool isnull;
442
443 isnull = tok == JSON_TOKEN_NULL;
444
445 if (astart != NULL)
446 (*astart) (sem->semstate, isnull);
447
448 /* an array element is any object, array or scalar */
449 switch (tok)
450 {
451 case JSON_TOKEN_OBJECT_START:
452 result = parse_object(lex, sem);
453 break;
454 case JSON_TOKEN_ARRAY_START:
455 result = parse_array(lex, sem);
456 break;
457 default:
458 result = parse_scalar(lex, sem);
459 }
460
461 if (result != JSON_SUCCESS)
462 return result;
463
464 if (aend != NULL)
465 (*aend) (sem->semstate, isnull);
466
467 return JSON_SUCCESS;
468 }
469
470 static JsonParseErrorType
parse_array(JsonLexContext * lex,JsonSemAction * sem)471 parse_array(JsonLexContext *lex, JsonSemAction *sem)
472 {
473 /*
474 * an array is a possibly empty sequence of array elements, separated by
475 * commas and surrounded by square brackets.
476 */
477 json_struct_action astart = sem->array_start;
478 json_struct_action aend = sem->array_end;
479 JsonParseErrorType result;
480
481 check_stack_depth();
482
483 if (astart != NULL)
484 (*astart) (sem->semstate);
485
486 /*
487 * Data inside an array is at a higher nesting level than the array
488 * itself. Note that we increment this after we call the semantic routine
489 * for the array start and restore it before we call the routine for the
490 * array end.
491 */
492 lex->lex_level++;
493
494 result = lex_expect(JSON_PARSE_ARRAY_START, lex, JSON_TOKEN_ARRAY_START);
495 if (result == JSON_SUCCESS && lex_peek(lex) != JSON_TOKEN_ARRAY_END)
496 {
497 result = parse_array_element(lex, sem);
498
499 while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
500 {
501 result = json_lex(lex);
502 if (result != JSON_SUCCESS)
503 break;
504 result = parse_array_element(lex, sem);
505 }
506 }
507 if (result != JSON_SUCCESS)
508 return result;
509
510 result = lex_expect(JSON_PARSE_ARRAY_NEXT, lex, JSON_TOKEN_ARRAY_END);
511 if (result != JSON_SUCCESS)
512 return result;
513
514 lex->lex_level--;
515
516 if (aend != NULL)
517 (*aend) (sem->semstate);
518
519 return JSON_SUCCESS;
520 }
521
522 /*
523 * Lex one token from the input stream.
524 */
525 JsonParseErrorType
json_lex(JsonLexContext * lex)526 json_lex(JsonLexContext *lex)
527 {
528 char *s;
529 int len;
530 JsonParseErrorType result;
531
532 /* Skip leading whitespace. */
533 s = lex->token_terminator;
534 len = s - lex->input;
535 while (len < lex->input_length &&
536 (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
537 {
538 if (*s++ == '\n')
539 {
540 ++lex->line_number;
541 lex->line_start = s;
542 }
543 len++;
544 }
545 lex->token_start = s;
546
547 /* Determine token type. */
548 if (len >= lex->input_length)
549 {
550 lex->token_start = NULL;
551 lex->prev_token_terminator = lex->token_terminator;
552 lex->token_terminator = s;
553 lex->token_type = JSON_TOKEN_END;
554 }
555 else
556 {
557 switch (*s)
558 {
559 /* Single-character token, some kind of punctuation mark. */
560 case '{':
561 lex->prev_token_terminator = lex->token_terminator;
562 lex->token_terminator = s + 1;
563 lex->token_type = JSON_TOKEN_OBJECT_START;
564 break;
565 case '}':
566 lex->prev_token_terminator = lex->token_terminator;
567 lex->token_terminator = s + 1;
568 lex->token_type = JSON_TOKEN_OBJECT_END;
569 break;
570 case '[':
571 lex->prev_token_terminator = lex->token_terminator;
572 lex->token_terminator = s + 1;
573 lex->token_type = JSON_TOKEN_ARRAY_START;
574 break;
575 case ']':
576 lex->prev_token_terminator = lex->token_terminator;
577 lex->token_terminator = s + 1;
578 lex->token_type = JSON_TOKEN_ARRAY_END;
579 break;
580 case ',':
581 lex->prev_token_terminator = lex->token_terminator;
582 lex->token_terminator = s + 1;
583 lex->token_type = JSON_TOKEN_COMMA;
584 break;
585 case ':':
586 lex->prev_token_terminator = lex->token_terminator;
587 lex->token_terminator = s + 1;
588 lex->token_type = JSON_TOKEN_COLON;
589 break;
590 case '"':
591 /* string */
592 result = json_lex_string(lex);
593 if (result != JSON_SUCCESS)
594 return result;
595 lex->token_type = JSON_TOKEN_STRING;
596 break;
597 case '-':
598 /* Negative number. */
599 result = json_lex_number(lex, s + 1, NULL, NULL);
600 if (result != JSON_SUCCESS)
601 return result;
602 lex->token_type = JSON_TOKEN_NUMBER;
603 break;
604 case '0':
605 case '1':
606 case '2':
607 case '3':
608 case '4':
609 case '5':
610 case '6':
611 case '7':
612 case '8':
613 case '9':
614 /* Positive number. */
615 result = json_lex_number(lex, s, NULL, NULL);
616 if (result != JSON_SUCCESS)
617 return result;
618 lex->token_type = JSON_TOKEN_NUMBER;
619 break;
620 default:
621 {
622 char *p;
623
624 /*
625 * We're not dealing with a string, number, legal
626 * punctuation mark, or end of string. The only legal
627 * tokens we might find here are true, false, and null,
628 * but for error reporting purposes we scan until we see a
629 * non-alphanumeric character. That way, we can report
630 * the whole word as an unexpected token, rather than just
631 * some unintuitive prefix thereof.
632 */
633 for (p = s; p - s < lex->input_length - len && JSON_ALPHANUMERIC_CHAR(*p); p++)
634 /* skip */ ;
635
636 /*
637 * We got some sort of unexpected punctuation or an
638 * otherwise unexpected character, so just complain about
639 * that one character.
640 */
641 if (p == s)
642 {
643 lex->prev_token_terminator = lex->token_terminator;
644 lex->token_terminator = s + 1;
645 return JSON_INVALID_TOKEN;
646 }
647
648 /*
649 * We've got a real alphanumeric token here. If it
650 * happens to be true, false, or null, all is well. If
651 * not, error out.
652 */
653 lex->prev_token_terminator = lex->token_terminator;
654 lex->token_terminator = p;
655 if (p - s == 4)
656 {
657 if (memcmp(s, "true", 4) == 0)
658 lex->token_type = JSON_TOKEN_TRUE;
659 else if (memcmp(s, "null", 4) == 0)
660 lex->token_type = JSON_TOKEN_NULL;
661 else
662 return JSON_INVALID_TOKEN;
663 }
664 else if (p - s == 5 && memcmp(s, "false", 5) == 0)
665 lex->token_type = JSON_TOKEN_FALSE;
666 else
667 return JSON_INVALID_TOKEN;
668
669 }
670 } /* end of switch */
671 }
672
673 return JSON_SUCCESS;
674 }
675
676 /*
677 * The next token in the input stream is known to be a string; lex it.
678 */
679 static inline JsonParseErrorType
json_lex_string(JsonLexContext * lex)680 json_lex_string(JsonLexContext *lex)
681 {
682 char *s;
683 int len;
684 int hi_surrogate = -1;
685
686 if (lex->strval != NULL)
687 resetStringInfo(lex->strval);
688
689 Assert(lex->input_length > 0);
690 s = lex->token_start;
691 len = lex->token_start - lex->input;
692 for (;;)
693 {
694 s++;
695 len++;
696 /* Premature end of the string. */
697 if (len >= lex->input_length)
698 {
699 lex->token_terminator = s;
700 return JSON_INVALID_TOKEN;
701 }
702 else if (*s == '"')
703 break;
704 else if ((unsigned char) *s < 32)
705 {
706 /* Per RFC4627, these characters MUST be escaped. */
707 /* Since *s isn't printable, exclude it from the context string */
708 lex->token_terminator = s;
709 return JSON_ESCAPING_REQUIRED;
710 }
711 else if (*s == '\\')
712 {
713 /* OK, we have an escape character. */
714 s++;
715 len++;
716 if (len >= lex->input_length)
717 {
718 lex->token_terminator = s;
719 return JSON_INVALID_TOKEN;
720 }
721 else if (*s == 'u')
722 {
723 int i;
724 int ch = 0;
725
726 for (i = 1; i <= 4; i++)
727 {
728 s++;
729 len++;
730 if (len >= lex->input_length)
731 {
732 lex->token_terminator = s;
733 return JSON_INVALID_TOKEN;
734 }
735 else if (*s >= '0' && *s <= '9')
736 ch = (ch * 16) + (*s - '0');
737 else if (*s >= 'a' && *s <= 'f')
738 ch = (ch * 16) + (*s - 'a') + 10;
739 else if (*s >= 'A' && *s <= 'F')
740 ch = (ch * 16) + (*s - 'A') + 10;
741 else
742 {
743 lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
744 return JSON_UNICODE_ESCAPE_FORMAT;
745 }
746 }
747 if (lex->strval != NULL)
748 {
749 /*
750 * Combine surrogate pairs.
751 */
752 if (is_utf16_surrogate_first(ch))
753 {
754 if (hi_surrogate != -1)
755 return JSON_UNICODE_HIGH_SURROGATE;
756 hi_surrogate = ch;
757 continue;
758 }
759 else if (is_utf16_surrogate_second(ch))
760 {
761 if (hi_surrogate == -1)
762 return JSON_UNICODE_LOW_SURROGATE;
763 ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
764 hi_surrogate = -1;
765 }
766
767 if (hi_surrogate != -1)
768 return JSON_UNICODE_LOW_SURROGATE;
769
770 /*
771 * Reject invalid cases. We can't have a value above
772 * 0xFFFF here (since we only accepted 4 hex digits
773 * above), so no need to test for out-of-range chars.
774 */
775 if (ch == 0)
776 {
777 /* We can't allow this, since our TEXT type doesn't */
778 return JSON_UNICODE_CODE_POINT_ZERO;
779 }
780
781 /*
782 * Add the represented character to lex->strval. In the
783 * backend, we can let pg_unicode_to_server() handle any
784 * required character set conversion; in frontend, we can
785 * only deal with trivial conversions.
786 *
787 * Note: pg_unicode_to_server() will throw an error for a
788 * conversion failure, rather than returning a failure
789 * indication. That seems OK.
790 */
791 #ifndef FRONTEND
792 {
793 char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
794
795 pg_unicode_to_server(ch, (unsigned char *) cbuf);
796 appendStringInfoString(lex->strval, cbuf);
797 }
798 #else
799 if (lex->input_encoding == PG_UTF8)
800 {
801 /* OK, we can map the code point to UTF8 easily */
802 char utf8str[5];
803 int utf8len;
804
805 unicode_to_utf8(ch, (unsigned char *) utf8str);
806 utf8len = pg_utf_mblen((unsigned char *) utf8str);
807 appendBinaryStringInfo(lex->strval, utf8str, utf8len);
808 }
809 else if (ch <= 0x007f)
810 {
811 /* The ASCII range is the same in all encodings */
812 appendStringInfoChar(lex->strval, (char) ch);
813 }
814 else
815 return JSON_UNICODE_HIGH_ESCAPE;
816 #endif /* FRONTEND */
817 }
818 }
819 else if (lex->strval != NULL)
820 {
821 if (hi_surrogate != -1)
822 return JSON_UNICODE_LOW_SURROGATE;
823
824 switch (*s)
825 {
826 case '"':
827 case '\\':
828 case '/':
829 appendStringInfoChar(lex->strval, *s);
830 break;
831 case 'b':
832 appendStringInfoChar(lex->strval, '\b');
833 break;
834 case 'f':
835 appendStringInfoChar(lex->strval, '\f');
836 break;
837 case 'n':
838 appendStringInfoChar(lex->strval, '\n');
839 break;
840 case 'r':
841 appendStringInfoChar(lex->strval, '\r');
842 break;
843 case 't':
844 appendStringInfoChar(lex->strval, '\t');
845 break;
846 default:
847 /* Not a valid string escape, so signal error. */
848 lex->token_start = s;
849 lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
850 return JSON_ESCAPING_INVALID;
851 }
852 }
853 else if (strchr("\"\\/bfnrt", *s) == NULL)
854 {
855 /*
856 * Simpler processing if we're not bothered about de-escaping
857 *
858 * It's very tempting to remove the strchr() call here and
859 * replace it with a switch statement, but testing so far has
860 * shown it's not a performance win.
861 */
862 lex->token_start = s;
863 lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
864 return JSON_ESCAPING_INVALID;
865 }
866
867 }
868 else if (lex->strval != NULL)
869 {
870 if (hi_surrogate != -1)
871 return JSON_UNICODE_LOW_SURROGATE;
872
873 appendStringInfoChar(lex->strval, *s);
874 }
875
876 }
877
878 if (hi_surrogate != -1)
879 return JSON_UNICODE_LOW_SURROGATE;
880
881 /* Hooray, we found the end of the string! */
882 lex->prev_token_terminator = lex->token_terminator;
883 lex->token_terminator = s + 1;
884 return JSON_SUCCESS;
885 }
886
887 /*
888 * The next token in the input stream is known to be a number; lex it.
889 *
890 * In JSON, a number consists of four parts:
891 *
892 * (1) An optional minus sign ('-').
893 *
894 * (2) Either a single '0', or a string of one or more digits that does not
895 * begin with a '0'.
896 *
897 * (3) An optional decimal part, consisting of a period ('.') followed by
898 * one or more digits. (Note: While this part can be omitted
899 * completely, it's not OK to have only the decimal point without
900 * any digits afterwards.)
901 *
902 * (4) An optional exponent part, consisting of 'e' or 'E', optionally
903 * followed by '+' or '-', followed by one or more digits. (Note:
904 * As with the decimal part, if 'e' or 'E' is present, it must be
905 * followed by at least one digit.)
906 *
907 * The 's' argument to this function points to the ostensible beginning
908 * of part 2 - i.e. the character after any optional minus sign, or the
909 * first character of the string if there is none.
910 *
911 * If num_err is not NULL, we return an error flag to *num_err rather than
912 * raising an error for a badly-formed number. Also, if total_len is not NULL
913 * the distance from lex->input to the token end+1 is returned to *total_len.
914 */
915 static inline JsonParseErrorType
json_lex_number(JsonLexContext * lex,char * s,bool * num_err,int * total_len)916 json_lex_number(JsonLexContext *lex, char *s,
917 bool *num_err, int *total_len)
918 {
919 bool error = false;
920 int len = s - lex->input;
921
922 /* Part (1): leading sign indicator. */
923 /* Caller already did this for us; so do nothing. */
924
925 /* Part (2): parse main digit string. */
926 if (len < lex->input_length && *s == '0')
927 {
928 s++;
929 len++;
930 }
931 else if (len < lex->input_length && *s >= '1' && *s <= '9')
932 {
933 do
934 {
935 s++;
936 len++;
937 } while (len < lex->input_length && *s >= '0' && *s <= '9');
938 }
939 else
940 error = true;
941
942 /* Part (3): parse optional decimal portion. */
943 if (len < lex->input_length && *s == '.')
944 {
945 s++;
946 len++;
947 if (len == lex->input_length || *s < '0' || *s > '9')
948 error = true;
949 else
950 {
951 do
952 {
953 s++;
954 len++;
955 } while (len < lex->input_length && *s >= '0' && *s <= '9');
956 }
957 }
958
959 /* Part (4): parse optional exponent. */
960 if (len < lex->input_length && (*s == 'e' || *s == 'E'))
961 {
962 s++;
963 len++;
964 if (len < lex->input_length && (*s == '+' || *s == '-'))
965 {
966 s++;
967 len++;
968 }
969 if (len == lex->input_length || *s < '0' || *s > '9')
970 error = true;
971 else
972 {
973 do
974 {
975 s++;
976 len++;
977 } while (len < lex->input_length && *s >= '0' && *s <= '9');
978 }
979 }
980
981 /*
982 * Check for trailing garbage. As in json_lex(), any alphanumeric stuff
983 * here should be considered part of the token for error-reporting
984 * purposes.
985 */
986 for (; len < lex->input_length && JSON_ALPHANUMERIC_CHAR(*s); s++, len++)
987 error = true;
988
989 if (total_len != NULL)
990 *total_len = len;
991
992 if (num_err != NULL)
993 {
994 /* let the caller handle any error */
995 *num_err = error;
996 }
997 else
998 {
999 /* return token endpoint */
1000 lex->prev_token_terminator = lex->token_terminator;
1001 lex->token_terminator = s;
1002 /* handle error if any */
1003 if (error)
1004 return JSON_INVALID_TOKEN;
1005 }
1006
1007 return JSON_SUCCESS;
1008 }
1009
1010 /*
1011 * Report a parse error.
1012 *
1013 * lex->token_start and lex->token_terminator must identify the current token.
1014 */
1015 static JsonParseErrorType
report_parse_error(JsonParseContext ctx,JsonLexContext * lex)1016 report_parse_error(JsonParseContext ctx, JsonLexContext *lex)
1017 {
1018 /* Handle case where the input ended prematurely. */
1019 if (lex->token_start == NULL || lex->token_type == JSON_TOKEN_END)
1020 return JSON_EXPECTED_MORE;
1021
1022 /* Otherwise choose the error type based on the parsing context. */
1023 switch (ctx)
1024 {
1025 case JSON_PARSE_END:
1026 return JSON_EXPECTED_END;
1027 case JSON_PARSE_VALUE:
1028 return JSON_EXPECTED_JSON;
1029 case JSON_PARSE_STRING:
1030 return JSON_EXPECTED_STRING;
1031 case JSON_PARSE_ARRAY_START:
1032 return JSON_EXPECTED_ARRAY_FIRST;
1033 case JSON_PARSE_ARRAY_NEXT:
1034 return JSON_EXPECTED_ARRAY_NEXT;
1035 case JSON_PARSE_OBJECT_START:
1036 return JSON_EXPECTED_OBJECT_FIRST;
1037 case JSON_PARSE_OBJECT_LABEL:
1038 return JSON_EXPECTED_COLON;
1039 case JSON_PARSE_OBJECT_NEXT:
1040 return JSON_EXPECTED_OBJECT_NEXT;
1041 case JSON_PARSE_OBJECT_COMMA:
1042 return JSON_EXPECTED_STRING;
1043 }
1044
1045 /*
1046 * We don't use a default: case, so that the compiler will warn about
1047 * unhandled enum values. But this needs to be here anyway to cover the
1048 * possibility of an incorrect input.
1049 */
1050 json_log_and_abort("unexpected json parse state: %d", (int) ctx);
1051 return JSON_SUCCESS; /* silence stupider compilers */
1052 }
1053
1054 /*
1055 * Construct a detail message for a JSON error.
1056 */
1057 char *
json_errdetail(JsonParseErrorType error,JsonLexContext * lex)1058 json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
1059 {
1060 switch (error)
1061 {
1062 case JSON_SUCCESS:
1063 /* fall through to the error code after switch */
1064 break;
1065 case JSON_ESCAPING_INVALID:
1066 return psprintf(_("Escape sequence \"\\%s\" is invalid."),
1067 extract_token(lex));
1068 case JSON_ESCAPING_REQUIRED:
1069 return psprintf(_("Character with value 0x%02x must be escaped."),
1070 (unsigned char) *(lex->token_terminator));
1071 case JSON_EXPECTED_END:
1072 return psprintf(_("Expected end of input, but found \"%s\"."),
1073 extract_token(lex));
1074 case JSON_EXPECTED_ARRAY_FIRST:
1075 return psprintf(_("Expected array element or \"]\", but found \"%s\"."),
1076 extract_token(lex));
1077 case JSON_EXPECTED_ARRAY_NEXT:
1078 return psprintf(_("Expected \",\" or \"]\", but found \"%s\"."),
1079 extract_token(lex));
1080 case JSON_EXPECTED_COLON:
1081 return psprintf(_("Expected \":\", but found \"%s\"."),
1082 extract_token(lex));
1083 case JSON_EXPECTED_JSON:
1084 return psprintf(_("Expected JSON value, but found \"%s\"."),
1085 extract_token(lex));
1086 case JSON_EXPECTED_MORE:
1087 return _("The input string ended unexpectedly.");
1088 case JSON_EXPECTED_OBJECT_FIRST:
1089 return psprintf(_("Expected string or \"}\", but found \"%s\"."),
1090 extract_token(lex));
1091 case JSON_EXPECTED_OBJECT_NEXT:
1092 return psprintf(_("Expected \",\" or \"}\", but found \"%s\"."),
1093 extract_token(lex));
1094 case JSON_EXPECTED_STRING:
1095 return psprintf(_("Expected string, but found \"%s\"."),
1096 extract_token(lex));
1097 case JSON_INVALID_TOKEN:
1098 return psprintf(_("Token \"%s\" is invalid."),
1099 extract_token(lex));
1100 case JSON_UNICODE_CODE_POINT_ZERO:
1101 return _("\\u0000 cannot be converted to text.");
1102 case JSON_UNICODE_ESCAPE_FORMAT:
1103 return _("\"\\u\" must be followed by four hexadecimal digits.");
1104 case JSON_UNICODE_HIGH_ESCAPE:
1105 /* note: this case is only reachable in frontend not backend */
1106 return _("Unicode escape values cannot be used for code point values above 007F when the encoding is not UTF8.");
1107 case JSON_UNICODE_HIGH_SURROGATE:
1108 return _("Unicode high surrogate must not follow a high surrogate.");
1109 case JSON_UNICODE_LOW_SURROGATE:
1110 return _("Unicode low surrogate must follow a high surrogate.");
1111 }
1112
1113 /*
1114 * We don't use a default: case, so that the compiler will warn about
1115 * unhandled enum values. But this needs to be here anyway to cover the
1116 * possibility of an incorrect input.
1117 */
1118 json_log_and_abort("unexpected json parse error type: %d", (int) error);
1119 return NULL; /* silence stupider compilers */
1120 }
1121
1122 /*
1123 * Extract the current token from a lexing context, for error reporting.
1124 */
1125 static char *
extract_token(JsonLexContext * lex)1126 extract_token(JsonLexContext *lex)
1127 {
1128 int toklen = lex->token_terminator - lex->token_start;
1129 char *token = palloc(toklen + 1);
1130
1131 memcpy(token, lex->token_start, toklen);
1132 token[toklen] = '\0';
1133 return token;
1134 }
1135