1 /*-------------------------------------------------------------------------
2  *
3  * jsonapi.c
4  *		JSON parser and lexer interfaces
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/common/jsonapi.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef FRONTEND
15 #include "postgres.h"
16 #else
17 #include "postgres_fe.h"
18 #endif
19 
20 #include "common/jsonapi.h"
21 #include "mb/pg_wchar.h"
22 
23 #ifdef FRONTEND
24 #include "common/logging.h"
25 #else
26 #include "miscadmin.h"
27 #endif
28 
29 #ifdef FRONTEND
30 #define check_stack_depth()
31 #define json_log_and_abort(...) \
32 	do { pg_log_fatal(__VA_ARGS__); exit(1); } while(0)
33 #else
34 #define json_log_and_abort(...) elog(ERROR, __VA_ARGS__)
35 #endif
36 
37 /*
38  * The context of the parser is maintained by the recursive descent
39  * mechanism, but is passed explicitly to the error reporting routine
40  * for better diagnostics.
41  */
42 typedef enum					/* contexts of JSON parser */
43 {
44 	JSON_PARSE_VALUE,			/* expecting a value */
45 	JSON_PARSE_STRING,			/* expecting a string (for a field name) */
46 	JSON_PARSE_ARRAY_START,		/* saw '[', expecting value or ']' */
47 	JSON_PARSE_ARRAY_NEXT,		/* saw array element, expecting ',' or ']' */
48 	JSON_PARSE_OBJECT_START,	/* saw '{', expecting label or '}' */
49 	JSON_PARSE_OBJECT_LABEL,	/* saw object label, expecting ':' */
50 	JSON_PARSE_OBJECT_NEXT,		/* saw object value, expecting ',' or '}' */
51 	JSON_PARSE_OBJECT_COMMA,	/* saw object ',', expecting next label */
52 	JSON_PARSE_END				/* saw the end of a document, expect nothing */
53 } JsonParseContext;
54 
55 static inline JsonParseErrorType json_lex_string(JsonLexContext *lex);
56 static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s,
57 												 bool *num_err, int *total_len);
58 static inline JsonParseErrorType parse_scalar(JsonLexContext *lex, JsonSemAction *sem);
59 static JsonParseErrorType parse_object_field(JsonLexContext *lex, JsonSemAction *sem);
60 static JsonParseErrorType parse_object(JsonLexContext *lex, JsonSemAction *sem);
61 static JsonParseErrorType parse_array_element(JsonLexContext *lex, JsonSemAction *sem);
62 static JsonParseErrorType parse_array(JsonLexContext *lex, JsonSemAction *sem);
63 static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex);
64 static char *extract_token(JsonLexContext *lex);
65 
66 /* the null action object used for pure validation */
67 JsonSemAction nullSemAction =
68 {
69 	NULL, NULL, NULL, NULL, NULL,
70 	NULL, NULL, NULL, NULL, NULL
71 };
72 
73 /* Recursive Descent parser support routines */
74 
75 /*
76  * lex_peek
77  *
78  * what is the current look_ahead token?
79 */
80 static inline JsonTokenType
lex_peek(JsonLexContext * lex)81 lex_peek(JsonLexContext *lex)
82 {
83 	return lex->token_type;
84 }
85 
86 /*
87  * lex_expect
88  *
89  * move the lexer to the next token if the current look_ahead token matches
90  * the parameter token. Otherwise, report an error.
91  */
92 static inline JsonParseErrorType
lex_expect(JsonParseContext ctx,JsonLexContext * lex,JsonTokenType token)93 lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token)
94 {
95 	if (lex_peek(lex) == token)
96 		return json_lex(lex);
97 	else
98 		return report_parse_error(ctx, lex);
99 }
100 
101 /* chars to consider as part of an alphanumeric token */
102 #define JSON_ALPHANUMERIC_CHAR(c)  \
103 	(((c) >= 'a' && (c) <= 'z') || \
104 	 ((c) >= 'A' && (c) <= 'Z') || \
105 	 ((c) >= '0' && (c) <= '9') || \
106 	 (c) == '_' || \
107 	 IS_HIGHBIT_SET(c))
108 
109 /*
110  * Utility function to check if a string is a valid JSON number.
111  *
112  * str is of length len, and need not be null-terminated.
113  */
114 bool
IsValidJsonNumber(const char * str,int len)115 IsValidJsonNumber(const char *str, int len)
116 {
117 	bool		numeric_error;
118 	int			total_len;
119 	JsonLexContext dummy_lex;
120 
121 	if (len <= 0)
122 		return false;
123 
124 	/*
125 	 * json_lex_number expects a leading  '-' to have been eaten already.
126 	 *
127 	 * having to cast away the constness of str is ugly, but there's not much
128 	 * easy alternative.
129 	 */
130 	if (*str == '-')
131 	{
132 		dummy_lex.input = unconstify(char *, str) + 1;
133 		dummy_lex.input_length = len - 1;
134 	}
135 	else
136 	{
137 		dummy_lex.input = unconstify(char *, str);
138 		dummy_lex.input_length = len;
139 	}
140 
141 	json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len);
142 
143 	return (!numeric_error) && (total_len == dummy_lex.input_length);
144 }
145 
146 /*
147  * makeJsonLexContextCstringLen
148  *
149  * lex constructor, with or without StringInfo object for de-escaped lexemes.
150  *
151  * Without is better as it makes the processing faster, so only make one
152  * if really required.
153  */
154 JsonLexContext *
makeJsonLexContextCstringLen(char * json,int len,int encoding,bool need_escapes)155 makeJsonLexContextCstringLen(char *json, int len, int encoding, bool need_escapes)
156 {
157 	JsonLexContext *lex = palloc0(sizeof(JsonLexContext));
158 
159 	lex->input = lex->token_terminator = lex->line_start = json;
160 	lex->line_number = 1;
161 	lex->input_length = len;
162 	lex->input_encoding = encoding;
163 	if (need_escapes)
164 		lex->strval = makeStringInfo();
165 	return lex;
166 }
167 
168 /*
169  * pg_parse_json
170  *
171  * Publicly visible entry point for the JSON parser.
172  *
173  * lex is a lexing context, set up for the json to be processed by calling
174  * makeJsonLexContext(). sem is a structure of function pointers to semantic
175  * action routines to be called at appropriate spots during parsing, and a
176  * pointer to a state object to be passed to those routines.
177  */
178 JsonParseErrorType
pg_parse_json(JsonLexContext * lex,JsonSemAction * sem)179 pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
180 {
181 	JsonTokenType tok;
182 	JsonParseErrorType result;
183 
184 	/* get the initial token */
185 	result = json_lex(lex);
186 	if (result != JSON_SUCCESS)
187 		return result;
188 
189 	tok = lex_peek(lex);
190 
191 	/* parse by recursive descent */
192 	switch (tok)
193 	{
194 		case JSON_TOKEN_OBJECT_START:
195 			result = parse_object(lex, sem);
196 			break;
197 		case JSON_TOKEN_ARRAY_START:
198 			result = parse_array(lex, sem);
199 			break;
200 		default:
201 			result = parse_scalar(lex, sem);	/* json can be a bare scalar */
202 	}
203 
204 	if (result == JSON_SUCCESS)
205 		result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END);
206 
207 	return result;
208 }
209 
210 /*
211  * json_count_array_elements
212  *
213  * Returns number of array elements in lex context at start of array token
214  * until end of array token at same nesting level.
215  *
216  * Designed to be called from array_start routines.
217  */
218 JsonParseErrorType
json_count_array_elements(JsonLexContext * lex,int * elements)219 json_count_array_elements(JsonLexContext *lex, int *elements)
220 {
221 	JsonLexContext copylex;
222 	int			count;
223 	JsonParseErrorType result;
224 
225 	/*
226 	 * It's safe to do this with a shallow copy because the lexical routines
227 	 * don't scribble on the input. They do scribble on the other pointers
228 	 * etc, so doing this with a copy makes that safe.
229 	 */
230 	memcpy(&copylex, lex, sizeof(JsonLexContext));
231 	copylex.strval = NULL;		/* not interested in values here */
232 	copylex.lex_level++;
233 
234 	count = 0;
235 	result = lex_expect(JSON_PARSE_ARRAY_START, &copylex,
236 						JSON_TOKEN_ARRAY_START);
237 	if (result != JSON_SUCCESS)
238 		return result;
239 	if (lex_peek(&copylex) != JSON_TOKEN_ARRAY_END)
240 	{
241 		while (1)
242 		{
243 			count++;
244 			result = parse_array_element(&copylex, &nullSemAction);
245 			if (result != JSON_SUCCESS)
246 				return result;
247 			if (copylex.token_type != JSON_TOKEN_COMMA)
248 				break;
249 			result = json_lex(&copylex);
250 			if (result != JSON_SUCCESS)
251 				return result;
252 		}
253 	}
254 	result = lex_expect(JSON_PARSE_ARRAY_NEXT, &copylex,
255 						JSON_TOKEN_ARRAY_END);
256 	if (result != JSON_SUCCESS)
257 		return result;
258 
259 	*elements = count;
260 	return JSON_SUCCESS;
261 }
262 
263 /*
264  *	Recursive Descent parse routines. There is one for each structural
265  *	element in a json document:
266  *	  - scalar (string, number, true, false, null)
267  *	  - array  ( [ ] )
268  *	  - array element
269  *	  - object ( { } )
270  *	  - object field
271  */
272 static inline JsonParseErrorType
parse_scalar(JsonLexContext * lex,JsonSemAction * sem)273 parse_scalar(JsonLexContext *lex, JsonSemAction *sem)
274 {
275 	char	   *val = NULL;
276 	json_scalar_action sfunc = sem->scalar;
277 	JsonTokenType tok = lex_peek(lex);
278 	JsonParseErrorType result;
279 
280 	/* a scalar must be a string, a number, true, false, or null */
281 	if (tok != JSON_TOKEN_STRING && tok != JSON_TOKEN_NUMBER &&
282 		tok != JSON_TOKEN_TRUE && tok != JSON_TOKEN_FALSE &&
283 		tok != JSON_TOKEN_NULL)
284 		return report_parse_error(JSON_PARSE_VALUE, lex);
285 
286 	/* if no semantic function, just consume the token */
287 	if (sfunc == NULL)
288 		return json_lex(lex);
289 
290 	/* extract the de-escaped string value, or the raw lexeme */
291 	if (lex_peek(lex) == JSON_TOKEN_STRING)
292 	{
293 		if (lex->strval != NULL)
294 			val = pstrdup(lex->strval->data);
295 	}
296 	else
297 	{
298 		int			len = (lex->token_terminator - lex->token_start);
299 
300 		val = palloc(len + 1);
301 		memcpy(val, lex->token_start, len);
302 		val[len] = '\0';
303 	}
304 
305 	/* consume the token */
306 	result = json_lex(lex);
307 	if (result != JSON_SUCCESS)
308 		return result;
309 
310 	/* invoke the callback */
311 	(*sfunc) (sem->semstate, val, tok);
312 
313 	return JSON_SUCCESS;
314 }
315 
316 static JsonParseErrorType
parse_object_field(JsonLexContext * lex,JsonSemAction * sem)317 parse_object_field(JsonLexContext *lex, JsonSemAction *sem)
318 {
319 	/*
320 	 * An object field is "fieldname" : value where value can be a scalar,
321 	 * object or array.  Note: in user-facing docs and error messages, we
322 	 * generally call a field name a "key".
323 	 */
324 
325 	char	   *fname = NULL;	/* keep compiler quiet */
326 	json_ofield_action ostart = sem->object_field_start;
327 	json_ofield_action oend = sem->object_field_end;
328 	bool		isnull;
329 	JsonTokenType tok;
330 	JsonParseErrorType result;
331 
332 	if (lex_peek(lex) != JSON_TOKEN_STRING)
333 		return report_parse_error(JSON_PARSE_STRING, lex);
334 	if ((ostart != NULL || oend != NULL) && lex->strval != NULL)
335 		fname = pstrdup(lex->strval->data);
336 	result = json_lex(lex);
337 	if (result != JSON_SUCCESS)
338 		return result;
339 
340 	result = lex_expect(JSON_PARSE_OBJECT_LABEL, lex, JSON_TOKEN_COLON);
341 	if (result != JSON_SUCCESS)
342 		return result;
343 
344 	tok = lex_peek(lex);
345 	isnull = tok == JSON_TOKEN_NULL;
346 
347 	if (ostart != NULL)
348 		(*ostart) (sem->semstate, fname, isnull);
349 
350 	switch (tok)
351 	{
352 		case JSON_TOKEN_OBJECT_START:
353 			result = parse_object(lex, sem);
354 			break;
355 		case JSON_TOKEN_ARRAY_START:
356 			result = parse_array(lex, sem);
357 			break;
358 		default:
359 			result = parse_scalar(lex, sem);
360 	}
361 	if (result != JSON_SUCCESS)
362 		return result;
363 
364 	if (oend != NULL)
365 		(*oend) (sem->semstate, fname, isnull);
366 	return JSON_SUCCESS;
367 }
368 
369 static JsonParseErrorType
parse_object(JsonLexContext * lex,JsonSemAction * sem)370 parse_object(JsonLexContext *lex, JsonSemAction *sem)
371 {
372 	/*
373 	 * an object is a possibly empty sequence of object fields, separated by
374 	 * commas and surrounded by curly braces.
375 	 */
376 	json_struct_action ostart = sem->object_start;
377 	json_struct_action oend = sem->object_end;
378 	JsonTokenType tok;
379 	JsonParseErrorType result;
380 
381 	check_stack_depth();
382 
383 	if (ostart != NULL)
384 		(*ostart) (sem->semstate);
385 
386 	/*
387 	 * Data inside an object is at a higher nesting level than the object
388 	 * itself. Note that we increment this after we call the semantic routine
389 	 * for the object start and restore it before we call the routine for the
390 	 * object end.
391 	 */
392 	lex->lex_level++;
393 
394 	Assert(lex_peek(lex) == JSON_TOKEN_OBJECT_START);
395 	result = json_lex(lex);
396 	if (result != JSON_SUCCESS)
397 		return result;
398 
399 	tok = lex_peek(lex);
400 	switch (tok)
401 	{
402 		case JSON_TOKEN_STRING:
403 			result = parse_object_field(lex, sem);
404 			while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
405 			{
406 				result = json_lex(lex);
407 				if (result != JSON_SUCCESS)
408 					break;
409 				result = parse_object_field(lex, sem);
410 			}
411 			break;
412 		case JSON_TOKEN_OBJECT_END:
413 			break;
414 		default:
415 			/* case of an invalid initial token inside the object */
416 			result = report_parse_error(JSON_PARSE_OBJECT_START, lex);
417 	}
418 	if (result != JSON_SUCCESS)
419 		return result;
420 
421 	result = lex_expect(JSON_PARSE_OBJECT_NEXT, lex, JSON_TOKEN_OBJECT_END);
422 	if (result != JSON_SUCCESS)
423 		return result;
424 
425 	lex->lex_level--;
426 
427 	if (oend != NULL)
428 		(*oend) (sem->semstate);
429 
430 	return JSON_SUCCESS;
431 }
432 
433 static JsonParseErrorType
parse_array_element(JsonLexContext * lex,JsonSemAction * sem)434 parse_array_element(JsonLexContext *lex, JsonSemAction *sem)
435 {
436 	json_aelem_action astart = sem->array_element_start;
437 	json_aelem_action aend = sem->array_element_end;
438 	JsonTokenType tok = lex_peek(lex);
439 	JsonParseErrorType result;
440 
441 	bool		isnull;
442 
443 	isnull = tok == JSON_TOKEN_NULL;
444 
445 	if (astart != NULL)
446 		(*astart) (sem->semstate, isnull);
447 
448 	/* an array element is any object, array or scalar */
449 	switch (tok)
450 	{
451 		case JSON_TOKEN_OBJECT_START:
452 			result = parse_object(lex, sem);
453 			break;
454 		case JSON_TOKEN_ARRAY_START:
455 			result = parse_array(lex, sem);
456 			break;
457 		default:
458 			result = parse_scalar(lex, sem);
459 	}
460 
461 	if (result != JSON_SUCCESS)
462 		return result;
463 
464 	if (aend != NULL)
465 		(*aend) (sem->semstate, isnull);
466 
467 	return JSON_SUCCESS;
468 }
469 
470 static JsonParseErrorType
parse_array(JsonLexContext * lex,JsonSemAction * sem)471 parse_array(JsonLexContext *lex, JsonSemAction *sem)
472 {
473 	/*
474 	 * an array is a possibly empty sequence of array elements, separated by
475 	 * commas and surrounded by square brackets.
476 	 */
477 	json_struct_action astart = sem->array_start;
478 	json_struct_action aend = sem->array_end;
479 	JsonParseErrorType result;
480 
481 	check_stack_depth();
482 
483 	if (astart != NULL)
484 		(*astart) (sem->semstate);
485 
486 	/*
487 	 * Data inside an array is at a higher nesting level than the array
488 	 * itself. Note that we increment this after we call the semantic routine
489 	 * for the array start and restore it before we call the routine for the
490 	 * array end.
491 	 */
492 	lex->lex_level++;
493 
494 	result = lex_expect(JSON_PARSE_ARRAY_START, lex, JSON_TOKEN_ARRAY_START);
495 	if (result == JSON_SUCCESS && lex_peek(lex) != JSON_TOKEN_ARRAY_END)
496 	{
497 		result = parse_array_element(lex, sem);
498 
499 		while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
500 		{
501 			result = json_lex(lex);
502 			if (result != JSON_SUCCESS)
503 				break;
504 			result = parse_array_element(lex, sem);
505 		}
506 	}
507 	if (result != JSON_SUCCESS)
508 		return result;
509 
510 	result = lex_expect(JSON_PARSE_ARRAY_NEXT, lex, JSON_TOKEN_ARRAY_END);
511 	if (result != JSON_SUCCESS)
512 		return result;
513 
514 	lex->lex_level--;
515 
516 	if (aend != NULL)
517 		(*aend) (sem->semstate);
518 
519 	return JSON_SUCCESS;
520 }
521 
522 /*
523  * Lex one token from the input stream.
524  */
525 JsonParseErrorType
json_lex(JsonLexContext * lex)526 json_lex(JsonLexContext *lex)
527 {
528 	char	   *s;
529 	int			len;
530 	JsonParseErrorType result;
531 
532 	/* Skip leading whitespace. */
533 	s = lex->token_terminator;
534 	len = s - lex->input;
535 	while (len < lex->input_length &&
536 		   (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
537 	{
538 		if (*s++ == '\n')
539 		{
540 			++lex->line_number;
541 			lex->line_start = s;
542 		}
543 		len++;
544 	}
545 	lex->token_start = s;
546 
547 	/* Determine token type. */
548 	if (len >= lex->input_length)
549 	{
550 		lex->token_start = NULL;
551 		lex->prev_token_terminator = lex->token_terminator;
552 		lex->token_terminator = s;
553 		lex->token_type = JSON_TOKEN_END;
554 	}
555 	else
556 	{
557 		switch (*s)
558 		{
559 				/* Single-character token, some kind of punctuation mark. */
560 			case '{':
561 				lex->prev_token_terminator = lex->token_terminator;
562 				lex->token_terminator = s + 1;
563 				lex->token_type = JSON_TOKEN_OBJECT_START;
564 				break;
565 			case '}':
566 				lex->prev_token_terminator = lex->token_terminator;
567 				lex->token_terminator = s + 1;
568 				lex->token_type = JSON_TOKEN_OBJECT_END;
569 				break;
570 			case '[':
571 				lex->prev_token_terminator = lex->token_terminator;
572 				lex->token_terminator = s + 1;
573 				lex->token_type = JSON_TOKEN_ARRAY_START;
574 				break;
575 			case ']':
576 				lex->prev_token_terminator = lex->token_terminator;
577 				lex->token_terminator = s + 1;
578 				lex->token_type = JSON_TOKEN_ARRAY_END;
579 				break;
580 			case ',':
581 				lex->prev_token_terminator = lex->token_terminator;
582 				lex->token_terminator = s + 1;
583 				lex->token_type = JSON_TOKEN_COMMA;
584 				break;
585 			case ':':
586 				lex->prev_token_terminator = lex->token_terminator;
587 				lex->token_terminator = s + 1;
588 				lex->token_type = JSON_TOKEN_COLON;
589 				break;
590 			case '"':
591 				/* string */
592 				result = json_lex_string(lex);
593 				if (result != JSON_SUCCESS)
594 					return result;
595 				lex->token_type = JSON_TOKEN_STRING;
596 				break;
597 			case '-':
598 				/* Negative number. */
599 				result = json_lex_number(lex, s + 1, NULL, NULL);
600 				if (result != JSON_SUCCESS)
601 					return result;
602 				lex->token_type = JSON_TOKEN_NUMBER;
603 				break;
604 			case '0':
605 			case '1':
606 			case '2':
607 			case '3':
608 			case '4':
609 			case '5':
610 			case '6':
611 			case '7':
612 			case '8':
613 			case '9':
614 				/* Positive number. */
615 				result = json_lex_number(lex, s, NULL, NULL);
616 				if (result != JSON_SUCCESS)
617 					return result;
618 				lex->token_type = JSON_TOKEN_NUMBER;
619 				break;
620 			default:
621 				{
622 					char	   *p;
623 
624 					/*
625 					 * We're not dealing with a string, number, legal
626 					 * punctuation mark, or end of string.  The only legal
627 					 * tokens we might find here are true, false, and null,
628 					 * but for error reporting purposes we scan until we see a
629 					 * non-alphanumeric character.  That way, we can report
630 					 * the whole word as an unexpected token, rather than just
631 					 * some unintuitive prefix thereof.
632 					 */
633 					for (p = s; p - s < lex->input_length - len && JSON_ALPHANUMERIC_CHAR(*p); p++)
634 						 /* skip */ ;
635 
636 					/*
637 					 * We got some sort of unexpected punctuation or an
638 					 * otherwise unexpected character, so just complain about
639 					 * that one character.
640 					 */
641 					if (p == s)
642 					{
643 						lex->prev_token_terminator = lex->token_terminator;
644 						lex->token_terminator = s + 1;
645 						return JSON_INVALID_TOKEN;
646 					}
647 
648 					/*
649 					 * We've got a real alphanumeric token here.  If it
650 					 * happens to be true, false, or null, all is well.  If
651 					 * not, error out.
652 					 */
653 					lex->prev_token_terminator = lex->token_terminator;
654 					lex->token_terminator = p;
655 					if (p - s == 4)
656 					{
657 						if (memcmp(s, "true", 4) == 0)
658 							lex->token_type = JSON_TOKEN_TRUE;
659 						else if (memcmp(s, "null", 4) == 0)
660 							lex->token_type = JSON_TOKEN_NULL;
661 						else
662 							return JSON_INVALID_TOKEN;
663 					}
664 					else if (p - s == 5 && memcmp(s, "false", 5) == 0)
665 						lex->token_type = JSON_TOKEN_FALSE;
666 					else
667 						return JSON_INVALID_TOKEN;
668 
669 				}
670 		}						/* end of switch */
671 	}
672 
673 	return JSON_SUCCESS;
674 }
675 
676 /*
677  * The next token in the input stream is known to be a string; lex it.
678  */
679 static inline JsonParseErrorType
json_lex_string(JsonLexContext * lex)680 json_lex_string(JsonLexContext *lex)
681 {
682 	char	   *s;
683 	int			len;
684 	int			hi_surrogate = -1;
685 
686 	if (lex->strval != NULL)
687 		resetStringInfo(lex->strval);
688 
689 	Assert(lex->input_length > 0);
690 	s = lex->token_start;
691 	len = lex->token_start - lex->input;
692 	for (;;)
693 	{
694 		s++;
695 		len++;
696 		/* Premature end of the string. */
697 		if (len >= lex->input_length)
698 		{
699 			lex->token_terminator = s;
700 			return JSON_INVALID_TOKEN;
701 		}
702 		else if (*s == '"')
703 			break;
704 		else if ((unsigned char) *s < 32)
705 		{
706 			/* Per RFC4627, these characters MUST be escaped. */
707 			/* Since *s isn't printable, exclude it from the context string */
708 			lex->token_terminator = s;
709 			return JSON_ESCAPING_REQUIRED;
710 		}
711 		else if (*s == '\\')
712 		{
713 			/* OK, we have an escape character. */
714 			s++;
715 			len++;
716 			if (len >= lex->input_length)
717 			{
718 				lex->token_terminator = s;
719 				return JSON_INVALID_TOKEN;
720 			}
721 			else if (*s == 'u')
722 			{
723 				int			i;
724 				int			ch = 0;
725 
726 				for (i = 1; i <= 4; i++)
727 				{
728 					s++;
729 					len++;
730 					if (len >= lex->input_length)
731 					{
732 						lex->token_terminator = s;
733 						return JSON_INVALID_TOKEN;
734 					}
735 					else if (*s >= '0' && *s <= '9')
736 						ch = (ch * 16) + (*s - '0');
737 					else if (*s >= 'a' && *s <= 'f')
738 						ch = (ch * 16) + (*s - 'a') + 10;
739 					else if (*s >= 'A' && *s <= 'F')
740 						ch = (ch * 16) + (*s - 'A') + 10;
741 					else
742 					{
743 						lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
744 						return JSON_UNICODE_ESCAPE_FORMAT;
745 					}
746 				}
747 				if (lex->strval != NULL)
748 				{
749 					/*
750 					 * Combine surrogate pairs.
751 					 */
752 					if (is_utf16_surrogate_first(ch))
753 					{
754 						if (hi_surrogate != -1)
755 							return JSON_UNICODE_HIGH_SURROGATE;
756 						hi_surrogate = ch;
757 						continue;
758 					}
759 					else if (is_utf16_surrogate_second(ch))
760 					{
761 						if (hi_surrogate == -1)
762 							return JSON_UNICODE_LOW_SURROGATE;
763 						ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
764 						hi_surrogate = -1;
765 					}
766 
767 					if (hi_surrogate != -1)
768 						return JSON_UNICODE_LOW_SURROGATE;
769 
770 					/*
771 					 * Reject invalid cases.  We can't have a value above
772 					 * 0xFFFF here (since we only accepted 4 hex digits
773 					 * above), so no need to test for out-of-range chars.
774 					 */
775 					if (ch == 0)
776 					{
777 						/* We can't allow this, since our TEXT type doesn't */
778 						return JSON_UNICODE_CODE_POINT_ZERO;
779 					}
780 
781 					/*
782 					 * Add the represented character to lex->strval.  In the
783 					 * backend, we can let pg_unicode_to_server() handle any
784 					 * required character set conversion; in frontend, we can
785 					 * only deal with trivial conversions.
786 					 *
787 					 * Note: pg_unicode_to_server() will throw an error for a
788 					 * conversion failure, rather than returning a failure
789 					 * indication.  That seems OK.
790 					 */
791 #ifndef FRONTEND
792 					{
793 						char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
794 
795 						pg_unicode_to_server(ch, (unsigned char *) cbuf);
796 						appendStringInfoString(lex->strval, cbuf);
797 					}
798 #else
799 					if (lex->input_encoding == PG_UTF8)
800 					{
801 						/* OK, we can map the code point to UTF8 easily */
802 						char		utf8str[5];
803 						int			utf8len;
804 
805 						unicode_to_utf8(ch, (unsigned char *) utf8str);
806 						utf8len = pg_utf_mblen((unsigned char *) utf8str);
807 						appendBinaryStringInfo(lex->strval, utf8str, utf8len);
808 					}
809 					else if (ch <= 0x007f)
810 					{
811 						/* The ASCII range is the same in all encodings */
812 						appendStringInfoChar(lex->strval, (char) ch);
813 					}
814 					else
815 						return JSON_UNICODE_HIGH_ESCAPE;
816 #endif							/* FRONTEND */
817 				}
818 			}
819 			else if (lex->strval != NULL)
820 			{
821 				if (hi_surrogate != -1)
822 					return JSON_UNICODE_LOW_SURROGATE;
823 
824 				switch (*s)
825 				{
826 					case '"':
827 					case '\\':
828 					case '/':
829 						appendStringInfoChar(lex->strval, *s);
830 						break;
831 					case 'b':
832 						appendStringInfoChar(lex->strval, '\b');
833 						break;
834 					case 'f':
835 						appendStringInfoChar(lex->strval, '\f');
836 						break;
837 					case 'n':
838 						appendStringInfoChar(lex->strval, '\n');
839 						break;
840 					case 'r':
841 						appendStringInfoChar(lex->strval, '\r');
842 						break;
843 					case 't':
844 						appendStringInfoChar(lex->strval, '\t');
845 						break;
846 					default:
847 						/* Not a valid string escape, so signal error. */
848 						lex->token_start = s;
849 						lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
850 						return JSON_ESCAPING_INVALID;
851 				}
852 			}
853 			else if (strchr("\"\\/bfnrt", *s) == NULL)
854 			{
855 				/*
856 				 * Simpler processing if we're not bothered about de-escaping
857 				 *
858 				 * It's very tempting to remove the strchr() call here and
859 				 * replace it with a switch statement, but testing so far has
860 				 * shown it's not a performance win.
861 				 */
862 				lex->token_start = s;
863 				lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
864 				return JSON_ESCAPING_INVALID;
865 			}
866 
867 		}
868 		else if (lex->strval != NULL)
869 		{
870 			if (hi_surrogate != -1)
871 				return JSON_UNICODE_LOW_SURROGATE;
872 
873 			appendStringInfoChar(lex->strval, *s);
874 		}
875 
876 	}
877 
878 	if (hi_surrogate != -1)
879 		return JSON_UNICODE_LOW_SURROGATE;
880 
881 	/* Hooray, we found the end of the string! */
882 	lex->prev_token_terminator = lex->token_terminator;
883 	lex->token_terminator = s + 1;
884 	return JSON_SUCCESS;
885 }
886 
887 /*
888  * The next token in the input stream is known to be a number; lex it.
889  *
890  * In JSON, a number consists of four parts:
891  *
892  * (1) An optional minus sign ('-').
893  *
894  * (2) Either a single '0', or a string of one or more digits that does not
895  *	   begin with a '0'.
896  *
897  * (3) An optional decimal part, consisting of a period ('.') followed by
898  *	   one or more digits.  (Note: While this part can be omitted
899  *	   completely, it's not OK to have only the decimal point without
900  *	   any digits afterwards.)
901  *
902  * (4) An optional exponent part, consisting of 'e' or 'E', optionally
903  *	   followed by '+' or '-', followed by one or more digits.  (Note:
904  *	   As with the decimal part, if 'e' or 'E' is present, it must be
905  *	   followed by at least one digit.)
906  *
907  * The 's' argument to this function points to the ostensible beginning
908  * of part 2 - i.e. the character after any optional minus sign, or the
909  * first character of the string if there is none.
910  *
911  * If num_err is not NULL, we return an error flag to *num_err rather than
912  * raising an error for a badly-formed number.  Also, if total_len is not NULL
913  * the distance from lex->input to the token end+1 is returned to *total_len.
914  */
915 static inline JsonParseErrorType
json_lex_number(JsonLexContext * lex,char * s,bool * num_err,int * total_len)916 json_lex_number(JsonLexContext *lex, char *s,
917 				bool *num_err, int *total_len)
918 {
919 	bool		error = false;
920 	int			len = s - lex->input;
921 
922 	/* Part (1): leading sign indicator. */
923 	/* Caller already did this for us; so do nothing. */
924 
925 	/* Part (2): parse main digit string. */
926 	if (len < lex->input_length && *s == '0')
927 	{
928 		s++;
929 		len++;
930 	}
931 	else if (len < lex->input_length && *s >= '1' && *s <= '9')
932 	{
933 		do
934 		{
935 			s++;
936 			len++;
937 		} while (len < lex->input_length && *s >= '0' && *s <= '9');
938 	}
939 	else
940 		error = true;
941 
942 	/* Part (3): parse optional decimal portion. */
943 	if (len < lex->input_length && *s == '.')
944 	{
945 		s++;
946 		len++;
947 		if (len == lex->input_length || *s < '0' || *s > '9')
948 			error = true;
949 		else
950 		{
951 			do
952 			{
953 				s++;
954 				len++;
955 			} while (len < lex->input_length && *s >= '0' && *s <= '9');
956 		}
957 	}
958 
959 	/* Part (4): parse optional exponent. */
960 	if (len < lex->input_length && (*s == 'e' || *s == 'E'))
961 	{
962 		s++;
963 		len++;
964 		if (len < lex->input_length && (*s == '+' || *s == '-'))
965 		{
966 			s++;
967 			len++;
968 		}
969 		if (len == lex->input_length || *s < '0' || *s > '9')
970 			error = true;
971 		else
972 		{
973 			do
974 			{
975 				s++;
976 				len++;
977 			} while (len < lex->input_length && *s >= '0' && *s <= '9');
978 		}
979 	}
980 
981 	/*
982 	 * Check for trailing garbage.  As in json_lex(), any alphanumeric stuff
983 	 * here should be considered part of the token for error-reporting
984 	 * purposes.
985 	 */
986 	for (; len < lex->input_length && JSON_ALPHANUMERIC_CHAR(*s); s++, len++)
987 		error = true;
988 
989 	if (total_len != NULL)
990 		*total_len = len;
991 
992 	if (num_err != NULL)
993 	{
994 		/* let the caller handle any error */
995 		*num_err = error;
996 	}
997 	else
998 	{
999 		/* return token endpoint */
1000 		lex->prev_token_terminator = lex->token_terminator;
1001 		lex->token_terminator = s;
1002 		/* handle error if any */
1003 		if (error)
1004 			return JSON_INVALID_TOKEN;
1005 	}
1006 
1007 	return JSON_SUCCESS;
1008 }
1009 
1010 /*
1011  * Report a parse error.
1012  *
1013  * lex->token_start and lex->token_terminator must identify the current token.
1014  */
1015 static JsonParseErrorType
report_parse_error(JsonParseContext ctx,JsonLexContext * lex)1016 report_parse_error(JsonParseContext ctx, JsonLexContext *lex)
1017 {
1018 	/* Handle case where the input ended prematurely. */
1019 	if (lex->token_start == NULL || lex->token_type == JSON_TOKEN_END)
1020 		return JSON_EXPECTED_MORE;
1021 
1022 	/* Otherwise choose the error type based on the parsing context. */
1023 	switch (ctx)
1024 	{
1025 		case JSON_PARSE_END:
1026 			return JSON_EXPECTED_END;
1027 		case JSON_PARSE_VALUE:
1028 			return JSON_EXPECTED_JSON;
1029 		case JSON_PARSE_STRING:
1030 			return JSON_EXPECTED_STRING;
1031 		case JSON_PARSE_ARRAY_START:
1032 			return JSON_EXPECTED_ARRAY_FIRST;
1033 		case JSON_PARSE_ARRAY_NEXT:
1034 			return JSON_EXPECTED_ARRAY_NEXT;
1035 		case JSON_PARSE_OBJECT_START:
1036 			return JSON_EXPECTED_OBJECT_FIRST;
1037 		case JSON_PARSE_OBJECT_LABEL:
1038 			return JSON_EXPECTED_COLON;
1039 		case JSON_PARSE_OBJECT_NEXT:
1040 			return JSON_EXPECTED_OBJECT_NEXT;
1041 		case JSON_PARSE_OBJECT_COMMA:
1042 			return JSON_EXPECTED_STRING;
1043 	}
1044 
1045 	/*
1046 	 * We don't use a default: case, so that the compiler will warn about
1047 	 * unhandled enum values.  But this needs to be here anyway to cover the
1048 	 * possibility of an incorrect input.
1049 	 */
1050 	json_log_and_abort("unexpected json parse state: %d", (int) ctx);
1051 	return JSON_SUCCESS;		/* silence stupider compilers */
1052 }
1053 
1054 /*
1055  * Construct a detail message for a JSON error.
1056  */
1057 char *
json_errdetail(JsonParseErrorType error,JsonLexContext * lex)1058 json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
1059 {
1060 	switch (error)
1061 	{
1062 		case JSON_SUCCESS:
1063 			/* fall through to the error code after switch */
1064 			break;
1065 		case JSON_ESCAPING_INVALID:
1066 			return psprintf(_("Escape sequence \"\\%s\" is invalid."),
1067 							extract_token(lex));
1068 		case JSON_ESCAPING_REQUIRED:
1069 			return psprintf(_("Character with value 0x%02x must be escaped."),
1070 							(unsigned char) *(lex->token_terminator));
1071 		case JSON_EXPECTED_END:
1072 			return psprintf(_("Expected end of input, but found \"%s\"."),
1073 							extract_token(lex));
1074 		case JSON_EXPECTED_ARRAY_FIRST:
1075 			return psprintf(_("Expected array element or \"]\", but found \"%s\"."),
1076 							extract_token(lex));
1077 		case JSON_EXPECTED_ARRAY_NEXT:
1078 			return psprintf(_("Expected \",\" or \"]\", but found \"%s\"."),
1079 							extract_token(lex));
1080 		case JSON_EXPECTED_COLON:
1081 			return psprintf(_("Expected \":\", but found \"%s\"."),
1082 							extract_token(lex));
1083 		case JSON_EXPECTED_JSON:
1084 			return psprintf(_("Expected JSON value, but found \"%s\"."),
1085 							extract_token(lex));
1086 		case JSON_EXPECTED_MORE:
1087 			return _("The input string ended unexpectedly.");
1088 		case JSON_EXPECTED_OBJECT_FIRST:
1089 			return psprintf(_("Expected string or \"}\", but found \"%s\"."),
1090 							extract_token(lex));
1091 		case JSON_EXPECTED_OBJECT_NEXT:
1092 			return psprintf(_("Expected \",\" or \"}\", but found \"%s\"."),
1093 							extract_token(lex));
1094 		case JSON_EXPECTED_STRING:
1095 			return psprintf(_("Expected string, but found \"%s\"."),
1096 							extract_token(lex));
1097 		case JSON_INVALID_TOKEN:
1098 			return psprintf(_("Token \"%s\" is invalid."),
1099 							extract_token(lex));
1100 		case JSON_UNICODE_CODE_POINT_ZERO:
1101 			return _("\\u0000 cannot be converted to text.");
1102 		case JSON_UNICODE_ESCAPE_FORMAT:
1103 			return _("\"\\u\" must be followed by four hexadecimal digits.");
1104 		case JSON_UNICODE_HIGH_ESCAPE:
1105 			/* note: this case is only reachable in frontend not backend */
1106 			return _("Unicode escape values cannot be used for code point values above 007F when the encoding is not UTF8.");
1107 		case JSON_UNICODE_HIGH_SURROGATE:
1108 			return _("Unicode high surrogate must not follow a high surrogate.");
1109 		case JSON_UNICODE_LOW_SURROGATE:
1110 			return _("Unicode low surrogate must follow a high surrogate.");
1111 	}
1112 
1113 	/*
1114 	 * We don't use a default: case, so that the compiler will warn about
1115 	 * unhandled enum values.  But this needs to be here anyway to cover the
1116 	 * possibility of an incorrect input.
1117 	 */
1118 	json_log_and_abort("unexpected json parse error type: %d", (int) error);
1119 	return NULL;				/* silence stupider compilers */
1120 }
1121 
1122 /*
1123  * Extract the current token from a lexing context, for error reporting.
1124  */
1125 static char *
extract_token(JsonLexContext * lex)1126 extract_token(JsonLexContext *lex)
1127 {
1128 	int			toklen = lex->token_terminator - lex->token_start;
1129 	char	   *token = palloc(toklen + 1);
1130 
1131 	memcpy(token, lex->token_start, toklen);
1132 	token[toklen] = '\0';
1133 	return token;
1134 }
1135