1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 static int __json_unpack_put(
12     WT_SESSION_IMPL *, void *, u_char *, size_t, WT_CONFIG_ITEM *, size_t *);
13 static inline int __json_struct_size(WT_SESSION_IMPL *, const void *, size_t,
14     const char *, WT_CONFIG_ITEM *, bool, size_t *);
15 static inline int __json_struct_unpackv(WT_SESSION_IMPL *, const void *, size_t,
16     const char *, WT_CONFIG_ITEM *, u_char *, size_t, bool, va_list);
17 static int json_string_arg(WT_SESSION_IMPL *, const char **, WT_ITEM *);
18 static int json_int_arg(WT_SESSION_IMPL *, const char **, int64_t *);
19 static int json_uint_arg(WT_SESSION_IMPL *, const char **, uint64_t *);
20 static int __json_pack_struct(WT_SESSION_IMPL *, void *, size_t, const char *,
21     const char *);
22 static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *,
23     bool, const char *, size_t *);
24 
25 #define	WT_PACK_JSON_GET(session, pv, jstr) do {			\
26 	switch ((pv).type) {						\
27 	case 'x':							\
28 		break;							\
29 	case 's':							\
30 	case 'S':							\
31 		WT_RET(json_string_arg(session, &(jstr), &(pv).u.item));\
32 		(pv).type = (pv).type == 's' ? 'j' : 'J';		\
33 		break;							\
34 	case 'b':							\
35 	case 'h':							\
36 	case 'i':							\
37 	case 'l':							\
38 	case 'q':							\
39 		WT_RET(json_int_arg(session, &(jstr), &(pv).u.i));	\
40 		break;							\
41 	case 'B':							\
42 	case 'H':							\
43 	case 'I':							\
44 	case 'L':							\
45 	case 'Q':							\
46 	case 'r':							\
47 	case 'R':							\
48 	case 't':							\
49 		WT_RET(json_uint_arg(session, &(jstr), &(pv).u.u));	\
50 		break;							\
51 	case 'u':							\
52 		WT_RET(json_string_arg(session, &(jstr), &(pv).u.item));\
53 		(pv).type = 'K';					\
54 		break;							\
55 	/* User format strings have already been validated. */		\
56 	WT_ILLEGAL_VALUE(session, (pv).type);				\
57 	}								\
58 } while (0)
59 
60 /*
61  * __json_unpack_put --
62  *	Calculate the size of a packed byte string as formatted for JSON.
63  */
64 static int
__json_unpack_put(WT_SESSION_IMPL * session,void * voidpv,u_char * buf,size_t bufsz,WT_CONFIG_ITEM * name,size_t * retsizep)65 __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv,
66     u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name, size_t *retsizep)
67 {
68 	WT_PACK_VALUE *pv;
69 	size_t s, n;
70 	const u_char *p, *end;
71 
72 	pv = (WT_PACK_VALUE *)voidpv;
73 
74 	WT_RET(__wt_snprintf_len_set(
75 	    (char *)buf, bufsz, &s, "\"%.*s\" : ", (int)name->len, name->str));
76 	if (s <= bufsz) {
77 		bufsz -= s;
78 		buf += s;
79 	} else
80 		bufsz = 0;
81 
82 	switch (pv->type) {
83 	case 'x':
84 		return (0);
85 	case 's':
86 	case 'S':
87 		/* Account for '"' quote in front and back. */
88 		s += 2;
89 		p = (const u_char *)pv->u.s;
90 		if (bufsz > 0) {
91 			*buf++ = '"';
92 			bufsz--;
93 		}
94 		if (pv->type == 's' || pv->havesize) {
95 			end = p + pv->size;
96 			for (; p < end; p++) {
97 				n = __wt_json_unpack_char(
98 				    *p, buf, bufsz, false);
99 				if (n > bufsz)
100 					bufsz = 0;
101 				else {
102 					bufsz -= n;
103 					buf += n;
104 				}
105 				s += n;
106 			}
107 		} else
108 			for (; *p; p++) {
109 				n = __wt_json_unpack_char(
110 				    *p, buf, bufsz, false);
111 				if (n > bufsz)
112 					bufsz = 0;
113 				else {
114 					bufsz -= n;
115 					buf += n;
116 				}
117 				s += n;
118 			}
119 		if (bufsz > 0)
120 			*buf++ = '"';
121 		*retsizep += s;
122 		return (0);
123 	case 'U':
124 	case 'u':
125 		s += 2;
126 		p = (const u_char *)pv->u.item.data;
127 		end = p + pv->u.item.size;
128 		if (bufsz > 0) {
129 			*buf++ = '"';
130 			bufsz--;
131 		}
132 		for (; p < end; p++) {
133 			n = __wt_json_unpack_char(*p, buf, bufsz, true);
134 			if (n > bufsz)
135 				bufsz = 0;
136 			else {
137 				bufsz -= n;
138 				buf += n;
139 			}
140 			s += n;
141 		}
142 		if (bufsz > 0)
143 			*buf++ = '"';
144 		*retsizep += s;
145 		return (0);
146 	case 'b':
147 	case 'h':
148 	case 'i':
149 	case 'l':
150 	case 'q':
151 		WT_RET(__wt_snprintf_len_incr(
152 		    (char *)buf, bufsz, &s, "%" PRId64, pv->u.i));
153 		*retsizep += s;
154 		return (0);
155 	case 'B':
156 	case 't':
157 	case 'H':
158 	case 'I':
159 	case 'L':
160 	case 'Q':
161 	case 'r':
162 	case 'R':
163 		WT_RET(__wt_snprintf_len_incr(
164 		    (char *)buf, bufsz, &s, "%" PRId64, pv->u.u));
165 		*retsizep += s;
166 		return (0);
167 	}
168 
169 	WT_RET_MSG(session, EINVAL,
170 	    "unknown pack-value type: %c", (int)pv->type);
171 }
172 
173 /*
174  * __json_struct_size --
175  *	Calculate the size of a packed byte string as formatted for JSON.
176  */
177 static inline int
__json_struct_size(WT_SESSION_IMPL * session,const void * buffer,size_t size,const char * fmt,WT_CONFIG_ITEM * names,bool iskey,size_t * presult)178 __json_struct_size(WT_SESSION_IMPL *session, const void *buffer,
179     size_t size, const char *fmt, WT_CONFIG_ITEM *names, bool iskey,
180     size_t *presult)
181 {
182 	WT_CONFIG_ITEM name;
183 	WT_DECL_PACK_VALUE(pv);
184 	WT_DECL_RET;
185 	WT_PACK pack;
186 	WT_PACK_NAME packname;
187 	size_t result;
188 	const uint8_t *p, *end;
189 	bool needcr;
190 
191 	p = buffer;
192 	end = p + size;
193 	result = 0;
194 	needcr = false;
195 
196 	__pack_name_init(session, names, iskey, &packname);
197 	WT_RET(__pack_init(session, &pack, fmt));
198 	while ((ret = __pack_next(&pack, &pv)) == 0) {
199 		if (needcr)
200 			result += 2;
201 		needcr = true;
202 		WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
203 		WT_RET(__pack_name_next(&packname, &name));
204 		WT_RET(
205 		    __json_unpack_put(session, &pv, NULL, 0, &name, &result));
206 	}
207 	WT_RET_NOTFOUND_OK(ret);
208 
209 	/* Be paranoid - __pack_write should never overflow. */
210 	WT_ASSERT(session, p <= end);
211 
212 	*presult = result;
213 	return (0);
214 }
215 
216 /*
217  * __json_struct_unpackv --
218  *	Unpack a byte string to JSON (va_list version).
219  */
220 static inline int
__json_struct_unpackv(WT_SESSION_IMPL * session,const void * buffer,size_t size,const char * fmt,WT_CONFIG_ITEM * names,u_char * jbuf,size_t jbufsize,bool iskey,va_list ap)221 __json_struct_unpackv(WT_SESSION_IMPL *session,
222     const void *buffer, size_t size, const char *fmt, WT_CONFIG_ITEM *names,
223     u_char *jbuf, size_t jbufsize, bool iskey, va_list ap)
224 {
225 	WT_CONFIG_ITEM name;
226 	WT_DECL_PACK_VALUE(pv);
227 	WT_DECL_RET;
228 	WT_PACK pack;
229 	WT_PACK_NAME packname;
230 	size_t jsize;
231 	const uint8_t *p, *end;
232 	bool needcr;
233 
234 	p = buffer;
235 	end = p + size;
236 	needcr = false;
237 
238 	/* Unpacking a cursor marked as json implies a single arg. */
239 	*va_arg(ap, const char **) = (char *)jbuf;
240 
241 	__pack_name_init(session, names, iskey, &packname);
242 	WT_RET(__pack_init(session, &pack, fmt));
243 	while ((ret = __pack_next(&pack, &pv)) == 0) {
244 		if (needcr) {
245 			WT_ASSERT(session, jbufsize >= 3);
246 			strncat((char *)jbuf, ",\n", jbufsize);
247 			jbuf += 2;
248 			jbufsize -= 2;
249 		}
250 		needcr = true;
251 		WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
252 		WT_RET(__pack_name_next(&packname, &name));
253 		jsize = 0;
254 		WT_RET(__json_unpack_put(session,
255 		    (u_char *)&pv, jbuf, jbufsize, &name, &jsize));
256 		WT_ASSERT(session, jsize <= jbufsize);
257 		jbuf += jsize;
258 		jbufsize -= jsize;
259 	}
260 	WT_RET_NOTFOUND_OK(ret);
261 
262 	/* Be paranoid - __unpack_read should never overflow. */
263 	WT_ASSERT(session, p <= end);
264 
265 	WT_ASSERT(session, jbufsize == 1);
266 
267 	return (0);
268 }
269 
270 /*
271  * __wt_json_alloc_unpack --
272  *	Allocate space for, and unpack an entry into JSON format.
273  */
274 int
__wt_json_alloc_unpack(WT_SESSION_IMPL * session,const void * buffer,size_t size,const char * fmt,WT_CURSOR_JSON * json,bool iskey,va_list ap)275 __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer,
276     size_t size, const char *fmt, WT_CURSOR_JSON *json,
277     bool iskey, va_list ap)
278 {
279 	WT_CONFIG_ITEM *names;
280 	size_t needed;
281 	char **json_bufp;
282 
283 	if (iskey) {
284 		names = &json->key_names;
285 		json_bufp = &json->key_buf;
286 	} else {
287 		names = &json->value_names;
288 		json_bufp = &json->value_buf;
289 	}
290 	needed = 0;
291 	WT_RET(__json_struct_size(session, buffer, size, fmt, names,
292 	    iskey, &needed));
293 	WT_RET(__wt_realloc(session, NULL, needed + 1, json_bufp));
294 	WT_RET(__json_struct_unpackv(session, buffer, size, fmt,
295 	    names, (u_char *)*json_bufp, needed + 1, iskey, ap));
296 
297 	return (0);
298 }
299 
300 /*
301  * __wt_json_close --
302  *	Release any json related resources.
303  */
304 void
__wt_json_close(WT_SESSION_IMPL * session,WT_CURSOR * cursor)305 __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
306 {
307 	WT_CURSOR_JSON *json;
308 
309 	if ((json = (WT_CURSOR_JSON *)cursor->json_private) != NULL) {
310 		__wt_free(session, json->key_buf);
311 		__wt_free(session, json->value_buf);
312 		__wt_free(session, json);
313 	}
314 }
315 
316 /*
317  * __wt_json_unpack_char --
318  *	Unpack a single character into JSON escaped format.
319  *	Can be called with null buf for sizing.
320  */
321 size_t
__wt_json_unpack_char(u_char ch,u_char * buf,size_t bufsz,bool force_unicode)322 __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode)
323     WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
324 {
325 	u_char abbrev;
326 
327 	if (!force_unicode) {
328 		if (__wt_isprint(ch) && ch != '\\' && ch != '"') {
329 			if (bufsz >= 1)
330 				*buf = ch;
331 			return (1);
332 		}
333 		abbrev = '\0';
334 		switch (ch) {
335 		case '\\':
336 		case '"':
337 			abbrev = ch;
338 			break;
339 		case '\f':
340 			abbrev = 'f';
341 			break;
342 		case '\n':
343 			abbrev = 'n';
344 			break;
345 		case '\r':
346 			abbrev = 'r';
347 			break;
348 		case '\t':
349 			abbrev = 't';
350 			break;
351 		}
352 		if (abbrev != '\0') {
353 			if (bufsz >= 2) {
354 				*buf++ = '\\';
355 				*buf = abbrev;
356 			}
357 			return (2);
358 		}
359 	}
360 	if (bufsz >= 6) {
361 		*buf++ = '\\';
362 		*buf++ = 'u';
363 		*buf++ = '0';
364 		*buf++ = '0';
365 		*buf++ = __wt_hex((ch & 0xf0) >> 4);
366 		*buf++ = __wt_hex(ch & 0x0f);
367 	}
368 	return (6);
369 }
370 
371 /*
372  * __wt_json_column_init --
373  *	Set json_key_names, json_value_names to comma separated lists
374  *	of column names.
375  */
376 void
__wt_json_column_init(WT_CURSOR * cursor,const char * uri,const char * keyformat,const WT_CONFIG_ITEM * idxconf,const WT_CONFIG_ITEM * colconf)377 __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat,
378     const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf)
379 {
380 	WT_CURSOR_JSON *json;
381 	uint32_t keycnt, nkeys;
382 	const char *beginkey, *end, *lparen, *p;
383 
384 	json = (WT_CURSOR_JSON *)cursor->json_private;
385 	beginkey = colconf->str;
386 	end = beginkey + colconf->len;
387 
388 	if (idxconf != NULL) {
389 		json->key_names.str = idxconf->str;
390 		json->key_names.len = idxconf->len;
391 	} else if (colconf->len > 0 && *beginkey == '(') {
392 		beginkey++;
393 		if (end[-1] == ')')
394 			end--;
395 	}
396 
397 	for (nkeys = 0; *keyformat; keyformat++)
398 		if (!__wt_isdigit((u_char)*keyformat))
399 			nkeys++;
400 
401 	p = beginkey;
402 	keycnt = 0;
403 	while (p < end && keycnt < nkeys) {
404 		if (*p == ',')
405 			keycnt++;
406 		p++;
407 	}
408 	if ((lparen = strchr(uri, '(')) != NULL) {
409 		/* This cursor is a projection. */
410 		json->value_names.str = lparen;
411 		json->value_names.len = strlen(lparen) - 1;
412 		WT_ASSERT((WT_SESSION_IMPL *)cursor->session,
413 		    json->value_names.str[json->value_names.len] == ')');
414 	} else {
415 		json->value_names.str = p;
416 		json->value_names.len = WT_PTRDIFF(end, p);
417 	}
418 	if (idxconf == NULL) {
419 		if (p > beginkey)
420 			p--;
421 		json->key_names.str = beginkey;
422 		json->key_names.len = WT_PTRDIFF(p, beginkey);
423 	}
424 }
425 
426 #define	MATCH_KEYWORD(session, in, result, keyword, matchval) 	do {	\
427 	size_t _kwlen = strlen(keyword);				\
428 	if (strncmp(in, keyword, _kwlen) == 0 &&			\
429 	    !__wt_isalnum((u_char)(in)[_kwlen])) {			\
430 		(in) += _kwlen;						\
431 		(result) = matchval;					\
432 	} else {							\
433 		const char *_bad = (in);				\
434 		while (__wt_isalnum((u_char)*(in)))			\
435 			(in)++;						\
436 		WT_RET_MSG(session, EINVAL,				\
437 		    "unknown keyword \"%.*s\" in JSON",			\
438 		    (int)((in) - _bad), _bad);				\
439 	}								\
440 } while (0)
441 
442 /*
443  * __wt_json_token --
444  *	Return the type, start position and length of the next JSON
445  *	token in the input.  String tokens include the quotes.  JSON
446  *	can be entirely parsed using calls to this tokenizer, each
447  *	call using a src pointer that is the previously returned
448  *	tokstart + toklen.
449  *
450  *	The token type returned is one of:
451  *		0	:  EOF
452  *		's'	:  string
453  *		'i'	:  intnum
454  *		'f'	:  floatnum
455  *		':'	:  colon
456  *		','	:  comma
457  *		'{'	:  lbrace
458  *		'}'	:  rbrace
459  *		'['	:  lbracket
460  *		']'	:  rbracket
461  *		'N'	:  null
462  *		'T'	:  true
463  *		'F'	:  false
464  */
465 int
__wt_json_token(WT_SESSION * wt_session,const char * src,int * toktype,const char ** tokstart,size_t * toklen)466 __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype,
467     const char **tokstart, size_t *toklen)
468     WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
469 {
470 	WT_SESSION_IMPL *session;
471 	int result;
472 	const char *bad;
473 	char ch;
474 	bool backslash, isalph, isfloat;
475 
476 	result = -1;
477 	session = (WT_SESSION_IMPL *)wt_session;
478 	while (__wt_isspace((u_char)*src))
479 		src++;
480 	*tokstart = src;
481 
482 	if (*src == '\0') {
483 		*toktype = 0;
484 		*toklen = 0;
485 		return (0);
486 	}
487 
488 	/* JSON is specified in RFC 4627. */
489 	switch (*src) {
490 	case '"':
491 		backslash = false;
492 		src++;
493 		while ((ch = *src) != '\0') {
494 			if (!backslash) {
495 				if (ch == '"') {
496 					src++;
497 					result = 's';
498 					break;
499 				}
500 				if (ch == '\\')
501 					backslash = true;
502 			} else {
503 				/* We validate Unicode on this pass. */
504 				if (ch == 'u') {
505 					u_char ignored;
506 					const u_char *uc;
507 
508 					uc = (const u_char *)src;
509 					if (__wt_hex2byte(&uc[1], &ignored) ||
510 					    __wt_hex2byte(&uc[3], &ignored))
511 						WT_RET_MSG(session, EINVAL,
512 				    "invalid Unicode within JSON string");
513 					src += 4;
514 				}
515 				backslash = false;
516 			}
517 			src++;
518 		}
519 		if (result == 's')
520 			break;
521 		WT_RET_MSG(session, EINVAL, "unterminated string in JSON");
522 	case '-':
523 	case '0':
524 	case '1':
525 	case '2':
526 	case '3':
527 	case '4':
528 	case '5':
529 	case '6':
530 	case '7':
531 	case '8':
532 	case '9':
533 		isfloat = false;
534 		if (*src == '-')
535 			src++;
536 		while ((ch = *src) != '\0' && __wt_isdigit((u_char)ch))
537 			src++;
538 		if (*src == '.') {
539 			isfloat = true;
540 			src++;
541 			while ((ch = *src) != '\0' && __wt_isdigit((u_char)ch))
542 				src++;
543 		}
544 		if (*src == 'e' || *src == 'E') {
545 			isfloat = true;
546 			src++;
547 			if (*src == '+' || *src == '-')
548 				src++;
549 			while ((ch = *src) != '\0' && __wt_isdigit((u_char)ch))
550 				src++;
551 		}
552 		result = isfloat ? 'f' : 'i';
553 		break;
554 	case ':':
555 	case ',':
556 	case '{':
557 	case '}':
558 	case '[':
559 	case ']':
560 		result = *src++;
561 		break;
562 	case 'n':
563 		MATCH_KEYWORD(session, src, result, "null", 'N');
564 		break;
565 	case 't':
566 		MATCH_KEYWORD(session, src, result, "true", 'T');
567 		break;
568 	case 'f':
569 		MATCH_KEYWORD(session, src, result, "false", 'F');
570 		break;
571 	default:
572 		/* An illegal token, move past it anyway */
573 		bad = src;
574 		isalph = __wt_isalnum((u_char)*src);
575 		src++;
576 		if (isalph)
577 			while (*src != '\0' && __wt_isalnum((u_char)*src))
578 				src++;
579 		WT_RET_MSG(session, EINVAL,
580 		    "unknown token \"%.*s\" in JSON", (int)(src - bad), bad);
581 		/* NOTREACHED */
582 	}
583 	WT_ASSERT(session, result != -1);
584 
585 	*toklen = (size_t)(src - *tokstart);
586 	*toktype = result;
587 	return (0);
588 }
589 
590 /*
591  * __wt_json_tokname --
592  *	Return a descriptive name from the token type returned by
593  *	__wt_json_token.
594  */
595 const char *
__wt_json_tokname(int toktype)596 __wt_json_tokname(int toktype)
597     WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
598 {
599 	switch (toktype) {
600 	case 0:		return ("<EOF>");
601 	case 's':	return ("<string>");
602 	case 'i':	return ("<integer>");
603 	case 'f':	return ("<float>");
604 	case ':':	return ("':'");
605 	case ',':	return ("','");
606 	case '{':	return ("'{'");
607 	case '}':	return ("'}'");
608 	case '[':	return ("'['");
609 	case ']':	return ("']'");
610 	case 'N':	return ("'null'");
611 	case 'T':	return ("'true'");
612 	case 'F':	return ("'false'");
613 	default:	return ("<UNKNOWN>");
614 	}
615 }
616 
617 /*
618  * json_string_arg --
619  *	Returns a first cut of the needed string in item.
620  *	The result has not been stripped of escapes.
621  */
622 static int
json_string_arg(WT_SESSION_IMPL * session,const char ** jstr,WT_ITEM * item)623 json_string_arg(WT_SESSION_IMPL *session, const char **jstr, WT_ITEM *item)
624 {
625 	int tok;
626 	const char *tokstart;
627 
628 	WT_RET(__wt_json_token(
629 	    (WT_SESSION *)session, *jstr, &tok, &tokstart, &item->size));
630 	if (tok == 's') {
631 		*jstr = tokstart + item->size;
632 		/* The tokenizer includes the '"' chars */
633 		item->data = tokstart + 1;
634 		item->size -= 2;
635 	} else
636 		WT_RET_MSG(session, EINVAL,
637 		    "expected JSON <string>, got %s", __wt_json_tokname(tok));
638 	return (0);
639 }
640 
641 /*
642  * json_int_arg --
643  *	Returns a signed integral value from the current position
644  *	in the JSON string.
645  */
646 static int
json_int_arg(WT_SESSION_IMPL * session,const char ** jstr,int64_t * ip)647 json_int_arg(WT_SESSION_IMPL *session, const char **jstr, int64_t *ip)
648 {
649 	size_t toksize;
650 	int tok;
651 	char *end;
652 	const char *tokstart;
653 
654 	WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
655 		&toksize));
656 	if (tok == 'i') {
657 		/* JSON only allows decimal */
658 		*ip = strtoll(tokstart, &end, 10);
659 		if (end != tokstart + toksize)
660 			WT_RET_MSG(session, EINVAL,
661 			    "JSON <int> extraneous input");
662 		*jstr = tokstart + toksize;
663 	} else
664 		WT_RET_MSG(session, EINVAL,
665 		    "expected JSON <int>, got %s", __wt_json_tokname(tok));
666 	return (0);
667 }
668 
669 /*
670  * json_uint_arg --
671  *	Returns an unsigned integral value from the current position
672  *	in the JSON string.
673  */
674 static int
json_uint_arg(WT_SESSION_IMPL * session,const char ** jstr,uint64_t * up)675 json_uint_arg(WT_SESSION_IMPL *session, const char **jstr, uint64_t *up)
676 {
677 	size_t toksize;
678 	int tok;
679 	char *end;
680 	const char *tokstart;
681 
682 	WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
683 		&toksize));
684 	if (tok == 'i' && *tokstart != '-') {
685 		/* JSON only allows decimal */
686 		*up = strtoull(tokstart, &end, 10);
687 		if (end != tokstart + toksize)
688 			WT_RET_MSG(session, EINVAL,
689 			    "JSON <int> extraneous input");
690 		*jstr = tokstart + toksize;
691 	} else
692 		WT_RET_MSG(session, EINVAL,
693 		    "expected unsigned JSON <int>, got %s",
694 		    __wt_json_tokname(tok));
695 	return (0);
696 }
697 
698 #define	JSON_EXPECT_TOKEN_GET(session, jstr, tokval, start, sz) do {	\
699     int __tok;								\
700     WT_RET(__wt_json_token(						\
701 	(WT_SESSION *)(session), jstr, &__tok, &(start), &(sz)));	\
702     if (__tok != (tokval))						\
703 	    WT_RET_MSG(session, EINVAL,					\
704 		"expected JSON %s, got %s",				\
705 		__wt_json_tokname(tokval), __wt_json_tokname(__tok));	\
706     (jstr) = (start) + (sz);						\
707 } while (0)
708 
709 #define	JSON_EXPECT_TOKEN(session, jstr, tokval) do {			\
710     const char *__start;						\
711     size_t __sz;							\
712     JSON_EXPECT_TOKEN_GET(session, jstr, tokval, __start, __sz);	\
713 } while (0)
714 
715 /*
716  * __json_pack_struct --
717  *	Pack a byte string from a JSON string.
718  */
719 static int
__json_pack_struct(WT_SESSION_IMPL * session,void * buffer,size_t size,const char * fmt,const char * jstr)720 __json_pack_struct(WT_SESSION_IMPL *session, void *buffer, size_t size,
721     const char *fmt, const char *jstr)
722 {
723 	WT_DECL_PACK_VALUE(pv);
724 	WT_DECL_RET;
725 	WT_PACK pack;
726 	size_t toksize;
727 	uint8_t *p, *end;
728 	const char *tokstart;
729 	bool multi;
730 
731 	p = buffer;
732 	end = p + size;
733 	multi = false;
734 
735 	if (fmt[0] != '\0' && fmt[1] == '\0') {
736 		JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
737 		/* the key name was verified in __json_pack_size */
738 		JSON_EXPECT_TOKEN(session, jstr, ':');
739 		pv.type = fmt[0];
740 		WT_PACK_JSON_GET(session, pv, jstr);
741 		return (__pack_write(session, &pv, &p, size));
742 	}
743 
744 	WT_RET(__pack_init(session, &pack, fmt));
745 	while ((ret = __pack_next(&pack, &pv)) == 0) {
746 		if (multi)
747 			JSON_EXPECT_TOKEN(session, jstr, ',');
748 		JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
749 		/* the key name was verified in __json_pack_size */
750 		JSON_EXPECT_TOKEN(session, jstr, ':');
751 		WT_PACK_JSON_GET(session, pv, jstr);
752 		WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p)));
753 		multi = true;
754 	}
755 	WT_RET_NOTFOUND_OK(ret);
756 
757 	/* Be paranoid - __pack_write should never overflow. */
758 	WT_ASSERT(session, p <= end);
759 
760 	return (0);
761 }
762 
763 /*
764  * __json_pack_size --
765  *	Calculate the size of a packed byte string from a JSON string.
766  *	We verify that the names and value types provided in JSON match
767  *	the column names and type from the schema format, returning error
768  *	if not.
769  */
770 static int
__json_pack_size(WT_SESSION_IMPL * session,const char * fmt,WT_CONFIG_ITEM * names,bool iskey,const char * jstr,size_t * sizep)771 __json_pack_size(
772     WT_SESSION_IMPL *session, const char *fmt, WT_CONFIG_ITEM *names,
773 	bool iskey, const char *jstr, size_t *sizep)
774 {
775 	WT_CONFIG_ITEM name;
776 	WT_DECL_PACK_VALUE(pv);
777 	WT_DECL_RET;
778 	WT_PACK pack;
779 	WT_PACK_NAME packname;
780 	size_t toksize, v;
781 	const char *tokstart;
782 	bool multi;
783 
784 	__pack_name_init(session, names, iskey, &packname);
785 	multi = false;
786 	WT_RET(__pack_init(session, &pack, fmt));
787 	for (*sizep = 0; (ret = __pack_next(&pack, &pv)) == 0;) {
788 		if (multi)
789 			JSON_EXPECT_TOKEN(session, jstr, ',');
790 		JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
791 		WT_RET(__pack_name_next(&packname, &name));
792 		if (toksize - 2 != name.len ||
793 		    strncmp(tokstart + 1, name.str, toksize - 2) != 0)
794 			WT_RET_MSG(session, EINVAL,
795 			    "JSON expected %s name: \"%.*s\"",
796 			    iskey ? "key" : "value", (int)name.len, name.str);
797 		JSON_EXPECT_TOKEN(session, jstr, ':');
798 		WT_PACK_JSON_GET(session, pv, jstr);
799 		WT_RET(__pack_size(session, &pv, &v));
800 		*sizep += v;
801 		multi = true;
802 	}
803 	WT_RET_NOTFOUND_OK(ret);
804 
805 	/* check end of string */
806 	JSON_EXPECT_TOKEN(session, jstr, 0);
807 
808 	return (0);
809 }
810 
811 /*
812  * __wt_json_to_item --
813  *	Convert a JSON input string for either key/value to a raw WT_ITEM.
814  *	Checks that the input matches the expected format.
815  */
816 int
__wt_json_to_item(WT_SESSION_IMPL * session,const char * jstr,const char * format,WT_CURSOR_JSON * json,bool iskey,WT_ITEM * item)817 __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr,
818     const char *format, WT_CURSOR_JSON *json, bool iskey, WT_ITEM *item)
819 {
820 	size_t sz;
821 	sz = 0; /* Initialize because GCC 4.1 is paranoid */
822 
823 	WT_RET(__json_pack_size(session, format,
824 	    iskey ? &json->key_names : &json->value_names, iskey, jstr, &sz));
825 	WT_RET(__wt_buf_initsize(session, item, sz));
826 	WT_RET(__json_pack_struct(session, item->mem, sz, format, jstr));
827 	return (0);
828 }
829 
830 /*
831  * __wt_json_strlen --
832  *	Return the number of bytes represented by a string in JSON format,
833  *	or -1 if the format is incorrect.
834  */
835 ssize_t
__wt_json_strlen(const char * src,size_t srclen)836 __wt_json_strlen(const char *src, size_t srclen)
837     WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
838 {
839 	size_t dstlen;
840 	u_char hi, lo;
841 	const char *srcend;
842 
843 	dstlen = 0;
844 	srcend = src + srclen;
845 	while (src < srcend) {
846 		/* JSON can include any UTF-8 expressed in 4 hex chars. */
847 		if (*src == '\\') {
848 			if (*++src == 'u') {
849 				if (__wt_hex2byte((const u_char *)++src, &hi))
850 					return (-1);
851 				src += 2;
852 				if (__wt_hex2byte((const u_char *)src, &lo))
853 					return (-1);
854 				src += 2;
855 				if (hi != 0)
856 					/*
857 					 * For our dump representation,
858 					 * every Unicode character on input
859 					 * represents a single byte.
860 					 */
861 					return (-1);
862 			}
863 		} else
864 			src++;
865 		dstlen++;
866 	}
867 	if (src != srcend)
868 		return (-1);   /* invalid input, e.g. final char is '\\' */
869 	return ((ssize_t)dstlen);
870 }
871 
872 /*
873  * __wt_json_strncpy --
874  *	Copy bytes of string in JSON format to a destination, up to dstlen
875  * bytes. If dstlen is greater than the needed size, the result if zero padded.
876  */
877 int
__wt_json_strncpy(WT_SESSION * wt_session,char ** pdst,size_t dstlen,const char * src,size_t srclen)878 __wt_json_strncpy(WT_SESSION *wt_session,
879     char **pdst, size_t dstlen, const char *src, size_t srclen)
880     WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
881 {
882 	WT_SESSION_IMPL *session;
883 	u_char hi, lo;
884 	char ch, *dst;
885 	const char *dstend, *srcend;
886 
887 	session = (WT_SESSION_IMPL *)wt_session;
888 
889 	dst = *pdst;
890 	dstend = dst + dstlen;
891 	srcend = src + srclen;
892 	while (src < srcend && dst < dstend) {
893 		/* JSON can include any UTF-8 expressed in 4 hex chars. */
894 		if ((ch = *src++) == '\\')
895 			switch (ch = *src++) {
896 			case 'u':
897 				if (__wt_hex2byte((const u_char *)src, &hi) ||
898 				    __wt_hex2byte((const u_char *)src + 2, &lo))
899 					WT_RET_MSG(session, EINVAL,
900 				    "invalid Unicode within JSON string");
901 				src += 4;
902 				if (hi != 0)
903 					WT_RET_MSG(session, EINVAL,
904 					    "Unicode \"%6.6s\" byte out of "
905 					    "range in JSON",
906 					    src - 6);
907 				*dst++ = (char)lo;
908 				break;
909 			case 'f':
910 				*dst++ = '\f';
911 				break;
912 			case 'n':
913 				*dst++ = '\n';
914 				break;
915 			case 'r':
916 				*dst++ = '\r';
917 				break;
918 			case 't':
919 				*dst++ = '\t';
920 				break;
921 			case '"':
922 			case '\\':
923 				*dst++ = ch;
924 				break;
925 			WT_ILLEGAL_VALUE(session, ch);
926 			}
927 		else
928 			*dst++ = ch;
929 	}
930 	if (src != srcend)
931 		WT_RET_MSG(session,
932 		    ENOMEM, "JSON string copy destination buffer too small");
933 	*pdst = dst;
934 	while (dst < dstend)
935 		*dst++ = '\0';
936 	return (0);
937 }
938