1 /*-------------------------------------------------------------------------
2  *
3  * jsonb_gin.c
4  *	 GIN support functions for jsonb
5  *
6  * Copyright (c) 2014-2018, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/jsonb_gin.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "access/gin.h"
17 #include "access/hash.h"
18 #include "access/stratnum.h"
19 #include "catalog/pg_collation.h"
20 #include "catalog/pg_type.h"
21 #include "utils/builtins.h"
22 #include "utils/jsonb.h"
23 #include "utils/varlena.h"
24 
25 typedef struct PathHashStack
26 {
27 	uint32		hash;
28 	struct PathHashStack *parent;
29 } PathHashStack;
30 
31 static Datum make_text_key(char flag, const char *str, int len);
32 static Datum make_scalar_key(const JsonbValue *scalarVal, bool is_key);
33 
34 /*
35  *
36  * jsonb_ops GIN opclass support functions
37  *
38  */
39 
40 Datum
gin_compare_jsonb(PG_FUNCTION_ARGS)41 gin_compare_jsonb(PG_FUNCTION_ARGS)
42 {
43 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
44 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
45 	int32		result;
46 	char	   *a1p,
47 			   *a2p;
48 	int			len1,
49 				len2;
50 
51 	a1p = VARDATA_ANY(arg1);
52 	a2p = VARDATA_ANY(arg2);
53 
54 	len1 = VARSIZE_ANY_EXHDR(arg1);
55 	len2 = VARSIZE_ANY_EXHDR(arg2);
56 
57 	/* Compare text as bttextcmp does, but always using C collation */
58 	result = varstr_cmp(a1p, len1, a2p, len2, C_COLLATION_OID);
59 
60 	PG_FREE_IF_COPY(arg1, 0);
61 	PG_FREE_IF_COPY(arg2, 1);
62 
63 	PG_RETURN_INT32(result);
64 }
65 
66 Datum
gin_extract_jsonb(PG_FUNCTION_ARGS)67 gin_extract_jsonb(PG_FUNCTION_ARGS)
68 {
69 	Jsonb	   *jb = (Jsonb *) PG_GETARG_JSONB_P(0);
70 	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
71 	int			total = 2 * JB_ROOT_COUNT(jb);
72 	JsonbIterator *it;
73 	JsonbValue	v;
74 	JsonbIteratorToken r;
75 	int			i = 0;
76 	Datum	   *entries;
77 
78 	/* If the root level is empty, we certainly have no keys */
79 	if (total == 0)
80 	{
81 		*nentries = 0;
82 		PG_RETURN_POINTER(NULL);
83 	}
84 
85 	/* Otherwise, use 2 * root count as initial estimate of result size */
86 	entries = (Datum *) palloc(sizeof(Datum) * total);
87 
88 	it = JsonbIteratorInit(&jb->root);
89 
90 	while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
91 	{
92 		/* Since we recurse into the object, we might need more space */
93 		if (i >= total)
94 		{
95 			total *= 2;
96 			entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
97 		}
98 
99 		switch (r)
100 		{
101 			case WJB_KEY:
102 				entries[i++] = make_scalar_key(&v, true);
103 				break;
104 			case WJB_ELEM:
105 				/* Pretend string array elements are keys, see jsonb.h */
106 				entries[i++] = make_scalar_key(&v, (v.type == jbvString));
107 				break;
108 			case WJB_VALUE:
109 				entries[i++] = make_scalar_key(&v, false);
110 				break;
111 			default:
112 				/* we can ignore structural items */
113 				break;
114 		}
115 	}
116 
117 	*nentries = i;
118 
119 	PG_RETURN_POINTER(entries);
120 }
121 
122 Datum
gin_extract_jsonb_query(PG_FUNCTION_ARGS)123 gin_extract_jsonb_query(PG_FUNCTION_ARGS)
124 {
125 	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
126 	StrategyNumber strategy = PG_GETARG_UINT16(2);
127 	int32	   *searchMode = (int32 *) PG_GETARG_POINTER(6);
128 	Datum	   *entries;
129 
130 	if (strategy == JsonbContainsStrategyNumber)
131 	{
132 		/* Query is a jsonb, so just apply gin_extract_jsonb... */
133 		entries = (Datum *)
134 			DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb,
135 												PG_GETARG_DATUM(0),
136 												PointerGetDatum(nentries)));
137 		/* ...although "contains {}" requires a full index scan */
138 		if (*nentries == 0)
139 			*searchMode = GIN_SEARCH_MODE_ALL;
140 	}
141 	else if (strategy == JsonbExistsStrategyNumber)
142 	{
143 		/* Query is a text string, which we treat as a key */
144 		text	   *query = PG_GETARG_TEXT_PP(0);
145 
146 		*nentries = 1;
147 		entries = (Datum *) palloc(sizeof(Datum));
148 		entries[0] = make_text_key(JGINFLAG_KEY,
149 								   VARDATA_ANY(query),
150 								   VARSIZE_ANY_EXHDR(query));
151 	}
152 	else if (strategy == JsonbExistsAnyStrategyNumber ||
153 			 strategy == JsonbExistsAllStrategyNumber)
154 	{
155 		/* Query is a text array; each element is treated as a key */
156 		ArrayType  *query = PG_GETARG_ARRAYTYPE_P(0);
157 		Datum	   *key_datums;
158 		bool	   *key_nulls;
159 		int			key_count;
160 		int			i,
161 					j;
162 
163 		deconstruct_array(query,
164 						  TEXTOID, -1, false, 'i',
165 						  &key_datums, &key_nulls, &key_count);
166 
167 		entries = (Datum *) palloc(sizeof(Datum) * key_count);
168 
169 		for (i = 0, j = 0; i < key_count; i++)
170 		{
171 			/* Nulls in the array are ignored */
172 			if (key_nulls[i])
173 				continue;
174 			entries[j++] = make_text_key(JGINFLAG_KEY,
175 										 VARDATA(key_datums[i]),
176 										 VARSIZE(key_datums[i]) - VARHDRSZ);
177 		}
178 
179 		*nentries = j;
180 		/* ExistsAll with no keys should match everything */
181 		if (j == 0 && strategy == JsonbExistsAllStrategyNumber)
182 			*searchMode = GIN_SEARCH_MODE_ALL;
183 	}
184 	else
185 	{
186 		elog(ERROR, "unrecognized strategy number: %d", strategy);
187 		entries = NULL;			/* keep compiler quiet */
188 	}
189 
190 	PG_RETURN_POINTER(entries);
191 }
192 
193 Datum
gin_consistent_jsonb(PG_FUNCTION_ARGS)194 gin_consistent_jsonb(PG_FUNCTION_ARGS)
195 {
196 	bool	   *check = (bool *) PG_GETARG_POINTER(0);
197 	StrategyNumber strategy = PG_GETARG_UINT16(1);
198 
199 	/* Jsonb	   *query = PG_GETARG_JSONB_P(2); */
200 	int32		nkeys = PG_GETARG_INT32(3);
201 
202 	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
203 	bool	   *recheck = (bool *) PG_GETARG_POINTER(5);
204 	bool		res = true;
205 	int32		i;
206 
207 	if (strategy == JsonbContainsStrategyNumber)
208 	{
209 		/*
210 		 * We must always recheck, since we can't tell from the index whether
211 		 * the positions of the matched items match the structure of the query
212 		 * object.  (Even if we could, we'd also have to worry about hashed
213 		 * keys and the index's failure to distinguish keys from string array
214 		 * elements.)  However, the tuple certainly doesn't match unless it
215 		 * contains all the query keys.
216 		 */
217 		*recheck = true;
218 		for (i = 0; i < nkeys; i++)
219 		{
220 			if (!check[i])
221 			{
222 				res = false;
223 				break;
224 			}
225 		}
226 	}
227 	else if (strategy == JsonbExistsStrategyNumber)
228 	{
229 		/*
230 		 * Although the key is certainly present in the index, we must recheck
231 		 * because (1) the key might be hashed, and (2) the index match might
232 		 * be for a key that's not at top level of the JSON object.  For (1),
233 		 * we could look at the query key to see if it's hashed and not
234 		 * recheck if not, but the index lacks enough info to tell about (2).
235 		 */
236 		*recheck = true;
237 		res = true;
238 	}
239 	else if (strategy == JsonbExistsAnyStrategyNumber)
240 	{
241 		/* As for plain exists, we must recheck */
242 		*recheck = true;
243 		res = true;
244 	}
245 	else if (strategy == JsonbExistsAllStrategyNumber)
246 	{
247 		/* As for plain exists, we must recheck */
248 		*recheck = true;
249 		/* ... but unless all the keys are present, we can say "false" */
250 		for (i = 0; i < nkeys; i++)
251 		{
252 			if (!check[i])
253 			{
254 				res = false;
255 				break;
256 			}
257 		}
258 	}
259 	else
260 		elog(ERROR, "unrecognized strategy number: %d", strategy);
261 
262 	PG_RETURN_BOOL(res);
263 }
264 
265 Datum
gin_triconsistent_jsonb(PG_FUNCTION_ARGS)266 gin_triconsistent_jsonb(PG_FUNCTION_ARGS)
267 {
268 	GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
269 	StrategyNumber strategy = PG_GETARG_UINT16(1);
270 
271 	/* Jsonb	   *query = PG_GETARG_JSONB_P(2); */
272 	int32		nkeys = PG_GETARG_INT32(3);
273 
274 	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
275 	GinTernaryValue res = GIN_MAYBE;
276 	int32		i;
277 
278 	/*
279 	 * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
280 	 * corresponds to always forcing recheck in the regular consistent
281 	 * function, for the reasons listed there.
282 	 */
283 	if (strategy == JsonbContainsStrategyNumber ||
284 		strategy == JsonbExistsAllStrategyNumber)
285 	{
286 		/* All extracted keys must be present */
287 		for (i = 0; i < nkeys; i++)
288 		{
289 			if (check[i] == GIN_FALSE)
290 			{
291 				res = GIN_FALSE;
292 				break;
293 			}
294 		}
295 	}
296 	else if (strategy == JsonbExistsStrategyNumber ||
297 			 strategy == JsonbExistsAnyStrategyNumber)
298 	{
299 		/* At least one extracted key must be present */
300 		res = GIN_FALSE;
301 		for (i = 0; i < nkeys; i++)
302 		{
303 			if (check[i] == GIN_TRUE ||
304 				check[i] == GIN_MAYBE)
305 			{
306 				res = GIN_MAYBE;
307 				break;
308 			}
309 		}
310 	}
311 	else
312 		elog(ERROR, "unrecognized strategy number: %d", strategy);
313 
314 	PG_RETURN_GIN_TERNARY_VALUE(res);
315 }
316 
317 /*
318  *
319  * jsonb_path_ops GIN opclass support functions
320  *
321  * In a jsonb_path_ops index, the GIN keys are uint32 hashes, one per JSON
322  * value; but the JSON key(s) leading to each value are also included in its
323  * hash computation.  This means we can only support containment queries,
324  * but the index can distinguish, for example, {"foo": 42} from {"bar": 42}
325  * since different hashes will be generated.
326  *
327  */
328 
329 Datum
gin_extract_jsonb_path(PG_FUNCTION_ARGS)330 gin_extract_jsonb_path(PG_FUNCTION_ARGS)
331 {
332 	Jsonb	   *jb = PG_GETARG_JSONB_P(0);
333 	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
334 	int			total = 2 * JB_ROOT_COUNT(jb);
335 	JsonbIterator *it;
336 	JsonbValue	v;
337 	JsonbIteratorToken r;
338 	PathHashStack tail;
339 	PathHashStack *stack;
340 	int			i = 0;
341 	Datum	   *entries;
342 
343 	/* If the root level is empty, we certainly have no keys */
344 	if (total == 0)
345 	{
346 		*nentries = 0;
347 		PG_RETURN_POINTER(NULL);
348 	}
349 
350 	/* Otherwise, use 2 * root count as initial estimate of result size */
351 	entries = (Datum *) palloc(sizeof(Datum) * total);
352 
353 	/* We keep a stack of partial hashes corresponding to parent key levels */
354 	tail.parent = NULL;
355 	tail.hash = 0;
356 	stack = &tail;
357 
358 	it = JsonbIteratorInit(&jb->root);
359 
360 	while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
361 	{
362 		PathHashStack *parent;
363 
364 		/* Since we recurse into the object, we might need more space */
365 		if (i >= total)
366 		{
367 			total *= 2;
368 			entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
369 		}
370 
371 		switch (r)
372 		{
373 			case WJB_BEGIN_ARRAY:
374 			case WJB_BEGIN_OBJECT:
375 				/* Push a stack level for this object */
376 				parent = stack;
377 				stack = (PathHashStack *) palloc(sizeof(PathHashStack));
378 
379 				/*
380 				 * We pass forward hashes from outer nesting levels so that
381 				 * the hashes for nested values will include outer keys as
382 				 * well as their own keys.
383 				 *
384 				 * Nesting an array within another array will not alter
385 				 * innermost scalar element hash values, but that seems
386 				 * inconsequential.
387 				 */
388 				stack->hash = parent->hash;
389 				stack->parent = parent;
390 				break;
391 			case WJB_KEY:
392 				/* mix this key into the current outer hash */
393 				JsonbHashScalarValue(&v, &stack->hash);
394 				/* hash is now ready to incorporate the value */
395 				break;
396 			case WJB_ELEM:
397 			case WJB_VALUE:
398 				/* mix the element or value's hash into the prepared hash */
399 				JsonbHashScalarValue(&v, &stack->hash);
400 				/* and emit an index entry */
401 				entries[i++] = UInt32GetDatum(stack->hash);
402 				/* reset hash for next key, value, or sub-object */
403 				stack->hash = stack->parent->hash;
404 				break;
405 			case WJB_END_ARRAY:
406 			case WJB_END_OBJECT:
407 				/* Pop the stack */
408 				parent = stack->parent;
409 				pfree(stack);
410 				stack = parent;
411 				/* reset hash for next key, value, or sub-object */
412 				if (stack->parent)
413 					stack->hash = stack->parent->hash;
414 				else
415 					stack->hash = 0;
416 				break;
417 			default:
418 				elog(ERROR, "invalid JsonbIteratorNext rc: %d", (int) r);
419 		}
420 	}
421 
422 	*nentries = i;
423 
424 	PG_RETURN_POINTER(entries);
425 }
426 
427 Datum
gin_extract_jsonb_query_path(PG_FUNCTION_ARGS)428 gin_extract_jsonb_query_path(PG_FUNCTION_ARGS)
429 {
430 	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
431 	StrategyNumber strategy = PG_GETARG_UINT16(2);
432 	int32	   *searchMode = (int32 *) PG_GETARG_POINTER(6);
433 	Datum	   *entries;
434 
435 	if (strategy != JsonbContainsStrategyNumber)
436 		elog(ERROR, "unrecognized strategy number: %d", strategy);
437 
438 	/* Query is a jsonb, so just apply gin_extract_jsonb_path ... */
439 	entries = (Datum *)
440 		DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb_path,
441 											PG_GETARG_DATUM(0),
442 											PointerGetDatum(nentries)));
443 
444 	/* ... although "contains {}" requires a full index scan */
445 	if (*nentries == 0)
446 		*searchMode = GIN_SEARCH_MODE_ALL;
447 
448 	PG_RETURN_POINTER(entries);
449 }
450 
451 Datum
gin_consistent_jsonb_path(PG_FUNCTION_ARGS)452 gin_consistent_jsonb_path(PG_FUNCTION_ARGS)
453 {
454 	bool	   *check = (bool *) PG_GETARG_POINTER(0);
455 	StrategyNumber strategy = PG_GETARG_UINT16(1);
456 
457 	/* Jsonb	   *query = PG_GETARG_JSONB_P(2); */
458 	int32		nkeys = PG_GETARG_INT32(3);
459 
460 	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
461 	bool	   *recheck = (bool *) PG_GETARG_POINTER(5);
462 	bool		res = true;
463 	int32		i;
464 
465 	if (strategy != JsonbContainsStrategyNumber)
466 		elog(ERROR, "unrecognized strategy number: %d", strategy);
467 
468 	/*
469 	 * jsonb_path_ops is necessarily lossy, not only because of hash
470 	 * collisions but also because it doesn't preserve complete information
471 	 * about the structure of the JSON object.  Besides, there are some
472 	 * special rules around the containment of raw scalars in arrays that are
473 	 * not handled here.  So we must always recheck a match.  However, if not
474 	 * all of the keys are present, the tuple certainly doesn't match.
475 	 */
476 	*recheck = true;
477 	for (i = 0; i < nkeys; i++)
478 	{
479 		if (!check[i])
480 		{
481 			res = false;
482 			break;
483 		}
484 	}
485 
486 	PG_RETURN_BOOL(res);
487 }
488 
489 Datum
gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS)490 gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS)
491 {
492 	GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
493 	StrategyNumber strategy = PG_GETARG_UINT16(1);
494 
495 	/* Jsonb	   *query = PG_GETARG_JSONB_P(2); */
496 	int32		nkeys = PG_GETARG_INT32(3);
497 
498 	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
499 	GinTernaryValue res = GIN_MAYBE;
500 	int32		i;
501 
502 	if (strategy != JsonbContainsStrategyNumber)
503 		elog(ERROR, "unrecognized strategy number: %d", strategy);
504 
505 	/*
506 	 * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
507 	 * corresponds to always forcing recheck in the regular consistent
508 	 * function, for the reasons listed there.
509 	 */
510 	for (i = 0; i < nkeys; i++)
511 	{
512 		if (check[i] == GIN_FALSE)
513 		{
514 			res = GIN_FALSE;
515 			break;
516 		}
517 	}
518 
519 	PG_RETURN_GIN_TERNARY_VALUE(res);
520 }
521 
522 /*
523  * Construct a jsonb_ops GIN key from a flag byte and a textual representation
524  * (which need not be null-terminated).  This function is responsible
525  * for hashing overlength text representations; it will add the
526  * JGINFLAG_HASHED bit to the flag value if it does that.
527  */
528 static Datum
make_text_key(char flag,const char * str,int len)529 make_text_key(char flag, const char *str, int len)
530 {
531 	text	   *item;
532 	char		hashbuf[10];
533 
534 	if (len > JGIN_MAXLENGTH)
535 	{
536 		uint32		hashval;
537 
538 		hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len));
539 		snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval);
540 		str = hashbuf;
541 		len = 8;
542 		flag |= JGINFLAG_HASHED;
543 	}
544 
545 	/*
546 	 * Now build the text Datum.  For simplicity we build a 4-byte-header
547 	 * varlena text Datum here, but we expect it will get converted to short
548 	 * header format when stored in the index.
549 	 */
550 	item = (text *) palloc(VARHDRSZ + len + 1);
551 	SET_VARSIZE(item, VARHDRSZ + len + 1);
552 
553 	*VARDATA(item) = flag;
554 
555 	memcpy(VARDATA(item) + 1, str, len);
556 
557 	return PointerGetDatum(item);
558 }
559 
560 /*
561  * Create a textual representation of a JsonbValue that will serve as a GIN
562  * key in a jsonb_ops index.  is_key is true if the JsonbValue is a key,
563  * or if it is a string array element (since we pretend those are keys,
564  * see jsonb.h).
565  */
566 static Datum
make_scalar_key(const JsonbValue * scalarVal,bool is_key)567 make_scalar_key(const JsonbValue *scalarVal, bool is_key)
568 {
569 	Datum		item;
570 	char	   *cstr;
571 
572 	switch (scalarVal->type)
573 	{
574 		case jbvNull:
575 			Assert(!is_key);
576 			item = make_text_key(JGINFLAG_NULL, "", 0);
577 			break;
578 		case jbvBool:
579 			Assert(!is_key);
580 			item = make_text_key(JGINFLAG_BOOL,
581 								 scalarVal->val.boolean ? "t" : "f", 1);
582 			break;
583 		case jbvNumeric:
584 			Assert(!is_key);
585 
586 			/*
587 			 * A normalized textual representation, free of trailing zeroes,
588 			 * is required so that numerically equal values will produce equal
589 			 * strings.
590 			 *
591 			 * It isn't ideal that numerics are stored in a relatively bulky
592 			 * textual format.  However, it's a notationally convenient way of
593 			 * storing a "union" type in the GIN B-Tree, and indexing Jsonb
594 			 * strings takes precedence.
595 			 */
596 			cstr = numeric_normalize(scalarVal->val.numeric);
597 			item = make_text_key(JGINFLAG_NUM, cstr, strlen(cstr));
598 			pfree(cstr);
599 			break;
600 		case jbvString:
601 			item = make_text_key(is_key ? JGINFLAG_KEY : JGINFLAG_STR,
602 								 scalarVal->val.string.val,
603 								 scalarVal->val.string.len);
604 			break;
605 		default:
606 			elog(ERROR, "unrecognized jsonb scalar type: %d", scalarVal->type);
607 			item = 0;			/* keep compiler quiet */
608 			break;
609 	}
610 
611 	return item;
612 }
613