1 /*-------------------------------------------------------------------------
2 *
3 * jsonb_gin.c
4 * GIN support functions for jsonb
5 *
6 * Copyright (c) 2014-2018, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/jsonb_gin.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "access/gin.h"
17 #include "access/hash.h"
18 #include "access/stratnum.h"
19 #include "catalog/pg_collation.h"
20 #include "catalog/pg_type.h"
21 #include "utils/builtins.h"
22 #include "utils/jsonb.h"
23 #include "utils/varlena.h"
24
25 typedef struct PathHashStack
26 {
27 uint32 hash;
28 struct PathHashStack *parent;
29 } PathHashStack;
30
31 static Datum make_text_key(char flag, const char *str, int len);
32 static Datum make_scalar_key(const JsonbValue *scalarVal, bool is_key);
33
34 /*
35 *
36 * jsonb_ops GIN opclass support functions
37 *
38 */
39
40 Datum
gin_compare_jsonb(PG_FUNCTION_ARGS)41 gin_compare_jsonb(PG_FUNCTION_ARGS)
42 {
43 text *arg1 = PG_GETARG_TEXT_PP(0);
44 text *arg2 = PG_GETARG_TEXT_PP(1);
45 int32 result;
46 char *a1p,
47 *a2p;
48 int len1,
49 len2;
50
51 a1p = VARDATA_ANY(arg1);
52 a2p = VARDATA_ANY(arg2);
53
54 len1 = VARSIZE_ANY_EXHDR(arg1);
55 len2 = VARSIZE_ANY_EXHDR(arg2);
56
57 /* Compare text as bttextcmp does, but always using C collation */
58 result = varstr_cmp(a1p, len1, a2p, len2, C_COLLATION_OID);
59
60 PG_FREE_IF_COPY(arg1, 0);
61 PG_FREE_IF_COPY(arg2, 1);
62
63 PG_RETURN_INT32(result);
64 }
65
66 Datum
gin_extract_jsonb(PG_FUNCTION_ARGS)67 gin_extract_jsonb(PG_FUNCTION_ARGS)
68 {
69 Jsonb *jb = (Jsonb *) PG_GETARG_JSONB_P(0);
70 int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
71 int total = 2 * JB_ROOT_COUNT(jb);
72 JsonbIterator *it;
73 JsonbValue v;
74 JsonbIteratorToken r;
75 int i = 0;
76 Datum *entries;
77
78 /* If the root level is empty, we certainly have no keys */
79 if (total == 0)
80 {
81 *nentries = 0;
82 PG_RETURN_POINTER(NULL);
83 }
84
85 /* Otherwise, use 2 * root count as initial estimate of result size */
86 entries = (Datum *) palloc(sizeof(Datum) * total);
87
88 it = JsonbIteratorInit(&jb->root);
89
90 while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
91 {
92 /* Since we recurse into the object, we might need more space */
93 if (i >= total)
94 {
95 total *= 2;
96 entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
97 }
98
99 switch (r)
100 {
101 case WJB_KEY:
102 entries[i++] = make_scalar_key(&v, true);
103 break;
104 case WJB_ELEM:
105 /* Pretend string array elements are keys, see jsonb.h */
106 entries[i++] = make_scalar_key(&v, (v.type == jbvString));
107 break;
108 case WJB_VALUE:
109 entries[i++] = make_scalar_key(&v, false);
110 break;
111 default:
112 /* we can ignore structural items */
113 break;
114 }
115 }
116
117 *nentries = i;
118
119 PG_RETURN_POINTER(entries);
120 }
121
122 Datum
gin_extract_jsonb_query(PG_FUNCTION_ARGS)123 gin_extract_jsonb_query(PG_FUNCTION_ARGS)
124 {
125 int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
126 StrategyNumber strategy = PG_GETARG_UINT16(2);
127 int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
128 Datum *entries;
129
130 if (strategy == JsonbContainsStrategyNumber)
131 {
132 /* Query is a jsonb, so just apply gin_extract_jsonb... */
133 entries = (Datum *)
134 DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb,
135 PG_GETARG_DATUM(0),
136 PointerGetDatum(nentries)));
137 /* ...although "contains {}" requires a full index scan */
138 if (*nentries == 0)
139 *searchMode = GIN_SEARCH_MODE_ALL;
140 }
141 else if (strategy == JsonbExistsStrategyNumber)
142 {
143 /* Query is a text string, which we treat as a key */
144 text *query = PG_GETARG_TEXT_PP(0);
145
146 *nentries = 1;
147 entries = (Datum *) palloc(sizeof(Datum));
148 entries[0] = make_text_key(JGINFLAG_KEY,
149 VARDATA_ANY(query),
150 VARSIZE_ANY_EXHDR(query));
151 }
152 else if (strategy == JsonbExistsAnyStrategyNumber ||
153 strategy == JsonbExistsAllStrategyNumber)
154 {
155 /* Query is a text array; each element is treated as a key */
156 ArrayType *query = PG_GETARG_ARRAYTYPE_P(0);
157 Datum *key_datums;
158 bool *key_nulls;
159 int key_count;
160 int i,
161 j;
162
163 deconstruct_array(query,
164 TEXTOID, -1, false, 'i',
165 &key_datums, &key_nulls, &key_count);
166
167 entries = (Datum *) palloc(sizeof(Datum) * key_count);
168
169 for (i = 0, j = 0; i < key_count; i++)
170 {
171 /* Nulls in the array are ignored */
172 if (key_nulls[i])
173 continue;
174 entries[j++] = make_text_key(JGINFLAG_KEY,
175 VARDATA(key_datums[i]),
176 VARSIZE(key_datums[i]) - VARHDRSZ);
177 }
178
179 *nentries = j;
180 /* ExistsAll with no keys should match everything */
181 if (j == 0 && strategy == JsonbExistsAllStrategyNumber)
182 *searchMode = GIN_SEARCH_MODE_ALL;
183 }
184 else
185 {
186 elog(ERROR, "unrecognized strategy number: %d", strategy);
187 entries = NULL; /* keep compiler quiet */
188 }
189
190 PG_RETURN_POINTER(entries);
191 }
192
193 Datum
gin_consistent_jsonb(PG_FUNCTION_ARGS)194 gin_consistent_jsonb(PG_FUNCTION_ARGS)
195 {
196 bool *check = (bool *) PG_GETARG_POINTER(0);
197 StrategyNumber strategy = PG_GETARG_UINT16(1);
198
199 /* Jsonb *query = PG_GETARG_JSONB_P(2); */
200 int32 nkeys = PG_GETARG_INT32(3);
201
202 /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
203 bool *recheck = (bool *) PG_GETARG_POINTER(5);
204 bool res = true;
205 int32 i;
206
207 if (strategy == JsonbContainsStrategyNumber)
208 {
209 /*
210 * We must always recheck, since we can't tell from the index whether
211 * the positions of the matched items match the structure of the query
212 * object. (Even if we could, we'd also have to worry about hashed
213 * keys and the index's failure to distinguish keys from string array
214 * elements.) However, the tuple certainly doesn't match unless it
215 * contains all the query keys.
216 */
217 *recheck = true;
218 for (i = 0; i < nkeys; i++)
219 {
220 if (!check[i])
221 {
222 res = false;
223 break;
224 }
225 }
226 }
227 else if (strategy == JsonbExistsStrategyNumber)
228 {
229 /*
230 * Although the key is certainly present in the index, we must recheck
231 * because (1) the key might be hashed, and (2) the index match might
232 * be for a key that's not at top level of the JSON object. For (1),
233 * we could look at the query key to see if it's hashed and not
234 * recheck if not, but the index lacks enough info to tell about (2).
235 */
236 *recheck = true;
237 res = true;
238 }
239 else if (strategy == JsonbExistsAnyStrategyNumber)
240 {
241 /* As for plain exists, we must recheck */
242 *recheck = true;
243 res = true;
244 }
245 else if (strategy == JsonbExistsAllStrategyNumber)
246 {
247 /* As for plain exists, we must recheck */
248 *recheck = true;
249 /* ... but unless all the keys are present, we can say "false" */
250 for (i = 0; i < nkeys; i++)
251 {
252 if (!check[i])
253 {
254 res = false;
255 break;
256 }
257 }
258 }
259 else
260 elog(ERROR, "unrecognized strategy number: %d", strategy);
261
262 PG_RETURN_BOOL(res);
263 }
264
265 Datum
gin_triconsistent_jsonb(PG_FUNCTION_ARGS)266 gin_triconsistent_jsonb(PG_FUNCTION_ARGS)
267 {
268 GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
269 StrategyNumber strategy = PG_GETARG_UINT16(1);
270
271 /* Jsonb *query = PG_GETARG_JSONB_P(2); */
272 int32 nkeys = PG_GETARG_INT32(3);
273
274 /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
275 GinTernaryValue res = GIN_MAYBE;
276 int32 i;
277
278 /*
279 * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
280 * corresponds to always forcing recheck in the regular consistent
281 * function, for the reasons listed there.
282 */
283 if (strategy == JsonbContainsStrategyNumber ||
284 strategy == JsonbExistsAllStrategyNumber)
285 {
286 /* All extracted keys must be present */
287 for (i = 0; i < nkeys; i++)
288 {
289 if (check[i] == GIN_FALSE)
290 {
291 res = GIN_FALSE;
292 break;
293 }
294 }
295 }
296 else if (strategy == JsonbExistsStrategyNumber ||
297 strategy == JsonbExistsAnyStrategyNumber)
298 {
299 /* At least one extracted key must be present */
300 res = GIN_FALSE;
301 for (i = 0; i < nkeys; i++)
302 {
303 if (check[i] == GIN_TRUE ||
304 check[i] == GIN_MAYBE)
305 {
306 res = GIN_MAYBE;
307 break;
308 }
309 }
310 }
311 else
312 elog(ERROR, "unrecognized strategy number: %d", strategy);
313
314 PG_RETURN_GIN_TERNARY_VALUE(res);
315 }
316
317 /*
318 *
319 * jsonb_path_ops GIN opclass support functions
320 *
321 * In a jsonb_path_ops index, the GIN keys are uint32 hashes, one per JSON
322 * value; but the JSON key(s) leading to each value are also included in its
323 * hash computation. This means we can only support containment queries,
324 * but the index can distinguish, for example, {"foo": 42} from {"bar": 42}
325 * since different hashes will be generated.
326 *
327 */
328
329 Datum
gin_extract_jsonb_path(PG_FUNCTION_ARGS)330 gin_extract_jsonb_path(PG_FUNCTION_ARGS)
331 {
332 Jsonb *jb = PG_GETARG_JSONB_P(0);
333 int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
334 int total = 2 * JB_ROOT_COUNT(jb);
335 JsonbIterator *it;
336 JsonbValue v;
337 JsonbIteratorToken r;
338 PathHashStack tail;
339 PathHashStack *stack;
340 int i = 0;
341 Datum *entries;
342
343 /* If the root level is empty, we certainly have no keys */
344 if (total == 0)
345 {
346 *nentries = 0;
347 PG_RETURN_POINTER(NULL);
348 }
349
350 /* Otherwise, use 2 * root count as initial estimate of result size */
351 entries = (Datum *) palloc(sizeof(Datum) * total);
352
353 /* We keep a stack of partial hashes corresponding to parent key levels */
354 tail.parent = NULL;
355 tail.hash = 0;
356 stack = &tail;
357
358 it = JsonbIteratorInit(&jb->root);
359
360 while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
361 {
362 PathHashStack *parent;
363
364 /* Since we recurse into the object, we might need more space */
365 if (i >= total)
366 {
367 total *= 2;
368 entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
369 }
370
371 switch (r)
372 {
373 case WJB_BEGIN_ARRAY:
374 case WJB_BEGIN_OBJECT:
375 /* Push a stack level for this object */
376 parent = stack;
377 stack = (PathHashStack *) palloc(sizeof(PathHashStack));
378
379 /*
380 * We pass forward hashes from outer nesting levels so that
381 * the hashes for nested values will include outer keys as
382 * well as their own keys.
383 *
384 * Nesting an array within another array will not alter
385 * innermost scalar element hash values, but that seems
386 * inconsequential.
387 */
388 stack->hash = parent->hash;
389 stack->parent = parent;
390 break;
391 case WJB_KEY:
392 /* mix this key into the current outer hash */
393 JsonbHashScalarValue(&v, &stack->hash);
394 /* hash is now ready to incorporate the value */
395 break;
396 case WJB_ELEM:
397 case WJB_VALUE:
398 /* mix the element or value's hash into the prepared hash */
399 JsonbHashScalarValue(&v, &stack->hash);
400 /* and emit an index entry */
401 entries[i++] = UInt32GetDatum(stack->hash);
402 /* reset hash for next key, value, or sub-object */
403 stack->hash = stack->parent->hash;
404 break;
405 case WJB_END_ARRAY:
406 case WJB_END_OBJECT:
407 /* Pop the stack */
408 parent = stack->parent;
409 pfree(stack);
410 stack = parent;
411 /* reset hash for next key, value, or sub-object */
412 if (stack->parent)
413 stack->hash = stack->parent->hash;
414 else
415 stack->hash = 0;
416 break;
417 default:
418 elog(ERROR, "invalid JsonbIteratorNext rc: %d", (int) r);
419 }
420 }
421
422 *nentries = i;
423
424 PG_RETURN_POINTER(entries);
425 }
426
427 Datum
gin_extract_jsonb_query_path(PG_FUNCTION_ARGS)428 gin_extract_jsonb_query_path(PG_FUNCTION_ARGS)
429 {
430 int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
431 StrategyNumber strategy = PG_GETARG_UINT16(2);
432 int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
433 Datum *entries;
434
435 if (strategy != JsonbContainsStrategyNumber)
436 elog(ERROR, "unrecognized strategy number: %d", strategy);
437
438 /* Query is a jsonb, so just apply gin_extract_jsonb_path ... */
439 entries = (Datum *)
440 DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb_path,
441 PG_GETARG_DATUM(0),
442 PointerGetDatum(nentries)));
443
444 /* ... although "contains {}" requires a full index scan */
445 if (*nentries == 0)
446 *searchMode = GIN_SEARCH_MODE_ALL;
447
448 PG_RETURN_POINTER(entries);
449 }
450
451 Datum
gin_consistent_jsonb_path(PG_FUNCTION_ARGS)452 gin_consistent_jsonb_path(PG_FUNCTION_ARGS)
453 {
454 bool *check = (bool *) PG_GETARG_POINTER(0);
455 StrategyNumber strategy = PG_GETARG_UINT16(1);
456
457 /* Jsonb *query = PG_GETARG_JSONB_P(2); */
458 int32 nkeys = PG_GETARG_INT32(3);
459
460 /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
461 bool *recheck = (bool *) PG_GETARG_POINTER(5);
462 bool res = true;
463 int32 i;
464
465 if (strategy != JsonbContainsStrategyNumber)
466 elog(ERROR, "unrecognized strategy number: %d", strategy);
467
468 /*
469 * jsonb_path_ops is necessarily lossy, not only because of hash
470 * collisions but also because it doesn't preserve complete information
471 * about the structure of the JSON object. Besides, there are some
472 * special rules around the containment of raw scalars in arrays that are
473 * not handled here. So we must always recheck a match. However, if not
474 * all of the keys are present, the tuple certainly doesn't match.
475 */
476 *recheck = true;
477 for (i = 0; i < nkeys; i++)
478 {
479 if (!check[i])
480 {
481 res = false;
482 break;
483 }
484 }
485
486 PG_RETURN_BOOL(res);
487 }
488
489 Datum
gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS)490 gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS)
491 {
492 GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
493 StrategyNumber strategy = PG_GETARG_UINT16(1);
494
495 /* Jsonb *query = PG_GETARG_JSONB_P(2); */
496 int32 nkeys = PG_GETARG_INT32(3);
497
498 /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
499 GinTernaryValue res = GIN_MAYBE;
500 int32 i;
501
502 if (strategy != JsonbContainsStrategyNumber)
503 elog(ERROR, "unrecognized strategy number: %d", strategy);
504
505 /*
506 * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
507 * corresponds to always forcing recheck in the regular consistent
508 * function, for the reasons listed there.
509 */
510 for (i = 0; i < nkeys; i++)
511 {
512 if (check[i] == GIN_FALSE)
513 {
514 res = GIN_FALSE;
515 break;
516 }
517 }
518
519 PG_RETURN_GIN_TERNARY_VALUE(res);
520 }
521
522 /*
523 * Construct a jsonb_ops GIN key from a flag byte and a textual representation
524 * (which need not be null-terminated). This function is responsible
525 * for hashing overlength text representations; it will add the
526 * JGINFLAG_HASHED bit to the flag value if it does that.
527 */
528 static Datum
make_text_key(char flag,const char * str,int len)529 make_text_key(char flag, const char *str, int len)
530 {
531 text *item;
532 char hashbuf[10];
533
534 if (len > JGIN_MAXLENGTH)
535 {
536 uint32 hashval;
537
538 hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len));
539 snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval);
540 str = hashbuf;
541 len = 8;
542 flag |= JGINFLAG_HASHED;
543 }
544
545 /*
546 * Now build the text Datum. For simplicity we build a 4-byte-header
547 * varlena text Datum here, but we expect it will get converted to short
548 * header format when stored in the index.
549 */
550 item = (text *) palloc(VARHDRSZ + len + 1);
551 SET_VARSIZE(item, VARHDRSZ + len + 1);
552
553 *VARDATA(item) = flag;
554
555 memcpy(VARDATA(item) + 1, str, len);
556
557 return PointerGetDatum(item);
558 }
559
560 /*
561 * Create a textual representation of a JsonbValue that will serve as a GIN
562 * key in a jsonb_ops index. is_key is true if the JsonbValue is a key,
563 * or if it is a string array element (since we pretend those are keys,
564 * see jsonb.h).
565 */
566 static Datum
make_scalar_key(const JsonbValue * scalarVal,bool is_key)567 make_scalar_key(const JsonbValue *scalarVal, bool is_key)
568 {
569 Datum item;
570 char *cstr;
571
572 switch (scalarVal->type)
573 {
574 case jbvNull:
575 Assert(!is_key);
576 item = make_text_key(JGINFLAG_NULL, "", 0);
577 break;
578 case jbvBool:
579 Assert(!is_key);
580 item = make_text_key(JGINFLAG_BOOL,
581 scalarVal->val.boolean ? "t" : "f", 1);
582 break;
583 case jbvNumeric:
584 Assert(!is_key);
585
586 /*
587 * A normalized textual representation, free of trailing zeroes,
588 * is required so that numerically equal values will produce equal
589 * strings.
590 *
591 * It isn't ideal that numerics are stored in a relatively bulky
592 * textual format. However, it's a notationally convenient way of
593 * storing a "union" type in the GIN B-Tree, and indexing Jsonb
594 * strings takes precedence.
595 */
596 cstr = numeric_normalize(scalarVal->val.numeric);
597 item = make_text_key(JGINFLAG_NUM, cstr, strlen(cstr));
598 pfree(cstr);
599 break;
600 case jbvString:
601 item = make_text_key(is_key ? JGINFLAG_KEY : JGINFLAG_STR,
602 scalarVal->val.string.val,
603 scalarVal->val.string.len);
604 break;
605 default:
606 elog(ERROR, "unrecognized jsonb scalar type: %d", scalarVal->type);
607 item = 0; /* keep compiler quiet */
608 break;
609 }
610
611 return item;
612 }
613