1 /*
2 * brin_tuple.c
3 * Method implementations for tuples in BRIN indexes.
4 *
5 * Intended usage is that code outside this file only deals with
6 * BrinMemTuples, and convert to and from the on-disk representation through
7 * functions in this file.
8 *
9 * NOTES
10 *
11 * A BRIN tuple is similar to a heap tuple, with a few key differences. The
12 * first interesting difference is that the tuple header is much simpler, only
13 * containing its total length and a small area for flags. Also, the stored
14 * data does not match the relation tuple descriptor exactly: for each
15 * attribute in the descriptor, the index tuple carries an arbitrary number
16 * of values, depending on the opclass.
17 *
18 * Also, for each column of the index relation there are two null bits: one
19 * (hasnulls) stores whether any tuple within the page range has that column
20 * set to null; the other one (allnulls) stores whether the column values are
21 * all null. If allnulls is true, then the tuple data area does not contain
22 * values for that column at all; whereas it does if the hasnulls is set.
23 * Note the size of the null bitmask may not be the same as that of the
24 * datum array.
25 *
26 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 * IDENTIFICATION
30 * src/backend/access/brin/brin_tuple.c
31 */
32 #include "postgres.h"
33
34 #include "access/htup_details.h"
35 #include "access/brin_tuple.h"
36 #include "access/tupdesc.h"
37 #include "access/tupmacs.h"
38 #include "access/tuptoaster.h"
39 #include "utils/datum.h"
40 #include "utils/memutils.h"
41
42
43 static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
44 char *tp, bits8 *nullbits, bool nulls,
45 Datum *values, bool *allnulls, bool *hasnulls);
46
47
48 /*
49 * Return a tuple descriptor used for on-disk storage of BRIN tuples.
50 */
51 static TupleDesc
brtuple_disk_tupdesc(BrinDesc * brdesc)52 brtuple_disk_tupdesc(BrinDesc *brdesc)
53 {
54 /* We cache these in the BrinDesc */
55 if (brdesc->bd_disktdesc == NULL)
56 {
57 int i;
58 int j;
59 AttrNumber attno = 1;
60 TupleDesc tupdesc;
61 MemoryContext oldcxt;
62
63 /* make sure it's in the bdesc's context */
64 oldcxt = MemoryContextSwitchTo(brdesc->bd_context);
65
66 tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored);
67
68 for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
69 {
70 for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++)
71 TupleDescInitEntry(tupdesc, attno++, NULL,
72 brdesc->bd_info[i]->oi_typcache[j]->type_id,
73 -1, 0);
74 }
75
76 MemoryContextSwitchTo(oldcxt);
77
78 brdesc->bd_disktdesc = tupdesc;
79 }
80
81 return brdesc->bd_disktdesc;
82 }
83
84 /*
85 * Generate a new on-disk tuple to be inserted in a BRIN index.
86 *
87 * See brin_form_placeholder_tuple if you touch this.
88 */
89 BrinTuple *
brin_form_tuple(BrinDesc * brdesc,BlockNumber blkno,BrinMemTuple * tuple,Size * size)90 brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
91 Size *size)
92 {
93 Datum *values;
94 bool *nulls;
95 bool anynulls = false;
96 BrinTuple *rettuple;
97 int keyno;
98 int idxattno;
99 uint16 phony_infomask = 0;
100 bits8 *phony_nullbitmap;
101 Size len,
102 hoff,
103 data_len;
104 int i;
105
106 #ifdef TOAST_INDEX_HACK
107 Datum *untoasted_values;
108 int nuntoasted = 0;
109 #endif
110
111 Assert(brdesc->bd_totalstored > 0);
112
113 values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored);
114 nulls = (bool *) palloc0(sizeof(bool) * brdesc->bd_totalstored);
115 phony_nullbitmap = (bits8 *)
116 palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
117
118 #ifdef TOAST_INDEX_HACK
119 untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored);
120 #endif
121
122 /*
123 * Set up the values/nulls arrays for heap_fill_tuple
124 */
125 idxattno = 0;
126 for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
127 {
128 int datumno;
129
130 /*
131 * "allnulls" is set when there's no nonnull value in any row in the
132 * column; when this happens, there is no data to store. Thus set the
133 * nullable bits for all data elements of this column and we're done.
134 */
135 if (tuple->bt_columns[keyno].bv_allnulls)
136 {
137 for (datumno = 0;
138 datumno < brdesc->bd_info[keyno]->oi_nstored;
139 datumno++)
140 nulls[idxattno++] = true;
141 anynulls = true;
142 continue;
143 }
144
145 /*
146 * The "hasnulls" bit is set when there are some null values in the
147 * data. We still need to store a real value, but the presence of
148 * this means we need a null bitmap.
149 */
150 if (tuple->bt_columns[keyno].bv_hasnulls)
151 anynulls = true;
152
153 /*
154 * Now obtain the values of each stored datum. Note that some values
155 * might be toasted, and we cannot rely on the original heap values
156 * sticking around forever, so we must detoast them. Also try to
157 * compress them.
158 */
159 for (datumno = 0;
160 datumno < brdesc->bd_info[keyno]->oi_nstored;
161 datumno++)
162 {
163 Datum value = tuple->bt_columns[keyno].bv_values[datumno];
164
165 #ifdef TOAST_INDEX_HACK
166
167 /* We must look at the stored type, not at the index descriptor. */
168 TypeCacheEntry *atttype = brdesc->bd_info[keyno]->oi_typcache[datumno];
169
170 /* Do we need to free the value at the end? */
171 bool free_value = false;
172
173 /* For non-varlena types we don't need to do anything special */
174 if (atttype->typlen != -1)
175 {
176 values[idxattno++] = value;
177 continue;
178 }
179
180 /*
181 * Do nothing if value is not of varlena type. We don't need to
182 * care about NULL values here, thanks to bv_allnulls above.
183 *
184 * If value is stored EXTERNAL, must fetch it so we are not
185 * depending on outside storage.
186 *
187 * XXX Is this actually true? Could it be that the summary is
188 * NULL even for range with non-NULL data? E.g. degenerate bloom
189 * filter may be thrown away, etc.
190 */
191 if (VARATT_IS_EXTERNAL(DatumGetPointer(value)))
192 {
193 value = PointerGetDatum(heap_tuple_fetch_attr((struct varlena *)
194 DatumGetPointer(value)));
195 free_value = true;
196 }
197
198 /*
199 * If value is above size target, and is of a compressible datatype,
200 * try to compress it in-line.
201 */
202 if (!VARATT_IS_EXTENDED(DatumGetPointer(value)) &&
203 VARSIZE(DatumGetPointer(value)) > TOAST_INDEX_TARGET &&
204 (atttype->typstorage == 'x' || atttype->typstorage == 'm'))
205 {
206 Datum cvalue = toast_compress_datum(value);
207
208 if (DatumGetPointer(cvalue) != NULL)
209 {
210 /* successful compression */
211 if (free_value)
212 pfree(DatumGetPointer(value));
213
214 value = cvalue;
215 free_value = true;
216 }
217 }
218
219 /*
220 * If we untoasted / compressed the value, we need to free it
221 * after forming the index tuple.
222 */
223 if (free_value)
224 untoasted_values[nuntoasted++] = value;
225
226 #endif
227
228 values[idxattno++] = value;
229 }
230 }
231
232 /* Assert we did not overrun temp arrays */
233 Assert(idxattno <= brdesc->bd_totalstored);
234
235 /* compute total space needed */
236 len = SizeOfBrinTuple;
237 if (anynulls)
238 {
239 /*
240 * We need a double-length bitmap on an on-disk BRIN index tuple; the
241 * first half stores the "allnulls" bits, the second stores
242 * "hasnulls".
243 */
244 len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
245 }
246
247 len = hoff = MAXALIGN(len);
248
249 data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc),
250 values, nulls);
251 len += data_len;
252
253 len = MAXALIGN(len);
254
255 rettuple = palloc0(len);
256 rettuple->bt_blkno = blkno;
257 rettuple->bt_info = hoff;
258
259 /* Assert that hoff fits in the space available */
260 Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff);
261
262 /*
263 * The infomask and null bitmap as computed by heap_fill_tuple are useless
264 * to us. However, that function will not accept a null infomask; and we
265 * need to pass a valid null bitmap so that it will correctly skip
266 * outputting null attributes in the data area.
267 */
268 heap_fill_tuple(brtuple_disk_tupdesc(brdesc),
269 values,
270 nulls,
271 (char *) rettuple + hoff,
272 data_len,
273 &phony_infomask,
274 phony_nullbitmap);
275
276 /* done with these */
277 pfree(values);
278 pfree(nulls);
279 pfree(phony_nullbitmap);
280
281 #ifdef TOAST_INDEX_HACK
282 for (i = 0; i < nuntoasted; i++)
283 pfree(DatumGetPointer(untoasted_values[i]));
284 #endif
285
286 /*
287 * Now fill in the real null bitmasks. allnulls first.
288 */
289 if (anynulls)
290 {
291 bits8 *bitP;
292 int bitmask;
293
294 rettuple->bt_info |= BRIN_NULLS_MASK;
295
296 /*
297 * Note that we reverse the sense of null bits in this module: we
298 * store a 1 for a null attribute rather than a 0. So we must reverse
299 * the sense of the att_isnull test in brin_deconstruct_tuple as well.
300 */
301 bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
302 bitmask = HIGHBIT;
303 for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
304 {
305 if (bitmask != HIGHBIT)
306 bitmask <<= 1;
307 else
308 {
309 bitP += 1;
310 *bitP = 0x0;
311 bitmask = 1;
312 }
313
314 if (!tuple->bt_columns[keyno].bv_allnulls)
315 continue;
316
317 *bitP |= bitmask;
318 }
319 /* hasnulls bits follow */
320 for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
321 {
322 if (bitmask != HIGHBIT)
323 bitmask <<= 1;
324 else
325 {
326 bitP += 1;
327 *bitP = 0x0;
328 bitmask = 1;
329 }
330
331 if (!tuple->bt_columns[keyno].bv_hasnulls)
332 continue;
333
334 *bitP |= bitmask;
335 }
336 bitP = ((bits8 *) (rettuple + SizeOfBrinTuple)) - 1;
337 }
338
339 if (tuple->bt_placeholder)
340 rettuple->bt_info |= BRIN_PLACEHOLDER_MASK;
341
342 *size = len;
343 return rettuple;
344 }
345
346 /*
347 * Generate a new on-disk tuple with no data values, marked as placeholder.
348 *
349 * This is a cut-down version of brin_form_tuple.
350 */
351 BrinTuple *
brin_form_placeholder_tuple(BrinDesc * brdesc,BlockNumber blkno,Size * size)352 brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
353 {
354 Size len;
355 Size hoff;
356 BrinTuple *rettuple;
357 int keyno;
358 bits8 *bitP;
359 int bitmask;
360
361 /* compute total space needed: always add nulls */
362 len = SizeOfBrinTuple;
363 len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
364 len = hoff = MAXALIGN(len);
365
366 rettuple = palloc0(len);
367 rettuple->bt_blkno = blkno;
368 rettuple->bt_info = hoff;
369 rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK;
370
371 bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
372 bitmask = HIGHBIT;
373 /* set allnulls true for all attributes */
374 for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
375 {
376 if (bitmask != HIGHBIT)
377 bitmask <<= 1;
378 else
379 {
380 bitP += 1;
381 *bitP = 0x0;
382 bitmask = 1;
383 }
384
385 *bitP |= bitmask;
386 }
387 /* no need to set hasnulls */
388
389 *size = len;
390 return rettuple;
391 }
392
393 /*
394 * Free a tuple created by brin_form_tuple
395 */
396 void
brin_free_tuple(BrinTuple * tuple)397 brin_free_tuple(BrinTuple *tuple)
398 {
399 pfree(tuple);
400 }
401
402 /*
403 * Given a brin tuple of size len, create a copy of it. If 'dest' is not
404 * NULL, its size is destsz, and can be used as output buffer; if the tuple
405 * to be copied does not fit, it is enlarged by repalloc, and the size is
406 * updated to match. This avoids palloc/free cycles when many brin tuples
407 * are being processed in loops.
408 */
409 BrinTuple *
brin_copy_tuple(BrinTuple * tuple,Size len,BrinTuple * dest,Size * destsz)410 brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz)
411 {
412 if (!destsz || *destsz == 0)
413 dest = palloc(len);
414 else if (len > *destsz)
415 {
416 dest = repalloc(dest, len);
417 *destsz = len;
418 }
419
420 memcpy(dest, tuple, len);
421
422 return dest;
423 }
424
425 /*
426 * Return whether two BrinTuples are bitwise identical.
427 */
428 bool
brin_tuples_equal(const BrinTuple * a,Size alen,const BrinTuple * b,Size blen)429 brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen)
430 {
431 if (alen != blen)
432 return false;
433 if (memcmp(a, b, alen) != 0)
434 return false;
435 return true;
436 }
437
438 /*
439 * Create a new BrinMemTuple from scratch, and initialize it to an empty
440 * state.
441 *
442 * Note: we don't provide any means to free a deformed tuple, so make sure to
443 * use a temporary memory context.
444 */
445 BrinMemTuple *
brin_new_memtuple(BrinDesc * brdesc)446 brin_new_memtuple(BrinDesc *brdesc)
447 {
448 BrinMemTuple *dtup;
449 long basesize;
450
451 basesize = MAXALIGN(sizeof(BrinMemTuple) +
452 sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
453 dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored);
454
455 dtup->bt_values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
456 dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
457 dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
458
459 dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
460 "brin dtuple",
461 ALLOCSET_DEFAULT_SIZES);
462
463 brin_memtuple_initialize(dtup, brdesc);
464
465 return dtup;
466 }
467
468 /*
469 * Reset a BrinMemTuple to initial state. We return the same tuple, for
470 * notational convenience.
471 */
472 BrinMemTuple *
brin_memtuple_initialize(BrinMemTuple * dtuple,BrinDesc * brdesc)473 brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
474 {
475 int i;
476 char *currdatum;
477
478 MemoryContextReset(dtuple->bt_context);
479
480 currdatum = (char *) dtuple +
481 MAXALIGN(sizeof(BrinMemTuple) +
482 sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
483 for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
484 {
485 dtuple->bt_columns[i].bv_attno = i + 1;
486 dtuple->bt_columns[i].bv_allnulls = true;
487 dtuple->bt_columns[i].bv_hasnulls = false;
488 dtuple->bt_columns[i].bv_values = (Datum *) currdatum;
489 currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
490 }
491
492 return dtuple;
493 }
494
495 /*
496 * Convert a BrinTuple back to a BrinMemTuple. This is the reverse of
497 * brin_form_tuple.
498 *
499 * As an optimization, the caller can pass a previously allocated 'dMemtuple'.
500 * This avoids having to allocate it here, which can be useful when this
501 * function is called many times in a loop. It is caller's responsibility
502 * that the given BrinMemTuple matches what we need here.
503 *
504 * Note we don't need the "on disk tupdesc" here; we rely on our own routine to
505 * deconstruct the tuple from the on-disk format.
506 */
507 BrinMemTuple *
brin_deform_tuple(BrinDesc * brdesc,BrinTuple * tuple,BrinMemTuple * dMemtuple)508 brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
509 {
510 BrinMemTuple *dtup;
511 Datum *values;
512 bool *allnulls;
513 bool *hasnulls;
514 char *tp;
515 bits8 *nullbits;
516 int keyno;
517 int valueno;
518 MemoryContext oldcxt;
519
520 dtup = dMemtuple ? brin_memtuple_initialize(dMemtuple, brdesc) :
521 brin_new_memtuple(brdesc);
522
523 if (BrinTupleIsPlaceholder(tuple))
524 dtup->bt_placeholder = true;
525 dtup->bt_blkno = tuple->bt_blkno;
526
527 values = dtup->bt_values;
528 allnulls = dtup->bt_allnulls;
529 hasnulls = dtup->bt_hasnulls;
530
531 tp = (char *) tuple + BrinTupleDataOffset(tuple);
532
533 if (BrinTupleHasNulls(tuple))
534 nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple);
535 else
536 nullbits = NULL;
537 brin_deconstruct_tuple(brdesc,
538 tp, nullbits, BrinTupleHasNulls(tuple),
539 values, allnulls, hasnulls);
540
541 /*
542 * Iterate to assign each of the values to the corresponding item in the
543 * values array of each column. The copies occur in the tuple's context.
544 */
545 oldcxt = MemoryContextSwitchTo(dtup->bt_context);
546 for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
547 {
548 int i;
549
550 if (allnulls[keyno])
551 {
552 valueno += brdesc->bd_info[keyno]->oi_nstored;
553 continue;
554 }
555
556 /*
557 * We would like to skip datumCopy'ing the values datum in some cases,
558 * caller permitting ...
559 */
560 for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++)
561 dtup->bt_columns[keyno].bv_values[i] =
562 datumCopy(values[valueno++],
563 brdesc->bd_info[keyno]->oi_typcache[i]->typbyval,
564 brdesc->bd_info[keyno]->oi_typcache[i]->typlen);
565
566 dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno];
567 dtup->bt_columns[keyno].bv_allnulls = false;
568 }
569
570 MemoryContextSwitchTo(oldcxt);
571
572 return dtup;
573 }
574
575 /*
576 * brin_deconstruct_tuple
577 * Guts of attribute extraction from an on-disk BRIN tuple.
578 *
579 * Its arguments are:
580 * brdesc BRIN descriptor for the stored tuple
581 * tp pointer to the tuple data area
582 * nullbits pointer to the tuple nulls bitmask
583 * nulls "has nulls" bit in tuple infomask
584 * values output values, array of size brdesc->bd_totalstored
585 * allnulls output "allnulls", size brdesc->bd_tupdesc->natts
586 * hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts
587 *
588 * Output arrays must have been allocated by caller.
589 */
590 static inline void
brin_deconstruct_tuple(BrinDesc * brdesc,char * tp,bits8 * nullbits,bool nulls,Datum * values,bool * allnulls,bool * hasnulls)591 brin_deconstruct_tuple(BrinDesc *brdesc,
592 char *tp, bits8 *nullbits, bool nulls,
593 Datum *values, bool *allnulls, bool *hasnulls)
594 {
595 int attnum;
596 int stored;
597 TupleDesc diskdsc;
598 long off;
599
600 /*
601 * First iterate to natts to obtain both null flags for each attribute.
602 * Note that we reverse the sense of the att_isnull test, because we store
603 * 1 for a null value (rather than a 1 for a not null value as is the
604 * att_isnull convention used elsewhere.) See brin_form_tuple.
605 */
606 for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
607 {
608 /*
609 * the "all nulls" bit means that all values in the page range for
610 * this column are nulls. Therefore there are no values in the tuple
611 * data area.
612 */
613 allnulls[attnum] = nulls && !att_isnull(attnum, nullbits);
614
615 /*
616 * the "has nulls" bit means that some tuples have nulls, but others
617 * have not-null values. Therefore we know the tuple contains data
618 * for this column.
619 *
620 * The hasnulls bits follow the allnulls bits in the same bitmask.
621 */
622 hasnulls[attnum] =
623 nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits);
624 }
625
626 /*
627 * Iterate to obtain each attribute's stored values. Note that since we
628 * may reuse attribute entries for more than one column, we cannot cache
629 * offsets here.
630 */
631 diskdsc = brtuple_disk_tupdesc(brdesc);
632 stored = 0;
633 off = 0;
634 for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
635 {
636 int datumno;
637
638 if (allnulls[attnum])
639 {
640 stored += brdesc->bd_info[attnum]->oi_nstored;
641 continue;
642 }
643
644 for (datumno = 0;
645 datumno < brdesc->bd_info[attnum]->oi_nstored;
646 datumno++)
647 {
648 Form_pg_attribute thisatt = TupleDescAttr(diskdsc, stored);
649
650 if (thisatt->attlen == -1)
651 {
652 off = att_align_pointer(off, thisatt->attalign, -1,
653 tp + off);
654 }
655 else
656 {
657 /* not varlena, so safe to use att_align_nominal */
658 off = att_align_nominal(off, thisatt->attalign);
659 }
660
661 values[stored++] = fetchatt(thisatt, tp + off);
662
663 off = att_addlength_pointer(off, thisatt->attlen, tp + off);
664 }
665 }
666 }
667