1 /*------------------------------------------------------------------------- 2 * 3 * toast_internals.c 4 * Functions for internal use by the TOAST system. 5 * 6 * Copyright (c) 2000-2021, PostgreSQL Global Development Group 7 * 8 * IDENTIFICATION 9 * src/backend/access/common/toast_internals.c 10 * 11 *------------------------------------------------------------------------- 12 */ 13 14 #include "postgres.h" 15 16 #include "access/detoast.h" 17 #include "access/genam.h" 18 #include "access/heapam.h" 19 #include "access/heaptoast.h" 20 #include "access/table.h" 21 #include "access/toast_internals.h" 22 #include "access/xact.h" 23 #include "catalog/catalog.h" 24 #include "common/pg_lzcompress.h" 25 #include "miscadmin.h" 26 #include "utils/fmgroids.h" 27 #include "utils/rel.h" 28 #include "utils/snapmgr.h" 29 30 static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); 31 static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); 32 33 /* ---------- 34 * toast_compress_datum - 35 * 36 * Create a compressed version of a varlena datum 37 * 38 * If we fail (ie, compressed result is actually bigger than original) 39 * then return NULL. We must not use compressed data if it'd expand 40 * the tuple! 41 * 42 * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without 43 * copying them. But we can't handle external or compressed datums. 44 * ---------- 45 */ 46 Datum 47 toast_compress_datum(Datum value, char cmethod) 48 { 49 struct varlena *tmp = NULL; 50 int32 valsize; 51 ToastCompressionId cmid = TOAST_INVALID_COMPRESSION_ID; 52 53 Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value))); 54 Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); 55 56 valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); 57 58 /* If the compression method is not valid, use the current default */ 59 if (!CompressionMethodIsValid(cmethod)) 60 cmethod = default_toast_compression; 61 62 /* 63 * Call appropriate compression routine for the compression method. 64 */ 65 switch (cmethod) 66 { 67 case TOAST_PGLZ_COMPRESSION: 68 tmp = pglz_compress_datum((const struct varlena *) value); 69 cmid = TOAST_PGLZ_COMPRESSION_ID; 70 break; 71 case TOAST_LZ4_COMPRESSION: 72 tmp = lz4_compress_datum((const struct varlena *) value); 73 cmid = TOAST_LZ4_COMPRESSION_ID; 74 break; 75 default: 76 elog(ERROR, "invalid compression method %c", cmethod); 77 } 78 79 if (tmp == NULL) 80 return PointerGetDatum(NULL); 81 82 /* 83 * We recheck the actual size even if compression reports success, because 84 * it might be satisfied with having saved as little as one byte in the 85 * compressed data --- which could turn into a net loss once you consider 86 * header and alignment padding. Worst case, the compressed format might 87 * require three padding bytes (plus header, which is included in 88 * VARSIZE(tmp)), whereas the uncompressed format would take only one 89 * header byte and no padding if the value is short enough. So we insist 90 * on a savings of more than 2 bytes to ensure we have a gain. 91 */ 92 if (VARSIZE(tmp) < valsize - 2) 93 { 94 /* successful compression */ 95 Assert(cmid != TOAST_INVALID_COMPRESSION_ID); 96 TOAST_COMPRESS_SET_SIZE_AND_COMPRESS_METHOD(tmp, valsize, cmid); 97 return PointerGetDatum(tmp); 98 } 99 else 100 { 101 /* incompressible data */ 102 pfree(tmp); 103 return PointerGetDatum(NULL); 104 } 105 } 106 107 /* ---------- 108 * toast_save_datum - 109 * 110 * Save one single datum into the secondary relation and return 111 * a Datum reference for it. 112 * 113 * rel: the main relation we're working with (not the toast rel!) 114 * value: datum to be pushed to toast storage 115 * oldexternal: if not NULL, toast pointer previously representing the datum 116 * options: options to be passed to heap_insert() for toast rows 117 * ---------- 118 */ 119 Datum 120 toast_save_datum(Relation rel, Datum value, 121 struct varlena *oldexternal, int options) 122 { 123 Relation toastrel; 124 Relation *toastidxs; 125 HeapTuple toasttup; SyncScanShmemSize(void)126 TupleDesc toasttupDesc; 127 Datum t_values[3]; 128 bool t_isnull[3]; 129 CommandId mycid = GetCurrentCommandId(true); 130 struct varlena *result; 131 struct varatt_external toast_pointer; 132 union 133 { 134 struct varlena hdr; SyncScanShmemInit(void)135 /* this is to make the union big enough for a chunk: */ 136 char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; 137 /* ensure union is aligned well enough: */ 138 int32 align_it; 139 } chunk_data; 140 int32 chunk_size; 141 int32 chunk_seq = 0; 142 char *data_p; 143 int32 data_todo; 144 Pointer dval = DatumGetPointer(value); 145 int num_indexes; 146 int validIndex; 147 148 Assert(!VARATT_IS_EXTERNAL(value)); 149 150 /* 151 * Open the toast relation and its indexes. We can use the index to check 152 * uniqueness of the OID we assign to the toasted item, even though it has 153 * additional columns besides OID. 154 */ 155 toastrel = table_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); 156 toasttupDesc = toastrel->rd_att; 157 158 /* Open all the toast indexes and look for the valid one */ 159 validIndex = toast_open_indexes(toastrel, 160 RowExclusiveLock, 161 &toastidxs, 162 &num_indexes); 163 164 /* 165 * Get the data pointer and length, and compute va_rawsize and va_extinfo. 166 * 167 * va_rawsize is the size of the equivalent fully uncompressed datum, so 168 * we have to adjust for short headers. 169 * 170 * va_extinfo stored the actual size of the data payload in the toast 171 * records and the compression method in first 2 bits if data is 172 * compressed. 173 */ 174 if (VARATT_IS_SHORT(dval)) 175 { 176 data_p = VARDATA_SHORT(dval); 177 data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; 178 toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ 179 toast_pointer.va_extinfo = data_todo; 180 } 181 else if (VARATT_IS_COMPRESSED(dval)) 182 { 183 data_p = VARDATA(dval); 184 data_todo = VARSIZE(dval) - VARHDRSZ; 185 /* rawsize in a compressed datum is just the size of the payload */ 186 toast_pointer.va_rawsize = VARDATA_COMPRESSED_GET_EXTSIZE(dval) + VARHDRSZ; 187 188 /* set external size and compression method */ 189 VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer, data_todo, 190 VARDATA_COMPRESSED_GET_COMPRESS_METHOD(dval)); ss_search(RelFileNode relfilenode,BlockNumber location,bool set)191 /* Assert that the numbers look like it's compressed */ 192 Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); 193 } 194 else 195 { 196 data_p = VARDATA(dval); 197 data_todo = VARSIZE(dval) - VARHDRSZ; 198 toast_pointer.va_rawsize = VARSIZE(dval); 199 toast_pointer.va_extinfo = data_todo; 200 } 201 202 /* 203 * Insert the correct table OID into the result TOAST pointer. 204 * 205 * Normally this is the actual OID of the target toast table, but during 206 * table-rewriting operations such as CLUSTER, we have to insert the OID 207 * of the table's real permanent toast table instead. rd_toastoid is set 208 * if we have to substitute such an OID. 209 */ 210 if (OidIsValid(rel->rd_toastoid)) 211 toast_pointer.va_toastrelid = rel->rd_toastoid; 212 else 213 toast_pointer.va_toastrelid = RelationGetRelid(toastrel); 214 215 /* 216 * Choose an OID to use as the value ID for this toast value. 217 * 218 * Normally we just choose an unused OID within the toast table. But 219 * during table-rewriting operations where we are preserving an existing 220 * toast table OID, we want to preserve toast value OIDs too. So, if 221 * rd_toastoid is set and we had a prior external value from that same 222 * toast table, re-use its value ID. If we didn't have a prior external 223 * value (which is a corner case, but possible if the table's attstorage 224 * options have been changed), we have to pick a value ID that doesn't 225 * conflict with either new or existing toast value OIDs. 226 */ 227 if (!OidIsValid(rel->rd_toastoid)) 228 { 229 /* normal case: just choose an unused OID */ 230 toast_pointer.va_valueid = 231 GetNewOidWithIndex(toastrel, 232 RelationGetRelid(toastidxs[validIndex]), 233 (AttrNumber) 1); 234 } 235 else 236 { 237 /* rewrite case: check to see if value was in old toast table */ 238 toast_pointer.va_valueid = InvalidOid; 239 if (oldexternal != NULL) 240 { 241 struct varatt_external old_toast_pointer; 242 243 Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); 244 /* Must copy to access aligned fields */ 245 VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); 246 if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) 247 { 248 /* This value came from the old toast table; reuse its OID */ 249 toast_pointer.va_valueid = old_toast_pointer.va_valueid; 250 251 /* 252 * There is a corner case here: the table rewrite might have ss_get_location(Relation rel,BlockNumber relnblocks)253 * to copy both live and recently-dead versions of a row, and 254 * those versions could easily reference the same toast value. 255 * When we copy the second or later version of such a row, 256 * reusing the OID will mean we select an OID that's already 257 * in the new toast table. Check for that, and if so, just 258 * fall through without writing the data again. 259 * 260 * While annoying and ugly-looking, this is a good thing 261 * because it ensures that we wind up with only one copy of 262 * the toast value when there is only one copy in the old 263 * toast table. Before we detected this case, we'd have made 264 * multiple copies, wasting space; and what's worse, the 265 * copies belonging to already-deleted heap tuples would not 266 * be reclaimed by VACUUM. 267 */ 268 if (toastrel_valueid_exists(toastrel, 269 toast_pointer.va_valueid)) 270 { 271 /* Match, so short-circuit the data storage loop below */ 272 data_todo = 0; 273 } 274 } 275 } 276 if (toast_pointer.va_valueid == InvalidOid) 277 { 278 /* 279 * new value; must choose an OID that doesn't conflict in either 280 * old or new toast table 281 */ 282 do 283 { 284 toast_pointer.va_valueid = 285 GetNewOidWithIndex(toastrel, 286 RelationGetRelid(toastidxs[validIndex]), 287 (AttrNumber) 1); ss_report_location(Relation rel,BlockNumber location)288 } while (toastid_valueid_exists(rel->rd_toastoid, 289 toast_pointer.va_valueid)); 290 } 291 } 292 293 /* 294 * Initialize constant parts of the tuple data 295 */ 296 t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); 297 t_values[2] = PointerGetDatum(&chunk_data); 298 t_isnull[0] = false; 299 t_isnull[1] = false; 300 t_isnull[2] = false; 301 302 /* 303 * Split up the item into chunks 304 */ 305 while (data_todo > 0) 306 { 307 int i; 308 309 CHECK_FOR_INTERRUPTS(); 310 311 /* 312 * Calculate the size of this chunk 313 */ 314 chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); 315 316 /* 317 * Build a tuple and store it 318 */ 319 t_values[1] = Int32GetDatum(chunk_seq++); 320 SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); 321 memcpy(VARDATA(&chunk_data), data_p, chunk_size); 322 toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull); 323 324 heap_insert(toastrel, toasttup, mycid, options, NULL); 325 326 /* 327 * Create the index entry. We cheat a little here by not using 328 * FormIndexDatum: this relies on the knowledge that the index columns 329 * are the same as the initial columns of the table for all the 330 * indexes. We also cheat by not providing an IndexInfo: this is okay 331 * for now because btree doesn't need one, but we might have to be 332 * more honest someday. 333 * 334 * Note also that there had better not be any user-created index on 335 * the TOAST table, since we don't bother to update anything else. 336 */ 337 for (i = 0; i < num_indexes; i++) 338 { 339 /* Only index relations marked as ready can be updated */ 340 if (toastidxs[i]->rd_index->indisready) 341 index_insert(toastidxs[i], t_values, t_isnull, 342 &(toasttup->t_self), 343 toastrel, 344 toastidxs[i]->rd_index->indisunique ? 345 UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, 346 false, NULL); 347 } 348 349 /* 350 * Free memory 351 */ 352 heap_freetuple(toasttup); 353 354 /* 355 * Move on to next chunk 356 */ 357 data_todo -= chunk_size; 358 data_p += chunk_size; 359 } 360 361 /* 362 * Done - close toast relation and its indexes 363 */ 364 toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); 365 table_close(toastrel, RowExclusiveLock); 366 367 /* 368 * Create the TOAST pointer value that we'll return 369 */ 370 result = (struct varlena *) palloc(TOAST_POINTER_SIZE); 371 SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); 372 memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); 373 374 return PointerGetDatum(result); 375 } 376 377 /* ---------- 378 * toast_delete_datum - 379 * 380 * Delete a single external stored value. 381 * ---------- 382 */ 383 void 384 toast_delete_datum(Relation rel, Datum value, bool is_speculative) 385 { 386 struct varlena *attr = (struct varlena *) DatumGetPointer(value); 387 struct varatt_external toast_pointer; 388 Relation toastrel; 389 Relation *toastidxs; 390 ScanKeyData toastkey; 391 SysScanDesc toastscan; 392 HeapTuple toasttup; 393 int num_indexes; 394 int validIndex; 395 SnapshotData SnapshotToast; 396 397 if (!VARATT_IS_EXTERNAL_ONDISK(attr)) 398 return; 399 400 /* Must copy to access aligned fields */ 401 VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); 402 403 /* 404 * Open the toast relation and its indexes 405 */ 406 toastrel = table_open(toast_pointer.va_toastrelid, RowExclusiveLock); 407 408 /* Fetch valid relation used for process */ 409 validIndex = toast_open_indexes(toastrel, 410 RowExclusiveLock, 411 &toastidxs, 412 &num_indexes); 413 414 /* 415 * Setup a scan key to find chunks with matching va_valueid 416 */ 417 ScanKeyInit(&toastkey, 418 (AttrNumber) 1, 419 BTEqualStrategyNumber, F_OIDEQ, 420 ObjectIdGetDatum(toast_pointer.va_valueid)); 421 422 /* 423 * Find all the chunks. (We don't actually care whether we see them in 424 * sequence or not, but since we've already locked the index we might as 425 * well use systable_beginscan_ordered.) 426 */ 427 init_toast_snapshot(&SnapshotToast); 428 toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], 429 &SnapshotToast, 1, &toastkey); 430 while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) 431 { 432 /* 433 * Have a chunk, delete it 434 */ 435 if (is_speculative) 436 heap_abort_speculative(toastrel, &toasttup->t_self); 437 else 438 simple_heap_delete(toastrel, &toasttup->t_self); 439 } 440 441 /* 442 * End scan and close relations 443 */ 444 systable_endscan_ordered(toastscan); 445 toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); 446 table_close(toastrel, RowExclusiveLock); 447 } 448 449 /* ---------- 450 * toastrel_valueid_exists - 451 * 452 * Test whether a toast value with the given ID exists in the toast relation. 453 * For safety, we consider a value to exist if there are either live or dead 454 * toast rows with that ID; see notes for GetNewOidWithIndex(). 455 * ---------- 456 */ 457 static bool 458 toastrel_valueid_exists(Relation toastrel, Oid valueid) 459 { 460 bool result = false; 461 ScanKeyData toastkey; 462 SysScanDesc toastscan; 463 int num_indexes; 464 int validIndex; 465 Relation *toastidxs; 466 467 /* Fetch a valid index relation */ 468 validIndex = toast_open_indexes(toastrel, 469 RowExclusiveLock, 470 &toastidxs, 471 &num_indexes); 472 473 /* 474 * Setup a scan key to find chunks with matching va_valueid 475 */ 476 ScanKeyInit(&toastkey, 477 (AttrNumber) 1, 478 BTEqualStrategyNumber, F_OIDEQ, 479 ObjectIdGetDatum(valueid)); 480 481 /* 482 * Is there any such chunk? 483 */ 484 toastscan = systable_beginscan(toastrel, 485 RelationGetRelid(toastidxs[validIndex]), 486 true, SnapshotAny, 1, &toastkey); 487 488 if (systable_getnext(toastscan) != NULL) 489 result = true; 490 491 systable_endscan(toastscan); 492 493 /* Clean up */ 494 toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); 495 496 return result; 497 } 498 499 /* ---------- 500 * toastid_valueid_exists - 501 * 502 * As above, but work from toast rel's OID not an open relation 503 * ---------- 504 */ 505 static bool 506 toastid_valueid_exists(Oid toastrelid, Oid valueid) 507 { 508 bool result; 509 Relation toastrel; 510 511 toastrel = table_open(toastrelid, AccessShareLock); 512 513 result = toastrel_valueid_exists(toastrel, valueid); 514 515 table_close(toastrel, AccessShareLock); 516 517 return result; 518 } 519 520 /* ---------- 521 * toast_get_valid_index 522 * 523 * Get OID of valid index associated to given toast relation. A toast 524 * relation can have only one valid index at the same time. 525 */ 526 Oid 527 toast_get_valid_index(Oid toastoid, LOCKMODE lock) 528 { 529 int num_indexes; 530 int validIndex; 531 Oid validIndexOid; 532 Relation *toastidxs; 533 Relation toastrel; 534 535 /* Open the toast relation */ 536 toastrel = table_open(toastoid, lock); 537 538 /* Look for the valid index of the toast relation */ 539 validIndex = toast_open_indexes(toastrel, 540 lock, 541 &toastidxs, 542 &num_indexes); 543 validIndexOid = RelationGetRelid(toastidxs[validIndex]); 544 545 /* Close the toast relation and all its indexes */ 546 toast_close_indexes(toastidxs, num_indexes, NoLock); 547 table_close(toastrel, NoLock); 548 549 return validIndexOid; 550 } 551 552 /* ---------- 553 * toast_open_indexes 554 * 555 * Get an array of the indexes associated to the given toast relation 556 * and return as well the position of the valid index used by the toast 557 * relation in this array. It is the responsibility of the caller of this 558 * function to close the indexes as well as free them. 559 */ 560 int 561 toast_open_indexes(Relation toastrel, 562 LOCKMODE lock, 563 Relation **toastidxs, 564 int *num_indexes) 565 { 566 int i = 0; 567 int res = 0; 568 bool found = false; 569 List *indexlist; 570 ListCell *lc; 571 572 /* Get index list of the toast relation */ 573 indexlist = RelationGetIndexList(toastrel); 574 Assert(indexlist != NIL); 575 576 *num_indexes = list_length(indexlist); 577 578 /* Open all the index relations */ 579 *toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation)); 580 foreach(lc, indexlist) 581 (*toastidxs)[i++] = index_open(lfirst_oid(lc), lock); 582 583 /* Fetch the first valid index in list */ 584 for (i = 0; i < *num_indexes; i++) 585 { 586 Relation toastidx = (*toastidxs)[i]; 587 588 if (toastidx->rd_index->indisvalid) 589 { 590 res = i; 591 found = true; 592 break; 593 } 594 } 595 596 /* 597 * Free index list, not necessary anymore as relations are opened and a 598 * valid index has been found. 599 */ 600 list_free(indexlist); 601 602 /* 603 * The toast relation should have one valid index, so something is going 604 * wrong if there is nothing. 605 */ 606 if (!found) 607 elog(ERROR, "no valid index found for toast relation with Oid %u", 608 RelationGetRelid(toastrel)); 609 610 return res; 611 } 612 613 /* ---------- 614 * toast_close_indexes 615 * 616 * Close an array of indexes for a toast relation and free it. This should 617 * be called for a set of indexes opened previously with toast_open_indexes. 618 */ 619 void 620 toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock) 621 { 622 int i; 623 624 /* Close relations and clean up things */ 625 for (i = 0; i < num_indexes; i++) 626 index_close(toastidxs[i], lock); 627 pfree(toastidxs); 628 } 629 630 /* ---------- 631 * init_toast_snapshot 632 * 633 * Initialize an appropriate TOAST snapshot. We must use an MVCC snapshot 634 * to initialize the TOAST snapshot; since we don't know which one to use, 635 * just use the oldest one. This is safe: at worst, we will get a "snapshot 636 * too old" error that might have been avoided otherwise. 637 */ 638 void 639 init_toast_snapshot(Snapshot toast_snapshot) 640 { 641 Snapshot snapshot = GetOldestSnapshot(); 642 643 /* 644 * GetOldestSnapshot returns NULL if the session has no active snapshots. 645 * We can get that if, for example, a procedure fetches a toasted value 646 * into a local variable, commits, and then tries to detoast the value. 647 * Such coding is unsafe, because once we commit there is nothing to 648 * prevent the toast data from being deleted. Detoasting *must* happen in 649 * the same transaction that originally fetched the toast pointer. Hence, 650 * rather than trying to band-aid over the problem, throw an error. (This 651 * is not very much protection, because in many scenarios the procedure 652 * would have already created a new transaction snapshot, preventing us 653 * from detecting the problem. But it's better than nothing, and for sure 654 * we shouldn't expend code on masking the problem more.) 655 */ 656 if (snapshot == NULL) 657 elog(ERROR, "cannot fetch toast data without an active snapshot"); 658 659 InitToastSnapshot(*toast_snapshot, snapshot->lsn, snapshot->whenTaken); 660 } 661