1 /*------------------------------------------------------------------------- 2 * 3 * hashinsert.c 4 * Item insertion in hash tables for Postgres. 5 * 6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group 7 * Portions Copyright (c) 1994, Regents of the University of California 8 * 9 * 10 * IDENTIFICATION 11 * src/backend/access/hash/hashinsert.c 12 * 13 *------------------------------------------------------------------------- 14 */ 15 16 #include "postgres.h" 17 18 #include "access/hash.h" 19 #include "access/hash_xlog.h" 20 #include "miscadmin.h" 21 #include "utils/rel.h" 22 #include "storage/lwlock.h" 23 #include "storage/buf_internals.h" 24 #include "storage/predicate.h" 25 26 static void _hash_vacuum_one_page(Relation rel, Relation hrel, 27 Buffer metabuf, Buffer buf); 28 29 /* 30 * _hash_doinsert() -- Handle insertion of a single index tuple. 31 * 32 * This routine is called by the public interface routines, hashbuild 33 * and hashinsert. By here, itup is completely filled in. 34 */ 35 void 36 _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel) 37 { 38 Buffer buf = InvalidBuffer; 39 Buffer bucket_buf; 40 Buffer metabuf; 41 HashMetaPage metap; 42 HashMetaPage usedmetap = NULL; 43 Page metapage; 44 Page page; 45 HashPageOpaque pageopaque; 46 Size itemsz; 47 bool do_expand; 48 uint32 hashkey; 49 Bucket bucket; 50 OffsetNumber itup_off; 51 52 /* 53 * Get the hash key for the item (it's stored in the index tuple itself). 54 */ 55 hashkey = _hash_get_indextuple_hashkey(itup); 56 57 /* compute item size too */ 58 itemsz = IndexTupleSize(itup); 59 itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we 60 * need to be consistent */ 61 62 restart_insert: 63 64 /* 65 * Read the metapage. We don't lock it yet; HashMaxItemSize() will 66 * examine pd_pagesize_version, but that can't change so we can examine it 67 * without a lock. 68 */ 69 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); 70 metapage = BufferGetPage(metabuf); 71 72 /* 73 * Check whether the item can fit on a hash page at all. (Eventually, we 74 * ought to try to apply TOAST methods if not.) Note that at this point, 75 * itemsz doesn't include the ItemId. 76 * 77 * XXX this is useless code if we are only storing hash keys. 78 */ 79 if (itemsz > HashMaxItemSize(metapage)) 80 ereport(ERROR, 81 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), 82 errmsg("index row size %zu exceeds hash maximum %zu", 83 itemsz, HashMaxItemSize(metapage)), 84 errhint("Values larger than a buffer page cannot be indexed."))); 85 86 /* Lock the primary bucket page for the target bucket. */ 87 buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, 88 &usedmetap); 89 Assert(usedmetap != NULL); 90 91 CheckForSerializableConflictIn(rel, NULL, buf); 92 93 /* remember the primary bucket buffer to release the pin on it at end. */ 94 bucket_buf = buf; 95 96 page = BufferGetPage(buf); 97 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); 98 bucket = pageopaque->hasho_bucket; 99 100 /* 101 * If this bucket is in the process of being split, try to finish the 102 * split before inserting, because that might create room for the 103 * insertion to proceed without allocating an additional overflow page. 104 * It's only interesting to finish the split if we're trying to insert 105 * into the bucket from which we're removing tuples (the "old" bucket), 106 * not if we're trying to insert into the bucket into which tuples are 107 * being moved (the "new" bucket). 108 */ 109 if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) 110 { 111 /* release the lock on bucket buffer, before completing the split. */ 112 LockBuffer(buf, BUFFER_LOCK_UNLOCK); 113 114 _hash_finish_split(rel, metabuf, buf, bucket, 115 usedmetap->hashm_maxbucket, 116 usedmetap->hashm_highmask, 117 usedmetap->hashm_lowmask); 118 119 /* release the pin on old and meta buffer. retry for insert. */ 120 _hash_dropbuf(rel, buf); 121 _hash_dropbuf(rel, metabuf); 122 goto restart_insert; 123 } 124 125 /* Do the insertion */ 126 while (PageGetFreeSpace(page) < itemsz) 127 { 128 BlockNumber nextblkno; 129 130 /* 131 * Check if current page has any DEAD tuples. If yes, delete these 132 * tuples and see if we can get a space for the new item to be 133 * inserted before moving to the next page in the bucket chain. 134 */ 135 if (H_HAS_DEAD_TUPLES(pageopaque)) 136 { 137 138 if (IsBufferCleanupOK(buf)) 139 { 140 _hash_vacuum_one_page(rel, heapRel, metabuf, buf); 141 142 if (PageGetFreeSpace(page) >= itemsz) 143 break; /* OK, now we have enough space */ 144 } 145 } 146 147 /* 148 * no space on this page; check for an overflow page 149 */ 150 nextblkno = pageopaque->hasho_nextblkno; 151 152 if (BlockNumberIsValid(nextblkno)) 153 { 154 /* 155 * ovfl page exists; go get it. if it doesn't have room, we'll 156 * find out next pass through the loop test above. we always 157 * release both the lock and pin if this is an overflow page, but 158 * only the lock if this is the primary bucket page, since the pin 159 * on the primary bucket must be retained throughout the scan. 160 */ 161 if (buf != bucket_buf) 162 _hash_relbuf(rel, buf); 163 else 164 LockBuffer(buf, BUFFER_LOCK_UNLOCK); 165 buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); 166 page = BufferGetPage(buf); 167 } 168 else 169 { 170 /* 171 * we're at the end of the bucket chain and we haven't found a 172 * page with enough room. allocate a new overflow page. 173 */ 174 175 /* release our write lock without modifying buffer */ 176 LockBuffer(buf, BUFFER_LOCK_UNLOCK); 177 178 /* chain to a new overflow page */ 179 buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false); 180 page = BufferGetPage(buf); 181 182 /* should fit now, given test above */ 183 Assert(PageGetFreeSpace(page) >= itemsz); 184 } 185 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); 186 Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE); 187 Assert(pageopaque->hasho_bucket == bucket); 188 } 189 190 /* 191 * Write-lock the metapage so we can increment the tuple count. After 192 * incrementing it, check to see if it's time for a split. 193 */ 194 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); 195 196 /* Do the update. No ereport(ERROR) until changes are logged */ 197 START_CRIT_SECTION(); 198 199 /* found page with enough space, so add the item here */ 200 itup_off = _hash_pgaddtup(rel, buf, itemsz, itup); 201 MarkBufferDirty(buf); 202 203 /* metapage operations */ 204 metap = HashPageGetMeta(metapage); 205 metap->hashm_ntuples += 1; 206 207 /* Make sure this stays in sync with _hash_expandtable() */ 208 do_expand = metap->hashm_ntuples > 209 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); 210 211 MarkBufferDirty(metabuf); 212 213 /* XLOG stuff */ 214 if (RelationNeedsWAL(rel)) 215 { 216 xl_hash_insert xlrec; 217 XLogRecPtr recptr; 218 219 xlrec.offnum = itup_off; 220 221 XLogBeginInsert(); 222 XLogRegisterData((char *) &xlrec, SizeOfHashInsert); 223 224 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); 225 226 XLogRegisterBuffer(0, buf, REGBUF_STANDARD); 227 XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); 228 229 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT); 230 231 PageSetLSN(BufferGetPage(buf), recptr); 232 PageSetLSN(BufferGetPage(metabuf), recptr); 233 } 234 235 END_CRIT_SECTION(); 236 237 /* drop lock on metapage, but keep pin */ 238 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); 239 240 /* 241 * Release the modified page and ensure to release the pin on primary 242 * page. 243 */ 244 _hash_relbuf(rel, buf); 245 if (buf != bucket_buf) 246 _hash_dropbuf(rel, bucket_buf); 247 248 /* Attempt to split if a split is needed */ 249 if (do_expand) 250 _hash_expandtable(rel, metabuf); 251 252 /* Finally drop our pin on the metapage */ 253 _hash_dropbuf(rel, metabuf); 254 } 255 256 /* 257 * _hash_pgaddtup() -- add a tuple to a particular page in the index. 258 * 259 * This routine adds the tuple to the page as requested; it does not write out 260 * the page. It is an error to call pgaddtup() without pin and write lock on 261 * the target buffer. 262 * 263 * Returns the offset number at which the tuple was inserted. This function 264 * is responsible for preserving the condition that tuples in a hash index 265 * page are sorted by hashkey value. 266 */ 267 OffsetNumber 268 _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup) 269 { 270 OffsetNumber itup_off; 271 Page page; 272 uint32 hashkey; 273 274 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); 275 page = BufferGetPage(buf); 276 277 /* Find where to insert the tuple (preserving page's hashkey ordering) */ 278 hashkey = _hash_get_indextuple_hashkey(itup); 279 itup_off = _hash_binsearch(page, hashkey); 280 281 if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) 282 == InvalidOffsetNumber) 283 elog(ERROR, "failed to add index item to \"%s\"", 284 RelationGetRelationName(rel)); 285 286 return itup_off; 287 } 288 289 /* 290 * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the 291 * index. 292 * 293 * This routine has same requirements for locking and tuple ordering as 294 * _hash_pgaddtup(). 295 * 296 * Returns the offset number array at which the tuples were inserted. 297 */ 298 void 299 _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, 300 OffsetNumber *itup_offsets, uint16 nitups) 301 { 302 OffsetNumber itup_off; 303 Page page; 304 uint32 hashkey; 305 int i; 306 307 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); 308 page = BufferGetPage(buf); 309 310 for (i = 0; i < nitups; i++) 311 { 312 Size itemsize; 313 314 itemsize = IndexTupleSize(itups[i]); 315 itemsize = MAXALIGN(itemsize); 316 317 /* Find where to insert the tuple (preserving page's hashkey ordering) */ 318 hashkey = _hash_get_indextuple_hashkey(itups[i]); 319 itup_off = _hash_binsearch(page, hashkey); 320 321 itup_offsets[i] = itup_off; 322 323 if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) 324 == InvalidOffsetNumber) 325 elog(ERROR, "failed to add index item to \"%s\"", 326 RelationGetRelationName(rel)); 327 } 328 } 329 330 /* 331 * _hash_vacuum_one_page - vacuum just one index page. 332 * 333 * Try to remove LP_DEAD items from the given page. We must acquire cleanup 334 * lock on the page being modified before calling this function. 335 */ 336 337 static void 338 _hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf) 339 { 340 OffsetNumber deletable[MaxOffsetNumber]; 341 int ndeletable = 0; 342 OffsetNumber offnum, 343 maxoff; 344 Page page = BufferGetPage(buf); 345 HashPageOpaque pageopaque; 346 HashMetaPage metap; 347 348 /* Scan each tuple in page to see if it is marked as LP_DEAD */ 349 maxoff = PageGetMaxOffsetNumber(page); 350 for (offnum = FirstOffsetNumber; 351 offnum <= maxoff; 352 offnum = OffsetNumberNext(offnum)) 353 { 354 ItemId itemId = PageGetItemId(page, offnum); 355 356 if (ItemIdIsDead(itemId)) 357 deletable[ndeletable++] = offnum; 358 } 359 360 if (ndeletable > 0) 361 { 362 TransactionId latestRemovedXid; 363 364 latestRemovedXid = 365 index_compute_xid_horizon_for_tuples(rel, hrel, buf, 366 deletable, ndeletable); 367 368 /* 369 * Write-lock the meta page so that we can decrement tuple count. 370 */ 371 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); 372 373 /* No ereport(ERROR) until changes are logged */ 374 START_CRIT_SECTION(); 375 376 PageIndexMultiDelete(page, deletable, ndeletable); 377 378 /* 379 * Mark the page as not containing any LP_DEAD items. This is not 380 * certainly true (there might be some that have recently been marked, 381 * but weren't included in our target-item list), but it will almost 382 * always be true and it doesn't seem worth an additional page scan to 383 * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint 384 * anyway. 385 */ 386 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); 387 pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; 388 389 metap = HashPageGetMeta(BufferGetPage(metabuf)); 390 metap->hashm_ntuples -= ndeletable; 391 392 MarkBufferDirty(buf); 393 MarkBufferDirty(metabuf); 394 395 /* XLOG stuff */ 396 if (RelationNeedsWAL(rel)) 397 { 398 xl_hash_vacuum_one_page xlrec; 399 XLogRecPtr recptr; 400 401 xlrec.latestRemovedXid = latestRemovedXid; 402 xlrec.ntuples = ndeletable; 403 404 XLogBeginInsert(); 405 XLogRegisterBuffer(0, buf, REGBUF_STANDARD); 406 XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage); 407 408 /* 409 * We need the target-offsets array whether or not we store the 410 * whole buffer, to allow us to find the latestRemovedXid on a 411 * standby server. 412 */ 413 XLogRegisterData((char *) deletable, 414 ndeletable * sizeof(OffsetNumber)); 415 416 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); 417 418 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE); 419 420 PageSetLSN(BufferGetPage(buf), recptr); 421 PageSetLSN(BufferGetPage(metabuf), recptr); 422 } 423 424 END_CRIT_SECTION(); 425 426 /* 427 * Releasing write lock on meta page as we have updated the tuple 428 * count. 429 */ 430 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); 431 } 432 } 433