1 /*-------------------------------------------------------------------------
2 *
3 * hashinsert.c
4 * Item insertion in hash tables for Postgres.
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/hash/hashinsert.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16 #include "postgres.h"
17
18 #include "access/hash.h"
19 #include "access/hash_xlog.h"
20 #include "access/heapam.h"
21 #include "miscadmin.h"
22 #include "utils/rel.h"
23 #include "storage/lwlock.h"
24 #include "storage/buf_internals.h"
25 #include "storage/predicate.h"
26
27 static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
28 RelFileNode hnode);
29
30 /*
31 * _hash_doinsert() -- Handle insertion of a single index tuple.
32 *
33 * This routine is called by the public interface routines, hashbuild
34 * and hashinsert. By here, itup is completely filled in.
35 */
36 void
_hash_doinsert(Relation rel,IndexTuple itup,Relation heapRel)37 _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
38 {
39 Buffer buf = InvalidBuffer;
40 Buffer bucket_buf;
41 Buffer metabuf;
42 HashMetaPage metap;
43 HashMetaPage usedmetap = NULL;
44 Page metapage;
45 Page page;
46 HashPageOpaque pageopaque;
47 Size itemsz;
48 bool do_expand;
49 uint32 hashkey;
50 Bucket bucket;
51 OffsetNumber itup_off;
52
53 /*
54 * Get the hash key for the item (it's stored in the index tuple itself).
55 */
56 hashkey = _hash_get_indextuple_hashkey(itup);
57
58 /* compute item size too */
59 itemsz = IndexTupleSize(itup);
60 itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
61 * need to be consistent */
62
63 restart_insert:
64
65 /*
66 * Read the metapage. We don't lock it yet; HashMaxItemSize() will
67 * examine pd_pagesize_version, but that can't change so we can examine it
68 * without a lock.
69 */
70 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
71 metapage = BufferGetPage(metabuf);
72
73 /*
74 * Check whether the item can fit on a hash page at all. (Eventually, we
75 * ought to try to apply TOAST methods if not.) Note that at this point,
76 * itemsz doesn't include the ItemId.
77 *
78 * XXX this is useless code if we are only storing hash keys.
79 */
80 if (itemsz > HashMaxItemSize(metapage))
81 ereport(ERROR,
82 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
83 errmsg("index row size %zu exceeds hash maximum %zu",
84 itemsz, HashMaxItemSize(metapage)),
85 errhint("Values larger than a buffer page cannot be indexed.")));
86
87 /* Lock the primary bucket page for the target bucket. */
88 buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
89 &usedmetap);
90 Assert(usedmetap != NULL);
91
92 CheckForSerializableConflictIn(rel, NULL, buf);
93
94 /* remember the primary bucket buffer to release the pin on it at end. */
95 bucket_buf = buf;
96
97 page = BufferGetPage(buf);
98 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
99 bucket = pageopaque->hasho_bucket;
100
101 /*
102 * If this bucket is in the process of being split, try to finish the
103 * split before inserting, because that might create room for the
104 * insertion to proceed without allocating an additional overflow page.
105 * It's only interesting to finish the split if we're trying to insert
106 * into the bucket from which we're removing tuples (the "old" bucket),
107 * not if we're trying to insert into the bucket into which tuples are
108 * being moved (the "new" bucket).
109 */
110 if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
111 {
112 /* release the lock on bucket buffer, before completing the split. */
113 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
114
115 _hash_finish_split(rel, metabuf, buf, bucket,
116 usedmetap->hashm_maxbucket,
117 usedmetap->hashm_highmask,
118 usedmetap->hashm_lowmask);
119
120 /* release the pin on old and meta buffer. retry for insert. */
121 _hash_dropbuf(rel, buf);
122 _hash_dropbuf(rel, metabuf);
123 goto restart_insert;
124 }
125
126 /* Do the insertion */
127 while (PageGetFreeSpace(page) < itemsz)
128 {
129 BlockNumber nextblkno;
130
131 /*
132 * Check if current page has any DEAD tuples. If yes, delete these
133 * tuples and see if we can get a space for the new item to be
134 * inserted before moving to the next page in the bucket chain.
135 */
136 if (H_HAS_DEAD_TUPLES(pageopaque))
137 {
138
139 if (IsBufferCleanupOK(buf))
140 {
141 _hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
142
143 if (PageGetFreeSpace(page) >= itemsz)
144 break; /* OK, now we have enough space */
145 }
146 }
147
148 /*
149 * no space on this page; check for an overflow page
150 */
151 nextblkno = pageopaque->hasho_nextblkno;
152
153 if (BlockNumberIsValid(nextblkno))
154 {
155 /*
156 * ovfl page exists; go get it. if it doesn't have room, we'll
157 * find out next pass through the loop test above. we always
158 * release both the lock and pin if this is an overflow page, but
159 * only the lock if this is the primary bucket page, since the pin
160 * on the primary bucket must be retained throughout the scan.
161 */
162 if (buf != bucket_buf)
163 _hash_relbuf(rel, buf);
164 else
165 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
166 buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
167 page = BufferGetPage(buf);
168 }
169 else
170 {
171 /*
172 * we're at the end of the bucket chain and we haven't found a
173 * page with enough room. allocate a new overflow page.
174 */
175
176 /* release our write lock without modifying buffer */
177 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
178
179 /* chain to a new overflow page */
180 buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
181 page = BufferGetPage(buf);
182
183 /* should fit now, given test above */
184 Assert(PageGetFreeSpace(page) >= itemsz);
185 }
186 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
187 Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
188 Assert(pageopaque->hasho_bucket == bucket);
189 }
190
191 /*
192 * Write-lock the metapage so we can increment the tuple count. After
193 * incrementing it, check to see if it's time for a split.
194 */
195 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
196
197 /* Do the update. No ereport(ERROR) until changes are logged */
198 START_CRIT_SECTION();
199
200 /* found page with enough space, so add the item here */
201 itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
202 MarkBufferDirty(buf);
203
204 /* metapage operations */
205 metap = HashPageGetMeta(metapage);
206 metap->hashm_ntuples += 1;
207
208 /* Make sure this stays in sync with _hash_expandtable() */
209 do_expand = metap->hashm_ntuples >
210 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
211
212 MarkBufferDirty(metabuf);
213
214 /* XLOG stuff */
215 if (RelationNeedsWAL(rel))
216 {
217 xl_hash_insert xlrec;
218 XLogRecPtr recptr;
219
220 xlrec.offnum = itup_off;
221
222 XLogBeginInsert();
223 XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
224
225 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
226
227 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
228 XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
229
230 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
231
232 PageSetLSN(BufferGetPage(buf), recptr);
233 PageSetLSN(BufferGetPage(metabuf), recptr);
234 }
235
236 END_CRIT_SECTION();
237
238 /* drop lock on metapage, but keep pin */
239 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
240
241 /*
242 * Release the modified page and ensure to release the pin on primary
243 * page.
244 */
245 _hash_relbuf(rel, buf);
246 if (buf != bucket_buf)
247 _hash_dropbuf(rel, bucket_buf);
248
249 /* Attempt to split if a split is needed */
250 if (do_expand)
251 _hash_expandtable(rel, metabuf);
252
253 /* Finally drop our pin on the metapage */
254 _hash_dropbuf(rel, metabuf);
255 }
256
257 /*
258 * _hash_pgaddtup() -- add a tuple to a particular page in the index.
259 *
260 * This routine adds the tuple to the page as requested; it does not write out
261 * the page. It is an error to call pgaddtup() without pin and write lock on
262 * the target buffer.
263 *
264 * Returns the offset number at which the tuple was inserted. This function
265 * is responsible for preserving the condition that tuples in a hash index
266 * page are sorted by hashkey value.
267 */
268 OffsetNumber
_hash_pgaddtup(Relation rel,Buffer buf,Size itemsize,IndexTuple itup)269 _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
270 {
271 OffsetNumber itup_off;
272 Page page;
273 uint32 hashkey;
274
275 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
276 page = BufferGetPage(buf);
277
278 /* Find where to insert the tuple (preserving page's hashkey ordering) */
279 hashkey = _hash_get_indextuple_hashkey(itup);
280 itup_off = _hash_binsearch(page, hashkey);
281
282 if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
283 == InvalidOffsetNumber)
284 elog(ERROR, "failed to add index item to \"%s\"",
285 RelationGetRelationName(rel));
286
287 return itup_off;
288 }
289
290 /*
291 * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the
292 * index.
293 *
294 * This routine has same requirements for locking and tuple ordering as
295 * _hash_pgaddtup().
296 *
297 * Returns the offset number array at which the tuples were inserted.
298 */
299 void
_hash_pgaddmultitup(Relation rel,Buffer buf,IndexTuple * itups,OffsetNumber * itup_offsets,uint16 nitups)300 _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
301 OffsetNumber *itup_offsets, uint16 nitups)
302 {
303 OffsetNumber itup_off;
304 Page page;
305 uint32 hashkey;
306 int i;
307
308 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
309 page = BufferGetPage(buf);
310
311 for (i = 0; i < nitups; i++)
312 {
313 Size itemsize;
314
315 itemsize = IndexTupleSize(itups[i]);
316 itemsize = MAXALIGN(itemsize);
317
318 /* Find where to insert the tuple (preserving page's hashkey ordering) */
319 hashkey = _hash_get_indextuple_hashkey(itups[i]);
320 itup_off = _hash_binsearch(page, hashkey);
321
322 itup_offsets[i] = itup_off;
323
324 if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
325 == InvalidOffsetNumber)
326 elog(ERROR, "failed to add index item to \"%s\"",
327 RelationGetRelationName(rel));
328 }
329 }
330
331 /*
332 * _hash_vacuum_one_page - vacuum just one index page.
333 *
334 * Try to remove LP_DEAD items from the given page. We must acquire cleanup
335 * lock on the page being modified before calling this function.
336 */
337
338 static void
_hash_vacuum_one_page(Relation rel,Buffer metabuf,Buffer buf,RelFileNode hnode)339 _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
340 RelFileNode hnode)
341 {
342 OffsetNumber deletable[MaxOffsetNumber];
343 int ndeletable = 0;
344 OffsetNumber offnum,
345 maxoff;
346 Page page = BufferGetPage(buf);
347 HashPageOpaque pageopaque;
348 HashMetaPage metap;
349
350 /* Scan each tuple in page to see if it is marked as LP_DEAD */
351 maxoff = PageGetMaxOffsetNumber(page);
352 for (offnum = FirstOffsetNumber;
353 offnum <= maxoff;
354 offnum = OffsetNumberNext(offnum))
355 {
356 ItemId itemId = PageGetItemId(page, offnum);
357
358 if (ItemIdIsDead(itemId))
359 deletable[ndeletable++] = offnum;
360 }
361
362 if (ndeletable > 0)
363 {
364 /*
365 * Write-lock the meta page so that we can decrement tuple count.
366 */
367 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
368
369 /* No ereport(ERROR) until changes are logged */
370 START_CRIT_SECTION();
371
372 PageIndexMultiDelete(page, deletable, ndeletable);
373
374 /*
375 * Mark the page as not containing any LP_DEAD items. This is not
376 * certainly true (there might be some that have recently been marked,
377 * but weren't included in our target-item list), but it will almost
378 * always be true and it doesn't seem worth an additional page scan to
379 * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
380 * anyway.
381 */
382 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
383 pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
384
385 metap = HashPageGetMeta(BufferGetPage(metabuf));
386 metap->hashm_ntuples -= ndeletable;
387
388 MarkBufferDirty(buf);
389 MarkBufferDirty(metabuf);
390
391 /* XLOG stuff */
392 if (RelationNeedsWAL(rel))
393 {
394 xl_hash_vacuum_one_page xlrec;
395 XLogRecPtr recptr;
396
397 xlrec.hnode = hnode;
398 xlrec.ntuples = ndeletable;
399
400 XLogBeginInsert();
401 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
402 XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
403
404 /*
405 * We need the target-offsets array whether or not we store the
406 * whole buffer, to allow us to find the latestRemovedXid on a
407 * standby server.
408 */
409 XLogRegisterData((char *) deletable,
410 ndeletable * sizeof(OffsetNumber));
411
412 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
413
414 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
415
416 PageSetLSN(BufferGetPage(buf), recptr);
417 PageSetLSN(BufferGetPage(metabuf), recptr);
418 }
419
420 END_CRIT_SECTION();
421
422 /*
423 * Releasing write lock on meta page as we have updated the tuple
424 * count.
425 */
426 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
427 }
428 }
429