1 /*-------------------------------------------------------------------------
2  *
3  * hashinsert.c
4  *	  Item insertion in hash tables for Postgres.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/hash/hashinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/hash.h"
19 #include "access/hash_xlog.h"
20 #include "miscadmin.h"
21 #include "storage/buf_internals.h"
22 #include "storage/lwlock.h"
23 #include "storage/predicate.h"
24 #include "utils/rel.h"
25 
26 static void _hash_vacuum_one_page(Relation rel, Relation hrel,
27 								  Buffer metabuf, Buffer buf);
28 
29 /*
30  *	_hash_doinsert() -- Handle insertion of a single index tuple.
31  *
32  *		This routine is called by the public interface routines, hashbuild
33  *		and hashinsert.  By here, itup is completely filled in.
34  */
35 void
_hash_doinsert(Relation rel,IndexTuple itup,Relation heapRel)36 _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
37 {
38 	Buffer		buf = InvalidBuffer;
39 	Buffer		bucket_buf;
40 	Buffer		metabuf;
41 	HashMetaPage metap;
42 	HashMetaPage usedmetap = NULL;
43 	Page		metapage;
44 	Page		page;
45 	HashPageOpaque pageopaque;
46 	Size		itemsz;
47 	bool		do_expand;
48 	uint32		hashkey;
49 	Bucket		bucket;
50 	OffsetNumber itup_off;
51 
52 	/*
53 	 * Get the hash key for the item (it's stored in the index tuple itself).
54 	 */
55 	hashkey = _hash_get_indextuple_hashkey(itup);
56 
57 	/* compute item size too */
58 	itemsz = IndexTupleSize(itup);
59 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
60 								 * need to be consistent */
61 
62 restart_insert:
63 
64 	/*
65 	 * Read the metapage.  We don't lock it yet; HashMaxItemSize() will
66 	 * examine pd_pagesize_version, but that can't change so we can examine it
67 	 * without a lock.
68 	 */
69 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
70 	metapage = BufferGetPage(metabuf);
71 
72 	/*
73 	 * Check whether the item can fit on a hash page at all. (Eventually, we
74 	 * ought to try to apply TOAST methods if not.)  Note that at this point,
75 	 * itemsz doesn't include the ItemId.
76 	 *
77 	 * XXX this is useless code if we are only storing hash keys.
78 	 */
79 	if (itemsz > HashMaxItemSize(metapage))
80 		ereport(ERROR,
81 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
82 				 errmsg("index row size %zu exceeds hash maximum %zu",
83 						itemsz, HashMaxItemSize(metapage)),
84 				 errhint("Values larger than a buffer page cannot be indexed.")));
85 
86 	/* Lock the primary bucket page for the target bucket. */
87 	buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
88 										  &usedmetap);
89 	Assert(usedmetap != NULL);
90 
91 	CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(buf));
92 
93 	/* remember the primary bucket buffer to release the pin on it at end. */
94 	bucket_buf = buf;
95 
96 	page = BufferGetPage(buf);
97 	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
98 	bucket = pageopaque->hasho_bucket;
99 
100 	/*
101 	 * If this bucket is in the process of being split, try to finish the
102 	 * split before inserting, because that might create room for the
103 	 * insertion to proceed without allocating an additional overflow page.
104 	 * It's only interesting to finish the split if we're trying to insert
105 	 * into the bucket from which we're removing tuples (the "old" bucket),
106 	 * not if we're trying to insert into the bucket into which tuples are
107 	 * being moved (the "new" bucket).
108 	 */
109 	if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
110 	{
111 		/* release the lock on bucket buffer, before completing the split. */
112 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
113 
114 		_hash_finish_split(rel, metabuf, buf, bucket,
115 						   usedmetap->hashm_maxbucket,
116 						   usedmetap->hashm_highmask,
117 						   usedmetap->hashm_lowmask);
118 
119 		/* release the pin on old and meta buffer.  retry for insert. */
120 		_hash_dropbuf(rel, buf);
121 		_hash_dropbuf(rel, metabuf);
122 		goto restart_insert;
123 	}
124 
125 	/* Do the insertion */
126 	while (PageGetFreeSpace(page) < itemsz)
127 	{
128 		BlockNumber nextblkno;
129 
130 		/*
131 		 * Check if current page has any DEAD tuples. If yes, delete these
132 		 * tuples and see if we can get a space for the new item to be
133 		 * inserted before moving to the next page in the bucket chain.
134 		 */
135 		if (H_HAS_DEAD_TUPLES(pageopaque))
136 		{
137 
138 			if (IsBufferCleanupOK(buf))
139 			{
140 				_hash_vacuum_one_page(rel, heapRel, metabuf, buf);
141 
142 				if (PageGetFreeSpace(page) >= itemsz)
143 					break;		/* OK, now we have enough space */
144 			}
145 		}
146 
147 		/*
148 		 * no space on this page; check for an overflow page
149 		 */
150 		nextblkno = pageopaque->hasho_nextblkno;
151 
152 		if (BlockNumberIsValid(nextblkno))
153 		{
154 			/*
155 			 * ovfl page exists; go get it.  if it doesn't have room, we'll
156 			 * find out next pass through the loop test above.  we always
157 			 * release both the lock and pin if this is an overflow page, but
158 			 * only the lock if this is the primary bucket page, since the pin
159 			 * on the primary bucket must be retained throughout the scan.
160 			 */
161 			if (buf != bucket_buf)
162 				_hash_relbuf(rel, buf);
163 			else
164 				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
165 			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
166 			page = BufferGetPage(buf);
167 		}
168 		else
169 		{
170 			/*
171 			 * we're at the end of the bucket chain and we haven't found a
172 			 * page with enough room.  allocate a new overflow page.
173 			 */
174 
175 			/* release our write lock without modifying buffer */
176 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
177 
178 			/* chain to a new overflow page */
179 			buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
180 			page = BufferGetPage(buf);
181 
182 			/* should fit now, given test above */
183 			Assert(PageGetFreeSpace(page) >= itemsz);
184 		}
185 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
186 		Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
187 		Assert(pageopaque->hasho_bucket == bucket);
188 	}
189 
190 	/*
191 	 * Write-lock the metapage so we can increment the tuple count. After
192 	 * incrementing it, check to see if it's time for a split.
193 	 */
194 	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
195 
196 	/* Do the update.  No ereport(ERROR) until changes are logged */
197 	START_CRIT_SECTION();
198 
199 	/* found page with enough space, so add the item here */
200 	itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
201 	MarkBufferDirty(buf);
202 
203 	/* metapage operations */
204 	metap = HashPageGetMeta(metapage);
205 	metap->hashm_ntuples += 1;
206 
207 	/* Make sure this stays in sync with _hash_expandtable() */
208 	do_expand = metap->hashm_ntuples >
209 		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
210 
211 	MarkBufferDirty(metabuf);
212 
213 	/* XLOG stuff */
214 	if (RelationNeedsWAL(rel))
215 	{
216 		xl_hash_insert xlrec;
217 		XLogRecPtr	recptr;
218 
219 		xlrec.offnum = itup_off;
220 
221 		XLogBeginInsert();
222 		XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
223 
224 		XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
225 
226 		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
227 		XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
228 
229 		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
230 
231 		PageSetLSN(BufferGetPage(buf), recptr);
232 		PageSetLSN(BufferGetPage(metabuf), recptr);
233 	}
234 
235 	END_CRIT_SECTION();
236 
237 	/* drop lock on metapage, but keep pin */
238 	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
239 
240 	/*
241 	 * Release the modified page and ensure to release the pin on primary
242 	 * page.
243 	 */
244 	_hash_relbuf(rel, buf);
245 	if (buf != bucket_buf)
246 		_hash_dropbuf(rel, bucket_buf);
247 
248 	/* Attempt to split if a split is needed */
249 	if (do_expand)
250 		_hash_expandtable(rel, metabuf);
251 
252 	/* Finally drop our pin on the metapage */
253 	_hash_dropbuf(rel, metabuf);
254 }
255 
256 /*
257  *	_hash_pgaddtup() -- add a tuple to a particular page in the index.
258  *
259  * This routine adds the tuple to the page as requested; it does not write out
260  * the page.  It is an error to call this function without pin and write lock
261  * on the target buffer.
262  *
263  * Returns the offset number at which the tuple was inserted.  This function
264  * is responsible for preserving the condition that tuples in a hash index
265  * page are sorted by hashkey value.
266  */
267 OffsetNumber
_hash_pgaddtup(Relation rel,Buffer buf,Size itemsize,IndexTuple itup)268 _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
269 {
270 	OffsetNumber itup_off;
271 	Page		page;
272 	uint32		hashkey;
273 
274 	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
275 	page = BufferGetPage(buf);
276 
277 	/* Find where to insert the tuple (preserving page's hashkey ordering) */
278 	hashkey = _hash_get_indextuple_hashkey(itup);
279 	itup_off = _hash_binsearch(page, hashkey);
280 
281 	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
282 		== InvalidOffsetNumber)
283 		elog(ERROR, "failed to add index item to \"%s\"",
284 			 RelationGetRelationName(rel));
285 
286 	return itup_off;
287 }
288 
289 /*
290  *	_hash_pgaddmultitup() -- add a tuple vector to a particular page in the
291  *							 index.
292  *
293  * This routine has same requirements for locking and tuple ordering as
294  * _hash_pgaddtup().
295  *
296  * Returns the offset number array at which the tuples were inserted.
297  */
298 void
_hash_pgaddmultitup(Relation rel,Buffer buf,IndexTuple * itups,OffsetNumber * itup_offsets,uint16 nitups)299 _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
300 					OffsetNumber *itup_offsets, uint16 nitups)
301 {
302 	OffsetNumber itup_off;
303 	Page		page;
304 	uint32		hashkey;
305 	int			i;
306 
307 	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
308 	page = BufferGetPage(buf);
309 
310 	for (i = 0; i < nitups; i++)
311 	{
312 		Size		itemsize;
313 
314 		itemsize = IndexTupleSize(itups[i]);
315 		itemsize = MAXALIGN(itemsize);
316 
317 		/* Find where to insert the tuple (preserving page's hashkey ordering) */
318 		hashkey = _hash_get_indextuple_hashkey(itups[i]);
319 		itup_off = _hash_binsearch(page, hashkey);
320 
321 		itup_offsets[i] = itup_off;
322 
323 		if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
324 			== InvalidOffsetNumber)
325 			elog(ERROR, "failed to add index item to \"%s\"",
326 				 RelationGetRelationName(rel));
327 	}
328 }
329 
330 /*
331  * _hash_vacuum_one_page - vacuum just one index page.
332  *
333  * Try to remove LP_DEAD items from the given page. We must acquire cleanup
334  * lock on the page being modified before calling this function.
335  */
336 
337 static void
_hash_vacuum_one_page(Relation rel,Relation hrel,Buffer metabuf,Buffer buf)338 _hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf)
339 {
340 	OffsetNumber deletable[MaxOffsetNumber];
341 	int			ndeletable = 0;
342 	OffsetNumber offnum,
343 				maxoff;
344 	Page		page = BufferGetPage(buf);
345 	HashPageOpaque pageopaque;
346 	HashMetaPage metap;
347 
348 	/* Scan each tuple in page to see if it is marked as LP_DEAD */
349 	maxoff = PageGetMaxOffsetNumber(page);
350 	for (offnum = FirstOffsetNumber;
351 		 offnum <= maxoff;
352 		 offnum = OffsetNumberNext(offnum))
353 	{
354 		ItemId		itemId = PageGetItemId(page, offnum);
355 
356 		if (ItemIdIsDead(itemId))
357 			deletable[ndeletable++] = offnum;
358 	}
359 
360 	if (ndeletable > 0)
361 	{
362 		TransactionId latestRemovedXid;
363 
364 		latestRemovedXid =
365 			index_compute_xid_horizon_for_tuples(rel, hrel, buf,
366 												 deletable, ndeletable);
367 
368 		/*
369 		 * Write-lock the meta page so that we can decrement tuple count.
370 		 */
371 		LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
372 
373 		/* No ereport(ERROR) until changes are logged */
374 		START_CRIT_SECTION();
375 
376 		PageIndexMultiDelete(page, deletable, ndeletable);
377 
378 		/*
379 		 * Mark the page as not containing any LP_DEAD items. This is not
380 		 * certainly true (there might be some that have recently been marked,
381 		 * but weren't included in our target-item list), but it will almost
382 		 * always be true and it doesn't seem worth an additional page scan to
383 		 * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
384 		 * anyway.
385 		 */
386 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
387 		pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
388 
389 		metap = HashPageGetMeta(BufferGetPage(metabuf));
390 		metap->hashm_ntuples -= ndeletable;
391 
392 		MarkBufferDirty(buf);
393 		MarkBufferDirty(metabuf);
394 
395 		/* XLOG stuff */
396 		if (RelationNeedsWAL(rel))
397 		{
398 			xl_hash_vacuum_one_page xlrec;
399 			XLogRecPtr	recptr;
400 
401 			xlrec.latestRemovedXid = latestRemovedXid;
402 			xlrec.ntuples = ndeletable;
403 
404 			XLogBeginInsert();
405 			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
406 			XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
407 
408 			/*
409 			 * We need the target-offsets array whether or not we store the
410 			 * whole buffer, to allow us to find the latestRemovedXid on a
411 			 * standby server.
412 			 */
413 			XLogRegisterData((char *) deletable,
414 							 ndeletable * sizeof(OffsetNumber));
415 
416 			XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
417 
418 			recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
419 
420 			PageSetLSN(BufferGetPage(buf), recptr);
421 			PageSetLSN(BufferGetPage(metabuf), recptr);
422 		}
423 
424 		END_CRIT_SECTION();
425 
426 		/*
427 		 * Releasing write lock on meta page as we have updated the tuple
428 		 * count.
429 		 */
430 		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
431 	}
432 }
433