1 /*-------------------------------------------------------------------------
2  *
3  * hashinsert.c
4  *	  Item insertion in hash tables for Postgres.
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/hash/hashinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/hash.h"
19 #include "access/hash_xlog.h"
20 #include "access/heapam.h"
21 #include "miscadmin.h"
22 #include "utils/rel.h"
23 #include "storage/lwlock.h"
24 #include "storage/buf_internals.h"
25 #include "storage/predicate.h"
26 
27 static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
28 					  RelFileNode hnode);
29 
30 /*
31  *	_hash_doinsert() -- Handle insertion of a single index tuple.
32  *
33  *		This routine is called by the public interface routines, hashbuild
34  *		and hashinsert.  By here, itup is completely filled in.
35  */
36 void
_hash_doinsert(Relation rel,IndexTuple itup,Relation heapRel)37 _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
38 {
39 	Buffer		buf = InvalidBuffer;
40 	Buffer		bucket_buf;
41 	Buffer		metabuf;
42 	HashMetaPage metap;
43 	HashMetaPage usedmetap = NULL;
44 	Page		metapage;
45 	Page		page;
46 	HashPageOpaque pageopaque;
47 	Size		itemsz;
48 	bool		do_expand;
49 	uint32		hashkey;
50 	Bucket		bucket;
51 	OffsetNumber itup_off;
52 
53 	/*
54 	 * Get the hash key for the item (it's stored in the index tuple itself).
55 	 */
56 	hashkey = _hash_get_indextuple_hashkey(itup);
57 
58 	/* compute item size too */
59 	itemsz = IndexTupleSize(itup);
60 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
61 								 * need to be consistent */
62 
63 restart_insert:
64 
65 	/*
66 	 * Read the metapage.  We don't lock it yet; HashMaxItemSize() will
67 	 * examine pd_pagesize_version, but that can't change so we can examine it
68 	 * without a lock.
69 	 */
70 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
71 	metapage = BufferGetPage(metabuf);
72 
73 	/*
74 	 * Check whether the item can fit on a hash page at all. (Eventually, we
75 	 * ought to try to apply TOAST methods if not.)  Note that at this point,
76 	 * itemsz doesn't include the ItemId.
77 	 *
78 	 * XXX this is useless code if we are only storing hash keys.
79 	 */
80 	if (itemsz > HashMaxItemSize(metapage))
81 		ereport(ERROR,
82 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
83 				 errmsg("index row size %zu exceeds hash maximum %zu",
84 						itemsz, HashMaxItemSize(metapage)),
85 				 errhint("Values larger than a buffer page cannot be indexed.")));
86 
87 	/* Lock the primary bucket page for the target bucket. */
88 	buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
89 										  &usedmetap);
90 	Assert(usedmetap != NULL);
91 
92 	CheckForSerializableConflictIn(rel, NULL, buf);
93 
94 	/* remember the primary bucket buffer to release the pin on it at end. */
95 	bucket_buf = buf;
96 
97 	page = BufferGetPage(buf);
98 	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
99 	bucket = pageopaque->hasho_bucket;
100 
101 	/*
102 	 * If this bucket is in the process of being split, try to finish the
103 	 * split before inserting, because that might create room for the
104 	 * insertion to proceed without allocating an additional overflow page.
105 	 * It's only interesting to finish the split if we're trying to insert
106 	 * into the bucket from which we're removing tuples (the "old" bucket),
107 	 * not if we're trying to insert into the bucket into which tuples are
108 	 * being moved (the "new" bucket).
109 	 */
110 	if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
111 	{
112 		/* release the lock on bucket buffer, before completing the split. */
113 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
114 
115 		_hash_finish_split(rel, metabuf, buf, bucket,
116 						   usedmetap->hashm_maxbucket,
117 						   usedmetap->hashm_highmask,
118 						   usedmetap->hashm_lowmask);
119 
120 		/* release the pin on old and meta buffer.  retry for insert. */
121 		_hash_dropbuf(rel, buf);
122 		_hash_dropbuf(rel, metabuf);
123 		goto restart_insert;
124 	}
125 
126 	/* Do the insertion */
127 	while (PageGetFreeSpace(page) < itemsz)
128 	{
129 		BlockNumber nextblkno;
130 
131 		/*
132 		 * Check if current page has any DEAD tuples. If yes, delete these
133 		 * tuples and see if we can get a space for the new item to be
134 		 * inserted before moving to the next page in the bucket chain.
135 		 */
136 		if (H_HAS_DEAD_TUPLES(pageopaque))
137 		{
138 
139 			if (IsBufferCleanupOK(buf))
140 			{
141 				_hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
142 
143 				if (PageGetFreeSpace(page) >= itemsz)
144 					break;		/* OK, now we have enough space */
145 			}
146 		}
147 
148 		/*
149 		 * no space on this page; check for an overflow page
150 		 */
151 		nextblkno = pageopaque->hasho_nextblkno;
152 
153 		if (BlockNumberIsValid(nextblkno))
154 		{
155 			/*
156 			 * ovfl page exists; go get it.  if it doesn't have room, we'll
157 			 * find out next pass through the loop test above.  we always
158 			 * release both the lock and pin if this is an overflow page, but
159 			 * only the lock if this is the primary bucket page, since the pin
160 			 * on the primary bucket must be retained throughout the scan.
161 			 */
162 			if (buf != bucket_buf)
163 				_hash_relbuf(rel, buf);
164 			else
165 				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
166 			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
167 			page = BufferGetPage(buf);
168 		}
169 		else
170 		{
171 			/*
172 			 * we're at the end of the bucket chain and we haven't found a
173 			 * page with enough room.  allocate a new overflow page.
174 			 */
175 
176 			/* release our write lock without modifying buffer */
177 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
178 
179 			/* chain to a new overflow page */
180 			buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
181 			page = BufferGetPage(buf);
182 
183 			/* should fit now, given test above */
184 			Assert(PageGetFreeSpace(page) >= itemsz);
185 		}
186 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
187 		Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
188 		Assert(pageopaque->hasho_bucket == bucket);
189 	}
190 
191 	/*
192 	 * Write-lock the metapage so we can increment the tuple count. After
193 	 * incrementing it, check to see if it's time for a split.
194 	 */
195 	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
196 
197 	/* Do the update.  No ereport(ERROR) until changes are logged */
198 	START_CRIT_SECTION();
199 
200 	/* found page with enough space, so add the item here */
201 	itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
202 	MarkBufferDirty(buf);
203 
204 	/* metapage operations */
205 	metap = HashPageGetMeta(metapage);
206 	metap->hashm_ntuples += 1;
207 
208 	/* Make sure this stays in sync with _hash_expandtable() */
209 	do_expand = metap->hashm_ntuples >
210 		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
211 
212 	MarkBufferDirty(metabuf);
213 
214 	/* XLOG stuff */
215 	if (RelationNeedsWAL(rel))
216 	{
217 		xl_hash_insert xlrec;
218 		XLogRecPtr	recptr;
219 
220 		xlrec.offnum = itup_off;
221 
222 		XLogBeginInsert();
223 		XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
224 
225 		XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
226 
227 		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
228 		XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
229 
230 		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
231 
232 		PageSetLSN(BufferGetPage(buf), recptr);
233 		PageSetLSN(BufferGetPage(metabuf), recptr);
234 	}
235 
236 	END_CRIT_SECTION();
237 
238 	/* drop lock on metapage, but keep pin */
239 	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
240 
241 	/*
242 	 * Release the modified page and ensure to release the pin on primary
243 	 * page.
244 	 */
245 	_hash_relbuf(rel, buf);
246 	if (buf != bucket_buf)
247 		_hash_dropbuf(rel, bucket_buf);
248 
249 	/* Attempt to split if a split is needed */
250 	if (do_expand)
251 		_hash_expandtable(rel, metabuf);
252 
253 	/* Finally drop our pin on the metapage */
254 	_hash_dropbuf(rel, metabuf);
255 }
256 
257 /*
258  *	_hash_pgaddtup() -- add a tuple to a particular page in the index.
259  *
260  * This routine adds the tuple to the page as requested; it does not write out
261  * the page.  It is an error to call pgaddtup() without pin and write lock on
262  * the target buffer.
263  *
264  * Returns the offset number at which the tuple was inserted.  This function
265  * is responsible for preserving the condition that tuples in a hash index
266  * page are sorted by hashkey value.
267  */
268 OffsetNumber
_hash_pgaddtup(Relation rel,Buffer buf,Size itemsize,IndexTuple itup)269 _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
270 {
271 	OffsetNumber itup_off;
272 	Page		page;
273 	uint32		hashkey;
274 
275 	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
276 	page = BufferGetPage(buf);
277 
278 	/* Find where to insert the tuple (preserving page's hashkey ordering) */
279 	hashkey = _hash_get_indextuple_hashkey(itup);
280 	itup_off = _hash_binsearch(page, hashkey);
281 
282 	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
283 		== InvalidOffsetNumber)
284 		elog(ERROR, "failed to add index item to \"%s\"",
285 			 RelationGetRelationName(rel));
286 
287 	return itup_off;
288 }
289 
290 /*
291  *	_hash_pgaddmultitup() -- add a tuple vector to a particular page in the
292  *							 index.
293  *
294  * This routine has same requirements for locking and tuple ordering as
295  * _hash_pgaddtup().
296  *
297  * Returns the offset number array at which the tuples were inserted.
298  */
299 void
_hash_pgaddmultitup(Relation rel,Buffer buf,IndexTuple * itups,OffsetNumber * itup_offsets,uint16 nitups)300 _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
301 					OffsetNumber *itup_offsets, uint16 nitups)
302 {
303 	OffsetNumber itup_off;
304 	Page		page;
305 	uint32		hashkey;
306 	int			i;
307 
308 	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
309 	page = BufferGetPage(buf);
310 
311 	for (i = 0; i < nitups; i++)
312 	{
313 		Size		itemsize;
314 
315 		itemsize = IndexTupleSize(itups[i]);
316 		itemsize = MAXALIGN(itemsize);
317 
318 		/* Find where to insert the tuple (preserving page's hashkey ordering) */
319 		hashkey = _hash_get_indextuple_hashkey(itups[i]);
320 		itup_off = _hash_binsearch(page, hashkey);
321 
322 		itup_offsets[i] = itup_off;
323 
324 		if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
325 			== InvalidOffsetNumber)
326 			elog(ERROR, "failed to add index item to \"%s\"",
327 				 RelationGetRelationName(rel));
328 	}
329 }
330 
331 /*
332  * _hash_vacuum_one_page - vacuum just one index page.
333  *
334  * Try to remove LP_DEAD items from the given page. We must acquire cleanup
335  * lock on the page being modified before calling this function.
336  */
337 
338 static void
_hash_vacuum_one_page(Relation rel,Buffer metabuf,Buffer buf,RelFileNode hnode)339 _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
340 					  RelFileNode hnode)
341 {
342 	OffsetNumber deletable[MaxOffsetNumber];
343 	int			ndeletable = 0;
344 	OffsetNumber offnum,
345 				maxoff;
346 	Page		page = BufferGetPage(buf);
347 	HashPageOpaque pageopaque;
348 	HashMetaPage metap;
349 
350 	/* Scan each tuple in page to see if it is marked as LP_DEAD */
351 	maxoff = PageGetMaxOffsetNumber(page);
352 	for (offnum = FirstOffsetNumber;
353 		 offnum <= maxoff;
354 		 offnum = OffsetNumberNext(offnum))
355 	{
356 		ItemId		itemId = PageGetItemId(page, offnum);
357 
358 		if (ItemIdIsDead(itemId))
359 			deletable[ndeletable++] = offnum;
360 	}
361 
362 	if (ndeletable > 0)
363 	{
364 		/*
365 		 * Write-lock the meta page so that we can decrement tuple count.
366 		 */
367 		LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
368 
369 		/* No ereport(ERROR) until changes are logged */
370 		START_CRIT_SECTION();
371 
372 		PageIndexMultiDelete(page, deletable, ndeletable);
373 
374 		/*
375 		 * Mark the page as not containing any LP_DEAD items. This is not
376 		 * certainly true (there might be some that have recently been marked,
377 		 * but weren't included in our target-item list), but it will almost
378 		 * always be true and it doesn't seem worth an additional page scan to
379 		 * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
380 		 * anyway.
381 		 */
382 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
383 		pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
384 
385 		metap = HashPageGetMeta(BufferGetPage(metabuf));
386 		metap->hashm_ntuples -= ndeletable;
387 
388 		MarkBufferDirty(buf);
389 		MarkBufferDirty(metabuf);
390 
391 		/* XLOG stuff */
392 		if (RelationNeedsWAL(rel))
393 		{
394 			xl_hash_vacuum_one_page xlrec;
395 			XLogRecPtr	recptr;
396 
397 			xlrec.hnode = hnode;
398 			xlrec.ntuples = ndeletable;
399 
400 			XLogBeginInsert();
401 			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
402 			XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
403 
404 			/*
405 			 * We need the target-offsets array whether or not we store the
406 			 * whole buffer, to allow us to find the latestRemovedXid on a
407 			 * standby server.
408 			 */
409 			XLogRegisterData((char *) deletable,
410 							 ndeletable * sizeof(OffsetNumber));
411 
412 			XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
413 
414 			recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
415 
416 			PageSetLSN(BufferGetPage(buf), recptr);
417 			PageSetLSN(BufferGetPage(metabuf), recptr);
418 		}
419 
420 		END_CRIT_SECTION();
421 
422 		/*
423 		 * Releasing write lock on meta page as we have updated the tuple
424 		 * count.
425 		 */
426 		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
427 	}
428 }
429