1 /*-------------------------------------------------------------------------
2  *
3  * ginpostinglist.c
4  *	  routines for dealing with posting lists.
5  *
6  *
7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *			src/backend/access/gin/ginpostinglist.c
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/gin_private.h"
18 
19 #ifdef USE_ASSERT_CHECKING
20 #define CHECK_ENCODING_ROUNDTRIP
21 #endif
22 
23 /*
24  * For encoding purposes, item pointers are represented as 64-bit unsigned
25  * integers. The lowest 11 bits represent the offset number, and the next
26  * lowest 32 bits are the block number. That leaves 21 bits unused, i.e.
27  * only 43 low bits are used.
28  *
29  * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <
30  * 2^11 on all supported block sizes. We are frugal with the bits, because
31  * smaller integers use fewer bytes in the varbyte encoding, saving disk
32  * space. (If we get a new table AM in the future that wants to use the full
33  * range of possible offset numbers, we'll need to change this.)
34  *
35  * These 43-bit integers are encoded using varbyte encoding. In each byte,
36  * the 7 low bits contain data, while the highest bit is a continuation bit.
37  * When the continuation bit is set, the next byte is part of the same
38  * integer, otherwise this is the last byte of this integer. 43 bits need
39  * at most 7 bytes in this encoding:
40  *
41  * 0XXXXXXX
42  * 1XXXXXXX 0XXXXYYY
43  * 1XXXXXXX 1XXXXYYY 0YYYYYYY
44  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
45  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
46  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
47  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY
48  *
49  * X = bits used for offset number
50  * Y = bits used for block number
51  * u = unused bit
52  *
53  * The bytes are in stored in little-endian order.
54  *
55  * An important property of this encoding is that removing an item from list
56  * never increases the size of the resulting compressed posting list. Proof:
57  *
58  * Removing number is actually replacement of two numbers with their sum. We
59  * have to prove that varbyte encoding of a sum can't be longer than varbyte
60  * encoding of its summands. Sum of two numbers is at most one bit wider than
61  * the larger of the summands. Widening a number by one bit enlarges its length
62  * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
63  * is at most one byte longer than varbyte encoding of larger summand. Lesser
64  * summand is at least one byte, so the sum cannot take more space than the
65  * summands, Q.E.D.
66  *
67  * This property greatly simplifies VACUUM, which can assume that posting
68  * lists always fit on the same page after vacuuming. Note that even though
69  * that holds for removing items from a posting list, you must also be
70  * careful to not cause expansion e.g. when merging uncompressed items on the
71  * page into the compressed lists, when vacuuming.
72  */
73 
74 /*
75  * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
76  * integer, but you can't fit that many items on a page. 11 ought to be more
77  * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
78  * use the minimum number of bits, but that would require changing the on-disk
79  * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
80  */
81 #define MaxHeapTuplesPerPageBits		11
82 
83 /* Max. number of bytes needed to encode the largest supported integer. */
84 #define MaxBytesPerInteger				7
85 
86 static inline uint64
itemptr_to_uint64(const ItemPointer iptr)87 itemptr_to_uint64(const ItemPointer iptr)
88 {
89 	uint64		val;
90 
91 	Assert(ItemPointerIsValid(iptr));
92 	Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits));
93 
94 	val = GinItemPointerGetBlockNumber(iptr);
95 	val <<= MaxHeapTuplesPerPageBits;
96 	val |= GinItemPointerGetOffsetNumber(iptr);
97 
98 	return val;
99 }
100 
101 static inline void
uint64_to_itemptr(uint64 val,ItemPointer iptr)102 uint64_to_itemptr(uint64 val, ItemPointer iptr)
103 {
104 	GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1));
105 	val = val >> MaxHeapTuplesPerPageBits;
106 	GinItemPointerSetBlockNumber(iptr, val);
107 
108 	Assert(ItemPointerIsValid(iptr));
109 }
110 
111 /*
112  * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
113  */
114 static void
encode_varbyte(uint64 val,unsigned char ** ptr)115 encode_varbyte(uint64 val, unsigned char **ptr)
116 {
117 	unsigned char *p = *ptr;
118 
119 	while (val > 0x7F)
120 	{
121 		*(p++) = 0x80 | (val & 0x7F);
122 		val >>= 7;
123 	}
124 	*(p++) = (unsigned char) val;
125 
126 	*ptr = p;
127 }
128 
129 /*
130  * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
131  */
132 static uint64
decode_varbyte(unsigned char ** ptr)133 decode_varbyte(unsigned char **ptr)
134 {
135 	uint64		val;
136 	unsigned char *p = *ptr;
137 	uint64		c;
138 
139 	/* 1st byte */
140 	c = *(p++);
141 	val = c & 0x7F;
142 	if (c & 0x80)
143 	{
144 		/* 2nd byte */
145 		c = *(p++);
146 		val |= (c & 0x7F) << 7;
147 		if (c & 0x80)
148 		{
149 			/* 3rd byte */
150 			c = *(p++);
151 			val |= (c & 0x7F) << 14;
152 			if (c & 0x80)
153 			{
154 				/* 4th byte */
155 				c = *(p++);
156 				val |= (c & 0x7F) << 21;
157 				if (c & 0x80)
158 				{
159 					/* 5th byte */
160 					c = *(p++);
161 					val |= (c & 0x7F) << 28;
162 					if (c & 0x80)
163 					{
164 						/* 6th byte */
165 						c = *(p++);
166 						val |= (c & 0x7F) << 35;
167 						if (c & 0x80)
168 						{
169 							/* 7th byte, should not have continuation bit */
170 							c = *(p++);
171 							val |= c << 42;
172 							Assert((c & 0x80) == 0);
173 						}
174 					}
175 				}
176 			}
177 		}
178 	}
179 
180 	*ptr = p;
181 
182 	return val;
183 }
184 
185 /*
186  * Encode a posting list.
187  *
188  * The encoded list is returned in a palloc'd struct, which will be at most
189  * 'maxsize' bytes in size.  The number items in the returned segment is
190  * returned in *nwritten. If it's not equal to nipd, not all the items fit
191  * in 'maxsize', and only the first *nwritten were encoded.
192  *
193  * The allocated size of the returned struct is short-aligned, and the padding
194  * byte at the end, if any, is zero.
195  */
196 GinPostingList *
ginCompressPostingList(const ItemPointer ipd,int nipd,int maxsize,int * nwritten)197 ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
198 					   int *nwritten)
199 {
200 	uint64		prev;
201 	int			totalpacked = 0;
202 	int			maxbytes;
203 	GinPostingList *result;
204 	unsigned char *ptr;
205 	unsigned char *endptr;
206 
207 	maxsize = SHORTALIGN_DOWN(maxsize);
208 
209 	result = palloc(maxsize);
210 
211 	maxbytes = maxsize - offsetof(GinPostingList, bytes);
212 	Assert(maxbytes > 0);
213 
214 	/* Store the first special item */
215 	result->first = ipd[0];
216 
217 	prev = itemptr_to_uint64(&result->first);
218 
219 	ptr = result->bytes;
220 	endptr = result->bytes + maxbytes;
221 	for (totalpacked = 1; totalpacked < nipd; totalpacked++)
222 	{
223 		uint64		val = itemptr_to_uint64(&ipd[totalpacked]);
224 		uint64		delta = val - prev;
225 
226 		Assert(val > prev);
227 
228 		if (endptr - ptr >= MaxBytesPerInteger)
229 			encode_varbyte(delta, &ptr);
230 		else
231 		{
232 			/*
233 			 * There are less than 7 bytes left. Have to check if the next
234 			 * item fits in that space before writing it out.
235 			 */
236 			unsigned char buf[MaxBytesPerInteger];
237 			unsigned char *p = buf;
238 
239 			encode_varbyte(delta, &p);
240 			if (p - buf > (endptr - ptr))
241 				break;			/* output is full */
242 
243 			memcpy(ptr, buf, p - buf);
244 			ptr += (p - buf);
245 		}
246 		prev = val;
247 	}
248 	result->nbytes = ptr - result->bytes;
249 
250 	/*
251 	 * If we wrote an odd number of bytes, zero out the padding byte at the
252 	 * end.
253 	 */
254 	if (result->nbytes != SHORTALIGN(result->nbytes))
255 		result->bytes[result->nbytes] = 0;
256 
257 	if (nwritten)
258 		*nwritten = totalpacked;
259 
260 	Assert(SizeOfGinPostingList(result) <= maxsize);
261 
262 	/*
263 	 * Check that the encoded segment decodes back to the original items.
264 	 */
265 #if defined (CHECK_ENCODING_ROUNDTRIP)
266 	{
267 		int			ndecoded;
268 		ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
269 		int			i;
270 
271 		Assert(ndecoded == totalpacked);
272 		for (i = 0; i < ndecoded; i++)
273 			Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0);
274 		pfree(tmp);
275 	}
276 #endif
277 
278 	return result;
279 }
280 
281 /*
282  * Decode a compressed posting list into an array of item pointers.
283  * The number of items is returned in *ndecoded.
284  */
285 ItemPointer
ginPostingListDecode(GinPostingList * plist,int * ndecoded)286 ginPostingListDecode(GinPostingList *plist, int *ndecoded)
287 {
288 	return ginPostingListDecodeAllSegments(plist,
289 										   SizeOfGinPostingList(plist),
290 										   ndecoded);
291 }
292 
293 /*
294  * Decode multiple posting list segments into an array of item pointers.
295  * The number of items is returned in *ndecoded_out. The segments are stored
296  * one after each other, with total size 'len' bytes.
297  */
298 ItemPointer
ginPostingListDecodeAllSegments(GinPostingList * segment,int len,int * ndecoded_out)299 ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
300 {
301 	ItemPointer result;
302 	int			nallocated;
303 	uint64		val;
304 	char	   *endseg = ((char *) segment) + len;
305 	int			ndecoded;
306 	unsigned char *ptr;
307 	unsigned char *endptr;
308 
309 	/*
310 	 * Guess an initial size of the array.
311 	 */
312 	nallocated = segment->nbytes * 2 + 1;
313 	result = palloc(nallocated * sizeof(ItemPointerData));
314 
315 	ndecoded = 0;
316 	while ((char *) segment < endseg)
317 	{
318 		/* enlarge output array if needed */
319 		if (ndecoded >= nallocated)
320 		{
321 			nallocated *= 2;
322 			result = repalloc(result, nallocated * sizeof(ItemPointerData));
323 		}
324 
325 		/* copy the first item */
326 		Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
327 		Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
328 		result[ndecoded] = segment->first;
329 		ndecoded++;
330 
331 		val = itemptr_to_uint64(&segment->first);
332 		ptr = segment->bytes;
333 		endptr = segment->bytes + segment->nbytes;
334 		while (ptr < endptr)
335 		{
336 			/* enlarge output array if needed */
337 			if (ndecoded >= nallocated)
338 			{
339 				nallocated *= 2;
340 				result = repalloc(result, nallocated * sizeof(ItemPointerData));
341 			}
342 
343 			val += decode_varbyte(&ptr);
344 
345 			uint64_to_itemptr(val, &result[ndecoded]);
346 			ndecoded++;
347 		}
348 		segment = GinNextPostingListSegment(segment);
349 	}
350 
351 	if (ndecoded_out)
352 		*ndecoded_out = ndecoded;
353 	return result;
354 }
355 
356 /*
357  * Add all item pointers from a bunch of posting lists to a TIDBitmap.
358  */
359 int
ginPostingListDecodeAllSegmentsToTbm(GinPostingList * ptr,int len,TIDBitmap * tbm)360 ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,
361 									 TIDBitmap *tbm)
362 {
363 	int			ndecoded;
364 	ItemPointer items;
365 
366 	items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
367 	tbm_add_tuples(tbm, items, ndecoded, false);
368 	pfree(items);
369 
370 	return ndecoded;
371 }
372 
373 /*
374  * Merge two ordered arrays of itempointers, eliminating any duplicates.
375  *
376  * Returns a palloc'd array, and *nmerged is set to the number of items in
377  * the result, after eliminating duplicates.
378  */
379 ItemPointer
ginMergeItemPointers(ItemPointerData * a,uint32 na,ItemPointerData * b,uint32 nb,int * nmerged)380 ginMergeItemPointers(ItemPointerData *a, uint32 na,
381 					 ItemPointerData *b, uint32 nb,
382 					 int *nmerged)
383 {
384 	ItemPointerData *dst;
385 
386 	dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
387 
388 	/*
389 	 * If the argument arrays don't overlap, we can just append them to each
390 	 * other.
391 	 */
392 	if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
393 	{
394 		memcpy(dst, a, na * sizeof(ItemPointerData));
395 		memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
396 		*nmerged = na + nb;
397 	}
398 	else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
399 	{
400 		memcpy(dst, b, nb * sizeof(ItemPointerData));
401 		memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
402 		*nmerged = na + nb;
403 	}
404 	else
405 	{
406 		ItemPointerData *dptr = dst;
407 		ItemPointerData *aptr = a;
408 		ItemPointerData *bptr = b;
409 
410 		while (aptr - a < na && bptr - b < nb)
411 		{
412 			int			cmp = ginCompareItemPointers(aptr, bptr);
413 
414 			if (cmp > 0)
415 				*dptr++ = *bptr++;
416 			else if (cmp == 0)
417 			{
418 				/* only keep one copy of the identical items */
419 				*dptr++ = *bptr++;
420 				aptr++;
421 			}
422 			else
423 				*dptr++ = *aptr++;
424 		}
425 
426 		while (aptr - a < na)
427 			*dptr++ = *aptr++;
428 
429 		while (bptr - b < nb)
430 			*dptr++ = *bptr++;
431 
432 		*nmerged = dptr - dst;
433 	}
434 
435 	return dst;
436 }
437