1 /*
2  * contrib/intarray/_int_gist.c
3  */
4 #include "postgres.h"
5 
6 #include <limits.h>
7 
8 #include "access/gist.h"
9 #include "access/stratnum.h"
10 
11 #include "_int.h"
12 
13 #define GETENTRY(vec,pos) ((ArrayType *) DatumGetPointer((vec)->vector[(pos)].key))
14 
15 /*
16  * Control the maximum sparseness of compressed keys.
17  *
18  * The upper safe bound for this limit is half the maximum allocatable array
19  * size. A lower bound would give more guarantees that pathological data
20  * wouldn't eat excessive CPU and memory, but at the expense of breaking
21  * possibly working (after a fashion) indexes.
22  */
23 #define MAXNUMELTS (Min((MaxAllocSize / sizeof(Datum)),((MaxAllocSize - ARR_OVERHEAD_NONULLS(1)) / sizeof(int)))/2)
24 /* or: #define MAXNUMELTS 1000000 */
25 
26 /*
27 ** GiST support methods
28 */
29 PG_FUNCTION_INFO_V1(g_int_consistent);
30 PG_FUNCTION_INFO_V1(g_int_compress);
31 PG_FUNCTION_INFO_V1(g_int_decompress);
32 PG_FUNCTION_INFO_V1(g_int_penalty);
33 PG_FUNCTION_INFO_V1(g_int_picksplit);
34 PG_FUNCTION_INFO_V1(g_int_union);
35 PG_FUNCTION_INFO_V1(g_int_same);
36 
37 
38 /*
39 ** The GiST Consistent method for _intments
40 ** Should return false if for all data items x below entry,
41 ** the predicate x op query == FALSE, where op is the oper
42 ** corresponding to strategy in the pg_amop table.
43 */
44 Datum
g_int_consistent(PG_FUNCTION_ARGS)45 g_int_consistent(PG_FUNCTION_ARGS)
46 {
47 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
48 	ArrayType  *query = PG_GETARG_ARRAYTYPE_P_COPY(1);
49 	StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
50 
51 	/* Oid		subtype = PG_GETARG_OID(3); */
52 	bool	   *recheck = (bool *) PG_GETARG_POINTER(4);
53 	bool		retval;
54 
55 	/* this is exact except for RTSameStrategyNumber */
56 	*recheck = (strategy == RTSameStrategyNumber);
57 
58 	if (strategy == BooleanSearchStrategy)
59 	{
60 		retval = execconsistent((QUERYTYPE *) query,
61 								(ArrayType *) DatumGetPointer(entry->key),
62 								GIST_LEAF(entry));
63 
64 		pfree(query);
65 		PG_RETURN_BOOL(retval);
66 	}
67 
68 	/* sort query for fast search, key is already sorted */
69 	CHECKARRVALID(query);
70 	PREPAREARR(query);
71 
72 	switch (strategy)
73 	{
74 		case RTOverlapStrategyNumber:
75 			retval = inner_int_overlap((ArrayType *) DatumGetPointer(entry->key),
76 									   query);
77 			break;
78 		case RTSameStrategyNumber:
79 			if (GIST_LEAF(entry))
80 				DirectFunctionCall3(g_int_same,
81 									entry->key,
82 									PointerGetDatum(query),
83 									PointerGetDatum(&retval));
84 			else
85 				retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key),
86 											query);
87 			break;
88 		case RTContainsStrategyNumber:
89 		case RTOldContainsStrategyNumber:
90 			retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key),
91 										query);
92 			break;
93 		case RTContainedByStrategyNumber:
94 		case RTOldContainedByStrategyNumber:
95 			if (GIST_LEAF(entry))
96 				retval = inner_int_contains(query,
97 											(ArrayType *) DatumGetPointer(entry->key));
98 			else
99 			{
100 				/*
101 				 * Unfortunately, because empty arrays could be anywhere in
102 				 * the index, we must search the whole tree.
103 				 */
104 				retval = true;
105 			}
106 			break;
107 		default:
108 			retval = FALSE;
109 	}
110 	pfree(query);
111 	PG_RETURN_BOOL(retval);
112 }
113 
114 Datum
g_int_union(PG_FUNCTION_ARGS)115 g_int_union(PG_FUNCTION_ARGS)
116 {
117 	GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
118 	int		   *size = (int *) PG_GETARG_POINTER(1);
119 	int32		i,
120 			   *ptr;
121 	ArrayType  *res;
122 	int			totlen = 0;
123 
124 	for (i = 0; i < entryvec->n; i++)
125 	{
126 		ArrayType  *ent = GETENTRY(entryvec, i);
127 
128 		CHECKARRVALID(ent);
129 		totlen += ARRNELEMS(ent);
130 	}
131 
132 	res = new_intArrayType(totlen);
133 	ptr = ARRPTR(res);
134 
135 	for (i = 0; i < entryvec->n; i++)
136 	{
137 		ArrayType  *ent = GETENTRY(entryvec, i);
138 		int			nel;
139 
140 		nel = ARRNELEMS(ent);
141 		memcpy(ptr, ARRPTR(ent), nel * sizeof(int32));
142 		ptr += nel;
143 	}
144 
145 	QSORT(res, 1);
146 	res = _int_unique(res);
147 	*size = VARSIZE(res);
148 	PG_RETURN_POINTER(res);
149 }
150 
151 /*
152 ** GiST Compress and Decompress methods
153 */
154 Datum
g_int_compress(PG_FUNCTION_ARGS)155 g_int_compress(PG_FUNCTION_ARGS)
156 {
157 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
158 	GISTENTRY  *retval;
159 	ArrayType  *r;
160 	int			len,
161 				lenr;
162 	int		   *dr;
163 	int			i,
164 				j,
165 				cand;
166 	int64		min;
167 
168 	if (entry->leafkey)
169 	{
170 		r = DatumGetArrayTypePCopy(entry->key);
171 		CHECKARRVALID(r);
172 		PREPAREARR(r);
173 
174 		if (ARRNELEMS(r) >= 2 * MAXNUMRANGE)
175 			elog(NOTICE, "input array is too big (%d maximum allowed, %d current), use gist__intbig_ops opclass instead",
176 				 2 * MAXNUMRANGE - 1, ARRNELEMS(r));
177 
178 		retval = palloc(sizeof(GISTENTRY));
179 		gistentryinit(*retval, PointerGetDatum(r),
180 					  entry->rel, entry->page, entry->offset, FALSE);
181 
182 		PG_RETURN_POINTER(retval);
183 	}
184 
185 	/*
186 	 * leaf entries never compress one more time, only when entry->leafkey
187 	 * ==true, so now we work only with internal keys
188 	 */
189 
190 	r = DatumGetArrayTypeP(entry->key);
191 	CHECKARRVALID(r);
192 	if (ARRISEMPTY(r))
193 	{
194 		if (r != (ArrayType *) DatumGetPointer(entry->key))
195 			pfree(r);
196 		PG_RETURN_POINTER(entry);
197 	}
198 
199 	if ((len = ARRNELEMS(r)) >= 2 * MAXNUMRANGE)
200 	{							/* compress */
201 		if (r == (ArrayType *) DatumGetPointer(entry->key))
202 			r = DatumGetArrayTypePCopy(entry->key);
203 		r = resize_intArrayType(r, 2 * (len));
204 
205 		dr = ARRPTR(r);
206 
207 		/*
208 		 * "len" at this point is the number of ranges we will construct.
209 		 * "lenr" is the number of ranges we must eventually remove by
210 		 * merging, we must be careful to remove no more than this number.
211 		 */
212 		lenr = len - MAXNUMRANGE;
213 
214 		/*
215 		 * Initially assume we can merge consecutive ints into a range. but we
216 		 * must count every value removed and stop when lenr runs out
217 		 */
218 		for (j = i = len - 1; i > 0 && lenr > 0; i--, j--)
219 		{
220 			int		r_end = dr[i];
221 			int		r_start = r_end;
222 			while (i > 0 && lenr > 0 && dr[i-1] == r_start - 1)
223 				--r_start, --i, --lenr;
224 			dr[2*j] = r_start;
225 			dr[2*j+1] = r_end;
226 		}
227 		/* just copy the rest, if any, as trivial ranges */
228 		for (; i >= 0; i--, j--)
229 			dr[2*j] = dr[2*j + 1] = dr[i];
230 
231 		if (++j)
232 		{
233 			/*
234 			 * shunt everything down to start at the right place
235 			 */
236 			memmove((void *) &dr[0], (void *) &dr[2*j], 2*(len - j) * sizeof(int32));
237 		}
238 		/*
239 		 * make "len" be number of array elements, not ranges
240 		 */
241 		len = 2*(len - j);
242 		cand = 1;
243 		while (len > MAXNUMRANGE * 2)
244 		{
245 			min = PG_INT64_MAX;
246 			for (i = 2; i < len; i += 2)
247 				if (min > ((int64)dr[i] - (int64)dr[i - 1]))
248 				{
249 					min = ((int64)dr[i] - (int64)dr[i - 1]);
250 					cand = i;
251 				}
252 			memmove((void *) &dr[cand - 1], (void *) &dr[cand + 1], (len - cand - 1) * sizeof(int32));
253 			len -= 2;
254 		}
255 		/*
256 		 * check sparseness of result
257 		 */
258 		lenr = internal_size(dr, len);
259 		if (lenr < 0 || lenr > MAXNUMELTS)
260 			ereport(ERROR,
261 					(errmsg("data is too sparse, recreate index using gist__intbig_ops opclass instead")));
262 
263 		r = resize_intArrayType(r, len);
264 		retval = palloc(sizeof(GISTENTRY));
265 		gistentryinit(*retval, PointerGetDatum(r),
266 					  entry->rel, entry->page, entry->offset, FALSE);
267 		PG_RETURN_POINTER(retval);
268 	}
269 	else
270 		PG_RETURN_POINTER(entry);
271 }
272 
273 Datum
g_int_decompress(PG_FUNCTION_ARGS)274 g_int_decompress(PG_FUNCTION_ARGS)
275 {
276 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
277 	GISTENTRY  *retval;
278 	ArrayType  *r;
279 	int		   *dr,
280 				lenr;
281 	ArrayType  *in;
282 	int			lenin;
283 	int		   *din;
284 	int			i,
285 				j;
286 
287 	in = DatumGetArrayTypeP(entry->key);
288 
289 	CHECKARRVALID(in);
290 	if (ARRISEMPTY(in))
291 	{
292 		if (in != (ArrayType *) DatumGetPointer(entry->key))
293 		{
294 			retval = palloc(sizeof(GISTENTRY));
295 			gistentryinit(*retval, PointerGetDatum(in),
296 						  entry->rel, entry->page, entry->offset, FALSE);
297 			PG_RETURN_POINTER(retval);
298 		}
299 
300 		PG_RETURN_POINTER(entry);
301 	}
302 
303 	lenin = ARRNELEMS(in);
304 
305 	if (lenin < 2 * MAXNUMRANGE)
306 	{							/* not compressed value */
307 		if (in != (ArrayType *) DatumGetPointer(entry->key))
308 		{
309 			retval = palloc(sizeof(GISTENTRY));
310 			gistentryinit(*retval, PointerGetDatum(in),
311 						  entry->rel, entry->page, entry->offset, FALSE);
312 
313 			PG_RETURN_POINTER(retval);
314 		}
315 		PG_RETURN_POINTER(entry);
316 	}
317 
318 	din = ARRPTR(in);
319 	lenr = internal_size(din, lenin);
320 	if (lenr < 0 || lenr > MAXNUMELTS)
321 		ereport(ERROR,
322 				(errmsg("compressed array is too big, recreate index using gist__intbig_ops opclass instead")));
323 
324 	r = new_intArrayType(lenr);
325 	dr = ARRPTR(r);
326 
327 	for (i = 0; i < lenin; i += 2)
328 		for (j = din[i]; j <= din[i + 1]; j++)
329 			if ((!i) || *(dr - 1) != j)
330 				*dr++ = j;
331 
332 	if (in != (ArrayType *) DatumGetPointer(entry->key))
333 		pfree(in);
334 	retval = palloc(sizeof(GISTENTRY));
335 	gistentryinit(*retval, PointerGetDatum(r),
336 				  entry->rel, entry->page, entry->offset, FALSE);
337 
338 	PG_RETURN_POINTER(retval);
339 }
340 
341 /*
342 ** The GiST Penalty method for _intments
343 */
344 Datum
g_int_penalty(PG_FUNCTION_ARGS)345 g_int_penalty(PG_FUNCTION_ARGS)
346 {
347 	GISTENTRY  *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
348 	GISTENTRY  *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
349 	float	   *result = (float *) PG_GETARG_POINTER(2);
350 	ArrayType  *ud;
351 	float		tmp1,
352 				tmp2;
353 
354 	ud = inner_int_union((ArrayType *) DatumGetPointer(origentry->key),
355 						 (ArrayType *) DatumGetPointer(newentry->key));
356 	rt__int_size(ud, &tmp1);
357 	rt__int_size((ArrayType *) DatumGetPointer(origentry->key), &tmp2);
358 	*result = tmp1 - tmp2;
359 	pfree(ud);
360 
361 	PG_RETURN_POINTER(result);
362 }
363 
364 
365 
366 Datum
g_int_same(PG_FUNCTION_ARGS)367 g_int_same(PG_FUNCTION_ARGS)
368 {
369 	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
370 	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
371 	bool	   *result = (bool *) PG_GETARG_POINTER(2);
372 	int32		n = ARRNELEMS(a);
373 	int32	   *da,
374 			   *db;
375 
376 	CHECKARRVALID(a);
377 	CHECKARRVALID(b);
378 
379 	if (n != ARRNELEMS(b))
380 	{
381 		*result = false;
382 		PG_RETURN_POINTER(result);
383 	}
384 	*result = TRUE;
385 	da = ARRPTR(a);
386 	db = ARRPTR(b);
387 	while (n--)
388 	{
389 		if (*da++ != *db++)
390 		{
391 			*result = FALSE;
392 			break;
393 		}
394 	}
395 
396 	PG_RETURN_POINTER(result);
397 }
398 
399 /*****************************************************************
400 ** Common GiST Method
401 *****************************************************************/
402 
403 typedef struct
404 {
405 	OffsetNumber pos;
406 	float		cost;
407 } SPLITCOST;
408 
409 static int
comparecost(const void * a,const void * b)410 comparecost(const void *a, const void *b)
411 {
412 	if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost)
413 		return 0;
414 	else
415 		return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1;
416 }
417 
418 /*
419 ** The GiST PickSplit method for _intments
420 ** We use Guttman's poly time split algorithm
421 */
422 Datum
g_int_picksplit(PG_FUNCTION_ARGS)423 g_int_picksplit(PG_FUNCTION_ARGS)
424 {
425 	GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
426 	GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
427 	OffsetNumber i,
428 				j;
429 	ArrayType  *datum_alpha,
430 			   *datum_beta;
431 	ArrayType  *datum_l,
432 			   *datum_r;
433 	ArrayType  *union_d,
434 			   *union_dl,
435 			   *union_dr;
436 	ArrayType  *inter_d;
437 	bool		firsttime;
438 	float		size_alpha,
439 				size_beta,
440 				size_union,
441 				size_inter;
442 	float		size_waste,
443 				waste;
444 	float		size_l,
445 				size_r;
446 	int			nbytes;
447 	OffsetNumber seed_1 = 0,
448 				seed_2 = 0;
449 	OffsetNumber *left,
450 			   *right;
451 	OffsetNumber maxoff;
452 	SPLITCOST  *costvector;
453 
454 #ifdef GIST_DEBUG
455 	elog(DEBUG3, "--------picksplit %d", entryvec->n);
456 #endif
457 
458 	maxoff = entryvec->n - 2;
459 	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
460 	v->spl_left = (OffsetNumber *) palloc(nbytes);
461 	v->spl_right = (OffsetNumber *) palloc(nbytes);
462 
463 	firsttime = true;
464 	waste = 0.0;
465 	for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
466 	{
467 		datum_alpha = GETENTRY(entryvec, i);
468 		for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
469 		{
470 			datum_beta = GETENTRY(entryvec, j);
471 
472 			/* compute the wasted space by unioning these guys */
473 			/* size_waste = size_union - size_inter; */
474 			union_d = inner_int_union(datum_alpha, datum_beta);
475 			rt__int_size(union_d, &size_union);
476 			inter_d = inner_int_inter(datum_alpha, datum_beta);
477 			rt__int_size(inter_d, &size_inter);
478 			size_waste = size_union - size_inter;
479 
480 			pfree(union_d);
481 			pfree(inter_d);
482 
483 			/*
484 			 * are these a more promising split that what we've already seen?
485 			 */
486 
487 			if (size_waste > waste || firsttime)
488 			{
489 				waste = size_waste;
490 				seed_1 = i;
491 				seed_2 = j;
492 				firsttime = false;
493 			}
494 		}
495 	}
496 
497 	left = v->spl_left;
498 	v->spl_nleft = 0;
499 	right = v->spl_right;
500 	v->spl_nright = 0;
501 	if (seed_1 == 0 || seed_2 == 0)
502 	{
503 		seed_1 = 1;
504 		seed_2 = 2;
505 	}
506 
507 	datum_alpha = GETENTRY(entryvec, seed_1);
508 	datum_l = copy_intArrayType(datum_alpha);
509 	rt__int_size(datum_l, &size_l);
510 	datum_beta = GETENTRY(entryvec, seed_2);
511 	datum_r = copy_intArrayType(datum_beta);
512 	rt__int_size(datum_r, &size_r);
513 
514 	maxoff = OffsetNumberNext(maxoff);
515 
516 	/*
517 	 * sort entries
518 	 */
519 	costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
520 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
521 	{
522 		costvector[i - 1].pos = i;
523 		datum_alpha = GETENTRY(entryvec, i);
524 		union_d = inner_int_union(datum_l, datum_alpha);
525 		rt__int_size(union_d, &size_alpha);
526 		pfree(union_d);
527 		union_d = inner_int_union(datum_r, datum_alpha);
528 		rt__int_size(union_d, &size_beta);
529 		pfree(union_d);
530 		costvector[i - 1].cost = Abs((size_alpha - size_l) - (size_beta - size_r));
531 	}
532 	qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);
533 
534 	/*
535 	 * Now split up the regions between the two seeds.  An important property
536 	 * of this split algorithm is that the split vector v has the indices of
537 	 * items to be split in order in its left and right vectors.  We exploit
538 	 * this property by doing a merge in the code that actually splits the
539 	 * page.
540 	 *
541 	 * For efficiency, we also place the new index tuple in this loop. This is
542 	 * handled at the very end, when we have placed all the existing tuples
543 	 * and i == maxoff + 1.
544 	 */
545 
546 
547 	for (j = 0; j < maxoff; j++)
548 	{
549 		i = costvector[j].pos;
550 
551 		/*
552 		 * If we've already decided where to place this item, just put it on
553 		 * the right list.  Otherwise, we need to figure out which page needs
554 		 * the least enlargement in order to store the item.
555 		 */
556 
557 		if (i == seed_1)
558 		{
559 			*left++ = i;
560 			v->spl_nleft++;
561 			continue;
562 		}
563 		else if (i == seed_2)
564 		{
565 			*right++ = i;
566 			v->spl_nright++;
567 			continue;
568 		}
569 
570 		/* okay, which page needs least enlargement? */
571 		datum_alpha = GETENTRY(entryvec, i);
572 		union_dl = inner_int_union(datum_l, datum_alpha);
573 		union_dr = inner_int_union(datum_r, datum_alpha);
574 		rt__int_size(union_dl, &size_alpha);
575 		rt__int_size(union_dr, &size_beta);
576 
577 		/* pick which page to add it to */
578 		if (size_alpha - size_l < size_beta - size_r + WISH_F(v->spl_nleft, v->spl_nright, 0.01))
579 		{
580 			pfree(datum_l);
581 			pfree(union_dr);
582 			datum_l = union_dl;
583 			size_l = size_alpha;
584 			*left++ = i;
585 			v->spl_nleft++;
586 		}
587 		else
588 		{
589 			pfree(datum_r);
590 			pfree(union_dl);
591 			datum_r = union_dr;
592 			size_r = size_beta;
593 			*right++ = i;
594 			v->spl_nright++;
595 		}
596 	}
597 	pfree(costvector);
598 	*right = *left = FirstOffsetNumber;
599 
600 	v->spl_ldatum = PointerGetDatum(datum_l);
601 	v->spl_rdatum = PointerGetDatum(datum_r);
602 
603 	PG_RETURN_POINTER(v);
604 }
605