1 /*
2 * contrib/intarray/_int_gist.c
3 */
4 #include "postgres.h"
5
6 #include <limits.h>
7
8 #include "access/gist.h"
9 #include "access/stratnum.h"
10
11 #include "_int.h"
12
13 #define GETENTRY(vec,pos) ((ArrayType *) DatumGetPointer((vec)->vector[(pos)].key))
14
15 /*
16 * Control the maximum sparseness of compressed keys.
17 *
18 * The upper safe bound for this limit is half the maximum allocatable array
19 * size. A lower bound would give more guarantees that pathological data
20 * wouldn't eat excessive CPU and memory, but at the expense of breaking
21 * possibly working (after a fashion) indexes.
22 */
23 #define MAXNUMELTS (Min((MaxAllocSize / sizeof(Datum)),((MaxAllocSize - ARR_OVERHEAD_NONULLS(1)) / sizeof(int)))/2)
24 /* or: #define MAXNUMELTS 1000000 */
25
26 /*
27 ** GiST support methods
28 */
29 PG_FUNCTION_INFO_V1(g_int_consistent);
30 PG_FUNCTION_INFO_V1(g_int_compress);
31 PG_FUNCTION_INFO_V1(g_int_decompress);
32 PG_FUNCTION_INFO_V1(g_int_penalty);
33 PG_FUNCTION_INFO_V1(g_int_picksplit);
34 PG_FUNCTION_INFO_V1(g_int_union);
35 PG_FUNCTION_INFO_V1(g_int_same);
36
37
38 /*
39 ** The GiST Consistent method for _intments
40 ** Should return false if for all data items x below entry,
41 ** the predicate x op query == FALSE, where op is the oper
42 ** corresponding to strategy in the pg_amop table.
43 */
44 Datum
g_int_consistent(PG_FUNCTION_ARGS)45 g_int_consistent(PG_FUNCTION_ARGS)
46 {
47 GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
48 ArrayType *query = PG_GETARG_ARRAYTYPE_P_COPY(1);
49 StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
50
51 /* Oid subtype = PG_GETARG_OID(3); */
52 bool *recheck = (bool *) PG_GETARG_POINTER(4);
53 bool retval;
54
55 /* this is exact except for RTSameStrategyNumber */
56 *recheck = (strategy == RTSameStrategyNumber);
57
58 if (strategy == BooleanSearchStrategy)
59 {
60 retval = execconsistent((QUERYTYPE *) query,
61 (ArrayType *) DatumGetPointer(entry->key),
62 GIST_LEAF(entry));
63
64 pfree(query);
65 PG_RETURN_BOOL(retval);
66 }
67
68 /* sort query for fast search, key is already sorted */
69 CHECKARRVALID(query);
70 PREPAREARR(query);
71
72 switch (strategy)
73 {
74 case RTOverlapStrategyNumber:
75 retval = inner_int_overlap((ArrayType *) DatumGetPointer(entry->key),
76 query);
77 break;
78 case RTSameStrategyNumber:
79 if (GIST_LEAF(entry))
80 DirectFunctionCall3(g_int_same,
81 entry->key,
82 PointerGetDatum(query),
83 PointerGetDatum(&retval));
84 else
85 retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key),
86 query);
87 break;
88 case RTContainsStrategyNumber:
89 case RTOldContainsStrategyNumber:
90 retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key),
91 query);
92 break;
93 case RTContainedByStrategyNumber:
94 case RTOldContainedByStrategyNumber:
95 if (GIST_LEAF(entry))
96 retval = inner_int_contains(query,
97 (ArrayType *) DatumGetPointer(entry->key));
98 else
99 {
100 /*
101 * Unfortunately, because empty arrays could be anywhere in
102 * the index, we must search the whole tree.
103 */
104 retval = true;
105 }
106 break;
107 default:
108 retval = FALSE;
109 }
110 pfree(query);
111 PG_RETURN_BOOL(retval);
112 }
113
114 Datum
g_int_union(PG_FUNCTION_ARGS)115 g_int_union(PG_FUNCTION_ARGS)
116 {
117 GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
118 int *size = (int *) PG_GETARG_POINTER(1);
119 int32 i,
120 *ptr;
121 ArrayType *res;
122 int totlen = 0;
123
124 for (i = 0; i < entryvec->n; i++)
125 {
126 ArrayType *ent = GETENTRY(entryvec, i);
127
128 CHECKARRVALID(ent);
129 totlen += ARRNELEMS(ent);
130 }
131
132 res = new_intArrayType(totlen);
133 ptr = ARRPTR(res);
134
135 for (i = 0; i < entryvec->n; i++)
136 {
137 ArrayType *ent = GETENTRY(entryvec, i);
138 int nel;
139
140 nel = ARRNELEMS(ent);
141 memcpy(ptr, ARRPTR(ent), nel * sizeof(int32));
142 ptr += nel;
143 }
144
145 QSORT(res, 1);
146 res = _int_unique(res);
147 *size = VARSIZE(res);
148 PG_RETURN_POINTER(res);
149 }
150
151 /*
152 ** GiST Compress and Decompress methods
153 */
154 Datum
g_int_compress(PG_FUNCTION_ARGS)155 g_int_compress(PG_FUNCTION_ARGS)
156 {
157 GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
158 GISTENTRY *retval;
159 ArrayType *r;
160 int len,
161 lenr;
162 int *dr;
163 int i,
164 j,
165 cand;
166 int64 min;
167
168 if (entry->leafkey)
169 {
170 r = DatumGetArrayTypePCopy(entry->key);
171 CHECKARRVALID(r);
172 PREPAREARR(r);
173
174 if (ARRNELEMS(r) >= 2 * MAXNUMRANGE)
175 elog(NOTICE, "input array is too big (%d maximum allowed, %d current), use gist__intbig_ops opclass instead",
176 2 * MAXNUMRANGE - 1, ARRNELEMS(r));
177
178 retval = palloc(sizeof(GISTENTRY));
179 gistentryinit(*retval, PointerGetDatum(r),
180 entry->rel, entry->page, entry->offset, FALSE);
181
182 PG_RETURN_POINTER(retval);
183 }
184
185 /*
186 * leaf entries never compress one more time, only when entry->leafkey
187 * ==true, so now we work only with internal keys
188 */
189
190 r = DatumGetArrayTypeP(entry->key);
191 CHECKARRVALID(r);
192 if (ARRISEMPTY(r))
193 {
194 if (r != (ArrayType *) DatumGetPointer(entry->key))
195 pfree(r);
196 PG_RETURN_POINTER(entry);
197 }
198
199 if ((len = ARRNELEMS(r)) >= 2 * MAXNUMRANGE)
200 { /* compress */
201 if (r == (ArrayType *) DatumGetPointer(entry->key))
202 r = DatumGetArrayTypePCopy(entry->key);
203 r = resize_intArrayType(r, 2 * (len));
204
205 dr = ARRPTR(r);
206
207 /*
208 * "len" at this point is the number of ranges we will construct.
209 * "lenr" is the number of ranges we must eventually remove by
210 * merging, we must be careful to remove no more than this number.
211 */
212 lenr = len - MAXNUMRANGE;
213
214 /*
215 * Initially assume we can merge consecutive ints into a range. but we
216 * must count every value removed and stop when lenr runs out
217 */
218 for (j = i = len - 1; i > 0 && lenr > 0; i--, j--)
219 {
220 int r_end = dr[i];
221 int r_start = r_end;
222 while (i > 0 && lenr > 0 && dr[i-1] == r_start - 1)
223 --r_start, --i, --lenr;
224 dr[2*j] = r_start;
225 dr[2*j+1] = r_end;
226 }
227 /* just copy the rest, if any, as trivial ranges */
228 for (; i >= 0; i--, j--)
229 dr[2*j] = dr[2*j + 1] = dr[i];
230
231 if (++j)
232 {
233 /*
234 * shunt everything down to start at the right place
235 */
236 memmove((void *) &dr[0], (void *) &dr[2*j], 2*(len - j) * sizeof(int32));
237 }
238 /*
239 * make "len" be number of array elements, not ranges
240 */
241 len = 2*(len - j);
242 cand = 1;
243 while (len > MAXNUMRANGE * 2)
244 {
245 min = PG_INT64_MAX;
246 for (i = 2; i < len; i += 2)
247 if (min > ((int64)dr[i] - (int64)dr[i - 1]))
248 {
249 min = ((int64)dr[i] - (int64)dr[i - 1]);
250 cand = i;
251 }
252 memmove((void *) &dr[cand - 1], (void *) &dr[cand + 1], (len - cand - 1) * sizeof(int32));
253 len -= 2;
254 }
255 /*
256 * check sparseness of result
257 */
258 lenr = internal_size(dr, len);
259 if (lenr < 0 || lenr > MAXNUMELTS)
260 ereport(ERROR,
261 (errmsg("data is too sparse, recreate index using gist__intbig_ops opclass instead")));
262
263 r = resize_intArrayType(r, len);
264 retval = palloc(sizeof(GISTENTRY));
265 gistentryinit(*retval, PointerGetDatum(r),
266 entry->rel, entry->page, entry->offset, FALSE);
267 PG_RETURN_POINTER(retval);
268 }
269 else
270 PG_RETURN_POINTER(entry);
271 }
272
273 Datum
g_int_decompress(PG_FUNCTION_ARGS)274 g_int_decompress(PG_FUNCTION_ARGS)
275 {
276 GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
277 GISTENTRY *retval;
278 ArrayType *r;
279 int *dr,
280 lenr;
281 ArrayType *in;
282 int lenin;
283 int *din;
284 int i,
285 j;
286
287 in = DatumGetArrayTypeP(entry->key);
288
289 CHECKARRVALID(in);
290 if (ARRISEMPTY(in))
291 {
292 if (in != (ArrayType *) DatumGetPointer(entry->key))
293 {
294 retval = palloc(sizeof(GISTENTRY));
295 gistentryinit(*retval, PointerGetDatum(in),
296 entry->rel, entry->page, entry->offset, FALSE);
297 PG_RETURN_POINTER(retval);
298 }
299
300 PG_RETURN_POINTER(entry);
301 }
302
303 lenin = ARRNELEMS(in);
304
305 if (lenin < 2 * MAXNUMRANGE)
306 { /* not compressed value */
307 if (in != (ArrayType *) DatumGetPointer(entry->key))
308 {
309 retval = palloc(sizeof(GISTENTRY));
310 gistentryinit(*retval, PointerGetDatum(in),
311 entry->rel, entry->page, entry->offset, FALSE);
312
313 PG_RETURN_POINTER(retval);
314 }
315 PG_RETURN_POINTER(entry);
316 }
317
318 din = ARRPTR(in);
319 lenr = internal_size(din, lenin);
320 if (lenr < 0 || lenr > MAXNUMELTS)
321 ereport(ERROR,
322 (errmsg("compressed array is too big, recreate index using gist__intbig_ops opclass instead")));
323
324 r = new_intArrayType(lenr);
325 dr = ARRPTR(r);
326
327 for (i = 0; i < lenin; i += 2)
328 for (j = din[i]; j <= din[i + 1]; j++)
329 if ((!i) || *(dr - 1) != j)
330 *dr++ = j;
331
332 if (in != (ArrayType *) DatumGetPointer(entry->key))
333 pfree(in);
334 retval = palloc(sizeof(GISTENTRY));
335 gistentryinit(*retval, PointerGetDatum(r),
336 entry->rel, entry->page, entry->offset, FALSE);
337
338 PG_RETURN_POINTER(retval);
339 }
340
341 /*
342 ** The GiST Penalty method for _intments
343 */
344 Datum
g_int_penalty(PG_FUNCTION_ARGS)345 g_int_penalty(PG_FUNCTION_ARGS)
346 {
347 GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
348 GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
349 float *result = (float *) PG_GETARG_POINTER(2);
350 ArrayType *ud;
351 float tmp1,
352 tmp2;
353
354 ud = inner_int_union((ArrayType *) DatumGetPointer(origentry->key),
355 (ArrayType *) DatumGetPointer(newentry->key));
356 rt__int_size(ud, &tmp1);
357 rt__int_size((ArrayType *) DatumGetPointer(origentry->key), &tmp2);
358 *result = tmp1 - tmp2;
359 pfree(ud);
360
361 PG_RETURN_POINTER(result);
362 }
363
364
365
366 Datum
g_int_same(PG_FUNCTION_ARGS)367 g_int_same(PG_FUNCTION_ARGS)
368 {
369 ArrayType *a = PG_GETARG_ARRAYTYPE_P(0);
370 ArrayType *b = PG_GETARG_ARRAYTYPE_P(1);
371 bool *result = (bool *) PG_GETARG_POINTER(2);
372 int32 n = ARRNELEMS(a);
373 int32 *da,
374 *db;
375
376 CHECKARRVALID(a);
377 CHECKARRVALID(b);
378
379 if (n != ARRNELEMS(b))
380 {
381 *result = false;
382 PG_RETURN_POINTER(result);
383 }
384 *result = TRUE;
385 da = ARRPTR(a);
386 db = ARRPTR(b);
387 while (n--)
388 {
389 if (*da++ != *db++)
390 {
391 *result = FALSE;
392 break;
393 }
394 }
395
396 PG_RETURN_POINTER(result);
397 }
398
399 /*****************************************************************
400 ** Common GiST Method
401 *****************************************************************/
402
403 typedef struct
404 {
405 OffsetNumber pos;
406 float cost;
407 } SPLITCOST;
408
409 static int
comparecost(const void * a,const void * b)410 comparecost(const void *a, const void *b)
411 {
412 if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost)
413 return 0;
414 else
415 return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1;
416 }
417
418 /*
419 ** The GiST PickSplit method for _intments
420 ** We use Guttman's poly time split algorithm
421 */
422 Datum
g_int_picksplit(PG_FUNCTION_ARGS)423 g_int_picksplit(PG_FUNCTION_ARGS)
424 {
425 GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
426 GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
427 OffsetNumber i,
428 j;
429 ArrayType *datum_alpha,
430 *datum_beta;
431 ArrayType *datum_l,
432 *datum_r;
433 ArrayType *union_d,
434 *union_dl,
435 *union_dr;
436 ArrayType *inter_d;
437 bool firsttime;
438 float size_alpha,
439 size_beta,
440 size_union,
441 size_inter;
442 float size_waste,
443 waste;
444 float size_l,
445 size_r;
446 int nbytes;
447 OffsetNumber seed_1 = 0,
448 seed_2 = 0;
449 OffsetNumber *left,
450 *right;
451 OffsetNumber maxoff;
452 SPLITCOST *costvector;
453
454 #ifdef GIST_DEBUG
455 elog(DEBUG3, "--------picksplit %d", entryvec->n);
456 #endif
457
458 maxoff = entryvec->n - 2;
459 nbytes = (maxoff + 2) * sizeof(OffsetNumber);
460 v->spl_left = (OffsetNumber *) palloc(nbytes);
461 v->spl_right = (OffsetNumber *) palloc(nbytes);
462
463 firsttime = true;
464 waste = 0.0;
465 for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
466 {
467 datum_alpha = GETENTRY(entryvec, i);
468 for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
469 {
470 datum_beta = GETENTRY(entryvec, j);
471
472 /* compute the wasted space by unioning these guys */
473 /* size_waste = size_union - size_inter; */
474 union_d = inner_int_union(datum_alpha, datum_beta);
475 rt__int_size(union_d, &size_union);
476 inter_d = inner_int_inter(datum_alpha, datum_beta);
477 rt__int_size(inter_d, &size_inter);
478 size_waste = size_union - size_inter;
479
480 pfree(union_d);
481 pfree(inter_d);
482
483 /*
484 * are these a more promising split that what we've already seen?
485 */
486
487 if (size_waste > waste || firsttime)
488 {
489 waste = size_waste;
490 seed_1 = i;
491 seed_2 = j;
492 firsttime = false;
493 }
494 }
495 }
496
497 left = v->spl_left;
498 v->spl_nleft = 0;
499 right = v->spl_right;
500 v->spl_nright = 0;
501 if (seed_1 == 0 || seed_2 == 0)
502 {
503 seed_1 = 1;
504 seed_2 = 2;
505 }
506
507 datum_alpha = GETENTRY(entryvec, seed_1);
508 datum_l = copy_intArrayType(datum_alpha);
509 rt__int_size(datum_l, &size_l);
510 datum_beta = GETENTRY(entryvec, seed_2);
511 datum_r = copy_intArrayType(datum_beta);
512 rt__int_size(datum_r, &size_r);
513
514 maxoff = OffsetNumberNext(maxoff);
515
516 /*
517 * sort entries
518 */
519 costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
520 for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
521 {
522 costvector[i - 1].pos = i;
523 datum_alpha = GETENTRY(entryvec, i);
524 union_d = inner_int_union(datum_l, datum_alpha);
525 rt__int_size(union_d, &size_alpha);
526 pfree(union_d);
527 union_d = inner_int_union(datum_r, datum_alpha);
528 rt__int_size(union_d, &size_beta);
529 pfree(union_d);
530 costvector[i - 1].cost = Abs((size_alpha - size_l) - (size_beta - size_r));
531 }
532 qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);
533
534 /*
535 * Now split up the regions between the two seeds. An important property
536 * of this split algorithm is that the split vector v has the indices of
537 * items to be split in order in its left and right vectors. We exploit
538 * this property by doing a merge in the code that actually splits the
539 * page.
540 *
541 * For efficiency, we also place the new index tuple in this loop. This is
542 * handled at the very end, when we have placed all the existing tuples
543 * and i == maxoff + 1.
544 */
545
546
547 for (j = 0; j < maxoff; j++)
548 {
549 i = costvector[j].pos;
550
551 /*
552 * If we've already decided where to place this item, just put it on
553 * the right list. Otherwise, we need to figure out which page needs
554 * the least enlargement in order to store the item.
555 */
556
557 if (i == seed_1)
558 {
559 *left++ = i;
560 v->spl_nleft++;
561 continue;
562 }
563 else if (i == seed_2)
564 {
565 *right++ = i;
566 v->spl_nright++;
567 continue;
568 }
569
570 /* okay, which page needs least enlargement? */
571 datum_alpha = GETENTRY(entryvec, i);
572 union_dl = inner_int_union(datum_l, datum_alpha);
573 union_dr = inner_int_union(datum_r, datum_alpha);
574 rt__int_size(union_dl, &size_alpha);
575 rt__int_size(union_dr, &size_beta);
576
577 /* pick which page to add it to */
578 if (size_alpha - size_l < size_beta - size_r + WISH_F(v->spl_nleft, v->spl_nright, 0.01))
579 {
580 pfree(datum_l);
581 pfree(union_dr);
582 datum_l = union_dl;
583 size_l = size_alpha;
584 *left++ = i;
585 v->spl_nleft++;
586 }
587 else
588 {
589 pfree(datum_r);
590 pfree(union_dl);
591 datum_r = union_dr;
592 size_r = size_beta;
593 *right++ = i;
594 v->spl_nright++;
595 }
596 }
597 pfree(costvector);
598 *right = *left = FirstOffsetNumber;
599
600 v->spl_ldatum = PointerGetDatum(datum_l);
601 v->spl_rdatum = PointerGetDatum(datum_r);
602
603 PG_RETURN_POINTER(v);
604 }
605