1 /*
2  * contrib/pg_trgm/trgm_gist.c
3  */
4 #include "postgres.h"
5 
6 #include "trgm.h"
7 
8 #include "access/stratnum.h"
9 #include "fmgr.h"
10 #include "port/pg_bitutils.h"
11 
12 
13 typedef struct
14 {
15 	/* most recent inputs to gtrgm_consistent */
16 	StrategyNumber strategy;
17 	text	   *query;
18 	/* extracted trigrams for query */
19 	TRGM	   *trigrams;
20 	/* if a regex operator, the extracted graph */
21 	TrgmPackedGraph *graph;
22 
23 	/*
24 	 * The "query" and "trigrams" are stored in the same palloc block as this
25 	 * cache struct, at MAXALIGN'ed offsets.  The graph however isn't.
26 	 */
27 } gtrgm_consistent_cache;
28 
29 #define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
30 
31 
32 PG_FUNCTION_INFO_V1(gtrgm_in);
33 PG_FUNCTION_INFO_V1(gtrgm_out);
34 PG_FUNCTION_INFO_V1(gtrgm_compress);
35 PG_FUNCTION_INFO_V1(gtrgm_decompress);
36 PG_FUNCTION_INFO_V1(gtrgm_consistent);
37 PG_FUNCTION_INFO_V1(gtrgm_distance);
38 PG_FUNCTION_INFO_V1(gtrgm_union);
39 PG_FUNCTION_INFO_V1(gtrgm_same);
40 PG_FUNCTION_INFO_V1(gtrgm_penalty);
41 PG_FUNCTION_INFO_V1(gtrgm_picksplit);
42 
43 
44 Datum
45 gtrgm_in(PG_FUNCTION_ARGS)
46 {
47 	elog(ERROR, "not implemented");
48 	PG_RETURN_DATUM(0);
49 }
50 
51 Datum
52 gtrgm_out(PG_FUNCTION_ARGS)
53 {
54 	elog(ERROR, "not implemented");
55 	PG_RETURN_DATUM(0);
56 }
57 
58 static void
59 makesign(BITVECP sign, TRGM *a)
60 {
61 	int32		k,
62 				len = ARRNELEM(a);
63 	trgm	   *ptr = GETARR(a);
64 	int32		tmp = 0;
65 
66 	MemSet((void *) sign, 0, sizeof(BITVEC));
67 	SETBIT(sign, SIGLENBIT);	/* set last unused bit */
68 	for (k = 0; k < len; k++)
69 	{
70 		CPTRGM(((char *) &tmp), ptr + k);
71 		HASH(sign, tmp);
72 	}
73 }
74 
75 Datum
76 gtrgm_compress(PG_FUNCTION_ARGS)
77 {
78 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
79 	GISTENTRY  *retval = entry;
80 
81 	if (entry->leafkey)
82 	{							/* trgm */
83 		TRGM	   *res;
84 		text	   *val = DatumGetTextPP(entry->key);
85 
86 		res = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
87 		retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
88 		gistentryinit(*retval, PointerGetDatum(res),
89 					  entry->rel, entry->page,
90 					  entry->offset, false);
91 	}
92 	else if (ISSIGNKEY(DatumGetPointer(entry->key)) &&
93 			 !ISALLTRUE(DatumGetPointer(entry->key)))
94 	{
95 		int32		i,
96 					len;
97 		TRGM	   *res;
98 		BITVECP		sign = GETSIGN(DatumGetPointer(entry->key));
99 
100 		LOOPBYTE
101 		{
102 			if ((sign[i] & 0xff) != 0xff)
103 				PG_RETURN_POINTER(retval);
104 		}
105 
106 		len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0);
107 		res = (TRGM *) palloc(len);
108 		SET_VARSIZE(res, len);
109 		res->flag = SIGNKEY | ALLISTRUE;
110 
111 		retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
112 		gistentryinit(*retval, PointerGetDatum(res),
113 					  entry->rel, entry->page,
114 					  entry->offset, false);
115 	}
116 	PG_RETURN_POINTER(retval);
117 }
118 
119 Datum
120 gtrgm_decompress(PG_FUNCTION_ARGS)
121 {
122 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
123 	GISTENTRY  *retval;
124 	text	   *key;
125 
126 	key = DatumGetTextPP(entry->key);
127 
128 	if (key != (text *) DatumGetPointer(entry->key))
129 	{
130 		/* need to pass back the decompressed item */
131 		retval = palloc(sizeof(GISTENTRY));
132 		gistentryinit(*retval, PointerGetDatum(key),
133 					  entry->rel, entry->page, entry->offset, entry->leafkey);
134 		PG_RETURN_POINTER(retval);
135 	}
136 	else
137 	{
138 		/* we can return the entry as-is */
139 		PG_RETURN_POINTER(entry);
140 	}
141 }
142 
143 static int32
144 cnt_sml_sign_common(TRGM *qtrg, BITVECP sign)
145 {
146 	int32		count = 0;
147 	int32		k,
148 				len = ARRNELEM(qtrg);
149 	trgm	   *ptr = GETARR(qtrg);
150 	int32		tmp = 0;
151 
152 	for (k = 0; k < len; k++)
153 	{
154 		CPTRGM(((char *) &tmp), ptr + k);
155 		count += GETBIT(sign, HASHVAL(tmp));
156 	}
157 
158 	return count;
159 }
160 
161 Datum
162 gtrgm_consistent(PG_FUNCTION_ARGS)
163 {
164 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
165 	text	   *query = PG_GETARG_TEXT_P(1);
166 	StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
167 
168 	/* Oid		subtype = PG_GETARG_OID(3); */
169 	bool	   *recheck = (bool *) PG_GETARG_POINTER(4);
170 	TRGM	   *key = (TRGM *) DatumGetPointer(entry->key);
171 	TRGM	   *qtrg;
172 	bool		res;
173 	Size		querysize = VARSIZE(query);
174 	gtrgm_consistent_cache *cache;
175 	double		nlimit;
176 
177 	/*
178 	 * We keep the extracted trigrams in cache, because trigram extraction is
179 	 * relatively CPU-expensive.  When trying to reuse a cached value, check
180 	 * strategy number not just query itself, because trigram extraction
181 	 * depends on strategy.
182 	 *
183 	 * The cached structure is a single palloc chunk containing the
184 	 * gtrgm_consistent_cache header, then the input query (4-byte length
185 	 * word, uncompressed, starting at a MAXALIGN boundary), then the TRGM
186 	 * value (also starting at a MAXALIGN boundary).  However we don't try to
187 	 * include the regex graph (if any) in that struct.  (XXX currently, this
188 	 * approach can leak regex graphs across index rescans.  Not clear if
189 	 * that's worth fixing.)
190 	 */
191 	cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra;
192 	if (cache == NULL ||
193 		cache->strategy != strategy ||
194 		VARSIZE(cache->query) != querysize ||
195 		memcmp((char *) cache->query, (char *) query, querysize) != 0)
196 	{
197 		gtrgm_consistent_cache *newcache;
198 		TrgmPackedGraph *graph = NULL;
199 		Size		qtrgsize;
200 
201 		switch (strategy)
202 		{
203 			case SimilarityStrategyNumber:
204 			case WordSimilarityStrategyNumber:
205 			case StrictWordSimilarityStrategyNumber:
206 				qtrg = generate_trgm(VARDATA(query),
207 									 querysize - VARHDRSZ);
208 				break;
209 			case ILikeStrategyNumber:
210 #ifndef IGNORECASE
211 				elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
212 #endif
213 				/* FALL THRU */
214 			case LikeStrategyNumber:
215 				qtrg = generate_wildcard_trgm(VARDATA(query),
216 											  querysize - VARHDRSZ);
217 				break;
218 			case RegExpICaseStrategyNumber:
219 #ifndef IGNORECASE
220 				elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
221 #endif
222 				/* FALL THRU */
223 			case RegExpStrategyNumber:
224 				qtrg = createTrgmNFA(query, PG_GET_COLLATION(),
225 									 &graph, fcinfo->flinfo->fn_mcxt);
226 				/* just in case an empty array is returned ... */
227 				if (qtrg && ARRNELEM(qtrg) <= 0)
228 				{
229 					pfree(qtrg);
230 					qtrg = NULL;
231 				}
232 				break;
233 			default:
234 				elog(ERROR, "unrecognized strategy number: %d", strategy);
235 				qtrg = NULL;	/* keep compiler quiet */
236 				break;
237 		}
238 
239 		qtrgsize = qtrg ? VARSIZE(qtrg) : 0;
240 
241 		newcache = (gtrgm_consistent_cache *)
242 			MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
243 							   MAXALIGN(sizeof(gtrgm_consistent_cache)) +
244 							   MAXALIGN(querysize) +
245 							   qtrgsize);
246 
247 		newcache->strategy = strategy;
248 		newcache->query = (text *)
249 			((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache)));
250 		memcpy((char *) newcache->query, (char *) query, querysize);
251 		if (qtrg)
252 		{
253 			newcache->trigrams = (TRGM *)
254 				((char *) newcache->query + MAXALIGN(querysize));
255 			memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize);
256 			/* release qtrg in case it was made in fn_mcxt */
257 			pfree(qtrg);
258 		}
259 		else
260 			newcache->trigrams = NULL;
261 		newcache->graph = graph;
262 
263 		if (cache)
264 			pfree(cache);
265 		fcinfo->flinfo->fn_extra = (void *) newcache;
266 		cache = newcache;
267 	}
268 
269 	qtrg = cache->trigrams;
270 
271 	switch (strategy)
272 	{
273 		case SimilarityStrategyNumber:
274 		case WordSimilarityStrategyNumber:
275 		case StrictWordSimilarityStrategyNumber:
276 
277 			/*
278 			 * Similarity search is exact. (Strict) word similarity search is
279 			 * inexact
280 			 */
281 			*recheck = (strategy != SimilarityStrategyNumber);
282 
283 			nlimit = index_strategy_get_limit(strategy);
284 
285 			if (GIST_LEAF(entry))
286 			{					/* all leafs contains orig trgm */
287 				double		tmpsml = cnt_sml(qtrg, key, *recheck);
288 
289 				res = (tmpsml >= nlimit);
290 			}
291 			else if (ISALLTRUE(key))
292 			{					/* non-leaf contains signature */
293 				res = true;
294 			}
295 			else
296 			{					/* non-leaf contains signature */
297 				int32		count = cnt_sml_sign_common(qtrg, GETSIGN(key));
298 				int32		len = ARRNELEM(qtrg);
299 
300 				if (len == 0)
301 					res = false;
302 				else
303 					res = (((((float8) count) / ((float8) len))) >= nlimit);
304 			}
305 			break;
306 		case ILikeStrategyNumber:
307 #ifndef IGNORECASE
308 			elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
309 #endif
310 			/* FALL THRU */
311 		case LikeStrategyNumber:
312 			/* Wildcard search is inexact */
313 			*recheck = true;
314 
315 			/*
316 			 * Check if all the extracted trigrams can be present in child
317 			 * nodes.
318 			 */
319 			if (GIST_LEAF(entry))
320 			{					/* all leafs contains orig trgm */
321 				res = trgm_contained_by(qtrg, key);
322 			}
323 			else if (ISALLTRUE(key))
324 			{					/* non-leaf contains signature */
325 				res = true;
326 			}
327 			else
328 			{					/* non-leaf contains signature */
329 				int32		k,
330 							tmp = 0,
331 							len = ARRNELEM(qtrg);
332 				trgm	   *ptr = GETARR(qtrg);
333 				BITVECP		sign = GETSIGN(key);
334 
335 				res = true;
336 				for (k = 0; k < len; k++)
337 				{
338 					CPTRGM(((char *) &tmp), ptr + k);
339 					if (!GETBIT(sign, HASHVAL(tmp)))
340 					{
341 						res = false;
342 						break;
343 					}
344 				}
345 			}
346 			break;
347 		case RegExpICaseStrategyNumber:
348 #ifndef IGNORECASE
349 			elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
350 #endif
351 			/* FALL THRU */
352 		case RegExpStrategyNumber:
353 			/* Regexp search is inexact */
354 			*recheck = true;
355 
356 			/* Check regex match as much as we can with available info */
357 			if (qtrg)
358 			{
359 				if (GIST_LEAF(entry))
360 				{				/* all leafs contains orig trgm */
361 					bool	   *check;
362 
363 					check = trgm_presence_map(qtrg, key);
364 					res = trigramsMatchGraph(cache->graph, check);
365 					pfree(check);
366 				}
367 				else if (ISALLTRUE(key))
368 				{				/* non-leaf contains signature */
369 					res = true;
370 				}
371 				else
372 				{				/* non-leaf contains signature */
373 					int32		k,
374 								tmp = 0,
375 								len = ARRNELEM(qtrg);
376 					trgm	   *ptr = GETARR(qtrg);
377 					BITVECP		sign = GETSIGN(key);
378 					bool	   *check;
379 
380 					/*
381 					 * GETBIT() tests may give false positives, due to limited
382 					 * size of the sign array.  But since trigramsMatchGraph()
383 					 * implements a monotone boolean function, false positives
384 					 * in the check array can't lead to false negative answer.
385 					 * So we can apply trigramsMatchGraph despite uncertainty,
386 					 * and that usefully improves the quality of the search.
387 					 */
388 					check = (bool *) palloc(len * sizeof(bool));
389 					for (k = 0; k < len; k++)
390 					{
391 						CPTRGM(((char *) &tmp), ptr + k);
392 						check[k] = GETBIT(sign, HASHVAL(tmp));
393 					}
394 					res = trigramsMatchGraph(cache->graph, check);
395 					pfree(check);
396 				}
397 			}
398 			else
399 			{
400 				/* trigram-free query must be rechecked everywhere */
401 				res = true;
402 			}
403 			break;
404 		default:
405 			elog(ERROR, "unrecognized strategy number: %d", strategy);
406 			res = false;		/* keep compiler quiet */
407 			break;
408 	}
409 
410 	PG_RETURN_BOOL(res);
411 }
412 
413 Datum
414 gtrgm_distance(PG_FUNCTION_ARGS)
415 {
416 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
417 	text	   *query = PG_GETARG_TEXT_P(1);
418 	StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
419 
420 	/* Oid		subtype = PG_GETARG_OID(3); */
421 	bool	   *recheck = (bool *) PG_GETARG_POINTER(4);
422 	TRGM	   *key = (TRGM *) DatumGetPointer(entry->key);
423 	TRGM	   *qtrg;
424 	float8		res;
425 	Size		querysize = VARSIZE(query);
426 	char	   *cache = (char *) fcinfo->flinfo->fn_extra;
427 
428 	/*
429 	 * Cache the generated trigrams across multiple calls with the same query.
430 	 */
431 	if (cache == NULL ||
432 		VARSIZE(cache) != querysize ||
433 		memcmp(cache, query, querysize) != 0)
434 	{
435 		char	   *newcache;
436 
437 		qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ);
438 
439 		newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
440 									  MAXALIGN(querysize) +
441 									  VARSIZE(qtrg));
442 
443 		memcpy(newcache, query, querysize);
444 		memcpy(newcache + MAXALIGN(querysize), qtrg, VARSIZE(qtrg));
445 
446 		if (cache)
447 			pfree(cache);
448 		fcinfo->flinfo->fn_extra = newcache;
449 		cache = newcache;
450 	}
451 
452 	qtrg = (TRGM *) (cache + MAXALIGN(querysize));
453 
454 	switch (strategy)
455 	{
456 		case DistanceStrategyNumber:
457 		case WordDistanceStrategyNumber:
458 		case StrictWordDistanceStrategyNumber:
459 			/* Only plain trigram distance is exact */
460 			*recheck = (strategy != DistanceStrategyNumber);
461 			if (GIST_LEAF(entry))
462 			{					/* all leafs contains orig trgm */
463 
464 				/*
465 				 * Prevent gcc optimizing the sml variable using volatile
466 				 * keyword. Otherwise res can differ from the
467 				 * word_similarity_dist_op() function.
468 				 */
469 				float4 volatile sml = cnt_sml(qtrg, key, *recheck);
470 
471 				res = 1.0 - sml;
472 			}
473 			else if (ISALLTRUE(key))
474 			{					/* all leafs contains orig trgm */
475 				res = 0.0;
476 			}
477 			else
478 			{					/* non-leaf contains signature */
479 				int32		count = cnt_sml_sign_common(qtrg, GETSIGN(key));
480 				int32		len = ARRNELEM(qtrg);
481 
482 				res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len);
483 			}
484 			break;
485 		default:
486 			elog(ERROR, "unrecognized strategy number: %d", strategy);
487 			res = 0;			/* keep compiler quiet */
488 			break;
489 	}
490 
491 	PG_RETURN_FLOAT8(res);
492 }
493 
494 static int32
495 unionkey(BITVECP sbase, TRGM *add)
496 {
497 	int32		i;
498 
499 	if (ISSIGNKEY(add))
500 	{
501 		BITVECP		sadd = GETSIGN(add);
502 
503 		if (ISALLTRUE(add))
504 			return 1;
505 
506 		LOOPBYTE
507 			sbase[i] |= sadd[i];
508 	}
509 	else
510 	{
511 		trgm	   *ptr = GETARR(add);
512 		int32		tmp = 0;
513 
514 		for (i = 0; i < ARRNELEM(add); i++)
515 		{
516 			CPTRGM(((char *) &tmp), ptr + i);
517 			HASH(sbase, tmp);
518 		}
519 	}
520 	return 0;
521 }
522 
523 
524 Datum
525 gtrgm_union(PG_FUNCTION_ARGS)
526 {
527 	GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
528 	int32		len = entryvec->n;
529 	int		   *size = (int *) PG_GETARG_POINTER(1);
530 	BITVEC		base;
531 	int32		i;
532 	int32		flag = 0;
533 	TRGM	   *result;
534 
535 	MemSet((void *) base, 0, sizeof(BITVEC));
536 	for (i = 0; i < len; i++)
537 	{
538 		if (unionkey(base, GETENTRY(entryvec, i)))
539 		{
540 			flag = ALLISTRUE;
541 			break;
542 		}
543 	}
544 
545 	flag |= SIGNKEY;
546 	len = CALCGTSIZE(flag, 0);
547 	result = (TRGM *) palloc(len);
548 	SET_VARSIZE(result, len);
549 	result->flag = flag;
550 	if (!ISALLTRUE(result))
551 		memcpy((void *) GETSIGN(result), (void *) base, sizeof(BITVEC));
552 	*size = len;
553 
554 	PG_RETURN_POINTER(result);
555 }
556 
557 Datum
558 gtrgm_same(PG_FUNCTION_ARGS)
559 {
560 	TRGM	   *a = (TRGM *) PG_GETARG_POINTER(0);
561 	TRGM	   *b = (TRGM *) PG_GETARG_POINTER(1);
562 	bool	   *result = (bool *) PG_GETARG_POINTER(2);
563 
564 	if (ISSIGNKEY(a))
565 	{							/* then b also ISSIGNKEY */
566 		if (ISALLTRUE(a) && ISALLTRUE(b))
567 			*result = true;
568 		else if (ISALLTRUE(a))
569 			*result = false;
570 		else if (ISALLTRUE(b))
571 			*result = false;
572 		else
573 		{
574 			int32		i;
575 			BITVECP		sa = GETSIGN(a),
576 						sb = GETSIGN(b);
577 
578 			*result = true;
579 			LOOPBYTE
580 			{
581 				if (sa[i] != sb[i])
582 				{
583 					*result = false;
584 					break;
585 				}
586 			}
587 		}
588 	}
589 	else
590 	{							/* a and b ISARRKEY */
591 		int32		lena = ARRNELEM(a),
592 					lenb = ARRNELEM(b);
593 
594 		if (lena != lenb)
595 			*result = false;
596 		else
597 		{
598 			trgm	   *ptra = GETARR(a),
599 					   *ptrb = GETARR(b);
600 			int32		i;
601 
602 			*result = true;
603 			for (i = 0; i < lena; i++)
604 				if (CMPTRGM(ptra + i, ptrb + i))
605 				{
606 					*result = false;
607 					break;
608 				}
609 		}
610 	}
611 
612 	PG_RETURN_POINTER(result);
613 }
614 
615 static int32
616 sizebitvec(BITVECP sign)
617 {
618 	return pg_popcount(sign, SIGLEN);
619 }
620 
621 static int
622 hemdistsign(BITVECP a, BITVECP b)
623 {
624 	int			i,
625 				diff,
626 				dist = 0;
627 
628 	LOOPBYTE
629 	{
630 		diff = (unsigned char) (a[i] ^ b[i]);
631 		/* Using the popcount functions here isn't likely to win */
632 		dist += pg_number_of_ones[diff];
633 	}
634 	return dist;
635 }
636 
637 static int
638 hemdist(TRGM *a, TRGM *b)
639 {
640 	if (ISALLTRUE(a))
641 	{
642 		if (ISALLTRUE(b))
643 			return 0;
644 		else
645 			return SIGLENBIT - sizebitvec(GETSIGN(b));
646 	}
647 	else if (ISALLTRUE(b))
648 		return SIGLENBIT - sizebitvec(GETSIGN(a));
649 
650 	return hemdistsign(GETSIGN(a), GETSIGN(b));
651 }
652 
653 Datum
654 gtrgm_penalty(PG_FUNCTION_ARGS)
655 {
656 	GISTENTRY  *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); /* always ISSIGNKEY */
657 	GISTENTRY  *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
658 	float	   *penalty = (float *) PG_GETARG_POINTER(2);
659 	TRGM	   *origval = (TRGM *) DatumGetPointer(origentry->key);
660 	TRGM	   *newval = (TRGM *) DatumGetPointer(newentry->key);
661 	BITVECP		orig = GETSIGN(origval);
662 
663 	*penalty = 0.0;
664 
665 	if (ISARRKEY(newval))
666 	{
667 		char	   *cache = (char *) fcinfo->flinfo->fn_extra;
668 		TRGM	   *cachedVal = (TRGM *) (cache + MAXALIGN(sizeof(BITVEC)));
669 		Size		newvalsize = VARSIZE(newval);
670 		BITVECP		sign;
671 
672 		/*
673 		 * Cache the sign data across multiple calls with the same newval.
674 		 */
675 		if (cache == NULL ||
676 			VARSIZE(cachedVal) != newvalsize ||
677 			memcmp(cachedVal, newval, newvalsize) != 0)
678 		{
679 			char	   *newcache;
680 
681 			newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
682 										  MAXALIGN(sizeof(BITVEC)) +
683 										  newvalsize);
684 
685 			makesign((BITVECP) newcache, newval);
686 
687 			cachedVal = (TRGM *) (newcache + MAXALIGN(sizeof(BITVEC)));
688 			memcpy(cachedVal, newval, newvalsize);
689 
690 			if (cache)
691 				pfree(cache);
692 			fcinfo->flinfo->fn_extra = newcache;
693 			cache = newcache;
694 		}
695 
696 		sign = (BITVECP) cache;
697 
698 		if (ISALLTRUE(origval))
699 			*penalty = ((float) (SIGLENBIT - sizebitvec(sign))) / (float) (SIGLENBIT + 1);
700 		else
701 			*penalty = hemdistsign(sign, orig);
702 	}
703 	else
704 		*penalty = hemdist(origval, newval);
705 	PG_RETURN_POINTER(penalty);
706 }
707 
708 typedef struct
709 {
710 	bool		allistrue;
711 	BITVEC		sign;
712 } CACHESIGN;
713 
714 static void
715 fillcache(CACHESIGN *item, TRGM *key)
716 {
717 	item->allistrue = false;
718 	if (ISARRKEY(key))
719 		makesign(item->sign, key);
720 	else if (ISALLTRUE(key))
721 		item->allistrue = true;
722 	else
723 		memcpy((void *) item->sign, (void *) GETSIGN(key), sizeof(BITVEC));
724 }
725 
726 #define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) )
727 typedef struct
728 {
729 	OffsetNumber pos;
730 	int32		cost;
731 } SPLITCOST;
732 
733 static int
734 comparecost(const void *a, const void *b)
735 {
736 	if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost)
737 		return 0;
738 	else
739 		return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1;
740 }
741 
742 
743 static int
744 hemdistcache(CACHESIGN *a, CACHESIGN *b)
745 {
746 	if (a->allistrue)
747 	{
748 		if (b->allistrue)
749 			return 0;
750 		else
751 			return SIGLENBIT - sizebitvec(b->sign);
752 	}
753 	else if (b->allistrue)
754 		return SIGLENBIT - sizebitvec(a->sign);
755 
756 	return hemdistsign(a->sign, b->sign);
757 }
758 
759 Datum
760 gtrgm_picksplit(PG_FUNCTION_ARGS)
761 {
762 	GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
763 	OffsetNumber maxoff = entryvec->n - 1;
764 	GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
765 	OffsetNumber k,
766 				j;
767 	TRGM	   *datum_l,
768 			   *datum_r;
769 	BITVECP		union_l,
770 				union_r;
771 	int32		size_alpha,
772 				size_beta;
773 	int32		size_waste,
774 				waste = -1;
775 	int32		nbytes;
776 	OffsetNumber seed_1 = 0,
777 				seed_2 = 0;
778 	OffsetNumber *left,
779 			   *right;
780 	BITVECP		ptr;
781 	int			i;
782 	CACHESIGN  *cache;
783 	SPLITCOST  *costvector;
784 
785 	/* cache the sign data for each existing item */
786 	cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 1));
787 	for (k = FirstOffsetNumber; k <= maxoff; k = OffsetNumberNext(k))
788 		fillcache(&cache[k], GETENTRY(entryvec, k));
789 
790 	/* now find the two furthest-apart items */
791 	for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k))
792 	{
793 		for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j))
794 		{
795 			size_waste = hemdistcache(&(cache[j]), &(cache[k]));
796 			if (size_waste > waste)
797 			{
798 				waste = size_waste;
799 				seed_1 = k;
800 				seed_2 = j;
801 			}
802 		}
803 	}
804 
805 	/* just in case we didn't make a selection ... */
806 	if (seed_1 == 0 || seed_2 == 0)
807 	{
808 		seed_1 = 1;
809 		seed_2 = 2;
810 	}
811 
812 	/* initialize the result vectors */
813 	nbytes = maxoff * sizeof(OffsetNumber);
814 	v->spl_left = left = (OffsetNumber *) palloc(nbytes);
815 	v->spl_right = right = (OffsetNumber *) palloc(nbytes);
816 	v->spl_nleft = 0;
817 	v->spl_nright = 0;
818 
819 	/* form initial .. */
820 	if (cache[seed_1].allistrue)
821 	{
822 		datum_l = (TRGM *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
823 		SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
824 		datum_l->flag = SIGNKEY | ALLISTRUE;
825 	}
826 	else
827 	{
828 		datum_l = (TRGM *) palloc(CALCGTSIZE(SIGNKEY, 0));
829 		SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY, 0));
830 		datum_l->flag = SIGNKEY;
831 		memcpy((void *) GETSIGN(datum_l), (void *) cache[seed_1].sign, sizeof(BITVEC));
832 	}
833 	if (cache[seed_2].allistrue)
834 	{
835 		datum_r = (TRGM *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
836 		SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
837 		datum_r->flag = SIGNKEY | ALLISTRUE;
838 	}
839 	else
840 	{
841 		datum_r = (TRGM *) palloc(CALCGTSIZE(SIGNKEY, 0));
842 		SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY, 0));
843 		datum_r->flag = SIGNKEY;
844 		memcpy((void *) GETSIGN(datum_r), (void *) cache[seed_2].sign, sizeof(BITVEC));
845 	}
846 
847 	union_l = GETSIGN(datum_l);
848 	union_r = GETSIGN(datum_r);
849 
850 	/* sort before ... */
851 	costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
852 	for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j))
853 	{
854 		costvector[j - 1].pos = j;
855 		size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j]));
856 		size_beta = hemdistcache(&(cache[seed_2]), &(cache[j]));
857 		costvector[j - 1].cost = abs(size_alpha - size_beta);
858 	}
859 	qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);
860 
861 	for (k = 0; k < maxoff; k++)
862 	{
863 		j = costvector[k].pos;
864 		if (j == seed_1)
865 		{
866 			*left++ = j;
867 			v->spl_nleft++;
868 			continue;
869 		}
870 		else if (j == seed_2)
871 		{
872 			*right++ = j;
873 			v->spl_nright++;
874 			continue;
875 		}
876 
877 		if (ISALLTRUE(datum_l) || cache[j].allistrue)
878 		{
879 			if (ISALLTRUE(datum_l) && cache[j].allistrue)
880 				size_alpha = 0;
881 			else
882 				size_alpha = SIGLENBIT - sizebitvec(
883 													(cache[j].allistrue) ? GETSIGN(datum_l) : GETSIGN(cache[j].sign)
884 					);
885 		}
886 		else
887 			size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l));
888 
889 		if (ISALLTRUE(datum_r) || cache[j].allistrue)
890 		{
891 			if (ISALLTRUE(datum_r) && cache[j].allistrue)
892 				size_beta = 0;
893 			else
894 				size_beta = SIGLENBIT - sizebitvec(
895 												   (cache[j].allistrue) ? GETSIGN(datum_r) : GETSIGN(cache[j].sign)
896 					);
897 		}
898 		else
899 			size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r));
900 
901 		if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1))
902 		{
903 			if (ISALLTRUE(datum_l) || cache[j].allistrue)
904 			{
905 				if (!ISALLTRUE(datum_l))
906 					MemSet((void *) GETSIGN(datum_l), 0xff, sizeof(BITVEC));
907 			}
908 			else
909 			{
910 				ptr = cache[j].sign;
911 				LOOPBYTE
912 					union_l[i] |= ptr[i];
913 			}
914 			*left++ = j;
915 			v->spl_nleft++;
916 		}
917 		else
918 		{
919 			if (ISALLTRUE(datum_r) || cache[j].allistrue)
920 			{
921 				if (!ISALLTRUE(datum_r))
922 					MemSet((void *) GETSIGN(datum_r), 0xff, sizeof(BITVEC));
923 			}
924 			else
925 			{
926 				ptr = cache[j].sign;
927 				LOOPBYTE
928 					union_r[i] |= ptr[i];
929 			}
930 			*right++ = j;
931 			v->spl_nright++;
932 		}
933 	}
934 
935 	v->spl_ldatum = PointerGetDatum(datum_l);
936 	v->spl_rdatum = PointerGetDatum(datum_r);
937 
938 	PG_RETURN_POINTER(v);
939 }
940