1 /*-------------------------------------------------------------------------
2 *
3 * partition.c
4 * Partitioning related data structures and functions.
5 *
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/catalog/partition.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16 #include "postgres.h"
17
18 #include "access/heapam.h"
19 #include "access/htup_details.h"
20 #include "access/nbtree.h"
21 #include "access/sysattr.h"
22 #include "catalog/dependency.h"
23 #include "catalog/indexing.h"
24 #include "catalog/objectaddress.h"
25 #include "catalog/partition.h"
26 #include "catalog/pg_collation.h"
27 #include "catalog/pg_inherits.h"
28 #include "catalog/pg_inherits_fn.h"
29 #include "catalog/pg_opclass.h"
30 #include "catalog/pg_type.h"
31 #include "executor/executor.h"
32 #include "miscadmin.h"
33 #include "nodes/makefuncs.h"
34 #include "nodes/nodeFuncs.h"
35 #include "nodes/parsenodes.h"
36 #include "optimizer/clauses.h"
37 #include "optimizer/planmain.h"
38 #include "optimizer/var.h"
39 #include "rewrite/rewriteManip.h"
40 #include "storage/lmgr.h"
41 #include "utils/array.h"
42 #include "utils/builtins.h"
43 #include "utils/datum.h"
44 #include "utils/memutils.h"
45 #include "utils/fmgroids.h"
46 #include "utils/inval.h"
47 #include "utils/lsyscache.h"
48 #include "utils/rel.h"
49 #include "utils/ruleutils.h"
50 #include "utils/syscache.h"
51
52 /*
53 * Information about bounds of a partitioned relation
54 *
55 * A list partition datum that is known to be NULL is never put into the
56 * datums array. Instead, it is tracked using the null_index field.
57 *
58 * In the case of range partitioning, ndatums will typically be far less than
59 * 2 * nparts, because a partition's upper bound and the next partition's lower
60 * bound are the same in most common cases, and we only store one of them (the
61 * upper bound).
62 *
63 * In the case of list partitioning, the indexes array stores one entry for
64 * every datum, which is the index of the partition that accepts a given datum.
65 * In case of range partitioning, it stores one entry per distinct range
66 * datum, which is the index of the partition for which a given datum
67 * is an upper bound.
68 */
69
70 typedef struct PartitionBoundInfoData
71 {
72 char strategy; /* list or range bounds? */
73 int ndatums; /* Length of the datums following array */
74 Datum **datums; /* Array of datum-tuples with key->partnatts
75 * datums each */
76 PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
77 * NULL for list partitioned tables */
78 int *indexes; /* Partition indexes; one entry per member of
79 * the datums array (plus one if range
80 * partitioned table) */
81 int null_index; /* Index of the null-accepting partition; -1
82 * if there isn't one */
83 } PartitionBoundInfoData;
84
85 #define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)
86
87 /*
88 * When qsort'ing partition bounds after reading from the catalog, each bound
89 * is represented with one of the following structs.
90 */
91
92 /* One value coming from some (index'th) list partition */
93 typedef struct PartitionListValue
94 {
95 int index;
96 Datum value;
97 } PartitionListValue;
98
99 /* One bound of a range partition */
100 typedef struct PartitionRangeBound
101 {
102 int index;
103 Datum *datums; /* range bound datums */
104 PartitionRangeDatumKind *kind; /* the kind of each datum */
105 bool lower; /* this is the lower (vs upper) bound */
106 } PartitionRangeBound;
107
108 static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
109 void *arg);
110 static int32 qsort_partition_rbound_cmp(const void *a, const void *b,
111 void *arg);
112
113 static Oid get_partition_operator(PartitionKey key, int col,
114 StrategyNumber strategy, bool *need_relabel);
115 static Expr *make_partition_op_expr(PartitionKey key, int keynum,
116 uint16 strategy, Expr *arg1, Expr *arg2);
117 static void get_range_key_properties(PartitionKey key, int keynum,
118 PartitionRangeDatum *ldatum,
119 PartitionRangeDatum *udatum,
120 ListCell **partexprs_item,
121 Expr **keyCol,
122 Const **lower_val, Const **upper_val);
123 static List *get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec);
124 static List *get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec);
125 static List *generate_partition_qual(Relation rel);
126
127 static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index,
128 List *datums, bool lower);
129 static int32 partition_rbound_cmp(PartitionKey key,
130 Datum *datums1, PartitionRangeDatumKind *kind1,
131 bool lower1, PartitionRangeBound *b2);
132 static int32 partition_rbound_datum_cmp(PartitionKey key,
133 Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
134 Datum *tuple_datums);
135
136 static int32 partition_bound_cmp(PartitionKey key,
137 PartitionBoundInfo boundinfo,
138 int offset, void *probe, bool probe_is_bound);
139 static int partition_bound_bsearch(PartitionKey key,
140 PartitionBoundInfo boundinfo,
141 void *probe, bool probe_is_bound, bool *is_equal);
142
143 /*
144 * RelationBuildPartitionDesc
145 * Form rel's partition descriptor, and store in relcache entry
146 *
147 * Note: the descriptor won't be flushed from the cache by
148 * RelationClearRelation() unless it's changed because of
149 * addition or removal of a partition. Hence, code holding a lock
150 * that's sufficient to prevent that can assume that rd_partdesc
151 * won't change underneath it.
152 */
153 void
RelationBuildPartitionDesc(Relation rel)154 RelationBuildPartitionDesc(Relation rel)
155 {
156 List *inhoids,
157 *partoids;
158 Oid *oids = NULL;
159 List *boundspecs = NIL;
160 ListCell *cell;
161 int i,
162 nparts;
163 PartitionKey key = RelationGetPartitionKey(rel);
164 PartitionDesc result;
165 MemoryContext oldcxt;
166
167 int ndatums = 0;
168
169 /* List partitioning specific */
170 PartitionListValue **all_values = NULL;
171 int null_index = -1;
172
173 /* Range partitioning specific */
174 PartitionRangeBound **rbounds = NULL;
175
176 /*
177 * The following could happen in situations where rel has a pg_class entry
178 * but not the pg_partitioned_table entry yet.
179 */
180 if (key == NULL)
181 return;
182
183 /* Get partition oids from pg_inherits */
184 inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
185
186 /* Collect bound spec nodes in a list */
187 i = 0;
188 partoids = NIL;
189 foreach(cell, inhoids)
190 {
191 Oid inhrelid = lfirst_oid(cell);
192 HeapTuple tuple;
193 Datum datum;
194 bool isnull;
195 Node *boundspec;
196
197 tuple = SearchSysCache1(RELOID, inhrelid);
198 if (!HeapTupleIsValid(tuple))
199 elog(ERROR, "cache lookup failed for relation %u", inhrelid);
200
201 /*
202 * It is possible that the pg_class tuple of a partition has not been
203 * updated yet to set its relpartbound field. The only case where
204 * this happens is when we open the parent relation to check using its
205 * partition descriptor that a new partition's bound does not overlap
206 * some existing partition.
207 */
208 if (!((Form_pg_class) GETSTRUCT(tuple))->relispartition)
209 {
210 ReleaseSysCache(tuple);
211 continue;
212 }
213
214 datum = SysCacheGetAttr(RELOID, tuple,
215 Anum_pg_class_relpartbound,
216 &isnull);
217 Assert(!isnull);
218 boundspec = (Node *) stringToNode(TextDatumGetCString(datum));
219 boundspecs = lappend(boundspecs, boundspec);
220 partoids = lappend_oid(partoids, inhrelid);
221 ReleaseSysCache(tuple);
222 }
223
224 nparts = list_length(partoids);
225
226 if (nparts > 0)
227 {
228 oids = (Oid *) palloc(nparts * sizeof(Oid));
229 i = 0;
230 foreach(cell, partoids)
231 oids[i++] = lfirst_oid(cell);
232
233 /* Convert from node to the internal representation */
234 if (key->strategy == PARTITION_STRATEGY_LIST)
235 {
236 List *non_null_values = NIL;
237
238 /*
239 * Create a unified list of non-null values across all partitions.
240 */
241 i = 0;
242 null_index = -1;
243 foreach(cell, boundspecs)
244 {
245 PartitionBoundSpec *spec = castNode(PartitionBoundSpec,
246 lfirst(cell));
247 ListCell *c;
248
249 if (spec->strategy != PARTITION_STRATEGY_LIST)
250 elog(ERROR, "invalid strategy in partition bound spec");
251
252 foreach(c, spec->listdatums)
253 {
254 Const *val = castNode(Const, lfirst(c));
255 PartitionListValue *list_value = NULL;
256
257 if (!val->constisnull)
258 {
259 list_value = (PartitionListValue *)
260 palloc0(sizeof(PartitionListValue));
261 list_value->index = i;
262 list_value->value = val->constvalue;
263 }
264 else
265 {
266 /*
267 * Never put a null into the values array, flag
268 * instead for the code further down below where we
269 * construct the actual relcache struct.
270 */
271 if (null_index != -1)
272 elog(ERROR, "found null more than once");
273 null_index = i;
274 }
275
276 if (list_value)
277 non_null_values = lappend(non_null_values,
278 list_value);
279 }
280
281 i++;
282 }
283
284 ndatums = list_length(non_null_values);
285
286 /*
287 * Collect all list values in one array. Alongside the value, we
288 * also save the index of partition the value comes from.
289 */
290 all_values = (PartitionListValue **) palloc(ndatums *
291 sizeof(PartitionListValue *));
292 i = 0;
293 foreach(cell, non_null_values)
294 {
295 PartitionListValue *src = lfirst(cell);
296
297 all_values[i] = (PartitionListValue *)
298 palloc(sizeof(PartitionListValue));
299 all_values[i]->value = src->value;
300 all_values[i]->index = src->index;
301 i++;
302 }
303
304 qsort_arg(all_values, ndatums, sizeof(PartitionListValue *),
305 qsort_partition_list_value_cmp, (void *) key);
306 }
307 else if (key->strategy == PARTITION_STRATEGY_RANGE)
308 {
309 int j,
310 k;
311 PartitionRangeBound **all_bounds,
312 *prev;
313 bool *distinct_indexes;
314
315 all_bounds = (PartitionRangeBound **) palloc0(2 * nparts *
316 sizeof(PartitionRangeBound *));
317 distinct_indexes = (bool *) palloc(2 * nparts * sizeof(bool));
318
319 /*
320 * Create a unified list of range bounds across all the
321 * partitions.
322 */
323 i = j = 0;
324 foreach(cell, boundspecs)
325 {
326 PartitionBoundSpec *spec = castNode(PartitionBoundSpec,
327 lfirst(cell));
328 PartitionRangeBound *lower,
329 *upper;
330
331 if (spec->strategy != PARTITION_STRATEGY_RANGE)
332 elog(ERROR, "invalid strategy in partition bound spec");
333
334 lower = make_one_range_bound(key, i, spec->lowerdatums,
335 true);
336 upper = make_one_range_bound(key, i, spec->upperdatums,
337 false);
338 all_bounds[j] = lower;
339 all_bounds[j + 1] = upper;
340 j += 2;
341 i++;
342 }
343 Assert(j == 2 * nparts);
344
345 /* Sort all the bounds in ascending order */
346 qsort_arg(all_bounds, 2 * nparts,
347 sizeof(PartitionRangeBound *),
348 qsort_partition_rbound_cmp,
349 (void *) key);
350
351 /*
352 * Count the number of distinct bounds to allocate an array of
353 * that size.
354 */
355 ndatums = 0;
356 prev = NULL;
357 for (i = 0; i < 2 * nparts; i++)
358 {
359 PartitionRangeBound *cur = all_bounds[i];
360 bool is_distinct = false;
361 int j;
362
363 /* Is the current bound distinct from the previous one? */
364 for (j = 0; j < key->partnatts; j++)
365 {
366 Datum cmpval;
367
368 if (prev == NULL || cur->kind[j] != prev->kind[j])
369 {
370 is_distinct = true;
371 break;
372 }
373
374 /*
375 * If the bounds are both MINVALUE or MAXVALUE, stop now
376 * and treat them as equal, since any values after this
377 * point must be ignored.
378 */
379 if (cur->kind[j] != PARTITION_RANGE_DATUM_VALUE)
380 break;
381
382 cmpval = FunctionCall2Coll(&key->partsupfunc[j],
383 key->partcollation[j],
384 cur->datums[j],
385 prev->datums[j]);
386 if (DatumGetInt32(cmpval) != 0)
387 {
388 is_distinct = true;
389 break;
390 }
391 }
392
393 /*
394 * Count the current bound if it is distinct from the previous
395 * one. Also, store if the index i contains a distinct bound
396 * that we'd like put in the relcache array.
397 */
398 if (is_distinct)
399 {
400 distinct_indexes[i] = true;
401 ndatums++;
402 }
403 else
404 distinct_indexes[i] = false;
405
406 prev = cur;
407 }
408
409 /*
410 * Finally save them in an array from where they will be copied
411 * into the relcache.
412 */
413 rbounds = (PartitionRangeBound **) palloc(ndatums *
414 sizeof(PartitionRangeBound *));
415 k = 0;
416 for (i = 0; i < 2 * nparts; i++)
417 {
418 if (distinct_indexes[i])
419 rbounds[k++] = all_bounds[i];
420 }
421 Assert(k == ndatums);
422 }
423 else
424 elog(ERROR, "unexpected partition strategy: %d",
425 (int) key->strategy);
426 }
427
428 /* Assert we aren't about to leak any old data structure */
429 Assert(rel->rd_pdcxt == NULL);
430 Assert(rel->rd_partdesc == NULL);
431
432 /*
433 * Now build the actual relcache partition descriptor. Note that the
434 * order of operations here is fairly critical. If we fail partway
435 * through this code, we won't have leaked memory because the rd_pdcxt is
436 * attached to the relcache entry immediately, so it'll be freed whenever
437 * the entry is rebuilt or destroyed. However, we don't assign to
438 * rd_partdesc until the cached data structure is fully complete and
439 * valid, so that no other code might try to use it.
440 */
441 rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext,
442 RelationGetRelationName(rel),
443 ALLOCSET_SMALL_SIZES);
444 oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);
445
446 result = (PartitionDescData *) palloc0(sizeof(PartitionDescData));
447 result->nparts = nparts;
448 if (nparts > 0)
449 {
450 PartitionBoundInfo boundinfo;
451 int *mapping;
452 int next_index = 0;
453
454 result->oids = (Oid *) palloc0(nparts * sizeof(Oid));
455
456 boundinfo = (PartitionBoundInfoData *)
457 palloc0(sizeof(PartitionBoundInfoData));
458 boundinfo->strategy = key->strategy;
459 boundinfo->ndatums = ndatums;
460 boundinfo->null_index = -1;
461 boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
462
463 /* Initialize mapping array with invalid values */
464 mapping = (int *) palloc(sizeof(int) * nparts);
465 for (i = 0; i < nparts; i++)
466 mapping[i] = -1;
467
468 switch (key->strategy)
469 {
470 case PARTITION_STRATEGY_LIST:
471 {
472 boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
473
474 /*
475 * Copy values. Indexes of individual values are mapped
476 * to canonical values so that they match for any two list
477 * partitioned tables with same number of partitions and
478 * same lists per partition. One way to canonicalize is
479 * to assign the index in all_values[] of the smallest
480 * value of each partition, as the index of all of the
481 * partition's values.
482 */
483 for (i = 0; i < ndatums; i++)
484 {
485 boundinfo->datums[i] = (Datum *) palloc(sizeof(Datum));
486 boundinfo->datums[i][0] = datumCopy(all_values[i]->value,
487 key->parttypbyval[0],
488 key->parttyplen[0]);
489
490 /* If the old index has no mapping, assign one */
491 if (mapping[all_values[i]->index] == -1)
492 mapping[all_values[i]->index] = next_index++;
493
494 boundinfo->indexes[i] = mapping[all_values[i]->index];
495 }
496
497 /*
498 * If null-accepting partition has no mapped index yet,
499 * assign one. This could happen if such partition
500 * accepts only null and hence not covered in the above
501 * loop which only handled non-null values.
502 */
503 if (null_index != -1)
504 {
505 Assert(null_index >= 0);
506 if (mapping[null_index] == -1)
507 mapping[null_index] = next_index++;
508 boundinfo->null_index = mapping[null_index];
509 }
510
511 /* All partitions must now have a valid mapping */
512 Assert(next_index == nparts);
513 break;
514 }
515
516 case PARTITION_STRATEGY_RANGE:
517 {
518 boundinfo->kind = (PartitionRangeDatumKind **)
519 palloc(ndatums *
520 sizeof(PartitionRangeDatumKind *));
521 boundinfo->indexes = (int *) palloc((ndatums + 1) *
522 sizeof(int));
523
524 for (i = 0; i < ndatums; i++)
525 {
526 int j;
527
528 boundinfo->datums[i] = (Datum *) palloc(key->partnatts *
529 sizeof(Datum));
530 boundinfo->kind[i] = (PartitionRangeDatumKind *)
531 palloc(key->partnatts *
532 sizeof(PartitionRangeDatumKind));
533 for (j = 0; j < key->partnatts; j++)
534 {
535 if (rbounds[i]->kind[j] == PARTITION_RANGE_DATUM_VALUE)
536 boundinfo->datums[i][j] =
537 datumCopy(rbounds[i]->datums[j],
538 key->parttypbyval[j],
539 key->parttyplen[j]);
540 boundinfo->kind[i][j] = rbounds[i]->kind[j];
541 }
542
543 /*
544 * There is no mapping for invalid indexes.
545 *
546 * Any lower bounds in the rbounds array have invalid
547 * indexes assigned, because the values between the
548 * previous bound (if there is one) and this (lower)
549 * bound are not part of the range of any existing
550 * partition.
551 */
552 if (rbounds[i]->lower)
553 boundinfo->indexes[i] = -1;
554 else
555 {
556 int orig_index = rbounds[i]->index;
557
558 /* If the old index has no mapping, assign one */
559 if (mapping[orig_index] == -1)
560 mapping[orig_index] = next_index++;
561
562 boundinfo->indexes[i] = mapping[orig_index];
563 }
564 }
565 boundinfo->indexes[i] = -1;
566 break;
567 }
568
569 default:
570 elog(ERROR, "unexpected partition strategy: %d",
571 (int) key->strategy);
572 }
573
574 result->boundinfo = boundinfo;
575
576 /*
577 * Now assign OIDs from the original array into mapped indexes of the
578 * result array. Order of OIDs in the former is defined by the
579 * catalog scan that retrieved them, whereas that in the latter is
580 * defined by canonicalized representation of the list values or the
581 * range bounds.
582 */
583 for (i = 0; i < nparts; i++)
584 result->oids[mapping[i]] = oids[i];
585 pfree(mapping);
586 }
587
588 MemoryContextSwitchTo(oldcxt);
589 rel->rd_partdesc = result;
590 }
591
592 /*
593 * Are two partition bound collections logically equal?
594 *
595 * Used in the keep logic of relcache.c (ie, in RelationClearRelation()).
596 * This is also useful when b1 and b2 are bound collections of two separate
597 * relations, respectively, because PartitionBoundInfo is a canonical
598 * representation of partition bounds.
599 */
600 bool
partition_bounds_equal(PartitionKey key,PartitionBoundInfo b1,PartitionBoundInfo b2)601 partition_bounds_equal(PartitionKey key,
602 PartitionBoundInfo b1, PartitionBoundInfo b2)
603 {
604 int i;
605
606 if (b1->strategy != b2->strategy)
607 return false;
608
609 if (b1->ndatums != b2->ndatums)
610 return false;
611
612 if (b1->null_index != b2->null_index)
613 return false;
614
615 for (i = 0; i < b1->ndatums; i++)
616 {
617 int j;
618
619 for (j = 0; j < key->partnatts; j++)
620 {
621 /* For range partitions, the bounds might not be finite. */
622 if (b1->kind != NULL)
623 {
624 /* The different kinds of bound all differ from each other */
625 if (b1->kind[i][j] != b2->kind[i][j])
626 return false;
627
628 /* Non-finite bounds are equal without further examination. */
629 if (b1->kind[i][j] != PARTITION_RANGE_DATUM_VALUE)
630 continue;
631 }
632
633 /*
634 * Compare the actual values. Note that it would be both incorrect
635 * and unsafe to invoke the comparison operator derived from the
636 * partitioning specification here. It would be incorrect because
637 * we want the relcache entry to be updated for ANY change to the
638 * partition bounds, not just those that the partitioning operator
639 * thinks are significant. It would be unsafe because we might
640 * reach this code in the context of an aborted transaction, and
641 * an arbitrary partitioning operator might not be safe in that
642 * context. datumIsEqual() should be simple enough to be safe.
643 */
644 if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j],
645 key->parttypbyval[j],
646 key->parttyplen[j]))
647 return false;
648 }
649
650 if (b1->indexes[i] != b2->indexes[i])
651 return false;
652 }
653
654 /* There are ndatums+1 indexes in case of range partitions */
655 if (key->strategy == PARTITION_STRATEGY_RANGE &&
656 b1->indexes[i] != b2->indexes[i])
657 return false;
658
659 return true;
660 }
661
662 /*
663 * check_new_partition_bound
664 *
665 * Checks if the new partition's bound overlaps any of the existing partitions
666 * of parent. Also performs additional checks as necessary per strategy.
667 */
668 void
check_new_partition_bound(char * relname,Relation parent,PartitionBoundSpec * spec)669 check_new_partition_bound(char *relname, Relation parent,
670 PartitionBoundSpec *spec)
671 {
672 PartitionKey key = RelationGetPartitionKey(parent);
673 PartitionDesc partdesc = RelationGetPartitionDesc(parent);
674 ParseState *pstate = make_parsestate(NULL);
675 int with = -1;
676 bool overlap = false;
677
678 switch (key->strategy)
679 {
680 case PARTITION_STRATEGY_LIST:
681 {
682 Assert(spec->strategy == PARTITION_STRATEGY_LIST);
683
684 if (partdesc->nparts > 0)
685 {
686 PartitionBoundInfo boundinfo = partdesc->boundinfo;
687 ListCell *cell;
688
689 Assert(boundinfo &&
690 boundinfo->strategy == PARTITION_STRATEGY_LIST &&
691 (boundinfo->ndatums > 0 ||
692 partition_bound_accepts_nulls(boundinfo)));
693
694 foreach(cell, spec->listdatums)
695 {
696 Const *val = castNode(Const, lfirst(cell));
697
698 if (!val->constisnull)
699 {
700 int offset;
701 bool equal;
702
703 offset = partition_bound_bsearch(key, boundinfo,
704 &val->constvalue,
705 true, &equal);
706 if (offset >= 0 && equal)
707 {
708 overlap = true;
709 with = boundinfo->indexes[offset];
710 break;
711 }
712 }
713 else if (partition_bound_accepts_nulls(boundinfo))
714 {
715 overlap = true;
716 with = boundinfo->null_index;
717 break;
718 }
719 }
720 }
721
722 break;
723 }
724
725 case PARTITION_STRATEGY_RANGE:
726 {
727 PartitionRangeBound *lower,
728 *upper;
729
730 Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
731 lower = make_one_range_bound(key, -1, spec->lowerdatums, true);
732 upper = make_one_range_bound(key, -1, spec->upperdatums, false);
733
734 /*
735 * First check if the resulting range would be empty with
736 * specified lower and upper bounds
737 */
738 if (partition_rbound_cmp(key, lower->datums, lower->kind, true,
739 upper) >= 0)
740 {
741 ereport(ERROR,
742 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
743 errmsg("empty range bound specified for partition \"%s\"",
744 relname),
745 errdetail("Specified lower bound %s is greater than or equal to upper bound %s.",
746 get_range_partbound_string(spec->lowerdatums),
747 get_range_partbound_string(spec->upperdatums)),
748 parser_errposition(pstate, spec->location)));
749 }
750
751 if (partdesc->nparts > 0)
752 {
753 PartitionBoundInfo boundinfo = partdesc->boundinfo;
754 int offset;
755 bool equal;
756
757 Assert(boundinfo && boundinfo->ndatums > 0 &&
758 boundinfo->strategy == PARTITION_STRATEGY_RANGE);
759
760 /*
761 * Test whether the new lower bound (which is treated
762 * inclusively as part of the new partition) lies inside
763 * an existing partition, or in a gap.
764 *
765 * If it's inside an existing partition, the bound at
766 * offset + 1 will be the upper bound of that partition,
767 * and its index will be >= 0.
768 *
769 * If it's in a gap, the bound at offset + 1 will be the
770 * lower bound of the next partition, and its index will
771 * be -1. This is also true if there is no next partition,
772 * since the index array is initialised with an extra -1
773 * at the end.
774 */
775 offset = partition_bound_bsearch(key, boundinfo, lower,
776 true, &equal);
777
778 if (boundinfo->indexes[offset + 1] < 0)
779 {
780 /*
781 * Check that the new partition will fit in the gap.
782 * For it to fit, the new upper bound must be less
783 * than or equal to the lower bound of the next
784 * partition, if there is one.
785 */
786 if (offset + 1 < boundinfo->ndatums)
787 {
788 int32 cmpval;
789
790 cmpval = partition_bound_cmp(key, boundinfo,
791 offset + 1, upper,
792 true);
793 if (cmpval < 0)
794 {
795 /*
796 * The new partition overlaps with the
797 * existing partition between offset + 1 and
798 * offset + 2.
799 */
800 overlap = true;
801 with = boundinfo->indexes[offset + 2];
802 }
803 }
804 }
805 else
806 {
807 /*
808 * The new partition overlaps with the existing
809 * partition between offset and offset + 1.
810 */
811 overlap = true;
812 with = boundinfo->indexes[offset + 1];
813 }
814 }
815
816 break;
817 }
818
819 default:
820 elog(ERROR, "unexpected partition strategy: %d",
821 (int) key->strategy);
822 }
823
824 if (overlap)
825 {
826 Assert(with >= 0);
827 ereport(ERROR,
828 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
829 errmsg("partition \"%s\" would overlap partition \"%s\"",
830 relname, get_rel_name(partdesc->oids[with])),
831 parser_errposition(pstate, spec->location)));
832 }
833 }
834
835 /*
836 * get_partition_parent
837 *
838 * Returns inheritance parent of a partition by scanning pg_inherits
839 *
840 * Note: Because this function assumes that the relation whose OID is passed
841 * as an argument will have precisely one parent, it should only be called
842 * when it is known that the relation is a partition.
843 */
844 Oid
get_partition_parent(Oid relid)845 get_partition_parent(Oid relid)
846 {
847 Form_pg_inherits form;
848 Relation catalogRelation;
849 SysScanDesc scan;
850 ScanKeyData key[2];
851 HeapTuple tuple;
852 Oid result;
853
854 catalogRelation = heap_open(InheritsRelationId, AccessShareLock);
855
856 ScanKeyInit(&key[0],
857 Anum_pg_inherits_inhrelid,
858 BTEqualStrategyNumber, F_OIDEQ,
859 ObjectIdGetDatum(relid));
860 ScanKeyInit(&key[1],
861 Anum_pg_inherits_inhseqno,
862 BTEqualStrategyNumber, F_INT4EQ,
863 Int32GetDatum(1));
864
865 scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, true,
866 NULL, 2, key);
867
868 tuple = systable_getnext(scan);
869 if (!HeapTupleIsValid(tuple))
870 elog(ERROR, "could not find tuple for parent of relation %u", relid);
871
872 form = (Form_pg_inherits) GETSTRUCT(tuple);
873 result = form->inhparent;
874
875 systable_endscan(scan);
876 heap_close(catalogRelation, AccessShareLock);
877
878 return result;
879 }
880
881 /*
882 * get_qual_from_partbound
883 * Given a parser node for partition bound, return the list of executable
884 * expressions as partition constraint
885 */
886 List *
get_qual_from_partbound(Relation rel,Relation parent,PartitionBoundSpec * spec)887 get_qual_from_partbound(Relation rel, Relation parent,
888 PartitionBoundSpec *spec)
889 {
890 PartitionKey key = RelationGetPartitionKey(parent);
891 List *my_qual = NIL;
892
893 Assert(key != NULL);
894
895 switch (key->strategy)
896 {
897 case PARTITION_STRATEGY_LIST:
898 Assert(spec->strategy == PARTITION_STRATEGY_LIST);
899 my_qual = get_qual_for_list(key, spec);
900 break;
901
902 case PARTITION_STRATEGY_RANGE:
903 Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
904 my_qual = get_qual_for_range(key, spec);
905 break;
906
907 default:
908 elog(ERROR, "unexpected partition strategy: %d",
909 (int) key->strategy);
910 }
911
912 return my_qual;
913 }
914
915 /*
916 * map_partition_varattnos - maps varattno of any Vars in expr from the
917 * parent attno to partition attno.
918 *
919 * We must allow for cases where physical attnos of a partition can be
920 * different from the parent's.
921 *
922 * If found_whole_row is not NULL, *found_whole_row returns whether a
923 * whole-row variable was found in the input expression.
924 *
925 * Note: this will work on any node tree, so really the argument and result
926 * should be declared "Node *". But a substantial majority of the callers
927 * are working on Lists, so it's less messy to do the casts internally.
928 */
929 List *
map_partition_varattnos(List * expr,int target_varno,Relation partrel,Relation parent,bool * found_whole_row)930 map_partition_varattnos(List *expr, int target_varno,
931 Relation partrel, Relation parent,
932 bool *found_whole_row)
933 {
934 bool my_found_whole_row = false;
935
936 if (expr != NIL)
937 {
938 AttrNumber *part_attnos;
939
940 part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel),
941 RelationGetDescr(parent),
942 gettext_noop("could not convert row type"));
943 expr = (List *) map_variable_attnos((Node *) expr,
944 target_varno, 0,
945 part_attnos,
946 RelationGetDescr(parent)->natts,
947 RelationGetForm(partrel)->reltype,
948 &my_found_whole_row);
949 }
950
951 if (found_whole_row)
952 *found_whole_row = my_found_whole_row;
953
954 return expr;
955 }
956
957 /*
958 * RelationGetPartitionQual
959 *
960 * Returns a list of partition quals
961 */
962 List *
RelationGetPartitionQual(Relation rel)963 RelationGetPartitionQual(Relation rel)
964 {
965 /* Quick exit */
966 if (!rel->rd_rel->relispartition)
967 return NIL;
968
969 return generate_partition_qual(rel);
970 }
971
972 /*
973 * get_partition_qual_relid
974 *
975 * Returns an expression tree describing the passed-in relation's partition
976 * constraint.
977 *
978 * If the relation is not found, or is not a partition, or there is no
979 * partition constraint, return NULL. We must guard against the first two
980 * cases because this supports a SQL function that could be passed any OID.
981 * The last case shouldn't happen in v10, but cope anyway.
982 */
983 Expr *
get_partition_qual_relid(Oid relid)984 get_partition_qual_relid(Oid relid)
985 {
986 Expr *result = NULL;
987
988 /* Do the work only if this relation exists and is a partition. */
989 if (get_rel_relispartition(relid))
990 {
991 Relation rel = relation_open(relid, AccessShareLock);
992 List *and_args;
993
994 and_args = generate_partition_qual(rel);
995
996 /* Convert implicit-AND list format to boolean expression */
997 if (and_args == NIL)
998 result = NULL;
999 else if (list_length(and_args) > 1)
1000 result = makeBoolExpr(AND_EXPR, and_args, -1);
1001 else
1002 result = linitial(and_args);
1003
1004 /* Keep the lock, to allow safe deparsing against the rel by caller. */
1005 relation_close(rel, NoLock);
1006 }
1007
1008 return result;
1009 }
1010
1011 /*
1012 * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
1013 * append pointer rel to the list 'parents'.
1014 */
1015 #define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
1016 do\
1017 {\
1018 int i;\
1019 for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
1020 {\
1021 (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
1022 (parents) = lappend((parents), (rel));\
1023 }\
1024 } while(0)
1025
1026 /*
1027 * RelationGetPartitionDispatchInfo
1028 * Returns information necessary to route tuples down a partition tree
1029 *
1030 * The number of elements in the returned array (that is, the number of
1031 * PartitionDispatch objects for the partitioned tables in the partition tree)
1032 * is returned in *num_parted and a list of the OIDs of all the leaf
1033 * partitions of rel is returned in *leaf_part_oids.
1034 *
1035 * All the relations in the partition tree (including 'rel') must have been
1036 * locked (using at least the AccessShareLock) by the caller.
1037 */
1038 PartitionDispatch *
RelationGetPartitionDispatchInfo(Relation rel,int * num_parted,List ** leaf_part_oids)1039 RelationGetPartitionDispatchInfo(Relation rel,
1040 int *num_parted, List **leaf_part_oids)
1041 {
1042 PartitionDispatchData **pd;
1043 List *all_parts = NIL,
1044 *all_parents = NIL,
1045 *parted_rels,
1046 *parted_rel_parents;
1047 ListCell *lc1,
1048 *lc2;
1049 int i,
1050 k,
1051 offset;
1052
1053 /*
1054 * We rely on the relcache to traverse the partition tree to build both
1055 * the leaf partition OIDs list and the array of PartitionDispatch objects
1056 * for the partitioned tables in the tree. That means every partitioned
1057 * table in the tree must be locked, which is fine since we require the
1058 * caller to lock all the partitions anyway.
1059 *
1060 * For every partitioned table in the tree, starting with the root
1061 * partitioned table, add its relcache entry to parted_rels, while also
1062 * queuing its partitions (in the order in which they appear in the
1063 * partition descriptor) to be looked at later in the same loop. This is
1064 * a bit tricky but works because the foreach() macro doesn't fetch the
1065 * next list element until the bottom of the loop.
1066 */
1067 *num_parted = 1;
1068 parted_rels = list_make1(rel);
1069 /* Root partitioned table has no parent, so NULL for parent */
1070 parted_rel_parents = list_make1(NULL);
1071 APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
1072 forboth(lc1, all_parts, lc2, all_parents)
1073 {
1074 Oid partrelid = lfirst_oid(lc1);
1075 Relation parent = lfirst(lc2);
1076
1077 if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE)
1078 {
1079 /*
1080 * Already locked by the caller. Note that it is the
1081 * responsibility of the caller to close the below relcache entry,
1082 * once done using the information being collected here (for
1083 * example, in ExecEndModifyTable).
1084 */
1085 Relation partrel = heap_open(partrelid, NoLock);
1086
1087 (*num_parted)++;
1088 parted_rels = lappend(parted_rels, partrel);
1089 parted_rel_parents = lappend(parted_rel_parents, parent);
1090 APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
1091 }
1092 }
1093
1094 /*
1095 * We want to create two arrays - one for leaf partitions and another for
1096 * partitioned tables (including the root table and internal partitions).
1097 * While we only create the latter here, leaf partition array of suitable
1098 * objects (such as, ResultRelInfo) is created by the caller using the
1099 * list of OIDs we return. Indexes into these arrays get assigned in a
1100 * breadth-first manner, whereby partitions of any given level are placed
1101 * consecutively in the respective arrays.
1102 */
1103 pd = (PartitionDispatchData **) palloc(*num_parted *
1104 sizeof(PartitionDispatchData *));
1105 *leaf_part_oids = NIL;
1106 i = k = offset = 0;
1107 forboth(lc1, parted_rels, lc2, parted_rel_parents)
1108 {
1109 Relation partrel = lfirst(lc1);
1110 Relation parent = lfirst(lc2);
1111 PartitionKey partkey = RelationGetPartitionKey(partrel);
1112 TupleDesc tupdesc = RelationGetDescr(partrel);
1113 PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
1114 int j,
1115 m;
1116
1117 pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
1118 pd[i]->reldesc = partrel;
1119 pd[i]->key = partkey;
1120 pd[i]->keystate = NIL;
1121 pd[i]->partdesc = partdesc;
1122 if (parent != NULL)
1123 {
1124 /*
1125 * For every partitioned table other than root, we must store a
1126 * tuple table slot initialized with its tuple descriptor and a
1127 * tuple conversion map to convert a tuple from its parent's
1128 * rowtype to its own. That is to make sure that we are looking at
1129 * the correct row using the correct tuple descriptor when
1130 * computing its partition key for tuple routing.
1131 */
1132 pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
1133 pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
1134 tupdesc,
1135 gettext_noop("could not convert row type"));
1136 }
1137 else
1138 {
1139 /* Not required for the root partitioned table */
1140 pd[i]->tupslot = NULL;
1141 pd[i]->tupmap = NULL;
1142 }
1143 pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
1144
1145 /*
1146 * Indexes corresponding to the internal partitions are multiplied by
1147 * -1 to distinguish them from those of leaf partitions. Encountering
1148 * an index >= 0 means we found a leaf partition, which is immediately
1149 * returned as the partition we are looking for. A negative index
1150 * means we found a partitioned table, whose PartitionDispatch object
1151 * is located at the above index multiplied back by -1. Using the
1152 * PartitionDispatch object, search is continued further down the
1153 * partition tree.
1154 */
1155 m = 0;
1156 for (j = 0; j < partdesc->nparts; j++)
1157 {
1158 Oid partrelid = partdesc->oids[j];
1159
1160 if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
1161 {
1162 *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
1163 pd[i]->indexes[j] = k++;
1164 }
1165 else
1166 {
1167 /*
1168 * offset denotes the number of partitioned tables of upper
1169 * levels including those of the current level. Any partition
1170 * of this table must belong to the next level and hence will
1171 * be placed after the last partitioned table of this level.
1172 */
1173 pd[i]->indexes[j] = -(1 + offset + m);
1174 m++;
1175 }
1176 }
1177 i++;
1178
1179 /*
1180 * This counts the number of partitioned tables at upper levels
1181 * including those of the current level.
1182 */
1183 offset += m;
1184 }
1185
1186 return pd;
1187 }
1188
1189 /* Module-local functions */
1190
1191 /*
1192 * get_partition_operator
1193 *
1194 * Return oid of the operator of given strategy for a given partition key
1195 * column. It is assumed that the partitioning key is of the same type as the
1196 * chosen partitioning opclass, or at least binary-compatible. In the latter
1197 * case, *need_relabel is set to true if the opclass is not of a polymorphic
1198 * type, otherwise false.
1199 */
1200 static Oid
get_partition_operator(PartitionKey key,int col,StrategyNumber strategy,bool * need_relabel)1201 get_partition_operator(PartitionKey key, int col, StrategyNumber strategy,
1202 bool *need_relabel)
1203 {
1204 Oid operoid;
1205
1206 /*
1207 * Get the operator in the partitioning opfamily using the opclass'
1208 * declared input type as both left- and righttype.
1209 */
1210 operoid = get_opfamily_member(key->partopfamily[col],
1211 key->partopcintype[col],
1212 key->partopcintype[col],
1213 strategy);
1214 if (!OidIsValid(operoid))
1215 elog(ERROR, "missing operator %d(%u,%u) in partition opfamily %u",
1216 strategy, key->partopcintype[col], key->partopcintype[col],
1217 key->partopfamily[col]);
1218
1219 /*
1220 * If the partition key column is not of the same type as the operator
1221 * class and not polymorphic, tell caller to wrap the non-Const expression
1222 * in a RelabelType. This matches what parse_coerce.c does.
1223 */
1224 *need_relabel = (key->parttypid[col] != key->partopcintype[col] &&
1225 key->partopcintype[col] != RECORDOID &&
1226 !IsPolymorphicType(key->partopcintype[col]));
1227
1228 return operoid;
1229 }
1230
1231 /*
1232 * make_partition_op_expr
1233 * Returns an Expr for the given partition key column with arg1 and
1234 * arg2 as its leftop and rightop, respectively
1235 */
1236 static Expr *
make_partition_op_expr(PartitionKey key,int keynum,uint16 strategy,Expr * arg1,Expr * arg2)1237 make_partition_op_expr(PartitionKey key, int keynum,
1238 uint16 strategy, Expr *arg1, Expr *arg2)
1239 {
1240 Oid operoid;
1241 bool need_relabel = false;
1242 Expr *result = NULL;
1243
1244 /* Get the correct btree operator for this partitioning column */
1245 operoid = get_partition_operator(key, keynum, strategy, &need_relabel);
1246
1247 /*
1248 * Chosen operator may be such that the non-Const operand needs to be
1249 * coerced, so apply the same; see the comment in
1250 * get_partition_operator().
1251 */
1252 if (!IsA(arg1, Const) &&
1253 (need_relabel ||
1254 key->partcollation[keynum] != key->parttypcoll[keynum]))
1255 arg1 = (Expr *) makeRelabelType(arg1,
1256 key->partopcintype[keynum],
1257 -1,
1258 key->partcollation[keynum],
1259 COERCE_EXPLICIT_CAST);
1260
1261 /* Generate the actual expression */
1262 switch (key->strategy)
1263 {
1264 case PARTITION_STRATEGY_LIST:
1265 {
1266 List *elems = (List *) arg2;
1267 int nelems = list_length(elems);
1268
1269 Assert(nelems >= 1);
1270 Assert(keynum == 0);
1271
1272 if (nelems > 1 &&
1273 !type_is_array(key->parttypid[keynum]))
1274 {
1275 ArrayExpr *arrexpr;
1276 ScalarArrayOpExpr *saopexpr;
1277
1278 /* Construct an ArrayExpr for the right-hand inputs */
1279 arrexpr = makeNode(ArrayExpr);
1280 arrexpr->array_typeid =
1281 get_array_type(key->parttypid[keynum]);
1282 arrexpr->array_collid = key->parttypcoll[keynum];
1283 arrexpr->element_typeid = key->parttypid[keynum];
1284 arrexpr->elements = elems;
1285 arrexpr->multidims = false;
1286 arrexpr->location = -1;
1287
1288 /* Build leftop = ANY (rightop) */
1289 saopexpr = makeNode(ScalarArrayOpExpr);
1290 saopexpr->opno = operoid;
1291 saopexpr->opfuncid = get_opcode(operoid);
1292 saopexpr->useOr = true;
1293 saopexpr->inputcollid = key->partcollation[keynum];
1294 saopexpr->args = list_make2(arg1, arrexpr);
1295 saopexpr->location = -1;
1296
1297 result = (Expr *) saopexpr;
1298 }
1299 else
1300 {
1301 List *elemops = NIL;
1302 ListCell *lc;
1303
1304 foreach (lc, elems)
1305 {
1306 Expr *elem = lfirst(lc),
1307 *elemop;
1308
1309 elemop = make_opclause(operoid,
1310 BOOLOID,
1311 false,
1312 arg1, elem,
1313 InvalidOid,
1314 key->partcollation[keynum]);
1315 elemops = lappend(elemops, elemop);
1316 }
1317
1318 result = nelems > 1 ? makeBoolExpr(OR_EXPR, elemops, -1) : linitial(elemops);
1319 }
1320 break;
1321 }
1322
1323 case PARTITION_STRATEGY_RANGE:
1324 result = make_opclause(operoid,
1325 BOOLOID,
1326 false,
1327 arg1, arg2,
1328 InvalidOid,
1329 key->partcollation[keynum]);
1330 break;
1331
1332 default:
1333 elog(ERROR, "invalid partitioning strategy");
1334 break;
1335 }
1336
1337 return result;
1338 }
1339
1340 /*
1341 * get_qual_for_list
1342 *
1343 * Returns an implicit-AND list of expressions to use as a list partition's
1344 * constraint, given the partition key and bound structures.
1345 */
1346 static List *
get_qual_for_list(PartitionKey key,PartitionBoundSpec * spec)1347 get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
1348 {
1349 List *result;
1350 Expr *keyCol;
1351 Expr *opexpr;
1352 NullTest *nulltest;
1353 ListCell *cell;
1354 List *elems = NIL;
1355 bool list_has_null = false;
1356
1357 /*
1358 * Only single-column list partitioning is supported, so we are worried
1359 * only about the partition key with index 0.
1360 */
1361 Assert(key->partnatts == 1);
1362
1363 /* Construct Var or expression representing the partition column */
1364 if (key->partattrs[0] != 0)
1365 keyCol = (Expr *) makeVar(1,
1366 key->partattrs[0],
1367 key->parttypid[0],
1368 key->parttypmod[0],
1369 key->parttypcoll[0],
1370 0);
1371 else
1372 keyCol = (Expr *) copyObject(linitial(key->partexprs));
1373
1374 /* Create list of Consts for the allowed values, excluding any nulls */
1375 foreach(cell, spec->listdatums)
1376 {
1377 Const *val = castNode(Const, lfirst(cell));
1378
1379 if (val->constisnull)
1380 list_has_null = true;
1381 else
1382 elems = lappend(elems, copyObject(val));
1383 }
1384
1385 if (elems)
1386 {
1387 /*
1388 * Generate the operator expression from the non-null partition
1389 * values.
1390 */
1391 opexpr = make_partition_op_expr(key, 0, BTEqualStrategyNumber,
1392 keyCol, (Expr *) elems);
1393 }
1394 else
1395 {
1396 /*
1397 * If there are no partition values, we don't need an operator
1398 * expression.
1399 */
1400 opexpr = NULL;
1401 }
1402
1403 if (!list_has_null)
1404 {
1405 /*
1406 * Gin up a "col IS NOT NULL" test that will be AND'd with the main
1407 * expression. This might seem redundant, but the partition routing
1408 * machinery needs it.
1409 */
1410 nulltest = makeNode(NullTest);
1411 nulltest->arg = keyCol;
1412 nulltest->nulltesttype = IS_NOT_NULL;
1413 nulltest->argisrow = false;
1414 nulltest->location = -1;
1415
1416 result = opexpr ? list_make2(nulltest, opexpr) : list_make1(nulltest);
1417 }
1418 else
1419 {
1420 /*
1421 * Gin up a "col IS NULL" test that will be OR'd with the main
1422 * expression.
1423 */
1424 nulltest = makeNode(NullTest);
1425 nulltest->arg = keyCol;
1426 nulltest->nulltesttype = IS_NULL;
1427 nulltest->argisrow = false;
1428 nulltest->location = -1;
1429
1430 if (opexpr)
1431 {
1432 Expr *or;
1433
1434 or = makeBoolExpr(OR_EXPR, list_make2(nulltest, opexpr), -1);
1435 result = list_make1(or);
1436 }
1437 else
1438 result = list_make1(nulltest);
1439 }
1440
1441 return result;
1442 }
1443
1444 /*
1445 * get_range_key_properties
1446 * Returns range partition key information for a given column
1447 *
1448 * This is a subroutine for get_qual_for_range, and its API is pretty
1449 * specialized to that caller.
1450 *
1451 * Constructs an Expr for the key column (returned in *keyCol) and Consts
1452 * for the lower and upper range limits (returned in *lower_val and
1453 * *upper_val). For MINVALUE/MAXVALUE limits, NULL is returned instead of
1454 * a Const. All of these structures are freshly palloc'd.
1455 *
1456 * *partexprs_item points to the cell containing the next expression in
1457 * the key->partexprs list, or NULL. It may be advanced upon return.
1458 */
1459 static void
get_range_key_properties(PartitionKey key,int keynum,PartitionRangeDatum * ldatum,PartitionRangeDatum * udatum,ListCell ** partexprs_item,Expr ** keyCol,Const ** lower_val,Const ** upper_val)1460 get_range_key_properties(PartitionKey key, int keynum,
1461 PartitionRangeDatum *ldatum,
1462 PartitionRangeDatum *udatum,
1463 ListCell **partexprs_item,
1464 Expr **keyCol,
1465 Const **lower_val, Const **upper_val)
1466 {
1467 /* Get partition key expression for this column */
1468 if (key->partattrs[keynum] != 0)
1469 {
1470 *keyCol = (Expr *) makeVar(1,
1471 key->partattrs[keynum],
1472 key->parttypid[keynum],
1473 key->parttypmod[keynum],
1474 key->parttypcoll[keynum],
1475 0);
1476 }
1477 else
1478 {
1479 if (*partexprs_item == NULL)
1480 elog(ERROR, "wrong number of partition key expressions");
1481 *keyCol = copyObject(lfirst(*partexprs_item));
1482 *partexprs_item = lnext(*partexprs_item);
1483 }
1484
1485 /* Get appropriate Const nodes for the bounds */
1486 if (ldatum->kind == PARTITION_RANGE_DATUM_VALUE)
1487 *lower_val = castNode(Const, copyObject(ldatum->value));
1488 else
1489 *lower_val = NULL;
1490
1491 if (udatum->kind == PARTITION_RANGE_DATUM_VALUE)
1492 *upper_val = castNode(Const, copyObject(udatum->value));
1493 else
1494 *upper_val = NULL;
1495 }
1496
1497 /*
1498 * get_qual_for_range
1499 *
1500 * Returns an implicit-AND list of expressions to use as a range partition's
1501 * constraint, given the partition key and bound structures.
1502 *
1503 * For a multi-column range partition key, say (a, b, c), with (al, bl, cl)
1504 * as the lower bound tuple and (au, bu, cu) as the upper bound tuple, we
1505 * generate an expression tree of the following form:
1506 *
1507 * (a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL)
1508 * AND
1509 * (a > al OR (a = al AND b > bl) OR (a = al AND b = bl AND c >= cl))
1510 * AND
1511 * (a < au OR (a = au AND b < bu) OR (a = au AND b = bu AND c < cu))
1512 *
1513 * It is often the case that a prefix of lower and upper bound tuples contains
1514 * the same values, for example, (al = au), in which case, we will emit an
1515 * expression tree of the following form:
1516 *
1517 * (a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL)
1518 * AND
1519 * (a = al)
1520 * AND
1521 * (b > bl OR (b = bl AND c >= cl))
1522 * AND
1523 * (b < bu OR (b = bu AND c < cu))
1524 *
1525 * If a bound datum is either MINVALUE or MAXVALUE, these expressions are
1526 * simplified using the fact that any value is greater than MINVALUE and less
1527 * than MAXVALUE. So, for example, if cu = MAXVALUE, c < cu is automatically
1528 * true, and we need not emit any expression for it, and the last line becomes
1529 *
1530 * (b < bu) OR (b = bu), which is simplified to (b <= bu)
1531 *
1532 * In most common cases with only one partition column, say a, the following
1533 * expression tree will be generated: a IS NOT NULL AND a >= al AND a < au
1534 *
1535 * If we end up with an empty result list, we return a single-member list
1536 * containing a constant TRUE, because callers expect a non-empty list.
1537 */
1538 static List *
get_qual_for_range(PartitionKey key,PartitionBoundSpec * spec)1539 get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
1540 {
1541 List *result = NIL;
1542 ListCell *cell1,
1543 *cell2,
1544 *partexprs_item,
1545 *partexprs_item_saved;
1546 int i,
1547 j;
1548 PartitionRangeDatum *ldatum,
1549 *udatum;
1550 Expr *keyCol;
1551 Const *lower_val,
1552 *upper_val;
1553 NullTest *nulltest;
1554 List *lower_or_arms,
1555 *upper_or_arms;
1556 int num_or_arms,
1557 current_or_arm;
1558 ListCell *lower_or_start_datum,
1559 *upper_or_start_datum;
1560 bool need_next_lower_arm,
1561 need_next_upper_arm;
1562
1563 lower_or_start_datum = list_head(spec->lowerdatums);
1564 upper_or_start_datum = list_head(spec->upperdatums);
1565 num_or_arms = key->partnatts;
1566
1567 /*
1568 * A range-partitioned table does not currently allow partition keys to be
1569 * null, so emit an IS NOT NULL expression for each key column.
1570 */
1571 partexprs_item = list_head(key->partexprs);
1572 for (i = 0; i < key->partnatts; i++)
1573 {
1574 Expr *keyCol;
1575
1576 if (key->partattrs[i] != 0)
1577 {
1578 keyCol = (Expr *) makeVar(1,
1579 key->partattrs[i],
1580 key->parttypid[i],
1581 key->parttypmod[i],
1582 key->parttypcoll[i],
1583 0);
1584 }
1585 else
1586 {
1587 if (partexprs_item == NULL)
1588 elog(ERROR, "wrong number of partition key expressions");
1589 keyCol = copyObject(lfirst(partexprs_item));
1590 partexprs_item = lnext(partexprs_item);
1591 }
1592
1593 nulltest = makeNode(NullTest);
1594 nulltest->arg = keyCol;
1595 nulltest->nulltesttype = IS_NOT_NULL;
1596 nulltest->argisrow = false;
1597 nulltest->location = -1;
1598 result = lappend(result, nulltest);
1599 }
1600
1601 /*
1602 * Iterate over the key columns and check if the corresponding lower and
1603 * upper datums are equal using the btree equality operator for the
1604 * column's type. If equal, we emit single keyCol = common_value
1605 * expression. Starting from the first column for which the corresponding
1606 * lower and upper bound datums are not equal, we generate OR expressions
1607 * as shown in the function's header comment.
1608 */
1609 i = 0;
1610 partexprs_item = list_head(key->partexprs);
1611 partexprs_item_saved = partexprs_item; /* placate compiler */
1612 forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
1613 {
1614 EState *estate;
1615 MemoryContext oldcxt;
1616 Expr *test_expr;
1617 ExprState *test_exprstate;
1618 Datum test_result;
1619 bool isNull;
1620
1621 ldatum = castNode(PartitionRangeDatum, lfirst(cell1));
1622 udatum = castNode(PartitionRangeDatum, lfirst(cell2));
1623
1624 /*
1625 * Since get_range_key_properties() modifies partexprs_item, and we
1626 * might need to start over from the previous expression in the later
1627 * part of this function, save away the current value.
1628 */
1629 partexprs_item_saved = partexprs_item;
1630
1631 get_range_key_properties(key, i, ldatum, udatum,
1632 &partexprs_item,
1633 &keyCol,
1634 &lower_val, &upper_val);
1635
1636 /*
1637 * If either value is NULL, the corresponding partition bound is
1638 * either MINVALUE or MAXVALUE, and we treat them as unequal, because
1639 * even if they're the same, there is no common value to equate the
1640 * key column with.
1641 */
1642 if (!lower_val || !upper_val)
1643 break;
1644
1645 /* Create the test expression */
1646 estate = CreateExecutorState();
1647 oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
1648 test_expr = make_partition_op_expr(key, i, BTEqualStrategyNumber,
1649 (Expr *) lower_val,
1650 (Expr *) upper_val);
1651 fix_opfuncids((Node *) test_expr);
1652 test_exprstate = ExecInitExpr(test_expr, NULL);
1653 test_result = ExecEvalExprSwitchContext(test_exprstate,
1654 GetPerTupleExprContext(estate),
1655 &isNull);
1656 MemoryContextSwitchTo(oldcxt);
1657 FreeExecutorState(estate);
1658
1659 /* If not equal, go generate the OR expressions */
1660 if (!DatumGetBool(test_result))
1661 break;
1662
1663 /*
1664 * The bounds for the last key column can't be equal, because such a
1665 * range partition would never be allowed to be defined (it would have
1666 * an empty range otherwise).
1667 */
1668 if (i == key->partnatts - 1)
1669 elog(ERROR, "invalid range bound specification");
1670
1671 /* Equal, so generate keyCol = lower_val expression */
1672 result = lappend(result,
1673 make_partition_op_expr(key, i, BTEqualStrategyNumber,
1674 keyCol, (Expr *) lower_val));
1675
1676 i++;
1677 }
1678
1679 /* First pair of lower_val and upper_val that are not equal. */
1680 lower_or_start_datum = cell1;
1681 upper_or_start_datum = cell2;
1682
1683 /* OR will have as many arms as there are key columns left. */
1684 num_or_arms = key->partnatts - i;
1685 current_or_arm = 0;
1686 lower_or_arms = upper_or_arms = NIL;
1687 need_next_lower_arm = need_next_upper_arm = true;
1688 while (current_or_arm < num_or_arms)
1689 {
1690 List *lower_or_arm_args = NIL,
1691 *upper_or_arm_args = NIL;
1692
1693 /* Restart scan of columns from the i'th one */
1694 j = i;
1695 partexprs_item = partexprs_item_saved;
1696
1697 for_both_cell(cell1, lower_or_start_datum, cell2, upper_or_start_datum)
1698 {
1699 PartitionRangeDatum *ldatum_next = NULL,
1700 *udatum_next = NULL;
1701
1702 ldatum = castNode(PartitionRangeDatum, lfirst(cell1));
1703 if (lnext(cell1))
1704 ldatum_next = castNode(PartitionRangeDatum,
1705 lfirst(lnext(cell1)));
1706 udatum = castNode(PartitionRangeDatum, lfirst(cell2));
1707 if (lnext(cell2))
1708 udatum_next = castNode(PartitionRangeDatum,
1709 lfirst(lnext(cell2)));
1710 get_range_key_properties(key, j, ldatum, udatum,
1711 &partexprs_item,
1712 &keyCol,
1713 &lower_val, &upper_val);
1714
1715 if (need_next_lower_arm && lower_val)
1716 {
1717 uint16 strategy;
1718
1719 /*
1720 * For the non-last columns of this arm, use the EQ operator.
1721 * For the last column of this arm, use GT, unless this is the
1722 * last column of the whole bound check, or the next bound
1723 * datum is MINVALUE, in which case use GE.
1724 */
1725 if (j - i < current_or_arm)
1726 strategy = BTEqualStrategyNumber;
1727 else if (j == key->partnatts - 1 ||
1728 (ldatum_next &&
1729 ldatum_next->kind == PARTITION_RANGE_DATUM_MINVALUE))
1730 strategy = BTGreaterEqualStrategyNumber;
1731 else
1732 strategy = BTGreaterStrategyNumber;
1733
1734 lower_or_arm_args = lappend(lower_or_arm_args,
1735 make_partition_op_expr(key, j,
1736 strategy,
1737 keyCol,
1738 (Expr *) lower_val));
1739 }
1740
1741 if (need_next_upper_arm && upper_val)
1742 {
1743 uint16 strategy;
1744
1745 /*
1746 * For the non-last columns of this arm, use the EQ operator.
1747 * For the last column of this arm, use LT, unless the next
1748 * bound datum is MAXVALUE, in which case use LE.
1749 */
1750 if (j - i < current_or_arm)
1751 strategy = BTEqualStrategyNumber;
1752 else if (udatum_next &&
1753 udatum_next->kind == PARTITION_RANGE_DATUM_MAXVALUE)
1754 strategy = BTLessEqualStrategyNumber;
1755 else
1756 strategy = BTLessStrategyNumber;
1757
1758 upper_or_arm_args = lappend(upper_or_arm_args,
1759 make_partition_op_expr(key, j,
1760 strategy,
1761 keyCol,
1762 (Expr *) upper_val));
1763 }
1764
1765 /*
1766 * Did we generate enough of OR's arguments? First arm considers
1767 * the first of the remaining columns, second arm considers first
1768 * two of the remaining columns, and so on.
1769 */
1770 ++j;
1771 if (j - i > current_or_arm)
1772 {
1773 /*
1774 * We must not emit any more arms if the new column that will
1775 * be considered is unbounded, or this one was.
1776 */
1777 if (!lower_val || !ldatum_next ||
1778 ldatum_next->kind != PARTITION_RANGE_DATUM_VALUE)
1779 need_next_lower_arm = false;
1780 if (!upper_val || !udatum_next ||
1781 udatum_next->kind != PARTITION_RANGE_DATUM_VALUE)
1782 need_next_upper_arm = false;
1783 break;
1784 }
1785 }
1786
1787 if (lower_or_arm_args != NIL)
1788 lower_or_arms = lappend(lower_or_arms,
1789 list_length(lower_or_arm_args) > 1
1790 ? makeBoolExpr(AND_EXPR, lower_or_arm_args, -1)
1791 : linitial(lower_or_arm_args));
1792
1793 if (upper_or_arm_args != NIL)
1794 upper_or_arms = lappend(upper_or_arms,
1795 list_length(upper_or_arm_args) > 1
1796 ? makeBoolExpr(AND_EXPR, upper_or_arm_args, -1)
1797 : linitial(upper_or_arm_args));
1798
1799 /* If no work to do in the next iteration, break away. */
1800 if (!need_next_lower_arm && !need_next_upper_arm)
1801 break;
1802
1803 ++current_or_arm;
1804 }
1805
1806 /*
1807 * Generate the OR expressions for each of lower and upper bounds (if
1808 * required), and append to the list of implicitly ANDed list of
1809 * expressions.
1810 */
1811 if (lower_or_arms != NIL)
1812 result = lappend(result,
1813 list_length(lower_or_arms) > 1
1814 ? makeBoolExpr(OR_EXPR, lower_or_arms, -1)
1815 : linitial(lower_or_arms));
1816 if (upper_or_arms != NIL)
1817 result = lappend(result,
1818 list_length(upper_or_arms) > 1
1819 ? makeBoolExpr(OR_EXPR, upper_or_arms, -1)
1820 : linitial(upper_or_arms));
1821
1822 /* As noted above, caller expects the list to be non-empty. */
1823 if (result == NIL)
1824 result = list_make1(makeBoolConst(true, false));
1825
1826 return result;
1827 }
1828
1829 /*
1830 * generate_partition_qual
1831 *
1832 * Generate partition predicate from rel's partition bound expression
1833 *
1834 * We cache a copy of the result in the relcache entry, after constructing
1835 * it using the caller's context. This approach avoids leaking any data
1836 * into long-lived cache contexts, especially if we fail partway through.
1837 */
1838 static List *
generate_partition_qual(Relation rel)1839 generate_partition_qual(Relation rel)
1840 {
1841 HeapTuple tuple;
1842 MemoryContext oldcxt;
1843 Datum boundDatum;
1844 bool isnull;
1845 PartitionBoundSpec *bound;
1846 List *my_qual = NIL,
1847 *result = NIL;
1848 Relation parent;
1849 bool found_whole_row;
1850
1851 /* Guard against stack overflow due to overly deep partition tree */
1852 check_stack_depth();
1853
1854 /* If we already cached the result, just return a copy */
1855 if (rel->rd_partcheckvalid)
1856 return copyObject(rel->rd_partcheck);
1857
1858 /* Grab at least an AccessShareLock on the parent table */
1859 parent = relation_open(get_partition_parent(RelationGetRelid(rel)),
1860 AccessShareLock);
1861
1862 /* Get pg_class.relpartbound */
1863 tuple = SearchSysCache1(RELOID, RelationGetRelid(rel));
1864 if (!HeapTupleIsValid(tuple))
1865 elog(ERROR, "cache lookup failed for relation %u",
1866 RelationGetRelid(rel));
1867
1868 boundDatum = SysCacheGetAttr(RELOID, tuple,
1869 Anum_pg_class_relpartbound,
1870 &isnull);
1871 if (isnull) /* should not happen */
1872 elog(ERROR, "relation \"%s\" has relpartbound = null",
1873 RelationGetRelationName(rel));
1874 bound = castNode(PartitionBoundSpec,
1875 stringToNode(TextDatumGetCString(boundDatum)));
1876 ReleaseSysCache(tuple);
1877
1878 my_qual = get_qual_from_partbound(rel, parent, bound);
1879
1880 /* Add the parent's quals to the list (if any) */
1881 if (parent->rd_rel->relispartition)
1882 result = list_concat(generate_partition_qual(parent), my_qual);
1883 else
1884 result = my_qual;
1885
1886 /*
1887 * Change Vars to have partition's attnos instead of the parent's. We do
1888 * this after we concatenate the parent's quals, because we want every Var
1889 * in it to bear this relation's attnos. It's safe to assume varno = 1
1890 * here.
1891 */
1892 result = map_partition_varattnos(result, 1, rel, parent,
1893 &found_whole_row);
1894 /* There can never be a whole-row reference here */
1895 if (found_whole_row)
1896 elog(ERROR, "unexpected whole-row reference found in partition key");
1897
1898 /* Assert that we're not leaking any old data during assignments below */
1899 Assert(rel->rd_partcheckcxt == NULL);
1900 Assert(rel->rd_partcheck == NIL);
1901
1902 /*
1903 * Save a copy in the relcache. The order of these operations is fairly
1904 * critical to avoid memory leaks and ensure that we don't leave a corrupt
1905 * relcache entry if we fail partway through copyObject.
1906 *
1907 * If, as is definitely possible, the partcheck list is NIL, then we do
1908 * not need to make a context to hold it.
1909 */
1910 if (result != NIL)
1911 {
1912 rel->rd_partcheckcxt = AllocSetContextCreate(CacheMemoryContext,
1913 RelationGetRelationName(rel),
1914 ALLOCSET_SMALL_SIZES);
1915 oldcxt = MemoryContextSwitchTo(rel->rd_partcheckcxt);
1916 rel->rd_partcheck = copyObject(result);
1917 MemoryContextSwitchTo(oldcxt);
1918 }
1919 else
1920 rel->rd_partcheck = NIL;
1921 rel->rd_partcheckvalid = true;
1922
1923 /* Keep the parent locked until commit */
1924 relation_close(parent, NoLock);
1925
1926 /* Return the working copy to the caller */
1927 return result;
1928 }
1929
1930 /* ----------------
1931 * FormPartitionKeyDatum
1932 * Construct values[] and isnull[] arrays for the partition key
1933 * of a tuple.
1934 *
1935 * pd Partition dispatch object of the partitioned table
1936 * slot Heap tuple from which to extract partition key
1937 * estate executor state for evaluating any partition key
1938 * expressions (must be non-NULL)
1939 * values Array of partition key Datums (output area)
1940 * isnull Array of is-null indicators (output area)
1941 *
1942 * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1943 * the heap tuple passed in.
1944 * ----------------
1945 */
1946 void
FormPartitionKeyDatum(PartitionDispatch pd,TupleTableSlot * slot,EState * estate,Datum * values,bool * isnull)1947 FormPartitionKeyDatum(PartitionDispatch pd,
1948 TupleTableSlot *slot,
1949 EState *estate,
1950 Datum *values,
1951 bool *isnull)
1952 {
1953 ListCell *partexpr_item;
1954 int i;
1955
1956 if (pd->key->partexprs != NIL && pd->keystate == NIL)
1957 {
1958 /* Check caller has set up context correctly */
1959 Assert(estate != NULL &&
1960 GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1961
1962 /* First time through, set up expression evaluation state */
1963 pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
1964 }
1965
1966 partexpr_item = list_head(pd->keystate);
1967 for (i = 0; i < pd->key->partnatts; i++)
1968 {
1969 AttrNumber keycol = pd->key->partattrs[i];
1970 Datum datum;
1971 bool isNull;
1972
1973 if (keycol != 0)
1974 {
1975 /* Plain column; get the value directly from the heap tuple */
1976 datum = slot_getattr(slot, keycol, &isNull);
1977 }
1978 else
1979 {
1980 /* Expression; need to evaluate it */
1981 if (partexpr_item == NULL)
1982 elog(ERROR, "wrong number of partition key expressions");
1983 datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
1984 GetPerTupleExprContext(estate),
1985 &isNull);
1986 partexpr_item = lnext(partexpr_item);
1987 }
1988 values[i] = datum;
1989 isnull[i] = isNull;
1990 }
1991
1992 if (partexpr_item != NULL)
1993 elog(ERROR, "wrong number of partition key expressions");
1994 }
1995
1996 /*
1997 * get_partition_for_tuple
1998 * Finds a leaf partition for tuple contained in *slot
1999 *
2000 * Returned value is the sequence number of the leaf partition thus found,
2001 * or -1 if no leaf partition is found for the tuple. *failed_at is set
2002 * to the OID of the partitioned table whose partition was not found in
2003 * the latter case.
2004 */
2005 int
get_partition_for_tuple(PartitionDispatch * pd,TupleTableSlot * slot,EState * estate,PartitionDispatchData ** failed_at,TupleTableSlot ** failed_slot)2006 get_partition_for_tuple(PartitionDispatch *pd,
2007 TupleTableSlot *slot,
2008 EState *estate,
2009 PartitionDispatchData **failed_at,
2010 TupleTableSlot **failed_slot)
2011 {
2012 PartitionDispatch parent;
2013 Datum values[PARTITION_MAX_KEYS];
2014 bool isnull[PARTITION_MAX_KEYS];
2015 int cur_offset,
2016 cur_index;
2017 int i,
2018 result;
2019 ExprContext *ecxt = GetPerTupleExprContext(estate);
2020 TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
2021
2022 /* start with the root partitioned table */
2023 parent = pd[0];
2024 while (true)
2025 {
2026 PartitionKey key = parent->key;
2027 PartitionDesc partdesc = parent->partdesc;
2028 TupleTableSlot *myslot = parent->tupslot;
2029 TupleConversionMap *map = parent->tupmap;
2030
2031 if (myslot != NULL && map != NULL)
2032 {
2033 HeapTuple tuple = ExecFetchSlotTuple(slot);
2034
2035 ExecClearTuple(myslot);
2036 tuple = do_convert_tuple(tuple, map);
2037 ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
2038 slot = myslot;
2039 }
2040
2041 /* Quick exit */
2042 if (partdesc->nparts == 0)
2043 {
2044 *failed_at = parent;
2045 *failed_slot = slot;
2046 result = -1;
2047 goto error_exit;
2048 }
2049
2050 /*
2051 * Extract partition key from tuple. Expression evaluation machinery
2052 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
2053 * point to the correct tuple slot. The slot might have changed from
2054 * what was used for the parent table if the table of the current
2055 * partitioning level has different tuple descriptor from the parent.
2056 * So update ecxt_scantuple accordingly.
2057 */
2058 ecxt->ecxt_scantuple = slot;
2059 FormPartitionKeyDatum(parent, slot, estate, values, isnull);
2060
2061 if (key->strategy == PARTITION_STRATEGY_RANGE)
2062 {
2063 /*
2064 * Since we cannot route tuples with NULL partition keys through a
2065 * range-partitioned table, simply return that no partition exists
2066 */
2067 for (i = 0; i < key->partnatts; i++)
2068 {
2069 if (isnull[i])
2070 {
2071 *failed_at = parent;
2072 *failed_slot = slot;
2073 result = -1;
2074 goto error_exit;
2075 }
2076 }
2077 }
2078
2079 /*
2080 * A null partition key is only acceptable if null-accepting list
2081 * partition exists.
2082 */
2083 cur_index = -1;
2084 if (isnull[0] && partition_bound_accepts_nulls(partdesc->boundinfo))
2085 cur_index = partdesc->boundinfo->null_index;
2086 else if (!isnull[0])
2087 {
2088 /* Else bsearch in partdesc->boundinfo */
2089 bool equal = false;
2090
2091 cur_offset = partition_bound_bsearch(key, partdesc->boundinfo,
2092 values, false, &equal);
2093 switch (key->strategy)
2094 {
2095 case PARTITION_STRATEGY_LIST:
2096 if (cur_offset >= 0 && equal)
2097 cur_index = partdesc->boundinfo->indexes[cur_offset];
2098 else
2099 cur_index = -1;
2100 break;
2101
2102 case PARTITION_STRATEGY_RANGE:
2103
2104 /*
2105 * Offset returned is such that the bound at offset is
2106 * found to be less or equal with the tuple. So, the bound
2107 * at offset+1 would be the upper bound.
2108 */
2109 cur_index = partdesc->boundinfo->indexes[cur_offset + 1];
2110 break;
2111
2112 default:
2113 elog(ERROR, "unexpected partition strategy: %d",
2114 (int) key->strategy);
2115 }
2116 }
2117
2118 /*
2119 * cur_index < 0 means we failed to find a partition of this parent.
2120 * cur_index >= 0 means we either found the leaf partition, or the
2121 * next parent to find a partition of.
2122 */
2123 if (cur_index < 0)
2124 {
2125 result = -1;
2126 *failed_at = parent;
2127 *failed_slot = slot;
2128 break;
2129 }
2130 else if (parent->indexes[cur_index] >= 0)
2131 {
2132 result = parent->indexes[cur_index];
2133 break;
2134 }
2135 else
2136 parent = pd[-parent->indexes[cur_index]];
2137 }
2138
2139 error_exit:
2140 ecxt->ecxt_scantuple = ecxt_scantuple_old;
2141 return result;
2142 }
2143
2144 /*
2145 * qsort_partition_list_value_cmp
2146 *
2147 * Compare two list partition bound datums
2148 */
2149 static int32
qsort_partition_list_value_cmp(const void * a,const void * b,void * arg)2150 qsort_partition_list_value_cmp(const void *a, const void *b, void *arg)
2151 {
2152 Datum val1 = (*(const PartitionListValue **) a)->value,
2153 val2 = (*(const PartitionListValue **) b)->value;
2154 PartitionKey key = (PartitionKey) arg;
2155
2156 return DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
2157 key->partcollation[0],
2158 val1, val2));
2159 }
2160
2161 /*
2162 * make_one_range_bound
2163 *
2164 * Return a PartitionRangeBound given a list of PartitionRangeDatum elements
2165 * and a flag telling whether the bound is lower or not. Made into a function
2166 * because there are multiple sites that want to use this facility.
2167 */
2168 static PartitionRangeBound *
make_one_range_bound(PartitionKey key,int index,List * datums,bool lower)2169 make_one_range_bound(PartitionKey key, int index, List *datums, bool lower)
2170 {
2171 PartitionRangeBound *bound;
2172 ListCell *lc;
2173 int i;
2174
2175 bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound));
2176 bound->index = index;
2177 bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum));
2178 bound->kind = (PartitionRangeDatumKind *) palloc0(key->partnatts *
2179 sizeof(PartitionRangeDatumKind));
2180 bound->lower = lower;
2181
2182 i = 0;
2183 foreach(lc, datums)
2184 {
2185 PartitionRangeDatum *datum = castNode(PartitionRangeDatum, lfirst(lc));
2186
2187 /* What's contained in this range datum? */
2188 bound->kind[i] = datum->kind;
2189
2190 if (datum->kind == PARTITION_RANGE_DATUM_VALUE)
2191 {
2192 Const *val = castNode(Const, datum->value);
2193
2194 if (val->constisnull)
2195 elog(ERROR, "invalid range bound datum");
2196 bound->datums[i] = val->constvalue;
2197 }
2198
2199 i++;
2200 }
2201
2202 return bound;
2203 }
2204
2205 /* Used when sorting range bounds across all range partitions */
2206 static int32
qsort_partition_rbound_cmp(const void * a,const void * b,void * arg)2207 qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
2208 {
2209 PartitionRangeBound *b1 = (*(PartitionRangeBound *const *) a);
2210 PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b);
2211 PartitionKey key = (PartitionKey) arg;
2212
2213 return partition_rbound_cmp(key, b1->datums, b1->kind, b1->lower, b2);
2214 }
2215
2216 /*
2217 * partition_rbound_cmp
2218 *
2219 * Return for two range bounds whether the 1st one (specified in datum1,
2220 * kind1, and lower1) is <, =, or > the bound specified in *b2.
2221 *
2222 * Note that if the values of the two range bounds compare equal, then we take
2223 * into account whether they are upper or lower bounds, and an upper bound is
2224 * considered to be smaller than a lower bound. This is important to the way
2225 * that RelationBuildPartitionDesc() builds the PartitionBoundInfoData
2226 * structure, which only stores the upper bound of a common boundary between
2227 * two contiguous partitions.
2228 */
2229 static int32
partition_rbound_cmp(PartitionKey key,Datum * datums1,PartitionRangeDatumKind * kind1,bool lower1,PartitionRangeBound * b2)2230 partition_rbound_cmp(PartitionKey key,
2231 Datum *datums1, PartitionRangeDatumKind *kind1,
2232 bool lower1, PartitionRangeBound *b2)
2233 {
2234 int32 cmpval = 0; /* placate compiler */
2235 int i;
2236 Datum *datums2 = b2->datums;
2237 PartitionRangeDatumKind *kind2 = b2->kind;
2238 bool lower2 = b2->lower;
2239
2240 for (i = 0; i < key->partnatts; i++)
2241 {
2242 /*
2243 * First, handle cases where the column is unbounded, which should not
2244 * invoke the comparison procedure, and should not consider any later
2245 * columns. Note that the PartitionRangeDatumKind enum elements
2246 * compare the same way as the values they represent.
2247 */
2248 if (kind1[i] < kind2[i])
2249 return -1;
2250 else if (kind1[i] > kind2[i])
2251 return 1;
2252 else if (kind1[i] != PARTITION_RANGE_DATUM_VALUE)
2253
2254 /*
2255 * The column bounds are both MINVALUE or both MAXVALUE. No later
2256 * columns should be considered, but we still need to compare
2257 * whether they are upper or lower bounds.
2258 */
2259 break;
2260
2261 cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
2262 key->partcollation[i],
2263 datums1[i],
2264 datums2[i]));
2265 if (cmpval != 0)
2266 break;
2267 }
2268
2269 /*
2270 * If the comparison is anything other than equal, we're done. If they
2271 * compare equal though, we still have to consider whether the boundaries
2272 * are inclusive or exclusive. Exclusive one is considered smaller of the
2273 * two.
2274 */
2275 if (cmpval == 0 && lower1 != lower2)
2276 cmpval = lower1 ? 1 : -1;
2277
2278 return cmpval;
2279 }
2280
2281 /*
2282 * partition_rbound_datum_cmp
2283 *
2284 * Return whether range bound (specified in rb_datums, rb_kind, and rb_lower)
2285 * is <, =, or > partition key of tuple (tuple_datums)
2286 */
2287 static int32
partition_rbound_datum_cmp(PartitionKey key,Datum * rb_datums,PartitionRangeDatumKind * rb_kind,Datum * tuple_datums)2288 partition_rbound_datum_cmp(PartitionKey key,
2289 Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
2290 Datum *tuple_datums)
2291 {
2292 int i;
2293 int32 cmpval = -1;
2294
2295 for (i = 0; i < key->partnatts; i++)
2296 {
2297 if (rb_kind[i] == PARTITION_RANGE_DATUM_MINVALUE)
2298 return -1;
2299 else if (rb_kind[i] == PARTITION_RANGE_DATUM_MAXVALUE)
2300 return 1;
2301
2302 cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
2303 key->partcollation[i],
2304 rb_datums[i],
2305 tuple_datums[i]));
2306 if (cmpval != 0)
2307 break;
2308 }
2309
2310 return cmpval;
2311 }
2312
2313 /*
2314 * partition_bound_cmp
2315 *
2316 * Return whether the bound at offset in boundinfo is <, =, or > the argument
2317 * specified in *probe.
2318 */
2319 static int32
partition_bound_cmp(PartitionKey key,PartitionBoundInfo boundinfo,int offset,void * probe,bool probe_is_bound)2320 partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo,
2321 int offset, void *probe, bool probe_is_bound)
2322 {
2323 Datum *bound_datums = boundinfo->datums[offset];
2324 int32 cmpval = -1;
2325
2326 switch (key->strategy)
2327 {
2328 case PARTITION_STRATEGY_LIST:
2329 cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
2330 key->partcollation[0],
2331 bound_datums[0],
2332 *(Datum *) probe));
2333 break;
2334
2335 case PARTITION_STRATEGY_RANGE:
2336 {
2337 PartitionRangeDatumKind *kind = boundinfo->kind[offset];
2338
2339 if (probe_is_bound)
2340 {
2341 /*
2342 * We need to pass whether the existing bound is a lower
2343 * bound, so that two equal-valued lower and upper bounds
2344 * are not regarded equal.
2345 */
2346 bool lower = boundinfo->indexes[offset] < 0;
2347
2348 cmpval = partition_rbound_cmp(key,
2349 bound_datums, kind, lower,
2350 (PartitionRangeBound *) probe);
2351 }
2352 else
2353 cmpval = partition_rbound_datum_cmp(key,
2354 bound_datums, kind,
2355 (Datum *) probe);
2356 break;
2357 }
2358
2359 default:
2360 elog(ERROR, "unexpected partition strategy: %d",
2361 (int) key->strategy);
2362 }
2363
2364 return cmpval;
2365 }
2366
2367 /*
2368 * Binary search on a collection of partition bounds. Returns greatest
2369 * bound in array boundinfo->datums which is less than or equal to *probe.
2370 * If all bounds in the array are greater than *probe, -1 is returned.
2371 *
2372 * *probe could either be a partition bound or a Datum array representing
2373 * the partition key of a tuple being routed; probe_is_bound tells which.
2374 * We pass that down to the comparison function so that it can interpret the
2375 * contents of *probe accordingly.
2376 *
2377 * *is_equal is set to whether the bound at the returned index is equal with
2378 * *probe.
2379 */
2380 static int
partition_bound_bsearch(PartitionKey key,PartitionBoundInfo boundinfo,void * probe,bool probe_is_bound,bool * is_equal)2381 partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
2382 void *probe, bool probe_is_bound, bool *is_equal)
2383 {
2384 int lo,
2385 hi,
2386 mid;
2387
2388 lo = -1;
2389 hi = boundinfo->ndatums - 1;
2390 while (lo < hi)
2391 {
2392 int32 cmpval;
2393
2394 mid = (lo + hi + 1) / 2;
2395 cmpval = partition_bound_cmp(key, boundinfo, mid, probe,
2396 probe_is_bound);
2397 if (cmpval <= 0)
2398 {
2399 lo = mid;
2400 *is_equal = (cmpval == 0);
2401
2402 if (*is_equal)
2403 break;
2404 }
2405 else
2406 hi = mid - 1;
2407 }
2408
2409 return lo;
2410 }
2411