1 /*-------------------------------------------------------------------------
2  *
3  * partition.c
4  *		  Partitioning related data structures and functions.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *		  src/backend/catalog/partition.c
12  *
13  *-------------------------------------------------------------------------
14 */
15 
16 #include "postgres.h"
17 
18 #include "access/heapam.h"
19 #include "access/htup_details.h"
20 #include "access/nbtree.h"
21 #include "access/sysattr.h"
22 #include "catalog/dependency.h"
23 #include "catalog/indexing.h"
24 #include "catalog/objectaddress.h"
25 #include "catalog/partition.h"
26 #include "catalog/pg_collation.h"
27 #include "catalog/pg_inherits.h"
28 #include "catalog/pg_inherits_fn.h"
29 #include "catalog/pg_opclass.h"
30 #include "catalog/pg_type.h"
31 #include "executor/executor.h"
32 #include "miscadmin.h"
33 #include "nodes/makefuncs.h"
34 #include "nodes/nodeFuncs.h"
35 #include "nodes/parsenodes.h"
36 #include "optimizer/clauses.h"
37 #include "optimizer/planmain.h"
38 #include "optimizer/var.h"
39 #include "rewrite/rewriteManip.h"
40 #include "storage/lmgr.h"
41 #include "utils/array.h"
42 #include "utils/builtins.h"
43 #include "utils/datum.h"
44 #include "utils/memutils.h"
45 #include "utils/fmgroids.h"
46 #include "utils/inval.h"
47 #include "utils/lsyscache.h"
48 #include "utils/rel.h"
49 #include "utils/ruleutils.h"
50 #include "utils/syscache.h"
51 
52 /*
53  * Information about bounds of a partitioned relation
54  *
55  * A list partition datum that is known to be NULL is never put into the
56  * datums array. Instead, it is tracked using the null_index field.
57  *
58  * In the case of range partitioning, ndatums will typically be far less than
59  * 2 * nparts, because a partition's upper bound and the next partition's lower
60  * bound are the same in most common cases, and we only store one of them (the
61  * upper bound).
62  *
63  * In the case of list partitioning, the indexes array stores one entry for
64  * every datum, which is the index of the partition that accepts a given datum.
65  * In case of range partitioning, it stores one entry per distinct range
66  * datum, which is the index of the partition for which a given datum
67  * is an upper bound.
68  */
69 
70 typedef struct PartitionBoundInfoData
71 {
72 	char		strategy;		/* list or range bounds? */
73 	int			ndatums;		/* Length of the datums following array */
74 	Datum	  **datums;			/* Array of datum-tuples with key->partnatts
75 								 * datums each */
76 	PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
77 									 * NULL for list partitioned tables */
78 	int		   *indexes;		/* Partition indexes; one entry per member of
79 								 * the datums array (plus one if range
80 								 * partitioned table) */
81 	int			null_index;		/* Index of the null-accepting partition; -1
82 								 * if there isn't one */
83 } PartitionBoundInfoData;
84 
85 #define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)
86 
87 /*
88  * When qsort'ing partition bounds after reading from the catalog, each bound
89  * is represented with one of the following structs.
90  */
91 
92 /* One value coming from some (index'th) list partition */
93 typedef struct PartitionListValue
94 {
95 	int			index;
96 	Datum		value;
97 } PartitionListValue;
98 
99 /* One bound of a range partition */
100 typedef struct PartitionRangeBound
101 {
102 	int			index;
103 	Datum	   *datums;			/* range bound datums */
104 	PartitionRangeDatumKind *kind;	/* the kind of each datum */
105 	bool		lower;			/* this is the lower (vs upper) bound */
106 } PartitionRangeBound;
107 
108 static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
109 							   void *arg);
110 static int32 qsort_partition_rbound_cmp(const void *a, const void *b,
111 						   void *arg);
112 
113 static Oid get_partition_operator(PartitionKey key, int col,
114 					   StrategyNumber strategy, bool *need_relabel);
115 static Expr *make_partition_op_expr(PartitionKey key, int keynum,
116 					   uint16 strategy, Expr *arg1, Expr *arg2);
117 static void get_range_key_properties(PartitionKey key, int keynum,
118 						 PartitionRangeDatum *ldatum,
119 						 PartitionRangeDatum *udatum,
120 						 ListCell **partexprs_item,
121 						 Expr **keyCol,
122 						 Const **lower_val, Const **upper_val);
123 static List *get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec);
124 static List *get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec);
125 static List *generate_partition_qual(Relation rel);
126 
127 static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index,
128 					 List *datums, bool lower);
129 static int32 partition_rbound_cmp(PartitionKey key,
130 					 Datum *datums1, PartitionRangeDatumKind *kind1,
131 					 bool lower1, PartitionRangeBound *b2);
132 static int32 partition_rbound_datum_cmp(PartitionKey key,
133 						   Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
134 						   Datum *tuple_datums);
135 
136 static int32 partition_bound_cmp(PartitionKey key,
137 					PartitionBoundInfo boundinfo,
138 					int offset, void *probe, bool probe_is_bound);
139 static int partition_bound_bsearch(PartitionKey key,
140 						PartitionBoundInfo boundinfo,
141 						void *probe, bool probe_is_bound, bool *is_equal);
142 
143 /*
144  * RelationBuildPartitionDesc
145  *		Form rel's partition descriptor, and store in relcache entry
146  *
147  * Note: the descriptor won't be flushed from the cache by
148  * RelationClearRelation() unless it's changed because of
149  * addition or removal of a partition.  Hence, code holding a lock
150  * that's sufficient to prevent that can assume that rd_partdesc
151  * won't change underneath it.
152  */
153 void
RelationBuildPartitionDesc(Relation rel)154 RelationBuildPartitionDesc(Relation rel)
155 {
156 	List	   *inhoids,
157 			   *partoids;
158 	Oid		   *oids = NULL;
159 	List	   *boundspecs = NIL;
160 	ListCell   *cell;
161 	int			i,
162 				nparts;
163 	PartitionKey key = RelationGetPartitionKey(rel);
164 	PartitionDesc result;
165 	MemoryContext oldcxt;
166 
167 	int			ndatums = 0;
168 
169 	/* List partitioning specific */
170 	PartitionListValue **all_values = NULL;
171 	int			null_index = -1;
172 
173 	/* Range partitioning specific */
174 	PartitionRangeBound **rbounds = NULL;
175 
176 	/*
177 	 * The following could happen in situations where rel has a pg_class entry
178 	 * but not the pg_partitioned_table entry yet.
179 	 */
180 	if (key == NULL)
181 		return;
182 
183 	/* Get partition oids from pg_inherits */
184 	inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
185 
186 	/* Collect bound spec nodes in a list */
187 	i = 0;
188 	partoids = NIL;
189 	foreach(cell, inhoids)
190 	{
191 		Oid			inhrelid = lfirst_oid(cell);
192 		HeapTuple	tuple;
193 		Datum		datum;
194 		bool		isnull;
195 		Node	   *boundspec;
196 
197 		tuple = SearchSysCache1(RELOID, inhrelid);
198 		if (!HeapTupleIsValid(tuple))
199 			elog(ERROR, "cache lookup failed for relation %u", inhrelid);
200 
201 		/*
202 		 * It is possible that the pg_class tuple of a partition has not been
203 		 * updated yet to set its relpartbound field.  The only case where
204 		 * this happens is when we open the parent relation to check using its
205 		 * partition descriptor that a new partition's bound does not overlap
206 		 * some existing partition.
207 		 */
208 		if (!((Form_pg_class) GETSTRUCT(tuple))->relispartition)
209 		{
210 			ReleaseSysCache(tuple);
211 			continue;
212 		}
213 
214 		datum = SysCacheGetAttr(RELOID, tuple,
215 								Anum_pg_class_relpartbound,
216 								&isnull);
217 		Assert(!isnull);
218 		boundspec = (Node *) stringToNode(TextDatumGetCString(datum));
219 		boundspecs = lappend(boundspecs, boundspec);
220 		partoids = lappend_oid(partoids, inhrelid);
221 		ReleaseSysCache(tuple);
222 	}
223 
224 	nparts = list_length(partoids);
225 
226 	if (nparts > 0)
227 	{
228 		oids = (Oid *) palloc(nparts * sizeof(Oid));
229 		i = 0;
230 		foreach(cell, partoids)
231 			oids[i++] = lfirst_oid(cell);
232 
233 		/* Convert from node to the internal representation */
234 		if (key->strategy == PARTITION_STRATEGY_LIST)
235 		{
236 			List	   *non_null_values = NIL;
237 
238 			/*
239 			 * Create a unified list of non-null values across all partitions.
240 			 */
241 			i = 0;
242 			null_index = -1;
243 			foreach(cell, boundspecs)
244 			{
245 				PartitionBoundSpec *spec = castNode(PartitionBoundSpec,
246 													lfirst(cell));
247 				ListCell   *c;
248 
249 				if (spec->strategy != PARTITION_STRATEGY_LIST)
250 					elog(ERROR, "invalid strategy in partition bound spec");
251 
252 				foreach(c, spec->listdatums)
253 				{
254 					Const	   *val = castNode(Const, lfirst(c));
255 					PartitionListValue *list_value = NULL;
256 
257 					if (!val->constisnull)
258 					{
259 						list_value = (PartitionListValue *)
260 							palloc0(sizeof(PartitionListValue));
261 						list_value->index = i;
262 						list_value->value = val->constvalue;
263 					}
264 					else
265 					{
266 						/*
267 						 * Never put a null into the values array, flag
268 						 * instead for the code further down below where we
269 						 * construct the actual relcache struct.
270 						 */
271 						if (null_index != -1)
272 							elog(ERROR, "found null more than once");
273 						null_index = i;
274 					}
275 
276 					if (list_value)
277 						non_null_values = lappend(non_null_values,
278 												  list_value);
279 				}
280 
281 				i++;
282 			}
283 
284 			ndatums = list_length(non_null_values);
285 
286 			/*
287 			 * Collect all list values in one array. Alongside the value, we
288 			 * also save the index of partition the value comes from.
289 			 */
290 			all_values = (PartitionListValue **) palloc(ndatums *
291 														sizeof(PartitionListValue *));
292 			i = 0;
293 			foreach(cell, non_null_values)
294 			{
295 				PartitionListValue *src = lfirst(cell);
296 
297 				all_values[i] = (PartitionListValue *)
298 					palloc(sizeof(PartitionListValue));
299 				all_values[i]->value = src->value;
300 				all_values[i]->index = src->index;
301 				i++;
302 			}
303 
304 			qsort_arg(all_values, ndatums, sizeof(PartitionListValue *),
305 					  qsort_partition_list_value_cmp, (void *) key);
306 		}
307 		else if (key->strategy == PARTITION_STRATEGY_RANGE)
308 		{
309 			int			j,
310 						k;
311 			PartitionRangeBound **all_bounds,
312 					   *prev;
313 			bool	   *distinct_indexes;
314 
315 			all_bounds = (PartitionRangeBound **) palloc0(2 * nparts *
316 														  sizeof(PartitionRangeBound *));
317 			distinct_indexes = (bool *) palloc(2 * nparts * sizeof(bool));
318 
319 			/*
320 			 * Create a unified list of range bounds across all the
321 			 * partitions.
322 			 */
323 			i = j = 0;
324 			foreach(cell, boundspecs)
325 			{
326 				PartitionBoundSpec *spec = castNode(PartitionBoundSpec,
327 													lfirst(cell));
328 				PartitionRangeBound *lower,
329 						   *upper;
330 
331 				if (spec->strategy != PARTITION_STRATEGY_RANGE)
332 					elog(ERROR, "invalid strategy in partition bound spec");
333 
334 				lower = make_one_range_bound(key, i, spec->lowerdatums,
335 											 true);
336 				upper = make_one_range_bound(key, i, spec->upperdatums,
337 											 false);
338 				all_bounds[j] = lower;
339 				all_bounds[j + 1] = upper;
340 				j += 2;
341 				i++;
342 			}
343 			Assert(j == 2 * nparts);
344 
345 			/* Sort all the bounds in ascending order */
346 			qsort_arg(all_bounds, 2 * nparts,
347 					  sizeof(PartitionRangeBound *),
348 					  qsort_partition_rbound_cmp,
349 					  (void *) key);
350 
351 			/*
352 			 * Count the number of distinct bounds to allocate an array of
353 			 * that size.
354 			 */
355 			ndatums = 0;
356 			prev = NULL;
357 			for (i = 0; i < 2 * nparts; i++)
358 			{
359 				PartitionRangeBound *cur = all_bounds[i];
360 				bool		is_distinct = false;
361 				int			j;
362 
363 				/* Is the current bound distinct from the previous one? */
364 				for (j = 0; j < key->partnatts; j++)
365 				{
366 					Datum		cmpval;
367 
368 					if (prev == NULL || cur->kind[j] != prev->kind[j])
369 					{
370 						is_distinct = true;
371 						break;
372 					}
373 
374 					/*
375 					 * If the bounds are both MINVALUE or MAXVALUE, stop now
376 					 * and treat them as equal, since any values after this
377 					 * point must be ignored.
378 					 */
379 					if (cur->kind[j] != PARTITION_RANGE_DATUM_VALUE)
380 						break;
381 
382 					cmpval = FunctionCall2Coll(&key->partsupfunc[j],
383 											   key->partcollation[j],
384 											   cur->datums[j],
385 											   prev->datums[j]);
386 					if (DatumGetInt32(cmpval) != 0)
387 					{
388 						is_distinct = true;
389 						break;
390 					}
391 				}
392 
393 				/*
394 				 * Count the current bound if it is distinct from the previous
395 				 * one.  Also, store if the index i contains a distinct bound
396 				 * that we'd like put in the relcache array.
397 				 */
398 				if (is_distinct)
399 				{
400 					distinct_indexes[i] = true;
401 					ndatums++;
402 				}
403 				else
404 					distinct_indexes[i] = false;
405 
406 				prev = cur;
407 			}
408 
409 			/*
410 			 * Finally save them in an array from where they will be copied
411 			 * into the relcache.
412 			 */
413 			rbounds = (PartitionRangeBound **) palloc(ndatums *
414 													  sizeof(PartitionRangeBound *));
415 			k = 0;
416 			for (i = 0; i < 2 * nparts; i++)
417 			{
418 				if (distinct_indexes[i])
419 					rbounds[k++] = all_bounds[i];
420 			}
421 			Assert(k == ndatums);
422 		}
423 		else
424 			elog(ERROR, "unexpected partition strategy: %d",
425 				 (int) key->strategy);
426 	}
427 
428 	/* Assert we aren't about to leak any old data structure */
429 	Assert(rel->rd_pdcxt == NULL);
430 	Assert(rel->rd_partdesc == NULL);
431 
432 	/*
433 	 * Now build the actual relcache partition descriptor.  Note that the
434 	 * order of operations here is fairly critical.  If we fail partway
435 	 * through this code, we won't have leaked memory because the rd_pdcxt is
436 	 * attached to the relcache entry immediately, so it'll be freed whenever
437 	 * the entry is rebuilt or destroyed.  However, we don't assign to
438 	 * rd_partdesc until the cached data structure is fully complete and
439 	 * valid, so that no other code might try to use it.
440 	 */
441 	rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext,
442 										  RelationGetRelationName(rel),
443 										  ALLOCSET_SMALL_SIZES);
444 	oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);
445 
446 	result = (PartitionDescData *) palloc0(sizeof(PartitionDescData));
447 	result->nparts = nparts;
448 	if (nparts > 0)
449 	{
450 		PartitionBoundInfo boundinfo;
451 		int		   *mapping;
452 		int			next_index = 0;
453 
454 		result->oids = (Oid *) palloc0(nparts * sizeof(Oid));
455 
456 		boundinfo = (PartitionBoundInfoData *)
457 			palloc0(sizeof(PartitionBoundInfoData));
458 		boundinfo->strategy = key->strategy;
459 		boundinfo->ndatums = ndatums;
460 		boundinfo->null_index = -1;
461 		boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
462 
463 		/* Initialize mapping array with invalid values */
464 		mapping = (int *) palloc(sizeof(int) * nparts);
465 		for (i = 0; i < nparts; i++)
466 			mapping[i] = -1;
467 
468 		switch (key->strategy)
469 		{
470 			case PARTITION_STRATEGY_LIST:
471 				{
472 					boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
473 
474 					/*
475 					 * Copy values.  Indexes of individual values are mapped
476 					 * to canonical values so that they match for any two list
477 					 * partitioned tables with same number of partitions and
478 					 * same lists per partition.  One way to canonicalize is
479 					 * to assign the index in all_values[] of the smallest
480 					 * value of each partition, as the index of all of the
481 					 * partition's values.
482 					 */
483 					for (i = 0; i < ndatums; i++)
484 					{
485 						boundinfo->datums[i] = (Datum *) palloc(sizeof(Datum));
486 						boundinfo->datums[i][0] = datumCopy(all_values[i]->value,
487 															key->parttypbyval[0],
488 															key->parttyplen[0]);
489 
490 						/* If the old index has no mapping, assign one */
491 						if (mapping[all_values[i]->index] == -1)
492 							mapping[all_values[i]->index] = next_index++;
493 
494 						boundinfo->indexes[i] = mapping[all_values[i]->index];
495 					}
496 
497 					/*
498 					 * If null-accepting partition has no mapped index yet,
499 					 * assign one.  This could happen if such partition
500 					 * accepts only null and hence not covered in the above
501 					 * loop which only handled non-null values.
502 					 */
503 					if (null_index != -1)
504 					{
505 						Assert(null_index >= 0);
506 						if (mapping[null_index] == -1)
507 							mapping[null_index] = next_index++;
508 						boundinfo->null_index = mapping[null_index];
509 					}
510 
511 					/* All partitions must now have a valid mapping */
512 					Assert(next_index == nparts);
513 					break;
514 				}
515 
516 			case PARTITION_STRATEGY_RANGE:
517 				{
518 					boundinfo->kind = (PartitionRangeDatumKind **)
519 						palloc(ndatums *
520 							   sizeof(PartitionRangeDatumKind *));
521 					boundinfo->indexes = (int *) palloc((ndatums + 1) *
522 														sizeof(int));
523 
524 					for (i = 0; i < ndatums; i++)
525 					{
526 						int			j;
527 
528 						boundinfo->datums[i] = (Datum *) palloc(key->partnatts *
529 																sizeof(Datum));
530 						boundinfo->kind[i] = (PartitionRangeDatumKind *)
531 							palloc(key->partnatts *
532 								   sizeof(PartitionRangeDatumKind));
533 						for (j = 0; j < key->partnatts; j++)
534 						{
535 							if (rbounds[i]->kind[j] == PARTITION_RANGE_DATUM_VALUE)
536 								boundinfo->datums[i][j] =
537 									datumCopy(rbounds[i]->datums[j],
538 											  key->parttypbyval[j],
539 											  key->parttyplen[j]);
540 							boundinfo->kind[i][j] = rbounds[i]->kind[j];
541 						}
542 
543 						/*
544 						 * There is no mapping for invalid indexes.
545 						 *
546 						 * Any lower bounds in the rbounds array have invalid
547 						 * indexes assigned, because the values between the
548 						 * previous bound (if there is one) and this (lower)
549 						 * bound are not part of the range of any existing
550 						 * partition.
551 						 */
552 						if (rbounds[i]->lower)
553 							boundinfo->indexes[i] = -1;
554 						else
555 						{
556 							int			orig_index = rbounds[i]->index;
557 
558 							/* If the old index has no mapping, assign one */
559 							if (mapping[orig_index] == -1)
560 								mapping[orig_index] = next_index++;
561 
562 							boundinfo->indexes[i] = mapping[orig_index];
563 						}
564 					}
565 					boundinfo->indexes[i] = -1;
566 					break;
567 				}
568 
569 			default:
570 				elog(ERROR, "unexpected partition strategy: %d",
571 					 (int) key->strategy);
572 		}
573 
574 		result->boundinfo = boundinfo;
575 
576 		/*
577 		 * Now assign OIDs from the original array into mapped indexes of the
578 		 * result array.  Order of OIDs in the former is defined by the
579 		 * catalog scan that retrieved them, whereas that in the latter is
580 		 * defined by canonicalized representation of the list values or the
581 		 * range bounds.
582 		 */
583 		for (i = 0; i < nparts; i++)
584 			result->oids[mapping[i]] = oids[i];
585 		pfree(mapping);
586 	}
587 
588 	MemoryContextSwitchTo(oldcxt);
589 	rel->rd_partdesc = result;
590 }
591 
592 /*
593  * Are two partition bound collections logically equal?
594  *
595  * Used in the keep logic of relcache.c (ie, in RelationClearRelation()).
596  * This is also useful when b1 and b2 are bound collections of two separate
597  * relations, respectively, because PartitionBoundInfo is a canonical
598  * representation of partition bounds.
599  */
600 bool
partition_bounds_equal(PartitionKey key,PartitionBoundInfo b1,PartitionBoundInfo b2)601 partition_bounds_equal(PartitionKey key,
602 					   PartitionBoundInfo b1, PartitionBoundInfo b2)
603 {
604 	int			i;
605 
606 	if (b1->strategy != b2->strategy)
607 		return false;
608 
609 	if (b1->ndatums != b2->ndatums)
610 		return false;
611 
612 	if (b1->null_index != b2->null_index)
613 		return false;
614 
615 	for (i = 0; i < b1->ndatums; i++)
616 	{
617 		int			j;
618 
619 		for (j = 0; j < key->partnatts; j++)
620 		{
621 			/* For range partitions, the bounds might not be finite. */
622 			if (b1->kind != NULL)
623 			{
624 				/* The different kinds of bound all differ from each other */
625 				if (b1->kind[i][j] != b2->kind[i][j])
626 					return false;
627 
628 				/* Non-finite bounds are equal without further examination. */
629 				if (b1->kind[i][j] != PARTITION_RANGE_DATUM_VALUE)
630 					continue;
631 			}
632 
633 			/*
634 			 * Compare the actual values. Note that it would be both incorrect
635 			 * and unsafe to invoke the comparison operator derived from the
636 			 * partitioning specification here.  It would be incorrect because
637 			 * we want the relcache entry to be updated for ANY change to the
638 			 * partition bounds, not just those that the partitioning operator
639 			 * thinks are significant.  It would be unsafe because we might
640 			 * reach this code in the context of an aborted transaction, and
641 			 * an arbitrary partitioning operator might not be safe in that
642 			 * context.  datumIsEqual() should be simple enough to be safe.
643 			 */
644 			if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j],
645 							  key->parttypbyval[j],
646 							  key->parttyplen[j]))
647 				return false;
648 		}
649 
650 		if (b1->indexes[i] != b2->indexes[i])
651 			return false;
652 	}
653 
654 	/* There are ndatums+1 indexes in case of range partitions */
655 	if (key->strategy == PARTITION_STRATEGY_RANGE &&
656 		b1->indexes[i] != b2->indexes[i])
657 		return false;
658 
659 	return true;
660 }
661 
662 /*
663  * check_new_partition_bound
664  *
665  * Checks if the new partition's bound overlaps any of the existing partitions
666  * of parent.  Also performs additional checks as necessary per strategy.
667  */
668 void
check_new_partition_bound(char * relname,Relation parent,PartitionBoundSpec * spec)669 check_new_partition_bound(char *relname, Relation parent,
670 						  PartitionBoundSpec *spec)
671 {
672 	PartitionKey key = RelationGetPartitionKey(parent);
673 	PartitionDesc partdesc = RelationGetPartitionDesc(parent);
674 	ParseState *pstate = make_parsestate(NULL);
675 	int			with = -1;
676 	bool		overlap = false;
677 
678 	switch (key->strategy)
679 	{
680 		case PARTITION_STRATEGY_LIST:
681 			{
682 				Assert(spec->strategy == PARTITION_STRATEGY_LIST);
683 
684 				if (partdesc->nparts > 0)
685 				{
686 					PartitionBoundInfo boundinfo = partdesc->boundinfo;
687 					ListCell   *cell;
688 
689 					Assert(boundinfo &&
690 						   boundinfo->strategy == PARTITION_STRATEGY_LIST &&
691 						   (boundinfo->ndatums > 0 ||
692 							partition_bound_accepts_nulls(boundinfo)));
693 
694 					foreach(cell, spec->listdatums)
695 					{
696 						Const	   *val = castNode(Const, lfirst(cell));
697 
698 						if (!val->constisnull)
699 						{
700 							int			offset;
701 							bool		equal;
702 
703 							offset = partition_bound_bsearch(key, boundinfo,
704 															 &val->constvalue,
705 															 true, &equal);
706 							if (offset >= 0 && equal)
707 							{
708 								overlap = true;
709 								with = boundinfo->indexes[offset];
710 								break;
711 							}
712 						}
713 						else if (partition_bound_accepts_nulls(boundinfo))
714 						{
715 							overlap = true;
716 							with = boundinfo->null_index;
717 							break;
718 						}
719 					}
720 				}
721 
722 				break;
723 			}
724 
725 		case PARTITION_STRATEGY_RANGE:
726 			{
727 				PartitionRangeBound *lower,
728 						   *upper;
729 
730 				Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
731 				lower = make_one_range_bound(key, -1, spec->lowerdatums, true);
732 				upper = make_one_range_bound(key, -1, spec->upperdatums, false);
733 
734 				/*
735 				 * First check if the resulting range would be empty with
736 				 * specified lower and upper bounds
737 				 */
738 				if (partition_rbound_cmp(key, lower->datums, lower->kind, true,
739 										 upper) >= 0)
740 				{
741 					ereport(ERROR,
742 							(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
743 							 errmsg("empty range bound specified for partition \"%s\"",
744 									relname),
745 							 errdetail("Specified lower bound %s is greater than or equal to upper bound %s.",
746 									   get_range_partbound_string(spec->lowerdatums),
747 									   get_range_partbound_string(spec->upperdatums)),
748 							 parser_errposition(pstate, spec->location)));
749 				}
750 
751 				if (partdesc->nparts > 0)
752 				{
753 					PartitionBoundInfo boundinfo = partdesc->boundinfo;
754 					int			offset;
755 					bool		equal;
756 
757 					Assert(boundinfo && boundinfo->ndatums > 0 &&
758 						   boundinfo->strategy == PARTITION_STRATEGY_RANGE);
759 
760 					/*
761 					 * Test whether the new lower bound (which is treated
762 					 * inclusively as part of the new partition) lies inside
763 					 * an existing partition, or in a gap.
764 					 *
765 					 * If it's inside an existing partition, the bound at
766 					 * offset + 1 will be the upper bound of that partition,
767 					 * and its index will be >= 0.
768 					 *
769 					 * If it's in a gap, the bound at offset + 1 will be the
770 					 * lower bound of the next partition, and its index will
771 					 * be -1. This is also true if there is no next partition,
772 					 * since the index array is initialised with an extra -1
773 					 * at the end.
774 					 */
775 					offset = partition_bound_bsearch(key, boundinfo, lower,
776 													 true, &equal);
777 
778 					if (boundinfo->indexes[offset + 1] < 0)
779 					{
780 						/*
781 						 * Check that the new partition will fit in the gap.
782 						 * For it to fit, the new upper bound must be less
783 						 * than or equal to the lower bound of the next
784 						 * partition, if there is one.
785 						 */
786 						if (offset + 1 < boundinfo->ndatums)
787 						{
788 							int32		cmpval;
789 
790 							cmpval = partition_bound_cmp(key, boundinfo,
791 														 offset + 1, upper,
792 														 true);
793 							if (cmpval < 0)
794 							{
795 								/*
796 								 * The new partition overlaps with the
797 								 * existing partition between offset + 1 and
798 								 * offset + 2.
799 								 */
800 								overlap = true;
801 								with = boundinfo->indexes[offset + 2];
802 							}
803 						}
804 					}
805 					else
806 					{
807 						/*
808 						 * The new partition overlaps with the existing
809 						 * partition between offset and offset + 1.
810 						 */
811 						overlap = true;
812 						with = boundinfo->indexes[offset + 1];
813 					}
814 				}
815 
816 				break;
817 			}
818 
819 		default:
820 			elog(ERROR, "unexpected partition strategy: %d",
821 				 (int) key->strategy);
822 	}
823 
824 	if (overlap)
825 	{
826 		Assert(with >= 0);
827 		ereport(ERROR,
828 				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
829 				 errmsg("partition \"%s\" would overlap partition \"%s\"",
830 						relname, get_rel_name(partdesc->oids[with])),
831 				 parser_errposition(pstate, spec->location)));
832 	}
833 }
834 
835 /*
836  * get_partition_parent
837  *
838  * Returns inheritance parent of a partition by scanning pg_inherits
839  *
840  * Note: Because this function assumes that the relation whose OID is passed
841  * as an argument will have precisely one parent, it should only be called
842  * when it is known that the relation is a partition.
843  */
844 Oid
get_partition_parent(Oid relid)845 get_partition_parent(Oid relid)
846 {
847 	Form_pg_inherits form;
848 	Relation	catalogRelation;
849 	SysScanDesc scan;
850 	ScanKeyData key[2];
851 	HeapTuple	tuple;
852 	Oid			result;
853 
854 	catalogRelation = heap_open(InheritsRelationId, AccessShareLock);
855 
856 	ScanKeyInit(&key[0],
857 				Anum_pg_inherits_inhrelid,
858 				BTEqualStrategyNumber, F_OIDEQ,
859 				ObjectIdGetDatum(relid));
860 	ScanKeyInit(&key[1],
861 				Anum_pg_inherits_inhseqno,
862 				BTEqualStrategyNumber, F_INT4EQ,
863 				Int32GetDatum(1));
864 
865 	scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, true,
866 							  NULL, 2, key);
867 
868 	tuple = systable_getnext(scan);
869 	if (!HeapTupleIsValid(tuple))
870 		elog(ERROR, "could not find tuple for parent of relation %u", relid);
871 
872 	form = (Form_pg_inherits) GETSTRUCT(tuple);
873 	result = form->inhparent;
874 
875 	systable_endscan(scan);
876 	heap_close(catalogRelation, AccessShareLock);
877 
878 	return result;
879 }
880 
881 /*
882  * get_qual_from_partbound
883  *		Given a parser node for partition bound, return the list of executable
884  *		expressions as partition constraint
885  */
886 List *
get_qual_from_partbound(Relation rel,Relation parent,PartitionBoundSpec * spec)887 get_qual_from_partbound(Relation rel, Relation parent,
888 						PartitionBoundSpec *spec)
889 {
890 	PartitionKey key = RelationGetPartitionKey(parent);
891 	List	   *my_qual = NIL;
892 
893 	Assert(key != NULL);
894 
895 	switch (key->strategy)
896 	{
897 		case PARTITION_STRATEGY_LIST:
898 			Assert(spec->strategy == PARTITION_STRATEGY_LIST);
899 			my_qual = get_qual_for_list(key, spec);
900 			break;
901 
902 		case PARTITION_STRATEGY_RANGE:
903 			Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
904 			my_qual = get_qual_for_range(key, spec);
905 			break;
906 
907 		default:
908 			elog(ERROR, "unexpected partition strategy: %d",
909 				 (int) key->strategy);
910 	}
911 
912 	return my_qual;
913 }
914 
915 /*
916  * map_partition_varattnos - maps varattno of any Vars in expr from the
917  * parent attno to partition attno.
918  *
919  * We must allow for cases where physical attnos of a partition can be
920  * different from the parent's.
921  *
922  * If found_whole_row is not NULL, *found_whole_row returns whether a
923  * whole-row variable was found in the input expression.
924  *
925  * Note: this will work on any node tree, so really the argument and result
926  * should be declared "Node *".  But a substantial majority of the callers
927  * are working on Lists, so it's less messy to do the casts internally.
928  */
929 List *
map_partition_varattnos(List * expr,int target_varno,Relation partrel,Relation parent,bool * found_whole_row)930 map_partition_varattnos(List *expr, int target_varno,
931 						Relation partrel, Relation parent,
932 						bool *found_whole_row)
933 {
934 	bool		my_found_whole_row = false;
935 
936 	if (expr != NIL)
937 	{
938 		AttrNumber *part_attnos;
939 
940 		part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel),
941 												 RelationGetDescr(parent),
942 												 gettext_noop("could not convert row type"));
943 		expr = (List *) map_variable_attnos((Node *) expr,
944 											target_varno, 0,
945 											part_attnos,
946 											RelationGetDescr(parent)->natts,
947 											RelationGetForm(partrel)->reltype,
948 											&my_found_whole_row);
949 	}
950 
951 	if (found_whole_row)
952 		*found_whole_row = my_found_whole_row;
953 
954 	return expr;
955 }
956 
957 /*
958  * RelationGetPartitionQual
959  *
960  * Returns a list of partition quals
961  */
962 List *
RelationGetPartitionQual(Relation rel)963 RelationGetPartitionQual(Relation rel)
964 {
965 	/* Quick exit */
966 	if (!rel->rd_rel->relispartition)
967 		return NIL;
968 
969 	return generate_partition_qual(rel);
970 }
971 
972 /*
973  * get_partition_qual_relid
974  *
975  * Returns an expression tree describing the passed-in relation's partition
976  * constraint.
977  *
978  * If the relation is not found, or is not a partition, or there is no
979  * partition constraint, return NULL.  We must guard against the first two
980  * cases because this supports a SQL function that could be passed any OID.
981  * The last case shouldn't happen in v10, but cope anyway.
982  */
983 Expr *
get_partition_qual_relid(Oid relid)984 get_partition_qual_relid(Oid relid)
985 {
986 	Expr	   *result = NULL;
987 
988 	/* Do the work only if this relation exists and is a partition. */
989 	if (get_rel_relispartition(relid))
990 	{
991 		Relation	rel = relation_open(relid, AccessShareLock);
992 		List	   *and_args;
993 
994 		and_args = generate_partition_qual(rel);
995 
996 		/* Convert implicit-AND list format to boolean expression */
997 		if (and_args == NIL)
998 			result = NULL;
999 		else if (list_length(and_args) > 1)
1000 			result = makeBoolExpr(AND_EXPR, and_args, -1);
1001 		else
1002 			result = linitial(and_args);
1003 
1004 		/* Keep the lock, to allow safe deparsing against the rel by caller. */
1005 		relation_close(rel, NoLock);
1006 	}
1007 
1008 	return result;
1009 }
1010 
1011 /*
1012  * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
1013  * append pointer rel to the list 'parents'.
1014  */
1015 #define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
1016 	do\
1017 	{\
1018 		int		i;\
1019 		for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
1020 		{\
1021 			(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
1022 			(parents) = lappend((parents), (rel));\
1023 		}\
1024 	} while(0)
1025 
1026 /*
1027  * RelationGetPartitionDispatchInfo
1028  *		Returns information necessary to route tuples down a partition tree
1029  *
1030  * The number of elements in the returned array (that is, the number of
1031  * PartitionDispatch objects for the partitioned tables in the partition tree)
1032  * is returned in *num_parted and a list of the OIDs of all the leaf
1033  * partitions of rel is returned in *leaf_part_oids.
1034  *
1035  * All the relations in the partition tree (including 'rel') must have been
1036  * locked (using at least the AccessShareLock) by the caller.
1037  */
1038 PartitionDispatch *
RelationGetPartitionDispatchInfo(Relation rel,int * num_parted,List ** leaf_part_oids)1039 RelationGetPartitionDispatchInfo(Relation rel,
1040 								 int *num_parted, List **leaf_part_oids)
1041 {
1042 	PartitionDispatchData **pd;
1043 	List	   *all_parts = NIL,
1044 			   *all_parents = NIL,
1045 			   *parted_rels,
1046 			   *parted_rel_parents;
1047 	ListCell   *lc1,
1048 			   *lc2;
1049 	int			i,
1050 				k,
1051 				offset;
1052 
1053 	/*
1054 	 * We rely on the relcache to traverse the partition tree to build both
1055 	 * the leaf partition OIDs list and the array of PartitionDispatch objects
1056 	 * for the partitioned tables in the tree.  That means every partitioned
1057 	 * table in the tree must be locked, which is fine since we require the
1058 	 * caller to lock all the partitions anyway.
1059 	 *
1060 	 * For every partitioned table in the tree, starting with the root
1061 	 * partitioned table, add its relcache entry to parted_rels, while also
1062 	 * queuing its partitions (in the order in which they appear in the
1063 	 * partition descriptor) to be looked at later in the same loop.  This is
1064 	 * a bit tricky but works because the foreach() macro doesn't fetch the
1065 	 * next list element until the bottom of the loop.
1066 	 */
1067 	*num_parted = 1;
1068 	parted_rels = list_make1(rel);
1069 	/* Root partitioned table has no parent, so NULL for parent */
1070 	parted_rel_parents = list_make1(NULL);
1071 	APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
1072 	forboth(lc1, all_parts, lc2, all_parents)
1073 	{
1074 		Oid			partrelid = lfirst_oid(lc1);
1075 		Relation	parent = lfirst(lc2);
1076 
1077 		if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE)
1078 		{
1079 			/*
1080 			 * Already locked by the caller.  Note that it is the
1081 			 * responsibility of the caller to close the below relcache entry,
1082 			 * once done using the information being collected here (for
1083 			 * example, in ExecEndModifyTable).
1084 			 */
1085 			Relation	partrel = heap_open(partrelid, NoLock);
1086 
1087 			(*num_parted)++;
1088 			parted_rels = lappend(parted_rels, partrel);
1089 			parted_rel_parents = lappend(parted_rel_parents, parent);
1090 			APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
1091 		}
1092 	}
1093 
1094 	/*
1095 	 * We want to create two arrays - one for leaf partitions and another for
1096 	 * partitioned tables (including the root table and internal partitions).
1097 	 * While we only create the latter here, leaf partition array of suitable
1098 	 * objects (such as, ResultRelInfo) is created by the caller using the
1099 	 * list of OIDs we return.  Indexes into these arrays get assigned in a
1100 	 * breadth-first manner, whereby partitions of any given level are placed
1101 	 * consecutively in the respective arrays.
1102 	 */
1103 	pd = (PartitionDispatchData **) palloc(*num_parted *
1104 										   sizeof(PartitionDispatchData *));
1105 	*leaf_part_oids = NIL;
1106 	i = k = offset = 0;
1107 	forboth(lc1, parted_rels, lc2, parted_rel_parents)
1108 	{
1109 		Relation	partrel = lfirst(lc1);
1110 		Relation	parent = lfirst(lc2);
1111 		PartitionKey partkey = RelationGetPartitionKey(partrel);
1112 		TupleDesc	tupdesc = RelationGetDescr(partrel);
1113 		PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
1114 		int			j,
1115 					m;
1116 
1117 		pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
1118 		pd[i]->reldesc = partrel;
1119 		pd[i]->key = partkey;
1120 		pd[i]->keystate = NIL;
1121 		pd[i]->partdesc = partdesc;
1122 		if (parent != NULL)
1123 		{
1124 			/*
1125 			 * For every partitioned table other than root, we must store a
1126 			 * tuple table slot initialized with its tuple descriptor and a
1127 			 * tuple conversion map to convert a tuple from its parent's
1128 			 * rowtype to its own. That is to make sure that we are looking at
1129 			 * the correct row using the correct tuple descriptor when
1130 			 * computing its partition key for tuple routing.
1131 			 */
1132 			pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
1133 			pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
1134 												   tupdesc,
1135 												   gettext_noop("could not convert row type"));
1136 		}
1137 		else
1138 		{
1139 			/* Not required for the root partitioned table */
1140 			pd[i]->tupslot = NULL;
1141 			pd[i]->tupmap = NULL;
1142 		}
1143 		pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
1144 
1145 		/*
1146 		 * Indexes corresponding to the internal partitions are multiplied by
1147 		 * -1 to distinguish them from those of leaf partitions.  Encountering
1148 		 * an index >= 0 means we found a leaf partition, which is immediately
1149 		 * returned as the partition we are looking for.  A negative index
1150 		 * means we found a partitioned table, whose PartitionDispatch object
1151 		 * is located at the above index multiplied back by -1.  Using the
1152 		 * PartitionDispatch object, search is continued further down the
1153 		 * partition tree.
1154 		 */
1155 		m = 0;
1156 		for (j = 0; j < partdesc->nparts; j++)
1157 		{
1158 			Oid			partrelid = partdesc->oids[j];
1159 
1160 			if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
1161 			{
1162 				*leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
1163 				pd[i]->indexes[j] = k++;
1164 			}
1165 			else
1166 			{
1167 				/*
1168 				 * offset denotes the number of partitioned tables of upper
1169 				 * levels including those of the current level.  Any partition
1170 				 * of this table must belong to the next level and hence will
1171 				 * be placed after the last partitioned table of this level.
1172 				 */
1173 				pd[i]->indexes[j] = -(1 + offset + m);
1174 				m++;
1175 			}
1176 		}
1177 		i++;
1178 
1179 		/*
1180 		 * This counts the number of partitioned tables at upper levels
1181 		 * including those of the current level.
1182 		 */
1183 		offset += m;
1184 	}
1185 
1186 	return pd;
1187 }
1188 
1189 /* Module-local functions */
1190 
1191 /*
1192  * get_partition_operator
1193  *
1194  * Return oid of the operator of given strategy for a given partition key
1195  * column.  It is assumed that the partitioning key is of the same type as the
1196  * chosen partitioning opclass, or at least binary-compatible.  In the latter
1197  * case, *need_relabel is set to true if the opclass is not of a polymorphic
1198  * type, otherwise false.
1199  */
1200 static Oid
get_partition_operator(PartitionKey key,int col,StrategyNumber strategy,bool * need_relabel)1201 get_partition_operator(PartitionKey key, int col, StrategyNumber strategy,
1202 					   bool *need_relabel)
1203 {
1204 	Oid			operoid;
1205 
1206 	/*
1207 	 * Get the operator in the partitioning opfamily using the opclass'
1208 	 * declared input type as both left- and righttype.
1209 	 */
1210 	operoid = get_opfamily_member(key->partopfamily[col],
1211 								  key->partopcintype[col],
1212 								  key->partopcintype[col],
1213 								  strategy);
1214 	if (!OidIsValid(operoid))
1215 		elog(ERROR, "missing operator %d(%u,%u) in partition opfamily %u",
1216 			 strategy, key->partopcintype[col], key->partopcintype[col],
1217 			 key->partopfamily[col]);
1218 
1219 	/*
1220 	 * If the partition key column is not of the same type as the operator
1221 	 * class and not polymorphic, tell caller to wrap the non-Const expression
1222 	 * in a RelabelType.  This matches what parse_coerce.c does.
1223 	 */
1224 	*need_relabel = (key->parttypid[col] != key->partopcintype[col] &&
1225 					 key->partopcintype[col] != RECORDOID &&
1226 					 !IsPolymorphicType(key->partopcintype[col]));
1227 
1228 	return operoid;
1229 }
1230 
1231 /*
1232  * make_partition_op_expr
1233  *		Returns an Expr for the given partition key column with arg1 and
1234  *		arg2 as its leftop and rightop, respectively
1235  */
1236 static Expr *
make_partition_op_expr(PartitionKey key,int keynum,uint16 strategy,Expr * arg1,Expr * arg2)1237 make_partition_op_expr(PartitionKey key, int keynum,
1238 					   uint16 strategy, Expr *arg1, Expr *arg2)
1239 {
1240 	Oid			operoid;
1241 	bool		need_relabel = false;
1242 	Expr	   *result = NULL;
1243 
1244 	/* Get the correct btree operator for this partitioning column */
1245 	operoid = get_partition_operator(key, keynum, strategy, &need_relabel);
1246 
1247 	/*
1248 	 * Chosen operator may be such that the non-Const operand needs to be
1249 	 * coerced, so apply the same; see the comment in
1250 	 * get_partition_operator().
1251 	 */
1252 	if (!IsA(arg1, Const) &&
1253 		(need_relabel ||
1254 		 key->partcollation[keynum] != key->parttypcoll[keynum]))
1255 		arg1 = (Expr *) makeRelabelType(arg1,
1256 										key->partopcintype[keynum],
1257 										-1,
1258 										key->partcollation[keynum],
1259 										COERCE_EXPLICIT_CAST);
1260 
1261 	/* Generate the actual expression */
1262 	switch (key->strategy)
1263 	{
1264 		case PARTITION_STRATEGY_LIST:
1265 			{
1266 				List	   *elems = (List *) arg2;
1267 				int			nelems = list_length(elems);
1268 
1269 				Assert(nelems >= 1);
1270 				Assert(keynum == 0);
1271 
1272 				if (nelems > 1 &&
1273 					!type_is_array(key->parttypid[keynum]))
1274 				{
1275 					ArrayExpr  *arrexpr;
1276 					ScalarArrayOpExpr *saopexpr;
1277 
1278 					/* Construct an ArrayExpr for the right-hand inputs */
1279 					arrexpr = makeNode(ArrayExpr);
1280 					arrexpr->array_typeid =
1281 									get_array_type(key->parttypid[keynum]);
1282 					arrexpr->array_collid = key->parttypcoll[keynum];
1283 					arrexpr->element_typeid = key->parttypid[keynum];
1284 					arrexpr->elements = elems;
1285 					arrexpr->multidims = false;
1286 					arrexpr->location = -1;
1287 
1288 					/* Build leftop = ANY (rightop) */
1289 					saopexpr = makeNode(ScalarArrayOpExpr);
1290 					saopexpr->opno = operoid;
1291 					saopexpr->opfuncid = get_opcode(operoid);
1292 					saopexpr->useOr = true;
1293 					saopexpr->inputcollid = key->partcollation[keynum];
1294 					saopexpr->args = list_make2(arg1, arrexpr);
1295 					saopexpr->location = -1;
1296 
1297 					result = (Expr *) saopexpr;
1298 				}
1299 				else
1300 				{
1301 					List	   *elemops = NIL;
1302 					ListCell   *lc;
1303 
1304 					foreach (lc, elems)
1305 					{
1306 						Expr   *elem = lfirst(lc),
1307 							   *elemop;
1308 
1309 						elemop = make_opclause(operoid,
1310 											   BOOLOID,
1311 											   false,
1312 											   arg1, elem,
1313 											   InvalidOid,
1314 											   key->partcollation[keynum]);
1315 						elemops = lappend(elemops, elemop);
1316 					}
1317 
1318 					result = nelems > 1 ? makeBoolExpr(OR_EXPR, elemops, -1) : linitial(elemops);
1319 				}
1320 				break;
1321 			}
1322 
1323 		case PARTITION_STRATEGY_RANGE:
1324 			result = make_opclause(operoid,
1325 								   BOOLOID,
1326 								   false,
1327 								   arg1, arg2,
1328 								   InvalidOid,
1329 								   key->partcollation[keynum]);
1330 			break;
1331 
1332 		default:
1333 			elog(ERROR, "invalid partitioning strategy");
1334 			break;
1335 	}
1336 
1337 	return result;
1338 }
1339 
1340 /*
1341  * get_qual_for_list
1342  *
1343  * Returns an implicit-AND list of expressions to use as a list partition's
1344  * constraint, given the partition key and bound structures.
1345  */
1346 static List *
get_qual_for_list(PartitionKey key,PartitionBoundSpec * spec)1347 get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
1348 {
1349 	List	   *result;
1350 	Expr	   *keyCol;
1351 	Expr	   *opexpr;
1352 	NullTest   *nulltest;
1353 	ListCell   *cell;
1354 	List	   *elems = NIL;
1355 	bool		list_has_null = false;
1356 
1357 	/*
1358 	 * Only single-column list partitioning is supported, so we are worried
1359 	 * only about the partition key with index 0.
1360 	 */
1361 	Assert(key->partnatts == 1);
1362 
1363 	/* Construct Var or expression representing the partition column */
1364 	if (key->partattrs[0] != 0)
1365 		keyCol = (Expr *) makeVar(1,
1366 								  key->partattrs[0],
1367 								  key->parttypid[0],
1368 								  key->parttypmod[0],
1369 								  key->parttypcoll[0],
1370 								  0);
1371 	else
1372 		keyCol = (Expr *) copyObject(linitial(key->partexprs));
1373 
1374 	/* Create list of Consts for the allowed values, excluding any nulls */
1375 	foreach(cell, spec->listdatums)
1376 	{
1377 		Const	   *val = castNode(Const, lfirst(cell));
1378 
1379 		if (val->constisnull)
1380 			list_has_null = true;
1381 		else
1382 			elems = lappend(elems, copyObject(val));
1383 	}
1384 
1385 	if (elems)
1386 	{
1387 		/*
1388 		 * Generate the operator expression from the non-null partition
1389 		 * values.
1390 		 */
1391 		opexpr = make_partition_op_expr(key, 0, BTEqualStrategyNumber,
1392 										keyCol, (Expr *) elems);
1393 	}
1394 	else
1395 	{
1396 		/*
1397 		 * If there are no partition values, we don't need an operator
1398 		 * expression.
1399 		 */
1400 		opexpr = NULL;
1401 	}
1402 
1403 	if (!list_has_null)
1404 	{
1405 		/*
1406 		 * Gin up a "col IS NOT NULL" test that will be AND'd with the main
1407 		 * expression.  This might seem redundant, but the partition routing
1408 		 * machinery needs it.
1409 		 */
1410 		nulltest = makeNode(NullTest);
1411 		nulltest->arg = keyCol;
1412 		nulltest->nulltesttype = IS_NOT_NULL;
1413 		nulltest->argisrow = false;
1414 		nulltest->location = -1;
1415 
1416 		result = opexpr ? list_make2(nulltest, opexpr) : list_make1(nulltest);
1417 	}
1418 	else
1419 	{
1420 		/*
1421 		 * Gin up a "col IS NULL" test that will be OR'd with the main
1422 		 * expression.
1423 		 */
1424 		nulltest = makeNode(NullTest);
1425 		nulltest->arg = keyCol;
1426 		nulltest->nulltesttype = IS_NULL;
1427 		nulltest->argisrow = false;
1428 		nulltest->location = -1;
1429 
1430 		if (opexpr)
1431 		{
1432 			Expr	   *or;
1433 
1434 			or = makeBoolExpr(OR_EXPR, list_make2(nulltest, opexpr), -1);
1435 			result = list_make1(or);
1436 		}
1437 		else
1438 			result = list_make1(nulltest);
1439 	}
1440 
1441 	return result;
1442 }
1443 
1444 /*
1445  * get_range_key_properties
1446  *		Returns range partition key information for a given column
1447  *
1448  * This is a subroutine for get_qual_for_range, and its API is pretty
1449  * specialized to that caller.
1450  *
1451  * Constructs an Expr for the key column (returned in *keyCol) and Consts
1452  * for the lower and upper range limits (returned in *lower_val and
1453  * *upper_val).  For MINVALUE/MAXVALUE limits, NULL is returned instead of
1454  * a Const.  All of these structures are freshly palloc'd.
1455  *
1456  * *partexprs_item points to the cell containing the next expression in
1457  * the key->partexprs list, or NULL.  It may be advanced upon return.
1458  */
1459 static void
get_range_key_properties(PartitionKey key,int keynum,PartitionRangeDatum * ldatum,PartitionRangeDatum * udatum,ListCell ** partexprs_item,Expr ** keyCol,Const ** lower_val,Const ** upper_val)1460 get_range_key_properties(PartitionKey key, int keynum,
1461 						 PartitionRangeDatum *ldatum,
1462 						 PartitionRangeDatum *udatum,
1463 						 ListCell **partexprs_item,
1464 						 Expr **keyCol,
1465 						 Const **lower_val, Const **upper_val)
1466 {
1467 	/* Get partition key expression for this column */
1468 	if (key->partattrs[keynum] != 0)
1469 	{
1470 		*keyCol = (Expr *) makeVar(1,
1471 								   key->partattrs[keynum],
1472 								   key->parttypid[keynum],
1473 								   key->parttypmod[keynum],
1474 								   key->parttypcoll[keynum],
1475 								   0);
1476 	}
1477 	else
1478 	{
1479 		if (*partexprs_item == NULL)
1480 			elog(ERROR, "wrong number of partition key expressions");
1481 		*keyCol = copyObject(lfirst(*partexprs_item));
1482 		*partexprs_item = lnext(*partexprs_item);
1483 	}
1484 
1485 	/* Get appropriate Const nodes for the bounds */
1486 	if (ldatum->kind == PARTITION_RANGE_DATUM_VALUE)
1487 		*lower_val = castNode(Const, copyObject(ldatum->value));
1488 	else
1489 		*lower_val = NULL;
1490 
1491 	if (udatum->kind == PARTITION_RANGE_DATUM_VALUE)
1492 		*upper_val = castNode(Const, copyObject(udatum->value));
1493 	else
1494 		*upper_val = NULL;
1495 }
1496 
1497 /*
1498  * get_qual_for_range
1499  *
1500  * Returns an implicit-AND list of expressions to use as a range partition's
1501  * constraint, given the partition key and bound structures.
1502  *
1503  * For a multi-column range partition key, say (a, b, c), with (al, bl, cl)
1504  * as the lower bound tuple and (au, bu, cu) as the upper bound tuple, we
1505  * generate an expression tree of the following form:
1506  *
1507  *	(a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL)
1508  *		AND
1509  *	(a > al OR (a = al AND b > bl) OR (a = al AND b = bl AND c >= cl))
1510  *		AND
1511  *	(a < au OR (a = au AND b < bu) OR (a = au AND b = bu AND c < cu))
1512  *
1513  * It is often the case that a prefix of lower and upper bound tuples contains
1514  * the same values, for example, (al = au), in which case, we will emit an
1515  * expression tree of the following form:
1516  *
1517  *	(a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL)
1518  *		AND
1519  *	(a = al)
1520  *		AND
1521  *	(b > bl OR (b = bl AND c >= cl))
1522  *		AND
1523  *	(b < bu OR (b = bu AND c < cu))
1524  *
1525  * If a bound datum is either MINVALUE or MAXVALUE, these expressions are
1526  * simplified using the fact that any value is greater than MINVALUE and less
1527  * than MAXVALUE. So, for example, if cu = MAXVALUE, c < cu is automatically
1528  * true, and we need not emit any expression for it, and the last line becomes
1529  *
1530  *	(b < bu) OR (b = bu), which is simplified to (b <= bu)
1531  *
1532  * In most common cases with only one partition column, say a, the following
1533  * expression tree will be generated: a IS NOT NULL AND a >= al AND a < au
1534  *
1535  * If we end up with an empty result list, we return a single-member list
1536  * containing a constant TRUE, because callers expect a non-empty list.
1537  */
1538 static List *
get_qual_for_range(PartitionKey key,PartitionBoundSpec * spec)1539 get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
1540 {
1541 	List	   *result = NIL;
1542 	ListCell   *cell1,
1543 			   *cell2,
1544 			   *partexprs_item,
1545 			   *partexprs_item_saved;
1546 	int			i,
1547 				j;
1548 	PartitionRangeDatum *ldatum,
1549 			   *udatum;
1550 	Expr	   *keyCol;
1551 	Const	   *lower_val,
1552 			   *upper_val;
1553 	NullTest   *nulltest;
1554 	List	   *lower_or_arms,
1555 			   *upper_or_arms;
1556 	int			num_or_arms,
1557 				current_or_arm;
1558 	ListCell   *lower_or_start_datum,
1559 			   *upper_or_start_datum;
1560 	bool		need_next_lower_arm,
1561 				need_next_upper_arm;
1562 
1563 	lower_or_start_datum = list_head(spec->lowerdatums);
1564 	upper_or_start_datum = list_head(spec->upperdatums);
1565 	num_or_arms = key->partnatts;
1566 
1567 	/*
1568 	 * A range-partitioned table does not currently allow partition keys to be
1569 	 * null, so emit an IS NOT NULL expression for each key column.
1570 	 */
1571 	partexprs_item = list_head(key->partexprs);
1572 	for (i = 0; i < key->partnatts; i++)
1573 	{
1574 		Expr	   *keyCol;
1575 
1576 		if (key->partattrs[i] != 0)
1577 		{
1578 			keyCol = (Expr *) makeVar(1,
1579 									  key->partattrs[i],
1580 									  key->parttypid[i],
1581 									  key->parttypmod[i],
1582 									  key->parttypcoll[i],
1583 									  0);
1584 		}
1585 		else
1586 		{
1587 			if (partexprs_item == NULL)
1588 				elog(ERROR, "wrong number of partition key expressions");
1589 			keyCol = copyObject(lfirst(partexprs_item));
1590 			partexprs_item = lnext(partexprs_item);
1591 		}
1592 
1593 		nulltest = makeNode(NullTest);
1594 		nulltest->arg = keyCol;
1595 		nulltest->nulltesttype = IS_NOT_NULL;
1596 		nulltest->argisrow = false;
1597 		nulltest->location = -1;
1598 		result = lappend(result, nulltest);
1599 	}
1600 
1601 	/*
1602 	 * Iterate over the key columns and check if the corresponding lower and
1603 	 * upper datums are equal using the btree equality operator for the
1604 	 * column's type.  If equal, we emit single keyCol = common_value
1605 	 * expression.  Starting from the first column for which the corresponding
1606 	 * lower and upper bound datums are not equal, we generate OR expressions
1607 	 * as shown in the function's header comment.
1608 	 */
1609 	i = 0;
1610 	partexprs_item = list_head(key->partexprs);
1611 	partexprs_item_saved = partexprs_item;	/* placate compiler */
1612 	forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
1613 	{
1614 		EState	   *estate;
1615 		MemoryContext oldcxt;
1616 		Expr	   *test_expr;
1617 		ExprState  *test_exprstate;
1618 		Datum		test_result;
1619 		bool		isNull;
1620 
1621 		ldatum = castNode(PartitionRangeDatum, lfirst(cell1));
1622 		udatum = castNode(PartitionRangeDatum, lfirst(cell2));
1623 
1624 		/*
1625 		 * Since get_range_key_properties() modifies partexprs_item, and we
1626 		 * might need to start over from the previous expression in the later
1627 		 * part of this function, save away the current value.
1628 		 */
1629 		partexprs_item_saved = partexprs_item;
1630 
1631 		get_range_key_properties(key, i, ldatum, udatum,
1632 								 &partexprs_item,
1633 								 &keyCol,
1634 								 &lower_val, &upper_val);
1635 
1636 		/*
1637 		 * If either value is NULL, the corresponding partition bound is
1638 		 * either MINVALUE or MAXVALUE, and we treat them as unequal, because
1639 		 * even if they're the same, there is no common value to equate the
1640 		 * key column with.
1641 		 */
1642 		if (!lower_val || !upper_val)
1643 			break;
1644 
1645 		/* Create the test expression */
1646 		estate = CreateExecutorState();
1647 		oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
1648 		test_expr = make_partition_op_expr(key, i, BTEqualStrategyNumber,
1649 										   (Expr *) lower_val,
1650 										   (Expr *) upper_val);
1651 		fix_opfuncids((Node *) test_expr);
1652 		test_exprstate = ExecInitExpr(test_expr, NULL);
1653 		test_result = ExecEvalExprSwitchContext(test_exprstate,
1654 												GetPerTupleExprContext(estate),
1655 												&isNull);
1656 		MemoryContextSwitchTo(oldcxt);
1657 		FreeExecutorState(estate);
1658 
1659 		/* If not equal, go generate the OR expressions */
1660 		if (!DatumGetBool(test_result))
1661 			break;
1662 
1663 		/*
1664 		 * The bounds for the last key column can't be equal, because such a
1665 		 * range partition would never be allowed to be defined (it would have
1666 		 * an empty range otherwise).
1667 		 */
1668 		if (i == key->partnatts - 1)
1669 			elog(ERROR, "invalid range bound specification");
1670 
1671 		/* Equal, so generate keyCol = lower_val expression */
1672 		result = lappend(result,
1673 						 make_partition_op_expr(key, i, BTEqualStrategyNumber,
1674 												keyCol, (Expr *) lower_val));
1675 
1676 		i++;
1677 	}
1678 
1679 	/* First pair of lower_val and upper_val that are not equal. */
1680 	lower_or_start_datum = cell1;
1681 	upper_or_start_datum = cell2;
1682 
1683 	/* OR will have as many arms as there are key columns left. */
1684 	num_or_arms = key->partnatts - i;
1685 	current_or_arm = 0;
1686 	lower_or_arms = upper_or_arms = NIL;
1687 	need_next_lower_arm = need_next_upper_arm = true;
1688 	while (current_or_arm < num_or_arms)
1689 	{
1690 		List	   *lower_or_arm_args = NIL,
1691 				   *upper_or_arm_args = NIL;
1692 
1693 		/* Restart scan of columns from the i'th one */
1694 		j = i;
1695 		partexprs_item = partexprs_item_saved;
1696 
1697 		for_both_cell(cell1, lower_or_start_datum, cell2, upper_or_start_datum)
1698 		{
1699 			PartitionRangeDatum *ldatum_next = NULL,
1700 					   *udatum_next = NULL;
1701 
1702 			ldatum = castNode(PartitionRangeDatum, lfirst(cell1));
1703 			if (lnext(cell1))
1704 				ldatum_next = castNode(PartitionRangeDatum,
1705 									   lfirst(lnext(cell1)));
1706 			udatum = castNode(PartitionRangeDatum, lfirst(cell2));
1707 			if (lnext(cell2))
1708 				udatum_next = castNode(PartitionRangeDatum,
1709 									   lfirst(lnext(cell2)));
1710 			get_range_key_properties(key, j, ldatum, udatum,
1711 									 &partexprs_item,
1712 									 &keyCol,
1713 									 &lower_val, &upper_val);
1714 
1715 			if (need_next_lower_arm && lower_val)
1716 			{
1717 				uint16		strategy;
1718 
1719 				/*
1720 				 * For the non-last columns of this arm, use the EQ operator.
1721 				 * For the last column of this arm, use GT, unless this is the
1722 				 * last column of the whole bound check, or the next bound
1723 				 * datum is MINVALUE, in which case use GE.
1724 				 */
1725 				if (j - i < current_or_arm)
1726 					strategy = BTEqualStrategyNumber;
1727 				else if (j == key->partnatts - 1 ||
1728 						 (ldatum_next &&
1729 						  ldatum_next->kind == PARTITION_RANGE_DATUM_MINVALUE))
1730 					strategy = BTGreaterEqualStrategyNumber;
1731 				else
1732 					strategy = BTGreaterStrategyNumber;
1733 
1734 				lower_or_arm_args = lappend(lower_or_arm_args,
1735 											make_partition_op_expr(key, j,
1736 																   strategy,
1737 																   keyCol,
1738 																   (Expr *) lower_val));
1739 			}
1740 
1741 			if (need_next_upper_arm && upper_val)
1742 			{
1743 				uint16		strategy;
1744 
1745 				/*
1746 				 * For the non-last columns of this arm, use the EQ operator.
1747 				 * For the last column of this arm, use LT, unless the next
1748 				 * bound datum is MAXVALUE, in which case use LE.
1749 				 */
1750 				if (j - i < current_or_arm)
1751 					strategy = BTEqualStrategyNumber;
1752 				else if (udatum_next &&
1753 						 udatum_next->kind == PARTITION_RANGE_DATUM_MAXVALUE)
1754 					strategy = BTLessEqualStrategyNumber;
1755 				else
1756 					strategy = BTLessStrategyNumber;
1757 
1758 				upper_or_arm_args = lappend(upper_or_arm_args,
1759 											make_partition_op_expr(key, j,
1760 																   strategy,
1761 																   keyCol,
1762 																   (Expr *) upper_val));
1763 			}
1764 
1765 			/*
1766 			 * Did we generate enough of OR's arguments?  First arm considers
1767 			 * the first of the remaining columns, second arm considers first
1768 			 * two of the remaining columns, and so on.
1769 			 */
1770 			++j;
1771 			if (j - i > current_or_arm)
1772 			{
1773 				/*
1774 				 * We must not emit any more arms if the new column that will
1775 				 * be considered is unbounded, or this one was.
1776 				 */
1777 				if (!lower_val || !ldatum_next ||
1778 					ldatum_next->kind != PARTITION_RANGE_DATUM_VALUE)
1779 					need_next_lower_arm = false;
1780 				if (!upper_val || !udatum_next ||
1781 					udatum_next->kind != PARTITION_RANGE_DATUM_VALUE)
1782 					need_next_upper_arm = false;
1783 				break;
1784 			}
1785 		}
1786 
1787 		if (lower_or_arm_args != NIL)
1788 			lower_or_arms = lappend(lower_or_arms,
1789 									list_length(lower_or_arm_args) > 1
1790 									? makeBoolExpr(AND_EXPR, lower_or_arm_args, -1)
1791 									: linitial(lower_or_arm_args));
1792 
1793 		if (upper_or_arm_args != NIL)
1794 			upper_or_arms = lappend(upper_or_arms,
1795 									list_length(upper_or_arm_args) > 1
1796 									? makeBoolExpr(AND_EXPR, upper_or_arm_args, -1)
1797 									: linitial(upper_or_arm_args));
1798 
1799 		/* If no work to do in the next iteration, break away. */
1800 		if (!need_next_lower_arm && !need_next_upper_arm)
1801 			break;
1802 
1803 		++current_or_arm;
1804 	}
1805 
1806 	/*
1807 	 * Generate the OR expressions for each of lower and upper bounds (if
1808 	 * required), and append to the list of implicitly ANDed list of
1809 	 * expressions.
1810 	 */
1811 	if (lower_or_arms != NIL)
1812 		result = lappend(result,
1813 						 list_length(lower_or_arms) > 1
1814 						 ? makeBoolExpr(OR_EXPR, lower_or_arms, -1)
1815 						 : linitial(lower_or_arms));
1816 	if (upper_or_arms != NIL)
1817 		result = lappend(result,
1818 						 list_length(upper_or_arms) > 1
1819 						 ? makeBoolExpr(OR_EXPR, upper_or_arms, -1)
1820 						 : linitial(upper_or_arms));
1821 
1822 	/* As noted above, caller expects the list to be non-empty. */
1823 	if (result == NIL)
1824 		result = list_make1(makeBoolConst(true, false));
1825 
1826 	return result;
1827 }
1828 
1829 /*
1830  * generate_partition_qual
1831  *
1832  * Generate partition predicate from rel's partition bound expression
1833  *
1834  * We cache a copy of the result in the relcache entry, after constructing
1835  * it using the caller's context.  This approach avoids leaking any data
1836  * into long-lived cache contexts, especially if we fail partway through.
1837  */
1838 static List *
generate_partition_qual(Relation rel)1839 generate_partition_qual(Relation rel)
1840 {
1841 	HeapTuple	tuple;
1842 	MemoryContext oldcxt;
1843 	Datum		boundDatum;
1844 	bool		isnull;
1845 	PartitionBoundSpec *bound;
1846 	List	   *my_qual = NIL,
1847 			   *result = NIL;
1848 	Relation	parent;
1849 	bool		found_whole_row;
1850 
1851 	/* Guard against stack overflow due to overly deep partition tree */
1852 	check_stack_depth();
1853 
1854 	/* If we already cached the result, just return a copy */
1855 	if (rel->rd_partcheckvalid)
1856 		return copyObject(rel->rd_partcheck);
1857 
1858 	/* Grab at least an AccessShareLock on the parent table */
1859 	parent = relation_open(get_partition_parent(RelationGetRelid(rel)),
1860 						   AccessShareLock);
1861 
1862 	/* Get pg_class.relpartbound */
1863 	tuple = SearchSysCache1(RELOID, RelationGetRelid(rel));
1864 	if (!HeapTupleIsValid(tuple))
1865 		elog(ERROR, "cache lookup failed for relation %u",
1866 			 RelationGetRelid(rel));
1867 
1868 	boundDatum = SysCacheGetAttr(RELOID, tuple,
1869 								 Anum_pg_class_relpartbound,
1870 								 &isnull);
1871 	if (isnull)					/* should not happen */
1872 		elog(ERROR, "relation \"%s\" has relpartbound = null",
1873 			 RelationGetRelationName(rel));
1874 	bound = castNode(PartitionBoundSpec,
1875 					 stringToNode(TextDatumGetCString(boundDatum)));
1876 	ReleaseSysCache(tuple);
1877 
1878 	my_qual = get_qual_from_partbound(rel, parent, bound);
1879 
1880 	/* Add the parent's quals to the list (if any) */
1881 	if (parent->rd_rel->relispartition)
1882 		result = list_concat(generate_partition_qual(parent), my_qual);
1883 	else
1884 		result = my_qual;
1885 
1886 	/*
1887 	 * Change Vars to have partition's attnos instead of the parent's. We do
1888 	 * this after we concatenate the parent's quals, because we want every Var
1889 	 * in it to bear this relation's attnos. It's safe to assume varno = 1
1890 	 * here.
1891 	 */
1892 	result = map_partition_varattnos(result, 1, rel, parent,
1893 									 &found_whole_row);
1894 	/* There can never be a whole-row reference here */
1895 	if (found_whole_row)
1896 		elog(ERROR, "unexpected whole-row reference found in partition key");
1897 
1898 	/* Assert that we're not leaking any old data during assignments below */
1899 	Assert(rel->rd_partcheckcxt == NULL);
1900 	Assert(rel->rd_partcheck == NIL);
1901 
1902 	/*
1903 	 * Save a copy in the relcache.  The order of these operations is fairly
1904 	 * critical to avoid memory leaks and ensure that we don't leave a corrupt
1905 	 * relcache entry if we fail partway through copyObject.
1906 	 *
1907 	 * If, as is definitely possible, the partcheck list is NIL, then we do
1908 	 * not need to make a context to hold it.
1909 	 */
1910 	if (result != NIL)
1911 	{
1912 		rel->rd_partcheckcxt = AllocSetContextCreate(CacheMemoryContext,
1913 													 RelationGetRelationName(rel),
1914 													 ALLOCSET_SMALL_SIZES);
1915 		oldcxt = MemoryContextSwitchTo(rel->rd_partcheckcxt);
1916 		rel->rd_partcheck = copyObject(result);
1917 		MemoryContextSwitchTo(oldcxt);
1918 	}
1919 	else
1920 		rel->rd_partcheck = NIL;
1921 	rel->rd_partcheckvalid = true;
1922 
1923 	/* Keep the parent locked until commit */
1924 	relation_close(parent, NoLock);
1925 
1926 	/* Return the working copy to the caller */
1927 	return result;
1928 }
1929 
1930 /* ----------------
1931  *		FormPartitionKeyDatum
1932  *			Construct values[] and isnull[] arrays for the partition key
1933  *			of a tuple.
1934  *
1935  *	pd				Partition dispatch object of the partitioned table
1936  *	slot			Heap tuple from which to extract partition key
1937  *	estate			executor state for evaluating any partition key
1938  *					expressions (must be non-NULL)
1939  *	values			Array of partition key Datums (output area)
1940  *	isnull			Array of is-null indicators (output area)
1941  *
1942  * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1943  * the heap tuple passed in.
1944  * ----------------
1945  */
1946 void
FormPartitionKeyDatum(PartitionDispatch pd,TupleTableSlot * slot,EState * estate,Datum * values,bool * isnull)1947 FormPartitionKeyDatum(PartitionDispatch pd,
1948 					  TupleTableSlot *slot,
1949 					  EState *estate,
1950 					  Datum *values,
1951 					  bool *isnull)
1952 {
1953 	ListCell   *partexpr_item;
1954 	int			i;
1955 
1956 	if (pd->key->partexprs != NIL && pd->keystate == NIL)
1957 	{
1958 		/* Check caller has set up context correctly */
1959 		Assert(estate != NULL &&
1960 			   GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1961 
1962 		/* First time through, set up expression evaluation state */
1963 		pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
1964 	}
1965 
1966 	partexpr_item = list_head(pd->keystate);
1967 	for (i = 0; i < pd->key->partnatts; i++)
1968 	{
1969 		AttrNumber	keycol = pd->key->partattrs[i];
1970 		Datum		datum;
1971 		bool		isNull;
1972 
1973 		if (keycol != 0)
1974 		{
1975 			/* Plain column; get the value directly from the heap tuple */
1976 			datum = slot_getattr(slot, keycol, &isNull);
1977 		}
1978 		else
1979 		{
1980 			/* Expression; need to evaluate it */
1981 			if (partexpr_item == NULL)
1982 				elog(ERROR, "wrong number of partition key expressions");
1983 			datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
1984 											  GetPerTupleExprContext(estate),
1985 											  &isNull);
1986 			partexpr_item = lnext(partexpr_item);
1987 		}
1988 		values[i] = datum;
1989 		isnull[i] = isNull;
1990 	}
1991 
1992 	if (partexpr_item != NULL)
1993 		elog(ERROR, "wrong number of partition key expressions");
1994 }
1995 
1996 /*
1997  * get_partition_for_tuple
1998  *		Finds a leaf partition for tuple contained in *slot
1999  *
2000  * Returned value is the sequence number of the leaf partition thus found,
2001  * or -1 if no leaf partition is found for the tuple.  *failed_at is set
2002  * to the OID of the partitioned table whose partition was not found in
2003  * the latter case.
2004  */
2005 int
get_partition_for_tuple(PartitionDispatch * pd,TupleTableSlot * slot,EState * estate,PartitionDispatchData ** failed_at,TupleTableSlot ** failed_slot)2006 get_partition_for_tuple(PartitionDispatch *pd,
2007 						TupleTableSlot *slot,
2008 						EState *estate,
2009 						PartitionDispatchData **failed_at,
2010 						TupleTableSlot **failed_slot)
2011 {
2012 	PartitionDispatch parent;
2013 	Datum		values[PARTITION_MAX_KEYS];
2014 	bool		isnull[PARTITION_MAX_KEYS];
2015 	int			cur_offset,
2016 				cur_index;
2017 	int			i,
2018 				result;
2019 	ExprContext *ecxt = GetPerTupleExprContext(estate);
2020 	TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
2021 
2022 	/* start with the root partitioned table */
2023 	parent = pd[0];
2024 	while (true)
2025 	{
2026 		PartitionKey key = parent->key;
2027 		PartitionDesc partdesc = parent->partdesc;
2028 		TupleTableSlot *myslot = parent->tupslot;
2029 		TupleConversionMap *map = parent->tupmap;
2030 
2031 		if (myslot != NULL && map != NULL)
2032 		{
2033 			HeapTuple	tuple = ExecFetchSlotTuple(slot);
2034 
2035 			ExecClearTuple(myslot);
2036 			tuple = do_convert_tuple(tuple, map);
2037 			ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
2038 			slot = myslot;
2039 		}
2040 
2041 		/* Quick exit */
2042 		if (partdesc->nparts == 0)
2043 		{
2044 			*failed_at = parent;
2045 			*failed_slot = slot;
2046 			result = -1;
2047 			goto error_exit;
2048 		}
2049 
2050 		/*
2051 		 * Extract partition key from tuple. Expression evaluation machinery
2052 		 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
2053 		 * point to the correct tuple slot.  The slot might have changed from
2054 		 * what was used for the parent table if the table of the current
2055 		 * partitioning level has different tuple descriptor from the parent.
2056 		 * So update ecxt_scantuple accordingly.
2057 		 */
2058 		ecxt->ecxt_scantuple = slot;
2059 		FormPartitionKeyDatum(parent, slot, estate, values, isnull);
2060 
2061 		if (key->strategy == PARTITION_STRATEGY_RANGE)
2062 		{
2063 			/*
2064 			 * Since we cannot route tuples with NULL partition keys through a
2065 			 * range-partitioned table, simply return that no partition exists
2066 			 */
2067 			for (i = 0; i < key->partnatts; i++)
2068 			{
2069 				if (isnull[i])
2070 				{
2071 					*failed_at = parent;
2072 					*failed_slot = slot;
2073 					result = -1;
2074 					goto error_exit;
2075 				}
2076 			}
2077 		}
2078 
2079 		/*
2080 		 * A null partition key is only acceptable if null-accepting list
2081 		 * partition exists.
2082 		 */
2083 		cur_index = -1;
2084 		if (isnull[0] && partition_bound_accepts_nulls(partdesc->boundinfo))
2085 			cur_index = partdesc->boundinfo->null_index;
2086 		else if (!isnull[0])
2087 		{
2088 			/* Else bsearch in partdesc->boundinfo */
2089 			bool		equal = false;
2090 
2091 			cur_offset = partition_bound_bsearch(key, partdesc->boundinfo,
2092 												 values, false, &equal);
2093 			switch (key->strategy)
2094 			{
2095 				case PARTITION_STRATEGY_LIST:
2096 					if (cur_offset >= 0 && equal)
2097 						cur_index = partdesc->boundinfo->indexes[cur_offset];
2098 					else
2099 						cur_index = -1;
2100 					break;
2101 
2102 				case PARTITION_STRATEGY_RANGE:
2103 
2104 					/*
2105 					 * Offset returned is such that the bound at offset is
2106 					 * found to be less or equal with the tuple. So, the bound
2107 					 * at offset+1 would be the upper bound.
2108 					 */
2109 					cur_index = partdesc->boundinfo->indexes[cur_offset + 1];
2110 					break;
2111 
2112 				default:
2113 					elog(ERROR, "unexpected partition strategy: %d",
2114 						 (int) key->strategy);
2115 			}
2116 		}
2117 
2118 		/*
2119 		 * cur_index < 0 means we failed to find a partition of this parent.
2120 		 * cur_index >= 0 means we either found the leaf partition, or the
2121 		 * next parent to find a partition of.
2122 		 */
2123 		if (cur_index < 0)
2124 		{
2125 			result = -1;
2126 			*failed_at = parent;
2127 			*failed_slot = slot;
2128 			break;
2129 		}
2130 		else if (parent->indexes[cur_index] >= 0)
2131 		{
2132 			result = parent->indexes[cur_index];
2133 			break;
2134 		}
2135 		else
2136 			parent = pd[-parent->indexes[cur_index]];
2137 	}
2138 
2139 error_exit:
2140 	ecxt->ecxt_scantuple = ecxt_scantuple_old;
2141 	return result;
2142 }
2143 
2144 /*
2145  * qsort_partition_list_value_cmp
2146  *
2147  * Compare two list partition bound datums
2148  */
2149 static int32
qsort_partition_list_value_cmp(const void * a,const void * b,void * arg)2150 qsort_partition_list_value_cmp(const void *a, const void *b, void *arg)
2151 {
2152 	Datum		val1 = (*(const PartitionListValue **) a)->value,
2153 				val2 = (*(const PartitionListValue **) b)->value;
2154 	PartitionKey key = (PartitionKey) arg;
2155 
2156 	return DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
2157 										   key->partcollation[0],
2158 										   val1, val2));
2159 }
2160 
2161 /*
2162  * make_one_range_bound
2163  *
2164  * Return a PartitionRangeBound given a list of PartitionRangeDatum elements
2165  * and a flag telling whether the bound is lower or not.  Made into a function
2166  * because there are multiple sites that want to use this facility.
2167  */
2168 static PartitionRangeBound *
make_one_range_bound(PartitionKey key,int index,List * datums,bool lower)2169 make_one_range_bound(PartitionKey key, int index, List *datums, bool lower)
2170 {
2171 	PartitionRangeBound *bound;
2172 	ListCell   *lc;
2173 	int			i;
2174 
2175 	bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound));
2176 	bound->index = index;
2177 	bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum));
2178 	bound->kind = (PartitionRangeDatumKind *) palloc0(key->partnatts *
2179 													  sizeof(PartitionRangeDatumKind));
2180 	bound->lower = lower;
2181 
2182 	i = 0;
2183 	foreach(lc, datums)
2184 	{
2185 		PartitionRangeDatum *datum = castNode(PartitionRangeDatum, lfirst(lc));
2186 
2187 		/* What's contained in this range datum? */
2188 		bound->kind[i] = datum->kind;
2189 
2190 		if (datum->kind == PARTITION_RANGE_DATUM_VALUE)
2191 		{
2192 			Const	   *val = castNode(Const, datum->value);
2193 
2194 			if (val->constisnull)
2195 				elog(ERROR, "invalid range bound datum");
2196 			bound->datums[i] = val->constvalue;
2197 		}
2198 
2199 		i++;
2200 	}
2201 
2202 	return bound;
2203 }
2204 
2205 /* Used when sorting range bounds across all range partitions */
2206 static int32
qsort_partition_rbound_cmp(const void * a,const void * b,void * arg)2207 qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
2208 {
2209 	PartitionRangeBound *b1 = (*(PartitionRangeBound *const *) a);
2210 	PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b);
2211 	PartitionKey key = (PartitionKey) arg;
2212 
2213 	return partition_rbound_cmp(key, b1->datums, b1->kind, b1->lower, b2);
2214 }
2215 
2216 /*
2217  * partition_rbound_cmp
2218  *
2219  * Return for two range bounds whether the 1st one (specified in datum1,
2220  * kind1, and lower1) is <, =, or > the bound specified in *b2.
2221  *
2222  * Note that if the values of the two range bounds compare equal, then we take
2223  * into account whether they are upper or lower bounds, and an upper bound is
2224  * considered to be smaller than a lower bound. This is important to the way
2225  * that RelationBuildPartitionDesc() builds the PartitionBoundInfoData
2226  * structure, which only stores the upper bound of a common boundary between
2227  * two contiguous partitions.
2228  */
2229 static int32
partition_rbound_cmp(PartitionKey key,Datum * datums1,PartitionRangeDatumKind * kind1,bool lower1,PartitionRangeBound * b2)2230 partition_rbound_cmp(PartitionKey key,
2231 					 Datum *datums1, PartitionRangeDatumKind *kind1,
2232 					 bool lower1, PartitionRangeBound *b2)
2233 {
2234 	int32		cmpval = 0;		/* placate compiler */
2235 	int			i;
2236 	Datum	   *datums2 = b2->datums;
2237 	PartitionRangeDatumKind *kind2 = b2->kind;
2238 	bool		lower2 = b2->lower;
2239 
2240 	for (i = 0; i < key->partnatts; i++)
2241 	{
2242 		/*
2243 		 * First, handle cases where the column is unbounded, which should not
2244 		 * invoke the comparison procedure, and should not consider any later
2245 		 * columns. Note that the PartitionRangeDatumKind enum elements
2246 		 * compare the same way as the values they represent.
2247 		 */
2248 		if (kind1[i] < kind2[i])
2249 			return -1;
2250 		else if (kind1[i] > kind2[i])
2251 			return 1;
2252 		else if (kind1[i] != PARTITION_RANGE_DATUM_VALUE)
2253 
2254 			/*
2255 			 * The column bounds are both MINVALUE or both MAXVALUE. No later
2256 			 * columns should be considered, but we still need to compare
2257 			 * whether they are upper or lower bounds.
2258 			 */
2259 			break;
2260 
2261 		cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
2262 												 key->partcollation[i],
2263 												 datums1[i],
2264 												 datums2[i]));
2265 		if (cmpval != 0)
2266 			break;
2267 	}
2268 
2269 	/*
2270 	 * If the comparison is anything other than equal, we're done. If they
2271 	 * compare equal though, we still have to consider whether the boundaries
2272 	 * are inclusive or exclusive.  Exclusive one is considered smaller of the
2273 	 * two.
2274 	 */
2275 	if (cmpval == 0 && lower1 != lower2)
2276 		cmpval = lower1 ? 1 : -1;
2277 
2278 	return cmpval;
2279 }
2280 
2281 /*
2282  * partition_rbound_datum_cmp
2283  *
2284  * Return whether range bound (specified in rb_datums, rb_kind, and rb_lower)
2285  * is <, =, or > partition key of tuple (tuple_datums)
2286  */
2287 static int32
partition_rbound_datum_cmp(PartitionKey key,Datum * rb_datums,PartitionRangeDatumKind * rb_kind,Datum * tuple_datums)2288 partition_rbound_datum_cmp(PartitionKey key,
2289 						   Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
2290 						   Datum *tuple_datums)
2291 {
2292 	int			i;
2293 	int32		cmpval = -1;
2294 
2295 	for (i = 0; i < key->partnatts; i++)
2296 	{
2297 		if (rb_kind[i] == PARTITION_RANGE_DATUM_MINVALUE)
2298 			return -1;
2299 		else if (rb_kind[i] == PARTITION_RANGE_DATUM_MAXVALUE)
2300 			return 1;
2301 
2302 		cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
2303 												 key->partcollation[i],
2304 												 rb_datums[i],
2305 												 tuple_datums[i]));
2306 		if (cmpval != 0)
2307 			break;
2308 	}
2309 
2310 	return cmpval;
2311 }
2312 
2313 /*
2314  * partition_bound_cmp
2315  *
2316  * Return whether the bound at offset in boundinfo is <, =, or > the argument
2317  * specified in *probe.
2318  */
2319 static int32
partition_bound_cmp(PartitionKey key,PartitionBoundInfo boundinfo,int offset,void * probe,bool probe_is_bound)2320 partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo,
2321 					int offset, void *probe, bool probe_is_bound)
2322 {
2323 	Datum	   *bound_datums = boundinfo->datums[offset];
2324 	int32		cmpval = -1;
2325 
2326 	switch (key->strategy)
2327 	{
2328 		case PARTITION_STRATEGY_LIST:
2329 			cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
2330 													 key->partcollation[0],
2331 													 bound_datums[0],
2332 													 *(Datum *) probe));
2333 			break;
2334 
2335 		case PARTITION_STRATEGY_RANGE:
2336 			{
2337 				PartitionRangeDatumKind *kind = boundinfo->kind[offset];
2338 
2339 				if (probe_is_bound)
2340 				{
2341 					/*
2342 					 * We need to pass whether the existing bound is a lower
2343 					 * bound, so that two equal-valued lower and upper bounds
2344 					 * are not regarded equal.
2345 					 */
2346 					bool		lower = boundinfo->indexes[offset] < 0;
2347 
2348 					cmpval = partition_rbound_cmp(key,
2349 												  bound_datums, kind, lower,
2350 												  (PartitionRangeBound *) probe);
2351 				}
2352 				else
2353 					cmpval = partition_rbound_datum_cmp(key,
2354 														bound_datums, kind,
2355 														(Datum *) probe);
2356 				break;
2357 			}
2358 
2359 		default:
2360 			elog(ERROR, "unexpected partition strategy: %d",
2361 				 (int) key->strategy);
2362 	}
2363 
2364 	return cmpval;
2365 }
2366 
2367 /*
2368  * Binary search on a collection of partition bounds. Returns greatest
2369  * bound in array boundinfo->datums which is less than or equal to *probe.
2370  * If all bounds in the array are greater than *probe, -1 is returned.
2371  *
2372  * *probe could either be a partition bound or a Datum array representing
2373  * the partition key of a tuple being routed; probe_is_bound tells which.
2374  * We pass that down to the comparison function so that it can interpret the
2375  * contents of *probe accordingly.
2376  *
2377  * *is_equal is set to whether the bound at the returned index is equal with
2378  * *probe.
2379  */
2380 static int
partition_bound_bsearch(PartitionKey key,PartitionBoundInfo boundinfo,void * probe,bool probe_is_bound,bool * is_equal)2381 partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
2382 						void *probe, bool probe_is_bound, bool *is_equal)
2383 {
2384 	int			lo,
2385 				hi,
2386 				mid;
2387 
2388 	lo = -1;
2389 	hi = boundinfo->ndatums - 1;
2390 	while (lo < hi)
2391 	{
2392 		int32		cmpval;
2393 
2394 		mid = (lo + hi + 1) / 2;
2395 		cmpval = partition_bound_cmp(key, boundinfo, mid, probe,
2396 									 probe_is_bound);
2397 		if (cmpval <= 0)
2398 		{
2399 			lo = mid;
2400 			*is_equal = (cmpval == 0);
2401 
2402 			if (*is_equal)
2403 				break;
2404 		}
2405 		else
2406 			hi = mid - 1;
2407 	}
2408 
2409 	return lo;
2410 }
2411