1 /*-------------------------------------------------------------------------
2 *
3 * execPartition.c
4 * Support routines for partitioning.
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/executor/execPartition.c
11 *
12 *-------------------------------------------------------------------------
13 */
xA::B::base14 #include "postgres.h"
15
16 #include "catalog/partition.h"
17 #include "catalog/pg_inherits.h"
18 #include "catalog/pg_type.h"
19 #include "executor/execPartition.h"
20 #include "executor/executor.h"
21 #include "foreign/fdwapi.h"
22 #include "mb/pg_wchar.h"
23 #include "miscadmin.h"
24 #include "nodes/makefuncs.h"
25 #include "partitioning/partbounds.h"
26 #include "partitioning/partprune.h"
27 #include "rewrite/rewriteManip.h"
28 #include "utils/lsyscache.h"
29 #include "utils/partcache.h"
30 #include "utils/rel.h"
31 #include "utils/rls.h"
32 #include "utils/ruleutils.h"
33
34
35 /*-----------------------
36 * PartitionDispatch - information about one partitioned table in a partition
37 * hierarchy required to route a tuple to one of its partitions
38 *
39 * reldesc Relation descriptor of the table
40 * key Partition key information of the table
41 * keystate Execution state required for expressions in the partition key
42 * partdesc Partition descriptor of the table
43 * tupslot A standalone TupleTableSlot initialized with this table's tuple
44 * descriptor
45 * tupmap TupleConversionMap to convert from the parent's rowtype to
46 * this table's rowtype (when extracting the partition key of a
47 * tuple just before routing it through this table)
48 * indexes Array with partdesc->nparts members (for details on what
49 * individual members represent, see how they are set in
50 * get_partition_dispatch_recurse())
51 *-----------------------
52 */
53 typedef struct PartitionDispatchData
54 {
55 Relation reldesc;
56 PartitionKey key;
57 List *keystate; /* list of ExprState */
58 PartitionDesc partdesc;
59 TupleTableSlot *tupslot;
60 TupleConversionMap *tupmap;
61 int *indexes;
62 } PartitionDispatchData;
63
fun2()64
65 static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
66 int *num_parted, List **leaf_part_oids);
67 static void get_partition_dispatch_recurse(Relation rel, Relation parent,
68 List **pds, List **leaf_part_oids);
69 static void FormPartitionKeyDatum(PartitionDispatch pd,
70 TupleTableSlot *slot,
71 EState *estate,
72 Datum *values,
73 bool *isnull);
74 static int get_partition_for_tuple(Relation relation, Datum *values,
75 bool *isnull);
76 static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
77 Datum *values,
78 bool *isnull,
79 int maxfieldlen);
80 static List *adjust_partition_tlist(List *tlist, TupleConversionMap *map);
81 static void ExecInitPruningContext(PartitionPruneContext *context,
82 Oid reloid,
83 List *pruning_steps,
84 PlanState *planstate);
fun3()85 static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
86 PartitionedRelPruningData *pprune,
87 bool initial_prune,
88 Bitmapset **validsubplans);
89
90
fun4a()91 /*
92 * ExecSetupPartitionTupleRouting - sets up information needed during
93 * tuple routing for partitioned tables, encapsulates it in
94 * PartitionTupleRouting, and returns it.
95 *
96 * Note that all the relations in the partition tree are locked using the
97 * RowExclusiveLock mode upon return from this function.
98 *
99 * While we allocate the arrays of pointers of ResultRelInfo and
100 * TupleConversionMap for all partitions here, actual objects themselves are
101 * lazily allocated for a given partition if a tuple is actually routed to it;
102 * see ExecInitPartitionInfo. However, if the function is invoked for update
103 * tuple routing, caller would already have initialized ResultRelInfo's for
104 * some of the partitions, which are reused and assigned to their respective
105 * slot in the aforementioned array. For such partitions, we delay setting
106 * up objects such as TupleConversionMap until those are actually chosen as
107 * the partitions to route tuples to. See ExecPrepareTupleRouting.
108 */
109 PartitionTupleRouting *
110 ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, ResultRelInfo *rootResultRelInfo)
111 {
112 List *leaf_parts;
113 ListCell *cell;
114 int i;
115 ResultRelInfo *update_rri = NULL;
116 int num_update_rri = 0,
117 update_rri_index = 0;
118 PartitionTupleRouting *proute;
119 int nparts;
120 ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
121
122 /*
123 * Get the information about the partition tree after locking all the
124 * partitions.
125 */
126 (void) find_all_inheritors(RelationGetRelid(rootResultRelInfo->ri_RelationDesc),
127 RowExclusiveLock, NULL);
128 proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
129 proute->partition_dispatch_info =
130 RelationGetPartitionDispatchInfo(rootResultRelInfo->ri_RelationDesc,
131 &proute->num_dispatch,
132 &leaf_parts);
133 proute->num_partitions = nparts = list_length(leaf_parts);
134 proute->partitions =
135 (ResultRelInfo **) palloc(nparts * sizeof(ResultRelInfo *));
136 proute->parent_child_tupconv_maps =
137 (TupleConversionMap **) palloc0(nparts * sizeof(TupleConversionMap *));
138 proute->partition_oids = (Oid *) palloc(nparts * sizeof(Oid));
139
140 /* Set up details specific to the type of tuple routing we are doing. */
141 if (node && node->operation == CMD_UPDATE)
142 {
143 update_rri = mtstate->resultRelInfo;
144 num_update_rri = list_length(node->plans);
145 proute->subplan_partition_offsets =
146 palloc(num_update_rri * sizeof(int));
147 proute->num_subplan_partition_offsets = num_update_rri;
148
149 /*
150 * We need an additional tuple slot for storing transient tuples that
151 * are converted to the root table descriptor.
152 */
153 proute->root_tuple_slot = MakeTupleTableSlot(NULL);
154 }
155
156 /*
157 * Initialize an empty slot that will be used to manipulate tuples of any
158 * given partition's rowtype. It is attached to the caller-specified node
159 * (such as ModifyTableState) and released when the node finishes
160 * processing.
161 */
162 proute->partition_tuple_slot = MakeTupleTableSlot(NULL);
163
164 i = 0;
165 foreach(cell, leaf_parts)
166 {
167 ResultRelInfo *leaf_part_rri = NULL;
168 Oid leaf_oid = lfirst_oid(cell);
169
170 proute->partition_oids[i] = leaf_oid;
171
172 /*
173 * If the leaf partition is already present in the per-subplan result
174 * rels, we re-use that rather than initialize a new result rel. The
175 * per-subplan resultrels and the resultrels of the leaf partitions
176 * are both in the same canonical order. So while going through the
177 * leaf partition oids, we need to keep track of the next per-subplan
178 * result rel to be looked for in the leaf partition resultrels.
179 */
180 if (update_rri_index < num_update_rri &&
181 RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
182 {
183 leaf_part_rri = &update_rri[update_rri_index];
184
185 /*
186 * This is required in order to convert the partition's tuple to
187 * be compatible with the root partitioned table's tuple
188 * descriptor. When generating the per-subplan result rels, this
189 * was not set.
190 */
191 leaf_part_rri->ri_RootResultRelInfo = rootResultRelInfo;
192
193 /* Remember the subplan offset for this ResultRelInfo */
194 proute->subplan_partition_offsets[update_rri_index] = i;
195
196 update_rri_index++;
197 }
198
199 proute->partitions[i] = leaf_part_rri;
200 i++;
201 }
202
203 /*
204 * For UPDATE, we should have found all the per-subplan resultrels in the
205 * leaf partitions. (If this is an INSERT, both values will be zero.)
206 */
207 Assert(update_rri_index == num_update_rri);
208
209 return proute;
210 }
211
212 /*
213 * ExecFindPartition -- Find a leaf partition in the partition tree rooted
214 * at parent, for the heap tuple contained in *slot
215 *
216 * estate must be non-NULL; we'll need it to compute any expressions in the
217 * partition key(s)
218 *
219 * If no leaf partition is found, this routine errors out with the appropriate
220 * error message, else it returns the leaf partition sequence number
221 * as an index into the array of (ResultRelInfos of) all leaf partitions in
222 * the partition tree.
223 */
224 int
225 ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
226 TupleTableSlot *slot, EState *estate)
227 {
228 int result;
229 Datum values[PARTITION_MAX_KEYS];
230 bool isnull[PARTITION_MAX_KEYS];
231 Relation rel;
232 PartitionDispatch dispatch;
233 ExprContext *ecxt = GetPerTupleExprContext(estate);
234 TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
235 TupleTableSlot *myslot = NULL;
236 MemoryContext oldcxt;
237 HeapTuple tuple;
238
239 /* use per-tuple context here to avoid leaking memory */
240 oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
241
242 /*
243 * First check the root table's partition constraint, if any. No point in
244 * routing the tuple if it doesn't belong in the root table itself.
245 */
246 if (resultRelInfo->ri_PartitionCheck)
247 ExecPartitionCheck(resultRelInfo, slot, estate, true);
248
249 /* start with the root partitioned table */
250 tuple = ExecFetchSlotTuple(slot);
251 dispatch = pd[0];
252 while (true)
253 {
254 PartitionDesc partdesc;
255 TupleConversionMap *map = dispatch->tupmap;
256 int cur_index = -1;
257
258 rel = dispatch->reldesc;
259 partdesc = RelationGetPartitionDesc(rel);
260
261 /*
262 * Use the slot dedicated to this level's parent. All parents except
263 * the root have a dedicated slot. For the root parent, we just use
264 * the original input slot.
265 */
266 myslot = dispatch->tupslot == NULL ? slot : dispatch->tupslot;
267
268 /*
269 * If the tuple layout of this level's parent is different from the
270 * previous level's parent, convert the tuple and store it into its
271 * dedicated slot.
272 */
273 if (myslot != slot)
274 {
275 if (map != NULL)
276 tuple = do_convert_tuple(tuple, map);
277 ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
278 }
279 else
280 Assert(map == NULL);
281
282 /*
283 * Extract partition key from tuple. Expression evaluation machinery
284 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
285 * point to the correct tuple slot. The slot might have changed from
286 * what was used for the parent table if the table of the current
287 * partitioning level has different tuple descriptor from the parent.
288 * So update ecxt_scantuple accordingly.
289 */
290 ecxt->ecxt_scantuple = myslot;
291 FormPartitionKeyDatum(dispatch, myslot, estate, values, isnull);
292
293 /*
294 * Nothing for get_partition_for_tuple() to do if there are no
295 * partitions to begin with.
296 */
297 if (partdesc->nparts == 0)
298 {
299 result = -1;
300 break;
301 }
302
303 cur_index = get_partition_for_tuple(rel, values, isnull);
304
305 /*
306 * cur_index < 0 means we failed to find a partition of this parent.
307 * cur_index >= 0 means we either found the leaf partition, or the
308 * next parent to find a partition of.
309 */
310 if (cur_index < 0)
311 {
312 result = -1;
313 break;
314 }
315 else if (dispatch->indexes[cur_index] >= 0)
316 {
317 result = dispatch->indexes[cur_index];
318 /* success! */
319 break;
320 }
321 else
322 {
323 /* move down one level */
324 dispatch = pd[-dispatch->indexes[cur_index]];
325
326 /*
327 * Make a copy of the tuple for the next level of routing. If
328 * this level's parent had a dedicated slot, we must clear its
329 * tuple too, which would be the copy we made in the last
330 * iteration.
331 */
332 tuple = ExecCopySlotTuple(myslot);
333 if (myslot != slot)
334 ExecClearTuple(myslot);
335 }
336 }
337
338 /* A partition was not found. */
339 if (result < 0)
340 {
341 char *val_desc;
342
343 val_desc = ExecBuildSlotPartitionKeyDescription(rel,
344 values, isnull, 64);
345 Assert(OidIsValid(RelationGetRelid(rel)));
346 ereport(ERROR,
347 (errcode(ERRCODE_CHECK_VIOLATION),
348 errmsg("no partition of relation \"%s\" found for row",
349 RelationGetRelationName(rel)),
350 val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0));
351 }
352
353 /* Release the tuple in the lowest parent's dedicated slot. */
354 if (myslot != slot)
355 ExecClearTuple(myslot);
356
357 MemoryContextSwitchTo(oldcxt);
358 ecxt->ecxt_scantuple = ecxt_scantuple_old;
359
360 return result;
361 }
362
363 /*
364 * ExecInitPartitionInfo
365 * Initialize ResultRelInfo and other information for a partition
366 *
367 * Returns the ResultRelInfo
368 */
369 ResultRelInfo *
370 ExecInitPartitionInfo(ModifyTableState *mtstate,
371 ResultRelInfo *rootResultRelInfo,
372 PartitionTupleRouting *proute,
373 EState *estate, int partidx)
374 {
375 ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
376 Relation partrel;
377 int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
378 Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
379 ResultRelInfo *leaf_part_rri;
380 MemoryContext oldContext;
381 AttrNumber *part_attnos = NULL;
382 bool found_whole_row;
383
384 /*
385 * We locked all the partitions in ExecSetupPartitionTupleRouting
386 * including the leaf partitions.
387 */
388 partrel = heap_open(proute->partition_oids[partidx], NoLock);
389
390 /*
391 * Keep ResultRelInfo and other information for this partition in the
392 * per-query memory context so they'll survive throughout the query.
393 */
394 oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
395
396 leaf_part_rri = makeNode(ResultRelInfo);
397 InitResultRelInfo(leaf_part_rri,
398 partrel,
399 0,
400 rootResultRelInfo,
401 estate->es_instrument);
402
403 /*
404 * Verify result relation is a valid target for an INSERT. An UPDATE of a
405 * partition-key becomes a DELETE+INSERT operation, so this check is still
406 * required when the operation is CMD_UPDATE.
407 */
408 CheckValidResultRel(leaf_part_rri, CMD_INSERT);
409
410 /*
411 * Since we've just initialized this ResultRelInfo, it's not in any list
412 * attached to the estate as yet. Add it, so that it can be found later.
413 *
414 * Note that the entries in this list appear in no predetermined order,
415 * because partition result rels are initialized as and when they're
416 * needed.
417 */
418 estate->es_tuple_routing_result_relations =
419 lappend(estate->es_tuple_routing_result_relations,
420 leaf_part_rri);
421
422 /*
423 * Open partition indices. The user may have asked to check for conflicts
424 * within this leaf partition and do "nothing" instead of throwing an
425 * error. Be prepared in that case by initializing the index information
426 * needed by ExecInsert() to perform speculative insertions.
427 */
428 if (partrel->rd_rel->relhasindex &&
429 leaf_part_rri->ri_IndexRelationDescs == NULL)
430 ExecOpenIndices(leaf_part_rri,
431 (node != NULL &&
432 node->onConflictAction != ONCONFLICT_NONE));
433
434 /*
435 * Build WITH CHECK OPTION constraints for the partition. Note that we
436 * didn't build the withCheckOptionList for partitions within the planner,
437 * but simple translation of varattnos will suffice. This only occurs for
438 * the INSERT case or in the case of UPDATE tuple routing where we didn't
439 * find a result rel to reuse in ExecSetupPartitionTupleRouting().
440 */
441 if (node && node->withCheckOptionLists != NIL)
442 {
443 List *wcoList;
444 List *wcoExprs = NIL;
445 ListCell *ll;
446
447 /*
448 * In the case of INSERT on a partitioned table, there is only one
449 * plan. Likewise, there is only one WCO list, not one per partition.
450 * For UPDATE, there are as many WCO lists as there are plans.
451 */
452 Assert((node->operation == CMD_INSERT &&
453 list_length(node->withCheckOptionLists) == 1 &&
454 list_length(node->plans) == 1) ||
455 (node->operation == CMD_UPDATE &&
456 list_length(node->withCheckOptionLists) ==
457 list_length(node->plans)));
458
459 /*
460 * Use the WCO list of the first plan as a reference to calculate
461 * attno's for the WCO list of this partition. In the INSERT case,
462 * that refers to the root partitioned table, whereas in the UPDATE
463 * tuple routing case, that refers to the first partition in the
464 * mtstate->resultRelInfo array. In any case, both that relation and
465 * this partition should have the same columns, so we should be able
466 * to map attributes successfully.
467 */
468 wcoList = linitial(node->withCheckOptionLists);
469
470 /*
471 * Convert Vars in it to contain this partition's attribute numbers.
472 */
473 part_attnos =
474 convert_tuples_by_name_map(RelationGetDescr(partrel),
475 RelationGetDescr(firstResultRel),
476 gettext_noop("could not convert row type"));
477 wcoList = (List *)
478 map_variable_attnos((Node *) wcoList,
479 firstVarno, 0,
480 part_attnos,
481 RelationGetDescr(firstResultRel)->natts,
482 RelationGetForm(partrel)->reltype,
483 &found_whole_row);
484 /* We ignore the value of found_whole_row. */
485
486 foreach(ll, wcoList)
487 {
488 WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
489 ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual),
490 &mtstate->ps);
491
492 wcoExprs = lappend(wcoExprs, wcoExpr);
493 }
494
495 leaf_part_rri->ri_WithCheckOptions = wcoList;
496 leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
497 }
498
499 /*
500 * Build the RETURNING projection for the partition. Note that we didn't
501 * build the returningList for partitions within the planner, but simple
502 * translation of varattnos will suffice. This only occurs for the INSERT
503 * case or in the case of UPDATE tuple routing where we didn't find a
504 * result rel to reuse in ExecSetupPartitionTupleRouting().
505 */
506 if (node && node->returningLists != NIL)
507 {
508 TupleTableSlot *slot;
509 ExprContext *econtext;
510 List *returningList;
511
512 /* See the comment above for WCO lists. */
513 Assert((node->operation == CMD_INSERT &&
514 list_length(node->returningLists) == 1 &&
515 list_length(node->plans) == 1) ||
516 (node->operation == CMD_UPDATE &&
517 list_length(node->returningLists) ==
518 list_length(node->plans)));
519
520 /*
521 * Use the RETURNING list of the first plan as a reference to
522 * calculate attno's for the RETURNING list of this partition. See
523 * the comment above for WCO lists for more details on why this is
524 * okay.
525 */
526 returningList = linitial(node->returningLists);
527
528 /*
529 * Convert Vars in it to contain this partition's attribute numbers.
530 */
531 if (part_attnos == NULL)
532 part_attnos =
533 convert_tuples_by_name_map(RelationGetDescr(partrel),
534 RelationGetDescr(firstResultRel),
535 gettext_noop("could not convert row type"));
536 returningList = (List *)
537 map_variable_attnos((Node *) returningList,
538 firstVarno, 0,
539 part_attnos,
540 RelationGetDescr(firstResultRel)->natts,
541 RelationGetForm(partrel)->reltype,
542 &found_whole_row);
543 /* We ignore the value of found_whole_row. */
544
545 leaf_part_rri->ri_returningList = returningList;
546
547 /*
548 * Initialize the projection itself.
549 *
550 * Use the slot and the expression context that would have been set up
551 * in ExecInitModifyTable() for projection's output.
552 */
553 Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
554 slot = mtstate->ps.ps_ResultTupleSlot;
555 Assert(mtstate->ps.ps_ExprContext != NULL);
556 econtext = mtstate->ps.ps_ExprContext;
557 leaf_part_rri->ri_projectReturning =
558 ExecBuildProjectionInfo(returningList, econtext, slot,
559 &mtstate->ps, RelationGetDescr(partrel));
560 }
561
562 /* Set up information needed for routing tuples to the partition. */
563 ExecInitRoutingInfo(mtstate, estate, proute, leaf_part_rri, partidx);
564
565 /*
566 * If there is an ON CONFLICT clause, initialize state for it.
567 */
568 if (node && node->onConflictAction != ONCONFLICT_NONE)
569 {
570 TupleConversionMap *map = proute->parent_child_tupconv_maps[partidx];
571 TupleDesc partrelDesc = RelationGetDescr(partrel);
572 ExprContext *econtext = mtstate->ps.ps_ExprContext;
573 ListCell *lc;
574 List *arbiterIndexes = NIL;
575
576 /*
577 * If there is a list of arbiter indexes, map it to a list of indexes
578 * in the partition. We do that by scanning the partition's index
579 * list and searching for ancestry relationships to each index in the
580 * ancestor table.
581 */
582 if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) > 0)
583 {
584 List *childIdxs;
585
586 childIdxs = RelationGetIndexList(leaf_part_rri->ri_RelationDesc);
587
588 foreach(lc, childIdxs)
589 {
590 Oid childIdx = lfirst_oid(lc);
591 List *ancestors;
592 ListCell *lc2;
593
594 ancestors = get_partition_ancestors(childIdx);
595 foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes)
596 {
597 if (list_member_oid(ancestors, lfirst_oid(lc2)))
598 arbiterIndexes = lappend_oid(arbiterIndexes, childIdx);
599 }
600 list_free(ancestors);
601 }
602 }
603
604 /*
605 * If the resulting lists are of inequal length, something is wrong.
606 * (This shouldn't happen, since arbiter index selection should not
607 * pick up an invalid index.)
608 */
609 if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
610 list_length(arbiterIndexes))
611 elog(ERROR, "invalid arbiter index list");
612 leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
613
614 /*
615 * In the DO UPDATE case, we have some more state to initialize.
616 */
617 if (node->onConflictAction == ONCONFLICT_UPDATE)
618 {
619 Assert(node->onConflictSet != NIL);
620 Assert(rootResultRelInfo->ri_onConflict != NULL);
621
622 /*
623 * If the partition's tuple descriptor matches exactly the root
624 * parent (the common case), we can simply re-use the parent's ON
625 * CONFLICT SET state, skipping a bunch of work. Otherwise, we
626 * need to create state specific to this partition.
627 */
628 if (map == NULL)
629 leaf_part_rri->ri_onConflict = rootResultRelInfo->ri_onConflict;
630 else
631 {
632 OnConflictSetState *onconfl = makeNode(OnConflictSetState);
633 List *onconflset;
634 bool found_whole_row;
635
636 leaf_part_rri->ri_onConflict = onconfl;
637
638 /*
639 * Translate expressions in onConflictSet to account for
640 * different attribute numbers. For that, map partition
641 * varattnos twice: first to catch the EXCLUDED
642 * pseudo-relation (INNER_VAR), and second to handle the main
643 * target relation (firstVarno).
644 */
645 onconflset = copyObject(node->onConflictSet);
646 if (part_attnos == NULL)
647 part_attnos =
648 convert_tuples_by_name_map(RelationGetDescr(partrel),
649 RelationGetDescr(firstResultRel),
650 gettext_noop("could not convert row type"));
651 onconflset = (List *)
652 map_variable_attnos((Node *) onconflset,
653 INNER_VAR, 0,
654 part_attnos,
655 RelationGetDescr(firstResultRel)->natts,
656 RelationGetForm(partrel)->reltype,
657 &found_whole_row);
658 /* We ignore the value of found_whole_row. */
659 onconflset = (List *)
660 map_variable_attnos((Node *) onconflset,
661 firstVarno, 0,
662 part_attnos,
663 RelationGetDescr(firstResultRel)->natts,
664 RelationGetForm(partrel)->reltype,
665 &found_whole_row);
666 /* We ignore the value of found_whole_row. */
667
668 /* Finally, reorder the tlist to match the partition. */
669 onconflset = adjust_partition_tlist(onconflset, map);
670
671 /*
672 * Build UPDATE SET's projection info. The user of this
673 * projection is responsible for setting the slot's tupdesc!
674 * We set aside a tupdesc that's good for the common case of a
675 * partition that's tupdesc-equal to the partitioned table;
676 * partitions of different tupdescs must generate their own.
677 */
678 ExecSetSlotDescriptor(mtstate->mt_conflproj, partrelDesc);
679 onconfl->oc_ProjInfo =
680 ExecBuildProjectionInfoExt(onconflset, econtext,
681 mtstate->mt_conflproj, false,
682 &mtstate->ps, partrelDesc);
683 onconfl->oc_ProjTupdesc = partrelDesc;
684
685 /*
686 * If there is a WHERE clause, initialize state where it will
687 * be evaluated, mapping the attribute numbers appropriately.
688 * As with onConflictSet, we need to map partition varattnos
689 * to the partition's tupdesc.
690 */
691 if (node->onConflictWhere)
692 {
693 List *clause;
694
695 clause = copyObject((List *) node->onConflictWhere);
696 clause = (List *)
697 map_variable_attnos((Node *) clause,
698 INNER_VAR, 0,
699 part_attnos,
700 RelationGetDescr(firstResultRel)->natts,
701 RelationGetForm(partrel)->reltype,
702 &found_whole_row);
703 /* We ignore the value of found_whole_row. */
704 clause = (List *)
705 map_variable_attnos((Node *) clause,
706 firstVarno, 0,
707 part_attnos,
708 RelationGetDescr(firstResultRel)->natts,
709 RelationGetForm(partrel)->reltype,
710 &found_whole_row);
711 /* We ignore the value of found_whole_row. */
712 onconfl->oc_WhereClause =
713 ExecInitQual((List *) clause, &mtstate->ps);
714 }
715 }
716 }
717 }
718
719 Assert(proute->partitions[partidx] == NULL);
720 proute->partitions[partidx] = leaf_part_rri;
721
722 MemoryContextSwitchTo(oldContext);
723
724 return leaf_part_rri;
725 }
726
727 /*
728 * ExecInitRoutingInfo
729 * Set up information needed for routing tuples to a leaf partition
730 */
731 void
732 ExecInitRoutingInfo(ModifyTableState *mtstate,
733 EState *estate,
734 PartitionTupleRouting *proute,
735 ResultRelInfo *partRelInfo,
736 int partidx)
737 {
738 ResultRelInfo *rootRelInfo = partRelInfo->ri_RootResultRelInfo;
739 MemoryContext oldContext;
740
741 /*
742 * Switch into per-query memory context.
743 */
744 oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
745
746 /*
747 * Set up a tuple conversion map to convert a tuple routed to the
748 * partition from the parent's type to the partition's.
749 */
750 proute->parent_child_tupconv_maps[partidx] =
751 convert_tuples_by_name(RelationGetDescr(rootRelInfo->ri_RelationDesc),
752 RelationGetDescr(partRelInfo->ri_RelationDesc),
753 gettext_noop("could not convert row type"));
754
755 /*
756 * If the partition is a foreign table, let the FDW init itself for
757 * routing tuples to the partition.
758 */
759 if (partRelInfo->ri_FdwRoutine != NULL &&
760 partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
761 partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
762
763 MemoryContextSwitchTo(oldContext);
764
765 partRelInfo->ri_PartitionReadyForRouting = true;
766 }
767
768 /*
769 * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition
770 * child-to-root tuple conversion map array.
771 *
772 * This map is required for capturing transition tuples when the target table
773 * is a partitioned table. For a tuple that is routed by an INSERT or UPDATE,
774 * we need to convert it from the leaf partition to the target table
775 * descriptor.
776 */
777 void
778 ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute)
779 {
780 Assert(proute != NULL);
781
782 /*
783 * These array elements get filled up with maps on an on-demand basis.
784 * Initially just set all of them to NULL.
785 */
786 proute->child_parent_tupconv_maps =
787 (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) *
788 proute->num_partitions);
789
790 /* Same is the case for this array. All the values are set to false */
791 proute->child_parent_map_not_required =
792 (bool *) palloc0(sizeof(bool) * proute->num_partitions);
793 }
794
795 /*
796 * TupConvMapForLeaf -- Get the tuple conversion map for a given leaf partition
797 * index.
798 */
799 TupleConversionMap *
800 TupConvMapForLeaf(PartitionTupleRouting *proute,
801 ResultRelInfo *rootRelInfo, int leaf_index)
802 {
803 ResultRelInfo **resultRelInfos = proute->partitions;
804 TupleConversionMap **map;
805 TupleDesc tupdesc;
806
807 /* Don't call this if we're not supposed to be using this type of map. */
808 Assert(proute->child_parent_tupconv_maps != NULL);
809
810 /* If it's already known that we don't need a map, return NULL. */
811 if (proute->child_parent_map_not_required[leaf_index])
812 return NULL;
813
814 /* If we've already got a map, return it. */
815 map = &proute->child_parent_tupconv_maps[leaf_index];
816 if (*map != NULL)
817 return *map;
818
819 /* No map yet; try to create one. */
820 tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc);
821 *map =
822 convert_tuples_by_name(tupdesc,
823 RelationGetDescr(rootRelInfo->ri_RelationDesc),
824 gettext_noop("could not convert row type"));
825
826 /* If it turns out no map is needed, remember for next time. */
827 proute->child_parent_map_not_required[leaf_index] = (*map == NULL);
828
829 return *map;
830 }
831
832 /*
833 * ConvertPartitionTupleSlot -- convenience function for tuple conversion.
834 * The tuple, if converted, is stored in new_slot, and *p_my_slot is
835 * updated to point to it. new_slot typically should be one of the
836 * dedicated partition tuple slots. If map is NULL, *p_my_slot is not changed.
837 *
838 * Returns the converted tuple, unless map is NULL, in which case original
839 * tuple is returned unmodified.
840 */
841 HeapTuple
842 ConvertPartitionTupleSlot(TupleConversionMap *map,
843 HeapTuple tuple,
844 TupleTableSlot *new_slot,
845 TupleTableSlot **p_my_slot)
846 {
847 if (!map)
848 return tuple;
849
850 tuple = do_convert_tuple(tuple, map);
851
852 /*
853 * Change the partition tuple slot descriptor, as per converted tuple.
854 */
855 *p_my_slot = new_slot;
856 Assert(new_slot != NULL);
857 ExecSetSlotDescriptor(new_slot, map->outdesc);
858 ExecStoreTuple(tuple, new_slot, InvalidBuffer, true);
859
860 return tuple;
861 }
862
863 /*
864 * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
865 * routing.
866 *
867 * Close all the partitioned tables, leaf partitions, and their indices.
868 */
869 void
870 ExecCleanupTupleRouting(ModifyTableState *mtstate,
871 PartitionTupleRouting *proute)
872 {
873 int i;
874 int subplan_index = 0;
875
876 /*
877 * Remember, proute->partition_dispatch_info[0] corresponds to the root
878 * partitioned table, which we must not try to close, because it is the
879 * main target table of the query that will be closed by callers such as
880 * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
881 * partitioned table.
882 */
883 for (i = 1; i < proute->num_dispatch; i++)
884 {
885 PartitionDispatch pd = proute->partition_dispatch_info[i];
886
887 heap_close(pd->reldesc, NoLock);
888 ExecDropSingleTupleTableSlot(pd->tupslot);
889 }
890
891 for (i = 0; i < proute->num_partitions; i++)
892 {
893 ResultRelInfo *resultRelInfo = proute->partitions[i];
894
895 /* skip further processsing for uninitialized partitions */
896 if (resultRelInfo == NULL)
897 continue;
898
899 /* Allow any FDWs to shut down if they've been exercised */
900 if (resultRelInfo->ri_PartitionReadyForRouting &&
901 resultRelInfo->ri_FdwRoutine != NULL &&
902 resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
903 resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
904 resultRelInfo);
905
906 /*
907 * If this result rel is one of the UPDATE subplan result rels, let
908 * ExecEndPlan() close it. For INSERT or COPY,
909 * proute->subplan_partition_offsets will always be NULL. Note that
910 * the subplan_partition_offsets array and the partitions array have
911 * the partitions in the same order. So, while we iterate over
912 * partitions array, we also iterate over the
913 * subplan_partition_offsets array in order to figure out which of the
914 * result rels are present in the UPDATE subplans.
915 */
916 if (proute->subplan_partition_offsets &&
917 subplan_index < proute->num_subplan_partition_offsets &&
918 proute->subplan_partition_offsets[subplan_index] == i)
919 {
920 subplan_index++;
921 continue;
922 }
923
924 ExecCloseIndices(resultRelInfo);
925 heap_close(resultRelInfo->ri_RelationDesc, NoLock);
926 }
927
928 /* Release the standalone partition tuple descriptors, if any */
929 if (proute->root_tuple_slot)
930 ExecDropSingleTupleTableSlot(proute->root_tuple_slot);
931 if (proute->partition_tuple_slot)
932 ExecDropSingleTupleTableSlot(proute->partition_tuple_slot);
933 }
934
935 /*
936 * RelationGetPartitionDispatchInfo
937 * Returns information necessary to route tuples down a partition tree
938 *
939 * The number of elements in the returned array (that is, the number of
940 * PartitionDispatch objects for the partitioned tables in the partition tree)
941 * is returned in *num_parted and a list of the OIDs of all the leaf
942 * partitions of rel is returned in *leaf_part_oids.
943 *
944 * All the relations in the partition tree (including 'rel') must have been
945 * locked (using at least the AccessShareLock) by the caller.
946 */
947 static PartitionDispatch *
948 RelationGetPartitionDispatchInfo(Relation rel,
949 int *num_parted, List **leaf_part_oids)
950 {
951 List *pdlist = NIL;
952 PartitionDispatchData **pd;
953 ListCell *lc;
954 int i;
955
956 Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
957
958 *num_parted = 0;
959 *leaf_part_oids = NIL;
960
961 get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
962 *num_parted = list_length(pdlist);
963 pd = (PartitionDispatchData **) palloc(*num_parted *
964 sizeof(PartitionDispatchData *));
965 i = 0;
966 foreach(lc, pdlist)
967 {
968 pd[i++] = lfirst(lc);
969 }
970
971 return pd;
972 }
973
974 /*
975 * get_partition_dispatch_recurse
976 * Recursively expand partition tree rooted at rel
977 *
978 * As the partition tree is expanded in a depth-first manner, we maintain two
979 * global lists: of PartitionDispatch objects corresponding to partitioned
980 * tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
981 *
982 * Note that the order of OIDs of leaf partitions in leaf_part_oids matches
983 * the order in which the planner's expand_partitioned_rtentry() processes
984 * them. It's not necessarily the case that the offsets match up exactly,
985 * because constraint exclusion might prune away some partitions on the
986 * planner side, whereas we'll always have the complete list; but unpruned
987 * partitions will appear in the same order in the plan as they are returned
988 * here.
989 */
990 static void
991 get_partition_dispatch_recurse(Relation rel, Relation parent,
992 List **pds, List **leaf_part_oids)
993 {
994 TupleDesc tupdesc = RelationGetDescr(rel);
995 PartitionDesc partdesc = RelationGetPartitionDesc(rel);
996 PartitionKey partkey = RelationGetPartitionKey(rel);
997 PartitionDispatch pd;
998 int i;
999
1000 check_stack_depth();
1001
1002 /* Build a PartitionDispatch for this table and add it to *pds. */
1003 pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
1004 *pds = lappend(*pds, pd);
1005 pd->reldesc = rel;
1006 pd->key = partkey;
1007 pd->keystate = NIL;
1008 pd->partdesc = partdesc;
1009 if (parent != NULL)
1010 {
1011 /*
1012 * For every partitioned table other than the root, we must store a
1013 * tuple table slot initialized with its tuple descriptor and a tuple
1014 * conversion map to convert a tuple from its parent's rowtype to its
1015 * own. That is to make sure that we are looking at the correct row
1016 * using the correct tuple descriptor when computing its partition key
1017 * for tuple routing.
1018 */
1019 pd->tupslot = MakeSingleTupleTableSlot(tupdesc);
1020 pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
1021 tupdesc,
1022 gettext_noop("could not convert row type"));
1023 }
1024 else
1025 {
1026 /* Not required for the root partitioned table */
1027 pd->tupslot = NULL;
1028 pd->tupmap = NULL;
1029 }
1030
1031 /*
1032 * Go look at each partition of this table. If it's a leaf partition,
1033 * simply add its OID to *leaf_part_oids. If it's a partitioned table,
1034 * recursively call get_partition_dispatch_recurse(), so that its
1035 * partitions are processed as well and a corresponding PartitionDispatch
1036 * object gets added to *pds.
1037 *
1038 * The 'indexes' array is used when searching for a partition matching a
1039 * given tuple. The actual value we store here depends on whether the
1040 * array element belongs to a leaf partition or a subpartitioned table.
1041 * For leaf partitions we store the index into *leaf_part_oids, and for
1042 * sub-partitioned tables we store a negative version of the index into
1043 * the *pds list. Both indexes are 0-based, but the first element of the
1044 * *pds list is the root partition, so 0 always means the first leaf. When
1045 * searching, if we see a negative value, the search must continue in the
1046 * corresponding sub-partition; otherwise, we've identified the correct
1047 * partition.
1048 */
1049 pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
1050 for (i = 0; i < partdesc->nparts; i++)
1051 {
1052 Oid partrelid = partdesc->oids[i];
1053
1054 if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
1055 {
1056 *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
1057 pd->indexes[i] = list_length(*leaf_part_oids) - 1;
1058 }
1059 else
1060 {
1061 /*
1062 * We assume all tables in the partition tree were already locked
1063 * by the caller.
1064 */
1065 Relation partrel = heap_open(partrelid, NoLock);
1066
1067 pd->indexes[i] = -list_length(*pds);
1068 get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
1069 }
1070 }
1071 }
1072
1073 /* ----------------
1074 * FormPartitionKeyDatum
1075 * Construct values[] and isnull[] arrays for the partition key
1076 * of a tuple.
1077 *
1078 * pd Partition dispatch object of the partitioned table
1079 * slot Heap tuple from which to extract partition key
1080 * estate executor state for evaluating any partition key
1081 * expressions (must be non-NULL)
1082 * values Array of partition key Datums (output area)
1083 * isnull Array of is-null indicators (output area)
1084 *
1085 * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1086 * the heap tuple passed in.
1087 * ----------------
1088 */
1089 static void
1090 FormPartitionKeyDatum(PartitionDispatch pd,
1091 TupleTableSlot *slot,
1092 EState *estate,
1093 Datum *values,
1094 bool *isnull)
1095 {
1096 ListCell *partexpr_item;
1097 int i;
1098
1099 if (pd->key->partexprs != NIL && pd->keystate == NIL)
1100 {
1101 /* Check caller has set up context correctly */
1102 Assert(estate != NULL &&
1103 GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1104
1105 /* First time through, set up expression evaluation state */
1106 pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
1107 }
1108
1109 partexpr_item = list_head(pd->keystate);
1110 for (i = 0; i < pd->key->partnatts; i++)
1111 {
1112 AttrNumber keycol = pd->key->partattrs[i];
1113 Datum datum;
1114 bool isNull;
1115
1116 if (keycol != 0)
1117 {
1118 /* Plain column; get the value directly from the heap tuple */
1119 datum = slot_getattr(slot, keycol, &isNull);
1120 }
1121 else
1122 {
1123 /* Expression; need to evaluate it */
1124 if (partexpr_item == NULL)
1125 elog(ERROR, "wrong number of partition key expressions");
1126 datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
1127 GetPerTupleExprContext(estate),
1128 &isNull);
1129 partexpr_item = lnext(partexpr_item);
1130 }
1131 values[i] = datum;
1132 isnull[i] = isNull;
1133 }
1134
1135 if (partexpr_item != NULL)
1136 elog(ERROR, "wrong number of partition key expressions");
1137 }
1138
1139 /*
1140 * get_partition_for_tuple
1141 * Finds partition of relation which accepts the partition key specified
1142 * in values and isnull
1143 *
1144 * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
1145 * found or -1 if none found.
1146 */
1147 static int
1148 get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
1149 {
1150 int bound_offset;
1151 int part_index = -1;
1152 PartitionKey key = RelationGetPartitionKey(relation);
1153 PartitionDesc partdesc = RelationGetPartitionDesc(relation);
1154 PartitionBoundInfo boundinfo = partdesc->boundinfo;
1155
1156 /* Route as appropriate based on partitioning strategy. */
1157 switch (key->strategy)
1158 {
1159 case PARTITION_STRATEGY_HASH:
1160 {
1161 uint64 rowHash;
1162
1163 rowHash = compute_partition_hash_value(key->partnatts,
1164 key->partsupfunc,
1165 values, isnull);
1166
1167 part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
1168 }
1169 break;
1170
1171 case PARTITION_STRATEGY_LIST:
1172 if (isnull[0])
1173 {
1174 if (partition_bound_accepts_nulls(boundinfo))
1175 part_index = boundinfo->null_index;
1176 }
1177 else
1178 {
1179 bool equal = false;
1180
1181 bound_offset = partition_list_bsearch(key->partsupfunc,
1182 key->partcollation,
1183 boundinfo,
1184 values[0], &equal);
1185 if (bound_offset >= 0 && equal)
1186 part_index = boundinfo->indexes[bound_offset];
1187 }
1188 break;
1189
1190 case PARTITION_STRATEGY_RANGE:
1191 {
1192 bool equal = false,
1193 range_partkey_has_null = false;
1194 int i;
1195
1196 /*
1197 * No range includes NULL, so this will be accepted by the
1198 * default partition if there is one, and otherwise rejected.
1199 */
1200 for (i = 0; i < key->partnatts; i++)
1201 {
1202 if (isnull[i])
1203 {
1204 range_partkey_has_null = true;
1205 break;
1206 }
1207 }
1208
1209 if (!range_partkey_has_null)
1210 {
1211 bound_offset = partition_range_datum_bsearch(key->partsupfunc,
1212 key->partcollation,
1213 boundinfo,
1214 key->partnatts,
1215 values,
1216 &equal);
1217
1218 /*
1219 * The bound at bound_offset is less than or equal to the
1220 * tuple value, so the bound at offset+1 is the upper
1221 * bound of the partition we're looking for, if there
1222 * actually exists one.
1223 */
1224 part_index = boundinfo->indexes[bound_offset + 1];
1225 }
1226 }
1227 break;
1228
1229 default:
1230 elog(ERROR, "unexpected partition strategy: %d",
1231 (int) key->strategy);
1232 }
1233
1234 /*
1235 * part_index < 0 means we failed to find a partition of this parent. Use
1236 * the default partition, if there is one.
1237 */
1238 if (part_index < 0)
1239 part_index = boundinfo->default_index;
1240
1241 return part_index;
1242 }
1243
1244 /*
1245 * ExecBuildSlotPartitionKeyDescription
1246 *
1247 * This works very much like BuildIndexValueDescription() and is currently
1248 * used for building error messages when ExecFindPartition() fails to find
1249 * partition for a row.
1250 */
1251 static char *
1252 ExecBuildSlotPartitionKeyDescription(Relation rel,
1253 Datum *values,
1254 bool *isnull,
1255 int maxfieldlen)
1256 {
1257 StringInfoData buf;
1258 PartitionKey key = RelationGetPartitionKey(rel);
1259 int partnatts = get_partition_natts(key);
1260 int i;
1261 Oid relid = RelationGetRelid(rel);
1262 AclResult aclresult;
1263
1264 if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
1265 return NULL;
1266
1267 /* If the user has table-level access, just go build the description. */
1268 aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
1269 if (aclresult != ACLCHECK_OK)
1270 {
1271 /*
1272 * Step through the columns of the partition key and make sure the
1273 * user has SELECT rights on all of them.
1274 */
1275 for (i = 0; i < partnatts; i++)
1276 {
1277 AttrNumber attnum = get_partition_col_attnum(key, i);
1278
1279 /*
1280 * If this partition key column is an expression, we return no
1281 * detail rather than try to figure out what column(s) the
1282 * expression includes and if the user has SELECT rights on them.
1283 */
1284 if (attnum == InvalidAttrNumber ||
1285 pg_attribute_aclcheck(relid, attnum, GetUserId(),
1286 ACL_SELECT) != ACLCHECK_OK)
1287 return NULL;
1288 }
1289 }
1290
1291 initStringInfo(&buf);
1292 appendStringInfo(&buf, "(%s) = (",
1293 pg_get_partkeydef_columns(relid, true));
1294
1295 for (i = 0; i < partnatts; i++)
1296 {
1297 char *val;
1298 int vallen;
1299
1300 if (isnull[i])
1301 val = "null";
1302 else
1303 {
1304 Oid foutoid;
1305 bool typisvarlena;
1306
1307 getTypeOutputInfo(get_partition_col_typid(key, i),
1308 &foutoid, &typisvarlena);
1309 val = OidOutputFunctionCall(foutoid, values[i]);
1310 }
1311
1312 if (i > 0)
1313 appendStringInfoString(&buf, ", ");
1314
1315 /* truncate if needed */
1316 vallen = strlen(val);
1317 if (vallen <= maxfieldlen)
1318 appendStringInfoString(&buf, val);
1319 else
1320 {
1321 vallen = pg_mbcliplen(val, vallen, maxfieldlen);
1322 appendBinaryStringInfo(&buf, val, vallen);
1323 appendStringInfoString(&buf, "...");
1324 }
1325 }
1326
1327 appendStringInfoChar(&buf, ')');
1328
1329 return buf.data;
1330 }
1331
1332 /*
1333 * adjust_partition_tlist
1334 * Re-order the targetlist entries for a given partition to account for
1335 * column position differences between the parent and the partition.
1336 *
1337 * The expressions have already been fixed, but we must now re-order the
1338 * entries in case the partition has different column order, and possibly
1339 * add or remove dummy entries for dropped columns.
1340 *
1341 * Although a new List is returned, this feels free to scribble on resno
1342 * fields of the given tlist, so that should be a working copy.
1343 */
1344 static List *
1345 adjust_partition_tlist(List *tlist, TupleConversionMap *map)
1346 {
1347 List *new_tlist = NIL;
1348 TupleDesc tupdesc = map->outdesc;
1349 AttrNumber *attrMap = map->attrMap;
1350 AttrNumber attrno;
1351 ListCell *lc;
1352
1353 for (attrno = 1; attrno <= tupdesc->natts; attrno++)
1354 {
1355 Form_pg_attribute att_tup = TupleDescAttr(tupdesc, attrno - 1);
1356 AttrNumber parentattrno = attrMap[attrno - 1];
1357 TargetEntry *tle;
1358
1359 if (parentattrno != InvalidAttrNumber)
1360 {
1361 /*
1362 * Use the corresponding entry from the parent's tlist, adjusting
1363 * the resno to match the partition's attno.
1364 */
1365 Assert(!att_tup->attisdropped);
1366 tle = (TargetEntry *) list_nth(tlist, parentattrno - 1);
1367 Assert(!tle->resjunk);
1368 Assert(tle->resno == parentattrno);
1369 tle->resno = attrno;
1370 }
1371 else
1372 {
1373 /*
1374 * For a dropped attribute in the partition, generate a dummy
1375 * entry with resno matching the partition's attno. This should
1376 * match what expand_targetlist() does.
1377 */
1378 Const *expr;
1379
1380 Assert(att_tup->attisdropped);
1381 expr = makeConst(INT4OID,
1382 -1,
1383 InvalidOid,
1384 sizeof(int32),
1385 (Datum) 0,
1386 true, /* isnull */
1387 true /* byval */ );
1388 tle = makeTargetEntry((Expr *) expr,
1389 attrno,
1390 pstrdup(NameStr(att_tup->attname)),
1391 false);
1392 }
1393
1394 new_tlist = lappend(new_tlist, tle);
1395 }
1396
1397 /* Finally, attach any resjunk entries to the end of the new tlist */
1398 foreach(lc, tlist)
1399 {
1400 TargetEntry *tle = (TargetEntry *) lfirst(lc);
1401
1402 if (tle->resjunk)
1403 {
1404 tle->resno = list_length(new_tlist) + 1;
1405 new_tlist = lappend(new_tlist, tle);
1406 }
1407 }
1408
1409 return new_tlist;
1410 }
1411
1412 /*-------------------------------------------------------------------------
1413 * Run-Time Partition Pruning Support.
1414 *
1415 * The following series of functions exist to support the removal of unneeded
1416 * subplans for queries against partitioned tables. The supporting functions
1417 * here are designed to work with any plan type which supports an arbitrary
1418 * number of subplans, e.g. Append, MergeAppend.
1419 *
1420 * When pruning involves comparison of a partition key to a constant, it's
1421 * done by the planner. However, if we have a comparison to a non-constant
1422 * but not volatile expression, that presents an opportunity for run-time
1423 * pruning by the executor, allowing irrelevant partitions to be skipped
1424 * dynamically.
1425 *
1426 * We must distinguish expressions containing PARAM_EXEC Params from
1427 * expressions that don't contain those. Even though a PARAM_EXEC Param is
1428 * considered to be a stable expression, it can change value from one plan
1429 * node scan to the next during query execution. Stable comparison
1430 * expressions that don't involve such Params allow partition pruning to be
1431 * done once during executor startup. Expressions that do involve such Params
1432 * require us to prune separately for each scan of the parent plan node.
1433 *
1434 * Note that pruning away unneeded subplans during executor startup has the
1435 * added benefit of not having to initialize the unneeded subplans at all.
1436 *
1437 *
1438 * Functions:
1439 *
1440 * ExecCreatePartitionPruneState:
1441 * Creates the PartitionPruneState required by each of the two pruning
1442 * functions. Details stored include how to map the partition index
1443 * returned by the partition pruning code into subplan indexes.
1444 *
1445 * ExecDestroyPartitionPruneState:
1446 * Deletes a PartitionPruneState. Must be called during executor shutdown.
1447 *
1448 * ExecFindInitialMatchingSubPlans:
1449 * Returns indexes of matching subplans. Partition pruning is attempted
1450 * without any evaluation of expressions containing PARAM_EXEC Params.
1451 * This function must be called during executor startup for the parent
1452 * plan before the subplans themselves are initialized. Subplans which
1453 * are found not to match by this function must be removed from the
1454 * plan's list of subplans during execution, as this function performs a
1455 * remap of the partition index to subplan index map and the newly
1456 * created map provides indexes only for subplans which remain after
1457 * calling this function.
1458 *
1459 * ExecFindMatchingSubPlans:
1460 * Returns indexes of matching subplans after evaluating all available
1461 * expressions. This function can only be called during execution and
1462 * must be called again each time the value of a Param listed in
1463 * PartitionPruneState's 'execparamids' changes.
1464 *-------------------------------------------------------------------------
1465 */
1466
1467 /*
1468 * ExecCreatePartitionPruneState
1469 * Build the data structure required for calling
1470 * ExecFindInitialMatchingSubPlans and ExecFindMatchingSubPlans.
1471 *
1472 * 'planstate' is the parent plan node's execution state.
1473 *
1474 * 'partitionpruneinfo' is a PartitionPruneInfo as generated by
1475 * make_partition_pruneinfo. Here we build a PartitionPruneState containing a
1476 * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
1477 * partitionpruneinfo->prune_infos), each of which contains a
1478 * PartitionedRelPruningData for each PartitionedRelPruneInfo appearing in
1479 * that sublist. This two-level system is needed to keep from confusing the
1480 * different hierarchies when a UNION ALL contains multiple partitioned tables
1481 * as children. The data stored in each PartitionedRelPruningData can be
1482 * re-used each time we re-evaluate which partitions match the pruning steps
1483 * provided in each PartitionedRelPruneInfo.
1484 */
1485 PartitionPruneState *
1486 ExecCreatePartitionPruneState(PlanState *planstate,
1487 PartitionPruneInfo *partitionpruneinfo)
1488 {
1489 PartitionPruneState *prunestate;
1490 int n_part_hierarchies;
1491 ListCell *lc;
1492 int i;
1493
1494 n_part_hierarchies = list_length(partitionpruneinfo->prune_infos);
1495 Assert(n_part_hierarchies > 0);
1496
1497 /*
1498 * Allocate the data structure
1499 */
1500 prunestate = (PartitionPruneState *)
1501 palloc(offsetof(PartitionPruneState, partprunedata) +
1502 sizeof(PartitionPruningData *) * n_part_hierarchies);
1503
1504 prunestate->execparamids = NULL;
1505 /* other_subplans can change at runtime, so we need our own copy */
1506 prunestate->other_subplans = bms_copy(partitionpruneinfo->other_subplans);
1507 prunestate->do_initial_prune = false; /* may be set below */
1508 prunestate->do_exec_prune = false; /* may be set below */
1509 prunestate->num_partprunedata = n_part_hierarchies;
1510
1511 /*
1512 * Create a short-term memory context which we'll use when making calls to
1513 * the partition pruning functions. This avoids possible memory leaks,
1514 * since the pruning functions call comparison functions that aren't under
1515 * our control.
1516 */
1517 prunestate->prune_context =
1518 AllocSetContextCreate(CurrentMemoryContext,
1519 "Partition Prune",
1520 ALLOCSET_DEFAULT_SIZES);
1521
1522 i = 0;
1523 foreach(lc, partitionpruneinfo->prune_infos)
1524 {
1525 List *partrelpruneinfos = lfirst_node(List, lc);
1526 int npartrelpruneinfos = list_length(partrelpruneinfos);
1527 PartitionPruningData *prunedata;
1528 ListCell *lc2;
1529 int j;
1530
1531 prunedata = (PartitionPruningData *)
1532 palloc(offsetof(PartitionPruningData, partrelprunedata) +
1533 npartrelpruneinfos * sizeof(PartitionedRelPruningData));
1534 prunestate->partprunedata[i] = prunedata;
1535 prunedata->num_partrelprunedata = npartrelpruneinfos;
1536
1537 j = 0;
1538 foreach(lc2, partrelpruneinfos)
1539 {
1540 PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2);
1541 PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
1542
1543 /*
1544 * We must copy the subplan_map rather than pointing directly to
1545 * the plan's version, as we may end up making modifications to it
1546 * later.
1547 */
1548 pprune->nparts = pinfo->nparts;
1549 pprune->subplan_map = palloc(sizeof(int) * pinfo->nparts);
1550 memcpy(pprune->subplan_map, pinfo->subplan_map,
1551 sizeof(int) * pinfo->nparts);
1552
1553 /* We can use the subpart_map verbatim, since we never modify it */
1554 pprune->subpart_map = pinfo->subpart_map;
1555
1556 /* present_parts is also subject to later modification */
1557 pprune->present_parts = bms_copy(pinfo->present_parts);
1558
1559 /*
1560 * Initialize pruning contexts as needed.
1561 */
1562 pprune->initial_pruning_steps = pinfo->initial_pruning_steps;
1563 if (pinfo->initial_pruning_steps)
1564 {
1565 ExecInitPruningContext(&pprune->initial_context,
1566 pinfo->reloid,
1567 pinfo->initial_pruning_steps,
1568 planstate);
1569 /* Record whether initial pruning is needed at any level */
1570 prunestate->do_initial_prune = true;
1571 }
1572 pprune->exec_pruning_steps = pinfo->exec_pruning_steps;
1573 if (pinfo->exec_pruning_steps)
1574 {
1575 ExecInitPruningContext(&pprune->exec_context,
1576 pinfo->reloid,
1577 pinfo->exec_pruning_steps,
1578 planstate);
1579 /* Record whether exec pruning is needed at any level */
1580 prunestate->do_exec_prune = true;
1581 }
1582
1583 /*
1584 * Accumulate the IDs of all PARAM_EXEC Params affecting the
1585 * partitioning decisions at this plan node.
1586 */
1587 prunestate->execparamids = bms_add_members(prunestate->execparamids,
1588 pinfo->execparamids);
1589
1590 j++;
1591 }
1592 i++;
1593 }
1594
1595 return prunestate;
1596 }
1597
1598 /*
1599 * ExecDestroyPartitionPruneState
1600 * Release resources at plan shutdown.
1601 *
1602 * We don't bother to free any memory here, since the whole executor context
1603 * will be going away shortly. We do need to release our relcache pins.
1604 */
1605 void
1606 ExecDestroyPartitionPruneState(PartitionPruneState *prunestate)
1607 {
1608 PartitionPruningData **partprunedata = prunestate->partprunedata;
1609 int i;
1610
1611 for (i = 0; i < prunestate->num_partprunedata; i++)
1612 {
1613 PartitionPruningData *prunedata = partprunedata[i];
1614 PartitionedRelPruningData *pprune = prunedata->partrelprunedata;
1615 int j;
1616
1617 for (j = 0; j < prunedata->num_partrelprunedata; j++)
1618 {
1619 if (pprune[j].initial_pruning_steps)
1620 relation_close(pprune[j].initial_context.partrel, NoLock);
1621 if (pprune[j].exec_pruning_steps)
1622 relation_close(pprune[j].exec_context.partrel, NoLock);
1623 }
1624 }
1625 }
1626
1627 /*
1628 * Initialize a PartitionPruneContext for the given list of pruning steps.
1629 */
1630 static void
1631 ExecInitPruningContext(PartitionPruneContext *context,
1632 Oid reloid,
1633 List *pruning_steps,
1634 PlanState *planstate)
1635 {
1636 PartitionKey partkey;
1637 PartitionDesc partdesc;
1638 int n_steps;
1639 int partnatts;
1640 ListCell *lc;
1641
1642 /*
1643 * We need to hold a pin on the partitioned table's relcache entry
1644 * so that we can rely on its copies of the table's partition key
1645 * and partition descriptor. We need not get a lock though; one
1646 * should have been acquired already by InitPlan or
1647 * ExecLockNonLeafAppendTables.
1648 */
1649 context->partrel = relation_open(reloid, NoLock);
1650
1651 partkey = RelationGetPartitionKey(context->partrel);
1652 partdesc = RelationGetPartitionDesc(context->partrel);
1653
1654 n_steps = list_length(pruning_steps);
1655
1656 context->strategy = partkey->strategy;
1657 context->partnatts = partnatts = partkey->partnatts;
1658 context->nparts = partdesc->nparts;
1659 context->boundinfo = partdesc->boundinfo;
1660 context->partcollation = partkey->partcollation;
1661 context->partsupfunc = partkey->partsupfunc;
1662
1663 /* We'll look up type-specific support functions as needed */
1664 context->stepcmpfuncs = (FmgrInfo *)
1665 palloc0(sizeof(FmgrInfo) * n_steps * partnatts);
1666
1667 context->ppccontext = CurrentMemoryContext;
1668 context->planstate = planstate;
1669
1670 /* Initialize expression state for each expression we need */
1671 context->exprstates = (ExprState **)
1672 palloc0(sizeof(ExprState *) * n_steps * partnatts);
1673 foreach(lc, pruning_steps)
1674 {
1675 PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
1676 ListCell *lc2;
1677 int keyno;
1678
1679 /* not needed for other step kinds */
1680 if (!IsA(step, PartitionPruneStepOp))
1681 continue;
1682
1683 Assert(list_length(step->exprs) <= partnatts);
1684
1685 keyno = 0;
1686 foreach(lc2, step->exprs)
1687 {
1688 Expr *expr = (Expr *) lfirst(lc2);
1689
1690 /* not needed for Consts */
1691 if (!IsA(expr, Const))
1692 {
1693 int stateidx = PruneCxtStateIdx(partnatts,
1694 step->step.step_id,
1695 keyno);
1696
1697 context->exprstates[stateidx] =
1698 ExecInitExpr(expr, context->planstate);
1699 }
1700 keyno++;
1701 }
1702 }
1703 }
1704
1705 /*
1706 * ExecFindInitialMatchingSubPlans
1707 * Identify the set of subplans that cannot be eliminated by initial
1708 * pruning (disregarding any pruning constraints involving PARAM_EXEC
1709 * Params). Also re-map the translation matrix which allows conversion
1710 * of partition indexes into subplan indexes to account for the unneeded
1711 * subplans having been removed.
1712 *
1713 * Must only be called once per 'prunestate', and only if initial pruning
1714 * is required.
1715 *
1716 * 'nsubplans' must be passed as the total number of unpruned subplans.
1717 */
1718 Bitmapset *
1719 ExecFindInitialMatchingSubPlans(PartitionPruneState *prunestate, int nsubplans)
1720 {
1721 Bitmapset *result = NULL;
1722 MemoryContext oldcontext;
1723 int i;
1724
1725 Assert(prunestate->do_initial_prune);
1726
1727 /*
1728 * Switch to a temp context to avoid leaking memory in the executor's
1729 * memory context.
1730 */
1731 oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
1732
1733 /*
1734 * For each hierarchy, do the pruning tests, and add deletable subplans'
1735 * indexes to "result".
1736 */
1737 for (i = 0; i < prunestate->num_partprunedata; i++)
1738 {
1739 PartitionPruningData *prunedata;
1740 PartitionedRelPruningData *pprune;
1741
1742 prunedata = prunestate->partprunedata[i];
1743 pprune = &prunedata->partrelprunedata[0];
1744
1745 /* Perform pruning without using PARAM_EXEC Params */
1746 find_matching_subplans_recurse(prunedata, pprune, true, &result);
1747
1748 /* Expression eval may have used space in node's ps_ExprContext too */
1749 if (pprune->initial_pruning_steps)
1750 ResetExprContext(pprune->initial_context.planstate->ps_ExprContext);
1751 }
1752
1753 MemoryContextSwitchTo(oldcontext);
1754
1755 /* Copy result out of the temp context before we reset it */
1756 result = bms_copy(result);
1757
1758 /* Add in any subplans that partition pruning didn't account for */
1759 result = bms_add_members(result, prunestate->other_subplans);
1760
1761 MemoryContextReset(prunestate->prune_context);
1762
1763 /*
1764 * If any subplans were pruned, we must re-sequence the subplan indexes so
1765 * that ExecFindMatchingSubPlans properly returns the indexes from the
1766 * subplans which will remain after execution of this function.
1767 */
1768 if (bms_num_members(result) < nsubplans)
1769 {
1770 int *new_subplan_indexes;
1771 Bitmapset *new_other_subplans;
1772 int i;
1773 int newidx;
1774
1775 /*
1776 * First we must build a temporary array which maps old subplan
1777 * indexes to new ones. While we're at it, also recompute the
1778 * other_subplans set, since indexes in it may change.
1779 */
1780 new_subplan_indexes = (int *) palloc(sizeof(int) * nsubplans);
1781 new_other_subplans = NULL;
1782 newidx = 0;
1783 for (i = 0; i < nsubplans; i++)
1784 {
1785 if (bms_is_member(i, result))
1786 new_subplan_indexes[i] = newidx++;
1787 else
1788 new_subplan_indexes[i] = -1; /* Newly pruned */
1789
1790 if (bms_is_member(i, prunestate->other_subplans))
1791 new_other_subplans = bms_add_member(new_other_subplans,
1792 new_subplan_indexes[i]);
1793 }
1794 bms_free(prunestate->other_subplans);
1795 prunestate->other_subplans = new_other_subplans;
1796
1797 /*
1798 * Now we can update each PartitionedRelPruneInfo's subplan_map with
1799 * new subplan indexes. We must also recompute its present_parts
1800 * bitmap.
1801 */
1802 for (i = 0; i < prunestate->num_partprunedata; i++)
1803 {
1804 PartitionPruningData *prunedata = prunestate->partprunedata[i];
1805 int j;
1806
1807 /*
1808 * Within each hierarchy, we perform this loop in back-to-front
1809 * order so that we determine present_parts for the lowest-level
1810 * partitioned tables first. This way we can tell whether a
1811 * sub-partitioned table's partitions were entirely pruned so we
1812 * can exclude that from 'present_parts'.
1813 */
1814 for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--)
1815 {
1816 PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
1817 int nparts = pprune->nparts;
1818 int k;
1819
1820 /* We just rebuild present_parts from scratch */
1821 bms_free(pprune->present_parts);
1822 pprune->present_parts = NULL;
1823
1824 for (k = 0; k < nparts; k++)
1825 {
1826 int oldidx = pprune->subplan_map[k];
1827 int subidx;
1828
1829 /*
1830 * If this partition existed as a subplan then change the
1831 * old subplan index to the new subplan index. The new
1832 * index may become -1 if the partition was pruned above,
1833 * or it may just come earlier in the subplan list due to
1834 * some subplans being removed earlier in the list. If
1835 * it's a subpartition, add it to present_parts unless
1836 * it's entirely pruned.
1837 */
1838 if (oldidx >= 0)
1839 {
1840 Assert(oldidx < nsubplans);
1841 pprune->subplan_map[k] = new_subplan_indexes[oldidx];
1842
1843 if (new_subplan_indexes[oldidx] >= 0)
1844 pprune->present_parts =
1845 bms_add_member(pprune->present_parts, k);
1846 }
1847 else if ((subidx = pprune->subpart_map[k]) >= 0)
1848 {
1849 PartitionedRelPruningData *subprune;
1850
1851 subprune = &prunedata->partrelprunedata[subidx];
1852
1853 if (!bms_is_empty(subprune->present_parts))
1854 pprune->present_parts =
1855 bms_add_member(pprune->present_parts, k);
1856 }
1857 }
1858 }
1859 }
1860
1861 pfree(new_subplan_indexes);
1862 }
1863
1864 return result;
1865 }
1866
1867 /*
1868 * ExecFindMatchingSubPlans
1869 * Determine which subplans match the pruning steps detailed in
1870 * 'prunestate' for the current comparison expression values.
1871 *
1872 * Here we assume we may evaluate PARAM_EXEC Params.
1873 */
1874 Bitmapset *
1875 ExecFindMatchingSubPlans(PartitionPruneState *prunestate)
1876 {
1877 Bitmapset *result = NULL;
1878 MemoryContext oldcontext;
1879 int i;
1880
1881 /*
1882 * Switch to a temp context to avoid leaking memory in the executor's
1883 * memory context.
1884 */
1885 oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
1886
1887 /*
1888 * For each hierarchy, do the pruning tests, and add deletable subplans'
1889 * indexes to "result".
1890 */
1891 for (i = 0; i < prunestate->num_partprunedata; i++)
1892 {
1893 PartitionPruningData *prunedata;
1894 PartitionedRelPruningData *pprune;
1895
1896 prunedata = prunestate->partprunedata[i];
1897 pprune = &prunedata->partrelprunedata[0];
1898
1899 find_matching_subplans_recurse(prunedata, pprune, false, &result);
1900
1901 /* Expression eval may have used space in node's ps_ExprContext too */
1902 if (pprune->exec_pruning_steps)
1903 ResetExprContext(pprune->exec_context.planstate->ps_ExprContext);
1904 }
1905
1906 MemoryContextSwitchTo(oldcontext);
1907
1908 /* Copy result out of the temp context before we reset it */
1909 result = bms_copy(result);
1910
1911 /* Add in any subplans that partition pruning didn't account for */
1912 result = bms_add_members(result, prunestate->other_subplans);
1913
1914 MemoryContextReset(prunestate->prune_context);
1915
1916 return result;
1917 }
1918
1919 /*
1920 * find_matching_subplans_recurse
1921 * Recursive worker function for ExecFindMatchingSubPlans and
1922 * ExecFindInitialMatchingSubPlans
1923 *
1924 * Adds valid (non-prunable) subplan IDs to *validsubplans
1925 */
1926 static void
1927 find_matching_subplans_recurse(PartitionPruningData *prunedata,
1928 PartitionedRelPruningData *pprune,
1929 bool initial_prune,
1930 Bitmapset **validsubplans)
1931 {
1932 Bitmapset *partset;
1933 int i;
1934
1935 /* Guard against stack overflow due to overly deep partition hierarchy. */
1936 check_stack_depth();
1937
1938 /* Only prune if pruning would be useful at this level. */
1939 if (initial_prune && pprune->initial_pruning_steps)
1940 {
1941 partset = get_matching_partitions(&pprune->initial_context,
1942 pprune->initial_pruning_steps);
1943 }
1944 else if (!initial_prune && pprune->exec_pruning_steps)
1945 {
1946 partset = get_matching_partitions(&pprune->exec_context,
1947 pprune->exec_pruning_steps);
1948 }
1949 else
1950 {
1951 /*
1952 * If no pruning is to be done, just include all partitions at this
1953 * level.
1954 */
1955 partset = pprune->present_parts;
1956 }
1957
1958 /* Translate partset into subplan indexes */
1959 i = -1;
1960 while ((i = bms_next_member(partset, i)) >= 0)
1961 {
1962 if (pprune->subplan_map[i] >= 0)
1963 *validsubplans = bms_add_member(*validsubplans,
1964 pprune->subplan_map[i]);
1965 else
1966 {
1967 int partidx = pprune->subpart_map[i];
1968
1969 if (partidx >= 0)
1970 find_matching_subplans_recurse(prunedata,
1971 &prunedata->partrelprunedata[partidx],
1972 initial_prune, validsubplans);
1973 else
1974 {
1975 /*
1976 * We get here if the planner already pruned all the sub-
1977 * partitions for this partition. Silently ignore this
1978 * partition in this case. The end result is the same: we
1979 * would have pruned all partitions just the same, but we
1980 * don't have any pruning steps to execute to verify this.
1981 */
1982 }
1983 }
1984 }
1985 }
1986