1 /*-------------------------------------------------------------------------
2  *
3  * partdesc.c
4  *		Support routines for manipulating partition descriptors
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *		  src/backend/partitioning/partdesc.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/genam.h"
18 #include "access/htup_details.h"
19 #include "access/table.h"
20 #include "catalog/indexing.h"
21 #include "catalog/partition.h"
22 #include "catalog/pg_inherits.h"
23 #include "partitioning/partbounds.h"
24 #include "partitioning/partdesc.h"
25 #include "storage/bufmgr.h"
26 #include "storage/sinval.h"
27 #include "utils/builtins.h"
28 #include "utils/fmgroids.h"
29 #include "utils/hsearch.h"
30 #include "utils/inval.h"
31 #include "utils/lsyscache.h"
32 #include "utils/memutils.h"
33 #include "utils/partcache.h"
34 #include "utils/rel.h"
35 #include "utils/syscache.h"
36 
37 typedef struct PartitionDirectoryData
38 {
39 	MemoryContext pdir_mcxt;
40 	HTAB	   *pdir_hash;
41 }			PartitionDirectoryData;
42 
43 typedef struct PartitionDirectoryEntry
44 {
45 	Oid			reloid;
46 	Relation	rel;
47 	PartitionDesc pd;
48 } PartitionDirectoryEntry;
49 
50 static void RelationBuildPartitionDesc(Relation rel);
51 
52 
53 /*
54  * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
55  *
56  * Note: we arrange for partition descriptors to not get freed until the
57  * relcache entry's refcount goes to zero (see hacks in RelationClose,
58  * RelationClearRelation, and RelationBuildPartitionDesc).  Therefore, even
59  * though we hand back a direct pointer into the relcache entry, it's safe
60  * for callers to continue to use that pointer as long as (a) they hold the
61  * relation open, and (b) they hold a relation lock strong enough to ensure
62  * that the data doesn't become stale.
63  */
64 PartitionDesc
RelationGetPartitionDesc(Relation rel)65 RelationGetPartitionDesc(Relation rel)
66 {
67 	if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
68 		return NULL;
69 
70 	if (unlikely(rel->rd_partdesc == NULL))
71 		RelationBuildPartitionDesc(rel);
72 
73 	return rel->rd_partdesc;
74 }
75 
76 /*
77  * RelationBuildPartitionDesc
78  *		Form rel's partition descriptor, and store in relcache entry
79  *
80  * Partition descriptor is a complex structure; to avoid complicated logic to
81  * free individual elements whenever the relcache entry is flushed, we give it
82  * its own memory context, a child of CacheMemoryContext, which can easily be
83  * deleted on its own.  To avoid leaking memory in that context in case of an
84  * error partway through this function, the context is initially created as a
85  * child of CurTransactionContext and only re-parented to CacheMemoryContext
86  * at the end, when no further errors are possible.  Also, we don't make this
87  * context the current context except in very brief code sections, out of fear
88  * that some of our callees allocate memory on their own which would be leaked
89  * permanently.
90  */
91 static void
RelationBuildPartitionDesc(Relation rel)92 RelationBuildPartitionDesc(Relation rel)
93 {
94 	PartitionDesc partdesc;
95 	PartitionBoundInfo boundinfo = NULL;
96 	List	   *inhoids;
97 	PartitionBoundSpec **boundspecs = NULL;
98 	Oid		   *oids = NULL;
99 	bool	   *is_leaf = NULL;
100 	ListCell   *cell;
101 	int			i,
102 				nparts;
103 	PartitionKey key = RelationGetPartitionKey(rel);
104 	MemoryContext new_pdcxt;
105 	MemoryContext oldcxt;
106 	int		   *mapping;
107 
108 	/*
109 	 * Get partition oids from pg_inherits.  This uses a single snapshot to
110 	 * fetch the list of children, so while more children may be getting added
111 	 * concurrently, whatever this function returns will be accurate as of
112 	 * some well-defined point in time.
113 	 */
114 	inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
115 	nparts = list_length(inhoids);
116 
117 	/* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
118 	if (nparts > 0)
119 	{
120 		oids = (Oid *) palloc(nparts * sizeof(Oid));
121 		is_leaf = (bool *) palloc(nparts * sizeof(bool));
122 		boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
123 	}
124 
125 	/* Collect bound spec nodes for each partition. */
126 	i = 0;
127 	foreach(cell, inhoids)
128 	{
129 		Oid			inhrelid = lfirst_oid(cell);
130 		HeapTuple	tuple;
131 		PartitionBoundSpec *boundspec = NULL;
132 
133 		/* Try fetching the tuple from the catcache, for speed. */
134 		tuple = SearchSysCache1(RELOID, inhrelid);
135 		if (HeapTupleIsValid(tuple))
136 		{
137 			Datum		datum;
138 			bool		isnull;
139 
140 			datum = SysCacheGetAttr(RELOID, tuple,
141 									Anum_pg_class_relpartbound,
142 									&isnull);
143 			if (!isnull)
144 				boundspec = stringToNode(TextDatumGetCString(datum));
145 			ReleaseSysCache(tuple);
146 		}
147 
148 		/*
149 		 * The system cache may be out of date; if so, we may find no pg_class
150 		 * tuple or an old one where relpartbound is NULL.  In that case, try
151 		 * the table directly.  We can't just AcceptInvalidationMessages() and
152 		 * retry the system cache lookup because it's possible that a
153 		 * concurrent ATTACH PARTITION operation has removed itself from the
154 		 * ProcArray but not yet added invalidation messages to the shared
155 		 * queue; InvalidateSystemCaches() would work, but seems excessive.
156 		 *
157 		 * Note that this algorithm assumes that PartitionBoundSpec we manage
158 		 * to fetch is the right one -- so this is only good enough for
159 		 * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
160 		 * some hypothetical operation that changes the partition bounds.
161 		 */
162 		if (boundspec == NULL)
163 		{
164 			Relation	pg_class;
165 			SysScanDesc scan;
166 			ScanKeyData key[1];
167 			Datum		datum;
168 			bool		isnull;
169 
170 			pg_class = table_open(RelationRelationId, AccessShareLock);
171 			ScanKeyInit(&key[0],
172 						Anum_pg_class_oid,
173 						BTEqualStrategyNumber, F_OIDEQ,
174 						ObjectIdGetDatum(inhrelid));
175 			scan = systable_beginscan(pg_class, ClassOidIndexId, true,
176 									  NULL, 1, key);
177 			tuple = systable_getnext(scan);
178 			datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
179 								 RelationGetDescr(pg_class), &isnull);
180 			if (!isnull)
181 				boundspec = stringToNode(TextDatumGetCString(datum));
182 			systable_endscan(scan);
183 			table_close(pg_class, AccessShareLock);
184 		}
185 
186 		/* Sanity checks. */
187 		if (!boundspec)
188 			elog(ERROR, "missing relpartbound for relation %u", inhrelid);
189 		if (!IsA(boundspec, PartitionBoundSpec))
190 			elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
191 
192 		/*
193 		 * If the PartitionBoundSpec says this is the default partition, its
194 		 * OID should match pg_partitioned_table.partdefid; if not, the
195 		 * catalog is corrupt.
196 		 */
197 		if (boundspec->is_default)
198 		{
199 			Oid			partdefid;
200 
201 			partdefid = get_default_partition_oid(RelationGetRelid(rel));
202 			if (partdefid != inhrelid)
203 				elog(ERROR, "expected partdefid %u, but got %u",
204 					 inhrelid, partdefid);
205 		}
206 
207 		/* Save results. */
208 		oids[i] = inhrelid;
209 		is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
210 		boundspecs[i] = boundspec;
211 		++i;
212 	}
213 
214 	/*
215 	 * Create PartitionBoundInfo and mapping, working in the caller's context.
216 	 * This could fail, but we haven't done any damage if so.
217 	 */
218 	if (nparts > 0)
219 		boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
220 
221 	/*
222 	 * Now build the actual relcache partition descriptor, copying all the
223 	 * data into a new, small context.  As per above comment, we don't make
224 	 * this a long-lived context until it's finished.
225 	 */
226 	new_pdcxt = AllocSetContextCreate(CurTransactionContext,
227 									  "partition descriptor",
228 									  ALLOCSET_SMALL_SIZES);
229 	MemoryContextCopyAndSetIdentifier(new_pdcxt,
230 									  RelationGetRelationName(rel));
231 
232 	partdesc = (PartitionDescData *)
233 		MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
234 	partdesc->nparts = nparts;
235 	/* If there are no partitions, the rest of the partdesc can stay zero */
236 	if (nparts > 0)
237 	{
238 		oldcxt = MemoryContextSwitchTo(new_pdcxt);
239 		partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
240 		partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
241 		partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
242 
243 		/*
244 		 * Assign OIDs from the original array into mapped indexes of the
245 		 * result array.  The order of OIDs in the former is defined by the
246 		 * catalog scan that retrieved them, whereas that in the latter is
247 		 * defined by canonicalized representation of the partition bounds.
248 		 * Also save leaf-ness of each partition.
249 		 */
250 		for (i = 0; i < nparts; i++)
251 		{
252 			int			index = mapping[i];
253 
254 			partdesc->oids[index] = oids[i];
255 			partdesc->is_leaf[index] = is_leaf[i];
256 		}
257 		MemoryContextSwitchTo(oldcxt);
258 	}
259 
260 	/*
261 	 * We have a fully valid partdesc ready to store into the relcache.
262 	 * Reparent it so it has the right lifespan.
263 	 */
264 	MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
265 
266 	/*
267 	 * But first, a kluge: if there's an old rd_pdcxt, it contains an old
268 	 * partition descriptor that may still be referenced somewhere.  Preserve
269 	 * it, while not leaking it, by reattaching it as a child context of the
270 	 * new rd_pdcxt.  Eventually it will get dropped by either RelationClose
271 	 * or RelationClearRelation.
272 	 */
273 	if (rel->rd_pdcxt != NULL)
274 		MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
275 	rel->rd_pdcxt = new_pdcxt;
276 	rel->rd_partdesc = partdesc;
277 }
278 
279 /*
280  * CreatePartitionDirectory
281  *		Create a new partition directory object.
282  */
283 PartitionDirectory
CreatePartitionDirectory(MemoryContext mcxt)284 CreatePartitionDirectory(MemoryContext mcxt)
285 {
286 	MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
287 	PartitionDirectory pdir;
288 	HASHCTL		ctl;
289 
290 	MemSet(&ctl, 0, sizeof(HASHCTL));
291 	ctl.keysize = sizeof(Oid);
292 	ctl.entrysize = sizeof(PartitionDirectoryEntry);
293 	ctl.hcxt = mcxt;
294 
295 	pdir = palloc(sizeof(PartitionDirectoryData));
296 	pdir->pdir_mcxt = mcxt;
297 	pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
298 								  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
299 
300 	MemoryContextSwitchTo(oldcontext);
301 	return pdir;
302 }
303 
304 /*
305  * PartitionDirectoryLookup
306  *		Look up the partition descriptor for a relation in the directory.
307  *
308  * The purpose of this function is to ensure that we get the same
309  * PartitionDesc for each relation every time we look it up.  In the
310  * face of concurrent DDL, different PartitionDescs may be constructed with
311  * different views of the catalog state, but any single particular OID
312  * will always get the same PartitionDesc for as long as the same
313  * PartitionDirectory is used.
314  */
315 PartitionDesc
PartitionDirectoryLookup(PartitionDirectory pdir,Relation rel)316 PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
317 {
318 	PartitionDirectoryEntry *pde;
319 	Oid			relid = RelationGetRelid(rel);
320 	bool		found;
321 
322 	pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
323 	if (!found)
324 	{
325 		/*
326 		 * We must keep a reference count on the relation so that the
327 		 * PartitionDesc to which we are pointing can't get destroyed.
328 		 */
329 		RelationIncrementReferenceCount(rel);
330 		pde->rel = rel;
331 		pde->pd = RelationGetPartitionDesc(rel);
332 		Assert(pde->pd != NULL);
333 	}
334 	return pde->pd;
335 }
336 
337 /*
338  * DestroyPartitionDirectory
339  *		Destroy a partition directory.
340  *
341  * Release the reference counts we're holding.
342  */
343 void
DestroyPartitionDirectory(PartitionDirectory pdir)344 DestroyPartitionDirectory(PartitionDirectory pdir)
345 {
346 	HASH_SEQ_STATUS status;
347 	PartitionDirectoryEntry *pde;
348 
349 	hash_seq_init(&status, pdir->pdir_hash);
350 	while ((pde = hash_seq_search(&status)) != NULL)
351 		RelationDecrementReferenceCount(pde->rel);
352 }
353 
354 /*
355  * get_default_oid_from_partdesc
356  *
357  * Given a partition descriptor, return the OID of the default partition, if
358  * one exists; else, return InvalidOid.
359  */
360 Oid
get_default_oid_from_partdesc(PartitionDesc partdesc)361 get_default_oid_from_partdesc(PartitionDesc partdesc)
362 {
363 	if (partdesc && partdesc->boundinfo &&
364 		partition_bound_has_default(partdesc->boundinfo))
365 		return partdesc->oids[partdesc->boundinfo->default_index];
366 
367 	return InvalidOid;
368 }
369