1 /*-------------------------------------------------------------------------
2  *
3  * partdesc.c
4  *		Support routines for manipulating partition descriptors
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *		  src/backend/partitioning/partdesc.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/genam.h"
18 #include "access/htup_details.h"
19 #include "access/table.h"
20 #include "catalog/indexing.h"
21 #include "catalog/partition.h"
22 #include "catalog/pg_inherits.h"
23 #include "partitioning/partbounds.h"
24 #include "partitioning/partdesc.h"
25 #include "storage/bufmgr.h"
26 #include "storage/sinval.h"
27 #include "utils/builtins.h"
28 #include "utils/inval.h"
29 #include "utils/fmgroids.h"
30 #include "utils/hsearch.h"
31 #include "utils/lsyscache.h"
32 #include "utils/memutils.h"
33 #include "utils/rel.h"
34 #include "utils/partcache.h"
35 #include "utils/syscache.h"
36 
37 typedef struct PartitionDirectoryData
38 {
39 	MemoryContext pdir_mcxt;
40 	HTAB	   *pdir_hash;
41 }			PartitionDirectoryData;
42 
43 typedef struct PartitionDirectoryEntry
44 {
45 	Oid			reloid;
46 	Relation	rel;
47 	PartitionDesc pd;
48 } PartitionDirectoryEntry;
49 
50 /*
51  * RelationBuildPartitionDesc
52  *		Form rel's partition descriptor, and store in relcache entry
53  *
54  * Note: the descriptor won't be flushed from the cache by
55  * RelationClearRelation() unless it's changed because of
56  * addition or removal of a partition.  Hence, code holding a lock
57  * that's sufficient to prevent that can assume that rd_partdesc
58  * won't change underneath it.
59  */
60 void
RelationBuildPartitionDesc(Relation rel)61 RelationBuildPartitionDesc(Relation rel)
62 {
63 	PartitionDesc partdesc;
64 	PartitionBoundInfo boundinfo = NULL;
65 	List	   *inhoids;
66 	PartitionBoundSpec **boundspecs = NULL;
67 	Oid		   *oids = NULL;
68 	ListCell   *cell;
69 	int			i,
70 				nparts;
71 	PartitionKey key = RelationGetPartitionKey(rel);
72 	MemoryContext oldcxt;
73 	int		   *mapping;
74 
75 	/*
76 	 * Get partition oids from pg_inherits.  This uses a single snapshot to
77 	 * fetch the list of children, so while more children may be getting added
78 	 * concurrently, whatever this function returns will be accurate as of
79 	 * some well-defined point in time.
80 	 */
81 	inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
82 	nparts = list_length(inhoids);
83 
84 	/* Allocate arrays for OIDs and boundspecs. */
85 	if (nparts > 0)
86 	{
87 		oids = palloc(nparts * sizeof(Oid));
88 		boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
89 	}
90 
91 	/* Collect bound spec nodes for each partition. */
92 	i = 0;
93 	foreach(cell, inhoids)
94 	{
95 		Oid			inhrelid = lfirst_oid(cell);
96 		HeapTuple	tuple;
97 		PartitionBoundSpec *boundspec = NULL;
98 
99 		/* Try fetching the tuple from the catcache, for speed. */
100 		tuple = SearchSysCache1(RELOID, inhrelid);
101 		if (HeapTupleIsValid(tuple))
102 		{
103 			Datum		datum;
104 			bool		isnull;
105 
106 			datum = SysCacheGetAttr(RELOID, tuple,
107 									Anum_pg_class_relpartbound,
108 									&isnull);
109 			if (!isnull)
110 				boundspec = stringToNode(TextDatumGetCString(datum));
111 			ReleaseSysCache(tuple);
112 		}
113 
114 		/*
115 		 * The system cache may be out of date; if so, we may find no pg_class
116 		 * tuple or an old one where relpartbound is NULL.  In that case, try
117 		 * the table directly.  We can't just AcceptInvalidationMessages() and
118 		 * retry the system cache lookup because it's possible that a
119 		 * concurrent ATTACH PARTITION operation has removed itself from the
120 		 * ProcArray but not yet added invalidation messages to the shared
121 		 * queue; InvalidateSystemCaches() would work, but seems excessive.
122 		 *
123 		 * Note that this algorithm assumes that PartitionBoundSpec we manage
124 		 * to fetch is the right one -- so this is only good enough for
125 		 * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
126 		 * some hypothetical operation that changes the partition bounds.
127 		 */
128 		if (boundspec == NULL)
129 		{
130 			Relation	pg_class;
131 			SysScanDesc scan;
132 			ScanKeyData key[1];
133 			Datum		datum;
134 			bool		isnull;
135 
136 			pg_class = table_open(RelationRelationId, AccessShareLock);
137 			ScanKeyInit(&key[0],
138 						Anum_pg_class_oid,
139 						BTEqualStrategyNumber, F_OIDEQ,
140 						ObjectIdGetDatum(inhrelid));
141 			scan = systable_beginscan(pg_class, ClassOidIndexId, true,
142 									  NULL, 1, key);
143 			tuple = systable_getnext(scan);
144 			datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
145 								 RelationGetDescr(pg_class), &isnull);
146 			if (!isnull)
147 				boundspec = stringToNode(TextDatumGetCString(datum));
148 			systable_endscan(scan);
149 			table_close(pg_class, AccessShareLock);
150 		}
151 
152 		/* Sanity checks. */
153 		if (!boundspec)
154 			elog(ERROR, "missing relpartbound for relation %u", inhrelid);
155 		if (!IsA(boundspec, PartitionBoundSpec))
156 			elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
157 
158 		/*
159 		 * If the PartitionBoundSpec says this is the default partition, its
160 		 * OID should match pg_partitioned_table.partdefid; if not, the
161 		 * catalog is corrupt.
162 		 */
163 		if (boundspec->is_default)
164 		{
165 			Oid			partdefid;
166 
167 			partdefid = get_default_partition_oid(RelationGetRelid(rel));
168 			if (partdefid != inhrelid)
169 				elog(ERROR, "expected partdefid %u, but got %u",
170 					 inhrelid, partdefid);
171 		}
172 
173 		/* Save results. */
174 		oids[i] = inhrelid;
175 		boundspecs[i] = boundspec;
176 		++i;
177 	}
178 
179 	/* Assert we aren't about to leak any old data structure */
180 	Assert(rel->rd_pdcxt == NULL);
181 	Assert(rel->rd_partdesc == NULL);
182 
183 	/*
184 	 * Now build the actual relcache partition descriptor.  Note that the
185 	 * order of operations here is fairly critical.  If we fail partway
186 	 * through this code, we won't have leaked memory because the rd_pdcxt is
187 	 * attached to the relcache entry immediately, so it'll be freed whenever
188 	 * the entry is rebuilt or destroyed.  However, we don't assign to
189 	 * rd_partdesc until the cached data structure is fully complete and
190 	 * valid, so that no other code might try to use it.
191 	 */
192 	rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext,
193 										  "partition descriptor",
194 										  ALLOCSET_SMALL_SIZES);
195 	MemoryContextCopyAndSetIdentifier(rel->rd_pdcxt,
196 									  RelationGetRelationName(rel));
197 
198 	partdesc = (PartitionDescData *)
199 		MemoryContextAllocZero(rel->rd_pdcxt, sizeof(PartitionDescData));
200 	partdesc->nparts = nparts;
201 	/* If there are no partitions, the rest of the partdesc can stay zero */
202 	if (nparts > 0)
203 	{
204 		/* Create PartitionBoundInfo, using the caller's context. */
205 		boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
206 
207 		/* Now copy all info into relcache's partdesc. */
208 		oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);
209 		partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
210 		partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
211 		partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
212 		MemoryContextSwitchTo(oldcxt);
213 
214 		/*
215 		 * Assign OIDs from the original array into mapped indexes of the
216 		 * result array.  The order of OIDs in the former is defined by the
217 		 * catalog scan that retrieved them, whereas that in the latter is
218 		 * defined by canonicalized representation of the partition bounds.
219 		 *
220 		 * Also record leaf-ness of each partition.  For this we use
221 		 * get_rel_relkind() which may leak memory, so be sure to run it in
222 		 * the caller's context.
223 		 */
224 		for (i = 0; i < nparts; i++)
225 		{
226 			int			index = mapping[i];
227 
228 			partdesc->oids[index] = oids[i];
229 			partdesc->is_leaf[index] =
230 				(get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE);
231 		}
232 	}
233 
234 	rel->rd_partdesc = partdesc;
235 }
236 
237 /*
238  * CreatePartitionDirectory
239  *		Create a new partition directory object.
240  */
241 PartitionDirectory
CreatePartitionDirectory(MemoryContext mcxt)242 CreatePartitionDirectory(MemoryContext mcxt)
243 {
244 	MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
245 	PartitionDirectory pdir;
246 	HASHCTL		ctl;
247 
248 	MemSet(&ctl, 0, sizeof(HASHCTL));
249 	ctl.keysize = sizeof(Oid);
250 	ctl.entrysize = sizeof(PartitionDirectoryEntry);
251 	ctl.hcxt = mcxt;
252 
253 	pdir = palloc(sizeof(PartitionDirectoryData));
254 	pdir->pdir_mcxt = mcxt;
255 	pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
256 								  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
257 
258 	MemoryContextSwitchTo(oldcontext);
259 	return pdir;
260 }
261 
262 /*
263  * PartitionDirectoryLookup
264  *		Look up the partition descriptor for a relation in the directory.
265  *
266  * The purpose of this function is to ensure that we get the same
267  * PartitionDesc for each relation every time we look it up.  In the
268  * face of current DDL, different PartitionDescs may be constructed with
269  * different views of the catalog state, but any single particular OID
270  * will always get the same PartitionDesc for as long as the same
271  * PartitionDirectory is used.
272  */
273 PartitionDesc
PartitionDirectoryLookup(PartitionDirectory pdir,Relation rel)274 PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
275 {
276 	PartitionDirectoryEntry *pde;
277 	Oid			relid = RelationGetRelid(rel);
278 	bool		found;
279 
280 	pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
281 	if (!found)
282 	{
283 		/*
284 		 * We must keep a reference count on the relation so that the
285 		 * PartitionDesc to which we are pointing can't get destroyed.
286 		 */
287 		RelationIncrementReferenceCount(rel);
288 		pde->rel = rel;
289 		pde->pd = RelationGetPartitionDesc(rel);
290 		Assert(pde->pd != NULL);
291 	}
292 	return pde->pd;
293 }
294 
295 /*
296  * DestroyPartitionDirectory
297  *		Destroy a partition directory.
298  *
299  * Release the reference counts we're holding.
300  */
301 void
DestroyPartitionDirectory(PartitionDirectory pdir)302 DestroyPartitionDirectory(PartitionDirectory pdir)
303 {
304 	HASH_SEQ_STATUS status;
305 	PartitionDirectoryEntry *pde;
306 
307 	hash_seq_init(&status, pdir->pdir_hash);
308 	while ((pde = hash_seq_search(&status)) != NULL)
309 		RelationDecrementReferenceCount(pde->rel);
310 }
311 
312 /*
313  * equalPartitionDescs
314  *		Compare two partition descriptors for logical equality
315  */
316 bool
equalPartitionDescs(PartitionKey key,PartitionDesc partdesc1,PartitionDesc partdesc2)317 equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
318 					PartitionDesc partdesc2)
319 {
320 	int			i;
321 
322 	if (partdesc1 != NULL)
323 	{
324 		if (partdesc2 == NULL)
325 			return false;
326 		if (partdesc1->nparts != partdesc2->nparts)
327 			return false;
328 
329 		Assert(key != NULL || partdesc1->nparts == 0);
330 
331 		/*
332 		 * Same oids? If the partitioning structure did not change, that is,
333 		 * no partitions were added or removed to the relation, the oids array
334 		 * should still match element-by-element.
335 		 */
336 		for (i = 0; i < partdesc1->nparts; i++)
337 		{
338 			if (partdesc1->oids[i] != partdesc2->oids[i])
339 				return false;
340 		}
341 
342 		/*
343 		 * Now compare partition bound collections.  The logic to iterate over
344 		 * the collections is private to partition.c.
345 		 */
346 		if (partdesc1->boundinfo != NULL)
347 		{
348 			if (partdesc2->boundinfo == NULL)
349 				return false;
350 
351 			if (!partition_bounds_equal(key->partnatts, key->parttyplen,
352 										key->parttypbyval,
353 										partdesc1->boundinfo,
354 										partdesc2->boundinfo))
355 				return false;
356 		}
357 		else if (partdesc2->boundinfo != NULL)
358 			return false;
359 	}
360 	else if (partdesc2 != NULL)
361 		return false;
362 
363 	return true;
364 }
365 
366 /*
367  * get_default_oid_from_partdesc
368  *
369  * Given a partition descriptor, return the OID of the default partition, if
370  * one exists; else, return InvalidOid.
371  */
372 Oid
get_default_oid_from_partdesc(PartitionDesc partdesc)373 get_default_oid_from_partdesc(PartitionDesc partdesc)
374 {
375 	if (partdesc && partdesc->boundinfo &&
376 		partition_bound_has_default(partdesc->boundinfo))
377 		return partdesc->oids[partdesc->boundinfo->default_index];
378 
379 	return InvalidOid;
380 }
381