1 /*-------------------------------------------------------------------------
2  *
3  * genam.c
4  *	  general index access method routines
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/index/genam.c
12  *
13  * NOTES
14  *	  many of the old access method routines have been turned into
15  *	  macros and moved to genam.h -cim 4/30/91
16  *
17  *-------------------------------------------------------------------------
18  */
19 
20 #include "postgres.h"
21 
22 #include "access/relscan.h"
23 #include "access/transam.h"
24 #include "catalog/index.h"
25 #include "lib/stringinfo.h"
26 #include "miscadmin.h"
27 #include "storage/bufmgr.h"
28 #include "utils/acl.h"
29 #include "utils/builtins.h"
30 #include "utils/lsyscache.h"
31 #include "utils/rel.h"
32 #include "utils/rls.h"
33 #include "utils/ruleutils.h"
34 #include "utils/snapmgr.h"
35 #include "utils/syscache.h"
36 #include "utils/tqual.h"
37 
38 
39 /* ----------------------------------------------------------------
40  *		general access method routines
41  *
42  *		All indexed access methods use an identical scan structure.
43  *		We don't know how the various AMs do locking, however, so we don't
44  *		do anything about that here.
45  *
46  *		The intent is that an AM implementor will define a beginscan routine
47  *		that calls RelationGetIndexScan, to fill in the scan, and then does
48  *		whatever kind of locking he wants.
49  *
50  *		At the end of a scan, the AM's endscan routine undoes the locking,
51  *		but does *not* call IndexScanEnd --- the higher-level index_endscan
52  *		routine does that.  (We can't do it in the AM because index_endscan
53  *		still needs to touch the IndexScanDesc after calling the AM.)
54  *
55  *		Because of this, the AM does not have a choice whether to call
56  *		RelationGetIndexScan or not; its beginscan routine must return an
57  *		object made by RelationGetIndexScan.  This is kinda ugly but not
58  *		worth cleaning up now.
59  * ----------------------------------------------------------------
60  */
61 
62 /* ----------------
63  *	RelationGetIndexScan -- Create and fill an IndexScanDesc.
64  *
65  *		This routine creates an index scan structure and sets up initial
66  *		contents for it.
67  *
68  *		Parameters:
69  *				indexRelation -- index relation for scan.
70  *				nkeys -- count of scan keys (index qual conditions).
71  *				norderbys -- count of index order-by operators.
72  *
73  *		Returns:
74  *				An initialized IndexScanDesc.
75  * ----------------
76  */
77 IndexScanDesc
RelationGetIndexScan(Relation indexRelation,int nkeys,int norderbys)78 RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
79 {
80 	IndexScanDesc scan;
81 
82 	scan = (IndexScanDesc) palloc(sizeof(IndexScanDescData));
83 
84 	scan->heapRelation = NULL;	/* may be set later */
85 	scan->indexRelation = indexRelation;
86 	scan->xs_snapshot = InvalidSnapshot;	/* caller must initialize this */
87 	scan->numberOfKeys = nkeys;
88 	scan->numberOfOrderBys = norderbys;
89 
90 	/*
91 	 * We allocate key workspace here, but it won't get filled until amrescan.
92 	 */
93 	if (nkeys > 0)
94 		scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
95 	else
96 		scan->keyData = NULL;
97 	if (norderbys > 0)
98 		scan->orderByData = (ScanKey) palloc(sizeof(ScanKeyData) * norderbys);
99 	else
100 		scan->orderByData = NULL;
101 
102 	scan->xs_want_itup = false; /* may be set later */
103 
104 	/*
105 	 * During recovery we ignore killed tuples and don't bother to kill them
106 	 * either. We do this because the xmin on the primary node could easily be
107 	 * later than the xmin on the standby node, so that what the primary
108 	 * thinks is killed is supposed to be visible on standby. So for correct
109 	 * MVCC for queries during recovery we must ignore these hints and check
110 	 * all tuples. Do *not* set ignore_killed_tuples to true when running in a
111 	 * transaction that was started during recovery. xactStartedInRecovery
112 	 * should not be altered by index AMs.
113 	 */
114 	scan->kill_prior_tuple = false;
115 	scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
116 	scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
117 
118 	scan->opaque = NULL;
119 
120 	scan->xs_itup = NULL;
121 	scan->xs_itupdesc = NULL;
122 	scan->xs_hitup = NULL;
123 	scan->xs_hitupdesc = NULL;
124 
125 	ItemPointerSetInvalid(&scan->xs_ctup.t_self);
126 	scan->xs_ctup.t_data = NULL;
127 	scan->xs_cbuf = InvalidBuffer;
128 	scan->xs_continue_hot = false;
129 
130 	return scan;
131 }
132 
133 /* ----------------
134  *	IndexScanEnd -- End an index scan.
135  *
136  *		This routine just releases the storage acquired by
137  *		RelationGetIndexScan().  Any AM-level resources are
138  *		assumed to already have been released by the AM's
139  *		endscan routine.
140  *
141  *	Returns:
142  *		None.
143  * ----------------
144  */
145 void
IndexScanEnd(IndexScanDesc scan)146 IndexScanEnd(IndexScanDesc scan)
147 {
148 	if (scan->keyData != NULL)
149 		pfree(scan->keyData);
150 	if (scan->orderByData != NULL)
151 		pfree(scan->orderByData);
152 
153 	pfree(scan);
154 }
155 
156 /*
157  * BuildIndexValueDescription
158  *
159  * Construct a string describing the contents of an index entry, in the
160  * form "(key_name, ...)=(key_value, ...)".  This is currently used
161  * for building unique-constraint and exclusion-constraint error messages,
162  * so only key columns of the index are checked and printed.
163  *
164  * Note that if the user does not have permissions to view all of the
165  * columns involved then a NULL is returned.  Returning a partial key seems
166  * unlikely to be useful and we have no way to know which of the columns the
167  * user provided (unlike in ExecBuildSlotValueDescription).
168  *
169  * The passed-in values/nulls arrays are the "raw" input to the index AM,
170  * e.g. results of FormIndexDatum --- this is not necessarily what is stored
171  * in the index, but it's what the user perceives to be stored.
172  *
173  * Note: if you change anything here, check whether
174  * ExecBuildSlotPartitionKeyDescription() in execMain.c needs a similar
175  * change.
176  */
177 char *
BuildIndexValueDescription(Relation indexRelation,Datum * values,bool * isnull)178 BuildIndexValueDescription(Relation indexRelation,
179 						   Datum *values, bool *isnull)
180 {
181 	StringInfoData buf;
182 	Form_pg_index idxrec;
183 	HeapTuple	ht_idx;
184 	int			indnkeyatts;
185 	int			i;
186 	int			keyno;
187 	Oid			indexrelid = RelationGetRelid(indexRelation);
188 	Oid			indrelid;
189 	AclResult	aclresult;
190 
191 	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRelation);
192 
193 	/*
194 	 * Check permissions- if the user does not have access to view all of the
195 	 * key columns then return NULL to avoid leaking data.
196 	 *
197 	 * First check if RLS is enabled for the relation.  If so, return NULL to
198 	 * avoid leaking data.
199 	 *
200 	 * Next we need to check table-level SELECT access and then, if there is
201 	 * no access there, check column-level permissions.
202 	 */
203 
204 	/*
205 	 * Fetch the pg_index tuple by the Oid of the index
206 	 */
207 	ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexrelid));
208 	if (!HeapTupleIsValid(ht_idx))
209 		elog(ERROR, "cache lookup failed for index %u", indexrelid);
210 	idxrec = (Form_pg_index) GETSTRUCT(ht_idx);
211 
212 	indrelid = idxrec->indrelid;
213 	Assert(indexrelid == idxrec->indexrelid);
214 
215 	/* RLS check- if RLS is enabled then we don't return anything. */
216 	if (check_enable_rls(indrelid, InvalidOid, true) == RLS_ENABLED)
217 	{
218 		ReleaseSysCache(ht_idx);
219 		return NULL;
220 	}
221 
222 	/* Table-level SELECT is enough, if the user has it */
223 	aclresult = pg_class_aclcheck(indrelid, GetUserId(), ACL_SELECT);
224 	if (aclresult != ACLCHECK_OK)
225 	{
226 		/*
227 		 * No table-level access, so step through the columns in the index and
228 		 * make sure the user has SELECT rights on all of them.
229 		 */
230 		for (keyno = 0; keyno < idxrec->indnkeyatts; keyno++)
231 		{
232 			AttrNumber	attnum = idxrec->indkey.values[keyno];
233 
234 			/*
235 			 * Note that if attnum == InvalidAttrNumber, then this is an index
236 			 * based on an expression and we return no detail rather than try
237 			 * to figure out what column(s) the expression includes and if the
238 			 * user has SELECT rights on them.
239 			 */
240 			if (attnum == InvalidAttrNumber ||
241 				pg_attribute_aclcheck(indrelid, attnum, GetUserId(),
242 									  ACL_SELECT) != ACLCHECK_OK)
243 			{
244 				/* No access, so clean up and return */
245 				ReleaseSysCache(ht_idx);
246 				return NULL;
247 			}
248 		}
249 	}
250 	ReleaseSysCache(ht_idx);
251 
252 	initStringInfo(&buf);
253 	appendStringInfo(&buf, "(%s)=(",
254 					 pg_get_indexdef_columns(indexrelid, true));
255 
256 	for (i = 0; i < indnkeyatts; i++)
257 	{
258 		char	   *val;
259 
260 		if (isnull[i])
261 			val = "null";
262 		else
263 		{
264 			Oid			foutoid;
265 			bool		typisvarlena;
266 
267 			/*
268 			 * The provided data is not necessarily of the type stored in the
269 			 * index; rather it is of the index opclass's input type. So look
270 			 * at rd_opcintype not the index tupdesc.
271 			 *
272 			 * Note: this is a bit shaky for opclasses that have pseudotype
273 			 * input types such as ANYARRAY or RECORD.  Currently, the
274 			 * typoutput functions associated with the pseudotypes will work
275 			 * okay, but we might have to try harder in future.
276 			 */
277 			getTypeOutputInfo(indexRelation->rd_opcintype[i],
278 							  &foutoid, &typisvarlena);
279 			val = OidOutputFunctionCall(foutoid, values[i]);
280 		}
281 
282 		if (i > 0)
283 			appendStringInfoString(&buf, ", ");
284 		appendStringInfoString(&buf, val);
285 	}
286 
287 	appendStringInfoChar(&buf, ')');
288 
289 	return buf.data;
290 }
291 
292 
293 /* ----------------------------------------------------------------
294  *		heap-or-index-scan access to system catalogs
295  *
296  *		These functions support system catalog accesses that normally use
297  *		an index but need to be capable of being switched to heap scans
298  *		if the system indexes are unavailable.
299  *
300  *		The specified scan keys must be compatible with the named index.
301  *		Generally this means that they must constrain either all columns
302  *		of the index, or the first K columns of an N-column index.
303  *
304  *		These routines could work with non-system tables, actually,
305  *		but they're only useful when there is a known index to use with
306  *		the given scan keys; so in practice they're only good for
307  *		predetermined types of scans of system catalogs.
308  * ----------------------------------------------------------------
309  */
310 
311 /*
312  * systable_beginscan --- set up for heap-or-index scan
313  *
314  *	rel: catalog to scan, already opened and suitably locked
315  *	indexId: OID of index to conditionally use
316  *	indexOK: if false, forces a heap scan (see notes below)
317  *	snapshot: time qual to use (NULL for a recent catalog snapshot)
318  *	nkeys, key: scan keys
319  *
320  * The attribute numbers in the scan key should be set for the heap case.
321  * If we choose to index, we reset them to 1..n to reference the index
322  * columns.  Note this means there must be one scankey qualification per
323  * index column!  This is checked by the Asserts in the normal, index-using
324  * case, but won't be checked if the heapscan path is taken.
325  *
326  * The routine checks the normal cases for whether an indexscan is safe,
327  * but caller can make additional checks and pass indexOK=false if needed.
328  * In standard case indexOK can simply be constant TRUE.
329  */
330 SysScanDesc
systable_beginscan(Relation heapRelation,Oid indexId,bool indexOK,Snapshot snapshot,int nkeys,ScanKey key)331 systable_beginscan(Relation heapRelation,
332 				   Oid indexId,
333 				   bool indexOK,
334 				   Snapshot snapshot,
335 				   int nkeys, ScanKey key)
336 {
337 	SysScanDesc sysscan;
338 	Relation	irel;
339 
340 	if (indexOK &&
341 		!IgnoreSystemIndexes &&
342 		!ReindexIsProcessingIndex(indexId))
343 		irel = index_open(indexId, AccessShareLock);
344 	else
345 		irel = NULL;
346 
347 	sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData));
348 
349 	sysscan->heap_rel = heapRelation;
350 	sysscan->irel = irel;
351 
352 	if (snapshot == NULL)
353 	{
354 		Oid			relid = RelationGetRelid(heapRelation);
355 
356 		snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
357 		sysscan->snapshot = snapshot;
358 	}
359 	else
360 	{
361 		/* Caller is responsible for any snapshot. */
362 		sysscan->snapshot = NULL;
363 	}
364 
365 	if (irel)
366 	{
367 		int			i;
368 
369 		/* Change attribute numbers to be index column numbers. */
370 		for (i = 0; i < nkeys; i++)
371 		{
372 			int			j;
373 
374 			for (j = 0; j < IndexRelationGetNumberOfAttributes(irel); j++)
375 			{
376 				if (key[i].sk_attno == irel->rd_index->indkey.values[j])
377 				{
378 					key[i].sk_attno = j + 1;
379 					break;
380 				}
381 			}
382 			if (j == IndexRelationGetNumberOfAttributes(irel))
383 				elog(ERROR, "column is not in index");
384 		}
385 
386 		sysscan->iscan = index_beginscan(heapRelation, irel,
387 										 snapshot, nkeys, 0);
388 		index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
389 		sysscan->scan = NULL;
390 	}
391 	else
392 	{
393 		/*
394 		 * We disallow synchronized scans when forced to use a heapscan on a
395 		 * catalog.  In most cases the desired rows are near the front, so
396 		 * that the unpredictable start point of a syncscan is a serious
397 		 * disadvantage; and there are no compensating advantages, because
398 		 * it's unlikely that such scans will occur in parallel.
399 		 */
400 		sysscan->scan = heap_beginscan_strat(heapRelation, snapshot,
401 											 nkeys, key,
402 											 true, false);
403 		sysscan->iscan = NULL;
404 	}
405 
406 	return sysscan;
407 }
408 
409 /*
410  * systable_getnext --- get next tuple in a heap-or-index scan
411  *
412  * Returns NULL if no more tuples available.
413  *
414  * Note that returned tuple is a reference to data in a disk buffer;
415  * it must not be modified, and should be presumed inaccessible after
416  * next getnext() or endscan() call.
417  */
418 HeapTuple
systable_getnext(SysScanDesc sysscan)419 systable_getnext(SysScanDesc sysscan)
420 {
421 	HeapTuple	htup;
422 
423 	if (sysscan->irel)
424 	{
425 		htup = index_getnext(sysscan->iscan, ForwardScanDirection);
426 
427 		/*
428 		 * We currently don't need to support lossy index operators for any
429 		 * system catalog scan.  It could be done here, using the scan keys to
430 		 * drive the operator calls, if we arranged to save the heap attnums
431 		 * during systable_beginscan(); this is practical because we still
432 		 * wouldn't need to support indexes on expressions.
433 		 */
434 		if (htup && sysscan->iscan->xs_recheck)
435 			elog(ERROR, "system catalog scans with lossy index conditions are not implemented");
436 	}
437 	else
438 		htup = heap_getnext(sysscan->scan, ForwardScanDirection);
439 
440 	return htup;
441 }
442 
443 /*
444  * systable_recheck_tuple --- recheck visibility of most-recently-fetched tuple
445  *
446  * In particular, determine if this tuple would be visible to a catalog scan
447  * that started now.  We don't handle the case of a non-MVCC scan snapshot,
448  * because no caller needs that yet.
449  *
450  * This is useful to test whether an object was deleted while we waited to
451  * acquire lock on it.
452  *
453  * Note: we don't actually *need* the tuple to be passed in, but it's a
454  * good crosscheck that the caller is interested in the right tuple.
455  */
456 bool
systable_recheck_tuple(SysScanDesc sysscan,HeapTuple tup)457 systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup)
458 {
459 	Snapshot	freshsnap;
460 	bool		result;
461 
462 	/*
463 	 * Trust that LockBuffer() and HeapTupleSatisfiesMVCC() do not themselves
464 	 * acquire snapshots, so we need not register the snapshot.  Those
465 	 * facilities are too low-level to have any business scanning tables.
466 	 */
467 	freshsnap = GetCatalogSnapshot(RelationGetRelid(sysscan->heap_rel));
468 
469 	if (sysscan->irel)
470 	{
471 		IndexScanDesc scan = sysscan->iscan;
472 
473 		Assert(IsMVCCSnapshot(scan->xs_snapshot));
474 		Assert(tup == &scan->xs_ctup);
475 		Assert(BufferIsValid(scan->xs_cbuf));
476 		/* must hold a buffer lock to call HeapTupleSatisfiesVisibility */
477 		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
478 		result = HeapTupleSatisfiesVisibility(tup, freshsnap, scan->xs_cbuf);
479 		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
480 	}
481 	else
482 	{
483 		HeapScanDesc scan = sysscan->scan;
484 
485 		Assert(IsMVCCSnapshot(scan->rs_snapshot));
486 		Assert(tup == &scan->rs_ctup);
487 		Assert(BufferIsValid(scan->rs_cbuf));
488 		/* must hold a buffer lock to call HeapTupleSatisfiesVisibility */
489 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
490 		result = HeapTupleSatisfiesVisibility(tup, freshsnap, scan->rs_cbuf);
491 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
492 	}
493 	return result;
494 }
495 
496 /*
497  * systable_endscan --- close scan, release resources
498  *
499  * Note that it's still up to the caller to close the heap relation.
500  */
501 void
systable_endscan(SysScanDesc sysscan)502 systable_endscan(SysScanDesc sysscan)
503 {
504 	if (sysscan->irel)
505 	{
506 		index_endscan(sysscan->iscan);
507 		index_close(sysscan->irel, AccessShareLock);
508 	}
509 	else
510 		heap_endscan(sysscan->scan);
511 
512 	if (sysscan->snapshot)
513 		UnregisterSnapshot(sysscan->snapshot);
514 
515 	pfree(sysscan);
516 }
517 
518 
519 /*
520  * systable_beginscan_ordered --- set up for ordered catalog scan
521  *
522  * These routines have essentially the same API as systable_beginscan etc,
523  * except that they guarantee to return multiple matching tuples in
524  * index order.  Also, for largely historical reasons, the index to use
525  * is opened and locked by the caller, not here.
526  *
527  * Currently we do not support non-index-based scans here.  (In principle
528  * we could do a heapscan and sort, but the uses are in places that
529  * probably don't need to still work with corrupted catalog indexes.)
530  * For the moment, therefore, these functions are merely the thinnest of
531  * wrappers around index_beginscan/index_getnext.  The main reason for their
532  * existence is to centralize possible future support of lossy operators
533  * in catalog scans.
534  */
535 SysScanDesc
systable_beginscan_ordered(Relation heapRelation,Relation indexRelation,Snapshot snapshot,int nkeys,ScanKey key)536 systable_beginscan_ordered(Relation heapRelation,
537 						   Relation indexRelation,
538 						   Snapshot snapshot,
539 						   int nkeys, ScanKey key)
540 {
541 	SysScanDesc sysscan;
542 	int			i;
543 
544 	/* REINDEX can probably be a hard error here ... */
545 	if (ReindexIsProcessingIndex(RelationGetRelid(indexRelation)))
546 		elog(ERROR, "cannot do ordered scan on index \"%s\", because it is being reindexed",
547 			 RelationGetRelationName(indexRelation));
548 	/* ... but we only throw a warning about violating IgnoreSystemIndexes */
549 	if (IgnoreSystemIndexes)
550 		elog(WARNING, "using index \"%s\" despite IgnoreSystemIndexes",
551 			 RelationGetRelationName(indexRelation));
552 
553 	sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData));
554 
555 	sysscan->heap_rel = heapRelation;
556 	sysscan->irel = indexRelation;
557 
558 	if (snapshot == NULL)
559 	{
560 		Oid			relid = RelationGetRelid(heapRelation);
561 
562 		snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
563 		sysscan->snapshot = snapshot;
564 	}
565 	else
566 	{
567 		/* Caller is responsible for any snapshot. */
568 		sysscan->snapshot = NULL;
569 	}
570 
571 	/* Change attribute numbers to be index column numbers. */
572 	for (i = 0; i < nkeys; i++)
573 	{
574 		int			j;
575 
576 		for (j = 0; j < IndexRelationGetNumberOfAttributes(indexRelation); j++)
577 		{
578 			if (key[i].sk_attno == indexRelation->rd_index->indkey.values[j])
579 			{
580 				key[i].sk_attno = j + 1;
581 				break;
582 			}
583 		}
584 		if (j == IndexRelationGetNumberOfAttributes(indexRelation))
585 			elog(ERROR, "column is not in index");
586 	}
587 
588 	sysscan->iscan = index_beginscan(heapRelation, indexRelation,
589 									 snapshot, nkeys, 0);
590 	index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
591 	sysscan->scan = NULL;
592 
593 	return sysscan;
594 }
595 
596 /*
597  * systable_getnext_ordered --- get next tuple in an ordered catalog scan
598  */
599 HeapTuple
systable_getnext_ordered(SysScanDesc sysscan,ScanDirection direction)600 systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction)
601 {
602 	HeapTuple	htup;
603 
604 	Assert(sysscan->irel);
605 	htup = index_getnext(sysscan->iscan, direction);
606 	/* See notes in systable_getnext */
607 	if (htup && sysscan->iscan->xs_recheck)
608 		elog(ERROR, "system catalog scans with lossy index conditions are not implemented");
609 
610 	return htup;
611 }
612 
613 /*
614  * systable_endscan_ordered --- close scan, release resources
615  */
616 void
systable_endscan_ordered(SysScanDesc sysscan)617 systable_endscan_ordered(SysScanDesc sysscan)
618 {
619 	Assert(sysscan->irel);
620 	index_endscan(sysscan->iscan);
621 	if (sysscan->snapshot)
622 		UnregisterSnapshot(sysscan->snapshot);
623 	pfree(sysscan);
624 }
625