1 /*-------------------------------------------------------------------------
2  *
3  * predicate_internals.h
4  *	  POSTGRES internal predicate locking definitions.
5  *
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/include/storage/predicate_internals.h
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef PREDICATE_INTERNALS_H
15 #define PREDICATE_INTERNALS_H
16 
17 #include "storage/lock.h"
18 #include "storage/lwlock.h"
19 
20 /*
21  * Commit number.
22  */
23 typedef uint64 SerCommitSeqNo;
24 
25 /*
26  * Reserved commit sequence numbers:
27  *	- 0 is reserved to indicate a non-existent SLRU entry; it cannot be
28  *	  used as a SerCommitSeqNo, even an invalid one
29  *	- InvalidSerCommitSeqNo is used to indicate a transaction that
30  *	  hasn't committed yet, so use a number greater than all valid
31  *	  ones to make comparison do the expected thing
32  *	- RecoverySerCommitSeqNo is used to refer to transactions that
33  *	  happened before a crash/recovery, since we restart the sequence
34  *	  at that point.  It's earlier than all normal sequence numbers,
35  *	  and is only used by recovered prepared transactions
36  */
37 #define InvalidSerCommitSeqNo		((SerCommitSeqNo) PG_UINT64_MAX)
38 #define RecoverySerCommitSeqNo		((SerCommitSeqNo) 1)
39 #define FirstNormalSerCommitSeqNo	((SerCommitSeqNo) 2)
40 
41 /*
42  * The SERIALIZABLEXACT struct contains information needed for each
43  * serializable database transaction to support SSI techniques.
44  *
45  * A home-grown list is maintained in shared memory to manage these.
46  * An entry is used when the serializable transaction acquires a snapshot.
47  * Unless the transaction is rolled back, this entry must generally remain
48  * until all concurrent transactions have completed.  (There are special
49  * optimizations for READ ONLY transactions which often allow them to be
50  * cleaned up earlier.)  A transaction which is rolled back is cleaned up
51  * as soon as possible.
52  *
53  * Eligibility for cleanup of committed transactions is generally determined
54  * by comparing the transaction's finishedBefore field to
55  * SxactGlobalXmin.
56  */
57 typedef struct SERIALIZABLEXACT
58 {
59 	VirtualTransactionId vxid;	/* The executing process always has one of
60 								 * these. */
61 
62 	/*
63 	 * We use two numbers to track the order that transactions commit. Before
64 	 * commit, a transaction is marked as prepared, and prepareSeqNo is set.
65 	 * Shortly after commit, it's marked as committed, and commitSeqNo is set.
66 	 * This doesn't give a strict commit order, but these two values together
67 	 * are good enough for us, as we can always err on the safe side and
68 	 * assume that there's a conflict, if we can't be sure of the exact
69 	 * ordering of two commits.
70 	 *
71 	 * Note that a transaction is marked as prepared for a short period during
72 	 * commit processing, even if two-phase commit is not used. But with
73 	 * two-phase commit, a transaction can stay in prepared state for some
74 	 * time.
75 	 */
76 	SerCommitSeqNo prepareSeqNo;
77 	SerCommitSeqNo commitSeqNo;
78 
79 	/* these values are not both interesting at the same time */
80 	union
81 	{
82 		SerCommitSeqNo earliestOutConflictCommit;	/* when committed with
83 													 * conflict out */
84 		SerCommitSeqNo lastCommitBeforeSnapshot;	/* when not committed or
85 													 * no conflict out */
86 	}			SeqNo;
87 	SHM_QUEUE	outConflicts;	/* list of write transactions whose data we
88 								 * couldn't read. */
89 	SHM_QUEUE	inConflicts;	/* list of read transactions which couldn't
90 								 * see our write. */
91 	SHM_QUEUE	predicateLocks; /* list of associated PREDICATELOCK objects */
92 	SHM_QUEUE	finishedLink;	/* list link in
93 								 * FinishedSerializableTransactions */
94 
95 	/*
96 	 * perXactPredicateListLock is only used in parallel queries: it protects
97 	 * this SERIALIZABLEXACT's predicate lock list against other workers of
98 	 * the same session.
99 	 */
100 	LWLock		perXactPredicateListLock;
101 
102 	/*
103 	 * for r/o transactions: list of concurrent r/w transactions that we could
104 	 * potentially have conflicts with, and vice versa for r/w transactions
105 	 */
106 	SHM_QUEUE	possibleUnsafeConflicts;
107 
108 	TransactionId topXid;		/* top level xid for the transaction, if one
109 								 * exists; else invalid */
110 	TransactionId finishedBefore;	/* invalid means still running; else the
111 									 * struct expires when no serializable
112 									 * xids are before this. */
113 	TransactionId xmin;			/* the transaction's snapshot xmin */
114 	uint32		flags;			/* OR'd combination of values defined below */
115 	int			pid;			/* pid of associated process */
116 } SERIALIZABLEXACT;
117 
118 #define SXACT_FLAG_COMMITTED			0x00000001	/* already committed */
119 #define SXACT_FLAG_PREPARED				0x00000002	/* about to commit */
120 #define SXACT_FLAG_ROLLED_BACK			0x00000004	/* already rolled back */
121 #define SXACT_FLAG_DOOMED				0x00000008	/* will roll back */
122 /*
123  * The following flag actually means that the flagged transaction has a
124  * conflict out *to a transaction which committed ahead of it*.  It's hard
125  * to get that into a name of a reasonable length.
126  */
127 #define SXACT_FLAG_CONFLICT_OUT			0x00000010
128 #define SXACT_FLAG_READ_ONLY			0x00000020
129 #define SXACT_FLAG_DEFERRABLE_WAITING	0x00000040
130 #define SXACT_FLAG_RO_SAFE				0x00000080
131 #define SXACT_FLAG_RO_UNSAFE			0x00000100
132 #define SXACT_FLAG_SUMMARY_CONFLICT_IN	0x00000200
133 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
134 /*
135  * The following flag means the transaction has been partially released
136  * already, but is being preserved because parallel workers might have a
137  * reference to it.  It'll be recycled by the leader at end-of-transaction.
138  */
139 #define SXACT_FLAG_PARTIALLY_RELEASED	0x00000800
140 
141 /*
142  * The following types are used to provide an ad hoc list for holding
143  * SERIALIZABLEXACT objects.  An HTAB is overkill, since there is no need to
144  * access these by key -- there are direct pointers to these objects where
145  * needed.  If a shared memory list is created, these types can probably be
146  * eliminated in favor of using the general solution.
147  */
148 typedef struct PredXactListElementData
149 {
150 	SHM_QUEUE	link;
151 	SERIALIZABLEXACT sxact;
152 }			PredXactListElementData;
153 
154 typedef struct PredXactListElementData *PredXactListElement;
155 
156 #define PredXactListElementDataSize \
157 		((Size)MAXALIGN(sizeof(PredXactListElementData)))
158 
159 typedef struct PredXactListData
160 {
161 	SHM_QUEUE	availableList;
162 	SHM_QUEUE	activeList;
163 
164 	/*
165 	 * These global variables are maintained when registering and cleaning up
166 	 * serializable transactions.  They must be global across all backends,
167 	 * but are not needed outside the predicate.c source file. Protected by
168 	 * SerializableXactHashLock.
169 	 */
170 	TransactionId SxactGlobalXmin;	/* global xmin for active serializable
171 									 * transactions */
172 	int			SxactGlobalXminCount;	/* how many active serializable
173 										 * transactions have this xmin */
174 	int			WritableSxactCount; /* how many non-read-only serializable
175 									 * transactions are active */
176 	SerCommitSeqNo LastSxactCommitSeqNo;	/* a strictly monotonically
177 											 * increasing number for commits
178 											 * of serializable transactions */
179 	/* Protected by SerializableXactHashLock. */
180 	SerCommitSeqNo CanPartialClearThrough;	/* can clear predicate locks and
181 											 * inConflicts for committed
182 											 * transactions through this seq
183 											 * no */
184 	/* Protected by SerializableFinishedListLock. */
185 	SerCommitSeqNo HavePartialClearedThrough;	/* have cleared through this
186 												 * seq no */
187 	SERIALIZABLEXACT *OldCommittedSxact;	/* shared copy of dummy sxact */
188 
189 	PredXactListElement element;
190 }			PredXactListData;
191 
192 typedef struct PredXactListData *PredXactList;
193 
194 #define PredXactListDataSize \
195 		((Size)MAXALIGN(sizeof(PredXactListData)))
196 
197 
198 /*
199  * The following types are used to provide lists of rw-conflicts between
200  * pairs of transactions.  Since exactly the same information is needed,
201  * they are also used to record possible unsafe transaction relationships
202  * for purposes of identifying safe snapshots for read-only transactions.
203  *
204  * When a RWConflictData is not in use to record either type of relationship
205  * between a pair of transactions, it is kept on an "available" list.  The
206  * outLink field is used for maintaining that list.
207  */
208 typedef struct RWConflictData
209 {
210 	SHM_QUEUE	outLink;		/* link for list of conflicts out from a sxact */
211 	SHM_QUEUE	inLink;			/* link for list of conflicts in to a sxact */
212 	SERIALIZABLEXACT *sxactOut;
213 	SERIALIZABLEXACT *sxactIn;
214 }			RWConflictData;
215 
216 typedef struct RWConflictData *RWConflict;
217 
218 #define RWConflictDataSize \
219 		((Size)MAXALIGN(sizeof(RWConflictData)))
220 
221 typedef struct RWConflictPoolHeaderData
222 {
223 	SHM_QUEUE	availableList;
224 	RWConflict	element;
225 }			RWConflictPoolHeaderData;
226 
227 typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
228 
229 #define RWConflictPoolHeaderDataSize \
230 		((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
231 
232 
233 /*
234  * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
235  * transaction or any of its subtransactions.
236  */
237 typedef struct SERIALIZABLEXIDTAG
238 {
239 	TransactionId xid;
240 } SERIALIZABLEXIDTAG;
241 
242 /*
243  * The SERIALIZABLEXID struct provides a link from a TransactionId for a
244  * serializable transaction to the related SERIALIZABLEXACT record, even if
245  * the transaction has completed and its connection has been closed.
246  *
247  * These are created as new top level transaction IDs are first assigned to
248  * transactions which are participating in predicate locking.  This may
249  * never happen for a particular transaction if it doesn't write anything.
250  * They are removed with their related serializable transaction objects.
251  *
252  * The SubTransGetTopmostTransaction method is used where necessary to get
253  * from an XID which might be from a subtransaction to the top level XID.
254  */
255 typedef struct SERIALIZABLEXID
256 {
257 	/* hash key */
258 	SERIALIZABLEXIDTAG tag;
259 
260 	/* data */
261 	SERIALIZABLEXACT *myXact;	/* pointer to the top level transaction data */
262 } SERIALIZABLEXID;
263 
264 
265 /*
266  * The PREDICATELOCKTARGETTAG struct identifies a database object which can
267  * be the target of predicate locks.
268  *
269  * Note that the hash function being used doesn't properly respect tag
270  * length -- if the length of the structure isn't a multiple of four bytes it
271  * will go to a four byte boundary past the end of the tag.  If you change
272  * this struct, make sure any slack space is initialized, so that any random
273  * bytes in the middle or at the end are not included in the hash.
274  *
275  * TODO SSI: If we always use the same fields for the same type of value, we
276  * should rename these.  Holding off until it's clear there are no exceptions.
277  * Since indexes are relations with blocks and tuples, it's looking likely that
278  * the rename will be possible.  If not, we may need to divide the last field
279  * and use part of it for a target type, so that we know how to interpret the
280  * data..
281  */
282 typedef struct PREDICATELOCKTARGETTAG
283 {
284 	uint32		locktag_field1; /* a 32-bit ID field */
285 	uint32		locktag_field2; /* a 32-bit ID field */
286 	uint32		locktag_field3; /* a 32-bit ID field */
287 	uint32		locktag_field4; /* a 32-bit ID field */
288 } PREDICATELOCKTARGETTAG;
289 
290 /*
291  * The PREDICATELOCKTARGET struct represents a database object on which there
292  * are predicate locks.
293  *
294  * A hash list of these objects is maintained in shared memory.  An entry is
295  * added when a predicate lock is requested on an object which doesn't
296  * already have one.  An entry is removed when the last lock is removed from
297  * its list.
298  */
299 typedef struct PREDICATELOCKTARGET
300 {
301 	/* hash key */
302 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
303 
304 	/* data */
305 	SHM_QUEUE	predicateLocks; /* list of PREDICATELOCK objects assoc. with
306 								 * predicate lock target */
307 } PREDICATELOCKTARGET;
308 
309 
310 /*
311  * The PREDICATELOCKTAG struct identifies an individual predicate lock.
312  *
313  * It is the combination of predicate lock target (which is a lockable
314  * object) and a serializable transaction which has acquired a lock on that
315  * target.
316  */
317 typedef struct PREDICATELOCKTAG
318 {
319 	PREDICATELOCKTARGET *myTarget;
320 	SERIALIZABLEXACT *myXact;
321 } PREDICATELOCKTAG;
322 
323 /*
324  * The PREDICATELOCK struct represents an individual lock.
325  *
326  * An entry can be created here when the related database object is read, or
327  * by promotion of multiple finer-grained targets.  All entries related to a
328  * serializable transaction are removed when that serializable transaction is
329  * cleaned up.  Entries can also be removed when they are combined into a
330  * single coarser-grained lock entry.
331  */
332 typedef struct PREDICATELOCK
333 {
334 	/* hash key */
335 	PREDICATELOCKTAG tag;		/* unique identifier of lock */
336 
337 	/* data */
338 	SHM_QUEUE	targetLink;		/* list link in PREDICATELOCKTARGET's list of
339 								 * predicate locks */
340 	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
341 								 * predicate locks */
342 	SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
343 } PREDICATELOCK;
344 
345 
346 /*
347  * The LOCALPREDICATELOCK struct represents a local copy of data which is
348  * also present in the PREDICATELOCK table, organized for fast access without
349  * needing to acquire a LWLock.  It is strictly for optimization.
350  *
351  * Each serializable transaction creates its own local hash table to hold a
352  * collection of these.  This information is used to determine when a number
353  * of fine-grained locks should be promoted to a single coarser-grained lock.
354  * The information is maintained more-or-less in parallel to the
355  * PREDICATELOCK data, but because this data is not protected by locks and is
356  * only used in an optimization heuristic, it is allowed to drift in a few
357  * corner cases where maintaining exact data would be expensive.
358  *
359  * The hash table is created when the serializable transaction acquires its
360  * snapshot, and its memory is released upon completion of the transaction.
361  */
362 typedef struct LOCALPREDICATELOCK
363 {
364 	/* hash key */
365 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
366 
367 	/* data */
368 	bool		held;			/* is lock held, or just its children?	*/
369 	int			childLocks;		/* number of child locks currently held */
370 } LOCALPREDICATELOCK;
371 
372 
373 /*
374  * The types of predicate locks which can be acquired.
375  */
376 typedef enum PredicateLockTargetType
377 {
378 	PREDLOCKTAG_RELATION,
379 	PREDLOCKTAG_PAGE,
380 	PREDLOCKTAG_TUPLE
381 	/* TODO SSI: Other types may be needed for index locking */
382 } PredicateLockTargetType;
383 
384 
385 /*
386  * This structure is used to quickly capture a copy of all predicate
387  * locks.  This is currently used only by the pg_lock_status function,
388  * which in turn is used by the pg_locks view.
389  */
390 typedef struct PredicateLockData
391 {
392 	int			nelements;
393 	PREDICATELOCKTARGETTAG *locktags;
394 	SERIALIZABLEXACT *xacts;
395 } PredicateLockData;
396 
397 
398 /*
399  * These macros define how we map logical IDs of lockable objects into the
400  * physical fields of PREDICATELOCKTARGETTAG.   Use these to set up values,
401  * rather than accessing the fields directly.  Note multiple eval of target!
402  */
403 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
404 	((locktag).locktag_field1 = (dboid), \
405 	 (locktag).locktag_field2 = (reloid), \
406 	 (locktag).locktag_field3 = InvalidBlockNumber, \
407 	 (locktag).locktag_field4 = InvalidOffsetNumber)
408 
409 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
410 	((locktag).locktag_field1 = (dboid), \
411 	 (locktag).locktag_field2 = (reloid), \
412 	 (locktag).locktag_field3 = (blocknum), \
413 	 (locktag).locktag_field4 = InvalidOffsetNumber)
414 
415 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
416 	((locktag).locktag_field1 = (dboid), \
417 	 (locktag).locktag_field2 = (reloid), \
418 	 (locktag).locktag_field3 = (blocknum), \
419 	 (locktag).locktag_field4 = (offnum))
420 
421 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
422 	((Oid) (locktag).locktag_field1)
423 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
424 	((Oid) (locktag).locktag_field2)
425 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
426 	((BlockNumber) (locktag).locktag_field3)
427 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
428 	((OffsetNumber) (locktag).locktag_field4)
429 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)							 \
430 	(((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
431 	 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
432 	  PREDLOCKTAG_RELATION))
433 
434 /*
435  * Two-phase commit statefile records. There are two types: for each
436  * transaction, we generate one per-transaction record and a variable
437  * number of per-predicate-lock records.
438  */
439 typedef enum TwoPhasePredicateRecordType
440 {
441 	TWOPHASEPREDICATERECORD_XACT,
442 	TWOPHASEPREDICATERECORD_LOCK
443 } TwoPhasePredicateRecordType;
444 
445 /*
446  * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
447  * much is needed because most of it not meaningful for a recovered
448  * prepared transaction.
449  *
450  * In particular, we do not record the in and out conflict lists for a
451  * prepared transaction because the associated SERIALIZABLEXACTs will
452  * not be available after recovery. Instead, we simply record the
453  * existence of each type of conflict by setting the transaction's
454  * summary conflict in/out flag.
455  */
456 typedef struct TwoPhasePredicateXactRecord
457 {
458 	TransactionId xmin;
459 	uint32		flags;
460 } TwoPhasePredicateXactRecord;
461 
462 /* Per-lock state */
463 typedef struct TwoPhasePredicateLockRecord
464 {
465 	PREDICATELOCKTARGETTAG target;
466 	uint32		filler;			/* to avoid length change in back-patched fix */
467 } TwoPhasePredicateLockRecord;
468 
469 typedef struct TwoPhasePredicateRecord
470 {
471 	TwoPhasePredicateRecordType type;
472 	union
473 	{
474 		TwoPhasePredicateXactRecord xactRecord;
475 		TwoPhasePredicateLockRecord lockRecord;
476 	}			data;
477 } TwoPhasePredicateRecord;
478 
479 /*
480  * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
481  */
482 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
483 
484 
485 /*
486  * Function definitions for functions needing awareness of predicate
487  * locking internals.
488  */
489 extern PredicateLockData *GetPredicateLockStatusData(void);
490 extern int	GetSafeSnapshotBlockingPids(int blocked_pid,
491 										int *output, int output_size);
492 
493 #endif							/* PREDICATE_INTERNALS_H */
494