1 /*-------------------------------------------------------------------------
2  *
3  * predicate_internals.h
4  *	  POSTGRES internal predicate locking definitions.
5  *
6  *
7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/include/storage/predicate_internals.h
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef PREDICATE_INTERNALS_H
15 #define PREDICATE_INTERNALS_H
16 
17 #include "storage/lock.h"
18 
19 /*
20  * Commit number.
21  */
22 typedef uint64 SerCommitSeqNo;
23 
24 /*
25  * Reserved commit sequence numbers:
26  *	- 0 is reserved to indicate a non-existent SLRU entry; it cannot be
27  *	  used as a SerCommitSeqNo, even an invalid one
28  *	- InvalidSerCommitSeqNo is used to indicate a transaction that
29  *	  hasn't committed yet, so use a number greater than all valid
30  *	  ones to make comparison do the expected thing
31  *	- RecoverySerCommitSeqNo is used to refer to transactions that
32  *	  happened before a crash/recovery, since we restart the sequence
33  *	  at that point.  It's earlier than all normal sequence numbers,
34  *	  and is only used by recovered prepared transactions
35  */
36 #define InvalidSerCommitSeqNo		((SerCommitSeqNo) PG_UINT64_MAX)
37 #define RecoverySerCommitSeqNo		((SerCommitSeqNo) 1)
38 #define FirstNormalSerCommitSeqNo	((SerCommitSeqNo) 2)
39 
40 /*
41  * The SERIALIZABLEXACT struct contains information needed for each
42  * serializable database transaction to support SSI techniques.
43  *
44  * A home-grown list is maintained in shared memory to manage these.
45  * An entry is used when the serializable transaction acquires a snapshot.
46  * Unless the transaction is rolled back, this entry must generally remain
47  * until all concurrent transactions have completed.  (There are special
48  * optimizations for READ ONLY transactions which often allow them to be
49  * cleaned up earlier.)  A transaction which is rolled back is cleaned up
50  * as soon as possible.
51  *
52  * Eligibility for cleanup of committed transactions is generally determined
53  * by comparing the transaction's finishedBefore field to
54  * SerializableGlobalXmin.
55  */
56 typedef struct SERIALIZABLEXACT
57 {
58 	VirtualTransactionId vxid;	/* The executing process always has one of
59 								 * these. */
60 
61 	/*
62 	 * We use two numbers to track the order that transactions commit. Before
63 	 * commit, a transaction is marked as prepared, and prepareSeqNo is set.
64 	 * Shortly after commit, it's marked as committed, and commitSeqNo is set.
65 	 * This doesn't give a strict commit order, but these two values together
66 	 * are good enough for us, as we can always err on the safe side and
67 	 * assume that there's a conflict, if we can't be sure of the exact
68 	 * ordering of two commits.
69 	 *
70 	 * Note that a transaction is marked as prepared for a short period during
71 	 * commit processing, even if two-phase commit is not used. But with
72 	 * two-phase commit, a transaction can stay in prepared state for some
73 	 * time.
74 	 */
75 	SerCommitSeqNo prepareSeqNo;
76 	SerCommitSeqNo commitSeqNo;
77 
78 	/* these values are not both interesting at the same time */
79 	union
80 	{
81 		SerCommitSeqNo earliestOutConflictCommit;	/* when committed with
82 													 * conflict out */
83 		SerCommitSeqNo lastCommitBeforeSnapshot;	/* when not committed or
84 													 * no conflict out */
85 	}			SeqNo;
86 	SHM_QUEUE	outConflicts;	/* list of write transactions whose data we
87 								 * couldn't read. */
88 	SHM_QUEUE	inConflicts;	/* list of read transactions which couldn't
89 								 * see our write. */
90 	SHM_QUEUE	predicateLocks; /* list of associated PREDICATELOCK objects */
91 	SHM_QUEUE	finishedLink;	/* list link in
92 								 * FinishedSerializableTransactions */
93 
94 	/*
95 	 * for r/o transactions: list of concurrent r/w transactions that we could
96 	 * potentially have conflicts with, and vice versa for r/w transactions
97 	 */
98 	SHM_QUEUE	possibleUnsafeConflicts;
99 
100 	TransactionId topXid;		/* top level xid for the transaction, if one
101 								 * exists; else invalid */
102 	TransactionId finishedBefore;	/* invalid means still running; else the
103 									 * struct expires when no serializable
104 									 * xids are before this. */
105 	TransactionId xmin;			/* the transaction's snapshot xmin */
106 	uint32		flags;			/* OR'd combination of values defined below */
107 	int			pid;			/* pid of associated process */
108 } SERIALIZABLEXACT;
109 
110 #define SXACT_FLAG_COMMITTED			0x00000001	/* already committed */
111 #define SXACT_FLAG_PREPARED				0x00000002	/* about to commit */
112 #define SXACT_FLAG_ROLLED_BACK			0x00000004	/* already rolled back */
113 #define SXACT_FLAG_DOOMED				0x00000008	/* will roll back */
114 /*
115  * The following flag actually means that the flagged transaction has a
116  * conflict out *to a transaction which committed ahead of it*.  It's hard
117  * to get that into a name of a reasonable length.
118  */
119 #define SXACT_FLAG_CONFLICT_OUT			0x00000010
120 #define SXACT_FLAG_READ_ONLY			0x00000020
121 #define SXACT_FLAG_DEFERRABLE_WAITING	0x00000040
122 #define SXACT_FLAG_RO_SAFE				0x00000080
123 #define SXACT_FLAG_RO_UNSAFE			0x00000100
124 #define SXACT_FLAG_SUMMARY_CONFLICT_IN	0x00000200
125 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
126 
127 /*
128  * The following types are used to provide an ad hoc list for holding
129  * SERIALIZABLEXACT objects.  An HTAB is overkill, since there is no need to
130  * access these by key -- there are direct pointers to these objects where
131  * needed.  If a shared memory list is created, these types can probably be
132  * eliminated in favor of using the general solution.
133  */
134 typedef struct PredXactListElementData
135 {
136 	SHM_QUEUE	link;
137 	SERIALIZABLEXACT sxact;
138 }			PredXactListElementData;
139 
140 typedef struct PredXactListElementData *PredXactListElement;
141 
142 #define PredXactListElementDataSize \
143 		((Size)MAXALIGN(sizeof(PredXactListElementData)))
144 
145 typedef struct PredXactListData
146 {
147 	SHM_QUEUE	availableList;
148 	SHM_QUEUE	activeList;
149 
150 	/*
151 	 * These global variables are maintained when registering and cleaning up
152 	 * serializable transactions.  They must be global across all backends,
153 	 * but are not needed outside the predicate.c source file. Protected by
154 	 * SerializableXactHashLock.
155 	 */
156 	TransactionId SxactGlobalXmin;	/* global xmin for active serializable
157 									 * transactions */
158 	int			SxactGlobalXminCount;	/* how many active serializable
159 										 * transactions have this xmin */
160 	int			WritableSxactCount; /* how many non-read-only serializable
161 									 * transactions are active */
162 	SerCommitSeqNo LastSxactCommitSeqNo;	/* a strictly monotonically
163 											 * increasing number for commits
164 											 * of serializable transactions */
165 	/* Protected by SerializableXactHashLock. */
166 	SerCommitSeqNo CanPartialClearThrough;	/* can clear predicate locks and
167 											 * inConflicts for committed
168 											 * transactions through this seq
169 											 * no */
170 	/* Protected by SerializableFinishedListLock. */
171 	SerCommitSeqNo HavePartialClearedThrough;	/* have cleared through this
172 												 * seq no */
173 	SERIALIZABLEXACT *OldCommittedSxact;	/* shared copy of dummy sxact */
174 
175 	PredXactListElement element;
176 }			PredXactListData;
177 
178 typedef struct PredXactListData *PredXactList;
179 
180 #define PredXactListDataSize \
181 		((Size)MAXALIGN(sizeof(PredXactListData)))
182 
183 
184 /*
185  * The following types are used to provide lists of rw-conflicts between
186  * pairs of transactions.  Since exactly the same information is needed,
187  * they are also used to record possible unsafe transaction relationships
188  * for purposes of identifying safe snapshots for read-only transactions.
189  *
190  * When a RWConflictData is not in use to record either type of relationship
191  * between a pair of transactions, it is kept on an "available" list.  The
192  * outLink field is used for maintaining that list.
193  */
194 typedef struct RWConflictData
195 {
196 	SHM_QUEUE	outLink;		/* link for list of conflicts out from a sxact */
197 	SHM_QUEUE	inLink;			/* link for list of conflicts in to a sxact */
198 	SERIALIZABLEXACT *sxactOut;
199 	SERIALIZABLEXACT *sxactIn;
200 }			RWConflictData;
201 
202 typedef struct RWConflictData *RWConflict;
203 
204 #define RWConflictDataSize \
205 		((Size)MAXALIGN(sizeof(RWConflictData)))
206 
207 typedef struct RWConflictPoolHeaderData
208 {
209 	SHM_QUEUE	availableList;
210 	RWConflict	element;
211 }			RWConflictPoolHeaderData;
212 
213 typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
214 
215 #define RWConflictPoolHeaderDataSize \
216 		((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
217 
218 
219 /*
220  * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
221  * transaction or any of its subtransactions.
222  */
223 typedef struct SERIALIZABLEXIDTAG
224 {
225 	TransactionId xid;
226 } SERIALIZABLEXIDTAG;
227 
228 /*
229  * The SERIALIZABLEXID struct provides a link from a TransactionId for a
230  * serializable transaction to the related SERIALIZABLEXACT record, even if
231  * the transaction has completed and its connection has been closed.
232  *
233  * These are created as new top level transaction IDs are first assigned to
234  * transactions which are participating in predicate locking.  This may
235  * never happen for a particular transaction if it doesn't write anything.
236  * They are removed with their related serializable transaction objects.
237  *
238  * The SubTransGetTopmostTransaction method is used where necessary to get
239  * from an XID which might be from a subtransaction to the top level XID.
240  */
241 typedef struct SERIALIZABLEXID
242 {
243 	/* hash key */
244 	SERIALIZABLEXIDTAG tag;
245 
246 	/* data */
247 	SERIALIZABLEXACT *myXact;	/* pointer to the top level transaction data */
248 } SERIALIZABLEXID;
249 
250 
251 /*
252  * The PREDICATELOCKTARGETTAG struct identifies a database object which can
253  * be the target of predicate locks.
254  *
255  * Note that the hash function being used doesn't properly respect tag
256  * length -- if the length of the structure isn't a multiple of four bytes it
257  * will go to a four byte boundary past the end of the tag.  If you change
258  * this struct, make sure any slack space is initialized, so that any random
259  * bytes in the middle or at the end are not included in the hash.
260  *
261  * TODO SSI: If we always use the same fields for the same type of value, we
262  * should rename these.  Holding off until it's clear there are no exceptions.
263  * Since indexes are relations with blocks and tuples, it's looking likely that
264  * the rename will be possible.  If not, we may need to divide the last field
265  * and use part of it for a target type, so that we know how to interpret the
266  * data..
267  */
268 typedef struct PREDICATELOCKTARGETTAG
269 {
270 	uint32		locktag_field1; /* a 32-bit ID field */
271 	uint32		locktag_field2; /* a 32-bit ID field */
272 	uint32		locktag_field3; /* a 32-bit ID field */
273 	uint32		locktag_field4; /* a 32-bit ID field */
274 } PREDICATELOCKTARGETTAG;
275 
276 /*
277  * The PREDICATELOCKTARGET struct represents a database object on which there
278  * are predicate locks.
279  *
280  * A hash list of these objects is maintained in shared memory.  An entry is
281  * added when a predicate lock is requested on an object which doesn't
282  * already have one.  An entry is removed when the last lock is removed from
283  * its list.
284  */
285 typedef struct PREDICATELOCKTARGET
286 {
287 	/* hash key */
288 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
289 
290 	/* data */
291 	SHM_QUEUE	predicateLocks; /* list of PREDICATELOCK objects assoc. with
292 								 * predicate lock target */
293 } PREDICATELOCKTARGET;
294 
295 
296 /*
297  * The PREDICATELOCKTAG struct identifies an individual predicate lock.
298  *
299  * It is the combination of predicate lock target (which is a lockable
300  * object) and a serializable transaction which has acquired a lock on that
301  * target.
302  */
303 typedef struct PREDICATELOCKTAG
304 {
305 	PREDICATELOCKTARGET *myTarget;
306 	SERIALIZABLEXACT *myXact;
307 } PREDICATELOCKTAG;
308 
309 /*
310  * The PREDICATELOCK struct represents an individual lock.
311  *
312  * An entry can be created here when the related database object is read, or
313  * by promotion of multiple finer-grained targets.  All entries related to a
314  * serializable transaction are removed when that serializable transaction is
315  * cleaned up.  Entries can also be removed when they are combined into a
316  * single coarser-grained lock entry.
317  */
318 typedef struct PREDICATELOCK
319 {
320 	/* hash key */
321 	PREDICATELOCKTAG tag;		/* unique identifier of lock */
322 
323 	/* data */
324 	SHM_QUEUE	targetLink;		/* list link in PREDICATELOCKTARGET's list of
325 								 * predicate locks */
326 	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
327 								 * predicate locks */
328 	SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
329 } PREDICATELOCK;
330 
331 
332 /*
333  * The LOCALPREDICATELOCK struct represents a local copy of data which is
334  * also present in the PREDICATELOCK table, organized for fast access without
335  * needing to acquire a LWLock.  It is strictly for optimization.
336  *
337  * Each serializable transaction creates its own local hash table to hold a
338  * collection of these.  This information is used to determine when a number
339  * of fine-grained locks should be promoted to a single coarser-grained lock.
340  * The information is maintained more-or-less in parallel to the
341  * PREDICATELOCK data, but because this data is not protected by locks and is
342  * only used in an optimization heuristic, it is allowed to drift in a few
343  * corner cases where maintaining exact data would be expensive.
344  *
345  * The hash table is created when the serializable transaction acquires its
346  * snapshot, and its memory is released upon completion of the transaction.
347  */
348 typedef struct LOCALPREDICATELOCK
349 {
350 	/* hash key */
351 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
352 
353 	/* data */
354 	bool		held;			/* is lock held, or just its children?	*/
355 	int			childLocks;		/* number of child locks currently held */
356 } LOCALPREDICATELOCK;
357 
358 
359 /*
360  * The types of predicate locks which can be acquired.
361  */
362 typedef enum PredicateLockTargetType
363 {
364 	PREDLOCKTAG_RELATION,
365 	PREDLOCKTAG_PAGE,
366 	PREDLOCKTAG_TUPLE
367 	/* TODO SSI: Other types may be needed for index locking */
368 } PredicateLockTargetType;
369 
370 
371 /*
372  * This structure is used to quickly capture a copy of all predicate
373  * locks.  This is currently used only by the pg_lock_status function,
374  * which in turn is used by the pg_locks view.
375  */
376 typedef struct PredicateLockData
377 {
378 	int			nelements;
379 	PREDICATELOCKTARGETTAG *locktags;
380 	SERIALIZABLEXACT *xacts;
381 } PredicateLockData;
382 
383 
384 /*
385  * These macros define how we map logical IDs of lockable objects into the
386  * physical fields of PREDICATELOCKTARGETTAG.   Use these to set up values,
387  * rather than accessing the fields directly.  Note multiple eval of target!
388  */
389 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
390 	((locktag).locktag_field1 = (dboid), \
391 	 (locktag).locktag_field2 = (reloid), \
392 	 (locktag).locktag_field3 = InvalidBlockNumber, \
393 	 (locktag).locktag_field4 = InvalidOffsetNumber)
394 
395 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
396 	((locktag).locktag_field1 = (dboid), \
397 	 (locktag).locktag_field2 = (reloid), \
398 	 (locktag).locktag_field3 = (blocknum), \
399 	 (locktag).locktag_field4 = InvalidOffsetNumber)
400 
401 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
402 	((locktag).locktag_field1 = (dboid), \
403 	 (locktag).locktag_field2 = (reloid), \
404 	 (locktag).locktag_field3 = (blocknum), \
405 	 (locktag).locktag_field4 = (offnum))
406 
407 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
408 	((Oid) (locktag).locktag_field1)
409 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
410 	((Oid) (locktag).locktag_field2)
411 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
412 	((BlockNumber) (locktag).locktag_field3)
413 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
414 	((OffsetNumber) (locktag).locktag_field4)
415 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)							 \
416 	(((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
417 	 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
418 	  PREDLOCKTAG_RELATION))
419 
420 /*
421  * Two-phase commit statefile records. There are two types: for each
422  * transaction, we generate one per-transaction record and a variable
423  * number of per-predicate-lock records.
424  */
425 typedef enum TwoPhasePredicateRecordType
426 {
427 	TWOPHASEPREDICATERECORD_XACT,
428 	TWOPHASEPREDICATERECORD_LOCK
429 } TwoPhasePredicateRecordType;
430 
431 /*
432  * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
433  * much is needed because most of it not meaningful for a recovered
434  * prepared transaction.
435  *
436  * In particular, we do not record the in and out conflict lists for a
437  * prepared transaction because the associated SERIALIZABLEXACTs will
438  * not be available after recovery. Instead, we simply record the
439  * existence of each type of conflict by setting the transaction's
440  * summary conflict in/out flag.
441  */
442 typedef struct TwoPhasePredicateXactRecord
443 {
444 	TransactionId xmin;
445 	uint32		flags;
446 } TwoPhasePredicateXactRecord;
447 
448 /* Per-lock state */
449 typedef struct TwoPhasePredicateLockRecord
450 {
451 	PREDICATELOCKTARGETTAG target;
452 	uint32		filler;			/* to avoid length change in back-patched fix */
453 } TwoPhasePredicateLockRecord;
454 
455 typedef struct TwoPhasePredicateRecord
456 {
457 	TwoPhasePredicateRecordType type;
458 	union
459 	{
460 		TwoPhasePredicateXactRecord xactRecord;
461 		TwoPhasePredicateLockRecord lockRecord;
462 	}			data;
463 } TwoPhasePredicateRecord;
464 
465 /*
466  * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
467  */
468 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
469 
470 
471 /*
472  * Function definitions for functions needing awareness of predicate
473  * locking internals.
474  */
475 extern PredicateLockData *GetPredicateLockStatusData(void);
476 extern int GetSafeSnapshotBlockingPids(int blocked_pid,
477 							int *output, int output_size);
478 
479 #endif							/* PREDICATE_INTERNALS_H */
480