1 /*------------------------------------------------------------------------- 2 * 3 * predicate_internals.h 4 * POSTGRES internal predicate locking definitions. 5 * 6 * 7 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/storage/predicate_internals.h 11 * 12 *------------------------------------------------------------------------- 13 */ 14 #ifndef PREDICATE_INTERNALS_H 15 #define PREDICATE_INTERNALS_H 16 17 #include "storage/lock.h" 18 #include "storage/lwlock.h" 19 20 /* 21 * Commit number. 22 */ 23 typedef uint64 SerCommitSeqNo; 24 25 /* 26 * Reserved commit sequence numbers: 27 * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be 28 * used as a SerCommitSeqNo, even an invalid one 29 * - InvalidSerCommitSeqNo is used to indicate a transaction that 30 * hasn't committed yet, so use a number greater than all valid 31 * ones to make comparison do the expected thing 32 * - RecoverySerCommitSeqNo is used to refer to transactions that 33 * happened before a crash/recovery, since we restart the sequence 34 * at that point. It's earlier than all normal sequence numbers, 35 * and is only used by recovered prepared transactions 36 */ 37 #define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX) 38 #define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1) 39 #define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2) 40 41 /* 42 * The SERIALIZABLEXACT struct contains information needed for each 43 * serializable database transaction to support SSI techniques. 44 * 45 * A home-grown list is maintained in shared memory to manage these. 46 * An entry is used when the serializable transaction acquires a snapshot. 47 * Unless the transaction is rolled back, this entry must generally remain 48 * until all concurrent transactions have completed. (There are special 49 * optimizations for READ ONLY transactions which often allow them to be 50 * cleaned up earlier.) A transaction which is rolled back is cleaned up 51 * as soon as possible. 52 * 53 * Eligibility for cleanup of committed transactions is generally determined 54 * by comparing the transaction's finishedBefore field to 55 * SxactGlobalXmin. 56 */ 57 typedef struct SERIALIZABLEXACT 58 { 59 VirtualTransactionId vxid; /* The executing process always has one of 60 * these. */ 61 62 /* 63 * We use two numbers to track the order that transactions commit. Before 64 * commit, a transaction is marked as prepared, and prepareSeqNo is set. 65 * Shortly after commit, it's marked as committed, and commitSeqNo is set. 66 * This doesn't give a strict commit order, but these two values together 67 * are good enough for us, as we can always err on the safe side and 68 * assume that there's a conflict, if we can't be sure of the exact 69 * ordering of two commits. 70 * 71 * Note that a transaction is marked as prepared for a short period during 72 * commit processing, even if two-phase commit is not used. But with 73 * two-phase commit, a transaction can stay in prepared state for some 74 * time. 75 */ 76 SerCommitSeqNo prepareSeqNo; 77 SerCommitSeqNo commitSeqNo; 78 79 /* these values are not both interesting at the same time */ 80 union 81 { 82 SerCommitSeqNo earliestOutConflictCommit; /* when committed with 83 * conflict out */ 84 SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or 85 * no conflict out */ 86 } SeqNo; 87 SHM_QUEUE outConflicts; /* list of write transactions whose data we 88 * couldn't read. */ 89 SHM_QUEUE inConflicts; /* list of read transactions which couldn't 90 * see our write. */ 91 SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */ 92 SHM_QUEUE finishedLink; /* list link in 93 * FinishedSerializableTransactions */ 94 95 /* 96 * perXactPredicateListLock is only used in parallel queries: it protects 97 * this SERIALIZABLEXACT's predicate lock list against other workers of 98 * the same session. 99 */ 100 LWLock perXactPredicateListLock; 101 102 /* 103 * for r/o transactions: list of concurrent r/w transactions that we could 104 * potentially have conflicts with, and vice versa for r/w transactions 105 */ 106 SHM_QUEUE possibleUnsafeConflicts; 107 108 TransactionId topXid; /* top level xid for the transaction, if one 109 * exists; else invalid */ 110 TransactionId finishedBefore; /* invalid means still running; else the 111 * struct expires when no serializable 112 * xids are before this. */ 113 TransactionId xmin; /* the transaction's snapshot xmin */ 114 uint32 flags; /* OR'd combination of values defined below */ 115 int pid; /* pid of associated process */ 116 } SERIALIZABLEXACT; 117 118 #define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */ 119 #define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */ 120 #define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */ 121 #define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */ 122 /* 123 * The following flag actually means that the flagged transaction has a 124 * conflict out *to a transaction which committed ahead of it*. It's hard 125 * to get that into a name of a reasonable length. 126 */ 127 #define SXACT_FLAG_CONFLICT_OUT 0x00000010 128 #define SXACT_FLAG_READ_ONLY 0x00000020 129 #define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040 130 #define SXACT_FLAG_RO_SAFE 0x00000080 131 #define SXACT_FLAG_RO_UNSAFE 0x00000100 132 #define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200 133 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400 134 /* 135 * The following flag means the transaction has been partially released 136 * already, but is being preserved because parallel workers might have a 137 * reference to it. It'll be recycled by the leader at end-of-transaction. 138 */ 139 #define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800 140 141 /* 142 * The following types are used to provide an ad hoc list for holding 143 * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to 144 * access these by key -- there are direct pointers to these objects where 145 * needed. If a shared memory list is created, these types can probably be 146 * eliminated in favor of using the general solution. 147 */ 148 typedef struct PredXactListElementData 149 { 150 SHM_QUEUE link; 151 SERIALIZABLEXACT sxact; 152 } PredXactListElementData; 153 154 typedef struct PredXactListElementData *PredXactListElement; 155 156 #define PredXactListElementDataSize \ 157 ((Size)MAXALIGN(sizeof(PredXactListElementData))) 158 159 typedef struct PredXactListData 160 { 161 SHM_QUEUE availableList; 162 SHM_QUEUE activeList; 163 164 /* 165 * These global variables are maintained when registering and cleaning up 166 * serializable transactions. They must be global across all backends, 167 * but are not needed outside the predicate.c source file. Protected by 168 * SerializableXactHashLock. 169 */ 170 TransactionId SxactGlobalXmin; /* global xmin for active serializable 171 * transactions */ 172 int SxactGlobalXminCount; /* how many active serializable 173 * transactions have this xmin */ 174 int WritableSxactCount; /* how many non-read-only serializable 175 * transactions are active */ 176 SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically 177 * increasing number for commits 178 * of serializable transactions */ 179 /* Protected by SerializableXactHashLock. */ 180 SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and 181 * inConflicts for committed 182 * transactions through this seq 183 * no */ 184 /* Protected by SerializableFinishedListLock. */ 185 SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this 186 * seq no */ 187 SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */ 188 189 PredXactListElement element; 190 } PredXactListData; 191 192 typedef struct PredXactListData *PredXactList; 193 194 #define PredXactListDataSize \ 195 ((Size)MAXALIGN(sizeof(PredXactListData))) 196 197 198 /* 199 * The following types are used to provide lists of rw-conflicts between 200 * pairs of transactions. Since exactly the same information is needed, 201 * they are also used to record possible unsafe transaction relationships 202 * for purposes of identifying safe snapshots for read-only transactions. 203 * 204 * When a RWConflictData is not in use to record either type of relationship 205 * between a pair of transactions, it is kept on an "available" list. The 206 * outLink field is used for maintaining that list. 207 */ 208 typedef struct RWConflictData 209 { 210 SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */ 211 SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */ 212 SERIALIZABLEXACT *sxactOut; 213 SERIALIZABLEXACT *sxactIn; 214 } RWConflictData; 215 216 typedef struct RWConflictData *RWConflict; 217 218 #define RWConflictDataSize \ 219 ((Size)MAXALIGN(sizeof(RWConflictData))) 220 221 typedef struct RWConflictPoolHeaderData 222 { 223 SHM_QUEUE availableList; 224 RWConflict element; 225 } RWConflictPoolHeaderData; 226 227 typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader; 228 229 #define RWConflictPoolHeaderDataSize \ 230 ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData))) 231 232 233 /* 234 * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable 235 * transaction or any of its subtransactions. 236 */ 237 typedef struct SERIALIZABLEXIDTAG 238 { 239 TransactionId xid; 240 } SERIALIZABLEXIDTAG; 241 242 /* 243 * The SERIALIZABLEXID struct provides a link from a TransactionId for a 244 * serializable transaction to the related SERIALIZABLEXACT record, even if 245 * the transaction has completed and its connection has been closed. 246 * 247 * These are created as new top level transaction IDs are first assigned to 248 * transactions which are participating in predicate locking. This may 249 * never happen for a particular transaction if it doesn't write anything. 250 * They are removed with their related serializable transaction objects. 251 * 252 * The SubTransGetTopmostTransaction method is used where necessary to get 253 * from an XID which might be from a subtransaction to the top level XID. 254 */ 255 typedef struct SERIALIZABLEXID 256 { 257 /* hash key */ 258 SERIALIZABLEXIDTAG tag; 259 260 /* data */ 261 SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */ 262 } SERIALIZABLEXID; 263 264 265 /* 266 * The PREDICATELOCKTARGETTAG struct identifies a database object which can 267 * be the target of predicate locks. 268 * 269 * Note that the hash function being used doesn't properly respect tag 270 * length -- if the length of the structure isn't a multiple of four bytes it 271 * will go to a four byte boundary past the end of the tag. If you change 272 * this struct, make sure any slack space is initialized, so that any random 273 * bytes in the middle or at the end are not included in the hash. 274 * 275 * TODO SSI: If we always use the same fields for the same type of value, we 276 * should rename these. Holding off until it's clear there are no exceptions. 277 * Since indexes are relations with blocks and tuples, it's looking likely that 278 * the rename will be possible. If not, we may need to divide the last field 279 * and use part of it for a target type, so that we know how to interpret the 280 * data.. 281 */ 282 typedef struct PREDICATELOCKTARGETTAG 283 { 284 uint32 locktag_field1; /* a 32-bit ID field */ 285 uint32 locktag_field2; /* a 32-bit ID field */ 286 uint32 locktag_field3; /* a 32-bit ID field */ 287 uint32 locktag_field4; /* a 32-bit ID field */ 288 } PREDICATELOCKTARGETTAG; 289 290 /* 291 * The PREDICATELOCKTARGET struct represents a database object on which there 292 * are predicate locks. 293 * 294 * A hash list of these objects is maintained in shared memory. An entry is 295 * added when a predicate lock is requested on an object which doesn't 296 * already have one. An entry is removed when the last lock is removed from 297 * its list. 298 */ 299 typedef struct PREDICATELOCKTARGET 300 { 301 /* hash key */ 302 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ 303 304 /* data */ 305 SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with 306 * predicate lock target */ 307 } PREDICATELOCKTARGET; 308 309 310 /* 311 * The PREDICATELOCKTAG struct identifies an individual predicate lock. 312 * 313 * It is the combination of predicate lock target (which is a lockable 314 * object) and a serializable transaction which has acquired a lock on that 315 * target. 316 */ 317 typedef struct PREDICATELOCKTAG 318 { 319 PREDICATELOCKTARGET *myTarget; 320 SERIALIZABLEXACT *myXact; 321 } PREDICATELOCKTAG; 322 323 /* 324 * The PREDICATELOCK struct represents an individual lock. 325 * 326 * An entry can be created here when the related database object is read, or 327 * by promotion of multiple finer-grained targets. All entries related to a 328 * serializable transaction are removed when that serializable transaction is 329 * cleaned up. Entries can also be removed when they are combined into a 330 * single coarser-grained lock entry. 331 */ 332 typedef struct PREDICATELOCK 333 { 334 /* hash key */ 335 PREDICATELOCKTAG tag; /* unique identifier of lock */ 336 337 /* data */ 338 SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of 339 * predicate locks */ 340 SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of 341 * predicate locks */ 342 SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */ 343 } PREDICATELOCK; 344 345 346 /* 347 * The LOCALPREDICATELOCK struct represents a local copy of data which is 348 * also present in the PREDICATELOCK table, organized for fast access without 349 * needing to acquire a LWLock. It is strictly for optimization. 350 * 351 * Each serializable transaction creates its own local hash table to hold a 352 * collection of these. This information is used to determine when a number 353 * of fine-grained locks should be promoted to a single coarser-grained lock. 354 * The information is maintained more-or-less in parallel to the 355 * PREDICATELOCK data, but because this data is not protected by locks and is 356 * only used in an optimization heuristic, it is allowed to drift in a few 357 * corner cases where maintaining exact data would be expensive. 358 * 359 * The hash table is created when the serializable transaction acquires its 360 * snapshot, and its memory is released upon completion of the transaction. 361 */ 362 typedef struct LOCALPREDICATELOCK 363 { 364 /* hash key */ 365 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ 366 367 /* data */ 368 bool held; /* is lock held, or just its children? */ 369 int childLocks; /* number of child locks currently held */ 370 } LOCALPREDICATELOCK; 371 372 373 /* 374 * The types of predicate locks which can be acquired. 375 */ 376 typedef enum PredicateLockTargetType 377 { 378 PREDLOCKTAG_RELATION, 379 PREDLOCKTAG_PAGE, 380 PREDLOCKTAG_TUPLE 381 /* TODO SSI: Other types may be needed for index locking */ 382 } PredicateLockTargetType; 383 384 385 /* 386 * This structure is used to quickly capture a copy of all predicate 387 * locks. This is currently used only by the pg_lock_status function, 388 * which in turn is used by the pg_locks view. 389 */ 390 typedef struct PredicateLockData 391 { 392 int nelements; 393 PREDICATELOCKTARGETTAG *locktags; 394 SERIALIZABLEXACT *xacts; 395 } PredicateLockData; 396 397 398 /* 399 * These macros define how we map logical IDs of lockable objects into the 400 * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values, 401 * rather than accessing the fields directly. Note multiple eval of target! 402 */ 403 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \ 404 ((locktag).locktag_field1 = (dboid), \ 405 (locktag).locktag_field2 = (reloid), \ 406 (locktag).locktag_field3 = InvalidBlockNumber, \ 407 (locktag).locktag_field4 = InvalidOffsetNumber) 408 409 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \ 410 ((locktag).locktag_field1 = (dboid), \ 411 (locktag).locktag_field2 = (reloid), \ 412 (locktag).locktag_field3 = (blocknum), \ 413 (locktag).locktag_field4 = InvalidOffsetNumber) 414 415 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \ 416 ((locktag).locktag_field1 = (dboid), \ 417 (locktag).locktag_field2 = (reloid), \ 418 (locktag).locktag_field3 = (blocknum), \ 419 (locktag).locktag_field4 = (offnum)) 420 421 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \ 422 ((Oid) (locktag).locktag_field1) 423 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \ 424 ((Oid) (locktag).locktag_field2) 425 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \ 426 ((BlockNumber) (locktag).locktag_field3) 427 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \ 428 ((OffsetNumber) (locktag).locktag_field4) 429 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \ 430 (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \ 431 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \ 432 PREDLOCKTAG_RELATION)) 433 434 /* 435 * Two-phase commit statefile records. There are two types: for each 436 * transaction, we generate one per-transaction record and a variable 437 * number of per-predicate-lock records. 438 */ 439 typedef enum TwoPhasePredicateRecordType 440 { 441 TWOPHASEPREDICATERECORD_XACT, 442 TWOPHASEPREDICATERECORD_LOCK 443 } TwoPhasePredicateRecordType; 444 445 /* 446 * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not 447 * much is needed because most of it not meaningful for a recovered 448 * prepared transaction. 449 * 450 * In particular, we do not record the in and out conflict lists for a 451 * prepared transaction because the associated SERIALIZABLEXACTs will 452 * not be available after recovery. Instead, we simply record the 453 * existence of each type of conflict by setting the transaction's 454 * summary conflict in/out flag. 455 */ 456 typedef struct TwoPhasePredicateXactRecord 457 { 458 TransactionId xmin; 459 uint32 flags; 460 } TwoPhasePredicateXactRecord; 461 462 /* Per-lock state */ 463 typedef struct TwoPhasePredicateLockRecord 464 { 465 PREDICATELOCKTARGETTAG target; 466 uint32 filler; /* to avoid length change in back-patched fix */ 467 } TwoPhasePredicateLockRecord; 468 469 typedef struct TwoPhasePredicateRecord 470 { 471 TwoPhasePredicateRecordType type; 472 union 473 { 474 TwoPhasePredicateXactRecord xactRecord; 475 TwoPhasePredicateLockRecord lockRecord; 476 } data; 477 } TwoPhasePredicateRecord; 478 479 /* 480 * Define a macro to use for an "empty" SERIALIZABLEXACT reference. 481 */ 482 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL) 483 484 485 /* 486 * Function definitions for functions needing awareness of predicate 487 * locking internals. 488 */ 489 extern PredicateLockData *GetPredicateLockStatusData(void); 490 extern int GetSafeSnapshotBlockingPids(int blocked_pid, 491 int *output, int output_size); 492 493 #endif /* PREDICATE_INTERNALS_H */ 494