1 /*------------------------------------------------------------------------- 2 * 3 * predicate_internals.h 4 * POSTGRES internal predicate locking definitions. 5 * 6 * 7 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/storage/predicate_internals.h 11 * 12 *------------------------------------------------------------------------- 13 */ 14 #ifndef PREDICATE_INTERNALS_H 15 #define PREDICATE_INTERNALS_H 16 17 #include "storage/lock.h" 18 19 /* 20 * Commit number. 21 */ 22 typedef uint64 SerCommitSeqNo; 23 24 /* 25 * Reserved commit sequence numbers: 26 * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be 27 * used as a SerCommitSeqNo, even an invalid one 28 * - InvalidSerCommitSeqNo is used to indicate a transaction that 29 * hasn't committed yet, so use a number greater than all valid 30 * ones to make comparison do the expected thing 31 * - RecoverySerCommitSeqNo is used to refer to transactions that 32 * happened before a crash/recovery, since we restart the sequence 33 * at that point. It's earlier than all normal sequence numbers, 34 * and is only used by recovered prepared transactions 35 */ 36 #define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX) 37 #define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1) 38 #define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2) 39 40 /* 41 * The SERIALIZABLEXACT struct contains information needed for each 42 * serializable database transaction to support SSI techniques. 43 * 44 * A home-grown list is maintained in shared memory to manage these. 45 * An entry is used when the serializable transaction acquires a snapshot. 46 * Unless the transaction is rolled back, this entry must generally remain 47 * until all concurrent transactions have completed. (There are special 48 * optimizations for READ ONLY transactions which often allow them to be 49 * cleaned up earlier.) A transaction which is rolled back is cleaned up 50 * as soon as possible. 51 * 52 * Eligibility for cleanup of committed transactions is generally determined 53 * by comparing the transaction's finishedBefore field to 54 * SerializableGlobalXmin. 55 */ 56 typedef struct SERIALIZABLEXACT 57 { 58 VirtualTransactionId vxid; /* The executing process always has one of 59 * these. */ 60 61 /* 62 * We use two numbers to track the order that transactions commit. Before 63 * commit, a transaction is marked as prepared, and prepareSeqNo is set. 64 * Shortly after commit, it's marked as committed, and commitSeqNo is set. 65 * This doesn't give a strict commit order, but these two values together 66 * are good enough for us, as we can always err on the safe side and 67 * assume that there's a conflict, if we can't be sure of the exact 68 * ordering of two commits. 69 * 70 * Note that a transaction is marked as prepared for a short period during 71 * commit processing, even if two-phase commit is not used. But with 72 * two-phase commit, a transaction can stay in prepared state for some 73 * time. 74 */ 75 SerCommitSeqNo prepareSeqNo; 76 SerCommitSeqNo commitSeqNo; 77 78 /* these values are not both interesting at the same time */ 79 union 80 { 81 SerCommitSeqNo earliestOutConflictCommit; /* when committed with 82 * conflict out */ 83 SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or 84 * no conflict out */ 85 } SeqNo; 86 SHM_QUEUE outConflicts; /* list of write transactions whose data we 87 * couldn't read. */ 88 SHM_QUEUE inConflicts; /* list of read transactions which couldn't 89 * see our write. */ 90 SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */ 91 SHM_QUEUE finishedLink; /* list link in 92 * FinishedSerializableTransactions */ 93 94 /* 95 * for r/o transactions: list of concurrent r/w transactions that we could 96 * potentially have conflicts with, and vice versa for r/w transactions 97 */ 98 SHM_QUEUE possibleUnsafeConflicts; 99 100 TransactionId topXid; /* top level xid for the transaction, if one 101 * exists; else invalid */ 102 TransactionId finishedBefore; /* invalid means still running; else the 103 * struct expires when no serializable 104 * xids are before this. */ 105 TransactionId xmin; /* the transaction's snapshot xmin */ 106 uint32 flags; /* OR'd combination of values defined below */ 107 int pid; /* pid of associated process */ 108 } SERIALIZABLEXACT; 109 110 #define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */ 111 #define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */ 112 #define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */ 113 #define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */ 114 /* 115 * The following flag actually means that the flagged transaction has a 116 * conflict out *to a transaction which committed ahead of it*. It's hard 117 * to get that into a name of a reasonable length. 118 */ 119 #define SXACT_FLAG_CONFLICT_OUT 0x00000010 120 #define SXACT_FLAG_READ_ONLY 0x00000020 121 #define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040 122 #define SXACT_FLAG_RO_SAFE 0x00000080 123 #define SXACT_FLAG_RO_UNSAFE 0x00000100 124 #define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200 125 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400 126 127 /* 128 * The following types are used to provide an ad hoc list for holding 129 * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to 130 * access these by key -- there are direct pointers to these objects where 131 * needed. If a shared memory list is created, these types can probably be 132 * eliminated in favor of using the general solution. 133 */ 134 typedef struct PredXactListElementData 135 { 136 SHM_QUEUE link; 137 SERIALIZABLEXACT sxact; 138 } PredXactListElementData; 139 140 typedef struct PredXactListElementData *PredXactListElement; 141 142 #define PredXactListElementDataSize \ 143 ((Size)MAXALIGN(sizeof(PredXactListElementData))) 144 145 typedef struct PredXactListData 146 { 147 SHM_QUEUE availableList; 148 SHM_QUEUE activeList; 149 150 /* 151 * These global variables are maintained when registering and cleaning up 152 * serializable transactions. They must be global across all backends, 153 * but are not needed outside the predicate.c source file. Protected by 154 * SerializableXactHashLock. 155 */ 156 TransactionId SxactGlobalXmin; /* global xmin for active serializable 157 * transactions */ 158 int SxactGlobalXminCount; /* how many active serializable 159 * transactions have this xmin */ 160 int WritableSxactCount; /* how many non-read-only serializable 161 * transactions are active */ 162 SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically 163 * increasing number for commits 164 * of serializable transactions */ 165 /* Protected by SerializableXactHashLock. */ 166 SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and 167 * inConflicts for committed 168 * transactions through this seq 169 * no */ 170 /* Protected by SerializableFinishedListLock. */ 171 SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this 172 * seq no */ 173 SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */ 174 175 PredXactListElement element; 176 } PredXactListData; 177 178 typedef struct PredXactListData *PredXactList; 179 180 #define PredXactListDataSize \ 181 ((Size)MAXALIGN(sizeof(PredXactListData))) 182 183 184 /* 185 * The following types are used to provide lists of rw-conflicts between 186 * pairs of transactions. Since exactly the same information is needed, 187 * they are also used to record possible unsafe transaction relationships 188 * for purposes of identifying safe snapshots for read-only transactions. 189 * 190 * When a RWConflictData is not in use to record either type of relationship 191 * between a pair of transactions, it is kept on an "available" list. The 192 * outLink field is used for maintaining that list. 193 */ 194 typedef struct RWConflictData 195 { 196 SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */ 197 SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */ 198 SERIALIZABLEXACT *sxactOut; 199 SERIALIZABLEXACT *sxactIn; 200 } RWConflictData; 201 202 typedef struct RWConflictData *RWConflict; 203 204 #define RWConflictDataSize \ 205 ((Size)MAXALIGN(sizeof(RWConflictData))) 206 207 typedef struct RWConflictPoolHeaderData 208 { 209 SHM_QUEUE availableList; 210 RWConflict element; 211 } RWConflictPoolHeaderData; 212 213 typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader; 214 215 #define RWConflictPoolHeaderDataSize \ 216 ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData))) 217 218 219 /* 220 * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable 221 * transaction or any of its subtransactions. 222 */ 223 typedef struct SERIALIZABLEXIDTAG 224 { 225 TransactionId xid; 226 } SERIALIZABLEXIDTAG; 227 228 /* 229 * The SERIALIZABLEXID struct provides a link from a TransactionId for a 230 * serializable transaction to the related SERIALIZABLEXACT record, even if 231 * the transaction has completed and its connection has been closed. 232 * 233 * These are created as new top level transaction IDs are first assigned to 234 * transactions which are participating in predicate locking. This may 235 * never happen for a particular transaction if it doesn't write anything. 236 * They are removed with their related serializable transaction objects. 237 * 238 * The SubTransGetTopmostTransaction method is used where necessary to get 239 * from an XID which might be from a subtransaction to the top level XID. 240 */ 241 typedef struct SERIALIZABLEXID 242 { 243 /* hash key */ 244 SERIALIZABLEXIDTAG tag; 245 246 /* data */ 247 SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */ 248 } SERIALIZABLEXID; 249 250 251 /* 252 * The PREDICATELOCKTARGETTAG struct identifies a database object which can 253 * be the target of predicate locks. 254 * 255 * Note that the hash function being used doesn't properly respect tag 256 * length -- if the length of the structure isn't a multiple of four bytes it 257 * will go to a four byte boundary past the end of the tag. If you change 258 * this struct, make sure any slack space is initialized, so that any random 259 * bytes in the middle or at the end are not included in the hash. 260 * 261 * TODO SSI: If we always use the same fields for the same type of value, we 262 * should rename these. Holding off until it's clear there are no exceptions. 263 * Since indexes are relations with blocks and tuples, it's looking likely that 264 * the rename will be possible. If not, we may need to divide the last field 265 * and use part of it for a target type, so that we know how to interpret the 266 * data.. 267 */ 268 typedef struct PREDICATELOCKTARGETTAG 269 { 270 uint32 locktag_field1; /* a 32-bit ID field */ 271 uint32 locktag_field2; /* a 32-bit ID field */ 272 uint32 locktag_field3; /* a 32-bit ID field */ 273 uint32 locktag_field4; /* a 32-bit ID field */ 274 } PREDICATELOCKTARGETTAG; 275 276 /* 277 * The PREDICATELOCKTARGET struct represents a database object on which there 278 * are predicate locks. 279 * 280 * A hash list of these objects is maintained in shared memory. An entry is 281 * added when a predicate lock is requested on an object which doesn't 282 * already have one. An entry is removed when the last lock is removed from 283 * its list. 284 */ 285 typedef struct PREDICATELOCKTARGET 286 { 287 /* hash key */ 288 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ 289 290 /* data */ 291 SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with 292 * predicate lock target */ 293 } PREDICATELOCKTARGET; 294 295 296 /* 297 * The PREDICATELOCKTAG struct identifies an individual predicate lock. 298 * 299 * It is the combination of predicate lock target (which is a lockable 300 * object) and a serializable transaction which has acquired a lock on that 301 * target. 302 */ 303 typedef struct PREDICATELOCKTAG 304 { 305 PREDICATELOCKTARGET *myTarget; 306 SERIALIZABLEXACT *myXact; 307 } PREDICATELOCKTAG; 308 309 /* 310 * The PREDICATELOCK struct represents an individual lock. 311 * 312 * An entry can be created here when the related database object is read, or 313 * by promotion of multiple finer-grained targets. All entries related to a 314 * serializable transaction are removed when that serializable transaction is 315 * cleaned up. Entries can also be removed when they are combined into a 316 * single coarser-grained lock entry. 317 */ 318 typedef struct PREDICATELOCK 319 { 320 /* hash key */ 321 PREDICATELOCKTAG tag; /* unique identifier of lock */ 322 323 /* data */ 324 SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of 325 * predicate locks */ 326 SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of 327 * predicate locks */ 328 SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */ 329 } PREDICATELOCK; 330 331 332 /* 333 * The LOCALPREDICATELOCK struct represents a local copy of data which is 334 * also present in the PREDICATELOCK table, organized for fast access without 335 * needing to acquire a LWLock. It is strictly for optimization. 336 * 337 * Each serializable transaction creates its own local hash table to hold a 338 * collection of these. This information is used to determine when a number 339 * of fine-grained locks should be promoted to a single coarser-grained lock. 340 * The information is maintained more-or-less in parallel to the 341 * PREDICATELOCK data, but because this data is not protected by locks and is 342 * only used in an optimization heuristic, it is allowed to drift in a few 343 * corner cases where maintaining exact data would be expensive. 344 * 345 * The hash table is created when the serializable transaction acquires its 346 * snapshot, and its memory is released upon completion of the transaction. 347 */ 348 typedef struct LOCALPREDICATELOCK 349 { 350 /* hash key */ 351 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ 352 353 /* data */ 354 bool held; /* is lock held, or just its children? */ 355 int childLocks; /* number of child locks currently held */ 356 } LOCALPREDICATELOCK; 357 358 359 /* 360 * The types of predicate locks which can be acquired. 361 */ 362 typedef enum PredicateLockTargetType 363 { 364 PREDLOCKTAG_RELATION, 365 PREDLOCKTAG_PAGE, 366 PREDLOCKTAG_TUPLE 367 /* TODO SSI: Other types may be needed for index locking */ 368 } PredicateLockTargetType; 369 370 371 /* 372 * This structure is used to quickly capture a copy of all predicate 373 * locks. This is currently used only by the pg_lock_status function, 374 * which in turn is used by the pg_locks view. 375 */ 376 typedef struct PredicateLockData 377 { 378 int nelements; 379 PREDICATELOCKTARGETTAG *locktags; 380 SERIALIZABLEXACT *xacts; 381 } PredicateLockData; 382 383 384 /* 385 * These macros define how we map logical IDs of lockable objects into the 386 * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values, 387 * rather than accessing the fields directly. Note multiple eval of target! 388 */ 389 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \ 390 ((locktag).locktag_field1 = (dboid), \ 391 (locktag).locktag_field2 = (reloid), \ 392 (locktag).locktag_field3 = InvalidBlockNumber, \ 393 (locktag).locktag_field4 = InvalidOffsetNumber) 394 395 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \ 396 ((locktag).locktag_field1 = (dboid), \ 397 (locktag).locktag_field2 = (reloid), \ 398 (locktag).locktag_field3 = (blocknum), \ 399 (locktag).locktag_field4 = InvalidOffsetNumber) 400 401 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \ 402 ((locktag).locktag_field1 = (dboid), \ 403 (locktag).locktag_field2 = (reloid), \ 404 (locktag).locktag_field3 = (blocknum), \ 405 (locktag).locktag_field4 = (offnum)) 406 407 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \ 408 ((Oid) (locktag).locktag_field1) 409 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \ 410 ((Oid) (locktag).locktag_field2) 411 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \ 412 ((BlockNumber) (locktag).locktag_field3) 413 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \ 414 ((OffsetNumber) (locktag).locktag_field4) 415 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \ 416 (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \ 417 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \ 418 PREDLOCKTAG_RELATION)) 419 420 /* 421 * Two-phase commit statefile records. There are two types: for each 422 * transaction, we generate one per-transaction record and a variable 423 * number of per-predicate-lock records. 424 */ 425 typedef enum TwoPhasePredicateRecordType 426 { 427 TWOPHASEPREDICATERECORD_XACT, 428 TWOPHASEPREDICATERECORD_LOCK 429 } TwoPhasePredicateRecordType; 430 431 /* 432 * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not 433 * much is needed because most of it not meaningful for a recovered 434 * prepared transaction. 435 * 436 * In particular, we do not record the in and out conflict lists for a 437 * prepared transaction because the associated SERIALIZABLEXACTs will 438 * not be available after recovery. Instead, we simply record the 439 * existence of each type of conflict by setting the transaction's 440 * summary conflict in/out flag. 441 */ 442 typedef struct TwoPhasePredicateXactRecord 443 { 444 TransactionId xmin; 445 uint32 flags; 446 } TwoPhasePredicateXactRecord; 447 448 /* Per-lock state */ 449 typedef struct TwoPhasePredicateLockRecord 450 { 451 PREDICATELOCKTARGETTAG target; 452 uint32 filler; /* to avoid length change in back-patched fix */ 453 } TwoPhasePredicateLockRecord; 454 455 typedef struct TwoPhasePredicateRecord 456 { 457 TwoPhasePredicateRecordType type; 458 union 459 { 460 TwoPhasePredicateXactRecord xactRecord; 461 TwoPhasePredicateLockRecord lockRecord; 462 } data; 463 } TwoPhasePredicateRecord; 464 465 /* 466 * Define a macro to use for an "empty" SERIALIZABLEXACT reference. 467 */ 468 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL) 469 470 471 /* 472 * Function definitions for functions needing awareness of predicate 473 * locking internals. 474 */ 475 extern PredicateLockData *GetPredicateLockStatusData(void); 476 extern int GetSafeSnapshotBlockingPids(int blocked_pid, 477 int *output, int output_size); 478 479 #endif /* PREDICATE_INTERNALS_H */ 480