1 /*------------------------------------------------------------------------- 2 * 3 * syncscan.c 4 * scan synchronization support 5 * 6 * When multiple backends run a sequential scan on the same table, we try 7 * to keep them synchronized to reduce the overall I/O needed. The goal is 8 * to read each page into shared buffer cache only once, and let all backends 9 * that take part in the shared scan process the page before it falls out of 10 * the cache. 11 * 12 * Since the "leader" in a pack of backends doing a seqscan will have to wait 13 * for I/O, while the "followers" don't, there is a strong self-synchronizing 14 * effect once we can get the backends examining approximately the same part 15 * of the table at the same time. Hence all that is really needed is to get 16 * a new backend beginning a seqscan to begin it close to where other backends 17 * are reading. We can scan the table circularly, from block X up to the 18 * end and then from block 0 to X-1, to ensure we visit all rows while still 19 * participating in the common scan. 20 * 21 * To accomplish that, we keep track of the scan position of each table, and 22 * start new scans close to where the previous scan(s) are. We don't try to 23 * do any extra synchronization to keep the scans together afterwards; some 24 * scans might progress much more slowly than others, for example if the 25 * results need to be transferred to the client over a slow network, and we 26 * don't want such queries to slow down others. 27 * 28 * There can realistically only be a few large sequential scans on different 29 * tables in progress at any time. Therefore we just keep the scan positions 30 * in a small LRU list which we scan every time we need to look up or update a 31 * scan position. The whole mechanism is only applied for tables exceeding 32 * a threshold size (but that is not the concern of this module). 33 * 34 * INTERFACE ROUTINES 35 * ss_get_location - return current scan location of a relation 36 * ss_report_location - update current scan location 37 * 38 * 39 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group 40 * Portions Copyright (c) 1994, Regents of the University of California 41 * 42 * IDENTIFICATION 43 * src/backend/access/common/syncscan.c 44 * 45 *------------------------------------------------------------------------- 46 */ 47 #include "postgres.h" 48 49 #include "access/syncscan.h" 50 #include "miscadmin.h" 51 #include "storage/lwlock.h" 52 #include "storage/shmem.h" 53 #include "utils/rel.h" 54 55 56 /* GUC variables */ 57 #ifdef TRACE_SYNCSCAN 58 bool trace_syncscan = false; 59 #endif 60 61 62 /* 63 * Size of the LRU list. 64 * 65 * Note: the code assumes that SYNC_SCAN_NELEM > 1. 66 * 67 * XXX: What's a good value? It should be large enough to hold the 68 * maximum number of large tables scanned simultaneously. But a larger value 69 * means more traversing of the LRU list when starting a new scan. 70 */ 71 #define SYNC_SCAN_NELEM 20 72 73 /* 74 * Interval between reports of the location of the current scan, in pages. 75 * 76 * Note: This should be smaller than the ring size (see buffer/freelist.c) 77 * we use for bulk reads. Otherwise a scan joining other scans might start 78 * from a page that's no longer in the buffer cache. This is a bit fuzzy; 79 * there's no guarantee that the new scan will read the page before it leaves 80 * the buffer cache anyway, and on the other hand the page is most likely 81 * still in the OS cache. 82 */ 83 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) 84 85 86 /* 87 * The scan locations structure is essentially a doubly-linked LRU with head 88 * and tail pointer, but designed to hold a fixed maximum number of elements in 89 * fixed-size shared memory. 90 */ 91 typedef struct ss_scan_location_t 92 { 93 RelFileNode relfilenode; /* identity of a relation */ 94 BlockNumber location; /* last-reported location in the relation */ 95 } ss_scan_location_t; 96 97 typedef struct ss_lru_item_t 98 { 99 struct ss_lru_item_t *prev; 100 struct ss_lru_item_t *next; 101 ss_scan_location_t location; 102 } ss_lru_item_t; 103 104 typedef struct ss_scan_locations_t 105 { 106 ss_lru_item_t *head; 107 ss_lru_item_t *tail; 108 ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */ 109 } ss_scan_locations_t; 110 111 #define SizeOfScanLocations(N) \ 112 (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t)) 113 114 /* Pointer to struct in shared memory */ 115 static ss_scan_locations_t *scan_locations; 116 117 /* prototypes for internal functions */ 118 static BlockNumber ss_search(RelFileNode relfilenode, 119 BlockNumber location, bool set); 120 121 122 /* 123 * SyncScanShmemSize --- report amount of shared memory space needed 124 */ 125 Size 126 SyncScanShmemSize(void) 127 { 128 return SizeOfScanLocations(SYNC_SCAN_NELEM); 129 } 130 131 /* 132 * SyncScanShmemInit --- initialize this module's shared memory 133 */ 134 void 135 SyncScanShmemInit(void) 136 { 137 int i; 138 bool found; 139 140 scan_locations = (ss_scan_locations_t *) 141 ShmemInitStruct("Sync Scan Locations List", 142 SizeOfScanLocations(SYNC_SCAN_NELEM), 143 &found); 144 145 if (!IsUnderPostmaster) 146 { 147 /* Initialize shared memory area */ 148 Assert(!found); 149 150 scan_locations->head = &scan_locations->items[0]; 151 scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1]; 152 153 for (i = 0; i < SYNC_SCAN_NELEM; i++) 154 { 155 ss_lru_item_t *item = &scan_locations->items[i]; 156 157 /* 158 * Initialize all slots with invalid values. As scans are started, 159 * these invalid entries will fall off the LRU list and get 160 * replaced with real entries. 161 */ 162 item->location.relfilenode.spcNode = InvalidOid; 163 item->location.relfilenode.dbNode = InvalidOid; 164 item->location.relfilenode.relNode = InvalidOid; 165 item->location.location = InvalidBlockNumber; 166 167 item->prev = (i > 0) ? 168 (&scan_locations->items[i - 1]) : NULL; 169 item->next = (i < SYNC_SCAN_NELEM - 1) ? 170 (&scan_locations->items[i + 1]) : NULL; 171 } 172 } 173 else 174 Assert(found); 175 } 176 177 /* 178 * ss_search --- search the scan_locations structure for an entry with the 179 * given relfilenode. 180 * 181 * If "set" is true, the location is updated to the given location. If no 182 * entry for the given relfilenode is found, it will be created at the head 183 * of the list with the given location, even if "set" is false. 184 * 185 * In any case, the location after possible update is returned. 186 * 187 * Caller is responsible for having acquired suitable lock on the shared 188 * data structure. 189 */ 190 static BlockNumber 191 ss_search(RelFileNode relfilenode, BlockNumber location, bool set) 192 { 193 ss_lru_item_t *item; 194 195 item = scan_locations->head; 196 for (;;) 197 { 198 bool match; 199 200 match = RelFileNodeEquals(item->location.relfilenode, relfilenode); 201 202 if (match || item->next == NULL) 203 { 204 /* 205 * If we reached the end of list and no match was found, take over 206 * the last entry 207 */ 208 if (!match) 209 { 210 item->location.relfilenode = relfilenode; 211 item->location.location = location; 212 } 213 else if (set) 214 item->location.location = location; 215 216 /* Move the entry to the front of the LRU list */ 217 if (item != scan_locations->head) 218 { 219 /* unlink */ 220 if (item == scan_locations->tail) 221 scan_locations->tail = item->prev; 222 item->prev->next = item->next; 223 if (item->next) 224 item->next->prev = item->prev; 225 226 /* link */ 227 item->prev = NULL; 228 item->next = scan_locations->head; 229 scan_locations->head->prev = item; 230 scan_locations->head = item; 231 } 232 233 return item->location.location; 234 } 235 236 item = item->next; 237 } 238 239 /* not reached */ 240 } 241 242 /* 243 * ss_get_location --- get the optimal starting location for scan 244 * 245 * Returns the last-reported location of a sequential scan on the 246 * relation, or 0 if no valid location is found. 247 * 248 * We expect the caller has just done RelationGetNumberOfBlocks(), and 249 * so that number is passed in rather than computing it again. The result 250 * is guaranteed less than relnblocks (assuming that's > 0). 251 */ 252 BlockNumber 253 ss_get_location(Relation rel, BlockNumber relnblocks) 254 { 255 BlockNumber startloc; 256 257 LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); 258 startloc = ss_search(rel->rd_node, 0, false); 259 LWLockRelease(SyncScanLock); 260 261 /* 262 * If the location is not a valid block number for this scan, start at 0. 263 * 264 * This can happen if for instance a VACUUM truncated the table since the 265 * location was saved. 266 */ 267 if (startloc >= relnblocks) 268 startloc = 0; 269 270 #ifdef TRACE_SYNCSCAN 271 if (trace_syncscan) 272 elog(LOG, 273 "SYNC_SCAN: start \"%s\" (size %u) at %u", 274 RelationGetRelationName(rel), relnblocks, startloc); 275 #endif 276 277 return startloc; 278 } 279 280 /* 281 * ss_report_location --- update the current scan location 282 * 283 * Writes an entry into the shared Sync Scan state of the form 284 * (relfilenode, blocknumber), overwriting any existing entry for the 285 * same relfilenode. 286 */ 287 void 288 ss_report_location(Relation rel, BlockNumber location) 289 { 290 #ifdef TRACE_SYNCSCAN 291 if (trace_syncscan) 292 { 293 if ((location % 1024) == 0) 294 elog(LOG, 295 "SYNC_SCAN: scanning \"%s\" at %u", 296 RelationGetRelationName(rel), location); 297 } 298 #endif 299 300 /* 301 * To reduce lock contention, only report scan progress every N pages. For 302 * the same reason, don't block if the lock isn't immediately available. 303 * Missing a few updates isn't critical, it just means that a new scan 304 * that wants to join the pack will start a little bit behind the head of 305 * the scan. Hopefully the pages are still in OS cache and the scan 306 * catches up quickly. 307 */ 308 if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0) 309 { 310 if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) 311 { 312 (void) ss_search(rel->rd_node, location, true); 313 LWLockRelease(SyncScanLock); 314 } 315 #ifdef TRACE_SYNCSCAN 316 else if (trace_syncscan) 317 elog(LOG, 318 "SYNC_SCAN: missed update for \"%s\" at %u", 319 RelationGetRelationName(rel), location); 320 #endif 321 } 322 } 323