1 /*-------------------------------------------------------------------------
2  *
3  * syncscan.c
4  *	  heap scan synchronization support
5  *
6  * When multiple backends run a sequential scan on the same table, we try
7  * to keep them synchronized to reduce the overall I/O needed.  The goal is
8  * to read each page into shared buffer cache only once, and let all backends
9  * that take part in the shared scan process the page before it falls out of
10  * the cache.
11  *
12  * Since the "leader" in a pack of backends doing a seqscan will have to wait
13  * for I/O, while the "followers" don't, there is a strong self-synchronizing
14  * effect once we can get the backends examining approximately the same part
15  * of the table at the same time.  Hence all that is really needed is to get
16  * a new backend beginning a seqscan to begin it close to where other backends
17  * are reading.  We can scan the table circularly, from block X up to the
18  * end and then from block 0 to X-1, to ensure we visit all rows while still
19  * participating in the common scan.
20  *
21  * To accomplish that, we keep track of the scan position of each table, and
22  * start new scans close to where the previous scan(s) are.  We don't try to
23  * do any extra synchronization to keep the scans together afterwards; some
24  * scans might progress much more slowly than others, for example if the
25  * results need to be transferred to the client over a slow network, and we
26  * don't want such queries to slow down others.
27  *
28  * There can realistically only be a few large sequential scans on different
29  * tables in progress at any time.  Therefore we just keep the scan positions
30  * in a small LRU list which we scan every time we need to look up or update a
31  * scan position.  The whole mechanism is only applied for tables exceeding
32  * a threshold size (but that is not the concern of this module).
33  *
34  * INTERFACE ROUTINES
35  *		ss_get_location		- return current scan location of a relation
36  *		ss_report_location	- update current scan location
37  *
38  *
39  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  * IDENTIFICATION
43  *	  src/backend/access/heap/syncscan.c
44  *
45  *-------------------------------------------------------------------------
46  */
47 #include "postgres.h"
48 
49 #include "access/heapam.h"
50 #include "miscadmin.h"
51 #include "storage/lwlock.h"
52 #include "storage/shmem.h"
53 #include "utils/rel.h"
54 
55 
56 /* GUC variables */
57 #ifdef TRACE_SYNCSCAN
58 bool		trace_syncscan = false;
59 #endif
60 
61 
62 /*
63  * Size of the LRU list.
64  *
65  * Note: the code assumes that SYNC_SCAN_NELEM > 1.
66  *
67  * XXX: What's a good value? It should be large enough to hold the
68  * maximum number of large tables scanned simultaneously.  But a larger value
69  * means more traversing of the LRU list when starting a new scan.
70  */
71 #define SYNC_SCAN_NELEM 20
72 
73 /*
74  * Interval between reports of the location of the current scan, in pages.
75  *
76  * Note: This should be smaller than the ring size (see buffer/freelist.c)
77  * we use for bulk reads.  Otherwise a scan joining other scans might start
78  * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
79  * there's no guarantee that the new scan will read the page before it leaves
80  * the buffer cache anyway, and on the other hand the page is most likely
81  * still in the OS cache.
82  */
83 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
84 
85 
86 /*
87  * The scan locations structure is essentially a doubly-linked LRU with head
88  * and tail pointer, but designed to hold a fixed maximum number of elements in
89  * fixed-size shared memory.
90  */
91 typedef struct ss_scan_location_t
92 {
93 	RelFileNode relfilenode;	/* identity of a relation */
94 	BlockNumber location;		/* last-reported location in the relation */
95 } ss_scan_location_t;
96 
97 typedef struct ss_lru_item_t
98 {
99 	struct ss_lru_item_t *prev;
100 	struct ss_lru_item_t *next;
101 	ss_scan_location_t location;
102 } ss_lru_item_t;
103 
104 typedef struct ss_scan_locations_t
105 {
106 	ss_lru_item_t *head;
107 	ss_lru_item_t *tail;
108 	ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
109 } ss_scan_locations_t;
110 
111 #define SizeOfScanLocations(N) \
112 	(offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
113 
114 /* Pointer to struct in shared memory */
115 static ss_scan_locations_t *scan_locations;
116 
117 /* prototypes for internal functions */
118 static BlockNumber ss_search(RelFileNode relfilenode,
119 		  BlockNumber location, bool set);
120 
121 
122 /*
123  * SyncScanShmemSize --- report amount of shared memory space needed
124  */
125 Size
SyncScanShmemSize(void)126 SyncScanShmemSize(void)
127 {
128 	return SizeOfScanLocations(SYNC_SCAN_NELEM);
129 }
130 
131 /*
132  * SyncScanShmemInit --- initialize this module's shared memory
133  */
134 void
SyncScanShmemInit(void)135 SyncScanShmemInit(void)
136 {
137 	int			i;
138 	bool		found;
139 
140 	scan_locations = (ss_scan_locations_t *)
141 		ShmemInitStruct("Sync Scan Locations List",
142 						SizeOfScanLocations(SYNC_SCAN_NELEM),
143 						&found);
144 
145 	if (!IsUnderPostmaster)
146 	{
147 		/* Initialize shared memory area */
148 		Assert(!found);
149 
150 		scan_locations->head = &scan_locations->items[0];
151 		scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
152 
153 		for (i = 0; i < SYNC_SCAN_NELEM; i++)
154 		{
155 			ss_lru_item_t *item = &scan_locations->items[i];
156 
157 			/*
158 			 * Initialize all slots with invalid values. As scans are started,
159 			 * these invalid entries will fall off the LRU list and get
160 			 * replaced with real entries.
161 			 */
162 			item->location.relfilenode.spcNode = InvalidOid;
163 			item->location.relfilenode.dbNode = InvalidOid;
164 			item->location.relfilenode.relNode = InvalidOid;
165 			item->location.location = InvalidBlockNumber;
166 
167 			item->prev = (i > 0) ?
168 				(&scan_locations->items[i - 1]) : NULL;
169 			item->next = (i < SYNC_SCAN_NELEM - 1) ?
170 				(&scan_locations->items[i + 1]) : NULL;
171 		}
172 	}
173 	else
174 		Assert(found);
175 }
176 
177 /*
178  * ss_search --- search the scan_locations structure for an entry with the
179  *		given relfilenode.
180  *
181  * If "set" is true, the location is updated to the given location.  If no
182  * entry for the given relfilenode is found, it will be created at the head
183  * of the list with the given location, even if "set" is false.
184  *
185  * In any case, the location after possible update is returned.
186  *
187  * Caller is responsible for having acquired suitable lock on the shared
188  * data structure.
189  */
190 static BlockNumber
ss_search(RelFileNode relfilenode,BlockNumber location,bool set)191 ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
192 {
193 	ss_lru_item_t *item;
194 
195 	item = scan_locations->head;
196 	for (;;)
197 	{
198 		bool		match;
199 
200 		match = RelFileNodeEquals(item->location.relfilenode, relfilenode);
201 
202 		if (match || item->next == NULL)
203 		{
204 			/*
205 			 * If we reached the end of list and no match was found, take over
206 			 * the last entry
207 			 */
208 			if (!match)
209 			{
210 				item->location.relfilenode = relfilenode;
211 				item->location.location = location;
212 			}
213 			else if (set)
214 				item->location.location = location;
215 
216 			/* Move the entry to the front of the LRU list */
217 			if (item != scan_locations->head)
218 			{
219 				/* unlink */
220 				if (item == scan_locations->tail)
221 					scan_locations->tail = item->prev;
222 				item->prev->next = item->next;
223 				if (item->next)
224 					item->next->prev = item->prev;
225 
226 				/* link */
227 				item->prev = NULL;
228 				item->next = scan_locations->head;
229 				scan_locations->head->prev = item;
230 				scan_locations->head = item;
231 			}
232 
233 			return item->location.location;
234 		}
235 
236 		item = item->next;
237 	}
238 
239 	/* not reached */
240 }
241 
242 /*
243  * ss_get_location --- get the optimal starting location for scan
244  *
245  * Returns the last-reported location of a sequential scan on the
246  * relation, or 0 if no valid location is found.
247  *
248  * We expect the caller has just done RelationGetNumberOfBlocks(), and
249  * so that number is passed in rather than computing it again.  The result
250  * is guaranteed less than relnblocks (assuming that's > 0).
251  */
252 BlockNumber
ss_get_location(Relation rel,BlockNumber relnblocks)253 ss_get_location(Relation rel, BlockNumber relnblocks)
254 {
255 	BlockNumber startloc;
256 
257 	LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
258 	startloc = ss_search(rel->rd_node, 0, false);
259 	LWLockRelease(SyncScanLock);
260 
261 	/*
262 	 * If the location is not a valid block number for this scan, start at 0.
263 	 *
264 	 * This can happen if for instance a VACUUM truncated the table since the
265 	 * location was saved.
266 	 */
267 	if (startloc >= relnblocks)
268 		startloc = 0;
269 
270 #ifdef TRACE_SYNCSCAN
271 	if (trace_syncscan)
272 		elog(LOG,
273 			 "SYNC_SCAN: start \"%s\" (size %u) at %u",
274 			 RelationGetRelationName(rel), relnblocks, startloc);
275 #endif
276 
277 	return startloc;
278 }
279 
280 /*
281  * ss_report_location --- update the current scan location
282  *
283  * Writes an entry into the shared Sync Scan state of the form
284  * (relfilenode, blocknumber), overwriting any existing entry for the
285  * same relfilenode.
286  */
287 void
ss_report_location(Relation rel,BlockNumber location)288 ss_report_location(Relation rel, BlockNumber location)
289 {
290 #ifdef TRACE_SYNCSCAN
291 	if (trace_syncscan)
292 	{
293 		if ((location % 1024) == 0)
294 			elog(LOG,
295 				 "SYNC_SCAN: scanning \"%s\" at %u",
296 				 RelationGetRelationName(rel), location);
297 	}
298 #endif
299 
300 	/*
301 	 * To reduce lock contention, only report scan progress every N pages. For
302 	 * the same reason, don't block if the lock isn't immediately available.
303 	 * Missing a few updates isn't critical, it just means that a new scan
304 	 * that wants to join the pack will start a little bit behind the head of
305 	 * the scan.  Hopefully the pages are still in OS cache and the scan
306 	 * catches up quickly.
307 	 */
308 	if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
309 	{
310 		if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
311 		{
312 			(void) ss_search(rel->rd_node, location, true);
313 			LWLockRelease(SyncScanLock);
314 		}
315 #ifdef TRACE_SYNCSCAN
316 		else if (trace_syncscan)
317 			elog(LOG,
318 				 "SYNC_SCAN: missed update for \"%s\" at %u",
319 				 RelationGetRelationName(rel), location);
320 #endif
321 	}
322 }
323