1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *	  This code manages relations that reside on magnetic disk.
5  *
6  * Or at least, that was what the Berkeley folk had in mind when they named
7  * this file.  In reality, what this code provides is an interface from
8  * the smgr API to Unix-like filesystem APIs, so it will work with any type
9  * of device for which the operating system provides filesystem support.
10  * It doesn't matter whether the bits are on spinning rust or some other
11  * storage technology.
12  *
13  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  *
17  * IDENTIFICATION
18  *	  src/backend/storage/smgr/md.c
19  *
20  *-------------------------------------------------------------------------
21  */
22 #include "postgres.h"
23 
24 #include <unistd.h>
25 #include <fcntl.h>
26 #include <sys/file.h>
27 
28 #include "miscadmin.h"
29 #include "access/xlogutils.h"
30 #include "access/xlog.h"
31 #include "catalog/catalog.h"
32 #include "pgstat.h"
33 #include "portability/instr_time.h"
34 #include "postmaster/bgwriter.h"
35 #include "storage/fd.h"
36 #include "storage/bufmgr.h"
37 #include "storage/relfilenode.h"
38 #include "storage/smgr.h"
39 #include "utils/hsearch.h"
40 #include "utils/memutils.h"
41 #include "pg_trace.h"
42 
43 
44 /* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
45 #define FSYNCS_PER_ABSORB		10
46 #define UNLINKS_PER_ABSORB		10
47 
48 /*
49  * Special values for the segno arg to RememberFsyncRequest.
50  *
51  * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
52  * fsync request from the queue if an identical, subsequent request is found.
53  * See comments there before making changes here.
54  */
55 #define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
56 #define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
57 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
58 
59 /*
60  * On Windows, we have to interpret EACCES as possibly meaning the same as
61  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
62  * that's what you get.  Ugh.  This code is designed so that we don't
63  * actually believe these cases are okay without further evidence (namely,
64  * a pending fsync request getting canceled ... see mdsync).
65  */
66 #ifndef WIN32
67 #define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT)
68 #else
69 #define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT || (err) == EACCES)
70 #endif
71 
72 /*
73  *	The magnetic disk storage manager keeps track of open file
74  *	descriptors in its own descriptor pool.  This is done to make it
75  *	easier to support relations that are larger than the operating
76  *	system's file size limit (often 2GBytes).  In order to do that,
77  *	we break relations up into "segment" files that are each shorter than
78  *	the OS file size limit.  The segment size is set by the RELSEG_SIZE
79  *	configuration constant in pg_config.h.
80  *
81  *	On disk, a relation must consist of consecutively numbered segment
82  *	files in the pattern
83  *		-- Zero or more full segments of exactly RELSEG_SIZE blocks each
84  *		-- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
85  *		-- Optionally, any number of inactive segments of size 0 blocks.
86  *	The full and partial segments are collectively the "active" segments.
87  *	Inactive segments are those that once contained data but are currently
88  *	not needed because of an mdtruncate() operation.  The reason for leaving
89  *	them present at size zero, rather than unlinking them, is that other
90  *	backends and/or the checkpointer might be holding open file references to
91  *	such segments.  If the relation expands again after mdtruncate(), such
92  *	that a deactivated segment becomes active again, it is important that
93  *	such file references still be valid --- else data might get written
94  *	out to an unlinked old copy of a segment file that will eventually
95  *	disappear.
96  *
97  *	File descriptors are stored in the per-fork md_seg_fds arrays inside
98  *	SMgrRelation. The length of these arrays is stored in md_num_open_segs.
99  *	Note that a fork's md_num_open_segs having a specific value does not
100  *	necessarily mean the relation doesn't have additional segments; we may
101  *	just not have opened the next segment yet.  (We could not have "all
102  *	segments are in the array" as an invariant anyway, since another backend
103  *	could extend the relation while we aren't looking.)  We do not have
104  *	entries for inactive segments, however; as soon as we find a partial
105  *	segment, we assume that any subsequent segments are inactive.
106  *
107  *	The entire MdfdVec array is palloc'd in the MdCxt memory context.
108  */
109 
110 typedef struct _MdfdVec
111 {
112 	File		mdfd_vfd;		/* fd number in fd.c's pool */
113 	BlockNumber mdfd_segno;		/* segment number, from 0 */
114 } MdfdVec;
115 
116 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
117 
118 
119 /*
120  * In some contexts (currently, standalone backends and the checkpointer)
121  * we keep track of pending fsync operations: we need to remember all relation
122  * segments that have been written since the last checkpoint, so that we can
123  * fsync them down to disk before completing the next checkpoint.  This hash
124  * table remembers the pending operations.  We use a hash table mostly as
125  * a convenient way of merging duplicate requests.
126  *
127  * We use a similar mechanism to remember no-longer-needed files that can
128  * be deleted after the next checkpoint, but we use a linked list instead of
129  * a hash table, because we don't expect there to be any duplicate requests.
130  *
131  * These mechanisms are only used for non-temp relations; we never fsync
132  * temp rels, nor do we need to postpone their deletion (see comments in
133  * mdunlink).
134  *
135  * (Regular backends do not track pending operations locally, but forward
136  * them to the checkpointer.)
137  */
138 typedef uint16 CycleCtr;		/* can be any convenient integer size */
139 
140 typedef struct
141 {
142 	RelFileNode rnode;			/* hash table key (must be first!) */
143 	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr of oldest request */
144 	/* requests[f] has bit n set if we need to fsync segment n of fork f */
145 	Bitmapset  *requests[MAX_FORKNUM + 1];
146 	/* canceled[f] is true if we canceled fsyncs for fork "recently" */
147 	bool		canceled[MAX_FORKNUM + 1];
148 } PendingOperationEntry;
149 
150 typedef struct
151 {
152 	RelFileNode rnode;			/* the dead relation to delete */
153 	CycleCtr	cycle_ctr;		/* mdckpt_cycle_ctr when request was made */
154 } PendingUnlinkEntry;
155 
156 static HTAB *pendingOpsTable = NULL;
157 static List *pendingUnlinks = NIL;
158 static MemoryContext pendingOpsCxt; /* context for the above  */
159 
160 static CycleCtr mdsync_cycle_ctr = 0;
161 static CycleCtr mdckpt_cycle_ctr = 0;
162 
163 
164 /*** behavior for mdopen & _mdfd_getseg ***/
165 /* ereport if segment not present */
166 #define EXTENSION_FAIL				(1 << 0)
167 /* return NULL if segment not present */
168 #define EXTENSION_RETURN_NULL		(1 << 1)
169 /* create new segments as needed */
170 #define EXTENSION_CREATE			(1 << 2)
171 /* create new segments if needed during recovery */
172 #define EXTENSION_CREATE_RECOVERY	(1 << 3)
173 /*
174  * Allow opening segments which are preceded by segments smaller than
175  * RELSEG_SIZE, e.g. inactive segments (see above). Note that this is breaks
176  * mdnblocks() and related functionality henceforth - which currently is ok,
177  * because this is only required in the checkpointer which never uses
178  * mdnblocks().
179  */
180 #define EXTENSION_DONT_CHECK_SIZE	(1 << 4)
181 
182 
183 /* local routines */
184 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
185 			 bool isRedo);
186 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
187 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
188 					   MdfdVec *seg);
189 static void register_unlink(RelFileNodeBackend rnode);
190 static void _fdvec_resize(SMgrRelation reln,
191 			  ForkNumber forknum,
192 			  int nseg);
193 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
194 			  BlockNumber segno);
195 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
196 			  BlockNumber segno, int oflags);
197 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
198 			 BlockNumber blkno, bool skipFsync, int behavior);
199 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
200 		   MdfdVec *seg);
201 
202 
203 /*
204  *	mdinit() -- Initialize private state for magnetic disk storage manager.
205  */
206 void
mdinit(void)207 mdinit(void)
208 {
209 	MdCxt = AllocSetContextCreate(TopMemoryContext,
210 								  "MdSmgr",
211 								  ALLOCSET_DEFAULT_SIZES);
212 
213 	/*
214 	 * Create pending-operations hashtable if we need it.  Currently, we need
215 	 * it if we are standalone (not under a postmaster) or if we are a startup
216 	 * or checkpointer auxiliary process.
217 	 */
218 	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
219 	{
220 		HASHCTL		hash_ctl;
221 
222 		/*
223 		 * XXX: The checkpointer needs to add entries to the pending ops table
224 		 * when absorbing fsync requests.  That is done within a critical
225 		 * section, which isn't usually allowed, but we make an exception. It
226 		 * means that there's a theoretical possibility that you run out of
227 		 * memory while absorbing fsync requests, which leads to a PANIC.
228 		 * Fortunately the hash table is small so that's unlikely to happen in
229 		 * practice.
230 		 */
231 		pendingOpsCxt = AllocSetContextCreate(MdCxt,
232 											  "Pending ops context",
233 											  ALLOCSET_DEFAULT_SIZES);
234 		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
235 
236 		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
237 		hash_ctl.keysize = sizeof(RelFileNode);
238 		hash_ctl.entrysize = sizeof(PendingOperationEntry);
239 		hash_ctl.hcxt = pendingOpsCxt;
240 		pendingOpsTable = hash_create("Pending Ops Table",
241 									  100L,
242 									  &hash_ctl,
243 									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
244 		pendingUnlinks = NIL;
245 	}
246 }
247 
248 /*
249  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
250  * already created the pendingOpsTable during initialization of the startup
251  * process.  Calling this function drops the local pendingOpsTable so that
252  * subsequent requests will be forwarded to checkpointer.
253  */
254 void
SetForwardFsyncRequests(void)255 SetForwardFsyncRequests(void)
256 {
257 	/* Perform any pending fsyncs we may have queued up, then drop table */
258 	if (pendingOpsTable)
259 	{
260 		mdsync();
261 		hash_destroy(pendingOpsTable);
262 	}
263 	pendingOpsTable = NULL;
264 
265 	/*
266 	 * We should not have any pending unlink requests, since mdunlink doesn't
267 	 * queue unlink requests when isRedo.
268 	 */
269 	Assert(pendingUnlinks == NIL);
270 }
271 
272 /*
273  *	mdexists() -- Does the physical file exist?
274  *
275  * Note: this will return true for lingering files, with pending deletions
276  */
277 bool
mdexists(SMgrRelation reln,ForkNumber forkNum)278 mdexists(SMgrRelation reln, ForkNumber forkNum)
279 {
280 	/*
281 	 * Close it first, to ensure that we notice if the fork has been unlinked
282 	 * since we opened it.
283 	 */
284 	mdclose(reln, forkNum);
285 
286 	return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
287 }
288 
289 /*
290  *	mdcreate() -- Create a new relation on magnetic disk.
291  *
292  * If isRedo is true, it's okay for the relation to exist already.
293  */
294 void
mdcreate(SMgrRelation reln,ForkNumber forkNum,bool isRedo)295 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
296 {
297 	MdfdVec    *mdfd;
298 	char	   *path;
299 	File		fd;
300 
301 	if (isRedo && reln->md_num_open_segs[forkNum] > 0)
302 		return;					/* created and opened already... */
303 
304 	Assert(reln->md_num_open_segs[forkNum] == 0);
305 
306 	path = relpath(reln->smgr_rnode, forkNum);
307 
308 	fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
309 
310 	if (fd < 0)
311 	{
312 		int			save_errno = errno;
313 
314 		/*
315 		 * During bootstrap, there are cases where a system relation will be
316 		 * accessed (by internal backend processes) before the bootstrap
317 		 * script nominally creates it.  Therefore, allow the file to exist
318 		 * already, even if isRedo is not set.  (See also mdopen)
319 		 */
320 		if (isRedo || IsBootstrapProcessingMode())
321 			fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
322 		if (fd < 0)
323 		{
324 			/* be sure to report the error reported by create, not open */
325 			errno = save_errno;
326 			ereport(ERROR,
327 					(errcode_for_file_access(),
328 					 errmsg("could not create file \"%s\": %m", path)));
329 		}
330 	}
331 
332 	pfree(path);
333 
334 	_fdvec_resize(reln, forkNum, 1);
335 	mdfd = &reln->md_seg_fds[forkNum][0];
336 	mdfd->mdfd_vfd = fd;
337 	mdfd->mdfd_segno = 0;
338 }
339 
340 /*
341  *	mdunlink() -- Unlink a relation.
342  *
343  * Note that we're passed a RelFileNodeBackend --- by the time this is called,
344  * there won't be an SMgrRelation hashtable entry anymore.
345  *
346  * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
347  * to delete all forks.
348  *
349  * For regular relations, we don't unlink the first segment file of the rel,
350  * but just truncate it to zero length, and record a request to unlink it after
351  * the next checkpoint.  Additional segments can be unlinked immediately,
352  * however.  Leaving the empty file in place prevents that relfilenode
353  * number from being reused.  The scenario this protects us from is:
354  * 1. We delete a relation (and commit, and actually remove its file).
355  * 2. We create a new relation, which by chance gets the same relfilenode as
356  *	  the just-deleted one (OIDs must've wrapped around for that to happen).
357  * 3. We crash before another checkpoint occurs.
358  * During replay, we would delete the file and then recreate it, which is fine
359  * if the contents of the file were repopulated by subsequent WAL entries.
360  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
361  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
362  * the contents of the file would be lost forever.  By leaving the empty file
363  * until after the next checkpoint, we prevent reassignment of the relfilenode
364  * number until it's safe, because relfilenode assignment skips over any
365  * existing file.
366  *
367  * We do not need to go through this dance for temp relations, though, because
368  * we never make WAL entries for temp rels, and so a temp rel poses no threat
369  * to the health of a regular rel that has taken over its relfilenode number.
370  * The fact that temp rels and regular rels have different file naming
371  * patterns provides additional safety.
372  *
373  * All the above applies only to the relation's main fork; other forks can
374  * just be removed immediately, since they are not needed to prevent the
375  * relfilenode number from being recycled.  Also, we do not carefully
376  * track whether other forks have been created or not, but just attempt to
377  * unlink them unconditionally; so we should never complain about ENOENT.
378  *
379  * If isRedo is true, it's unsurprising for the relation to be already gone.
380  * Also, we should remove the file immediately instead of queuing a request
381  * for later, since during redo there's no possibility of creating a
382  * conflicting relation.
383  *
384  * Note: any failure should be reported as WARNING not ERROR, because
385  * we are usually not in a transaction anymore when this is called.
386  */
387 void
mdunlink(RelFileNodeBackend rnode,ForkNumber forkNum,bool isRedo)388 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
389 {
390 	/*
391 	 * We have to clean out any pending fsync requests for the doomed
392 	 * relation, else the next mdsync() will fail.  There can't be any such
393 	 * requests for a temp relation, though.  We can send just one request
394 	 * even when deleting multiple forks, since the fsync queuing code accepts
395 	 * the "InvalidForkNumber = all forks" convention.
396 	 */
397 	if (!RelFileNodeBackendIsTemp(rnode))
398 		ForgetRelationFsyncRequests(rnode.node, forkNum);
399 
400 	/* Now do the per-fork work */
401 	if (forkNum == InvalidForkNumber)
402 	{
403 		for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
404 			mdunlinkfork(rnode, forkNum, isRedo);
405 	}
406 	else
407 		mdunlinkfork(rnode, forkNum, isRedo);
408 }
409 
410 /*
411  * Truncate a file to release disk space.
412  */
413 static int
do_truncate(char * path)414 do_truncate(char *path)
415 {
416 	int			save_errno;
417 	int			ret;
418 	int			fd;
419 
420 	/* truncate(2) would be easier here, but Windows hasn't got it */
421 	fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0);
422 	if (fd >= 0)
423 	{
424 		ret = ftruncate(fd, 0);
425 		save_errno = errno;
426 		CloseTransientFile(fd);
427 		errno = save_errno;
428 	}
429 	else
430 		ret = -1;
431 
432 	/* Log a warning here to avoid repetition in callers. */
433 	if (ret < 0 && errno != ENOENT)
434 	{
435 		save_errno = errno;
436 		ereport(WARNING,
437 				(errcode_for_file_access(),
438 				 errmsg("could not truncate file \"%s\": %m", path)));
439 		errno = save_errno;
440 	}
441 
442 	return ret;
443 }
444 
445 static void
mdunlinkfork(RelFileNodeBackend rnode,ForkNumber forkNum,bool isRedo)446 mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
447 {
448 	char	   *path;
449 	int			ret;
450 
451 	path = relpath(rnode, forkNum);
452 
453 	/*
454 	 * Delete or truncate the first segment.
455 	 */
456 	if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
457 	{
458 		if (!RelFileNodeBackendIsTemp(rnode))
459 		{
460 			/* Prevent other backends' fds from holding on to the disk space */
461 			ret = do_truncate(path);
462 		}
463 		else
464 			ret = 0;
465 
466 		/* Next unlink the file, unless it was already found to be missing */
467 		if (ret == 0 || errno != ENOENT)
468 		{
469 			ret = unlink(path);
470 			if (ret < 0 && errno != ENOENT)
471 				ereport(WARNING,
472 						(errcode_for_file_access(),
473 						 errmsg("could not remove file \"%s\": %m", path)));
474 		}
475 	}
476 	else
477 	{
478 		/* Prevent other backends' fds from holding on to the disk space */
479 		ret = do_truncate(path);
480 
481 		/* Register request to unlink first segment later */
482 		register_unlink(rnode);
483 	}
484 
485 	/*
486 	 * Delete any additional segments.
487 	 */
488 	if (ret >= 0)
489 	{
490 		char	   *segpath = (char *) palloc(strlen(path) + 12);
491 		BlockNumber segno;
492 
493 		/*
494 		 * Note that because we loop until getting ENOENT, we will correctly
495 		 * remove all inactive segments as well as active ones.
496 		 */
497 		for (segno = 1;; segno++)
498 		{
499 			sprintf(segpath, "%s.%u", path, segno);
500 
501 			if (!RelFileNodeBackendIsTemp(rnode))
502 			{
503 				/*
504 				 * Prevent other backends' fds from holding on to the disk
505 				 * space.
506 				 */
507 				if (do_truncate(segpath) < 0 && errno == ENOENT)
508 					break;
509 			}
510 
511 			if (unlink(segpath) < 0)
512 			{
513 				/* ENOENT is expected after the last segment... */
514 				if (errno != ENOENT)
515 					ereport(WARNING,
516 							(errcode_for_file_access(),
517 							 errmsg("could not remove file \"%s\": %m", segpath)));
518 				break;
519 			}
520 		}
521 		pfree(segpath);
522 	}
523 
524 	pfree(path);
525 }
526 
527 /*
528  *	mdextend() -- Add a block to the specified relation.
529  *
530  *		The semantics are nearly the same as mdwrite(): write at the
531  *		specified position.  However, this is to be used for the case of
532  *		extending a relation (i.e., blocknum is at or beyond the current
533  *		EOF).  Note that we assume writing a block beyond current EOF
534  *		causes intervening file space to become filled with zeroes.
535  */
536 void
mdextend(SMgrRelation reln,ForkNumber forknum,BlockNumber blocknum,char * buffer,bool skipFsync)537 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
538 		 char *buffer, bool skipFsync)
539 {
540 	off_t		seekpos;
541 	int			nbytes;
542 	MdfdVec    *v;
543 
544 	/* This assert is too expensive to have on normally ... */
545 #ifdef CHECK_WRITE_VS_EXTEND
546 	Assert(blocknum >= mdnblocks(reln, forknum));
547 #endif
548 
549 	/*
550 	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
551 	 * more --- we mustn't create a block whose number actually is
552 	 * InvalidBlockNumber.  (Note that this failure should be unreachable
553 	 * because of upstream checks in bufmgr.c.)
554 	 */
555 	if (blocknum == InvalidBlockNumber)
556 		ereport(ERROR,
557 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
558 				 errmsg("cannot extend file \"%s\" beyond %u blocks",
559 						relpath(reln->smgr_rnode, forknum),
560 						InvalidBlockNumber)));
561 
562 	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
563 
564 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
565 
566 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
567 
568 	/*
569 	 * Note: because caller usually obtained blocknum by calling mdnblocks,
570 	 * which did a seek(SEEK_END), this seek is often redundant and will be
571 	 * optimized away by fd.c.  It's not redundant, however, if there is a
572 	 * partial page at the end of the file. In that case we want to try to
573 	 * overwrite the partial page with a full page.  It's also not redundant
574 	 * if bufmgr.c had to dump another buffer of the same file to make room
575 	 * for the new page's buffer.
576 	 */
577 	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
578 		ereport(ERROR,
579 				(errcode_for_file_access(),
580 				 errmsg("could not seek to block %u in file \"%s\": %m",
581 						blocknum, FilePathName(v->mdfd_vfd))));
582 
583 	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
584 	{
585 		if (nbytes < 0)
586 			ereport(ERROR,
587 					(errcode_for_file_access(),
588 					 errmsg("could not extend file \"%s\": %m",
589 							FilePathName(v->mdfd_vfd)),
590 					 errhint("Check free disk space.")));
591 		/* short write: complain appropriately */
592 		ereport(ERROR,
593 				(errcode(ERRCODE_DISK_FULL),
594 				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
595 						FilePathName(v->mdfd_vfd),
596 						nbytes, BLCKSZ, blocknum),
597 				 errhint("Check free disk space.")));
598 	}
599 
600 	if (!skipFsync && !SmgrIsTemp(reln))
601 		register_dirty_segment(reln, forknum, v);
602 
603 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
604 }
605 
606 /*
607  *	mdopen() -- Open the specified relation.
608  *
609  * Note we only open the first segment, when there are multiple segments.
610  *
611  * If first segment is not present, either ereport or return NULL according
612  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
613  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
614  * invent one out of whole cloth.
615  */
616 static MdfdVec *
mdopen(SMgrRelation reln,ForkNumber forknum,int behavior)617 mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
618 {
619 	MdfdVec    *mdfd;
620 	char	   *path;
621 	File		fd;
622 
623 	/* No work if already open */
624 	if (reln->md_num_open_segs[forknum] > 0)
625 		return &reln->md_seg_fds[forknum][0];
626 
627 	path = relpath(reln->smgr_rnode, forknum);
628 
629 	fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
630 
631 	if (fd < 0)
632 	{
633 		/*
634 		 * During bootstrap, there are cases where a system relation will be
635 		 * accessed (by internal backend processes) before the bootstrap
636 		 * script nominally creates it.  Therefore, accept mdopen() as a
637 		 * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
638 		 */
639 		if (IsBootstrapProcessingMode())
640 			fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
641 		if (fd < 0)
642 		{
643 			if ((behavior & EXTENSION_RETURN_NULL) &&
644 				FILE_POSSIBLY_DELETED(errno))
645 			{
646 				pfree(path);
647 				return NULL;
648 			}
649 			ereport(ERROR,
650 					(errcode_for_file_access(),
651 					 errmsg("could not open file \"%s\": %m", path)));
652 		}
653 	}
654 
655 	pfree(path);
656 
657 	_fdvec_resize(reln, forknum, 1);
658 	mdfd = &reln->md_seg_fds[forknum][0];
659 	mdfd->mdfd_vfd = fd;
660 	mdfd->mdfd_segno = 0;
661 
662 	Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
663 
664 	return mdfd;
665 }
666 
667 /*
668  *	mdclose() -- Close the specified relation, if it isn't closed already.
669  */
670 void
mdclose(SMgrRelation reln,ForkNumber forknum)671 mdclose(SMgrRelation reln, ForkNumber forknum)
672 {
673 	int			nopensegs = reln->md_num_open_segs[forknum];
674 
675 	/* No work if already closed */
676 	if (nopensegs == 0)
677 		return;
678 
679 	/* close segments starting from the end */
680 	while (nopensegs > 0)
681 	{
682 		MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
683 
684 		FileClose(v->mdfd_vfd);
685 		_fdvec_resize(reln, forknum, nopensegs - 1);
686 		nopensegs--;
687 	}
688 }
689 
690 /*
691  *	mdprefetch() -- Initiate asynchronous read of the specified block of a relation
692  */
693 void
mdprefetch(SMgrRelation reln,ForkNumber forknum,BlockNumber blocknum)694 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
695 {
696 #ifdef USE_PREFETCH
697 	off_t		seekpos;
698 	MdfdVec    *v;
699 
700 	v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
701 
702 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
703 
704 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
705 
706 	(void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
707 #endif							/* USE_PREFETCH */
708 }
709 
710 /*
711  * mdwriteback() -- Tell the kernel to write pages back to storage.
712  *
713  * This accepts a range of blocks because flushing several pages at once is
714  * considerably more efficient than doing so individually.
715  */
716 void
mdwriteback(SMgrRelation reln,ForkNumber forknum,BlockNumber blocknum,BlockNumber nblocks)717 mdwriteback(SMgrRelation reln, ForkNumber forknum,
718 			BlockNumber blocknum, BlockNumber nblocks)
719 {
720 	/*
721 	 * Issue flush requests in as few requests as possible; have to split at
722 	 * segment boundaries though, since those are actually separate files.
723 	 */
724 	while (nblocks > 0)
725 	{
726 		BlockNumber nflush = nblocks;
727 		off_t		seekpos;
728 		MdfdVec    *v;
729 		int			segnum_start,
730 					segnum_end;
731 
732 		v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
733 						 EXTENSION_RETURN_NULL);
734 
735 		/*
736 		 * We might be flushing buffers of already removed relations, that's
737 		 * ok, just ignore that case.
738 		 */
739 		if (!v)
740 			return;
741 
742 		/* compute offset inside the current segment */
743 		segnum_start = blocknum / RELSEG_SIZE;
744 
745 		/* compute number of desired writes within the current segment */
746 		segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
747 		if (segnum_start != segnum_end)
748 			nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
749 
750 		Assert(nflush >= 1);
751 		Assert(nflush <= nblocks);
752 
753 		seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
754 
755 		FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
756 
757 		nblocks -= nflush;
758 		blocknum += nflush;
759 	}
760 }
761 
762 /*
763  *	mdread() -- Read the specified block from a relation.
764  */
765 void
mdread(SMgrRelation reln,ForkNumber forknum,BlockNumber blocknum,char * buffer)766 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
767 	   char *buffer)
768 {
769 	off_t		seekpos;
770 	int			nbytes;
771 	MdfdVec    *v;
772 
773 	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
774 										reln->smgr_rnode.node.spcNode,
775 										reln->smgr_rnode.node.dbNode,
776 										reln->smgr_rnode.node.relNode,
777 										reln->smgr_rnode.backend);
778 
779 	v = _mdfd_getseg(reln, forknum, blocknum, false,
780 					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
781 
782 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
783 
784 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
785 
786 	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
787 		ereport(ERROR,
788 				(errcode_for_file_access(),
789 				 errmsg("could not seek to block %u in file \"%s\": %m",
790 						blocknum, FilePathName(v->mdfd_vfd))));
791 
792 	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);
793 
794 	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
795 									   reln->smgr_rnode.node.spcNode,
796 									   reln->smgr_rnode.node.dbNode,
797 									   reln->smgr_rnode.node.relNode,
798 									   reln->smgr_rnode.backend,
799 									   nbytes,
800 									   BLCKSZ);
801 
802 	if (nbytes != BLCKSZ)
803 	{
804 		if (nbytes < 0)
805 			ereport(ERROR,
806 					(errcode_for_file_access(),
807 					 errmsg("could not read block %u in file \"%s\": %m",
808 							blocknum, FilePathName(v->mdfd_vfd))));
809 
810 		/*
811 		 * Short read: we are at or past EOF, or we read a partial block at
812 		 * EOF.  Normally this is an error; upper levels should never try to
813 		 * read a nonexistent block.  However, if zero_damaged_pages is ON or
814 		 * we are InRecovery, we should instead return zeroes without
815 		 * complaining.  This allows, for example, the case of trying to
816 		 * update a block that was later truncated away.
817 		 */
818 		if (zero_damaged_pages || InRecovery)
819 			MemSet(buffer, 0, BLCKSZ);
820 		else
821 			ereport(ERROR,
822 					(errcode(ERRCODE_DATA_CORRUPTED),
823 					 errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
824 							blocknum, FilePathName(v->mdfd_vfd),
825 							nbytes, BLCKSZ)));
826 	}
827 }
828 
829 /*
830  *	mdwrite() -- Write the supplied block at the appropriate location.
831  *
832  *		This is to be used only for updating already-existing blocks of a
833  *		relation (ie, those before the current EOF).  To extend a relation,
834  *		use mdextend().
835  */
836 void
mdwrite(SMgrRelation reln,ForkNumber forknum,BlockNumber blocknum,char * buffer,bool skipFsync)837 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
838 		char *buffer, bool skipFsync)
839 {
840 	off_t		seekpos;
841 	int			nbytes;
842 	MdfdVec    *v;
843 
844 	/* This assert is too expensive to have on normally ... */
845 #ifdef CHECK_WRITE_VS_EXTEND
846 	Assert(blocknum < mdnblocks(reln, forknum));
847 #endif
848 
849 	TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
850 										 reln->smgr_rnode.node.spcNode,
851 										 reln->smgr_rnode.node.dbNode,
852 										 reln->smgr_rnode.node.relNode,
853 										 reln->smgr_rnode.backend);
854 
855 	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
856 					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
857 
858 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
859 
860 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
861 
862 	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
863 		ereport(ERROR,
864 				(errcode_for_file_access(),
865 				 errmsg("could not seek to block %u in file \"%s\": %m",
866 						blocknum, FilePathName(v->mdfd_vfd))));
867 
868 	nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE);
869 
870 	TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
871 										reln->smgr_rnode.node.spcNode,
872 										reln->smgr_rnode.node.dbNode,
873 										reln->smgr_rnode.node.relNode,
874 										reln->smgr_rnode.backend,
875 										nbytes,
876 										BLCKSZ);
877 
878 	if (nbytes != BLCKSZ)
879 	{
880 		if (nbytes < 0)
881 			ereport(ERROR,
882 					(errcode_for_file_access(),
883 					 errmsg("could not write block %u in file \"%s\": %m",
884 							blocknum, FilePathName(v->mdfd_vfd))));
885 		/* short write: complain appropriately */
886 		ereport(ERROR,
887 				(errcode(ERRCODE_DISK_FULL),
888 				 errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
889 						blocknum,
890 						FilePathName(v->mdfd_vfd),
891 						nbytes, BLCKSZ),
892 				 errhint("Check free disk space.")));
893 	}
894 
895 	if (!skipFsync && !SmgrIsTemp(reln))
896 		register_dirty_segment(reln, forknum, v);
897 }
898 
899 /*
900  *	mdnblocks() -- Get the number of blocks stored in a relation.
901  *
902  *		Important side effect: all active segments of the relation are opened
903  *		and added to the mdfd_seg_fds array.  If this routine has not been
904  *		called, then only segments up to the last one actually touched
905  *		are present in the array.
906  */
907 BlockNumber
mdnblocks(SMgrRelation reln,ForkNumber forknum)908 mdnblocks(SMgrRelation reln, ForkNumber forknum)
909 {
910 	MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
911 	BlockNumber nblocks;
912 	BlockNumber segno = 0;
913 
914 	/* mdopen has opened the first segment */
915 	Assert(reln->md_num_open_segs[forknum] > 0);
916 
917 	/*
918 	 * Start from the last open segments, to avoid redundant seeks.  We have
919 	 * previously verified that these segments are exactly RELSEG_SIZE long,
920 	 * and it's useless to recheck that each time.
921 	 *
922 	 * NOTE: this assumption could only be wrong if another backend has
923 	 * truncated the relation.  We rely on higher code levels to handle that
924 	 * scenario by closing and re-opening the md fd, which is handled via
925 	 * relcache flush.  (Since the checkpointer doesn't participate in
926 	 * relcache flush, it could have segment entries for inactive segments;
927 	 * that's OK because the checkpointer never needs to compute relation
928 	 * size.)
929 	 */
930 	segno = reln->md_num_open_segs[forknum] - 1;
931 	v = &reln->md_seg_fds[forknum][segno];
932 
933 	for (;;)
934 	{
935 		nblocks = _mdnblocks(reln, forknum, v);
936 		if (nblocks > ((BlockNumber) RELSEG_SIZE))
937 			elog(FATAL, "segment too big");
938 		if (nblocks < ((BlockNumber) RELSEG_SIZE))
939 			return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
940 
941 		/*
942 		 * If segment is exactly RELSEG_SIZE, advance to next one.
943 		 */
944 		segno++;
945 
946 		/*
947 		 * We used to pass O_CREAT here, but that's has the disadvantage that
948 		 * it might create a segment which has vanished through some operating
949 		 * system misadventure.  In such a case, creating the segment here
950 		 * undermines _mdfd_getseg's attempts to notice and report an error
951 		 * upon access to a missing segment.
952 		 */
953 		v = _mdfd_openseg(reln, forknum, segno, 0);
954 		if (v == NULL)
955 			return segno * ((BlockNumber) RELSEG_SIZE);
956 	}
957 }
958 
959 /*
960  *	mdtruncate() -- Truncate relation to specified number of blocks.
961  */
962 void
mdtruncate(SMgrRelation reln,ForkNumber forknum,BlockNumber nblocks)963 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
964 {
965 	BlockNumber curnblk;
966 	BlockNumber priorblocks;
967 	int			curopensegs;
968 
969 	/*
970 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
971 	 * truncation loop will get them all!
972 	 */
973 	curnblk = mdnblocks(reln, forknum);
974 	if (nblocks > curnblk)
975 	{
976 		/* Bogus request ... but no complaint if InRecovery */
977 		if (InRecovery)
978 			return;
979 		ereport(ERROR,
980 				(errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
981 						relpath(reln->smgr_rnode, forknum),
982 						nblocks, curnblk)));
983 	}
984 	if (nblocks == curnblk)
985 		return;					/* no work */
986 
987 	/*
988 	 * Truncate segments, starting at the last one. Starting at the end makes
989 	 * managing the memory for the fd array easier, should there be errors.
990 	 */
991 	curopensegs = reln->md_num_open_segs[forknum];
992 	while (curopensegs > 0)
993 	{
994 		MdfdVec    *v;
995 
996 		priorblocks = (curopensegs - 1) * RELSEG_SIZE;
997 
998 		v = &reln->md_seg_fds[forknum][curopensegs - 1];
999 
1000 		if (priorblocks > nblocks)
1001 		{
1002 			/*
1003 			 * This segment is no longer active. We truncate the file, but do
1004 			 * not delete it, for reasons explained in the header comments.
1005 			 */
1006 			if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1007 				ereport(ERROR,
1008 						(errcode_for_file_access(),
1009 						 errmsg("could not truncate file \"%s\": %m",
1010 								FilePathName(v->mdfd_vfd))));
1011 
1012 			if (!SmgrIsTemp(reln))
1013 				register_dirty_segment(reln, forknum, v);
1014 
1015 			/* we never drop the 1st segment */
1016 			Assert(v != &reln->md_seg_fds[forknum][0]);
1017 
1018 			FileClose(v->mdfd_vfd);
1019 			_fdvec_resize(reln, forknum, curopensegs - 1);
1020 		}
1021 		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1022 		{
1023 			/*
1024 			 * This is the last segment we want to keep. Truncate the file to
1025 			 * the right length. NOTE: if nblocks is exactly a multiple K of
1026 			 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1027 			 * keep it. This adheres to the invariant given in the header
1028 			 * comments.
1029 			 */
1030 			BlockNumber lastsegblocks = nblocks - priorblocks;
1031 
1032 			if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1033 				ereport(ERROR,
1034 						(errcode_for_file_access(),
1035 						 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1036 								FilePathName(v->mdfd_vfd),
1037 								nblocks)));
1038 			if (!SmgrIsTemp(reln))
1039 				register_dirty_segment(reln, forknum, v);
1040 		}
1041 		else
1042 		{
1043 			/*
1044 			 * We still need this segment, so nothing to do for this and any
1045 			 * earlier segment.
1046 			 */
1047 			break;
1048 		}
1049 		curopensegs--;
1050 	}
1051 }
1052 
1053 /*
1054  *	mdimmedsync() -- Immediately sync a relation to stable storage.
1055  *
1056  * Note that only writes already issued are synced; this routine knows
1057  * nothing of dirty buffers that may exist inside the buffer manager.
1058  */
1059 void
mdimmedsync(SMgrRelation reln,ForkNumber forknum)1060 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
1061 {
1062 	int			segno;
1063 
1064 	/*
1065 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1066 	 * fsync loop will get them all!
1067 	 */
1068 	mdnblocks(reln, forknum);
1069 
1070 	segno = reln->md_num_open_segs[forknum];
1071 
1072 	while (segno > 0)
1073 	{
1074 		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
1075 
1076 		if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1077 			ereport(data_sync_elevel(ERROR),
1078 					(errcode_for_file_access(),
1079 					 errmsg("could not fsync file \"%s\": %m",
1080 							FilePathName(v->mdfd_vfd))));
1081 		segno--;
1082 	}
1083 }
1084 
1085 /*
1086  *	mdsync() -- Sync previous writes to stable storage.
1087  */
1088 void
mdsync(void)1089 mdsync(void)
1090 {
1091 	static bool mdsync_in_progress = false;
1092 
1093 	HASH_SEQ_STATUS hstat;
1094 	PendingOperationEntry *entry;
1095 	int			absorb_counter;
1096 
1097 	/* Statistics on sync times */
1098 	int			processed = 0;
1099 	instr_time	sync_start,
1100 				sync_end,
1101 				sync_diff;
1102 	uint64		elapsed;
1103 	uint64		longest = 0;
1104 	uint64		total_elapsed = 0;
1105 
1106 	/*
1107 	 * This is only called during checkpoints, and checkpoints should only
1108 	 * occur in processes that have created a pendingOpsTable.
1109 	 */
1110 	if (!pendingOpsTable)
1111 		elog(ERROR, "cannot sync without a pendingOpsTable");
1112 
1113 	/*
1114 	 * If we are in the checkpointer, the sync had better include all fsync
1115 	 * requests that were queued by backends up to this point.  The tightest
1116 	 * race condition that could occur is that a buffer that must be written
1117 	 * and fsync'd for the checkpoint could have been dumped by a backend just
1118 	 * before it was visited by BufferSync().  We know the backend will have
1119 	 * queued an fsync request before clearing the buffer's dirtybit, so we
1120 	 * are safe as long as we do an Absorb after completing BufferSync().
1121 	 */
1122 	AbsorbFsyncRequests();
1123 
1124 	/*
1125 	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
1126 	 * checkpoint), we want to ignore fsync requests that are entered into the
1127 	 * hashtable after this point --- they should be processed next time,
1128 	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
1129 	 * ones: new ones will have cycle_ctr equal to the incremented value of
1130 	 * mdsync_cycle_ctr.
1131 	 *
1132 	 * In normal circumstances, all entries present in the table at this point
1133 	 * will have cycle_ctr exactly equal to the current (about to be old)
1134 	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
1135 	 * fsync'ing loop, then older values of cycle_ctr might remain when we
1136 	 * come back here to try again.  Repeated checkpoint failures would
1137 	 * eventually wrap the counter around to the point where an old entry
1138 	 * might appear new, causing us to skip it, possibly allowing a checkpoint
1139 	 * to succeed that should not have.  To forestall wraparound, any time the
1140 	 * previous mdsync() failed to complete, run through the table and
1141 	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
1142 	 *
1143 	 * Think not to merge this loop with the main loop, as the problem is
1144 	 * exactly that that loop may fail before having visited all the entries.
1145 	 * From a performance point of view it doesn't matter anyway, as this path
1146 	 * will never be taken in a system that's functioning normally.
1147 	 */
1148 	if (mdsync_in_progress)
1149 	{
1150 		/* prior try failed, so update any stale cycle_ctr values */
1151 		hash_seq_init(&hstat, pendingOpsTable);
1152 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1153 		{
1154 			entry->cycle_ctr = mdsync_cycle_ctr;
1155 		}
1156 	}
1157 
1158 	/* Advance counter so that new hashtable entries are distinguishable */
1159 	mdsync_cycle_ctr++;
1160 
1161 	/* Set flag to detect failure if we don't reach the end of the loop */
1162 	mdsync_in_progress = true;
1163 
1164 	/* Now scan the hashtable for fsync requests to process */
1165 	absorb_counter = FSYNCS_PER_ABSORB;
1166 	hash_seq_init(&hstat, pendingOpsTable);
1167 	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1168 	{
1169 		ForkNumber	forknum;
1170 
1171 		/*
1172 		 * If the entry is new then don't process it this time; it might
1173 		 * contain multiple fsync-request bits, but they are all new.  Note
1174 		 * "continue" bypasses the hash-remove call at the bottom of the loop.
1175 		 */
1176 		if (entry->cycle_ctr == mdsync_cycle_ctr)
1177 			continue;
1178 
1179 		/* Else assert we haven't missed it */
1180 		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1181 
1182 		/*
1183 		 * Scan over the forks and segments represented by the entry.
1184 		 *
1185 		 * The bitmap manipulations are slightly tricky, because we can call
1186 		 * AbsorbFsyncRequests() inside the loop and that could result in
1187 		 * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
1188 		 * So we detach it, but if we fail we'll merge it with any new
1189 		 * requests that have arrived in the meantime.
1190 		 */
1191 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1192 		{
1193 			Bitmapset  *requests = entry->requests[forknum];
1194 			int			segno;
1195 
1196 			entry->requests[forknum] = NULL;
1197 			entry->canceled[forknum] = false;
1198 
1199 			segno = -1;
1200 			while ((segno = bms_next_member(requests, segno)) >= 0)
1201 			{
1202 				int			failures;
1203 
1204 				/*
1205 				 * If fsync is off then we don't have to bother opening the
1206 				 * file at all.  (We delay checking until this point so that
1207 				 * changing fsync on the fly behaves sensibly.)
1208 				 */
1209 				if (!enableFsync)
1210 					continue;
1211 
1212 				/*
1213 				 * If in checkpointer, we want to absorb pending requests
1214 				 * every so often to prevent overflow of the fsync request
1215 				 * queue.  It is unspecified whether newly-added entries will
1216 				 * be visited by hash_seq_search, but we don't care since we
1217 				 * don't need to process them anyway.
1218 				 */
1219 				if (--absorb_counter <= 0)
1220 				{
1221 					AbsorbFsyncRequests();
1222 					absorb_counter = FSYNCS_PER_ABSORB;
1223 				}
1224 
1225 				/*
1226 				 * The fsync table could contain requests to fsync segments
1227 				 * that have been deleted (unlinked) by the time we get to
1228 				 * them. Rather than just hoping an ENOENT (or EACCES on
1229 				 * Windows) error can be ignored, what we do on error is
1230 				 * absorb pending requests and then retry.  Since mdunlink()
1231 				 * queues a "cancel" message before actually unlinking, the
1232 				 * fsync request is guaranteed to be marked canceled after the
1233 				 * absorb if it really was this case. DROP DATABASE likewise
1234 				 * has to tell us to forget fsync requests before it starts
1235 				 * deletions.
1236 				 */
1237 				for (failures = 0;; failures++) /* loop exits at "break" */
1238 				{
1239 					SMgrRelation reln;
1240 					MdfdVec    *seg;
1241 					char	   *path;
1242 					int			save_errno;
1243 
1244 					/*
1245 					 * Find or create an smgr hash entry for this relation.
1246 					 * This may seem a bit unclean -- md calling smgr?	But
1247 					 * it's really the best solution.  It ensures that the
1248 					 * open file reference isn't permanently leaked if we get
1249 					 * an error here. (You may say "but an unreferenced
1250 					 * SMgrRelation is still a leak!" Not really, because the
1251 					 * only case in which a checkpoint is done by a process
1252 					 * that isn't about to shut down is in the checkpointer,
1253 					 * and it will periodically do smgrcloseall(). This fact
1254 					 * justifies our not closing the reln in the success path
1255 					 * either, which is a good thing since in non-checkpointer
1256 					 * cases we couldn't safely do that.)
1257 					 */
1258 					reln = smgropen(entry->rnode, InvalidBackendId);
1259 
1260 					/* Attempt to open and fsync the target segment */
1261 					seg = _mdfd_getseg(reln, forknum,
1262 									   (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
1263 									   false,
1264 									   EXTENSION_RETURN_NULL
1265 									   | EXTENSION_DONT_CHECK_SIZE);
1266 
1267 					INSTR_TIME_SET_CURRENT(sync_start);
1268 
1269 					if (seg != NULL &&
1270 						FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
1271 					{
1272 						/* Success; update statistics about sync timing */
1273 						INSTR_TIME_SET_CURRENT(sync_end);
1274 						sync_diff = sync_end;
1275 						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
1276 						elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
1277 						if (elapsed > longest)
1278 							longest = elapsed;
1279 						total_elapsed += elapsed;
1280 						processed++;
1281 						requests = bms_del_member(requests, segno);
1282 						if (log_checkpoints)
1283 							elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
1284 								 processed,
1285 								 FilePathName(seg->mdfd_vfd),
1286 								 (double) elapsed / 1000);
1287 
1288 						break;	/* out of retry loop */
1289 					}
1290 
1291 					/* Compute file name for use in message */
1292 					save_errno = errno;
1293 					path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
1294 					errno = save_errno;
1295 
1296 					/*
1297 					 * It is possible that the relation has been dropped or
1298 					 * truncated since the fsync request was entered.
1299 					 * Therefore, allow ENOENT, but only if we didn't fail
1300 					 * already on this file.  This applies both for
1301 					 * _mdfd_getseg() and for FileSync, since fd.c might have
1302 					 * closed the file behind our back.
1303 					 *
1304 					 * XXX is there any point in allowing more than one retry?
1305 					 * Don't see one at the moment, but easy to change the
1306 					 * test here if so.
1307 					 */
1308 					if (!FILE_POSSIBLY_DELETED(errno) ||
1309 						failures > 0)
1310 					{
1311 						Bitmapset  *new_requests;
1312 
1313 						/*
1314 						 * We need to merge these unsatisfied requests with
1315 						 * any others that have arrived since we started.
1316 						 */
1317 						new_requests = entry->requests[forknum];
1318 						entry->requests[forknum] =
1319 							bms_join(new_requests, requests);
1320 
1321 						errno = save_errno;
1322 						ereport(data_sync_elevel(ERROR),
1323 								(errcode_for_file_access(),
1324 								 errmsg("could not fsync file \"%s\": %m",
1325 										path)));
1326 					}
1327 					else
1328 						ereport(DEBUG1,
1329 								(errcode_for_file_access(),
1330 								 errmsg("could not fsync file \"%s\" but retrying: %m",
1331 										path)));
1332 					pfree(path);
1333 
1334 					/*
1335 					 * Absorb incoming requests and check to see if a cancel
1336 					 * arrived for this relation fork.
1337 					 */
1338 					AbsorbFsyncRequests();
1339 					absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
1340 
1341 					if (entry->canceled[forknum])
1342 						break;
1343 				}				/* end retry loop */
1344 			}
1345 			bms_free(requests);
1346 		}
1347 
1348 		/*
1349 		 * We've finished everything that was requested before we started to
1350 		 * scan the entry.  If no new requests have been inserted meanwhile,
1351 		 * remove the entry.  Otherwise, update its cycle counter, as all the
1352 		 * requests now in it must have arrived during this cycle.
1353 		 */
1354 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1355 		{
1356 			if (entry->requests[forknum] != NULL)
1357 				break;
1358 		}
1359 		if (forknum <= MAX_FORKNUM)
1360 			entry->cycle_ctr = mdsync_cycle_ctr;
1361 		else
1362 		{
1363 			/* Okay to remove it */
1364 			if (hash_search(pendingOpsTable, &entry->rnode,
1365 							HASH_REMOVE, NULL) == NULL)
1366 				elog(ERROR, "pendingOpsTable corrupted");
1367 		}
1368 	}							/* end loop over hashtable entries */
1369 
1370 	/* Return sync performance metrics for report at checkpoint end */
1371 	CheckpointStats.ckpt_sync_rels = processed;
1372 	CheckpointStats.ckpt_longest_sync = longest;
1373 	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
1374 
1375 	/* Flag successful completion of mdsync */
1376 	mdsync_in_progress = false;
1377 }
1378 
1379 /*
1380  * mdpreckpt() -- Do pre-checkpoint work
1381  *
1382  * To distinguish unlink requests that arrived before this checkpoint
1383  * started from those that arrived during the checkpoint, we use a cycle
1384  * counter similar to the one we use for fsync requests. That cycle
1385  * counter is incremented here.
1386  *
1387  * This must be called *before* the checkpoint REDO point is determined.
1388  * That ensures that we won't delete files too soon.
1389  *
1390  * Note that we can't do anything here that depends on the assumption
1391  * that the checkpoint will be completed.
1392  */
1393 void
mdpreckpt(void)1394 mdpreckpt(void)
1395 {
1396 	/*
1397 	 * Any unlink requests arriving after this point will be assigned the next
1398 	 * cycle counter, and won't be unlinked until next checkpoint.
1399 	 */
1400 	mdckpt_cycle_ctr++;
1401 }
1402 
1403 /*
1404  * mdpostckpt() -- Do post-checkpoint work
1405  *
1406  * Remove any lingering files that can now be safely removed.
1407  */
1408 void
mdpostckpt(void)1409 mdpostckpt(void)
1410 {
1411 	int			absorb_counter;
1412 
1413 	absorb_counter = UNLINKS_PER_ABSORB;
1414 	while (pendingUnlinks != NIL)
1415 	{
1416 		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1417 		char	   *path;
1418 
1419 		/*
1420 		 * New entries are appended to the end, so if the entry is new we've
1421 		 * reached the end of old entries.
1422 		 *
1423 		 * Note: if just the right number of consecutive checkpoints fail, we
1424 		 * could be fooled here by cycle_ctr wraparound.  However, the only
1425 		 * consequence is that we'd delay unlinking for one more checkpoint,
1426 		 * which is perfectly tolerable.
1427 		 */
1428 		if (entry->cycle_ctr == mdckpt_cycle_ctr)
1429 			break;
1430 
1431 		/* Unlink the file */
1432 		path = relpathperm(entry->rnode, MAIN_FORKNUM);
1433 		if (unlink(path) < 0)
1434 		{
1435 			/*
1436 			 * There's a race condition, when the database is dropped at the
1437 			 * same time that we process the pending unlink requests. If the
1438 			 * DROP DATABASE deletes the file before we do, we will get ENOENT
1439 			 * here. rmtree() also has to ignore ENOENT errors, to deal with
1440 			 * the possibility that we delete the file first.
1441 			 */
1442 			if (errno != ENOENT)
1443 				ereport(WARNING,
1444 						(errcode_for_file_access(),
1445 						 errmsg("could not remove file \"%s\": %m", path)));
1446 		}
1447 		pfree(path);
1448 
1449 		/* And remove the list entry */
1450 		pendingUnlinks = list_delete_first(pendingUnlinks);
1451 		pfree(entry);
1452 
1453 		/*
1454 		 * As in mdsync, we don't want to stop absorbing fsync requests for a
1455 		 * long time when there are many deletions to be done.  We can safely
1456 		 * call AbsorbFsyncRequests() at this point in the loop (note it might
1457 		 * try to delete list entries).
1458 		 */
1459 		if (--absorb_counter <= 0)
1460 		{
1461 			AbsorbFsyncRequests();
1462 			absorb_counter = UNLINKS_PER_ABSORB;
1463 		}
1464 	}
1465 }
1466 
1467 /*
1468  * register_dirty_segment() -- Mark a relation segment as needing fsync
1469  *
1470  * If there is a local pending-ops table, just make an entry in it for
1471  * mdsync to process later.  Otherwise, try to pass off the fsync request
1472  * to the checkpointer process.  If that fails, just do the fsync
1473  * locally before returning (we hope this will not happen often enough
1474  * to be a performance problem).
1475  */
1476 static void
register_dirty_segment(SMgrRelation reln,ForkNumber forknum,MdfdVec * seg)1477 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1478 {
1479 	/* Temp relations should never be fsync'd */
1480 	Assert(!SmgrIsTemp(reln));
1481 
1482 	if (pendingOpsTable)
1483 	{
1484 		/* push it into local pending-ops table */
1485 		RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
1486 	}
1487 	else
1488 	{
1489 		if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
1490 			return;				/* passed it off successfully */
1491 
1492 		ereport(DEBUG1,
1493 				(errmsg("could not forward fsync request because request queue is full")));
1494 
1495 		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1496 			ereport(data_sync_elevel(ERROR),
1497 					(errcode_for_file_access(),
1498 					 errmsg("could not fsync file \"%s\": %m",
1499 							FilePathName(seg->mdfd_vfd))));
1500 	}
1501 }
1502 
1503 /*
1504  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1505  *
1506  * We don't bother passing in the fork number, because this is only used
1507  * with main forks.
1508  *
1509  * As with register_dirty_segment, this could involve either a local or
1510  * a remote pending-ops table.
1511  */
1512 static void
register_unlink(RelFileNodeBackend rnode)1513 register_unlink(RelFileNodeBackend rnode)
1514 {
1515 	/* Should never be used with temp relations */
1516 	Assert(!RelFileNodeBackendIsTemp(rnode));
1517 
1518 	if (pendingOpsTable)
1519 	{
1520 		/* push it into local pending-ops table */
1521 		RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
1522 							 UNLINK_RELATION_REQUEST);
1523 	}
1524 	else
1525 	{
1526 		/*
1527 		 * Notify the checkpointer about it.  If we fail to queue the request
1528 		 * message, we have to sleep and try again, because we can't simply
1529 		 * delete the file now.  Ugly, but hopefully won't happen often.
1530 		 *
1531 		 * XXX should we just leave the file orphaned instead?
1532 		 */
1533 		Assert(IsUnderPostmaster);
1534 		while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
1535 									UNLINK_RELATION_REQUEST))
1536 			pg_usleep(10000L);	/* 10 msec seems a good number */
1537 	}
1538 }
1539 
1540 /*
1541  * RememberFsyncRequest() -- callback from checkpointer side of fsync request
1542  *
1543  * We stuff fsync requests into the local hash table for execution
1544  * during the checkpointer's next checkpoint.  UNLINK requests go into a
1545  * separate linked list, however, because they get processed separately.
1546  *
1547  * The range of possible segment numbers is way less than the range of
1548  * BlockNumber, so we can reserve high values of segno for special purposes.
1549  * We define three:
1550  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
1551  *	 either for one fork, or all forks if forknum is InvalidForkNumber
1552  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1553  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1554  *	 checkpoint.
1555  * Note also that we're assuming real segment numbers don't exceed INT_MAX.
1556  *
1557  * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
1558  * table has to be searched linearly, but dropping a database is a pretty
1559  * heavyweight operation anyhow, so we'll live with it.)
1560  */
1561 void
RememberFsyncRequest(RelFileNode rnode,ForkNumber forknum,BlockNumber segno)1562 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1563 {
1564 	Assert(pendingOpsTable);
1565 
1566 	if (segno == FORGET_RELATION_FSYNC)
1567 	{
1568 		/* Remove any pending requests for the relation (one or all forks) */
1569 		PendingOperationEntry *entry;
1570 
1571 		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1572 													  &rnode,
1573 													  HASH_FIND,
1574 													  NULL);
1575 		if (entry)
1576 		{
1577 			/*
1578 			 * We can't just delete the entry since mdsync could have an
1579 			 * active hashtable scan.  Instead we delete the bitmapsets; this
1580 			 * is safe because of the way mdsync is coded.  We also set the
1581 			 * "canceled" flags so that mdsync can tell that a cancel arrived
1582 			 * for the fork(s).
1583 			 */
1584 			if (forknum == InvalidForkNumber)
1585 			{
1586 				/* remove requests for all forks */
1587 				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1588 				{
1589 					bms_free(entry->requests[forknum]);
1590 					entry->requests[forknum] = NULL;
1591 					entry->canceled[forknum] = true;
1592 				}
1593 			}
1594 			else
1595 			{
1596 				/* remove requests for single fork */
1597 				bms_free(entry->requests[forknum]);
1598 				entry->requests[forknum] = NULL;
1599 				entry->canceled[forknum] = true;
1600 			}
1601 		}
1602 	}
1603 	else if (segno == FORGET_DATABASE_FSYNC)
1604 	{
1605 		/* Remove any pending requests for the entire database */
1606 		HASH_SEQ_STATUS hstat;
1607 		PendingOperationEntry *entry;
1608 		ListCell   *cell,
1609 				   *prev,
1610 				   *next;
1611 
1612 		/* Remove fsync requests */
1613 		hash_seq_init(&hstat, pendingOpsTable);
1614 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1615 		{
1616 			if (entry->rnode.dbNode == rnode.dbNode)
1617 			{
1618 				/* remove requests for all forks */
1619 				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1620 				{
1621 					bms_free(entry->requests[forknum]);
1622 					entry->requests[forknum] = NULL;
1623 					entry->canceled[forknum] = true;
1624 				}
1625 			}
1626 		}
1627 
1628 		/* Remove unlink requests */
1629 		prev = NULL;
1630 		for (cell = list_head(pendingUnlinks); cell; cell = next)
1631 		{
1632 			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1633 
1634 			next = lnext(cell);
1635 			if (entry->rnode.dbNode == rnode.dbNode)
1636 			{
1637 				pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1638 				pfree(entry);
1639 			}
1640 			else
1641 				prev = cell;
1642 		}
1643 	}
1644 	else if (segno == UNLINK_RELATION_REQUEST)
1645 	{
1646 		/* Unlink request: put it in the linked list */
1647 		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
1648 		PendingUnlinkEntry *entry;
1649 
1650 		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
1651 		Assert(forknum == MAIN_FORKNUM);
1652 
1653 		entry = palloc(sizeof(PendingUnlinkEntry));
1654 		entry->rnode = rnode;
1655 		entry->cycle_ctr = mdckpt_cycle_ctr;
1656 
1657 		pendingUnlinks = lappend(pendingUnlinks, entry);
1658 
1659 		MemoryContextSwitchTo(oldcxt);
1660 	}
1661 	else
1662 	{
1663 		/* Normal case: enter a request to fsync this segment */
1664 		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
1665 		PendingOperationEntry *entry;
1666 		bool		found;
1667 
1668 		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1669 													  &rnode,
1670 													  HASH_ENTER,
1671 													  &found);
1672 		/* if new entry, initialize it */
1673 		if (!found)
1674 		{
1675 			entry->cycle_ctr = mdsync_cycle_ctr;
1676 			MemSet(entry->requests, 0, sizeof(entry->requests));
1677 			MemSet(entry->canceled, 0, sizeof(entry->canceled));
1678 		}
1679 
1680 		/*
1681 		 * NB: it's intentional that we don't change cycle_ctr if the entry
1682 		 * already exists.  The cycle_ctr must represent the oldest fsync
1683 		 * request that could be in the entry.
1684 		 */
1685 
1686 		entry->requests[forknum] = bms_add_member(entry->requests[forknum],
1687 												  (int) segno);
1688 
1689 		MemoryContextSwitchTo(oldcxt);
1690 	}
1691 }
1692 
1693 /*
1694  * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
1695  *
1696  * forknum == InvalidForkNumber means all forks, although this code doesn't
1697  * actually know that, since it's just forwarding the request elsewhere.
1698  */
1699 void
ForgetRelationFsyncRequests(RelFileNode rnode,ForkNumber forknum)1700 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
1701 {
1702 	if (pendingOpsTable)
1703 	{
1704 		/* standalone backend or startup process: fsync state is local */
1705 		RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1706 	}
1707 	else if (IsUnderPostmaster)
1708 	{
1709 		/*
1710 		 * Notify the checkpointer about it.  If we fail to queue the cancel
1711 		 * message, we have to sleep and try again ... ugly, but hopefully
1712 		 * won't happen often.
1713 		 *
1714 		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1715 		 * error would leave the no-longer-used file still present on disk,
1716 		 * which would be bad, so I'm inclined to assume that the checkpointer
1717 		 * will always empty the queue soon.
1718 		 */
1719 		while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1720 			pg_usleep(10000L);	/* 10 msec seems a good number */
1721 
1722 		/*
1723 		 * Note we don't wait for the checkpointer to actually absorb the
1724 		 * cancel message; see mdsync() for the implications.
1725 		 */
1726 	}
1727 }
1728 
1729 /*
1730  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1731  */
1732 void
ForgetDatabaseFsyncRequests(Oid dbid)1733 ForgetDatabaseFsyncRequests(Oid dbid)
1734 {
1735 	RelFileNode rnode;
1736 
1737 	rnode.dbNode = dbid;
1738 	rnode.spcNode = 0;
1739 	rnode.relNode = 0;
1740 
1741 	if (pendingOpsTable)
1742 	{
1743 		/* standalone backend or startup process: fsync state is local */
1744 		RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1745 	}
1746 	else if (IsUnderPostmaster)
1747 	{
1748 		/* see notes in ForgetRelationFsyncRequests */
1749 		while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1750 									FORGET_DATABASE_FSYNC))
1751 			pg_usleep(10000L);	/* 10 msec seems a good number */
1752 	}
1753 }
1754 
1755 /*
1756  * DropRelationFiles -- drop files of all given relations
1757  */
1758 void
DropRelationFiles(RelFileNode * delrels,int ndelrels,bool isRedo)1759 DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
1760 {
1761 	SMgrRelation *srels;
1762 	int			i;
1763 
1764 	srels = palloc(sizeof(SMgrRelation) * ndelrels);
1765 	for (i = 0; i < ndelrels; i++)
1766 	{
1767 		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
1768 
1769 		if (isRedo)
1770 		{
1771 			ForkNumber	fork;
1772 
1773 			for (fork = 0; fork <= MAX_FORKNUM; fork++)
1774 				XLogDropRelation(delrels[i], fork);
1775 		}
1776 		srels[i] = srel;
1777 	}
1778 
1779 	smgrdounlinkall(srels, ndelrels, isRedo);
1780 
1781 	for (i = 0; i < ndelrels; i++)
1782 		smgrclose(srels[i]);
1783 	pfree(srels);
1784 }
1785 
1786 
1787 /*
1788  *	_fdvec_resize() -- Resize the fork's open segments array
1789  */
1790 static void
_fdvec_resize(SMgrRelation reln,ForkNumber forknum,int nseg)1791 _fdvec_resize(SMgrRelation reln,
1792 			  ForkNumber forknum,
1793 			  int nseg)
1794 {
1795 	if (nseg == 0)
1796 	{
1797 		if (reln->md_num_open_segs[forknum] > 0)
1798 		{
1799 			pfree(reln->md_seg_fds[forknum]);
1800 			reln->md_seg_fds[forknum] = NULL;
1801 		}
1802 	}
1803 	else if (reln->md_num_open_segs[forknum] == 0)
1804 	{
1805 		reln->md_seg_fds[forknum] =
1806 			MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
1807 	}
1808 	else
1809 	{
1810 		/*
1811 		 * It doesn't seem worthwhile complicating the code to amortize
1812 		 * repalloc() calls.  Those are far faster than PathNameOpenFile() or
1813 		 * FileClose(), and the memory context internally will sometimes avoid
1814 		 * doing an actual reallocation.
1815 		 */
1816 		reln->md_seg_fds[forknum] =
1817 			repalloc(reln->md_seg_fds[forknum],
1818 					 sizeof(MdfdVec) * nseg);
1819 	}
1820 
1821 	reln->md_num_open_segs[forknum] = nseg;
1822 }
1823 
1824 /*
1825  * Return the filename for the specified segment of the relation. The
1826  * returned string is palloc'd.
1827  */
1828 static char *
_mdfd_segpath(SMgrRelation reln,ForkNumber forknum,BlockNumber segno)1829 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1830 {
1831 	char	   *path,
1832 			   *fullpath;
1833 
1834 	path = relpath(reln->smgr_rnode, forknum);
1835 
1836 	if (segno > 0)
1837 	{
1838 		fullpath = psprintf("%s.%u", path, segno);
1839 		pfree(path);
1840 	}
1841 	else
1842 		fullpath = path;
1843 
1844 	return fullpath;
1845 }
1846 
1847 /*
1848  * Open the specified segment of the relation,
1849  * and make a MdfdVec object for it.  Returns NULL on failure.
1850  */
1851 static MdfdVec *
_mdfd_openseg(SMgrRelation reln,ForkNumber forknum,BlockNumber segno,int oflags)1852 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1853 			  int oflags)
1854 {
1855 	MdfdVec    *v;
1856 	int			fd;
1857 	char	   *fullpath;
1858 
1859 	fullpath = _mdfd_segpath(reln, forknum, segno);
1860 
1861 	/* open the file */
1862 	fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1863 
1864 	pfree(fullpath);
1865 
1866 	if (fd < 0)
1867 		return NULL;
1868 
1869 	if (segno <= reln->md_num_open_segs[forknum])
1870 		_fdvec_resize(reln, forknum, segno + 1);
1871 
1872 	/* fill the entry */
1873 	v = &reln->md_seg_fds[forknum][segno];
1874 	v->mdfd_vfd = fd;
1875 	v->mdfd_segno = segno;
1876 
1877 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1878 
1879 	/* all done */
1880 	return v;
1881 }
1882 
1883 /*
1884  *	_mdfd_getseg() -- Find the segment of the relation holding the
1885  *		specified block.
1886  *
1887  * If the segment doesn't exist, we ereport, return NULL, or create the
1888  * segment, according to "behavior".  Note: skipFsync is only used in the
1889  * EXTENSION_CREATE case.
1890  */
1891 static MdfdVec *
_mdfd_getseg(SMgrRelation reln,ForkNumber forknum,BlockNumber blkno,bool skipFsync,int behavior)1892 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1893 			 bool skipFsync, int behavior)
1894 {
1895 	MdfdVec    *v;
1896 	BlockNumber targetseg;
1897 	BlockNumber nextsegno;
1898 
1899 	/* some way to handle non-existent segments needs to be specified */
1900 	Assert(behavior &
1901 		   (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL));
1902 
1903 	targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1904 
1905 	/* if an existing and opened segment, we're done */
1906 	if (targetseg < reln->md_num_open_segs[forknum])
1907 	{
1908 		v = &reln->md_seg_fds[forknum][targetseg];
1909 		return v;
1910 	}
1911 
1912 	/*
1913 	 * The target segment is not yet open. Iterate over all the segments
1914 	 * between the last opened and the target segment. This way missing
1915 	 * segments either raise an error, or get created (according to
1916 	 * 'behavior'). Start with either the last opened, or the first segment if
1917 	 * none was opened before.
1918 	 */
1919 	if (reln->md_num_open_segs[forknum] > 0)
1920 		v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1921 	else
1922 	{
1923 		v = mdopen(reln, forknum, behavior);
1924 		if (!v)
1925 			return NULL;		/* if behavior & EXTENSION_RETURN_NULL */
1926 	}
1927 
1928 	for (nextsegno = reln->md_num_open_segs[forknum];
1929 		 nextsegno <= targetseg; nextsegno++)
1930 	{
1931 		BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1932 		int			flags = 0;
1933 
1934 		Assert(nextsegno == v->mdfd_segno + 1);
1935 
1936 		if (nblocks > ((BlockNumber) RELSEG_SIZE))
1937 			elog(FATAL, "segment too big");
1938 
1939 		if ((behavior & EXTENSION_CREATE) ||
1940 			(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1941 		{
1942 			/*
1943 			 * Normally we will create new segments only if authorized by the
1944 			 * caller (i.e., we are doing mdextend()).  But when doing WAL
1945 			 * recovery, create segments anyway; this allows cases such as
1946 			 * replaying WAL data that has a write into a high-numbered
1947 			 * segment of a relation that was later deleted. We want to go
1948 			 * ahead and create the segments so we can finish out the replay.
1949 			 * However if the caller has specified
1950 			 * EXTENSION_REALLY_RETURN_NULL, then extension is not desired
1951 			 * even in recovery; we won't reach this point in that case.
1952 			 *
1953 			 * We have to maintain the invariant that segments before the last
1954 			 * active segment are of size RELSEG_SIZE; therefore, if
1955 			 * extending, pad them out with zeroes if needed.  (This only
1956 			 * matters if in recovery, or if the caller is extending the
1957 			 * relation discontiguously, but that can happen in hash indexes.)
1958 			 */
1959 			if (nblocks < ((BlockNumber) RELSEG_SIZE))
1960 			{
1961 				char	   *zerobuf = palloc0(BLCKSZ);
1962 
1963 				mdextend(reln, forknum,
1964 						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1965 						 zerobuf, skipFsync);
1966 				pfree(zerobuf);
1967 			}
1968 			flags = O_CREAT;
1969 		}
1970 		else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
1971 				 nblocks < ((BlockNumber) RELSEG_SIZE))
1972 		{
1973 			/*
1974 			 * When not extending (or explicitly including truncated
1975 			 * segments), only open the next segment if the current one is
1976 			 * exactly RELSEG_SIZE.  If not (this branch), either return NULL
1977 			 * or fail.
1978 			 */
1979 			if (behavior & EXTENSION_RETURN_NULL)
1980 			{
1981 				/*
1982 				 * Some callers discern between reasons for _mdfd_getseg()
1983 				 * returning NULL based on errno. As there's no failing
1984 				 * syscall involved in this case, explicitly set errno to
1985 				 * ENOENT, as that seems the closest interpretation.
1986 				 */
1987 				errno = ENOENT;
1988 				return NULL;
1989 			}
1990 
1991 			ereport(ERROR,
1992 					(errcode_for_file_access(),
1993 					 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1994 							_mdfd_segpath(reln, forknum, nextsegno),
1995 							blkno, nblocks)));
1996 		}
1997 
1998 		v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1999 
2000 		if (v == NULL)
2001 		{
2002 			if ((behavior & EXTENSION_RETURN_NULL) &&
2003 				FILE_POSSIBLY_DELETED(errno))
2004 				return NULL;
2005 			ereport(ERROR,
2006 					(errcode_for_file_access(),
2007 					 errmsg("could not open file \"%s\" (target block %u): %m",
2008 							_mdfd_segpath(reln, forknum, nextsegno),
2009 							blkno)));
2010 		}
2011 	}
2012 
2013 	return v;
2014 }
2015 
2016 /*
2017  * Get number of blocks present in a single disk file
2018  */
2019 static BlockNumber
_mdnblocks(SMgrRelation reln,ForkNumber forknum,MdfdVec * seg)2020 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
2021 {
2022 	off_t		len;
2023 
2024 	len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
2025 	if (len < 0)
2026 		ereport(ERROR,
2027 				(errcode_for_file_access(),
2028 				 errmsg("could not seek to end of file \"%s\": %m",
2029 						FilePathName(seg->mdfd_vfd))));
2030 	/* note that this calculation will ignore any partial block at EOF */
2031 	return (BlockNumber) (len / BLCKSZ);
2032 }
2033