1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *	  This code manages relations that reside on magnetic disk.
5  *
6  * Or at least, that was what the Berkeley folk had in mind when they named
7  * this file.  In reality, what this code provides is an interface from
8  * the smgr API to Unix-like filesystem APIs, so it will work with any type
9  * of device for which the operating system provides filesystem support.
10  * It doesn't matter whether the bits are on spinning rust or some other
11  * storage technology.
12  *
13  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  *
17  * IDENTIFICATION
18  *	  src/backend/storage/smgr/md.c
19  *
20  *-------------------------------------------------------------------------
21  */
22 #include "postgres.h"
23 
24 #include <unistd.h>
25 #include <fcntl.h>
26 #include <sys/file.h>
27 
28 #include "miscadmin.h"
29 #include "access/xlogutils.h"
30 #include "access/xlog.h"
31 #include "pgstat.h"
32 #include "portability/instr_time.h"
33 #include "postmaster/bgwriter.h"
34 #include "storage/fd.h"
35 #include "storage/bufmgr.h"
36 #include "storage/relfilenode.h"
37 #include "storage/smgr.h"
38 #include "utils/hsearch.h"
39 #include "utils/memutils.h"
40 #include "pg_trace.h"
41 
42 
43 /* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
44 #define FSYNCS_PER_ABSORB		10
45 #define UNLINKS_PER_ABSORB		10
46 
47 /*
48  * Special values for the segno arg to RememberFsyncRequest.
49  *
50  * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
51  * fsync request from the queue if an identical, subsequent request is found.
52  * See comments there before making changes here.
53  */
54 #define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
55 #define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
56 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
57 
58 /*
59  * On Windows, we have to interpret EACCES as possibly meaning the same as
60  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
61  * that's what you get.  Ugh.  This code is designed so that we don't
62  * actually believe these cases are okay without further evidence (namely,
63  * a pending fsync request getting canceled ... see mdsync).
64  */
65 #ifndef WIN32
66 #define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT)
67 #else
68 #define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT || (err) == EACCES)
69 #endif
70 
71 /*
72  *	The magnetic disk storage manager keeps track of open file
73  *	descriptors in its own descriptor pool.  This is done to make it
74  *	easier to support relations that are larger than the operating
75  *	system's file size limit (often 2GBytes).  In order to do that,
76  *	we break relations up into "segment" files that are each shorter than
77  *	the OS file size limit.  The segment size is set by the RELSEG_SIZE
78  *	configuration constant in pg_config.h.
79  *
80  *	On disk, a relation must consist of consecutively numbered segment
81  *	files in the pattern
82  *		-- Zero or more full segments of exactly RELSEG_SIZE blocks each
83  *		-- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
84  *		-- Optionally, any number of inactive segments of size 0 blocks.
85  *	The full and partial segments are collectively the "active" segments.
86  *	Inactive segments are those that once contained data but are currently
87  *	not needed because of an mdtruncate() operation.  The reason for leaving
88  *	them present at size zero, rather than unlinking them, is that other
89  *	backends and/or the checkpointer might be holding open file references to
90  *	such segments.  If the relation expands again after mdtruncate(), such
91  *	that a deactivated segment becomes active again, it is important that
92  *	such file references still be valid --- else data might get written
93  *	out to an unlinked old copy of a segment file that will eventually
94  *	disappear.
95  *
96  *	File descriptors are stored in the per-fork md_seg_fds arrays inside
97  *	SMgrRelation. The length of these arrays is stored in md_num_open_segs.
98  *	Note that a fork's md_num_open_segs having a specific value does not
99  *	necessarily mean the relation doesn't have additional segments; we may
100  *	just not have opened the next segment yet.  (We could not have "all
101  *	segments are in the array" as an invariant anyway, since another backend
102  *	could extend the relation while we aren't looking.)  We do not have
103  *	entries for inactive segments, however; as soon as we find a partial
104  *	segment, we assume that any subsequent segments are inactive.
105  *
106  *	The entire MdfdVec array is palloc'd in the MdCxt memory context.
107  */
108 
109 typedef struct _MdfdVec
110 {
111 	File		mdfd_vfd;		/* fd number in fd.c's pool */
112 	BlockNumber mdfd_segno;		/* segment number, from 0 */
113 } MdfdVec;
114 
115 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
116 
117 
118 /*
119  * In some contexts (currently, standalone backends and the checkpointer)
120  * we keep track of pending fsync operations: we need to remember all relation
121  * segments that have been written since the last checkpoint, so that we can
122  * fsync them down to disk before completing the next checkpoint.  This hash
123  * table remembers the pending operations.  We use a hash table mostly as
124  * a convenient way of merging duplicate requests.
125  *
126  * We use a similar mechanism to remember no-longer-needed files that can
127  * be deleted after the next checkpoint, but we use a linked list instead of
128  * a hash table, because we don't expect there to be any duplicate requests.
129  *
130  * These mechanisms are only used for non-temp relations; we never fsync
131  * temp rels, nor do we need to postpone their deletion (see comments in
132  * mdunlink).
133  *
134  * (Regular backends do not track pending operations locally, but forward
135  * them to the checkpointer.)
136  */
137 typedef uint16 CycleCtr;		/* can be any convenient integer size */
138 
139 typedef struct
140 {
141 	RelFileNode rnode;			/* hash table key (must be first!) */
142 	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr of oldest request */
143 	/* requests[f] has bit n set if we need to fsync segment n of fork f */
144 	Bitmapset  *requests[MAX_FORKNUM + 1];
145 	/* canceled[f] is true if we canceled fsyncs for fork "recently" */
146 	bool		canceled[MAX_FORKNUM + 1];
147 } PendingOperationEntry;
148 
149 typedef struct
150 {
151 	RelFileNode rnode;			/* the dead relation to delete */
152 	CycleCtr	cycle_ctr;		/* mdckpt_cycle_ctr when request was made */
153 } PendingUnlinkEntry;
154 
155 static HTAB *pendingOpsTable = NULL;
156 static List *pendingUnlinks = NIL;
157 static MemoryContext pendingOpsCxt; /* context for the above  */
158 
159 static CycleCtr mdsync_cycle_ctr = 0;
160 static CycleCtr mdckpt_cycle_ctr = 0;
161 
162 
163 /*** behavior for mdopen & _mdfd_getseg ***/
164 /* ereport if segment not present */
165 #define EXTENSION_FAIL				(1 << 0)
166 /* return NULL if segment not present */
167 #define EXTENSION_RETURN_NULL		(1 << 1)
168 /* create new segments as needed */
169 #define EXTENSION_CREATE			(1 << 2)
170 /* create new segments if needed during recovery */
171 #define EXTENSION_CREATE_RECOVERY	(1 << 3)
172 /*
173  * Allow opening segments which are preceded by segments smaller than
174  * RELSEG_SIZE, e.g. inactive segments (see above). Note that this is breaks
175  * mdnblocks() and related functionality henceforth - which currently is ok,
176  * because this is only required in the checkpointer which never uses
177  * mdnblocks().
178  */
179 #define EXTENSION_DONT_CHECK_SIZE	(1 << 4)
180 
181 
182 /* local routines */
183 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
184 			 bool isRedo);
185 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
186 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
187 					   MdfdVec *seg);
188 static void register_unlink(RelFileNodeBackend rnode);
189 static void _fdvec_resize(SMgrRelation reln,
190 			  ForkNumber forknum,
191 			  int nseg);
192 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
193 			  BlockNumber segno);
194 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
195 			  BlockNumber segno, int oflags);
196 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
197 			 BlockNumber blkno, bool skipFsync, int behavior);
198 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
199 		   MdfdVec *seg);
200 
201 
202 /*
203  *	mdinit() -- Initialize private state for magnetic disk storage manager.
204  */
205 void
206 mdinit(void)
207 {
208 	MdCxt = AllocSetContextCreate(TopMemoryContext,
209 								  "MdSmgr",
210 								  ALLOCSET_DEFAULT_SIZES);
211 
212 	/*
213 	 * Create pending-operations hashtable if we need it.  Currently, we need
214 	 * it if we are standalone (not under a postmaster) or if we are a startup
215 	 * or checkpointer auxiliary process.
216 	 */
217 	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
218 	{
219 		HASHCTL		hash_ctl;
220 
221 		/*
222 		 * XXX: The checkpointer needs to add entries to the pending ops table
223 		 * when absorbing fsync requests.  That is done within a critical
224 		 * section, which isn't usually allowed, but we make an exception. It
225 		 * means that there's a theoretical possibility that you run out of
226 		 * memory while absorbing fsync requests, which leads to a PANIC.
227 		 * Fortunately the hash table is small so that's unlikely to happen in
228 		 * practice.
229 		 */
230 		pendingOpsCxt = AllocSetContextCreate(MdCxt,
231 											  "Pending ops context",
232 											  ALLOCSET_DEFAULT_SIZES);
233 		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
234 
235 		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
236 		hash_ctl.keysize = sizeof(RelFileNode);
237 		hash_ctl.entrysize = sizeof(PendingOperationEntry);
238 		hash_ctl.hcxt = pendingOpsCxt;
239 		pendingOpsTable = hash_create("Pending Ops Table",
240 									  100L,
241 									  &hash_ctl,
242 									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
243 		pendingUnlinks = NIL;
244 	}
245 }
246 
247 /*
248  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
249  * already created the pendingOpsTable during initialization of the startup
250  * process.  Calling this function drops the local pendingOpsTable so that
251  * subsequent requests will be forwarded to checkpointer.
252  */
253 void
254 SetForwardFsyncRequests(void)
255 {
256 	/* Perform any pending fsyncs we may have queued up, then drop table */
257 	if (pendingOpsTable)
258 	{
259 		mdsync();
260 		hash_destroy(pendingOpsTable);
261 	}
262 	pendingOpsTable = NULL;
263 
264 	/*
265 	 * We should not have any pending unlink requests, since mdunlink doesn't
266 	 * queue unlink requests when isRedo.
267 	 */
268 	Assert(pendingUnlinks == NIL);
269 }
270 
271 /*
272  *	mdexists() -- Does the physical file exist?
273  *
274  * Note: this will return true for lingering files, with pending deletions
275  */
276 bool
277 mdexists(SMgrRelation reln, ForkNumber forkNum)
278 {
279 	/*
280 	 * Close it first, to ensure that we notice if the fork has been unlinked
281 	 * since we opened it.
282 	 */
283 	mdclose(reln, forkNum);
284 
285 	return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
286 }
287 
288 /*
289  *	mdcreate() -- Create a new relation on magnetic disk.
290  *
291  * If isRedo is true, it's okay for the relation to exist already.
292  */
293 void
294 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
295 {
296 	MdfdVec    *mdfd;
297 	char	   *path;
298 	File		fd;
299 
300 	if (isRedo && reln->md_num_open_segs[forkNum] > 0)
301 		return;					/* created and opened already... */
302 
303 	Assert(reln->md_num_open_segs[forkNum] == 0);
304 
305 	path = relpath(reln->smgr_rnode, forkNum);
306 
307 	fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
308 
309 	if (fd < 0)
310 	{
311 		int			save_errno = errno;
312 
313 		/*
314 		 * During bootstrap, there are cases where a system relation will be
315 		 * accessed (by internal backend processes) before the bootstrap
316 		 * script nominally creates it.  Therefore, allow the file to exist
317 		 * already, even if isRedo is not set.  (See also mdopen)
318 		 */
319 		if (isRedo || IsBootstrapProcessingMode())
320 			fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
321 		if (fd < 0)
322 		{
323 			/* be sure to report the error reported by create, not open */
324 			errno = save_errno;
325 			ereport(ERROR,
326 					(errcode_for_file_access(),
327 					 errmsg("could not create file \"%s\": %m", path)));
328 		}
329 	}
330 
331 	pfree(path);
332 
333 	_fdvec_resize(reln, forkNum, 1);
334 	mdfd = &reln->md_seg_fds[forkNum][0];
335 	mdfd->mdfd_vfd = fd;
336 	mdfd->mdfd_segno = 0;
337 }
338 
339 /*
340  *	mdunlink() -- Unlink a relation.
341  *
342  * Note that we're passed a RelFileNodeBackend --- by the time this is called,
343  * there won't be an SMgrRelation hashtable entry anymore.
344  *
345  * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
346  * to delete all forks.
347  *
348  * For regular relations, we don't unlink the first segment file of the rel,
349  * but just truncate it to zero length, and record a request to unlink it after
350  * the next checkpoint.  Additional segments can be unlinked immediately,
351  * however.  Leaving the empty file in place prevents that relfilenode
352  * number from being reused.  The scenario this protects us from is:
353  * 1. We delete a relation (and commit, and actually remove its file).
354  * 2. We create a new relation, which by chance gets the same relfilenode as
355  *	  the just-deleted one (OIDs must've wrapped around for that to happen).
356  * 3. We crash before another checkpoint occurs.
357  * During replay, we would delete the file and then recreate it, which is fine
358  * if the contents of the file were repopulated by subsequent WAL entries.
359  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
360  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
361  * the contents of the file would be lost forever.  By leaving the empty file
362  * until after the next checkpoint, we prevent reassignment of the relfilenode
363  * number until it's safe, because relfilenode assignment skips over any
364  * existing file.
365  *
366  * We do not need to go through this dance for temp relations, though, because
367  * we never make WAL entries for temp rels, and so a temp rel poses no threat
368  * to the health of a regular rel that has taken over its relfilenode number.
369  * The fact that temp rels and regular rels have different file naming
370  * patterns provides additional safety.
371  *
372  * All the above applies only to the relation's main fork; other forks can
373  * just be removed immediately, since they are not needed to prevent the
374  * relfilenode number from being recycled.  Also, we do not carefully
375  * track whether other forks have been created or not, but just attempt to
376  * unlink them unconditionally; so we should never complain about ENOENT.
377  *
378  * If isRedo is true, it's unsurprising for the relation to be already gone.
379  * Also, we should remove the file immediately instead of queuing a request
380  * for later, since during redo there's no possibility of creating a
381  * conflicting relation.
382  *
383  * Note: any failure should be reported as WARNING not ERROR, because
384  * we are usually not in a transaction anymore when this is called.
385  */
386 void
387 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
388 {
389 	/*
390 	 * We have to clean out any pending fsync requests for the doomed
391 	 * relation, else the next mdsync() will fail.  There can't be any such
392 	 * requests for a temp relation, though.  We can send just one request
393 	 * even when deleting multiple forks, since the fsync queuing code accepts
394 	 * the "InvalidForkNumber = all forks" convention.
395 	 */
396 	if (!RelFileNodeBackendIsTemp(rnode))
397 		ForgetRelationFsyncRequests(rnode.node, forkNum);
398 
399 	/* Now do the per-fork work */
400 	if (forkNum == InvalidForkNumber)
401 	{
402 		for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
403 			mdunlinkfork(rnode, forkNum, isRedo);
404 	}
405 	else
406 		mdunlinkfork(rnode, forkNum, isRedo);
407 }
408 
409 /*
410  * Truncate a file to release disk space.
411  */
412 static int
413 do_truncate(const char *path)
414 {
415 	int			save_errno;
416 	int			ret;
417 	int			fd;
418 
419 	/* truncate(2) would be easier here, but Windows hasn't got it */
420 	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
421 	if (fd >= 0)
422 	{
423 		ret = ftruncate(fd, 0);
424 		save_errno = errno;
425 		CloseTransientFile(fd);
426 		errno = save_errno;
427 	}
428 	else
429 		ret = -1;
430 
431 	/* Log a warning here to avoid repetition in callers. */
432 	if (ret < 0 && errno != ENOENT)
433 	{
434 		save_errno = errno;
435 		ereport(WARNING,
436 				(errcode_for_file_access(),
437 				 errmsg("could not truncate file \"%s\": %m", path)));
438 		errno = save_errno;
439 	}
440 
441 	return ret;
442 }
443 
444 static void
445 mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
446 {
447 	char	   *path;
448 	int			ret;
449 
450 	path = relpath(rnode, forkNum);
451 
452 	/*
453 	 * Delete or truncate the first segment.
454 	 */
455 	if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
456 	{
457 		if (!RelFileNodeBackendIsTemp(rnode))
458 		{
459 			/* Prevent other backends' fds from holding on to the disk space */
460 			ret = do_truncate(path);
461 		}
462 		else
463 			ret = 0;
464 
465 		/* Next unlink the file, unless it was already found to be missing */
466 		if (ret == 0 || errno != ENOENT)
467 		{
468 			ret = unlink(path);
469 			if (ret < 0 && errno != ENOENT)
470 				ereport(WARNING,
471 						(errcode_for_file_access(),
472 						 errmsg("could not remove file \"%s\": %m", path)));
473 		}
474 	}
475 	else
476 	{
477 		/* Prevent other backends' fds from holding on to the disk space */
478 		ret = do_truncate(path);
479 
480 		/* Register request to unlink first segment later */
481 		register_unlink(rnode);
482 	}
483 
484 	/*
485 	 * Delete any additional segments.
486 	 */
487 	if (ret >= 0)
488 	{
489 		char	   *segpath = (char *) palloc(strlen(path) + 12);
490 		BlockNumber segno;
491 
492 		/*
493 		 * Note that because we loop until getting ENOENT, we will correctly
494 		 * remove all inactive segments as well as active ones.
495 		 */
496 		for (segno = 1;; segno++)
497 		{
498 			sprintf(segpath, "%s.%u", path, segno);
499 
500 			if (!RelFileNodeBackendIsTemp(rnode))
501 			{
502 				/*
503 				 * Prevent other backends' fds from holding on to the disk
504 				 * space.
505 				 */
506 				if (do_truncate(segpath) < 0 && errno == ENOENT)
507 					break;
508 			}
509 
510 			if (unlink(segpath) < 0)
511 			{
512 				/* ENOENT is expected after the last segment... */
513 				if (errno != ENOENT)
514 					ereport(WARNING,
515 							(errcode_for_file_access(),
516 							 errmsg("could not remove file \"%s\": %m", segpath)));
517 				break;
518 			}
519 		}
520 		pfree(segpath);
521 	}
522 
523 	pfree(path);
524 }
525 
526 /*
527  *	mdextend() -- Add a block to the specified relation.
528  *
529  *		The semantics are nearly the same as mdwrite(): write at the
530  *		specified position.  However, this is to be used for the case of
531  *		extending a relation (i.e., blocknum is at or beyond the current
532  *		EOF).  Note that we assume writing a block beyond current EOF
533  *		causes intervening file space to become filled with zeroes.
534  */
535 void
536 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
537 		 char *buffer, bool skipFsync)
538 {
539 	off_t		seekpos;
540 	int			nbytes;
541 	MdfdVec    *v;
542 
543 	/* This assert is too expensive to have on normally ... */
544 #ifdef CHECK_WRITE_VS_EXTEND
545 	Assert(blocknum >= mdnblocks(reln, forknum));
546 #endif
547 
548 	/*
549 	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
550 	 * more --- we mustn't create a block whose number actually is
551 	 * InvalidBlockNumber.  (Note that this failure should be unreachable
552 	 * because of upstream checks in bufmgr.c.)
553 	 */
554 	if (blocknum == InvalidBlockNumber)
555 		ereport(ERROR,
556 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
557 				 errmsg("cannot extend file \"%s\" beyond %u blocks",
558 						relpath(reln->smgr_rnode, forknum),
559 						InvalidBlockNumber)));
560 
561 	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
562 
563 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
564 
565 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
566 
567 	/*
568 	 * Note: because caller usually obtained blocknum by calling mdnblocks,
569 	 * which did a seek(SEEK_END), this seek is often redundant and will be
570 	 * optimized away by fd.c.  It's not redundant, however, if there is a
571 	 * partial page at the end of the file. In that case we want to try to
572 	 * overwrite the partial page with a full page.  It's also not redundant
573 	 * if bufmgr.c had to dump another buffer of the same file to make room
574 	 * for the new page's buffer.
575 	 */
576 	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
577 		ereport(ERROR,
578 				(errcode_for_file_access(),
579 				 errmsg("could not seek to block %u in file \"%s\": %m",
580 						blocknum, FilePathName(v->mdfd_vfd))));
581 
582 	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
583 	{
584 		if (nbytes < 0)
585 			ereport(ERROR,
586 					(errcode_for_file_access(),
587 					 errmsg("could not extend file \"%s\": %m",
588 							FilePathName(v->mdfd_vfd)),
589 					 errhint("Check free disk space.")));
590 		/* short write: complain appropriately */
591 		ereport(ERROR,
592 				(errcode(ERRCODE_DISK_FULL),
593 				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
594 						FilePathName(v->mdfd_vfd),
595 						nbytes, BLCKSZ, blocknum),
596 				 errhint("Check free disk space.")));
597 	}
598 
599 	if (!skipFsync && !SmgrIsTemp(reln))
600 		register_dirty_segment(reln, forknum, v);
601 
602 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
603 }
604 
605 /*
606  *	mdopen() -- Open the specified relation.
607  *
608  * Note we only open the first segment, when there are multiple segments.
609  *
610  * If first segment is not present, either ereport or return NULL according
611  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
612  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
613  * invent one out of whole cloth.
614  */
615 static MdfdVec *
616 mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
617 {
618 	MdfdVec    *mdfd;
619 	char	   *path;
620 	File		fd;
621 
622 	/* No work if already open */
623 	if (reln->md_num_open_segs[forknum] > 0)
624 		return &reln->md_seg_fds[forknum][0];
625 
626 	path = relpath(reln->smgr_rnode, forknum);
627 
628 	fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
629 
630 	if (fd < 0)
631 	{
632 		/*
633 		 * During bootstrap, there are cases where a system relation will be
634 		 * accessed (by internal backend processes) before the bootstrap
635 		 * script nominally creates it.  Therefore, accept mdopen() as a
636 		 * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
637 		 */
638 		if (IsBootstrapProcessingMode())
639 			fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
640 		if (fd < 0)
641 		{
642 			if ((behavior & EXTENSION_RETURN_NULL) &&
643 				FILE_POSSIBLY_DELETED(errno))
644 			{
645 				pfree(path);
646 				return NULL;
647 			}
648 			ereport(ERROR,
649 					(errcode_for_file_access(),
650 					 errmsg("could not open file \"%s\": %m", path)));
651 		}
652 	}
653 
654 	pfree(path);
655 
656 	_fdvec_resize(reln, forknum, 1);
657 	mdfd = &reln->md_seg_fds[forknum][0];
658 	mdfd->mdfd_vfd = fd;
659 	mdfd->mdfd_segno = 0;
660 
661 	Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
662 
663 	return mdfd;
664 }
665 
666 /*
667  *	mdclose() -- Close the specified relation, if it isn't closed already.
668  */
669 void
670 mdclose(SMgrRelation reln, ForkNumber forknum)
671 {
672 	int			nopensegs = reln->md_num_open_segs[forknum];
673 
674 	/* No work if already closed */
675 	if (nopensegs == 0)
676 		return;
677 
678 	/* close segments starting from the end */
679 	while (nopensegs > 0)
680 	{
681 		MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
682 
683 		FileClose(v->mdfd_vfd);
684 		_fdvec_resize(reln, forknum, nopensegs - 1);
685 		nopensegs--;
686 	}
687 }
688 
689 /*
690  *	mdprefetch() -- Initiate asynchronous read of the specified block of a relation
691  */
692 void
693 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
694 {
695 #ifdef USE_PREFETCH
696 	off_t		seekpos;
697 	MdfdVec    *v;
698 
699 	v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
700 
701 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
702 
703 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
704 
705 	(void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
706 #endif							/* USE_PREFETCH */
707 }
708 
709 /*
710  * mdwriteback() -- Tell the kernel to write pages back to storage.
711  *
712  * This accepts a range of blocks because flushing several pages at once is
713  * considerably more efficient than doing so individually.
714  */
715 void
716 mdwriteback(SMgrRelation reln, ForkNumber forknum,
717 			BlockNumber blocknum, BlockNumber nblocks)
718 {
719 	/*
720 	 * Issue flush requests in as few requests as possible; have to split at
721 	 * segment boundaries though, since those are actually separate files.
722 	 */
723 	while (nblocks > 0)
724 	{
725 		BlockNumber nflush = nblocks;
726 		off_t		seekpos;
727 		MdfdVec    *v;
728 		int			segnum_start,
729 					segnum_end;
730 
731 		v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
732 						 EXTENSION_RETURN_NULL);
733 
734 		/*
735 		 * We might be flushing buffers of already removed relations, that's
736 		 * ok, just ignore that case.
737 		 */
738 		if (!v)
739 			return;
740 
741 		/* compute offset inside the current segment */
742 		segnum_start = blocknum / RELSEG_SIZE;
743 
744 		/* compute number of desired writes within the current segment */
745 		segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
746 		if (segnum_start != segnum_end)
747 			nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
748 
749 		Assert(nflush >= 1);
750 		Assert(nflush <= nblocks);
751 
752 		seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
753 
754 		FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
755 
756 		nblocks -= nflush;
757 		blocknum += nflush;
758 	}
759 }
760 
761 /*
762  *	mdread() -- Read the specified block from a relation.
763  */
764 void
765 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
766 	   char *buffer)
767 {
768 	off_t		seekpos;
769 	int			nbytes;
770 	MdfdVec    *v;
771 
772 	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
773 										reln->smgr_rnode.node.spcNode,
774 										reln->smgr_rnode.node.dbNode,
775 										reln->smgr_rnode.node.relNode,
776 										reln->smgr_rnode.backend);
777 
778 	v = _mdfd_getseg(reln, forknum, blocknum, false,
779 					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
780 
781 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
782 
783 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
784 
785 	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
786 		ereport(ERROR,
787 				(errcode_for_file_access(),
788 				 errmsg("could not seek to block %u in file \"%s\": %m",
789 						blocknum, FilePathName(v->mdfd_vfd))));
790 
791 	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);
792 
793 	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
794 									   reln->smgr_rnode.node.spcNode,
795 									   reln->smgr_rnode.node.dbNode,
796 									   reln->smgr_rnode.node.relNode,
797 									   reln->smgr_rnode.backend,
798 									   nbytes,
799 									   BLCKSZ);
800 
801 	if (nbytes != BLCKSZ)
802 	{
803 		if (nbytes < 0)
804 			ereport(ERROR,
805 					(errcode_for_file_access(),
806 					 errmsg("could not read block %u in file \"%s\": %m",
807 							blocknum, FilePathName(v->mdfd_vfd))));
808 
809 		/*
810 		 * Short read: we are at or past EOF, or we read a partial block at
811 		 * EOF.  Normally this is an error; upper levels should never try to
812 		 * read a nonexistent block.  However, if zero_damaged_pages is ON or
813 		 * we are InRecovery, we should instead return zeroes without
814 		 * complaining.  This allows, for example, the case of trying to
815 		 * update a block that was later truncated away.
816 		 */
817 		if (zero_damaged_pages || InRecovery)
818 			MemSet(buffer, 0, BLCKSZ);
819 		else
820 			ereport(ERROR,
821 					(errcode(ERRCODE_DATA_CORRUPTED),
822 					 errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
823 							blocknum, FilePathName(v->mdfd_vfd),
824 							nbytes, BLCKSZ)));
825 	}
826 }
827 
828 /*
829  *	mdwrite() -- Write the supplied block at the appropriate location.
830  *
831  *		This is to be used only for updating already-existing blocks of a
832  *		relation (ie, those before the current EOF).  To extend a relation,
833  *		use mdextend().
834  */
835 void
836 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
837 		char *buffer, bool skipFsync)
838 {
839 	off_t		seekpos;
840 	int			nbytes;
841 	MdfdVec    *v;
842 
843 	/* This assert is too expensive to have on normally ... */
844 #ifdef CHECK_WRITE_VS_EXTEND
845 	Assert(blocknum < mdnblocks(reln, forknum));
846 #endif
847 
848 	TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
849 										 reln->smgr_rnode.node.spcNode,
850 										 reln->smgr_rnode.node.dbNode,
851 										 reln->smgr_rnode.node.relNode,
852 										 reln->smgr_rnode.backend);
853 
854 	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
855 					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
856 
857 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
858 
859 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
860 
861 	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
862 		ereport(ERROR,
863 				(errcode_for_file_access(),
864 				 errmsg("could not seek to block %u in file \"%s\": %m",
865 						blocknum, FilePathName(v->mdfd_vfd))));
866 
867 	nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE);
868 
869 	TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
870 										reln->smgr_rnode.node.spcNode,
871 										reln->smgr_rnode.node.dbNode,
872 										reln->smgr_rnode.node.relNode,
873 										reln->smgr_rnode.backend,
874 										nbytes,
875 										BLCKSZ);
876 
877 	if (nbytes != BLCKSZ)
878 	{
879 		if (nbytes < 0)
880 			ereport(ERROR,
881 					(errcode_for_file_access(),
882 					 errmsg("could not write block %u in file \"%s\": %m",
883 							blocknum, FilePathName(v->mdfd_vfd))));
884 		/* short write: complain appropriately */
885 		ereport(ERROR,
886 				(errcode(ERRCODE_DISK_FULL),
887 				 errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
888 						blocknum,
889 						FilePathName(v->mdfd_vfd),
890 						nbytes, BLCKSZ),
891 				 errhint("Check free disk space.")));
892 	}
893 
894 	if (!skipFsync && !SmgrIsTemp(reln))
895 		register_dirty_segment(reln, forknum, v);
896 }
897 
898 /*
899  *	mdnblocks() -- Get the number of blocks stored in a relation.
900  *
901  *		Important side effect: all active segments of the relation are opened
902  *		and added to the mdfd_seg_fds array.  If this routine has not been
903  *		called, then only segments up to the last one actually touched
904  *		are present in the array.
905  */
906 BlockNumber
907 mdnblocks(SMgrRelation reln, ForkNumber forknum)
908 {
909 	MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
910 	BlockNumber nblocks;
911 	BlockNumber segno = 0;
912 
913 	/* mdopen has opened the first segment */
914 	Assert(reln->md_num_open_segs[forknum] > 0);
915 
916 	/*
917 	 * Start from the last open segments, to avoid redundant seeks.  We have
918 	 * previously verified that these segments are exactly RELSEG_SIZE long,
919 	 * and it's useless to recheck that each time.
920 	 *
921 	 * NOTE: this assumption could only be wrong if another backend has
922 	 * truncated the relation.  We rely on higher code levels to handle that
923 	 * scenario by closing and re-opening the md fd, which is handled via
924 	 * relcache flush.  (Since the checkpointer doesn't participate in
925 	 * relcache flush, it could have segment entries for inactive segments;
926 	 * that's OK because the checkpointer never needs to compute relation
927 	 * size.)
928 	 */
929 	segno = reln->md_num_open_segs[forknum] - 1;
930 	v = &reln->md_seg_fds[forknum][segno];
931 
932 	for (;;)
933 	{
934 		nblocks = _mdnblocks(reln, forknum, v);
935 		if (nblocks > ((BlockNumber) RELSEG_SIZE))
936 			elog(FATAL, "segment too big");
937 		if (nblocks < ((BlockNumber) RELSEG_SIZE))
938 			return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
939 
940 		/*
941 		 * If segment is exactly RELSEG_SIZE, advance to next one.
942 		 */
943 		segno++;
944 
945 		/*
946 		 * We used to pass O_CREAT here, but that's has the disadvantage that
947 		 * it might create a segment which has vanished through some operating
948 		 * system misadventure.  In such a case, creating the segment here
949 		 * undermines _mdfd_getseg's attempts to notice and report an error
950 		 * upon access to a missing segment.
951 		 */
952 		v = _mdfd_openseg(reln, forknum, segno, 0);
953 		if (v == NULL)
954 			return segno * ((BlockNumber) RELSEG_SIZE);
955 	}
956 }
957 
958 /*
959  *	mdtruncate() -- Truncate relation to specified number of blocks.
960  */
961 void
962 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
963 {
964 	BlockNumber curnblk;
965 	BlockNumber priorblocks;
966 	int			curopensegs;
967 
968 	/*
969 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
970 	 * truncation loop will get them all!
971 	 */
972 	curnblk = mdnblocks(reln, forknum);
973 	if (nblocks > curnblk)
974 	{
975 		/* Bogus request ... but no complaint if InRecovery */
976 		if (InRecovery)
977 			return;
978 		ereport(ERROR,
979 				(errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
980 						relpath(reln->smgr_rnode, forknum),
981 						nblocks, curnblk)));
982 	}
983 	if (nblocks == curnblk)
984 		return;					/* no work */
985 
986 	/*
987 	 * Truncate segments, starting at the last one. Starting at the end makes
988 	 * managing the memory for the fd array easier, should there be errors.
989 	 */
990 	curopensegs = reln->md_num_open_segs[forknum];
991 	while (curopensegs > 0)
992 	{
993 		MdfdVec    *v;
994 
995 		priorblocks = (curopensegs - 1) * RELSEG_SIZE;
996 
997 		v = &reln->md_seg_fds[forknum][curopensegs - 1];
998 
999 		if (priorblocks > nblocks)
1000 		{
1001 			/*
1002 			 * This segment is no longer active. We truncate the file, but do
1003 			 * not delete it, for reasons explained in the header comments.
1004 			 */
1005 			if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1006 				ereport(ERROR,
1007 						(errcode_for_file_access(),
1008 						 errmsg("could not truncate file \"%s\": %m",
1009 								FilePathName(v->mdfd_vfd))));
1010 
1011 			if (!SmgrIsTemp(reln))
1012 				register_dirty_segment(reln, forknum, v);
1013 
1014 			/* we never drop the 1st segment */
1015 			Assert(v != &reln->md_seg_fds[forknum][0]);
1016 
1017 			FileClose(v->mdfd_vfd);
1018 			_fdvec_resize(reln, forknum, curopensegs - 1);
1019 		}
1020 		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1021 		{
1022 			/*
1023 			 * This is the last segment we want to keep. Truncate the file to
1024 			 * the right length. NOTE: if nblocks is exactly a multiple K of
1025 			 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1026 			 * keep it. This adheres to the invariant given in the header
1027 			 * comments.
1028 			 */
1029 			BlockNumber lastsegblocks = nblocks - priorblocks;
1030 
1031 			if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1032 				ereport(ERROR,
1033 						(errcode_for_file_access(),
1034 						 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1035 								FilePathName(v->mdfd_vfd),
1036 								nblocks)));
1037 			if (!SmgrIsTemp(reln))
1038 				register_dirty_segment(reln, forknum, v);
1039 		}
1040 		else
1041 		{
1042 			/*
1043 			 * We still need this segment, so nothing to do for this and any
1044 			 * earlier segment.
1045 			 */
1046 			break;
1047 		}
1048 		curopensegs--;
1049 	}
1050 }
1051 
1052 /*
1053  *	mdimmedsync() -- Immediately sync a relation to stable storage.
1054  *
1055  * Note that only writes already issued are synced; this routine knows
1056  * nothing of dirty buffers that may exist inside the buffer manager.
1057  */
1058 void
1059 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
1060 {
1061 	int			segno;
1062 
1063 	/*
1064 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1065 	 * fsync loop will get them all!
1066 	 */
1067 	mdnblocks(reln, forknum);
1068 
1069 	segno = reln->md_num_open_segs[forknum];
1070 
1071 	while (segno > 0)
1072 	{
1073 		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
1074 
1075 		if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1076 			ereport(data_sync_elevel(ERROR),
1077 					(errcode_for_file_access(),
1078 					 errmsg("could not fsync file \"%s\": %m",
1079 							FilePathName(v->mdfd_vfd))));
1080 		segno--;
1081 	}
1082 }
1083 
1084 /*
1085  *	mdsync() -- Sync previous writes to stable storage.
1086  */
1087 void
1088 mdsync(void)
1089 {
1090 	static bool mdsync_in_progress = false;
1091 
1092 	HASH_SEQ_STATUS hstat;
1093 	PendingOperationEntry *entry;
1094 	int			absorb_counter;
1095 
1096 	/* Statistics on sync times */
1097 	int			processed = 0;
1098 	instr_time	sync_start,
1099 				sync_end,
1100 				sync_diff;
1101 	uint64		elapsed;
1102 	uint64		longest = 0;
1103 	uint64		total_elapsed = 0;
1104 
1105 	/*
1106 	 * This is only called during checkpoints, and checkpoints should only
1107 	 * occur in processes that have created a pendingOpsTable.
1108 	 */
1109 	if (!pendingOpsTable)
1110 		elog(ERROR, "cannot sync without a pendingOpsTable");
1111 
1112 	/*
1113 	 * If we are in the checkpointer, the sync had better include all fsync
1114 	 * requests that were queued by backends up to this point.  The tightest
1115 	 * race condition that could occur is that a buffer that must be written
1116 	 * and fsync'd for the checkpoint could have been dumped by a backend just
1117 	 * before it was visited by BufferSync().  We know the backend will have
1118 	 * queued an fsync request before clearing the buffer's dirtybit, so we
1119 	 * are safe as long as we do an Absorb after completing BufferSync().
1120 	 */
1121 	AbsorbFsyncRequests();
1122 
1123 	/*
1124 	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
1125 	 * checkpoint), we want to ignore fsync requests that are entered into the
1126 	 * hashtable after this point --- they should be processed next time,
1127 	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
1128 	 * ones: new ones will have cycle_ctr equal to the incremented value of
1129 	 * mdsync_cycle_ctr.
1130 	 *
1131 	 * In normal circumstances, all entries present in the table at this point
1132 	 * will have cycle_ctr exactly equal to the current (about to be old)
1133 	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
1134 	 * fsync'ing loop, then older values of cycle_ctr might remain when we
1135 	 * come back here to try again.  Repeated checkpoint failures would
1136 	 * eventually wrap the counter around to the point where an old entry
1137 	 * might appear new, causing us to skip it, possibly allowing a checkpoint
1138 	 * to succeed that should not have.  To forestall wraparound, any time the
1139 	 * previous mdsync() failed to complete, run through the table and
1140 	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
1141 	 *
1142 	 * Think not to merge this loop with the main loop, as the problem is
1143 	 * exactly that that loop may fail before having visited all the entries.
1144 	 * From a performance point of view it doesn't matter anyway, as this path
1145 	 * will never be taken in a system that's functioning normally.
1146 	 */
1147 	if (mdsync_in_progress)
1148 	{
1149 		/* prior try failed, so update any stale cycle_ctr values */
1150 		hash_seq_init(&hstat, pendingOpsTable);
1151 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1152 		{
1153 			entry->cycle_ctr = mdsync_cycle_ctr;
1154 		}
1155 	}
1156 
1157 	/* Advance counter so that new hashtable entries are distinguishable */
1158 	mdsync_cycle_ctr++;
1159 
1160 	/* Set flag to detect failure if we don't reach the end of the loop */
1161 	mdsync_in_progress = true;
1162 
1163 	/* Now scan the hashtable for fsync requests to process */
1164 	absorb_counter = FSYNCS_PER_ABSORB;
1165 	hash_seq_init(&hstat, pendingOpsTable);
1166 	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1167 	{
1168 		ForkNumber	forknum;
1169 
1170 		/*
1171 		 * If the entry is new then don't process it this time; it might
1172 		 * contain multiple fsync-request bits, but they are all new.  Note
1173 		 * "continue" bypasses the hash-remove call at the bottom of the loop.
1174 		 */
1175 		if (entry->cycle_ctr == mdsync_cycle_ctr)
1176 			continue;
1177 
1178 		/* Else assert we haven't missed it */
1179 		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1180 
1181 		/*
1182 		 * Scan over the forks and segments represented by the entry.
1183 		 *
1184 		 * The bitmap manipulations are slightly tricky, because we can call
1185 		 * AbsorbFsyncRequests() inside the loop and that could result in
1186 		 * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
1187 		 * So we detach it, but if we fail we'll merge it with any new
1188 		 * requests that have arrived in the meantime.
1189 		 */
1190 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1191 		{
1192 			Bitmapset  *requests = entry->requests[forknum];
1193 			int			segno;
1194 
1195 			entry->requests[forknum] = NULL;
1196 			entry->canceled[forknum] = false;
1197 
1198 			segno = -1;
1199 			while ((segno = bms_next_member(requests, segno)) >= 0)
1200 			{
1201 				int			failures;
1202 
1203 				/*
1204 				 * If fsync is off then we don't have to bother opening the
1205 				 * file at all.  (We delay checking until this point so that
1206 				 * changing fsync on the fly behaves sensibly.)
1207 				 */
1208 				if (!enableFsync)
1209 					continue;
1210 
1211 				/*
1212 				 * If in checkpointer, we want to absorb pending requests
1213 				 * every so often to prevent overflow of the fsync request
1214 				 * queue.  It is unspecified whether newly-added entries will
1215 				 * be visited by hash_seq_search, but we don't care since we
1216 				 * don't need to process them anyway.
1217 				 */
1218 				if (--absorb_counter <= 0)
1219 				{
1220 					AbsorbFsyncRequests();
1221 					absorb_counter = FSYNCS_PER_ABSORB;
1222 				}
1223 
1224 				/*
1225 				 * The fsync table could contain requests to fsync segments
1226 				 * that have been deleted (unlinked) by the time we get to
1227 				 * them. Rather than just hoping an ENOENT (or EACCES on
1228 				 * Windows) error can be ignored, what we do on error is
1229 				 * absorb pending requests and then retry.  Since mdunlink()
1230 				 * queues a "cancel" message before actually unlinking, the
1231 				 * fsync request is guaranteed to be marked canceled after the
1232 				 * absorb if it really was this case. DROP DATABASE likewise
1233 				 * has to tell us to forget fsync requests before it starts
1234 				 * deletions.
1235 				 */
1236 				for (failures = 0;; failures++) /* loop exits at "break" */
1237 				{
1238 					SMgrRelation reln;
1239 					MdfdVec    *seg;
1240 					char	   *path;
1241 					int			save_errno;
1242 
1243 					/*
1244 					 * Find or create an smgr hash entry for this relation.
1245 					 * This may seem a bit unclean -- md calling smgr?	But
1246 					 * it's really the best solution.  It ensures that the
1247 					 * open file reference isn't permanently leaked if we get
1248 					 * an error here. (You may say "but an unreferenced
1249 					 * SMgrRelation is still a leak!" Not really, because the
1250 					 * only case in which a checkpoint is done by a process
1251 					 * that isn't about to shut down is in the checkpointer,
1252 					 * and it will periodically do smgrcloseall(). This fact
1253 					 * justifies our not closing the reln in the success path
1254 					 * either, which is a good thing since in non-checkpointer
1255 					 * cases we couldn't safely do that.)
1256 					 */
1257 					reln = smgropen(entry->rnode, InvalidBackendId);
1258 
1259 					/* Attempt to open and fsync the target segment */
1260 					seg = _mdfd_getseg(reln, forknum,
1261 									   (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
1262 									   false,
1263 									   EXTENSION_RETURN_NULL
1264 									   | EXTENSION_DONT_CHECK_SIZE);
1265 
1266 					INSTR_TIME_SET_CURRENT(sync_start);
1267 
1268 					if (seg != NULL &&
1269 						FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
1270 					{
1271 						/* Success; update statistics about sync timing */
1272 						INSTR_TIME_SET_CURRENT(sync_end);
1273 						sync_diff = sync_end;
1274 						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
1275 						elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
1276 						if (elapsed > longest)
1277 							longest = elapsed;
1278 						total_elapsed += elapsed;
1279 						processed++;
1280 						requests = bms_del_member(requests, segno);
1281 						if (log_checkpoints)
1282 							elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
1283 								 processed,
1284 								 FilePathName(seg->mdfd_vfd),
1285 								 (double) elapsed / 1000);
1286 
1287 						break;	/* out of retry loop */
1288 					}
1289 
1290 					/* Compute file name for use in message */
1291 					save_errno = errno;
1292 					path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
1293 					errno = save_errno;
1294 
1295 					/*
1296 					 * It is possible that the relation has been dropped or
1297 					 * truncated since the fsync request was entered.
1298 					 * Therefore, allow ENOENT, but only if we didn't fail
1299 					 * already on this file.  This applies both for
1300 					 * _mdfd_getseg() and for FileSync, since fd.c might have
1301 					 * closed the file behind our back.
1302 					 *
1303 					 * XXX is there any point in allowing more than one retry?
1304 					 * Don't see one at the moment, but easy to change the
1305 					 * test here if so.
1306 					 */
1307 					if (!FILE_POSSIBLY_DELETED(errno) ||
1308 						failures > 0)
1309 					{
1310 						Bitmapset  *new_requests;
1311 
1312 						/*
1313 						 * We need to merge these unsatisfied requests with
1314 						 * any others that have arrived since we started.
1315 						 */
1316 						new_requests = entry->requests[forknum];
1317 						entry->requests[forknum] =
1318 							bms_join(new_requests, requests);
1319 
1320 						errno = save_errno;
1321 						ereport(data_sync_elevel(ERROR),
1322 								(errcode_for_file_access(),
1323 								 errmsg("could not fsync file \"%s\": %m",
1324 										path)));
1325 					}
1326 					else
1327 						ereport(DEBUG1,
1328 								(errcode_for_file_access(),
1329 								 errmsg("could not fsync file \"%s\" but retrying: %m",
1330 										path)));
1331 					pfree(path);
1332 
1333 					/*
1334 					 * Absorb incoming requests and check to see if a cancel
1335 					 * arrived for this relation fork.
1336 					 */
1337 					AbsorbFsyncRequests();
1338 					absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
1339 
1340 					if (entry->canceled[forknum])
1341 						break;
1342 				}				/* end retry loop */
1343 			}
1344 			bms_free(requests);
1345 		}
1346 
1347 		/*
1348 		 * We've finished everything that was requested before we started to
1349 		 * scan the entry.  If no new requests have been inserted meanwhile,
1350 		 * remove the entry.  Otherwise, update its cycle counter, as all the
1351 		 * requests now in it must have arrived during this cycle.
1352 		 */
1353 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1354 		{
1355 			if (entry->requests[forknum] != NULL)
1356 				break;
1357 		}
1358 		if (forknum <= MAX_FORKNUM)
1359 			entry->cycle_ctr = mdsync_cycle_ctr;
1360 		else
1361 		{
1362 			/* Okay to remove it */
1363 			if (hash_search(pendingOpsTable, &entry->rnode,
1364 							HASH_REMOVE, NULL) == NULL)
1365 				elog(ERROR, "pendingOpsTable corrupted");
1366 		}
1367 	}							/* end loop over hashtable entries */
1368 
1369 	/* Return sync performance metrics for report at checkpoint end */
1370 	CheckpointStats.ckpt_sync_rels = processed;
1371 	CheckpointStats.ckpt_longest_sync = longest;
1372 	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
1373 
1374 	/* Flag successful completion of mdsync */
1375 	mdsync_in_progress = false;
1376 }
1377 
1378 /*
1379  * mdpreckpt() -- Do pre-checkpoint work
1380  *
1381  * To distinguish unlink requests that arrived before this checkpoint
1382  * started from those that arrived during the checkpoint, we use a cycle
1383  * counter similar to the one we use for fsync requests. That cycle
1384  * counter is incremented here.
1385  *
1386  * This must be called *before* the checkpoint REDO point is determined.
1387  * That ensures that we won't delete files too soon.
1388  *
1389  * Note that we can't do anything here that depends on the assumption
1390  * that the checkpoint will be completed.
1391  */
1392 void
1393 mdpreckpt(void)
1394 {
1395 	/*
1396 	 * Any unlink requests arriving after this point will be assigned the next
1397 	 * cycle counter, and won't be unlinked until next checkpoint.
1398 	 */
1399 	mdckpt_cycle_ctr++;
1400 }
1401 
1402 /*
1403  * mdpostckpt() -- Do post-checkpoint work
1404  *
1405  * Remove any lingering files that can now be safely removed.
1406  */
1407 void
1408 mdpostckpt(void)
1409 {
1410 	int			absorb_counter;
1411 
1412 	absorb_counter = UNLINKS_PER_ABSORB;
1413 	while (pendingUnlinks != NIL)
1414 	{
1415 		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1416 		char	   *path;
1417 
1418 		/*
1419 		 * New entries are appended to the end, so if the entry is new we've
1420 		 * reached the end of old entries.
1421 		 *
1422 		 * Note: if just the right number of consecutive checkpoints fail, we
1423 		 * could be fooled here by cycle_ctr wraparound.  However, the only
1424 		 * consequence is that we'd delay unlinking for one more checkpoint,
1425 		 * which is perfectly tolerable.
1426 		 */
1427 		if (entry->cycle_ctr == mdckpt_cycle_ctr)
1428 			break;
1429 
1430 		/* Unlink the file */
1431 		path = relpathperm(entry->rnode, MAIN_FORKNUM);
1432 		if (unlink(path) < 0)
1433 		{
1434 			/*
1435 			 * There's a race condition, when the database is dropped at the
1436 			 * same time that we process the pending unlink requests. If the
1437 			 * DROP DATABASE deletes the file before we do, we will get ENOENT
1438 			 * here. rmtree() also has to ignore ENOENT errors, to deal with
1439 			 * the possibility that we delete the file first.
1440 			 */
1441 			if (errno != ENOENT)
1442 				ereport(WARNING,
1443 						(errcode_for_file_access(),
1444 						 errmsg("could not remove file \"%s\": %m", path)));
1445 		}
1446 		pfree(path);
1447 
1448 		/* And remove the list entry */
1449 		pendingUnlinks = list_delete_first(pendingUnlinks);
1450 		pfree(entry);
1451 
1452 		/*
1453 		 * As in mdsync, we don't want to stop absorbing fsync requests for a
1454 		 * long time when there are many deletions to be done.  We can safely
1455 		 * call AbsorbFsyncRequests() at this point in the loop (note it might
1456 		 * try to delete list entries).
1457 		 */
1458 		if (--absorb_counter <= 0)
1459 		{
1460 			AbsorbFsyncRequests();
1461 			absorb_counter = UNLINKS_PER_ABSORB;
1462 		}
1463 	}
1464 }
1465 
1466 /*
1467  * register_dirty_segment() -- Mark a relation segment as needing fsync
1468  *
1469  * If there is a local pending-ops table, just make an entry in it for
1470  * mdsync to process later.  Otherwise, try to pass off the fsync request
1471  * to the checkpointer process.  If that fails, just do the fsync
1472  * locally before returning (we hope this will not happen often enough
1473  * to be a performance problem).
1474  */
1475 static void
1476 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1477 {
1478 	/* Temp relations should never be fsync'd */
1479 	Assert(!SmgrIsTemp(reln));
1480 
1481 	if (pendingOpsTable)
1482 	{
1483 		/* push it into local pending-ops table */
1484 		RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
1485 	}
1486 	else
1487 	{
1488 		if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
1489 			return;				/* passed it off successfully */
1490 
1491 		ereport(DEBUG1,
1492 				(errmsg("could not forward fsync request because request queue is full")));
1493 
1494 		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1495 			ereport(data_sync_elevel(ERROR),
1496 					(errcode_for_file_access(),
1497 					 errmsg("could not fsync file \"%s\": %m",
1498 							FilePathName(seg->mdfd_vfd))));
1499 	}
1500 }
1501 
1502 /*
1503  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1504  *
1505  * We don't bother passing in the fork number, because this is only used
1506  * with main forks.
1507  *
1508  * As with register_dirty_segment, this could involve either a local or
1509  * a remote pending-ops table.
1510  */
1511 static void
1512 register_unlink(RelFileNodeBackend rnode)
1513 {
1514 	/* Should never be used with temp relations */
1515 	Assert(!RelFileNodeBackendIsTemp(rnode));
1516 
1517 	if (pendingOpsTable)
1518 	{
1519 		/* push it into local pending-ops table */
1520 		RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
1521 							 UNLINK_RELATION_REQUEST);
1522 	}
1523 	else
1524 	{
1525 		/*
1526 		 * Notify the checkpointer about it.  If we fail to queue the request
1527 		 * message, we have to sleep and try again, because we can't simply
1528 		 * delete the file now.  Ugly, but hopefully won't happen often.
1529 		 *
1530 		 * XXX should we just leave the file orphaned instead?
1531 		 */
1532 		Assert(IsUnderPostmaster);
1533 		while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
1534 									UNLINK_RELATION_REQUEST))
1535 			pg_usleep(10000L);	/* 10 msec seems a good number */
1536 	}
1537 }
1538 
1539 /*
1540  * RememberFsyncRequest() -- callback from checkpointer side of fsync request
1541  *
1542  * We stuff fsync requests into the local hash table for execution
1543  * during the checkpointer's next checkpoint.  UNLINK requests go into a
1544  * separate linked list, however, because they get processed separately.
1545  *
1546  * The range of possible segment numbers is way less than the range of
1547  * BlockNumber, so we can reserve high values of segno for special purposes.
1548  * We define three:
1549  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
1550  *	 either for one fork, or all forks if forknum is InvalidForkNumber
1551  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1552  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1553  *	 checkpoint.
1554  * Note also that we're assuming real segment numbers don't exceed INT_MAX.
1555  *
1556  * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
1557  * table has to be searched linearly, but dropping a database is a pretty
1558  * heavyweight operation anyhow, so we'll live with it.)
1559  */
1560 void
1561 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1562 {
1563 	Assert(pendingOpsTable);
1564 
1565 	if (segno == FORGET_RELATION_FSYNC)
1566 	{
1567 		/* Remove any pending requests for the relation (one or all forks) */
1568 		PendingOperationEntry *entry;
1569 
1570 		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1571 													  &rnode,
1572 													  HASH_FIND,
1573 													  NULL);
1574 		if (entry)
1575 		{
1576 			/*
1577 			 * We can't just delete the entry since mdsync could have an
1578 			 * active hashtable scan.  Instead we delete the bitmapsets; this
1579 			 * is safe because of the way mdsync is coded.  We also set the
1580 			 * "canceled" flags so that mdsync can tell that a cancel arrived
1581 			 * for the fork(s).
1582 			 */
1583 			if (forknum == InvalidForkNumber)
1584 			{
1585 				/* remove requests for all forks */
1586 				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1587 				{
1588 					bms_free(entry->requests[forknum]);
1589 					entry->requests[forknum] = NULL;
1590 					entry->canceled[forknum] = true;
1591 				}
1592 			}
1593 			else
1594 			{
1595 				/* remove requests for single fork */
1596 				bms_free(entry->requests[forknum]);
1597 				entry->requests[forknum] = NULL;
1598 				entry->canceled[forknum] = true;
1599 			}
1600 		}
1601 	}
1602 	else if (segno == FORGET_DATABASE_FSYNC)
1603 	{
1604 		/* Remove any pending requests for the entire database */
1605 		HASH_SEQ_STATUS hstat;
1606 		PendingOperationEntry *entry;
1607 		ListCell   *cell,
1608 				   *prev,
1609 				   *next;
1610 
1611 		/* Remove fsync requests */
1612 		hash_seq_init(&hstat, pendingOpsTable);
1613 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1614 		{
1615 			if (entry->rnode.dbNode == rnode.dbNode)
1616 			{
1617 				/* remove requests for all forks */
1618 				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1619 				{
1620 					bms_free(entry->requests[forknum]);
1621 					entry->requests[forknum] = NULL;
1622 					entry->canceled[forknum] = true;
1623 				}
1624 			}
1625 		}
1626 
1627 		/* Remove unlink requests */
1628 		prev = NULL;
1629 		for (cell = list_head(pendingUnlinks); cell; cell = next)
1630 		{
1631 			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1632 
1633 			next = lnext(cell);
1634 			if (entry->rnode.dbNode == rnode.dbNode)
1635 			{
1636 				pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1637 				pfree(entry);
1638 			}
1639 			else
1640 				prev = cell;
1641 		}
1642 	}
1643 	else if (segno == UNLINK_RELATION_REQUEST)
1644 	{
1645 		/* Unlink request: put it in the linked list */
1646 		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
1647 		PendingUnlinkEntry *entry;
1648 
1649 		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
1650 		Assert(forknum == MAIN_FORKNUM);
1651 
1652 		entry = palloc(sizeof(PendingUnlinkEntry));
1653 		entry->rnode = rnode;
1654 		entry->cycle_ctr = mdckpt_cycle_ctr;
1655 
1656 		pendingUnlinks = lappend(pendingUnlinks, entry);
1657 
1658 		MemoryContextSwitchTo(oldcxt);
1659 	}
1660 	else
1661 	{
1662 		/* Normal case: enter a request to fsync this segment */
1663 		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
1664 		PendingOperationEntry *entry;
1665 		bool		found;
1666 
1667 		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1668 													  &rnode,
1669 													  HASH_ENTER,
1670 													  &found);
1671 		/* if new entry, initialize it */
1672 		if (!found)
1673 		{
1674 			entry->cycle_ctr = mdsync_cycle_ctr;
1675 			MemSet(entry->requests, 0, sizeof(entry->requests));
1676 			MemSet(entry->canceled, 0, sizeof(entry->canceled));
1677 		}
1678 
1679 		/*
1680 		 * NB: it's intentional that we don't change cycle_ctr if the entry
1681 		 * already exists.  The cycle_ctr must represent the oldest fsync
1682 		 * request that could be in the entry.
1683 		 */
1684 
1685 		entry->requests[forknum] = bms_add_member(entry->requests[forknum],
1686 												  (int) segno);
1687 
1688 		MemoryContextSwitchTo(oldcxt);
1689 	}
1690 }
1691 
1692 /*
1693  * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
1694  *
1695  * forknum == InvalidForkNumber means all forks, although this code doesn't
1696  * actually know that, since it's just forwarding the request elsewhere.
1697  */
1698 void
1699 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
1700 {
1701 	if (pendingOpsTable)
1702 	{
1703 		/* standalone backend or startup process: fsync state is local */
1704 		RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1705 	}
1706 	else if (IsUnderPostmaster)
1707 	{
1708 		/*
1709 		 * Notify the checkpointer about it.  If we fail to queue the cancel
1710 		 * message, we have to sleep and try again ... ugly, but hopefully
1711 		 * won't happen often.
1712 		 *
1713 		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1714 		 * error would leave the no-longer-used file still present on disk,
1715 		 * which would be bad, so I'm inclined to assume that the checkpointer
1716 		 * will always empty the queue soon.
1717 		 */
1718 		while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1719 			pg_usleep(10000L);	/* 10 msec seems a good number */
1720 
1721 		/*
1722 		 * Note we don't wait for the checkpointer to actually absorb the
1723 		 * cancel message; see mdsync() for the implications.
1724 		 */
1725 	}
1726 }
1727 
1728 /*
1729  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1730  */
1731 void
1732 ForgetDatabaseFsyncRequests(Oid dbid)
1733 {
1734 	RelFileNode rnode;
1735 
1736 	rnode.dbNode = dbid;
1737 	rnode.spcNode = 0;
1738 	rnode.relNode = 0;
1739 
1740 	if (pendingOpsTable)
1741 	{
1742 		/* standalone backend or startup process: fsync state is local */
1743 		RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1744 	}
1745 	else if (IsUnderPostmaster)
1746 	{
1747 		/* see notes in ForgetRelationFsyncRequests */
1748 		while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1749 									FORGET_DATABASE_FSYNC))
1750 			pg_usleep(10000L);	/* 10 msec seems a good number */
1751 	}
1752 }
1753 
1754 /*
1755  * DropRelationFiles -- drop files of all given relations
1756  */
1757 void
1758 DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
1759 {
1760 	SMgrRelation *srels;
1761 	int			i;
1762 
1763 	srels = palloc(sizeof(SMgrRelation) * ndelrels);
1764 	for (i = 0; i < ndelrels; i++)
1765 	{
1766 		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
1767 
1768 		if (isRedo)
1769 		{
1770 			ForkNumber	fork;
1771 
1772 			for (fork = 0; fork <= MAX_FORKNUM; fork++)
1773 				XLogDropRelation(delrels[i], fork);
1774 		}
1775 		srels[i] = srel;
1776 	}
1777 
1778 	smgrdounlinkall(srels, ndelrels, isRedo);
1779 
1780 	for (i = 0; i < ndelrels; i++)
1781 		smgrclose(srels[i]);
1782 	pfree(srels);
1783 }
1784 
1785 
1786 /*
1787  *	_fdvec_resize() -- Resize the fork's open segments array
1788  */
1789 static void
1790 _fdvec_resize(SMgrRelation reln,
1791 			  ForkNumber forknum,
1792 			  int nseg)
1793 {
1794 	if (nseg == 0)
1795 	{
1796 		if (reln->md_num_open_segs[forknum] > 0)
1797 		{
1798 			pfree(reln->md_seg_fds[forknum]);
1799 			reln->md_seg_fds[forknum] = NULL;
1800 		}
1801 	}
1802 	else if (reln->md_num_open_segs[forknum] == 0)
1803 	{
1804 		reln->md_seg_fds[forknum] =
1805 			MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
1806 	}
1807 	else
1808 	{
1809 		/*
1810 		 * It doesn't seem worthwhile complicating the code to amortize
1811 		 * repalloc() calls.  Those are far faster than PathNameOpenFile() or
1812 		 * FileClose(), and the memory context internally will sometimes avoid
1813 		 * doing an actual reallocation.
1814 		 */
1815 		reln->md_seg_fds[forknum] =
1816 			repalloc(reln->md_seg_fds[forknum],
1817 					 sizeof(MdfdVec) * nseg);
1818 	}
1819 
1820 	reln->md_num_open_segs[forknum] = nseg;
1821 }
1822 
1823 /*
1824  * Return the filename for the specified segment of the relation. The
1825  * returned string is palloc'd.
1826  */
1827 static char *
1828 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1829 {
1830 	char	   *path,
1831 			   *fullpath;
1832 
1833 	path = relpath(reln->smgr_rnode, forknum);
1834 
1835 	if (segno > 0)
1836 	{
1837 		fullpath = psprintf("%s.%u", path, segno);
1838 		pfree(path);
1839 	}
1840 	else
1841 		fullpath = path;
1842 
1843 	return fullpath;
1844 }
1845 
1846 /*
1847  * Open the specified segment of the relation,
1848  * and make a MdfdVec object for it.  Returns NULL on failure.
1849  */
1850 static MdfdVec *
1851 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1852 			  int oflags)
1853 {
1854 	MdfdVec    *v;
1855 	int			fd;
1856 	char	   *fullpath;
1857 
1858 	fullpath = _mdfd_segpath(reln, forknum, segno);
1859 
1860 	/* open the file */
1861 	fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
1862 
1863 	pfree(fullpath);
1864 
1865 	if (fd < 0)
1866 		return NULL;
1867 
1868 	if (segno <= reln->md_num_open_segs[forknum])
1869 		_fdvec_resize(reln, forknum, segno + 1);
1870 
1871 	/* fill the entry */
1872 	v = &reln->md_seg_fds[forknum][segno];
1873 	v->mdfd_vfd = fd;
1874 	v->mdfd_segno = segno;
1875 
1876 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1877 
1878 	/* all done */
1879 	return v;
1880 }
1881 
1882 /*
1883  *	_mdfd_getseg() -- Find the segment of the relation holding the
1884  *		specified block.
1885  *
1886  * If the segment doesn't exist, we ereport, return NULL, or create the
1887  * segment, according to "behavior".  Note: skipFsync is only used in the
1888  * EXTENSION_CREATE case.
1889  */
1890 static MdfdVec *
1891 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1892 			 bool skipFsync, int behavior)
1893 {
1894 	MdfdVec    *v;
1895 	BlockNumber targetseg;
1896 	BlockNumber nextsegno;
1897 
1898 	/* some way to handle non-existent segments needs to be specified */
1899 	Assert(behavior &
1900 		   (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL));
1901 
1902 	targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1903 
1904 	/* if an existing and opened segment, we're done */
1905 	if (targetseg < reln->md_num_open_segs[forknum])
1906 	{
1907 		v = &reln->md_seg_fds[forknum][targetseg];
1908 		return v;
1909 	}
1910 
1911 	/*
1912 	 * The target segment is not yet open. Iterate over all the segments
1913 	 * between the last opened and the target segment. This way missing
1914 	 * segments either raise an error, or get created (according to
1915 	 * 'behavior'). Start with either the last opened, or the first segment if
1916 	 * none was opened before.
1917 	 */
1918 	if (reln->md_num_open_segs[forknum] > 0)
1919 		v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1920 	else
1921 	{
1922 		v = mdopen(reln, forknum, behavior);
1923 		if (!v)
1924 			return NULL;		/* if behavior & EXTENSION_RETURN_NULL */
1925 	}
1926 
1927 	for (nextsegno = reln->md_num_open_segs[forknum];
1928 		 nextsegno <= targetseg; nextsegno++)
1929 	{
1930 		BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1931 		int			flags = 0;
1932 
1933 		Assert(nextsegno == v->mdfd_segno + 1);
1934 
1935 		if (nblocks > ((BlockNumber) RELSEG_SIZE))
1936 			elog(FATAL, "segment too big");
1937 
1938 		if ((behavior & EXTENSION_CREATE) ||
1939 			(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1940 		{
1941 			/*
1942 			 * Normally we will create new segments only if authorized by the
1943 			 * caller (i.e., we are doing mdextend()).  But when doing WAL
1944 			 * recovery, create segments anyway; this allows cases such as
1945 			 * replaying WAL data that has a write into a high-numbered
1946 			 * segment of a relation that was later deleted. We want to go
1947 			 * ahead and create the segments so we can finish out the replay.
1948 			 * However if the caller has specified
1949 			 * EXTENSION_REALLY_RETURN_NULL, then extension is not desired
1950 			 * even in recovery; we won't reach this point in that case.
1951 			 *
1952 			 * We have to maintain the invariant that segments before the last
1953 			 * active segment are of size RELSEG_SIZE; therefore, if
1954 			 * extending, pad them out with zeroes if needed.  (This only
1955 			 * matters if in recovery, or if the caller is extending the
1956 			 * relation discontiguously, but that can happen in hash indexes.)
1957 			 */
1958 			if (nblocks < ((BlockNumber) RELSEG_SIZE))
1959 			{
1960 				char	   *zerobuf = palloc0(BLCKSZ);
1961 
1962 				mdextend(reln, forknum,
1963 						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1964 						 zerobuf, skipFsync);
1965 				pfree(zerobuf);
1966 			}
1967 			flags = O_CREAT;
1968 		}
1969 		else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
1970 				 nblocks < ((BlockNumber) RELSEG_SIZE))
1971 		{
1972 			/*
1973 			 * When not extending (or explicitly including truncated
1974 			 * segments), only open the next segment if the current one is
1975 			 * exactly RELSEG_SIZE.  If not (this branch), either return NULL
1976 			 * or fail.
1977 			 */
1978 			if (behavior & EXTENSION_RETURN_NULL)
1979 			{
1980 				/*
1981 				 * Some callers discern between reasons for _mdfd_getseg()
1982 				 * returning NULL based on errno. As there's no failing
1983 				 * syscall involved in this case, explicitly set errno to
1984 				 * ENOENT, as that seems the closest interpretation.
1985 				 */
1986 				errno = ENOENT;
1987 				return NULL;
1988 			}
1989 
1990 			ereport(ERROR,
1991 					(errcode_for_file_access(),
1992 					 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1993 							_mdfd_segpath(reln, forknum, nextsegno),
1994 							blkno, nblocks)));
1995 		}
1996 
1997 		v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1998 
1999 		if (v == NULL)
2000 		{
2001 			if ((behavior & EXTENSION_RETURN_NULL) &&
2002 				FILE_POSSIBLY_DELETED(errno))
2003 				return NULL;
2004 			ereport(ERROR,
2005 					(errcode_for_file_access(),
2006 					 errmsg("could not open file \"%s\" (target block %u): %m",
2007 							_mdfd_segpath(reln, forknum, nextsegno),
2008 							blkno)));
2009 		}
2010 	}
2011 
2012 	return v;
2013 }
2014 
2015 /*
2016  * Get number of blocks present in a single disk file
2017  */
2018 static BlockNumber
2019 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
2020 {
2021 	off_t		len;
2022 
2023 	len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
2024 	if (len < 0)
2025 		ereport(ERROR,
2026 				(errcode_for_file_access(),
2027 				 errmsg("could not seek to end of file \"%s\": %m",
2028 						FilePathName(seg->mdfd_vfd))));
2029 	/* note that this calculation will ignore any partial block at EOF */
2030 	return (BlockNumber) (len / BLCKSZ);
2031 }
2032