1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  *	  Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have.  (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed.  Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends.  Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted.  See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open.  This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/stat.h>
79 #include <sys/types.h>
80 #ifndef WIN32
81 #include <sys/mman.h>
82 #endif
83 #include <limits.h>
84 #include <unistd.h>
85 #include <fcntl.h>
86 #ifdef HAVE_SYS_RESOURCE_H
87 #include <sys/resource.h>		/* for getrlimit */
88 #endif
89 
90 #include "access/xact.h"
91 #include "access/xlog.h"
92 #include "catalog/pg_tablespace.h"
93 #include "common/file_perm.h"
94 #include "common/file_utils.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "port/pg_iovec.h"
98 #include "portability/mem.h"
99 #include "storage/fd.h"
100 #include "storage/ipc.h"
101 #include "utils/guc.h"
102 #include "utils/resowner_private.h"
103 
104 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
105 #if defined(HAVE_SYNC_FILE_RANGE)
106 #define PG_FLUSH_DATA_WORKS 1
107 #elif !defined(WIN32) && defined(MS_ASYNC)
108 #define PG_FLUSH_DATA_WORKS 1
109 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
110 #define PG_FLUSH_DATA_WORKS 1
111 #endif
112 
113 /*
114  * We must leave some file descriptors free for system(), the dynamic loader,
115  * and other code that tries to open files without consulting fd.c.  This
116  * is the number left free.  (While we try fairly hard to prevent EMFILE
117  * errors, there's never any guarantee that we won't get ENFILE due to
118  * other processes chewing up FDs.  So it's a bad idea to try to open files
119  * without consulting fd.c.  Nonetheless we cannot control all code.)
120  *
121  * Because this is just a fixed setting, we are effectively assuming that
122  * no such code will leave FDs open over the long term; otherwise the slop
123  * is likely to be insufficient.  Note in particular that we expect that
124  * loading a shared library does not result in any permanent increase in
125  * the number of open files.  (This appears to be true on most if not
126  * all platforms as of Feb 2004.)
127  */
128 #define NUM_RESERVED_FDS		10
129 
130 /*
131  * If we have fewer than this many usable FDs after allowing for the reserved
132  * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
133  * much less than that.  Note that this value ensures numExternalFDs can be
134  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
135  * will not pass unless that can grow to at least 14.)
136  */
137 #define FD_MINFREE				48
138 
139 /*
140  * A number of platforms allow individual processes to open many more files
141  * than they can really support when *many* processes do the same thing.
142  * This GUC parameter lets the DBA limit max_safe_fds to something less than
143  * what the postmaster's initial probe suggests will work.
144  */
145 int			max_files_per_process = 1000;
146 
147 /*
148  * Maximum number of file descriptors to open for operations that fd.c knows
149  * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
150  * to a conservative value, and remains that way indefinitely in bootstrap or
151  * standalone-backend cases.  In normal postmaster operation, the postmaster
152  * calls set_max_safe_fds() late in initialization to update the value, and
153  * that value is then inherited by forked subprocesses.
154  *
155  * Note: the value of max_files_per_process is taken into account while
156  * setting this variable, and so need not be tested separately.
157  */
158 int			max_safe_fds = FD_MINFREE;	/* default if not changed */
159 
160 /* Whether it is safe to continue running after fsync() fails. */
161 bool		data_sync_retry = false;
162 
163 /* How SyncDataDirectory() should do its job. */
164 int			recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
165 
166 /* Debugging.... */
167 
168 #ifdef FDDEBUG
169 #define DO_DB(A) \
170 	do { \
171 		int			_do_db_save_errno = errno; \
172 		A; \
173 		errno = _do_db_save_errno; \
174 	} while (0)
175 #else
176 #define DO_DB(A) \
177 	((void) 0)
178 #endif
179 
180 #define VFD_CLOSED (-1)
181 
182 #define FileIsValid(file) \
183 	((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
184 
185 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
186 
187 /* these are the assigned bits in fdstate below: */
188 #define FD_DELETE_AT_CLOSE	(1 << 0)	/* T = delete when closed */
189 #define FD_CLOSE_AT_EOXACT	(1 << 1)	/* T = close at eoXact */
190 #define FD_TEMP_FILE_LIMIT	(1 << 2)	/* T = respect temp_file_limit */
191 
192 typedef struct vfd
193 {
194 	int			fd;				/* current FD, or VFD_CLOSED if none */
195 	unsigned short fdstate;		/* bitflags for VFD's state */
196 	ResourceOwner resowner;		/* owner, for automatic cleanup */
197 	File		nextFree;		/* link to next free VFD, if in freelist */
198 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
199 	File		lruLessRecently;
200 	off_t		fileSize;		/* current size of file (0 if not temporary) */
201 	char	   *fileName;		/* name of file, or NULL for unused VFD */
202 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
203 	int			fileFlags;		/* open(2) flags for (re)opening the file */
204 	mode_t		fileMode;		/* mode to pass to open(2) */
205 } Vfd;
206 
207 /*
208  * Virtual File Descriptor array pointer and size.  This grows as
209  * needed.  'File' values are indexes into this array.
210  * Note that VfdCache[0] is not a usable VFD, just a list header.
211  */
212 static Vfd *VfdCache;
213 static Size SizeVfdCache = 0;
214 
215 /*
216  * Number of file descriptors known to be in use by VFD entries.
217  */
218 static int	nfile = 0;
219 
220 /*
221  * Flag to tell whether it's worth scanning VfdCache looking for temp files
222  * to close
223  */
224 static bool have_xact_temporary_files = false;
225 
226 /*
227  * Tracks the total size of all temporary files.  Note: when temp_file_limit
228  * is being enforced, this cannot overflow since the limit cannot be more
229  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
230  * overflow, but we don't care.
231  */
232 static uint64 temporary_files_size = 0;
233 
234 /*
235  * List of OS handles opened with AllocateFile, AllocateDir and
236  * OpenTransientFile.
237  */
238 typedef enum
239 {
240 	AllocateDescFile,
241 	AllocateDescPipe,
242 	AllocateDescDir,
243 	AllocateDescRawFD
244 } AllocateDescKind;
245 
246 typedef struct
247 {
248 	AllocateDescKind kind;
249 	SubTransactionId create_subid;
250 	union
251 	{
252 		FILE	   *file;
253 		DIR		   *dir;
254 		int			fd;
255 	}			desc;
256 } AllocateDesc;
257 
258 static int	numAllocatedDescs = 0;
259 static int	maxAllocatedDescs = 0;
260 static AllocateDesc *allocatedDescs = NULL;
261 
262 /*
263  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
264  */
265 static int	numExternalFDs = 0;
266 
267 /*
268  * Number of temporary files opened during the current session;
269  * this is used in generation of tempfile names.
270  */
271 static long tempFileCounter = 0;
272 
273 /*
274  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
275  * indicating that the current database's default tablespace should be used.)
276  * When numTempTableSpaces is -1, this has not been set in the current
277  * transaction.
278  */
279 static Oid *tempTableSpaces = NULL;
280 static int	numTempTableSpaces = -1;
281 static int	nextTempTableSpace = 0;
282 
283 
284 /*--------------------
285  *
286  * Private Routines
287  *
288  * Delete		   - delete a file from the Lru ring
289  * LruDelete	   - remove a file from the Lru ring and close its FD
290  * Insert		   - put a file at the front of the Lru ring
291  * LruInsert	   - put a file at the front of the Lru ring and open it
292  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
293  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
294  * AllocateVfd	   - grab a free (or new) file record (from VfdCache)
295  * FreeVfd		   - free a file record
296  *
297  * The Least Recently Used ring is a doubly linked list that begins and
298  * ends on element zero.  Element zero is special -- it doesn't represent
299  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
300  * anchor that shows us the beginning/end of the ring.
301  * Only VFD elements that are currently really open (have an FD assigned) are
302  * in the Lru ring.  Elements that are "virtually" open can be recognized
303  * by having a non-null fileName field.
304  *
305  * example:
306  *
307  *	   /--less----\				   /---------\
308  *	   v		   \			  v			  \
309  *	 #0 --more---> LeastRecentlyUsed --more-\ \
310  *	  ^\									| |
311  *	   \\less--> MostRecentlyUsedFile	<---/ |
312  *		\more---/					 \--less--/
313  *
314  *--------------------
315  */
316 static void Delete(File file);
317 static void LruDelete(File file);
318 static void Insert(File file);
319 static int	LruInsert(File file);
320 static bool ReleaseLruFile(void);
321 static void ReleaseLruFiles(void);
322 static File AllocateVfd(void);
323 static void FreeVfd(File file);
324 
325 static int	FileAccess(File file);
326 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
327 static bool reserveAllocatedDesc(void);
328 static int	FreeDesc(AllocateDesc *desc);
329 
330 static void AtProcExit_Files(int code, Datum arg);
331 static void CleanupTempFiles(bool isCommit, bool isProcExit);
332 static void RemovePgTempRelationFiles(const char *tsdirname);
333 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
334 
335 static void walkdir(const char *path,
336 					void (*action) (const char *fname, bool isdir, int elevel),
337 					bool process_symlinks,
338 					int elevel);
339 #ifdef PG_FLUSH_DATA_WORKS
340 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
341 #endif
342 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
343 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
344 
345 static int	fsync_parent_path(const char *fname, int elevel);
346 
347 
348 /*
349  * pg_fsync --- do fsync with or without writethrough
350  */
351 int
pg_fsync(int fd)352 pg_fsync(int fd)
353 {
354 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
355 	struct stat st;
356 
357 	/*
358 	 * Some operating system implementations of fsync() have requirements
359 	 * about the file access modes that were used when their file descriptor
360 	 * argument was opened, and these requirements differ depending on whether
361 	 * the file descriptor is for a directory.
362 	 *
363 	 * For any file descriptor that may eventually be handed to fsync(), we
364 	 * should have opened it with access modes that are compatible with
365 	 * fsync() on all supported systems, otherwise the code may not be
366 	 * portable, even if it runs ok on the current system.
367 	 *
368 	 * We assert here that a descriptor for a file was opened with write
369 	 * permissions (either O_RDWR or O_WRONLY) and for a directory without
370 	 * write permissions (O_RDONLY).
371 	 *
372 	 * Ignore any fstat errors and let the follow-up fsync() do its work.
373 	 * Doing this sanity check here counts for the case where fsync() is
374 	 * disabled.
375 	 */
376 	if (fstat(fd, &st) == 0)
377 	{
378 		int			desc_flags = fcntl(fd, F_GETFL);
379 
380 		/*
381 		 * O_RDONLY is historically 0, so just make sure that for directories
382 		 * no write flags are used.
383 		 */
384 		if (S_ISDIR(st.st_mode))
385 			Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
386 		else
387 			Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
388 	}
389 	errno = 0;
390 #endif
391 
392 	/* #if is to skip the sync_method test if there's no need for it */
393 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
394 	if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
395 		return pg_fsync_writethrough(fd);
396 	else
397 #endif
398 		return pg_fsync_no_writethrough(fd);
399 }
400 
401 
402 /*
403  * pg_fsync_no_writethrough --- same as fsync except does nothing if
404  *	enableFsync is off
405  */
406 int
pg_fsync_no_writethrough(int fd)407 pg_fsync_no_writethrough(int fd)
408 {
409 	if (enableFsync)
410 		return fsync(fd);
411 	else
412 		return 0;
413 }
414 
415 /*
416  * pg_fsync_writethrough
417  */
418 int
pg_fsync_writethrough(int fd)419 pg_fsync_writethrough(int fd)
420 {
421 	if (enableFsync)
422 	{
423 #ifdef WIN32
424 		return _commit(fd);
425 #elif defined(F_FULLFSYNC)
426 		return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
427 #else
428 		errno = ENOSYS;
429 		return -1;
430 #endif
431 	}
432 	else
433 		return 0;
434 }
435 
436 /*
437  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
438  *
439  * Not all platforms have fdatasync; treat as fsync if not available.
440  */
441 int
pg_fdatasync(int fd)442 pg_fdatasync(int fd)
443 {
444 	if (enableFsync)
445 	{
446 #ifdef HAVE_FDATASYNC
447 		return fdatasync(fd);
448 #else
449 		return fsync(fd);
450 #endif
451 	}
452 	else
453 		return 0;
454 }
455 
456 /*
457  * pg_flush_data --- advise OS that the described dirty data should be flushed
458  *
459  * offset of 0 with nbytes 0 means that the entire file should be flushed
460  */
461 void
pg_flush_data(int fd,off_t offset,off_t nbytes)462 pg_flush_data(int fd, off_t offset, off_t nbytes)
463 {
464 	/*
465 	 * Right now file flushing is primarily used to avoid making later
466 	 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
467 	 * if fsyncs are disabled - that's a decision we might want to make
468 	 * configurable at some point.
469 	 */
470 	if (!enableFsync)
471 		return;
472 
473 	/*
474 	 * We compile all alternatives that are supported on the current platform,
475 	 * to find portability problems more easily.
476 	 */
477 #if defined(HAVE_SYNC_FILE_RANGE)
478 	{
479 		int			rc;
480 		static bool not_implemented_by_kernel = false;
481 
482 		if (not_implemented_by_kernel)
483 			return;
484 
485 		/*
486 		 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
487 		 * tells the OS that writeback for the specified blocks should be
488 		 * started, but that we don't want to wait for completion.  Note that
489 		 * this call might block if too much dirty data exists in the range.
490 		 * This is the preferable method on OSs supporting it, as it works
491 		 * reliably when available (contrast to msync()) and doesn't flush out
492 		 * clean data (like FADV_DONTNEED).
493 		 */
494 		rc = sync_file_range(fd, offset, nbytes,
495 							 SYNC_FILE_RANGE_WRITE);
496 		if (rc != 0)
497 		{
498 			int			elevel;
499 
500 			/*
501 			 * For systems that don't have an implementation of
502 			 * sync_file_range() such as Windows WSL, generate only one
503 			 * warning and then suppress all further attempts by this process.
504 			 */
505 			if (errno == ENOSYS)
506 			{
507 				elevel = WARNING;
508 				not_implemented_by_kernel = true;
509 			}
510 			else
511 				elevel = data_sync_elevel(WARNING);
512 
513 			ereport(elevel,
514 					(errcode_for_file_access(),
515 					 errmsg("could not flush dirty data: %m")));
516 		}
517 
518 		return;
519 	}
520 #endif
521 #if !defined(WIN32) && defined(MS_ASYNC)
522 	{
523 		void	   *p;
524 		static int	pagesize = 0;
525 
526 		/*
527 		 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
528 		 * writeback. On linux it only does so if MS_SYNC is specified, but
529 		 * then it does the writeback synchronously. Luckily all common linux
530 		 * systems have sync_file_range().  This is preferable over
531 		 * FADV_DONTNEED because it doesn't flush out clean data.
532 		 *
533 		 * We map the file (mmap()), tell the kernel to sync back the contents
534 		 * (msync()), and then remove the mapping again (munmap()).
535 		 */
536 
537 		/* mmap() needs actual length if we want to map whole file */
538 		if (offset == 0 && nbytes == 0)
539 		{
540 			nbytes = lseek(fd, 0, SEEK_END);
541 			if (nbytes < 0)
542 			{
543 				ereport(WARNING,
544 						(errcode_for_file_access(),
545 						 errmsg("could not determine dirty data size: %m")));
546 				return;
547 			}
548 		}
549 
550 		/*
551 		 * Some platforms reject partial-page mmap() attempts.  To deal with
552 		 * that, just truncate the request to a page boundary.  If any extra
553 		 * bytes don't get flushed, well, it's only a hint anyway.
554 		 */
555 
556 		/* fetch pagesize only once */
557 		if (pagesize == 0)
558 			pagesize = sysconf(_SC_PAGESIZE);
559 
560 		/* align length to pagesize, dropping any fractional page */
561 		if (pagesize > 0)
562 			nbytes = (nbytes / pagesize) * pagesize;
563 
564 		/* fractional-page request is a no-op */
565 		if (nbytes <= 0)
566 			return;
567 
568 		/*
569 		 * mmap could well fail, particularly on 32-bit platforms where there
570 		 * may simply not be enough address space.  If so, silently fall
571 		 * through to the next implementation.
572 		 */
573 		if (nbytes <= (off_t) SSIZE_MAX)
574 			p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
575 		else
576 			p = MAP_FAILED;
577 
578 		if (p != MAP_FAILED)
579 		{
580 			int			rc;
581 
582 			rc = msync(p, (size_t) nbytes, MS_ASYNC);
583 			if (rc != 0)
584 			{
585 				ereport(data_sync_elevel(WARNING),
586 						(errcode_for_file_access(),
587 						 errmsg("could not flush dirty data: %m")));
588 				/* NB: need to fall through to munmap()! */
589 			}
590 
591 			rc = munmap(p, (size_t) nbytes);
592 			if (rc != 0)
593 			{
594 				/* FATAL error because mapping would remain */
595 				ereport(FATAL,
596 						(errcode_for_file_access(),
597 						 errmsg("could not munmap() while flushing data: %m")));
598 			}
599 
600 			return;
601 		}
602 	}
603 #endif
604 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
605 	{
606 		int			rc;
607 
608 		/*
609 		 * Signal the kernel that the passed in range should not be cached
610 		 * anymore. This has the, desired, side effect of writing out dirty
611 		 * data, and the, undesired, side effect of likely discarding useful
612 		 * clean cached blocks.  For the latter reason this is the least
613 		 * preferable method.
614 		 */
615 
616 		rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
617 
618 		if (rc != 0)
619 		{
620 			/* don't error out, this is just a performance optimization */
621 			ereport(WARNING,
622 					(errcode_for_file_access(),
623 					 errmsg("could not flush dirty data: %m")));
624 		}
625 
626 		return;
627 	}
628 #endif
629 }
630 
631 /*
632  * Truncate a file to a given length by name.
633  */
634 int
pg_truncate(const char * path,off_t length)635 pg_truncate(const char *path, off_t length)
636 {
637 #ifdef WIN32
638 	int			save_errno;
639 	int			ret;
640 	int			fd;
641 
642 	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
643 	if (fd >= 0)
644 	{
645 		ret = ftruncate(fd, 0);
646 		save_errno = errno;
647 		CloseTransientFile(fd);
648 		errno = save_errno;
649 	}
650 	else
651 		ret = -1;
652 
653 	return ret;
654 #else
655 	return truncate(path, length);
656 #endif
657 }
658 
659 /*
660  * fsync_fname -- fsync a file or directory, handling errors properly
661  *
662  * Try to fsync a file or directory. When doing the latter, ignore errors that
663  * indicate the OS just doesn't allow/require fsyncing directories.
664  */
665 void
fsync_fname(const char * fname,bool isdir)666 fsync_fname(const char *fname, bool isdir)
667 {
668 	fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
669 }
670 
671 /*
672  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
673  *
674  * This routine ensures that, after returning, the effect of renaming file
675  * persists in case of a crash. A crash while this routine is running will
676  * leave you with either the pre-existing or the moved file in place of the
677  * new file; no mixed state or truncated files are possible.
678  *
679  * It does so by using fsync on the old filename and the possibly existing
680  * target filename before the rename, and the target file and directory after.
681  *
682  * Note that rename() cannot be used across arbitrary directories, as they
683  * might not be on the same filesystem. Therefore this routine does not
684  * support renaming across directories.
685  *
686  * Log errors with the caller specified severity.
687  *
688  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
689  * valid upon return.
690  */
691 int
durable_rename(const char * oldfile,const char * newfile,int elevel)692 durable_rename(const char *oldfile, const char *newfile, int elevel)
693 {
694 	int			fd;
695 
696 	/*
697 	 * First fsync the old and target path (if it exists), to ensure that they
698 	 * are properly persistent on disk. Syncing the target file is not
699 	 * strictly necessary, but it makes it easier to reason about crashes;
700 	 * because it's then guaranteed that either source or target file exists
701 	 * after a crash.
702 	 */
703 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
704 		return -1;
705 
706 	fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
707 	if (fd < 0)
708 	{
709 		if (errno != ENOENT)
710 		{
711 			ereport(elevel,
712 					(errcode_for_file_access(),
713 					 errmsg("could not open file \"%s\": %m", newfile)));
714 			return -1;
715 		}
716 	}
717 	else
718 	{
719 		if (pg_fsync(fd) != 0)
720 		{
721 			int			save_errno;
722 
723 			/* close file upon error, might not be in transaction context */
724 			save_errno = errno;
725 			CloseTransientFile(fd);
726 			errno = save_errno;
727 
728 			ereport(elevel,
729 					(errcode_for_file_access(),
730 					 errmsg("could not fsync file \"%s\": %m", newfile)));
731 			return -1;
732 		}
733 
734 		if (CloseTransientFile(fd) != 0)
735 		{
736 			ereport(elevel,
737 					(errcode_for_file_access(),
738 					 errmsg("could not close file \"%s\": %m", newfile)));
739 			return -1;
740 		}
741 	}
742 
743 	/* Time to do the real deal... */
744 	if (rename(oldfile, newfile) < 0)
745 	{
746 		ereport(elevel,
747 				(errcode_for_file_access(),
748 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
749 						oldfile, newfile)));
750 		return -1;
751 	}
752 
753 	/*
754 	 * To guarantee renaming the file is persistent, fsync the file with its
755 	 * new name, and its containing directory.
756 	 */
757 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
758 		return -1;
759 
760 	if (fsync_parent_path(newfile, elevel) != 0)
761 		return -1;
762 
763 	return 0;
764 }
765 
766 /*
767  * durable_unlink -- remove a file in a durable manner
768  *
769  * This routine ensures that, after returning, the effect of removing file
770  * persists in case of a crash. A crash while this routine is running will
771  * leave the system in no mixed state.
772  *
773  * It does so by using fsync on the parent directory of the file after the
774  * actual removal is done.
775  *
776  * Log errors with the severity specified by caller.
777  *
778  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779  * valid upon return.
780  */
781 int
durable_unlink(const char * fname,int elevel)782 durable_unlink(const char *fname, int elevel)
783 {
784 	if (unlink(fname) < 0)
785 	{
786 		ereport(elevel,
787 				(errcode_for_file_access(),
788 				 errmsg("could not remove file \"%s\": %m",
789 						fname)));
790 		return -1;
791 	}
792 
793 	/*
794 	 * To guarantee that the removal of the file is persistent, fsync its
795 	 * parent directory.
796 	 */
797 	if (fsync_parent_path(fname, elevel) != 0)
798 		return -1;
799 
800 	return 0;
801 }
802 
803 /*
804  * durable_rename_excl -- rename a file in a durable manner.
805  *
806  * Similar to durable_rename(), except that this routine tries (but does not
807  * guarantee) not to overwrite the target file.
808  *
809  * Note that a crash in an unfortunate moment can leave you with two links to
810  * the target file.
811  *
812  * Log errors with the caller specified severity.
813  *
814  * On Windows, using a hard link followed by unlink() causes concurrency
815  * issues, while a simple rename() does not cause that, so be careful when
816  * changing the logic of this routine.
817  *
818  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
819  * valid upon return.
820  */
821 int
durable_rename_excl(const char * oldfile,const char * newfile,int elevel)822 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
823 {
824 	/*
825 	 * Ensure that, if we crash directly after the rename/link, a file with
826 	 * valid contents is moved into place.
827 	 */
828 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
829 		return -1;
830 
831 #ifdef HAVE_WORKING_LINK
832 	if (link(oldfile, newfile) < 0)
833 	{
834 		ereport(elevel,
835 				(errcode_for_file_access(),
836 				 errmsg("could not link file \"%s\" to \"%s\": %m",
837 						oldfile, newfile)));
838 		return -1;
839 	}
840 	unlink(oldfile);
841 #else
842 	if (rename(oldfile, newfile) < 0)
843 	{
844 		ereport(elevel,
845 				(errcode_for_file_access(),
846 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
847 						oldfile, newfile)));
848 		return -1;
849 	}
850 #endif
851 
852 	/*
853 	 * Make change persistent in case of an OS crash, both the new entry and
854 	 * its parent directory need to be flushed.
855 	 */
856 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
857 		return -1;
858 
859 	/* Same for parent directory */
860 	if (fsync_parent_path(newfile, elevel) != 0)
861 		return -1;
862 
863 	return 0;
864 }
865 
866 /*
867  * InitFileAccess --- initialize this module during backend startup
868  *
869  * This is called during either normal or standalone backend start.
870  * It is *not* called in the postmaster.
871  */
872 void
InitFileAccess(void)873 InitFileAccess(void)
874 {
875 	Assert(SizeVfdCache == 0);	/* call me only once */
876 
877 	/* initialize cache header entry */
878 	VfdCache = (Vfd *) malloc(sizeof(Vfd));
879 	if (VfdCache == NULL)
880 		ereport(FATAL,
881 				(errcode(ERRCODE_OUT_OF_MEMORY),
882 				 errmsg("out of memory")));
883 
884 	MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
885 	VfdCache->fd = VFD_CLOSED;
886 
887 	SizeVfdCache = 1;
888 
889 	/* register proc-exit hook to ensure temp files are dropped at exit */
890 	on_proc_exit(AtProcExit_Files, 0);
891 }
892 
893 /*
894  * count_usable_fds --- count how many FDs the system will let us open,
895  *		and estimate how many are already open.
896  *
897  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
898  * value of max_to_probe might result in an underestimate of already_open;
899  * we must fill in any "gaps" in the set of used FDs before the calculation
900  * of already_open will give the right answer.  In practice, max_to_probe
901  * of a couple of dozen should be enough to ensure good results.
902  *
903  * We assume stderr (FD 2) is available for dup'ing.  While the calling
904  * script could theoretically close that, it would be a really bad idea,
905  * since then one risks loss of error messages from, e.g., libc.
906  */
907 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)908 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
909 {
910 	int		   *fd;
911 	int			size;
912 	int			used = 0;
913 	int			highestfd = 0;
914 	int			j;
915 
916 #ifdef HAVE_GETRLIMIT
917 	struct rlimit rlim;
918 	int			getrlimit_status;
919 #endif
920 
921 	size = 1024;
922 	fd = (int *) palloc(size * sizeof(int));
923 
924 #ifdef HAVE_GETRLIMIT
925 #ifdef RLIMIT_NOFILE			/* most platforms use RLIMIT_NOFILE */
926 	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
927 #else							/* but BSD doesn't ... */
928 	getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
929 #endif							/* RLIMIT_NOFILE */
930 	if (getrlimit_status != 0)
931 		ereport(WARNING, (errmsg("getrlimit failed: %m")));
932 #endif							/* HAVE_GETRLIMIT */
933 
934 	/* dup until failure or probe limit reached */
935 	for (;;)
936 	{
937 		int			thisfd;
938 
939 #ifdef HAVE_GETRLIMIT
940 
941 		/*
942 		 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
943 		 * some platforms
944 		 */
945 		if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
946 			break;
947 #endif
948 
949 		thisfd = dup(2);
950 		if (thisfd < 0)
951 		{
952 			/* Expect EMFILE or ENFILE, else it's fishy */
953 			if (errno != EMFILE && errno != ENFILE)
954 				elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
955 			break;
956 		}
957 
958 		if (used >= size)
959 		{
960 			size *= 2;
961 			fd = (int *) repalloc(fd, size * sizeof(int));
962 		}
963 		fd[used++] = thisfd;
964 
965 		if (highestfd < thisfd)
966 			highestfd = thisfd;
967 
968 		if (used >= max_to_probe)
969 			break;
970 	}
971 
972 	/* release the files we opened */
973 	for (j = 0; j < used; j++)
974 		close(fd[j]);
975 
976 	pfree(fd);
977 
978 	/*
979 	 * Return results.  usable_fds is just the number of successful dups. We
980 	 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
981 	 * number) and so already_open is highestfd+1 - usable_fds.
982 	 */
983 	*usable_fds = used;
984 	*already_open = highestfd + 1 - used;
985 }
986 
987 /*
988  * set_max_safe_fds
989  *		Determine number of file descriptors that fd.c is allowed to use
990  */
991 void
set_max_safe_fds(void)992 set_max_safe_fds(void)
993 {
994 	int			usable_fds;
995 	int			already_open;
996 
997 	/*----------
998 	 * We want to set max_safe_fds to
999 	 *			MIN(usable_fds, max_files_per_process - already_open)
1000 	 * less the slop factor for files that are opened without consulting
1001 	 * fd.c.  This ensures that we won't exceed either max_files_per_process
1002 	 * or the experimentally-determined EMFILE limit.
1003 	 *----------
1004 	 */
1005 	count_usable_fds(max_files_per_process,
1006 					 &usable_fds, &already_open);
1007 
1008 	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1009 
1010 	/*
1011 	 * Take off the FDs reserved for system() etc.
1012 	 */
1013 	max_safe_fds -= NUM_RESERVED_FDS;
1014 
1015 	/*
1016 	 * Make sure we still have enough to get by.
1017 	 */
1018 	if (max_safe_fds < FD_MINFREE)
1019 		ereport(FATAL,
1020 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1021 				 errmsg("insufficient file descriptors available to start server process"),
1022 				 errdetail("System allows %d, we need at least %d.",
1023 						   max_safe_fds + NUM_RESERVED_FDS,
1024 						   FD_MINFREE + NUM_RESERVED_FDS)));
1025 
1026 	elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1027 		 max_safe_fds, usable_fds, already_open);
1028 }
1029 
1030 /*
1031  * Open a file with BasicOpenFilePerm() and pass default file mode for the
1032  * fileMode parameter.
1033  */
1034 int
BasicOpenFile(const char * fileName,int fileFlags)1035 BasicOpenFile(const char *fileName, int fileFlags)
1036 {
1037 	return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1038 }
1039 
1040 /*
1041  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1042  *
1043  * This is exported for use by places that really want a plain kernel FD,
1044  * but need to be proof against running out of FDs.  Once an FD has been
1045  * successfully returned, it is the caller's responsibility to ensure that
1046  * it will not be leaked on ereport()!	Most users should *not* call this
1047  * routine directly, but instead use the VFD abstraction level, which
1048  * provides protection against descriptor leaks as well as management of
1049  * files that need to be open for more than a short period of time.
1050  *
1051  * Ideally this should be the *only* direct call of open() in the backend.
1052  * In practice, the postmaster calls open() directly, and there are some
1053  * direct open() calls done early in backend startup.  Those are OK since
1054  * this module wouldn't have any open files to close at that point anyway.
1055  */
1056 int
BasicOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1057 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1058 {
1059 	int			fd;
1060 
1061 tryAgain:
1062 	fd = open(fileName, fileFlags, fileMode);
1063 
1064 	if (fd >= 0)
1065 		return fd;				/* success! */
1066 
1067 	if (errno == EMFILE || errno == ENFILE)
1068 	{
1069 		int			save_errno = errno;
1070 
1071 		ereport(LOG,
1072 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1073 				 errmsg("out of file descriptors: %m; release and retry")));
1074 		errno = 0;
1075 		if (ReleaseLruFile())
1076 			goto tryAgain;
1077 		errno = save_errno;
1078 	}
1079 
1080 	return -1;					/* failure */
1081 }
1082 
1083 /*
1084  * AcquireExternalFD - attempt to reserve an external file descriptor
1085  *
1086  * This should be used by callers that need to hold a file descriptor open
1087  * over more than a short interval, but cannot use any of the other facilities
1088  * provided by this module.
1089  *
1090  * The difference between this and the underlying ReserveExternalFD function
1091  * is that this will report failure (by setting errno and returning false)
1092  * if "too many" external FDs are already reserved.  This should be used in
1093  * any code where the total number of FDs to be reserved is not predictable
1094  * and small.
1095  */
1096 bool
AcquireExternalFD(void)1097 AcquireExternalFD(void)
1098 {
1099 	/*
1100 	 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1101 	 * "external" FDs.
1102 	 */
1103 	if (numExternalFDs < max_safe_fds / 3)
1104 	{
1105 		ReserveExternalFD();
1106 		return true;
1107 	}
1108 	errno = EMFILE;
1109 	return false;
1110 }
1111 
1112 /*
1113  * ReserveExternalFD - report external consumption of a file descriptor
1114  *
1115  * This should be used by callers that need to hold a file descriptor open
1116  * over more than a short interval, but cannot use any of the other facilities
1117  * provided by this module.  This just tracks the use of the FD and closes
1118  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1119  *
1120  * Call this directly only in code where failure to reserve the FD would be
1121  * fatal; for example, the WAL-writing code does so, since the alternative is
1122  * session failure.  Also, it's very unwise to do so in code that could
1123  * consume more than one FD per process.
1124  *
1125  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1126  * available, it doesn't matter too much whether this is called before or
1127  * after actually opening the FD; but doing so beforehand reduces the risk of
1128  * an EMFILE failure if not everybody played nice.  In any case, it's solely
1129  * caller's responsibility to keep the external-FD count in sync with reality.
1130  */
1131 void
ReserveExternalFD(void)1132 ReserveExternalFD(void)
1133 {
1134 	/*
1135 	 * Release VFDs if needed to stay safe.  Because we do this before
1136 	 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1137 	 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1138 	 */
1139 	ReleaseLruFiles();
1140 
1141 	numExternalFDs++;
1142 }
1143 
1144 /*
1145  * ReleaseExternalFD - report release of an external file descriptor
1146  *
1147  * This is guaranteed not to change errno, so it can be used in failure paths.
1148  */
1149 void
ReleaseExternalFD(void)1150 ReleaseExternalFD(void)
1151 {
1152 	Assert(numExternalFDs > 0);
1153 	numExternalFDs--;
1154 }
1155 
1156 
1157 #if defined(FDDEBUG)
1158 
1159 static void
_dump_lru(void)1160 _dump_lru(void)
1161 {
1162 	int			mru = VfdCache[0].lruLessRecently;
1163 	Vfd		   *vfdP = &VfdCache[mru];
1164 	char		buf[2048];
1165 
1166 	snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1167 	while (mru != 0)
1168 	{
1169 		mru = vfdP->lruLessRecently;
1170 		vfdP = &VfdCache[mru];
1171 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1172 	}
1173 	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1174 	elog(LOG, "%s", buf);
1175 }
1176 #endif							/* FDDEBUG */
1177 
1178 static void
Delete(File file)1179 Delete(File file)
1180 {
1181 	Vfd		   *vfdP;
1182 
1183 	Assert(file != 0);
1184 
1185 	DO_DB(elog(LOG, "Delete %d (%s)",
1186 			   file, VfdCache[file].fileName));
1187 	DO_DB(_dump_lru());
1188 
1189 	vfdP = &VfdCache[file];
1190 
1191 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1192 	VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1193 
1194 	DO_DB(_dump_lru());
1195 }
1196 
1197 static void
LruDelete(File file)1198 LruDelete(File file)
1199 {
1200 	Vfd		   *vfdP;
1201 
1202 	Assert(file != 0);
1203 
1204 	DO_DB(elog(LOG, "LruDelete %d (%s)",
1205 			   file, VfdCache[file].fileName));
1206 
1207 	vfdP = &VfdCache[file];
1208 
1209 	/*
1210 	 * Close the file.  We aren't expecting this to fail; if it does, better
1211 	 * to leak the FD than to mess up our internal state.
1212 	 */
1213 	if (close(vfdP->fd) != 0)
1214 		elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1215 			 "could not close file \"%s\": %m", vfdP->fileName);
1216 	vfdP->fd = VFD_CLOSED;
1217 	--nfile;
1218 
1219 	/* delete the vfd record from the LRU ring */
1220 	Delete(file);
1221 }
1222 
1223 static void
Insert(File file)1224 Insert(File file)
1225 {
1226 	Vfd		   *vfdP;
1227 
1228 	Assert(file != 0);
1229 
1230 	DO_DB(elog(LOG, "Insert %d (%s)",
1231 			   file, VfdCache[file].fileName));
1232 	DO_DB(_dump_lru());
1233 
1234 	vfdP = &VfdCache[file];
1235 
1236 	vfdP->lruMoreRecently = 0;
1237 	vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1238 	VfdCache[0].lruLessRecently = file;
1239 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1240 
1241 	DO_DB(_dump_lru());
1242 }
1243 
1244 /* returns 0 on success, -1 on re-open failure (with errno set) */
1245 static int
LruInsert(File file)1246 LruInsert(File file)
1247 {
1248 	Vfd		   *vfdP;
1249 
1250 	Assert(file != 0);
1251 
1252 	DO_DB(elog(LOG, "LruInsert %d (%s)",
1253 			   file, VfdCache[file].fileName));
1254 
1255 	vfdP = &VfdCache[file];
1256 
1257 	if (FileIsNotOpen(file))
1258 	{
1259 		/* Close excess kernel FDs. */
1260 		ReleaseLruFiles();
1261 
1262 		/*
1263 		 * The open could still fail for lack of file descriptors, eg due to
1264 		 * overall system file table being full.  So, be prepared to release
1265 		 * another FD if necessary...
1266 		 */
1267 		vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1268 									 vfdP->fileMode);
1269 		if (vfdP->fd < 0)
1270 		{
1271 			DO_DB(elog(LOG, "re-open failed: %m"));
1272 			return -1;
1273 		}
1274 		else
1275 		{
1276 			++nfile;
1277 		}
1278 	}
1279 
1280 	/*
1281 	 * put it at the head of the Lru ring
1282 	 */
1283 
1284 	Insert(file);
1285 
1286 	return 0;
1287 }
1288 
1289 /*
1290  * Release one kernel FD by closing the least-recently-used VFD.
1291  */
1292 static bool
ReleaseLruFile(void)1293 ReleaseLruFile(void)
1294 {
1295 	DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1296 
1297 	if (nfile > 0)
1298 	{
1299 		/*
1300 		 * There are opened files and so there should be at least one used vfd
1301 		 * in the ring.
1302 		 */
1303 		Assert(VfdCache[0].lruMoreRecently != 0);
1304 		LruDelete(VfdCache[0].lruMoreRecently);
1305 		return true;			/* freed a file */
1306 	}
1307 	return false;				/* no files available to free */
1308 }
1309 
1310 /*
1311  * Release kernel FDs as needed to get under the max_safe_fds limit.
1312  * After calling this, it's OK to try to open another file.
1313  */
1314 static void
ReleaseLruFiles(void)1315 ReleaseLruFiles(void)
1316 {
1317 	while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1318 	{
1319 		if (!ReleaseLruFile())
1320 			break;
1321 	}
1322 }
1323 
1324 static File
AllocateVfd(void)1325 AllocateVfd(void)
1326 {
1327 	Index		i;
1328 	File		file;
1329 
1330 	DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1331 
1332 	Assert(SizeVfdCache > 0);	/* InitFileAccess not called? */
1333 
1334 	if (VfdCache[0].nextFree == 0)
1335 	{
1336 		/*
1337 		 * The free list is empty so it is time to increase the size of the
1338 		 * array.  We choose to double it each time this happens. However,
1339 		 * there's not much point in starting *real* small.
1340 		 */
1341 		Size		newCacheSize = SizeVfdCache * 2;
1342 		Vfd		   *newVfdCache;
1343 
1344 		if (newCacheSize < 32)
1345 			newCacheSize = 32;
1346 
1347 		/*
1348 		 * Be careful not to clobber VfdCache ptr if realloc fails.
1349 		 */
1350 		newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1351 		if (newVfdCache == NULL)
1352 			ereport(ERROR,
1353 					(errcode(ERRCODE_OUT_OF_MEMORY),
1354 					 errmsg("out of memory")));
1355 		VfdCache = newVfdCache;
1356 
1357 		/*
1358 		 * Initialize the new entries and link them into the free list.
1359 		 */
1360 		for (i = SizeVfdCache; i < newCacheSize; i++)
1361 		{
1362 			MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1363 			VfdCache[i].nextFree = i + 1;
1364 			VfdCache[i].fd = VFD_CLOSED;
1365 		}
1366 		VfdCache[newCacheSize - 1].nextFree = 0;
1367 		VfdCache[0].nextFree = SizeVfdCache;
1368 
1369 		/*
1370 		 * Record the new size
1371 		 */
1372 		SizeVfdCache = newCacheSize;
1373 	}
1374 
1375 	file = VfdCache[0].nextFree;
1376 
1377 	VfdCache[0].nextFree = VfdCache[file].nextFree;
1378 
1379 	return file;
1380 }
1381 
1382 static void
FreeVfd(File file)1383 FreeVfd(File file)
1384 {
1385 	Vfd		   *vfdP = &VfdCache[file];
1386 
1387 	DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1388 			   file, vfdP->fileName ? vfdP->fileName : ""));
1389 
1390 	if (vfdP->fileName != NULL)
1391 	{
1392 		free(vfdP->fileName);
1393 		vfdP->fileName = NULL;
1394 	}
1395 	vfdP->fdstate = 0x0;
1396 
1397 	vfdP->nextFree = VfdCache[0].nextFree;
1398 	VfdCache[0].nextFree = file;
1399 }
1400 
1401 /* returns 0 on success, -1 on re-open failure (with errno set) */
1402 static int
FileAccess(File file)1403 FileAccess(File file)
1404 {
1405 	int			returnValue;
1406 
1407 	DO_DB(elog(LOG, "FileAccess %d (%s)",
1408 			   file, VfdCache[file].fileName));
1409 
1410 	/*
1411 	 * Is the file open?  If not, open it and put it at the head of the LRU
1412 	 * ring (possibly closing the least recently used file to get an FD).
1413 	 */
1414 
1415 	if (FileIsNotOpen(file))
1416 	{
1417 		returnValue = LruInsert(file);
1418 		if (returnValue != 0)
1419 			return returnValue;
1420 	}
1421 	else if (VfdCache[0].lruLessRecently != file)
1422 	{
1423 		/*
1424 		 * We now know that the file is open and that it is not the last one
1425 		 * accessed, so we need to move it to the head of the Lru ring.
1426 		 */
1427 
1428 		Delete(file);
1429 		Insert(file);
1430 	}
1431 
1432 	return 0;
1433 }
1434 
1435 /*
1436  * Called whenever a temporary file is deleted to report its size.
1437  */
1438 static void
ReportTemporaryFileUsage(const char * path,off_t size)1439 ReportTemporaryFileUsage(const char *path, off_t size)
1440 {
1441 	pgstat_report_tempfile(size);
1442 
1443 	if (log_temp_files >= 0)
1444 	{
1445 		if ((size / 1024) >= log_temp_files)
1446 			ereport(LOG,
1447 					(errmsg("temporary file: path \"%s\", size %lu",
1448 							path, (unsigned long) size)));
1449 	}
1450 }
1451 
1452 /*
1453  * Called to register a temporary file for automatic close.
1454  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1455  * before the file was opened.
1456  */
1457 static void
RegisterTemporaryFile(File file)1458 RegisterTemporaryFile(File file)
1459 {
1460 	ResourceOwnerRememberFile(CurrentResourceOwner, file);
1461 	VfdCache[file].resowner = CurrentResourceOwner;
1462 
1463 	/* Backup mechanism for closing at end of xact. */
1464 	VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1465 	have_xact_temporary_files = true;
1466 }
1467 
1468 /*
1469  *	Called when we get a shared invalidation message on some relation.
1470  */
1471 #ifdef NOT_USED
1472 void
FileInvalidate(File file)1473 FileInvalidate(File file)
1474 {
1475 	Assert(FileIsValid(file));
1476 	if (!FileIsNotOpen(file))
1477 		LruDelete(file);
1478 }
1479 #endif
1480 
1481 /*
1482  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1483  * fileMode parameter.
1484  */
1485 File
PathNameOpenFile(const char * fileName,int fileFlags)1486 PathNameOpenFile(const char *fileName, int fileFlags)
1487 {
1488 	return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1489 }
1490 
1491 /*
1492  * open a file in an arbitrary directory
1493  *
1494  * NB: if the passed pathname is relative (which it usually is),
1495  * it will be interpreted relative to the process' working directory
1496  * (which should always be $PGDATA when this code is running).
1497  */
1498 File
PathNameOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1499 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1500 {
1501 	char	   *fnamecopy;
1502 	File		file;
1503 	Vfd		   *vfdP;
1504 
1505 	DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1506 			   fileName, fileFlags, fileMode));
1507 
1508 	/*
1509 	 * We need a malloc'd copy of the file name; fail cleanly if no room.
1510 	 */
1511 	fnamecopy = strdup(fileName);
1512 	if (fnamecopy == NULL)
1513 		ereport(ERROR,
1514 				(errcode(ERRCODE_OUT_OF_MEMORY),
1515 				 errmsg("out of memory")));
1516 
1517 	file = AllocateVfd();
1518 	vfdP = &VfdCache[file];
1519 
1520 	/* Close excess kernel FDs. */
1521 	ReleaseLruFiles();
1522 
1523 	vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1524 
1525 	if (vfdP->fd < 0)
1526 	{
1527 		int			save_errno = errno;
1528 
1529 		FreeVfd(file);
1530 		free(fnamecopy);
1531 		errno = save_errno;
1532 		return -1;
1533 	}
1534 	++nfile;
1535 	DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1536 			   vfdP->fd));
1537 
1538 	vfdP->fileName = fnamecopy;
1539 	/* Saved flags are adjusted to be OK for re-opening file */
1540 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1541 	vfdP->fileMode = fileMode;
1542 	vfdP->fileSize = 0;
1543 	vfdP->fdstate = 0x0;
1544 	vfdP->resowner = NULL;
1545 
1546 	Insert(file);
1547 
1548 	return file;
1549 }
1550 
1551 /*
1552  * Create directory 'directory'.  If necessary, create 'basedir', which must
1553  * be the directory above it.  This is designed for creating the top-level
1554  * temporary directory on demand before creating a directory underneath it.
1555  * Do nothing if the directory already exists.
1556  *
1557  * Directories created within the top-level temporary directory should begin
1558  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1559  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
1560  * that do not need any particular prefix.
1561 */
1562 void
PathNameCreateTemporaryDir(const char * basedir,const char * directory)1563 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1564 {
1565 	if (MakePGDirectory(directory) < 0)
1566 	{
1567 		if (errno == EEXIST)
1568 			return;
1569 
1570 		/*
1571 		 * Failed.  Try to create basedir first in case it's missing. Tolerate
1572 		 * EEXIST to close a race against another process following the same
1573 		 * algorithm.
1574 		 */
1575 		if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1576 			ereport(ERROR,
1577 					(errcode_for_file_access(),
1578 					 errmsg("cannot create temporary directory \"%s\": %m",
1579 							basedir)));
1580 
1581 		/* Try again. */
1582 		if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1583 			ereport(ERROR,
1584 					(errcode_for_file_access(),
1585 					 errmsg("cannot create temporary subdirectory \"%s\": %m",
1586 							directory)));
1587 	}
1588 }
1589 
1590 /*
1591  * Delete a directory and everything in it, if it exists.
1592  */
1593 void
PathNameDeleteTemporaryDir(const char * dirname)1594 PathNameDeleteTemporaryDir(const char *dirname)
1595 {
1596 	struct stat statbuf;
1597 
1598 	/* Silently ignore missing directory. */
1599 	if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1600 		return;
1601 
1602 	/*
1603 	 * Currently, walkdir doesn't offer a way for our passed in function to
1604 	 * maintain state.  Perhaps it should, so that we could tell the caller
1605 	 * whether this operation succeeded or failed.  Since this operation is
1606 	 * used in a cleanup path, we wouldn't actually behave differently: we'll
1607 	 * just log failures.
1608 	 */
1609 	walkdir(dirname, unlink_if_exists_fname, false, LOG);
1610 }
1611 
1612 /*
1613  * Open a temporary file that will disappear when we close it.
1614  *
1615  * This routine takes care of generating an appropriate tempfile name.
1616  * There's no need to pass in fileFlags or fileMode either, since only
1617  * one setting makes any sense for a temp file.
1618  *
1619  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1620  * to ensure it's closed and deleted when it's no longer needed, typically at
1621  * the end-of-transaction. In most cases, you don't want temporary files to
1622  * outlive the transaction that created them, so this should be false -- but
1623  * if you need "somewhat" temporary storage, this might be useful. In either
1624  * case, the file is removed when the File is explicitly closed.
1625  */
1626 File
OpenTemporaryFile(bool interXact)1627 OpenTemporaryFile(bool interXact)
1628 {
1629 	File		file = 0;
1630 
1631 	/*
1632 	 * Make sure the current resource owner has space for this File before we
1633 	 * open it, if we'll be registering it below.
1634 	 */
1635 	if (!interXact)
1636 		ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1637 
1638 	/*
1639 	 * If some temp tablespace(s) have been given to us, try to use the next
1640 	 * one.  If a given tablespace can't be found, we silently fall back to
1641 	 * the database's default tablespace.
1642 	 *
1643 	 * BUT: if the temp file is slated to outlive the current transaction,
1644 	 * force it into the database's default tablespace, so that it will not
1645 	 * pose a threat to possible tablespace drop attempts.
1646 	 */
1647 	if (numTempTableSpaces > 0 && !interXact)
1648 	{
1649 		Oid			tblspcOid = GetNextTempTableSpace();
1650 
1651 		if (OidIsValid(tblspcOid))
1652 			file = OpenTemporaryFileInTablespace(tblspcOid, false);
1653 	}
1654 
1655 	/*
1656 	 * If not, or if tablespace is bad, create in database's default
1657 	 * tablespace.  MyDatabaseTableSpace should normally be set before we get
1658 	 * here, but just in case it isn't, fall back to pg_default tablespace.
1659 	 */
1660 	if (file <= 0)
1661 		file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1662 											 MyDatabaseTableSpace :
1663 											 DEFAULTTABLESPACE_OID,
1664 											 true);
1665 
1666 	/* Mark it for deletion at close and temporary file size limit */
1667 	VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1668 
1669 	/* Register it with the current resource owner */
1670 	if (!interXact)
1671 		RegisterTemporaryFile(file);
1672 
1673 	return file;
1674 }
1675 
1676 /*
1677  * Return the path of the temp directory in a given tablespace.
1678  */
1679 void
TempTablespacePath(char * path,Oid tablespace)1680 TempTablespacePath(char *path, Oid tablespace)
1681 {
1682 	/*
1683 	 * Identify the tempfile directory for this tablespace.
1684 	 *
1685 	 * If someone tries to specify pg_global, use pg_default instead.
1686 	 */
1687 	if (tablespace == InvalidOid ||
1688 		tablespace == DEFAULTTABLESPACE_OID ||
1689 		tablespace == GLOBALTABLESPACE_OID)
1690 		snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1691 	else
1692 	{
1693 		/* All other tablespaces are accessed via symlinks */
1694 		snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1695 				 tablespace, TABLESPACE_VERSION_DIRECTORY,
1696 				 PG_TEMP_FILES_DIR);
1697 	}
1698 }
1699 
1700 /*
1701  * Open a temporary file in a specific tablespace.
1702  * Subroutine for OpenTemporaryFile, which see for details.
1703  */
1704 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1705 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1706 {
1707 	char		tempdirpath[MAXPGPATH];
1708 	char		tempfilepath[MAXPGPATH];
1709 	File		file;
1710 
1711 	TempTablespacePath(tempdirpath, tblspcOid);
1712 
1713 	/*
1714 	 * Generate a tempfile name that should be unique within the current
1715 	 * database instance.
1716 	 */
1717 	snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1718 			 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1719 
1720 	/*
1721 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1722 	 * temp file that can be reused.
1723 	 */
1724 	file = PathNameOpenFile(tempfilepath,
1725 							O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1726 	if (file <= 0)
1727 	{
1728 		/*
1729 		 * We might need to create the tablespace's tempfile directory, if no
1730 		 * one has yet done so.
1731 		 *
1732 		 * Don't check for an error from MakePGDirectory; it could fail if
1733 		 * someone else just did the same thing.  If it doesn't work then
1734 		 * we'll bomb out on the second create attempt, instead.
1735 		 */
1736 		(void) MakePGDirectory(tempdirpath);
1737 
1738 		file = PathNameOpenFile(tempfilepath,
1739 								O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1740 		if (file <= 0 && rejectError)
1741 			elog(ERROR, "could not create temporary file \"%s\": %m",
1742 				 tempfilepath);
1743 	}
1744 
1745 	return file;
1746 }
1747 
1748 
1749 /*
1750  * Create a new file.  The directory containing it must already exist.  Files
1751  * created this way are subject to temp_file_limit and are automatically
1752  * closed at end of transaction, but are not automatically deleted on close
1753  * because they are intended to be shared between cooperating backends.
1754  *
1755  * If the file is inside the top-level temporary directory, its name should
1756  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1757  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
1758  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1759  * the prefix isn't needed.
1760  */
1761 File
PathNameCreateTemporaryFile(const char * path,bool error_on_failure)1762 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1763 {
1764 	File		file;
1765 
1766 	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1767 
1768 	/*
1769 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1770 	 * temp file that can be reused.
1771 	 */
1772 	file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1773 	if (file <= 0)
1774 	{
1775 		if (error_on_failure)
1776 			ereport(ERROR,
1777 					(errcode_for_file_access(),
1778 					 errmsg("could not create temporary file \"%s\": %m",
1779 							path)));
1780 		else
1781 			return file;
1782 	}
1783 
1784 	/* Mark it for temp_file_limit accounting. */
1785 	VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1786 
1787 	/* Register it for automatic close. */
1788 	RegisterTemporaryFile(file);
1789 
1790 	return file;
1791 }
1792 
1793 /*
1794  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1795  * another backend.  Files opened this way don't count against the
1796  * temp_file_limit of the caller, are automatically closed at the end of the
1797  * transaction but are not deleted on close.
1798  */
1799 File
PathNameOpenTemporaryFile(const char * path,int mode)1800 PathNameOpenTemporaryFile(const char *path, int mode)
1801 {
1802 	File		file;
1803 
1804 	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1805 
1806 	file = PathNameOpenFile(path, mode | PG_BINARY);
1807 
1808 	/* If no such file, then we don't raise an error. */
1809 	if (file <= 0 && errno != ENOENT)
1810 		ereport(ERROR,
1811 				(errcode_for_file_access(),
1812 				 errmsg("could not open temporary file \"%s\": %m",
1813 						path)));
1814 
1815 	if (file > 0)
1816 	{
1817 		/* Register it for automatic close. */
1818 		RegisterTemporaryFile(file);
1819 	}
1820 
1821 	return file;
1822 }
1823 
1824 /*
1825  * Delete a file by pathname.  Return true if the file existed, false if
1826  * didn't.
1827  */
1828 bool
PathNameDeleteTemporaryFile(const char * path,bool error_on_failure)1829 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1830 {
1831 	struct stat filestats;
1832 	int			stat_errno;
1833 
1834 	/* Get the final size for pgstat reporting. */
1835 	if (stat(path, &filestats) != 0)
1836 		stat_errno = errno;
1837 	else
1838 		stat_errno = 0;
1839 
1840 	/*
1841 	 * Unlike FileClose's automatic file deletion code, we tolerate
1842 	 * non-existence to support BufFileDeleteShared which doesn't know how
1843 	 * many segments it has to delete until it runs out.
1844 	 */
1845 	if (stat_errno == ENOENT)
1846 		return false;
1847 
1848 	if (unlink(path) < 0)
1849 	{
1850 		if (errno != ENOENT)
1851 			ereport(error_on_failure ? ERROR : LOG,
1852 					(errcode_for_file_access(),
1853 					 errmsg("could not unlink temporary file \"%s\": %m",
1854 							path)));
1855 		return false;
1856 	}
1857 
1858 	if (stat_errno == 0)
1859 		ReportTemporaryFileUsage(path, filestats.st_size);
1860 	else
1861 	{
1862 		errno = stat_errno;
1863 		ereport(LOG,
1864 				(errcode_for_file_access(),
1865 				 errmsg("could not stat file \"%s\": %m", path)));
1866 	}
1867 
1868 	return true;
1869 }
1870 
1871 /*
1872  * close a file when done with it
1873  */
1874 void
FileClose(File file)1875 FileClose(File file)
1876 {
1877 	Vfd		   *vfdP;
1878 
1879 	Assert(FileIsValid(file));
1880 
1881 	DO_DB(elog(LOG, "FileClose: %d (%s)",
1882 			   file, VfdCache[file].fileName));
1883 
1884 	vfdP = &VfdCache[file];
1885 
1886 	if (!FileIsNotOpen(file))
1887 	{
1888 		/* close the file */
1889 		if (close(vfdP->fd) != 0)
1890 		{
1891 			/*
1892 			 * We may need to panic on failure to close non-temporary files;
1893 			 * see LruDelete.
1894 			 */
1895 			elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1896 				 "could not close file \"%s\": %m", vfdP->fileName);
1897 		}
1898 
1899 		--nfile;
1900 		vfdP->fd = VFD_CLOSED;
1901 
1902 		/* remove the file from the lru ring */
1903 		Delete(file);
1904 	}
1905 
1906 	if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1907 	{
1908 		/* Subtract its size from current usage (do first in case of error) */
1909 		temporary_files_size -= vfdP->fileSize;
1910 		vfdP->fileSize = 0;
1911 	}
1912 
1913 	/*
1914 	 * Delete the file if it was temporary, and make a log entry if wanted
1915 	 */
1916 	if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1917 	{
1918 		struct stat filestats;
1919 		int			stat_errno;
1920 
1921 		/*
1922 		 * If we get an error, as could happen within the ereport/elog calls,
1923 		 * we'll come right back here during transaction abort.  Reset the
1924 		 * flag to ensure that we can't get into an infinite loop.  This code
1925 		 * is arranged to ensure that the worst-case consequence is failing to
1926 		 * emit log message(s), not failing to attempt the unlink.
1927 		 */
1928 		vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1929 
1930 
1931 		/* first try the stat() */
1932 		if (stat(vfdP->fileName, &filestats))
1933 			stat_errno = errno;
1934 		else
1935 			stat_errno = 0;
1936 
1937 		/* in any case do the unlink */
1938 		if (unlink(vfdP->fileName))
1939 			ereport(LOG,
1940 					(errcode_for_file_access(),
1941 					 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
1942 
1943 		/* and last report the stat results */
1944 		if (stat_errno == 0)
1945 			ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1946 		else
1947 		{
1948 			errno = stat_errno;
1949 			ereport(LOG,
1950 					(errcode_for_file_access(),
1951 					 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
1952 		}
1953 	}
1954 
1955 	/* Unregister it from the resource owner */
1956 	if (vfdP->resowner)
1957 		ResourceOwnerForgetFile(vfdP->resowner, file);
1958 
1959 	/*
1960 	 * Return the Vfd slot to the free list
1961 	 */
1962 	FreeVfd(file);
1963 }
1964 
1965 /*
1966  * FilePrefetch - initiate asynchronous read of a given range of the file.
1967  *
1968  * Currently the only implementation of this function is using posix_fadvise
1969  * which is the simplest standardized interface that accomplishes this.
1970  * We could add an implementation using libaio in the future; but note that
1971  * this API is inappropriate for libaio, which wants to have a buffer provided
1972  * to read into.
1973  */
1974 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1975 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1976 {
1977 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1978 	int			returnCode;
1979 
1980 	Assert(FileIsValid(file));
1981 
1982 	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1983 			   file, VfdCache[file].fileName,
1984 			   (int64) offset, amount));
1985 
1986 	returnCode = FileAccess(file);
1987 	if (returnCode < 0)
1988 		return returnCode;
1989 
1990 	pgstat_report_wait_start(wait_event_info);
1991 	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1992 							   POSIX_FADV_WILLNEED);
1993 	pgstat_report_wait_end();
1994 
1995 	return returnCode;
1996 #else
1997 	Assert(FileIsValid(file));
1998 	return 0;
1999 #endif
2000 }
2001 
2002 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)2003 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2004 {
2005 	int			returnCode;
2006 
2007 	Assert(FileIsValid(file));
2008 
2009 	DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2010 			   file, VfdCache[file].fileName,
2011 			   (int64) offset, (int64) nbytes));
2012 
2013 	if (nbytes <= 0)
2014 		return;
2015 
2016 	returnCode = FileAccess(file);
2017 	if (returnCode < 0)
2018 		return;
2019 
2020 	pgstat_report_wait_start(wait_event_info);
2021 	pg_flush_data(VfdCache[file].fd, offset, nbytes);
2022 	pgstat_report_wait_end();
2023 }
2024 
2025 int
FileRead(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)2026 FileRead(File file, char *buffer, int amount, off_t offset,
2027 		 uint32 wait_event_info)
2028 {
2029 	int			returnCode;
2030 	Vfd		   *vfdP;
2031 
2032 	Assert(FileIsValid(file));
2033 
2034 	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
2035 			   file, VfdCache[file].fileName,
2036 			   (int64) offset,
2037 			   amount, buffer));
2038 
2039 	returnCode = FileAccess(file);
2040 	if (returnCode < 0)
2041 		return returnCode;
2042 
2043 	vfdP = &VfdCache[file];
2044 
2045 retry:
2046 	pgstat_report_wait_start(wait_event_info);
2047 	returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2048 	pgstat_report_wait_end();
2049 
2050 	if (returnCode < 0)
2051 	{
2052 		/*
2053 		 * Windows may run out of kernel buffers and return "Insufficient
2054 		 * system resources" error.  Wait a bit and retry to solve it.
2055 		 *
2056 		 * It is rumored that EINTR is also possible on some Unix filesystems,
2057 		 * in which case immediate retry is indicated.
2058 		 */
2059 #ifdef WIN32
2060 		DWORD		error = GetLastError();
2061 
2062 		switch (error)
2063 		{
2064 			case ERROR_NO_SYSTEM_RESOURCES:
2065 				pg_usleep(1000L);
2066 				errno = EINTR;
2067 				break;
2068 			default:
2069 				_dosmaperr(error);
2070 				break;
2071 		}
2072 #endif
2073 		/* OK to retry if interrupted */
2074 		if (errno == EINTR)
2075 			goto retry;
2076 	}
2077 
2078 	return returnCode;
2079 }
2080 
2081 int
FileWrite(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)2082 FileWrite(File file, char *buffer, int amount, off_t offset,
2083 		  uint32 wait_event_info)
2084 {
2085 	int			returnCode;
2086 	Vfd		   *vfdP;
2087 
2088 	Assert(FileIsValid(file));
2089 
2090 	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2091 			   file, VfdCache[file].fileName,
2092 			   (int64) offset,
2093 			   amount, buffer));
2094 
2095 	returnCode = FileAccess(file);
2096 	if (returnCode < 0)
2097 		return returnCode;
2098 
2099 	vfdP = &VfdCache[file];
2100 
2101 	/*
2102 	 * If enforcing temp_file_limit and it's a temp file, check to see if the
2103 	 * write would overrun temp_file_limit, and throw error if so.  Note: it's
2104 	 * really a modularity violation to throw error here; we should set errno
2105 	 * and return -1.  However, there's no way to report a suitable error
2106 	 * message if we do that.  All current callers would just throw error
2107 	 * immediately anyway, so this is safe at present.
2108 	 */
2109 	if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2110 	{
2111 		off_t		past_write = offset + amount;
2112 
2113 		if (past_write > vfdP->fileSize)
2114 		{
2115 			uint64		newTotal = temporary_files_size;
2116 
2117 			newTotal += past_write - vfdP->fileSize;
2118 			if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2119 				ereport(ERROR,
2120 						(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2121 						 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2122 								temp_file_limit)));
2123 		}
2124 	}
2125 
2126 retry:
2127 	errno = 0;
2128 	pgstat_report_wait_start(wait_event_info);
2129 	returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2130 	pgstat_report_wait_end();
2131 
2132 	/* if write didn't set errno, assume problem is no disk space */
2133 	if (returnCode != amount && errno == 0)
2134 		errno = ENOSPC;
2135 
2136 	if (returnCode >= 0)
2137 	{
2138 		/*
2139 		 * Maintain fileSize and temporary_files_size if it's a temp file.
2140 		 */
2141 		if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2142 		{
2143 			off_t		past_write = offset + amount;
2144 
2145 			if (past_write > vfdP->fileSize)
2146 			{
2147 				temporary_files_size += past_write - vfdP->fileSize;
2148 				vfdP->fileSize = past_write;
2149 			}
2150 		}
2151 	}
2152 	else
2153 	{
2154 		/*
2155 		 * See comments in FileRead()
2156 		 */
2157 #ifdef WIN32
2158 		DWORD		error = GetLastError();
2159 
2160 		switch (error)
2161 		{
2162 			case ERROR_NO_SYSTEM_RESOURCES:
2163 				pg_usleep(1000L);
2164 				errno = EINTR;
2165 				break;
2166 			default:
2167 				_dosmaperr(error);
2168 				break;
2169 		}
2170 #endif
2171 		/* OK to retry if interrupted */
2172 		if (errno == EINTR)
2173 			goto retry;
2174 	}
2175 
2176 	return returnCode;
2177 }
2178 
2179 int
FileSync(File file,uint32 wait_event_info)2180 FileSync(File file, uint32 wait_event_info)
2181 {
2182 	int			returnCode;
2183 
2184 	Assert(FileIsValid(file));
2185 
2186 	DO_DB(elog(LOG, "FileSync: %d (%s)",
2187 			   file, VfdCache[file].fileName));
2188 
2189 	returnCode = FileAccess(file);
2190 	if (returnCode < 0)
2191 		return returnCode;
2192 
2193 	pgstat_report_wait_start(wait_event_info);
2194 	returnCode = pg_fsync(VfdCache[file].fd);
2195 	pgstat_report_wait_end();
2196 
2197 	return returnCode;
2198 }
2199 
2200 off_t
FileSize(File file)2201 FileSize(File file)
2202 {
2203 	Assert(FileIsValid(file));
2204 
2205 	DO_DB(elog(LOG, "FileSize %d (%s)",
2206 			   file, VfdCache[file].fileName));
2207 
2208 	if (FileIsNotOpen(file))
2209 	{
2210 		if (FileAccess(file) < 0)
2211 			return (off_t) -1;
2212 	}
2213 
2214 	return lseek(VfdCache[file].fd, 0, SEEK_END);
2215 }
2216 
2217 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)2218 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2219 {
2220 	int			returnCode;
2221 
2222 	Assert(FileIsValid(file));
2223 
2224 	DO_DB(elog(LOG, "FileTruncate %d (%s)",
2225 			   file, VfdCache[file].fileName));
2226 
2227 	returnCode = FileAccess(file);
2228 	if (returnCode < 0)
2229 		return returnCode;
2230 
2231 	pgstat_report_wait_start(wait_event_info);
2232 	returnCode = ftruncate(VfdCache[file].fd, offset);
2233 	pgstat_report_wait_end();
2234 
2235 	if (returnCode == 0 && VfdCache[file].fileSize > offset)
2236 	{
2237 		/* adjust our state for truncation of a temp file */
2238 		Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2239 		temporary_files_size -= VfdCache[file].fileSize - offset;
2240 		VfdCache[file].fileSize = offset;
2241 	}
2242 
2243 	return returnCode;
2244 }
2245 
2246 /*
2247  * Return the pathname associated with an open file.
2248  *
2249  * The returned string points to an internal buffer, which is valid until
2250  * the file is closed.
2251  */
2252 char *
FilePathName(File file)2253 FilePathName(File file)
2254 {
2255 	Assert(FileIsValid(file));
2256 
2257 	return VfdCache[file].fileName;
2258 }
2259 
2260 /*
2261  * Return the raw file descriptor of an opened file.
2262  *
2263  * The returned file descriptor will be valid until the file is closed, but
2264  * there are a lot of things that can make that happen.  So the caller should
2265  * be careful not to do much of anything else before it finishes using the
2266  * returned file descriptor.
2267  */
2268 int
FileGetRawDesc(File file)2269 FileGetRawDesc(File file)
2270 {
2271 	Assert(FileIsValid(file));
2272 	return VfdCache[file].fd;
2273 }
2274 
2275 /*
2276  * FileGetRawFlags - returns the file flags on open(2)
2277  */
2278 int
FileGetRawFlags(File file)2279 FileGetRawFlags(File file)
2280 {
2281 	Assert(FileIsValid(file));
2282 	return VfdCache[file].fileFlags;
2283 }
2284 
2285 /*
2286  * FileGetRawMode - returns the mode bitmask passed to open(2)
2287  */
2288 mode_t
FileGetRawMode(File file)2289 FileGetRawMode(File file)
2290 {
2291 	Assert(FileIsValid(file));
2292 	return VfdCache[file].fileMode;
2293 }
2294 
2295 /*
2296  * Make room for another allocatedDescs[] array entry if needed and possible.
2297  * Returns true if an array element is available.
2298  */
2299 static bool
reserveAllocatedDesc(void)2300 reserveAllocatedDesc(void)
2301 {
2302 	AllocateDesc *newDescs;
2303 	int			newMax;
2304 
2305 	/* Quick out if array already has a free slot. */
2306 	if (numAllocatedDescs < maxAllocatedDescs)
2307 		return true;
2308 
2309 	/*
2310 	 * If the array hasn't yet been created in the current process, initialize
2311 	 * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
2312 	 * we will ever need, anyway.  We don't want to look at max_safe_fds
2313 	 * immediately because set_max_safe_fds() may not have run yet.
2314 	 */
2315 	if (allocatedDescs == NULL)
2316 	{
2317 		newMax = FD_MINFREE / 3;
2318 		newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2319 		/* Out of memory already?  Treat as fatal error. */
2320 		if (newDescs == NULL)
2321 			ereport(ERROR,
2322 					(errcode(ERRCODE_OUT_OF_MEMORY),
2323 					 errmsg("out of memory")));
2324 		allocatedDescs = newDescs;
2325 		maxAllocatedDescs = newMax;
2326 		return true;
2327 	}
2328 
2329 	/*
2330 	 * Consider enlarging the array beyond the initial allocation used above.
2331 	 * By the time this happens, max_safe_fds should be known accurately.
2332 	 *
2333 	 * We mustn't let allocated descriptors hog all the available FDs, and in
2334 	 * practice we'd better leave a reasonable number of FDs for VFD use.  So
2335 	 * set the maximum to max_safe_fds / 3.  (This should certainly be at
2336 	 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2337 	 * tightening the restriction here.)  Recall that "external" FDs are
2338 	 * allowed to consume another third of max_safe_fds.
2339 	 */
2340 	newMax = max_safe_fds / 3;
2341 	if (newMax > maxAllocatedDescs)
2342 	{
2343 		newDescs = (AllocateDesc *) realloc(allocatedDescs,
2344 											newMax * sizeof(AllocateDesc));
2345 		/* Treat out-of-memory as a non-fatal error. */
2346 		if (newDescs == NULL)
2347 			return false;
2348 		allocatedDescs = newDescs;
2349 		maxAllocatedDescs = newMax;
2350 		return true;
2351 	}
2352 
2353 	/* Can't enlarge allocatedDescs[] any more. */
2354 	return false;
2355 }
2356 
2357 /*
2358  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2359  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
2360  * necessary to open the file.  When done, call FreeFile rather than fclose.
2361  *
2362  * Note that files that will be open for any significant length of time
2363  * should NOT be handled this way, since they cannot share kernel file
2364  * descriptors with other files; there is grave risk of running out of FDs
2365  * if anyone locks down too many FDs.  Most callers of this routine are
2366  * simply reading a config file that they will read and close immediately.
2367  *
2368  * fd.c will automatically close all files opened with AllocateFile at
2369  * transaction commit or abort; this prevents FD leakage if a routine
2370  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2371  *
2372  * Ideally this should be the *only* direct call of fopen() in the backend.
2373  */
2374 FILE *
AllocateFile(const char * name,const char * mode)2375 AllocateFile(const char *name, const char *mode)
2376 {
2377 	FILE	   *file;
2378 
2379 	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2380 			   numAllocatedDescs, name));
2381 
2382 	/* Can we allocate another non-virtual FD? */
2383 	if (!reserveAllocatedDesc())
2384 		ereport(ERROR,
2385 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2386 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2387 						maxAllocatedDescs, name)));
2388 
2389 	/* Close excess kernel FDs. */
2390 	ReleaseLruFiles();
2391 
2392 TryAgain:
2393 	if ((file = fopen(name, mode)) != NULL)
2394 	{
2395 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2396 
2397 		desc->kind = AllocateDescFile;
2398 		desc->desc.file = file;
2399 		desc->create_subid = GetCurrentSubTransactionId();
2400 		numAllocatedDescs++;
2401 		return desc->desc.file;
2402 	}
2403 
2404 	if (errno == EMFILE || errno == ENFILE)
2405 	{
2406 		int			save_errno = errno;
2407 
2408 		ereport(LOG,
2409 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2410 				 errmsg("out of file descriptors: %m; release and retry")));
2411 		errno = 0;
2412 		if (ReleaseLruFile())
2413 			goto TryAgain;
2414 		errno = save_errno;
2415 	}
2416 
2417 	return NULL;
2418 }
2419 
2420 /*
2421  * Open a file with OpenTransientFilePerm() and pass default file mode for
2422  * the fileMode parameter.
2423  */
2424 int
OpenTransientFile(const char * fileName,int fileFlags)2425 OpenTransientFile(const char *fileName, int fileFlags)
2426 {
2427 	return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2428 }
2429 
2430 /*
2431  * Like AllocateFile, but returns an unbuffered fd like open(2)
2432  */
2433 int
OpenTransientFilePerm(const char * fileName,int fileFlags,mode_t fileMode)2434 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2435 {
2436 	int			fd;
2437 
2438 	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2439 			   numAllocatedDescs, fileName));
2440 
2441 	/* Can we allocate another non-virtual FD? */
2442 	if (!reserveAllocatedDesc())
2443 		ereport(ERROR,
2444 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2445 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2446 						maxAllocatedDescs, fileName)));
2447 
2448 	/* Close excess kernel FDs. */
2449 	ReleaseLruFiles();
2450 
2451 	fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2452 
2453 	if (fd >= 0)
2454 	{
2455 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2456 
2457 		desc->kind = AllocateDescRawFD;
2458 		desc->desc.fd = fd;
2459 		desc->create_subid = GetCurrentSubTransactionId();
2460 		numAllocatedDescs++;
2461 
2462 		return fd;
2463 	}
2464 
2465 	return -1;					/* failure */
2466 }
2467 
2468 /*
2469  * Routines that want to initiate a pipe stream should use OpenPipeStream
2470  * rather than plain popen().  This lets fd.c deal with freeing FDs if
2471  * necessary.  When done, call ClosePipeStream rather than pclose.
2472  *
2473  * This function also ensures that the popen'd program is run with default
2474  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2475  * uses.  This ensures desirable response to, eg, closing a read pipe early.
2476  */
2477 FILE *
OpenPipeStream(const char * command,const char * mode)2478 OpenPipeStream(const char *command, const char *mode)
2479 {
2480 	FILE	   *file;
2481 	int			save_errno;
2482 
2483 	DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2484 			   numAllocatedDescs, command));
2485 
2486 	/* Can we allocate another non-virtual FD? */
2487 	if (!reserveAllocatedDesc())
2488 		ereport(ERROR,
2489 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2490 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2491 						maxAllocatedDescs, command)));
2492 
2493 	/* Close excess kernel FDs. */
2494 	ReleaseLruFiles();
2495 
2496 TryAgain:
2497 	fflush(stdout);
2498 	fflush(stderr);
2499 	pqsignal(SIGPIPE, SIG_DFL);
2500 	errno = 0;
2501 	file = popen(command, mode);
2502 	save_errno = errno;
2503 	pqsignal(SIGPIPE, SIG_IGN);
2504 	errno = save_errno;
2505 	if (file != NULL)
2506 	{
2507 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2508 
2509 		desc->kind = AllocateDescPipe;
2510 		desc->desc.file = file;
2511 		desc->create_subid = GetCurrentSubTransactionId();
2512 		numAllocatedDescs++;
2513 		return desc->desc.file;
2514 	}
2515 
2516 	if (errno == EMFILE || errno == ENFILE)
2517 	{
2518 		ereport(LOG,
2519 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2520 				 errmsg("out of file descriptors: %m; release and retry")));
2521 		if (ReleaseLruFile())
2522 			goto TryAgain;
2523 		errno = save_errno;
2524 	}
2525 
2526 	return NULL;
2527 }
2528 
2529 /*
2530  * Free an AllocateDesc of any type.
2531  *
2532  * The argument *must* point into the allocatedDescs[] array.
2533  */
2534 static int
FreeDesc(AllocateDesc * desc)2535 FreeDesc(AllocateDesc *desc)
2536 {
2537 	int			result;
2538 
2539 	/* Close the underlying object */
2540 	switch (desc->kind)
2541 	{
2542 		case AllocateDescFile:
2543 			result = fclose(desc->desc.file);
2544 			break;
2545 		case AllocateDescPipe:
2546 			result = pclose(desc->desc.file);
2547 			break;
2548 		case AllocateDescDir:
2549 			result = closedir(desc->desc.dir);
2550 			break;
2551 		case AllocateDescRawFD:
2552 			result = close(desc->desc.fd);
2553 			break;
2554 		default:
2555 			elog(ERROR, "AllocateDesc kind not recognized");
2556 			result = 0;			/* keep compiler quiet */
2557 			break;
2558 	}
2559 
2560 	/* Compact storage in the allocatedDescs array */
2561 	numAllocatedDescs--;
2562 	*desc = allocatedDescs[numAllocatedDescs];
2563 
2564 	return result;
2565 }
2566 
2567 /*
2568  * Close a file returned by AllocateFile.
2569  *
2570  * Note we do not check fclose's return value --- it is up to the caller
2571  * to handle close errors.
2572  */
2573 int
FreeFile(FILE * file)2574 FreeFile(FILE *file)
2575 {
2576 	int			i;
2577 
2578 	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2579 
2580 	/* Remove file from list of allocated files, if it's present */
2581 	for (i = numAllocatedDescs; --i >= 0;)
2582 	{
2583 		AllocateDesc *desc = &allocatedDescs[i];
2584 
2585 		if (desc->kind == AllocateDescFile && desc->desc.file == file)
2586 			return FreeDesc(desc);
2587 	}
2588 
2589 	/* Only get here if someone passes us a file not in allocatedDescs */
2590 	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2591 
2592 	return fclose(file);
2593 }
2594 
2595 /*
2596  * Close a file returned by OpenTransientFile.
2597  *
2598  * Note we do not check close's return value --- it is up to the caller
2599  * to handle close errors.
2600  */
2601 int
CloseTransientFile(int fd)2602 CloseTransientFile(int fd)
2603 {
2604 	int			i;
2605 
2606 	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2607 
2608 	/* Remove fd from list of allocated files, if it's present */
2609 	for (i = numAllocatedDescs; --i >= 0;)
2610 	{
2611 		AllocateDesc *desc = &allocatedDescs[i];
2612 
2613 		if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2614 			return FreeDesc(desc);
2615 	}
2616 
2617 	/* Only get here if someone passes us a file not in allocatedDescs */
2618 	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2619 
2620 	return close(fd);
2621 }
2622 
2623 /*
2624  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2625  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
2626  * necessary to open the directory, and with closing it after an elog.
2627  * When done, call FreeDir rather than closedir.
2628  *
2629  * Returns NULL, with errno set, on failure.  Note that failure detection
2630  * is commonly left to the following call of ReadDir or ReadDirExtended;
2631  * see the comments for ReadDir.
2632  *
2633  * Ideally this should be the *only* direct call of opendir() in the backend.
2634  */
2635 DIR *
AllocateDir(const char * dirname)2636 AllocateDir(const char *dirname)
2637 {
2638 	DIR		   *dir;
2639 
2640 	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2641 			   numAllocatedDescs, dirname));
2642 
2643 	/* Can we allocate another non-virtual FD? */
2644 	if (!reserveAllocatedDesc())
2645 		ereport(ERROR,
2646 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2647 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2648 						maxAllocatedDescs, dirname)));
2649 
2650 	/* Close excess kernel FDs. */
2651 	ReleaseLruFiles();
2652 
2653 TryAgain:
2654 	if ((dir = opendir(dirname)) != NULL)
2655 	{
2656 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2657 
2658 		desc->kind = AllocateDescDir;
2659 		desc->desc.dir = dir;
2660 		desc->create_subid = GetCurrentSubTransactionId();
2661 		numAllocatedDescs++;
2662 		return desc->desc.dir;
2663 	}
2664 
2665 	if (errno == EMFILE || errno == ENFILE)
2666 	{
2667 		int			save_errno = errno;
2668 
2669 		ereport(LOG,
2670 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2671 				 errmsg("out of file descriptors: %m; release and retry")));
2672 		errno = 0;
2673 		if (ReleaseLruFile())
2674 			goto TryAgain;
2675 		errno = save_errno;
2676 	}
2677 
2678 	return NULL;
2679 }
2680 
2681 /*
2682  * Read a directory opened with AllocateDir, ereport'ing any error.
2683  *
2684  * This is easier to use than raw readdir() since it takes care of some
2685  * otherwise rather tedious and error-prone manipulation of errno.  Also,
2686  * if you are happy with a generic error message for AllocateDir failure,
2687  * you can just do
2688  *
2689  *		dir = AllocateDir(path);
2690  *		while ((dirent = ReadDir(dir, path)) != NULL)
2691  *			process dirent;
2692  *		FreeDir(dir);
2693  *
2694  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2695  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2696  * use this shortcut.)
2697  *
2698  * The pathname passed to AllocateDir must be passed to this routine too,
2699  * but it is only used for error reporting.
2700  */
2701 struct dirent *
ReadDir(DIR * dir,const char * dirname)2702 ReadDir(DIR *dir, const char *dirname)
2703 {
2704 	return ReadDirExtended(dir, dirname, ERROR);
2705 }
2706 
2707 /*
2708  * Alternate version of ReadDir that allows caller to specify the elevel
2709  * for any error report (whether it's reporting an initial failure of
2710  * AllocateDir or a subsequent directory read failure).
2711  *
2712  * If elevel < ERROR, returns NULL after any error.  With the normal coding
2713  * pattern, this will result in falling out of the loop immediately as
2714  * though the directory contained no (more) entries.
2715  */
2716 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2717 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2718 {
2719 	struct dirent *dent;
2720 
2721 	/* Give a generic message for AllocateDir failure, if caller didn't */
2722 	if (dir == NULL)
2723 	{
2724 		ereport(elevel,
2725 				(errcode_for_file_access(),
2726 				 errmsg("could not open directory \"%s\": %m",
2727 						dirname)));
2728 		return NULL;
2729 	}
2730 
2731 	errno = 0;
2732 	if ((dent = readdir(dir)) != NULL)
2733 		return dent;
2734 
2735 	if (errno)
2736 		ereport(elevel,
2737 				(errcode_for_file_access(),
2738 				 errmsg("could not read directory \"%s\": %m",
2739 						dirname)));
2740 	return NULL;
2741 }
2742 
2743 /*
2744  * Close a directory opened with AllocateDir.
2745  *
2746  * Returns closedir's return value (with errno set if it's not 0).
2747  * Note we do not check the return value --- it is up to the caller
2748  * to handle close errors if wanted.
2749  *
2750  * Does nothing if dir == NULL; we assume that directory open failure was
2751  * already reported if desired.
2752  */
2753 int
FreeDir(DIR * dir)2754 FreeDir(DIR *dir)
2755 {
2756 	int			i;
2757 
2758 	/* Nothing to do if AllocateDir failed */
2759 	if (dir == NULL)
2760 		return 0;
2761 
2762 	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2763 
2764 	/* Remove dir from list of allocated dirs, if it's present */
2765 	for (i = numAllocatedDescs; --i >= 0;)
2766 	{
2767 		AllocateDesc *desc = &allocatedDescs[i];
2768 
2769 		if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2770 			return FreeDesc(desc);
2771 	}
2772 
2773 	/* Only get here if someone passes us a dir not in allocatedDescs */
2774 	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2775 
2776 	return closedir(dir);
2777 }
2778 
2779 
2780 /*
2781  * Close a pipe stream returned by OpenPipeStream.
2782  */
2783 int
ClosePipeStream(FILE * file)2784 ClosePipeStream(FILE *file)
2785 {
2786 	int			i;
2787 
2788 	DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2789 
2790 	/* Remove file from list of allocated files, if it's present */
2791 	for (i = numAllocatedDescs; --i >= 0;)
2792 	{
2793 		AllocateDesc *desc = &allocatedDescs[i];
2794 
2795 		if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2796 			return FreeDesc(desc);
2797 	}
2798 
2799 	/* Only get here if someone passes us a file not in allocatedDescs */
2800 	elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2801 
2802 	return pclose(file);
2803 }
2804 
2805 /*
2806  * closeAllVfds
2807  *
2808  * Force all VFDs into the physically-closed state, so that the fewest
2809  * possible number of kernel file descriptors are in use.  There is no
2810  * change in the logical state of the VFDs.
2811  */
2812 void
closeAllVfds(void)2813 closeAllVfds(void)
2814 {
2815 	Index		i;
2816 
2817 	if (SizeVfdCache > 0)
2818 	{
2819 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2820 		for (i = 1; i < SizeVfdCache; i++)
2821 		{
2822 			if (!FileIsNotOpen(i))
2823 				LruDelete(i);
2824 		}
2825 	}
2826 }
2827 
2828 
2829 /*
2830  * SetTempTablespaces
2831  *
2832  * Define a list (actually an array) of OIDs of tablespaces to use for
2833  * temporary files.  This list will be used until end of transaction,
2834  * unless this function is called again before then.  It is caller's
2835  * responsibility that the passed-in array has adequate lifespan (typically
2836  * it'd be allocated in TopTransactionContext).
2837  *
2838  * Some entries of the array may be InvalidOid, indicating that the current
2839  * database's default tablespace should be used.
2840  */
2841 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2842 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2843 {
2844 	Assert(numSpaces >= 0);
2845 	tempTableSpaces = tableSpaces;
2846 	numTempTableSpaces = numSpaces;
2847 
2848 	/*
2849 	 * Select a random starting point in the list.  This is to minimize
2850 	 * conflicts between backends that are most likely sharing the same list
2851 	 * of temp tablespaces.  Note that if we create multiple temp files in the
2852 	 * same transaction, we'll advance circularly through the list --- this
2853 	 * ensures that large temporary sort files are nicely spread across all
2854 	 * available tablespaces.
2855 	 */
2856 	if (numSpaces > 1)
2857 		nextTempTableSpace = random() % numSpaces;
2858 	else
2859 		nextTempTableSpace = 0;
2860 }
2861 
2862 /*
2863  * TempTablespacesAreSet
2864  *
2865  * Returns true if SetTempTablespaces has been called in current transaction.
2866  * (This is just so that tablespaces.c doesn't need its own per-transaction
2867  * state.)
2868  */
2869 bool
TempTablespacesAreSet(void)2870 TempTablespacesAreSet(void)
2871 {
2872 	return (numTempTableSpaces >= 0);
2873 }
2874 
2875 /*
2876  * GetTempTablespaces
2877  *
2878  * Populate an array with the OIDs of the tablespaces that should be used for
2879  * temporary files.  (Some entries may be InvalidOid, indicating that the
2880  * current database's default tablespace should be used.)  At most numSpaces
2881  * entries will be filled.
2882  * Returns the number of OIDs that were copied into the output array.
2883  */
2884 int
GetTempTablespaces(Oid * tableSpaces,int numSpaces)2885 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2886 {
2887 	int			i;
2888 
2889 	Assert(TempTablespacesAreSet());
2890 	for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2891 		tableSpaces[i] = tempTableSpaces[i];
2892 
2893 	return i;
2894 }
2895 
2896 /*
2897  * GetNextTempTableSpace
2898  *
2899  * Select the next temp tablespace to use.  A result of InvalidOid means
2900  * to use the current database's default tablespace.
2901  */
2902 Oid
GetNextTempTableSpace(void)2903 GetNextTempTableSpace(void)
2904 {
2905 	if (numTempTableSpaces > 0)
2906 	{
2907 		/* Advance nextTempTableSpace counter with wraparound */
2908 		if (++nextTempTableSpace >= numTempTableSpaces)
2909 			nextTempTableSpace = 0;
2910 		return tempTableSpaces[nextTempTableSpace];
2911 	}
2912 	return InvalidOid;
2913 }
2914 
2915 
2916 /*
2917  * AtEOSubXact_Files
2918  *
2919  * Take care of subtransaction commit/abort.  At abort, we close temp files
2920  * that the subtransaction may have opened.  At commit, we reassign the
2921  * files that were opened to the parent subtransaction.
2922  */
2923 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2924 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2925 				  SubTransactionId parentSubid)
2926 {
2927 	Index		i;
2928 
2929 	for (i = 0; i < numAllocatedDescs; i++)
2930 	{
2931 		if (allocatedDescs[i].create_subid == mySubid)
2932 		{
2933 			if (isCommit)
2934 				allocatedDescs[i].create_subid = parentSubid;
2935 			else
2936 			{
2937 				/* have to recheck the item after FreeDesc (ugly) */
2938 				FreeDesc(&allocatedDescs[i--]);
2939 			}
2940 		}
2941 	}
2942 }
2943 
2944 /*
2945  * AtEOXact_Files
2946  *
2947  * This routine is called during transaction commit or abort.  All still-open
2948  * per-transaction temporary file VFDs are closed, which also causes the
2949  * underlying files to be deleted (although they should've been closed already
2950  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2951  * closed. We also forget any transaction-local temp tablespace list.
2952  *
2953  * The isCommit flag is used only to decide whether to emit warnings about
2954  * unclosed files.
2955  */
2956 void
AtEOXact_Files(bool isCommit)2957 AtEOXact_Files(bool isCommit)
2958 {
2959 	CleanupTempFiles(isCommit, false);
2960 	tempTableSpaces = NULL;
2961 	numTempTableSpaces = -1;
2962 }
2963 
2964 /*
2965  * AtProcExit_Files
2966  *
2967  * on_proc_exit hook to clean up temp files during backend shutdown.
2968  * Here, we want to clean up *all* temp files including interXact ones.
2969  */
2970 static void
AtProcExit_Files(int code,Datum arg)2971 AtProcExit_Files(int code, Datum arg)
2972 {
2973 	CleanupTempFiles(false, true);
2974 }
2975 
2976 /*
2977  * Close temporary files and delete their underlying files.
2978  *
2979  * isCommit: if true, this is normal transaction commit, and we don't
2980  * expect any remaining files; warn if there are some.
2981  *
2982  * isProcExit: if true, this is being called as the backend process is
2983  * exiting. If that's the case, we should remove all temporary files; if
2984  * that's not the case, we are being called for transaction commit/abort
2985  * and should only remove transaction-local temp files.  In either case,
2986  * also clean up "allocated" stdio files, dirs and fds.
2987  */
2988 static void
CleanupTempFiles(bool isCommit,bool isProcExit)2989 CleanupTempFiles(bool isCommit, bool isProcExit)
2990 {
2991 	Index		i;
2992 
2993 	/*
2994 	 * Careful here: at proc_exit we need extra cleanup, not just
2995 	 * xact_temporary files.
2996 	 */
2997 	if (isProcExit || have_xact_temporary_files)
2998 	{
2999 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
3000 		for (i = 1; i < SizeVfdCache; i++)
3001 		{
3002 			unsigned short fdstate = VfdCache[i].fdstate;
3003 
3004 			if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3005 				VfdCache[i].fileName != NULL)
3006 			{
3007 				/*
3008 				 * If we're in the process of exiting a backend process, close
3009 				 * all temporary files. Otherwise, only close temporary files
3010 				 * local to the current transaction. They should be closed by
3011 				 * the ResourceOwner mechanism already, so this is just a
3012 				 * debugging cross-check.
3013 				 */
3014 				if (isProcExit)
3015 					FileClose(i);
3016 				else if (fdstate & FD_CLOSE_AT_EOXACT)
3017 				{
3018 					elog(WARNING,
3019 						 "temporary file %s not closed at end-of-transaction",
3020 						 VfdCache[i].fileName);
3021 					FileClose(i);
3022 				}
3023 			}
3024 		}
3025 
3026 		have_xact_temporary_files = false;
3027 	}
3028 
3029 	/* Complain if any allocated files remain open at commit. */
3030 	if (isCommit && numAllocatedDescs > 0)
3031 		elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3032 			 numAllocatedDescs);
3033 
3034 	/* Clean up "allocated" stdio files, dirs and fds. */
3035 	while (numAllocatedDescs > 0)
3036 		FreeDesc(&allocatedDescs[0]);
3037 }
3038 
3039 
3040 /*
3041  * Remove temporary and temporary relation files left over from a prior
3042  * postmaster session
3043  *
3044  * This should be called during postmaster startup.  It will forcibly
3045  * remove any leftover files created by OpenTemporaryFile and any leftover
3046  * temporary relation files created by mdcreate.
3047  *
3048  * During post-backend-crash restart cycle, this routine is called when
3049  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3050  * queries are using temp files could result in useless storage usage that can
3051  * only be reclaimed by a service restart. The argument against enabling it is
3052  * that someone might want to examine the temporary files for debugging
3053  * purposes. This does however mean that OpenTemporaryFile had better allow for
3054  * collision with an existing temp file name.
3055  *
3056  * NOTE: this function and its subroutines generally report syscall failures
3057  * with ereport(LOG) and keep going.  Removing temp files is not so critical
3058  * that we should fail to start the database when we can't do it.
3059  */
3060 void
RemovePgTempFiles(void)3061 RemovePgTempFiles(void)
3062 {
3063 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3064 	DIR		   *spc_dir;
3065 	struct dirent *spc_de;
3066 
3067 	/*
3068 	 * First process temp files in pg_default ($PGDATA/base)
3069 	 */
3070 	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3071 	RemovePgTempFilesInDir(temp_path, true, false);
3072 	RemovePgTempRelationFiles("base");
3073 
3074 	/*
3075 	 * Cycle through temp directories for all non-default tablespaces.
3076 	 */
3077 	spc_dir = AllocateDir("pg_tblspc");
3078 
3079 	while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3080 	{
3081 		if (strcmp(spc_de->d_name, ".") == 0 ||
3082 			strcmp(spc_de->d_name, "..") == 0)
3083 			continue;
3084 
3085 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3086 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
3087 		RemovePgTempFilesInDir(temp_path, true, false);
3088 
3089 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3090 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3091 		RemovePgTempRelationFiles(temp_path);
3092 	}
3093 
3094 	FreeDir(spc_dir);
3095 
3096 	/*
3097 	 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3098 	 * DataDir as well.  However, that is *not* cleaned here because doing so
3099 	 * would create a race condition.  It's done separately, earlier in
3100 	 * postmaster startup.
3101 	 */
3102 }
3103 
3104 /*
3105  * Process one pgsql_tmp directory for RemovePgTempFiles.
3106  *
3107  * If missing_ok is true, it's all right for the named directory to not exist.
3108  * Any other problem results in a LOG message.  (missing_ok should be true at
3109  * the top level, since pgsql_tmp directories are not created until needed.)
3110  *
3111  * At the top level, this should be called with unlink_all = false, so that
3112  * only files matching the temporary name prefix will be unlinked.  When
3113  * recursing it will be called with unlink_all = true to unlink everything
3114  * under a top-level temporary directory.
3115  *
3116  * (These two flags could be replaced by one, but it seems clearer to keep
3117  * them separate.)
3118  */
3119 void
RemovePgTempFilesInDir(const char * tmpdirname,bool missing_ok,bool unlink_all)3120 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3121 {
3122 	DIR		   *temp_dir;
3123 	struct dirent *temp_de;
3124 	char		rm_path[MAXPGPATH * 2];
3125 
3126 	temp_dir = AllocateDir(tmpdirname);
3127 
3128 	if (temp_dir == NULL && errno == ENOENT && missing_ok)
3129 		return;
3130 
3131 	while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3132 	{
3133 		if (strcmp(temp_de->d_name, ".") == 0 ||
3134 			strcmp(temp_de->d_name, "..") == 0)
3135 			continue;
3136 
3137 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
3138 				 tmpdirname, temp_de->d_name);
3139 
3140 		if (unlink_all ||
3141 			strncmp(temp_de->d_name,
3142 					PG_TEMP_FILE_PREFIX,
3143 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
3144 		{
3145 			struct stat statbuf;
3146 
3147 			if (lstat(rm_path, &statbuf) < 0)
3148 			{
3149 				ereport(LOG,
3150 						(errcode_for_file_access(),
3151 						 errmsg("could not stat file \"%s\": %m", rm_path)));
3152 				continue;
3153 			}
3154 
3155 			if (S_ISDIR(statbuf.st_mode))
3156 			{
3157 				/* recursively remove contents, then directory itself */
3158 				RemovePgTempFilesInDir(rm_path, false, true);
3159 
3160 				if (rmdir(rm_path) < 0)
3161 					ereport(LOG,
3162 							(errcode_for_file_access(),
3163 							 errmsg("could not remove directory \"%s\": %m",
3164 									rm_path)));
3165 			}
3166 			else
3167 			{
3168 				if (unlink(rm_path) < 0)
3169 					ereport(LOG,
3170 							(errcode_for_file_access(),
3171 							 errmsg("could not remove file \"%s\": %m",
3172 									rm_path)));
3173 			}
3174 		}
3175 		else
3176 			ereport(LOG,
3177 					(errmsg("unexpected file found in temporary-files directory: \"%s\"",
3178 							rm_path)));
3179 	}
3180 
3181 	FreeDir(temp_dir);
3182 }
3183 
3184 /* Process one tablespace directory, look for per-DB subdirectories */
3185 static void
RemovePgTempRelationFiles(const char * tsdirname)3186 RemovePgTempRelationFiles(const char *tsdirname)
3187 {
3188 	DIR		   *ts_dir;
3189 	struct dirent *de;
3190 	char		dbspace_path[MAXPGPATH * 2];
3191 
3192 	ts_dir = AllocateDir(tsdirname);
3193 
3194 	while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3195 	{
3196 		/*
3197 		 * We're only interested in the per-database directories, which have
3198 		 * numeric names.  Note that this code will also (properly) ignore "."
3199 		 * and "..".
3200 		 */
3201 		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3202 			continue;
3203 
3204 		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3205 				 tsdirname, de->d_name);
3206 		RemovePgTempRelationFilesInDbspace(dbspace_path);
3207 	}
3208 
3209 	FreeDir(ts_dir);
3210 }
3211 
3212 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3213 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)3214 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3215 {
3216 	DIR		   *dbspace_dir;
3217 	struct dirent *de;
3218 	char		rm_path[MAXPGPATH * 2];
3219 
3220 	dbspace_dir = AllocateDir(dbspacedirname);
3221 
3222 	while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3223 	{
3224 		if (!looks_like_temp_rel_name(de->d_name))
3225 			continue;
3226 
3227 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
3228 				 dbspacedirname, de->d_name);
3229 
3230 		if (unlink(rm_path) < 0)
3231 			ereport(LOG,
3232 					(errcode_for_file_access(),
3233 					 errmsg("could not remove file \"%s\": %m",
3234 							rm_path)));
3235 	}
3236 
3237 	FreeDir(dbspace_dir);
3238 }
3239 
3240 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3241 bool
looks_like_temp_rel_name(const char * name)3242 looks_like_temp_rel_name(const char *name)
3243 {
3244 	int			pos;
3245 	int			savepos;
3246 
3247 	/* Must start with "t". */
3248 	if (name[0] != 't')
3249 		return false;
3250 
3251 	/* Followed by a non-empty string of digits and then an underscore. */
3252 	for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3253 		;
3254 	if (pos == 1 || name[pos] != '_')
3255 		return false;
3256 
3257 	/* Followed by another nonempty string of digits. */
3258 	for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3259 		;
3260 	if (savepos == pos)
3261 		return false;
3262 
3263 	/* We might have _forkname or .segment or both. */
3264 	if (name[pos] == '_')
3265 	{
3266 		int			forkchar = forkname_chars(&name[pos + 1], NULL);
3267 
3268 		if (forkchar <= 0)
3269 			return false;
3270 		pos += forkchar + 1;
3271 	}
3272 	if (name[pos] == '.')
3273 	{
3274 		int			segchar;
3275 
3276 		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3277 			;
3278 		if (segchar <= 1)
3279 			return false;
3280 		pos += segchar;
3281 	}
3282 
3283 	/* Now we should be at the end. */
3284 	if (name[pos] != '\0')
3285 		return false;
3286 	return true;
3287 }
3288 
3289 #ifdef HAVE_SYNCFS
3290 static void
do_syncfs(const char * path)3291 do_syncfs(const char *path)
3292 {
3293 	int			fd;
3294 
3295 	fd = OpenTransientFile(path, O_RDONLY);
3296 	if (fd < 0)
3297 	{
3298 		ereport(LOG,
3299 				(errcode_for_file_access(),
3300 				 errmsg("could not open file \"%s\": %m", path)));
3301 		return;
3302 	}
3303 	if (syncfs(fd) < 0)
3304 		ereport(LOG,
3305 				(errcode_for_file_access(),
3306 				 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3307 	CloseTransientFile(fd);
3308 }
3309 #endif
3310 
3311 /*
3312  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3313  * all potential filesystem, depending on recovery_init_sync_method setting.
3314  *
3315  * We fsync regular files and directories wherever they are, but we
3316  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3317  * Other symlinks are presumed to point at files we're not responsible
3318  * for fsyncing, and might not have privileges to write at all.
3319  *
3320  * Errors are logged but not considered fatal; that's because this is used
3321  * only during database startup, to deal with the possibility that there are
3322  * issued-but-unsynced writes pending against the data directory.  We want to
3323  * ensure that such writes reach disk before anything that's done in the new
3324  * run.  However, aborting on error would result in failure to start for
3325  * harmless cases such as read-only files in the data directory, and that's
3326  * not good either.
3327  *
3328  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3329  * rewriting all changes again during recovery.
3330  *
3331  * Note we assume we're chdir'd into PGDATA to begin with.
3332  */
3333 void
SyncDataDirectory(void)3334 SyncDataDirectory(void)
3335 {
3336 	bool		xlog_is_symlink;
3337 
3338 	/* We can skip this whole thing if fsync is disabled. */
3339 	if (!enableFsync)
3340 		return;
3341 
3342 	/*
3343 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
3344 	 * because the first walkdir below will ignore it.
3345 	 */
3346 	xlog_is_symlink = false;
3347 
3348 #ifndef WIN32
3349 	{
3350 		struct stat st;
3351 
3352 		if (lstat("pg_wal", &st) < 0)
3353 			ereport(LOG,
3354 					(errcode_for_file_access(),
3355 					 errmsg("could not stat file \"%s\": %m",
3356 							"pg_wal")));
3357 		else if (S_ISLNK(st.st_mode))
3358 			xlog_is_symlink = true;
3359 	}
3360 #else
3361 	if (pgwin32_is_junction("pg_wal"))
3362 		xlog_is_symlink = true;
3363 #endif
3364 
3365 #ifdef HAVE_SYNCFS
3366 	if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
3367 	{
3368 		DIR		   *dir;
3369 		struct dirent *de;
3370 
3371 		/*
3372 		 * On Linux, we don't have to open every single file one by one.  We
3373 		 * can use syncfs() to sync whole filesystems.  We only expect
3374 		 * filesystem boundaries to exist where we tolerate symlinks, namely
3375 		 * pg_wal and the tablespaces, so we call syncfs() for each of those
3376 		 * directories.
3377 		 */
3378 
3379 		/* Sync the top level pgdata directory. */
3380 		do_syncfs(".");
3381 		/* If any tablespaces are configured, sync each of those. */
3382 		dir = AllocateDir("pg_tblspc");
3383 		while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3384 		{
3385 			char		path[MAXPGPATH];
3386 
3387 			if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3388 				continue;
3389 
3390 			snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3391 			do_syncfs(path);
3392 		}
3393 		FreeDir(dir);
3394 		/* If pg_wal is a symlink, process that too. */
3395 		if (xlog_is_symlink)
3396 			do_syncfs("pg_wal");
3397 		return;
3398 	}
3399 #endif							/* !HAVE_SYNCFS */
3400 
3401 	/*
3402 	 * If possible, hint to the kernel that we're soon going to fsync the data
3403 	 * directory and its contents.  Errors in this step are even less
3404 	 * interesting than normal, so log them only at DEBUG1.
3405 	 */
3406 #ifdef PG_FLUSH_DATA_WORKS
3407 	walkdir(".", pre_sync_fname, false, DEBUG1);
3408 	if (xlog_is_symlink)
3409 		walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3410 	walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3411 #endif
3412 
3413 	/*
3414 	 * Now we do the fsync()s in the same order.
3415 	 *
3416 	 * The main call ignores symlinks, so in addition to specially processing
3417 	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3418 	 * process_symlinks = true.  Note that if there are any plain directories
3419 	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
3420 	 * so we don't worry about optimizing it.
3421 	 */
3422 	walkdir(".", datadir_fsync_fname, false, LOG);
3423 	if (xlog_is_symlink)
3424 		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3425 	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3426 }
3427 
3428 /*
3429  * walkdir: recursively walk a directory, applying the action to each
3430  * regular file and directory (including the named directory itself).
3431  *
3432  * If process_symlinks is true, the action and recursion are also applied
3433  * to regular files and directories that are pointed to by symlinks in the
3434  * given directory; otherwise symlinks are ignored.  Symlinks are always
3435  * ignored in subdirectories, ie we intentionally don't pass down the
3436  * process_symlinks flag to recursive calls.
3437  *
3438  * Errors are reported at level elevel, which might be ERROR or less.
3439  *
3440  * See also walkdir in file_utils.c, which is a frontend version of this
3441  * logic.
3442  */
3443 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3444 walkdir(const char *path,
3445 		void (*action) (const char *fname, bool isdir, int elevel),
3446 		bool process_symlinks,
3447 		int elevel)
3448 {
3449 	DIR		   *dir;
3450 	struct dirent *de;
3451 
3452 	dir = AllocateDir(path);
3453 
3454 	while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3455 	{
3456 		char		subpath[MAXPGPATH * 2];
3457 
3458 		CHECK_FOR_INTERRUPTS();
3459 
3460 		if (strcmp(de->d_name, ".") == 0 ||
3461 			strcmp(de->d_name, "..") == 0)
3462 			continue;
3463 
3464 		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3465 
3466 		switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3467 		{
3468 			case PGFILETYPE_REG:
3469 				(*action) (subpath, false, elevel);
3470 				break;
3471 			case PGFILETYPE_DIR:
3472 				walkdir(subpath, action, false, elevel);
3473 				break;
3474 			default:
3475 
3476 				/*
3477 				 * Errors are already reported directly by get_dirent_type(),
3478 				 * and any remaining symlinks and unknown file types are
3479 				 * ignored.
3480 				 */
3481 				break;
3482 		}
3483 	}
3484 
3485 	FreeDir(dir);				/* we ignore any error here */
3486 
3487 	/*
3488 	 * It's important to fsync the destination directory itself as individual
3489 	 * file fsyncs don't guarantee that the directory entry for the file is
3490 	 * synced.  However, skip this if AllocateDir failed; the action function
3491 	 * might not be robust against that.
3492 	 */
3493 	if (dir)
3494 		(*action) (path, true, elevel);
3495 }
3496 
3497 
3498 /*
3499  * Hint to the OS that it should get ready to fsync() this file.
3500  *
3501  * Ignores errors trying to open unreadable files, and logs other errors at a
3502  * caller-specified level.
3503  */
3504 #ifdef PG_FLUSH_DATA_WORKS
3505 
3506 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3507 pre_sync_fname(const char *fname, bool isdir, int elevel)
3508 {
3509 	int			fd;
3510 
3511 	/* Don't try to flush directories, it'll likely just fail */
3512 	if (isdir)
3513 		return;
3514 
3515 	fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3516 
3517 	if (fd < 0)
3518 	{
3519 		if (errno == EACCES)
3520 			return;
3521 		ereport(elevel,
3522 				(errcode_for_file_access(),
3523 				 errmsg("could not open file \"%s\": %m", fname)));
3524 		return;
3525 	}
3526 
3527 	/*
3528 	 * pg_flush_data() ignores errors, which is ok because this is only a
3529 	 * hint.
3530 	 */
3531 	pg_flush_data(fd, 0, 0);
3532 
3533 	if (CloseTransientFile(fd) != 0)
3534 		ereport(elevel,
3535 				(errcode_for_file_access(),
3536 				 errmsg("could not close file \"%s\": %m", fname)));
3537 }
3538 
3539 #endif							/* PG_FLUSH_DATA_WORKS */
3540 
3541 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3542 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3543 {
3544 	/*
3545 	 * We want to silently ignoring errors about unreadable files.  Pass that
3546 	 * desire on to fsync_fname_ext().
3547 	 */
3548 	fsync_fname_ext(fname, isdir, true, elevel);
3549 }
3550 
3551 static void
unlink_if_exists_fname(const char * fname,bool isdir,int elevel)3552 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3553 {
3554 	if (isdir)
3555 	{
3556 		if (rmdir(fname) != 0 && errno != ENOENT)
3557 			ereport(elevel,
3558 					(errcode_for_file_access(),
3559 					 errmsg("could not remove directory \"%s\": %m", fname)));
3560 	}
3561 	else
3562 	{
3563 		/* Use PathNameDeleteTemporaryFile to report filesize */
3564 		PathNameDeleteTemporaryFile(fname, false);
3565 	}
3566 }
3567 
3568 /*
3569  * fsync_fname_ext -- Try to fsync a file or directory
3570  *
3571  * If ignore_perm is true, ignore errors upon trying to open unreadable
3572  * files. Logs other errors at a caller-specified level.
3573  *
3574  * Returns 0 if the operation succeeded, -1 otherwise.
3575  */
3576 int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3577 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3578 {
3579 	int			fd;
3580 	int			flags;
3581 	int			returncode;
3582 
3583 	/*
3584 	 * Some OSs require directories to be opened read-only whereas other
3585 	 * systems don't allow us to fsync files opened read-only; so we need both
3586 	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
3587 	 * not writable by our userid, but we assume that's OK.
3588 	 */
3589 	flags = PG_BINARY;
3590 	if (!isdir)
3591 		flags |= O_RDWR;
3592 	else
3593 		flags |= O_RDONLY;
3594 
3595 	fd = OpenTransientFile(fname, flags);
3596 
3597 	/*
3598 	 * Some OSs don't allow us to open directories at all (Windows returns
3599 	 * EACCES), just ignore the error in that case.  If desired also silently
3600 	 * ignoring errors about unreadable files. Log others.
3601 	 */
3602 	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3603 		return 0;
3604 	else if (fd < 0 && ignore_perm && errno == EACCES)
3605 		return 0;
3606 	else if (fd < 0)
3607 	{
3608 		ereport(elevel,
3609 				(errcode_for_file_access(),
3610 				 errmsg("could not open file \"%s\": %m", fname)));
3611 		return -1;
3612 	}
3613 
3614 	returncode = pg_fsync(fd);
3615 
3616 	/*
3617 	 * Some OSes don't allow us to fsync directories at all, so we can ignore
3618 	 * those errors. Anything else needs to be logged.
3619 	 */
3620 	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3621 	{
3622 		int			save_errno;
3623 
3624 		/* close file upon error, might not be in transaction context */
3625 		save_errno = errno;
3626 		(void) CloseTransientFile(fd);
3627 		errno = save_errno;
3628 
3629 		ereport(elevel,
3630 				(errcode_for_file_access(),
3631 				 errmsg("could not fsync file \"%s\": %m", fname)));
3632 		return -1;
3633 	}
3634 
3635 	if (CloseTransientFile(fd) != 0)
3636 	{
3637 		ereport(elevel,
3638 				(errcode_for_file_access(),
3639 				 errmsg("could not close file \"%s\": %m", fname)));
3640 		return -1;
3641 	}
3642 
3643 	return 0;
3644 }
3645 
3646 /*
3647  * fsync_parent_path -- fsync the parent path of a file or directory
3648  *
3649  * This is aimed at making file operations persistent on disk in case of
3650  * an OS crash or power failure.
3651  */
3652 static int
fsync_parent_path(const char * fname,int elevel)3653 fsync_parent_path(const char *fname, int elevel)
3654 {
3655 	char		parentpath[MAXPGPATH];
3656 
3657 	strlcpy(parentpath, fname, MAXPGPATH);
3658 	get_parent_directory(parentpath);
3659 
3660 	/*
3661 	 * get_parent_directory() returns an empty string if the input argument is
3662 	 * just a file name (see comments in path.c), so handle that as being the
3663 	 * current directory.
3664 	 */
3665 	if (strlen(parentpath) == 0)
3666 		strlcpy(parentpath, ".", MAXPGPATH);
3667 
3668 	if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3669 		return -1;
3670 
3671 	return 0;
3672 }
3673 
3674 /*
3675  * Create a PostgreSQL data sub-directory
3676  *
3677  * The data directory itself, and most of its sub-directories, are created at
3678  * initdb time, but we do have some occasions when we create directories in
3679  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
3680  * make sure that those directories are created consistently.  Today, that means
3681  * making sure that the created directory has the correct permissions, which is
3682  * what pg_dir_create_mode tracks for us.
3683  *
3684  * Note that we also set the umask() based on what we understand the correct
3685  * permissions to be (see file_perm.c).
3686  *
3687  * For permissions other than the default, mkdir() can be used directly, but
3688  * be sure to consider carefully such cases -- a sub-directory with incorrect
3689  * permissions in a PostgreSQL data directory could cause backups and other
3690  * processes to fail.
3691  */
3692 int
MakePGDirectory(const char * directoryName)3693 MakePGDirectory(const char *directoryName)
3694 {
3695 	return mkdir(directoryName, pg_dir_create_mode);
3696 }
3697 
3698 /*
3699  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3700  *
3701  * Failure to fsync any data file is cause for immediate panic, unless
3702  * data_sync_retry is enabled.  Data may have been written to the operating
3703  * system and removed from our buffer pool already, and if we are running on
3704  * an operating system that forgets dirty data on write-back failure, there
3705  * may be only one copy of the data remaining: in the WAL.  A later attempt to
3706  * fsync again might falsely report success.  Therefore we must not allow any
3707  * further checkpoints to be attempted.  data_sync_retry can in theory be
3708  * enabled on systems known not to drop dirty buffered data on write-back
3709  * failure (with the likely outcome that checkpoints will continue to fail
3710  * until the underlying problem is fixed).
3711  *
3712  * Any code that reports a failure from fsync() or related functions should
3713  * filter the error level with this function.
3714  */
3715 int
data_sync_elevel(int elevel)3716 data_sync_elevel(int elevel)
3717 {
3718 	return data_sync_retry ? elevel : PANIC;
3719 }
3720 
3721 /*
3722  * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
3723  * error is returned, it is unspecified how much has been written.
3724  */
3725 ssize_t
pg_pwritev_with_retry(int fd,const struct iovec * iov,int iovcnt,off_t offset)3726 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
3727 {
3728 	struct iovec iov_copy[PG_IOV_MAX];
3729 	ssize_t		sum = 0;
3730 	ssize_t		part;
3731 
3732 	/* We'd better have space to make a copy, in case we need to retry. */
3733 	if (iovcnt > PG_IOV_MAX)
3734 	{
3735 		errno = EINVAL;
3736 		return -1;
3737 	}
3738 
3739 	for (;;)
3740 	{
3741 		/* Write as much as we can. */
3742 		part = pg_pwritev(fd, iov, iovcnt, offset);
3743 		if (part < 0)
3744 			return -1;
3745 
3746 #ifdef SIMULATE_SHORT_WRITE
3747 		part = Min(part, 4096);
3748 #endif
3749 
3750 		/* Count our progress. */
3751 		sum += part;
3752 		offset += part;
3753 
3754 		/* Step over iovecs that are done. */
3755 		while (iovcnt > 0 && iov->iov_len <= part)
3756 		{
3757 			part -= iov->iov_len;
3758 			++iov;
3759 			--iovcnt;
3760 		}
3761 
3762 		/* Are they all done? */
3763 		if (iovcnt == 0)
3764 		{
3765 			/* We don't expect the kernel to write more than requested. */
3766 			Assert(part == 0);
3767 			break;
3768 		}
3769 
3770 		/*
3771 		 * Move whatever's left to the front of our mutable copy and adjust
3772 		 * the leading iovec.
3773 		 */
3774 		Assert(iovcnt > 0);
3775 		memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
3776 		Assert(iov->iov_len > part);
3777 		iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
3778 		iov_copy[0].iov_len -= part;
3779 		iov = iov_copy;
3780 	}
3781 
3782 	return sum;
3783 }
3784