1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  *	  Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have.  (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed.  Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends.  Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted.  See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open.  This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <sys/file.h>
76 #include <sys/param.h>
77 #include <sys/stat.h>
78 #ifndef WIN32
79 #include <sys/mman.h>
80 #endif
81 #include <limits.h>
82 #include <unistd.h>
83 #include <fcntl.h>
84 #ifdef HAVE_SYS_RESOURCE_H
85 #include <sys/resource.h>		/* for getrlimit */
86 #endif
87 
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "miscadmin.h"
93 #include "pgstat.h"
94 #include "portability/mem.h"
95 #include "storage/fd.h"
96 #include "storage/ipc.h"
97 #include "utils/guc.h"
98 #include "utils/resowner_private.h"
99 
100 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
101 #if defined(HAVE_SYNC_FILE_RANGE)
102 #define PG_FLUSH_DATA_WORKS 1
103 #elif !defined(WIN32) && defined(MS_ASYNC)
104 #define PG_FLUSH_DATA_WORKS 1
105 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
106 #define PG_FLUSH_DATA_WORKS 1
107 #endif
108 
109 /*
110  * We must leave some file descriptors free for system(), the dynamic loader,
111  * and other code that tries to open files without consulting fd.c.  This
112  * is the number left free.  (While we try fairly hard to prevent EMFILE
113  * errors, there's never any guarantee that we won't get ENFILE due to
114  * other processes chewing up FDs.  So it's a bad idea to try to open files
115  * without consulting fd.c.  Nonetheless we cannot control all code.)
116  *
117  * Because this is just a fixed setting, we are effectively assuming that
118  * no such code will leave FDs open over the long term; otherwise the slop
119  * is likely to be insufficient.  Note in particular that we expect that
120  * loading a shared library does not result in any permanent increase in
121  * the number of open files.  (This appears to be true on most if not
122  * all platforms as of Feb 2004.)
123  */
124 #define NUM_RESERVED_FDS		10
125 
126 /*
127  * If we have fewer than this many usable FDs after allowing for the reserved
128  * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
129  * much less than that.  Note that this value ensures numExternalFDs can be
130  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
131  * will not pass unless that can grow to at least 14.)
132  */
133 #define FD_MINFREE				48
134 
135 /*
136  * A number of platforms allow individual processes to open many more files
137  * than they can really support when *many* processes do the same thing.
138  * This GUC parameter lets the DBA limit max_safe_fds to something less than
139  * what the postmaster's initial probe suggests will work.
140  */
141 int			max_files_per_process = 1000;
142 
143 /*
144  * Maximum number of file descriptors to open for operations that fd.c knows
145  * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
146  * to a conservative value, and remains that way indefinitely in bootstrap or
147  * standalone-backend cases.  In normal postmaster operation, the postmaster
148  * calls set_max_safe_fds() late in initialization to update the value, and
149  * that value is then inherited by forked subprocesses.
150  *
151  * Note: the value of max_files_per_process is taken into account while
152  * setting this variable, and so need not be tested separately.
153  */
154 int			max_safe_fds = FD_MINFREE;	/* default if not changed */
155 
156 /* Whether it is safe to continue running after fsync() fails. */
157 bool		data_sync_retry = false;
158 
159 /* Debugging.... */
160 
161 #ifdef FDDEBUG
162 #define DO_DB(A) \
163 	do { \
164 		int			_do_db_save_errno = errno; \
165 		A; \
166 		errno = _do_db_save_errno; \
167 	} while (0)
168 #else
169 #define DO_DB(A) \
170 	((void) 0)
171 #endif
172 
173 #define VFD_CLOSED (-1)
174 
175 #define FileIsValid(file) \
176 	((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
177 
178 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
179 
180 /* these are the assigned bits in fdstate below: */
181 #define FD_DELETE_AT_CLOSE	(1 << 0)	/* T = delete when closed */
182 #define FD_CLOSE_AT_EOXACT	(1 << 1)	/* T = close at eoXact */
183 #define FD_TEMP_FILE_LIMIT	(1 << 2)	/* T = respect temp_file_limit */
184 
185 typedef struct vfd
186 {
187 	int			fd;				/* current FD, or VFD_CLOSED if none */
188 	unsigned short fdstate;		/* bitflags for VFD's state */
189 	ResourceOwner resowner;		/* owner, for automatic cleanup */
190 	File		nextFree;		/* link to next free VFD, if in freelist */
191 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
192 	File		lruLessRecently;
193 	off_t		fileSize;		/* current size of file (0 if not temporary) */
194 	char	   *fileName;		/* name of file, or NULL for unused VFD */
195 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
196 	int			fileFlags;		/* open(2) flags for (re)opening the file */
197 	mode_t		fileMode;		/* mode to pass to open(2) */
198 } Vfd;
199 
200 /*
201  * Virtual File Descriptor array pointer and size.  This grows as
202  * needed.  'File' values are indexes into this array.
203  * Note that VfdCache[0] is not a usable VFD, just a list header.
204  */
205 static Vfd *VfdCache;
206 static Size SizeVfdCache = 0;
207 
208 /*
209  * Number of file descriptors known to be in use by VFD entries.
210  */
211 static int	nfile = 0;
212 
213 /*
214  * Flag to tell whether it's worth scanning VfdCache looking for temp files
215  * to close
216  */
217 static bool have_xact_temporary_files = false;
218 
219 /*
220  * Tracks the total size of all temporary files.  Note: when temp_file_limit
221  * is being enforced, this cannot overflow since the limit cannot be more
222  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
223  * overflow, but we don't care.
224  */
225 static uint64 temporary_files_size = 0;
226 
227 /*
228  * List of OS handles opened with AllocateFile, AllocateDir and
229  * OpenTransientFile.
230  */
231 typedef enum
232 {
233 	AllocateDescFile,
234 	AllocateDescPipe,
235 	AllocateDescDir,
236 	AllocateDescRawFD
237 } AllocateDescKind;
238 
239 typedef struct
240 {
241 	AllocateDescKind kind;
242 	SubTransactionId create_subid;
243 	union
244 	{
245 		FILE	   *file;
246 		DIR		   *dir;
247 		int			fd;
248 	}			desc;
249 } AllocateDesc;
250 
251 static int	numAllocatedDescs = 0;
252 static int	maxAllocatedDescs = 0;
253 static AllocateDesc *allocatedDescs = NULL;
254 
255 /*
256  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
257  */
258 static int	numExternalFDs = 0;
259 
260 /*
261  * Number of temporary files opened during the current session;
262  * this is used in generation of tempfile names.
263  */
264 static long tempFileCounter = 0;
265 
266 /*
267  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
268  * indicating that the current database's default tablespace should be used.)
269  * When numTempTableSpaces is -1, this has not been set in the current
270  * transaction.
271  */
272 static Oid *tempTableSpaces = NULL;
273 static int	numTempTableSpaces = -1;
274 static int	nextTempTableSpace = 0;
275 
276 
277 /*--------------------
278  *
279  * Private Routines
280  *
281  * Delete		   - delete a file from the Lru ring
282  * LruDelete	   - remove a file from the Lru ring and close its FD
283  * Insert		   - put a file at the front of the Lru ring
284  * LruInsert	   - put a file at the front of the Lru ring and open it
285  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
286  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
287  * AllocateVfd	   - grab a free (or new) file record (from VfdCache)
288  * FreeVfd		   - free a file record
289  *
290  * The Least Recently Used ring is a doubly linked list that begins and
291  * ends on element zero.  Element zero is special -- it doesn't represent
292  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
293  * anchor that shows us the beginning/end of the ring.
294  * Only VFD elements that are currently really open (have an FD assigned) are
295  * in the Lru ring.  Elements that are "virtually" open can be recognized
296  * by having a non-null fileName field.
297  *
298  * example:
299  *
300  *	   /--less----\				   /---------\
301  *	   v		   \			  v			  \
302  *	 #0 --more---> LeastRecentlyUsed --more-\ \
303  *	  ^\									| |
304  *	   \\less--> MostRecentlyUsedFile	<---/ |
305  *		\more---/					 \--less--/
306  *
307  *--------------------
308  */
309 static void Delete(File file);
310 static void LruDelete(File file);
311 static void Insert(File file);
312 static int	LruInsert(File file);
313 static bool ReleaseLruFile(void);
314 static void ReleaseLruFiles(void);
315 static File AllocateVfd(void);
316 static void FreeVfd(File file);
317 
318 static int	FileAccess(File file);
319 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
320 static bool reserveAllocatedDesc(void);
321 static int	FreeDesc(AllocateDesc *desc);
322 
323 static void AtProcExit_Files(int code, Datum arg);
324 static void CleanupTempFiles(bool isCommit, bool isProcExit);
325 static void RemovePgTempRelationFiles(const char *tsdirname);
326 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
327 
328 static void walkdir(const char *path,
329 					void (*action) (const char *fname, bool isdir, int elevel),
330 					bool process_symlinks,
331 					int elevel);
332 #ifdef PG_FLUSH_DATA_WORKS
333 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
334 #endif
335 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
336 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
337 
338 static int	fsync_parent_path(const char *fname, int elevel);
339 
340 
341 /*
342  * pg_fsync --- do fsync with or without writethrough
343  */
344 int
pg_fsync(int fd)345 pg_fsync(int fd)
346 {
347 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
348 	struct stat st;
349 
350 	/*
351 	 * Some operating system implementations of fsync() have requirements
352 	 * about the file access modes that were used when their file descriptor
353 	 * argument was opened, and these requirements differ depending on whether
354 	 * the file descriptor is for a directory.
355 	 *
356 	 * For any file descriptor that may eventually be handed to fsync(), we
357 	 * should have opened it with access modes that are compatible with
358 	 * fsync() on all supported systems, otherwise the code may not be
359 	 * portable, even if it runs ok on the current system.
360 	 *
361 	 * We assert here that a descriptor for a file was opened with write
362 	 * permissions (either O_RDWR or O_WRONLY) and for a directory without
363 	 * write permissions (O_RDONLY).
364 	 *
365 	 * Ignore any fstat errors and let the follow-up fsync() do its work.
366 	 * Doing this sanity check here counts for the case where fsync() is
367 	 * disabled.
368 	 */
369 	if (fstat(fd, &st) == 0)
370 	{
371 		int			desc_flags = fcntl(fd, F_GETFL);
372 
373 		/*
374 		 * O_RDONLY is historically 0, so just make sure that for directories
375 		 * no write flags are used.
376 		 */
377 		if (S_ISDIR(st.st_mode))
378 			Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
379 		else
380 			Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
381 	}
382 	errno = 0;
383 #endif
384 
385 	/* #if is to skip the sync_method test if there's no need for it */
386 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
387 	if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
388 		return pg_fsync_writethrough(fd);
389 	else
390 #endif
391 		return pg_fsync_no_writethrough(fd);
392 }
393 
394 
395 /*
396  * pg_fsync_no_writethrough --- same as fsync except does nothing if
397  *	enableFsync is off
398  */
399 int
pg_fsync_no_writethrough(int fd)400 pg_fsync_no_writethrough(int fd)
401 {
402 	if (enableFsync)
403 		return fsync(fd);
404 	else
405 		return 0;
406 }
407 
408 /*
409  * pg_fsync_writethrough
410  */
411 int
pg_fsync_writethrough(int fd)412 pg_fsync_writethrough(int fd)
413 {
414 	if (enableFsync)
415 	{
416 #ifdef WIN32
417 		return _commit(fd);
418 #elif defined(F_FULLFSYNC)
419 		return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
420 #else
421 		errno = ENOSYS;
422 		return -1;
423 #endif
424 	}
425 	else
426 		return 0;
427 }
428 
429 /*
430  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
431  *
432  * Not all platforms have fdatasync; treat as fsync if not available.
433  */
434 int
pg_fdatasync(int fd)435 pg_fdatasync(int fd)
436 {
437 	if (enableFsync)
438 	{
439 #ifdef HAVE_FDATASYNC
440 		return fdatasync(fd);
441 #else
442 		return fsync(fd);
443 #endif
444 	}
445 	else
446 		return 0;
447 }
448 
449 /*
450  * pg_flush_data --- advise OS that the described dirty data should be flushed
451  *
452  * offset of 0 with nbytes 0 means that the entire file should be flushed
453  */
454 void
pg_flush_data(int fd,off_t offset,off_t nbytes)455 pg_flush_data(int fd, off_t offset, off_t nbytes)
456 {
457 	/*
458 	 * Right now file flushing is primarily used to avoid making later
459 	 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
460 	 * if fsyncs are disabled - that's a decision we might want to make
461 	 * configurable at some point.
462 	 */
463 	if (!enableFsync)
464 		return;
465 
466 	/*
467 	 * We compile all alternatives that are supported on the current platform,
468 	 * to find portability problems more easily.
469 	 */
470 #if defined(HAVE_SYNC_FILE_RANGE)
471 	{
472 		int			rc;
473 		static bool not_implemented_by_kernel = false;
474 
475 		if (not_implemented_by_kernel)
476 			return;
477 
478 		/*
479 		 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
480 		 * tells the OS that writeback for the specified blocks should be
481 		 * started, but that we don't want to wait for completion.  Note that
482 		 * this call might block if too much dirty data exists in the range.
483 		 * This is the preferable method on OSs supporting it, as it works
484 		 * reliably when available (contrast to msync()) and doesn't flush out
485 		 * clean data (like FADV_DONTNEED).
486 		 */
487 		rc = sync_file_range(fd, offset, nbytes,
488 							 SYNC_FILE_RANGE_WRITE);
489 		if (rc != 0)
490 		{
491 			int			elevel;
492 
493 			/*
494 			 * For systems that don't have an implementation of
495 			 * sync_file_range() such as Windows WSL, generate only one
496 			 * warning and then suppress all further attempts by this process.
497 			 */
498 			if (errno == ENOSYS)
499 			{
500 				elevel = WARNING;
501 				not_implemented_by_kernel = true;
502 			}
503 			else
504 				elevel = data_sync_elevel(WARNING);
505 
506 			ereport(elevel,
507 					(errcode_for_file_access(),
508 					 errmsg("could not flush dirty data: %m")));
509 		}
510 
511 		return;
512 	}
513 #endif
514 #if !defined(WIN32) && defined(MS_ASYNC)
515 	{
516 		void	   *p;
517 		static int	pagesize = 0;
518 
519 		/*
520 		 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
521 		 * writeback. On linux it only does so if MS_SYNC is specified, but
522 		 * then it does the writeback synchronously. Luckily all common linux
523 		 * systems have sync_file_range().  This is preferable over
524 		 * FADV_DONTNEED because it doesn't flush out clean data.
525 		 *
526 		 * We map the file (mmap()), tell the kernel to sync back the contents
527 		 * (msync()), and then remove the mapping again (munmap()).
528 		 */
529 
530 		/* mmap() needs actual length if we want to map whole file */
531 		if (offset == 0 && nbytes == 0)
532 		{
533 			nbytes = lseek(fd, 0, SEEK_END);
534 			if (nbytes < 0)
535 			{
536 				ereport(WARNING,
537 						(errcode_for_file_access(),
538 						 errmsg("could not determine dirty data size: %m")));
539 				return;
540 			}
541 		}
542 
543 		/*
544 		 * Some platforms reject partial-page mmap() attempts.  To deal with
545 		 * that, just truncate the request to a page boundary.  If any extra
546 		 * bytes don't get flushed, well, it's only a hint anyway.
547 		 */
548 
549 		/* fetch pagesize only once */
550 		if (pagesize == 0)
551 			pagesize = sysconf(_SC_PAGESIZE);
552 
553 		/* align length to pagesize, dropping any fractional page */
554 		if (pagesize > 0)
555 			nbytes = (nbytes / pagesize) * pagesize;
556 
557 		/* fractional-page request is a no-op */
558 		if (nbytes <= 0)
559 			return;
560 
561 		/*
562 		 * mmap could well fail, particularly on 32-bit platforms where there
563 		 * may simply not be enough address space.  If so, silently fall
564 		 * through to the next implementation.
565 		 */
566 		if (nbytes <= (off_t) SSIZE_MAX)
567 			p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
568 		else
569 			p = MAP_FAILED;
570 
571 		if (p != MAP_FAILED)
572 		{
573 			int			rc;
574 
575 			rc = msync(p, (size_t) nbytes, MS_ASYNC);
576 			if (rc != 0)
577 			{
578 				ereport(data_sync_elevel(WARNING),
579 						(errcode_for_file_access(),
580 						 errmsg("could not flush dirty data: %m")));
581 				/* NB: need to fall through to munmap()! */
582 			}
583 
584 			rc = munmap(p, (size_t) nbytes);
585 			if (rc != 0)
586 			{
587 				/* FATAL error because mapping would remain */
588 				ereport(FATAL,
589 						(errcode_for_file_access(),
590 						 errmsg("could not munmap() while flushing data: %m")));
591 			}
592 
593 			return;
594 		}
595 	}
596 #endif
597 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
598 	{
599 		int			rc;
600 
601 		/*
602 		 * Signal the kernel that the passed in range should not be cached
603 		 * anymore. This has the, desired, side effect of writing out dirty
604 		 * data, and the, undesired, side effect of likely discarding useful
605 		 * clean cached blocks.  For the latter reason this is the least
606 		 * preferable method.
607 		 */
608 
609 		rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
610 
611 		if (rc != 0)
612 		{
613 			/* don't error out, this is just a performance optimization */
614 			ereport(WARNING,
615 					(errcode_for_file_access(),
616 					 errmsg("could not flush dirty data: %m")));
617 		}
618 
619 		return;
620 	}
621 #endif
622 }
623 
624 
625 /*
626  * fsync_fname -- fsync a file or directory, handling errors properly
627  *
628  * Try to fsync a file or directory. When doing the latter, ignore errors that
629  * indicate the OS just doesn't allow/require fsyncing directories.
630  */
631 void
fsync_fname(const char * fname,bool isdir)632 fsync_fname(const char *fname, bool isdir)
633 {
634 	fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
635 }
636 
637 /*
638  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
639  *
640  * This routine ensures that, after returning, the effect of renaming file
641  * persists in case of a crash. A crash while this routine is running will
642  * leave you with either the pre-existing or the moved file in place of the
643  * new file; no mixed state or truncated files are possible.
644  *
645  * It does so by using fsync on the old filename and the possibly existing
646  * target filename before the rename, and the target file and directory after.
647  *
648  * Note that rename() cannot be used across arbitrary directories, as they
649  * might not be on the same filesystem. Therefore this routine does not
650  * support renaming across directories.
651  *
652  * Log errors with the caller specified severity.
653  *
654  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
655  * valid upon return.
656  */
657 int
durable_rename(const char * oldfile,const char * newfile,int elevel)658 durable_rename(const char *oldfile, const char *newfile, int elevel)
659 {
660 	int			fd;
661 
662 	/*
663 	 * First fsync the old and target path (if it exists), to ensure that they
664 	 * are properly persistent on disk. Syncing the target file is not
665 	 * strictly necessary, but it makes it easier to reason about crashes;
666 	 * because it's then guaranteed that either source or target file exists
667 	 * after a crash.
668 	 */
669 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
670 		return -1;
671 
672 	fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
673 	if (fd < 0)
674 	{
675 		if (errno != ENOENT)
676 		{
677 			ereport(elevel,
678 					(errcode_for_file_access(),
679 					 errmsg("could not open file \"%s\": %m", newfile)));
680 			return -1;
681 		}
682 	}
683 	else
684 	{
685 		if (pg_fsync(fd) != 0)
686 		{
687 			int			save_errno;
688 
689 			/* close file upon error, might not be in transaction context */
690 			save_errno = errno;
691 			CloseTransientFile(fd);
692 			errno = save_errno;
693 
694 			ereport(elevel,
695 					(errcode_for_file_access(),
696 					 errmsg("could not fsync file \"%s\": %m", newfile)));
697 			return -1;
698 		}
699 
700 		if (CloseTransientFile(fd) != 0)
701 		{
702 			ereport(elevel,
703 					(errcode_for_file_access(),
704 					 errmsg("could not close file \"%s\": %m", newfile)));
705 			return -1;
706 		}
707 	}
708 
709 	/* Time to do the real deal... */
710 	if (rename(oldfile, newfile) < 0)
711 	{
712 		ereport(elevel,
713 				(errcode_for_file_access(),
714 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
715 						oldfile, newfile)));
716 		return -1;
717 	}
718 
719 	/*
720 	 * To guarantee renaming the file is persistent, fsync the file with its
721 	 * new name, and its containing directory.
722 	 */
723 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
724 		return -1;
725 
726 	if (fsync_parent_path(newfile, elevel) != 0)
727 		return -1;
728 
729 	return 0;
730 }
731 
732 /*
733  * durable_unlink -- remove a file in a durable manner
734  *
735  * This routine ensures that, after returning, the effect of removing file
736  * persists in case of a crash. A crash while this routine is running will
737  * leave the system in no mixed state.
738  *
739  * It does so by using fsync on the parent directory of the file after the
740  * actual removal is done.
741  *
742  * Log errors with the severity specified by caller.
743  *
744  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
745  * valid upon return.
746  */
747 int
durable_unlink(const char * fname,int elevel)748 durable_unlink(const char *fname, int elevel)
749 {
750 	if (unlink(fname) < 0)
751 	{
752 		ereport(elevel,
753 				(errcode_for_file_access(),
754 				 errmsg("could not remove file \"%s\": %m",
755 						fname)));
756 		return -1;
757 	}
758 
759 	/*
760 	 * To guarantee that the removal of the file is persistent, fsync its
761 	 * parent directory.
762 	 */
763 	if (fsync_parent_path(fname, elevel) != 0)
764 		return -1;
765 
766 	return 0;
767 }
768 
769 /*
770  * durable_rename_excl -- rename a file in a durable manner.
771  *
772  * Similar to durable_rename(), except that this routine tries (but does not
773  * guarantee) not to overwrite the target file.
774  *
775  * Note that a crash in an unfortunate moment can leave you with two links to
776  * the target file.
777  *
778  * Log errors with the caller specified severity.
779  *
780  * On Windows, using a hard link followed by unlink() causes concurrency
781  * issues, while a simple rename() does not cause that, so be careful when
782  * changing the logic of this routine.
783  *
784  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
785  * valid upon return.
786  */
787 int
durable_rename_excl(const char * oldfile,const char * newfile,int elevel)788 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
789 {
790 	/*
791 	 * Ensure that, if we crash directly after the rename/link, a file with
792 	 * valid contents is moved into place.
793 	 */
794 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
795 		return -1;
796 
797 #ifdef HAVE_WORKING_LINK
798 	if (link(oldfile, newfile) < 0)
799 	{
800 		ereport(elevel,
801 				(errcode_for_file_access(),
802 				 errmsg("could not link file \"%s\" to \"%s\": %m",
803 						oldfile, newfile)));
804 		return -1;
805 	}
806 	unlink(oldfile);
807 #else
808 	if (rename(oldfile, newfile) < 0)
809 	{
810 		ereport(elevel,
811 				(errcode_for_file_access(),
812 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
813 						oldfile, newfile)));
814 		return -1;
815 	}
816 #endif
817 
818 	/*
819 	 * Make change persistent in case of an OS crash, both the new entry and
820 	 * its parent directory need to be flushed.
821 	 */
822 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
823 		return -1;
824 
825 	/* Same for parent directory */
826 	if (fsync_parent_path(newfile, elevel) != 0)
827 		return -1;
828 
829 	return 0;
830 }
831 
832 /*
833  * InitFileAccess --- initialize this module during backend startup
834  *
835  * This is called during either normal or standalone backend start.
836  * It is *not* called in the postmaster.
837  */
838 void
InitFileAccess(void)839 InitFileAccess(void)
840 {
841 	Assert(SizeVfdCache == 0);	/* call me only once */
842 
843 	/* initialize cache header entry */
844 	VfdCache = (Vfd *) malloc(sizeof(Vfd));
845 	if (VfdCache == NULL)
846 		ereport(FATAL,
847 				(errcode(ERRCODE_OUT_OF_MEMORY),
848 				 errmsg("out of memory")));
849 
850 	MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
851 	VfdCache->fd = VFD_CLOSED;
852 
853 	SizeVfdCache = 1;
854 
855 	/* register proc-exit hook to ensure temp files are dropped at exit */
856 	on_proc_exit(AtProcExit_Files, 0);
857 }
858 
859 /*
860  * count_usable_fds --- count how many FDs the system will let us open,
861  *		and estimate how many are already open.
862  *
863  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
864  * value of max_to_probe might result in an underestimate of already_open;
865  * we must fill in any "gaps" in the set of used FDs before the calculation
866  * of already_open will give the right answer.  In practice, max_to_probe
867  * of a couple of dozen should be enough to ensure good results.
868  *
869  * We assume stdin (FD 0) is available for dup'ing
870  */
871 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)872 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
873 {
874 	int		   *fd;
875 	int			size;
876 	int			used = 0;
877 	int			highestfd = 0;
878 	int			j;
879 
880 #ifdef HAVE_GETRLIMIT
881 	struct rlimit rlim;
882 	int			getrlimit_status;
883 #endif
884 
885 	size = 1024;
886 	fd = (int *) palloc(size * sizeof(int));
887 
888 #ifdef HAVE_GETRLIMIT
889 #ifdef RLIMIT_NOFILE			/* most platforms use RLIMIT_NOFILE */
890 	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
891 #else							/* but BSD doesn't ... */
892 	getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
893 #endif							/* RLIMIT_NOFILE */
894 	if (getrlimit_status != 0)
895 		ereport(WARNING, (errmsg("getrlimit failed: %m")));
896 #endif							/* HAVE_GETRLIMIT */
897 
898 	/* dup until failure or probe limit reached */
899 	for (;;)
900 	{
901 		int			thisfd;
902 
903 #ifdef HAVE_GETRLIMIT
904 
905 		/*
906 		 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
907 		 * some platforms
908 		 */
909 		if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
910 			break;
911 #endif
912 
913 		thisfd = dup(0);
914 		if (thisfd < 0)
915 		{
916 			/* Expect EMFILE or ENFILE, else it's fishy */
917 			if (errno != EMFILE && errno != ENFILE)
918 				elog(WARNING, "dup(0) failed after %d successes: %m", used);
919 			break;
920 		}
921 
922 		if (used >= size)
923 		{
924 			size *= 2;
925 			fd = (int *) repalloc(fd, size * sizeof(int));
926 		}
927 		fd[used++] = thisfd;
928 
929 		if (highestfd < thisfd)
930 			highestfd = thisfd;
931 
932 		if (used >= max_to_probe)
933 			break;
934 	}
935 
936 	/* release the files we opened */
937 	for (j = 0; j < used; j++)
938 		close(fd[j]);
939 
940 	pfree(fd);
941 
942 	/*
943 	 * Return results.  usable_fds is just the number of successful dups. We
944 	 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
945 	 * number) and so already_open is highestfd+1 - usable_fds.
946 	 */
947 	*usable_fds = used;
948 	*already_open = highestfd + 1 - used;
949 }
950 
951 /*
952  * set_max_safe_fds
953  *		Determine number of file descriptors that fd.c is allowed to use
954  */
955 void
set_max_safe_fds(void)956 set_max_safe_fds(void)
957 {
958 	int			usable_fds;
959 	int			already_open;
960 
961 	/*----------
962 	 * We want to set max_safe_fds to
963 	 *			MIN(usable_fds, max_files_per_process - already_open)
964 	 * less the slop factor for files that are opened without consulting
965 	 * fd.c.  This ensures that we won't exceed either max_files_per_process
966 	 * or the experimentally-determined EMFILE limit.
967 	 *----------
968 	 */
969 	count_usable_fds(max_files_per_process,
970 					 &usable_fds, &already_open);
971 
972 	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
973 
974 	/*
975 	 * Take off the FDs reserved for system() etc.
976 	 */
977 	max_safe_fds -= NUM_RESERVED_FDS;
978 
979 	/*
980 	 * Make sure we still have enough to get by.
981 	 */
982 	if (max_safe_fds < FD_MINFREE)
983 		ereport(FATAL,
984 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
985 				 errmsg("insufficient file descriptors available to start server process"),
986 				 errdetail("System allows %d, we need at least %d.",
987 						   max_safe_fds + NUM_RESERVED_FDS,
988 						   FD_MINFREE + NUM_RESERVED_FDS)));
989 
990 	elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
991 		 max_safe_fds, usable_fds, already_open);
992 }
993 
994 /*
995  * Open a file with BasicOpenFilePerm() and pass default file mode for the
996  * fileMode parameter.
997  */
998 int
BasicOpenFile(const char * fileName,int fileFlags)999 BasicOpenFile(const char *fileName, int fileFlags)
1000 {
1001 	return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1002 }
1003 
1004 /*
1005  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1006  *
1007  * This is exported for use by places that really want a plain kernel FD,
1008  * but need to be proof against running out of FDs.  Once an FD has been
1009  * successfully returned, it is the caller's responsibility to ensure that
1010  * it will not be leaked on ereport()!	Most users should *not* call this
1011  * routine directly, but instead use the VFD abstraction level, which
1012  * provides protection against descriptor leaks as well as management of
1013  * files that need to be open for more than a short period of time.
1014  *
1015  * Ideally this should be the *only* direct call of open() in the backend.
1016  * In practice, the postmaster calls open() directly, and there are some
1017  * direct open() calls done early in backend startup.  Those are OK since
1018  * this module wouldn't have any open files to close at that point anyway.
1019  */
1020 int
BasicOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1021 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1022 {
1023 	int			fd;
1024 
1025 tryAgain:
1026 	fd = open(fileName, fileFlags, fileMode);
1027 
1028 	if (fd >= 0)
1029 		return fd;				/* success! */
1030 
1031 	if (errno == EMFILE || errno == ENFILE)
1032 	{
1033 		int			save_errno = errno;
1034 
1035 		ereport(LOG,
1036 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1037 				 errmsg("out of file descriptors: %m; release and retry")));
1038 		errno = 0;
1039 		if (ReleaseLruFile())
1040 			goto tryAgain;
1041 		errno = save_errno;
1042 	}
1043 
1044 	return -1;					/* failure */
1045 }
1046 
1047 /*
1048  * AcquireExternalFD - attempt to reserve an external file descriptor
1049  *
1050  * This should be used by callers that need to hold a file descriptor open
1051  * over more than a short interval, but cannot use any of the other facilities
1052  * provided by this module.
1053  *
1054  * The difference between this and the underlying ReserveExternalFD function
1055  * is that this will report failure (by setting errno and returning false)
1056  * if "too many" external FDs are already reserved.  This should be used in
1057  * any code where the total number of FDs to be reserved is not predictable
1058  * and small.
1059  */
1060 bool
AcquireExternalFD(void)1061 AcquireExternalFD(void)
1062 {
1063 	/*
1064 	 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1065 	 * "external" FDs.
1066 	 */
1067 	if (numExternalFDs < max_safe_fds / 3)
1068 	{
1069 		ReserveExternalFD();
1070 		return true;
1071 	}
1072 	errno = EMFILE;
1073 	return false;
1074 }
1075 
1076 /*
1077  * ReserveExternalFD - report external consumption of a file descriptor
1078  *
1079  * This should be used by callers that need to hold a file descriptor open
1080  * over more than a short interval, but cannot use any of the other facilities
1081  * provided by this module.  This just tracks the use of the FD and closes
1082  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1083  *
1084  * Call this directly only in code where failure to reserve the FD would be
1085  * fatal; for example, the WAL-writing code does so, since the alternative is
1086  * session failure.  Also, it's very unwise to do so in code that could
1087  * consume more than one FD per process.
1088  *
1089  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1090  * available, it doesn't matter too much whether this is called before or
1091  * after actually opening the FD; but doing so beforehand reduces the risk of
1092  * an EMFILE failure if not everybody played nice.  In any case, it's solely
1093  * caller's responsibility to keep the external-FD count in sync with reality.
1094  */
1095 void
ReserveExternalFD(void)1096 ReserveExternalFD(void)
1097 {
1098 	/*
1099 	 * Release VFDs if needed to stay safe.  Because we do this before
1100 	 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1101 	 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1102 	 */
1103 	ReleaseLruFiles();
1104 
1105 	numExternalFDs++;
1106 }
1107 
1108 /*
1109  * ReleaseExternalFD - report release of an external file descriptor
1110  *
1111  * This is guaranteed not to change errno, so it can be used in failure paths.
1112  */
1113 void
ReleaseExternalFD(void)1114 ReleaseExternalFD(void)
1115 {
1116 	Assert(numExternalFDs > 0);
1117 	numExternalFDs--;
1118 }
1119 
1120 
1121 #if defined(FDDEBUG)
1122 
1123 static void
_dump_lru(void)1124 _dump_lru(void)
1125 {
1126 	int			mru = VfdCache[0].lruLessRecently;
1127 	Vfd		   *vfdP = &VfdCache[mru];
1128 	char		buf[2048];
1129 
1130 	snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1131 	while (mru != 0)
1132 	{
1133 		mru = vfdP->lruLessRecently;
1134 		vfdP = &VfdCache[mru];
1135 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1136 	}
1137 	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1138 	elog(LOG, "%s", buf);
1139 }
1140 #endif							/* FDDEBUG */
1141 
1142 static void
Delete(File file)1143 Delete(File file)
1144 {
1145 	Vfd		   *vfdP;
1146 
1147 	Assert(file != 0);
1148 
1149 	DO_DB(elog(LOG, "Delete %d (%s)",
1150 			   file, VfdCache[file].fileName));
1151 	DO_DB(_dump_lru());
1152 
1153 	vfdP = &VfdCache[file];
1154 
1155 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1156 	VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1157 
1158 	DO_DB(_dump_lru());
1159 }
1160 
1161 static void
LruDelete(File file)1162 LruDelete(File file)
1163 {
1164 	Vfd		   *vfdP;
1165 
1166 	Assert(file != 0);
1167 
1168 	DO_DB(elog(LOG, "LruDelete %d (%s)",
1169 			   file, VfdCache[file].fileName));
1170 
1171 	vfdP = &VfdCache[file];
1172 
1173 	/*
1174 	 * Close the file.  We aren't expecting this to fail; if it does, better
1175 	 * to leak the FD than to mess up our internal state.
1176 	 */
1177 	if (close(vfdP->fd) != 0)
1178 		elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1179 			 "could not close file \"%s\": %m", vfdP->fileName);
1180 	vfdP->fd = VFD_CLOSED;
1181 	--nfile;
1182 
1183 	/* delete the vfd record from the LRU ring */
1184 	Delete(file);
1185 }
1186 
1187 static void
Insert(File file)1188 Insert(File file)
1189 {
1190 	Vfd		   *vfdP;
1191 
1192 	Assert(file != 0);
1193 
1194 	DO_DB(elog(LOG, "Insert %d (%s)",
1195 			   file, VfdCache[file].fileName));
1196 	DO_DB(_dump_lru());
1197 
1198 	vfdP = &VfdCache[file];
1199 
1200 	vfdP->lruMoreRecently = 0;
1201 	vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1202 	VfdCache[0].lruLessRecently = file;
1203 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1204 
1205 	DO_DB(_dump_lru());
1206 }
1207 
1208 /* returns 0 on success, -1 on re-open failure (with errno set) */
1209 static int
LruInsert(File file)1210 LruInsert(File file)
1211 {
1212 	Vfd		   *vfdP;
1213 
1214 	Assert(file != 0);
1215 
1216 	DO_DB(elog(LOG, "LruInsert %d (%s)",
1217 			   file, VfdCache[file].fileName));
1218 
1219 	vfdP = &VfdCache[file];
1220 
1221 	if (FileIsNotOpen(file))
1222 	{
1223 		/* Close excess kernel FDs. */
1224 		ReleaseLruFiles();
1225 
1226 		/*
1227 		 * The open could still fail for lack of file descriptors, eg due to
1228 		 * overall system file table being full.  So, be prepared to release
1229 		 * another FD if necessary...
1230 		 */
1231 		vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1232 									 vfdP->fileMode);
1233 		if (vfdP->fd < 0)
1234 		{
1235 			DO_DB(elog(LOG, "re-open failed: %m"));
1236 			return -1;
1237 		}
1238 		else
1239 		{
1240 			++nfile;
1241 		}
1242 	}
1243 
1244 	/*
1245 	 * put it at the head of the Lru ring
1246 	 */
1247 
1248 	Insert(file);
1249 
1250 	return 0;
1251 }
1252 
1253 /*
1254  * Release one kernel FD by closing the least-recently-used VFD.
1255  */
1256 static bool
ReleaseLruFile(void)1257 ReleaseLruFile(void)
1258 {
1259 	DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1260 
1261 	if (nfile > 0)
1262 	{
1263 		/*
1264 		 * There are opened files and so there should be at least one used vfd
1265 		 * in the ring.
1266 		 */
1267 		Assert(VfdCache[0].lruMoreRecently != 0);
1268 		LruDelete(VfdCache[0].lruMoreRecently);
1269 		return true;			/* freed a file */
1270 	}
1271 	return false;				/* no files available to free */
1272 }
1273 
1274 /*
1275  * Release kernel FDs as needed to get under the max_safe_fds limit.
1276  * After calling this, it's OK to try to open another file.
1277  */
1278 static void
ReleaseLruFiles(void)1279 ReleaseLruFiles(void)
1280 {
1281 	while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1282 	{
1283 		if (!ReleaseLruFile())
1284 			break;
1285 	}
1286 }
1287 
1288 static File
AllocateVfd(void)1289 AllocateVfd(void)
1290 {
1291 	Index		i;
1292 	File		file;
1293 
1294 	DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1295 
1296 	Assert(SizeVfdCache > 0);	/* InitFileAccess not called? */
1297 
1298 	if (VfdCache[0].nextFree == 0)
1299 	{
1300 		/*
1301 		 * The free list is empty so it is time to increase the size of the
1302 		 * array.  We choose to double it each time this happens. However,
1303 		 * there's not much point in starting *real* small.
1304 		 */
1305 		Size		newCacheSize = SizeVfdCache * 2;
1306 		Vfd		   *newVfdCache;
1307 
1308 		if (newCacheSize < 32)
1309 			newCacheSize = 32;
1310 
1311 		/*
1312 		 * Be careful not to clobber VfdCache ptr if realloc fails.
1313 		 */
1314 		newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1315 		if (newVfdCache == NULL)
1316 			ereport(ERROR,
1317 					(errcode(ERRCODE_OUT_OF_MEMORY),
1318 					 errmsg("out of memory")));
1319 		VfdCache = newVfdCache;
1320 
1321 		/*
1322 		 * Initialize the new entries and link them into the free list.
1323 		 */
1324 		for (i = SizeVfdCache; i < newCacheSize; i++)
1325 		{
1326 			MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1327 			VfdCache[i].nextFree = i + 1;
1328 			VfdCache[i].fd = VFD_CLOSED;
1329 		}
1330 		VfdCache[newCacheSize - 1].nextFree = 0;
1331 		VfdCache[0].nextFree = SizeVfdCache;
1332 
1333 		/*
1334 		 * Record the new size
1335 		 */
1336 		SizeVfdCache = newCacheSize;
1337 	}
1338 
1339 	file = VfdCache[0].nextFree;
1340 
1341 	VfdCache[0].nextFree = VfdCache[file].nextFree;
1342 
1343 	return file;
1344 }
1345 
1346 static void
FreeVfd(File file)1347 FreeVfd(File file)
1348 {
1349 	Vfd		   *vfdP = &VfdCache[file];
1350 
1351 	DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1352 			   file, vfdP->fileName ? vfdP->fileName : ""));
1353 
1354 	if (vfdP->fileName != NULL)
1355 	{
1356 		free(vfdP->fileName);
1357 		vfdP->fileName = NULL;
1358 	}
1359 	vfdP->fdstate = 0x0;
1360 
1361 	vfdP->nextFree = VfdCache[0].nextFree;
1362 	VfdCache[0].nextFree = file;
1363 }
1364 
1365 /* returns 0 on success, -1 on re-open failure (with errno set) */
1366 static int
FileAccess(File file)1367 FileAccess(File file)
1368 {
1369 	int			returnValue;
1370 
1371 	DO_DB(elog(LOG, "FileAccess %d (%s)",
1372 			   file, VfdCache[file].fileName));
1373 
1374 	/*
1375 	 * Is the file open?  If not, open it and put it at the head of the LRU
1376 	 * ring (possibly closing the least recently used file to get an FD).
1377 	 */
1378 
1379 	if (FileIsNotOpen(file))
1380 	{
1381 		returnValue = LruInsert(file);
1382 		if (returnValue != 0)
1383 			return returnValue;
1384 	}
1385 	else if (VfdCache[0].lruLessRecently != file)
1386 	{
1387 		/*
1388 		 * We now know that the file is open and that it is not the last one
1389 		 * accessed, so we need to move it to the head of the Lru ring.
1390 		 */
1391 
1392 		Delete(file);
1393 		Insert(file);
1394 	}
1395 
1396 	return 0;
1397 }
1398 
1399 /*
1400  * Called whenever a temporary file is deleted to report its size.
1401  */
1402 static void
ReportTemporaryFileUsage(const char * path,off_t size)1403 ReportTemporaryFileUsage(const char *path, off_t size)
1404 {
1405 	pgstat_report_tempfile(size);
1406 
1407 	if (log_temp_files >= 0)
1408 	{
1409 		if ((size / 1024) >= log_temp_files)
1410 			ereport(LOG,
1411 					(errmsg("temporary file: path \"%s\", size %lu",
1412 							path, (unsigned long) size)));
1413 	}
1414 }
1415 
1416 /*
1417  * Called to register a temporary file for automatic close.
1418  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1419  * before the file was opened.
1420  */
1421 static void
RegisterTemporaryFile(File file)1422 RegisterTemporaryFile(File file)
1423 {
1424 	ResourceOwnerRememberFile(CurrentResourceOwner, file);
1425 	VfdCache[file].resowner = CurrentResourceOwner;
1426 
1427 	/* Backup mechanism for closing at end of xact. */
1428 	VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1429 	have_xact_temporary_files = true;
1430 }
1431 
1432 /*
1433  *	Called when we get a shared invalidation message on some relation.
1434  */
1435 #ifdef NOT_USED
1436 void
FileInvalidate(File file)1437 FileInvalidate(File file)
1438 {
1439 	Assert(FileIsValid(file));
1440 	if (!FileIsNotOpen(file))
1441 		LruDelete(file);
1442 }
1443 #endif
1444 
1445 /*
1446  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1447  * fileMode parameter.
1448  */
1449 File
PathNameOpenFile(const char * fileName,int fileFlags)1450 PathNameOpenFile(const char *fileName, int fileFlags)
1451 {
1452 	return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1453 }
1454 
1455 /*
1456  * open a file in an arbitrary directory
1457  *
1458  * NB: if the passed pathname is relative (which it usually is),
1459  * it will be interpreted relative to the process' working directory
1460  * (which should always be $PGDATA when this code is running).
1461  */
1462 File
PathNameOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1463 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1464 {
1465 	char	   *fnamecopy;
1466 	File		file;
1467 	Vfd		   *vfdP;
1468 
1469 	DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1470 			   fileName, fileFlags, fileMode));
1471 
1472 	/*
1473 	 * We need a malloc'd copy of the file name; fail cleanly if no room.
1474 	 */
1475 	fnamecopy = strdup(fileName);
1476 	if (fnamecopy == NULL)
1477 		ereport(ERROR,
1478 				(errcode(ERRCODE_OUT_OF_MEMORY),
1479 				 errmsg("out of memory")));
1480 
1481 	file = AllocateVfd();
1482 	vfdP = &VfdCache[file];
1483 
1484 	/* Close excess kernel FDs. */
1485 	ReleaseLruFiles();
1486 
1487 	vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1488 
1489 	if (vfdP->fd < 0)
1490 	{
1491 		int			save_errno = errno;
1492 
1493 		FreeVfd(file);
1494 		free(fnamecopy);
1495 		errno = save_errno;
1496 		return -1;
1497 	}
1498 	++nfile;
1499 	DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1500 			   vfdP->fd));
1501 
1502 	vfdP->fileName = fnamecopy;
1503 	/* Saved flags are adjusted to be OK for re-opening file */
1504 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1505 	vfdP->fileMode = fileMode;
1506 	vfdP->fileSize = 0;
1507 	vfdP->fdstate = 0x0;
1508 	vfdP->resowner = NULL;
1509 
1510 	Insert(file);
1511 
1512 	return file;
1513 }
1514 
1515 /*
1516  * Create directory 'directory'.  If necessary, create 'basedir', which must
1517  * be the directory above it.  This is designed for creating the top-level
1518  * temporary directory on demand before creating a directory underneath it.
1519  * Do nothing if the directory already exists.
1520  *
1521  * Directories created within the top-level temporary directory should begin
1522  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1523  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
1524  * that do not need any particular prefix.
1525 */
1526 void
PathNameCreateTemporaryDir(const char * basedir,const char * directory)1527 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1528 {
1529 	if (MakePGDirectory(directory) < 0)
1530 	{
1531 		if (errno == EEXIST)
1532 			return;
1533 
1534 		/*
1535 		 * Failed.  Try to create basedir first in case it's missing. Tolerate
1536 		 * EEXIST to close a race against another process following the same
1537 		 * algorithm.
1538 		 */
1539 		if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1540 			ereport(ERROR,
1541 					(errcode_for_file_access(),
1542 					 errmsg("cannot create temporary directory \"%s\": %m",
1543 							basedir)));
1544 
1545 		/* Try again. */
1546 		if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1547 			ereport(ERROR,
1548 					(errcode_for_file_access(),
1549 					 errmsg("cannot create temporary subdirectory \"%s\": %m",
1550 							directory)));
1551 	}
1552 }
1553 
1554 /*
1555  * Delete a directory and everything in it, if it exists.
1556  */
1557 void
PathNameDeleteTemporaryDir(const char * dirname)1558 PathNameDeleteTemporaryDir(const char *dirname)
1559 {
1560 	struct stat statbuf;
1561 
1562 	/* Silently ignore missing directory. */
1563 	if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1564 		return;
1565 
1566 	/*
1567 	 * Currently, walkdir doesn't offer a way for our passed in function to
1568 	 * maintain state.  Perhaps it should, so that we could tell the caller
1569 	 * whether this operation succeeded or failed.  Since this operation is
1570 	 * used in a cleanup path, we wouldn't actually behave differently: we'll
1571 	 * just log failures.
1572 	 */
1573 	walkdir(dirname, unlink_if_exists_fname, false, LOG);
1574 }
1575 
1576 /*
1577  * Open a temporary file that will disappear when we close it.
1578  *
1579  * This routine takes care of generating an appropriate tempfile name.
1580  * There's no need to pass in fileFlags or fileMode either, since only
1581  * one setting makes any sense for a temp file.
1582  *
1583  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1584  * to ensure it's closed and deleted when it's no longer needed, typically at
1585  * the end-of-transaction. In most cases, you don't want temporary files to
1586  * outlive the transaction that created them, so this should be false -- but
1587  * if you need "somewhat" temporary storage, this might be useful. In either
1588  * case, the file is removed when the File is explicitly closed.
1589  */
1590 File
OpenTemporaryFile(bool interXact)1591 OpenTemporaryFile(bool interXact)
1592 {
1593 	File		file = 0;
1594 
1595 	/*
1596 	 * Make sure the current resource owner has space for this File before we
1597 	 * open it, if we'll be registering it below.
1598 	 */
1599 	if (!interXact)
1600 		ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1601 
1602 	/*
1603 	 * If some temp tablespace(s) have been given to us, try to use the next
1604 	 * one.  If a given tablespace can't be found, we silently fall back to
1605 	 * the database's default tablespace.
1606 	 *
1607 	 * BUT: if the temp file is slated to outlive the current transaction,
1608 	 * force it into the database's default tablespace, so that it will not
1609 	 * pose a threat to possible tablespace drop attempts.
1610 	 */
1611 	if (numTempTableSpaces > 0 && !interXact)
1612 	{
1613 		Oid			tblspcOid = GetNextTempTableSpace();
1614 
1615 		if (OidIsValid(tblspcOid))
1616 			file = OpenTemporaryFileInTablespace(tblspcOid, false);
1617 	}
1618 
1619 	/*
1620 	 * If not, or if tablespace is bad, create in database's default
1621 	 * tablespace.  MyDatabaseTableSpace should normally be set before we get
1622 	 * here, but just in case it isn't, fall back to pg_default tablespace.
1623 	 */
1624 	if (file <= 0)
1625 		file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1626 											 MyDatabaseTableSpace :
1627 											 DEFAULTTABLESPACE_OID,
1628 											 true);
1629 
1630 	/* Mark it for deletion at close and temporary file size limit */
1631 	VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1632 
1633 	/* Register it with the current resource owner */
1634 	if (!interXact)
1635 		RegisterTemporaryFile(file);
1636 
1637 	return file;
1638 }
1639 
1640 /*
1641  * Return the path of the temp directory in a given tablespace.
1642  */
1643 void
TempTablespacePath(char * path,Oid tablespace)1644 TempTablespacePath(char *path, Oid tablespace)
1645 {
1646 	/*
1647 	 * Identify the tempfile directory for this tablespace.
1648 	 *
1649 	 * If someone tries to specify pg_global, use pg_default instead.
1650 	 */
1651 	if (tablespace == InvalidOid ||
1652 		tablespace == DEFAULTTABLESPACE_OID ||
1653 		tablespace == GLOBALTABLESPACE_OID)
1654 		snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1655 	else
1656 	{
1657 		/* All other tablespaces are accessed via symlinks */
1658 		snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1659 				 tablespace, TABLESPACE_VERSION_DIRECTORY,
1660 				 PG_TEMP_FILES_DIR);
1661 	}
1662 }
1663 
1664 /*
1665  * Open a temporary file in a specific tablespace.
1666  * Subroutine for OpenTemporaryFile, which see for details.
1667  */
1668 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1669 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1670 {
1671 	char		tempdirpath[MAXPGPATH];
1672 	char		tempfilepath[MAXPGPATH];
1673 	File		file;
1674 
1675 	TempTablespacePath(tempdirpath, tblspcOid);
1676 
1677 	/*
1678 	 * Generate a tempfile name that should be unique within the current
1679 	 * database instance.
1680 	 */
1681 	snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1682 			 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1683 
1684 	/*
1685 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1686 	 * temp file that can be reused.
1687 	 */
1688 	file = PathNameOpenFile(tempfilepath,
1689 							O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1690 	if (file <= 0)
1691 	{
1692 		/*
1693 		 * We might need to create the tablespace's tempfile directory, if no
1694 		 * one has yet done so.
1695 		 *
1696 		 * Don't check for an error from MakePGDirectory; it could fail if
1697 		 * someone else just did the same thing.  If it doesn't work then
1698 		 * we'll bomb out on the second create attempt, instead.
1699 		 */
1700 		(void) MakePGDirectory(tempdirpath);
1701 
1702 		file = PathNameOpenFile(tempfilepath,
1703 								O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1704 		if (file <= 0 && rejectError)
1705 			elog(ERROR, "could not create temporary file \"%s\": %m",
1706 				 tempfilepath);
1707 	}
1708 
1709 	return file;
1710 }
1711 
1712 
1713 /*
1714  * Create a new file.  The directory containing it must already exist.  Files
1715  * created this way are subject to temp_file_limit and are automatically
1716  * closed at end of transaction, but are not automatically deleted on close
1717  * because they are intended to be shared between cooperating backends.
1718  *
1719  * If the file is inside the top-level temporary directory, its name should
1720  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1721  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
1722  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1723  * the prefix isn't needed.
1724  */
1725 File
PathNameCreateTemporaryFile(const char * path,bool error_on_failure)1726 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1727 {
1728 	File		file;
1729 
1730 	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1731 
1732 	/*
1733 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1734 	 * temp file that can be reused.
1735 	 */
1736 	file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1737 	if (file <= 0)
1738 	{
1739 		if (error_on_failure)
1740 			ereport(ERROR,
1741 					(errcode_for_file_access(),
1742 					 errmsg("could not create temporary file \"%s\": %m",
1743 							path)));
1744 		else
1745 			return file;
1746 	}
1747 
1748 	/* Mark it for temp_file_limit accounting. */
1749 	VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1750 
1751 	/* Register it for automatic close. */
1752 	RegisterTemporaryFile(file);
1753 
1754 	return file;
1755 }
1756 
1757 /*
1758  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1759  * another backend.  Files opened this way don't count against the
1760  * temp_file_limit of the caller, are read-only and are automatically closed
1761  * at the end of the transaction but are not deleted on close.
1762  */
1763 File
PathNameOpenTemporaryFile(const char * path)1764 PathNameOpenTemporaryFile(const char *path)
1765 {
1766 	File		file;
1767 
1768 	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1769 
1770 	/* We open the file read-only. */
1771 	file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1772 
1773 	/* If no such file, then we don't raise an error. */
1774 	if (file <= 0 && errno != ENOENT)
1775 		ereport(ERROR,
1776 				(errcode_for_file_access(),
1777 				 errmsg("could not open temporary file \"%s\": %m",
1778 						path)));
1779 
1780 	if (file > 0)
1781 	{
1782 		/* Register it for automatic close. */
1783 		RegisterTemporaryFile(file);
1784 	}
1785 
1786 	return file;
1787 }
1788 
1789 /*
1790  * Delete a file by pathname.  Return true if the file existed, false if
1791  * didn't.
1792  */
1793 bool
PathNameDeleteTemporaryFile(const char * path,bool error_on_failure)1794 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1795 {
1796 	struct stat filestats;
1797 	int			stat_errno;
1798 
1799 	/* Get the final size for pgstat reporting. */
1800 	if (stat(path, &filestats) != 0)
1801 		stat_errno = errno;
1802 	else
1803 		stat_errno = 0;
1804 
1805 	/*
1806 	 * Unlike FileClose's automatic file deletion code, we tolerate
1807 	 * non-existence to support BufFileDeleteShared which doesn't know how
1808 	 * many segments it has to delete until it runs out.
1809 	 */
1810 	if (stat_errno == ENOENT)
1811 		return false;
1812 
1813 	if (unlink(path) < 0)
1814 	{
1815 		if (errno != ENOENT)
1816 			ereport(error_on_failure ? ERROR : LOG,
1817 					(errcode_for_file_access(),
1818 					 errmsg("could not unlink temporary file \"%s\": %m",
1819 							path)));
1820 		return false;
1821 	}
1822 
1823 	if (stat_errno == 0)
1824 		ReportTemporaryFileUsage(path, filestats.st_size);
1825 	else
1826 	{
1827 		errno = stat_errno;
1828 		ereport(LOG,
1829 				(errcode_for_file_access(),
1830 				 errmsg("could not stat file \"%s\": %m", path)));
1831 	}
1832 
1833 	return true;
1834 }
1835 
1836 /*
1837  * close a file when done with it
1838  */
1839 void
FileClose(File file)1840 FileClose(File file)
1841 {
1842 	Vfd		   *vfdP;
1843 
1844 	Assert(FileIsValid(file));
1845 
1846 	DO_DB(elog(LOG, "FileClose: %d (%s)",
1847 			   file, VfdCache[file].fileName));
1848 
1849 	vfdP = &VfdCache[file];
1850 
1851 	if (!FileIsNotOpen(file))
1852 	{
1853 		/* close the file */
1854 		if (close(vfdP->fd) != 0)
1855 		{
1856 			/*
1857 			 * We may need to panic on failure to close non-temporary files;
1858 			 * see LruDelete.
1859 			 */
1860 			elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1861 				 "could not close file \"%s\": %m", vfdP->fileName);
1862 		}
1863 
1864 		--nfile;
1865 		vfdP->fd = VFD_CLOSED;
1866 
1867 		/* remove the file from the lru ring */
1868 		Delete(file);
1869 	}
1870 
1871 	if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1872 	{
1873 		/* Subtract its size from current usage (do first in case of error) */
1874 		temporary_files_size -= vfdP->fileSize;
1875 		vfdP->fileSize = 0;
1876 	}
1877 
1878 	/*
1879 	 * Delete the file if it was temporary, and make a log entry if wanted
1880 	 */
1881 	if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1882 	{
1883 		struct stat filestats;
1884 		int			stat_errno;
1885 
1886 		/*
1887 		 * If we get an error, as could happen within the ereport/elog calls,
1888 		 * we'll come right back here during transaction abort.  Reset the
1889 		 * flag to ensure that we can't get into an infinite loop.  This code
1890 		 * is arranged to ensure that the worst-case consequence is failing to
1891 		 * emit log message(s), not failing to attempt the unlink.
1892 		 */
1893 		vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1894 
1895 
1896 		/* first try the stat() */
1897 		if (stat(vfdP->fileName, &filestats))
1898 			stat_errno = errno;
1899 		else
1900 			stat_errno = 0;
1901 
1902 		/* in any case do the unlink */
1903 		if (unlink(vfdP->fileName))
1904 			elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1905 
1906 		/* and last report the stat results */
1907 		if (stat_errno == 0)
1908 			ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1909 		else
1910 		{
1911 			errno = stat_errno;
1912 			elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1913 		}
1914 	}
1915 
1916 	/* Unregister it from the resource owner */
1917 	if (vfdP->resowner)
1918 		ResourceOwnerForgetFile(vfdP->resowner, file);
1919 
1920 	/*
1921 	 * Return the Vfd slot to the free list
1922 	 */
1923 	FreeVfd(file);
1924 }
1925 
1926 /*
1927  * FilePrefetch - initiate asynchronous read of a given range of the file.
1928  *
1929  * Currently the only implementation of this function is using posix_fadvise
1930  * which is the simplest standardized interface that accomplishes this.
1931  * We could add an implementation using libaio in the future; but note that
1932  * this API is inappropriate for libaio, which wants to have a buffer provided
1933  * to read into.
1934  */
1935 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1936 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1937 {
1938 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1939 	int			returnCode;
1940 
1941 	Assert(FileIsValid(file));
1942 
1943 	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1944 			   file, VfdCache[file].fileName,
1945 			   (int64) offset, amount));
1946 
1947 	returnCode = FileAccess(file);
1948 	if (returnCode < 0)
1949 		return returnCode;
1950 
1951 	pgstat_report_wait_start(wait_event_info);
1952 	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1953 							   POSIX_FADV_WILLNEED);
1954 	pgstat_report_wait_end();
1955 
1956 	return returnCode;
1957 #else
1958 	Assert(FileIsValid(file));
1959 	return 0;
1960 #endif
1961 }
1962 
1963 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)1964 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1965 {
1966 	int			returnCode;
1967 
1968 	Assert(FileIsValid(file));
1969 
1970 	DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1971 			   file, VfdCache[file].fileName,
1972 			   (int64) offset, (int64) nbytes));
1973 
1974 	if (nbytes <= 0)
1975 		return;
1976 
1977 	returnCode = FileAccess(file);
1978 	if (returnCode < 0)
1979 		return;
1980 
1981 	pgstat_report_wait_start(wait_event_info);
1982 	pg_flush_data(VfdCache[file].fd, offset, nbytes);
1983 	pgstat_report_wait_end();
1984 }
1985 
1986 int
FileRead(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)1987 FileRead(File file, char *buffer, int amount, off_t offset,
1988 		 uint32 wait_event_info)
1989 {
1990 	int			returnCode;
1991 	Vfd		   *vfdP;
1992 
1993 	Assert(FileIsValid(file));
1994 
1995 	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1996 			   file, VfdCache[file].fileName,
1997 			   (int64) offset,
1998 			   amount, buffer));
1999 
2000 	returnCode = FileAccess(file);
2001 	if (returnCode < 0)
2002 		return returnCode;
2003 
2004 	vfdP = &VfdCache[file];
2005 
2006 retry:
2007 	pgstat_report_wait_start(wait_event_info);
2008 	returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2009 	pgstat_report_wait_end();
2010 
2011 	if (returnCode < 0)
2012 	{
2013 		/*
2014 		 * Windows may run out of kernel buffers and return "Insufficient
2015 		 * system resources" error.  Wait a bit and retry to solve it.
2016 		 *
2017 		 * It is rumored that EINTR is also possible on some Unix filesystems,
2018 		 * in which case immediate retry is indicated.
2019 		 */
2020 #ifdef WIN32
2021 		DWORD		error = GetLastError();
2022 
2023 		switch (error)
2024 		{
2025 			case ERROR_NO_SYSTEM_RESOURCES:
2026 				pg_usleep(1000L);
2027 				errno = EINTR;
2028 				break;
2029 			default:
2030 				_dosmaperr(error);
2031 				break;
2032 		}
2033 #endif
2034 		/* OK to retry if interrupted */
2035 		if (errno == EINTR)
2036 			goto retry;
2037 	}
2038 
2039 	return returnCode;
2040 }
2041 
2042 int
FileWrite(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)2043 FileWrite(File file, char *buffer, int amount, off_t offset,
2044 		  uint32 wait_event_info)
2045 {
2046 	int			returnCode;
2047 	Vfd		   *vfdP;
2048 
2049 	Assert(FileIsValid(file));
2050 
2051 	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2052 			   file, VfdCache[file].fileName,
2053 			   (int64) offset,
2054 			   amount, buffer));
2055 
2056 	returnCode = FileAccess(file);
2057 	if (returnCode < 0)
2058 		return returnCode;
2059 
2060 	vfdP = &VfdCache[file];
2061 
2062 	/*
2063 	 * If enforcing temp_file_limit and it's a temp file, check to see if the
2064 	 * write would overrun temp_file_limit, and throw error if so.  Note: it's
2065 	 * really a modularity violation to throw error here; we should set errno
2066 	 * and return -1.  However, there's no way to report a suitable error
2067 	 * message if we do that.  All current callers would just throw error
2068 	 * immediately anyway, so this is safe at present.
2069 	 */
2070 	if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2071 	{
2072 		off_t		past_write = offset + amount;
2073 
2074 		if (past_write > vfdP->fileSize)
2075 		{
2076 			uint64		newTotal = temporary_files_size;
2077 
2078 			newTotal += past_write - vfdP->fileSize;
2079 			if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2080 				ereport(ERROR,
2081 						(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2082 						 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2083 								temp_file_limit)));
2084 		}
2085 	}
2086 
2087 retry:
2088 	errno = 0;
2089 	pgstat_report_wait_start(wait_event_info);
2090 	returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2091 	pgstat_report_wait_end();
2092 
2093 	/* if write didn't set errno, assume problem is no disk space */
2094 	if (returnCode != amount && errno == 0)
2095 		errno = ENOSPC;
2096 
2097 	if (returnCode >= 0)
2098 	{
2099 		/*
2100 		 * Maintain fileSize and temporary_files_size if it's a temp file.
2101 		 */
2102 		if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2103 		{
2104 			off_t		past_write = offset + amount;
2105 
2106 			if (past_write > vfdP->fileSize)
2107 			{
2108 				temporary_files_size += past_write - vfdP->fileSize;
2109 				vfdP->fileSize = past_write;
2110 			}
2111 		}
2112 	}
2113 	else
2114 	{
2115 		/*
2116 		 * See comments in FileRead()
2117 		 */
2118 #ifdef WIN32
2119 		DWORD		error = GetLastError();
2120 
2121 		switch (error)
2122 		{
2123 			case ERROR_NO_SYSTEM_RESOURCES:
2124 				pg_usleep(1000L);
2125 				errno = EINTR;
2126 				break;
2127 			default:
2128 				_dosmaperr(error);
2129 				break;
2130 		}
2131 #endif
2132 		/* OK to retry if interrupted */
2133 		if (errno == EINTR)
2134 			goto retry;
2135 	}
2136 
2137 	return returnCode;
2138 }
2139 
2140 int
FileSync(File file,uint32 wait_event_info)2141 FileSync(File file, uint32 wait_event_info)
2142 {
2143 	int			returnCode;
2144 
2145 	Assert(FileIsValid(file));
2146 
2147 	DO_DB(elog(LOG, "FileSync: %d (%s)",
2148 			   file, VfdCache[file].fileName));
2149 
2150 	returnCode = FileAccess(file);
2151 	if (returnCode < 0)
2152 		return returnCode;
2153 
2154 	pgstat_report_wait_start(wait_event_info);
2155 	returnCode = pg_fsync(VfdCache[file].fd);
2156 	pgstat_report_wait_end();
2157 
2158 	return returnCode;
2159 }
2160 
2161 off_t
FileSize(File file)2162 FileSize(File file)
2163 {
2164 	Assert(FileIsValid(file));
2165 
2166 	DO_DB(elog(LOG, "FileSize %d (%s)",
2167 			   file, VfdCache[file].fileName));
2168 
2169 	if (FileIsNotOpen(file))
2170 	{
2171 		if (FileAccess(file) < 0)
2172 			return (off_t) -1;
2173 	}
2174 
2175 	return lseek(VfdCache[file].fd, 0, SEEK_END);
2176 }
2177 
2178 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)2179 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2180 {
2181 	int			returnCode;
2182 
2183 	Assert(FileIsValid(file));
2184 
2185 	DO_DB(elog(LOG, "FileTruncate %d (%s)",
2186 			   file, VfdCache[file].fileName));
2187 
2188 	returnCode = FileAccess(file);
2189 	if (returnCode < 0)
2190 		return returnCode;
2191 
2192 	pgstat_report_wait_start(wait_event_info);
2193 	returnCode = ftruncate(VfdCache[file].fd, offset);
2194 	pgstat_report_wait_end();
2195 
2196 	if (returnCode == 0 && VfdCache[file].fileSize > offset)
2197 	{
2198 		/* adjust our state for truncation of a temp file */
2199 		Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2200 		temporary_files_size -= VfdCache[file].fileSize - offset;
2201 		VfdCache[file].fileSize = offset;
2202 	}
2203 
2204 	return returnCode;
2205 }
2206 
2207 /*
2208  * Return the pathname associated with an open file.
2209  *
2210  * The returned string points to an internal buffer, which is valid until
2211  * the file is closed.
2212  */
2213 char *
FilePathName(File file)2214 FilePathName(File file)
2215 {
2216 	Assert(FileIsValid(file));
2217 
2218 	return VfdCache[file].fileName;
2219 }
2220 
2221 /*
2222  * Return the raw file descriptor of an opened file.
2223  *
2224  * The returned file descriptor will be valid until the file is closed, but
2225  * there are a lot of things that can make that happen.  So the caller should
2226  * be careful not to do much of anything else before it finishes using the
2227  * returned file descriptor.
2228  */
2229 int
FileGetRawDesc(File file)2230 FileGetRawDesc(File file)
2231 {
2232 	Assert(FileIsValid(file));
2233 	return VfdCache[file].fd;
2234 }
2235 
2236 /*
2237  * FileGetRawFlags - returns the file flags on open(2)
2238  */
2239 int
FileGetRawFlags(File file)2240 FileGetRawFlags(File file)
2241 {
2242 	Assert(FileIsValid(file));
2243 	return VfdCache[file].fileFlags;
2244 }
2245 
2246 /*
2247  * FileGetRawMode - returns the mode bitmask passed to open(2)
2248  */
2249 mode_t
FileGetRawMode(File file)2250 FileGetRawMode(File file)
2251 {
2252 	Assert(FileIsValid(file));
2253 	return VfdCache[file].fileMode;
2254 }
2255 
2256 /*
2257  * Make room for another allocatedDescs[] array entry if needed and possible.
2258  * Returns true if an array element is available.
2259  */
2260 static bool
reserveAllocatedDesc(void)2261 reserveAllocatedDesc(void)
2262 {
2263 	AllocateDesc *newDescs;
2264 	int			newMax;
2265 
2266 	/* Quick out if array already has a free slot. */
2267 	if (numAllocatedDescs < maxAllocatedDescs)
2268 		return true;
2269 
2270 	/*
2271 	 * If the array hasn't yet been created in the current process, initialize
2272 	 * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
2273 	 * we will ever need, anyway.  We don't want to look at max_safe_fds
2274 	 * immediately because set_max_safe_fds() may not have run yet.
2275 	 */
2276 	if (allocatedDescs == NULL)
2277 	{
2278 		newMax = FD_MINFREE / 3;
2279 		newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2280 		/* Out of memory already?  Treat as fatal error. */
2281 		if (newDescs == NULL)
2282 			ereport(ERROR,
2283 					(errcode(ERRCODE_OUT_OF_MEMORY),
2284 					 errmsg("out of memory")));
2285 		allocatedDescs = newDescs;
2286 		maxAllocatedDescs = newMax;
2287 		return true;
2288 	}
2289 
2290 	/*
2291 	 * Consider enlarging the array beyond the initial allocation used above.
2292 	 * By the time this happens, max_safe_fds should be known accurately.
2293 	 *
2294 	 * We mustn't let allocated descriptors hog all the available FDs, and in
2295 	 * practice we'd better leave a reasonable number of FDs for VFD use.  So
2296 	 * set the maximum to max_safe_fds / 3.  (This should certainly be at
2297 	 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2298 	 * tightening the restriction here.)  Recall that "external" FDs are
2299 	 * allowed to consume another third of max_safe_fds.
2300 	 */
2301 	newMax = max_safe_fds / 3;
2302 	if (newMax > maxAllocatedDescs)
2303 	{
2304 		newDescs = (AllocateDesc *) realloc(allocatedDescs,
2305 											newMax * sizeof(AllocateDesc));
2306 		/* Treat out-of-memory as a non-fatal error. */
2307 		if (newDescs == NULL)
2308 			return false;
2309 		allocatedDescs = newDescs;
2310 		maxAllocatedDescs = newMax;
2311 		return true;
2312 	}
2313 
2314 	/* Can't enlarge allocatedDescs[] any more. */
2315 	return false;
2316 }
2317 
2318 /*
2319  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2320  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
2321  * necessary to open the file.  When done, call FreeFile rather than fclose.
2322  *
2323  * Note that files that will be open for any significant length of time
2324  * should NOT be handled this way, since they cannot share kernel file
2325  * descriptors with other files; there is grave risk of running out of FDs
2326  * if anyone locks down too many FDs.  Most callers of this routine are
2327  * simply reading a config file that they will read and close immediately.
2328  *
2329  * fd.c will automatically close all files opened with AllocateFile at
2330  * transaction commit or abort; this prevents FD leakage if a routine
2331  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2332  *
2333  * Ideally this should be the *only* direct call of fopen() in the backend.
2334  */
2335 FILE *
AllocateFile(const char * name,const char * mode)2336 AllocateFile(const char *name, const char *mode)
2337 {
2338 	FILE	   *file;
2339 
2340 	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2341 			   numAllocatedDescs, name));
2342 
2343 	/* Can we allocate another non-virtual FD? */
2344 	if (!reserveAllocatedDesc())
2345 		ereport(ERROR,
2346 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2347 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2348 						maxAllocatedDescs, name)));
2349 
2350 	/* Close excess kernel FDs. */
2351 	ReleaseLruFiles();
2352 
2353 TryAgain:
2354 	if ((file = fopen(name, mode)) != NULL)
2355 	{
2356 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2357 
2358 		desc->kind = AllocateDescFile;
2359 		desc->desc.file = file;
2360 		desc->create_subid = GetCurrentSubTransactionId();
2361 		numAllocatedDescs++;
2362 		return desc->desc.file;
2363 	}
2364 
2365 	if (errno == EMFILE || errno == ENFILE)
2366 	{
2367 		int			save_errno = errno;
2368 
2369 		ereport(LOG,
2370 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2371 				 errmsg("out of file descriptors: %m; release and retry")));
2372 		errno = 0;
2373 		if (ReleaseLruFile())
2374 			goto TryAgain;
2375 		errno = save_errno;
2376 	}
2377 
2378 	return NULL;
2379 }
2380 
2381 /*
2382  * Open a file with OpenTransientFilePerm() and pass default file mode for
2383  * the fileMode parameter.
2384  */
2385 int
OpenTransientFile(const char * fileName,int fileFlags)2386 OpenTransientFile(const char *fileName, int fileFlags)
2387 {
2388 	return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2389 }
2390 
2391 /*
2392  * Like AllocateFile, but returns an unbuffered fd like open(2)
2393  */
2394 int
OpenTransientFilePerm(const char * fileName,int fileFlags,mode_t fileMode)2395 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2396 {
2397 	int			fd;
2398 
2399 	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2400 			   numAllocatedDescs, fileName));
2401 
2402 	/* Can we allocate another non-virtual FD? */
2403 	if (!reserveAllocatedDesc())
2404 		ereport(ERROR,
2405 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2406 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2407 						maxAllocatedDescs, fileName)));
2408 
2409 	/* Close excess kernel FDs. */
2410 	ReleaseLruFiles();
2411 
2412 	fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2413 
2414 	if (fd >= 0)
2415 	{
2416 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2417 
2418 		desc->kind = AllocateDescRawFD;
2419 		desc->desc.fd = fd;
2420 		desc->create_subid = GetCurrentSubTransactionId();
2421 		numAllocatedDescs++;
2422 
2423 		return fd;
2424 	}
2425 
2426 	return -1;					/* failure */
2427 }
2428 
2429 /*
2430  * Routines that want to initiate a pipe stream should use OpenPipeStream
2431  * rather than plain popen().  This lets fd.c deal with freeing FDs if
2432  * necessary.  When done, call ClosePipeStream rather than pclose.
2433  *
2434  * This function also ensures that the popen'd program is run with default
2435  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2436  * uses.  This ensures desirable response to, eg, closing a read pipe early.
2437  */
2438 FILE *
OpenPipeStream(const char * command,const char * mode)2439 OpenPipeStream(const char *command, const char *mode)
2440 {
2441 	FILE	   *file;
2442 	int			save_errno;
2443 
2444 	DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2445 			   numAllocatedDescs, command));
2446 
2447 	/* Can we allocate another non-virtual FD? */
2448 	if (!reserveAllocatedDesc())
2449 		ereport(ERROR,
2450 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2451 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2452 						maxAllocatedDescs, command)));
2453 
2454 	/* Close excess kernel FDs. */
2455 	ReleaseLruFiles();
2456 
2457 TryAgain:
2458 	fflush(stdout);
2459 	fflush(stderr);
2460 	pqsignal(SIGPIPE, SIG_DFL);
2461 	errno = 0;
2462 	file = popen(command, mode);
2463 	save_errno = errno;
2464 	pqsignal(SIGPIPE, SIG_IGN);
2465 	errno = save_errno;
2466 	if (file != NULL)
2467 	{
2468 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2469 
2470 		desc->kind = AllocateDescPipe;
2471 		desc->desc.file = file;
2472 		desc->create_subid = GetCurrentSubTransactionId();
2473 		numAllocatedDescs++;
2474 		return desc->desc.file;
2475 	}
2476 
2477 	if (errno == EMFILE || errno == ENFILE)
2478 	{
2479 		ereport(LOG,
2480 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2481 				 errmsg("out of file descriptors: %m; release and retry")));
2482 		if (ReleaseLruFile())
2483 			goto TryAgain;
2484 		errno = save_errno;
2485 	}
2486 
2487 	return NULL;
2488 }
2489 
2490 /*
2491  * Free an AllocateDesc of any type.
2492  *
2493  * The argument *must* point into the allocatedDescs[] array.
2494  */
2495 static int
FreeDesc(AllocateDesc * desc)2496 FreeDesc(AllocateDesc *desc)
2497 {
2498 	int			result;
2499 
2500 	/* Close the underlying object */
2501 	switch (desc->kind)
2502 	{
2503 		case AllocateDescFile:
2504 			result = fclose(desc->desc.file);
2505 			break;
2506 		case AllocateDescPipe:
2507 			result = pclose(desc->desc.file);
2508 			break;
2509 		case AllocateDescDir:
2510 			result = closedir(desc->desc.dir);
2511 			break;
2512 		case AllocateDescRawFD:
2513 			result = close(desc->desc.fd);
2514 			break;
2515 		default:
2516 			elog(ERROR, "AllocateDesc kind not recognized");
2517 			result = 0;			/* keep compiler quiet */
2518 			break;
2519 	}
2520 
2521 	/* Compact storage in the allocatedDescs array */
2522 	numAllocatedDescs--;
2523 	*desc = allocatedDescs[numAllocatedDescs];
2524 
2525 	return result;
2526 }
2527 
2528 /*
2529  * Close a file returned by AllocateFile.
2530  *
2531  * Note we do not check fclose's return value --- it is up to the caller
2532  * to handle close errors.
2533  */
2534 int
FreeFile(FILE * file)2535 FreeFile(FILE *file)
2536 {
2537 	int			i;
2538 
2539 	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2540 
2541 	/* Remove file from list of allocated files, if it's present */
2542 	for (i = numAllocatedDescs; --i >= 0;)
2543 	{
2544 		AllocateDesc *desc = &allocatedDescs[i];
2545 
2546 		if (desc->kind == AllocateDescFile && desc->desc.file == file)
2547 			return FreeDesc(desc);
2548 	}
2549 
2550 	/* Only get here if someone passes us a file not in allocatedDescs */
2551 	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2552 
2553 	return fclose(file);
2554 }
2555 
2556 /*
2557  * Close a file returned by OpenTransientFile.
2558  *
2559  * Note we do not check close's return value --- it is up to the caller
2560  * to handle close errors.
2561  */
2562 int
CloseTransientFile(int fd)2563 CloseTransientFile(int fd)
2564 {
2565 	int			i;
2566 
2567 	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2568 
2569 	/* Remove fd from list of allocated files, if it's present */
2570 	for (i = numAllocatedDescs; --i >= 0;)
2571 	{
2572 		AllocateDesc *desc = &allocatedDescs[i];
2573 
2574 		if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2575 			return FreeDesc(desc);
2576 	}
2577 
2578 	/* Only get here if someone passes us a file not in allocatedDescs */
2579 	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2580 
2581 	return close(fd);
2582 }
2583 
2584 /*
2585  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2586  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
2587  * necessary to open the directory, and with closing it after an elog.
2588  * When done, call FreeDir rather than closedir.
2589  *
2590  * Returns NULL, with errno set, on failure.  Note that failure detection
2591  * is commonly left to the following call of ReadDir or ReadDirExtended;
2592  * see the comments for ReadDir.
2593  *
2594  * Ideally this should be the *only* direct call of opendir() in the backend.
2595  */
2596 DIR *
AllocateDir(const char * dirname)2597 AllocateDir(const char *dirname)
2598 {
2599 	DIR		   *dir;
2600 
2601 	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2602 			   numAllocatedDescs, dirname));
2603 
2604 	/* Can we allocate another non-virtual FD? */
2605 	if (!reserveAllocatedDesc())
2606 		ereport(ERROR,
2607 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2608 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2609 						maxAllocatedDescs, dirname)));
2610 
2611 	/* Close excess kernel FDs. */
2612 	ReleaseLruFiles();
2613 
2614 TryAgain:
2615 	if ((dir = opendir(dirname)) != NULL)
2616 	{
2617 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2618 
2619 		desc->kind = AllocateDescDir;
2620 		desc->desc.dir = dir;
2621 		desc->create_subid = GetCurrentSubTransactionId();
2622 		numAllocatedDescs++;
2623 		return desc->desc.dir;
2624 	}
2625 
2626 	if (errno == EMFILE || errno == ENFILE)
2627 	{
2628 		int			save_errno = errno;
2629 
2630 		ereport(LOG,
2631 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2632 				 errmsg("out of file descriptors: %m; release and retry")));
2633 		errno = 0;
2634 		if (ReleaseLruFile())
2635 			goto TryAgain;
2636 		errno = save_errno;
2637 	}
2638 
2639 	return NULL;
2640 }
2641 
2642 /*
2643  * Read a directory opened with AllocateDir, ereport'ing any error.
2644  *
2645  * This is easier to use than raw readdir() since it takes care of some
2646  * otherwise rather tedious and error-prone manipulation of errno.  Also,
2647  * if you are happy with a generic error message for AllocateDir failure,
2648  * you can just do
2649  *
2650  *		dir = AllocateDir(path);
2651  *		while ((dirent = ReadDir(dir, path)) != NULL)
2652  *			process dirent;
2653  *		FreeDir(dir);
2654  *
2655  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2656  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2657  * use this shortcut.)
2658  *
2659  * The pathname passed to AllocateDir must be passed to this routine too,
2660  * but it is only used for error reporting.
2661  */
2662 struct dirent *
ReadDir(DIR * dir,const char * dirname)2663 ReadDir(DIR *dir, const char *dirname)
2664 {
2665 	return ReadDirExtended(dir, dirname, ERROR);
2666 }
2667 
2668 /*
2669  * Alternate version of ReadDir that allows caller to specify the elevel
2670  * for any error report (whether it's reporting an initial failure of
2671  * AllocateDir or a subsequent directory read failure).
2672  *
2673  * If elevel < ERROR, returns NULL after any error.  With the normal coding
2674  * pattern, this will result in falling out of the loop immediately as
2675  * though the directory contained no (more) entries.
2676  */
2677 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2678 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2679 {
2680 	struct dirent *dent;
2681 
2682 	/* Give a generic message for AllocateDir failure, if caller didn't */
2683 	if (dir == NULL)
2684 	{
2685 		ereport(elevel,
2686 				(errcode_for_file_access(),
2687 				 errmsg("could not open directory \"%s\": %m",
2688 						dirname)));
2689 		return NULL;
2690 	}
2691 
2692 	errno = 0;
2693 	if ((dent = readdir(dir)) != NULL)
2694 		return dent;
2695 
2696 	if (errno)
2697 		ereport(elevel,
2698 				(errcode_for_file_access(),
2699 				 errmsg("could not read directory \"%s\": %m",
2700 						dirname)));
2701 	return NULL;
2702 }
2703 
2704 /*
2705  * Close a directory opened with AllocateDir.
2706  *
2707  * Returns closedir's return value (with errno set if it's not 0).
2708  * Note we do not check the return value --- it is up to the caller
2709  * to handle close errors if wanted.
2710  *
2711  * Does nothing if dir == NULL; we assume that directory open failure was
2712  * already reported if desired.
2713  */
2714 int
FreeDir(DIR * dir)2715 FreeDir(DIR *dir)
2716 {
2717 	int			i;
2718 
2719 	/* Nothing to do if AllocateDir failed */
2720 	if (dir == NULL)
2721 		return 0;
2722 
2723 	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2724 
2725 	/* Remove dir from list of allocated dirs, if it's present */
2726 	for (i = numAllocatedDescs; --i >= 0;)
2727 	{
2728 		AllocateDesc *desc = &allocatedDescs[i];
2729 
2730 		if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2731 			return FreeDesc(desc);
2732 	}
2733 
2734 	/* Only get here if someone passes us a dir not in allocatedDescs */
2735 	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2736 
2737 	return closedir(dir);
2738 }
2739 
2740 
2741 /*
2742  * Close a pipe stream returned by OpenPipeStream.
2743  */
2744 int
ClosePipeStream(FILE * file)2745 ClosePipeStream(FILE *file)
2746 {
2747 	int			i;
2748 
2749 	DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2750 
2751 	/* Remove file from list of allocated files, if it's present */
2752 	for (i = numAllocatedDescs; --i >= 0;)
2753 	{
2754 		AllocateDesc *desc = &allocatedDescs[i];
2755 
2756 		if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2757 			return FreeDesc(desc);
2758 	}
2759 
2760 	/* Only get here if someone passes us a file not in allocatedDescs */
2761 	elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2762 
2763 	return pclose(file);
2764 }
2765 
2766 /*
2767  * closeAllVfds
2768  *
2769  * Force all VFDs into the physically-closed state, so that the fewest
2770  * possible number of kernel file descriptors are in use.  There is no
2771  * change in the logical state of the VFDs.
2772  */
2773 void
closeAllVfds(void)2774 closeAllVfds(void)
2775 {
2776 	Index		i;
2777 
2778 	if (SizeVfdCache > 0)
2779 	{
2780 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2781 		for (i = 1; i < SizeVfdCache; i++)
2782 		{
2783 			if (!FileIsNotOpen(i))
2784 				LruDelete(i);
2785 		}
2786 	}
2787 }
2788 
2789 
2790 /*
2791  * SetTempTablespaces
2792  *
2793  * Define a list (actually an array) of OIDs of tablespaces to use for
2794  * temporary files.  This list will be used until end of transaction,
2795  * unless this function is called again before then.  It is caller's
2796  * responsibility that the passed-in array has adequate lifespan (typically
2797  * it'd be allocated in TopTransactionContext).
2798  *
2799  * Some entries of the array may be InvalidOid, indicating that the current
2800  * database's default tablespace should be used.
2801  */
2802 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2803 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2804 {
2805 	Assert(numSpaces >= 0);
2806 	tempTableSpaces = tableSpaces;
2807 	numTempTableSpaces = numSpaces;
2808 
2809 	/*
2810 	 * Select a random starting point in the list.  This is to minimize
2811 	 * conflicts between backends that are most likely sharing the same list
2812 	 * of temp tablespaces.  Note that if we create multiple temp files in the
2813 	 * same transaction, we'll advance circularly through the list --- this
2814 	 * ensures that large temporary sort files are nicely spread across all
2815 	 * available tablespaces.
2816 	 */
2817 	if (numSpaces > 1)
2818 		nextTempTableSpace = random() % numSpaces;
2819 	else
2820 		nextTempTableSpace = 0;
2821 }
2822 
2823 /*
2824  * TempTablespacesAreSet
2825  *
2826  * Returns true if SetTempTablespaces has been called in current transaction.
2827  * (This is just so that tablespaces.c doesn't need its own per-transaction
2828  * state.)
2829  */
2830 bool
TempTablespacesAreSet(void)2831 TempTablespacesAreSet(void)
2832 {
2833 	return (numTempTableSpaces >= 0);
2834 }
2835 
2836 /*
2837  * GetTempTablespaces
2838  *
2839  * Populate an array with the OIDs of the tablespaces that should be used for
2840  * temporary files.  (Some entries may be InvalidOid, indicating that the
2841  * current database's default tablespace should be used.)  At most numSpaces
2842  * entries will be filled.
2843  * Returns the number of OIDs that were copied into the output array.
2844  */
2845 int
GetTempTablespaces(Oid * tableSpaces,int numSpaces)2846 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2847 {
2848 	int			i;
2849 
2850 	Assert(TempTablespacesAreSet());
2851 	for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2852 		tableSpaces[i] = tempTableSpaces[i];
2853 
2854 	return i;
2855 }
2856 
2857 /*
2858  * GetNextTempTableSpace
2859  *
2860  * Select the next temp tablespace to use.  A result of InvalidOid means
2861  * to use the current database's default tablespace.
2862  */
2863 Oid
GetNextTempTableSpace(void)2864 GetNextTempTableSpace(void)
2865 {
2866 	if (numTempTableSpaces > 0)
2867 	{
2868 		/* Advance nextTempTableSpace counter with wraparound */
2869 		if (++nextTempTableSpace >= numTempTableSpaces)
2870 			nextTempTableSpace = 0;
2871 		return tempTableSpaces[nextTempTableSpace];
2872 	}
2873 	return InvalidOid;
2874 }
2875 
2876 
2877 /*
2878  * AtEOSubXact_Files
2879  *
2880  * Take care of subtransaction commit/abort.  At abort, we close temp files
2881  * that the subtransaction may have opened.  At commit, we reassign the
2882  * files that were opened to the parent subtransaction.
2883  */
2884 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2885 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2886 				  SubTransactionId parentSubid)
2887 {
2888 	Index		i;
2889 
2890 	for (i = 0; i < numAllocatedDescs; i++)
2891 	{
2892 		if (allocatedDescs[i].create_subid == mySubid)
2893 		{
2894 			if (isCommit)
2895 				allocatedDescs[i].create_subid = parentSubid;
2896 			else
2897 			{
2898 				/* have to recheck the item after FreeDesc (ugly) */
2899 				FreeDesc(&allocatedDescs[i--]);
2900 			}
2901 		}
2902 	}
2903 }
2904 
2905 /*
2906  * AtEOXact_Files
2907  *
2908  * This routine is called during transaction commit or abort.  All still-open
2909  * per-transaction temporary file VFDs are closed, which also causes the
2910  * underlying files to be deleted (although they should've been closed already
2911  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2912  * closed. We also forget any transaction-local temp tablespace list.
2913  *
2914  * The isCommit flag is used only to decide whether to emit warnings about
2915  * unclosed files.
2916  */
2917 void
AtEOXact_Files(bool isCommit)2918 AtEOXact_Files(bool isCommit)
2919 {
2920 	CleanupTempFiles(isCommit, false);
2921 	tempTableSpaces = NULL;
2922 	numTempTableSpaces = -1;
2923 }
2924 
2925 /*
2926  * AtProcExit_Files
2927  *
2928  * on_proc_exit hook to clean up temp files during backend shutdown.
2929  * Here, we want to clean up *all* temp files including interXact ones.
2930  */
2931 static void
AtProcExit_Files(int code,Datum arg)2932 AtProcExit_Files(int code, Datum arg)
2933 {
2934 	CleanupTempFiles(false, true);
2935 }
2936 
2937 /*
2938  * Close temporary files and delete their underlying files.
2939  *
2940  * isCommit: if true, this is normal transaction commit, and we don't
2941  * expect any remaining files; warn if there are some.
2942  *
2943  * isProcExit: if true, this is being called as the backend process is
2944  * exiting. If that's the case, we should remove all temporary files; if
2945  * that's not the case, we are being called for transaction commit/abort
2946  * and should only remove transaction-local temp files.  In either case,
2947  * also clean up "allocated" stdio files, dirs and fds.
2948  */
2949 static void
CleanupTempFiles(bool isCommit,bool isProcExit)2950 CleanupTempFiles(bool isCommit, bool isProcExit)
2951 {
2952 	Index		i;
2953 
2954 	/*
2955 	 * Careful here: at proc_exit we need extra cleanup, not just
2956 	 * xact_temporary files.
2957 	 */
2958 	if (isProcExit || have_xact_temporary_files)
2959 	{
2960 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2961 		for (i = 1; i < SizeVfdCache; i++)
2962 		{
2963 			unsigned short fdstate = VfdCache[i].fdstate;
2964 
2965 			if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2966 				VfdCache[i].fileName != NULL)
2967 			{
2968 				/*
2969 				 * If we're in the process of exiting a backend process, close
2970 				 * all temporary files. Otherwise, only close temporary files
2971 				 * local to the current transaction. They should be closed by
2972 				 * the ResourceOwner mechanism already, so this is just a
2973 				 * debugging cross-check.
2974 				 */
2975 				if (isProcExit)
2976 					FileClose(i);
2977 				else if (fdstate & FD_CLOSE_AT_EOXACT)
2978 				{
2979 					elog(WARNING,
2980 						 "temporary file %s not closed at end-of-transaction",
2981 						 VfdCache[i].fileName);
2982 					FileClose(i);
2983 				}
2984 			}
2985 		}
2986 
2987 		have_xact_temporary_files = false;
2988 	}
2989 
2990 	/* Complain if any allocated files remain open at commit. */
2991 	if (isCommit && numAllocatedDescs > 0)
2992 		elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2993 			 numAllocatedDescs);
2994 
2995 	/* Clean up "allocated" stdio files, dirs and fds. */
2996 	while (numAllocatedDescs > 0)
2997 		FreeDesc(&allocatedDescs[0]);
2998 }
2999 
3000 
3001 /*
3002  * Remove temporary and temporary relation files left over from a prior
3003  * postmaster session
3004  *
3005  * This should be called during postmaster startup.  It will forcibly
3006  * remove any leftover files created by OpenTemporaryFile and any leftover
3007  * temporary relation files created by mdcreate.
3008  *
3009  * NOTE: we could, but don't, call this during a post-backend-crash restart
3010  * cycle.  The argument for not doing it is that someone might want to examine
3011  * the temp files for debugging purposes.  This does however mean that
3012  * OpenTemporaryFile had better allow for collision with an existing temp
3013  * file name.
3014  *
3015  * NOTE: this function and its subroutines generally report syscall failures
3016  * with ereport(LOG) and keep going.  Removing temp files is not so critical
3017  * that we should fail to start the database when we can't do it.
3018  */
3019 void
RemovePgTempFiles(void)3020 RemovePgTempFiles(void)
3021 {
3022 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3023 	DIR		   *spc_dir;
3024 	struct dirent *spc_de;
3025 
3026 	/*
3027 	 * First process temp files in pg_default ($PGDATA/base)
3028 	 */
3029 	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3030 	RemovePgTempFilesInDir(temp_path, true, false);
3031 	RemovePgTempRelationFiles("base");
3032 
3033 	/*
3034 	 * Cycle through temp directories for all non-default tablespaces.
3035 	 */
3036 	spc_dir = AllocateDir("pg_tblspc");
3037 
3038 	while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3039 	{
3040 		if (strcmp(spc_de->d_name, ".") == 0 ||
3041 			strcmp(spc_de->d_name, "..") == 0)
3042 			continue;
3043 
3044 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3045 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
3046 		RemovePgTempFilesInDir(temp_path, true, false);
3047 
3048 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3049 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3050 		RemovePgTempRelationFiles(temp_path);
3051 	}
3052 
3053 	FreeDir(spc_dir);
3054 
3055 	/*
3056 	 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3057 	 * DataDir as well.  However, that is *not* cleaned here because doing so
3058 	 * would create a race condition.  It's done separately, earlier in
3059 	 * postmaster startup.
3060 	 */
3061 }
3062 
3063 /*
3064  * Process one pgsql_tmp directory for RemovePgTempFiles.
3065  *
3066  * If missing_ok is true, it's all right for the named directory to not exist.
3067  * Any other problem results in a LOG message.  (missing_ok should be true at
3068  * the top level, since pgsql_tmp directories are not created until needed.)
3069  *
3070  * At the top level, this should be called with unlink_all = false, so that
3071  * only files matching the temporary name prefix will be unlinked.  When
3072  * recursing it will be called with unlink_all = true to unlink everything
3073  * under a top-level temporary directory.
3074  *
3075  * (These two flags could be replaced by one, but it seems clearer to keep
3076  * them separate.)
3077  */
3078 void
RemovePgTempFilesInDir(const char * tmpdirname,bool missing_ok,bool unlink_all)3079 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3080 {
3081 	DIR		   *temp_dir;
3082 	struct dirent *temp_de;
3083 	char		rm_path[MAXPGPATH * 2];
3084 
3085 	temp_dir = AllocateDir(tmpdirname);
3086 
3087 	if (temp_dir == NULL && errno == ENOENT && missing_ok)
3088 		return;
3089 
3090 	while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3091 	{
3092 		if (strcmp(temp_de->d_name, ".") == 0 ||
3093 			strcmp(temp_de->d_name, "..") == 0)
3094 			continue;
3095 
3096 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
3097 				 tmpdirname, temp_de->d_name);
3098 
3099 		if (unlink_all ||
3100 			strncmp(temp_de->d_name,
3101 					PG_TEMP_FILE_PREFIX,
3102 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
3103 		{
3104 			struct stat statbuf;
3105 
3106 			if (lstat(rm_path, &statbuf) < 0)
3107 			{
3108 				ereport(LOG,
3109 						(errcode_for_file_access(),
3110 						 errmsg("could not stat file \"%s\": %m", rm_path)));
3111 				continue;
3112 			}
3113 
3114 			if (S_ISDIR(statbuf.st_mode))
3115 			{
3116 				/* recursively remove contents, then directory itself */
3117 				RemovePgTempFilesInDir(rm_path, false, true);
3118 
3119 				if (rmdir(rm_path) < 0)
3120 					ereport(LOG,
3121 							(errcode_for_file_access(),
3122 							 errmsg("could not remove directory \"%s\": %m",
3123 									rm_path)));
3124 			}
3125 			else
3126 			{
3127 				if (unlink(rm_path) < 0)
3128 					ereport(LOG,
3129 							(errcode_for_file_access(),
3130 							 errmsg("could not remove file \"%s\": %m",
3131 									rm_path)));
3132 			}
3133 		}
3134 		else
3135 			ereport(LOG,
3136 					(errmsg("unexpected file found in temporary-files directory: \"%s\"",
3137 							rm_path)));
3138 	}
3139 
3140 	FreeDir(temp_dir);
3141 }
3142 
3143 /* Process one tablespace directory, look for per-DB subdirectories */
3144 static void
RemovePgTempRelationFiles(const char * tsdirname)3145 RemovePgTempRelationFiles(const char *tsdirname)
3146 {
3147 	DIR		   *ts_dir;
3148 	struct dirent *de;
3149 	char		dbspace_path[MAXPGPATH * 2];
3150 
3151 	ts_dir = AllocateDir(tsdirname);
3152 
3153 	while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3154 	{
3155 		/*
3156 		 * We're only interested in the per-database directories, which have
3157 		 * numeric names.  Note that this code will also (properly) ignore "."
3158 		 * and "..".
3159 		 */
3160 		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3161 			continue;
3162 
3163 		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3164 				 tsdirname, de->d_name);
3165 		RemovePgTempRelationFilesInDbspace(dbspace_path);
3166 	}
3167 
3168 	FreeDir(ts_dir);
3169 }
3170 
3171 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3172 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)3173 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3174 {
3175 	DIR		   *dbspace_dir;
3176 	struct dirent *de;
3177 	char		rm_path[MAXPGPATH * 2];
3178 
3179 	dbspace_dir = AllocateDir(dbspacedirname);
3180 
3181 	while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3182 	{
3183 		if (!looks_like_temp_rel_name(de->d_name))
3184 			continue;
3185 
3186 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
3187 				 dbspacedirname, de->d_name);
3188 
3189 		if (unlink(rm_path) < 0)
3190 			ereport(LOG,
3191 					(errcode_for_file_access(),
3192 					 errmsg("could not remove file \"%s\": %m",
3193 							rm_path)));
3194 	}
3195 
3196 	FreeDir(dbspace_dir);
3197 }
3198 
3199 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3200 bool
looks_like_temp_rel_name(const char * name)3201 looks_like_temp_rel_name(const char *name)
3202 {
3203 	int			pos;
3204 	int			savepos;
3205 
3206 	/* Must start with "t". */
3207 	if (name[0] != 't')
3208 		return false;
3209 
3210 	/* Followed by a non-empty string of digits and then an underscore. */
3211 	for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3212 		;
3213 	if (pos == 1 || name[pos] != '_')
3214 		return false;
3215 
3216 	/* Followed by another nonempty string of digits. */
3217 	for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3218 		;
3219 	if (savepos == pos)
3220 		return false;
3221 
3222 	/* We might have _forkname or .segment or both. */
3223 	if (name[pos] == '_')
3224 	{
3225 		int			forkchar = forkname_chars(&name[pos + 1], NULL);
3226 
3227 		if (forkchar <= 0)
3228 			return false;
3229 		pos += forkchar + 1;
3230 	}
3231 	if (name[pos] == '.')
3232 	{
3233 		int			segchar;
3234 
3235 		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3236 			;
3237 		if (segchar <= 1)
3238 			return false;
3239 		pos += segchar;
3240 	}
3241 
3242 	/* Now we should be at the end. */
3243 	if (name[pos] != '\0')
3244 		return false;
3245 	return true;
3246 }
3247 
3248 
3249 /*
3250  * Issue fsync recursively on PGDATA and all its contents.
3251  *
3252  * We fsync regular files and directories wherever they are, but we
3253  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3254  * Other symlinks are presumed to point at files we're not responsible
3255  * for fsyncing, and might not have privileges to write at all.
3256  *
3257  * Errors are logged but not considered fatal; that's because this is used
3258  * only during database startup, to deal with the possibility that there are
3259  * issued-but-unsynced writes pending against the data directory.  We want to
3260  * ensure that such writes reach disk before anything that's done in the new
3261  * run.  However, aborting on error would result in failure to start for
3262  * harmless cases such as read-only files in the data directory, and that's
3263  * not good either.
3264  *
3265  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3266  * rewriting all changes again during recovery.
3267  *
3268  * Note we assume we're chdir'd into PGDATA to begin with.
3269  */
3270 void
SyncDataDirectory(void)3271 SyncDataDirectory(void)
3272 {
3273 	bool		xlog_is_symlink;
3274 
3275 	/* We can skip this whole thing if fsync is disabled. */
3276 	if (!enableFsync)
3277 		return;
3278 
3279 	/*
3280 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
3281 	 * because the first walkdir below will ignore it.
3282 	 */
3283 	xlog_is_symlink = false;
3284 
3285 #ifndef WIN32
3286 	{
3287 		struct stat st;
3288 
3289 		if (lstat("pg_wal", &st) < 0)
3290 			ereport(LOG,
3291 					(errcode_for_file_access(),
3292 					 errmsg("could not stat file \"%s\": %m",
3293 							"pg_wal")));
3294 		else if (S_ISLNK(st.st_mode))
3295 			xlog_is_symlink = true;
3296 	}
3297 #else
3298 	if (pgwin32_is_junction("pg_wal"))
3299 		xlog_is_symlink = true;
3300 #endif
3301 
3302 	/*
3303 	 * If possible, hint to the kernel that we're soon going to fsync the data
3304 	 * directory and its contents.  Errors in this step are even less
3305 	 * interesting than normal, so log them only at DEBUG1.
3306 	 */
3307 #ifdef PG_FLUSH_DATA_WORKS
3308 	walkdir(".", pre_sync_fname, false, DEBUG1);
3309 	if (xlog_is_symlink)
3310 		walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3311 	walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3312 #endif
3313 
3314 	/*
3315 	 * Now we do the fsync()s in the same order.
3316 	 *
3317 	 * The main call ignores symlinks, so in addition to specially processing
3318 	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3319 	 * process_symlinks = true.  Note that if there are any plain directories
3320 	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
3321 	 * so we don't worry about optimizing it.
3322 	 */
3323 	walkdir(".", datadir_fsync_fname, false, LOG);
3324 	if (xlog_is_symlink)
3325 		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3326 	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3327 }
3328 
3329 /*
3330  * walkdir: recursively walk a directory, applying the action to each
3331  * regular file and directory (including the named directory itself).
3332  *
3333  * If process_symlinks is true, the action and recursion are also applied
3334  * to regular files and directories that are pointed to by symlinks in the
3335  * given directory; otherwise symlinks are ignored.  Symlinks are always
3336  * ignored in subdirectories, ie we intentionally don't pass down the
3337  * process_symlinks flag to recursive calls.
3338  *
3339  * Errors are reported at level elevel, which might be ERROR or less.
3340  *
3341  * See also walkdir in file_utils.c, which is a frontend version of this
3342  * logic.
3343  */
3344 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3345 walkdir(const char *path,
3346 		void (*action) (const char *fname, bool isdir, int elevel),
3347 		bool process_symlinks,
3348 		int elevel)
3349 {
3350 	DIR		   *dir;
3351 	struct dirent *de;
3352 
3353 	dir = AllocateDir(path);
3354 
3355 	while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3356 	{
3357 		char		subpath[MAXPGPATH * 2];
3358 		struct stat fst;
3359 		int			sret;
3360 
3361 		CHECK_FOR_INTERRUPTS();
3362 
3363 		if (strcmp(de->d_name, ".") == 0 ||
3364 			strcmp(de->d_name, "..") == 0)
3365 			continue;
3366 
3367 		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3368 
3369 		if (process_symlinks)
3370 			sret = stat(subpath, &fst);
3371 		else
3372 			sret = lstat(subpath, &fst);
3373 
3374 		if (sret < 0)
3375 		{
3376 			ereport(elevel,
3377 					(errcode_for_file_access(),
3378 					 errmsg("could not stat file \"%s\": %m", subpath)));
3379 			continue;
3380 		}
3381 
3382 		if (S_ISREG(fst.st_mode))
3383 			(*action) (subpath, false, elevel);
3384 		else if (S_ISDIR(fst.st_mode))
3385 			walkdir(subpath, action, false, elevel);
3386 	}
3387 
3388 	FreeDir(dir);				/* we ignore any error here */
3389 
3390 	/*
3391 	 * It's important to fsync the destination directory itself as individual
3392 	 * file fsyncs don't guarantee that the directory entry for the file is
3393 	 * synced.  However, skip this if AllocateDir failed; the action function
3394 	 * might not be robust against that.
3395 	 */
3396 	if (dir)
3397 		(*action) (path, true, elevel);
3398 }
3399 
3400 
3401 /*
3402  * Hint to the OS that it should get ready to fsync() this file.
3403  *
3404  * Ignores errors trying to open unreadable files, and logs other errors at a
3405  * caller-specified level.
3406  */
3407 #ifdef PG_FLUSH_DATA_WORKS
3408 
3409 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3410 pre_sync_fname(const char *fname, bool isdir, int elevel)
3411 {
3412 	int			fd;
3413 
3414 	/* Don't try to flush directories, it'll likely just fail */
3415 	if (isdir)
3416 		return;
3417 
3418 	fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3419 
3420 	if (fd < 0)
3421 	{
3422 		if (errno == EACCES)
3423 			return;
3424 		ereport(elevel,
3425 				(errcode_for_file_access(),
3426 				 errmsg("could not open file \"%s\": %m", fname)));
3427 		return;
3428 	}
3429 
3430 	/*
3431 	 * pg_flush_data() ignores errors, which is ok because this is only a
3432 	 * hint.
3433 	 */
3434 	pg_flush_data(fd, 0, 0);
3435 
3436 	if (CloseTransientFile(fd) != 0)
3437 		ereport(elevel,
3438 				(errcode_for_file_access(),
3439 				 errmsg("could not close file \"%s\": %m", fname)));
3440 }
3441 
3442 #endif							/* PG_FLUSH_DATA_WORKS */
3443 
3444 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3445 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3446 {
3447 	/*
3448 	 * We want to silently ignoring errors about unreadable files.  Pass that
3449 	 * desire on to fsync_fname_ext().
3450 	 */
3451 	fsync_fname_ext(fname, isdir, true, elevel);
3452 }
3453 
3454 static void
unlink_if_exists_fname(const char * fname,bool isdir,int elevel)3455 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3456 {
3457 	if (isdir)
3458 	{
3459 		if (rmdir(fname) != 0 && errno != ENOENT)
3460 			ereport(elevel,
3461 					(errcode_for_file_access(),
3462 					 errmsg("could not remove directory \"%s\": %m", fname)));
3463 	}
3464 	else
3465 	{
3466 		/* Use PathNameDeleteTemporaryFile to report filesize */
3467 		PathNameDeleteTemporaryFile(fname, false);
3468 	}
3469 }
3470 
3471 /*
3472  * fsync_fname_ext -- Try to fsync a file or directory
3473  *
3474  * If ignore_perm is true, ignore errors upon trying to open unreadable
3475  * files. Logs other errors at a caller-specified level.
3476  *
3477  * Returns 0 if the operation succeeded, -1 otherwise.
3478  */
3479 int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3480 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3481 {
3482 	int			fd;
3483 	int			flags;
3484 	int			returncode;
3485 
3486 	/*
3487 	 * Some OSs require directories to be opened read-only whereas other
3488 	 * systems don't allow us to fsync files opened read-only; so we need both
3489 	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
3490 	 * not writable by our userid, but we assume that's OK.
3491 	 */
3492 	flags = PG_BINARY;
3493 	if (!isdir)
3494 		flags |= O_RDWR;
3495 	else
3496 		flags |= O_RDONLY;
3497 
3498 	fd = OpenTransientFile(fname, flags);
3499 
3500 	/*
3501 	 * Some OSs don't allow us to open directories at all (Windows returns
3502 	 * EACCES), just ignore the error in that case.  If desired also silently
3503 	 * ignoring errors about unreadable files. Log others.
3504 	 */
3505 	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3506 		return 0;
3507 	else if (fd < 0 && ignore_perm && errno == EACCES)
3508 		return 0;
3509 	else if (fd < 0)
3510 	{
3511 		ereport(elevel,
3512 				(errcode_for_file_access(),
3513 				 errmsg("could not open file \"%s\": %m", fname)));
3514 		return -1;
3515 	}
3516 
3517 	returncode = pg_fsync(fd);
3518 
3519 	/*
3520 	 * Some OSes don't allow us to fsync directories at all, so we can ignore
3521 	 * those errors. Anything else needs to be logged.
3522 	 */
3523 	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3524 	{
3525 		int			save_errno;
3526 
3527 		/* close file upon error, might not be in transaction context */
3528 		save_errno = errno;
3529 		(void) CloseTransientFile(fd);
3530 		errno = save_errno;
3531 
3532 		ereport(elevel,
3533 				(errcode_for_file_access(),
3534 				 errmsg("could not fsync file \"%s\": %m", fname)));
3535 		return -1;
3536 	}
3537 
3538 	if (CloseTransientFile(fd) != 0)
3539 	{
3540 		ereport(elevel,
3541 				(errcode_for_file_access(),
3542 				 errmsg("could not close file \"%s\": %m", fname)));
3543 		return -1;
3544 	}
3545 
3546 	return 0;
3547 }
3548 
3549 /*
3550  * fsync_parent_path -- fsync the parent path of a file or directory
3551  *
3552  * This is aimed at making file operations persistent on disk in case of
3553  * an OS crash or power failure.
3554  */
3555 static int
fsync_parent_path(const char * fname,int elevel)3556 fsync_parent_path(const char *fname, int elevel)
3557 {
3558 	char		parentpath[MAXPGPATH];
3559 
3560 	strlcpy(parentpath, fname, MAXPGPATH);
3561 	get_parent_directory(parentpath);
3562 
3563 	/*
3564 	 * get_parent_directory() returns an empty string if the input argument is
3565 	 * just a file name (see comments in path.c), so handle that as being the
3566 	 * current directory.
3567 	 */
3568 	if (strlen(parentpath) == 0)
3569 		strlcpy(parentpath, ".", MAXPGPATH);
3570 
3571 	if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3572 		return -1;
3573 
3574 	return 0;
3575 }
3576 
3577 /*
3578  * Create a PostgreSQL data sub-directory
3579  *
3580  * The data directory itself, and most of its sub-directories, are created at
3581  * initdb time, but we do have some occasions when we create directories in
3582  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
3583  * make sure that those directories are created consistently.  Today, that means
3584  * making sure that the created directory has the correct permissions, which is
3585  * what pg_dir_create_mode tracks for us.
3586  *
3587  * Note that we also set the umask() based on what we understand the correct
3588  * permissions to be (see file_perm.c).
3589  *
3590  * For permissions other than the default, mkdir() can be used directly, but
3591  * be sure to consider carefully such cases -- a sub-directory with incorrect
3592  * permissions in a PostgreSQL data directory could cause backups and other
3593  * processes to fail.
3594  */
3595 int
MakePGDirectory(const char * directoryName)3596 MakePGDirectory(const char *directoryName)
3597 {
3598 	return mkdir(directoryName, pg_dir_create_mode);
3599 }
3600 
3601 /*
3602  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3603  *
3604  * Failure to fsync any data file is cause for immediate panic, unless
3605  * data_sync_retry is enabled.  Data may have been written to the operating
3606  * system and removed from our buffer pool already, and if we are running on
3607  * an operating system that forgets dirty data on write-back failure, there
3608  * may be only one copy of the data remaining: in the WAL.  A later attempt to
3609  * fsync again might falsely report success.  Therefore we must not allow any
3610  * further checkpoints to be attempted.  data_sync_retry can in theory be
3611  * enabled on systems known not to drop dirty buffered data on write-back
3612  * failure (with the likely outcome that checkpoints will continue to fail
3613  * until the underlying problem is fixed).
3614  *
3615  * Any code that reports a failure from fsync() or related functions should
3616  * filter the error level with this function.
3617  */
3618 int
data_sync_elevel(int elevel)3619 data_sync_elevel(int elevel)
3620 {
3621 	return data_sync_retry ? elevel : PANIC;
3622 }
3623