1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  *	  Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have.  (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed.  Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends.  Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted.  See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  *-------------------------------------------------------------------------
65  */
66 
67 #include "postgres.h"
68 
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h>		/* for getrlimit */
80 #endif
81 
82 #include "miscadmin.h"
83 #include "access/xact.h"
84 #include "access/xlog.h"
85 #include "catalog/pg_tablespace.h"
86 #include "common/file_perm.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93 
94 
95 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96 #if defined(HAVE_SYNC_FILE_RANGE)
97 #define PG_FLUSH_DATA_WORKS 1
98 #elif !defined(WIN32) && defined(MS_ASYNC)
99 #define PG_FLUSH_DATA_WORKS 1
100 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101 #define PG_FLUSH_DATA_WORKS 1
102 #endif
103 
104 /*
105  * We must leave some file descriptors free for system(), the dynamic loader,
106  * and other code that tries to open files without consulting fd.c.  This
107  * is the number left free.  (While we can be pretty sure we won't get
108  * EMFILE, there's never any guarantee that we won't get ENFILE due to
109  * other processes chewing up FDs.  So it's a bad idea to try to open files
110  * without consulting fd.c.  Nonetheless we cannot control all code.)
111  *
112  * Because this is just a fixed setting, we are effectively assuming that
113  * no such code will leave FDs open over the long term; otherwise the slop
114  * is likely to be insufficient.  Note in particular that we expect that
115  * loading a shared library does not result in any permanent increase in
116  * the number of open files.  (This appears to be true on most if not
117  * all platforms as of Feb 2004.)
118  */
119 #define NUM_RESERVED_FDS		10
120 
121 /*
122  * If we have fewer than this many usable FDs after allowing for the reserved
123  * ones, choke.
124  */
125 #define FD_MINFREE				10
126 
127 /*
128  * A number of platforms allow individual processes to open many more files
129  * than they can really support when *many* processes do the same thing.
130  * This GUC parameter lets the DBA limit max_safe_fds to something less than
131  * what the postmaster's initial probe suggests will work.
132  */
133 int			max_files_per_process = 1000;
134 
135 /*
136  * Maximum number of file descriptors to open for either VFD entries or
137  * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
138  * to a conservative value, and remains that way indefinitely in bootstrap or
139  * standalone-backend cases.  In normal postmaster operation, the postmaster
140  * calls set_max_safe_fds() late in initialization to update the value, and
141  * that value is then inherited by forked subprocesses.
142  *
143  * Note: the value of max_files_per_process is taken into account while
144  * setting this variable, and so need not be tested separately.
145  */
146 int			max_safe_fds = 32;	/* default if not changed */
147 
148 /* Whether it is safe to continue running after fsync() fails. */
149 bool		data_sync_retry = false;
150 
151 /* Debugging.... */
152 
153 #ifdef FDDEBUG
154 #define DO_DB(A) \
155 	do { \
156 		int			_do_db_save_errno = errno; \
157 		A; \
158 		errno = _do_db_save_errno; \
159 	} while (0)
160 #else
161 #define DO_DB(A) \
162 	((void) 0)
163 #endif
164 
165 #define VFD_CLOSED (-1)
166 
167 #define FileIsValid(file) \
168 	((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
169 
170 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
171 
172 /* these are the assigned bits in fdstate below: */
173 #define FD_DELETE_AT_CLOSE	(1 << 0)	/* T = delete when closed */
174 #define FD_CLOSE_AT_EOXACT	(1 << 1)	/* T = close at eoXact */
175 #define FD_TEMP_FILE_LIMIT	(1 << 2)	/* T = respect temp_file_limit */
176 
177 typedef struct vfd
178 {
179 	int			fd;				/* current FD, or VFD_CLOSED if none */
180 	unsigned short fdstate;		/* bitflags for VFD's state */
181 	ResourceOwner resowner;		/* owner, for automatic cleanup */
182 	File		nextFree;		/* link to next free VFD, if in freelist */
183 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
184 	File		lruLessRecently;
185 	off_t		fileSize;		/* current size of file (0 if not temporary) */
186 	char	   *fileName;		/* name of file, or NULL for unused VFD */
187 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
188 	int			fileFlags;		/* open(2) flags for (re)opening the file */
189 	mode_t		fileMode;		/* mode to pass to open(2) */
190 } Vfd;
191 
192 /*
193  * Virtual File Descriptor array pointer and size.  This grows as
194  * needed.  'File' values are indexes into this array.
195  * Note that VfdCache[0] is not a usable VFD, just a list header.
196  */
197 static Vfd *VfdCache;
198 static Size SizeVfdCache = 0;
199 
200 /*
201  * Number of file descriptors known to be in use by VFD entries.
202  */
203 static int	nfile = 0;
204 
205 /*
206  * Flag to tell whether it's worth scanning VfdCache looking for temp files
207  * to close
208  */
209 static bool have_xact_temporary_files = false;
210 
211 /*
212  * Tracks the total size of all temporary files.  Note: when temp_file_limit
213  * is being enforced, this cannot overflow since the limit cannot be more
214  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
215  * overflow, but we don't care.
216  */
217 static uint64 temporary_files_size = 0;
218 
219 /*
220  * List of OS handles opened with AllocateFile, AllocateDir and
221  * OpenTransientFile.
222  */
223 typedef enum
224 {
225 	AllocateDescFile,
226 	AllocateDescPipe,
227 	AllocateDescDir,
228 	AllocateDescRawFD
229 } AllocateDescKind;
230 
231 typedef struct
232 {
233 	AllocateDescKind kind;
234 	SubTransactionId create_subid;
235 	union
236 	{
237 		FILE	   *file;
238 		DIR		   *dir;
239 		int			fd;
240 	}			desc;
241 } AllocateDesc;
242 
243 static int	numAllocatedDescs = 0;
244 static int	maxAllocatedDescs = 0;
245 static AllocateDesc *allocatedDescs = NULL;
246 
247 /*
248  * Number of temporary files opened during the current session;
249  * this is used in generation of tempfile names.
250  */
251 static long tempFileCounter = 0;
252 
253 /*
254  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
255  * indicating that the current database's default tablespace should be used.)
256  * When numTempTableSpaces is -1, this has not been set in the current
257  * transaction.
258  */
259 static Oid *tempTableSpaces = NULL;
260 static int	numTempTableSpaces = -1;
261 static int	nextTempTableSpace = 0;
262 
263 
264 /*--------------------
265  *
266  * Private Routines
267  *
268  * Delete		   - delete a file from the Lru ring
269  * LruDelete	   - remove a file from the Lru ring and close its FD
270  * Insert		   - put a file at the front of the Lru ring
271  * LruInsert	   - put a file at the front of the Lru ring and open it
272  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
273  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
274  * AllocateVfd	   - grab a free (or new) file record (from VfdArray)
275  * FreeVfd		   - free a file record
276  *
277  * The Least Recently Used ring is a doubly linked list that begins and
278  * ends on element zero.  Element zero is special -- it doesn't represent
279  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
280  * anchor that shows us the beginning/end of the ring.
281  * Only VFD elements that are currently really open (have an FD assigned) are
282  * in the Lru ring.  Elements that are "virtually" open can be recognized
283  * by having a non-null fileName field.
284  *
285  * example:
286  *
287  *	   /--less----\				   /---------\
288  *	   v		   \			  v			  \
289  *	 #0 --more---> LeastRecentlyUsed --more-\ \
290  *	  ^\									| |
291  *	   \\less--> MostRecentlyUsedFile	<---/ |
292  *		\more---/					 \--less--/
293  *
294  *--------------------
295  */
296 static void Delete(File file);
297 static void LruDelete(File file);
298 static void Insert(File file);
299 static int	LruInsert(File file);
300 static bool ReleaseLruFile(void);
301 static void ReleaseLruFiles(void);
302 static File AllocateVfd(void);
303 static void FreeVfd(File file);
304 
305 static int	FileAccess(File file);
306 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
307 static bool reserveAllocatedDesc(void);
308 static int	FreeDesc(AllocateDesc *desc);
309 
310 static void AtProcExit_Files(int code, Datum arg);
311 static void CleanupTempFiles(bool isCommit, bool isProcExit);
312 static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
313 								   bool unlink_all);
314 static void RemovePgTempRelationFiles(const char *tsdirname);
315 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
316 
317 static void walkdir(const char *path,
318 					void (*action) (const char *fname, bool isdir, int elevel),
319 					bool process_symlinks,
320 					int elevel);
321 #ifdef PG_FLUSH_DATA_WORKS
322 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
323 #endif
324 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
325 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
326 
327 static int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
328 static int	fsync_parent_path(const char *fname, int elevel);
329 
330 
331 /*
332  * pg_fsync --- do fsync with or without writethrough
333  */
334 int
pg_fsync(int fd)335 pg_fsync(int fd)
336 {
337 	/* #if is to skip the sync_method test if there's no need for it */
338 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
339 	if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
340 		return pg_fsync_writethrough(fd);
341 	else
342 #endif
343 		return pg_fsync_no_writethrough(fd);
344 }
345 
346 
347 /*
348  * pg_fsync_no_writethrough --- same as fsync except does nothing if
349  *	enableFsync is off
350  */
351 int
pg_fsync_no_writethrough(int fd)352 pg_fsync_no_writethrough(int fd)
353 {
354 	if (enableFsync)
355 		return fsync(fd);
356 	else
357 		return 0;
358 }
359 
360 /*
361  * pg_fsync_writethrough
362  */
363 int
pg_fsync_writethrough(int fd)364 pg_fsync_writethrough(int fd)
365 {
366 	if (enableFsync)
367 	{
368 #ifdef WIN32
369 		return _commit(fd);
370 #elif defined(F_FULLFSYNC)
371 		return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
372 #else
373 		errno = ENOSYS;
374 		return -1;
375 #endif
376 	}
377 	else
378 		return 0;
379 }
380 
381 /*
382  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
383  *
384  * Not all platforms have fdatasync; treat as fsync if not available.
385  */
386 int
pg_fdatasync(int fd)387 pg_fdatasync(int fd)
388 {
389 	if (enableFsync)
390 	{
391 #ifdef HAVE_FDATASYNC
392 		return fdatasync(fd);
393 #else
394 		return fsync(fd);
395 #endif
396 	}
397 	else
398 		return 0;
399 }
400 
401 /*
402  * pg_flush_data --- advise OS that the described dirty data should be flushed
403  *
404  * offset of 0 with nbytes 0 means that the entire file should be flushed
405  */
406 void
pg_flush_data(int fd,off_t offset,off_t nbytes)407 pg_flush_data(int fd, off_t offset, off_t nbytes)
408 {
409 	/*
410 	 * Right now file flushing is primarily used to avoid making later
411 	 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
412 	 * if fsyncs are disabled - that's a decision we might want to make
413 	 * configurable at some point.
414 	 */
415 	if (!enableFsync)
416 		return;
417 
418 	/*
419 	 * We compile all alternatives that are supported on the current platform,
420 	 * to find portability problems more easily.
421 	 */
422 #if defined(HAVE_SYNC_FILE_RANGE)
423 	{
424 		int			rc;
425 		static bool not_implemented_by_kernel = false;
426 
427 		if (not_implemented_by_kernel)
428 			return;
429 
430 		/*
431 		 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
432 		 * tells the OS that writeback for the specified blocks should be
433 		 * started, but that we don't want to wait for completion.  Note that
434 		 * this call might block if too much dirty data exists in the range.
435 		 * This is the preferable method on OSs supporting it, as it works
436 		 * reliably when available (contrast to msync()) and doesn't flush out
437 		 * clean data (like FADV_DONTNEED).
438 		 */
439 		rc = sync_file_range(fd, offset, nbytes,
440 							 SYNC_FILE_RANGE_WRITE);
441 		if (rc != 0)
442 		{
443 			int			elevel;
444 
445 			/*
446 			 * For systems that don't have an implementation of
447 			 * sync_file_range() such as Windows WSL, generate only one
448 			 * warning and then suppress all further attempts by this process.
449 			 */
450 			if (errno == ENOSYS)
451 			{
452 				elevel = WARNING;
453 				not_implemented_by_kernel = true;
454 			}
455 			else
456 				elevel = data_sync_elevel(WARNING);
457 
458 			ereport(elevel,
459 					(errcode_for_file_access(),
460 					 errmsg("could not flush dirty data: %m")));
461 		}
462 
463 		return;
464 	}
465 #endif
466 #if !defined(WIN32) && defined(MS_ASYNC)
467 	{
468 		void	   *p;
469 		static int	pagesize = 0;
470 
471 		/*
472 		 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
473 		 * writeback. On linux it only does so if MS_SYNC is specified, but
474 		 * then it does the writeback synchronously. Luckily all common linux
475 		 * systems have sync_file_range().  This is preferable over
476 		 * FADV_DONTNEED because it doesn't flush out clean data.
477 		 *
478 		 * We map the file (mmap()), tell the kernel to sync back the contents
479 		 * (msync()), and then remove the mapping again (munmap()).
480 		 */
481 
482 		/* mmap() needs actual length if we want to map whole file */
483 		if (offset == 0 && nbytes == 0)
484 		{
485 			nbytes = lseek(fd, 0, SEEK_END);
486 			if (nbytes < 0)
487 			{
488 				ereport(WARNING,
489 						(errcode_for_file_access(),
490 						 errmsg("could not determine dirty data size: %m")));
491 				return;
492 			}
493 		}
494 
495 		/*
496 		 * Some platforms reject partial-page mmap() attempts.  To deal with
497 		 * that, just truncate the request to a page boundary.  If any extra
498 		 * bytes don't get flushed, well, it's only a hint anyway.
499 		 */
500 
501 		/* fetch pagesize only once */
502 		if (pagesize == 0)
503 			pagesize = sysconf(_SC_PAGESIZE);
504 
505 		/* align length to pagesize, dropping any fractional page */
506 		if (pagesize > 0)
507 			nbytes = (nbytes / pagesize) * pagesize;
508 
509 		/* fractional-page request is a no-op */
510 		if (nbytes <= 0)
511 			return;
512 
513 		/*
514 		 * mmap could well fail, particularly on 32-bit platforms where there
515 		 * may simply not be enough address space.  If so, silently fall
516 		 * through to the next implementation.
517 		 */
518 		if (nbytes <= (off_t) SSIZE_MAX)
519 			p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
520 		else
521 			p = MAP_FAILED;
522 
523 		if (p != MAP_FAILED)
524 		{
525 			int			rc;
526 
527 			rc = msync(p, (size_t) nbytes, MS_ASYNC);
528 			if (rc != 0)
529 			{
530 				ereport(data_sync_elevel(WARNING),
531 						(errcode_for_file_access(),
532 						 errmsg("could not flush dirty data: %m")));
533 				/* NB: need to fall through to munmap()! */
534 			}
535 
536 			rc = munmap(p, (size_t) nbytes);
537 			if (rc != 0)
538 			{
539 				/* FATAL error because mapping would remain */
540 				ereport(FATAL,
541 						(errcode_for_file_access(),
542 						 errmsg("could not munmap() while flushing data: %m")));
543 			}
544 
545 			return;
546 		}
547 	}
548 #endif
549 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
550 	{
551 		int			rc;
552 
553 		/*
554 		 * Signal the kernel that the passed in range should not be cached
555 		 * anymore. This has the, desired, side effect of writing out dirty
556 		 * data, and the, undesired, side effect of likely discarding useful
557 		 * clean cached blocks.  For the latter reason this is the least
558 		 * preferable method.
559 		 */
560 
561 		rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
562 
563 		if (rc != 0)
564 		{
565 			/* don't error out, this is just a performance optimization */
566 			ereport(WARNING,
567 					(errcode_for_file_access(),
568 					 errmsg("could not flush dirty data: %m")));
569 		}
570 
571 		return;
572 	}
573 #endif
574 }
575 
576 
577 /*
578  * fsync_fname -- fsync a file or directory, handling errors properly
579  *
580  * Try to fsync a file or directory. When doing the latter, ignore errors that
581  * indicate the OS just doesn't allow/require fsyncing directories.
582  */
583 void
fsync_fname(const char * fname,bool isdir)584 fsync_fname(const char *fname, bool isdir)
585 {
586 	fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
587 }
588 
589 /*
590  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
591  *
592  * This routine ensures that, after returning, the effect of renaming file
593  * persists in case of a crash. A crash while this routine is running will
594  * leave you with either the pre-existing or the moved file in place of the
595  * new file; no mixed state or truncated files are possible.
596  *
597  * It does so by using fsync on the old filename and the possibly existing
598  * target filename before the rename, and the target file and directory after.
599  *
600  * Note that rename() cannot be used across arbitrary directories, as they
601  * might not be on the same filesystem. Therefore this routine does not
602  * support renaming across directories.
603  *
604  * Log errors with the caller specified severity.
605  *
606  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
607  * valid upon return.
608  */
609 int
durable_rename(const char * oldfile,const char * newfile,int elevel)610 durable_rename(const char *oldfile, const char *newfile, int elevel)
611 {
612 	int			fd;
613 
614 	/*
615 	 * First fsync the old and target path (if it exists), to ensure that they
616 	 * are properly persistent on disk. Syncing the target file is not
617 	 * strictly necessary, but it makes it easier to reason about crashes;
618 	 * because it's then guaranteed that either source or target file exists
619 	 * after a crash.
620 	 */
621 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
622 		return -1;
623 
624 	fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
625 	if (fd < 0)
626 	{
627 		if (errno != ENOENT)
628 		{
629 			ereport(elevel,
630 					(errcode_for_file_access(),
631 					 errmsg("could not open file \"%s\": %m", newfile)));
632 			return -1;
633 		}
634 	}
635 	else
636 	{
637 		if (pg_fsync(fd) != 0)
638 		{
639 			int			save_errno;
640 
641 			/* close file upon error, might not be in transaction context */
642 			save_errno = errno;
643 			CloseTransientFile(fd);
644 			errno = save_errno;
645 
646 			ereport(elevel,
647 					(errcode_for_file_access(),
648 					 errmsg("could not fsync file \"%s\": %m", newfile)));
649 			return -1;
650 		}
651 
652 		if (CloseTransientFile(fd))
653 		{
654 			ereport(elevel,
655 					(errcode_for_file_access(),
656 					 errmsg("could not close file \"%s\": %m", newfile)));
657 			return -1;
658 		}
659 	}
660 
661 	/* Time to do the real deal... */
662 	if (rename(oldfile, newfile) < 0)
663 	{
664 		ereport(elevel,
665 				(errcode_for_file_access(),
666 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
667 						oldfile, newfile)));
668 		return -1;
669 	}
670 
671 	/*
672 	 * To guarantee renaming the file is persistent, fsync the file with its
673 	 * new name, and its containing directory.
674 	 */
675 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
676 		return -1;
677 
678 	if (fsync_parent_path(newfile, elevel) != 0)
679 		return -1;
680 
681 	return 0;
682 }
683 
684 /*
685  * durable_unlink -- remove a file in a durable manner
686  *
687  * This routine ensures that, after returning, the effect of removing file
688  * persists in case of a crash. A crash while this routine is running will
689  * leave the system in no mixed state.
690  *
691  * It does so by using fsync on the parent directory of the file after the
692  * actual removal is done.
693  *
694  * Log errors with the severity specified by caller.
695  *
696  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
697  * valid upon return.
698  */
699 int
durable_unlink(const char * fname,int elevel)700 durable_unlink(const char *fname, int elevel)
701 {
702 	if (unlink(fname) < 0)
703 	{
704 		ereport(elevel,
705 				(errcode_for_file_access(),
706 				 errmsg("could not remove file \"%s\": %m",
707 						fname)));
708 		return -1;
709 	}
710 
711 	/*
712 	 * To guarantee that the removal of the file is persistent, fsync its
713 	 * parent directory.
714 	 */
715 	if (fsync_parent_path(fname, elevel) != 0)
716 		return -1;
717 
718 	return 0;
719 }
720 
721 /*
722  * durable_link_or_rename -- rename a file in a durable manner.
723  *
724  * Similar to durable_rename(), except that this routine tries (but does not
725  * guarantee) not to overwrite the target file.
726  *
727  * Note that a crash in an unfortunate moment can leave you with two links to
728  * the target file.
729  *
730  * Log errors with the caller specified severity.
731  *
732  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
733  * valid upon return.
734  */
735 int
durable_link_or_rename(const char * oldfile,const char * newfile,int elevel)736 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
737 {
738 	/*
739 	 * Ensure that, if we crash directly after the rename/link, a file with
740 	 * valid contents is moved into place.
741 	 */
742 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
743 		return -1;
744 
745 #if HAVE_WORKING_LINK
746 	if (link(oldfile, newfile) < 0)
747 	{
748 		ereport(elevel,
749 				(errcode_for_file_access(),
750 				 errmsg("could not link file \"%s\" to \"%s\": %m",
751 						oldfile, newfile)));
752 		return -1;
753 	}
754 	unlink(oldfile);
755 #else
756 	/* XXX: Add racy file existence check? */
757 	if (rename(oldfile, newfile) < 0)
758 	{
759 		ereport(elevel,
760 				(errcode_for_file_access(),
761 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
762 						oldfile, newfile)));
763 		return -1;
764 	}
765 #endif
766 
767 	/*
768 	 * Make change persistent in case of an OS crash, both the new entry and
769 	 * its parent directory need to be flushed.
770 	 */
771 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
772 		return -1;
773 
774 	/* Same for parent directory */
775 	if (fsync_parent_path(newfile, elevel) != 0)
776 		return -1;
777 
778 	return 0;
779 }
780 
781 /*
782  * InitFileAccess --- initialize this module during backend startup
783  *
784  * This is called during either normal or standalone backend start.
785  * It is *not* called in the postmaster.
786  */
787 void
InitFileAccess(void)788 InitFileAccess(void)
789 {
790 	Assert(SizeVfdCache == 0);	/* call me only once */
791 
792 	/* initialize cache header entry */
793 	VfdCache = (Vfd *) malloc(sizeof(Vfd));
794 	if (VfdCache == NULL)
795 		ereport(FATAL,
796 				(errcode(ERRCODE_OUT_OF_MEMORY),
797 				 errmsg("out of memory")));
798 
799 	MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
800 	VfdCache->fd = VFD_CLOSED;
801 
802 	SizeVfdCache = 1;
803 
804 	/* register proc-exit hook to ensure temp files are dropped at exit */
805 	on_proc_exit(AtProcExit_Files, 0);
806 }
807 
808 /*
809  * count_usable_fds --- count how many FDs the system will let us open,
810  *		and estimate how many are already open.
811  *
812  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
813  * value of max_to_probe might result in an underestimate of already_open;
814  * we must fill in any "gaps" in the set of used FDs before the calculation
815  * of already_open will give the right answer.  In practice, max_to_probe
816  * of a couple of dozen should be enough to ensure good results.
817  *
818  * We assume stdin (FD 0) is available for dup'ing
819  */
820 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)821 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
822 {
823 	int		   *fd;
824 	int			size;
825 	int			used = 0;
826 	int			highestfd = 0;
827 	int			j;
828 
829 #ifdef HAVE_GETRLIMIT
830 	struct rlimit rlim;
831 	int			getrlimit_status;
832 #endif
833 
834 	size = 1024;
835 	fd = (int *) palloc(size * sizeof(int));
836 
837 #ifdef HAVE_GETRLIMIT
838 #ifdef RLIMIT_NOFILE			/* most platforms use RLIMIT_NOFILE */
839 	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
840 #else							/* but BSD doesn't ... */
841 	getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
842 #endif							/* RLIMIT_NOFILE */
843 	if (getrlimit_status != 0)
844 		ereport(WARNING, (errmsg("getrlimit failed: %m")));
845 #endif							/* HAVE_GETRLIMIT */
846 
847 	/* dup until failure or probe limit reached */
848 	for (;;)
849 	{
850 		int			thisfd;
851 
852 #ifdef HAVE_GETRLIMIT
853 
854 		/*
855 		 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
856 		 * some platforms
857 		 */
858 		if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
859 			break;
860 #endif
861 
862 		thisfd = dup(0);
863 		if (thisfd < 0)
864 		{
865 			/* Expect EMFILE or ENFILE, else it's fishy */
866 			if (errno != EMFILE && errno != ENFILE)
867 				elog(WARNING, "dup(0) failed after %d successes: %m", used);
868 			break;
869 		}
870 
871 		if (used >= size)
872 		{
873 			size *= 2;
874 			fd = (int *) repalloc(fd, size * sizeof(int));
875 		}
876 		fd[used++] = thisfd;
877 
878 		if (highestfd < thisfd)
879 			highestfd = thisfd;
880 
881 		if (used >= max_to_probe)
882 			break;
883 	}
884 
885 	/* release the files we opened */
886 	for (j = 0; j < used; j++)
887 		close(fd[j]);
888 
889 	pfree(fd);
890 
891 	/*
892 	 * Return results.  usable_fds is just the number of successful dups. We
893 	 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
894 	 * number) and so already_open is highestfd+1 - usable_fds.
895 	 */
896 	*usable_fds = used;
897 	*already_open = highestfd + 1 - used;
898 }
899 
900 /*
901  * set_max_safe_fds
902  *		Determine number of filedescriptors that fd.c is allowed to use
903  */
904 void
set_max_safe_fds(void)905 set_max_safe_fds(void)
906 {
907 	int			usable_fds;
908 	int			already_open;
909 
910 	/*----------
911 	 * We want to set max_safe_fds to
912 	 *			MIN(usable_fds, max_files_per_process - already_open)
913 	 * less the slop factor for files that are opened without consulting
914 	 * fd.c.  This ensures that we won't exceed either max_files_per_process
915 	 * or the experimentally-determined EMFILE limit.
916 	 *----------
917 	 */
918 	count_usable_fds(max_files_per_process,
919 					 &usable_fds, &already_open);
920 
921 	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
922 
923 	/*
924 	 * Take off the FDs reserved for system() etc.
925 	 */
926 	max_safe_fds -= NUM_RESERVED_FDS;
927 
928 	/*
929 	 * Make sure we still have enough to get by.
930 	 */
931 	if (max_safe_fds < FD_MINFREE)
932 		ereport(FATAL,
933 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
934 				 errmsg("insufficient file descriptors available to start server process"),
935 				 errdetail("System allows %d, we need at least %d.",
936 						   max_safe_fds + NUM_RESERVED_FDS,
937 						   FD_MINFREE + NUM_RESERVED_FDS)));
938 
939 	elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
940 		 max_safe_fds, usable_fds, already_open);
941 }
942 
943 /*
944  * Open a file with BasicOpenFilePerm() and pass default file mode for the
945  * fileMode parameter.
946  */
947 int
BasicOpenFile(const char * fileName,int fileFlags)948 BasicOpenFile(const char *fileName, int fileFlags)
949 {
950 	return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
951 }
952 
953 /*
954  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
955  *
956  * This is exported for use by places that really want a plain kernel FD,
957  * but need to be proof against running out of FDs.  Once an FD has been
958  * successfully returned, it is the caller's responsibility to ensure that
959  * it will not be leaked on ereport()!	Most users should *not* call this
960  * routine directly, but instead use the VFD abstraction level, which
961  * provides protection against descriptor leaks as well as management of
962  * files that need to be open for more than a short period of time.
963  *
964  * Ideally this should be the *only* direct call of open() in the backend.
965  * In practice, the postmaster calls open() directly, and there are some
966  * direct open() calls done early in backend startup.  Those are OK since
967  * this module wouldn't have any open files to close at that point anyway.
968  */
969 int
BasicOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)970 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
971 {
972 	int			fd;
973 
974 tryAgain:
975 	fd = open(fileName, fileFlags, fileMode);
976 
977 	if (fd >= 0)
978 		return fd;				/* success! */
979 
980 	if (errno == EMFILE || errno == ENFILE)
981 	{
982 		int			save_errno = errno;
983 
984 		ereport(LOG,
985 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
986 				 errmsg("out of file descriptors: %m; release and retry")));
987 		errno = 0;
988 		if (ReleaseLruFile())
989 			goto tryAgain;
990 		errno = save_errno;
991 	}
992 
993 	return -1;					/* failure */
994 }
995 
996 #if defined(FDDEBUG)
997 
998 static void
_dump_lru(void)999 _dump_lru(void)
1000 {
1001 	int			mru = VfdCache[0].lruLessRecently;
1002 	Vfd		   *vfdP = &VfdCache[mru];
1003 	char		buf[2048];
1004 
1005 	snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1006 	while (mru != 0)
1007 	{
1008 		mru = vfdP->lruLessRecently;
1009 		vfdP = &VfdCache[mru];
1010 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1011 	}
1012 	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1013 	elog(LOG, "%s", buf);
1014 }
1015 #endif							/* FDDEBUG */
1016 
1017 static void
Delete(File file)1018 Delete(File file)
1019 {
1020 	Vfd		   *vfdP;
1021 
1022 	Assert(file != 0);
1023 
1024 	DO_DB(elog(LOG, "Delete %d (%s)",
1025 			   file, VfdCache[file].fileName));
1026 	DO_DB(_dump_lru());
1027 
1028 	vfdP = &VfdCache[file];
1029 
1030 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1031 	VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1032 
1033 	DO_DB(_dump_lru());
1034 }
1035 
1036 static void
LruDelete(File file)1037 LruDelete(File file)
1038 {
1039 	Vfd		   *vfdP;
1040 
1041 	Assert(file != 0);
1042 
1043 	DO_DB(elog(LOG, "LruDelete %d (%s)",
1044 			   file, VfdCache[file].fileName));
1045 
1046 	vfdP = &VfdCache[file];
1047 
1048 	/*
1049 	 * Close the file.  We aren't expecting this to fail; if it does, better
1050 	 * to leak the FD than to mess up our internal state.
1051 	 */
1052 	if (close(vfdP->fd))
1053 		elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1054 			 "could not close file \"%s\": %m", vfdP->fileName);
1055 	vfdP->fd = VFD_CLOSED;
1056 	--nfile;
1057 
1058 	/* delete the vfd record from the LRU ring */
1059 	Delete(file);
1060 }
1061 
1062 static void
Insert(File file)1063 Insert(File file)
1064 {
1065 	Vfd		   *vfdP;
1066 
1067 	Assert(file != 0);
1068 
1069 	DO_DB(elog(LOG, "Insert %d (%s)",
1070 			   file, VfdCache[file].fileName));
1071 	DO_DB(_dump_lru());
1072 
1073 	vfdP = &VfdCache[file];
1074 
1075 	vfdP->lruMoreRecently = 0;
1076 	vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1077 	VfdCache[0].lruLessRecently = file;
1078 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1079 
1080 	DO_DB(_dump_lru());
1081 }
1082 
1083 /* returns 0 on success, -1 on re-open failure (with errno set) */
1084 static int
LruInsert(File file)1085 LruInsert(File file)
1086 {
1087 	Vfd		   *vfdP;
1088 
1089 	Assert(file != 0);
1090 
1091 	DO_DB(elog(LOG, "LruInsert %d (%s)",
1092 			   file, VfdCache[file].fileName));
1093 
1094 	vfdP = &VfdCache[file];
1095 
1096 	if (FileIsNotOpen(file))
1097 	{
1098 		/* Close excess kernel FDs. */
1099 		ReleaseLruFiles();
1100 
1101 		/*
1102 		 * The open could still fail for lack of file descriptors, eg due to
1103 		 * overall system file table being full.  So, be prepared to release
1104 		 * another FD if necessary...
1105 		 */
1106 		vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1107 									 vfdP->fileMode);
1108 		if (vfdP->fd < 0)
1109 		{
1110 			DO_DB(elog(LOG, "re-open failed: %m"));
1111 			return -1;
1112 		}
1113 		else
1114 		{
1115 			++nfile;
1116 		}
1117 	}
1118 
1119 	/*
1120 	 * put it at the head of the Lru ring
1121 	 */
1122 
1123 	Insert(file);
1124 
1125 	return 0;
1126 }
1127 
1128 /*
1129  * Release one kernel FD by closing the least-recently-used VFD.
1130  */
1131 static bool
ReleaseLruFile(void)1132 ReleaseLruFile(void)
1133 {
1134 	DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1135 
1136 	if (nfile > 0)
1137 	{
1138 		/*
1139 		 * There are opened files and so there should be at least one used vfd
1140 		 * in the ring.
1141 		 */
1142 		Assert(VfdCache[0].lruMoreRecently != 0);
1143 		LruDelete(VfdCache[0].lruMoreRecently);
1144 		return true;			/* freed a file */
1145 	}
1146 	return false;				/* no files available to free */
1147 }
1148 
1149 /*
1150  * Release kernel FDs as needed to get under the max_safe_fds limit.
1151  * After calling this, it's OK to try to open another file.
1152  */
1153 static void
ReleaseLruFiles(void)1154 ReleaseLruFiles(void)
1155 {
1156 	while (nfile + numAllocatedDescs >= max_safe_fds)
1157 	{
1158 		if (!ReleaseLruFile())
1159 			break;
1160 	}
1161 }
1162 
1163 static File
AllocateVfd(void)1164 AllocateVfd(void)
1165 {
1166 	Index		i;
1167 	File		file;
1168 
1169 	DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1170 
1171 	Assert(SizeVfdCache > 0);	/* InitFileAccess not called? */
1172 
1173 	if (VfdCache[0].nextFree == 0)
1174 	{
1175 		/*
1176 		 * The free list is empty so it is time to increase the size of the
1177 		 * array.  We choose to double it each time this happens. However,
1178 		 * there's not much point in starting *real* small.
1179 		 */
1180 		Size		newCacheSize = SizeVfdCache * 2;
1181 		Vfd		   *newVfdCache;
1182 
1183 		if (newCacheSize < 32)
1184 			newCacheSize = 32;
1185 
1186 		/*
1187 		 * Be careful not to clobber VfdCache ptr if realloc fails.
1188 		 */
1189 		newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1190 		if (newVfdCache == NULL)
1191 			ereport(ERROR,
1192 					(errcode(ERRCODE_OUT_OF_MEMORY),
1193 					 errmsg("out of memory")));
1194 		VfdCache = newVfdCache;
1195 
1196 		/*
1197 		 * Initialize the new entries and link them into the free list.
1198 		 */
1199 		for (i = SizeVfdCache; i < newCacheSize; i++)
1200 		{
1201 			MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1202 			VfdCache[i].nextFree = i + 1;
1203 			VfdCache[i].fd = VFD_CLOSED;
1204 		}
1205 		VfdCache[newCacheSize - 1].nextFree = 0;
1206 		VfdCache[0].nextFree = SizeVfdCache;
1207 
1208 		/*
1209 		 * Record the new size
1210 		 */
1211 		SizeVfdCache = newCacheSize;
1212 	}
1213 
1214 	file = VfdCache[0].nextFree;
1215 
1216 	VfdCache[0].nextFree = VfdCache[file].nextFree;
1217 
1218 	return file;
1219 }
1220 
1221 static void
FreeVfd(File file)1222 FreeVfd(File file)
1223 {
1224 	Vfd		   *vfdP = &VfdCache[file];
1225 
1226 	DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1227 			   file, vfdP->fileName ? vfdP->fileName : ""));
1228 
1229 	if (vfdP->fileName != NULL)
1230 	{
1231 		free(vfdP->fileName);
1232 		vfdP->fileName = NULL;
1233 	}
1234 	vfdP->fdstate = 0x0;
1235 
1236 	vfdP->nextFree = VfdCache[0].nextFree;
1237 	VfdCache[0].nextFree = file;
1238 }
1239 
1240 /* returns 0 on success, -1 on re-open failure (with errno set) */
1241 static int
FileAccess(File file)1242 FileAccess(File file)
1243 {
1244 	int			returnValue;
1245 
1246 	DO_DB(elog(LOG, "FileAccess %d (%s)",
1247 			   file, VfdCache[file].fileName));
1248 
1249 	/*
1250 	 * Is the file open?  If not, open it and put it at the head of the LRU
1251 	 * ring (possibly closing the least recently used file to get an FD).
1252 	 */
1253 
1254 	if (FileIsNotOpen(file))
1255 	{
1256 		returnValue = LruInsert(file);
1257 		if (returnValue != 0)
1258 			return returnValue;
1259 	}
1260 	else if (VfdCache[0].lruLessRecently != file)
1261 	{
1262 		/*
1263 		 * We now know that the file is open and that it is not the last one
1264 		 * accessed, so we need to move it to the head of the Lru ring.
1265 		 */
1266 
1267 		Delete(file);
1268 		Insert(file);
1269 	}
1270 
1271 	return 0;
1272 }
1273 
1274 /*
1275  * Called whenever a temporary file is deleted to report its size.
1276  */
1277 static void
ReportTemporaryFileUsage(const char * path,off_t size)1278 ReportTemporaryFileUsage(const char *path, off_t size)
1279 {
1280 	pgstat_report_tempfile(size);
1281 
1282 	if (log_temp_files >= 0)
1283 	{
1284 		if ((size / 1024) >= log_temp_files)
1285 			ereport(LOG,
1286 					(errmsg("temporary file: path \"%s\", size %lu",
1287 							path, (unsigned long) size)));
1288 	}
1289 }
1290 
1291 /*
1292  * Called to register a temporary file for automatic close.
1293  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1294  * before the file was opened.
1295  */
1296 static void
RegisterTemporaryFile(File file)1297 RegisterTemporaryFile(File file)
1298 {
1299 	ResourceOwnerRememberFile(CurrentResourceOwner, file);
1300 	VfdCache[file].resowner = CurrentResourceOwner;
1301 
1302 	/* Backup mechanism for closing at end of xact. */
1303 	VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1304 	have_xact_temporary_files = true;
1305 }
1306 
1307 /*
1308  *	Called when we get a shared invalidation message on some relation.
1309  */
1310 #ifdef NOT_USED
1311 void
FileInvalidate(File file)1312 FileInvalidate(File file)
1313 {
1314 	Assert(FileIsValid(file));
1315 	if (!FileIsNotOpen(file))
1316 		LruDelete(file);
1317 }
1318 #endif
1319 
1320 /*
1321  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1322  * fileMode parameter.
1323  */
1324 File
PathNameOpenFile(const char * fileName,int fileFlags)1325 PathNameOpenFile(const char *fileName, int fileFlags)
1326 {
1327 	return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1328 }
1329 
1330 /*
1331  * open a file in an arbitrary directory
1332  *
1333  * NB: if the passed pathname is relative (which it usually is),
1334  * it will be interpreted relative to the process' working directory
1335  * (which should always be $PGDATA when this code is running).
1336  */
1337 File
PathNameOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1338 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1339 {
1340 	char	   *fnamecopy;
1341 	File		file;
1342 	Vfd		   *vfdP;
1343 
1344 	DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1345 			   fileName, fileFlags, fileMode));
1346 
1347 	/*
1348 	 * We need a malloc'd copy of the file name; fail cleanly if no room.
1349 	 */
1350 	fnamecopy = strdup(fileName);
1351 	if (fnamecopy == NULL)
1352 		ereport(ERROR,
1353 				(errcode(ERRCODE_OUT_OF_MEMORY),
1354 				 errmsg("out of memory")));
1355 
1356 	file = AllocateVfd();
1357 	vfdP = &VfdCache[file];
1358 
1359 	/* Close excess kernel FDs. */
1360 	ReleaseLruFiles();
1361 
1362 	vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1363 
1364 	if (vfdP->fd < 0)
1365 	{
1366 		int			save_errno = errno;
1367 
1368 		FreeVfd(file);
1369 		free(fnamecopy);
1370 		errno = save_errno;
1371 		return -1;
1372 	}
1373 	++nfile;
1374 	DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1375 			   vfdP->fd));
1376 
1377 	vfdP->fileName = fnamecopy;
1378 	/* Saved flags are adjusted to be OK for re-opening file */
1379 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1380 	vfdP->fileMode = fileMode;
1381 	vfdP->fileSize = 0;
1382 	vfdP->fdstate = 0x0;
1383 	vfdP->resowner = NULL;
1384 
1385 	Insert(file);
1386 
1387 	return file;
1388 }
1389 
1390 /*
1391  * Create directory 'directory'.  If necessary, create 'basedir', which must
1392  * be the directory above it.  This is designed for creating the top-level
1393  * temporary directory on demand before creating a directory underneath it.
1394  * Do nothing if the directory already exists.
1395  *
1396  * Directories created within the top-level temporary directory should begin
1397  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1398  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
1399  * that do not need any particular prefix.
1400 */
1401 void
PathNameCreateTemporaryDir(const char * basedir,const char * directory)1402 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1403 {
1404 	if (MakePGDirectory(directory) < 0)
1405 	{
1406 		if (errno == EEXIST)
1407 			return;
1408 
1409 		/*
1410 		 * Failed.  Try to create basedir first in case it's missing. Tolerate
1411 		 * EEXIST to close a race against another process following the same
1412 		 * algorithm.
1413 		 */
1414 		if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1415 			ereport(ERROR,
1416 					(errcode_for_file_access(),
1417 					 errmsg("cannot create temporary directory \"%s\": %m",
1418 							basedir)));
1419 
1420 		/* Try again. */
1421 		if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1422 			ereport(ERROR,
1423 					(errcode_for_file_access(),
1424 					 errmsg("cannot create temporary subdirectory \"%s\": %m",
1425 							directory)));
1426 	}
1427 }
1428 
1429 /*
1430  * Delete a directory and everything in it, if it exists.
1431  */
1432 void
PathNameDeleteTemporaryDir(const char * dirname)1433 PathNameDeleteTemporaryDir(const char *dirname)
1434 {
1435 	struct stat statbuf;
1436 
1437 	/* Silently ignore missing directory. */
1438 	if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1439 		return;
1440 
1441 	/*
1442 	 * Currently, walkdir doesn't offer a way for our passed in function to
1443 	 * maintain state.  Perhaps it should, so that we could tell the caller
1444 	 * whether this operation succeeded or failed.  Since this operation is
1445 	 * used in a cleanup path, we wouldn't actually behave differently: we'll
1446 	 * just log failures.
1447 	 */
1448 	walkdir(dirname, unlink_if_exists_fname, false, LOG);
1449 }
1450 
1451 /*
1452  * Open a temporary file that will disappear when we close it.
1453  *
1454  * This routine takes care of generating an appropriate tempfile name.
1455  * There's no need to pass in fileFlags or fileMode either, since only
1456  * one setting makes any sense for a temp file.
1457  *
1458  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1459  * to ensure it's closed and deleted when it's no longer needed, typically at
1460  * the end-of-transaction. In most cases, you don't want temporary files to
1461  * outlive the transaction that created them, so this should be false -- but
1462  * if you need "somewhat" temporary storage, this might be useful. In either
1463  * case, the file is removed when the File is explicitly closed.
1464  */
1465 File
OpenTemporaryFile(bool interXact)1466 OpenTemporaryFile(bool interXact)
1467 {
1468 	File		file = 0;
1469 
1470 	/*
1471 	 * Make sure the current resource owner has space for this File before we
1472 	 * open it, if we'll be registering it below.
1473 	 */
1474 	if (!interXact)
1475 		ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1476 
1477 	/*
1478 	 * If some temp tablespace(s) have been given to us, try to use the next
1479 	 * one.  If a given tablespace can't be found, we silently fall back to
1480 	 * the database's default tablespace.
1481 	 *
1482 	 * BUT: if the temp file is slated to outlive the current transaction,
1483 	 * force it into the database's default tablespace, so that it will not
1484 	 * pose a threat to possible tablespace drop attempts.
1485 	 */
1486 	if (numTempTableSpaces > 0 && !interXact)
1487 	{
1488 		Oid			tblspcOid = GetNextTempTableSpace();
1489 
1490 		if (OidIsValid(tblspcOid))
1491 			file = OpenTemporaryFileInTablespace(tblspcOid, false);
1492 	}
1493 
1494 	/*
1495 	 * If not, or if tablespace is bad, create in database's default
1496 	 * tablespace.  MyDatabaseTableSpace should normally be set before we get
1497 	 * here, but just in case it isn't, fall back to pg_default tablespace.
1498 	 */
1499 	if (file <= 0)
1500 		file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1501 											 MyDatabaseTableSpace :
1502 											 DEFAULTTABLESPACE_OID,
1503 											 true);
1504 
1505 	/* Mark it for deletion at close and temporary file size limit */
1506 	VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1507 
1508 	/* Register it with the current resource owner */
1509 	if (!interXact)
1510 		RegisterTemporaryFile(file);
1511 
1512 	return file;
1513 }
1514 
1515 /*
1516  * Return the path of the temp directory in a given tablespace.
1517  */
1518 void
TempTablespacePath(char * path,Oid tablespace)1519 TempTablespacePath(char *path, Oid tablespace)
1520 {
1521 	/*
1522 	 * Identify the tempfile directory for this tablespace.
1523 	 *
1524 	 * If someone tries to specify pg_global, use pg_default instead.
1525 	 */
1526 	if (tablespace == InvalidOid ||
1527 		tablespace == DEFAULTTABLESPACE_OID ||
1528 		tablespace == GLOBALTABLESPACE_OID)
1529 		snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1530 	else
1531 	{
1532 		/* All other tablespaces are accessed via symlinks */
1533 		snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1534 				 tablespace, TABLESPACE_VERSION_DIRECTORY,
1535 				 PG_TEMP_FILES_DIR);
1536 	}
1537 }
1538 
1539 /*
1540  * Open a temporary file in a specific tablespace.
1541  * Subroutine for OpenTemporaryFile, which see for details.
1542  */
1543 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1544 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1545 {
1546 	char		tempdirpath[MAXPGPATH];
1547 	char		tempfilepath[MAXPGPATH];
1548 	File		file;
1549 
1550 	TempTablespacePath(tempdirpath, tblspcOid);
1551 
1552 	/*
1553 	 * Generate a tempfile name that should be unique within the current
1554 	 * database instance.
1555 	 */
1556 	snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1557 			 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1558 
1559 	/*
1560 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1561 	 * temp file that can be reused.
1562 	 */
1563 	file = PathNameOpenFile(tempfilepath,
1564 							O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1565 	if (file <= 0)
1566 	{
1567 		/*
1568 		 * We might need to create the tablespace's tempfile directory, if no
1569 		 * one has yet done so.
1570 		 *
1571 		 * Don't check for an error from MakePGDirectory; it could fail if
1572 		 * someone else just did the same thing.  If it doesn't work then
1573 		 * we'll bomb out on the second create attempt, instead.
1574 		 */
1575 		(void) MakePGDirectory(tempdirpath);
1576 
1577 		file = PathNameOpenFile(tempfilepath,
1578 								O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1579 		if (file <= 0 && rejectError)
1580 			elog(ERROR, "could not create temporary file \"%s\": %m",
1581 				 tempfilepath);
1582 	}
1583 
1584 	return file;
1585 }
1586 
1587 
1588 /*
1589  * Create a new file.  The directory containing it must already exist.  Files
1590  * created this way are subject to temp_file_limit and are automatically
1591  * closed at end of transaction, but are not automatically deleted on close
1592  * because they are intended to be shared between cooperating backends.
1593  *
1594  * If the file is inside the top-level temporary directory, its name should
1595  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1596  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
1597  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1598  * the prefix isn't needed.
1599  */
1600 File
PathNameCreateTemporaryFile(const char * path,bool error_on_failure)1601 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1602 {
1603 	File		file;
1604 
1605 	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1606 
1607 	/*
1608 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1609 	 * temp file that can be reused.
1610 	 */
1611 	file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1612 	if (file <= 0)
1613 	{
1614 		if (error_on_failure)
1615 			ereport(ERROR,
1616 					(errcode_for_file_access(),
1617 					 errmsg("could not create temporary file \"%s\": %m",
1618 							path)));
1619 		else
1620 			return file;
1621 	}
1622 
1623 	/* Mark it for temp_file_limit accounting. */
1624 	VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1625 
1626 	/* Register it for automatic close. */
1627 	RegisterTemporaryFile(file);
1628 
1629 	return file;
1630 }
1631 
1632 /*
1633  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1634  * another backend.  Files opened this way don't count against the
1635  * temp_file_limit of the caller, are read-only and are automatically closed
1636  * at the end of the transaction but are not deleted on close.
1637  */
1638 File
PathNameOpenTemporaryFile(const char * path)1639 PathNameOpenTemporaryFile(const char *path)
1640 {
1641 	File		file;
1642 
1643 	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1644 
1645 	/* We open the file read-only. */
1646 	file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1647 
1648 	/* If no such file, then we don't raise an error. */
1649 	if (file <= 0 && errno != ENOENT)
1650 		ereport(ERROR,
1651 				(errcode_for_file_access(),
1652 				 errmsg("could not open temporary file \"%s\": %m",
1653 						path)));
1654 
1655 	if (file > 0)
1656 	{
1657 		/* Register it for automatic close. */
1658 		RegisterTemporaryFile(file);
1659 	}
1660 
1661 	return file;
1662 }
1663 
1664 /*
1665  * Delete a file by pathname.  Return true if the file existed, false if
1666  * didn't.
1667  */
1668 bool
PathNameDeleteTemporaryFile(const char * path,bool error_on_failure)1669 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1670 {
1671 	struct stat filestats;
1672 	int			stat_errno;
1673 
1674 	/* Get the final size for pgstat reporting. */
1675 	if (stat(path, &filestats) != 0)
1676 		stat_errno = errno;
1677 	else
1678 		stat_errno = 0;
1679 
1680 	/*
1681 	 * Unlike FileClose's automatic file deletion code, we tolerate
1682 	 * non-existence to support BufFileDeleteShared which doesn't know how
1683 	 * many segments it has to delete until it runs out.
1684 	 */
1685 	if (stat_errno == ENOENT)
1686 		return false;
1687 
1688 	if (unlink(path) < 0)
1689 	{
1690 		if (errno != ENOENT)
1691 			ereport(error_on_failure ? ERROR : LOG,
1692 					(errcode_for_file_access(),
1693 					 errmsg("could not unlink temporary file \"%s\": %m",
1694 							path)));
1695 		return false;
1696 	}
1697 
1698 	if (stat_errno == 0)
1699 		ReportTemporaryFileUsage(path, filestats.st_size);
1700 	else
1701 	{
1702 		errno = stat_errno;
1703 		ereport(LOG,
1704 				(errcode_for_file_access(),
1705 				 errmsg("could not stat file \"%s\": %m", path)));
1706 	}
1707 
1708 	return true;
1709 }
1710 
1711 /*
1712  * close a file when done with it
1713  */
1714 void
FileClose(File file)1715 FileClose(File file)
1716 {
1717 	Vfd		   *vfdP;
1718 
1719 	Assert(FileIsValid(file));
1720 
1721 	DO_DB(elog(LOG, "FileClose: %d (%s)",
1722 			   file, VfdCache[file].fileName));
1723 
1724 	vfdP = &VfdCache[file];
1725 
1726 	if (!FileIsNotOpen(file))
1727 	{
1728 		/* close the file */
1729 		if (close(vfdP->fd))
1730 		{
1731 			/*
1732 			 * We may need to panic on failure to close non-temporary files;
1733 			 * see LruDelete.
1734 			 */
1735 			elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1736 				 "could not close file \"%s\": %m", vfdP->fileName);
1737 		}
1738 
1739 		--nfile;
1740 		vfdP->fd = VFD_CLOSED;
1741 
1742 		/* remove the file from the lru ring */
1743 		Delete(file);
1744 	}
1745 
1746 	if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1747 	{
1748 		/* Subtract its size from current usage (do first in case of error) */
1749 		temporary_files_size -= vfdP->fileSize;
1750 		vfdP->fileSize = 0;
1751 	}
1752 
1753 	/*
1754 	 * Delete the file if it was temporary, and make a log entry if wanted
1755 	 */
1756 	if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1757 	{
1758 		struct stat filestats;
1759 		int			stat_errno;
1760 
1761 		/*
1762 		 * If we get an error, as could happen within the ereport/elog calls,
1763 		 * we'll come right back here during transaction abort.  Reset the
1764 		 * flag to ensure that we can't get into an infinite loop.  This code
1765 		 * is arranged to ensure that the worst-case consequence is failing to
1766 		 * emit log message(s), not failing to attempt the unlink.
1767 		 */
1768 		vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1769 
1770 
1771 		/* first try the stat() */
1772 		if (stat(vfdP->fileName, &filestats))
1773 			stat_errno = errno;
1774 		else
1775 			stat_errno = 0;
1776 
1777 		/* in any case do the unlink */
1778 		if (unlink(vfdP->fileName))
1779 			elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1780 
1781 		/* and last report the stat results */
1782 		if (stat_errno == 0)
1783 			ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1784 		else
1785 		{
1786 			errno = stat_errno;
1787 			elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1788 		}
1789 	}
1790 
1791 	/* Unregister it from the resource owner */
1792 	if (vfdP->resowner)
1793 		ResourceOwnerForgetFile(vfdP->resowner, file);
1794 
1795 	/*
1796 	 * Return the Vfd slot to the free list
1797 	 */
1798 	FreeVfd(file);
1799 }
1800 
1801 /*
1802  * FilePrefetch - initiate asynchronous read of a given range of the file.
1803  *
1804  * Currently the only implementation of this function is using posix_fadvise
1805  * which is the simplest standardized interface that accomplishes this.
1806  * We could add an implementation using libaio in the future; but note that
1807  * this API is inappropriate for libaio, which wants to have a buffer provided
1808  * to read into.
1809  */
1810 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1811 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1812 {
1813 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1814 	int			returnCode;
1815 
1816 	Assert(FileIsValid(file));
1817 
1818 	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1819 			   file, VfdCache[file].fileName,
1820 			   (int64) offset, amount));
1821 
1822 	returnCode = FileAccess(file);
1823 	if (returnCode < 0)
1824 		return returnCode;
1825 
1826 	pgstat_report_wait_start(wait_event_info);
1827 	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1828 							   POSIX_FADV_WILLNEED);
1829 	pgstat_report_wait_end();
1830 
1831 	return returnCode;
1832 #else
1833 	Assert(FileIsValid(file));
1834 	return 0;
1835 #endif
1836 }
1837 
1838 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)1839 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1840 {
1841 	int			returnCode;
1842 
1843 	Assert(FileIsValid(file));
1844 
1845 	DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1846 			   file, VfdCache[file].fileName,
1847 			   (int64) offset, (int64) nbytes));
1848 
1849 	if (nbytes <= 0)
1850 		return;
1851 
1852 	returnCode = FileAccess(file);
1853 	if (returnCode < 0)
1854 		return;
1855 
1856 	pgstat_report_wait_start(wait_event_info);
1857 	pg_flush_data(VfdCache[file].fd, offset, nbytes);
1858 	pgstat_report_wait_end();
1859 }
1860 
1861 int
FileRead(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)1862 FileRead(File file, char *buffer, int amount, off_t offset,
1863 		 uint32 wait_event_info)
1864 {
1865 	int			returnCode;
1866 	Vfd		   *vfdP;
1867 
1868 	Assert(FileIsValid(file));
1869 
1870 	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1871 			   file, VfdCache[file].fileName,
1872 			   (int64) offset,
1873 			   amount, buffer));
1874 
1875 	returnCode = FileAccess(file);
1876 	if (returnCode < 0)
1877 		return returnCode;
1878 
1879 	vfdP = &VfdCache[file];
1880 
1881 retry:
1882 	pgstat_report_wait_start(wait_event_info);
1883 	returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1884 	pgstat_report_wait_end();
1885 
1886 	if (returnCode < 0)
1887 	{
1888 		/*
1889 		 * Windows may run out of kernel buffers and return "Insufficient
1890 		 * system resources" error.  Wait a bit and retry to solve it.
1891 		 *
1892 		 * It is rumored that EINTR is also possible on some Unix filesystems,
1893 		 * in which case immediate retry is indicated.
1894 		 */
1895 #ifdef WIN32
1896 		DWORD		error = GetLastError();
1897 
1898 		switch (error)
1899 		{
1900 			case ERROR_NO_SYSTEM_RESOURCES:
1901 				pg_usleep(1000L);
1902 				errno = EINTR;
1903 				break;
1904 			default:
1905 				_dosmaperr(error);
1906 				break;
1907 		}
1908 #endif
1909 		/* OK to retry if interrupted */
1910 		if (errno == EINTR)
1911 			goto retry;
1912 	}
1913 
1914 	return returnCode;
1915 }
1916 
1917 int
FileWrite(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)1918 FileWrite(File file, char *buffer, int amount, off_t offset,
1919 		  uint32 wait_event_info)
1920 {
1921 	int			returnCode;
1922 	Vfd		   *vfdP;
1923 
1924 	Assert(FileIsValid(file));
1925 
1926 	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1927 			   file, VfdCache[file].fileName,
1928 			   (int64) offset,
1929 			   amount, buffer));
1930 
1931 	returnCode = FileAccess(file);
1932 	if (returnCode < 0)
1933 		return returnCode;
1934 
1935 	vfdP = &VfdCache[file];
1936 
1937 	/*
1938 	 * If enforcing temp_file_limit and it's a temp file, check to see if the
1939 	 * write would overrun temp_file_limit, and throw error if so.  Note: it's
1940 	 * really a modularity violation to throw error here; we should set errno
1941 	 * and return -1.  However, there's no way to report a suitable error
1942 	 * message if we do that.  All current callers would just throw error
1943 	 * immediately anyway, so this is safe at present.
1944 	 */
1945 	if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1946 	{
1947 		off_t		past_write = offset + amount;
1948 
1949 		if (past_write > vfdP->fileSize)
1950 		{
1951 			uint64		newTotal = temporary_files_size;
1952 
1953 			newTotal += past_write - vfdP->fileSize;
1954 			if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1955 				ereport(ERROR,
1956 						(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1957 						 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1958 								temp_file_limit)));
1959 		}
1960 	}
1961 
1962 retry:
1963 	errno = 0;
1964 	pgstat_report_wait_start(wait_event_info);
1965 	returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
1966 	pgstat_report_wait_end();
1967 
1968 	/* if write didn't set errno, assume problem is no disk space */
1969 	if (returnCode != amount && errno == 0)
1970 		errno = ENOSPC;
1971 
1972 	if (returnCode >= 0)
1973 	{
1974 		/*
1975 		 * Maintain fileSize and temporary_files_size if it's a temp file.
1976 		 *
1977 		 * If seekPos is -1 (unknown), this will do nothing; but we could only
1978 		 * get here in that state if we're not enforcing temporary_files_size,
1979 		 * so we don't care.
1980 		 */
1981 		if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1982 		{
1983 			off_t		past_write = offset + amount;
1984 
1985 			if (past_write > vfdP->fileSize)
1986 			{
1987 				temporary_files_size += past_write - vfdP->fileSize;
1988 				vfdP->fileSize = past_write;
1989 			}
1990 		}
1991 	}
1992 	else
1993 	{
1994 		/*
1995 		 * See comments in FileRead()
1996 		 */
1997 #ifdef WIN32
1998 		DWORD		error = GetLastError();
1999 
2000 		switch (error)
2001 		{
2002 			case ERROR_NO_SYSTEM_RESOURCES:
2003 				pg_usleep(1000L);
2004 				errno = EINTR;
2005 				break;
2006 			default:
2007 				_dosmaperr(error);
2008 				break;
2009 		}
2010 #endif
2011 		/* OK to retry if interrupted */
2012 		if (errno == EINTR)
2013 			goto retry;
2014 	}
2015 
2016 	return returnCode;
2017 }
2018 
2019 int
FileSync(File file,uint32 wait_event_info)2020 FileSync(File file, uint32 wait_event_info)
2021 {
2022 	int			returnCode;
2023 
2024 	Assert(FileIsValid(file));
2025 
2026 	DO_DB(elog(LOG, "FileSync: %d (%s)",
2027 			   file, VfdCache[file].fileName));
2028 
2029 	returnCode = FileAccess(file);
2030 	if (returnCode < 0)
2031 		return returnCode;
2032 
2033 	pgstat_report_wait_start(wait_event_info);
2034 	returnCode = pg_fsync(VfdCache[file].fd);
2035 	pgstat_report_wait_end();
2036 
2037 	return returnCode;
2038 }
2039 
2040 off_t
FileSize(File file)2041 FileSize(File file)
2042 {
2043 	Assert(FileIsValid(file));
2044 
2045 	DO_DB(elog(LOG, "FileSize %d (%s)",
2046 			   file, VfdCache[file].fileName));
2047 
2048 	if (FileIsNotOpen(file))
2049 	{
2050 		if (FileAccess(file) < 0)
2051 			return (off_t) -1;
2052 	}
2053 
2054 	return lseek(VfdCache[file].fd, 0, SEEK_END);
2055 }
2056 
2057 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)2058 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2059 {
2060 	int			returnCode;
2061 
2062 	Assert(FileIsValid(file));
2063 
2064 	DO_DB(elog(LOG, "FileTruncate %d (%s)",
2065 			   file, VfdCache[file].fileName));
2066 
2067 	returnCode = FileAccess(file);
2068 	if (returnCode < 0)
2069 		return returnCode;
2070 
2071 	pgstat_report_wait_start(wait_event_info);
2072 	returnCode = ftruncate(VfdCache[file].fd, offset);
2073 	pgstat_report_wait_end();
2074 
2075 	if (returnCode == 0 && VfdCache[file].fileSize > offset)
2076 	{
2077 		/* adjust our state for truncation of a temp file */
2078 		Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2079 		temporary_files_size -= VfdCache[file].fileSize - offset;
2080 		VfdCache[file].fileSize = offset;
2081 	}
2082 
2083 	return returnCode;
2084 }
2085 
2086 /*
2087  * Return the pathname associated with an open file.
2088  *
2089  * The returned string points to an internal buffer, which is valid until
2090  * the file is closed.
2091  */
2092 char *
FilePathName(File file)2093 FilePathName(File file)
2094 {
2095 	Assert(FileIsValid(file));
2096 
2097 	return VfdCache[file].fileName;
2098 }
2099 
2100 /*
2101  * Return the raw file descriptor of an opened file.
2102  *
2103  * The returned file descriptor will be valid until the file is closed, but
2104  * there are a lot of things that can make that happen.  So the caller should
2105  * be careful not to do much of anything else before it finishes using the
2106  * returned file descriptor.
2107  */
2108 int
FileGetRawDesc(File file)2109 FileGetRawDesc(File file)
2110 {
2111 	Assert(FileIsValid(file));
2112 	return VfdCache[file].fd;
2113 }
2114 
2115 /*
2116  * FileGetRawFlags - returns the file flags on open(2)
2117  */
2118 int
FileGetRawFlags(File file)2119 FileGetRawFlags(File file)
2120 {
2121 	Assert(FileIsValid(file));
2122 	return VfdCache[file].fileFlags;
2123 }
2124 
2125 /*
2126  * FileGetRawMode - returns the mode bitmask passed to open(2)
2127  */
2128 mode_t
FileGetRawMode(File file)2129 FileGetRawMode(File file)
2130 {
2131 	Assert(FileIsValid(file));
2132 	return VfdCache[file].fileMode;
2133 }
2134 
2135 /*
2136  * Make room for another allocatedDescs[] array entry if needed and possible.
2137  * Returns true if an array element is available.
2138  */
2139 static bool
reserveAllocatedDesc(void)2140 reserveAllocatedDesc(void)
2141 {
2142 	AllocateDesc *newDescs;
2143 	int			newMax;
2144 
2145 	/* Quick out if array already has a free slot. */
2146 	if (numAllocatedDescs < maxAllocatedDescs)
2147 		return true;
2148 
2149 	/*
2150 	 * If the array hasn't yet been created in the current process, initialize
2151 	 * it with FD_MINFREE / 2 elements.  In many scenarios this is as many as
2152 	 * we will ever need, anyway.  We don't want to look at max_safe_fds
2153 	 * immediately because set_max_safe_fds() may not have run yet.
2154 	 */
2155 	if (allocatedDescs == NULL)
2156 	{
2157 		newMax = FD_MINFREE / 2;
2158 		newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2159 		/* Out of memory already?  Treat as fatal error. */
2160 		if (newDescs == NULL)
2161 			ereport(ERROR,
2162 					(errcode(ERRCODE_OUT_OF_MEMORY),
2163 					 errmsg("out of memory")));
2164 		allocatedDescs = newDescs;
2165 		maxAllocatedDescs = newMax;
2166 		return true;
2167 	}
2168 
2169 	/*
2170 	 * Consider enlarging the array beyond the initial allocation used above.
2171 	 * By the time this happens, max_safe_fds should be known accurately.
2172 	 *
2173 	 * We mustn't let allocated descriptors hog all the available FDs, and in
2174 	 * practice we'd better leave a reasonable number of FDs for VFD use.  So
2175 	 * set the maximum to max_safe_fds / 2.  (This should certainly be at
2176 	 * least as large as the initial size, FD_MINFREE / 2.)
2177 	 */
2178 	newMax = max_safe_fds / 2;
2179 	if (newMax > maxAllocatedDescs)
2180 	{
2181 		newDescs = (AllocateDesc *) realloc(allocatedDescs,
2182 											newMax * sizeof(AllocateDesc));
2183 		/* Treat out-of-memory as a non-fatal error. */
2184 		if (newDescs == NULL)
2185 			return false;
2186 		allocatedDescs = newDescs;
2187 		maxAllocatedDescs = newMax;
2188 		return true;
2189 	}
2190 
2191 	/* Can't enlarge allocatedDescs[] any more. */
2192 	return false;
2193 }
2194 
2195 /*
2196  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2197  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
2198  * necessary to open the file.  When done, call FreeFile rather than fclose.
2199  *
2200  * Note that files that will be open for any significant length of time
2201  * should NOT be handled this way, since they cannot share kernel file
2202  * descriptors with other files; there is grave risk of running out of FDs
2203  * if anyone locks down too many FDs.  Most callers of this routine are
2204  * simply reading a config file that they will read and close immediately.
2205  *
2206  * fd.c will automatically close all files opened with AllocateFile at
2207  * transaction commit or abort; this prevents FD leakage if a routine
2208  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2209  *
2210  * Ideally this should be the *only* direct call of fopen() in the backend.
2211  */
2212 FILE *
AllocateFile(const char * name,const char * mode)2213 AllocateFile(const char *name, const char *mode)
2214 {
2215 	FILE	   *file;
2216 
2217 	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2218 			   numAllocatedDescs, name));
2219 
2220 	/* Can we allocate another non-virtual FD? */
2221 	if (!reserveAllocatedDesc())
2222 		ereport(ERROR,
2223 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2224 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2225 						maxAllocatedDescs, name)));
2226 
2227 	/* Close excess kernel FDs. */
2228 	ReleaseLruFiles();
2229 
2230 TryAgain:
2231 	if ((file = fopen(name, mode)) != NULL)
2232 	{
2233 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2234 
2235 		desc->kind = AllocateDescFile;
2236 		desc->desc.file = file;
2237 		desc->create_subid = GetCurrentSubTransactionId();
2238 		numAllocatedDescs++;
2239 		return desc->desc.file;
2240 	}
2241 
2242 	if (errno == EMFILE || errno == ENFILE)
2243 	{
2244 		int			save_errno = errno;
2245 
2246 		ereport(LOG,
2247 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2248 				 errmsg("out of file descriptors: %m; release and retry")));
2249 		errno = 0;
2250 		if (ReleaseLruFile())
2251 			goto TryAgain;
2252 		errno = save_errno;
2253 	}
2254 
2255 	return NULL;
2256 }
2257 
2258 /*
2259  * Open a file with OpenTransientFilePerm() and pass default file mode for
2260  * the fileMode parameter.
2261  */
2262 int
OpenTransientFile(const char * fileName,int fileFlags)2263 OpenTransientFile(const char *fileName, int fileFlags)
2264 {
2265 	return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2266 }
2267 
2268 /*
2269  * Like AllocateFile, but returns an unbuffered fd like open(2)
2270  */
2271 int
OpenTransientFilePerm(const char * fileName,int fileFlags,mode_t fileMode)2272 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2273 {
2274 	int			fd;
2275 
2276 	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2277 			   numAllocatedDescs, fileName));
2278 
2279 	/* Can we allocate another non-virtual FD? */
2280 	if (!reserveAllocatedDesc())
2281 		ereport(ERROR,
2282 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2283 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2284 						maxAllocatedDescs, fileName)));
2285 
2286 	/* Close excess kernel FDs. */
2287 	ReleaseLruFiles();
2288 
2289 	fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2290 
2291 	if (fd >= 0)
2292 	{
2293 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2294 
2295 		desc->kind = AllocateDescRawFD;
2296 		desc->desc.fd = fd;
2297 		desc->create_subid = GetCurrentSubTransactionId();
2298 		numAllocatedDescs++;
2299 
2300 		return fd;
2301 	}
2302 
2303 	return -1;					/* failure */
2304 }
2305 
2306 /*
2307  * Routines that want to initiate a pipe stream should use OpenPipeStream
2308  * rather than plain popen().  This lets fd.c deal with freeing FDs if
2309  * necessary.  When done, call ClosePipeStream rather than pclose.
2310  *
2311  * This function also ensures that the popen'd program is run with default
2312  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2313  * uses.  This ensures desirable response to, eg, closing a read pipe early.
2314  */
2315 FILE *
OpenPipeStream(const char * command,const char * mode)2316 OpenPipeStream(const char *command, const char *mode)
2317 {
2318 	FILE	   *file;
2319 	int			save_errno;
2320 
2321 	DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2322 			   numAllocatedDescs, command));
2323 
2324 	/* Can we allocate another non-virtual FD? */
2325 	if (!reserveAllocatedDesc())
2326 		ereport(ERROR,
2327 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2328 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2329 						maxAllocatedDescs, command)));
2330 
2331 	/* Close excess kernel FDs. */
2332 	ReleaseLruFiles();
2333 
2334 TryAgain:
2335 	fflush(stdout);
2336 	fflush(stderr);
2337 	pqsignal(SIGPIPE, SIG_DFL);
2338 	errno = 0;
2339 	file = popen(command, mode);
2340 	save_errno = errno;
2341 	pqsignal(SIGPIPE, SIG_IGN);
2342 	errno = save_errno;
2343 	if (file != NULL)
2344 	{
2345 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2346 
2347 		desc->kind = AllocateDescPipe;
2348 		desc->desc.file = file;
2349 		desc->create_subid = GetCurrentSubTransactionId();
2350 		numAllocatedDescs++;
2351 		return desc->desc.file;
2352 	}
2353 
2354 	if (errno == EMFILE || errno == ENFILE)
2355 	{
2356 		ereport(LOG,
2357 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2358 				 errmsg("out of file descriptors: %m; release and retry")));
2359 		if (ReleaseLruFile())
2360 			goto TryAgain;
2361 		errno = save_errno;
2362 	}
2363 
2364 	return NULL;
2365 }
2366 
2367 /*
2368  * Free an AllocateDesc of any type.
2369  *
2370  * The argument *must* point into the allocatedDescs[] array.
2371  */
2372 static int
FreeDesc(AllocateDesc * desc)2373 FreeDesc(AllocateDesc *desc)
2374 {
2375 	int			result;
2376 
2377 	/* Close the underlying object */
2378 	switch (desc->kind)
2379 	{
2380 		case AllocateDescFile:
2381 			result = fclose(desc->desc.file);
2382 			break;
2383 		case AllocateDescPipe:
2384 			result = pclose(desc->desc.file);
2385 			break;
2386 		case AllocateDescDir:
2387 			result = closedir(desc->desc.dir);
2388 			break;
2389 		case AllocateDescRawFD:
2390 			result = close(desc->desc.fd);
2391 			break;
2392 		default:
2393 			elog(ERROR, "AllocateDesc kind not recognized");
2394 			result = 0;			/* keep compiler quiet */
2395 			break;
2396 	}
2397 
2398 	/* Compact storage in the allocatedDescs array */
2399 	numAllocatedDescs--;
2400 	*desc = allocatedDescs[numAllocatedDescs];
2401 
2402 	return result;
2403 }
2404 
2405 /*
2406  * Close a file returned by AllocateFile.
2407  *
2408  * Note we do not check fclose's return value --- it is up to the caller
2409  * to handle close errors.
2410  */
2411 int
FreeFile(FILE * file)2412 FreeFile(FILE *file)
2413 {
2414 	int			i;
2415 
2416 	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2417 
2418 	/* Remove file from list of allocated files, if it's present */
2419 	for (i = numAllocatedDescs; --i >= 0;)
2420 	{
2421 		AllocateDesc *desc = &allocatedDescs[i];
2422 
2423 		if (desc->kind == AllocateDescFile && desc->desc.file == file)
2424 			return FreeDesc(desc);
2425 	}
2426 
2427 	/* Only get here if someone passes us a file not in allocatedDescs */
2428 	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2429 
2430 	return fclose(file);
2431 }
2432 
2433 /*
2434  * Close a file returned by OpenTransientFile.
2435  *
2436  * Note we do not check close's return value --- it is up to the caller
2437  * to handle close errors.
2438  */
2439 int
CloseTransientFile(int fd)2440 CloseTransientFile(int fd)
2441 {
2442 	int			i;
2443 
2444 	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2445 
2446 	/* Remove fd from list of allocated files, if it's present */
2447 	for (i = numAllocatedDescs; --i >= 0;)
2448 	{
2449 		AllocateDesc *desc = &allocatedDescs[i];
2450 
2451 		if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2452 			return FreeDesc(desc);
2453 	}
2454 
2455 	/* Only get here if someone passes us a file not in allocatedDescs */
2456 	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2457 
2458 	return close(fd);
2459 }
2460 
2461 /*
2462  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2463  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
2464  * necessary to open the directory, and with closing it after an elog.
2465  * When done, call FreeDir rather than closedir.
2466  *
2467  * Returns NULL, with errno set, on failure.  Note that failure detection
2468  * is commonly left to the following call of ReadDir or ReadDirExtended;
2469  * see the comments for ReadDir.
2470  *
2471  * Ideally this should be the *only* direct call of opendir() in the backend.
2472  */
2473 DIR *
AllocateDir(const char * dirname)2474 AllocateDir(const char *dirname)
2475 {
2476 	DIR		   *dir;
2477 
2478 	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2479 			   numAllocatedDescs, dirname));
2480 
2481 	/* Can we allocate another non-virtual FD? */
2482 	if (!reserveAllocatedDesc())
2483 		ereport(ERROR,
2484 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2485 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2486 						maxAllocatedDescs, dirname)));
2487 
2488 	/* Close excess kernel FDs. */
2489 	ReleaseLruFiles();
2490 
2491 TryAgain:
2492 	if ((dir = opendir(dirname)) != NULL)
2493 	{
2494 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2495 
2496 		desc->kind = AllocateDescDir;
2497 		desc->desc.dir = dir;
2498 		desc->create_subid = GetCurrentSubTransactionId();
2499 		numAllocatedDescs++;
2500 		return desc->desc.dir;
2501 	}
2502 
2503 	if (errno == EMFILE || errno == ENFILE)
2504 	{
2505 		int			save_errno = errno;
2506 
2507 		ereport(LOG,
2508 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2509 				 errmsg("out of file descriptors: %m; release and retry")));
2510 		errno = 0;
2511 		if (ReleaseLruFile())
2512 			goto TryAgain;
2513 		errno = save_errno;
2514 	}
2515 
2516 	return NULL;
2517 }
2518 
2519 /*
2520  * Read a directory opened with AllocateDir, ereport'ing any error.
2521  *
2522  * This is easier to use than raw readdir() since it takes care of some
2523  * otherwise rather tedious and error-prone manipulation of errno.  Also,
2524  * if you are happy with a generic error message for AllocateDir failure,
2525  * you can just do
2526  *
2527  *		dir = AllocateDir(path);
2528  *		while ((dirent = ReadDir(dir, path)) != NULL)
2529  *			process dirent;
2530  *		FreeDir(dir);
2531  *
2532  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2533  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2534  * use this shortcut.)
2535  *
2536  * The pathname passed to AllocateDir must be passed to this routine too,
2537  * but it is only used for error reporting.
2538  */
2539 struct dirent *
ReadDir(DIR * dir,const char * dirname)2540 ReadDir(DIR *dir, const char *dirname)
2541 {
2542 	return ReadDirExtended(dir, dirname, ERROR);
2543 }
2544 
2545 /*
2546  * Alternate version of ReadDir that allows caller to specify the elevel
2547  * for any error report (whether it's reporting an initial failure of
2548  * AllocateDir or a subsequent directory read failure).
2549  *
2550  * If elevel < ERROR, returns NULL after any error.  With the normal coding
2551  * pattern, this will result in falling out of the loop immediately as
2552  * though the directory contained no (more) entries.
2553  */
2554 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2555 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2556 {
2557 	struct dirent *dent;
2558 
2559 	/* Give a generic message for AllocateDir failure, if caller didn't */
2560 	if (dir == NULL)
2561 	{
2562 		ereport(elevel,
2563 				(errcode_for_file_access(),
2564 				 errmsg("could not open directory \"%s\": %m",
2565 						dirname)));
2566 		return NULL;
2567 	}
2568 
2569 	errno = 0;
2570 	if ((dent = readdir(dir)) != NULL)
2571 		return dent;
2572 
2573 	if (errno)
2574 		ereport(elevel,
2575 				(errcode_for_file_access(),
2576 				 errmsg("could not read directory \"%s\": %m",
2577 						dirname)));
2578 	return NULL;
2579 }
2580 
2581 /*
2582  * Close a directory opened with AllocateDir.
2583  *
2584  * Returns closedir's return value (with errno set if it's not 0).
2585  * Note we do not check the return value --- it is up to the caller
2586  * to handle close errors if wanted.
2587  *
2588  * Does nothing if dir == NULL; we assume that directory open failure was
2589  * already reported if desired.
2590  */
2591 int
FreeDir(DIR * dir)2592 FreeDir(DIR *dir)
2593 {
2594 	int			i;
2595 
2596 	/* Nothing to do if AllocateDir failed */
2597 	if (dir == NULL)
2598 		return 0;
2599 
2600 	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2601 
2602 	/* Remove dir from list of allocated dirs, if it's present */
2603 	for (i = numAllocatedDescs; --i >= 0;)
2604 	{
2605 		AllocateDesc *desc = &allocatedDescs[i];
2606 
2607 		if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2608 			return FreeDesc(desc);
2609 	}
2610 
2611 	/* Only get here if someone passes us a dir not in allocatedDescs */
2612 	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2613 
2614 	return closedir(dir);
2615 }
2616 
2617 
2618 /*
2619  * Close a pipe stream returned by OpenPipeStream.
2620  */
2621 int
ClosePipeStream(FILE * file)2622 ClosePipeStream(FILE *file)
2623 {
2624 	int			i;
2625 
2626 	DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2627 
2628 	/* Remove file from list of allocated files, if it's present */
2629 	for (i = numAllocatedDescs; --i >= 0;)
2630 	{
2631 		AllocateDesc *desc = &allocatedDescs[i];
2632 
2633 		if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2634 			return FreeDesc(desc);
2635 	}
2636 
2637 	/* Only get here if someone passes us a file not in allocatedDescs */
2638 	elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2639 
2640 	return pclose(file);
2641 }
2642 
2643 /*
2644  * closeAllVfds
2645  *
2646  * Force all VFDs into the physically-closed state, so that the fewest
2647  * possible number of kernel file descriptors are in use.  There is no
2648  * change in the logical state of the VFDs.
2649  */
2650 void
closeAllVfds(void)2651 closeAllVfds(void)
2652 {
2653 	Index		i;
2654 
2655 	if (SizeVfdCache > 0)
2656 	{
2657 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2658 		for (i = 1; i < SizeVfdCache; i++)
2659 		{
2660 			if (!FileIsNotOpen(i))
2661 				LruDelete(i);
2662 		}
2663 	}
2664 }
2665 
2666 
2667 /*
2668  * SetTempTablespaces
2669  *
2670  * Define a list (actually an array) of OIDs of tablespaces to use for
2671  * temporary files.  This list will be used until end of transaction,
2672  * unless this function is called again before then.  It is caller's
2673  * responsibility that the passed-in array has adequate lifespan (typically
2674  * it'd be allocated in TopTransactionContext).
2675  *
2676  * Some entries of the array may be InvalidOid, indicating that the current
2677  * database's default tablespace should be used.
2678  */
2679 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2680 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2681 {
2682 	Assert(numSpaces >= 0);
2683 	tempTableSpaces = tableSpaces;
2684 	numTempTableSpaces = numSpaces;
2685 
2686 	/*
2687 	 * Select a random starting point in the list.  This is to minimize
2688 	 * conflicts between backends that are most likely sharing the same list
2689 	 * of temp tablespaces.  Note that if we create multiple temp files in the
2690 	 * same transaction, we'll advance circularly through the list --- this
2691 	 * ensures that large temporary sort files are nicely spread across all
2692 	 * available tablespaces.
2693 	 */
2694 	if (numSpaces > 1)
2695 		nextTempTableSpace = random() % numSpaces;
2696 	else
2697 		nextTempTableSpace = 0;
2698 }
2699 
2700 /*
2701  * TempTablespacesAreSet
2702  *
2703  * Returns true if SetTempTablespaces has been called in current transaction.
2704  * (This is just so that tablespaces.c doesn't need its own per-transaction
2705  * state.)
2706  */
2707 bool
TempTablespacesAreSet(void)2708 TempTablespacesAreSet(void)
2709 {
2710 	return (numTempTableSpaces >= 0);
2711 }
2712 
2713 /*
2714  * GetTempTablespaces
2715  *
2716  * Populate an array with the OIDs of the tablespaces that should be used for
2717  * temporary files.  (Some entries may be InvalidOid, indicating that the
2718  * current database's default tablespace should be used.)  At most numSpaces
2719  * entries will be filled.
2720  * Returns the number of OIDs that were copied into the output array.
2721  */
2722 int
GetTempTablespaces(Oid * tableSpaces,int numSpaces)2723 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2724 {
2725 	int			i;
2726 
2727 	Assert(TempTablespacesAreSet());
2728 	for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2729 		tableSpaces[i] = tempTableSpaces[i];
2730 
2731 	return i;
2732 }
2733 
2734 /*
2735  * GetNextTempTableSpace
2736  *
2737  * Select the next temp tablespace to use.  A result of InvalidOid means
2738  * to use the current database's default tablespace.
2739  */
2740 Oid
GetNextTempTableSpace(void)2741 GetNextTempTableSpace(void)
2742 {
2743 	if (numTempTableSpaces > 0)
2744 	{
2745 		/* Advance nextTempTableSpace counter with wraparound */
2746 		if (++nextTempTableSpace >= numTempTableSpaces)
2747 			nextTempTableSpace = 0;
2748 		return tempTableSpaces[nextTempTableSpace];
2749 	}
2750 	return InvalidOid;
2751 }
2752 
2753 
2754 /*
2755  * AtEOSubXact_Files
2756  *
2757  * Take care of subtransaction commit/abort.  At abort, we close temp files
2758  * that the subtransaction may have opened.  At commit, we reassign the
2759  * files that were opened to the parent subtransaction.
2760  */
2761 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2762 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2763 				  SubTransactionId parentSubid)
2764 {
2765 	Index		i;
2766 
2767 	for (i = 0; i < numAllocatedDescs; i++)
2768 	{
2769 		if (allocatedDescs[i].create_subid == mySubid)
2770 		{
2771 			if (isCommit)
2772 				allocatedDescs[i].create_subid = parentSubid;
2773 			else
2774 			{
2775 				/* have to recheck the item after FreeDesc (ugly) */
2776 				FreeDesc(&allocatedDescs[i--]);
2777 			}
2778 		}
2779 	}
2780 }
2781 
2782 /*
2783  * AtEOXact_Files
2784  *
2785  * This routine is called during transaction commit or abort.  All still-open
2786  * per-transaction temporary file VFDs are closed, which also causes the
2787  * underlying files to be deleted (although they should've been closed already
2788  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2789  * closed. We also forget any transaction-local temp tablespace list.
2790  *
2791  * The isCommit flag is used only to decide whether to emit warnings about
2792  * unclosed files.
2793  */
2794 void
AtEOXact_Files(bool isCommit)2795 AtEOXact_Files(bool isCommit)
2796 {
2797 	CleanupTempFiles(isCommit, false);
2798 	tempTableSpaces = NULL;
2799 	numTempTableSpaces = -1;
2800 }
2801 
2802 /*
2803  * AtProcExit_Files
2804  *
2805  * on_proc_exit hook to clean up temp files during backend shutdown.
2806  * Here, we want to clean up *all* temp files including interXact ones.
2807  */
2808 static void
AtProcExit_Files(int code,Datum arg)2809 AtProcExit_Files(int code, Datum arg)
2810 {
2811 	CleanupTempFiles(false, true);
2812 }
2813 
2814 /*
2815  * Close temporary files and delete their underlying files.
2816  *
2817  * isCommit: if true, this is normal transaction commit, and we don't
2818  * expect any remaining files; warn if there are some.
2819  *
2820  * isProcExit: if true, this is being called as the backend process is
2821  * exiting. If that's the case, we should remove all temporary files; if
2822  * that's not the case, we are being called for transaction commit/abort
2823  * and should only remove transaction-local temp files.  In either case,
2824  * also clean up "allocated" stdio files, dirs and fds.
2825  */
2826 static void
CleanupTempFiles(bool isCommit,bool isProcExit)2827 CleanupTempFiles(bool isCommit, bool isProcExit)
2828 {
2829 	Index		i;
2830 
2831 	/*
2832 	 * Careful here: at proc_exit we need extra cleanup, not just
2833 	 * xact_temporary files.
2834 	 */
2835 	if (isProcExit || have_xact_temporary_files)
2836 	{
2837 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2838 		for (i = 1; i < SizeVfdCache; i++)
2839 		{
2840 			unsigned short fdstate = VfdCache[i].fdstate;
2841 
2842 			if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2843 				VfdCache[i].fileName != NULL)
2844 			{
2845 				/*
2846 				 * If we're in the process of exiting a backend process, close
2847 				 * all temporary files. Otherwise, only close temporary files
2848 				 * local to the current transaction. They should be closed by
2849 				 * the ResourceOwner mechanism already, so this is just a
2850 				 * debugging cross-check.
2851 				 */
2852 				if (isProcExit)
2853 					FileClose(i);
2854 				else if (fdstate & FD_CLOSE_AT_EOXACT)
2855 				{
2856 					elog(WARNING,
2857 						 "temporary file %s not closed at end-of-transaction",
2858 						 VfdCache[i].fileName);
2859 					FileClose(i);
2860 				}
2861 			}
2862 		}
2863 
2864 		have_xact_temporary_files = false;
2865 	}
2866 
2867 	/* Complain if any allocated files remain open at commit. */
2868 	if (isCommit && numAllocatedDescs > 0)
2869 		elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2870 			 numAllocatedDescs);
2871 
2872 	/* Clean up "allocated" stdio files, dirs and fds. */
2873 	while (numAllocatedDescs > 0)
2874 		FreeDesc(&allocatedDescs[0]);
2875 }
2876 
2877 
2878 /*
2879  * Remove temporary and temporary relation files left over from a prior
2880  * postmaster session
2881  *
2882  * This should be called during postmaster startup.  It will forcibly
2883  * remove any leftover files created by OpenTemporaryFile and any leftover
2884  * temporary relation files created by mdcreate.
2885  *
2886  * NOTE: we could, but don't, call this during a post-backend-crash restart
2887  * cycle.  The argument for not doing it is that someone might want to examine
2888  * the temp files for debugging purposes.  This does however mean that
2889  * OpenTemporaryFile had better allow for collision with an existing temp
2890  * file name.
2891  *
2892  * NOTE: this function and its subroutines generally report syscall failures
2893  * with ereport(LOG) and keep going.  Removing temp files is not so critical
2894  * that we should fail to start the database when we can't do it.
2895  */
2896 void
RemovePgTempFiles(void)2897 RemovePgTempFiles(void)
2898 {
2899 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2900 	DIR		   *spc_dir;
2901 	struct dirent *spc_de;
2902 
2903 	/*
2904 	 * First process temp files in pg_default ($PGDATA/base)
2905 	 */
2906 	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2907 	RemovePgTempFilesInDir(temp_path, true, false);
2908 	RemovePgTempRelationFiles("base");
2909 
2910 	/*
2911 	 * Cycle through temp directories for all non-default tablespaces.
2912 	 */
2913 	spc_dir = AllocateDir("pg_tblspc");
2914 
2915 	while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
2916 	{
2917 		if (strcmp(spc_de->d_name, ".") == 0 ||
2918 			strcmp(spc_de->d_name, "..") == 0)
2919 			continue;
2920 
2921 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2922 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2923 		RemovePgTempFilesInDir(temp_path, true, false);
2924 
2925 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2926 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2927 		RemovePgTempRelationFiles(temp_path);
2928 	}
2929 
2930 	FreeDir(spc_dir);
2931 
2932 	/*
2933 	 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2934 	 * DataDir as well.
2935 	 */
2936 #ifdef EXEC_BACKEND
2937 	RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false);
2938 #endif
2939 }
2940 
2941 /*
2942  * Process one pgsql_tmp directory for RemovePgTempFiles.
2943  *
2944  * If missing_ok is true, it's all right for the named directory to not exist.
2945  * Any other problem results in a LOG message.  (missing_ok should be true at
2946  * the top level, since pgsql_tmp directories are not created until needed.)
2947  *
2948  * At the top level, this should be called with unlink_all = false, so that
2949  * only files matching the temporary name prefix will be unlinked.  When
2950  * recursing it will be called with unlink_all = true to unlink everything
2951  * under a top-level temporary directory.
2952  *
2953  * (These two flags could be replaced by one, but it seems clearer to keep
2954  * them separate.)
2955  */
2956 static void
RemovePgTempFilesInDir(const char * tmpdirname,bool missing_ok,bool unlink_all)2957 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
2958 {
2959 	DIR		   *temp_dir;
2960 	struct dirent *temp_de;
2961 	char		rm_path[MAXPGPATH * 2];
2962 
2963 	temp_dir = AllocateDir(tmpdirname);
2964 
2965 	if (temp_dir == NULL && errno == ENOENT && missing_ok)
2966 		return;
2967 
2968 	while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
2969 	{
2970 		if (strcmp(temp_de->d_name, ".") == 0 ||
2971 			strcmp(temp_de->d_name, "..") == 0)
2972 			continue;
2973 
2974 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
2975 				 tmpdirname, temp_de->d_name);
2976 
2977 		if (unlink_all ||
2978 			strncmp(temp_de->d_name,
2979 					PG_TEMP_FILE_PREFIX,
2980 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
2981 		{
2982 			struct stat statbuf;
2983 
2984 			if (lstat(rm_path, &statbuf) < 0)
2985 			{
2986 				ereport(LOG,
2987 						(errcode_for_file_access(),
2988 						 errmsg("could not stat file \"%s\": %m", rm_path)));
2989 				continue;
2990 			}
2991 
2992 			if (S_ISDIR(statbuf.st_mode))
2993 			{
2994 				/* recursively remove contents, then directory itself */
2995 				RemovePgTempFilesInDir(rm_path, false, true);
2996 
2997 				if (rmdir(rm_path) < 0)
2998 					ereport(LOG,
2999 							(errcode_for_file_access(),
3000 							 errmsg("could not remove directory \"%s\": %m",
3001 									rm_path)));
3002 			}
3003 			else
3004 			{
3005 				if (unlink(rm_path) < 0)
3006 					ereport(LOG,
3007 							(errcode_for_file_access(),
3008 							 errmsg("could not remove file \"%s\": %m",
3009 									rm_path)));
3010 			}
3011 		}
3012 		else
3013 			ereport(LOG,
3014 					(errmsg("unexpected file found in temporary-files directory: \"%s\"",
3015 							rm_path)));
3016 	}
3017 
3018 	FreeDir(temp_dir);
3019 }
3020 
3021 /* Process one tablespace directory, look for per-DB subdirectories */
3022 static void
RemovePgTempRelationFiles(const char * tsdirname)3023 RemovePgTempRelationFiles(const char *tsdirname)
3024 {
3025 	DIR		   *ts_dir;
3026 	struct dirent *de;
3027 	char		dbspace_path[MAXPGPATH * 2];
3028 
3029 	ts_dir = AllocateDir(tsdirname);
3030 
3031 	while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3032 	{
3033 		/*
3034 		 * We're only interested in the per-database directories, which have
3035 		 * numeric names.  Note that this code will also (properly) ignore "."
3036 		 * and "..".
3037 		 */
3038 		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3039 			continue;
3040 
3041 		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3042 				 tsdirname, de->d_name);
3043 		RemovePgTempRelationFilesInDbspace(dbspace_path);
3044 	}
3045 
3046 	FreeDir(ts_dir);
3047 }
3048 
3049 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3050 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)3051 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3052 {
3053 	DIR		   *dbspace_dir;
3054 	struct dirent *de;
3055 	char		rm_path[MAXPGPATH * 2];
3056 
3057 	dbspace_dir = AllocateDir(dbspacedirname);
3058 
3059 	while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3060 	{
3061 		if (!looks_like_temp_rel_name(de->d_name))
3062 			continue;
3063 
3064 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
3065 				 dbspacedirname, de->d_name);
3066 
3067 		if (unlink(rm_path) < 0)
3068 			ereport(LOG,
3069 					(errcode_for_file_access(),
3070 					 errmsg("could not remove file \"%s\": %m",
3071 							rm_path)));
3072 	}
3073 
3074 	FreeDir(dbspace_dir);
3075 }
3076 
3077 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3078 bool
looks_like_temp_rel_name(const char * name)3079 looks_like_temp_rel_name(const char *name)
3080 {
3081 	int			pos;
3082 	int			savepos;
3083 
3084 	/* Must start with "t". */
3085 	if (name[0] != 't')
3086 		return false;
3087 
3088 	/* Followed by a non-empty string of digits and then an underscore. */
3089 	for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3090 		;
3091 	if (pos == 1 || name[pos] != '_')
3092 		return false;
3093 
3094 	/* Followed by another nonempty string of digits. */
3095 	for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3096 		;
3097 	if (savepos == pos)
3098 		return false;
3099 
3100 	/* We might have _forkname or .segment or both. */
3101 	if (name[pos] == '_')
3102 	{
3103 		int			forkchar = forkname_chars(&name[pos + 1], NULL);
3104 
3105 		if (forkchar <= 0)
3106 			return false;
3107 		pos += forkchar + 1;
3108 	}
3109 	if (name[pos] == '.')
3110 	{
3111 		int			segchar;
3112 
3113 		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3114 			;
3115 		if (segchar <= 1)
3116 			return false;
3117 		pos += segchar;
3118 	}
3119 
3120 	/* Now we should be at the end. */
3121 	if (name[pos] != '\0')
3122 		return false;
3123 	return true;
3124 }
3125 
3126 
3127 /*
3128  * Issue fsync recursively on PGDATA and all its contents.
3129  *
3130  * We fsync regular files and directories wherever they are, but we
3131  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3132  * Other symlinks are presumed to point at files we're not responsible
3133  * for fsyncing, and might not have privileges to write at all.
3134  *
3135  * Errors are logged but not considered fatal; that's because this is used
3136  * only during database startup, to deal with the possibility that there are
3137  * issued-but-unsynced writes pending against the data directory.  We want to
3138  * ensure that such writes reach disk before anything that's done in the new
3139  * run.  However, aborting on error would result in failure to start for
3140  * harmless cases such as read-only files in the data directory, and that's
3141  * not good either.
3142  *
3143  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3144  * rewriting all changes again during recovery.
3145  *
3146  * Note we assume we're chdir'd into PGDATA to begin with.
3147  */
3148 void
SyncDataDirectory(void)3149 SyncDataDirectory(void)
3150 {
3151 	bool		xlog_is_symlink;
3152 
3153 	/* We can skip this whole thing if fsync is disabled. */
3154 	if (!enableFsync)
3155 		return;
3156 
3157 	/*
3158 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
3159 	 * because the first walkdir below will ignore it.
3160 	 */
3161 	xlog_is_symlink = false;
3162 
3163 #ifndef WIN32
3164 	{
3165 		struct stat st;
3166 
3167 		if (lstat("pg_wal", &st) < 0)
3168 			ereport(LOG,
3169 					(errcode_for_file_access(),
3170 					 errmsg("could not stat file \"%s\": %m",
3171 							"pg_wal")));
3172 		else if (S_ISLNK(st.st_mode))
3173 			xlog_is_symlink = true;
3174 	}
3175 #else
3176 	if (pgwin32_is_junction("pg_wal"))
3177 		xlog_is_symlink = true;
3178 #endif
3179 
3180 	/*
3181 	 * If possible, hint to the kernel that we're soon going to fsync the data
3182 	 * directory and its contents.  Errors in this step are even less
3183 	 * interesting than normal, so log them only at DEBUG1.
3184 	 */
3185 #ifdef PG_FLUSH_DATA_WORKS
3186 	walkdir(".", pre_sync_fname, false, DEBUG1);
3187 	if (xlog_is_symlink)
3188 		walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3189 	walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3190 #endif
3191 
3192 	/*
3193 	 * Now we do the fsync()s in the same order.
3194 	 *
3195 	 * The main call ignores symlinks, so in addition to specially processing
3196 	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3197 	 * process_symlinks = true.  Note that if there are any plain directories
3198 	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
3199 	 * so we don't worry about optimizing it.
3200 	 */
3201 	walkdir(".", datadir_fsync_fname, false, LOG);
3202 	if (xlog_is_symlink)
3203 		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3204 	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3205 }
3206 
3207 /*
3208  * walkdir: recursively walk a directory, applying the action to each
3209  * regular file and directory (including the named directory itself).
3210  *
3211  * If process_symlinks is true, the action and recursion are also applied
3212  * to regular files and directories that are pointed to by symlinks in the
3213  * given directory; otherwise symlinks are ignored.  Symlinks are always
3214  * ignored in subdirectories, ie we intentionally don't pass down the
3215  * process_symlinks flag to recursive calls.
3216  *
3217  * Errors are reported at level elevel, which might be ERROR or less.
3218  *
3219  * See also walkdir in initdb.c, which is a frontend version of this logic.
3220  */
3221 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3222 walkdir(const char *path,
3223 		void (*action) (const char *fname, bool isdir, int elevel),
3224 		bool process_symlinks,
3225 		int elevel)
3226 {
3227 	DIR		   *dir;
3228 	struct dirent *de;
3229 
3230 	dir = AllocateDir(path);
3231 
3232 	while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3233 	{
3234 		char		subpath[MAXPGPATH * 2];
3235 		struct stat fst;
3236 		int			sret;
3237 
3238 		CHECK_FOR_INTERRUPTS();
3239 
3240 		if (strcmp(de->d_name, ".") == 0 ||
3241 			strcmp(de->d_name, "..") == 0)
3242 			continue;
3243 
3244 		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3245 
3246 		if (process_symlinks)
3247 			sret = stat(subpath, &fst);
3248 		else
3249 			sret = lstat(subpath, &fst);
3250 
3251 		if (sret < 0)
3252 		{
3253 			ereport(elevel,
3254 					(errcode_for_file_access(),
3255 					 errmsg("could not stat file \"%s\": %m", subpath)));
3256 			continue;
3257 		}
3258 
3259 		if (S_ISREG(fst.st_mode))
3260 			(*action) (subpath, false, elevel);
3261 		else if (S_ISDIR(fst.st_mode))
3262 			walkdir(subpath, action, false, elevel);
3263 	}
3264 
3265 	FreeDir(dir);				/* we ignore any error here */
3266 
3267 	/*
3268 	 * It's important to fsync the destination directory itself as individual
3269 	 * file fsyncs don't guarantee that the directory entry for the file is
3270 	 * synced.  However, skip this if AllocateDir failed; the action function
3271 	 * might not be robust against that.
3272 	 */
3273 	if (dir)
3274 		(*action) (path, true, elevel);
3275 }
3276 
3277 
3278 /*
3279  * Hint to the OS that it should get ready to fsync() this file.
3280  *
3281  * Ignores errors trying to open unreadable files, and logs other errors at a
3282  * caller-specified level.
3283  */
3284 #ifdef PG_FLUSH_DATA_WORKS
3285 
3286 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3287 pre_sync_fname(const char *fname, bool isdir, int elevel)
3288 {
3289 	int			fd;
3290 
3291 	/* Don't try to flush directories, it'll likely just fail */
3292 	if (isdir)
3293 		return;
3294 
3295 	fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3296 
3297 	if (fd < 0)
3298 	{
3299 		if (errno == EACCES)
3300 			return;
3301 		ereport(elevel,
3302 				(errcode_for_file_access(),
3303 				 errmsg("could not open file \"%s\": %m", fname)));
3304 		return;
3305 	}
3306 
3307 	/*
3308 	 * pg_flush_data() ignores errors, which is ok because this is only a
3309 	 * hint.
3310 	 */
3311 	pg_flush_data(fd, 0, 0);
3312 
3313 	if (CloseTransientFile(fd))
3314 		ereport(elevel,
3315 				(errcode_for_file_access(),
3316 				 errmsg("could not close file \"%s\": %m", fname)));
3317 }
3318 
3319 #endif							/* PG_FLUSH_DATA_WORKS */
3320 
3321 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3322 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3323 {
3324 	/*
3325 	 * We want to silently ignoring errors about unreadable files.  Pass that
3326 	 * desire on to fsync_fname_ext().
3327 	 */
3328 	fsync_fname_ext(fname, isdir, true, elevel);
3329 }
3330 
3331 static void
unlink_if_exists_fname(const char * fname,bool isdir,int elevel)3332 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3333 {
3334 	if (isdir)
3335 	{
3336 		if (rmdir(fname) != 0 && errno != ENOENT)
3337 			ereport(elevel,
3338 					(errcode_for_file_access(),
3339 					 errmsg("could not remove directory \"%s\": %m", fname)));
3340 	}
3341 	else
3342 	{
3343 		/* Use PathNameDeleteTemporaryFile to report filesize */
3344 		PathNameDeleteTemporaryFile(fname, false);
3345 	}
3346 }
3347 
3348 /*
3349  * fsync_fname_ext -- Try to fsync a file or directory
3350  *
3351  * If ignore_perm is true, ignore errors upon trying to open unreadable
3352  * files. Logs other errors at a caller-specified level.
3353  *
3354  * Returns 0 if the operation succeeded, -1 otherwise.
3355  */
3356 static int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3357 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3358 {
3359 	int			fd;
3360 	int			flags;
3361 	int			returncode;
3362 
3363 	/*
3364 	 * Some OSs require directories to be opened read-only whereas other
3365 	 * systems don't allow us to fsync files opened read-only; so we need both
3366 	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
3367 	 * not writable by our userid, but we assume that's OK.
3368 	 */
3369 	flags = PG_BINARY;
3370 	if (!isdir)
3371 		flags |= O_RDWR;
3372 	else
3373 		flags |= O_RDONLY;
3374 
3375 	fd = OpenTransientFile(fname, flags);
3376 
3377 	/*
3378 	 * Some OSs don't allow us to open directories at all (Windows returns
3379 	 * EACCES), just ignore the error in that case.  If desired also silently
3380 	 * ignoring errors about unreadable files. Log others.
3381 	 */
3382 	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3383 		return 0;
3384 	else if (fd < 0 && ignore_perm && errno == EACCES)
3385 		return 0;
3386 	else if (fd < 0)
3387 	{
3388 		ereport(elevel,
3389 				(errcode_for_file_access(),
3390 				 errmsg("could not open file \"%s\": %m", fname)));
3391 		return -1;
3392 	}
3393 
3394 	returncode = pg_fsync(fd);
3395 
3396 	/*
3397 	 * Some OSes don't allow us to fsync directories at all, so we can ignore
3398 	 * those errors. Anything else needs to be logged.
3399 	 */
3400 	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3401 	{
3402 		int			save_errno;
3403 
3404 		/* close file upon error, might not be in transaction context */
3405 		save_errno = errno;
3406 		(void) CloseTransientFile(fd);
3407 		errno = save_errno;
3408 
3409 		ereport(elevel,
3410 				(errcode_for_file_access(),
3411 				 errmsg("could not fsync file \"%s\": %m", fname)));
3412 		return -1;
3413 	}
3414 
3415 	if (CloseTransientFile(fd))
3416 	{
3417 		ereport(elevel,
3418 				(errcode_for_file_access(),
3419 				 errmsg("could not close file \"%s\": %m", fname)));
3420 		return -1;
3421 	}
3422 
3423 	return 0;
3424 }
3425 
3426 /*
3427  * fsync_parent_path -- fsync the parent path of a file or directory
3428  *
3429  * This is aimed at making file operations persistent on disk in case of
3430  * an OS crash or power failure.
3431  */
3432 static int
fsync_parent_path(const char * fname,int elevel)3433 fsync_parent_path(const char *fname, int elevel)
3434 {
3435 	char		parentpath[MAXPGPATH];
3436 
3437 	strlcpy(parentpath, fname, MAXPGPATH);
3438 	get_parent_directory(parentpath);
3439 
3440 	/*
3441 	 * get_parent_directory() returns an empty string if the input argument is
3442 	 * just a file name (see comments in path.c), so handle that as being the
3443 	 * current directory.
3444 	 */
3445 	if (strlen(parentpath) == 0)
3446 		strlcpy(parentpath, ".", MAXPGPATH);
3447 
3448 	if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3449 		return -1;
3450 
3451 	return 0;
3452 }
3453 
3454 /*
3455  * Create a PostgreSQL data sub-directory
3456  *
3457  * The data directory itself, and most of its sub-directories, are created at
3458  * initdb time, but we do have some occasions when we create directories in
3459  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
3460  * make sure that those directories are created consistently.  Today, that means
3461  * making sure that the created directory has the correct permissions, which is
3462  * what pg_dir_create_mode tracks for us.
3463  *
3464  * Note that we also set the umask() based on what we understand the correct
3465  * permissions to be (see file_perm.c).
3466  *
3467  * For permissions other than the default, mkdir() can be used directly, but
3468  * be sure to consider carefully such cases -- a sub-directory with incorrect
3469  * permissions in a PostgreSQL data directory could cause backups and other
3470  * processes to fail.
3471  */
3472 int
MakePGDirectory(const char * directoryName)3473 MakePGDirectory(const char *directoryName)
3474 {
3475 	return mkdir(directoryName, pg_dir_create_mode);
3476 }
3477 
3478 /*
3479  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3480  *
3481  * Failure to fsync any data file is cause for immediate panic, unless
3482  * data_sync_retry is enabled.  Data may have been written to the operating
3483  * system and removed from our buffer pool already, and if we are running on
3484  * an operating system that forgets dirty data on write-back failure, there
3485  * may be only one copy of the data remaining: in the WAL.  A later attempt to
3486  * fsync again might falsely report success.  Therefore we must not allow any
3487  * further checkpoints to be attempted.  data_sync_retry can in theory be
3488  * enabled on systems known not to drop dirty buffered data on write-back
3489  * failure (with the likely outcome that checkpoints will continue to fail
3490  * until the underlying problem is fixed).
3491  *
3492  * Any code that reports a failure from fsync() or related functions should
3493  * filter the error level with this function.
3494  */
3495 int
data_sync_elevel(int elevel)3496 data_sync_elevel(int elevel)
3497 {
3498 	return data_sync_retry ? elevel : PANIC;
3499 }
3500