1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  *	  Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have.  (This is around 256 on many modern
20  * operating systems, but can be as low as 32 on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed.  Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends.  Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted.  See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  *-------------------------------------------------------------------------
65  */
66 
67 #include "postgres.h"
68 
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h>		/* for getrlimit */
80 #endif
81 
82 #include "miscadmin.h"
83 #include "access/xact.h"
84 #include "access/xlog.h"
85 #include "catalog/pg_tablespace.h"
86 #include "common/file_perm.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93 
94 
95 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96 #if defined(HAVE_SYNC_FILE_RANGE)
97 #define PG_FLUSH_DATA_WORKS 1
98 #elif !defined(WIN32) && defined(MS_ASYNC)
99 #define PG_FLUSH_DATA_WORKS 1
100 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101 #define PG_FLUSH_DATA_WORKS 1
102 #endif
103 
104 /*
105  * We must leave some file descriptors free for system(), the dynamic loader,
106  * and other code that tries to open files without consulting fd.c.  This
107  * is the number left free.  (While we can be pretty sure we won't get
108  * EMFILE, there's never any guarantee that we won't get ENFILE due to
109  * other processes chewing up FDs.  So it's a bad idea to try to open files
110  * without consulting fd.c.  Nonetheless we cannot control all code.)
111  *
112  * Because this is just a fixed setting, we are effectively assuming that
113  * no such code will leave FDs open over the long term; otherwise the slop
114  * is likely to be insufficient.  Note in particular that we expect that
115  * loading a shared library does not result in any permanent increase in
116  * the number of open files.  (This appears to be true on most if not
117  * all platforms as of Feb 2004.)
118  */
119 #define NUM_RESERVED_FDS		10
120 
121 /*
122  * If we have fewer than this many usable FDs after allowing for the reserved
123  * ones, choke.
124  */
125 #define FD_MINFREE				10
126 
127 /*
128  * A number of platforms allow individual processes to open many more files
129  * than they can really support when *many* processes do the same thing.
130  * This GUC parameter lets the DBA limit max_safe_fds to something less than
131  * what the postmaster's initial probe suggests will work.
132  */
133 int			max_files_per_process = 1000;
134 
135 /*
136  * Maximum number of file descriptors to open for either VFD entries or
137  * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
138  * to a conservative value, and remains that way indefinitely in bootstrap or
139  * standalone-backend cases.  In normal postmaster operation, the postmaster
140  * calls set_max_safe_fds() late in initialization to update the value, and
141  * that value is then inherited by forked subprocesses.
142  *
143  * Note: the value of max_files_per_process is taken into account while
144  * setting this variable, and so need not be tested separately.
145  */
146 int			max_safe_fds = 32;	/* default if not changed */
147 
148 /* Whether it is safe to continue running after fsync() fails. */
149 bool		data_sync_retry = false;
150 
151 /* Debugging.... */
152 
153 #ifdef FDDEBUG
154 #define DO_DB(A) \
155 	do { \
156 		int			_do_db_save_errno = errno; \
157 		A; \
158 		errno = _do_db_save_errno; \
159 	} while (0)
160 #else
161 #define DO_DB(A) \
162 	((void) 0)
163 #endif
164 
165 #define VFD_CLOSED (-1)
166 
167 #define FileIsValid(file) \
168 	((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
169 
170 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
171 
172 /*
173  * Note: a VFD's seekPos is normally always valid, but if for some reason
174  * an lseek() fails, it might become set to FileUnknownPos.  We can struggle
175  * along without knowing the seek position in many cases, but in some places
176  * we have to fail if we don't have it.
177  */
178 #define FileUnknownPos ((off_t) -1)
179 #define FilePosIsUnknown(pos) ((pos) < 0)
180 
181 /* these are the assigned bits in fdstate below: */
182 #define FD_DELETE_AT_CLOSE	(1 << 0)	/* T = delete when closed */
183 #define FD_CLOSE_AT_EOXACT	(1 << 1)	/* T = close at eoXact */
184 #define FD_TEMP_FILE_LIMIT	(1 << 2)	/* T = respect temp_file_limit */
185 
186 typedef struct vfd
187 {
188 	int			fd;				/* current FD, or VFD_CLOSED if none */
189 	unsigned short fdstate;		/* bitflags for VFD's state */
190 	ResourceOwner resowner;		/* owner, for automatic cleanup */
191 	File		nextFree;		/* link to next free VFD, if in freelist */
192 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
193 	File		lruLessRecently;
194 	off_t		seekPos;		/* current logical file position, or -1 */
195 	off_t		fileSize;		/* current size of file (0 if not temporary) */
196 	char	   *fileName;		/* name of file, or NULL for unused VFD */
197 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
198 	int			fileFlags;		/* open(2) flags for (re)opening the file */
199 	mode_t		fileMode;		/* mode to pass to open(2) */
200 } Vfd;
201 
202 /*
203  * Virtual File Descriptor array pointer and size.  This grows as
204  * needed.  'File' values are indexes into this array.
205  * Note that VfdCache[0] is not a usable VFD, just a list header.
206  */
207 static Vfd *VfdCache;
208 static Size SizeVfdCache = 0;
209 
210 /*
211  * Number of file descriptors known to be in use by VFD entries.
212  */
213 static int	nfile = 0;
214 
215 /*
216  * Flag to tell whether it's worth scanning VfdCache looking for temp files
217  * to close
218  */
219 static bool have_xact_temporary_files = false;
220 
221 /*
222  * Tracks the total size of all temporary files.  Note: when temp_file_limit
223  * is being enforced, this cannot overflow since the limit cannot be more
224  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
225  * overflow, but we don't care.
226  */
227 static uint64 temporary_files_size = 0;
228 
229 /*
230  * List of OS handles opened with AllocateFile, AllocateDir and
231  * OpenTransientFile.
232  */
233 typedef enum
234 {
235 	AllocateDescFile,
236 	AllocateDescPipe,
237 	AllocateDescDir,
238 	AllocateDescRawFD
239 } AllocateDescKind;
240 
241 typedef struct
242 {
243 	AllocateDescKind kind;
244 	SubTransactionId create_subid;
245 	union
246 	{
247 		FILE	   *file;
248 		DIR		   *dir;
249 		int			fd;
250 	}			desc;
251 } AllocateDesc;
252 
253 static int	numAllocatedDescs = 0;
254 static int	maxAllocatedDescs = 0;
255 static AllocateDesc *allocatedDescs = NULL;
256 
257 /*
258  * Number of temporary files opened during the current session;
259  * this is used in generation of tempfile names.
260  */
261 static long tempFileCounter = 0;
262 
263 /*
264  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
265  * indicating that the current database's default tablespace should be used.)
266  * When numTempTableSpaces is -1, this has not been set in the current
267  * transaction.
268  */
269 static Oid *tempTableSpaces = NULL;
270 static int	numTempTableSpaces = -1;
271 static int	nextTempTableSpace = 0;
272 
273 
274 /*--------------------
275  *
276  * Private Routines
277  *
278  * Delete		   - delete a file from the Lru ring
279  * LruDelete	   - remove a file from the Lru ring and close its FD
280  * Insert		   - put a file at the front of the Lru ring
281  * LruInsert	   - put a file at the front of the Lru ring and open it
282  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
283  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
284  * AllocateVfd	   - grab a free (or new) file record (from VfdArray)
285  * FreeVfd		   - free a file record
286  *
287  * The Least Recently Used ring is a doubly linked list that begins and
288  * ends on element zero.  Element zero is special -- it doesn't represent
289  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
290  * anchor that shows us the beginning/end of the ring.
291  * Only VFD elements that are currently really open (have an FD assigned) are
292  * in the Lru ring.  Elements that are "virtually" open can be recognized
293  * by having a non-null fileName field.
294  *
295  * example:
296  *
297  *	   /--less----\				   /---------\
298  *	   v		   \			  v			  \
299  *	 #0 --more---> LeastRecentlyUsed --more-\ \
300  *	  ^\									| |
301  *	   \\less--> MostRecentlyUsedFile	<---/ |
302  *		\more---/					 \--less--/
303  *
304  *--------------------
305  */
306 static void Delete(File file);
307 static void LruDelete(File file);
308 static void Insert(File file);
309 static int	LruInsert(File file);
310 static bool ReleaseLruFile(void);
311 static void ReleaseLruFiles(void);
312 static File AllocateVfd(void);
313 static void FreeVfd(File file);
314 
315 static int	FileAccess(File file);
316 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
317 static bool reserveAllocatedDesc(void);
318 static int	FreeDesc(AllocateDesc *desc);
319 
320 static void AtProcExit_Files(int code, Datum arg);
321 static void CleanupTempFiles(bool isCommit, bool isProcExit);
322 static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
323 					   bool unlink_all);
324 static void RemovePgTempRelationFiles(const char *tsdirname);
325 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
326 
327 static void walkdir(const char *path,
328 		void (*action) (const char *fname, bool isdir, int elevel),
329 		bool process_symlinks,
330 		int elevel);
331 #ifdef PG_FLUSH_DATA_WORKS
332 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
333 #endif
334 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
335 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
336 
337 static int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
338 static int	fsync_parent_path(const char *fname, int elevel);
339 
340 
341 /*
342  * pg_fsync --- do fsync with or without writethrough
343  */
344 int
pg_fsync(int fd)345 pg_fsync(int fd)
346 {
347 	/* #if is to skip the sync_method test if there's no need for it */
348 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
349 	if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
350 		return pg_fsync_writethrough(fd);
351 	else
352 #endif
353 		return pg_fsync_no_writethrough(fd);
354 }
355 
356 
357 /*
358  * pg_fsync_no_writethrough --- same as fsync except does nothing if
359  *	enableFsync is off
360  */
361 int
pg_fsync_no_writethrough(int fd)362 pg_fsync_no_writethrough(int fd)
363 {
364 	if (enableFsync)
365 		return fsync(fd);
366 	else
367 		return 0;
368 }
369 
370 /*
371  * pg_fsync_writethrough
372  */
373 int
pg_fsync_writethrough(int fd)374 pg_fsync_writethrough(int fd)
375 {
376 	if (enableFsync)
377 	{
378 #ifdef WIN32
379 		return _commit(fd);
380 #elif defined(F_FULLFSYNC)
381 		return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
382 #else
383 		errno = ENOSYS;
384 		return -1;
385 #endif
386 	}
387 	else
388 		return 0;
389 }
390 
391 /*
392  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
393  *
394  * Not all platforms have fdatasync; treat as fsync if not available.
395  */
396 int
pg_fdatasync(int fd)397 pg_fdatasync(int fd)
398 {
399 	if (enableFsync)
400 	{
401 #ifdef HAVE_FDATASYNC
402 		return fdatasync(fd);
403 #else
404 		return fsync(fd);
405 #endif
406 	}
407 	else
408 		return 0;
409 }
410 
411 /*
412  * pg_flush_data --- advise OS that the described dirty data should be flushed
413  *
414  * offset of 0 with nbytes 0 means that the entire file should be flushed;
415  * in this case, this function may have side-effects on the file's
416  * seek position!
417  */
418 void
pg_flush_data(int fd,off_t offset,off_t nbytes)419 pg_flush_data(int fd, off_t offset, off_t nbytes)
420 {
421 	/*
422 	 * Right now file flushing is primarily used to avoid making later
423 	 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
424 	 * if fsyncs are disabled - that's a decision we might want to make
425 	 * configurable at some point.
426 	 */
427 	if (!enableFsync)
428 		return;
429 
430 	/*
431 	 * We compile all alternatives that are supported on the current platform,
432 	 * to find portability problems more easily.
433 	 */
434 #if defined(HAVE_SYNC_FILE_RANGE)
435 	{
436 		int			rc;
437 		static bool not_implemented_by_kernel = false;
438 
439 		if (not_implemented_by_kernel)
440 			return;
441 
442 		/*
443 		 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
444 		 * tells the OS that writeback for the specified blocks should be
445 		 * started, but that we don't want to wait for completion.  Note that
446 		 * this call might block if too much dirty data exists in the range.
447 		 * This is the preferable method on OSs supporting it, as it works
448 		 * reliably when available (contrast to msync()) and doesn't flush out
449 		 * clean data (like FADV_DONTNEED).
450 		 */
451 		rc = sync_file_range(fd, offset, nbytes,
452 							 SYNC_FILE_RANGE_WRITE);
453 		if (rc != 0)
454 		{
455 			int			elevel;
456 
457 			/*
458 			 * For systems that don't have an implementation of
459 			 * sync_file_range() such as Windows WSL, generate only one
460 			 * warning and then suppress all further attempts by this process.
461 			 */
462 			if (errno == ENOSYS)
463 			{
464 				elevel = WARNING;
465 				not_implemented_by_kernel = true;
466 			}
467 			else
468 				elevel = data_sync_elevel(WARNING);
469 
470 			ereport(elevel,
471 					(errcode_for_file_access(),
472 					 errmsg("could not flush dirty data: %m")));
473 		}
474 
475 		return;
476 	}
477 #endif
478 #if !defined(WIN32) && defined(MS_ASYNC)
479 	{
480 		void	   *p;
481 		static int	pagesize = 0;
482 
483 		/*
484 		 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
485 		 * writeback. On linux it only does so if MS_SYNC is specified, but
486 		 * then it does the writeback synchronously. Luckily all common linux
487 		 * systems have sync_file_range().  This is preferable over
488 		 * FADV_DONTNEED because it doesn't flush out clean data.
489 		 *
490 		 * We map the file (mmap()), tell the kernel to sync back the contents
491 		 * (msync()), and then remove the mapping again (munmap()).
492 		 */
493 
494 		/* mmap() needs actual length if we want to map whole file */
495 		if (offset == 0 && nbytes == 0)
496 		{
497 			nbytes = lseek(fd, 0, SEEK_END);
498 			if (nbytes < 0)
499 			{
500 				ereport(WARNING,
501 						(errcode_for_file_access(),
502 						 errmsg("could not determine dirty data size: %m")));
503 				return;
504 			}
505 		}
506 
507 		/*
508 		 * Some platforms reject partial-page mmap() attempts.  To deal with
509 		 * that, just truncate the request to a page boundary.  If any extra
510 		 * bytes don't get flushed, well, it's only a hint anyway.
511 		 */
512 
513 		/* fetch pagesize only once */
514 		if (pagesize == 0)
515 			pagesize = sysconf(_SC_PAGESIZE);
516 
517 		/* align length to pagesize, dropping any fractional page */
518 		if (pagesize > 0)
519 			nbytes = (nbytes / pagesize) * pagesize;
520 
521 		/* fractional-page request is a no-op */
522 		if (nbytes <= 0)
523 			return;
524 
525 		/*
526 		 * mmap could well fail, particularly on 32-bit platforms where there
527 		 * may simply not be enough address space.  If so, silently fall
528 		 * through to the next implementation.
529 		 */
530 		if (nbytes <= (off_t) SSIZE_MAX)
531 			p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
532 		else
533 			p = MAP_FAILED;
534 
535 		if (p != MAP_FAILED)
536 		{
537 			int			rc;
538 
539 			rc = msync(p, (size_t) nbytes, MS_ASYNC);
540 			if (rc != 0)
541 			{
542 				ereport(data_sync_elevel(WARNING),
543 						(errcode_for_file_access(),
544 						 errmsg("could not flush dirty data: %m")));
545 				/* NB: need to fall through to munmap()! */
546 			}
547 
548 			rc = munmap(p, (size_t) nbytes);
549 			if (rc != 0)
550 			{
551 				/* FATAL error because mapping would remain */
552 				ereport(FATAL,
553 						(errcode_for_file_access(),
554 						 errmsg("could not munmap() while flushing data: %m")));
555 			}
556 
557 			return;
558 		}
559 	}
560 #endif
561 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
562 	{
563 		int			rc;
564 
565 		/*
566 		 * Signal the kernel that the passed in range should not be cached
567 		 * anymore. This has the, desired, side effect of writing out dirty
568 		 * data, and the, undesired, side effect of likely discarding useful
569 		 * clean cached blocks.  For the latter reason this is the least
570 		 * preferable method.
571 		 */
572 
573 		rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
574 
575 		if (rc != 0)
576 		{
577 			/* don't error out, this is just a performance optimization */
578 			ereport(WARNING,
579 					(errcode_for_file_access(),
580 					 errmsg("could not flush dirty data: %m")));
581 		}
582 
583 		return;
584 	}
585 #endif
586 }
587 
588 
589 /*
590  * fsync_fname -- fsync a file or directory, handling errors properly
591  *
592  * Try to fsync a file or directory. When doing the latter, ignore errors that
593  * indicate the OS just doesn't allow/require fsyncing directories.
594  */
595 void
fsync_fname(const char * fname,bool isdir)596 fsync_fname(const char *fname, bool isdir)
597 {
598 	fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
599 }
600 
601 /*
602  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
603  *
604  * This routine ensures that, after returning, the effect of renaming file
605  * persists in case of a crash. A crash while this routine is running will
606  * leave you with either the pre-existing or the moved file in place of the
607  * new file; no mixed state or truncated files are possible.
608  *
609  * It does so by using fsync on the old filename and the possibly existing
610  * target filename before the rename, and the target file and directory after.
611  *
612  * Note that rename() cannot be used across arbitrary directories, as they
613  * might not be on the same filesystem. Therefore this routine does not
614  * support renaming across directories.
615  *
616  * Log errors with the caller specified severity.
617  *
618  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
619  * valid upon return.
620  */
621 int
durable_rename(const char * oldfile,const char * newfile,int elevel)622 durable_rename(const char *oldfile, const char *newfile, int elevel)
623 {
624 	int			fd;
625 
626 	/*
627 	 * First fsync the old and target path (if it exists), to ensure that they
628 	 * are properly persistent on disk. Syncing the target file is not
629 	 * strictly necessary, but it makes it easier to reason about crashes;
630 	 * because it's then guaranteed that either source or target file exists
631 	 * after a crash.
632 	 */
633 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
634 		return -1;
635 
636 	fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
637 	if (fd < 0)
638 	{
639 		if (errno != ENOENT)
640 		{
641 			ereport(elevel,
642 					(errcode_for_file_access(),
643 					 errmsg("could not open file \"%s\": %m", newfile)));
644 			return -1;
645 		}
646 	}
647 	else
648 	{
649 		if (pg_fsync(fd) != 0)
650 		{
651 			int			save_errno;
652 
653 			/* close file upon error, might not be in transaction context */
654 			save_errno = errno;
655 			CloseTransientFile(fd);
656 			errno = save_errno;
657 
658 			ereport(elevel,
659 					(errcode_for_file_access(),
660 					 errmsg("could not fsync file \"%s\": %m", newfile)));
661 			return -1;
662 		}
663 		CloseTransientFile(fd);
664 	}
665 
666 	/* Time to do the real deal... */
667 	if (rename(oldfile, newfile) < 0)
668 	{
669 		ereport(elevel,
670 				(errcode_for_file_access(),
671 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
672 						oldfile, newfile)));
673 		return -1;
674 	}
675 
676 	/*
677 	 * To guarantee renaming the file is persistent, fsync the file with its
678 	 * new name, and its containing directory.
679 	 */
680 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
681 		return -1;
682 
683 	if (fsync_parent_path(newfile, elevel) != 0)
684 		return -1;
685 
686 	return 0;
687 }
688 
689 /*
690  * durable_unlink -- remove a file in a durable manner
691  *
692  * This routine ensures that, after returning, the effect of removing file
693  * persists in case of a crash. A crash while this routine is running will
694  * leave the system in no mixed state.
695  *
696  * It does so by using fsync on the parent directory of the file after the
697  * actual removal is done.
698  *
699  * Log errors with the severity specified by caller.
700  *
701  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
702  * valid upon return.
703  */
704 int
durable_unlink(const char * fname,int elevel)705 durable_unlink(const char *fname, int elevel)
706 {
707 	if (unlink(fname) < 0)
708 	{
709 		ereport(elevel,
710 				(errcode_for_file_access(),
711 				 errmsg("could not remove file \"%s\": %m",
712 						fname)));
713 		return -1;
714 	}
715 
716 	/*
717 	 * To guarantee that the removal of the file is persistent, fsync its
718 	 * parent directory.
719 	 */
720 	if (fsync_parent_path(fname, elevel) != 0)
721 		return -1;
722 
723 	return 0;
724 }
725 
726 /*
727  * durable_link_or_rename -- rename a file in a durable manner.
728  *
729  * Similar to durable_rename(), except that this routine tries (but does not
730  * guarantee) not to overwrite the target file.
731  *
732  * Note that a crash in an unfortunate moment can leave you with two links to
733  * the target file.
734  *
735  * Log errors with the caller specified severity.
736  *
737  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
738  * valid upon return.
739  */
740 int
durable_link_or_rename(const char * oldfile,const char * newfile,int elevel)741 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
742 {
743 	/*
744 	 * Ensure that, if we crash directly after the rename/link, a file with
745 	 * valid contents is moved into place.
746 	 */
747 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
748 		return -1;
749 
750 #if HAVE_WORKING_LINK
751 	if (link(oldfile, newfile) < 0)
752 	{
753 		ereport(elevel,
754 				(errcode_for_file_access(),
755 				 errmsg("could not link file \"%s\" to \"%s\": %m",
756 						oldfile, newfile)));
757 		return -1;
758 	}
759 	unlink(oldfile);
760 #else
761 	/* XXX: Add racy file existence check? */
762 	if (rename(oldfile, newfile) < 0)
763 	{
764 		ereport(elevel,
765 				(errcode_for_file_access(),
766 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
767 						oldfile, newfile)));
768 		return -1;
769 	}
770 #endif
771 
772 	/*
773 	 * Make change persistent in case of an OS crash, both the new entry and
774 	 * its parent directory need to be flushed.
775 	 */
776 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
777 		return -1;
778 
779 	/* Same for parent directory */
780 	if (fsync_parent_path(newfile, elevel) != 0)
781 		return -1;
782 
783 	return 0;
784 }
785 
786 /*
787  * InitFileAccess --- initialize this module during backend startup
788  *
789  * This is called during either normal or standalone backend start.
790  * It is *not* called in the postmaster.
791  */
792 void
InitFileAccess(void)793 InitFileAccess(void)
794 {
795 	Assert(SizeVfdCache == 0);	/* call me only once */
796 
797 	/* initialize cache header entry */
798 	VfdCache = (Vfd *) malloc(sizeof(Vfd));
799 	if (VfdCache == NULL)
800 		ereport(FATAL,
801 				(errcode(ERRCODE_OUT_OF_MEMORY),
802 				 errmsg("out of memory")));
803 
804 	MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
805 	VfdCache->fd = VFD_CLOSED;
806 
807 	SizeVfdCache = 1;
808 
809 	/* register proc-exit hook to ensure temp files are dropped at exit */
810 	on_proc_exit(AtProcExit_Files, 0);
811 }
812 
813 /*
814  * count_usable_fds --- count how many FDs the system will let us open,
815  *		and estimate how many are already open.
816  *
817  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
818  * value of max_to_probe might result in an underestimate of already_open;
819  * we must fill in any "gaps" in the set of used FDs before the calculation
820  * of already_open will give the right answer.  In practice, max_to_probe
821  * of a couple of dozen should be enough to ensure good results.
822  *
823  * We assume stdin (FD 0) is available for dup'ing
824  */
825 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)826 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
827 {
828 	int		   *fd;
829 	int			size;
830 	int			used = 0;
831 	int			highestfd = 0;
832 	int			j;
833 
834 #ifdef HAVE_GETRLIMIT
835 	struct rlimit rlim;
836 	int			getrlimit_status;
837 #endif
838 
839 	size = 1024;
840 	fd = (int *) palloc(size * sizeof(int));
841 
842 #ifdef HAVE_GETRLIMIT
843 #ifdef RLIMIT_NOFILE			/* most platforms use RLIMIT_NOFILE */
844 	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
845 #else							/* but BSD doesn't ... */
846 	getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
847 #endif							/* RLIMIT_NOFILE */
848 	if (getrlimit_status != 0)
849 		ereport(WARNING, (errmsg("getrlimit failed: %m")));
850 #endif							/* HAVE_GETRLIMIT */
851 
852 	/* dup until failure or probe limit reached */
853 	for (;;)
854 	{
855 		int			thisfd;
856 
857 #ifdef HAVE_GETRLIMIT
858 
859 		/*
860 		 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
861 		 * some platforms
862 		 */
863 		if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
864 			break;
865 #endif
866 
867 		thisfd = dup(0);
868 		if (thisfd < 0)
869 		{
870 			/* Expect EMFILE or ENFILE, else it's fishy */
871 			if (errno != EMFILE && errno != ENFILE)
872 				elog(WARNING, "dup(0) failed after %d successes: %m", used);
873 			break;
874 		}
875 
876 		if (used >= size)
877 		{
878 			size *= 2;
879 			fd = (int *) repalloc(fd, size * sizeof(int));
880 		}
881 		fd[used++] = thisfd;
882 
883 		if (highestfd < thisfd)
884 			highestfd = thisfd;
885 
886 		if (used >= max_to_probe)
887 			break;
888 	}
889 
890 	/* release the files we opened */
891 	for (j = 0; j < used; j++)
892 		close(fd[j]);
893 
894 	pfree(fd);
895 
896 	/*
897 	 * Return results.  usable_fds is just the number of successful dups. We
898 	 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
899 	 * number) and so already_open is highestfd+1 - usable_fds.
900 	 */
901 	*usable_fds = used;
902 	*already_open = highestfd + 1 - used;
903 }
904 
905 /*
906  * set_max_safe_fds
907  *		Determine number of filedescriptors that fd.c is allowed to use
908  */
909 void
set_max_safe_fds(void)910 set_max_safe_fds(void)
911 {
912 	int			usable_fds;
913 	int			already_open;
914 
915 	/*----------
916 	 * We want to set max_safe_fds to
917 	 *			MIN(usable_fds, max_files_per_process - already_open)
918 	 * less the slop factor for files that are opened without consulting
919 	 * fd.c.  This ensures that we won't exceed either max_files_per_process
920 	 * or the experimentally-determined EMFILE limit.
921 	 *----------
922 	 */
923 	count_usable_fds(max_files_per_process,
924 					 &usable_fds, &already_open);
925 
926 	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
927 
928 	/*
929 	 * Take off the FDs reserved for system() etc.
930 	 */
931 	max_safe_fds -= NUM_RESERVED_FDS;
932 
933 	/*
934 	 * Make sure we still have enough to get by.
935 	 */
936 	if (max_safe_fds < FD_MINFREE)
937 		ereport(FATAL,
938 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
939 				 errmsg("insufficient file descriptors available to start server process"),
940 				 errdetail("System allows %d, we need at least %d.",
941 						   max_safe_fds + NUM_RESERVED_FDS,
942 						   FD_MINFREE + NUM_RESERVED_FDS)));
943 
944 	elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
945 		 max_safe_fds, usable_fds, already_open);
946 }
947 
948 /*
949  * Open a file with BasicOpenFilePerm() and pass default file mode for the
950  * fileMode parameter.
951  */
952 int
BasicOpenFile(const char * fileName,int fileFlags)953 BasicOpenFile(const char *fileName, int fileFlags)
954 {
955 	return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
956 }
957 
958 /*
959  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
960  *
961  * This is exported for use by places that really want a plain kernel FD,
962  * but need to be proof against running out of FDs.  Once an FD has been
963  * successfully returned, it is the caller's responsibility to ensure that
964  * it will not be leaked on ereport()!	Most users should *not* call this
965  * routine directly, but instead use the VFD abstraction level, which
966  * provides protection against descriptor leaks as well as management of
967  * files that need to be open for more than a short period of time.
968  *
969  * Ideally this should be the *only* direct call of open() in the backend.
970  * In practice, the postmaster calls open() directly, and there are some
971  * direct open() calls done early in backend startup.  Those are OK since
972  * this module wouldn't have any open files to close at that point anyway.
973  */
974 int
BasicOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)975 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
976 {
977 	int			fd;
978 
979 tryAgain:
980 	fd = open(fileName, fileFlags, fileMode);
981 
982 	if (fd >= 0)
983 		return fd;				/* success! */
984 
985 	if (errno == EMFILE || errno == ENFILE)
986 	{
987 		int			save_errno = errno;
988 
989 		ereport(LOG,
990 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
991 				 errmsg("out of file descriptors: %m; release and retry")));
992 		errno = 0;
993 		if (ReleaseLruFile())
994 			goto tryAgain;
995 		errno = save_errno;
996 	}
997 
998 	return -1;					/* failure */
999 }
1000 
1001 #if defined(FDDEBUG)
1002 
1003 static void
_dump_lru(void)1004 _dump_lru(void)
1005 {
1006 	int			mru = VfdCache[0].lruLessRecently;
1007 	Vfd		   *vfdP = &VfdCache[mru];
1008 	char		buf[2048];
1009 
1010 	snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1011 	while (mru != 0)
1012 	{
1013 		mru = vfdP->lruLessRecently;
1014 		vfdP = &VfdCache[mru];
1015 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1016 	}
1017 	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1018 	elog(LOG, "%s", buf);
1019 }
1020 #endif							/* FDDEBUG */
1021 
1022 static void
Delete(File file)1023 Delete(File file)
1024 {
1025 	Vfd		   *vfdP;
1026 
1027 	Assert(file != 0);
1028 
1029 	DO_DB(elog(LOG, "Delete %d (%s)",
1030 			   file, VfdCache[file].fileName));
1031 	DO_DB(_dump_lru());
1032 
1033 	vfdP = &VfdCache[file];
1034 
1035 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1036 	VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1037 
1038 	DO_DB(_dump_lru());
1039 }
1040 
1041 static void
LruDelete(File file)1042 LruDelete(File file)
1043 {
1044 	Vfd		   *vfdP;
1045 
1046 	Assert(file != 0);
1047 
1048 	DO_DB(elog(LOG, "LruDelete %d (%s)",
1049 			   file, VfdCache[file].fileName));
1050 
1051 	vfdP = &VfdCache[file];
1052 
1053 	/*
1054 	 * Normally we should know the seek position, but if for some reason we
1055 	 * have lost track of it, try again to get it.  If we still can't get it,
1056 	 * we have a problem: we will be unable to restore the file seek position
1057 	 * when and if the file is re-opened.  But we can't really throw an error
1058 	 * and refuse to close the file, or activities such as transaction cleanup
1059 	 * will be broken.
1060 	 */
1061 	if (FilePosIsUnknown(vfdP->seekPos))
1062 	{
1063 		vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1064 		if (FilePosIsUnknown(vfdP->seekPos))
1065 			elog(LOG, "could not seek file \"%s\" before closing: %m",
1066 				 vfdP->fileName);
1067 	}
1068 
1069 	/*
1070 	 * Close the file.  We aren't expecting this to fail; if it does, better
1071 	 * to leak the FD than to mess up our internal state.
1072 	 */
1073 	if (close(vfdP->fd))
1074 		elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1075 			 "could not close file \"%s\": %m", vfdP->fileName);
1076 	vfdP->fd = VFD_CLOSED;
1077 	--nfile;
1078 
1079 	/* delete the vfd record from the LRU ring */
1080 	Delete(file);
1081 }
1082 
1083 static void
Insert(File file)1084 Insert(File file)
1085 {
1086 	Vfd		   *vfdP;
1087 
1088 	Assert(file != 0);
1089 
1090 	DO_DB(elog(LOG, "Insert %d (%s)",
1091 			   file, VfdCache[file].fileName));
1092 	DO_DB(_dump_lru());
1093 
1094 	vfdP = &VfdCache[file];
1095 
1096 	vfdP->lruMoreRecently = 0;
1097 	vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1098 	VfdCache[0].lruLessRecently = file;
1099 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1100 
1101 	DO_DB(_dump_lru());
1102 }
1103 
1104 /* returns 0 on success, -1 on re-open failure (with errno set) */
1105 static int
LruInsert(File file)1106 LruInsert(File file)
1107 {
1108 	Vfd		   *vfdP;
1109 
1110 	Assert(file != 0);
1111 
1112 	DO_DB(elog(LOG, "LruInsert %d (%s)",
1113 			   file, VfdCache[file].fileName));
1114 
1115 	vfdP = &VfdCache[file];
1116 
1117 	if (FileIsNotOpen(file))
1118 	{
1119 		/* Close excess kernel FDs. */
1120 		ReleaseLruFiles();
1121 
1122 		/*
1123 		 * The open could still fail for lack of file descriptors, eg due to
1124 		 * overall system file table being full.  So, be prepared to release
1125 		 * another FD if necessary...
1126 		 */
1127 		vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1128 									 vfdP->fileMode);
1129 		if (vfdP->fd < 0)
1130 		{
1131 			DO_DB(elog(LOG, "re-open failed: %m"));
1132 			return -1;
1133 		}
1134 		else
1135 		{
1136 			++nfile;
1137 		}
1138 
1139 		/*
1140 		 * Seek to the right position.  We need no special case for seekPos
1141 		 * equal to FileUnknownPos, as lseek() will certainly reject that
1142 		 * (thus completing the logic noted in LruDelete() that we will fail
1143 		 * to re-open a file if we couldn't get its seek position before
1144 		 * closing).
1145 		 */
1146 		if (vfdP->seekPos != (off_t) 0)
1147 		{
1148 			if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1149 			{
1150 				/*
1151 				 * If we fail to restore the seek position, treat it like an
1152 				 * open() failure.
1153 				 */
1154 				int			save_errno = errno;
1155 
1156 				elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1157 					 vfdP->fileName);
1158 				(void) close(vfdP->fd);
1159 				vfdP->fd = VFD_CLOSED;
1160 				--nfile;
1161 				errno = save_errno;
1162 				return -1;
1163 			}
1164 		}
1165 	}
1166 
1167 	/*
1168 	 * put it at the head of the Lru ring
1169 	 */
1170 
1171 	Insert(file);
1172 
1173 	return 0;
1174 }
1175 
1176 /*
1177  * Release one kernel FD by closing the least-recently-used VFD.
1178  */
1179 static bool
ReleaseLruFile(void)1180 ReleaseLruFile(void)
1181 {
1182 	DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1183 
1184 	if (nfile > 0)
1185 	{
1186 		/*
1187 		 * There are opened files and so there should be at least one used vfd
1188 		 * in the ring.
1189 		 */
1190 		Assert(VfdCache[0].lruMoreRecently != 0);
1191 		LruDelete(VfdCache[0].lruMoreRecently);
1192 		return true;			/* freed a file */
1193 	}
1194 	return false;				/* no files available to free */
1195 }
1196 
1197 /*
1198  * Release kernel FDs as needed to get under the max_safe_fds limit.
1199  * After calling this, it's OK to try to open another file.
1200  */
1201 static void
ReleaseLruFiles(void)1202 ReleaseLruFiles(void)
1203 {
1204 	while (nfile + numAllocatedDescs >= max_safe_fds)
1205 	{
1206 		if (!ReleaseLruFile())
1207 			break;
1208 	}
1209 }
1210 
1211 static File
AllocateVfd(void)1212 AllocateVfd(void)
1213 {
1214 	Index		i;
1215 	File		file;
1216 
1217 	DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1218 
1219 	Assert(SizeVfdCache > 0);	/* InitFileAccess not called? */
1220 
1221 	if (VfdCache[0].nextFree == 0)
1222 	{
1223 		/*
1224 		 * The free list is empty so it is time to increase the size of the
1225 		 * array.  We choose to double it each time this happens. However,
1226 		 * there's not much point in starting *real* small.
1227 		 */
1228 		Size		newCacheSize = SizeVfdCache * 2;
1229 		Vfd		   *newVfdCache;
1230 
1231 		if (newCacheSize < 32)
1232 			newCacheSize = 32;
1233 
1234 		/*
1235 		 * Be careful not to clobber VfdCache ptr if realloc fails.
1236 		 */
1237 		newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1238 		if (newVfdCache == NULL)
1239 			ereport(ERROR,
1240 					(errcode(ERRCODE_OUT_OF_MEMORY),
1241 					 errmsg("out of memory")));
1242 		VfdCache = newVfdCache;
1243 
1244 		/*
1245 		 * Initialize the new entries and link them into the free list.
1246 		 */
1247 		for (i = SizeVfdCache; i < newCacheSize; i++)
1248 		{
1249 			MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1250 			VfdCache[i].nextFree = i + 1;
1251 			VfdCache[i].fd = VFD_CLOSED;
1252 		}
1253 		VfdCache[newCacheSize - 1].nextFree = 0;
1254 		VfdCache[0].nextFree = SizeVfdCache;
1255 
1256 		/*
1257 		 * Record the new size
1258 		 */
1259 		SizeVfdCache = newCacheSize;
1260 	}
1261 
1262 	file = VfdCache[0].nextFree;
1263 
1264 	VfdCache[0].nextFree = VfdCache[file].nextFree;
1265 
1266 	return file;
1267 }
1268 
1269 static void
FreeVfd(File file)1270 FreeVfd(File file)
1271 {
1272 	Vfd		   *vfdP = &VfdCache[file];
1273 
1274 	DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1275 			   file, vfdP->fileName ? vfdP->fileName : ""));
1276 
1277 	if (vfdP->fileName != NULL)
1278 	{
1279 		free(vfdP->fileName);
1280 		vfdP->fileName = NULL;
1281 	}
1282 	vfdP->fdstate = 0x0;
1283 
1284 	vfdP->nextFree = VfdCache[0].nextFree;
1285 	VfdCache[0].nextFree = file;
1286 }
1287 
1288 /* returns 0 on success, -1 on re-open failure (with errno set) */
1289 static int
FileAccess(File file)1290 FileAccess(File file)
1291 {
1292 	int			returnValue;
1293 
1294 	DO_DB(elog(LOG, "FileAccess %d (%s)",
1295 			   file, VfdCache[file].fileName));
1296 
1297 	/*
1298 	 * Is the file open?  If not, open it and put it at the head of the LRU
1299 	 * ring (possibly closing the least recently used file to get an FD).
1300 	 */
1301 
1302 	if (FileIsNotOpen(file))
1303 	{
1304 		returnValue = LruInsert(file);
1305 		if (returnValue != 0)
1306 			return returnValue;
1307 	}
1308 	else if (VfdCache[0].lruLessRecently != file)
1309 	{
1310 		/*
1311 		 * We now know that the file is open and that it is not the last one
1312 		 * accessed, so we need to move it to the head of the Lru ring.
1313 		 */
1314 
1315 		Delete(file);
1316 		Insert(file);
1317 	}
1318 
1319 	return 0;
1320 }
1321 
1322 /*
1323  * Called whenever a temporary file is deleted to report its size.
1324  */
1325 static void
ReportTemporaryFileUsage(const char * path,off_t size)1326 ReportTemporaryFileUsage(const char *path, off_t size)
1327 {
1328 	pgstat_report_tempfile(size);
1329 
1330 	if (log_temp_files >= 0)
1331 	{
1332 		if ((size / 1024) >= log_temp_files)
1333 			ereport(LOG,
1334 					(errmsg("temporary file: path \"%s\", size %lu",
1335 							path, (unsigned long) size)));
1336 	}
1337 }
1338 
1339 /*
1340  * Called to register a temporary file for automatic close.
1341  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1342  * before the file was opened.
1343  */
1344 static void
RegisterTemporaryFile(File file)1345 RegisterTemporaryFile(File file)
1346 {
1347 	ResourceOwnerRememberFile(CurrentResourceOwner, file);
1348 	VfdCache[file].resowner = CurrentResourceOwner;
1349 
1350 	/* Backup mechanism for closing at end of xact. */
1351 	VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1352 	have_xact_temporary_files = true;
1353 }
1354 
1355 /*
1356  *	Called when we get a shared invalidation message on some relation.
1357  */
1358 #ifdef NOT_USED
1359 void
FileInvalidate(File file)1360 FileInvalidate(File file)
1361 {
1362 	Assert(FileIsValid(file));
1363 	if (!FileIsNotOpen(file))
1364 		LruDelete(file);
1365 }
1366 #endif
1367 
1368 /*
1369  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1370  * fileMode parameter.
1371  */
1372 File
PathNameOpenFile(const char * fileName,int fileFlags)1373 PathNameOpenFile(const char *fileName, int fileFlags)
1374 {
1375 	return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1376 }
1377 
1378 /*
1379  * open a file in an arbitrary directory
1380  *
1381  * NB: if the passed pathname is relative (which it usually is),
1382  * it will be interpreted relative to the process' working directory
1383  * (which should always be $PGDATA when this code is running).
1384  */
1385 File
PathNameOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1386 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1387 {
1388 	char	   *fnamecopy;
1389 	File		file;
1390 	Vfd		   *vfdP;
1391 
1392 	DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1393 			   fileName, fileFlags, fileMode));
1394 
1395 	/*
1396 	 * We need a malloc'd copy of the file name; fail cleanly if no room.
1397 	 */
1398 	fnamecopy = strdup(fileName);
1399 	if (fnamecopy == NULL)
1400 		ereport(ERROR,
1401 				(errcode(ERRCODE_OUT_OF_MEMORY),
1402 				 errmsg("out of memory")));
1403 
1404 	file = AllocateVfd();
1405 	vfdP = &VfdCache[file];
1406 
1407 	/* Close excess kernel FDs. */
1408 	ReleaseLruFiles();
1409 
1410 	vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1411 
1412 	if (vfdP->fd < 0)
1413 	{
1414 		int			save_errno = errno;
1415 
1416 		FreeVfd(file);
1417 		free(fnamecopy);
1418 		errno = save_errno;
1419 		return -1;
1420 	}
1421 	++nfile;
1422 	DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1423 			   vfdP->fd));
1424 
1425 	vfdP->fileName = fnamecopy;
1426 	/* Saved flags are adjusted to be OK for re-opening file */
1427 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1428 	vfdP->fileMode = fileMode;
1429 	vfdP->seekPos = 0;
1430 	vfdP->fileSize = 0;
1431 	vfdP->fdstate = 0x0;
1432 	vfdP->resowner = NULL;
1433 
1434 	Insert(file);
1435 
1436 	return file;
1437 }
1438 
1439 /*
1440  * Create directory 'directory'.  If necessary, create 'basedir', which must
1441  * be the directory above it.  This is designed for creating the top-level
1442  * temporary directory on demand before creating a directory underneath it.
1443  * Do nothing if the directory already exists.
1444  *
1445  * Directories created within the top-level temporary directory should begin
1446  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1447  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
1448  * that do not need any particular prefix.
1449 */
1450 void
PathNameCreateTemporaryDir(const char * basedir,const char * directory)1451 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1452 {
1453 	if (MakePGDirectory(directory) < 0)
1454 	{
1455 		if (errno == EEXIST)
1456 			return;
1457 
1458 		/*
1459 		 * Failed.  Try to create basedir first in case it's missing. Tolerate
1460 		 * EEXIST to close a race against another process following the same
1461 		 * algorithm.
1462 		 */
1463 		if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1464 			ereport(ERROR,
1465 					(errcode_for_file_access(),
1466 					 errmsg("cannot create temporary directory \"%s\": %m",
1467 							basedir)));
1468 
1469 		/* Try again. */
1470 		if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1471 			ereport(ERROR,
1472 					(errcode_for_file_access(),
1473 					 errmsg("cannot create temporary subdirectory \"%s\": %m",
1474 							directory)));
1475 	}
1476 }
1477 
1478 /*
1479  * Delete a directory and everything in it, if it exists.
1480  */
1481 void
PathNameDeleteTemporaryDir(const char * dirname)1482 PathNameDeleteTemporaryDir(const char *dirname)
1483 {
1484 	struct stat statbuf;
1485 
1486 	/* Silently ignore missing directory. */
1487 	if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1488 		return;
1489 
1490 	/*
1491 	 * Currently, walkdir doesn't offer a way for our passed in function to
1492 	 * maintain state.  Perhaps it should, so that we could tell the caller
1493 	 * whether this operation succeeded or failed.  Since this operation is
1494 	 * used in a cleanup path, we wouldn't actually behave differently: we'll
1495 	 * just log failures.
1496 	 */
1497 	walkdir(dirname, unlink_if_exists_fname, false, LOG);
1498 }
1499 
1500 /*
1501  * Open a temporary file that will disappear when we close it.
1502  *
1503  * This routine takes care of generating an appropriate tempfile name.
1504  * There's no need to pass in fileFlags or fileMode either, since only
1505  * one setting makes any sense for a temp file.
1506  *
1507  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1508  * to ensure it's closed and deleted when it's no longer needed, typically at
1509  * the end-of-transaction. In most cases, you don't want temporary files to
1510  * outlive the transaction that created them, so this should be false -- but
1511  * if you need "somewhat" temporary storage, this might be useful. In either
1512  * case, the file is removed when the File is explicitly closed.
1513  */
1514 File
OpenTemporaryFile(bool interXact)1515 OpenTemporaryFile(bool interXact)
1516 {
1517 	File		file = 0;
1518 
1519 	/*
1520 	 * Make sure the current resource owner has space for this File before we
1521 	 * open it, if we'll be registering it below.
1522 	 */
1523 	if (!interXact)
1524 		ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1525 
1526 	/*
1527 	 * If some temp tablespace(s) have been given to us, try to use the next
1528 	 * one.  If a given tablespace can't be found, we silently fall back to
1529 	 * the database's default tablespace.
1530 	 *
1531 	 * BUT: if the temp file is slated to outlive the current transaction,
1532 	 * force it into the database's default tablespace, so that it will not
1533 	 * pose a threat to possible tablespace drop attempts.
1534 	 */
1535 	if (numTempTableSpaces > 0 && !interXact)
1536 	{
1537 		Oid			tblspcOid = GetNextTempTableSpace();
1538 
1539 		if (OidIsValid(tblspcOid))
1540 			file = OpenTemporaryFileInTablespace(tblspcOid, false);
1541 	}
1542 
1543 	/*
1544 	 * If not, or if tablespace is bad, create in database's default
1545 	 * tablespace.  MyDatabaseTableSpace should normally be set before we get
1546 	 * here, but just in case it isn't, fall back to pg_default tablespace.
1547 	 */
1548 	if (file <= 0)
1549 		file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1550 											 MyDatabaseTableSpace :
1551 											 DEFAULTTABLESPACE_OID,
1552 											 true);
1553 
1554 	/* Mark it for deletion at close and temporary file size limit */
1555 	VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1556 
1557 	/* Register it with the current resource owner */
1558 	if (!interXact)
1559 		RegisterTemporaryFile(file);
1560 
1561 	return file;
1562 }
1563 
1564 /*
1565  * Return the path of the temp directory in a given tablespace.
1566  */
1567 void
TempTablespacePath(char * path,Oid tablespace)1568 TempTablespacePath(char *path, Oid tablespace)
1569 {
1570 	/*
1571 	 * Identify the tempfile directory for this tablespace.
1572 	 *
1573 	 * If someone tries to specify pg_global, use pg_default instead.
1574 	 */
1575 	if (tablespace == InvalidOid ||
1576 		tablespace == DEFAULTTABLESPACE_OID ||
1577 		tablespace == GLOBALTABLESPACE_OID)
1578 		snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1579 	else
1580 	{
1581 		/* All other tablespaces are accessed via symlinks */
1582 		snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1583 				 tablespace, TABLESPACE_VERSION_DIRECTORY,
1584 				 PG_TEMP_FILES_DIR);
1585 	}
1586 }
1587 
1588 /*
1589  * Open a temporary file in a specific tablespace.
1590  * Subroutine for OpenTemporaryFile, which see for details.
1591  */
1592 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1593 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1594 {
1595 	char		tempdirpath[MAXPGPATH];
1596 	char		tempfilepath[MAXPGPATH];
1597 	File		file;
1598 
1599 	TempTablespacePath(tempdirpath, tblspcOid);
1600 
1601 	/*
1602 	 * Generate a tempfile name that should be unique within the current
1603 	 * database instance.
1604 	 */
1605 	snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1606 			 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1607 
1608 	/*
1609 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1610 	 * temp file that can be reused.
1611 	 */
1612 	file = PathNameOpenFile(tempfilepath,
1613 							O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1614 	if (file <= 0)
1615 	{
1616 		/*
1617 		 * We might need to create the tablespace's tempfile directory, if no
1618 		 * one has yet done so.
1619 		 *
1620 		 * Don't check for an error from MakePGDirectory; it could fail if
1621 		 * someone else just did the same thing.  If it doesn't work then
1622 		 * we'll bomb out on the second create attempt, instead.
1623 		 */
1624 		(void) MakePGDirectory(tempdirpath);
1625 
1626 		file = PathNameOpenFile(tempfilepath,
1627 								O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1628 		if (file <= 0 && rejectError)
1629 			elog(ERROR, "could not create temporary file \"%s\": %m",
1630 				 tempfilepath);
1631 	}
1632 
1633 	return file;
1634 }
1635 
1636 
1637 /*
1638  * Create a new file.  The directory containing it must already exist.  Files
1639  * created this way are subject to temp_file_limit and are automatically
1640  * closed at end of transaction, but are not automatically deleted on close
1641  * because they are intended to be shared between cooperating backends.
1642  *
1643  * If the file is inside the top-level temporary directory, its name should
1644  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1645  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
1646  * inside a directory created with PathnameCreateTemporaryDir(), in which case
1647  * the prefix isn't needed.
1648  */
1649 File
PathNameCreateTemporaryFile(const char * path,bool error_on_failure)1650 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1651 {
1652 	File		file;
1653 
1654 	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1655 
1656 	/*
1657 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1658 	 * temp file that can be reused.
1659 	 */
1660 	file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1661 	if (file <= 0)
1662 	{
1663 		if (error_on_failure)
1664 			ereport(ERROR,
1665 					(errcode_for_file_access(),
1666 					 errmsg("could not create temporary file \"%s\": %m",
1667 							path)));
1668 		else
1669 			return file;
1670 	}
1671 
1672 	/* Mark it for temp_file_limit accounting. */
1673 	VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1674 
1675 	/* Register it for automatic close. */
1676 	RegisterTemporaryFile(file);
1677 
1678 	return file;
1679 }
1680 
1681 /*
1682  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1683  * another backend.  Files opened this way don't count against the
1684  * temp_file_limit of the caller, are read-only and are automatically closed
1685  * at the end of the transaction but are not deleted on close.
1686  */
1687 File
PathNameOpenTemporaryFile(const char * path)1688 PathNameOpenTemporaryFile(const char *path)
1689 {
1690 	File		file;
1691 
1692 	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1693 
1694 	/* We open the file read-only. */
1695 	file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1696 
1697 	/* If no such file, then we don't raise an error. */
1698 	if (file <= 0 && errno != ENOENT)
1699 		ereport(ERROR,
1700 				(errcode_for_file_access(),
1701 				 errmsg("could not open temporary file \"%s\": %m",
1702 						path)));
1703 
1704 	if (file > 0)
1705 	{
1706 		/* Register it for automatic close. */
1707 		RegisterTemporaryFile(file);
1708 	}
1709 
1710 	return file;
1711 }
1712 
1713 /*
1714  * Delete a file by pathname.  Return true if the file existed, false if
1715  * didn't.
1716  */
1717 bool
PathNameDeleteTemporaryFile(const char * path,bool error_on_failure)1718 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1719 {
1720 	struct stat filestats;
1721 	int			stat_errno;
1722 
1723 	/* Get the final size for pgstat reporting. */
1724 	if (stat(path, &filestats) != 0)
1725 		stat_errno = errno;
1726 	else
1727 		stat_errno = 0;
1728 
1729 	/*
1730 	 * Unlike FileClose's automatic file deletion code, we tolerate
1731 	 * non-existence to support BufFileDeleteShared which doesn't know how
1732 	 * many segments it has to delete until it runs out.
1733 	 */
1734 	if (stat_errno == ENOENT)
1735 		return false;
1736 
1737 	if (unlink(path) < 0)
1738 	{
1739 		if (errno != ENOENT)
1740 			ereport(error_on_failure ? ERROR : LOG,
1741 					(errcode_for_file_access(),
1742 					 errmsg("cannot unlink temporary file \"%s\": %m",
1743 							path)));
1744 		return false;
1745 	}
1746 
1747 	if (stat_errno == 0)
1748 		ReportTemporaryFileUsage(path, filestats.st_size);
1749 	else
1750 	{
1751 		errno = stat_errno;
1752 		ereport(LOG,
1753 				(errcode_for_file_access(),
1754 				 errmsg("could not stat file \"%s\": %m", path)));
1755 	}
1756 
1757 	return true;
1758 }
1759 
1760 /*
1761  * close a file when done with it
1762  */
1763 void
FileClose(File file)1764 FileClose(File file)
1765 {
1766 	Vfd		   *vfdP;
1767 
1768 	Assert(FileIsValid(file));
1769 
1770 	DO_DB(elog(LOG, "FileClose: %d (%s)",
1771 			   file, VfdCache[file].fileName));
1772 
1773 	vfdP = &VfdCache[file];
1774 
1775 	if (!FileIsNotOpen(file))
1776 	{
1777 		/* close the file */
1778 		if (close(vfdP->fd))
1779 		{
1780 			/*
1781 			 * We may need to panic on failure to close non-temporary files;
1782 			 * see LruDelete.
1783 			 */
1784 			elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1785 				"could not close file \"%s\": %m", vfdP->fileName);
1786 		}
1787 
1788 		--nfile;
1789 		vfdP->fd = VFD_CLOSED;
1790 
1791 		/* remove the file from the lru ring */
1792 		Delete(file);
1793 	}
1794 
1795 	if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1796 	{
1797 		/* Subtract its size from current usage (do first in case of error) */
1798 		temporary_files_size -= vfdP->fileSize;
1799 		vfdP->fileSize = 0;
1800 	}
1801 
1802 	/*
1803 	 * Delete the file if it was temporary, and make a log entry if wanted
1804 	 */
1805 	if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1806 	{
1807 		struct stat filestats;
1808 		int			stat_errno;
1809 
1810 		/*
1811 		 * If we get an error, as could happen within the ereport/elog calls,
1812 		 * we'll come right back here during transaction abort.  Reset the
1813 		 * flag to ensure that we can't get into an infinite loop.  This code
1814 		 * is arranged to ensure that the worst-case consequence is failing to
1815 		 * emit log message(s), not failing to attempt the unlink.
1816 		 */
1817 		vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1818 
1819 
1820 		/* first try the stat() */
1821 		if (stat(vfdP->fileName, &filestats))
1822 			stat_errno = errno;
1823 		else
1824 			stat_errno = 0;
1825 
1826 		/* in any case do the unlink */
1827 		if (unlink(vfdP->fileName))
1828 			elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1829 
1830 		/* and last report the stat results */
1831 		if (stat_errno == 0)
1832 			ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1833 		else
1834 		{
1835 			errno = stat_errno;
1836 			elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1837 		}
1838 	}
1839 
1840 	/* Unregister it from the resource owner */
1841 	if (vfdP->resowner)
1842 		ResourceOwnerForgetFile(vfdP->resowner, file);
1843 
1844 	/*
1845 	 * Return the Vfd slot to the free list
1846 	 */
1847 	FreeVfd(file);
1848 }
1849 
1850 /*
1851  * FilePrefetch - initiate asynchronous read of a given range of the file.
1852  * The logical seek position is unaffected.
1853  *
1854  * Currently the only implementation of this function is using posix_fadvise
1855  * which is the simplest standardized interface that accomplishes this.
1856  * We could add an implementation using libaio in the future; but note that
1857  * this API is inappropriate for libaio, which wants to have a buffer provided
1858  * to read into.
1859  */
1860 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1861 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1862 {
1863 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1864 	int			returnCode;
1865 
1866 	Assert(FileIsValid(file));
1867 
1868 	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1869 			   file, VfdCache[file].fileName,
1870 			   (int64) offset, amount));
1871 
1872 	returnCode = FileAccess(file);
1873 	if (returnCode < 0)
1874 		return returnCode;
1875 
1876 	pgstat_report_wait_start(wait_event_info);
1877 	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1878 							   POSIX_FADV_WILLNEED);
1879 	pgstat_report_wait_end();
1880 
1881 	return returnCode;
1882 #else
1883 	Assert(FileIsValid(file));
1884 	return 0;
1885 #endif
1886 }
1887 
1888 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)1889 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1890 {
1891 	int			returnCode;
1892 
1893 	Assert(FileIsValid(file));
1894 
1895 	DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1896 			   file, VfdCache[file].fileName,
1897 			   (int64) offset, (int64) nbytes));
1898 
1899 	/*
1900 	 * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1901 	 * file's seek position.  We prefer to define that as a no-op here.
1902 	 */
1903 	if (nbytes <= 0)
1904 		return;
1905 
1906 	returnCode = FileAccess(file);
1907 	if (returnCode < 0)
1908 		return;
1909 
1910 	pgstat_report_wait_start(wait_event_info);
1911 	pg_flush_data(VfdCache[file].fd, offset, nbytes);
1912 	pgstat_report_wait_end();
1913 }
1914 
1915 int
FileRead(File file,char * buffer,int amount,uint32 wait_event_info)1916 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1917 {
1918 	int			returnCode;
1919 	Vfd		   *vfdP;
1920 
1921 	Assert(FileIsValid(file));
1922 
1923 	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1924 			   file, VfdCache[file].fileName,
1925 			   (int64) VfdCache[file].seekPos,
1926 			   amount, buffer));
1927 
1928 	returnCode = FileAccess(file);
1929 	if (returnCode < 0)
1930 		return returnCode;
1931 
1932 	vfdP = &VfdCache[file];
1933 
1934 retry:
1935 	pgstat_report_wait_start(wait_event_info);
1936 	returnCode = read(vfdP->fd, buffer, amount);
1937 	pgstat_report_wait_end();
1938 
1939 	if (returnCode >= 0)
1940 	{
1941 		/* if seekPos is unknown, leave it that way */
1942 		if (!FilePosIsUnknown(vfdP->seekPos))
1943 			vfdP->seekPos += returnCode;
1944 	}
1945 	else
1946 	{
1947 		/*
1948 		 * Windows may run out of kernel buffers and return "Insufficient
1949 		 * system resources" error.  Wait a bit and retry to solve it.
1950 		 *
1951 		 * It is rumored that EINTR is also possible on some Unix filesystems,
1952 		 * in which case immediate retry is indicated.
1953 		 */
1954 #ifdef WIN32
1955 		DWORD		error = GetLastError();
1956 
1957 		switch (error)
1958 		{
1959 			case ERROR_NO_SYSTEM_RESOURCES:
1960 				pg_usleep(1000L);
1961 				errno = EINTR;
1962 				break;
1963 			default:
1964 				_dosmaperr(error);
1965 				break;
1966 		}
1967 #endif
1968 		/* OK to retry if interrupted */
1969 		if (errno == EINTR)
1970 			goto retry;
1971 
1972 		/* Trouble, so assume we don't know the file position anymore */
1973 		vfdP->seekPos = FileUnknownPos;
1974 	}
1975 
1976 	return returnCode;
1977 }
1978 
1979 int
FileWrite(File file,char * buffer,int amount,uint32 wait_event_info)1980 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1981 {
1982 	int			returnCode;
1983 	Vfd		   *vfdP;
1984 
1985 	Assert(FileIsValid(file));
1986 
1987 	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1988 			   file, VfdCache[file].fileName,
1989 			   (int64) VfdCache[file].seekPos,
1990 			   amount, buffer));
1991 
1992 	returnCode = FileAccess(file);
1993 	if (returnCode < 0)
1994 		return returnCode;
1995 
1996 	vfdP = &VfdCache[file];
1997 
1998 	/*
1999 	 * If enforcing temp_file_limit and it's a temp file, check to see if the
2000 	 * write would overrun temp_file_limit, and throw error if so.  Note: it's
2001 	 * really a modularity violation to throw error here; we should set errno
2002 	 * and return -1.  However, there's no way to report a suitable error
2003 	 * message if we do that.  All current callers would just throw error
2004 	 * immediately anyway, so this is safe at present.
2005 	 */
2006 	if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2007 	{
2008 		off_t		newPos;
2009 
2010 		/*
2011 		 * Normally we should know the seek position, but if for some reason
2012 		 * we have lost track of it, try again to get it.  Here, it's fine to
2013 		 * throw an error if we still can't get it.
2014 		 */
2015 		if (FilePosIsUnknown(vfdP->seekPos))
2016 		{
2017 			vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
2018 			if (FilePosIsUnknown(vfdP->seekPos))
2019 				elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
2020 		}
2021 
2022 		newPos = vfdP->seekPos + amount;
2023 		if (newPos > vfdP->fileSize)
2024 		{
2025 			uint64		newTotal = temporary_files_size;
2026 
2027 			newTotal += newPos - vfdP->fileSize;
2028 			if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2029 				ereport(ERROR,
2030 						(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2031 						 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2032 								temp_file_limit)));
2033 		}
2034 	}
2035 
2036 retry:
2037 	errno = 0;
2038 	pgstat_report_wait_start(wait_event_info);
2039 	returnCode = write(vfdP->fd, buffer, amount);
2040 	pgstat_report_wait_end();
2041 
2042 	/* if write didn't set errno, assume problem is no disk space */
2043 	if (returnCode != amount && errno == 0)
2044 		errno = ENOSPC;
2045 
2046 	if (returnCode >= 0)
2047 	{
2048 		/* if seekPos is unknown, leave it that way */
2049 		if (!FilePosIsUnknown(vfdP->seekPos))
2050 			vfdP->seekPos += returnCode;
2051 
2052 		/*
2053 		 * Maintain fileSize and temporary_files_size if it's a temp file.
2054 		 *
2055 		 * If seekPos is -1 (unknown), this will do nothing; but we could only
2056 		 * get here in that state if we're not enforcing temporary_files_size,
2057 		 * so we don't care.
2058 		 */
2059 		if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2060 		{
2061 			off_t		newPos = vfdP->seekPos;
2062 
2063 			if (newPos > vfdP->fileSize)
2064 			{
2065 				temporary_files_size += newPos - vfdP->fileSize;
2066 				vfdP->fileSize = newPos;
2067 			}
2068 		}
2069 	}
2070 	else
2071 	{
2072 		/*
2073 		 * See comments in FileRead()
2074 		 */
2075 #ifdef WIN32
2076 		DWORD		error = GetLastError();
2077 
2078 		switch (error)
2079 		{
2080 			case ERROR_NO_SYSTEM_RESOURCES:
2081 				pg_usleep(1000L);
2082 				errno = EINTR;
2083 				break;
2084 			default:
2085 				_dosmaperr(error);
2086 				break;
2087 		}
2088 #endif
2089 		/* OK to retry if interrupted */
2090 		if (errno == EINTR)
2091 			goto retry;
2092 
2093 		/* Trouble, so assume we don't know the file position anymore */
2094 		vfdP->seekPos = FileUnknownPos;
2095 	}
2096 
2097 	return returnCode;
2098 }
2099 
2100 int
FileSync(File file,uint32 wait_event_info)2101 FileSync(File file, uint32 wait_event_info)
2102 {
2103 	int			returnCode;
2104 
2105 	Assert(FileIsValid(file));
2106 
2107 	DO_DB(elog(LOG, "FileSync: %d (%s)",
2108 			   file, VfdCache[file].fileName));
2109 
2110 	returnCode = FileAccess(file);
2111 	if (returnCode < 0)
2112 		return returnCode;
2113 
2114 	pgstat_report_wait_start(wait_event_info);
2115 	returnCode = pg_fsync(VfdCache[file].fd);
2116 	pgstat_report_wait_end();
2117 
2118 	return returnCode;
2119 }
2120 
2121 off_t
FileSeek(File file,off_t offset,int whence)2122 FileSeek(File file, off_t offset, int whence)
2123 {
2124 	Vfd		   *vfdP;
2125 
2126 	Assert(FileIsValid(file));
2127 
2128 	DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
2129 			   file, VfdCache[file].fileName,
2130 			   (int64) VfdCache[file].seekPos,
2131 			   (int64) offset, whence));
2132 
2133 	vfdP = &VfdCache[file];
2134 
2135 	if (FileIsNotOpen(file))
2136 	{
2137 		switch (whence)
2138 		{
2139 			case SEEK_SET:
2140 				if (offset < 0)
2141 				{
2142 					errno = EINVAL;
2143 					return (off_t) -1;
2144 				}
2145 				vfdP->seekPos = offset;
2146 				break;
2147 			case SEEK_CUR:
2148 				if (FilePosIsUnknown(vfdP->seekPos) ||
2149 					vfdP->seekPos + offset < 0)
2150 				{
2151 					errno = EINVAL;
2152 					return (off_t) -1;
2153 				}
2154 				vfdP->seekPos += offset;
2155 				break;
2156 			case SEEK_END:
2157 				if (FileAccess(file) < 0)
2158 					return (off_t) -1;
2159 				vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2160 				break;
2161 			default:
2162 				elog(ERROR, "invalid whence: %d", whence);
2163 				break;
2164 		}
2165 	}
2166 	else
2167 	{
2168 		switch (whence)
2169 		{
2170 			case SEEK_SET:
2171 				if (offset < 0)
2172 				{
2173 					errno = EINVAL;
2174 					return (off_t) -1;
2175 				}
2176 				if (vfdP->seekPos != offset)
2177 					vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2178 				break;
2179 			case SEEK_CUR:
2180 				if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
2181 					vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2182 				break;
2183 			case SEEK_END:
2184 				vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2185 				break;
2186 			default:
2187 				elog(ERROR, "invalid whence: %d", whence);
2188 				break;
2189 		}
2190 	}
2191 
2192 	return vfdP->seekPos;
2193 }
2194 
2195 /*
2196  * XXX not actually used but here for completeness
2197  */
2198 #ifdef NOT_USED
2199 off_t
FileTell(File file)2200 FileTell(File file)
2201 {
2202 	Assert(FileIsValid(file));
2203 	DO_DB(elog(LOG, "FileTell %d (%s)",
2204 			   file, VfdCache[file].fileName));
2205 	return VfdCache[file].seekPos;
2206 }
2207 #endif
2208 
2209 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)2210 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2211 {
2212 	int			returnCode;
2213 
2214 	Assert(FileIsValid(file));
2215 
2216 	DO_DB(elog(LOG, "FileTruncate %d (%s)",
2217 			   file, VfdCache[file].fileName));
2218 
2219 	returnCode = FileAccess(file);
2220 	if (returnCode < 0)
2221 		return returnCode;
2222 
2223 	pgstat_report_wait_start(wait_event_info);
2224 	returnCode = ftruncate(VfdCache[file].fd, offset);
2225 	pgstat_report_wait_end();
2226 
2227 	if (returnCode == 0 && VfdCache[file].fileSize > offset)
2228 	{
2229 		/* adjust our state for truncation of a temp file */
2230 		Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2231 		temporary_files_size -= VfdCache[file].fileSize - offset;
2232 		VfdCache[file].fileSize = offset;
2233 	}
2234 
2235 	return returnCode;
2236 }
2237 
2238 /*
2239  * Return the pathname associated with an open file.
2240  *
2241  * The returned string points to an internal buffer, which is valid until
2242  * the file is closed.
2243  */
2244 char *
FilePathName(File file)2245 FilePathName(File file)
2246 {
2247 	Assert(FileIsValid(file));
2248 
2249 	return VfdCache[file].fileName;
2250 }
2251 
2252 /*
2253  * Return the raw file descriptor of an opened file.
2254  *
2255  * The returned file descriptor will be valid until the file is closed, but
2256  * there are a lot of things that can make that happen.  So the caller should
2257  * be careful not to do much of anything else before it finishes using the
2258  * returned file descriptor.
2259  */
2260 int
FileGetRawDesc(File file)2261 FileGetRawDesc(File file)
2262 {
2263 	Assert(FileIsValid(file));
2264 	return VfdCache[file].fd;
2265 }
2266 
2267 /*
2268  * FileGetRawFlags - returns the file flags on open(2)
2269  */
2270 int
FileGetRawFlags(File file)2271 FileGetRawFlags(File file)
2272 {
2273 	Assert(FileIsValid(file));
2274 	return VfdCache[file].fileFlags;
2275 }
2276 
2277 /*
2278  * FileGetRawMode - returns the mode bitmask passed to open(2)
2279  */
2280 mode_t
FileGetRawMode(File file)2281 FileGetRawMode(File file)
2282 {
2283 	Assert(FileIsValid(file));
2284 	return VfdCache[file].fileMode;
2285 }
2286 
2287 /*
2288  * Make room for another allocatedDescs[] array entry if needed and possible.
2289  * Returns true if an array element is available.
2290  */
2291 static bool
reserveAllocatedDesc(void)2292 reserveAllocatedDesc(void)
2293 {
2294 	AllocateDesc *newDescs;
2295 	int			newMax;
2296 
2297 	/* Quick out if array already has a free slot. */
2298 	if (numAllocatedDescs < maxAllocatedDescs)
2299 		return true;
2300 
2301 	/*
2302 	 * If the array hasn't yet been created in the current process, initialize
2303 	 * it with FD_MINFREE / 2 elements.  In many scenarios this is as many as
2304 	 * we will ever need, anyway.  We don't want to look at max_safe_fds
2305 	 * immediately because set_max_safe_fds() may not have run yet.
2306 	 */
2307 	if (allocatedDescs == NULL)
2308 	{
2309 		newMax = FD_MINFREE / 2;
2310 		newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2311 		/* Out of memory already?  Treat as fatal error. */
2312 		if (newDescs == NULL)
2313 			ereport(ERROR,
2314 					(errcode(ERRCODE_OUT_OF_MEMORY),
2315 					 errmsg("out of memory")));
2316 		allocatedDescs = newDescs;
2317 		maxAllocatedDescs = newMax;
2318 		return true;
2319 	}
2320 
2321 	/*
2322 	 * Consider enlarging the array beyond the initial allocation used above.
2323 	 * By the time this happens, max_safe_fds should be known accurately.
2324 	 *
2325 	 * We mustn't let allocated descriptors hog all the available FDs, and in
2326 	 * practice we'd better leave a reasonable number of FDs for VFD use.  So
2327 	 * set the maximum to max_safe_fds / 2.  (This should certainly be at
2328 	 * least as large as the initial size, FD_MINFREE / 2.)
2329 	 */
2330 	newMax = max_safe_fds / 2;
2331 	if (newMax > maxAllocatedDescs)
2332 	{
2333 		newDescs = (AllocateDesc *) realloc(allocatedDescs,
2334 											newMax * sizeof(AllocateDesc));
2335 		/* Treat out-of-memory as a non-fatal error. */
2336 		if (newDescs == NULL)
2337 			return false;
2338 		allocatedDescs = newDescs;
2339 		maxAllocatedDescs = newMax;
2340 		return true;
2341 	}
2342 
2343 	/* Can't enlarge allocatedDescs[] any more. */
2344 	return false;
2345 }
2346 
2347 /*
2348  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2349  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
2350  * necessary to open the file.  When done, call FreeFile rather than fclose.
2351  *
2352  * Note that files that will be open for any significant length of time
2353  * should NOT be handled this way, since they cannot share kernel file
2354  * descriptors with other files; there is grave risk of running out of FDs
2355  * if anyone locks down too many FDs.  Most callers of this routine are
2356  * simply reading a config file that they will read and close immediately.
2357  *
2358  * fd.c will automatically close all files opened with AllocateFile at
2359  * transaction commit or abort; this prevents FD leakage if a routine
2360  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2361  *
2362  * Ideally this should be the *only* direct call of fopen() in the backend.
2363  */
2364 FILE *
AllocateFile(const char * name,const char * mode)2365 AllocateFile(const char *name, const char *mode)
2366 {
2367 	FILE	   *file;
2368 
2369 	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2370 			   numAllocatedDescs, name));
2371 
2372 	/* Can we allocate another non-virtual FD? */
2373 	if (!reserveAllocatedDesc())
2374 		ereport(ERROR,
2375 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2376 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2377 						maxAllocatedDescs, name)));
2378 
2379 	/* Close excess kernel FDs. */
2380 	ReleaseLruFiles();
2381 
2382 TryAgain:
2383 	if ((file = fopen(name, mode)) != NULL)
2384 	{
2385 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2386 
2387 		desc->kind = AllocateDescFile;
2388 		desc->desc.file = file;
2389 		desc->create_subid = GetCurrentSubTransactionId();
2390 		numAllocatedDescs++;
2391 		return desc->desc.file;
2392 	}
2393 
2394 	if (errno == EMFILE || errno == ENFILE)
2395 	{
2396 		int			save_errno = errno;
2397 
2398 		ereport(LOG,
2399 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2400 				 errmsg("out of file descriptors: %m; release and retry")));
2401 		errno = 0;
2402 		if (ReleaseLruFile())
2403 			goto TryAgain;
2404 		errno = save_errno;
2405 	}
2406 
2407 	return NULL;
2408 }
2409 
2410 /*
2411  * Open a file with OpenTransientFilePerm() and pass default file mode for
2412  * the fileMode parameter.
2413  */
2414 int
OpenTransientFile(const char * fileName,int fileFlags)2415 OpenTransientFile(const char *fileName, int fileFlags)
2416 {
2417 	return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2418 }
2419 
2420 /*
2421  * Like AllocateFile, but returns an unbuffered fd like open(2)
2422  */
2423 int
OpenTransientFilePerm(const char * fileName,int fileFlags,mode_t fileMode)2424 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2425 {
2426 	int			fd;
2427 
2428 	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2429 			   numAllocatedDescs, fileName));
2430 
2431 	/* Can we allocate another non-virtual FD? */
2432 	if (!reserveAllocatedDesc())
2433 		ereport(ERROR,
2434 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2435 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2436 						maxAllocatedDescs, fileName)));
2437 
2438 	/* Close excess kernel FDs. */
2439 	ReleaseLruFiles();
2440 
2441 	fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2442 
2443 	if (fd >= 0)
2444 	{
2445 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2446 
2447 		desc->kind = AllocateDescRawFD;
2448 		desc->desc.fd = fd;
2449 		desc->create_subid = GetCurrentSubTransactionId();
2450 		numAllocatedDescs++;
2451 
2452 		return fd;
2453 	}
2454 
2455 	return -1;					/* failure */
2456 }
2457 
2458 /*
2459  * Routines that want to initiate a pipe stream should use OpenPipeStream
2460  * rather than plain popen().  This lets fd.c deal with freeing FDs if
2461  * necessary.  When done, call ClosePipeStream rather than pclose.
2462  *
2463  * This function also ensures that the popen'd program is run with default
2464  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2465  * uses.  This ensures desirable response to, eg, closing a read pipe early.
2466  */
2467 FILE *
OpenPipeStream(const char * command,const char * mode)2468 OpenPipeStream(const char *command, const char *mode)
2469 {
2470 	FILE	   *file;
2471 	int			save_errno;
2472 
2473 	DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2474 			   numAllocatedDescs, command));
2475 
2476 	/* Can we allocate another non-virtual FD? */
2477 	if (!reserveAllocatedDesc())
2478 		ereport(ERROR,
2479 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2480 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2481 						maxAllocatedDescs, command)));
2482 
2483 	/* Close excess kernel FDs. */
2484 	ReleaseLruFiles();
2485 
2486 TryAgain:
2487 	fflush(stdout);
2488 	fflush(stderr);
2489 	pqsignal(SIGPIPE, SIG_DFL);
2490 	errno = 0;
2491 	file = popen(command, mode);
2492 	save_errno = errno;
2493 	pqsignal(SIGPIPE, SIG_IGN);
2494 	errno = save_errno;
2495 	if (file != NULL)
2496 	{
2497 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2498 
2499 		desc->kind = AllocateDescPipe;
2500 		desc->desc.file = file;
2501 		desc->create_subid = GetCurrentSubTransactionId();
2502 		numAllocatedDescs++;
2503 		return desc->desc.file;
2504 	}
2505 
2506 	if (errno == EMFILE || errno == ENFILE)
2507 	{
2508 		ereport(LOG,
2509 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2510 				 errmsg("out of file descriptors: %m; release and retry")));
2511 		if (ReleaseLruFile())
2512 			goto TryAgain;
2513 		errno = save_errno;
2514 	}
2515 
2516 	return NULL;
2517 }
2518 
2519 /*
2520  * Free an AllocateDesc of any type.
2521  *
2522  * The argument *must* point into the allocatedDescs[] array.
2523  */
2524 static int
FreeDesc(AllocateDesc * desc)2525 FreeDesc(AllocateDesc *desc)
2526 {
2527 	int			result;
2528 
2529 	/* Close the underlying object */
2530 	switch (desc->kind)
2531 	{
2532 		case AllocateDescFile:
2533 			result = fclose(desc->desc.file);
2534 			break;
2535 		case AllocateDescPipe:
2536 			result = pclose(desc->desc.file);
2537 			break;
2538 		case AllocateDescDir:
2539 			result = closedir(desc->desc.dir);
2540 			break;
2541 		case AllocateDescRawFD:
2542 			result = close(desc->desc.fd);
2543 			break;
2544 		default:
2545 			elog(ERROR, "AllocateDesc kind not recognized");
2546 			result = 0;			/* keep compiler quiet */
2547 			break;
2548 	}
2549 
2550 	/* Compact storage in the allocatedDescs array */
2551 	numAllocatedDescs--;
2552 	*desc = allocatedDescs[numAllocatedDescs];
2553 
2554 	return result;
2555 }
2556 
2557 /*
2558  * Close a file returned by AllocateFile.
2559  *
2560  * Note we do not check fclose's return value --- it is up to the caller
2561  * to handle close errors.
2562  */
2563 int
FreeFile(FILE * file)2564 FreeFile(FILE *file)
2565 {
2566 	int			i;
2567 
2568 	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2569 
2570 	/* Remove file from list of allocated files, if it's present */
2571 	for (i = numAllocatedDescs; --i >= 0;)
2572 	{
2573 		AllocateDesc *desc = &allocatedDescs[i];
2574 
2575 		if (desc->kind == AllocateDescFile && desc->desc.file == file)
2576 			return FreeDesc(desc);
2577 	}
2578 
2579 	/* Only get here if someone passes us a file not in allocatedDescs */
2580 	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2581 
2582 	return fclose(file);
2583 }
2584 
2585 /*
2586  * Close a file returned by OpenTransientFile.
2587  *
2588  * Note we do not check close's return value --- it is up to the caller
2589  * to handle close errors.
2590  */
2591 int
CloseTransientFile(int fd)2592 CloseTransientFile(int fd)
2593 {
2594 	int			i;
2595 
2596 	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2597 
2598 	/* Remove fd from list of allocated files, if it's present */
2599 	for (i = numAllocatedDescs; --i >= 0;)
2600 	{
2601 		AllocateDesc *desc = &allocatedDescs[i];
2602 
2603 		if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2604 			return FreeDesc(desc);
2605 	}
2606 
2607 	/* Only get here if someone passes us a file not in allocatedDescs */
2608 	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2609 
2610 	return close(fd);
2611 }
2612 
2613 /*
2614  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2615  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
2616  * necessary to open the directory, and with closing it after an elog.
2617  * When done, call FreeDir rather than closedir.
2618  *
2619  * Returns NULL, with errno set, on failure.  Note that failure detection
2620  * is commonly left to the following call of ReadDir or ReadDirExtended;
2621  * see the comments for ReadDir.
2622  *
2623  * Ideally this should be the *only* direct call of opendir() in the backend.
2624  */
2625 DIR *
AllocateDir(const char * dirname)2626 AllocateDir(const char *dirname)
2627 {
2628 	DIR		   *dir;
2629 
2630 	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2631 			   numAllocatedDescs, dirname));
2632 
2633 	/* Can we allocate another non-virtual FD? */
2634 	if (!reserveAllocatedDesc())
2635 		ereport(ERROR,
2636 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2637 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2638 						maxAllocatedDescs, dirname)));
2639 
2640 	/* Close excess kernel FDs. */
2641 	ReleaseLruFiles();
2642 
2643 TryAgain:
2644 	if ((dir = opendir(dirname)) != NULL)
2645 	{
2646 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2647 
2648 		desc->kind = AllocateDescDir;
2649 		desc->desc.dir = dir;
2650 		desc->create_subid = GetCurrentSubTransactionId();
2651 		numAllocatedDescs++;
2652 		return desc->desc.dir;
2653 	}
2654 
2655 	if (errno == EMFILE || errno == ENFILE)
2656 	{
2657 		int			save_errno = errno;
2658 
2659 		ereport(LOG,
2660 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2661 				 errmsg("out of file descriptors: %m; release and retry")));
2662 		errno = 0;
2663 		if (ReleaseLruFile())
2664 			goto TryAgain;
2665 		errno = save_errno;
2666 	}
2667 
2668 	return NULL;
2669 }
2670 
2671 /*
2672  * Read a directory opened with AllocateDir, ereport'ing any error.
2673  *
2674  * This is easier to use than raw readdir() since it takes care of some
2675  * otherwise rather tedious and error-prone manipulation of errno.  Also,
2676  * if you are happy with a generic error message for AllocateDir failure,
2677  * you can just do
2678  *
2679  *		dir = AllocateDir(path);
2680  *		while ((dirent = ReadDir(dir, path)) != NULL)
2681  *			process dirent;
2682  *		FreeDir(dir);
2683  *
2684  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2685  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2686  * use this shortcut.)
2687  *
2688  * The pathname passed to AllocateDir must be passed to this routine too,
2689  * but it is only used for error reporting.
2690  */
2691 struct dirent *
ReadDir(DIR * dir,const char * dirname)2692 ReadDir(DIR *dir, const char *dirname)
2693 {
2694 	return ReadDirExtended(dir, dirname, ERROR);
2695 }
2696 
2697 /*
2698  * Alternate version of ReadDir that allows caller to specify the elevel
2699  * for any error report (whether it's reporting an initial failure of
2700  * AllocateDir or a subsequent directory read failure).
2701  *
2702  * If elevel < ERROR, returns NULL after any error.  With the normal coding
2703  * pattern, this will result in falling out of the loop immediately as
2704  * though the directory contained no (more) entries.
2705  */
2706 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2707 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2708 {
2709 	struct dirent *dent;
2710 
2711 	/* Give a generic message for AllocateDir failure, if caller didn't */
2712 	if (dir == NULL)
2713 	{
2714 		ereport(elevel,
2715 				(errcode_for_file_access(),
2716 				 errmsg("could not open directory \"%s\": %m",
2717 						dirname)));
2718 		return NULL;
2719 	}
2720 
2721 	errno = 0;
2722 	if ((dent = readdir(dir)) != NULL)
2723 		return dent;
2724 
2725 	if (errno)
2726 		ereport(elevel,
2727 				(errcode_for_file_access(),
2728 				 errmsg("could not read directory \"%s\": %m",
2729 						dirname)));
2730 	return NULL;
2731 }
2732 
2733 /*
2734  * Close a directory opened with AllocateDir.
2735  *
2736  * Returns closedir's return value (with errno set if it's not 0).
2737  * Note we do not check the return value --- it is up to the caller
2738  * to handle close errors if wanted.
2739  *
2740  * Does nothing if dir == NULL; we assume that directory open failure was
2741  * already reported if desired.
2742  */
2743 int
FreeDir(DIR * dir)2744 FreeDir(DIR *dir)
2745 {
2746 	int			i;
2747 
2748 	/* Nothing to do if AllocateDir failed */
2749 	if (dir == NULL)
2750 		return 0;
2751 
2752 	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2753 
2754 	/* Remove dir from list of allocated dirs, if it's present */
2755 	for (i = numAllocatedDescs; --i >= 0;)
2756 	{
2757 		AllocateDesc *desc = &allocatedDescs[i];
2758 
2759 		if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2760 			return FreeDesc(desc);
2761 	}
2762 
2763 	/* Only get here if someone passes us a dir not in allocatedDescs */
2764 	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2765 
2766 	return closedir(dir);
2767 }
2768 
2769 
2770 /*
2771  * Close a pipe stream returned by OpenPipeStream.
2772  */
2773 int
ClosePipeStream(FILE * file)2774 ClosePipeStream(FILE *file)
2775 {
2776 	int			i;
2777 
2778 	DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2779 
2780 	/* Remove file from list of allocated files, if it's present */
2781 	for (i = numAllocatedDescs; --i >= 0;)
2782 	{
2783 		AllocateDesc *desc = &allocatedDescs[i];
2784 
2785 		if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2786 			return FreeDesc(desc);
2787 	}
2788 
2789 	/* Only get here if someone passes us a file not in allocatedDescs */
2790 	elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2791 
2792 	return pclose(file);
2793 }
2794 
2795 /*
2796  * closeAllVfds
2797  *
2798  * Force all VFDs into the physically-closed state, so that the fewest
2799  * possible number of kernel file descriptors are in use.  There is no
2800  * change in the logical state of the VFDs.
2801  */
2802 void
closeAllVfds(void)2803 closeAllVfds(void)
2804 {
2805 	Index		i;
2806 
2807 	if (SizeVfdCache > 0)
2808 	{
2809 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2810 		for (i = 1; i < SizeVfdCache; i++)
2811 		{
2812 			if (!FileIsNotOpen(i))
2813 				LruDelete(i);
2814 		}
2815 	}
2816 }
2817 
2818 
2819 /*
2820  * SetTempTablespaces
2821  *
2822  * Define a list (actually an array) of OIDs of tablespaces to use for
2823  * temporary files.  This list will be used until end of transaction,
2824  * unless this function is called again before then.  It is caller's
2825  * responsibility that the passed-in array has adequate lifespan (typically
2826  * it'd be allocated in TopTransactionContext).
2827  *
2828  * Some entries of the array may be InvalidOid, indicating that the current
2829  * database's default tablespace should be used.
2830  */
2831 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2832 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2833 {
2834 	Assert(numSpaces >= 0);
2835 	tempTableSpaces = tableSpaces;
2836 	numTempTableSpaces = numSpaces;
2837 
2838 	/*
2839 	 * Select a random starting point in the list.  This is to minimize
2840 	 * conflicts between backends that are most likely sharing the same list
2841 	 * of temp tablespaces.  Note that if we create multiple temp files in the
2842 	 * same transaction, we'll advance circularly through the list --- this
2843 	 * ensures that large temporary sort files are nicely spread across all
2844 	 * available tablespaces.
2845 	 */
2846 	if (numSpaces > 1)
2847 		nextTempTableSpace = random() % numSpaces;
2848 	else
2849 		nextTempTableSpace = 0;
2850 }
2851 
2852 /*
2853  * TempTablespacesAreSet
2854  *
2855  * Returns true if SetTempTablespaces has been called in current transaction.
2856  * (This is just so that tablespaces.c doesn't need its own per-transaction
2857  * state.)
2858  */
2859 bool
TempTablespacesAreSet(void)2860 TempTablespacesAreSet(void)
2861 {
2862 	return (numTempTableSpaces >= 0);
2863 }
2864 
2865 /*
2866  * GetTempTablespaces
2867  *
2868  * Populate an array with the OIDs of the tablespaces that should be used for
2869  * temporary files.  (Some entries may be InvalidOid, indicating that the
2870  * current database's default tablespace should be used.)  At most numSpaces
2871  * entries will be filled.
2872  * Returns the number of OIDs that were copied into the output array.
2873  */
2874 int
GetTempTablespaces(Oid * tableSpaces,int numSpaces)2875 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2876 {
2877 	int			i;
2878 
2879 	Assert(TempTablespacesAreSet());
2880 	for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2881 		tableSpaces[i] = tempTableSpaces[i];
2882 
2883 	return i;
2884 }
2885 
2886 /*
2887  * GetNextTempTableSpace
2888  *
2889  * Select the next temp tablespace to use.  A result of InvalidOid means
2890  * to use the current database's default tablespace.
2891  */
2892 Oid
GetNextTempTableSpace(void)2893 GetNextTempTableSpace(void)
2894 {
2895 	if (numTempTableSpaces > 0)
2896 	{
2897 		/* Advance nextTempTableSpace counter with wraparound */
2898 		if (++nextTempTableSpace >= numTempTableSpaces)
2899 			nextTempTableSpace = 0;
2900 		return tempTableSpaces[nextTempTableSpace];
2901 	}
2902 	return InvalidOid;
2903 }
2904 
2905 
2906 /*
2907  * AtEOSubXact_Files
2908  *
2909  * Take care of subtransaction commit/abort.  At abort, we close temp files
2910  * that the subtransaction may have opened.  At commit, we reassign the
2911  * files that were opened to the parent subtransaction.
2912  */
2913 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2914 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2915 				  SubTransactionId parentSubid)
2916 {
2917 	Index		i;
2918 
2919 	for (i = 0; i < numAllocatedDescs; i++)
2920 	{
2921 		if (allocatedDescs[i].create_subid == mySubid)
2922 		{
2923 			if (isCommit)
2924 				allocatedDescs[i].create_subid = parentSubid;
2925 			else
2926 			{
2927 				/* have to recheck the item after FreeDesc (ugly) */
2928 				FreeDesc(&allocatedDescs[i--]);
2929 			}
2930 		}
2931 	}
2932 }
2933 
2934 /*
2935  * AtEOXact_Files
2936  *
2937  * This routine is called during transaction commit or abort.  All still-open
2938  * per-transaction temporary file VFDs are closed, which also causes the
2939  * underlying files to be deleted (although they should've been closed already
2940  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2941  * closed. We also forget any transaction-local temp tablespace list.
2942  *
2943  * The isCommit flag is used only to decide whether to emit warnings about
2944  * unclosed files.
2945  */
2946 void
AtEOXact_Files(bool isCommit)2947 AtEOXact_Files(bool isCommit)
2948 {
2949 	CleanupTempFiles(isCommit, false);
2950 	tempTableSpaces = NULL;
2951 	numTempTableSpaces = -1;
2952 }
2953 
2954 /*
2955  * AtProcExit_Files
2956  *
2957  * on_proc_exit hook to clean up temp files during backend shutdown.
2958  * Here, we want to clean up *all* temp files including interXact ones.
2959  */
2960 static void
AtProcExit_Files(int code,Datum arg)2961 AtProcExit_Files(int code, Datum arg)
2962 {
2963 	CleanupTempFiles(false, true);
2964 }
2965 
2966 /*
2967  * Close temporary files and delete their underlying files.
2968  *
2969  * isCommit: if true, this is normal transaction commit, and we don't
2970  * expect any remaining files; warn if there are some.
2971  *
2972  * isProcExit: if true, this is being called as the backend process is
2973  * exiting. If that's the case, we should remove all temporary files; if
2974  * that's not the case, we are being called for transaction commit/abort
2975  * and should only remove transaction-local temp files.  In either case,
2976  * also clean up "allocated" stdio files, dirs and fds.
2977  */
2978 static void
CleanupTempFiles(bool isCommit,bool isProcExit)2979 CleanupTempFiles(bool isCommit, bool isProcExit)
2980 {
2981 	Index		i;
2982 
2983 	/*
2984 	 * Careful here: at proc_exit we need extra cleanup, not just
2985 	 * xact_temporary files.
2986 	 */
2987 	if (isProcExit || have_xact_temporary_files)
2988 	{
2989 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2990 		for (i = 1; i < SizeVfdCache; i++)
2991 		{
2992 			unsigned short fdstate = VfdCache[i].fdstate;
2993 
2994 			if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2995 				VfdCache[i].fileName != NULL)
2996 			{
2997 				/*
2998 				 * If we're in the process of exiting a backend process, close
2999 				 * all temporary files. Otherwise, only close temporary files
3000 				 * local to the current transaction. They should be closed by
3001 				 * the ResourceOwner mechanism already, so this is just a
3002 				 * debugging cross-check.
3003 				 */
3004 				if (isProcExit)
3005 					FileClose(i);
3006 				else if (fdstate & FD_CLOSE_AT_EOXACT)
3007 				{
3008 					elog(WARNING,
3009 						 "temporary file %s not closed at end-of-transaction",
3010 						 VfdCache[i].fileName);
3011 					FileClose(i);
3012 				}
3013 			}
3014 		}
3015 
3016 		have_xact_temporary_files = false;
3017 	}
3018 
3019 	/* Complain if any allocated files remain open at commit. */
3020 	if (isCommit && numAllocatedDescs > 0)
3021 		elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3022 			 numAllocatedDescs);
3023 
3024 	/* Clean up "allocated" stdio files, dirs and fds. */
3025 	while (numAllocatedDescs > 0)
3026 		FreeDesc(&allocatedDescs[0]);
3027 }
3028 
3029 
3030 /*
3031  * Remove temporary and temporary relation files left over from a prior
3032  * postmaster session
3033  *
3034  * This should be called during postmaster startup.  It will forcibly
3035  * remove any leftover files created by OpenTemporaryFile and any leftover
3036  * temporary relation files created by mdcreate.
3037  *
3038  * NOTE: we could, but don't, call this during a post-backend-crash restart
3039  * cycle.  The argument for not doing it is that someone might want to examine
3040  * the temp files for debugging purposes.  This does however mean that
3041  * OpenTemporaryFile had better allow for collision with an existing temp
3042  * file name.
3043  *
3044  * NOTE: this function and its subroutines generally report syscall failures
3045  * with ereport(LOG) and keep going.  Removing temp files is not so critical
3046  * that we should fail to start the database when we can't do it.
3047  */
3048 void
RemovePgTempFiles(void)3049 RemovePgTempFiles(void)
3050 {
3051 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3052 	DIR		   *spc_dir;
3053 	struct dirent *spc_de;
3054 
3055 	/*
3056 	 * First process temp files in pg_default ($PGDATA/base)
3057 	 */
3058 	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3059 	RemovePgTempFilesInDir(temp_path, true, false);
3060 	RemovePgTempRelationFiles("base");
3061 
3062 	/*
3063 	 * Cycle through temp directories for all non-default tablespaces.
3064 	 */
3065 	spc_dir = AllocateDir("pg_tblspc");
3066 
3067 	while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3068 	{
3069 		if (strcmp(spc_de->d_name, ".") == 0 ||
3070 			strcmp(spc_de->d_name, "..") == 0)
3071 			continue;
3072 
3073 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3074 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
3075 		RemovePgTempFilesInDir(temp_path, true, false);
3076 
3077 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3078 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3079 		RemovePgTempRelationFiles(temp_path);
3080 	}
3081 
3082 	FreeDir(spc_dir);
3083 
3084 	/*
3085 	 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3086 	 * DataDir as well.
3087 	 */
3088 #ifdef EXEC_BACKEND
3089 	RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false);
3090 #endif
3091 }
3092 
3093 /*
3094  * Process one pgsql_tmp directory for RemovePgTempFiles.
3095  *
3096  * If missing_ok is true, it's all right for the named directory to not exist.
3097  * Any other problem results in a LOG message.  (missing_ok should be true at
3098  * the top level, since pgsql_tmp directories are not created until needed.)
3099  *
3100  * At the top level, this should be called with unlink_all = false, so that
3101  * only files matching the temporary name prefix will be unlinked.  When
3102  * recursing it will be called with unlink_all = true to unlink everything
3103  * under a top-level temporary directory.
3104  *
3105  * (These two flags could be replaced by one, but it seems clearer to keep
3106  * them separate.)
3107  */
3108 static void
RemovePgTempFilesInDir(const char * tmpdirname,bool missing_ok,bool unlink_all)3109 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3110 {
3111 	DIR		   *temp_dir;
3112 	struct dirent *temp_de;
3113 	char		rm_path[MAXPGPATH * 2];
3114 
3115 	temp_dir = AllocateDir(tmpdirname);
3116 
3117 	if (temp_dir == NULL && errno == ENOENT && missing_ok)
3118 		return;
3119 
3120 	while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3121 	{
3122 		if (strcmp(temp_de->d_name, ".") == 0 ||
3123 			strcmp(temp_de->d_name, "..") == 0)
3124 			continue;
3125 
3126 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
3127 				 tmpdirname, temp_de->d_name);
3128 
3129 		if (unlink_all ||
3130 			strncmp(temp_de->d_name,
3131 					PG_TEMP_FILE_PREFIX,
3132 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
3133 		{
3134 			struct stat statbuf;
3135 
3136 			if (lstat(rm_path, &statbuf) < 0)
3137 			{
3138 				ereport(LOG,
3139 						(errcode_for_file_access(),
3140 						 errmsg("could not stat file \"%s\": %m", rm_path)));
3141 				continue;
3142 			}
3143 
3144 			if (S_ISDIR(statbuf.st_mode))
3145 			{
3146 				/* recursively remove contents, then directory itself */
3147 				RemovePgTempFilesInDir(rm_path, false, true);
3148 
3149 				if (rmdir(rm_path) < 0)
3150 					ereport(LOG,
3151 							(errcode_for_file_access(),
3152 							 errmsg("could not remove directory \"%s\": %m",
3153 									rm_path)));
3154 			}
3155 			else
3156 			{
3157 				if (unlink(rm_path) < 0)
3158 					ereport(LOG,
3159 							(errcode_for_file_access(),
3160 							 errmsg("could not remove file \"%s\": %m",
3161 									rm_path)));
3162 			}
3163 		}
3164 		else
3165 			ereport(LOG,
3166 					(errmsg("unexpected file found in temporary-files directory: \"%s\"",
3167 							rm_path)));
3168 	}
3169 
3170 	FreeDir(temp_dir);
3171 }
3172 
3173 /* Process one tablespace directory, look for per-DB subdirectories */
3174 static void
RemovePgTempRelationFiles(const char * tsdirname)3175 RemovePgTempRelationFiles(const char *tsdirname)
3176 {
3177 	DIR		   *ts_dir;
3178 	struct dirent *de;
3179 	char		dbspace_path[MAXPGPATH * 2];
3180 
3181 	ts_dir = AllocateDir(tsdirname);
3182 
3183 	while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3184 	{
3185 		/*
3186 		 * We're only interested in the per-database directories, which have
3187 		 * numeric names.  Note that this code will also (properly) ignore "."
3188 		 * and "..".
3189 		 */
3190 		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3191 			continue;
3192 
3193 		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3194 				 tsdirname, de->d_name);
3195 		RemovePgTempRelationFilesInDbspace(dbspace_path);
3196 	}
3197 
3198 	FreeDir(ts_dir);
3199 }
3200 
3201 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3202 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)3203 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3204 {
3205 	DIR		   *dbspace_dir;
3206 	struct dirent *de;
3207 	char		rm_path[MAXPGPATH * 2];
3208 
3209 	dbspace_dir = AllocateDir(dbspacedirname);
3210 
3211 	while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3212 	{
3213 		if (!looks_like_temp_rel_name(de->d_name))
3214 			continue;
3215 
3216 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
3217 				 dbspacedirname, de->d_name);
3218 
3219 		if (unlink(rm_path) < 0)
3220 			ereport(LOG,
3221 					(errcode_for_file_access(),
3222 					 errmsg("could not remove file \"%s\": %m",
3223 							rm_path)));
3224 	}
3225 
3226 	FreeDir(dbspace_dir);
3227 }
3228 
3229 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3230 bool
looks_like_temp_rel_name(const char * name)3231 looks_like_temp_rel_name(const char *name)
3232 {
3233 	int			pos;
3234 	int			savepos;
3235 
3236 	/* Must start with "t". */
3237 	if (name[0] != 't')
3238 		return false;
3239 
3240 	/* Followed by a non-empty string of digits and then an underscore. */
3241 	for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3242 		;
3243 	if (pos == 1 || name[pos] != '_')
3244 		return false;
3245 
3246 	/* Followed by another nonempty string of digits. */
3247 	for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3248 		;
3249 	if (savepos == pos)
3250 		return false;
3251 
3252 	/* We might have _forkname or .segment or both. */
3253 	if (name[pos] == '_')
3254 	{
3255 		int			forkchar = forkname_chars(&name[pos + 1], NULL);
3256 
3257 		if (forkchar <= 0)
3258 			return false;
3259 		pos += forkchar + 1;
3260 	}
3261 	if (name[pos] == '.')
3262 	{
3263 		int			segchar;
3264 
3265 		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3266 			;
3267 		if (segchar <= 1)
3268 			return false;
3269 		pos += segchar;
3270 	}
3271 
3272 	/* Now we should be at the end. */
3273 	if (name[pos] != '\0')
3274 		return false;
3275 	return true;
3276 }
3277 
3278 
3279 /*
3280  * Issue fsync recursively on PGDATA and all its contents.
3281  *
3282  * We fsync regular files and directories wherever they are, but we
3283  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3284  * Other symlinks are presumed to point at files we're not responsible
3285  * for fsyncing, and might not have privileges to write at all.
3286  *
3287  * Errors are logged but not considered fatal; that's because this is used
3288  * only during database startup, to deal with the possibility that there are
3289  * issued-but-unsynced writes pending against the data directory.  We want to
3290  * ensure that such writes reach disk before anything that's done in the new
3291  * run.  However, aborting on error would result in failure to start for
3292  * harmless cases such as read-only files in the data directory, and that's
3293  * not good either.
3294  *
3295  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3296  * rewriting all changes again during recovery.
3297  *
3298  * Note we assume we're chdir'd into PGDATA to begin with.
3299  */
3300 void
SyncDataDirectory(void)3301 SyncDataDirectory(void)
3302 {
3303 	bool		xlog_is_symlink;
3304 
3305 	/* We can skip this whole thing if fsync is disabled. */
3306 	if (!enableFsync)
3307 		return;
3308 
3309 	/*
3310 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
3311 	 * because the first walkdir below will ignore it.
3312 	 */
3313 	xlog_is_symlink = false;
3314 
3315 #ifndef WIN32
3316 	{
3317 		struct stat st;
3318 
3319 		if (lstat("pg_wal", &st) < 0)
3320 			ereport(LOG,
3321 					(errcode_for_file_access(),
3322 					 errmsg("could not stat file \"%s\": %m",
3323 							"pg_wal")));
3324 		else if (S_ISLNK(st.st_mode))
3325 			xlog_is_symlink = true;
3326 	}
3327 #else
3328 	if (pgwin32_is_junction("pg_wal"))
3329 		xlog_is_symlink = true;
3330 #endif
3331 
3332 	/*
3333 	 * If possible, hint to the kernel that we're soon going to fsync the data
3334 	 * directory and its contents.  Errors in this step are even less
3335 	 * interesting than normal, so log them only at DEBUG1.
3336 	 */
3337 #ifdef PG_FLUSH_DATA_WORKS
3338 	walkdir(".", pre_sync_fname, false, DEBUG1);
3339 	if (xlog_is_symlink)
3340 		walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3341 	walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3342 #endif
3343 
3344 	/*
3345 	 * Now we do the fsync()s in the same order.
3346 	 *
3347 	 * The main call ignores symlinks, so in addition to specially processing
3348 	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3349 	 * process_symlinks = true.  Note that if there are any plain directories
3350 	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
3351 	 * so we don't worry about optimizing it.
3352 	 */
3353 	walkdir(".", datadir_fsync_fname, false, LOG);
3354 	if (xlog_is_symlink)
3355 		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3356 	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3357 }
3358 
3359 /*
3360  * walkdir: recursively walk a directory, applying the action to each
3361  * regular file and directory (including the named directory itself).
3362  *
3363  * If process_symlinks is true, the action and recursion are also applied
3364  * to regular files and directories that are pointed to by symlinks in the
3365  * given directory; otherwise symlinks are ignored.  Symlinks are always
3366  * ignored in subdirectories, ie we intentionally don't pass down the
3367  * process_symlinks flag to recursive calls.
3368  *
3369  * Errors are reported at level elevel, which might be ERROR or less.
3370  *
3371  * See also walkdir in initdb.c, which is a frontend version of this logic.
3372  */
3373 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3374 walkdir(const char *path,
3375 		void (*action) (const char *fname, bool isdir, int elevel),
3376 		bool process_symlinks,
3377 		int elevel)
3378 {
3379 	DIR		   *dir;
3380 	struct dirent *de;
3381 
3382 	dir = AllocateDir(path);
3383 
3384 	while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3385 	{
3386 		char		subpath[MAXPGPATH * 2];
3387 		struct stat fst;
3388 		int			sret;
3389 
3390 		CHECK_FOR_INTERRUPTS();
3391 
3392 		if (strcmp(de->d_name, ".") == 0 ||
3393 			strcmp(de->d_name, "..") == 0)
3394 			continue;
3395 
3396 		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3397 
3398 		if (process_symlinks)
3399 			sret = stat(subpath, &fst);
3400 		else
3401 			sret = lstat(subpath, &fst);
3402 
3403 		if (sret < 0)
3404 		{
3405 			ereport(elevel,
3406 					(errcode_for_file_access(),
3407 					 errmsg("could not stat file \"%s\": %m", subpath)));
3408 			continue;
3409 		}
3410 
3411 		if (S_ISREG(fst.st_mode))
3412 			(*action) (subpath, false, elevel);
3413 		else if (S_ISDIR(fst.st_mode))
3414 			walkdir(subpath, action, false, elevel);
3415 	}
3416 
3417 	FreeDir(dir);				/* we ignore any error here */
3418 
3419 	/*
3420 	 * It's important to fsync the destination directory itself as individual
3421 	 * file fsyncs don't guarantee that the directory entry for the file is
3422 	 * synced.  However, skip this if AllocateDir failed; the action function
3423 	 * might not be robust against that.
3424 	 */
3425 	if (dir)
3426 		(*action) (path, true, elevel);
3427 }
3428 
3429 
3430 /*
3431  * Hint to the OS that it should get ready to fsync() this file.
3432  *
3433  * Ignores errors trying to open unreadable files, and logs other errors at a
3434  * caller-specified level.
3435  */
3436 #ifdef PG_FLUSH_DATA_WORKS
3437 
3438 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3439 pre_sync_fname(const char *fname, bool isdir, int elevel)
3440 {
3441 	int			fd;
3442 
3443 	/* Don't try to flush directories, it'll likely just fail */
3444 	if (isdir)
3445 		return;
3446 
3447 	fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3448 
3449 	if (fd < 0)
3450 	{
3451 		if (errno == EACCES)
3452 			return;
3453 		ereport(elevel,
3454 				(errcode_for_file_access(),
3455 				 errmsg("could not open file \"%s\": %m", fname)));
3456 		return;
3457 	}
3458 
3459 	/*
3460 	 * pg_flush_data() ignores errors, which is ok because this is only a
3461 	 * hint.
3462 	 */
3463 	pg_flush_data(fd, 0, 0);
3464 
3465 	(void) CloseTransientFile(fd);
3466 }
3467 
3468 #endif							/* PG_FLUSH_DATA_WORKS */
3469 
3470 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3471 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3472 {
3473 	/*
3474 	 * We want to silently ignoring errors about unreadable files.  Pass that
3475 	 * desire on to fsync_fname_ext().
3476 	 */
3477 	fsync_fname_ext(fname, isdir, true, elevel);
3478 }
3479 
3480 static void
unlink_if_exists_fname(const char * fname,bool isdir,int elevel)3481 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3482 {
3483 	if (isdir)
3484 	{
3485 		if (rmdir(fname) != 0 && errno != ENOENT)
3486 			ereport(elevel,
3487 					(errcode_for_file_access(),
3488 					 errmsg("could not rmdir directory \"%s\": %m", fname)));
3489 	}
3490 	else
3491 	{
3492 		/* Use PathNameDeleteTemporaryFile to report filesize */
3493 		PathNameDeleteTemporaryFile(fname, false);
3494 	}
3495 }
3496 
3497 /*
3498  * fsync_fname_ext -- Try to fsync a file or directory
3499  *
3500  * If ignore_perm is true, ignore errors upon trying to open unreadable
3501  * files. Logs other errors at a caller-specified level.
3502  *
3503  * Returns 0 if the operation succeeded, -1 otherwise.
3504  */
3505 static int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3506 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3507 {
3508 	int			fd;
3509 	int			flags;
3510 	int			returncode;
3511 
3512 	/*
3513 	 * Some OSs require directories to be opened read-only whereas other
3514 	 * systems don't allow us to fsync files opened read-only; so we need both
3515 	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
3516 	 * not writable by our userid, but we assume that's OK.
3517 	 */
3518 	flags = PG_BINARY;
3519 	if (!isdir)
3520 		flags |= O_RDWR;
3521 	else
3522 		flags |= O_RDONLY;
3523 
3524 	fd = OpenTransientFile(fname, flags);
3525 
3526 	/*
3527 	 * Some OSs don't allow us to open directories at all (Windows returns
3528 	 * EACCES), just ignore the error in that case.  If desired also silently
3529 	 * ignoring errors about unreadable files. Log others.
3530 	 */
3531 	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3532 		return 0;
3533 	else if (fd < 0 && ignore_perm && errno == EACCES)
3534 		return 0;
3535 	else if (fd < 0)
3536 	{
3537 		ereport(elevel,
3538 				(errcode_for_file_access(),
3539 				 errmsg("could not open file \"%s\": %m", fname)));
3540 		return -1;
3541 	}
3542 
3543 	returncode = pg_fsync(fd);
3544 
3545 	/*
3546 	 * Some OSes don't allow us to fsync directories at all, so we can ignore
3547 	 * those errors. Anything else needs to be logged.
3548 	 */
3549 	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3550 	{
3551 		int			save_errno;
3552 
3553 		/* close file upon error, might not be in transaction context */
3554 		save_errno = errno;
3555 		(void) CloseTransientFile(fd);
3556 		errno = save_errno;
3557 
3558 		ereport(elevel,
3559 				(errcode_for_file_access(),
3560 				 errmsg("could not fsync file \"%s\": %m", fname)));
3561 		return -1;
3562 	}
3563 
3564 	(void) CloseTransientFile(fd);
3565 
3566 	return 0;
3567 }
3568 
3569 /*
3570  * fsync_parent_path -- fsync the parent path of a file or directory
3571  *
3572  * This is aimed at making file operations persistent on disk in case of
3573  * an OS crash or power failure.
3574  */
3575 static int
fsync_parent_path(const char * fname,int elevel)3576 fsync_parent_path(const char *fname, int elevel)
3577 {
3578 	char		parentpath[MAXPGPATH];
3579 
3580 	strlcpy(parentpath, fname, MAXPGPATH);
3581 	get_parent_directory(parentpath);
3582 
3583 	/*
3584 	 * get_parent_directory() returns an empty string if the input argument is
3585 	 * just a file name (see comments in path.c), so handle that as being the
3586 	 * current directory.
3587 	 */
3588 	if (strlen(parentpath) == 0)
3589 		strlcpy(parentpath, ".", MAXPGPATH);
3590 
3591 	if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3592 		return -1;
3593 
3594 	return 0;
3595 }
3596 
3597 /*
3598  * Create a PostgreSQL data sub-directory
3599  *
3600  * The data directory itself, and most of its sub-directories, are created at
3601  * initdb time, but we do have some occasions when we create directories in
3602  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
3603  * make sure that those directories are created consistently.  Today, that means
3604  * making sure that the created directory has the correct permissions, which is
3605  * what pg_dir_create_mode tracks for us.
3606  *
3607  * Note that we also set the umask() based on what we understand the correct
3608  * permissions to be (see file_perm.c).
3609  *
3610  * For permissions other than the default, mkdir() can be used directly, but
3611  * be sure to consider carefully such cases -- a sub-directory with incorrect
3612  * permissions in a PostgreSQL data directory could cause backups and other
3613  * processes to fail.
3614  */
3615 int
MakePGDirectory(const char * directoryName)3616 MakePGDirectory(const char *directoryName)
3617 {
3618 	return mkdir(directoryName, pg_dir_create_mode);
3619 }
3620 
3621 /*
3622  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3623  *
3624  * Failure to fsync any data file is cause for immediate panic, unless
3625  * data_sync_retry is enabled.  Data may have been written to the operating
3626  * system and removed from our buffer pool already, and if we are running on
3627  * an operating system that forgets dirty data on write-back failure, there
3628  * may be only one copy of the data remaining: in the WAL.  A later attempt to
3629  * fsync again might falsely report success.  Therefore we must not allow any
3630  * further checkpoints to be attempted.  data_sync_retry can in theory be
3631  * enabled on systems known not to drop dirty buffered data on write-back
3632  * failure (with the likely outcome that checkpoints will continue to fail
3633  * until the underlying problem is fixed).
3634  *
3635  * Any code that reports a failure from fsync() or related functions should
3636  * filter the error level with this function.
3637  */
3638 int
data_sync_elevel(int elevel)3639 data_sync_elevel(int elevel)
3640 {
3641 	return data_sync_retry ? elevel : PANIC;
3642 }
3643