1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  *	  Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have.  (This is around 256 on many modern
20  * operating systems, but can be as low as 32 on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed.  Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
43  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
44  * They behave like the corresponding native functions, except that the handle
45  * is registered with the current subtransaction, and will be automatically
46  * closed at abort. These are intended mainly for short operations like
47  * reading a configuration file; there is a limit on the number of files that
48  * can be opened using these functions at any one time.
49  *
50  * Finally, BasicOpenFile is just a thin wrapper around open() that can
51  * release file descriptors in use by the virtual file descriptors if
52  * necessary. There is no automatic cleanup of file descriptors returned by
53  * BasicOpenFile, it is solely the caller's responsibility to close the file
54  * descriptor by calling close(2).
55  *
56  *-------------------------------------------------------------------------
57  */
58 
59 #include "postgres.h"
60 
61 #include <sys/file.h>
62 #include <sys/param.h>
63 #include <sys/stat.h>
64 #ifndef WIN32
65 #include <sys/mman.h>
66 #endif
67 #include <limits.h>
68 #include <unistd.h>
69 #include <fcntl.h>
70 #ifdef HAVE_SYS_RESOURCE_H
71 #include <sys/resource.h>		/* for getrlimit */
72 #endif
73 
74 #include "miscadmin.h"
75 #include "access/xact.h"
76 #include "access/xlog.h"
77 #include "catalog/catalog.h"
78 #include "catalog/pg_tablespace.h"
79 #include "pgstat.h"
80 #include "portability/mem.h"
81 #include "storage/fd.h"
82 #include "storage/ipc.h"
83 #include "utils/guc.h"
84 #include "utils/resowner_private.h"
85 
86 
87 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
88 #if defined(HAVE_SYNC_FILE_RANGE)
89 #define PG_FLUSH_DATA_WORKS 1
90 #elif !defined(WIN32) && defined(MS_ASYNC)
91 #define PG_FLUSH_DATA_WORKS 1
92 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
93 #define PG_FLUSH_DATA_WORKS 1
94 #endif
95 
96 /*
97  * We must leave some file descriptors free for system(), the dynamic loader,
98  * and other code that tries to open files without consulting fd.c.  This
99  * is the number left free.  (While we can be pretty sure we won't get
100  * EMFILE, there's never any guarantee that we won't get ENFILE due to
101  * other processes chewing up FDs.  So it's a bad idea to try to open files
102  * without consulting fd.c.  Nonetheless we cannot control all code.)
103  *
104  * Because this is just a fixed setting, we are effectively assuming that
105  * no such code will leave FDs open over the long term; otherwise the slop
106  * is likely to be insufficient.  Note in particular that we expect that
107  * loading a shared library does not result in any permanent increase in
108  * the number of open files.  (This appears to be true on most if not
109  * all platforms as of Feb 2004.)
110  */
111 #define NUM_RESERVED_FDS		10
112 
113 /*
114  * If we have fewer than this many usable FDs after allowing for the reserved
115  * ones, choke.
116  */
117 #define FD_MINFREE				10
118 
119 
120 /*
121  * A number of platforms allow individual processes to open many more files
122  * than they can really support when *many* processes do the same thing.
123  * This GUC parameter lets the DBA limit max_safe_fds to something less than
124  * what the postmaster's initial probe suggests will work.
125  */
126 int			max_files_per_process = 1000;
127 
128 /*
129  * Maximum number of file descriptors to open for either VFD entries or
130  * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
131  * to a conservative value, and remains that way indefinitely in bootstrap or
132  * standalone-backend cases.  In normal postmaster operation, the postmaster
133  * calls set_max_safe_fds() late in initialization to update the value, and
134  * that value is then inherited by forked subprocesses.
135  *
136  * Note: the value of max_files_per_process is taken into account while
137  * setting this variable, and so need not be tested separately.
138  */
139 int			max_safe_fds = 32;	/* default if not changed */
140 
141 /* Whether it is safe to continue running after fsync() fails. */
142 bool		data_sync_retry = false;
143 
144 /* Debugging.... */
145 
146 #ifdef FDDEBUG
147 #define DO_DB(A) \
148 	do { \
149 		int			_do_db_save_errno = errno; \
150 		A; \
151 		errno = _do_db_save_errno; \
152 	} while (0)
153 #else
154 #define DO_DB(A) \
155 	((void) 0)
156 #endif
157 
158 #define VFD_CLOSED (-1)
159 
160 #define FileIsValid(file) \
161 	((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
162 
163 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
164 
165 /*
166  * Note: a VFD's seekPos is normally always valid, but if for some reason
167  * an lseek() fails, it might become set to FileUnknownPos.  We can struggle
168  * along without knowing the seek position in many cases, but in some places
169  * we have to fail if we don't have it.
170  */
171 #define FileUnknownPos ((off_t) -1)
172 #define FilePosIsUnknown(pos) ((pos) < 0)
173 
174 /* these are the assigned bits in fdstate below: */
175 #define FD_TEMPORARY		(1 << 0)	/* T = delete when closed */
176 #define FD_XACT_TEMPORARY	(1 << 1)	/* T = delete at eoXact */
177 
178 typedef struct vfd
179 {
180 	int			fd;				/* current FD, or VFD_CLOSED if none */
181 	unsigned short fdstate;		/* bitflags for VFD's state */
182 	ResourceOwner resowner;		/* owner, for automatic cleanup */
183 	File		nextFree;		/* link to next free VFD, if in freelist */
184 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
185 	File		lruLessRecently;
186 	off_t		seekPos;		/* current logical file position, or -1 */
187 	off_t		fileSize;		/* current size of file (0 if not temporary) */
188 	char	   *fileName;		/* name of file, or NULL for unused VFD */
189 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
190 	int			fileFlags;		/* open(2) flags for (re)opening the file */
191 	int			fileMode;		/* mode to pass to open(2) */
192 } Vfd;
193 
194 /*
195  * Virtual File Descriptor array pointer and size.  This grows as
196  * needed.  'File' values are indexes into this array.
197  * Note that VfdCache[0] is not a usable VFD, just a list header.
198  */
199 static Vfd *VfdCache;
200 static Size SizeVfdCache = 0;
201 
202 /*
203  * Number of file descriptors known to be in use by VFD entries.
204  */
205 static int	nfile = 0;
206 
207 /*
208  * Flag to tell whether it's worth scanning VfdCache looking for temp files
209  * to close
210  */
211 static bool have_xact_temporary_files = false;
212 
213 /*
214  * Tracks the total size of all temporary files.  Note: when temp_file_limit
215  * is being enforced, this cannot overflow since the limit cannot be more
216  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
217  * overflow, but we don't care.
218  */
219 static uint64 temporary_files_size = 0;
220 
221 /*
222  * List of OS handles opened with AllocateFile, AllocateDir and
223  * OpenTransientFile.
224  */
225 typedef enum
226 {
227 	AllocateDescFile,
228 	AllocateDescPipe,
229 	AllocateDescDir,
230 	AllocateDescRawFD
231 } AllocateDescKind;
232 
233 typedef struct
234 {
235 	AllocateDescKind kind;
236 	SubTransactionId create_subid;
237 	union
238 	{
239 		FILE	   *file;
240 		DIR		   *dir;
241 		int			fd;
242 	}			desc;
243 } AllocateDesc;
244 
245 static int	numAllocatedDescs = 0;
246 static int	maxAllocatedDescs = 0;
247 static AllocateDesc *allocatedDescs = NULL;
248 
249 /*
250  * Number of temporary files opened during the current session;
251  * this is used in generation of tempfile names.
252  */
253 static long tempFileCounter = 0;
254 
255 /*
256  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
257  * indicating that the current database's default tablespace should be used.)
258  * When numTempTableSpaces is -1, this has not been set in the current
259  * transaction.
260  */
261 static Oid *tempTableSpaces = NULL;
262 static int	numTempTableSpaces = -1;
263 static int	nextTempTableSpace = 0;
264 
265 
266 /*--------------------
267  *
268  * Private Routines
269  *
270  * Delete		   - delete a file from the Lru ring
271  * LruDelete	   - remove a file from the Lru ring and close its FD
272  * Insert		   - put a file at the front of the Lru ring
273  * LruInsert	   - put a file at the front of the Lru ring and open it
274  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
275  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
276  * AllocateVfd	   - grab a free (or new) file record (from VfdArray)
277  * FreeVfd		   - free a file record
278  *
279  * The Least Recently Used ring is a doubly linked list that begins and
280  * ends on element zero.  Element zero is special -- it doesn't represent
281  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
282  * anchor that shows us the beginning/end of the ring.
283  * Only VFD elements that are currently really open (have an FD assigned) are
284  * in the Lru ring.  Elements that are "virtually" open can be recognized
285  * by having a non-null fileName field.
286  *
287  * example:
288  *
289  *	   /--less----\				   /---------\
290  *	   v		   \			  v			  \
291  *	 #0 --more---> LeastRecentlyUsed --more-\ \
292  *	  ^\									| |
293  *	   \\less--> MostRecentlyUsedFile	<---/ |
294  *		\more---/					 \--less--/
295  *
296  *--------------------
297  */
298 static void Delete(File file);
299 static void LruDelete(File file);
300 static void Insert(File file);
301 static int	LruInsert(File file);
302 static bool ReleaseLruFile(void);
303 static void ReleaseLruFiles(void);
304 static File AllocateVfd(void);
305 static void FreeVfd(File file);
306 
307 static int	FileAccess(File file);
308 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
309 static bool reserveAllocatedDesc(void);
310 static int	FreeDesc(AllocateDesc *desc);
311 
312 static void AtProcExit_Files(int code, Datum arg);
313 static void CleanupTempFiles(bool isProcExit);
314 static void RemovePgTempFilesInDir(const char *tmpdirname);
315 static void RemovePgTempRelationFiles(const char *tsdirname);
316 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
317 static bool looks_like_temp_rel_name(const char *name);
318 
319 static void walkdir(const char *path,
320 		void (*action) (const char *fname, bool isdir, int elevel),
321 		bool process_symlinks,
322 		int elevel);
323 #ifdef PG_FLUSH_DATA_WORKS
324 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
325 #endif
326 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
327 
328 static int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
329 static int	fsync_parent_path(const char *fname, int elevel);
330 
331 
332 /*
333  * pg_fsync --- do fsync with or without writethrough
334  */
335 int
pg_fsync(int fd)336 pg_fsync(int fd)
337 {
338 	/* #if is to skip the sync_method test if there's no need for it */
339 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
340 	if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
341 		return pg_fsync_writethrough(fd);
342 	else
343 #endif
344 		return pg_fsync_no_writethrough(fd);
345 }
346 
347 
348 /*
349  * pg_fsync_no_writethrough --- same as fsync except does nothing if
350  *	enableFsync is off
351  */
352 int
pg_fsync_no_writethrough(int fd)353 pg_fsync_no_writethrough(int fd)
354 {
355 	if (enableFsync)
356 		return fsync(fd);
357 	else
358 		return 0;
359 }
360 
361 /*
362  * pg_fsync_writethrough
363  */
364 int
pg_fsync_writethrough(int fd)365 pg_fsync_writethrough(int fd)
366 {
367 	if (enableFsync)
368 	{
369 #ifdef WIN32
370 		return _commit(fd);
371 #elif defined(F_FULLFSYNC)
372 		return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
373 #else
374 		errno = ENOSYS;
375 		return -1;
376 #endif
377 	}
378 	else
379 		return 0;
380 }
381 
382 /*
383  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
384  *
385  * Not all platforms have fdatasync; treat as fsync if not available.
386  */
387 int
pg_fdatasync(int fd)388 pg_fdatasync(int fd)
389 {
390 	if (enableFsync)
391 	{
392 #ifdef HAVE_FDATASYNC
393 		return fdatasync(fd);
394 #else
395 		return fsync(fd);
396 #endif
397 	}
398 	else
399 		return 0;
400 }
401 
402 /*
403  * pg_flush_data --- advise OS that the described dirty data should be flushed
404  *
405  * offset of 0 with nbytes 0 means that the entire file should be flushed;
406  * in this case, this function may have side-effects on the file's
407  * seek position!
408  */
409 void
pg_flush_data(int fd,off_t offset,off_t nbytes)410 pg_flush_data(int fd, off_t offset, off_t nbytes)
411 {
412 	/*
413 	 * Right now file flushing is primarily used to avoid making later
414 	 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
415 	 * if fsyncs are disabled - that's a decision we might want to make
416 	 * configurable at some point.
417 	 */
418 	if (!enableFsync)
419 		return;
420 
421 	/*
422 	 * We compile all alternatives that are supported on the current platform,
423 	 * to find portability problems more easily.
424 	 */
425 #if defined(HAVE_SYNC_FILE_RANGE)
426 	{
427 		int			rc;
428 		static bool not_implemented_by_kernel = false;
429 
430 		if (not_implemented_by_kernel)
431 			return;
432 
433 		/*
434 		 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
435 		 * tells the OS that writeback for the specified blocks should be
436 		 * started, but that we don't want to wait for completion.  Note that
437 		 * this call might block if too much dirty data exists in the range.
438 		 * This is the preferable method on OSs supporting it, as it works
439 		 * reliably when available (contrast to msync()) and doesn't flush out
440 		 * clean data (like FADV_DONTNEED).
441 		 */
442 		rc = sync_file_range(fd, offset, nbytes,
443 							 SYNC_FILE_RANGE_WRITE);
444 		if (rc != 0)
445 		{
446 			int			elevel;
447 
448 			/*
449 			 * For systems that don't have an implementation of
450 			 * sync_file_range() such as Windows WSL, generate only one
451 			 * warning and then suppress all further attempts by this process.
452 			 */
453 			if (errno == ENOSYS)
454 			{
455 				elevel = WARNING;
456 				not_implemented_by_kernel = true;
457 			}
458 			else
459 				elevel = data_sync_elevel(WARNING);
460 
461 			ereport(elevel,
462 					(errcode_for_file_access(),
463 					 errmsg("could not flush dirty data: %m")));
464 		}
465 
466 		return;
467 	}
468 #endif
469 #if !defined(WIN32) && defined(MS_ASYNC)
470 	{
471 		void	   *p;
472 		static int	pagesize = 0;
473 
474 		/*
475 		 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
476 		 * writeback. On linux it only does so if MS_SYNC is specified, but
477 		 * then it does the writeback synchronously. Luckily all common linux
478 		 * systems have sync_file_range().  This is preferable over
479 		 * FADV_DONTNEED because it doesn't flush out clean data.
480 		 *
481 		 * We map the file (mmap()), tell the kernel to sync back the contents
482 		 * (msync()), and then remove the mapping again (munmap()).
483 		 */
484 
485 		/* mmap() needs actual length if we want to map whole file */
486 		if (offset == 0 && nbytes == 0)
487 		{
488 			nbytes = lseek(fd, 0, SEEK_END);
489 			if (nbytes < 0)
490 			{
491 				ereport(WARNING,
492 						(errcode_for_file_access(),
493 						 errmsg("could not determine dirty data size: %m")));
494 				return;
495 			}
496 		}
497 
498 		/*
499 		 * Some platforms reject partial-page mmap() attempts.  To deal with
500 		 * that, just truncate the request to a page boundary.  If any extra
501 		 * bytes don't get flushed, well, it's only a hint anyway.
502 		 */
503 
504 		/* fetch pagesize only once */
505 		if (pagesize == 0)
506 			pagesize = sysconf(_SC_PAGESIZE);
507 
508 		/* align length to pagesize, dropping any fractional page */
509 		if (pagesize > 0)
510 			nbytes = (nbytes / pagesize) * pagesize;
511 
512 		/* fractional-page request is a no-op */
513 		if (nbytes <= 0)
514 			return;
515 
516 		/*
517 		 * mmap could well fail, particularly on 32-bit platforms where there
518 		 * may simply not be enough address space.  If so, silently fall
519 		 * through to the next implementation.
520 		 */
521 		if (nbytes <= (off_t) SSIZE_MAX)
522 			p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
523 		else
524 			p = MAP_FAILED;
525 
526 		if (p != MAP_FAILED)
527 		{
528 			int			rc;
529 
530 			rc = msync(p, (size_t) nbytes, MS_ASYNC);
531 			if (rc != 0)
532 			{
533 				ereport(data_sync_elevel(WARNING),
534 						(errcode_for_file_access(),
535 						 errmsg("could not flush dirty data: %m")));
536 				/* NB: need to fall through to munmap()! */
537 			}
538 
539 			rc = munmap(p, (size_t) nbytes);
540 			if (rc != 0)
541 			{
542 				/* FATAL error because mapping would remain */
543 				ereport(FATAL,
544 						(errcode_for_file_access(),
545 						 errmsg("could not munmap() while flushing data: %m")));
546 			}
547 
548 			return;
549 		}
550 	}
551 #endif
552 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
553 	{
554 		int			rc;
555 
556 		/*
557 		 * Signal the kernel that the passed in range should not be cached
558 		 * anymore. This has the, desired, side effect of writing out dirty
559 		 * data, and the, undesired, side effect of likely discarding useful
560 		 * clean cached blocks.  For the latter reason this is the least
561 		 * preferable method.
562 		 */
563 
564 		rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
565 
566 		if (rc != 0)
567 		{
568 			/* don't error out, this is just a performance optimization */
569 			ereport(WARNING,
570 					(errcode_for_file_access(),
571 					 errmsg("could not flush dirty data: %m")));
572 		}
573 
574 		return;
575 	}
576 #endif
577 }
578 
579 
580 /*
581  * fsync_fname -- fsync a file or directory, handling errors properly
582  *
583  * Try to fsync a file or directory. When doing the latter, ignore errors that
584  * indicate the OS just doesn't allow/require fsyncing directories.
585  */
586 void
fsync_fname(const char * fname,bool isdir)587 fsync_fname(const char *fname, bool isdir)
588 {
589 	fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
590 }
591 
592 /*
593  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
594  *
595  * This routine ensures that, after returning, the effect of renaming file
596  * persists in case of a crash. A crash while this routine is running will
597  * leave you with either the pre-existing or the moved file in place of the
598  * new file; no mixed state or truncated files are possible.
599  *
600  * It does so by using fsync on the old filename and the possibly existing
601  * target filename before the rename, and the target file and directory after.
602  *
603  * Note that rename() cannot be used across arbitrary directories, as they
604  * might not be on the same filesystem. Therefore this routine does not
605  * support renaming across directories.
606  *
607  * Log errors with the caller specified severity.
608  *
609  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
610  * valid upon return.
611  */
612 int
durable_rename(const char * oldfile,const char * newfile,int elevel)613 durable_rename(const char *oldfile, const char *newfile, int elevel)
614 {
615 	int			fd;
616 
617 	/*
618 	 * First fsync the old and target path (if it exists), to ensure that they
619 	 * are properly persistent on disk. Syncing the target file is not
620 	 * strictly necessary, but it makes it easier to reason about crashes;
621 	 * because it's then guaranteed that either source or target file exists
622 	 * after a crash.
623 	 */
624 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
625 		return -1;
626 
627 	fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
628 	if (fd < 0)
629 	{
630 		if (errno != ENOENT)
631 		{
632 			ereport(elevel,
633 					(errcode_for_file_access(),
634 					 errmsg("could not open file \"%s\": %m", newfile)));
635 			return -1;
636 		}
637 	}
638 	else
639 	{
640 		if (pg_fsync(fd) != 0)
641 		{
642 			int			save_errno;
643 
644 			/* close file upon error, might not be in transaction context */
645 			save_errno = errno;
646 			CloseTransientFile(fd);
647 			errno = save_errno;
648 
649 			ereport(elevel,
650 					(errcode_for_file_access(),
651 					 errmsg("could not fsync file \"%s\": %m", newfile)));
652 			return -1;
653 		}
654 		CloseTransientFile(fd);
655 	}
656 
657 	/* Time to do the real deal... */
658 	if (rename(oldfile, newfile) < 0)
659 	{
660 		ereport(elevel,
661 				(errcode_for_file_access(),
662 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
663 						oldfile, newfile)));
664 		return -1;
665 	}
666 
667 	/*
668 	 * To guarantee renaming the file is persistent, fsync the file with its
669 	 * new name, and its containing directory.
670 	 */
671 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
672 		return -1;
673 
674 	if (fsync_parent_path(newfile, elevel) != 0)
675 		return -1;
676 
677 	return 0;
678 }
679 
680 /*
681  * durable_unlink -- remove a file in a durable manner
682  *
683  * This routine ensures that, after returning, the effect of removing file
684  * persists in case of a crash. A crash while this routine is running will
685  * leave the system in no mixed state.
686  *
687  * It does so by using fsync on the parent directory of the file after the
688  * actual removal is done.
689  *
690  * Log errors with the severity specified by caller.
691  *
692  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
693  * valid upon return.
694  */
695 int
durable_unlink(const char * fname,int elevel)696 durable_unlink(const char *fname, int elevel)
697 {
698 	if (unlink(fname) < 0)
699 	{
700 		ereport(elevel,
701 				(errcode_for_file_access(),
702 				 errmsg("could not remove file \"%s\": %m",
703 						fname)));
704 		return -1;
705 	}
706 
707 	/*
708 	 * To guarantee that the removal of the file is persistent, fsync its
709 	 * parent directory.
710 	 */
711 	if (fsync_parent_path(fname, elevel) != 0)
712 		return -1;
713 
714 	return 0;
715 }
716 
717 /*
718  * durable_link_or_rename -- rename a file in a durable manner.
719  *
720  * Similar to durable_rename(), except that this routine tries (but does not
721  * guarantee) not to overwrite the target file.
722  *
723  * Note that a crash in an unfortunate moment can leave you with two links to
724  * the target file.
725  *
726  * Log errors with the caller specified severity.
727  *
728  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
729  * valid upon return.
730  */
731 int
durable_link_or_rename(const char * oldfile,const char * newfile,int elevel)732 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
733 {
734 	/*
735 	 * Ensure that, if we crash directly after the rename/link, a file with
736 	 * valid contents is moved into place.
737 	 */
738 	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
739 		return -1;
740 
741 #if HAVE_WORKING_LINK
742 	if (link(oldfile, newfile) < 0)
743 	{
744 		ereport(elevel,
745 				(errcode_for_file_access(),
746 				 errmsg("could not link file \"%s\" to \"%s\": %m",
747 						oldfile, newfile)));
748 		return -1;
749 	}
750 	unlink(oldfile);
751 #else
752 	/* XXX: Add racy file existence check? */
753 	if (rename(oldfile, newfile) < 0)
754 	{
755 		ereport(elevel,
756 				(errcode_for_file_access(),
757 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
758 						oldfile, newfile)));
759 		return -1;
760 	}
761 #endif
762 
763 	/*
764 	 * Make change persistent in case of an OS crash, both the new entry and
765 	 * its parent directory need to be flushed.
766 	 */
767 	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
768 		return -1;
769 
770 	/* Same for parent directory */
771 	if (fsync_parent_path(newfile, elevel) != 0)
772 		return -1;
773 
774 	return 0;
775 }
776 
777 /*
778  * InitFileAccess --- initialize this module during backend startup
779  *
780  * This is called during either normal or standalone backend start.
781  * It is *not* called in the postmaster.
782  */
783 void
InitFileAccess(void)784 InitFileAccess(void)
785 {
786 	Assert(SizeVfdCache == 0);	/* call me only once */
787 
788 	/* initialize cache header entry */
789 	VfdCache = (Vfd *) malloc(sizeof(Vfd));
790 	if (VfdCache == NULL)
791 		ereport(FATAL,
792 				(errcode(ERRCODE_OUT_OF_MEMORY),
793 				 errmsg("out of memory")));
794 
795 	MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
796 	VfdCache->fd = VFD_CLOSED;
797 
798 	SizeVfdCache = 1;
799 
800 	/* register proc-exit hook to ensure temp files are dropped at exit */
801 	on_proc_exit(AtProcExit_Files, 0);
802 }
803 
804 /*
805  * count_usable_fds --- count how many FDs the system will let us open,
806  *		and estimate how many are already open.
807  *
808  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
809  * value of max_to_probe might result in an underestimate of already_open;
810  * we must fill in any "gaps" in the set of used FDs before the calculation
811  * of already_open will give the right answer.  In practice, max_to_probe
812  * of a couple of dozen should be enough to ensure good results.
813  *
814  * We assume stdin (FD 0) is available for dup'ing
815  */
816 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)817 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
818 {
819 	int		   *fd;
820 	int			size;
821 	int			used = 0;
822 	int			highestfd = 0;
823 	int			j;
824 
825 #ifdef HAVE_GETRLIMIT
826 	struct rlimit rlim;
827 	int			getrlimit_status;
828 #endif
829 
830 	size = 1024;
831 	fd = (int *) palloc(size * sizeof(int));
832 
833 #ifdef HAVE_GETRLIMIT
834 #ifdef RLIMIT_NOFILE			/* most platforms use RLIMIT_NOFILE */
835 	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
836 #else							/* but BSD doesn't ... */
837 	getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
838 #endif							/* RLIMIT_NOFILE */
839 	if (getrlimit_status != 0)
840 		ereport(WARNING, (errmsg("getrlimit failed: %m")));
841 #endif							/* HAVE_GETRLIMIT */
842 
843 	/* dup until failure or probe limit reached */
844 	for (;;)
845 	{
846 		int			thisfd;
847 
848 #ifdef HAVE_GETRLIMIT
849 
850 		/*
851 		 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
852 		 * some platforms
853 		 */
854 		if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
855 			break;
856 #endif
857 
858 		thisfd = dup(0);
859 		if (thisfd < 0)
860 		{
861 			/* Expect EMFILE or ENFILE, else it's fishy */
862 			if (errno != EMFILE && errno != ENFILE)
863 				elog(WARNING, "dup(0) failed after %d successes: %m", used);
864 			break;
865 		}
866 
867 		if (used >= size)
868 		{
869 			size *= 2;
870 			fd = (int *) repalloc(fd, size * sizeof(int));
871 		}
872 		fd[used++] = thisfd;
873 
874 		if (highestfd < thisfd)
875 			highestfd = thisfd;
876 
877 		if (used >= max_to_probe)
878 			break;
879 	}
880 
881 	/* release the files we opened */
882 	for (j = 0; j < used; j++)
883 		close(fd[j]);
884 
885 	pfree(fd);
886 
887 	/*
888 	 * Return results.  usable_fds is just the number of successful dups. We
889 	 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
890 	 * number) and so already_open is highestfd+1 - usable_fds.
891 	 */
892 	*usable_fds = used;
893 	*already_open = highestfd + 1 - used;
894 }
895 
896 /*
897  * set_max_safe_fds
898  *		Determine number of filedescriptors that fd.c is allowed to use
899  */
900 void
set_max_safe_fds(void)901 set_max_safe_fds(void)
902 {
903 	int			usable_fds;
904 	int			already_open;
905 
906 	/*----------
907 	 * We want to set max_safe_fds to
908 	 *			MIN(usable_fds, max_files_per_process - already_open)
909 	 * less the slop factor for files that are opened without consulting
910 	 * fd.c.  This ensures that we won't exceed either max_files_per_process
911 	 * or the experimentally-determined EMFILE limit.
912 	 *----------
913 	 */
914 	count_usable_fds(max_files_per_process,
915 					 &usable_fds, &already_open);
916 
917 	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
918 
919 	/*
920 	 * Take off the FDs reserved for system() etc.
921 	 */
922 	max_safe_fds -= NUM_RESERVED_FDS;
923 
924 	/*
925 	 * Make sure we still have enough to get by.
926 	 */
927 	if (max_safe_fds < FD_MINFREE)
928 		ereport(FATAL,
929 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
930 				 errmsg("insufficient file descriptors available to start server process"),
931 				 errdetail("System allows %d, we need at least %d.",
932 						   max_safe_fds + NUM_RESERVED_FDS,
933 						   FD_MINFREE + NUM_RESERVED_FDS)));
934 
935 	elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
936 		 max_safe_fds, usable_fds, already_open);
937 }
938 
939 /*
940  * BasicOpenFile --- same as open(2) except can free other FDs if needed
941  *
942  * This is exported for use by places that really want a plain kernel FD,
943  * but need to be proof against running out of FDs.  Once an FD has been
944  * successfully returned, it is the caller's responsibility to ensure that
945  * it will not be leaked on ereport()!	Most users should *not* call this
946  * routine directly, but instead use the VFD abstraction level, which
947  * provides protection against descriptor leaks as well as management of
948  * files that need to be open for more than a short period of time.
949  *
950  * Ideally this should be the *only* direct call of open() in the backend.
951  * In practice, the postmaster calls open() directly, and there are some
952  * direct open() calls done early in backend startup.  Those are OK since
953  * this module wouldn't have any open files to close at that point anyway.
954  */
955 int
BasicOpenFile(FileName fileName,int fileFlags,int fileMode)956 BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
957 {
958 	int			fd;
959 
960 tryAgain:
961 	fd = open(fileName, fileFlags, fileMode);
962 
963 	if (fd >= 0)
964 		return fd;				/* success! */
965 
966 	if (errno == EMFILE || errno == ENFILE)
967 	{
968 		int			save_errno = errno;
969 
970 		ereport(LOG,
971 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
972 				 errmsg("out of file descriptors: %m; release and retry")));
973 		errno = 0;
974 		if (ReleaseLruFile())
975 			goto tryAgain;
976 		errno = save_errno;
977 	}
978 
979 	return -1;					/* failure */
980 }
981 
982 #if defined(FDDEBUG)
983 
984 static void
_dump_lru(void)985 _dump_lru(void)
986 {
987 	int			mru = VfdCache[0].lruLessRecently;
988 	Vfd		   *vfdP = &VfdCache[mru];
989 	char		buf[2048];
990 
991 	snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
992 	while (mru != 0)
993 	{
994 		mru = vfdP->lruLessRecently;
995 		vfdP = &VfdCache[mru];
996 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
997 	}
998 	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
999 	elog(LOG, "%s", buf);
1000 }
1001 #endif							/* FDDEBUG */
1002 
1003 static void
Delete(File file)1004 Delete(File file)
1005 {
1006 	Vfd		   *vfdP;
1007 
1008 	Assert(file != 0);
1009 
1010 	DO_DB(elog(LOG, "Delete %d (%s)",
1011 			   file, VfdCache[file].fileName));
1012 	DO_DB(_dump_lru());
1013 
1014 	vfdP = &VfdCache[file];
1015 
1016 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1017 	VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1018 
1019 	DO_DB(_dump_lru());
1020 }
1021 
1022 static void
LruDelete(File file)1023 LruDelete(File file)
1024 {
1025 	Vfd		   *vfdP;
1026 
1027 	Assert(file != 0);
1028 
1029 	DO_DB(elog(LOG, "LruDelete %d (%s)",
1030 			   file, VfdCache[file].fileName));
1031 
1032 	vfdP = &VfdCache[file];
1033 
1034 	/*
1035 	 * Normally we should know the seek position, but if for some reason we
1036 	 * have lost track of it, try again to get it.  If we still can't get it,
1037 	 * we have a problem: we will be unable to restore the file seek position
1038 	 * when and if the file is re-opened.  But we can't really throw an error
1039 	 * and refuse to close the file, or activities such as transaction cleanup
1040 	 * will be broken.
1041 	 */
1042 	if (FilePosIsUnknown(vfdP->seekPos))
1043 	{
1044 		vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1045 		if (FilePosIsUnknown(vfdP->seekPos))
1046 			elog(LOG, "could not seek file \"%s\" before closing: %m",
1047 				 vfdP->fileName);
1048 	}
1049 
1050 	/*
1051 	 * Close the file.  We aren't expecting this to fail; if it does, better
1052 	 * to leak the FD than to mess up our internal state.
1053 	 */
1054 	if (close(vfdP->fd))
1055 		elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1056 			 "could not close file \"%s\": %m", vfdP->fileName);
1057 	vfdP->fd = VFD_CLOSED;
1058 	--nfile;
1059 
1060 	/* delete the vfd record from the LRU ring */
1061 	Delete(file);
1062 }
1063 
1064 static void
Insert(File file)1065 Insert(File file)
1066 {
1067 	Vfd		   *vfdP;
1068 
1069 	Assert(file != 0);
1070 
1071 	DO_DB(elog(LOG, "Insert %d (%s)",
1072 			   file, VfdCache[file].fileName));
1073 	DO_DB(_dump_lru());
1074 
1075 	vfdP = &VfdCache[file];
1076 
1077 	vfdP->lruMoreRecently = 0;
1078 	vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1079 	VfdCache[0].lruLessRecently = file;
1080 	VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1081 
1082 	DO_DB(_dump_lru());
1083 }
1084 
1085 /* returns 0 on success, -1 on re-open failure (with errno set) */
1086 static int
LruInsert(File file)1087 LruInsert(File file)
1088 {
1089 	Vfd		   *vfdP;
1090 
1091 	Assert(file != 0);
1092 
1093 	DO_DB(elog(LOG, "LruInsert %d (%s)",
1094 			   file, VfdCache[file].fileName));
1095 
1096 	vfdP = &VfdCache[file];
1097 
1098 	if (FileIsNotOpen(file))
1099 	{
1100 		/* Close excess kernel FDs. */
1101 		ReleaseLruFiles();
1102 
1103 		/*
1104 		 * The open could still fail for lack of file descriptors, eg due to
1105 		 * overall system file table being full.  So, be prepared to release
1106 		 * another FD if necessary...
1107 		 */
1108 		vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
1109 								 vfdP->fileMode);
1110 		if (vfdP->fd < 0)
1111 		{
1112 			DO_DB(elog(LOG, "re-open failed: %m"));
1113 			return -1;
1114 		}
1115 		else
1116 		{
1117 			++nfile;
1118 		}
1119 
1120 		/*
1121 		 * Seek to the right position.  We need no special case for seekPos
1122 		 * equal to FileUnknownPos, as lseek() will certainly reject that
1123 		 * (thus completing the logic noted in LruDelete() that we will fail
1124 		 * to re-open a file if we couldn't get its seek position before
1125 		 * closing).
1126 		 */
1127 		if (vfdP->seekPos != (off_t) 0)
1128 		{
1129 			if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1130 			{
1131 				/*
1132 				 * If we fail to restore the seek position, treat it like an
1133 				 * open() failure.
1134 				 */
1135 				int			save_errno = errno;
1136 
1137 				elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1138 					 vfdP->fileName);
1139 				(void) close(vfdP->fd);
1140 				vfdP->fd = VFD_CLOSED;
1141 				--nfile;
1142 				errno = save_errno;
1143 				return -1;
1144 			}
1145 		}
1146 	}
1147 
1148 	/*
1149 	 * put it at the head of the Lru ring
1150 	 */
1151 
1152 	Insert(file);
1153 
1154 	return 0;
1155 }
1156 
1157 /*
1158  * Release one kernel FD by closing the least-recently-used VFD.
1159  */
1160 static bool
ReleaseLruFile(void)1161 ReleaseLruFile(void)
1162 {
1163 	DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1164 
1165 	if (nfile > 0)
1166 	{
1167 		/*
1168 		 * There are opened files and so there should be at least one used vfd
1169 		 * in the ring.
1170 		 */
1171 		Assert(VfdCache[0].lruMoreRecently != 0);
1172 		LruDelete(VfdCache[0].lruMoreRecently);
1173 		return true;			/* freed a file */
1174 	}
1175 	return false;				/* no files available to free */
1176 }
1177 
1178 /*
1179  * Release kernel FDs as needed to get under the max_safe_fds limit.
1180  * After calling this, it's OK to try to open another file.
1181  */
1182 static void
ReleaseLruFiles(void)1183 ReleaseLruFiles(void)
1184 {
1185 	while (nfile + numAllocatedDescs >= max_safe_fds)
1186 	{
1187 		if (!ReleaseLruFile())
1188 			break;
1189 	}
1190 }
1191 
1192 static File
AllocateVfd(void)1193 AllocateVfd(void)
1194 {
1195 	Index		i;
1196 	File		file;
1197 
1198 	DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1199 
1200 	Assert(SizeVfdCache > 0);	/* InitFileAccess not called? */
1201 
1202 	if (VfdCache[0].nextFree == 0)
1203 	{
1204 		/*
1205 		 * The free list is empty so it is time to increase the size of the
1206 		 * array.  We choose to double it each time this happens. However,
1207 		 * there's not much point in starting *real* small.
1208 		 */
1209 		Size		newCacheSize = SizeVfdCache * 2;
1210 		Vfd		   *newVfdCache;
1211 
1212 		if (newCacheSize < 32)
1213 			newCacheSize = 32;
1214 
1215 		/*
1216 		 * Be careful not to clobber VfdCache ptr if realloc fails.
1217 		 */
1218 		newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1219 		if (newVfdCache == NULL)
1220 			ereport(ERROR,
1221 					(errcode(ERRCODE_OUT_OF_MEMORY),
1222 					 errmsg("out of memory")));
1223 		VfdCache = newVfdCache;
1224 
1225 		/*
1226 		 * Initialize the new entries and link them into the free list.
1227 		 */
1228 		for (i = SizeVfdCache; i < newCacheSize; i++)
1229 		{
1230 			MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1231 			VfdCache[i].nextFree = i + 1;
1232 			VfdCache[i].fd = VFD_CLOSED;
1233 		}
1234 		VfdCache[newCacheSize - 1].nextFree = 0;
1235 		VfdCache[0].nextFree = SizeVfdCache;
1236 
1237 		/*
1238 		 * Record the new size
1239 		 */
1240 		SizeVfdCache = newCacheSize;
1241 	}
1242 
1243 	file = VfdCache[0].nextFree;
1244 
1245 	VfdCache[0].nextFree = VfdCache[file].nextFree;
1246 
1247 	return file;
1248 }
1249 
1250 static void
FreeVfd(File file)1251 FreeVfd(File file)
1252 {
1253 	Vfd		   *vfdP = &VfdCache[file];
1254 
1255 	DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1256 			   file, vfdP->fileName ? vfdP->fileName : ""));
1257 
1258 	if (vfdP->fileName != NULL)
1259 	{
1260 		free(vfdP->fileName);
1261 		vfdP->fileName = NULL;
1262 	}
1263 	vfdP->fdstate = 0x0;
1264 
1265 	vfdP->nextFree = VfdCache[0].nextFree;
1266 	VfdCache[0].nextFree = file;
1267 }
1268 
1269 /* returns 0 on success, -1 on re-open failure (with errno set) */
1270 static int
FileAccess(File file)1271 FileAccess(File file)
1272 {
1273 	int			returnValue;
1274 
1275 	DO_DB(elog(LOG, "FileAccess %d (%s)",
1276 			   file, VfdCache[file].fileName));
1277 
1278 	/*
1279 	 * Is the file open?  If not, open it and put it at the head of the LRU
1280 	 * ring (possibly closing the least recently used file to get an FD).
1281 	 */
1282 
1283 	if (FileIsNotOpen(file))
1284 	{
1285 		returnValue = LruInsert(file);
1286 		if (returnValue != 0)
1287 			return returnValue;
1288 	}
1289 	else if (VfdCache[0].lruLessRecently != file)
1290 	{
1291 		/*
1292 		 * We now know that the file is open and that it is not the last one
1293 		 * accessed, so we need to move it to the head of the Lru ring.
1294 		 */
1295 
1296 		Delete(file);
1297 		Insert(file);
1298 	}
1299 
1300 	return 0;
1301 }
1302 
1303 /*
1304  *	Called when we get a shared invalidation message on some relation.
1305  */
1306 #ifdef NOT_USED
1307 void
FileInvalidate(File file)1308 FileInvalidate(File file)
1309 {
1310 	Assert(FileIsValid(file));
1311 	if (!FileIsNotOpen(file))
1312 		LruDelete(file);
1313 }
1314 #endif
1315 
1316 /*
1317  * open a file in an arbitrary directory
1318  *
1319  * NB: if the passed pathname is relative (which it usually is),
1320  * it will be interpreted relative to the process' working directory
1321  * (which should always be $PGDATA when this code is running).
1322  */
1323 File
PathNameOpenFile(FileName fileName,int fileFlags,int fileMode)1324 PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
1325 {
1326 	char	   *fnamecopy;
1327 	File		file;
1328 	Vfd		   *vfdP;
1329 
1330 	DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
1331 			   fileName, fileFlags, fileMode));
1332 
1333 	/*
1334 	 * We need a malloc'd copy of the file name; fail cleanly if no room.
1335 	 */
1336 	fnamecopy = strdup(fileName);
1337 	if (fnamecopy == NULL)
1338 		ereport(ERROR,
1339 				(errcode(ERRCODE_OUT_OF_MEMORY),
1340 				 errmsg("out of memory")));
1341 
1342 	file = AllocateVfd();
1343 	vfdP = &VfdCache[file];
1344 
1345 	/* Close excess kernel FDs. */
1346 	ReleaseLruFiles();
1347 
1348 	vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
1349 
1350 	if (vfdP->fd < 0)
1351 	{
1352 		int			save_errno = errno;
1353 
1354 		FreeVfd(file);
1355 		free(fnamecopy);
1356 		errno = save_errno;
1357 		return -1;
1358 	}
1359 	++nfile;
1360 	DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1361 			   vfdP->fd));
1362 
1363 	vfdP->fileName = fnamecopy;
1364 	/* Saved flags are adjusted to be OK for re-opening file */
1365 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1366 	vfdP->fileMode = fileMode;
1367 	vfdP->seekPos = 0;
1368 	vfdP->fileSize = 0;
1369 	vfdP->fdstate = 0x0;
1370 	vfdP->resowner = NULL;
1371 
1372 	Insert(file);
1373 
1374 	return file;
1375 }
1376 
1377 /*
1378  * Open a temporary file that will disappear when we close it.
1379  *
1380  * This routine takes care of generating an appropriate tempfile name.
1381  * There's no need to pass in fileFlags or fileMode either, since only
1382  * one setting makes any sense for a temp file.
1383  *
1384  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1385  * to ensure it's closed and deleted when it's no longer needed, typically at
1386  * the end-of-transaction. In most cases, you don't want temporary files to
1387  * outlive the transaction that created them, so this should be false -- but
1388  * if you need "somewhat" temporary storage, this might be useful. In either
1389  * case, the file is removed when the File is explicitly closed.
1390  */
1391 File
OpenTemporaryFile(bool interXact)1392 OpenTemporaryFile(bool interXact)
1393 {
1394 	File		file = 0;
1395 
1396 	/*
1397 	 * Make sure the current resource owner has space for this File before we
1398 	 * open it, if we'll be registering it below.
1399 	 */
1400 	if (!interXact)
1401 		ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1402 
1403 	/*
1404 	 * If some temp tablespace(s) have been given to us, try to use the next
1405 	 * one.  If a given tablespace can't be found, we silently fall back to
1406 	 * the database's default tablespace.
1407 	 *
1408 	 * BUT: if the temp file is slated to outlive the current transaction,
1409 	 * force it into the database's default tablespace, so that it will not
1410 	 * pose a threat to possible tablespace drop attempts.
1411 	 */
1412 	if (numTempTableSpaces > 0 && !interXact)
1413 	{
1414 		Oid			tblspcOid = GetNextTempTableSpace();
1415 
1416 		if (OidIsValid(tblspcOid))
1417 			file = OpenTemporaryFileInTablespace(tblspcOid, false);
1418 	}
1419 
1420 	/*
1421 	 * If not, or if tablespace is bad, create in database's default
1422 	 * tablespace.  MyDatabaseTableSpace should normally be set before we get
1423 	 * here, but just in case it isn't, fall back to pg_default tablespace.
1424 	 */
1425 	if (file <= 0)
1426 		file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1427 											 MyDatabaseTableSpace :
1428 											 DEFAULTTABLESPACE_OID,
1429 											 true);
1430 
1431 	/* Mark it for deletion at close */
1432 	VfdCache[file].fdstate |= FD_TEMPORARY;
1433 
1434 	/* Register it with the current resource owner */
1435 	if (!interXact)
1436 	{
1437 		VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
1438 
1439 		VfdCache[file].resowner = CurrentResourceOwner;
1440 		ResourceOwnerRememberFile(CurrentResourceOwner, file);
1441 
1442 		/* ensure cleanup happens at eoxact */
1443 		have_xact_temporary_files = true;
1444 	}
1445 
1446 	return file;
1447 }
1448 
1449 /*
1450  * Open a temporary file in a specific tablespace.
1451  * Subroutine for OpenTemporaryFile, which see for details.
1452  */
1453 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1454 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1455 {
1456 	char		tempdirpath[MAXPGPATH];
1457 	char		tempfilepath[MAXPGPATH];
1458 	File		file;
1459 
1460 	/*
1461 	 * Identify the tempfile directory for this tablespace.
1462 	 *
1463 	 * If someone tries to specify pg_global, use pg_default instead.
1464 	 */
1465 	if (tblspcOid == DEFAULTTABLESPACE_OID ||
1466 		tblspcOid == GLOBALTABLESPACE_OID)
1467 	{
1468 		/* The default tablespace is {datadir}/base */
1469 		snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
1470 				 PG_TEMP_FILES_DIR);
1471 	}
1472 	else
1473 	{
1474 		/* All other tablespaces are accessed via symlinks */
1475 		snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
1476 				 tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
1477 	}
1478 
1479 	/*
1480 	 * Generate a tempfile name that should be unique within the current
1481 	 * database instance.
1482 	 */
1483 	snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1484 			 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1485 
1486 	/*
1487 	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
1488 	 * temp file that can be reused.
1489 	 */
1490 	file = PathNameOpenFile(tempfilepath,
1491 							O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1492 							0600);
1493 	if (file <= 0)
1494 	{
1495 		/*
1496 		 * We might need to create the tablespace's tempfile directory, if no
1497 		 * one has yet done so.
1498 		 *
1499 		 * Don't check for error from mkdir; it could fail if someone else
1500 		 * just did the same thing.  If it doesn't work then we'll bomb out on
1501 		 * the second create attempt, instead.
1502 		 */
1503 		mkdir(tempdirpath, S_IRWXU);
1504 
1505 		file = PathNameOpenFile(tempfilepath,
1506 								O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1507 								0600);
1508 		if (file <= 0 && rejectError)
1509 			elog(ERROR, "could not create temporary file \"%s\": %m",
1510 				 tempfilepath);
1511 	}
1512 
1513 	return file;
1514 }
1515 
1516 /*
1517  * close a file when done with it
1518  */
1519 void
FileClose(File file)1520 FileClose(File file)
1521 {
1522 	Vfd		   *vfdP;
1523 
1524 	Assert(FileIsValid(file));
1525 
1526 	DO_DB(elog(LOG, "FileClose: %d (%s)",
1527 			   file, VfdCache[file].fileName));
1528 
1529 	vfdP = &VfdCache[file];
1530 
1531 	if (!FileIsNotOpen(file))
1532 	{
1533 		/* close the file */
1534 		if (close(vfdP->fd))
1535 		{
1536 			/*
1537 			 * We may need to panic on failure to close non-temporary files;
1538 			 * see LruDelete.
1539 			 */
1540 			elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1541 				"could not close file \"%s\": %m", vfdP->fileName);
1542 		}
1543 
1544 		--nfile;
1545 		vfdP->fd = VFD_CLOSED;
1546 
1547 		/* remove the file from the lru ring */
1548 		Delete(file);
1549 	}
1550 
1551 	/*
1552 	 * Delete the file if it was temporary, and make a log entry if wanted
1553 	 */
1554 	if (vfdP->fdstate & FD_TEMPORARY)
1555 	{
1556 		struct stat filestats;
1557 		int			stat_errno;
1558 
1559 		/*
1560 		 * If we get an error, as could happen within the ereport/elog calls,
1561 		 * we'll come right back here during transaction abort.  Reset the
1562 		 * flag to ensure that we can't get into an infinite loop.  This code
1563 		 * is arranged to ensure that the worst-case consequence is failing to
1564 		 * emit log message(s), not failing to attempt the unlink.
1565 		 */
1566 		vfdP->fdstate &= ~FD_TEMPORARY;
1567 
1568 		/* Subtract its size from current usage (do first in case of error) */
1569 		temporary_files_size -= vfdP->fileSize;
1570 		vfdP->fileSize = 0;
1571 
1572 		/* first try the stat() */
1573 		if (stat(vfdP->fileName, &filestats))
1574 			stat_errno = errno;
1575 		else
1576 			stat_errno = 0;
1577 
1578 		/* in any case do the unlink */
1579 		if (unlink(vfdP->fileName))
1580 			elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1581 
1582 		/* and last report the stat results */
1583 		if (stat_errno == 0)
1584 		{
1585 			pgstat_report_tempfile(filestats.st_size);
1586 
1587 			if (log_temp_files >= 0)
1588 			{
1589 				if ((filestats.st_size / 1024) >= log_temp_files)
1590 					ereport(LOG,
1591 							(errmsg("temporary file: path \"%s\", size %lu",
1592 									vfdP->fileName,
1593 									(unsigned long) filestats.st_size)));
1594 			}
1595 		}
1596 		else
1597 		{
1598 			errno = stat_errno;
1599 			elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1600 		}
1601 	}
1602 
1603 	/* Unregister it from the resource owner */
1604 	if (vfdP->resowner)
1605 		ResourceOwnerForgetFile(vfdP->resowner, file);
1606 
1607 	/*
1608 	 * Return the Vfd slot to the free list
1609 	 */
1610 	FreeVfd(file);
1611 }
1612 
1613 /*
1614  * FilePrefetch - initiate asynchronous read of a given range of the file.
1615  * The logical seek position is unaffected.
1616  *
1617  * Currently the only implementation of this function is using posix_fadvise
1618  * which is the simplest standardized interface that accomplishes this.
1619  * We could add an implementation using libaio in the future; but note that
1620  * this API is inappropriate for libaio, which wants to have a buffer provided
1621  * to read into.
1622  */
1623 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1624 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1625 {
1626 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1627 	int			returnCode;
1628 
1629 	Assert(FileIsValid(file));
1630 
1631 	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1632 			   file, VfdCache[file].fileName,
1633 			   (int64) offset, amount));
1634 
1635 	returnCode = FileAccess(file);
1636 	if (returnCode < 0)
1637 		return returnCode;
1638 
1639 	pgstat_report_wait_start(wait_event_info);
1640 	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1641 							   POSIX_FADV_WILLNEED);
1642 	pgstat_report_wait_end();
1643 
1644 	return returnCode;
1645 #else
1646 	Assert(FileIsValid(file));
1647 	return 0;
1648 #endif
1649 }
1650 
1651 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)1652 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1653 {
1654 	int			returnCode;
1655 
1656 	Assert(FileIsValid(file));
1657 
1658 	DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1659 			   file, VfdCache[file].fileName,
1660 			   (int64) offset, (int64) nbytes));
1661 
1662 	/*
1663 	 * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1664 	 * file's seek position.  We prefer to define that as a no-op here.
1665 	 */
1666 	if (nbytes <= 0)
1667 		return;
1668 
1669 	returnCode = FileAccess(file);
1670 	if (returnCode < 0)
1671 		return;
1672 
1673 	pgstat_report_wait_start(wait_event_info);
1674 	pg_flush_data(VfdCache[file].fd, offset, nbytes);
1675 	pgstat_report_wait_end();
1676 }
1677 
1678 int
FileRead(File file,char * buffer,int amount,uint32 wait_event_info)1679 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1680 {
1681 	int			returnCode;
1682 	Vfd		   *vfdP;
1683 
1684 	Assert(FileIsValid(file));
1685 
1686 	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1687 			   file, VfdCache[file].fileName,
1688 			   (int64) VfdCache[file].seekPos,
1689 			   amount, buffer));
1690 
1691 	returnCode = FileAccess(file);
1692 	if (returnCode < 0)
1693 		return returnCode;
1694 
1695 	vfdP = &VfdCache[file];
1696 
1697 retry:
1698 	pgstat_report_wait_start(wait_event_info);
1699 	returnCode = read(vfdP->fd, buffer, amount);
1700 	pgstat_report_wait_end();
1701 
1702 	if (returnCode >= 0)
1703 	{
1704 		/* if seekPos is unknown, leave it that way */
1705 		if (!FilePosIsUnknown(vfdP->seekPos))
1706 			vfdP->seekPos += returnCode;
1707 	}
1708 	else
1709 	{
1710 		/*
1711 		 * Windows may run out of kernel buffers and return "Insufficient
1712 		 * system resources" error.  Wait a bit and retry to solve it.
1713 		 *
1714 		 * It is rumored that EINTR is also possible on some Unix filesystems,
1715 		 * in which case immediate retry is indicated.
1716 		 */
1717 #ifdef WIN32
1718 		DWORD		error = GetLastError();
1719 
1720 		switch (error)
1721 		{
1722 			case ERROR_NO_SYSTEM_RESOURCES:
1723 				pg_usleep(1000L);
1724 				errno = EINTR;
1725 				break;
1726 			default:
1727 				_dosmaperr(error);
1728 				break;
1729 		}
1730 #endif
1731 		/* OK to retry if interrupted */
1732 		if (errno == EINTR)
1733 			goto retry;
1734 
1735 		/* Trouble, so assume we don't know the file position anymore */
1736 		vfdP->seekPos = FileUnknownPos;
1737 	}
1738 
1739 	return returnCode;
1740 }
1741 
1742 int
FileWrite(File file,char * buffer,int amount,uint32 wait_event_info)1743 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1744 {
1745 	int			returnCode;
1746 	Vfd		   *vfdP;
1747 
1748 	Assert(FileIsValid(file));
1749 
1750 	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1751 			   file, VfdCache[file].fileName,
1752 			   (int64) VfdCache[file].seekPos,
1753 			   amount, buffer));
1754 
1755 	returnCode = FileAccess(file);
1756 	if (returnCode < 0)
1757 		return returnCode;
1758 
1759 	vfdP = &VfdCache[file];
1760 
1761 	/*
1762 	 * If enforcing temp_file_limit and it's a temp file, check to see if the
1763 	 * write would overrun temp_file_limit, and throw error if so.  Note: it's
1764 	 * really a modularity violation to throw error here; we should set errno
1765 	 * and return -1.  However, there's no way to report a suitable error
1766 	 * message if we do that.  All current callers would just throw error
1767 	 * immediately anyway, so this is safe at present.
1768 	 */
1769 	if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
1770 	{
1771 		off_t		newPos;
1772 
1773 		/*
1774 		 * Normally we should know the seek position, but if for some reason
1775 		 * we have lost track of it, try again to get it.  Here, it's fine to
1776 		 * throw an error if we still can't get it.
1777 		 */
1778 		if (FilePosIsUnknown(vfdP->seekPos))
1779 		{
1780 			vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1781 			if (FilePosIsUnknown(vfdP->seekPos))
1782 				elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1783 		}
1784 
1785 		newPos = vfdP->seekPos + amount;
1786 		if (newPos > vfdP->fileSize)
1787 		{
1788 			uint64		newTotal = temporary_files_size;
1789 
1790 			newTotal += newPos - vfdP->fileSize;
1791 			if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1792 				ereport(ERROR,
1793 						(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1794 						 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1795 								temp_file_limit)));
1796 		}
1797 	}
1798 
1799 retry:
1800 	errno = 0;
1801 	pgstat_report_wait_start(wait_event_info);
1802 	returnCode = write(vfdP->fd, buffer, amount);
1803 	pgstat_report_wait_end();
1804 
1805 	/* if write didn't set errno, assume problem is no disk space */
1806 	if (returnCode != amount && errno == 0)
1807 		errno = ENOSPC;
1808 
1809 	if (returnCode >= 0)
1810 	{
1811 		/* if seekPos is unknown, leave it that way */
1812 		if (!FilePosIsUnknown(vfdP->seekPos))
1813 			vfdP->seekPos += returnCode;
1814 
1815 		/*
1816 		 * Maintain fileSize and temporary_files_size if it's a temp file.
1817 		 *
1818 		 * If seekPos is -1 (unknown), this will do nothing; but we could only
1819 		 * get here in that state if we're not enforcing temporary_files_size,
1820 		 * so we don't care.
1821 		 */
1822 		if (vfdP->fdstate & FD_TEMPORARY)
1823 		{
1824 			off_t		newPos = vfdP->seekPos;
1825 
1826 			if (newPos > vfdP->fileSize)
1827 			{
1828 				temporary_files_size += newPos - vfdP->fileSize;
1829 				vfdP->fileSize = newPos;
1830 			}
1831 		}
1832 	}
1833 	else
1834 	{
1835 		/*
1836 		 * See comments in FileRead()
1837 		 */
1838 #ifdef WIN32
1839 		DWORD		error = GetLastError();
1840 
1841 		switch (error)
1842 		{
1843 			case ERROR_NO_SYSTEM_RESOURCES:
1844 				pg_usleep(1000L);
1845 				errno = EINTR;
1846 				break;
1847 			default:
1848 				_dosmaperr(error);
1849 				break;
1850 		}
1851 #endif
1852 		/* OK to retry if interrupted */
1853 		if (errno == EINTR)
1854 			goto retry;
1855 
1856 		/* Trouble, so assume we don't know the file position anymore */
1857 		vfdP->seekPos = FileUnknownPos;
1858 	}
1859 
1860 	return returnCode;
1861 }
1862 
1863 int
FileSync(File file,uint32 wait_event_info)1864 FileSync(File file, uint32 wait_event_info)
1865 {
1866 	int			returnCode;
1867 
1868 	Assert(FileIsValid(file));
1869 
1870 	DO_DB(elog(LOG, "FileSync: %d (%s)",
1871 			   file, VfdCache[file].fileName));
1872 
1873 	returnCode = FileAccess(file);
1874 	if (returnCode < 0)
1875 		return returnCode;
1876 
1877 	pgstat_report_wait_start(wait_event_info);
1878 	returnCode = pg_fsync(VfdCache[file].fd);
1879 	pgstat_report_wait_end();
1880 
1881 	return returnCode;
1882 }
1883 
1884 off_t
FileSeek(File file,off_t offset,int whence)1885 FileSeek(File file, off_t offset, int whence)
1886 {
1887 	Vfd		   *vfdP;
1888 
1889 	Assert(FileIsValid(file));
1890 
1891 	DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
1892 			   file, VfdCache[file].fileName,
1893 			   (int64) VfdCache[file].seekPos,
1894 			   (int64) offset, whence));
1895 
1896 	vfdP = &VfdCache[file];
1897 
1898 	if (FileIsNotOpen(file))
1899 	{
1900 		switch (whence)
1901 		{
1902 			case SEEK_SET:
1903 				if (offset < 0)
1904 				{
1905 					errno = EINVAL;
1906 					return (off_t) -1;
1907 				}
1908 				vfdP->seekPos = offset;
1909 				break;
1910 			case SEEK_CUR:
1911 				if (FilePosIsUnknown(vfdP->seekPos) ||
1912 					vfdP->seekPos + offset < 0)
1913 				{
1914 					errno = EINVAL;
1915 					return (off_t) -1;
1916 				}
1917 				vfdP->seekPos += offset;
1918 				break;
1919 			case SEEK_END:
1920 				if (FileAccess(file) < 0)
1921 					return (off_t) -1;
1922 				vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1923 				break;
1924 			default:
1925 				elog(ERROR, "invalid whence: %d", whence);
1926 				break;
1927 		}
1928 	}
1929 	else
1930 	{
1931 		switch (whence)
1932 		{
1933 			case SEEK_SET:
1934 				if (offset < 0)
1935 				{
1936 					errno = EINVAL;
1937 					return (off_t) -1;
1938 				}
1939 				if (vfdP->seekPos != offset)
1940 					vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1941 				break;
1942 			case SEEK_CUR:
1943 				if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
1944 					vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1945 				break;
1946 			case SEEK_END:
1947 				vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1948 				break;
1949 			default:
1950 				elog(ERROR, "invalid whence: %d", whence);
1951 				break;
1952 		}
1953 	}
1954 
1955 	return vfdP->seekPos;
1956 }
1957 
1958 /*
1959  * XXX not actually used but here for completeness
1960  */
1961 #ifdef NOT_USED
1962 off_t
FileTell(File file)1963 FileTell(File file)
1964 {
1965 	Assert(FileIsValid(file));
1966 	DO_DB(elog(LOG, "FileTell %d (%s)",
1967 			   file, VfdCache[file].fileName));
1968 	return VfdCache[file].seekPos;
1969 }
1970 #endif
1971 
1972 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)1973 FileTruncate(File file, off_t offset, uint32 wait_event_info)
1974 {
1975 	int			returnCode;
1976 
1977 	Assert(FileIsValid(file));
1978 
1979 	DO_DB(elog(LOG, "FileTruncate %d (%s)",
1980 			   file, VfdCache[file].fileName));
1981 
1982 	returnCode = FileAccess(file);
1983 	if (returnCode < 0)
1984 		return returnCode;
1985 
1986 	pgstat_report_wait_start(wait_event_info);
1987 	returnCode = ftruncate(VfdCache[file].fd, offset);
1988 	pgstat_report_wait_end();
1989 
1990 	if (returnCode == 0 && VfdCache[file].fileSize > offset)
1991 	{
1992 		/* adjust our state for truncation of a temp file */
1993 		Assert(VfdCache[file].fdstate & FD_TEMPORARY);
1994 		temporary_files_size -= VfdCache[file].fileSize - offset;
1995 		VfdCache[file].fileSize = offset;
1996 	}
1997 
1998 	return returnCode;
1999 }
2000 
2001 /*
2002  * Return the pathname associated with an open file.
2003  *
2004  * The returned string points to an internal buffer, which is valid until
2005  * the file is closed.
2006  */
2007 char *
FilePathName(File file)2008 FilePathName(File file)
2009 {
2010 	Assert(FileIsValid(file));
2011 
2012 	return VfdCache[file].fileName;
2013 }
2014 
2015 /*
2016  * Return the raw file descriptor of an opened file.
2017  *
2018  * The returned file descriptor will be valid until the file is closed, but
2019  * there are a lot of things that can make that happen.  So the caller should
2020  * be careful not to do much of anything else before it finishes using the
2021  * returned file descriptor.
2022  */
2023 int
FileGetRawDesc(File file)2024 FileGetRawDesc(File file)
2025 {
2026 	Assert(FileIsValid(file));
2027 	return VfdCache[file].fd;
2028 }
2029 
2030 /*
2031  * FileGetRawFlags - returns the file flags on open(2)
2032  */
2033 int
FileGetRawFlags(File file)2034 FileGetRawFlags(File file)
2035 {
2036 	Assert(FileIsValid(file));
2037 	return VfdCache[file].fileFlags;
2038 }
2039 
2040 /*
2041  * FileGetRawMode - returns the mode bitmask passed to open(2)
2042  */
2043 int
FileGetRawMode(File file)2044 FileGetRawMode(File file)
2045 {
2046 	Assert(FileIsValid(file));
2047 	return VfdCache[file].fileMode;
2048 }
2049 
2050 /*
2051  * Make room for another allocatedDescs[] array entry if needed and possible.
2052  * Returns true if an array element is available.
2053  */
2054 static bool
reserveAllocatedDesc(void)2055 reserveAllocatedDesc(void)
2056 {
2057 	AllocateDesc *newDescs;
2058 	int			newMax;
2059 
2060 	/* Quick out if array already has a free slot. */
2061 	if (numAllocatedDescs < maxAllocatedDescs)
2062 		return true;
2063 
2064 	/*
2065 	 * If the array hasn't yet been created in the current process, initialize
2066 	 * it with FD_MINFREE / 2 elements.  In many scenarios this is as many as
2067 	 * we will ever need, anyway.  We don't want to look at max_safe_fds
2068 	 * immediately because set_max_safe_fds() may not have run yet.
2069 	 */
2070 	if (allocatedDescs == NULL)
2071 	{
2072 		newMax = FD_MINFREE / 2;
2073 		newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2074 		/* Out of memory already?  Treat as fatal error. */
2075 		if (newDescs == NULL)
2076 			ereport(ERROR,
2077 					(errcode(ERRCODE_OUT_OF_MEMORY),
2078 					 errmsg("out of memory")));
2079 		allocatedDescs = newDescs;
2080 		maxAllocatedDescs = newMax;
2081 		return true;
2082 	}
2083 
2084 	/*
2085 	 * Consider enlarging the array beyond the initial allocation used above.
2086 	 * By the time this happens, max_safe_fds should be known accurately.
2087 	 *
2088 	 * We mustn't let allocated descriptors hog all the available FDs, and in
2089 	 * practice we'd better leave a reasonable number of FDs for VFD use.  So
2090 	 * set the maximum to max_safe_fds / 2.  (This should certainly be at
2091 	 * least as large as the initial size, FD_MINFREE / 2.)
2092 	 */
2093 	newMax = max_safe_fds / 2;
2094 	if (newMax > maxAllocatedDescs)
2095 	{
2096 		newDescs = (AllocateDesc *) realloc(allocatedDescs,
2097 											newMax * sizeof(AllocateDesc));
2098 		/* Treat out-of-memory as a non-fatal error. */
2099 		if (newDescs == NULL)
2100 			return false;
2101 		allocatedDescs = newDescs;
2102 		maxAllocatedDescs = newMax;
2103 		return true;
2104 	}
2105 
2106 	/* Can't enlarge allocatedDescs[] any more. */
2107 	return false;
2108 }
2109 
2110 /*
2111  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2112  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
2113  * necessary to open the file.  When done, call FreeFile rather than fclose.
2114  *
2115  * Note that files that will be open for any significant length of time
2116  * should NOT be handled this way, since they cannot share kernel file
2117  * descriptors with other files; there is grave risk of running out of FDs
2118  * if anyone locks down too many FDs.  Most callers of this routine are
2119  * simply reading a config file that they will read and close immediately.
2120  *
2121  * fd.c will automatically close all files opened with AllocateFile at
2122  * transaction commit or abort; this prevents FD leakage if a routine
2123  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2124  *
2125  * Ideally this should be the *only* direct call of fopen() in the backend.
2126  */
2127 FILE *
AllocateFile(const char * name,const char * mode)2128 AllocateFile(const char *name, const char *mode)
2129 {
2130 	FILE	   *file;
2131 
2132 	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2133 			   numAllocatedDescs, name));
2134 
2135 	/* Can we allocate another non-virtual FD? */
2136 	if (!reserveAllocatedDesc())
2137 		ereport(ERROR,
2138 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2139 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2140 						maxAllocatedDescs, name)));
2141 
2142 	/* Close excess kernel FDs. */
2143 	ReleaseLruFiles();
2144 
2145 TryAgain:
2146 	if ((file = fopen(name, mode)) != NULL)
2147 	{
2148 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2149 
2150 		desc->kind = AllocateDescFile;
2151 		desc->desc.file = file;
2152 		desc->create_subid = GetCurrentSubTransactionId();
2153 		numAllocatedDescs++;
2154 		return desc->desc.file;
2155 	}
2156 
2157 	if (errno == EMFILE || errno == ENFILE)
2158 	{
2159 		int			save_errno = errno;
2160 
2161 		ereport(LOG,
2162 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2163 				 errmsg("out of file descriptors: %m; release and retry")));
2164 		errno = 0;
2165 		if (ReleaseLruFile())
2166 			goto TryAgain;
2167 		errno = save_errno;
2168 	}
2169 
2170 	return NULL;
2171 }
2172 
2173 
2174 /*
2175  * Like AllocateFile, but returns an unbuffered fd like open(2)
2176  */
2177 int
OpenTransientFile(FileName fileName,int fileFlags,int fileMode)2178 OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
2179 {
2180 	int			fd;
2181 
2182 	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2183 			   numAllocatedDescs, fileName));
2184 
2185 	/* Can we allocate another non-virtual FD? */
2186 	if (!reserveAllocatedDesc())
2187 		ereport(ERROR,
2188 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2189 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2190 						maxAllocatedDescs, fileName)));
2191 
2192 	/* Close excess kernel FDs. */
2193 	ReleaseLruFiles();
2194 
2195 	fd = BasicOpenFile(fileName, fileFlags, fileMode);
2196 
2197 	if (fd >= 0)
2198 	{
2199 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2200 
2201 		desc->kind = AllocateDescRawFD;
2202 		desc->desc.fd = fd;
2203 		desc->create_subid = GetCurrentSubTransactionId();
2204 		numAllocatedDescs++;
2205 
2206 		return fd;
2207 	}
2208 
2209 	return -1;					/* failure */
2210 }
2211 
2212 /*
2213  * Routines that want to initiate a pipe stream should use OpenPipeStream
2214  * rather than plain popen().  This lets fd.c deal with freeing FDs if
2215  * necessary.  When done, call ClosePipeStream rather than pclose.
2216  *
2217  * This function also ensures that the popen'd program is run with default
2218  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2219  * uses.  This ensures desirable response to, eg, closing a read pipe early.
2220  */
2221 FILE *
OpenPipeStream(const char * command,const char * mode)2222 OpenPipeStream(const char *command, const char *mode)
2223 {
2224 	FILE	   *file;
2225 	int			save_errno;
2226 
2227 	DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2228 			   numAllocatedDescs, command));
2229 
2230 	/* Can we allocate another non-virtual FD? */
2231 	if (!reserveAllocatedDesc())
2232 		ereport(ERROR,
2233 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2234 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2235 						maxAllocatedDescs, command)));
2236 
2237 	/* Close excess kernel FDs. */
2238 	ReleaseLruFiles();
2239 
2240 TryAgain:
2241 	fflush(stdout);
2242 	fflush(stderr);
2243 	pqsignal(SIGPIPE, SIG_DFL);
2244 	errno = 0;
2245 	file = popen(command, mode);
2246 	save_errno = errno;
2247 	pqsignal(SIGPIPE, SIG_IGN);
2248 	errno = save_errno;
2249 	if (file != NULL)
2250 	{
2251 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2252 
2253 		desc->kind = AllocateDescPipe;
2254 		desc->desc.file = file;
2255 		desc->create_subid = GetCurrentSubTransactionId();
2256 		numAllocatedDescs++;
2257 		return desc->desc.file;
2258 	}
2259 
2260 	if (errno == EMFILE || errno == ENFILE)
2261 	{
2262 		ereport(LOG,
2263 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2264 				 errmsg("out of file descriptors: %m; release and retry")));
2265 		if (ReleaseLruFile())
2266 			goto TryAgain;
2267 		errno = save_errno;
2268 	}
2269 
2270 	return NULL;
2271 }
2272 
2273 /*
2274  * Free an AllocateDesc of any type.
2275  *
2276  * The argument *must* point into the allocatedDescs[] array.
2277  */
2278 static int
FreeDesc(AllocateDesc * desc)2279 FreeDesc(AllocateDesc *desc)
2280 {
2281 	int			result;
2282 
2283 	/* Close the underlying object */
2284 	switch (desc->kind)
2285 	{
2286 		case AllocateDescFile:
2287 			result = fclose(desc->desc.file);
2288 			break;
2289 		case AllocateDescPipe:
2290 			result = pclose(desc->desc.file);
2291 			break;
2292 		case AllocateDescDir:
2293 			result = closedir(desc->desc.dir);
2294 			break;
2295 		case AllocateDescRawFD:
2296 			result = close(desc->desc.fd);
2297 			break;
2298 		default:
2299 			elog(ERROR, "AllocateDesc kind not recognized");
2300 			result = 0;			/* keep compiler quiet */
2301 			break;
2302 	}
2303 
2304 	/* Compact storage in the allocatedDescs array */
2305 	numAllocatedDescs--;
2306 	*desc = allocatedDescs[numAllocatedDescs];
2307 
2308 	return result;
2309 }
2310 
2311 /*
2312  * Close a file returned by AllocateFile.
2313  *
2314  * Note we do not check fclose's return value --- it is up to the caller
2315  * to handle close errors.
2316  */
2317 int
FreeFile(FILE * file)2318 FreeFile(FILE *file)
2319 {
2320 	int			i;
2321 
2322 	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2323 
2324 	/* Remove file from list of allocated files, if it's present */
2325 	for (i = numAllocatedDescs; --i >= 0;)
2326 	{
2327 		AllocateDesc *desc = &allocatedDescs[i];
2328 
2329 		if (desc->kind == AllocateDescFile && desc->desc.file == file)
2330 			return FreeDesc(desc);
2331 	}
2332 
2333 	/* Only get here if someone passes us a file not in allocatedDescs */
2334 	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2335 
2336 	return fclose(file);
2337 }
2338 
2339 /*
2340  * Close a file returned by OpenTransientFile.
2341  *
2342  * Note we do not check close's return value --- it is up to the caller
2343  * to handle close errors.
2344  */
2345 int
CloseTransientFile(int fd)2346 CloseTransientFile(int fd)
2347 {
2348 	int			i;
2349 
2350 	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2351 
2352 	/* Remove fd from list of allocated files, if it's present */
2353 	for (i = numAllocatedDescs; --i >= 0;)
2354 	{
2355 		AllocateDesc *desc = &allocatedDescs[i];
2356 
2357 		if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2358 			return FreeDesc(desc);
2359 	}
2360 
2361 	/* Only get here if someone passes us a file not in allocatedDescs */
2362 	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2363 
2364 	return close(fd);
2365 }
2366 
2367 /*
2368  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2369  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
2370  * necessary to open the directory, and with closing it after an elog.
2371  * When done, call FreeDir rather than closedir.
2372  *
2373  * Returns NULL, with errno set, on failure.  Note that failure detection
2374  * is commonly left to the following call of ReadDir or ReadDirExtended;
2375  * see the comments for ReadDir.
2376  *
2377  * Ideally this should be the *only* direct call of opendir() in the backend.
2378  */
2379 DIR *
AllocateDir(const char * dirname)2380 AllocateDir(const char *dirname)
2381 {
2382 	DIR		   *dir;
2383 
2384 	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2385 			   numAllocatedDescs, dirname));
2386 
2387 	/* Can we allocate another non-virtual FD? */
2388 	if (!reserveAllocatedDesc())
2389 		ereport(ERROR,
2390 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2391 				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2392 						maxAllocatedDescs, dirname)));
2393 
2394 	/* Close excess kernel FDs. */
2395 	ReleaseLruFiles();
2396 
2397 TryAgain:
2398 	if ((dir = opendir(dirname)) != NULL)
2399 	{
2400 		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2401 
2402 		desc->kind = AllocateDescDir;
2403 		desc->desc.dir = dir;
2404 		desc->create_subid = GetCurrentSubTransactionId();
2405 		numAllocatedDescs++;
2406 		return desc->desc.dir;
2407 	}
2408 
2409 	if (errno == EMFILE || errno == ENFILE)
2410 	{
2411 		int			save_errno = errno;
2412 
2413 		ereport(LOG,
2414 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2415 				 errmsg("out of file descriptors: %m; release and retry")));
2416 		errno = 0;
2417 		if (ReleaseLruFile())
2418 			goto TryAgain;
2419 		errno = save_errno;
2420 	}
2421 
2422 	return NULL;
2423 }
2424 
2425 /*
2426  * Read a directory opened with AllocateDir, ereport'ing any error.
2427  *
2428  * This is easier to use than raw readdir() since it takes care of some
2429  * otherwise rather tedious and error-prone manipulation of errno.  Also,
2430  * if you are happy with a generic error message for AllocateDir failure,
2431  * you can just do
2432  *
2433  *		dir = AllocateDir(path);
2434  *		while ((dirent = ReadDir(dir, path)) != NULL)
2435  *			process dirent;
2436  *		FreeDir(dir);
2437  *
2438  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2439  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2440  * use this shortcut.)
2441  *
2442  * The pathname passed to AllocateDir must be passed to this routine too,
2443  * but it is only used for error reporting.
2444  */
2445 struct dirent *
ReadDir(DIR * dir,const char * dirname)2446 ReadDir(DIR *dir, const char *dirname)
2447 {
2448 	return ReadDirExtended(dir, dirname, ERROR);
2449 }
2450 
2451 /*
2452  * Alternate version of ReadDir that allows caller to specify the elevel
2453  * for any error report (whether it's reporting an initial failure of
2454  * AllocateDir or a subsequent directory read failure).
2455  *
2456  * If elevel < ERROR, returns NULL after any error.  With the normal coding
2457  * pattern, this will result in falling out of the loop immediately as
2458  * though the directory contained no (more) entries.
2459  */
2460 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2461 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2462 {
2463 	struct dirent *dent;
2464 
2465 	/* Give a generic message for AllocateDir failure, if caller didn't */
2466 	if (dir == NULL)
2467 	{
2468 		ereport(elevel,
2469 				(errcode_for_file_access(),
2470 				 errmsg("could not open directory \"%s\": %m",
2471 						dirname)));
2472 		return NULL;
2473 	}
2474 
2475 	errno = 0;
2476 	if ((dent = readdir(dir)) != NULL)
2477 		return dent;
2478 
2479 	if (errno)
2480 		ereport(elevel,
2481 				(errcode_for_file_access(),
2482 				 errmsg("could not read directory \"%s\": %m",
2483 						dirname)));
2484 	return NULL;
2485 }
2486 
2487 /*
2488  * Close a directory opened with AllocateDir.
2489  *
2490  * Returns closedir's return value (with errno set if it's not 0).
2491  * Note we do not check the return value --- it is up to the caller
2492  * to handle close errors if wanted.
2493  *
2494  * Does nothing if dir == NULL; we assume that directory open failure was
2495  * already reported if desired.
2496  */
2497 int
FreeDir(DIR * dir)2498 FreeDir(DIR *dir)
2499 {
2500 	int			i;
2501 
2502 	/* Nothing to do if AllocateDir failed */
2503 	if (dir == NULL)
2504 		return 0;
2505 
2506 	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2507 
2508 	/* Remove dir from list of allocated dirs, if it's present */
2509 	for (i = numAllocatedDescs; --i >= 0;)
2510 	{
2511 		AllocateDesc *desc = &allocatedDescs[i];
2512 
2513 		if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2514 			return FreeDesc(desc);
2515 	}
2516 
2517 	/* Only get here if someone passes us a dir not in allocatedDescs */
2518 	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2519 
2520 	return closedir(dir);
2521 }
2522 
2523 
2524 /*
2525  * Close a pipe stream returned by OpenPipeStream.
2526  */
2527 int
ClosePipeStream(FILE * file)2528 ClosePipeStream(FILE *file)
2529 {
2530 	int			i;
2531 
2532 	DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2533 
2534 	/* Remove file from list of allocated files, if it's present */
2535 	for (i = numAllocatedDescs; --i >= 0;)
2536 	{
2537 		AllocateDesc *desc = &allocatedDescs[i];
2538 
2539 		if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2540 			return FreeDesc(desc);
2541 	}
2542 
2543 	/* Only get here if someone passes us a file not in allocatedDescs */
2544 	elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2545 
2546 	return pclose(file);
2547 }
2548 
2549 /*
2550  * closeAllVfds
2551  *
2552  * Force all VFDs into the physically-closed state, so that the fewest
2553  * possible number of kernel file descriptors are in use.  There is no
2554  * change in the logical state of the VFDs.
2555  */
2556 void
closeAllVfds(void)2557 closeAllVfds(void)
2558 {
2559 	Index		i;
2560 
2561 	if (SizeVfdCache > 0)
2562 	{
2563 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2564 		for (i = 1; i < SizeVfdCache; i++)
2565 		{
2566 			if (!FileIsNotOpen(i))
2567 				LruDelete(i);
2568 		}
2569 	}
2570 }
2571 
2572 
2573 /*
2574  * SetTempTablespaces
2575  *
2576  * Define a list (actually an array) of OIDs of tablespaces to use for
2577  * temporary files.  This list will be used until end of transaction,
2578  * unless this function is called again before then.  It is caller's
2579  * responsibility that the passed-in array has adequate lifespan (typically
2580  * it'd be allocated in TopTransactionContext).
2581  *
2582  * Some entries of the array may be InvalidOid, indicating that the current
2583  * database's default tablespace should be used.
2584  */
2585 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2586 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2587 {
2588 	Assert(numSpaces >= 0);
2589 	tempTableSpaces = tableSpaces;
2590 	numTempTableSpaces = numSpaces;
2591 
2592 	/*
2593 	 * Select a random starting point in the list.  This is to minimize
2594 	 * conflicts between backends that are most likely sharing the same list
2595 	 * of temp tablespaces.  Note that if we create multiple temp files in the
2596 	 * same transaction, we'll advance circularly through the list --- this
2597 	 * ensures that large temporary sort files are nicely spread across all
2598 	 * available tablespaces.
2599 	 */
2600 	if (numSpaces > 1)
2601 		nextTempTableSpace = random() % numSpaces;
2602 	else
2603 		nextTempTableSpace = 0;
2604 }
2605 
2606 /*
2607  * TempTablespacesAreSet
2608  *
2609  * Returns TRUE if SetTempTablespaces has been called in current transaction.
2610  * (This is just so that tablespaces.c doesn't need its own per-transaction
2611  * state.)
2612  */
2613 bool
TempTablespacesAreSet(void)2614 TempTablespacesAreSet(void)
2615 {
2616 	return (numTempTableSpaces >= 0);
2617 }
2618 
2619 /*
2620  * GetNextTempTableSpace
2621  *
2622  * Select the next temp tablespace to use.  A result of InvalidOid means
2623  * to use the current database's default tablespace.
2624  */
2625 Oid
GetNextTempTableSpace(void)2626 GetNextTempTableSpace(void)
2627 {
2628 	if (numTempTableSpaces > 0)
2629 	{
2630 		/* Advance nextTempTableSpace counter with wraparound */
2631 		if (++nextTempTableSpace >= numTempTableSpaces)
2632 			nextTempTableSpace = 0;
2633 		return tempTableSpaces[nextTempTableSpace];
2634 	}
2635 	return InvalidOid;
2636 }
2637 
2638 
2639 /*
2640  * AtEOSubXact_Files
2641  *
2642  * Take care of subtransaction commit/abort.  At abort, we close temp files
2643  * that the subtransaction may have opened.  At commit, we reassign the
2644  * files that were opened to the parent subtransaction.
2645  */
2646 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2647 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2648 				  SubTransactionId parentSubid)
2649 {
2650 	Index		i;
2651 
2652 	for (i = 0; i < numAllocatedDescs; i++)
2653 	{
2654 		if (allocatedDescs[i].create_subid == mySubid)
2655 		{
2656 			if (isCommit)
2657 				allocatedDescs[i].create_subid = parentSubid;
2658 			else
2659 			{
2660 				/* have to recheck the item after FreeDesc (ugly) */
2661 				FreeDesc(&allocatedDescs[i--]);
2662 			}
2663 		}
2664 	}
2665 }
2666 
2667 /*
2668  * AtEOXact_Files
2669  *
2670  * This routine is called during transaction commit or abort (it doesn't
2671  * particularly care which).  All still-open per-transaction temporary file
2672  * VFDs are closed, which also causes the underlying files to be deleted
2673  * (although they should've been closed already by the ResourceOwner
2674  * cleanup). Furthermore, all "allocated" stdio files are closed. We also
2675  * forget any transaction-local temp tablespace list.
2676  */
2677 void
AtEOXact_Files(void)2678 AtEOXact_Files(void)
2679 {
2680 	CleanupTempFiles(false);
2681 	tempTableSpaces = NULL;
2682 	numTempTableSpaces = -1;
2683 }
2684 
2685 /*
2686  * AtProcExit_Files
2687  *
2688  * on_proc_exit hook to clean up temp files during backend shutdown.
2689  * Here, we want to clean up *all* temp files including interXact ones.
2690  */
2691 static void
AtProcExit_Files(int code,Datum arg)2692 AtProcExit_Files(int code, Datum arg)
2693 {
2694 	CleanupTempFiles(true);
2695 }
2696 
2697 /*
2698  * Close temporary files and delete their underlying files.
2699  *
2700  * isProcExit: if true, this is being called as the backend process is
2701  * exiting. If that's the case, we should remove all temporary files; if
2702  * that's not the case, we are being called for transaction commit/abort
2703  * and should only remove transaction-local temp files.  In either case,
2704  * also clean up "allocated" stdio files, dirs and fds.
2705  */
2706 static void
CleanupTempFiles(bool isProcExit)2707 CleanupTempFiles(bool isProcExit)
2708 {
2709 	Index		i;
2710 
2711 	/*
2712 	 * Careful here: at proc_exit we need extra cleanup, not just
2713 	 * xact_temporary files.
2714 	 */
2715 	if (isProcExit || have_xact_temporary_files)
2716 	{
2717 		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
2718 		for (i = 1; i < SizeVfdCache; i++)
2719 		{
2720 			unsigned short fdstate = VfdCache[i].fdstate;
2721 
2722 			if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
2723 			{
2724 				/*
2725 				 * If we're in the process of exiting a backend process, close
2726 				 * all temporary files. Otherwise, only close temporary files
2727 				 * local to the current transaction. They should be closed by
2728 				 * the ResourceOwner mechanism already, so this is just a
2729 				 * debugging cross-check.
2730 				 */
2731 				if (isProcExit)
2732 					FileClose(i);
2733 				else if (fdstate & FD_XACT_TEMPORARY)
2734 				{
2735 					elog(WARNING,
2736 						 "temporary file %s not closed at end-of-transaction",
2737 						 VfdCache[i].fileName);
2738 					FileClose(i);
2739 				}
2740 			}
2741 		}
2742 
2743 		have_xact_temporary_files = false;
2744 	}
2745 
2746 	/* Clean up "allocated" stdio files, dirs and fds. */
2747 	while (numAllocatedDescs > 0)
2748 		FreeDesc(&allocatedDescs[0]);
2749 }
2750 
2751 
2752 /*
2753  * Remove temporary and temporary relation files left over from a prior
2754  * postmaster session
2755  *
2756  * This should be called during postmaster startup.  It will forcibly
2757  * remove any leftover files created by OpenTemporaryFile and any leftover
2758  * temporary relation files created by mdcreate.
2759  *
2760  * NOTE: we could, but don't, call this during a post-backend-crash restart
2761  * cycle.  The argument for not doing it is that someone might want to examine
2762  * the temp files for debugging purposes.  This does however mean that
2763  * OpenTemporaryFile had better allow for collision with an existing temp
2764  * file name.
2765  */
2766 void
RemovePgTempFiles(void)2767 RemovePgTempFiles(void)
2768 {
2769 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2770 	DIR		   *spc_dir;
2771 	struct dirent *spc_de;
2772 
2773 	/*
2774 	 * First process temp files in pg_default ($PGDATA/base)
2775 	 */
2776 	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2777 	RemovePgTempFilesInDir(temp_path);
2778 	RemovePgTempRelationFiles("base");
2779 
2780 	/*
2781 	 * Cycle through temp directories for all non-default tablespaces.
2782 	 */
2783 	spc_dir = AllocateDir("pg_tblspc");
2784 
2785 	while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
2786 	{
2787 		if (strcmp(spc_de->d_name, ".") == 0 ||
2788 			strcmp(spc_de->d_name, "..") == 0)
2789 			continue;
2790 
2791 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2792 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2793 		RemovePgTempFilesInDir(temp_path);
2794 
2795 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2796 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2797 		RemovePgTempRelationFiles(temp_path);
2798 	}
2799 
2800 	FreeDir(spc_dir);
2801 
2802 	/*
2803 	 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2804 	 * DataDir as well.
2805 	 */
2806 #ifdef EXEC_BACKEND
2807 	RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
2808 #endif
2809 }
2810 
2811 /* Process one pgsql_tmp directory for RemovePgTempFiles */
2812 static void
RemovePgTempFilesInDir(const char * tmpdirname)2813 RemovePgTempFilesInDir(const char *tmpdirname)
2814 {
2815 	DIR		   *temp_dir;
2816 	struct dirent *temp_de;
2817 	char		rm_path[MAXPGPATH * 2];
2818 
2819 	temp_dir = AllocateDir(tmpdirname);
2820 	if (temp_dir == NULL)
2821 	{
2822 		/* anything except ENOENT is fishy */
2823 		if (errno != ENOENT)
2824 			elog(LOG,
2825 				 "could not open temporary-files directory \"%s\": %m",
2826 				 tmpdirname);
2827 		return;
2828 	}
2829 
2830 	while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
2831 	{
2832 		if (strcmp(temp_de->d_name, ".") == 0 ||
2833 			strcmp(temp_de->d_name, "..") == 0)
2834 			continue;
2835 
2836 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
2837 				 tmpdirname, temp_de->d_name);
2838 
2839 		if (strncmp(temp_de->d_name,
2840 					PG_TEMP_FILE_PREFIX,
2841 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
2842 			unlink(rm_path);	/* note we ignore any error */
2843 		else
2844 			elog(LOG,
2845 				 "unexpected file found in temporary-files directory: \"%s\"",
2846 				 rm_path);
2847 	}
2848 
2849 	FreeDir(temp_dir);
2850 }
2851 
2852 /* Process one tablespace directory, look for per-DB subdirectories */
2853 static void
RemovePgTempRelationFiles(const char * tsdirname)2854 RemovePgTempRelationFiles(const char *tsdirname)
2855 {
2856 	DIR		   *ts_dir;
2857 	struct dirent *de;
2858 	char		dbspace_path[MAXPGPATH * 2];
2859 
2860 	ts_dir = AllocateDir(tsdirname);
2861 	if (ts_dir == NULL)
2862 	{
2863 		/* anything except ENOENT is fishy */
2864 		if (errno != ENOENT)
2865 			elog(LOG,
2866 				 "could not open tablespace directory \"%s\": %m",
2867 				 tsdirname);
2868 		return;
2869 	}
2870 
2871 	while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
2872 	{
2873 		int			i = 0;
2874 
2875 		/*
2876 		 * We're only interested in the per-database directories, which have
2877 		 * numeric names.  Note that this code will also (properly) ignore "."
2878 		 * and "..".
2879 		 */
2880 		while (isdigit((unsigned char) de->d_name[i]))
2881 			++i;
2882 		if (de->d_name[i] != '\0' || i == 0)
2883 			continue;
2884 
2885 		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
2886 				 tsdirname, de->d_name);
2887 		RemovePgTempRelationFilesInDbspace(dbspace_path);
2888 	}
2889 
2890 	FreeDir(ts_dir);
2891 }
2892 
2893 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
2894 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)2895 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
2896 {
2897 	DIR		   *dbspace_dir;
2898 	struct dirent *de;
2899 	char		rm_path[MAXPGPATH * 2];
2900 
2901 	dbspace_dir = AllocateDir(dbspacedirname);
2902 	if (dbspace_dir == NULL)
2903 	{
2904 		/* we just saw this directory, so it really ought to be there */
2905 		elog(LOG,
2906 			 "could not open dbspace directory \"%s\": %m",
2907 			 dbspacedirname);
2908 		return;
2909 	}
2910 
2911 	while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
2912 	{
2913 		if (!looks_like_temp_rel_name(de->d_name))
2914 			continue;
2915 
2916 		snprintf(rm_path, sizeof(rm_path), "%s/%s",
2917 				 dbspacedirname, de->d_name);
2918 
2919 		unlink(rm_path);		/* note we ignore any error */
2920 	}
2921 
2922 	FreeDir(dbspace_dir);
2923 }
2924 
2925 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
2926 static bool
looks_like_temp_rel_name(const char * name)2927 looks_like_temp_rel_name(const char *name)
2928 {
2929 	int			pos;
2930 	int			savepos;
2931 
2932 	/* Must start with "t". */
2933 	if (name[0] != 't')
2934 		return false;
2935 
2936 	/* Followed by a non-empty string of digits and then an underscore. */
2937 	for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
2938 		;
2939 	if (pos == 1 || name[pos] != '_')
2940 		return false;
2941 
2942 	/* Followed by another nonempty string of digits. */
2943 	for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
2944 		;
2945 	if (savepos == pos)
2946 		return false;
2947 
2948 	/* We might have _forkname or .segment or both. */
2949 	if (name[pos] == '_')
2950 	{
2951 		int			forkchar = forkname_chars(&name[pos + 1], NULL);
2952 
2953 		if (forkchar <= 0)
2954 			return false;
2955 		pos += forkchar + 1;
2956 	}
2957 	if (name[pos] == '.')
2958 	{
2959 		int			segchar;
2960 
2961 		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
2962 			;
2963 		if (segchar <= 1)
2964 			return false;
2965 		pos += segchar;
2966 	}
2967 
2968 	/* Now we should be at the end. */
2969 	if (name[pos] != '\0')
2970 		return false;
2971 	return true;
2972 }
2973 
2974 
2975 /*
2976  * Issue fsync recursively on PGDATA and all its contents.
2977  *
2978  * We fsync regular files and directories wherever they are, but we
2979  * follow symlinks only for pg_wal and immediately under pg_tblspc.
2980  * Other symlinks are presumed to point at files we're not responsible
2981  * for fsyncing, and might not have privileges to write at all.
2982  *
2983  * Errors are logged but not considered fatal; that's because this is used
2984  * only during database startup, to deal with the possibility that there are
2985  * issued-but-unsynced writes pending against the data directory.  We want to
2986  * ensure that such writes reach disk before anything that's done in the new
2987  * run.  However, aborting on error would result in failure to start for
2988  * harmless cases such as read-only files in the data directory, and that's
2989  * not good either.
2990  *
2991  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
2992  * rewriting all changes again during recovery.
2993  *
2994  * Note we assume we're chdir'd into PGDATA to begin with.
2995  */
2996 void
SyncDataDirectory(void)2997 SyncDataDirectory(void)
2998 {
2999 	bool		xlog_is_symlink;
3000 
3001 	/* We can skip this whole thing if fsync is disabled. */
3002 	if (!enableFsync)
3003 		return;
3004 
3005 	/*
3006 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
3007 	 * because the first walkdir below will ignore it.
3008 	 */
3009 	xlog_is_symlink = false;
3010 
3011 #ifndef WIN32
3012 	{
3013 		struct stat st;
3014 
3015 		if (lstat("pg_wal", &st) < 0)
3016 			ereport(LOG,
3017 					(errcode_for_file_access(),
3018 					 errmsg("could not stat file \"%s\": %m",
3019 							"pg_wal")));
3020 		else if (S_ISLNK(st.st_mode))
3021 			xlog_is_symlink = true;
3022 	}
3023 #else
3024 	if (pgwin32_is_junction("pg_wal"))
3025 		xlog_is_symlink = true;
3026 #endif
3027 
3028 	/*
3029 	 * If possible, hint to the kernel that we're soon going to fsync the data
3030 	 * directory and its contents.  Errors in this step are even less
3031 	 * interesting than normal, so log them only at DEBUG1.
3032 	 */
3033 #ifdef PG_FLUSH_DATA_WORKS
3034 	walkdir(".", pre_sync_fname, false, DEBUG1);
3035 	if (xlog_is_symlink)
3036 		walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3037 	walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3038 #endif
3039 
3040 	/*
3041 	 * Now we do the fsync()s in the same order.
3042 	 *
3043 	 * The main call ignores symlinks, so in addition to specially processing
3044 	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3045 	 * process_symlinks = true.  Note that if there are any plain directories
3046 	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
3047 	 * so we don't worry about optimizing it.
3048 	 */
3049 	walkdir(".", datadir_fsync_fname, false, LOG);
3050 	if (xlog_is_symlink)
3051 		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3052 	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3053 }
3054 
3055 /*
3056  * walkdir: recursively walk a directory, applying the action to each
3057  * regular file and directory (including the named directory itself).
3058  *
3059  * If process_symlinks is true, the action and recursion are also applied
3060  * to regular files and directories that are pointed to by symlinks in the
3061  * given directory; otherwise symlinks are ignored.  Symlinks are always
3062  * ignored in subdirectories, ie we intentionally don't pass down the
3063  * process_symlinks flag to recursive calls.
3064  *
3065  * Errors are reported at level elevel, which might be ERROR or less.
3066  *
3067  * See also walkdir in initdb.c, which is a frontend version of this logic.
3068  */
3069 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3070 walkdir(const char *path,
3071 		void (*action) (const char *fname, bool isdir, int elevel),
3072 		bool process_symlinks,
3073 		int elevel)
3074 {
3075 	DIR		   *dir;
3076 	struct dirent *de;
3077 
3078 	dir = AllocateDir(path);
3079 	if (dir == NULL)
3080 	{
3081 		ereport(elevel,
3082 				(errcode_for_file_access(),
3083 				 errmsg("could not open directory \"%s\": %m", path)));
3084 		return;
3085 	}
3086 
3087 	while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3088 	{
3089 		char		subpath[MAXPGPATH * 2];
3090 		struct stat fst;
3091 		int			sret;
3092 
3093 		CHECK_FOR_INTERRUPTS();
3094 
3095 		if (strcmp(de->d_name, ".") == 0 ||
3096 			strcmp(de->d_name, "..") == 0)
3097 			continue;
3098 
3099 		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3100 
3101 		if (process_symlinks)
3102 			sret = stat(subpath, &fst);
3103 		else
3104 			sret = lstat(subpath, &fst);
3105 
3106 		if (sret < 0)
3107 		{
3108 			ereport(elevel,
3109 					(errcode_for_file_access(),
3110 					 errmsg("could not stat file \"%s\": %m", subpath)));
3111 			continue;
3112 		}
3113 
3114 		if (S_ISREG(fst.st_mode))
3115 			(*action) (subpath, false, elevel);
3116 		else if (S_ISDIR(fst.st_mode))
3117 			walkdir(subpath, action, false, elevel);
3118 	}
3119 
3120 	FreeDir(dir);				/* we ignore any error here */
3121 
3122 	/*
3123 	 * It's important to fsync the destination directory itself as individual
3124 	 * file fsyncs don't guarantee that the directory entry for the file is
3125 	 * synced.
3126 	 */
3127 	(*action) (path, true, elevel);
3128 }
3129 
3130 
3131 /*
3132  * Hint to the OS that it should get ready to fsync() this file.
3133  *
3134  * Ignores errors trying to open unreadable files, and logs other errors at a
3135  * caller-specified level.
3136  */
3137 #ifdef PG_FLUSH_DATA_WORKS
3138 
3139 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3140 pre_sync_fname(const char *fname, bool isdir, int elevel)
3141 {
3142 	int			fd;
3143 
3144 	/* Don't try to flush directories, it'll likely just fail */
3145 	if (isdir)
3146 		return;
3147 
3148 	fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);
3149 
3150 	if (fd < 0)
3151 	{
3152 		if (errno == EACCES)
3153 			return;
3154 		ereport(elevel,
3155 				(errcode_for_file_access(),
3156 				 errmsg("could not open file \"%s\": %m", fname)));
3157 		return;
3158 	}
3159 
3160 	/*
3161 	 * pg_flush_data() ignores errors, which is ok because this is only a
3162 	 * hint.
3163 	 */
3164 	pg_flush_data(fd, 0, 0);
3165 
3166 	(void) CloseTransientFile(fd);
3167 }
3168 
3169 #endif							/* PG_FLUSH_DATA_WORKS */
3170 
3171 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3172 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3173 {
3174 	/*
3175 	 * We want to silently ignoring errors about unreadable files.  Pass that
3176 	 * desire on to fsync_fname_ext().
3177 	 */
3178 	fsync_fname_ext(fname, isdir, true, elevel);
3179 }
3180 
3181 /*
3182  * fsync_fname_ext -- Try to fsync a file or directory
3183  *
3184  * If ignore_perm is true, ignore errors upon trying to open unreadable
3185  * files. Logs other errors at a caller-specified level.
3186  *
3187  * Returns 0 if the operation succeeded, -1 otherwise.
3188  */
3189 static int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3190 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3191 {
3192 	int			fd;
3193 	int			flags;
3194 	int			returncode;
3195 
3196 	/*
3197 	 * Some OSs require directories to be opened read-only whereas other
3198 	 * systems don't allow us to fsync files opened read-only; so we need both
3199 	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
3200 	 * not writable by our userid, but we assume that's OK.
3201 	 */
3202 	flags = PG_BINARY;
3203 	if (!isdir)
3204 		flags |= O_RDWR;
3205 	else
3206 		flags |= O_RDONLY;
3207 
3208 	fd = OpenTransientFile((char *) fname, flags, 0);
3209 
3210 	/*
3211 	 * Some OSs don't allow us to open directories at all (Windows returns
3212 	 * EACCES), just ignore the error in that case.  If desired also silently
3213 	 * ignoring errors about unreadable files. Log others.
3214 	 */
3215 	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3216 		return 0;
3217 	else if (fd < 0 && ignore_perm && errno == EACCES)
3218 		return 0;
3219 	else if (fd < 0)
3220 	{
3221 		ereport(elevel,
3222 				(errcode_for_file_access(),
3223 				 errmsg("could not open file \"%s\": %m", fname)));
3224 		return -1;
3225 	}
3226 
3227 	returncode = pg_fsync(fd);
3228 
3229 	/*
3230 	 * Some OSes don't allow us to fsync directories at all, so we can ignore
3231 	 * those errors. Anything else needs to be logged.
3232 	 */
3233 	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3234 	{
3235 		int			save_errno;
3236 
3237 		/* close file upon error, might not be in transaction context */
3238 		save_errno = errno;
3239 		(void) CloseTransientFile(fd);
3240 		errno = save_errno;
3241 
3242 		ereport(elevel,
3243 				(errcode_for_file_access(),
3244 				 errmsg("could not fsync file \"%s\": %m", fname)));
3245 		return -1;
3246 	}
3247 
3248 	(void) CloseTransientFile(fd);
3249 
3250 	return 0;
3251 }
3252 
3253 /*
3254  * fsync_parent_path -- fsync the parent path of a file or directory
3255  *
3256  * This is aimed at making file operations persistent on disk in case of
3257  * an OS crash or power failure.
3258  */
3259 static int
fsync_parent_path(const char * fname,int elevel)3260 fsync_parent_path(const char *fname, int elevel)
3261 {
3262 	char		parentpath[MAXPGPATH];
3263 
3264 	strlcpy(parentpath, fname, MAXPGPATH);
3265 	get_parent_directory(parentpath);
3266 
3267 	/*
3268 	 * get_parent_directory() returns an empty string if the input argument is
3269 	 * just a file name (see comments in path.c), so handle that as being the
3270 	 * current directory.
3271 	 */
3272 	if (strlen(parentpath) == 0)
3273 		strlcpy(parentpath, ".", MAXPGPATH);
3274 
3275 	if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3276 		return -1;
3277 
3278 	return 0;
3279 }
3280 
3281 /*
3282  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3283  *
3284  * Failure to fsync any data file is cause for immediate panic, unless
3285  * data_sync_retry is enabled.  Data may have been written to the operating
3286  * system and removed from our buffer pool already, and if we are running on
3287  * an operating system that forgets dirty data on write-back failure, there
3288  * may be only one copy of the data remaining: in the WAL.  A later attempt to
3289  * fsync again might falsely report success.  Therefore we must not allow any
3290  * further checkpoints to be attempted.  data_sync_retry can in theory be
3291  * enabled on systems known not to drop dirty buffered data on write-back
3292  * failure (with the likely outcome that checkpoints will continue to fail
3293  * until the underlying problem is fixed).
3294  *
3295  * Any code that reports a failure from fsync() or related functions should
3296  * filter the error level with this function.
3297  */
3298 int
data_sync_elevel(int elevel)3299 data_sync_elevel(int elevel)
3300 {
3301 	return data_sync_retry ? elevel : PANIC;
3302 }
3303