1 /*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 *-------------------------------------------------------------------------
65 */
66
67 #include "postgres.h"
68
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h> /* for getrlimit */
80 #endif
81
82 #include "miscadmin.h"
83 #include "access/xact.h"
84 #include "access/xlog.h"
85 #include "catalog/pg_tablespace.h"
86 #include "common/file_perm.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93
94
95 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96 #if defined(HAVE_SYNC_FILE_RANGE)
97 #define PG_FLUSH_DATA_WORKS 1
98 #elif !defined(WIN32) && defined(MS_ASYNC)
99 #define PG_FLUSH_DATA_WORKS 1
100 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101 #define PG_FLUSH_DATA_WORKS 1
102 #endif
103
104 /*
105 * We must leave some file descriptors free for system(), the dynamic loader,
106 * and other code that tries to open files without consulting fd.c. This
107 * is the number left free. (While we can be pretty sure we won't get
108 * EMFILE, there's never any guarantee that we won't get ENFILE due to
109 * other processes chewing up FDs. So it's a bad idea to try to open files
110 * without consulting fd.c. Nonetheless we cannot control all code.)
111 *
112 * Because this is just a fixed setting, we are effectively assuming that
113 * no such code will leave FDs open over the long term; otherwise the slop
114 * is likely to be insufficient. Note in particular that we expect that
115 * loading a shared library does not result in any permanent increase in
116 * the number of open files. (This appears to be true on most if not
117 * all platforms as of Feb 2004.)
118 */
119 #define NUM_RESERVED_FDS 10
120
121 /*
122 * If we have fewer than this many usable FDs after allowing for the reserved
123 * ones, choke.
124 */
125 #define FD_MINFREE 10
126
127 /*
128 * A number of platforms allow individual processes to open many more files
129 * than they can really support when *many* processes do the same thing.
130 * This GUC parameter lets the DBA limit max_safe_fds to something less than
131 * what the postmaster's initial probe suggests will work.
132 */
133 int max_files_per_process = 1000;
134
135 /*
136 * Maximum number of file descriptors to open for either VFD entries or
137 * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
138 * to a conservative value, and remains that way indefinitely in bootstrap or
139 * standalone-backend cases. In normal postmaster operation, the postmaster
140 * calls set_max_safe_fds() late in initialization to update the value, and
141 * that value is then inherited by forked subprocesses.
142 *
143 * Note: the value of max_files_per_process is taken into account while
144 * setting this variable, and so need not be tested separately.
145 */
146 int max_safe_fds = 32; /* default if not changed */
147
148 /* Whether it is safe to continue running after fsync() fails. */
149 bool data_sync_retry = false;
150
151 /* Debugging.... */
152
153 #ifdef FDDEBUG
154 #define DO_DB(A) \
155 do { \
156 int _do_db_save_errno = errno; \
157 A; \
158 errno = _do_db_save_errno; \
159 } while (0)
160 #else
161 #define DO_DB(A) \
162 ((void) 0)
163 #endif
164
165 #define VFD_CLOSED (-1)
166
167 #define FileIsValid(file) \
168 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
169
170 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
171
172 /* these are the assigned bits in fdstate below: */
173 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
174 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
175 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
176
177 typedef struct vfd
178 {
179 int fd; /* current FD, or VFD_CLOSED if none */
180 unsigned short fdstate; /* bitflags for VFD's state */
181 ResourceOwner resowner; /* owner, for automatic cleanup */
182 File nextFree; /* link to next free VFD, if in freelist */
183 File lruMoreRecently; /* doubly linked recency-of-use list */
184 File lruLessRecently;
185 off_t fileSize; /* current size of file (0 if not temporary) */
186 char *fileName; /* name of file, or NULL for unused VFD */
187 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
188 int fileFlags; /* open(2) flags for (re)opening the file */
189 mode_t fileMode; /* mode to pass to open(2) */
190 } Vfd;
191
192 /*
193 * Virtual File Descriptor array pointer and size. This grows as
194 * needed. 'File' values are indexes into this array.
195 * Note that VfdCache[0] is not a usable VFD, just a list header.
196 */
197 static Vfd *VfdCache;
198 static Size SizeVfdCache = 0;
199
200 /*
201 * Number of file descriptors known to be in use by VFD entries.
202 */
203 static int nfile = 0;
204
205 /*
206 * Flag to tell whether it's worth scanning VfdCache looking for temp files
207 * to close
208 */
209 static bool have_xact_temporary_files = false;
210
211 /*
212 * Tracks the total size of all temporary files. Note: when temp_file_limit
213 * is being enforced, this cannot overflow since the limit cannot be more
214 * than INT_MAX kilobytes. When not enforcing, it could theoretically
215 * overflow, but we don't care.
216 */
217 static uint64 temporary_files_size = 0;
218
219 /*
220 * List of OS handles opened with AllocateFile, AllocateDir and
221 * OpenTransientFile.
222 */
223 typedef enum
224 {
225 AllocateDescFile,
226 AllocateDescPipe,
227 AllocateDescDir,
228 AllocateDescRawFD
229 } AllocateDescKind;
230
231 typedef struct
232 {
233 AllocateDescKind kind;
234 SubTransactionId create_subid;
235 union
236 {
237 FILE *file;
238 DIR *dir;
239 int fd;
240 } desc;
241 } AllocateDesc;
242
243 static int numAllocatedDescs = 0;
244 static int maxAllocatedDescs = 0;
245 static AllocateDesc *allocatedDescs = NULL;
246
247 /*
248 * Number of temporary files opened during the current session;
249 * this is used in generation of tempfile names.
250 */
251 static long tempFileCounter = 0;
252
253 /*
254 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
255 * indicating that the current database's default tablespace should be used.)
256 * When numTempTableSpaces is -1, this has not been set in the current
257 * transaction.
258 */
259 static Oid *tempTableSpaces = NULL;
260 static int numTempTableSpaces = -1;
261 static int nextTempTableSpace = 0;
262
263
264 /*--------------------
265 *
266 * Private Routines
267 *
268 * Delete - delete a file from the Lru ring
269 * LruDelete - remove a file from the Lru ring and close its FD
270 * Insert - put a file at the front of the Lru ring
271 * LruInsert - put a file at the front of the Lru ring and open it
272 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
273 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
274 * AllocateVfd - grab a free (or new) file record (from VfdArray)
275 * FreeVfd - free a file record
276 *
277 * The Least Recently Used ring is a doubly linked list that begins and
278 * ends on element zero. Element zero is special -- it doesn't represent
279 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
280 * anchor that shows us the beginning/end of the ring.
281 * Only VFD elements that are currently really open (have an FD assigned) are
282 * in the Lru ring. Elements that are "virtually" open can be recognized
283 * by having a non-null fileName field.
284 *
285 * example:
286 *
287 * /--less----\ /---------\
288 * v \ v \
289 * #0 --more---> LeastRecentlyUsed --more-\ \
290 * ^\ | |
291 * \\less--> MostRecentlyUsedFile <---/ |
292 * \more---/ \--less--/
293 *
294 *--------------------
295 */
296 static void Delete(File file);
297 static void LruDelete(File file);
298 static void Insert(File file);
299 static int LruInsert(File file);
300 static bool ReleaseLruFile(void);
301 static void ReleaseLruFiles(void);
302 static File AllocateVfd(void);
303 static void FreeVfd(File file);
304
305 static int FileAccess(File file);
306 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
307 static bool reserveAllocatedDesc(void);
308 static int FreeDesc(AllocateDesc *desc);
309
310 static void AtProcExit_Files(int code, Datum arg);
311 static void CleanupTempFiles(bool isCommit, bool isProcExit);
312 static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
313 bool unlink_all);
314 static void RemovePgTempRelationFiles(const char *tsdirname);
315 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
316
317 static void walkdir(const char *path,
318 void (*action) (const char *fname, bool isdir, int elevel),
319 bool process_symlinks,
320 int elevel);
321 #ifdef PG_FLUSH_DATA_WORKS
322 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
323 #endif
324 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
325 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
326
327 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
328 static int fsync_parent_path(const char *fname, int elevel);
329
330
331 /*
332 * pg_fsync --- do fsync with or without writethrough
333 */
334 int
pg_fsync(int fd)335 pg_fsync(int fd)
336 {
337 /* #if is to skip the sync_method test if there's no need for it */
338 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
339 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
340 return pg_fsync_writethrough(fd);
341 else
342 #endif
343 return pg_fsync_no_writethrough(fd);
344 }
345
346
347 /*
348 * pg_fsync_no_writethrough --- same as fsync except does nothing if
349 * enableFsync is off
350 */
351 int
pg_fsync_no_writethrough(int fd)352 pg_fsync_no_writethrough(int fd)
353 {
354 if (enableFsync)
355 return fsync(fd);
356 else
357 return 0;
358 }
359
360 /*
361 * pg_fsync_writethrough
362 */
363 int
pg_fsync_writethrough(int fd)364 pg_fsync_writethrough(int fd)
365 {
366 if (enableFsync)
367 {
368 #ifdef WIN32
369 return _commit(fd);
370 #elif defined(F_FULLFSYNC)
371 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
372 #else
373 errno = ENOSYS;
374 return -1;
375 #endif
376 }
377 else
378 return 0;
379 }
380
381 /*
382 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
383 *
384 * Not all platforms have fdatasync; treat as fsync if not available.
385 */
386 int
pg_fdatasync(int fd)387 pg_fdatasync(int fd)
388 {
389 if (enableFsync)
390 {
391 #ifdef HAVE_FDATASYNC
392 return fdatasync(fd);
393 #else
394 return fsync(fd);
395 #endif
396 }
397 else
398 return 0;
399 }
400
401 /*
402 * pg_flush_data --- advise OS that the described dirty data should be flushed
403 *
404 * offset of 0 with nbytes 0 means that the entire file should be flushed
405 */
406 void
pg_flush_data(int fd,off_t offset,off_t nbytes)407 pg_flush_data(int fd, off_t offset, off_t nbytes)
408 {
409 /*
410 * Right now file flushing is primarily used to avoid making later
411 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
412 * if fsyncs are disabled - that's a decision we might want to make
413 * configurable at some point.
414 */
415 if (!enableFsync)
416 return;
417
418 /*
419 * We compile all alternatives that are supported on the current platform,
420 * to find portability problems more easily.
421 */
422 #if defined(HAVE_SYNC_FILE_RANGE)
423 {
424 int rc;
425 static bool not_implemented_by_kernel = false;
426
427 if (not_implemented_by_kernel)
428 return;
429
430 /*
431 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
432 * tells the OS that writeback for the specified blocks should be
433 * started, but that we don't want to wait for completion. Note that
434 * this call might block if too much dirty data exists in the range.
435 * This is the preferable method on OSs supporting it, as it works
436 * reliably when available (contrast to msync()) and doesn't flush out
437 * clean data (like FADV_DONTNEED).
438 */
439 rc = sync_file_range(fd, offset, nbytes,
440 SYNC_FILE_RANGE_WRITE);
441 if (rc != 0)
442 {
443 int elevel;
444
445 /*
446 * For systems that don't have an implementation of
447 * sync_file_range() such as Windows WSL, generate only one
448 * warning and then suppress all further attempts by this process.
449 */
450 if (errno == ENOSYS)
451 {
452 elevel = WARNING;
453 not_implemented_by_kernel = true;
454 }
455 else
456 elevel = data_sync_elevel(WARNING);
457
458 ereport(elevel,
459 (errcode_for_file_access(),
460 errmsg("could not flush dirty data: %m")));
461 }
462
463 return;
464 }
465 #endif
466 #if !defined(WIN32) && defined(MS_ASYNC)
467 {
468 void *p;
469 static int pagesize = 0;
470
471 /*
472 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
473 * writeback. On linux it only does so if MS_SYNC is specified, but
474 * then it does the writeback synchronously. Luckily all common linux
475 * systems have sync_file_range(). This is preferable over
476 * FADV_DONTNEED because it doesn't flush out clean data.
477 *
478 * We map the file (mmap()), tell the kernel to sync back the contents
479 * (msync()), and then remove the mapping again (munmap()).
480 */
481
482 /* mmap() needs actual length if we want to map whole file */
483 if (offset == 0 && nbytes == 0)
484 {
485 nbytes = lseek(fd, 0, SEEK_END);
486 if (nbytes < 0)
487 {
488 ereport(WARNING,
489 (errcode_for_file_access(),
490 errmsg("could not determine dirty data size: %m")));
491 return;
492 }
493 }
494
495 /*
496 * Some platforms reject partial-page mmap() attempts. To deal with
497 * that, just truncate the request to a page boundary. If any extra
498 * bytes don't get flushed, well, it's only a hint anyway.
499 */
500
501 /* fetch pagesize only once */
502 if (pagesize == 0)
503 pagesize = sysconf(_SC_PAGESIZE);
504
505 /* align length to pagesize, dropping any fractional page */
506 if (pagesize > 0)
507 nbytes = (nbytes / pagesize) * pagesize;
508
509 /* fractional-page request is a no-op */
510 if (nbytes <= 0)
511 return;
512
513 /*
514 * mmap could well fail, particularly on 32-bit platforms where there
515 * may simply not be enough address space. If so, silently fall
516 * through to the next implementation.
517 */
518 if (nbytes <= (off_t) SSIZE_MAX)
519 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
520 else
521 p = MAP_FAILED;
522
523 if (p != MAP_FAILED)
524 {
525 int rc;
526
527 rc = msync(p, (size_t) nbytes, MS_ASYNC);
528 if (rc != 0)
529 {
530 ereport(data_sync_elevel(WARNING),
531 (errcode_for_file_access(),
532 errmsg("could not flush dirty data: %m")));
533 /* NB: need to fall through to munmap()! */
534 }
535
536 rc = munmap(p, (size_t) nbytes);
537 if (rc != 0)
538 {
539 /* FATAL error because mapping would remain */
540 ereport(FATAL,
541 (errcode_for_file_access(),
542 errmsg("could not munmap() while flushing data: %m")));
543 }
544
545 return;
546 }
547 }
548 #endif
549 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
550 {
551 int rc;
552
553 /*
554 * Signal the kernel that the passed in range should not be cached
555 * anymore. This has the, desired, side effect of writing out dirty
556 * data, and the, undesired, side effect of likely discarding useful
557 * clean cached blocks. For the latter reason this is the least
558 * preferable method.
559 */
560
561 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
562
563 if (rc != 0)
564 {
565 /* don't error out, this is just a performance optimization */
566 ereport(WARNING,
567 (errcode_for_file_access(),
568 errmsg("could not flush dirty data: %m")));
569 }
570
571 return;
572 }
573 #endif
574 }
575
576
577 /*
578 * fsync_fname -- fsync a file or directory, handling errors properly
579 *
580 * Try to fsync a file or directory. When doing the latter, ignore errors that
581 * indicate the OS just doesn't allow/require fsyncing directories.
582 */
583 void
fsync_fname(const char * fname,bool isdir)584 fsync_fname(const char *fname, bool isdir)
585 {
586 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
587 }
588
589 /*
590 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
591 *
592 * This routine ensures that, after returning, the effect of renaming file
593 * persists in case of a crash. A crash while this routine is running will
594 * leave you with either the pre-existing or the moved file in place of the
595 * new file; no mixed state or truncated files are possible.
596 *
597 * It does so by using fsync on the old filename and the possibly existing
598 * target filename before the rename, and the target file and directory after.
599 *
600 * Note that rename() cannot be used across arbitrary directories, as they
601 * might not be on the same filesystem. Therefore this routine does not
602 * support renaming across directories.
603 *
604 * Log errors with the caller specified severity.
605 *
606 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
607 * valid upon return.
608 */
609 int
durable_rename(const char * oldfile,const char * newfile,int elevel)610 durable_rename(const char *oldfile, const char *newfile, int elevel)
611 {
612 int fd;
613
614 /*
615 * First fsync the old and target path (if it exists), to ensure that they
616 * are properly persistent on disk. Syncing the target file is not
617 * strictly necessary, but it makes it easier to reason about crashes;
618 * because it's then guaranteed that either source or target file exists
619 * after a crash.
620 */
621 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
622 return -1;
623
624 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
625 if (fd < 0)
626 {
627 if (errno != ENOENT)
628 {
629 ereport(elevel,
630 (errcode_for_file_access(),
631 errmsg("could not open file \"%s\": %m", newfile)));
632 return -1;
633 }
634 }
635 else
636 {
637 if (pg_fsync(fd) != 0)
638 {
639 int save_errno;
640
641 /* close file upon error, might not be in transaction context */
642 save_errno = errno;
643 CloseTransientFile(fd);
644 errno = save_errno;
645
646 ereport(elevel,
647 (errcode_for_file_access(),
648 errmsg("could not fsync file \"%s\": %m", newfile)));
649 return -1;
650 }
651
652 if (CloseTransientFile(fd))
653 {
654 ereport(elevel,
655 (errcode_for_file_access(),
656 errmsg("could not close file \"%s\": %m", newfile)));
657 return -1;
658 }
659 }
660
661 /* Time to do the real deal... */
662 if (rename(oldfile, newfile) < 0)
663 {
664 ereport(elevel,
665 (errcode_for_file_access(),
666 errmsg("could not rename file \"%s\" to \"%s\": %m",
667 oldfile, newfile)));
668 return -1;
669 }
670
671 /*
672 * To guarantee renaming the file is persistent, fsync the file with its
673 * new name, and its containing directory.
674 */
675 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
676 return -1;
677
678 if (fsync_parent_path(newfile, elevel) != 0)
679 return -1;
680
681 return 0;
682 }
683
684 /*
685 * durable_unlink -- remove a file in a durable manner
686 *
687 * This routine ensures that, after returning, the effect of removing file
688 * persists in case of a crash. A crash while this routine is running will
689 * leave the system in no mixed state.
690 *
691 * It does so by using fsync on the parent directory of the file after the
692 * actual removal is done.
693 *
694 * Log errors with the severity specified by caller.
695 *
696 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
697 * valid upon return.
698 */
699 int
durable_unlink(const char * fname,int elevel)700 durable_unlink(const char *fname, int elevel)
701 {
702 if (unlink(fname) < 0)
703 {
704 ereport(elevel,
705 (errcode_for_file_access(),
706 errmsg("could not remove file \"%s\": %m",
707 fname)));
708 return -1;
709 }
710
711 /*
712 * To guarantee that the removal of the file is persistent, fsync its
713 * parent directory.
714 */
715 if (fsync_parent_path(fname, elevel) != 0)
716 return -1;
717
718 return 0;
719 }
720
721 /*
722 * durable_link_or_rename -- rename a file in a durable manner.
723 *
724 * Similar to durable_rename(), except that this routine tries (but does not
725 * guarantee) not to overwrite the target file.
726 *
727 * Note that a crash in an unfortunate moment can leave you with two links to
728 * the target file.
729 *
730 * Log errors with the caller specified severity.
731 *
732 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
733 * valid upon return.
734 */
735 int
durable_link_or_rename(const char * oldfile,const char * newfile,int elevel)736 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
737 {
738 /*
739 * Ensure that, if we crash directly after the rename/link, a file with
740 * valid contents is moved into place.
741 */
742 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
743 return -1;
744
745 #if HAVE_WORKING_LINK
746 if (link(oldfile, newfile) < 0)
747 {
748 ereport(elevel,
749 (errcode_for_file_access(),
750 errmsg("could not link file \"%s\" to \"%s\": %m",
751 oldfile, newfile)));
752 return -1;
753 }
754 unlink(oldfile);
755 #else
756 /* XXX: Add racy file existence check? */
757 if (rename(oldfile, newfile) < 0)
758 {
759 ereport(elevel,
760 (errcode_for_file_access(),
761 errmsg("could not rename file \"%s\" to \"%s\": %m",
762 oldfile, newfile)));
763 return -1;
764 }
765 #endif
766
767 /*
768 * Make change persistent in case of an OS crash, both the new entry and
769 * its parent directory need to be flushed.
770 */
771 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
772 return -1;
773
774 /* Same for parent directory */
775 if (fsync_parent_path(newfile, elevel) != 0)
776 return -1;
777
778 return 0;
779 }
780
781 /*
782 * InitFileAccess --- initialize this module during backend startup
783 *
784 * This is called during either normal or standalone backend start.
785 * It is *not* called in the postmaster.
786 */
787 void
InitFileAccess(void)788 InitFileAccess(void)
789 {
790 Assert(SizeVfdCache == 0); /* call me only once */
791
792 /* initialize cache header entry */
793 VfdCache = (Vfd *) malloc(sizeof(Vfd));
794 if (VfdCache == NULL)
795 ereport(FATAL,
796 (errcode(ERRCODE_OUT_OF_MEMORY),
797 errmsg("out of memory")));
798
799 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
800 VfdCache->fd = VFD_CLOSED;
801
802 SizeVfdCache = 1;
803
804 /* register proc-exit hook to ensure temp files are dropped at exit */
805 on_proc_exit(AtProcExit_Files, 0);
806 }
807
808 /*
809 * count_usable_fds --- count how many FDs the system will let us open,
810 * and estimate how many are already open.
811 *
812 * We stop counting if usable_fds reaches max_to_probe. Note: a small
813 * value of max_to_probe might result in an underestimate of already_open;
814 * we must fill in any "gaps" in the set of used FDs before the calculation
815 * of already_open will give the right answer. In practice, max_to_probe
816 * of a couple of dozen should be enough to ensure good results.
817 *
818 * We assume stdin (FD 0) is available for dup'ing
819 */
820 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)821 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
822 {
823 int *fd;
824 int size;
825 int used = 0;
826 int highestfd = 0;
827 int j;
828
829 #ifdef HAVE_GETRLIMIT
830 struct rlimit rlim;
831 int getrlimit_status;
832 #endif
833
834 size = 1024;
835 fd = (int *) palloc(size * sizeof(int));
836
837 #ifdef HAVE_GETRLIMIT
838 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
839 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
840 #else /* but BSD doesn't ... */
841 getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
842 #endif /* RLIMIT_NOFILE */
843 if (getrlimit_status != 0)
844 ereport(WARNING, (errmsg("getrlimit failed: %m")));
845 #endif /* HAVE_GETRLIMIT */
846
847 /* dup until failure or probe limit reached */
848 for (;;)
849 {
850 int thisfd;
851
852 #ifdef HAVE_GETRLIMIT
853
854 /*
855 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
856 * some platforms
857 */
858 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
859 break;
860 #endif
861
862 thisfd = dup(0);
863 if (thisfd < 0)
864 {
865 /* Expect EMFILE or ENFILE, else it's fishy */
866 if (errno != EMFILE && errno != ENFILE)
867 elog(WARNING, "dup(0) failed after %d successes: %m", used);
868 break;
869 }
870
871 if (used >= size)
872 {
873 size *= 2;
874 fd = (int *) repalloc(fd, size * sizeof(int));
875 }
876 fd[used++] = thisfd;
877
878 if (highestfd < thisfd)
879 highestfd = thisfd;
880
881 if (used >= max_to_probe)
882 break;
883 }
884
885 /* release the files we opened */
886 for (j = 0; j < used; j++)
887 close(fd[j]);
888
889 pfree(fd);
890
891 /*
892 * Return results. usable_fds is just the number of successful dups. We
893 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
894 * number) and so already_open is highestfd+1 - usable_fds.
895 */
896 *usable_fds = used;
897 *already_open = highestfd + 1 - used;
898 }
899
900 /*
901 * set_max_safe_fds
902 * Determine number of filedescriptors that fd.c is allowed to use
903 */
904 void
set_max_safe_fds(void)905 set_max_safe_fds(void)
906 {
907 int usable_fds;
908 int already_open;
909
910 /*----------
911 * We want to set max_safe_fds to
912 * MIN(usable_fds, max_files_per_process - already_open)
913 * less the slop factor for files that are opened without consulting
914 * fd.c. This ensures that we won't exceed either max_files_per_process
915 * or the experimentally-determined EMFILE limit.
916 *----------
917 */
918 count_usable_fds(max_files_per_process,
919 &usable_fds, &already_open);
920
921 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
922
923 /*
924 * Take off the FDs reserved for system() etc.
925 */
926 max_safe_fds -= NUM_RESERVED_FDS;
927
928 /*
929 * Make sure we still have enough to get by.
930 */
931 if (max_safe_fds < FD_MINFREE)
932 ereport(FATAL,
933 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
934 errmsg("insufficient file descriptors available to start server process"),
935 errdetail("System allows %d, we need at least %d.",
936 max_safe_fds + NUM_RESERVED_FDS,
937 FD_MINFREE + NUM_RESERVED_FDS)));
938
939 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
940 max_safe_fds, usable_fds, already_open);
941 }
942
943 /*
944 * Open a file with BasicOpenFilePerm() and pass default file mode for the
945 * fileMode parameter.
946 */
947 int
BasicOpenFile(const char * fileName,int fileFlags)948 BasicOpenFile(const char *fileName, int fileFlags)
949 {
950 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
951 }
952
953 /*
954 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
955 *
956 * This is exported for use by places that really want a plain kernel FD,
957 * but need to be proof against running out of FDs. Once an FD has been
958 * successfully returned, it is the caller's responsibility to ensure that
959 * it will not be leaked on ereport()! Most users should *not* call this
960 * routine directly, but instead use the VFD abstraction level, which
961 * provides protection against descriptor leaks as well as management of
962 * files that need to be open for more than a short period of time.
963 *
964 * Ideally this should be the *only* direct call of open() in the backend.
965 * In practice, the postmaster calls open() directly, and there are some
966 * direct open() calls done early in backend startup. Those are OK since
967 * this module wouldn't have any open files to close at that point anyway.
968 */
969 int
BasicOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)970 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
971 {
972 int fd;
973
974 tryAgain:
975 fd = open(fileName, fileFlags, fileMode);
976
977 if (fd >= 0)
978 return fd; /* success! */
979
980 if (errno == EMFILE || errno == ENFILE)
981 {
982 int save_errno = errno;
983
984 ereport(LOG,
985 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
986 errmsg("out of file descriptors: %m; release and retry")));
987 errno = 0;
988 if (ReleaseLruFile())
989 goto tryAgain;
990 errno = save_errno;
991 }
992
993 return -1; /* failure */
994 }
995
996 #if defined(FDDEBUG)
997
998 static void
_dump_lru(void)999 _dump_lru(void)
1000 {
1001 int mru = VfdCache[0].lruLessRecently;
1002 Vfd *vfdP = &VfdCache[mru];
1003 char buf[2048];
1004
1005 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1006 while (mru != 0)
1007 {
1008 mru = vfdP->lruLessRecently;
1009 vfdP = &VfdCache[mru];
1010 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1011 }
1012 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1013 elog(LOG, "%s", buf);
1014 }
1015 #endif /* FDDEBUG */
1016
1017 static void
Delete(File file)1018 Delete(File file)
1019 {
1020 Vfd *vfdP;
1021
1022 Assert(file != 0);
1023
1024 DO_DB(elog(LOG, "Delete %d (%s)",
1025 file, VfdCache[file].fileName));
1026 DO_DB(_dump_lru());
1027
1028 vfdP = &VfdCache[file];
1029
1030 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1031 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1032
1033 DO_DB(_dump_lru());
1034 }
1035
1036 static void
LruDelete(File file)1037 LruDelete(File file)
1038 {
1039 Vfd *vfdP;
1040
1041 Assert(file != 0);
1042
1043 DO_DB(elog(LOG, "LruDelete %d (%s)",
1044 file, VfdCache[file].fileName));
1045
1046 vfdP = &VfdCache[file];
1047
1048 /*
1049 * Close the file. We aren't expecting this to fail; if it does, better
1050 * to leak the FD than to mess up our internal state.
1051 */
1052 if (close(vfdP->fd))
1053 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1054 "could not close file \"%s\": %m", vfdP->fileName);
1055 vfdP->fd = VFD_CLOSED;
1056 --nfile;
1057
1058 /* delete the vfd record from the LRU ring */
1059 Delete(file);
1060 }
1061
1062 static void
Insert(File file)1063 Insert(File file)
1064 {
1065 Vfd *vfdP;
1066
1067 Assert(file != 0);
1068
1069 DO_DB(elog(LOG, "Insert %d (%s)",
1070 file, VfdCache[file].fileName));
1071 DO_DB(_dump_lru());
1072
1073 vfdP = &VfdCache[file];
1074
1075 vfdP->lruMoreRecently = 0;
1076 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1077 VfdCache[0].lruLessRecently = file;
1078 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1079
1080 DO_DB(_dump_lru());
1081 }
1082
1083 /* returns 0 on success, -1 on re-open failure (with errno set) */
1084 static int
LruInsert(File file)1085 LruInsert(File file)
1086 {
1087 Vfd *vfdP;
1088
1089 Assert(file != 0);
1090
1091 DO_DB(elog(LOG, "LruInsert %d (%s)",
1092 file, VfdCache[file].fileName));
1093
1094 vfdP = &VfdCache[file];
1095
1096 if (FileIsNotOpen(file))
1097 {
1098 /* Close excess kernel FDs. */
1099 ReleaseLruFiles();
1100
1101 /*
1102 * The open could still fail for lack of file descriptors, eg due to
1103 * overall system file table being full. So, be prepared to release
1104 * another FD if necessary...
1105 */
1106 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1107 vfdP->fileMode);
1108 if (vfdP->fd < 0)
1109 {
1110 DO_DB(elog(LOG, "re-open failed: %m"));
1111 return -1;
1112 }
1113 else
1114 {
1115 ++nfile;
1116 }
1117 }
1118
1119 /*
1120 * put it at the head of the Lru ring
1121 */
1122
1123 Insert(file);
1124
1125 return 0;
1126 }
1127
1128 /*
1129 * Release one kernel FD by closing the least-recently-used VFD.
1130 */
1131 static bool
ReleaseLruFile(void)1132 ReleaseLruFile(void)
1133 {
1134 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1135
1136 if (nfile > 0)
1137 {
1138 /*
1139 * There are opened files and so there should be at least one used vfd
1140 * in the ring.
1141 */
1142 Assert(VfdCache[0].lruMoreRecently != 0);
1143 LruDelete(VfdCache[0].lruMoreRecently);
1144 return true; /* freed a file */
1145 }
1146 return false; /* no files available to free */
1147 }
1148
1149 /*
1150 * Release kernel FDs as needed to get under the max_safe_fds limit.
1151 * After calling this, it's OK to try to open another file.
1152 */
1153 static void
ReleaseLruFiles(void)1154 ReleaseLruFiles(void)
1155 {
1156 while (nfile + numAllocatedDescs >= max_safe_fds)
1157 {
1158 if (!ReleaseLruFile())
1159 break;
1160 }
1161 }
1162
1163 static File
AllocateVfd(void)1164 AllocateVfd(void)
1165 {
1166 Index i;
1167 File file;
1168
1169 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1170
1171 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1172
1173 if (VfdCache[0].nextFree == 0)
1174 {
1175 /*
1176 * The free list is empty so it is time to increase the size of the
1177 * array. We choose to double it each time this happens. However,
1178 * there's not much point in starting *real* small.
1179 */
1180 Size newCacheSize = SizeVfdCache * 2;
1181 Vfd *newVfdCache;
1182
1183 if (newCacheSize < 32)
1184 newCacheSize = 32;
1185
1186 /*
1187 * Be careful not to clobber VfdCache ptr if realloc fails.
1188 */
1189 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1190 if (newVfdCache == NULL)
1191 ereport(ERROR,
1192 (errcode(ERRCODE_OUT_OF_MEMORY),
1193 errmsg("out of memory")));
1194 VfdCache = newVfdCache;
1195
1196 /*
1197 * Initialize the new entries and link them into the free list.
1198 */
1199 for (i = SizeVfdCache; i < newCacheSize; i++)
1200 {
1201 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1202 VfdCache[i].nextFree = i + 1;
1203 VfdCache[i].fd = VFD_CLOSED;
1204 }
1205 VfdCache[newCacheSize - 1].nextFree = 0;
1206 VfdCache[0].nextFree = SizeVfdCache;
1207
1208 /*
1209 * Record the new size
1210 */
1211 SizeVfdCache = newCacheSize;
1212 }
1213
1214 file = VfdCache[0].nextFree;
1215
1216 VfdCache[0].nextFree = VfdCache[file].nextFree;
1217
1218 return file;
1219 }
1220
1221 static void
FreeVfd(File file)1222 FreeVfd(File file)
1223 {
1224 Vfd *vfdP = &VfdCache[file];
1225
1226 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1227 file, vfdP->fileName ? vfdP->fileName : ""));
1228
1229 if (vfdP->fileName != NULL)
1230 {
1231 free(vfdP->fileName);
1232 vfdP->fileName = NULL;
1233 }
1234 vfdP->fdstate = 0x0;
1235
1236 vfdP->nextFree = VfdCache[0].nextFree;
1237 VfdCache[0].nextFree = file;
1238 }
1239
1240 /* returns 0 on success, -1 on re-open failure (with errno set) */
1241 static int
FileAccess(File file)1242 FileAccess(File file)
1243 {
1244 int returnValue;
1245
1246 DO_DB(elog(LOG, "FileAccess %d (%s)",
1247 file, VfdCache[file].fileName));
1248
1249 /*
1250 * Is the file open? If not, open it and put it at the head of the LRU
1251 * ring (possibly closing the least recently used file to get an FD).
1252 */
1253
1254 if (FileIsNotOpen(file))
1255 {
1256 returnValue = LruInsert(file);
1257 if (returnValue != 0)
1258 return returnValue;
1259 }
1260 else if (VfdCache[0].lruLessRecently != file)
1261 {
1262 /*
1263 * We now know that the file is open and that it is not the last one
1264 * accessed, so we need to move it to the head of the Lru ring.
1265 */
1266
1267 Delete(file);
1268 Insert(file);
1269 }
1270
1271 return 0;
1272 }
1273
1274 /*
1275 * Called whenever a temporary file is deleted to report its size.
1276 */
1277 static void
ReportTemporaryFileUsage(const char * path,off_t size)1278 ReportTemporaryFileUsage(const char *path, off_t size)
1279 {
1280 pgstat_report_tempfile(size);
1281
1282 if (log_temp_files >= 0)
1283 {
1284 if ((size / 1024) >= log_temp_files)
1285 ereport(LOG,
1286 (errmsg("temporary file: path \"%s\", size %lu",
1287 path, (unsigned long) size)));
1288 }
1289 }
1290
1291 /*
1292 * Called to register a temporary file for automatic close.
1293 * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1294 * before the file was opened.
1295 */
1296 static void
RegisterTemporaryFile(File file)1297 RegisterTemporaryFile(File file)
1298 {
1299 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1300 VfdCache[file].resowner = CurrentResourceOwner;
1301
1302 /* Backup mechanism for closing at end of xact. */
1303 VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1304 have_xact_temporary_files = true;
1305 }
1306
1307 /*
1308 * Called when we get a shared invalidation message on some relation.
1309 */
1310 #ifdef NOT_USED
1311 void
FileInvalidate(File file)1312 FileInvalidate(File file)
1313 {
1314 Assert(FileIsValid(file));
1315 if (!FileIsNotOpen(file))
1316 LruDelete(file);
1317 }
1318 #endif
1319
1320 /*
1321 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1322 * fileMode parameter.
1323 */
1324 File
PathNameOpenFile(const char * fileName,int fileFlags)1325 PathNameOpenFile(const char *fileName, int fileFlags)
1326 {
1327 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1328 }
1329
1330 /*
1331 * open a file in an arbitrary directory
1332 *
1333 * NB: if the passed pathname is relative (which it usually is),
1334 * it will be interpreted relative to the process' working directory
1335 * (which should always be $PGDATA when this code is running).
1336 */
1337 File
PathNameOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1338 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1339 {
1340 char *fnamecopy;
1341 File file;
1342 Vfd *vfdP;
1343
1344 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1345 fileName, fileFlags, fileMode));
1346
1347 /*
1348 * We need a malloc'd copy of the file name; fail cleanly if no room.
1349 */
1350 fnamecopy = strdup(fileName);
1351 if (fnamecopy == NULL)
1352 ereport(ERROR,
1353 (errcode(ERRCODE_OUT_OF_MEMORY),
1354 errmsg("out of memory")));
1355
1356 file = AllocateVfd();
1357 vfdP = &VfdCache[file];
1358
1359 /* Close excess kernel FDs. */
1360 ReleaseLruFiles();
1361
1362 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1363
1364 if (vfdP->fd < 0)
1365 {
1366 int save_errno = errno;
1367
1368 FreeVfd(file);
1369 free(fnamecopy);
1370 errno = save_errno;
1371 return -1;
1372 }
1373 ++nfile;
1374 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1375 vfdP->fd));
1376
1377 vfdP->fileName = fnamecopy;
1378 /* Saved flags are adjusted to be OK for re-opening file */
1379 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1380 vfdP->fileMode = fileMode;
1381 vfdP->fileSize = 0;
1382 vfdP->fdstate = 0x0;
1383 vfdP->resowner = NULL;
1384
1385 Insert(file);
1386
1387 return file;
1388 }
1389
1390 /*
1391 * Create directory 'directory'. If necessary, create 'basedir', which must
1392 * be the directory above it. This is designed for creating the top-level
1393 * temporary directory on demand before creating a directory underneath it.
1394 * Do nothing if the directory already exists.
1395 *
1396 * Directories created within the top-level temporary directory should begin
1397 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1398 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1399 * that do not need any particular prefix.
1400 */
1401 void
PathNameCreateTemporaryDir(const char * basedir,const char * directory)1402 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1403 {
1404 if (MakePGDirectory(directory) < 0)
1405 {
1406 if (errno == EEXIST)
1407 return;
1408
1409 /*
1410 * Failed. Try to create basedir first in case it's missing. Tolerate
1411 * EEXIST to close a race against another process following the same
1412 * algorithm.
1413 */
1414 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1415 ereport(ERROR,
1416 (errcode_for_file_access(),
1417 errmsg("cannot create temporary directory \"%s\": %m",
1418 basedir)));
1419
1420 /* Try again. */
1421 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1422 ereport(ERROR,
1423 (errcode_for_file_access(),
1424 errmsg("cannot create temporary subdirectory \"%s\": %m",
1425 directory)));
1426 }
1427 }
1428
1429 /*
1430 * Delete a directory and everything in it, if it exists.
1431 */
1432 void
PathNameDeleteTemporaryDir(const char * dirname)1433 PathNameDeleteTemporaryDir(const char *dirname)
1434 {
1435 struct stat statbuf;
1436
1437 /* Silently ignore missing directory. */
1438 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1439 return;
1440
1441 /*
1442 * Currently, walkdir doesn't offer a way for our passed in function to
1443 * maintain state. Perhaps it should, so that we could tell the caller
1444 * whether this operation succeeded or failed. Since this operation is
1445 * used in a cleanup path, we wouldn't actually behave differently: we'll
1446 * just log failures.
1447 */
1448 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1449 }
1450
1451 /*
1452 * Open a temporary file that will disappear when we close it.
1453 *
1454 * This routine takes care of generating an appropriate tempfile name.
1455 * There's no need to pass in fileFlags or fileMode either, since only
1456 * one setting makes any sense for a temp file.
1457 *
1458 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1459 * to ensure it's closed and deleted when it's no longer needed, typically at
1460 * the end-of-transaction. In most cases, you don't want temporary files to
1461 * outlive the transaction that created them, so this should be false -- but
1462 * if you need "somewhat" temporary storage, this might be useful. In either
1463 * case, the file is removed when the File is explicitly closed.
1464 */
1465 File
OpenTemporaryFile(bool interXact)1466 OpenTemporaryFile(bool interXact)
1467 {
1468 File file = 0;
1469
1470 /*
1471 * Make sure the current resource owner has space for this File before we
1472 * open it, if we'll be registering it below.
1473 */
1474 if (!interXact)
1475 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1476
1477 /*
1478 * If some temp tablespace(s) have been given to us, try to use the next
1479 * one. If a given tablespace can't be found, we silently fall back to
1480 * the database's default tablespace.
1481 *
1482 * BUT: if the temp file is slated to outlive the current transaction,
1483 * force it into the database's default tablespace, so that it will not
1484 * pose a threat to possible tablespace drop attempts.
1485 */
1486 if (numTempTableSpaces > 0 && !interXact)
1487 {
1488 Oid tblspcOid = GetNextTempTableSpace();
1489
1490 if (OidIsValid(tblspcOid))
1491 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1492 }
1493
1494 /*
1495 * If not, or if tablespace is bad, create in database's default
1496 * tablespace. MyDatabaseTableSpace should normally be set before we get
1497 * here, but just in case it isn't, fall back to pg_default tablespace.
1498 */
1499 if (file <= 0)
1500 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1501 MyDatabaseTableSpace :
1502 DEFAULTTABLESPACE_OID,
1503 true);
1504
1505 /* Mark it for deletion at close and temporary file size limit */
1506 VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1507
1508 /* Register it with the current resource owner */
1509 if (!interXact)
1510 RegisterTemporaryFile(file);
1511
1512 return file;
1513 }
1514
1515 /*
1516 * Return the path of the temp directory in a given tablespace.
1517 */
1518 void
TempTablespacePath(char * path,Oid tablespace)1519 TempTablespacePath(char *path, Oid tablespace)
1520 {
1521 /*
1522 * Identify the tempfile directory for this tablespace.
1523 *
1524 * If someone tries to specify pg_global, use pg_default instead.
1525 */
1526 if (tablespace == InvalidOid ||
1527 tablespace == DEFAULTTABLESPACE_OID ||
1528 tablespace == GLOBALTABLESPACE_OID)
1529 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1530 else
1531 {
1532 /* All other tablespaces are accessed via symlinks */
1533 snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1534 tablespace, TABLESPACE_VERSION_DIRECTORY,
1535 PG_TEMP_FILES_DIR);
1536 }
1537 }
1538
1539 /*
1540 * Open a temporary file in a specific tablespace.
1541 * Subroutine for OpenTemporaryFile, which see for details.
1542 */
1543 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1544 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1545 {
1546 char tempdirpath[MAXPGPATH];
1547 char tempfilepath[MAXPGPATH];
1548 File file;
1549
1550 TempTablespacePath(tempdirpath, tblspcOid);
1551
1552 /*
1553 * Generate a tempfile name that should be unique within the current
1554 * database instance.
1555 */
1556 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1557 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1558
1559 /*
1560 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1561 * temp file that can be reused.
1562 */
1563 file = PathNameOpenFile(tempfilepath,
1564 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1565 if (file <= 0)
1566 {
1567 /*
1568 * We might need to create the tablespace's tempfile directory, if no
1569 * one has yet done so.
1570 *
1571 * Don't check for an error from MakePGDirectory; it could fail if
1572 * someone else just did the same thing. If it doesn't work then
1573 * we'll bomb out on the second create attempt, instead.
1574 */
1575 (void) MakePGDirectory(tempdirpath);
1576
1577 file = PathNameOpenFile(tempfilepath,
1578 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1579 if (file <= 0 && rejectError)
1580 elog(ERROR, "could not create temporary file \"%s\": %m",
1581 tempfilepath);
1582 }
1583
1584 return file;
1585 }
1586
1587
1588 /*
1589 * Create a new file. The directory containing it must already exist. Files
1590 * created this way are subject to temp_file_limit and are automatically
1591 * closed at end of transaction, but are not automatically deleted on close
1592 * because they are intended to be shared between cooperating backends.
1593 *
1594 * If the file is inside the top-level temporary directory, its name should
1595 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1596 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1597 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1598 * the prefix isn't needed.
1599 */
1600 File
PathNameCreateTemporaryFile(const char * path,bool error_on_failure)1601 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1602 {
1603 File file;
1604
1605 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1606
1607 /*
1608 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1609 * temp file that can be reused.
1610 */
1611 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1612 if (file <= 0)
1613 {
1614 if (error_on_failure)
1615 ereport(ERROR,
1616 (errcode_for_file_access(),
1617 errmsg("could not create temporary file \"%s\": %m",
1618 path)));
1619 else
1620 return file;
1621 }
1622
1623 /* Mark it for temp_file_limit accounting. */
1624 VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1625
1626 /* Register it for automatic close. */
1627 RegisterTemporaryFile(file);
1628
1629 return file;
1630 }
1631
1632 /*
1633 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1634 * another backend. Files opened this way don't count against the
1635 * temp_file_limit of the caller, are read-only and are automatically closed
1636 * at the end of the transaction but are not deleted on close.
1637 */
1638 File
PathNameOpenTemporaryFile(const char * path)1639 PathNameOpenTemporaryFile(const char *path)
1640 {
1641 File file;
1642
1643 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1644
1645 /* We open the file read-only. */
1646 file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1647
1648 /* If no such file, then we don't raise an error. */
1649 if (file <= 0 && errno != ENOENT)
1650 ereport(ERROR,
1651 (errcode_for_file_access(),
1652 errmsg("could not open temporary file \"%s\": %m",
1653 path)));
1654
1655 if (file > 0)
1656 {
1657 /* Register it for automatic close. */
1658 RegisterTemporaryFile(file);
1659 }
1660
1661 return file;
1662 }
1663
1664 /*
1665 * Delete a file by pathname. Return true if the file existed, false if
1666 * didn't.
1667 */
1668 bool
PathNameDeleteTemporaryFile(const char * path,bool error_on_failure)1669 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1670 {
1671 struct stat filestats;
1672 int stat_errno;
1673
1674 /* Get the final size for pgstat reporting. */
1675 if (stat(path, &filestats) != 0)
1676 stat_errno = errno;
1677 else
1678 stat_errno = 0;
1679
1680 /*
1681 * Unlike FileClose's automatic file deletion code, we tolerate
1682 * non-existence to support BufFileDeleteShared which doesn't know how
1683 * many segments it has to delete until it runs out.
1684 */
1685 if (stat_errno == ENOENT)
1686 return false;
1687
1688 if (unlink(path) < 0)
1689 {
1690 if (errno != ENOENT)
1691 ereport(error_on_failure ? ERROR : LOG,
1692 (errcode_for_file_access(),
1693 errmsg("could not unlink temporary file \"%s\": %m",
1694 path)));
1695 return false;
1696 }
1697
1698 if (stat_errno == 0)
1699 ReportTemporaryFileUsage(path, filestats.st_size);
1700 else
1701 {
1702 errno = stat_errno;
1703 ereport(LOG,
1704 (errcode_for_file_access(),
1705 errmsg("could not stat file \"%s\": %m", path)));
1706 }
1707
1708 return true;
1709 }
1710
1711 /*
1712 * close a file when done with it
1713 */
1714 void
FileClose(File file)1715 FileClose(File file)
1716 {
1717 Vfd *vfdP;
1718
1719 Assert(FileIsValid(file));
1720
1721 DO_DB(elog(LOG, "FileClose: %d (%s)",
1722 file, VfdCache[file].fileName));
1723
1724 vfdP = &VfdCache[file];
1725
1726 if (!FileIsNotOpen(file))
1727 {
1728 /* close the file */
1729 if (close(vfdP->fd))
1730 {
1731 /*
1732 * We may need to panic on failure to close non-temporary files;
1733 * see LruDelete.
1734 */
1735 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1736 "could not close file \"%s\": %m", vfdP->fileName);
1737 }
1738
1739 --nfile;
1740 vfdP->fd = VFD_CLOSED;
1741
1742 /* remove the file from the lru ring */
1743 Delete(file);
1744 }
1745
1746 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1747 {
1748 /* Subtract its size from current usage (do first in case of error) */
1749 temporary_files_size -= vfdP->fileSize;
1750 vfdP->fileSize = 0;
1751 }
1752
1753 /*
1754 * Delete the file if it was temporary, and make a log entry if wanted
1755 */
1756 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1757 {
1758 struct stat filestats;
1759 int stat_errno;
1760
1761 /*
1762 * If we get an error, as could happen within the ereport/elog calls,
1763 * we'll come right back here during transaction abort. Reset the
1764 * flag to ensure that we can't get into an infinite loop. This code
1765 * is arranged to ensure that the worst-case consequence is failing to
1766 * emit log message(s), not failing to attempt the unlink.
1767 */
1768 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1769
1770
1771 /* first try the stat() */
1772 if (stat(vfdP->fileName, &filestats))
1773 stat_errno = errno;
1774 else
1775 stat_errno = 0;
1776
1777 /* in any case do the unlink */
1778 if (unlink(vfdP->fileName))
1779 elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1780
1781 /* and last report the stat results */
1782 if (stat_errno == 0)
1783 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1784 else
1785 {
1786 errno = stat_errno;
1787 elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1788 }
1789 }
1790
1791 /* Unregister it from the resource owner */
1792 if (vfdP->resowner)
1793 ResourceOwnerForgetFile(vfdP->resowner, file);
1794
1795 /*
1796 * Return the Vfd slot to the free list
1797 */
1798 FreeVfd(file);
1799 }
1800
1801 /*
1802 * FilePrefetch - initiate asynchronous read of a given range of the file.
1803 *
1804 * Currently the only implementation of this function is using posix_fadvise
1805 * which is the simplest standardized interface that accomplishes this.
1806 * We could add an implementation using libaio in the future; but note that
1807 * this API is inappropriate for libaio, which wants to have a buffer provided
1808 * to read into.
1809 */
1810 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1811 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1812 {
1813 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1814 int returnCode;
1815
1816 Assert(FileIsValid(file));
1817
1818 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1819 file, VfdCache[file].fileName,
1820 (int64) offset, amount));
1821
1822 returnCode = FileAccess(file);
1823 if (returnCode < 0)
1824 return returnCode;
1825
1826 pgstat_report_wait_start(wait_event_info);
1827 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1828 POSIX_FADV_WILLNEED);
1829 pgstat_report_wait_end();
1830
1831 return returnCode;
1832 #else
1833 Assert(FileIsValid(file));
1834 return 0;
1835 #endif
1836 }
1837
1838 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)1839 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1840 {
1841 int returnCode;
1842
1843 Assert(FileIsValid(file));
1844
1845 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1846 file, VfdCache[file].fileName,
1847 (int64) offset, (int64) nbytes));
1848
1849 if (nbytes <= 0)
1850 return;
1851
1852 returnCode = FileAccess(file);
1853 if (returnCode < 0)
1854 return;
1855
1856 pgstat_report_wait_start(wait_event_info);
1857 pg_flush_data(VfdCache[file].fd, offset, nbytes);
1858 pgstat_report_wait_end();
1859 }
1860
1861 int
FileRead(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)1862 FileRead(File file, char *buffer, int amount, off_t offset,
1863 uint32 wait_event_info)
1864 {
1865 int returnCode;
1866 Vfd *vfdP;
1867
1868 Assert(FileIsValid(file));
1869
1870 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1871 file, VfdCache[file].fileName,
1872 (int64) offset,
1873 amount, buffer));
1874
1875 returnCode = FileAccess(file);
1876 if (returnCode < 0)
1877 return returnCode;
1878
1879 vfdP = &VfdCache[file];
1880
1881 retry:
1882 pgstat_report_wait_start(wait_event_info);
1883 returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1884 pgstat_report_wait_end();
1885
1886 if (returnCode < 0)
1887 {
1888 /*
1889 * Windows may run out of kernel buffers and return "Insufficient
1890 * system resources" error. Wait a bit and retry to solve it.
1891 *
1892 * It is rumored that EINTR is also possible on some Unix filesystems,
1893 * in which case immediate retry is indicated.
1894 */
1895 #ifdef WIN32
1896 DWORD error = GetLastError();
1897
1898 switch (error)
1899 {
1900 case ERROR_NO_SYSTEM_RESOURCES:
1901 pg_usleep(1000L);
1902 errno = EINTR;
1903 break;
1904 default:
1905 _dosmaperr(error);
1906 break;
1907 }
1908 #endif
1909 /* OK to retry if interrupted */
1910 if (errno == EINTR)
1911 goto retry;
1912 }
1913
1914 return returnCode;
1915 }
1916
1917 int
FileWrite(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)1918 FileWrite(File file, char *buffer, int amount, off_t offset,
1919 uint32 wait_event_info)
1920 {
1921 int returnCode;
1922 Vfd *vfdP;
1923
1924 Assert(FileIsValid(file));
1925
1926 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1927 file, VfdCache[file].fileName,
1928 (int64) offset,
1929 amount, buffer));
1930
1931 returnCode = FileAccess(file);
1932 if (returnCode < 0)
1933 return returnCode;
1934
1935 vfdP = &VfdCache[file];
1936
1937 /*
1938 * If enforcing temp_file_limit and it's a temp file, check to see if the
1939 * write would overrun temp_file_limit, and throw error if so. Note: it's
1940 * really a modularity violation to throw error here; we should set errno
1941 * and return -1. However, there's no way to report a suitable error
1942 * message if we do that. All current callers would just throw error
1943 * immediately anyway, so this is safe at present.
1944 */
1945 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1946 {
1947 off_t past_write = offset + amount;
1948
1949 if (past_write > vfdP->fileSize)
1950 {
1951 uint64 newTotal = temporary_files_size;
1952
1953 newTotal += past_write - vfdP->fileSize;
1954 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1955 ereport(ERROR,
1956 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1957 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1958 temp_file_limit)));
1959 }
1960 }
1961
1962 retry:
1963 errno = 0;
1964 pgstat_report_wait_start(wait_event_info);
1965 returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
1966 pgstat_report_wait_end();
1967
1968 /* if write didn't set errno, assume problem is no disk space */
1969 if (returnCode != amount && errno == 0)
1970 errno = ENOSPC;
1971
1972 if (returnCode >= 0)
1973 {
1974 /*
1975 * Maintain fileSize and temporary_files_size if it's a temp file.
1976 *
1977 * If seekPos is -1 (unknown), this will do nothing; but we could only
1978 * get here in that state if we're not enforcing temporary_files_size,
1979 * so we don't care.
1980 */
1981 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1982 {
1983 off_t past_write = offset + amount;
1984
1985 if (past_write > vfdP->fileSize)
1986 {
1987 temporary_files_size += past_write - vfdP->fileSize;
1988 vfdP->fileSize = past_write;
1989 }
1990 }
1991 }
1992 else
1993 {
1994 /*
1995 * See comments in FileRead()
1996 */
1997 #ifdef WIN32
1998 DWORD error = GetLastError();
1999
2000 switch (error)
2001 {
2002 case ERROR_NO_SYSTEM_RESOURCES:
2003 pg_usleep(1000L);
2004 errno = EINTR;
2005 break;
2006 default:
2007 _dosmaperr(error);
2008 break;
2009 }
2010 #endif
2011 /* OK to retry if interrupted */
2012 if (errno == EINTR)
2013 goto retry;
2014 }
2015
2016 return returnCode;
2017 }
2018
2019 int
FileSync(File file,uint32 wait_event_info)2020 FileSync(File file, uint32 wait_event_info)
2021 {
2022 int returnCode;
2023
2024 Assert(FileIsValid(file));
2025
2026 DO_DB(elog(LOG, "FileSync: %d (%s)",
2027 file, VfdCache[file].fileName));
2028
2029 returnCode = FileAccess(file);
2030 if (returnCode < 0)
2031 return returnCode;
2032
2033 pgstat_report_wait_start(wait_event_info);
2034 returnCode = pg_fsync(VfdCache[file].fd);
2035 pgstat_report_wait_end();
2036
2037 return returnCode;
2038 }
2039
2040 off_t
FileSize(File file)2041 FileSize(File file)
2042 {
2043 Assert(FileIsValid(file));
2044
2045 DO_DB(elog(LOG, "FileSize %d (%s)",
2046 file, VfdCache[file].fileName));
2047
2048 if (FileIsNotOpen(file))
2049 {
2050 if (FileAccess(file) < 0)
2051 return (off_t) -1;
2052 }
2053
2054 return lseek(VfdCache[file].fd, 0, SEEK_END);
2055 }
2056
2057 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)2058 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2059 {
2060 int returnCode;
2061
2062 Assert(FileIsValid(file));
2063
2064 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2065 file, VfdCache[file].fileName));
2066
2067 returnCode = FileAccess(file);
2068 if (returnCode < 0)
2069 return returnCode;
2070
2071 pgstat_report_wait_start(wait_event_info);
2072 returnCode = ftruncate(VfdCache[file].fd, offset);
2073 pgstat_report_wait_end();
2074
2075 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2076 {
2077 /* adjust our state for truncation of a temp file */
2078 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2079 temporary_files_size -= VfdCache[file].fileSize - offset;
2080 VfdCache[file].fileSize = offset;
2081 }
2082
2083 return returnCode;
2084 }
2085
2086 /*
2087 * Return the pathname associated with an open file.
2088 *
2089 * The returned string points to an internal buffer, which is valid until
2090 * the file is closed.
2091 */
2092 char *
FilePathName(File file)2093 FilePathName(File file)
2094 {
2095 Assert(FileIsValid(file));
2096
2097 return VfdCache[file].fileName;
2098 }
2099
2100 /*
2101 * Return the raw file descriptor of an opened file.
2102 *
2103 * The returned file descriptor will be valid until the file is closed, but
2104 * there are a lot of things that can make that happen. So the caller should
2105 * be careful not to do much of anything else before it finishes using the
2106 * returned file descriptor.
2107 */
2108 int
FileGetRawDesc(File file)2109 FileGetRawDesc(File file)
2110 {
2111 Assert(FileIsValid(file));
2112 return VfdCache[file].fd;
2113 }
2114
2115 /*
2116 * FileGetRawFlags - returns the file flags on open(2)
2117 */
2118 int
FileGetRawFlags(File file)2119 FileGetRawFlags(File file)
2120 {
2121 Assert(FileIsValid(file));
2122 return VfdCache[file].fileFlags;
2123 }
2124
2125 /*
2126 * FileGetRawMode - returns the mode bitmask passed to open(2)
2127 */
2128 mode_t
FileGetRawMode(File file)2129 FileGetRawMode(File file)
2130 {
2131 Assert(FileIsValid(file));
2132 return VfdCache[file].fileMode;
2133 }
2134
2135 /*
2136 * Make room for another allocatedDescs[] array entry if needed and possible.
2137 * Returns true if an array element is available.
2138 */
2139 static bool
reserveAllocatedDesc(void)2140 reserveAllocatedDesc(void)
2141 {
2142 AllocateDesc *newDescs;
2143 int newMax;
2144
2145 /* Quick out if array already has a free slot. */
2146 if (numAllocatedDescs < maxAllocatedDescs)
2147 return true;
2148
2149 /*
2150 * If the array hasn't yet been created in the current process, initialize
2151 * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2152 * we will ever need, anyway. We don't want to look at max_safe_fds
2153 * immediately because set_max_safe_fds() may not have run yet.
2154 */
2155 if (allocatedDescs == NULL)
2156 {
2157 newMax = FD_MINFREE / 2;
2158 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2159 /* Out of memory already? Treat as fatal error. */
2160 if (newDescs == NULL)
2161 ereport(ERROR,
2162 (errcode(ERRCODE_OUT_OF_MEMORY),
2163 errmsg("out of memory")));
2164 allocatedDescs = newDescs;
2165 maxAllocatedDescs = newMax;
2166 return true;
2167 }
2168
2169 /*
2170 * Consider enlarging the array beyond the initial allocation used above.
2171 * By the time this happens, max_safe_fds should be known accurately.
2172 *
2173 * We mustn't let allocated descriptors hog all the available FDs, and in
2174 * practice we'd better leave a reasonable number of FDs for VFD use. So
2175 * set the maximum to max_safe_fds / 2. (This should certainly be at
2176 * least as large as the initial size, FD_MINFREE / 2.)
2177 */
2178 newMax = max_safe_fds / 2;
2179 if (newMax > maxAllocatedDescs)
2180 {
2181 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2182 newMax * sizeof(AllocateDesc));
2183 /* Treat out-of-memory as a non-fatal error. */
2184 if (newDescs == NULL)
2185 return false;
2186 allocatedDescs = newDescs;
2187 maxAllocatedDescs = newMax;
2188 return true;
2189 }
2190
2191 /* Can't enlarge allocatedDescs[] any more. */
2192 return false;
2193 }
2194
2195 /*
2196 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2197 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2198 * necessary to open the file. When done, call FreeFile rather than fclose.
2199 *
2200 * Note that files that will be open for any significant length of time
2201 * should NOT be handled this way, since they cannot share kernel file
2202 * descriptors with other files; there is grave risk of running out of FDs
2203 * if anyone locks down too many FDs. Most callers of this routine are
2204 * simply reading a config file that they will read and close immediately.
2205 *
2206 * fd.c will automatically close all files opened with AllocateFile at
2207 * transaction commit or abort; this prevents FD leakage if a routine
2208 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2209 *
2210 * Ideally this should be the *only* direct call of fopen() in the backend.
2211 */
2212 FILE *
AllocateFile(const char * name,const char * mode)2213 AllocateFile(const char *name, const char *mode)
2214 {
2215 FILE *file;
2216
2217 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2218 numAllocatedDescs, name));
2219
2220 /* Can we allocate another non-virtual FD? */
2221 if (!reserveAllocatedDesc())
2222 ereport(ERROR,
2223 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2224 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2225 maxAllocatedDescs, name)));
2226
2227 /* Close excess kernel FDs. */
2228 ReleaseLruFiles();
2229
2230 TryAgain:
2231 if ((file = fopen(name, mode)) != NULL)
2232 {
2233 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2234
2235 desc->kind = AllocateDescFile;
2236 desc->desc.file = file;
2237 desc->create_subid = GetCurrentSubTransactionId();
2238 numAllocatedDescs++;
2239 return desc->desc.file;
2240 }
2241
2242 if (errno == EMFILE || errno == ENFILE)
2243 {
2244 int save_errno = errno;
2245
2246 ereport(LOG,
2247 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2248 errmsg("out of file descriptors: %m; release and retry")));
2249 errno = 0;
2250 if (ReleaseLruFile())
2251 goto TryAgain;
2252 errno = save_errno;
2253 }
2254
2255 return NULL;
2256 }
2257
2258 /*
2259 * Open a file with OpenTransientFilePerm() and pass default file mode for
2260 * the fileMode parameter.
2261 */
2262 int
OpenTransientFile(const char * fileName,int fileFlags)2263 OpenTransientFile(const char *fileName, int fileFlags)
2264 {
2265 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2266 }
2267
2268 /*
2269 * Like AllocateFile, but returns an unbuffered fd like open(2)
2270 */
2271 int
OpenTransientFilePerm(const char * fileName,int fileFlags,mode_t fileMode)2272 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2273 {
2274 int fd;
2275
2276 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2277 numAllocatedDescs, fileName));
2278
2279 /* Can we allocate another non-virtual FD? */
2280 if (!reserveAllocatedDesc())
2281 ereport(ERROR,
2282 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2283 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2284 maxAllocatedDescs, fileName)));
2285
2286 /* Close excess kernel FDs. */
2287 ReleaseLruFiles();
2288
2289 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2290
2291 if (fd >= 0)
2292 {
2293 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2294
2295 desc->kind = AllocateDescRawFD;
2296 desc->desc.fd = fd;
2297 desc->create_subid = GetCurrentSubTransactionId();
2298 numAllocatedDescs++;
2299
2300 return fd;
2301 }
2302
2303 return -1; /* failure */
2304 }
2305
2306 /*
2307 * Routines that want to initiate a pipe stream should use OpenPipeStream
2308 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2309 * necessary. When done, call ClosePipeStream rather than pclose.
2310 *
2311 * This function also ensures that the popen'd program is run with default
2312 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2313 * uses. This ensures desirable response to, eg, closing a read pipe early.
2314 */
2315 FILE *
OpenPipeStream(const char * command,const char * mode)2316 OpenPipeStream(const char *command, const char *mode)
2317 {
2318 FILE *file;
2319 int save_errno;
2320
2321 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2322 numAllocatedDescs, command));
2323
2324 /* Can we allocate another non-virtual FD? */
2325 if (!reserveAllocatedDesc())
2326 ereport(ERROR,
2327 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2328 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2329 maxAllocatedDescs, command)));
2330
2331 /* Close excess kernel FDs. */
2332 ReleaseLruFiles();
2333
2334 TryAgain:
2335 fflush(stdout);
2336 fflush(stderr);
2337 pqsignal(SIGPIPE, SIG_DFL);
2338 errno = 0;
2339 file = popen(command, mode);
2340 save_errno = errno;
2341 pqsignal(SIGPIPE, SIG_IGN);
2342 errno = save_errno;
2343 if (file != NULL)
2344 {
2345 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2346
2347 desc->kind = AllocateDescPipe;
2348 desc->desc.file = file;
2349 desc->create_subid = GetCurrentSubTransactionId();
2350 numAllocatedDescs++;
2351 return desc->desc.file;
2352 }
2353
2354 if (errno == EMFILE || errno == ENFILE)
2355 {
2356 ereport(LOG,
2357 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2358 errmsg("out of file descriptors: %m; release and retry")));
2359 if (ReleaseLruFile())
2360 goto TryAgain;
2361 errno = save_errno;
2362 }
2363
2364 return NULL;
2365 }
2366
2367 /*
2368 * Free an AllocateDesc of any type.
2369 *
2370 * The argument *must* point into the allocatedDescs[] array.
2371 */
2372 static int
FreeDesc(AllocateDesc * desc)2373 FreeDesc(AllocateDesc *desc)
2374 {
2375 int result;
2376
2377 /* Close the underlying object */
2378 switch (desc->kind)
2379 {
2380 case AllocateDescFile:
2381 result = fclose(desc->desc.file);
2382 break;
2383 case AllocateDescPipe:
2384 result = pclose(desc->desc.file);
2385 break;
2386 case AllocateDescDir:
2387 result = closedir(desc->desc.dir);
2388 break;
2389 case AllocateDescRawFD:
2390 result = close(desc->desc.fd);
2391 break;
2392 default:
2393 elog(ERROR, "AllocateDesc kind not recognized");
2394 result = 0; /* keep compiler quiet */
2395 break;
2396 }
2397
2398 /* Compact storage in the allocatedDescs array */
2399 numAllocatedDescs--;
2400 *desc = allocatedDescs[numAllocatedDescs];
2401
2402 return result;
2403 }
2404
2405 /*
2406 * Close a file returned by AllocateFile.
2407 *
2408 * Note we do not check fclose's return value --- it is up to the caller
2409 * to handle close errors.
2410 */
2411 int
FreeFile(FILE * file)2412 FreeFile(FILE *file)
2413 {
2414 int i;
2415
2416 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2417
2418 /* Remove file from list of allocated files, if it's present */
2419 for (i = numAllocatedDescs; --i >= 0;)
2420 {
2421 AllocateDesc *desc = &allocatedDescs[i];
2422
2423 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2424 return FreeDesc(desc);
2425 }
2426
2427 /* Only get here if someone passes us a file not in allocatedDescs */
2428 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2429
2430 return fclose(file);
2431 }
2432
2433 /*
2434 * Close a file returned by OpenTransientFile.
2435 *
2436 * Note we do not check close's return value --- it is up to the caller
2437 * to handle close errors.
2438 */
2439 int
CloseTransientFile(int fd)2440 CloseTransientFile(int fd)
2441 {
2442 int i;
2443
2444 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2445
2446 /* Remove fd from list of allocated files, if it's present */
2447 for (i = numAllocatedDescs; --i >= 0;)
2448 {
2449 AllocateDesc *desc = &allocatedDescs[i];
2450
2451 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2452 return FreeDesc(desc);
2453 }
2454
2455 /* Only get here if someone passes us a file not in allocatedDescs */
2456 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2457
2458 return close(fd);
2459 }
2460
2461 /*
2462 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2463 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2464 * necessary to open the directory, and with closing it after an elog.
2465 * When done, call FreeDir rather than closedir.
2466 *
2467 * Returns NULL, with errno set, on failure. Note that failure detection
2468 * is commonly left to the following call of ReadDir or ReadDirExtended;
2469 * see the comments for ReadDir.
2470 *
2471 * Ideally this should be the *only* direct call of opendir() in the backend.
2472 */
2473 DIR *
AllocateDir(const char * dirname)2474 AllocateDir(const char *dirname)
2475 {
2476 DIR *dir;
2477
2478 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2479 numAllocatedDescs, dirname));
2480
2481 /* Can we allocate another non-virtual FD? */
2482 if (!reserveAllocatedDesc())
2483 ereport(ERROR,
2484 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2485 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2486 maxAllocatedDescs, dirname)));
2487
2488 /* Close excess kernel FDs. */
2489 ReleaseLruFiles();
2490
2491 TryAgain:
2492 if ((dir = opendir(dirname)) != NULL)
2493 {
2494 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2495
2496 desc->kind = AllocateDescDir;
2497 desc->desc.dir = dir;
2498 desc->create_subid = GetCurrentSubTransactionId();
2499 numAllocatedDescs++;
2500 return desc->desc.dir;
2501 }
2502
2503 if (errno == EMFILE || errno == ENFILE)
2504 {
2505 int save_errno = errno;
2506
2507 ereport(LOG,
2508 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2509 errmsg("out of file descriptors: %m; release and retry")));
2510 errno = 0;
2511 if (ReleaseLruFile())
2512 goto TryAgain;
2513 errno = save_errno;
2514 }
2515
2516 return NULL;
2517 }
2518
2519 /*
2520 * Read a directory opened with AllocateDir, ereport'ing any error.
2521 *
2522 * This is easier to use than raw readdir() since it takes care of some
2523 * otherwise rather tedious and error-prone manipulation of errno. Also,
2524 * if you are happy with a generic error message for AllocateDir failure,
2525 * you can just do
2526 *
2527 * dir = AllocateDir(path);
2528 * while ((dirent = ReadDir(dir, path)) != NULL)
2529 * process dirent;
2530 * FreeDir(dir);
2531 *
2532 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2533 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2534 * use this shortcut.)
2535 *
2536 * The pathname passed to AllocateDir must be passed to this routine too,
2537 * but it is only used for error reporting.
2538 */
2539 struct dirent *
ReadDir(DIR * dir,const char * dirname)2540 ReadDir(DIR *dir, const char *dirname)
2541 {
2542 return ReadDirExtended(dir, dirname, ERROR);
2543 }
2544
2545 /*
2546 * Alternate version of ReadDir that allows caller to specify the elevel
2547 * for any error report (whether it's reporting an initial failure of
2548 * AllocateDir or a subsequent directory read failure).
2549 *
2550 * If elevel < ERROR, returns NULL after any error. With the normal coding
2551 * pattern, this will result in falling out of the loop immediately as
2552 * though the directory contained no (more) entries.
2553 */
2554 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2555 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2556 {
2557 struct dirent *dent;
2558
2559 /* Give a generic message for AllocateDir failure, if caller didn't */
2560 if (dir == NULL)
2561 {
2562 ereport(elevel,
2563 (errcode_for_file_access(),
2564 errmsg("could not open directory \"%s\": %m",
2565 dirname)));
2566 return NULL;
2567 }
2568
2569 errno = 0;
2570 if ((dent = readdir(dir)) != NULL)
2571 return dent;
2572
2573 if (errno)
2574 ereport(elevel,
2575 (errcode_for_file_access(),
2576 errmsg("could not read directory \"%s\": %m",
2577 dirname)));
2578 return NULL;
2579 }
2580
2581 /*
2582 * Close a directory opened with AllocateDir.
2583 *
2584 * Returns closedir's return value (with errno set if it's not 0).
2585 * Note we do not check the return value --- it is up to the caller
2586 * to handle close errors if wanted.
2587 *
2588 * Does nothing if dir == NULL; we assume that directory open failure was
2589 * already reported if desired.
2590 */
2591 int
FreeDir(DIR * dir)2592 FreeDir(DIR *dir)
2593 {
2594 int i;
2595
2596 /* Nothing to do if AllocateDir failed */
2597 if (dir == NULL)
2598 return 0;
2599
2600 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2601
2602 /* Remove dir from list of allocated dirs, if it's present */
2603 for (i = numAllocatedDescs; --i >= 0;)
2604 {
2605 AllocateDesc *desc = &allocatedDescs[i];
2606
2607 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2608 return FreeDesc(desc);
2609 }
2610
2611 /* Only get here if someone passes us a dir not in allocatedDescs */
2612 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2613
2614 return closedir(dir);
2615 }
2616
2617
2618 /*
2619 * Close a pipe stream returned by OpenPipeStream.
2620 */
2621 int
ClosePipeStream(FILE * file)2622 ClosePipeStream(FILE *file)
2623 {
2624 int i;
2625
2626 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2627
2628 /* Remove file from list of allocated files, if it's present */
2629 for (i = numAllocatedDescs; --i >= 0;)
2630 {
2631 AllocateDesc *desc = &allocatedDescs[i];
2632
2633 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2634 return FreeDesc(desc);
2635 }
2636
2637 /* Only get here if someone passes us a file not in allocatedDescs */
2638 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2639
2640 return pclose(file);
2641 }
2642
2643 /*
2644 * closeAllVfds
2645 *
2646 * Force all VFDs into the physically-closed state, so that the fewest
2647 * possible number of kernel file descriptors are in use. There is no
2648 * change in the logical state of the VFDs.
2649 */
2650 void
closeAllVfds(void)2651 closeAllVfds(void)
2652 {
2653 Index i;
2654
2655 if (SizeVfdCache > 0)
2656 {
2657 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2658 for (i = 1; i < SizeVfdCache; i++)
2659 {
2660 if (!FileIsNotOpen(i))
2661 LruDelete(i);
2662 }
2663 }
2664 }
2665
2666
2667 /*
2668 * SetTempTablespaces
2669 *
2670 * Define a list (actually an array) of OIDs of tablespaces to use for
2671 * temporary files. This list will be used until end of transaction,
2672 * unless this function is called again before then. It is caller's
2673 * responsibility that the passed-in array has adequate lifespan (typically
2674 * it'd be allocated in TopTransactionContext).
2675 *
2676 * Some entries of the array may be InvalidOid, indicating that the current
2677 * database's default tablespace should be used.
2678 */
2679 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2680 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2681 {
2682 Assert(numSpaces >= 0);
2683 tempTableSpaces = tableSpaces;
2684 numTempTableSpaces = numSpaces;
2685
2686 /*
2687 * Select a random starting point in the list. This is to minimize
2688 * conflicts between backends that are most likely sharing the same list
2689 * of temp tablespaces. Note that if we create multiple temp files in the
2690 * same transaction, we'll advance circularly through the list --- this
2691 * ensures that large temporary sort files are nicely spread across all
2692 * available tablespaces.
2693 */
2694 if (numSpaces > 1)
2695 nextTempTableSpace = random() % numSpaces;
2696 else
2697 nextTempTableSpace = 0;
2698 }
2699
2700 /*
2701 * TempTablespacesAreSet
2702 *
2703 * Returns true if SetTempTablespaces has been called in current transaction.
2704 * (This is just so that tablespaces.c doesn't need its own per-transaction
2705 * state.)
2706 */
2707 bool
TempTablespacesAreSet(void)2708 TempTablespacesAreSet(void)
2709 {
2710 return (numTempTableSpaces >= 0);
2711 }
2712
2713 /*
2714 * GetTempTablespaces
2715 *
2716 * Populate an array with the OIDs of the tablespaces that should be used for
2717 * temporary files. (Some entries may be InvalidOid, indicating that the
2718 * current database's default tablespace should be used.) At most numSpaces
2719 * entries will be filled.
2720 * Returns the number of OIDs that were copied into the output array.
2721 */
2722 int
GetTempTablespaces(Oid * tableSpaces,int numSpaces)2723 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2724 {
2725 int i;
2726
2727 Assert(TempTablespacesAreSet());
2728 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2729 tableSpaces[i] = tempTableSpaces[i];
2730
2731 return i;
2732 }
2733
2734 /*
2735 * GetNextTempTableSpace
2736 *
2737 * Select the next temp tablespace to use. A result of InvalidOid means
2738 * to use the current database's default tablespace.
2739 */
2740 Oid
GetNextTempTableSpace(void)2741 GetNextTempTableSpace(void)
2742 {
2743 if (numTempTableSpaces > 0)
2744 {
2745 /* Advance nextTempTableSpace counter with wraparound */
2746 if (++nextTempTableSpace >= numTempTableSpaces)
2747 nextTempTableSpace = 0;
2748 return tempTableSpaces[nextTempTableSpace];
2749 }
2750 return InvalidOid;
2751 }
2752
2753
2754 /*
2755 * AtEOSubXact_Files
2756 *
2757 * Take care of subtransaction commit/abort. At abort, we close temp files
2758 * that the subtransaction may have opened. At commit, we reassign the
2759 * files that were opened to the parent subtransaction.
2760 */
2761 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2762 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2763 SubTransactionId parentSubid)
2764 {
2765 Index i;
2766
2767 for (i = 0; i < numAllocatedDescs; i++)
2768 {
2769 if (allocatedDescs[i].create_subid == mySubid)
2770 {
2771 if (isCommit)
2772 allocatedDescs[i].create_subid = parentSubid;
2773 else
2774 {
2775 /* have to recheck the item after FreeDesc (ugly) */
2776 FreeDesc(&allocatedDescs[i--]);
2777 }
2778 }
2779 }
2780 }
2781
2782 /*
2783 * AtEOXact_Files
2784 *
2785 * This routine is called during transaction commit or abort. All still-open
2786 * per-transaction temporary file VFDs are closed, which also causes the
2787 * underlying files to be deleted (although they should've been closed already
2788 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2789 * closed. We also forget any transaction-local temp tablespace list.
2790 *
2791 * The isCommit flag is used only to decide whether to emit warnings about
2792 * unclosed files.
2793 */
2794 void
AtEOXact_Files(bool isCommit)2795 AtEOXact_Files(bool isCommit)
2796 {
2797 CleanupTempFiles(isCommit, false);
2798 tempTableSpaces = NULL;
2799 numTempTableSpaces = -1;
2800 }
2801
2802 /*
2803 * AtProcExit_Files
2804 *
2805 * on_proc_exit hook to clean up temp files during backend shutdown.
2806 * Here, we want to clean up *all* temp files including interXact ones.
2807 */
2808 static void
AtProcExit_Files(int code,Datum arg)2809 AtProcExit_Files(int code, Datum arg)
2810 {
2811 CleanupTempFiles(false, true);
2812 }
2813
2814 /*
2815 * Close temporary files and delete their underlying files.
2816 *
2817 * isCommit: if true, this is normal transaction commit, and we don't
2818 * expect any remaining files; warn if there are some.
2819 *
2820 * isProcExit: if true, this is being called as the backend process is
2821 * exiting. If that's the case, we should remove all temporary files; if
2822 * that's not the case, we are being called for transaction commit/abort
2823 * and should only remove transaction-local temp files. In either case,
2824 * also clean up "allocated" stdio files, dirs and fds.
2825 */
2826 static void
CleanupTempFiles(bool isCommit,bool isProcExit)2827 CleanupTempFiles(bool isCommit, bool isProcExit)
2828 {
2829 Index i;
2830
2831 /*
2832 * Careful here: at proc_exit we need extra cleanup, not just
2833 * xact_temporary files.
2834 */
2835 if (isProcExit || have_xact_temporary_files)
2836 {
2837 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2838 for (i = 1; i < SizeVfdCache; i++)
2839 {
2840 unsigned short fdstate = VfdCache[i].fdstate;
2841
2842 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2843 VfdCache[i].fileName != NULL)
2844 {
2845 /*
2846 * If we're in the process of exiting a backend process, close
2847 * all temporary files. Otherwise, only close temporary files
2848 * local to the current transaction. They should be closed by
2849 * the ResourceOwner mechanism already, so this is just a
2850 * debugging cross-check.
2851 */
2852 if (isProcExit)
2853 FileClose(i);
2854 else if (fdstate & FD_CLOSE_AT_EOXACT)
2855 {
2856 elog(WARNING,
2857 "temporary file %s not closed at end-of-transaction",
2858 VfdCache[i].fileName);
2859 FileClose(i);
2860 }
2861 }
2862 }
2863
2864 have_xact_temporary_files = false;
2865 }
2866
2867 /* Complain if any allocated files remain open at commit. */
2868 if (isCommit && numAllocatedDescs > 0)
2869 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2870 numAllocatedDescs);
2871
2872 /* Clean up "allocated" stdio files, dirs and fds. */
2873 while (numAllocatedDescs > 0)
2874 FreeDesc(&allocatedDescs[0]);
2875 }
2876
2877
2878 /*
2879 * Remove temporary and temporary relation files left over from a prior
2880 * postmaster session
2881 *
2882 * This should be called during postmaster startup. It will forcibly
2883 * remove any leftover files created by OpenTemporaryFile and any leftover
2884 * temporary relation files created by mdcreate.
2885 *
2886 * NOTE: we could, but don't, call this during a post-backend-crash restart
2887 * cycle. The argument for not doing it is that someone might want to examine
2888 * the temp files for debugging purposes. This does however mean that
2889 * OpenTemporaryFile had better allow for collision with an existing temp
2890 * file name.
2891 *
2892 * NOTE: this function and its subroutines generally report syscall failures
2893 * with ereport(LOG) and keep going. Removing temp files is not so critical
2894 * that we should fail to start the database when we can't do it.
2895 */
2896 void
RemovePgTempFiles(void)2897 RemovePgTempFiles(void)
2898 {
2899 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2900 DIR *spc_dir;
2901 struct dirent *spc_de;
2902
2903 /*
2904 * First process temp files in pg_default ($PGDATA/base)
2905 */
2906 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2907 RemovePgTempFilesInDir(temp_path, true, false);
2908 RemovePgTempRelationFiles("base");
2909
2910 /*
2911 * Cycle through temp directories for all non-default tablespaces.
2912 */
2913 spc_dir = AllocateDir("pg_tblspc");
2914
2915 while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
2916 {
2917 if (strcmp(spc_de->d_name, ".") == 0 ||
2918 strcmp(spc_de->d_name, "..") == 0)
2919 continue;
2920
2921 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2922 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2923 RemovePgTempFilesInDir(temp_path, true, false);
2924
2925 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2926 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2927 RemovePgTempRelationFiles(temp_path);
2928 }
2929
2930 FreeDir(spc_dir);
2931
2932 /*
2933 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2934 * DataDir as well.
2935 */
2936 #ifdef EXEC_BACKEND
2937 RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false);
2938 #endif
2939 }
2940
2941 /*
2942 * Process one pgsql_tmp directory for RemovePgTempFiles.
2943 *
2944 * If missing_ok is true, it's all right for the named directory to not exist.
2945 * Any other problem results in a LOG message. (missing_ok should be true at
2946 * the top level, since pgsql_tmp directories are not created until needed.)
2947 *
2948 * At the top level, this should be called with unlink_all = false, so that
2949 * only files matching the temporary name prefix will be unlinked. When
2950 * recursing it will be called with unlink_all = true to unlink everything
2951 * under a top-level temporary directory.
2952 *
2953 * (These two flags could be replaced by one, but it seems clearer to keep
2954 * them separate.)
2955 */
2956 static void
RemovePgTempFilesInDir(const char * tmpdirname,bool missing_ok,bool unlink_all)2957 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
2958 {
2959 DIR *temp_dir;
2960 struct dirent *temp_de;
2961 char rm_path[MAXPGPATH * 2];
2962
2963 temp_dir = AllocateDir(tmpdirname);
2964
2965 if (temp_dir == NULL && errno == ENOENT && missing_ok)
2966 return;
2967
2968 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
2969 {
2970 if (strcmp(temp_de->d_name, ".") == 0 ||
2971 strcmp(temp_de->d_name, "..") == 0)
2972 continue;
2973
2974 snprintf(rm_path, sizeof(rm_path), "%s/%s",
2975 tmpdirname, temp_de->d_name);
2976
2977 if (unlink_all ||
2978 strncmp(temp_de->d_name,
2979 PG_TEMP_FILE_PREFIX,
2980 strlen(PG_TEMP_FILE_PREFIX)) == 0)
2981 {
2982 struct stat statbuf;
2983
2984 if (lstat(rm_path, &statbuf) < 0)
2985 {
2986 ereport(LOG,
2987 (errcode_for_file_access(),
2988 errmsg("could not stat file \"%s\": %m", rm_path)));
2989 continue;
2990 }
2991
2992 if (S_ISDIR(statbuf.st_mode))
2993 {
2994 /* recursively remove contents, then directory itself */
2995 RemovePgTempFilesInDir(rm_path, false, true);
2996
2997 if (rmdir(rm_path) < 0)
2998 ereport(LOG,
2999 (errcode_for_file_access(),
3000 errmsg("could not remove directory \"%s\": %m",
3001 rm_path)));
3002 }
3003 else
3004 {
3005 if (unlink(rm_path) < 0)
3006 ereport(LOG,
3007 (errcode_for_file_access(),
3008 errmsg("could not remove file \"%s\": %m",
3009 rm_path)));
3010 }
3011 }
3012 else
3013 ereport(LOG,
3014 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3015 rm_path)));
3016 }
3017
3018 FreeDir(temp_dir);
3019 }
3020
3021 /* Process one tablespace directory, look for per-DB subdirectories */
3022 static void
RemovePgTempRelationFiles(const char * tsdirname)3023 RemovePgTempRelationFiles(const char *tsdirname)
3024 {
3025 DIR *ts_dir;
3026 struct dirent *de;
3027 char dbspace_path[MAXPGPATH * 2];
3028
3029 ts_dir = AllocateDir(tsdirname);
3030
3031 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3032 {
3033 /*
3034 * We're only interested in the per-database directories, which have
3035 * numeric names. Note that this code will also (properly) ignore "."
3036 * and "..".
3037 */
3038 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3039 continue;
3040
3041 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3042 tsdirname, de->d_name);
3043 RemovePgTempRelationFilesInDbspace(dbspace_path);
3044 }
3045
3046 FreeDir(ts_dir);
3047 }
3048
3049 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3050 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)3051 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3052 {
3053 DIR *dbspace_dir;
3054 struct dirent *de;
3055 char rm_path[MAXPGPATH * 2];
3056
3057 dbspace_dir = AllocateDir(dbspacedirname);
3058
3059 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3060 {
3061 if (!looks_like_temp_rel_name(de->d_name))
3062 continue;
3063
3064 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3065 dbspacedirname, de->d_name);
3066
3067 if (unlink(rm_path) < 0)
3068 ereport(LOG,
3069 (errcode_for_file_access(),
3070 errmsg("could not remove file \"%s\": %m",
3071 rm_path)));
3072 }
3073
3074 FreeDir(dbspace_dir);
3075 }
3076
3077 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3078 bool
looks_like_temp_rel_name(const char * name)3079 looks_like_temp_rel_name(const char *name)
3080 {
3081 int pos;
3082 int savepos;
3083
3084 /* Must start with "t". */
3085 if (name[0] != 't')
3086 return false;
3087
3088 /* Followed by a non-empty string of digits and then an underscore. */
3089 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3090 ;
3091 if (pos == 1 || name[pos] != '_')
3092 return false;
3093
3094 /* Followed by another nonempty string of digits. */
3095 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3096 ;
3097 if (savepos == pos)
3098 return false;
3099
3100 /* We might have _forkname or .segment or both. */
3101 if (name[pos] == '_')
3102 {
3103 int forkchar = forkname_chars(&name[pos + 1], NULL);
3104
3105 if (forkchar <= 0)
3106 return false;
3107 pos += forkchar + 1;
3108 }
3109 if (name[pos] == '.')
3110 {
3111 int segchar;
3112
3113 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3114 ;
3115 if (segchar <= 1)
3116 return false;
3117 pos += segchar;
3118 }
3119
3120 /* Now we should be at the end. */
3121 if (name[pos] != '\0')
3122 return false;
3123 return true;
3124 }
3125
3126
3127 /*
3128 * Issue fsync recursively on PGDATA and all its contents.
3129 *
3130 * We fsync regular files and directories wherever they are, but we
3131 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3132 * Other symlinks are presumed to point at files we're not responsible
3133 * for fsyncing, and might not have privileges to write at all.
3134 *
3135 * Errors are logged but not considered fatal; that's because this is used
3136 * only during database startup, to deal with the possibility that there are
3137 * issued-but-unsynced writes pending against the data directory. We want to
3138 * ensure that such writes reach disk before anything that's done in the new
3139 * run. However, aborting on error would result in failure to start for
3140 * harmless cases such as read-only files in the data directory, and that's
3141 * not good either.
3142 *
3143 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3144 * rewriting all changes again during recovery.
3145 *
3146 * Note we assume we're chdir'd into PGDATA to begin with.
3147 */
3148 void
SyncDataDirectory(void)3149 SyncDataDirectory(void)
3150 {
3151 bool xlog_is_symlink;
3152
3153 /* We can skip this whole thing if fsync is disabled. */
3154 if (!enableFsync)
3155 return;
3156
3157 /*
3158 * If pg_wal is a symlink, we'll need to recurse into it separately,
3159 * because the first walkdir below will ignore it.
3160 */
3161 xlog_is_symlink = false;
3162
3163 #ifndef WIN32
3164 {
3165 struct stat st;
3166
3167 if (lstat("pg_wal", &st) < 0)
3168 ereport(LOG,
3169 (errcode_for_file_access(),
3170 errmsg("could not stat file \"%s\": %m",
3171 "pg_wal")));
3172 else if (S_ISLNK(st.st_mode))
3173 xlog_is_symlink = true;
3174 }
3175 #else
3176 if (pgwin32_is_junction("pg_wal"))
3177 xlog_is_symlink = true;
3178 #endif
3179
3180 /*
3181 * If possible, hint to the kernel that we're soon going to fsync the data
3182 * directory and its contents. Errors in this step are even less
3183 * interesting than normal, so log them only at DEBUG1.
3184 */
3185 #ifdef PG_FLUSH_DATA_WORKS
3186 walkdir(".", pre_sync_fname, false, DEBUG1);
3187 if (xlog_is_symlink)
3188 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3189 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3190 #endif
3191
3192 /*
3193 * Now we do the fsync()s in the same order.
3194 *
3195 * The main call ignores symlinks, so in addition to specially processing
3196 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3197 * process_symlinks = true. Note that if there are any plain directories
3198 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3199 * so we don't worry about optimizing it.
3200 */
3201 walkdir(".", datadir_fsync_fname, false, LOG);
3202 if (xlog_is_symlink)
3203 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3204 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3205 }
3206
3207 /*
3208 * walkdir: recursively walk a directory, applying the action to each
3209 * regular file and directory (including the named directory itself).
3210 *
3211 * If process_symlinks is true, the action and recursion are also applied
3212 * to regular files and directories that are pointed to by symlinks in the
3213 * given directory; otherwise symlinks are ignored. Symlinks are always
3214 * ignored in subdirectories, ie we intentionally don't pass down the
3215 * process_symlinks flag to recursive calls.
3216 *
3217 * Errors are reported at level elevel, which might be ERROR or less.
3218 *
3219 * See also walkdir in initdb.c, which is a frontend version of this logic.
3220 */
3221 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3222 walkdir(const char *path,
3223 void (*action) (const char *fname, bool isdir, int elevel),
3224 bool process_symlinks,
3225 int elevel)
3226 {
3227 DIR *dir;
3228 struct dirent *de;
3229
3230 dir = AllocateDir(path);
3231
3232 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3233 {
3234 char subpath[MAXPGPATH * 2];
3235 struct stat fst;
3236 int sret;
3237
3238 CHECK_FOR_INTERRUPTS();
3239
3240 if (strcmp(de->d_name, ".") == 0 ||
3241 strcmp(de->d_name, "..") == 0)
3242 continue;
3243
3244 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3245
3246 if (process_symlinks)
3247 sret = stat(subpath, &fst);
3248 else
3249 sret = lstat(subpath, &fst);
3250
3251 if (sret < 0)
3252 {
3253 ereport(elevel,
3254 (errcode_for_file_access(),
3255 errmsg("could not stat file \"%s\": %m", subpath)));
3256 continue;
3257 }
3258
3259 if (S_ISREG(fst.st_mode))
3260 (*action) (subpath, false, elevel);
3261 else if (S_ISDIR(fst.st_mode))
3262 walkdir(subpath, action, false, elevel);
3263 }
3264
3265 FreeDir(dir); /* we ignore any error here */
3266
3267 /*
3268 * It's important to fsync the destination directory itself as individual
3269 * file fsyncs don't guarantee that the directory entry for the file is
3270 * synced. However, skip this if AllocateDir failed; the action function
3271 * might not be robust against that.
3272 */
3273 if (dir)
3274 (*action) (path, true, elevel);
3275 }
3276
3277
3278 /*
3279 * Hint to the OS that it should get ready to fsync() this file.
3280 *
3281 * Ignores errors trying to open unreadable files, and logs other errors at a
3282 * caller-specified level.
3283 */
3284 #ifdef PG_FLUSH_DATA_WORKS
3285
3286 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3287 pre_sync_fname(const char *fname, bool isdir, int elevel)
3288 {
3289 int fd;
3290
3291 /* Don't try to flush directories, it'll likely just fail */
3292 if (isdir)
3293 return;
3294
3295 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3296
3297 if (fd < 0)
3298 {
3299 if (errno == EACCES)
3300 return;
3301 ereport(elevel,
3302 (errcode_for_file_access(),
3303 errmsg("could not open file \"%s\": %m", fname)));
3304 return;
3305 }
3306
3307 /*
3308 * pg_flush_data() ignores errors, which is ok because this is only a
3309 * hint.
3310 */
3311 pg_flush_data(fd, 0, 0);
3312
3313 if (CloseTransientFile(fd))
3314 ereport(elevel,
3315 (errcode_for_file_access(),
3316 errmsg("could not close file \"%s\": %m", fname)));
3317 }
3318
3319 #endif /* PG_FLUSH_DATA_WORKS */
3320
3321 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3322 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3323 {
3324 /*
3325 * We want to silently ignoring errors about unreadable files. Pass that
3326 * desire on to fsync_fname_ext().
3327 */
3328 fsync_fname_ext(fname, isdir, true, elevel);
3329 }
3330
3331 static void
unlink_if_exists_fname(const char * fname,bool isdir,int elevel)3332 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3333 {
3334 if (isdir)
3335 {
3336 if (rmdir(fname) != 0 && errno != ENOENT)
3337 ereport(elevel,
3338 (errcode_for_file_access(),
3339 errmsg("could not remove directory \"%s\": %m", fname)));
3340 }
3341 else
3342 {
3343 /* Use PathNameDeleteTemporaryFile to report filesize */
3344 PathNameDeleteTemporaryFile(fname, false);
3345 }
3346 }
3347
3348 /*
3349 * fsync_fname_ext -- Try to fsync a file or directory
3350 *
3351 * If ignore_perm is true, ignore errors upon trying to open unreadable
3352 * files. Logs other errors at a caller-specified level.
3353 *
3354 * Returns 0 if the operation succeeded, -1 otherwise.
3355 */
3356 static int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3357 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3358 {
3359 int fd;
3360 int flags;
3361 int returncode;
3362
3363 /*
3364 * Some OSs require directories to be opened read-only whereas other
3365 * systems don't allow us to fsync files opened read-only; so we need both
3366 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3367 * not writable by our userid, but we assume that's OK.
3368 */
3369 flags = PG_BINARY;
3370 if (!isdir)
3371 flags |= O_RDWR;
3372 else
3373 flags |= O_RDONLY;
3374
3375 fd = OpenTransientFile(fname, flags);
3376
3377 /*
3378 * Some OSs don't allow us to open directories at all (Windows returns
3379 * EACCES), just ignore the error in that case. If desired also silently
3380 * ignoring errors about unreadable files. Log others.
3381 */
3382 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3383 return 0;
3384 else if (fd < 0 && ignore_perm && errno == EACCES)
3385 return 0;
3386 else if (fd < 0)
3387 {
3388 ereport(elevel,
3389 (errcode_for_file_access(),
3390 errmsg("could not open file \"%s\": %m", fname)));
3391 return -1;
3392 }
3393
3394 returncode = pg_fsync(fd);
3395
3396 /*
3397 * Some OSes don't allow us to fsync directories at all, so we can ignore
3398 * those errors. Anything else needs to be logged.
3399 */
3400 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3401 {
3402 int save_errno;
3403
3404 /* close file upon error, might not be in transaction context */
3405 save_errno = errno;
3406 (void) CloseTransientFile(fd);
3407 errno = save_errno;
3408
3409 ereport(elevel,
3410 (errcode_for_file_access(),
3411 errmsg("could not fsync file \"%s\": %m", fname)));
3412 return -1;
3413 }
3414
3415 if (CloseTransientFile(fd))
3416 {
3417 ereport(elevel,
3418 (errcode_for_file_access(),
3419 errmsg("could not close file \"%s\": %m", fname)));
3420 return -1;
3421 }
3422
3423 return 0;
3424 }
3425
3426 /*
3427 * fsync_parent_path -- fsync the parent path of a file or directory
3428 *
3429 * This is aimed at making file operations persistent on disk in case of
3430 * an OS crash or power failure.
3431 */
3432 static int
fsync_parent_path(const char * fname,int elevel)3433 fsync_parent_path(const char *fname, int elevel)
3434 {
3435 char parentpath[MAXPGPATH];
3436
3437 strlcpy(parentpath, fname, MAXPGPATH);
3438 get_parent_directory(parentpath);
3439
3440 /*
3441 * get_parent_directory() returns an empty string if the input argument is
3442 * just a file name (see comments in path.c), so handle that as being the
3443 * current directory.
3444 */
3445 if (strlen(parentpath) == 0)
3446 strlcpy(parentpath, ".", MAXPGPATH);
3447
3448 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3449 return -1;
3450
3451 return 0;
3452 }
3453
3454 /*
3455 * Create a PostgreSQL data sub-directory
3456 *
3457 * The data directory itself, and most of its sub-directories, are created at
3458 * initdb time, but we do have some occasions when we create directories in
3459 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3460 * make sure that those directories are created consistently. Today, that means
3461 * making sure that the created directory has the correct permissions, which is
3462 * what pg_dir_create_mode tracks for us.
3463 *
3464 * Note that we also set the umask() based on what we understand the correct
3465 * permissions to be (see file_perm.c).
3466 *
3467 * For permissions other than the default, mkdir() can be used directly, but
3468 * be sure to consider carefully such cases -- a sub-directory with incorrect
3469 * permissions in a PostgreSQL data directory could cause backups and other
3470 * processes to fail.
3471 */
3472 int
MakePGDirectory(const char * directoryName)3473 MakePGDirectory(const char *directoryName)
3474 {
3475 return mkdir(directoryName, pg_dir_create_mode);
3476 }
3477
3478 /*
3479 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3480 *
3481 * Failure to fsync any data file is cause for immediate panic, unless
3482 * data_sync_retry is enabled. Data may have been written to the operating
3483 * system and removed from our buffer pool already, and if we are running on
3484 * an operating system that forgets dirty data on write-back failure, there
3485 * may be only one copy of the data remaining: in the WAL. A later attempt to
3486 * fsync again might falsely report success. Therefore we must not allow any
3487 * further checkpoints to be attempted. data_sync_retry can in theory be
3488 * enabled on systems known not to drop dirty buffered data on write-back
3489 * failure (with the likely outcome that checkpoints will continue to fail
3490 * until the underlying problem is fixed).
3491 *
3492 * Any code that reports a failure from fsync() or related functions should
3493 * filter the error level with this function.
3494 */
3495 int
data_sync_elevel(int elevel)3496 data_sync_elevel(int elevel)
3497 {
3498 return data_sync_retry ? elevel : PANIC;
3499 }
3500