1 /*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 * If a non-virtual file descriptor needs to be held open for any length of
65 * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 * (and eventually ReleaseExternalFD), so that we can take it into account
67 * while deciding how many VFDs can be open. This applies to FDs obtained
68 * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 *
70 *-------------------------------------------------------------------------
71 */
72
73 #include "postgres.h"
74
75 #include <sys/file.h>
76 #include <sys/param.h>
77 #include <sys/stat.h>
78 #ifndef WIN32
79 #include <sys/mman.h>
80 #endif
81 #include <limits.h>
82 #include <unistd.h>
83 #include <fcntl.h>
84 #ifdef HAVE_SYS_RESOURCE_H
85 #include <sys/resource.h> /* for getrlimit */
86 #endif
87
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "miscadmin.h"
93 #include "pgstat.h"
94 #include "portability/mem.h"
95 #include "storage/fd.h"
96 #include "storage/ipc.h"
97 #include "utils/guc.h"
98 #include "utils/resowner_private.h"
99
100 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
101 #if defined(HAVE_SYNC_FILE_RANGE)
102 #define PG_FLUSH_DATA_WORKS 1
103 #elif !defined(WIN32) && defined(MS_ASYNC)
104 #define PG_FLUSH_DATA_WORKS 1
105 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
106 #define PG_FLUSH_DATA_WORKS 1
107 #endif
108
109 /*
110 * We must leave some file descriptors free for system(), the dynamic loader,
111 * and other code that tries to open files without consulting fd.c. This
112 * is the number left free. (While we try fairly hard to prevent EMFILE
113 * errors, there's never any guarantee that we won't get ENFILE due to
114 * other processes chewing up FDs. So it's a bad idea to try to open files
115 * without consulting fd.c. Nonetheless we cannot control all code.)
116 *
117 * Because this is just a fixed setting, we are effectively assuming that
118 * no such code will leave FDs open over the long term; otherwise the slop
119 * is likely to be insufficient. Note in particular that we expect that
120 * loading a shared library does not result in any permanent increase in
121 * the number of open files. (This appears to be true on most if not
122 * all platforms as of Feb 2004.)
123 */
124 #define NUM_RESERVED_FDS 10
125
126 /*
127 * If we have fewer than this many usable FDs after allowing for the reserved
128 * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
129 * much less than that. Note that this value ensures numExternalFDs can be
130 * at least 16; as of this writing, the contrib/postgres_fdw regression tests
131 * will not pass unless that can grow to at least 14.)
132 */
133 #define FD_MINFREE 48
134
135 /*
136 * A number of platforms allow individual processes to open many more files
137 * than they can really support when *many* processes do the same thing.
138 * This GUC parameter lets the DBA limit max_safe_fds to something less than
139 * what the postmaster's initial probe suggests will work.
140 */
141 int max_files_per_process = 1000;
142
143 /*
144 * Maximum number of file descriptors to open for operations that fd.c knows
145 * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
146 * to a conservative value, and remains that way indefinitely in bootstrap or
147 * standalone-backend cases. In normal postmaster operation, the postmaster
148 * calls set_max_safe_fds() late in initialization to update the value, and
149 * that value is then inherited by forked subprocesses.
150 *
151 * Note: the value of max_files_per_process is taken into account while
152 * setting this variable, and so need not be tested separately.
153 */
154 int max_safe_fds = FD_MINFREE; /* default if not changed */
155
156 /* Whether it is safe to continue running after fsync() fails. */
157 bool data_sync_retry = false;
158
159 /* Debugging.... */
160
161 #ifdef FDDEBUG
162 #define DO_DB(A) \
163 do { \
164 int _do_db_save_errno = errno; \
165 A; \
166 errno = _do_db_save_errno; \
167 } while (0)
168 #else
169 #define DO_DB(A) \
170 ((void) 0)
171 #endif
172
173 #define VFD_CLOSED (-1)
174
175 #define FileIsValid(file) \
176 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
177
178 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
179
180 /* these are the assigned bits in fdstate below: */
181 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
182 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
183 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
184
185 typedef struct vfd
186 {
187 int fd; /* current FD, or VFD_CLOSED if none */
188 unsigned short fdstate; /* bitflags for VFD's state */
189 ResourceOwner resowner; /* owner, for automatic cleanup */
190 File nextFree; /* link to next free VFD, if in freelist */
191 File lruMoreRecently; /* doubly linked recency-of-use list */
192 File lruLessRecently;
193 off_t fileSize; /* current size of file (0 if not temporary) */
194 char *fileName; /* name of file, or NULL for unused VFD */
195 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
196 int fileFlags; /* open(2) flags for (re)opening the file */
197 mode_t fileMode; /* mode to pass to open(2) */
198 } Vfd;
199
200 /*
201 * Virtual File Descriptor array pointer and size. This grows as
202 * needed. 'File' values are indexes into this array.
203 * Note that VfdCache[0] is not a usable VFD, just a list header.
204 */
205 static Vfd *VfdCache;
206 static Size SizeVfdCache = 0;
207
208 /*
209 * Number of file descriptors known to be in use by VFD entries.
210 */
211 static int nfile = 0;
212
213 /*
214 * Flag to tell whether it's worth scanning VfdCache looking for temp files
215 * to close
216 */
217 static bool have_xact_temporary_files = false;
218
219 /*
220 * Tracks the total size of all temporary files. Note: when temp_file_limit
221 * is being enforced, this cannot overflow since the limit cannot be more
222 * than INT_MAX kilobytes. When not enforcing, it could theoretically
223 * overflow, but we don't care.
224 */
225 static uint64 temporary_files_size = 0;
226
227 /*
228 * List of OS handles opened with AllocateFile, AllocateDir and
229 * OpenTransientFile.
230 */
231 typedef enum
232 {
233 AllocateDescFile,
234 AllocateDescPipe,
235 AllocateDescDir,
236 AllocateDescRawFD
237 } AllocateDescKind;
238
239 typedef struct
240 {
241 AllocateDescKind kind;
242 SubTransactionId create_subid;
243 union
244 {
245 FILE *file;
246 DIR *dir;
247 int fd;
248 } desc;
249 } AllocateDesc;
250
251 static int numAllocatedDescs = 0;
252 static int maxAllocatedDescs = 0;
253 static AllocateDesc *allocatedDescs = NULL;
254
255 /*
256 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
257 */
258 static int numExternalFDs = 0;
259
260 /*
261 * Number of temporary files opened during the current session;
262 * this is used in generation of tempfile names.
263 */
264 static long tempFileCounter = 0;
265
266 /*
267 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
268 * indicating that the current database's default tablespace should be used.)
269 * When numTempTableSpaces is -1, this has not been set in the current
270 * transaction.
271 */
272 static Oid *tempTableSpaces = NULL;
273 static int numTempTableSpaces = -1;
274 static int nextTempTableSpace = 0;
275
276
277 /*--------------------
278 *
279 * Private Routines
280 *
281 * Delete - delete a file from the Lru ring
282 * LruDelete - remove a file from the Lru ring and close its FD
283 * Insert - put a file at the front of the Lru ring
284 * LruInsert - put a file at the front of the Lru ring and open it
285 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
286 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
287 * AllocateVfd - grab a free (or new) file record (from VfdCache)
288 * FreeVfd - free a file record
289 *
290 * The Least Recently Used ring is a doubly linked list that begins and
291 * ends on element zero. Element zero is special -- it doesn't represent
292 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
293 * anchor that shows us the beginning/end of the ring.
294 * Only VFD elements that are currently really open (have an FD assigned) are
295 * in the Lru ring. Elements that are "virtually" open can be recognized
296 * by having a non-null fileName field.
297 *
298 * example:
299 *
300 * /--less----\ /---------\
301 * v \ v \
302 * #0 --more---> LeastRecentlyUsed --more-\ \
303 * ^\ | |
304 * \\less--> MostRecentlyUsedFile <---/ |
305 * \more---/ \--less--/
306 *
307 *--------------------
308 */
309 static void Delete(File file);
310 static void LruDelete(File file);
311 static void Insert(File file);
312 static int LruInsert(File file);
313 static bool ReleaseLruFile(void);
314 static void ReleaseLruFiles(void);
315 static File AllocateVfd(void);
316 static void FreeVfd(File file);
317
318 static int FileAccess(File file);
319 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
320 static bool reserveAllocatedDesc(void);
321 static int FreeDesc(AllocateDesc *desc);
322
323 static void AtProcExit_Files(int code, Datum arg);
324 static void CleanupTempFiles(bool isCommit, bool isProcExit);
325 static void RemovePgTempRelationFiles(const char *tsdirname);
326 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
327
328 static void walkdir(const char *path,
329 void (*action) (const char *fname, bool isdir, int elevel),
330 bool process_symlinks,
331 int elevel);
332 #ifdef PG_FLUSH_DATA_WORKS
333 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
334 #endif
335 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
336 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
337
338 static int fsync_parent_path(const char *fname, int elevel);
339
340
341 /*
342 * pg_fsync --- do fsync with or without writethrough
343 */
344 int
pg_fsync(int fd)345 pg_fsync(int fd)
346 {
347 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
348 struct stat st;
349
350 /*
351 * Some operating system implementations of fsync() have requirements
352 * about the file access modes that were used when their file descriptor
353 * argument was opened, and these requirements differ depending on whether
354 * the file descriptor is for a directory.
355 *
356 * For any file descriptor that may eventually be handed to fsync(), we
357 * should have opened it with access modes that are compatible with
358 * fsync() on all supported systems, otherwise the code may not be
359 * portable, even if it runs ok on the current system.
360 *
361 * We assert here that a descriptor for a file was opened with write
362 * permissions (either O_RDWR or O_WRONLY) and for a directory without
363 * write permissions (O_RDONLY).
364 *
365 * Ignore any fstat errors and let the follow-up fsync() do its work.
366 * Doing this sanity check here counts for the case where fsync() is
367 * disabled.
368 */
369 if (fstat(fd, &st) == 0)
370 {
371 int desc_flags = fcntl(fd, F_GETFL);
372
373 /*
374 * O_RDONLY is historically 0, so just make sure that for directories
375 * no write flags are used.
376 */
377 if (S_ISDIR(st.st_mode))
378 Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
379 else
380 Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
381 }
382 errno = 0;
383 #endif
384
385 /* #if is to skip the sync_method test if there's no need for it */
386 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
387 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
388 return pg_fsync_writethrough(fd);
389 else
390 #endif
391 return pg_fsync_no_writethrough(fd);
392 }
393
394
395 /*
396 * pg_fsync_no_writethrough --- same as fsync except does nothing if
397 * enableFsync is off
398 */
399 int
pg_fsync_no_writethrough(int fd)400 pg_fsync_no_writethrough(int fd)
401 {
402 if (enableFsync)
403 return fsync(fd);
404 else
405 return 0;
406 }
407
408 /*
409 * pg_fsync_writethrough
410 */
411 int
pg_fsync_writethrough(int fd)412 pg_fsync_writethrough(int fd)
413 {
414 if (enableFsync)
415 {
416 #ifdef WIN32
417 return _commit(fd);
418 #elif defined(F_FULLFSYNC)
419 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
420 #else
421 errno = ENOSYS;
422 return -1;
423 #endif
424 }
425 else
426 return 0;
427 }
428
429 /*
430 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
431 *
432 * Not all platforms have fdatasync; treat as fsync if not available.
433 */
434 int
pg_fdatasync(int fd)435 pg_fdatasync(int fd)
436 {
437 if (enableFsync)
438 {
439 #ifdef HAVE_FDATASYNC
440 return fdatasync(fd);
441 #else
442 return fsync(fd);
443 #endif
444 }
445 else
446 return 0;
447 }
448
449 /*
450 * pg_flush_data --- advise OS that the described dirty data should be flushed
451 *
452 * offset of 0 with nbytes 0 means that the entire file should be flushed
453 */
454 void
pg_flush_data(int fd,off_t offset,off_t nbytes)455 pg_flush_data(int fd, off_t offset, off_t nbytes)
456 {
457 /*
458 * Right now file flushing is primarily used to avoid making later
459 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
460 * if fsyncs are disabled - that's a decision we might want to make
461 * configurable at some point.
462 */
463 if (!enableFsync)
464 return;
465
466 /*
467 * We compile all alternatives that are supported on the current platform,
468 * to find portability problems more easily.
469 */
470 #if defined(HAVE_SYNC_FILE_RANGE)
471 {
472 int rc;
473 static bool not_implemented_by_kernel = false;
474
475 if (not_implemented_by_kernel)
476 return;
477
478 /*
479 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
480 * tells the OS that writeback for the specified blocks should be
481 * started, but that we don't want to wait for completion. Note that
482 * this call might block if too much dirty data exists in the range.
483 * This is the preferable method on OSs supporting it, as it works
484 * reliably when available (contrast to msync()) and doesn't flush out
485 * clean data (like FADV_DONTNEED).
486 */
487 rc = sync_file_range(fd, offset, nbytes,
488 SYNC_FILE_RANGE_WRITE);
489 if (rc != 0)
490 {
491 int elevel;
492
493 /*
494 * For systems that don't have an implementation of
495 * sync_file_range() such as Windows WSL, generate only one
496 * warning and then suppress all further attempts by this process.
497 */
498 if (errno == ENOSYS)
499 {
500 elevel = WARNING;
501 not_implemented_by_kernel = true;
502 }
503 else
504 elevel = data_sync_elevel(WARNING);
505
506 ereport(elevel,
507 (errcode_for_file_access(),
508 errmsg("could not flush dirty data: %m")));
509 }
510
511 return;
512 }
513 #endif
514 #if !defined(WIN32) && defined(MS_ASYNC)
515 {
516 void *p;
517 static int pagesize = 0;
518
519 /*
520 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
521 * writeback. On linux it only does so if MS_SYNC is specified, but
522 * then it does the writeback synchronously. Luckily all common linux
523 * systems have sync_file_range(). This is preferable over
524 * FADV_DONTNEED because it doesn't flush out clean data.
525 *
526 * We map the file (mmap()), tell the kernel to sync back the contents
527 * (msync()), and then remove the mapping again (munmap()).
528 */
529
530 /* mmap() needs actual length if we want to map whole file */
531 if (offset == 0 && nbytes == 0)
532 {
533 nbytes = lseek(fd, 0, SEEK_END);
534 if (nbytes < 0)
535 {
536 ereport(WARNING,
537 (errcode_for_file_access(),
538 errmsg("could not determine dirty data size: %m")));
539 return;
540 }
541 }
542
543 /*
544 * Some platforms reject partial-page mmap() attempts. To deal with
545 * that, just truncate the request to a page boundary. If any extra
546 * bytes don't get flushed, well, it's only a hint anyway.
547 */
548
549 /* fetch pagesize only once */
550 if (pagesize == 0)
551 pagesize = sysconf(_SC_PAGESIZE);
552
553 /* align length to pagesize, dropping any fractional page */
554 if (pagesize > 0)
555 nbytes = (nbytes / pagesize) * pagesize;
556
557 /* fractional-page request is a no-op */
558 if (nbytes <= 0)
559 return;
560
561 /*
562 * mmap could well fail, particularly on 32-bit platforms where there
563 * may simply not be enough address space. If so, silently fall
564 * through to the next implementation.
565 */
566 if (nbytes <= (off_t) SSIZE_MAX)
567 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
568 else
569 p = MAP_FAILED;
570
571 if (p != MAP_FAILED)
572 {
573 int rc;
574
575 rc = msync(p, (size_t) nbytes, MS_ASYNC);
576 if (rc != 0)
577 {
578 ereport(data_sync_elevel(WARNING),
579 (errcode_for_file_access(),
580 errmsg("could not flush dirty data: %m")));
581 /* NB: need to fall through to munmap()! */
582 }
583
584 rc = munmap(p, (size_t) nbytes);
585 if (rc != 0)
586 {
587 /* FATAL error because mapping would remain */
588 ereport(FATAL,
589 (errcode_for_file_access(),
590 errmsg("could not munmap() while flushing data: %m")));
591 }
592
593 return;
594 }
595 }
596 #endif
597 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
598 {
599 int rc;
600
601 /*
602 * Signal the kernel that the passed in range should not be cached
603 * anymore. This has the, desired, side effect of writing out dirty
604 * data, and the, undesired, side effect of likely discarding useful
605 * clean cached blocks. For the latter reason this is the least
606 * preferable method.
607 */
608
609 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
610
611 if (rc != 0)
612 {
613 /* don't error out, this is just a performance optimization */
614 ereport(WARNING,
615 (errcode_for_file_access(),
616 errmsg("could not flush dirty data: %m")));
617 }
618
619 return;
620 }
621 #endif
622 }
623
624
625 /*
626 * fsync_fname -- fsync a file or directory, handling errors properly
627 *
628 * Try to fsync a file or directory. When doing the latter, ignore errors that
629 * indicate the OS just doesn't allow/require fsyncing directories.
630 */
631 void
fsync_fname(const char * fname,bool isdir)632 fsync_fname(const char *fname, bool isdir)
633 {
634 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
635 }
636
637 /*
638 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
639 *
640 * This routine ensures that, after returning, the effect of renaming file
641 * persists in case of a crash. A crash while this routine is running will
642 * leave you with either the pre-existing or the moved file in place of the
643 * new file; no mixed state or truncated files are possible.
644 *
645 * It does so by using fsync on the old filename and the possibly existing
646 * target filename before the rename, and the target file and directory after.
647 *
648 * Note that rename() cannot be used across arbitrary directories, as they
649 * might not be on the same filesystem. Therefore this routine does not
650 * support renaming across directories.
651 *
652 * Log errors with the caller specified severity.
653 *
654 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
655 * valid upon return.
656 */
657 int
durable_rename(const char * oldfile,const char * newfile,int elevel)658 durable_rename(const char *oldfile, const char *newfile, int elevel)
659 {
660 int fd;
661
662 /*
663 * First fsync the old and target path (if it exists), to ensure that they
664 * are properly persistent on disk. Syncing the target file is not
665 * strictly necessary, but it makes it easier to reason about crashes;
666 * because it's then guaranteed that either source or target file exists
667 * after a crash.
668 */
669 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
670 return -1;
671
672 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
673 if (fd < 0)
674 {
675 if (errno != ENOENT)
676 {
677 ereport(elevel,
678 (errcode_for_file_access(),
679 errmsg("could not open file \"%s\": %m", newfile)));
680 return -1;
681 }
682 }
683 else
684 {
685 if (pg_fsync(fd) != 0)
686 {
687 int save_errno;
688
689 /* close file upon error, might not be in transaction context */
690 save_errno = errno;
691 CloseTransientFile(fd);
692 errno = save_errno;
693
694 ereport(elevel,
695 (errcode_for_file_access(),
696 errmsg("could not fsync file \"%s\": %m", newfile)));
697 return -1;
698 }
699
700 if (CloseTransientFile(fd) != 0)
701 {
702 ereport(elevel,
703 (errcode_for_file_access(),
704 errmsg("could not close file \"%s\": %m", newfile)));
705 return -1;
706 }
707 }
708
709 /* Time to do the real deal... */
710 if (rename(oldfile, newfile) < 0)
711 {
712 ereport(elevel,
713 (errcode_for_file_access(),
714 errmsg("could not rename file \"%s\" to \"%s\": %m",
715 oldfile, newfile)));
716 return -1;
717 }
718
719 /*
720 * To guarantee renaming the file is persistent, fsync the file with its
721 * new name, and its containing directory.
722 */
723 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
724 return -1;
725
726 if (fsync_parent_path(newfile, elevel) != 0)
727 return -1;
728
729 return 0;
730 }
731
732 /*
733 * durable_unlink -- remove a file in a durable manner
734 *
735 * This routine ensures that, after returning, the effect of removing file
736 * persists in case of a crash. A crash while this routine is running will
737 * leave the system in no mixed state.
738 *
739 * It does so by using fsync on the parent directory of the file after the
740 * actual removal is done.
741 *
742 * Log errors with the severity specified by caller.
743 *
744 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
745 * valid upon return.
746 */
747 int
durable_unlink(const char * fname,int elevel)748 durable_unlink(const char *fname, int elevel)
749 {
750 if (unlink(fname) < 0)
751 {
752 ereport(elevel,
753 (errcode_for_file_access(),
754 errmsg("could not remove file \"%s\": %m",
755 fname)));
756 return -1;
757 }
758
759 /*
760 * To guarantee that the removal of the file is persistent, fsync its
761 * parent directory.
762 */
763 if (fsync_parent_path(fname, elevel) != 0)
764 return -1;
765
766 return 0;
767 }
768
769 /*
770 * durable_rename_excl -- rename a file in a durable manner.
771 *
772 * Similar to durable_rename(), except that this routine tries (but does not
773 * guarantee) not to overwrite the target file.
774 *
775 * Note that a crash in an unfortunate moment can leave you with two links to
776 * the target file.
777 *
778 * Log errors with the caller specified severity.
779 *
780 * On Windows, using a hard link followed by unlink() causes concurrency
781 * issues, while a simple rename() does not cause that, so be careful when
782 * changing the logic of this routine.
783 *
784 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
785 * valid upon return.
786 */
787 int
durable_rename_excl(const char * oldfile,const char * newfile,int elevel)788 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
789 {
790 /*
791 * Ensure that, if we crash directly after the rename/link, a file with
792 * valid contents is moved into place.
793 */
794 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
795 return -1;
796
797 #ifdef HAVE_WORKING_LINK
798 if (link(oldfile, newfile) < 0)
799 {
800 ereport(elevel,
801 (errcode_for_file_access(),
802 errmsg("could not link file \"%s\" to \"%s\": %m",
803 oldfile, newfile)));
804 return -1;
805 }
806 unlink(oldfile);
807 #else
808 if (rename(oldfile, newfile) < 0)
809 {
810 ereport(elevel,
811 (errcode_for_file_access(),
812 errmsg("could not rename file \"%s\" to \"%s\": %m",
813 oldfile, newfile)));
814 return -1;
815 }
816 #endif
817
818 /*
819 * Make change persistent in case of an OS crash, both the new entry and
820 * its parent directory need to be flushed.
821 */
822 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
823 return -1;
824
825 /* Same for parent directory */
826 if (fsync_parent_path(newfile, elevel) != 0)
827 return -1;
828
829 return 0;
830 }
831
832 /*
833 * InitFileAccess --- initialize this module during backend startup
834 *
835 * This is called during either normal or standalone backend start.
836 * It is *not* called in the postmaster.
837 */
838 void
InitFileAccess(void)839 InitFileAccess(void)
840 {
841 Assert(SizeVfdCache == 0); /* call me only once */
842
843 /* initialize cache header entry */
844 VfdCache = (Vfd *) malloc(sizeof(Vfd));
845 if (VfdCache == NULL)
846 ereport(FATAL,
847 (errcode(ERRCODE_OUT_OF_MEMORY),
848 errmsg("out of memory")));
849
850 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
851 VfdCache->fd = VFD_CLOSED;
852
853 SizeVfdCache = 1;
854
855 /* register proc-exit hook to ensure temp files are dropped at exit */
856 on_proc_exit(AtProcExit_Files, 0);
857 }
858
859 /*
860 * count_usable_fds --- count how many FDs the system will let us open,
861 * and estimate how many are already open.
862 *
863 * We stop counting if usable_fds reaches max_to_probe. Note: a small
864 * value of max_to_probe might result in an underestimate of already_open;
865 * we must fill in any "gaps" in the set of used FDs before the calculation
866 * of already_open will give the right answer. In practice, max_to_probe
867 * of a couple of dozen should be enough to ensure good results.
868 *
869 * We assume stdin (FD 0) is available for dup'ing
870 */
871 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)872 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
873 {
874 int *fd;
875 int size;
876 int used = 0;
877 int highestfd = 0;
878 int j;
879
880 #ifdef HAVE_GETRLIMIT
881 struct rlimit rlim;
882 int getrlimit_status;
883 #endif
884
885 size = 1024;
886 fd = (int *) palloc(size * sizeof(int));
887
888 #ifdef HAVE_GETRLIMIT
889 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
890 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
891 #else /* but BSD doesn't ... */
892 getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
893 #endif /* RLIMIT_NOFILE */
894 if (getrlimit_status != 0)
895 ereport(WARNING, (errmsg("getrlimit failed: %m")));
896 #endif /* HAVE_GETRLIMIT */
897
898 /* dup until failure or probe limit reached */
899 for (;;)
900 {
901 int thisfd;
902
903 #ifdef HAVE_GETRLIMIT
904
905 /*
906 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
907 * some platforms
908 */
909 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
910 break;
911 #endif
912
913 thisfd = dup(0);
914 if (thisfd < 0)
915 {
916 /* Expect EMFILE or ENFILE, else it's fishy */
917 if (errno != EMFILE && errno != ENFILE)
918 elog(WARNING, "dup(0) failed after %d successes: %m", used);
919 break;
920 }
921
922 if (used >= size)
923 {
924 size *= 2;
925 fd = (int *) repalloc(fd, size * sizeof(int));
926 }
927 fd[used++] = thisfd;
928
929 if (highestfd < thisfd)
930 highestfd = thisfd;
931
932 if (used >= max_to_probe)
933 break;
934 }
935
936 /* release the files we opened */
937 for (j = 0; j < used; j++)
938 close(fd[j]);
939
940 pfree(fd);
941
942 /*
943 * Return results. usable_fds is just the number of successful dups. We
944 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
945 * number) and so already_open is highestfd+1 - usable_fds.
946 */
947 *usable_fds = used;
948 *already_open = highestfd + 1 - used;
949 }
950
951 /*
952 * set_max_safe_fds
953 * Determine number of file descriptors that fd.c is allowed to use
954 */
955 void
set_max_safe_fds(void)956 set_max_safe_fds(void)
957 {
958 int usable_fds;
959 int already_open;
960
961 /*----------
962 * We want to set max_safe_fds to
963 * MIN(usable_fds, max_files_per_process - already_open)
964 * less the slop factor for files that are opened without consulting
965 * fd.c. This ensures that we won't exceed either max_files_per_process
966 * or the experimentally-determined EMFILE limit.
967 *----------
968 */
969 count_usable_fds(max_files_per_process,
970 &usable_fds, &already_open);
971
972 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
973
974 /*
975 * Take off the FDs reserved for system() etc.
976 */
977 max_safe_fds -= NUM_RESERVED_FDS;
978
979 /*
980 * Make sure we still have enough to get by.
981 */
982 if (max_safe_fds < FD_MINFREE)
983 ereport(FATAL,
984 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
985 errmsg("insufficient file descriptors available to start server process"),
986 errdetail("System allows %d, we need at least %d.",
987 max_safe_fds + NUM_RESERVED_FDS,
988 FD_MINFREE + NUM_RESERVED_FDS)));
989
990 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
991 max_safe_fds, usable_fds, already_open);
992 }
993
994 /*
995 * Open a file with BasicOpenFilePerm() and pass default file mode for the
996 * fileMode parameter.
997 */
998 int
BasicOpenFile(const char * fileName,int fileFlags)999 BasicOpenFile(const char *fileName, int fileFlags)
1000 {
1001 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1002 }
1003
1004 /*
1005 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1006 *
1007 * This is exported for use by places that really want a plain kernel FD,
1008 * but need to be proof against running out of FDs. Once an FD has been
1009 * successfully returned, it is the caller's responsibility to ensure that
1010 * it will not be leaked on ereport()! Most users should *not* call this
1011 * routine directly, but instead use the VFD abstraction level, which
1012 * provides protection against descriptor leaks as well as management of
1013 * files that need to be open for more than a short period of time.
1014 *
1015 * Ideally this should be the *only* direct call of open() in the backend.
1016 * In practice, the postmaster calls open() directly, and there are some
1017 * direct open() calls done early in backend startup. Those are OK since
1018 * this module wouldn't have any open files to close at that point anyway.
1019 */
1020 int
BasicOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1021 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1022 {
1023 int fd;
1024
1025 tryAgain:
1026 fd = open(fileName, fileFlags, fileMode);
1027
1028 if (fd >= 0)
1029 return fd; /* success! */
1030
1031 if (errno == EMFILE || errno == ENFILE)
1032 {
1033 int save_errno = errno;
1034
1035 ereport(LOG,
1036 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1037 errmsg("out of file descriptors: %m; release and retry")));
1038 errno = 0;
1039 if (ReleaseLruFile())
1040 goto tryAgain;
1041 errno = save_errno;
1042 }
1043
1044 return -1; /* failure */
1045 }
1046
1047 /*
1048 * AcquireExternalFD - attempt to reserve an external file descriptor
1049 *
1050 * This should be used by callers that need to hold a file descriptor open
1051 * over more than a short interval, but cannot use any of the other facilities
1052 * provided by this module.
1053 *
1054 * The difference between this and the underlying ReserveExternalFD function
1055 * is that this will report failure (by setting errno and returning false)
1056 * if "too many" external FDs are already reserved. This should be used in
1057 * any code where the total number of FDs to be reserved is not predictable
1058 * and small.
1059 */
1060 bool
AcquireExternalFD(void)1061 AcquireExternalFD(void)
1062 {
1063 /*
1064 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1065 * "external" FDs.
1066 */
1067 if (numExternalFDs < max_safe_fds / 3)
1068 {
1069 ReserveExternalFD();
1070 return true;
1071 }
1072 errno = EMFILE;
1073 return false;
1074 }
1075
1076 /*
1077 * ReserveExternalFD - report external consumption of a file descriptor
1078 *
1079 * This should be used by callers that need to hold a file descriptor open
1080 * over more than a short interval, but cannot use any of the other facilities
1081 * provided by this module. This just tracks the use of the FD and closes
1082 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1083 *
1084 * Call this directly only in code where failure to reserve the FD would be
1085 * fatal; for example, the WAL-writing code does so, since the alternative is
1086 * session failure. Also, it's very unwise to do so in code that could
1087 * consume more than one FD per process.
1088 *
1089 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1090 * available, it doesn't matter too much whether this is called before or
1091 * after actually opening the FD; but doing so beforehand reduces the risk of
1092 * an EMFILE failure if not everybody played nice. In any case, it's solely
1093 * caller's responsibility to keep the external-FD count in sync with reality.
1094 */
1095 void
ReserveExternalFD(void)1096 ReserveExternalFD(void)
1097 {
1098 /*
1099 * Release VFDs if needed to stay safe. Because we do this before
1100 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1101 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1102 */
1103 ReleaseLruFiles();
1104
1105 numExternalFDs++;
1106 }
1107
1108 /*
1109 * ReleaseExternalFD - report release of an external file descriptor
1110 *
1111 * This is guaranteed not to change errno, so it can be used in failure paths.
1112 */
1113 void
ReleaseExternalFD(void)1114 ReleaseExternalFD(void)
1115 {
1116 Assert(numExternalFDs > 0);
1117 numExternalFDs--;
1118 }
1119
1120
1121 #if defined(FDDEBUG)
1122
1123 static void
_dump_lru(void)1124 _dump_lru(void)
1125 {
1126 int mru = VfdCache[0].lruLessRecently;
1127 Vfd *vfdP = &VfdCache[mru];
1128 char buf[2048];
1129
1130 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1131 while (mru != 0)
1132 {
1133 mru = vfdP->lruLessRecently;
1134 vfdP = &VfdCache[mru];
1135 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1136 }
1137 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1138 elog(LOG, "%s", buf);
1139 }
1140 #endif /* FDDEBUG */
1141
1142 static void
Delete(File file)1143 Delete(File file)
1144 {
1145 Vfd *vfdP;
1146
1147 Assert(file != 0);
1148
1149 DO_DB(elog(LOG, "Delete %d (%s)",
1150 file, VfdCache[file].fileName));
1151 DO_DB(_dump_lru());
1152
1153 vfdP = &VfdCache[file];
1154
1155 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1156 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1157
1158 DO_DB(_dump_lru());
1159 }
1160
1161 static void
LruDelete(File file)1162 LruDelete(File file)
1163 {
1164 Vfd *vfdP;
1165
1166 Assert(file != 0);
1167
1168 DO_DB(elog(LOG, "LruDelete %d (%s)",
1169 file, VfdCache[file].fileName));
1170
1171 vfdP = &VfdCache[file];
1172
1173 /*
1174 * Close the file. We aren't expecting this to fail; if it does, better
1175 * to leak the FD than to mess up our internal state.
1176 */
1177 if (close(vfdP->fd) != 0)
1178 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1179 "could not close file \"%s\": %m", vfdP->fileName);
1180 vfdP->fd = VFD_CLOSED;
1181 --nfile;
1182
1183 /* delete the vfd record from the LRU ring */
1184 Delete(file);
1185 }
1186
1187 static void
Insert(File file)1188 Insert(File file)
1189 {
1190 Vfd *vfdP;
1191
1192 Assert(file != 0);
1193
1194 DO_DB(elog(LOG, "Insert %d (%s)",
1195 file, VfdCache[file].fileName));
1196 DO_DB(_dump_lru());
1197
1198 vfdP = &VfdCache[file];
1199
1200 vfdP->lruMoreRecently = 0;
1201 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1202 VfdCache[0].lruLessRecently = file;
1203 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1204
1205 DO_DB(_dump_lru());
1206 }
1207
1208 /* returns 0 on success, -1 on re-open failure (with errno set) */
1209 static int
LruInsert(File file)1210 LruInsert(File file)
1211 {
1212 Vfd *vfdP;
1213
1214 Assert(file != 0);
1215
1216 DO_DB(elog(LOG, "LruInsert %d (%s)",
1217 file, VfdCache[file].fileName));
1218
1219 vfdP = &VfdCache[file];
1220
1221 if (FileIsNotOpen(file))
1222 {
1223 /* Close excess kernel FDs. */
1224 ReleaseLruFiles();
1225
1226 /*
1227 * The open could still fail for lack of file descriptors, eg due to
1228 * overall system file table being full. So, be prepared to release
1229 * another FD if necessary...
1230 */
1231 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1232 vfdP->fileMode);
1233 if (vfdP->fd < 0)
1234 {
1235 DO_DB(elog(LOG, "re-open failed: %m"));
1236 return -1;
1237 }
1238 else
1239 {
1240 ++nfile;
1241 }
1242 }
1243
1244 /*
1245 * put it at the head of the Lru ring
1246 */
1247
1248 Insert(file);
1249
1250 return 0;
1251 }
1252
1253 /*
1254 * Release one kernel FD by closing the least-recently-used VFD.
1255 */
1256 static bool
ReleaseLruFile(void)1257 ReleaseLruFile(void)
1258 {
1259 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1260
1261 if (nfile > 0)
1262 {
1263 /*
1264 * There are opened files and so there should be at least one used vfd
1265 * in the ring.
1266 */
1267 Assert(VfdCache[0].lruMoreRecently != 0);
1268 LruDelete(VfdCache[0].lruMoreRecently);
1269 return true; /* freed a file */
1270 }
1271 return false; /* no files available to free */
1272 }
1273
1274 /*
1275 * Release kernel FDs as needed to get under the max_safe_fds limit.
1276 * After calling this, it's OK to try to open another file.
1277 */
1278 static void
ReleaseLruFiles(void)1279 ReleaseLruFiles(void)
1280 {
1281 while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1282 {
1283 if (!ReleaseLruFile())
1284 break;
1285 }
1286 }
1287
1288 static File
AllocateVfd(void)1289 AllocateVfd(void)
1290 {
1291 Index i;
1292 File file;
1293
1294 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1295
1296 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1297
1298 if (VfdCache[0].nextFree == 0)
1299 {
1300 /*
1301 * The free list is empty so it is time to increase the size of the
1302 * array. We choose to double it each time this happens. However,
1303 * there's not much point in starting *real* small.
1304 */
1305 Size newCacheSize = SizeVfdCache * 2;
1306 Vfd *newVfdCache;
1307
1308 if (newCacheSize < 32)
1309 newCacheSize = 32;
1310
1311 /*
1312 * Be careful not to clobber VfdCache ptr if realloc fails.
1313 */
1314 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1315 if (newVfdCache == NULL)
1316 ereport(ERROR,
1317 (errcode(ERRCODE_OUT_OF_MEMORY),
1318 errmsg("out of memory")));
1319 VfdCache = newVfdCache;
1320
1321 /*
1322 * Initialize the new entries and link them into the free list.
1323 */
1324 for (i = SizeVfdCache; i < newCacheSize; i++)
1325 {
1326 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1327 VfdCache[i].nextFree = i + 1;
1328 VfdCache[i].fd = VFD_CLOSED;
1329 }
1330 VfdCache[newCacheSize - 1].nextFree = 0;
1331 VfdCache[0].nextFree = SizeVfdCache;
1332
1333 /*
1334 * Record the new size
1335 */
1336 SizeVfdCache = newCacheSize;
1337 }
1338
1339 file = VfdCache[0].nextFree;
1340
1341 VfdCache[0].nextFree = VfdCache[file].nextFree;
1342
1343 return file;
1344 }
1345
1346 static void
FreeVfd(File file)1347 FreeVfd(File file)
1348 {
1349 Vfd *vfdP = &VfdCache[file];
1350
1351 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1352 file, vfdP->fileName ? vfdP->fileName : ""));
1353
1354 if (vfdP->fileName != NULL)
1355 {
1356 free(vfdP->fileName);
1357 vfdP->fileName = NULL;
1358 }
1359 vfdP->fdstate = 0x0;
1360
1361 vfdP->nextFree = VfdCache[0].nextFree;
1362 VfdCache[0].nextFree = file;
1363 }
1364
1365 /* returns 0 on success, -1 on re-open failure (with errno set) */
1366 static int
FileAccess(File file)1367 FileAccess(File file)
1368 {
1369 int returnValue;
1370
1371 DO_DB(elog(LOG, "FileAccess %d (%s)",
1372 file, VfdCache[file].fileName));
1373
1374 /*
1375 * Is the file open? If not, open it and put it at the head of the LRU
1376 * ring (possibly closing the least recently used file to get an FD).
1377 */
1378
1379 if (FileIsNotOpen(file))
1380 {
1381 returnValue = LruInsert(file);
1382 if (returnValue != 0)
1383 return returnValue;
1384 }
1385 else if (VfdCache[0].lruLessRecently != file)
1386 {
1387 /*
1388 * We now know that the file is open and that it is not the last one
1389 * accessed, so we need to move it to the head of the Lru ring.
1390 */
1391
1392 Delete(file);
1393 Insert(file);
1394 }
1395
1396 return 0;
1397 }
1398
1399 /*
1400 * Called whenever a temporary file is deleted to report its size.
1401 */
1402 static void
ReportTemporaryFileUsage(const char * path,off_t size)1403 ReportTemporaryFileUsage(const char *path, off_t size)
1404 {
1405 pgstat_report_tempfile(size);
1406
1407 if (log_temp_files >= 0)
1408 {
1409 if ((size / 1024) >= log_temp_files)
1410 ereport(LOG,
1411 (errmsg("temporary file: path \"%s\", size %lu",
1412 path, (unsigned long) size)));
1413 }
1414 }
1415
1416 /*
1417 * Called to register a temporary file for automatic close.
1418 * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1419 * before the file was opened.
1420 */
1421 static void
RegisterTemporaryFile(File file)1422 RegisterTemporaryFile(File file)
1423 {
1424 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1425 VfdCache[file].resowner = CurrentResourceOwner;
1426
1427 /* Backup mechanism for closing at end of xact. */
1428 VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1429 have_xact_temporary_files = true;
1430 }
1431
1432 /*
1433 * Called when we get a shared invalidation message on some relation.
1434 */
1435 #ifdef NOT_USED
1436 void
FileInvalidate(File file)1437 FileInvalidate(File file)
1438 {
1439 Assert(FileIsValid(file));
1440 if (!FileIsNotOpen(file))
1441 LruDelete(file);
1442 }
1443 #endif
1444
1445 /*
1446 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1447 * fileMode parameter.
1448 */
1449 File
PathNameOpenFile(const char * fileName,int fileFlags)1450 PathNameOpenFile(const char *fileName, int fileFlags)
1451 {
1452 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1453 }
1454
1455 /*
1456 * open a file in an arbitrary directory
1457 *
1458 * NB: if the passed pathname is relative (which it usually is),
1459 * it will be interpreted relative to the process' working directory
1460 * (which should always be $PGDATA when this code is running).
1461 */
1462 File
PathNameOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1463 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1464 {
1465 char *fnamecopy;
1466 File file;
1467 Vfd *vfdP;
1468
1469 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1470 fileName, fileFlags, fileMode));
1471
1472 /*
1473 * We need a malloc'd copy of the file name; fail cleanly if no room.
1474 */
1475 fnamecopy = strdup(fileName);
1476 if (fnamecopy == NULL)
1477 ereport(ERROR,
1478 (errcode(ERRCODE_OUT_OF_MEMORY),
1479 errmsg("out of memory")));
1480
1481 file = AllocateVfd();
1482 vfdP = &VfdCache[file];
1483
1484 /* Close excess kernel FDs. */
1485 ReleaseLruFiles();
1486
1487 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1488
1489 if (vfdP->fd < 0)
1490 {
1491 int save_errno = errno;
1492
1493 FreeVfd(file);
1494 free(fnamecopy);
1495 errno = save_errno;
1496 return -1;
1497 }
1498 ++nfile;
1499 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1500 vfdP->fd));
1501
1502 vfdP->fileName = fnamecopy;
1503 /* Saved flags are adjusted to be OK for re-opening file */
1504 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1505 vfdP->fileMode = fileMode;
1506 vfdP->fileSize = 0;
1507 vfdP->fdstate = 0x0;
1508 vfdP->resowner = NULL;
1509
1510 Insert(file);
1511
1512 return file;
1513 }
1514
1515 /*
1516 * Create directory 'directory'. If necessary, create 'basedir', which must
1517 * be the directory above it. This is designed for creating the top-level
1518 * temporary directory on demand before creating a directory underneath it.
1519 * Do nothing if the directory already exists.
1520 *
1521 * Directories created within the top-level temporary directory should begin
1522 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1523 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1524 * that do not need any particular prefix.
1525 */
1526 void
PathNameCreateTemporaryDir(const char * basedir,const char * directory)1527 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1528 {
1529 if (MakePGDirectory(directory) < 0)
1530 {
1531 if (errno == EEXIST)
1532 return;
1533
1534 /*
1535 * Failed. Try to create basedir first in case it's missing. Tolerate
1536 * EEXIST to close a race against another process following the same
1537 * algorithm.
1538 */
1539 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1540 ereport(ERROR,
1541 (errcode_for_file_access(),
1542 errmsg("cannot create temporary directory \"%s\": %m",
1543 basedir)));
1544
1545 /* Try again. */
1546 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1547 ereport(ERROR,
1548 (errcode_for_file_access(),
1549 errmsg("cannot create temporary subdirectory \"%s\": %m",
1550 directory)));
1551 }
1552 }
1553
1554 /*
1555 * Delete a directory and everything in it, if it exists.
1556 */
1557 void
PathNameDeleteTemporaryDir(const char * dirname)1558 PathNameDeleteTemporaryDir(const char *dirname)
1559 {
1560 struct stat statbuf;
1561
1562 /* Silently ignore missing directory. */
1563 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1564 return;
1565
1566 /*
1567 * Currently, walkdir doesn't offer a way for our passed in function to
1568 * maintain state. Perhaps it should, so that we could tell the caller
1569 * whether this operation succeeded or failed. Since this operation is
1570 * used in a cleanup path, we wouldn't actually behave differently: we'll
1571 * just log failures.
1572 */
1573 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1574 }
1575
1576 /*
1577 * Open a temporary file that will disappear when we close it.
1578 *
1579 * This routine takes care of generating an appropriate tempfile name.
1580 * There's no need to pass in fileFlags or fileMode either, since only
1581 * one setting makes any sense for a temp file.
1582 *
1583 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1584 * to ensure it's closed and deleted when it's no longer needed, typically at
1585 * the end-of-transaction. In most cases, you don't want temporary files to
1586 * outlive the transaction that created them, so this should be false -- but
1587 * if you need "somewhat" temporary storage, this might be useful. In either
1588 * case, the file is removed when the File is explicitly closed.
1589 */
1590 File
OpenTemporaryFile(bool interXact)1591 OpenTemporaryFile(bool interXact)
1592 {
1593 File file = 0;
1594
1595 /*
1596 * Make sure the current resource owner has space for this File before we
1597 * open it, if we'll be registering it below.
1598 */
1599 if (!interXact)
1600 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1601
1602 /*
1603 * If some temp tablespace(s) have been given to us, try to use the next
1604 * one. If a given tablespace can't be found, we silently fall back to
1605 * the database's default tablespace.
1606 *
1607 * BUT: if the temp file is slated to outlive the current transaction,
1608 * force it into the database's default tablespace, so that it will not
1609 * pose a threat to possible tablespace drop attempts.
1610 */
1611 if (numTempTableSpaces > 0 && !interXact)
1612 {
1613 Oid tblspcOid = GetNextTempTableSpace();
1614
1615 if (OidIsValid(tblspcOid))
1616 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1617 }
1618
1619 /*
1620 * If not, or if tablespace is bad, create in database's default
1621 * tablespace. MyDatabaseTableSpace should normally be set before we get
1622 * here, but just in case it isn't, fall back to pg_default tablespace.
1623 */
1624 if (file <= 0)
1625 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1626 MyDatabaseTableSpace :
1627 DEFAULTTABLESPACE_OID,
1628 true);
1629
1630 /* Mark it for deletion at close and temporary file size limit */
1631 VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1632
1633 /* Register it with the current resource owner */
1634 if (!interXact)
1635 RegisterTemporaryFile(file);
1636
1637 return file;
1638 }
1639
1640 /*
1641 * Return the path of the temp directory in a given tablespace.
1642 */
1643 void
TempTablespacePath(char * path,Oid tablespace)1644 TempTablespacePath(char *path, Oid tablespace)
1645 {
1646 /*
1647 * Identify the tempfile directory for this tablespace.
1648 *
1649 * If someone tries to specify pg_global, use pg_default instead.
1650 */
1651 if (tablespace == InvalidOid ||
1652 tablespace == DEFAULTTABLESPACE_OID ||
1653 tablespace == GLOBALTABLESPACE_OID)
1654 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1655 else
1656 {
1657 /* All other tablespaces are accessed via symlinks */
1658 snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1659 tablespace, TABLESPACE_VERSION_DIRECTORY,
1660 PG_TEMP_FILES_DIR);
1661 }
1662 }
1663
1664 /*
1665 * Open a temporary file in a specific tablespace.
1666 * Subroutine for OpenTemporaryFile, which see for details.
1667 */
1668 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1669 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1670 {
1671 char tempdirpath[MAXPGPATH];
1672 char tempfilepath[MAXPGPATH];
1673 File file;
1674
1675 TempTablespacePath(tempdirpath, tblspcOid);
1676
1677 /*
1678 * Generate a tempfile name that should be unique within the current
1679 * database instance.
1680 */
1681 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1682 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1683
1684 /*
1685 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1686 * temp file that can be reused.
1687 */
1688 file = PathNameOpenFile(tempfilepath,
1689 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1690 if (file <= 0)
1691 {
1692 /*
1693 * We might need to create the tablespace's tempfile directory, if no
1694 * one has yet done so.
1695 *
1696 * Don't check for an error from MakePGDirectory; it could fail if
1697 * someone else just did the same thing. If it doesn't work then
1698 * we'll bomb out on the second create attempt, instead.
1699 */
1700 (void) MakePGDirectory(tempdirpath);
1701
1702 file = PathNameOpenFile(tempfilepath,
1703 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1704 if (file <= 0 && rejectError)
1705 elog(ERROR, "could not create temporary file \"%s\": %m",
1706 tempfilepath);
1707 }
1708
1709 return file;
1710 }
1711
1712
1713 /*
1714 * Create a new file. The directory containing it must already exist. Files
1715 * created this way are subject to temp_file_limit and are automatically
1716 * closed at end of transaction, but are not automatically deleted on close
1717 * because they are intended to be shared between cooperating backends.
1718 *
1719 * If the file is inside the top-level temporary directory, its name should
1720 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1721 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1722 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1723 * the prefix isn't needed.
1724 */
1725 File
PathNameCreateTemporaryFile(const char * path,bool error_on_failure)1726 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1727 {
1728 File file;
1729
1730 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1731
1732 /*
1733 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1734 * temp file that can be reused.
1735 */
1736 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1737 if (file <= 0)
1738 {
1739 if (error_on_failure)
1740 ereport(ERROR,
1741 (errcode_for_file_access(),
1742 errmsg("could not create temporary file \"%s\": %m",
1743 path)));
1744 else
1745 return file;
1746 }
1747
1748 /* Mark it for temp_file_limit accounting. */
1749 VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1750
1751 /* Register it for automatic close. */
1752 RegisterTemporaryFile(file);
1753
1754 return file;
1755 }
1756
1757 /*
1758 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1759 * another backend. Files opened this way don't count against the
1760 * temp_file_limit of the caller, are read-only and are automatically closed
1761 * at the end of the transaction but are not deleted on close.
1762 */
1763 File
PathNameOpenTemporaryFile(const char * path)1764 PathNameOpenTemporaryFile(const char *path)
1765 {
1766 File file;
1767
1768 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1769
1770 /* We open the file read-only. */
1771 file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1772
1773 /* If no such file, then we don't raise an error. */
1774 if (file <= 0 && errno != ENOENT)
1775 ereport(ERROR,
1776 (errcode_for_file_access(),
1777 errmsg("could not open temporary file \"%s\": %m",
1778 path)));
1779
1780 if (file > 0)
1781 {
1782 /* Register it for automatic close. */
1783 RegisterTemporaryFile(file);
1784 }
1785
1786 return file;
1787 }
1788
1789 /*
1790 * Delete a file by pathname. Return true if the file existed, false if
1791 * didn't.
1792 */
1793 bool
PathNameDeleteTemporaryFile(const char * path,bool error_on_failure)1794 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1795 {
1796 struct stat filestats;
1797 int stat_errno;
1798
1799 /* Get the final size for pgstat reporting. */
1800 if (stat(path, &filestats) != 0)
1801 stat_errno = errno;
1802 else
1803 stat_errno = 0;
1804
1805 /*
1806 * Unlike FileClose's automatic file deletion code, we tolerate
1807 * non-existence to support BufFileDeleteShared which doesn't know how
1808 * many segments it has to delete until it runs out.
1809 */
1810 if (stat_errno == ENOENT)
1811 return false;
1812
1813 if (unlink(path) < 0)
1814 {
1815 if (errno != ENOENT)
1816 ereport(error_on_failure ? ERROR : LOG,
1817 (errcode_for_file_access(),
1818 errmsg("could not unlink temporary file \"%s\": %m",
1819 path)));
1820 return false;
1821 }
1822
1823 if (stat_errno == 0)
1824 ReportTemporaryFileUsage(path, filestats.st_size);
1825 else
1826 {
1827 errno = stat_errno;
1828 ereport(LOG,
1829 (errcode_for_file_access(),
1830 errmsg("could not stat file \"%s\": %m", path)));
1831 }
1832
1833 return true;
1834 }
1835
1836 /*
1837 * close a file when done with it
1838 */
1839 void
FileClose(File file)1840 FileClose(File file)
1841 {
1842 Vfd *vfdP;
1843
1844 Assert(FileIsValid(file));
1845
1846 DO_DB(elog(LOG, "FileClose: %d (%s)",
1847 file, VfdCache[file].fileName));
1848
1849 vfdP = &VfdCache[file];
1850
1851 if (!FileIsNotOpen(file))
1852 {
1853 /* close the file */
1854 if (close(vfdP->fd) != 0)
1855 {
1856 /*
1857 * We may need to panic on failure to close non-temporary files;
1858 * see LruDelete.
1859 */
1860 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1861 "could not close file \"%s\": %m", vfdP->fileName);
1862 }
1863
1864 --nfile;
1865 vfdP->fd = VFD_CLOSED;
1866
1867 /* remove the file from the lru ring */
1868 Delete(file);
1869 }
1870
1871 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1872 {
1873 /* Subtract its size from current usage (do first in case of error) */
1874 temporary_files_size -= vfdP->fileSize;
1875 vfdP->fileSize = 0;
1876 }
1877
1878 /*
1879 * Delete the file if it was temporary, and make a log entry if wanted
1880 */
1881 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1882 {
1883 struct stat filestats;
1884 int stat_errno;
1885
1886 /*
1887 * If we get an error, as could happen within the ereport/elog calls,
1888 * we'll come right back here during transaction abort. Reset the
1889 * flag to ensure that we can't get into an infinite loop. This code
1890 * is arranged to ensure that the worst-case consequence is failing to
1891 * emit log message(s), not failing to attempt the unlink.
1892 */
1893 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1894
1895
1896 /* first try the stat() */
1897 if (stat(vfdP->fileName, &filestats))
1898 stat_errno = errno;
1899 else
1900 stat_errno = 0;
1901
1902 /* in any case do the unlink */
1903 if (unlink(vfdP->fileName))
1904 elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1905
1906 /* and last report the stat results */
1907 if (stat_errno == 0)
1908 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1909 else
1910 {
1911 errno = stat_errno;
1912 elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1913 }
1914 }
1915
1916 /* Unregister it from the resource owner */
1917 if (vfdP->resowner)
1918 ResourceOwnerForgetFile(vfdP->resowner, file);
1919
1920 /*
1921 * Return the Vfd slot to the free list
1922 */
1923 FreeVfd(file);
1924 }
1925
1926 /*
1927 * FilePrefetch - initiate asynchronous read of a given range of the file.
1928 *
1929 * Currently the only implementation of this function is using posix_fadvise
1930 * which is the simplest standardized interface that accomplishes this.
1931 * We could add an implementation using libaio in the future; but note that
1932 * this API is inappropriate for libaio, which wants to have a buffer provided
1933 * to read into.
1934 */
1935 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1936 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1937 {
1938 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1939 int returnCode;
1940
1941 Assert(FileIsValid(file));
1942
1943 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1944 file, VfdCache[file].fileName,
1945 (int64) offset, amount));
1946
1947 returnCode = FileAccess(file);
1948 if (returnCode < 0)
1949 return returnCode;
1950
1951 pgstat_report_wait_start(wait_event_info);
1952 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1953 POSIX_FADV_WILLNEED);
1954 pgstat_report_wait_end();
1955
1956 return returnCode;
1957 #else
1958 Assert(FileIsValid(file));
1959 return 0;
1960 #endif
1961 }
1962
1963 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)1964 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1965 {
1966 int returnCode;
1967
1968 Assert(FileIsValid(file));
1969
1970 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1971 file, VfdCache[file].fileName,
1972 (int64) offset, (int64) nbytes));
1973
1974 if (nbytes <= 0)
1975 return;
1976
1977 returnCode = FileAccess(file);
1978 if (returnCode < 0)
1979 return;
1980
1981 pgstat_report_wait_start(wait_event_info);
1982 pg_flush_data(VfdCache[file].fd, offset, nbytes);
1983 pgstat_report_wait_end();
1984 }
1985
1986 int
FileRead(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)1987 FileRead(File file, char *buffer, int amount, off_t offset,
1988 uint32 wait_event_info)
1989 {
1990 int returnCode;
1991 Vfd *vfdP;
1992
1993 Assert(FileIsValid(file));
1994
1995 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1996 file, VfdCache[file].fileName,
1997 (int64) offset,
1998 amount, buffer));
1999
2000 returnCode = FileAccess(file);
2001 if (returnCode < 0)
2002 return returnCode;
2003
2004 vfdP = &VfdCache[file];
2005
2006 retry:
2007 pgstat_report_wait_start(wait_event_info);
2008 returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2009 pgstat_report_wait_end();
2010
2011 if (returnCode < 0)
2012 {
2013 /*
2014 * Windows may run out of kernel buffers and return "Insufficient
2015 * system resources" error. Wait a bit and retry to solve it.
2016 *
2017 * It is rumored that EINTR is also possible on some Unix filesystems,
2018 * in which case immediate retry is indicated.
2019 */
2020 #ifdef WIN32
2021 DWORD error = GetLastError();
2022
2023 switch (error)
2024 {
2025 case ERROR_NO_SYSTEM_RESOURCES:
2026 pg_usleep(1000L);
2027 errno = EINTR;
2028 break;
2029 default:
2030 _dosmaperr(error);
2031 break;
2032 }
2033 #endif
2034 /* OK to retry if interrupted */
2035 if (errno == EINTR)
2036 goto retry;
2037 }
2038
2039 return returnCode;
2040 }
2041
2042 int
FileWrite(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)2043 FileWrite(File file, char *buffer, int amount, off_t offset,
2044 uint32 wait_event_info)
2045 {
2046 int returnCode;
2047 Vfd *vfdP;
2048
2049 Assert(FileIsValid(file));
2050
2051 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2052 file, VfdCache[file].fileName,
2053 (int64) offset,
2054 amount, buffer));
2055
2056 returnCode = FileAccess(file);
2057 if (returnCode < 0)
2058 return returnCode;
2059
2060 vfdP = &VfdCache[file];
2061
2062 /*
2063 * If enforcing temp_file_limit and it's a temp file, check to see if the
2064 * write would overrun temp_file_limit, and throw error if so. Note: it's
2065 * really a modularity violation to throw error here; we should set errno
2066 * and return -1. However, there's no way to report a suitable error
2067 * message if we do that. All current callers would just throw error
2068 * immediately anyway, so this is safe at present.
2069 */
2070 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2071 {
2072 off_t past_write = offset + amount;
2073
2074 if (past_write > vfdP->fileSize)
2075 {
2076 uint64 newTotal = temporary_files_size;
2077
2078 newTotal += past_write - vfdP->fileSize;
2079 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2080 ereport(ERROR,
2081 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2082 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2083 temp_file_limit)));
2084 }
2085 }
2086
2087 retry:
2088 errno = 0;
2089 pgstat_report_wait_start(wait_event_info);
2090 returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2091 pgstat_report_wait_end();
2092
2093 /* if write didn't set errno, assume problem is no disk space */
2094 if (returnCode != amount && errno == 0)
2095 errno = ENOSPC;
2096
2097 if (returnCode >= 0)
2098 {
2099 /*
2100 * Maintain fileSize and temporary_files_size if it's a temp file.
2101 */
2102 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2103 {
2104 off_t past_write = offset + amount;
2105
2106 if (past_write > vfdP->fileSize)
2107 {
2108 temporary_files_size += past_write - vfdP->fileSize;
2109 vfdP->fileSize = past_write;
2110 }
2111 }
2112 }
2113 else
2114 {
2115 /*
2116 * See comments in FileRead()
2117 */
2118 #ifdef WIN32
2119 DWORD error = GetLastError();
2120
2121 switch (error)
2122 {
2123 case ERROR_NO_SYSTEM_RESOURCES:
2124 pg_usleep(1000L);
2125 errno = EINTR;
2126 break;
2127 default:
2128 _dosmaperr(error);
2129 break;
2130 }
2131 #endif
2132 /* OK to retry if interrupted */
2133 if (errno == EINTR)
2134 goto retry;
2135 }
2136
2137 return returnCode;
2138 }
2139
2140 int
FileSync(File file,uint32 wait_event_info)2141 FileSync(File file, uint32 wait_event_info)
2142 {
2143 int returnCode;
2144
2145 Assert(FileIsValid(file));
2146
2147 DO_DB(elog(LOG, "FileSync: %d (%s)",
2148 file, VfdCache[file].fileName));
2149
2150 returnCode = FileAccess(file);
2151 if (returnCode < 0)
2152 return returnCode;
2153
2154 pgstat_report_wait_start(wait_event_info);
2155 returnCode = pg_fsync(VfdCache[file].fd);
2156 pgstat_report_wait_end();
2157
2158 return returnCode;
2159 }
2160
2161 off_t
FileSize(File file)2162 FileSize(File file)
2163 {
2164 Assert(FileIsValid(file));
2165
2166 DO_DB(elog(LOG, "FileSize %d (%s)",
2167 file, VfdCache[file].fileName));
2168
2169 if (FileIsNotOpen(file))
2170 {
2171 if (FileAccess(file) < 0)
2172 return (off_t) -1;
2173 }
2174
2175 return lseek(VfdCache[file].fd, 0, SEEK_END);
2176 }
2177
2178 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)2179 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2180 {
2181 int returnCode;
2182
2183 Assert(FileIsValid(file));
2184
2185 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2186 file, VfdCache[file].fileName));
2187
2188 returnCode = FileAccess(file);
2189 if (returnCode < 0)
2190 return returnCode;
2191
2192 pgstat_report_wait_start(wait_event_info);
2193 returnCode = ftruncate(VfdCache[file].fd, offset);
2194 pgstat_report_wait_end();
2195
2196 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2197 {
2198 /* adjust our state for truncation of a temp file */
2199 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2200 temporary_files_size -= VfdCache[file].fileSize - offset;
2201 VfdCache[file].fileSize = offset;
2202 }
2203
2204 return returnCode;
2205 }
2206
2207 /*
2208 * Return the pathname associated with an open file.
2209 *
2210 * The returned string points to an internal buffer, which is valid until
2211 * the file is closed.
2212 */
2213 char *
FilePathName(File file)2214 FilePathName(File file)
2215 {
2216 Assert(FileIsValid(file));
2217
2218 return VfdCache[file].fileName;
2219 }
2220
2221 /*
2222 * Return the raw file descriptor of an opened file.
2223 *
2224 * The returned file descriptor will be valid until the file is closed, but
2225 * there are a lot of things that can make that happen. So the caller should
2226 * be careful not to do much of anything else before it finishes using the
2227 * returned file descriptor.
2228 */
2229 int
FileGetRawDesc(File file)2230 FileGetRawDesc(File file)
2231 {
2232 Assert(FileIsValid(file));
2233 return VfdCache[file].fd;
2234 }
2235
2236 /*
2237 * FileGetRawFlags - returns the file flags on open(2)
2238 */
2239 int
FileGetRawFlags(File file)2240 FileGetRawFlags(File file)
2241 {
2242 Assert(FileIsValid(file));
2243 return VfdCache[file].fileFlags;
2244 }
2245
2246 /*
2247 * FileGetRawMode - returns the mode bitmask passed to open(2)
2248 */
2249 mode_t
FileGetRawMode(File file)2250 FileGetRawMode(File file)
2251 {
2252 Assert(FileIsValid(file));
2253 return VfdCache[file].fileMode;
2254 }
2255
2256 /*
2257 * Make room for another allocatedDescs[] array entry if needed and possible.
2258 * Returns true if an array element is available.
2259 */
2260 static bool
reserveAllocatedDesc(void)2261 reserveAllocatedDesc(void)
2262 {
2263 AllocateDesc *newDescs;
2264 int newMax;
2265
2266 /* Quick out if array already has a free slot. */
2267 if (numAllocatedDescs < maxAllocatedDescs)
2268 return true;
2269
2270 /*
2271 * If the array hasn't yet been created in the current process, initialize
2272 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2273 * we will ever need, anyway. We don't want to look at max_safe_fds
2274 * immediately because set_max_safe_fds() may not have run yet.
2275 */
2276 if (allocatedDescs == NULL)
2277 {
2278 newMax = FD_MINFREE / 3;
2279 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2280 /* Out of memory already? Treat as fatal error. */
2281 if (newDescs == NULL)
2282 ereport(ERROR,
2283 (errcode(ERRCODE_OUT_OF_MEMORY),
2284 errmsg("out of memory")));
2285 allocatedDescs = newDescs;
2286 maxAllocatedDescs = newMax;
2287 return true;
2288 }
2289
2290 /*
2291 * Consider enlarging the array beyond the initial allocation used above.
2292 * By the time this happens, max_safe_fds should be known accurately.
2293 *
2294 * We mustn't let allocated descriptors hog all the available FDs, and in
2295 * practice we'd better leave a reasonable number of FDs for VFD use. So
2296 * set the maximum to max_safe_fds / 3. (This should certainly be at
2297 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2298 * tightening the restriction here.) Recall that "external" FDs are
2299 * allowed to consume another third of max_safe_fds.
2300 */
2301 newMax = max_safe_fds / 3;
2302 if (newMax > maxAllocatedDescs)
2303 {
2304 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2305 newMax * sizeof(AllocateDesc));
2306 /* Treat out-of-memory as a non-fatal error. */
2307 if (newDescs == NULL)
2308 return false;
2309 allocatedDescs = newDescs;
2310 maxAllocatedDescs = newMax;
2311 return true;
2312 }
2313
2314 /* Can't enlarge allocatedDescs[] any more. */
2315 return false;
2316 }
2317
2318 /*
2319 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2320 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2321 * necessary to open the file. When done, call FreeFile rather than fclose.
2322 *
2323 * Note that files that will be open for any significant length of time
2324 * should NOT be handled this way, since they cannot share kernel file
2325 * descriptors with other files; there is grave risk of running out of FDs
2326 * if anyone locks down too many FDs. Most callers of this routine are
2327 * simply reading a config file that they will read and close immediately.
2328 *
2329 * fd.c will automatically close all files opened with AllocateFile at
2330 * transaction commit or abort; this prevents FD leakage if a routine
2331 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2332 *
2333 * Ideally this should be the *only* direct call of fopen() in the backend.
2334 */
2335 FILE *
AllocateFile(const char * name,const char * mode)2336 AllocateFile(const char *name, const char *mode)
2337 {
2338 FILE *file;
2339
2340 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2341 numAllocatedDescs, name));
2342
2343 /* Can we allocate another non-virtual FD? */
2344 if (!reserveAllocatedDesc())
2345 ereport(ERROR,
2346 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2347 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2348 maxAllocatedDescs, name)));
2349
2350 /* Close excess kernel FDs. */
2351 ReleaseLruFiles();
2352
2353 TryAgain:
2354 if ((file = fopen(name, mode)) != NULL)
2355 {
2356 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2357
2358 desc->kind = AllocateDescFile;
2359 desc->desc.file = file;
2360 desc->create_subid = GetCurrentSubTransactionId();
2361 numAllocatedDescs++;
2362 return desc->desc.file;
2363 }
2364
2365 if (errno == EMFILE || errno == ENFILE)
2366 {
2367 int save_errno = errno;
2368
2369 ereport(LOG,
2370 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2371 errmsg("out of file descriptors: %m; release and retry")));
2372 errno = 0;
2373 if (ReleaseLruFile())
2374 goto TryAgain;
2375 errno = save_errno;
2376 }
2377
2378 return NULL;
2379 }
2380
2381 /*
2382 * Open a file with OpenTransientFilePerm() and pass default file mode for
2383 * the fileMode parameter.
2384 */
2385 int
OpenTransientFile(const char * fileName,int fileFlags)2386 OpenTransientFile(const char *fileName, int fileFlags)
2387 {
2388 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2389 }
2390
2391 /*
2392 * Like AllocateFile, but returns an unbuffered fd like open(2)
2393 */
2394 int
OpenTransientFilePerm(const char * fileName,int fileFlags,mode_t fileMode)2395 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2396 {
2397 int fd;
2398
2399 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2400 numAllocatedDescs, fileName));
2401
2402 /* Can we allocate another non-virtual FD? */
2403 if (!reserveAllocatedDesc())
2404 ereport(ERROR,
2405 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2406 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2407 maxAllocatedDescs, fileName)));
2408
2409 /* Close excess kernel FDs. */
2410 ReleaseLruFiles();
2411
2412 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2413
2414 if (fd >= 0)
2415 {
2416 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2417
2418 desc->kind = AllocateDescRawFD;
2419 desc->desc.fd = fd;
2420 desc->create_subid = GetCurrentSubTransactionId();
2421 numAllocatedDescs++;
2422
2423 return fd;
2424 }
2425
2426 return -1; /* failure */
2427 }
2428
2429 /*
2430 * Routines that want to initiate a pipe stream should use OpenPipeStream
2431 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2432 * necessary. When done, call ClosePipeStream rather than pclose.
2433 *
2434 * This function also ensures that the popen'd program is run with default
2435 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2436 * uses. This ensures desirable response to, eg, closing a read pipe early.
2437 */
2438 FILE *
OpenPipeStream(const char * command,const char * mode)2439 OpenPipeStream(const char *command, const char *mode)
2440 {
2441 FILE *file;
2442 int save_errno;
2443
2444 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2445 numAllocatedDescs, command));
2446
2447 /* Can we allocate another non-virtual FD? */
2448 if (!reserveAllocatedDesc())
2449 ereport(ERROR,
2450 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2451 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2452 maxAllocatedDescs, command)));
2453
2454 /* Close excess kernel FDs. */
2455 ReleaseLruFiles();
2456
2457 TryAgain:
2458 fflush(stdout);
2459 fflush(stderr);
2460 pqsignal(SIGPIPE, SIG_DFL);
2461 errno = 0;
2462 file = popen(command, mode);
2463 save_errno = errno;
2464 pqsignal(SIGPIPE, SIG_IGN);
2465 errno = save_errno;
2466 if (file != NULL)
2467 {
2468 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2469
2470 desc->kind = AllocateDescPipe;
2471 desc->desc.file = file;
2472 desc->create_subid = GetCurrentSubTransactionId();
2473 numAllocatedDescs++;
2474 return desc->desc.file;
2475 }
2476
2477 if (errno == EMFILE || errno == ENFILE)
2478 {
2479 ereport(LOG,
2480 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2481 errmsg("out of file descriptors: %m; release and retry")));
2482 if (ReleaseLruFile())
2483 goto TryAgain;
2484 errno = save_errno;
2485 }
2486
2487 return NULL;
2488 }
2489
2490 /*
2491 * Free an AllocateDesc of any type.
2492 *
2493 * The argument *must* point into the allocatedDescs[] array.
2494 */
2495 static int
FreeDesc(AllocateDesc * desc)2496 FreeDesc(AllocateDesc *desc)
2497 {
2498 int result;
2499
2500 /* Close the underlying object */
2501 switch (desc->kind)
2502 {
2503 case AllocateDescFile:
2504 result = fclose(desc->desc.file);
2505 break;
2506 case AllocateDescPipe:
2507 result = pclose(desc->desc.file);
2508 break;
2509 case AllocateDescDir:
2510 result = closedir(desc->desc.dir);
2511 break;
2512 case AllocateDescRawFD:
2513 result = close(desc->desc.fd);
2514 break;
2515 default:
2516 elog(ERROR, "AllocateDesc kind not recognized");
2517 result = 0; /* keep compiler quiet */
2518 break;
2519 }
2520
2521 /* Compact storage in the allocatedDescs array */
2522 numAllocatedDescs--;
2523 *desc = allocatedDescs[numAllocatedDescs];
2524
2525 return result;
2526 }
2527
2528 /*
2529 * Close a file returned by AllocateFile.
2530 *
2531 * Note we do not check fclose's return value --- it is up to the caller
2532 * to handle close errors.
2533 */
2534 int
FreeFile(FILE * file)2535 FreeFile(FILE *file)
2536 {
2537 int i;
2538
2539 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2540
2541 /* Remove file from list of allocated files, if it's present */
2542 for (i = numAllocatedDescs; --i >= 0;)
2543 {
2544 AllocateDesc *desc = &allocatedDescs[i];
2545
2546 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2547 return FreeDesc(desc);
2548 }
2549
2550 /* Only get here if someone passes us a file not in allocatedDescs */
2551 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2552
2553 return fclose(file);
2554 }
2555
2556 /*
2557 * Close a file returned by OpenTransientFile.
2558 *
2559 * Note we do not check close's return value --- it is up to the caller
2560 * to handle close errors.
2561 */
2562 int
CloseTransientFile(int fd)2563 CloseTransientFile(int fd)
2564 {
2565 int i;
2566
2567 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2568
2569 /* Remove fd from list of allocated files, if it's present */
2570 for (i = numAllocatedDescs; --i >= 0;)
2571 {
2572 AllocateDesc *desc = &allocatedDescs[i];
2573
2574 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2575 return FreeDesc(desc);
2576 }
2577
2578 /* Only get here if someone passes us a file not in allocatedDescs */
2579 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2580
2581 return close(fd);
2582 }
2583
2584 /*
2585 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2586 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2587 * necessary to open the directory, and with closing it after an elog.
2588 * When done, call FreeDir rather than closedir.
2589 *
2590 * Returns NULL, with errno set, on failure. Note that failure detection
2591 * is commonly left to the following call of ReadDir or ReadDirExtended;
2592 * see the comments for ReadDir.
2593 *
2594 * Ideally this should be the *only* direct call of opendir() in the backend.
2595 */
2596 DIR *
AllocateDir(const char * dirname)2597 AllocateDir(const char *dirname)
2598 {
2599 DIR *dir;
2600
2601 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2602 numAllocatedDescs, dirname));
2603
2604 /* Can we allocate another non-virtual FD? */
2605 if (!reserveAllocatedDesc())
2606 ereport(ERROR,
2607 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2608 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2609 maxAllocatedDescs, dirname)));
2610
2611 /* Close excess kernel FDs. */
2612 ReleaseLruFiles();
2613
2614 TryAgain:
2615 if ((dir = opendir(dirname)) != NULL)
2616 {
2617 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2618
2619 desc->kind = AllocateDescDir;
2620 desc->desc.dir = dir;
2621 desc->create_subid = GetCurrentSubTransactionId();
2622 numAllocatedDescs++;
2623 return desc->desc.dir;
2624 }
2625
2626 if (errno == EMFILE || errno == ENFILE)
2627 {
2628 int save_errno = errno;
2629
2630 ereport(LOG,
2631 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2632 errmsg("out of file descriptors: %m; release and retry")));
2633 errno = 0;
2634 if (ReleaseLruFile())
2635 goto TryAgain;
2636 errno = save_errno;
2637 }
2638
2639 return NULL;
2640 }
2641
2642 /*
2643 * Read a directory opened with AllocateDir, ereport'ing any error.
2644 *
2645 * This is easier to use than raw readdir() since it takes care of some
2646 * otherwise rather tedious and error-prone manipulation of errno. Also,
2647 * if you are happy with a generic error message for AllocateDir failure,
2648 * you can just do
2649 *
2650 * dir = AllocateDir(path);
2651 * while ((dirent = ReadDir(dir, path)) != NULL)
2652 * process dirent;
2653 * FreeDir(dir);
2654 *
2655 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2656 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2657 * use this shortcut.)
2658 *
2659 * The pathname passed to AllocateDir must be passed to this routine too,
2660 * but it is only used for error reporting.
2661 */
2662 struct dirent *
ReadDir(DIR * dir,const char * dirname)2663 ReadDir(DIR *dir, const char *dirname)
2664 {
2665 return ReadDirExtended(dir, dirname, ERROR);
2666 }
2667
2668 /*
2669 * Alternate version of ReadDir that allows caller to specify the elevel
2670 * for any error report (whether it's reporting an initial failure of
2671 * AllocateDir or a subsequent directory read failure).
2672 *
2673 * If elevel < ERROR, returns NULL after any error. With the normal coding
2674 * pattern, this will result in falling out of the loop immediately as
2675 * though the directory contained no (more) entries.
2676 */
2677 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2678 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2679 {
2680 struct dirent *dent;
2681
2682 /* Give a generic message for AllocateDir failure, if caller didn't */
2683 if (dir == NULL)
2684 {
2685 ereport(elevel,
2686 (errcode_for_file_access(),
2687 errmsg("could not open directory \"%s\": %m",
2688 dirname)));
2689 return NULL;
2690 }
2691
2692 errno = 0;
2693 if ((dent = readdir(dir)) != NULL)
2694 return dent;
2695
2696 if (errno)
2697 ereport(elevel,
2698 (errcode_for_file_access(),
2699 errmsg("could not read directory \"%s\": %m",
2700 dirname)));
2701 return NULL;
2702 }
2703
2704 /*
2705 * Close a directory opened with AllocateDir.
2706 *
2707 * Returns closedir's return value (with errno set if it's not 0).
2708 * Note we do not check the return value --- it is up to the caller
2709 * to handle close errors if wanted.
2710 *
2711 * Does nothing if dir == NULL; we assume that directory open failure was
2712 * already reported if desired.
2713 */
2714 int
FreeDir(DIR * dir)2715 FreeDir(DIR *dir)
2716 {
2717 int i;
2718
2719 /* Nothing to do if AllocateDir failed */
2720 if (dir == NULL)
2721 return 0;
2722
2723 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2724
2725 /* Remove dir from list of allocated dirs, if it's present */
2726 for (i = numAllocatedDescs; --i >= 0;)
2727 {
2728 AllocateDesc *desc = &allocatedDescs[i];
2729
2730 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2731 return FreeDesc(desc);
2732 }
2733
2734 /* Only get here if someone passes us a dir not in allocatedDescs */
2735 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2736
2737 return closedir(dir);
2738 }
2739
2740
2741 /*
2742 * Close a pipe stream returned by OpenPipeStream.
2743 */
2744 int
ClosePipeStream(FILE * file)2745 ClosePipeStream(FILE *file)
2746 {
2747 int i;
2748
2749 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2750
2751 /* Remove file from list of allocated files, if it's present */
2752 for (i = numAllocatedDescs; --i >= 0;)
2753 {
2754 AllocateDesc *desc = &allocatedDescs[i];
2755
2756 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2757 return FreeDesc(desc);
2758 }
2759
2760 /* Only get here if someone passes us a file not in allocatedDescs */
2761 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2762
2763 return pclose(file);
2764 }
2765
2766 /*
2767 * closeAllVfds
2768 *
2769 * Force all VFDs into the physically-closed state, so that the fewest
2770 * possible number of kernel file descriptors are in use. There is no
2771 * change in the logical state of the VFDs.
2772 */
2773 void
closeAllVfds(void)2774 closeAllVfds(void)
2775 {
2776 Index i;
2777
2778 if (SizeVfdCache > 0)
2779 {
2780 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2781 for (i = 1; i < SizeVfdCache; i++)
2782 {
2783 if (!FileIsNotOpen(i))
2784 LruDelete(i);
2785 }
2786 }
2787 }
2788
2789
2790 /*
2791 * SetTempTablespaces
2792 *
2793 * Define a list (actually an array) of OIDs of tablespaces to use for
2794 * temporary files. This list will be used until end of transaction,
2795 * unless this function is called again before then. It is caller's
2796 * responsibility that the passed-in array has adequate lifespan (typically
2797 * it'd be allocated in TopTransactionContext).
2798 *
2799 * Some entries of the array may be InvalidOid, indicating that the current
2800 * database's default tablespace should be used.
2801 */
2802 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2803 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2804 {
2805 Assert(numSpaces >= 0);
2806 tempTableSpaces = tableSpaces;
2807 numTempTableSpaces = numSpaces;
2808
2809 /*
2810 * Select a random starting point in the list. This is to minimize
2811 * conflicts between backends that are most likely sharing the same list
2812 * of temp tablespaces. Note that if we create multiple temp files in the
2813 * same transaction, we'll advance circularly through the list --- this
2814 * ensures that large temporary sort files are nicely spread across all
2815 * available tablespaces.
2816 */
2817 if (numSpaces > 1)
2818 nextTempTableSpace = random() % numSpaces;
2819 else
2820 nextTempTableSpace = 0;
2821 }
2822
2823 /*
2824 * TempTablespacesAreSet
2825 *
2826 * Returns true if SetTempTablespaces has been called in current transaction.
2827 * (This is just so that tablespaces.c doesn't need its own per-transaction
2828 * state.)
2829 */
2830 bool
TempTablespacesAreSet(void)2831 TempTablespacesAreSet(void)
2832 {
2833 return (numTempTableSpaces >= 0);
2834 }
2835
2836 /*
2837 * GetTempTablespaces
2838 *
2839 * Populate an array with the OIDs of the tablespaces that should be used for
2840 * temporary files. (Some entries may be InvalidOid, indicating that the
2841 * current database's default tablespace should be used.) At most numSpaces
2842 * entries will be filled.
2843 * Returns the number of OIDs that were copied into the output array.
2844 */
2845 int
GetTempTablespaces(Oid * tableSpaces,int numSpaces)2846 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2847 {
2848 int i;
2849
2850 Assert(TempTablespacesAreSet());
2851 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2852 tableSpaces[i] = tempTableSpaces[i];
2853
2854 return i;
2855 }
2856
2857 /*
2858 * GetNextTempTableSpace
2859 *
2860 * Select the next temp tablespace to use. A result of InvalidOid means
2861 * to use the current database's default tablespace.
2862 */
2863 Oid
GetNextTempTableSpace(void)2864 GetNextTempTableSpace(void)
2865 {
2866 if (numTempTableSpaces > 0)
2867 {
2868 /* Advance nextTempTableSpace counter with wraparound */
2869 if (++nextTempTableSpace >= numTempTableSpaces)
2870 nextTempTableSpace = 0;
2871 return tempTableSpaces[nextTempTableSpace];
2872 }
2873 return InvalidOid;
2874 }
2875
2876
2877 /*
2878 * AtEOSubXact_Files
2879 *
2880 * Take care of subtransaction commit/abort. At abort, we close temp files
2881 * that the subtransaction may have opened. At commit, we reassign the
2882 * files that were opened to the parent subtransaction.
2883 */
2884 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2885 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2886 SubTransactionId parentSubid)
2887 {
2888 Index i;
2889
2890 for (i = 0; i < numAllocatedDescs; i++)
2891 {
2892 if (allocatedDescs[i].create_subid == mySubid)
2893 {
2894 if (isCommit)
2895 allocatedDescs[i].create_subid = parentSubid;
2896 else
2897 {
2898 /* have to recheck the item after FreeDesc (ugly) */
2899 FreeDesc(&allocatedDescs[i--]);
2900 }
2901 }
2902 }
2903 }
2904
2905 /*
2906 * AtEOXact_Files
2907 *
2908 * This routine is called during transaction commit or abort. All still-open
2909 * per-transaction temporary file VFDs are closed, which also causes the
2910 * underlying files to be deleted (although they should've been closed already
2911 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2912 * closed. We also forget any transaction-local temp tablespace list.
2913 *
2914 * The isCommit flag is used only to decide whether to emit warnings about
2915 * unclosed files.
2916 */
2917 void
AtEOXact_Files(bool isCommit)2918 AtEOXact_Files(bool isCommit)
2919 {
2920 CleanupTempFiles(isCommit, false);
2921 tempTableSpaces = NULL;
2922 numTempTableSpaces = -1;
2923 }
2924
2925 /*
2926 * AtProcExit_Files
2927 *
2928 * on_proc_exit hook to clean up temp files during backend shutdown.
2929 * Here, we want to clean up *all* temp files including interXact ones.
2930 */
2931 static void
AtProcExit_Files(int code,Datum arg)2932 AtProcExit_Files(int code, Datum arg)
2933 {
2934 CleanupTempFiles(false, true);
2935 }
2936
2937 /*
2938 * Close temporary files and delete their underlying files.
2939 *
2940 * isCommit: if true, this is normal transaction commit, and we don't
2941 * expect any remaining files; warn if there are some.
2942 *
2943 * isProcExit: if true, this is being called as the backend process is
2944 * exiting. If that's the case, we should remove all temporary files; if
2945 * that's not the case, we are being called for transaction commit/abort
2946 * and should only remove transaction-local temp files. In either case,
2947 * also clean up "allocated" stdio files, dirs and fds.
2948 */
2949 static void
CleanupTempFiles(bool isCommit,bool isProcExit)2950 CleanupTempFiles(bool isCommit, bool isProcExit)
2951 {
2952 Index i;
2953
2954 /*
2955 * Careful here: at proc_exit we need extra cleanup, not just
2956 * xact_temporary files.
2957 */
2958 if (isProcExit || have_xact_temporary_files)
2959 {
2960 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2961 for (i = 1; i < SizeVfdCache; i++)
2962 {
2963 unsigned short fdstate = VfdCache[i].fdstate;
2964
2965 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2966 VfdCache[i].fileName != NULL)
2967 {
2968 /*
2969 * If we're in the process of exiting a backend process, close
2970 * all temporary files. Otherwise, only close temporary files
2971 * local to the current transaction. They should be closed by
2972 * the ResourceOwner mechanism already, so this is just a
2973 * debugging cross-check.
2974 */
2975 if (isProcExit)
2976 FileClose(i);
2977 else if (fdstate & FD_CLOSE_AT_EOXACT)
2978 {
2979 elog(WARNING,
2980 "temporary file %s not closed at end-of-transaction",
2981 VfdCache[i].fileName);
2982 FileClose(i);
2983 }
2984 }
2985 }
2986
2987 have_xact_temporary_files = false;
2988 }
2989
2990 /* Complain if any allocated files remain open at commit. */
2991 if (isCommit && numAllocatedDescs > 0)
2992 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2993 numAllocatedDescs);
2994
2995 /* Clean up "allocated" stdio files, dirs and fds. */
2996 while (numAllocatedDescs > 0)
2997 FreeDesc(&allocatedDescs[0]);
2998 }
2999
3000
3001 /*
3002 * Remove temporary and temporary relation files left over from a prior
3003 * postmaster session
3004 *
3005 * This should be called during postmaster startup. It will forcibly
3006 * remove any leftover files created by OpenTemporaryFile and any leftover
3007 * temporary relation files created by mdcreate.
3008 *
3009 * NOTE: we could, but don't, call this during a post-backend-crash restart
3010 * cycle. The argument for not doing it is that someone might want to examine
3011 * the temp files for debugging purposes. This does however mean that
3012 * OpenTemporaryFile had better allow for collision with an existing temp
3013 * file name.
3014 *
3015 * NOTE: this function and its subroutines generally report syscall failures
3016 * with ereport(LOG) and keep going. Removing temp files is not so critical
3017 * that we should fail to start the database when we can't do it.
3018 */
3019 void
RemovePgTempFiles(void)3020 RemovePgTempFiles(void)
3021 {
3022 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3023 DIR *spc_dir;
3024 struct dirent *spc_de;
3025
3026 /*
3027 * First process temp files in pg_default ($PGDATA/base)
3028 */
3029 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3030 RemovePgTempFilesInDir(temp_path, true, false);
3031 RemovePgTempRelationFiles("base");
3032
3033 /*
3034 * Cycle through temp directories for all non-default tablespaces.
3035 */
3036 spc_dir = AllocateDir("pg_tblspc");
3037
3038 while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3039 {
3040 if (strcmp(spc_de->d_name, ".") == 0 ||
3041 strcmp(spc_de->d_name, "..") == 0)
3042 continue;
3043
3044 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3045 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
3046 RemovePgTempFilesInDir(temp_path, true, false);
3047
3048 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3049 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3050 RemovePgTempRelationFiles(temp_path);
3051 }
3052
3053 FreeDir(spc_dir);
3054
3055 /*
3056 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3057 * DataDir as well. However, that is *not* cleaned here because doing so
3058 * would create a race condition. It's done separately, earlier in
3059 * postmaster startup.
3060 */
3061 }
3062
3063 /*
3064 * Process one pgsql_tmp directory for RemovePgTempFiles.
3065 *
3066 * If missing_ok is true, it's all right for the named directory to not exist.
3067 * Any other problem results in a LOG message. (missing_ok should be true at
3068 * the top level, since pgsql_tmp directories are not created until needed.)
3069 *
3070 * At the top level, this should be called with unlink_all = false, so that
3071 * only files matching the temporary name prefix will be unlinked. When
3072 * recursing it will be called with unlink_all = true to unlink everything
3073 * under a top-level temporary directory.
3074 *
3075 * (These two flags could be replaced by one, but it seems clearer to keep
3076 * them separate.)
3077 */
3078 void
RemovePgTempFilesInDir(const char * tmpdirname,bool missing_ok,bool unlink_all)3079 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3080 {
3081 DIR *temp_dir;
3082 struct dirent *temp_de;
3083 char rm_path[MAXPGPATH * 2];
3084
3085 temp_dir = AllocateDir(tmpdirname);
3086
3087 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3088 return;
3089
3090 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3091 {
3092 if (strcmp(temp_de->d_name, ".") == 0 ||
3093 strcmp(temp_de->d_name, "..") == 0)
3094 continue;
3095
3096 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3097 tmpdirname, temp_de->d_name);
3098
3099 if (unlink_all ||
3100 strncmp(temp_de->d_name,
3101 PG_TEMP_FILE_PREFIX,
3102 strlen(PG_TEMP_FILE_PREFIX)) == 0)
3103 {
3104 struct stat statbuf;
3105
3106 if (lstat(rm_path, &statbuf) < 0)
3107 {
3108 ereport(LOG,
3109 (errcode_for_file_access(),
3110 errmsg("could not stat file \"%s\": %m", rm_path)));
3111 continue;
3112 }
3113
3114 if (S_ISDIR(statbuf.st_mode))
3115 {
3116 /* recursively remove contents, then directory itself */
3117 RemovePgTempFilesInDir(rm_path, false, true);
3118
3119 if (rmdir(rm_path) < 0)
3120 ereport(LOG,
3121 (errcode_for_file_access(),
3122 errmsg("could not remove directory \"%s\": %m",
3123 rm_path)));
3124 }
3125 else
3126 {
3127 if (unlink(rm_path) < 0)
3128 ereport(LOG,
3129 (errcode_for_file_access(),
3130 errmsg("could not remove file \"%s\": %m",
3131 rm_path)));
3132 }
3133 }
3134 else
3135 ereport(LOG,
3136 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3137 rm_path)));
3138 }
3139
3140 FreeDir(temp_dir);
3141 }
3142
3143 /* Process one tablespace directory, look for per-DB subdirectories */
3144 static void
RemovePgTempRelationFiles(const char * tsdirname)3145 RemovePgTempRelationFiles(const char *tsdirname)
3146 {
3147 DIR *ts_dir;
3148 struct dirent *de;
3149 char dbspace_path[MAXPGPATH * 2];
3150
3151 ts_dir = AllocateDir(tsdirname);
3152
3153 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3154 {
3155 /*
3156 * We're only interested in the per-database directories, which have
3157 * numeric names. Note that this code will also (properly) ignore "."
3158 * and "..".
3159 */
3160 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3161 continue;
3162
3163 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3164 tsdirname, de->d_name);
3165 RemovePgTempRelationFilesInDbspace(dbspace_path);
3166 }
3167
3168 FreeDir(ts_dir);
3169 }
3170
3171 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3172 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)3173 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3174 {
3175 DIR *dbspace_dir;
3176 struct dirent *de;
3177 char rm_path[MAXPGPATH * 2];
3178
3179 dbspace_dir = AllocateDir(dbspacedirname);
3180
3181 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3182 {
3183 if (!looks_like_temp_rel_name(de->d_name))
3184 continue;
3185
3186 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3187 dbspacedirname, de->d_name);
3188
3189 if (unlink(rm_path) < 0)
3190 ereport(LOG,
3191 (errcode_for_file_access(),
3192 errmsg("could not remove file \"%s\": %m",
3193 rm_path)));
3194 }
3195
3196 FreeDir(dbspace_dir);
3197 }
3198
3199 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3200 bool
looks_like_temp_rel_name(const char * name)3201 looks_like_temp_rel_name(const char *name)
3202 {
3203 int pos;
3204 int savepos;
3205
3206 /* Must start with "t". */
3207 if (name[0] != 't')
3208 return false;
3209
3210 /* Followed by a non-empty string of digits and then an underscore. */
3211 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3212 ;
3213 if (pos == 1 || name[pos] != '_')
3214 return false;
3215
3216 /* Followed by another nonempty string of digits. */
3217 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3218 ;
3219 if (savepos == pos)
3220 return false;
3221
3222 /* We might have _forkname or .segment or both. */
3223 if (name[pos] == '_')
3224 {
3225 int forkchar = forkname_chars(&name[pos + 1], NULL);
3226
3227 if (forkchar <= 0)
3228 return false;
3229 pos += forkchar + 1;
3230 }
3231 if (name[pos] == '.')
3232 {
3233 int segchar;
3234
3235 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3236 ;
3237 if (segchar <= 1)
3238 return false;
3239 pos += segchar;
3240 }
3241
3242 /* Now we should be at the end. */
3243 if (name[pos] != '\0')
3244 return false;
3245 return true;
3246 }
3247
3248
3249 /*
3250 * Issue fsync recursively on PGDATA and all its contents.
3251 *
3252 * We fsync regular files and directories wherever they are, but we
3253 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3254 * Other symlinks are presumed to point at files we're not responsible
3255 * for fsyncing, and might not have privileges to write at all.
3256 *
3257 * Errors are logged but not considered fatal; that's because this is used
3258 * only during database startup, to deal with the possibility that there are
3259 * issued-but-unsynced writes pending against the data directory. We want to
3260 * ensure that such writes reach disk before anything that's done in the new
3261 * run. However, aborting on error would result in failure to start for
3262 * harmless cases such as read-only files in the data directory, and that's
3263 * not good either.
3264 *
3265 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3266 * rewriting all changes again during recovery.
3267 *
3268 * Note we assume we're chdir'd into PGDATA to begin with.
3269 */
3270 void
SyncDataDirectory(void)3271 SyncDataDirectory(void)
3272 {
3273 bool xlog_is_symlink;
3274
3275 /* We can skip this whole thing if fsync is disabled. */
3276 if (!enableFsync)
3277 return;
3278
3279 /*
3280 * If pg_wal is a symlink, we'll need to recurse into it separately,
3281 * because the first walkdir below will ignore it.
3282 */
3283 xlog_is_symlink = false;
3284
3285 #ifndef WIN32
3286 {
3287 struct stat st;
3288
3289 if (lstat("pg_wal", &st) < 0)
3290 ereport(LOG,
3291 (errcode_for_file_access(),
3292 errmsg("could not stat file \"%s\": %m",
3293 "pg_wal")));
3294 else if (S_ISLNK(st.st_mode))
3295 xlog_is_symlink = true;
3296 }
3297 #else
3298 if (pgwin32_is_junction("pg_wal"))
3299 xlog_is_symlink = true;
3300 #endif
3301
3302 /*
3303 * If possible, hint to the kernel that we're soon going to fsync the data
3304 * directory and its contents. Errors in this step are even less
3305 * interesting than normal, so log them only at DEBUG1.
3306 */
3307 #ifdef PG_FLUSH_DATA_WORKS
3308 walkdir(".", pre_sync_fname, false, DEBUG1);
3309 if (xlog_is_symlink)
3310 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3311 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3312 #endif
3313
3314 /*
3315 * Now we do the fsync()s in the same order.
3316 *
3317 * The main call ignores symlinks, so in addition to specially processing
3318 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3319 * process_symlinks = true. Note that if there are any plain directories
3320 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3321 * so we don't worry about optimizing it.
3322 */
3323 walkdir(".", datadir_fsync_fname, false, LOG);
3324 if (xlog_is_symlink)
3325 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3326 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3327 }
3328
3329 /*
3330 * walkdir: recursively walk a directory, applying the action to each
3331 * regular file and directory (including the named directory itself).
3332 *
3333 * If process_symlinks is true, the action and recursion are also applied
3334 * to regular files and directories that are pointed to by symlinks in the
3335 * given directory; otherwise symlinks are ignored. Symlinks are always
3336 * ignored in subdirectories, ie we intentionally don't pass down the
3337 * process_symlinks flag to recursive calls.
3338 *
3339 * Errors are reported at level elevel, which might be ERROR or less.
3340 *
3341 * See also walkdir in file_utils.c, which is a frontend version of this
3342 * logic.
3343 */
3344 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3345 walkdir(const char *path,
3346 void (*action) (const char *fname, bool isdir, int elevel),
3347 bool process_symlinks,
3348 int elevel)
3349 {
3350 DIR *dir;
3351 struct dirent *de;
3352
3353 dir = AllocateDir(path);
3354
3355 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3356 {
3357 char subpath[MAXPGPATH * 2];
3358 struct stat fst;
3359 int sret;
3360
3361 CHECK_FOR_INTERRUPTS();
3362
3363 if (strcmp(de->d_name, ".") == 0 ||
3364 strcmp(de->d_name, "..") == 0)
3365 continue;
3366
3367 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3368
3369 if (process_symlinks)
3370 sret = stat(subpath, &fst);
3371 else
3372 sret = lstat(subpath, &fst);
3373
3374 if (sret < 0)
3375 {
3376 ereport(elevel,
3377 (errcode_for_file_access(),
3378 errmsg("could not stat file \"%s\": %m", subpath)));
3379 continue;
3380 }
3381
3382 if (S_ISREG(fst.st_mode))
3383 (*action) (subpath, false, elevel);
3384 else if (S_ISDIR(fst.st_mode))
3385 walkdir(subpath, action, false, elevel);
3386 }
3387
3388 FreeDir(dir); /* we ignore any error here */
3389
3390 /*
3391 * It's important to fsync the destination directory itself as individual
3392 * file fsyncs don't guarantee that the directory entry for the file is
3393 * synced. However, skip this if AllocateDir failed; the action function
3394 * might not be robust against that.
3395 */
3396 if (dir)
3397 (*action) (path, true, elevel);
3398 }
3399
3400
3401 /*
3402 * Hint to the OS that it should get ready to fsync() this file.
3403 *
3404 * Ignores errors trying to open unreadable files, and logs other errors at a
3405 * caller-specified level.
3406 */
3407 #ifdef PG_FLUSH_DATA_WORKS
3408
3409 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3410 pre_sync_fname(const char *fname, bool isdir, int elevel)
3411 {
3412 int fd;
3413
3414 /* Don't try to flush directories, it'll likely just fail */
3415 if (isdir)
3416 return;
3417
3418 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3419
3420 if (fd < 0)
3421 {
3422 if (errno == EACCES)
3423 return;
3424 ereport(elevel,
3425 (errcode_for_file_access(),
3426 errmsg("could not open file \"%s\": %m", fname)));
3427 return;
3428 }
3429
3430 /*
3431 * pg_flush_data() ignores errors, which is ok because this is only a
3432 * hint.
3433 */
3434 pg_flush_data(fd, 0, 0);
3435
3436 if (CloseTransientFile(fd) != 0)
3437 ereport(elevel,
3438 (errcode_for_file_access(),
3439 errmsg("could not close file \"%s\": %m", fname)));
3440 }
3441
3442 #endif /* PG_FLUSH_DATA_WORKS */
3443
3444 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3445 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3446 {
3447 /*
3448 * We want to silently ignoring errors about unreadable files. Pass that
3449 * desire on to fsync_fname_ext().
3450 */
3451 fsync_fname_ext(fname, isdir, true, elevel);
3452 }
3453
3454 static void
unlink_if_exists_fname(const char * fname,bool isdir,int elevel)3455 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3456 {
3457 if (isdir)
3458 {
3459 if (rmdir(fname) != 0 && errno != ENOENT)
3460 ereport(elevel,
3461 (errcode_for_file_access(),
3462 errmsg("could not remove directory \"%s\": %m", fname)));
3463 }
3464 else
3465 {
3466 /* Use PathNameDeleteTemporaryFile to report filesize */
3467 PathNameDeleteTemporaryFile(fname, false);
3468 }
3469 }
3470
3471 /*
3472 * fsync_fname_ext -- Try to fsync a file or directory
3473 *
3474 * If ignore_perm is true, ignore errors upon trying to open unreadable
3475 * files. Logs other errors at a caller-specified level.
3476 *
3477 * Returns 0 if the operation succeeded, -1 otherwise.
3478 */
3479 int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3480 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3481 {
3482 int fd;
3483 int flags;
3484 int returncode;
3485
3486 /*
3487 * Some OSs require directories to be opened read-only whereas other
3488 * systems don't allow us to fsync files opened read-only; so we need both
3489 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3490 * not writable by our userid, but we assume that's OK.
3491 */
3492 flags = PG_BINARY;
3493 if (!isdir)
3494 flags |= O_RDWR;
3495 else
3496 flags |= O_RDONLY;
3497
3498 fd = OpenTransientFile(fname, flags);
3499
3500 /*
3501 * Some OSs don't allow us to open directories at all (Windows returns
3502 * EACCES), just ignore the error in that case. If desired also silently
3503 * ignoring errors about unreadable files. Log others.
3504 */
3505 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3506 return 0;
3507 else if (fd < 0 && ignore_perm && errno == EACCES)
3508 return 0;
3509 else if (fd < 0)
3510 {
3511 ereport(elevel,
3512 (errcode_for_file_access(),
3513 errmsg("could not open file \"%s\": %m", fname)));
3514 return -1;
3515 }
3516
3517 returncode = pg_fsync(fd);
3518
3519 /*
3520 * Some OSes don't allow us to fsync directories at all, so we can ignore
3521 * those errors. Anything else needs to be logged.
3522 */
3523 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3524 {
3525 int save_errno;
3526
3527 /* close file upon error, might not be in transaction context */
3528 save_errno = errno;
3529 (void) CloseTransientFile(fd);
3530 errno = save_errno;
3531
3532 ereport(elevel,
3533 (errcode_for_file_access(),
3534 errmsg("could not fsync file \"%s\": %m", fname)));
3535 return -1;
3536 }
3537
3538 if (CloseTransientFile(fd) != 0)
3539 {
3540 ereport(elevel,
3541 (errcode_for_file_access(),
3542 errmsg("could not close file \"%s\": %m", fname)));
3543 return -1;
3544 }
3545
3546 return 0;
3547 }
3548
3549 /*
3550 * fsync_parent_path -- fsync the parent path of a file or directory
3551 *
3552 * This is aimed at making file operations persistent on disk in case of
3553 * an OS crash or power failure.
3554 */
3555 static int
fsync_parent_path(const char * fname,int elevel)3556 fsync_parent_path(const char *fname, int elevel)
3557 {
3558 char parentpath[MAXPGPATH];
3559
3560 strlcpy(parentpath, fname, MAXPGPATH);
3561 get_parent_directory(parentpath);
3562
3563 /*
3564 * get_parent_directory() returns an empty string if the input argument is
3565 * just a file name (see comments in path.c), so handle that as being the
3566 * current directory.
3567 */
3568 if (strlen(parentpath) == 0)
3569 strlcpy(parentpath, ".", MAXPGPATH);
3570
3571 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3572 return -1;
3573
3574 return 0;
3575 }
3576
3577 /*
3578 * Create a PostgreSQL data sub-directory
3579 *
3580 * The data directory itself, and most of its sub-directories, are created at
3581 * initdb time, but we do have some occasions when we create directories in
3582 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3583 * make sure that those directories are created consistently. Today, that means
3584 * making sure that the created directory has the correct permissions, which is
3585 * what pg_dir_create_mode tracks for us.
3586 *
3587 * Note that we also set the umask() based on what we understand the correct
3588 * permissions to be (see file_perm.c).
3589 *
3590 * For permissions other than the default, mkdir() can be used directly, but
3591 * be sure to consider carefully such cases -- a sub-directory with incorrect
3592 * permissions in a PostgreSQL data directory could cause backups and other
3593 * processes to fail.
3594 */
3595 int
MakePGDirectory(const char * directoryName)3596 MakePGDirectory(const char *directoryName)
3597 {
3598 return mkdir(directoryName, pg_dir_create_mode);
3599 }
3600
3601 /*
3602 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3603 *
3604 * Failure to fsync any data file is cause for immediate panic, unless
3605 * data_sync_retry is enabled. Data may have been written to the operating
3606 * system and removed from our buffer pool already, and if we are running on
3607 * an operating system that forgets dirty data on write-back failure, there
3608 * may be only one copy of the data remaining: in the WAL. A later attempt to
3609 * fsync again might falsely report success. Therefore we must not allow any
3610 * further checkpoints to be attempted. data_sync_retry can in theory be
3611 * enabled on systems known not to drop dirty buffered data on write-back
3612 * failure (with the likely outcome that checkpoints will continue to fail
3613 * until the underlying problem is fixed).
3614 *
3615 * Any code that reports a failure from fsync() or related functions should
3616 * filter the error level with this function.
3617 */
3618 int
data_sync_elevel(int elevel)3619 data_sync_elevel(int elevel)
3620 {
3621 return data_sync_retry ? elevel : PANIC;
3622 }
3623