1 /*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 * If a non-virtual file descriptor needs to be held open for any length of
65 * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 * (and eventually ReleaseExternalFD), so that we can take it into account
67 * while deciding how many VFDs can be open. This applies to FDs obtained
68 * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 *
70 *-------------------------------------------------------------------------
71 */
72
73 #include "postgres.h"
74
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/stat.h>
79 #include <sys/types.h>
80 #ifndef WIN32
81 #include <sys/mman.h>
82 #endif
83 #include <limits.h>
84 #include <unistd.h>
85 #include <fcntl.h>
86 #ifdef HAVE_SYS_RESOURCE_H
87 #include <sys/resource.h> /* for getrlimit */
88 #endif
89
90 #include "access/xact.h"
91 #include "access/xlog.h"
92 #include "catalog/pg_tablespace.h"
93 #include "common/file_perm.h"
94 #include "common/file_utils.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "port/pg_iovec.h"
98 #include "portability/mem.h"
99 #include "storage/fd.h"
100 #include "storage/ipc.h"
101 #include "utils/guc.h"
102 #include "utils/resowner_private.h"
103
104 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
105 #if defined(HAVE_SYNC_FILE_RANGE)
106 #define PG_FLUSH_DATA_WORKS 1
107 #elif !defined(WIN32) && defined(MS_ASYNC)
108 #define PG_FLUSH_DATA_WORKS 1
109 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
110 #define PG_FLUSH_DATA_WORKS 1
111 #endif
112
113 /*
114 * We must leave some file descriptors free for system(), the dynamic loader,
115 * and other code that tries to open files without consulting fd.c. This
116 * is the number left free. (While we try fairly hard to prevent EMFILE
117 * errors, there's never any guarantee that we won't get ENFILE due to
118 * other processes chewing up FDs. So it's a bad idea to try to open files
119 * without consulting fd.c. Nonetheless we cannot control all code.)
120 *
121 * Because this is just a fixed setting, we are effectively assuming that
122 * no such code will leave FDs open over the long term; otherwise the slop
123 * is likely to be insufficient. Note in particular that we expect that
124 * loading a shared library does not result in any permanent increase in
125 * the number of open files. (This appears to be true on most if not
126 * all platforms as of Feb 2004.)
127 */
128 #define NUM_RESERVED_FDS 10
129
130 /*
131 * If we have fewer than this many usable FDs after allowing for the reserved
132 * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
133 * much less than that. Note that this value ensures numExternalFDs can be
134 * at least 16; as of this writing, the contrib/postgres_fdw regression tests
135 * will not pass unless that can grow to at least 14.)
136 */
137 #define FD_MINFREE 48
138
139 /*
140 * A number of platforms allow individual processes to open many more files
141 * than they can really support when *many* processes do the same thing.
142 * This GUC parameter lets the DBA limit max_safe_fds to something less than
143 * what the postmaster's initial probe suggests will work.
144 */
145 int max_files_per_process = 1000;
146
147 /*
148 * Maximum number of file descriptors to open for operations that fd.c knows
149 * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
150 * to a conservative value, and remains that way indefinitely in bootstrap or
151 * standalone-backend cases. In normal postmaster operation, the postmaster
152 * calls set_max_safe_fds() late in initialization to update the value, and
153 * that value is then inherited by forked subprocesses.
154 *
155 * Note: the value of max_files_per_process is taken into account while
156 * setting this variable, and so need not be tested separately.
157 */
158 int max_safe_fds = FD_MINFREE; /* default if not changed */
159
160 /* Whether it is safe to continue running after fsync() fails. */
161 bool data_sync_retry = false;
162
163 /* How SyncDataDirectory() should do its job. */
164 int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
165
166 /* Debugging.... */
167
168 #ifdef FDDEBUG
169 #define DO_DB(A) \
170 do { \
171 int _do_db_save_errno = errno; \
172 A; \
173 errno = _do_db_save_errno; \
174 } while (0)
175 #else
176 #define DO_DB(A) \
177 ((void) 0)
178 #endif
179
180 #define VFD_CLOSED (-1)
181
182 #define FileIsValid(file) \
183 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
184
185 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
186
187 /* these are the assigned bits in fdstate below: */
188 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
189 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
190 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
191
192 typedef struct vfd
193 {
194 int fd; /* current FD, or VFD_CLOSED if none */
195 unsigned short fdstate; /* bitflags for VFD's state */
196 ResourceOwner resowner; /* owner, for automatic cleanup */
197 File nextFree; /* link to next free VFD, if in freelist */
198 File lruMoreRecently; /* doubly linked recency-of-use list */
199 File lruLessRecently;
200 off_t fileSize; /* current size of file (0 if not temporary) */
201 char *fileName; /* name of file, or NULL for unused VFD */
202 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
203 int fileFlags; /* open(2) flags for (re)opening the file */
204 mode_t fileMode; /* mode to pass to open(2) */
205 } Vfd;
206
207 /*
208 * Virtual File Descriptor array pointer and size. This grows as
209 * needed. 'File' values are indexes into this array.
210 * Note that VfdCache[0] is not a usable VFD, just a list header.
211 */
212 static Vfd *VfdCache;
213 static Size SizeVfdCache = 0;
214
215 /*
216 * Number of file descriptors known to be in use by VFD entries.
217 */
218 static int nfile = 0;
219
220 /*
221 * Flag to tell whether it's worth scanning VfdCache looking for temp files
222 * to close
223 */
224 static bool have_xact_temporary_files = false;
225
226 /*
227 * Tracks the total size of all temporary files. Note: when temp_file_limit
228 * is being enforced, this cannot overflow since the limit cannot be more
229 * than INT_MAX kilobytes. When not enforcing, it could theoretically
230 * overflow, but we don't care.
231 */
232 static uint64 temporary_files_size = 0;
233
234 /*
235 * List of OS handles opened with AllocateFile, AllocateDir and
236 * OpenTransientFile.
237 */
238 typedef enum
239 {
240 AllocateDescFile,
241 AllocateDescPipe,
242 AllocateDescDir,
243 AllocateDescRawFD
244 } AllocateDescKind;
245
246 typedef struct
247 {
248 AllocateDescKind kind;
249 SubTransactionId create_subid;
250 union
251 {
252 FILE *file;
253 DIR *dir;
254 int fd;
255 } desc;
256 } AllocateDesc;
257
258 static int numAllocatedDescs = 0;
259 static int maxAllocatedDescs = 0;
260 static AllocateDesc *allocatedDescs = NULL;
261
262 /*
263 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
264 */
265 static int numExternalFDs = 0;
266
267 /*
268 * Number of temporary files opened during the current session;
269 * this is used in generation of tempfile names.
270 */
271 static long tempFileCounter = 0;
272
273 /*
274 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
275 * indicating that the current database's default tablespace should be used.)
276 * When numTempTableSpaces is -1, this has not been set in the current
277 * transaction.
278 */
279 static Oid *tempTableSpaces = NULL;
280 static int numTempTableSpaces = -1;
281 static int nextTempTableSpace = 0;
282
283
284 /*--------------------
285 *
286 * Private Routines
287 *
288 * Delete - delete a file from the Lru ring
289 * LruDelete - remove a file from the Lru ring and close its FD
290 * Insert - put a file at the front of the Lru ring
291 * LruInsert - put a file at the front of the Lru ring and open it
292 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
293 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
294 * AllocateVfd - grab a free (or new) file record (from VfdCache)
295 * FreeVfd - free a file record
296 *
297 * The Least Recently Used ring is a doubly linked list that begins and
298 * ends on element zero. Element zero is special -- it doesn't represent
299 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
300 * anchor that shows us the beginning/end of the ring.
301 * Only VFD elements that are currently really open (have an FD assigned) are
302 * in the Lru ring. Elements that are "virtually" open can be recognized
303 * by having a non-null fileName field.
304 *
305 * example:
306 *
307 * /--less----\ /---------\
308 * v \ v \
309 * #0 --more---> LeastRecentlyUsed --more-\ \
310 * ^\ | |
311 * \\less--> MostRecentlyUsedFile <---/ |
312 * \more---/ \--less--/
313 *
314 *--------------------
315 */
316 static void Delete(File file);
317 static void LruDelete(File file);
318 static void Insert(File file);
319 static int LruInsert(File file);
320 static bool ReleaseLruFile(void);
321 static void ReleaseLruFiles(void);
322 static File AllocateVfd(void);
323 static void FreeVfd(File file);
324
325 static int FileAccess(File file);
326 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
327 static bool reserveAllocatedDesc(void);
328 static int FreeDesc(AllocateDesc *desc);
329
330 static void AtProcExit_Files(int code, Datum arg);
331 static void CleanupTempFiles(bool isCommit, bool isProcExit);
332 static void RemovePgTempRelationFiles(const char *tsdirname);
333 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
334
335 static void walkdir(const char *path,
336 void (*action) (const char *fname, bool isdir, int elevel),
337 bool process_symlinks,
338 int elevel);
339 #ifdef PG_FLUSH_DATA_WORKS
340 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
341 #endif
342 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
343 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
344
345 static int fsync_parent_path(const char *fname, int elevel);
346
347
348 /*
349 * pg_fsync --- do fsync with or without writethrough
350 */
351 int
pg_fsync(int fd)352 pg_fsync(int fd)
353 {
354 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
355 struct stat st;
356
357 /*
358 * Some operating system implementations of fsync() have requirements
359 * about the file access modes that were used when their file descriptor
360 * argument was opened, and these requirements differ depending on whether
361 * the file descriptor is for a directory.
362 *
363 * For any file descriptor that may eventually be handed to fsync(), we
364 * should have opened it with access modes that are compatible with
365 * fsync() on all supported systems, otherwise the code may not be
366 * portable, even if it runs ok on the current system.
367 *
368 * We assert here that a descriptor for a file was opened with write
369 * permissions (either O_RDWR or O_WRONLY) and for a directory without
370 * write permissions (O_RDONLY).
371 *
372 * Ignore any fstat errors and let the follow-up fsync() do its work.
373 * Doing this sanity check here counts for the case where fsync() is
374 * disabled.
375 */
376 if (fstat(fd, &st) == 0)
377 {
378 int desc_flags = fcntl(fd, F_GETFL);
379
380 /*
381 * O_RDONLY is historically 0, so just make sure that for directories
382 * no write flags are used.
383 */
384 if (S_ISDIR(st.st_mode))
385 Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
386 else
387 Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
388 }
389 errno = 0;
390 #endif
391
392 /* #if is to skip the sync_method test if there's no need for it */
393 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
394 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
395 return pg_fsync_writethrough(fd);
396 else
397 #endif
398 return pg_fsync_no_writethrough(fd);
399 }
400
401
402 /*
403 * pg_fsync_no_writethrough --- same as fsync except does nothing if
404 * enableFsync is off
405 */
406 int
pg_fsync_no_writethrough(int fd)407 pg_fsync_no_writethrough(int fd)
408 {
409 if (enableFsync)
410 return fsync(fd);
411 else
412 return 0;
413 }
414
415 /*
416 * pg_fsync_writethrough
417 */
418 int
pg_fsync_writethrough(int fd)419 pg_fsync_writethrough(int fd)
420 {
421 if (enableFsync)
422 {
423 #ifdef WIN32
424 return _commit(fd);
425 #elif defined(F_FULLFSYNC)
426 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
427 #else
428 errno = ENOSYS;
429 return -1;
430 #endif
431 }
432 else
433 return 0;
434 }
435
436 /*
437 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
438 *
439 * Not all platforms have fdatasync; treat as fsync if not available.
440 */
441 int
pg_fdatasync(int fd)442 pg_fdatasync(int fd)
443 {
444 if (enableFsync)
445 {
446 #ifdef HAVE_FDATASYNC
447 return fdatasync(fd);
448 #else
449 return fsync(fd);
450 #endif
451 }
452 else
453 return 0;
454 }
455
456 /*
457 * pg_flush_data --- advise OS that the described dirty data should be flushed
458 *
459 * offset of 0 with nbytes 0 means that the entire file should be flushed
460 */
461 void
pg_flush_data(int fd,off_t offset,off_t nbytes)462 pg_flush_data(int fd, off_t offset, off_t nbytes)
463 {
464 /*
465 * Right now file flushing is primarily used to avoid making later
466 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
467 * if fsyncs are disabled - that's a decision we might want to make
468 * configurable at some point.
469 */
470 if (!enableFsync)
471 return;
472
473 /*
474 * We compile all alternatives that are supported on the current platform,
475 * to find portability problems more easily.
476 */
477 #if defined(HAVE_SYNC_FILE_RANGE)
478 {
479 int rc;
480 static bool not_implemented_by_kernel = false;
481
482 if (not_implemented_by_kernel)
483 return;
484
485 /*
486 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
487 * tells the OS that writeback for the specified blocks should be
488 * started, but that we don't want to wait for completion. Note that
489 * this call might block if too much dirty data exists in the range.
490 * This is the preferable method on OSs supporting it, as it works
491 * reliably when available (contrast to msync()) and doesn't flush out
492 * clean data (like FADV_DONTNEED).
493 */
494 rc = sync_file_range(fd, offset, nbytes,
495 SYNC_FILE_RANGE_WRITE);
496 if (rc != 0)
497 {
498 int elevel;
499
500 /*
501 * For systems that don't have an implementation of
502 * sync_file_range() such as Windows WSL, generate only one
503 * warning and then suppress all further attempts by this process.
504 */
505 if (errno == ENOSYS)
506 {
507 elevel = WARNING;
508 not_implemented_by_kernel = true;
509 }
510 else
511 elevel = data_sync_elevel(WARNING);
512
513 ereport(elevel,
514 (errcode_for_file_access(),
515 errmsg("could not flush dirty data: %m")));
516 }
517
518 return;
519 }
520 #endif
521 #if !defined(WIN32) && defined(MS_ASYNC)
522 {
523 void *p;
524 static int pagesize = 0;
525
526 /*
527 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
528 * writeback. On linux it only does so if MS_SYNC is specified, but
529 * then it does the writeback synchronously. Luckily all common linux
530 * systems have sync_file_range(). This is preferable over
531 * FADV_DONTNEED because it doesn't flush out clean data.
532 *
533 * We map the file (mmap()), tell the kernel to sync back the contents
534 * (msync()), and then remove the mapping again (munmap()).
535 */
536
537 /* mmap() needs actual length if we want to map whole file */
538 if (offset == 0 && nbytes == 0)
539 {
540 nbytes = lseek(fd, 0, SEEK_END);
541 if (nbytes < 0)
542 {
543 ereport(WARNING,
544 (errcode_for_file_access(),
545 errmsg("could not determine dirty data size: %m")));
546 return;
547 }
548 }
549
550 /*
551 * Some platforms reject partial-page mmap() attempts. To deal with
552 * that, just truncate the request to a page boundary. If any extra
553 * bytes don't get flushed, well, it's only a hint anyway.
554 */
555
556 /* fetch pagesize only once */
557 if (pagesize == 0)
558 pagesize = sysconf(_SC_PAGESIZE);
559
560 /* align length to pagesize, dropping any fractional page */
561 if (pagesize > 0)
562 nbytes = (nbytes / pagesize) * pagesize;
563
564 /* fractional-page request is a no-op */
565 if (nbytes <= 0)
566 return;
567
568 /*
569 * mmap could well fail, particularly on 32-bit platforms where there
570 * may simply not be enough address space. If so, silently fall
571 * through to the next implementation.
572 */
573 if (nbytes <= (off_t) SSIZE_MAX)
574 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
575 else
576 p = MAP_FAILED;
577
578 if (p != MAP_FAILED)
579 {
580 int rc;
581
582 rc = msync(p, (size_t) nbytes, MS_ASYNC);
583 if (rc != 0)
584 {
585 ereport(data_sync_elevel(WARNING),
586 (errcode_for_file_access(),
587 errmsg("could not flush dirty data: %m")));
588 /* NB: need to fall through to munmap()! */
589 }
590
591 rc = munmap(p, (size_t) nbytes);
592 if (rc != 0)
593 {
594 /* FATAL error because mapping would remain */
595 ereport(FATAL,
596 (errcode_for_file_access(),
597 errmsg("could not munmap() while flushing data: %m")));
598 }
599
600 return;
601 }
602 }
603 #endif
604 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
605 {
606 int rc;
607
608 /*
609 * Signal the kernel that the passed in range should not be cached
610 * anymore. This has the, desired, side effect of writing out dirty
611 * data, and the, undesired, side effect of likely discarding useful
612 * clean cached blocks. For the latter reason this is the least
613 * preferable method.
614 */
615
616 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
617
618 if (rc != 0)
619 {
620 /* don't error out, this is just a performance optimization */
621 ereport(WARNING,
622 (errcode_for_file_access(),
623 errmsg("could not flush dirty data: %m")));
624 }
625
626 return;
627 }
628 #endif
629 }
630
631 /*
632 * Truncate a file to a given length by name.
633 */
634 int
pg_truncate(const char * path,off_t length)635 pg_truncate(const char *path, off_t length)
636 {
637 #ifdef WIN32
638 int save_errno;
639 int ret;
640 int fd;
641
642 fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
643 if (fd >= 0)
644 {
645 ret = ftruncate(fd, 0);
646 save_errno = errno;
647 CloseTransientFile(fd);
648 errno = save_errno;
649 }
650 else
651 ret = -1;
652
653 return ret;
654 #else
655 return truncate(path, length);
656 #endif
657 }
658
659 /*
660 * fsync_fname -- fsync a file or directory, handling errors properly
661 *
662 * Try to fsync a file or directory. When doing the latter, ignore errors that
663 * indicate the OS just doesn't allow/require fsyncing directories.
664 */
665 void
fsync_fname(const char * fname,bool isdir)666 fsync_fname(const char *fname, bool isdir)
667 {
668 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
669 }
670
671 /*
672 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
673 *
674 * This routine ensures that, after returning, the effect of renaming file
675 * persists in case of a crash. A crash while this routine is running will
676 * leave you with either the pre-existing or the moved file in place of the
677 * new file; no mixed state or truncated files are possible.
678 *
679 * It does so by using fsync on the old filename and the possibly existing
680 * target filename before the rename, and the target file and directory after.
681 *
682 * Note that rename() cannot be used across arbitrary directories, as they
683 * might not be on the same filesystem. Therefore this routine does not
684 * support renaming across directories.
685 *
686 * Log errors with the caller specified severity.
687 *
688 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
689 * valid upon return.
690 */
691 int
durable_rename(const char * oldfile,const char * newfile,int elevel)692 durable_rename(const char *oldfile, const char *newfile, int elevel)
693 {
694 int fd;
695
696 /*
697 * First fsync the old and target path (if it exists), to ensure that they
698 * are properly persistent on disk. Syncing the target file is not
699 * strictly necessary, but it makes it easier to reason about crashes;
700 * because it's then guaranteed that either source or target file exists
701 * after a crash.
702 */
703 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
704 return -1;
705
706 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
707 if (fd < 0)
708 {
709 if (errno != ENOENT)
710 {
711 ereport(elevel,
712 (errcode_for_file_access(),
713 errmsg("could not open file \"%s\": %m", newfile)));
714 return -1;
715 }
716 }
717 else
718 {
719 if (pg_fsync(fd) != 0)
720 {
721 int save_errno;
722
723 /* close file upon error, might not be in transaction context */
724 save_errno = errno;
725 CloseTransientFile(fd);
726 errno = save_errno;
727
728 ereport(elevel,
729 (errcode_for_file_access(),
730 errmsg("could not fsync file \"%s\": %m", newfile)));
731 return -1;
732 }
733
734 if (CloseTransientFile(fd) != 0)
735 {
736 ereport(elevel,
737 (errcode_for_file_access(),
738 errmsg("could not close file \"%s\": %m", newfile)));
739 return -1;
740 }
741 }
742
743 /* Time to do the real deal... */
744 if (rename(oldfile, newfile) < 0)
745 {
746 ereport(elevel,
747 (errcode_for_file_access(),
748 errmsg("could not rename file \"%s\" to \"%s\": %m",
749 oldfile, newfile)));
750 return -1;
751 }
752
753 /*
754 * To guarantee renaming the file is persistent, fsync the file with its
755 * new name, and its containing directory.
756 */
757 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
758 return -1;
759
760 if (fsync_parent_path(newfile, elevel) != 0)
761 return -1;
762
763 return 0;
764 }
765
766 /*
767 * durable_unlink -- remove a file in a durable manner
768 *
769 * This routine ensures that, after returning, the effect of removing file
770 * persists in case of a crash. A crash while this routine is running will
771 * leave the system in no mixed state.
772 *
773 * It does so by using fsync on the parent directory of the file after the
774 * actual removal is done.
775 *
776 * Log errors with the severity specified by caller.
777 *
778 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779 * valid upon return.
780 */
781 int
durable_unlink(const char * fname,int elevel)782 durable_unlink(const char *fname, int elevel)
783 {
784 if (unlink(fname) < 0)
785 {
786 ereport(elevel,
787 (errcode_for_file_access(),
788 errmsg("could not remove file \"%s\": %m",
789 fname)));
790 return -1;
791 }
792
793 /*
794 * To guarantee that the removal of the file is persistent, fsync its
795 * parent directory.
796 */
797 if (fsync_parent_path(fname, elevel) != 0)
798 return -1;
799
800 return 0;
801 }
802
803 /*
804 * durable_rename_excl -- rename a file in a durable manner.
805 *
806 * Similar to durable_rename(), except that this routine tries (but does not
807 * guarantee) not to overwrite the target file.
808 *
809 * Note that a crash in an unfortunate moment can leave you with two links to
810 * the target file.
811 *
812 * Log errors with the caller specified severity.
813 *
814 * On Windows, using a hard link followed by unlink() causes concurrency
815 * issues, while a simple rename() does not cause that, so be careful when
816 * changing the logic of this routine.
817 *
818 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
819 * valid upon return.
820 */
821 int
durable_rename_excl(const char * oldfile,const char * newfile,int elevel)822 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
823 {
824 /*
825 * Ensure that, if we crash directly after the rename/link, a file with
826 * valid contents is moved into place.
827 */
828 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
829 return -1;
830
831 #ifdef HAVE_WORKING_LINK
832 if (link(oldfile, newfile) < 0)
833 {
834 ereport(elevel,
835 (errcode_for_file_access(),
836 errmsg("could not link file \"%s\" to \"%s\": %m",
837 oldfile, newfile)));
838 return -1;
839 }
840 unlink(oldfile);
841 #else
842 if (rename(oldfile, newfile) < 0)
843 {
844 ereport(elevel,
845 (errcode_for_file_access(),
846 errmsg("could not rename file \"%s\" to \"%s\": %m",
847 oldfile, newfile)));
848 return -1;
849 }
850 #endif
851
852 /*
853 * Make change persistent in case of an OS crash, both the new entry and
854 * its parent directory need to be flushed.
855 */
856 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
857 return -1;
858
859 /* Same for parent directory */
860 if (fsync_parent_path(newfile, elevel) != 0)
861 return -1;
862
863 return 0;
864 }
865
866 /*
867 * InitFileAccess --- initialize this module during backend startup
868 *
869 * This is called during either normal or standalone backend start.
870 * It is *not* called in the postmaster.
871 */
872 void
InitFileAccess(void)873 InitFileAccess(void)
874 {
875 Assert(SizeVfdCache == 0); /* call me only once */
876
877 /* initialize cache header entry */
878 VfdCache = (Vfd *) malloc(sizeof(Vfd));
879 if (VfdCache == NULL)
880 ereport(FATAL,
881 (errcode(ERRCODE_OUT_OF_MEMORY),
882 errmsg("out of memory")));
883
884 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
885 VfdCache->fd = VFD_CLOSED;
886
887 SizeVfdCache = 1;
888
889 /* register proc-exit hook to ensure temp files are dropped at exit */
890 on_proc_exit(AtProcExit_Files, 0);
891 }
892
893 /*
894 * count_usable_fds --- count how many FDs the system will let us open,
895 * and estimate how many are already open.
896 *
897 * We stop counting if usable_fds reaches max_to_probe. Note: a small
898 * value of max_to_probe might result in an underestimate of already_open;
899 * we must fill in any "gaps" in the set of used FDs before the calculation
900 * of already_open will give the right answer. In practice, max_to_probe
901 * of a couple of dozen should be enough to ensure good results.
902 *
903 * We assume stderr (FD 2) is available for dup'ing. While the calling
904 * script could theoretically close that, it would be a really bad idea,
905 * since then one risks loss of error messages from, e.g., libc.
906 */
907 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)908 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
909 {
910 int *fd;
911 int size;
912 int used = 0;
913 int highestfd = 0;
914 int j;
915
916 #ifdef HAVE_GETRLIMIT
917 struct rlimit rlim;
918 int getrlimit_status;
919 #endif
920
921 size = 1024;
922 fd = (int *) palloc(size * sizeof(int));
923
924 #ifdef HAVE_GETRLIMIT
925 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
926 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
927 #else /* but BSD doesn't ... */
928 getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
929 #endif /* RLIMIT_NOFILE */
930 if (getrlimit_status != 0)
931 ereport(WARNING, (errmsg("getrlimit failed: %m")));
932 #endif /* HAVE_GETRLIMIT */
933
934 /* dup until failure or probe limit reached */
935 for (;;)
936 {
937 int thisfd;
938
939 #ifdef HAVE_GETRLIMIT
940
941 /*
942 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
943 * some platforms
944 */
945 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
946 break;
947 #endif
948
949 thisfd = dup(2);
950 if (thisfd < 0)
951 {
952 /* Expect EMFILE or ENFILE, else it's fishy */
953 if (errno != EMFILE && errno != ENFILE)
954 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
955 break;
956 }
957
958 if (used >= size)
959 {
960 size *= 2;
961 fd = (int *) repalloc(fd, size * sizeof(int));
962 }
963 fd[used++] = thisfd;
964
965 if (highestfd < thisfd)
966 highestfd = thisfd;
967
968 if (used >= max_to_probe)
969 break;
970 }
971
972 /* release the files we opened */
973 for (j = 0; j < used; j++)
974 close(fd[j]);
975
976 pfree(fd);
977
978 /*
979 * Return results. usable_fds is just the number of successful dups. We
980 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
981 * number) and so already_open is highestfd+1 - usable_fds.
982 */
983 *usable_fds = used;
984 *already_open = highestfd + 1 - used;
985 }
986
987 /*
988 * set_max_safe_fds
989 * Determine number of file descriptors that fd.c is allowed to use
990 */
991 void
set_max_safe_fds(void)992 set_max_safe_fds(void)
993 {
994 int usable_fds;
995 int already_open;
996
997 /*----------
998 * We want to set max_safe_fds to
999 * MIN(usable_fds, max_files_per_process - already_open)
1000 * less the slop factor for files that are opened without consulting
1001 * fd.c. This ensures that we won't exceed either max_files_per_process
1002 * or the experimentally-determined EMFILE limit.
1003 *----------
1004 */
1005 count_usable_fds(max_files_per_process,
1006 &usable_fds, &already_open);
1007
1008 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1009
1010 /*
1011 * Take off the FDs reserved for system() etc.
1012 */
1013 max_safe_fds -= NUM_RESERVED_FDS;
1014
1015 /*
1016 * Make sure we still have enough to get by.
1017 */
1018 if (max_safe_fds < FD_MINFREE)
1019 ereport(FATAL,
1020 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1021 errmsg("insufficient file descriptors available to start server process"),
1022 errdetail("System allows %d, we need at least %d.",
1023 max_safe_fds + NUM_RESERVED_FDS,
1024 FD_MINFREE + NUM_RESERVED_FDS)));
1025
1026 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1027 max_safe_fds, usable_fds, already_open);
1028 }
1029
1030 /*
1031 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1032 * fileMode parameter.
1033 */
1034 int
BasicOpenFile(const char * fileName,int fileFlags)1035 BasicOpenFile(const char *fileName, int fileFlags)
1036 {
1037 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1038 }
1039
1040 /*
1041 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1042 *
1043 * This is exported for use by places that really want a plain kernel FD,
1044 * but need to be proof against running out of FDs. Once an FD has been
1045 * successfully returned, it is the caller's responsibility to ensure that
1046 * it will not be leaked on ereport()! Most users should *not* call this
1047 * routine directly, but instead use the VFD abstraction level, which
1048 * provides protection against descriptor leaks as well as management of
1049 * files that need to be open for more than a short period of time.
1050 *
1051 * Ideally this should be the *only* direct call of open() in the backend.
1052 * In practice, the postmaster calls open() directly, and there are some
1053 * direct open() calls done early in backend startup. Those are OK since
1054 * this module wouldn't have any open files to close at that point anyway.
1055 */
1056 int
BasicOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1057 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1058 {
1059 int fd;
1060
1061 tryAgain:
1062 fd = open(fileName, fileFlags, fileMode);
1063
1064 if (fd >= 0)
1065 return fd; /* success! */
1066
1067 if (errno == EMFILE || errno == ENFILE)
1068 {
1069 int save_errno = errno;
1070
1071 ereport(LOG,
1072 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1073 errmsg("out of file descriptors: %m; release and retry")));
1074 errno = 0;
1075 if (ReleaseLruFile())
1076 goto tryAgain;
1077 errno = save_errno;
1078 }
1079
1080 return -1; /* failure */
1081 }
1082
1083 /*
1084 * AcquireExternalFD - attempt to reserve an external file descriptor
1085 *
1086 * This should be used by callers that need to hold a file descriptor open
1087 * over more than a short interval, but cannot use any of the other facilities
1088 * provided by this module.
1089 *
1090 * The difference between this and the underlying ReserveExternalFD function
1091 * is that this will report failure (by setting errno and returning false)
1092 * if "too many" external FDs are already reserved. This should be used in
1093 * any code where the total number of FDs to be reserved is not predictable
1094 * and small.
1095 */
1096 bool
AcquireExternalFD(void)1097 AcquireExternalFD(void)
1098 {
1099 /*
1100 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1101 * "external" FDs.
1102 */
1103 if (numExternalFDs < max_safe_fds / 3)
1104 {
1105 ReserveExternalFD();
1106 return true;
1107 }
1108 errno = EMFILE;
1109 return false;
1110 }
1111
1112 /*
1113 * ReserveExternalFD - report external consumption of a file descriptor
1114 *
1115 * This should be used by callers that need to hold a file descriptor open
1116 * over more than a short interval, but cannot use any of the other facilities
1117 * provided by this module. This just tracks the use of the FD and closes
1118 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1119 *
1120 * Call this directly only in code where failure to reserve the FD would be
1121 * fatal; for example, the WAL-writing code does so, since the alternative is
1122 * session failure. Also, it's very unwise to do so in code that could
1123 * consume more than one FD per process.
1124 *
1125 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1126 * available, it doesn't matter too much whether this is called before or
1127 * after actually opening the FD; but doing so beforehand reduces the risk of
1128 * an EMFILE failure if not everybody played nice. In any case, it's solely
1129 * caller's responsibility to keep the external-FD count in sync with reality.
1130 */
1131 void
ReserveExternalFD(void)1132 ReserveExternalFD(void)
1133 {
1134 /*
1135 * Release VFDs if needed to stay safe. Because we do this before
1136 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1137 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1138 */
1139 ReleaseLruFiles();
1140
1141 numExternalFDs++;
1142 }
1143
1144 /*
1145 * ReleaseExternalFD - report release of an external file descriptor
1146 *
1147 * This is guaranteed not to change errno, so it can be used in failure paths.
1148 */
1149 void
ReleaseExternalFD(void)1150 ReleaseExternalFD(void)
1151 {
1152 Assert(numExternalFDs > 0);
1153 numExternalFDs--;
1154 }
1155
1156
1157 #if defined(FDDEBUG)
1158
1159 static void
_dump_lru(void)1160 _dump_lru(void)
1161 {
1162 int mru = VfdCache[0].lruLessRecently;
1163 Vfd *vfdP = &VfdCache[mru];
1164 char buf[2048];
1165
1166 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1167 while (mru != 0)
1168 {
1169 mru = vfdP->lruLessRecently;
1170 vfdP = &VfdCache[mru];
1171 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1172 }
1173 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1174 elog(LOG, "%s", buf);
1175 }
1176 #endif /* FDDEBUG */
1177
1178 static void
Delete(File file)1179 Delete(File file)
1180 {
1181 Vfd *vfdP;
1182
1183 Assert(file != 0);
1184
1185 DO_DB(elog(LOG, "Delete %d (%s)",
1186 file, VfdCache[file].fileName));
1187 DO_DB(_dump_lru());
1188
1189 vfdP = &VfdCache[file];
1190
1191 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1192 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1193
1194 DO_DB(_dump_lru());
1195 }
1196
1197 static void
LruDelete(File file)1198 LruDelete(File file)
1199 {
1200 Vfd *vfdP;
1201
1202 Assert(file != 0);
1203
1204 DO_DB(elog(LOG, "LruDelete %d (%s)",
1205 file, VfdCache[file].fileName));
1206
1207 vfdP = &VfdCache[file];
1208
1209 /*
1210 * Close the file. We aren't expecting this to fail; if it does, better
1211 * to leak the FD than to mess up our internal state.
1212 */
1213 if (close(vfdP->fd) != 0)
1214 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1215 "could not close file \"%s\": %m", vfdP->fileName);
1216 vfdP->fd = VFD_CLOSED;
1217 --nfile;
1218
1219 /* delete the vfd record from the LRU ring */
1220 Delete(file);
1221 }
1222
1223 static void
Insert(File file)1224 Insert(File file)
1225 {
1226 Vfd *vfdP;
1227
1228 Assert(file != 0);
1229
1230 DO_DB(elog(LOG, "Insert %d (%s)",
1231 file, VfdCache[file].fileName));
1232 DO_DB(_dump_lru());
1233
1234 vfdP = &VfdCache[file];
1235
1236 vfdP->lruMoreRecently = 0;
1237 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1238 VfdCache[0].lruLessRecently = file;
1239 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1240
1241 DO_DB(_dump_lru());
1242 }
1243
1244 /* returns 0 on success, -1 on re-open failure (with errno set) */
1245 static int
LruInsert(File file)1246 LruInsert(File file)
1247 {
1248 Vfd *vfdP;
1249
1250 Assert(file != 0);
1251
1252 DO_DB(elog(LOG, "LruInsert %d (%s)",
1253 file, VfdCache[file].fileName));
1254
1255 vfdP = &VfdCache[file];
1256
1257 if (FileIsNotOpen(file))
1258 {
1259 /* Close excess kernel FDs. */
1260 ReleaseLruFiles();
1261
1262 /*
1263 * The open could still fail for lack of file descriptors, eg due to
1264 * overall system file table being full. So, be prepared to release
1265 * another FD if necessary...
1266 */
1267 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1268 vfdP->fileMode);
1269 if (vfdP->fd < 0)
1270 {
1271 DO_DB(elog(LOG, "re-open failed: %m"));
1272 return -1;
1273 }
1274 else
1275 {
1276 ++nfile;
1277 }
1278 }
1279
1280 /*
1281 * put it at the head of the Lru ring
1282 */
1283
1284 Insert(file);
1285
1286 return 0;
1287 }
1288
1289 /*
1290 * Release one kernel FD by closing the least-recently-used VFD.
1291 */
1292 static bool
ReleaseLruFile(void)1293 ReleaseLruFile(void)
1294 {
1295 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1296
1297 if (nfile > 0)
1298 {
1299 /*
1300 * There are opened files and so there should be at least one used vfd
1301 * in the ring.
1302 */
1303 Assert(VfdCache[0].lruMoreRecently != 0);
1304 LruDelete(VfdCache[0].lruMoreRecently);
1305 return true; /* freed a file */
1306 }
1307 return false; /* no files available to free */
1308 }
1309
1310 /*
1311 * Release kernel FDs as needed to get under the max_safe_fds limit.
1312 * After calling this, it's OK to try to open another file.
1313 */
1314 static void
ReleaseLruFiles(void)1315 ReleaseLruFiles(void)
1316 {
1317 while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1318 {
1319 if (!ReleaseLruFile())
1320 break;
1321 }
1322 }
1323
1324 static File
AllocateVfd(void)1325 AllocateVfd(void)
1326 {
1327 Index i;
1328 File file;
1329
1330 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1331
1332 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1333
1334 if (VfdCache[0].nextFree == 0)
1335 {
1336 /*
1337 * The free list is empty so it is time to increase the size of the
1338 * array. We choose to double it each time this happens. However,
1339 * there's not much point in starting *real* small.
1340 */
1341 Size newCacheSize = SizeVfdCache * 2;
1342 Vfd *newVfdCache;
1343
1344 if (newCacheSize < 32)
1345 newCacheSize = 32;
1346
1347 /*
1348 * Be careful not to clobber VfdCache ptr if realloc fails.
1349 */
1350 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1351 if (newVfdCache == NULL)
1352 ereport(ERROR,
1353 (errcode(ERRCODE_OUT_OF_MEMORY),
1354 errmsg("out of memory")));
1355 VfdCache = newVfdCache;
1356
1357 /*
1358 * Initialize the new entries and link them into the free list.
1359 */
1360 for (i = SizeVfdCache; i < newCacheSize; i++)
1361 {
1362 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1363 VfdCache[i].nextFree = i + 1;
1364 VfdCache[i].fd = VFD_CLOSED;
1365 }
1366 VfdCache[newCacheSize - 1].nextFree = 0;
1367 VfdCache[0].nextFree = SizeVfdCache;
1368
1369 /*
1370 * Record the new size
1371 */
1372 SizeVfdCache = newCacheSize;
1373 }
1374
1375 file = VfdCache[0].nextFree;
1376
1377 VfdCache[0].nextFree = VfdCache[file].nextFree;
1378
1379 return file;
1380 }
1381
1382 static void
FreeVfd(File file)1383 FreeVfd(File file)
1384 {
1385 Vfd *vfdP = &VfdCache[file];
1386
1387 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1388 file, vfdP->fileName ? vfdP->fileName : ""));
1389
1390 if (vfdP->fileName != NULL)
1391 {
1392 free(vfdP->fileName);
1393 vfdP->fileName = NULL;
1394 }
1395 vfdP->fdstate = 0x0;
1396
1397 vfdP->nextFree = VfdCache[0].nextFree;
1398 VfdCache[0].nextFree = file;
1399 }
1400
1401 /* returns 0 on success, -1 on re-open failure (with errno set) */
1402 static int
FileAccess(File file)1403 FileAccess(File file)
1404 {
1405 int returnValue;
1406
1407 DO_DB(elog(LOG, "FileAccess %d (%s)",
1408 file, VfdCache[file].fileName));
1409
1410 /*
1411 * Is the file open? If not, open it and put it at the head of the LRU
1412 * ring (possibly closing the least recently used file to get an FD).
1413 */
1414
1415 if (FileIsNotOpen(file))
1416 {
1417 returnValue = LruInsert(file);
1418 if (returnValue != 0)
1419 return returnValue;
1420 }
1421 else if (VfdCache[0].lruLessRecently != file)
1422 {
1423 /*
1424 * We now know that the file is open and that it is not the last one
1425 * accessed, so we need to move it to the head of the Lru ring.
1426 */
1427
1428 Delete(file);
1429 Insert(file);
1430 }
1431
1432 return 0;
1433 }
1434
1435 /*
1436 * Called whenever a temporary file is deleted to report its size.
1437 */
1438 static void
ReportTemporaryFileUsage(const char * path,off_t size)1439 ReportTemporaryFileUsage(const char *path, off_t size)
1440 {
1441 pgstat_report_tempfile(size);
1442
1443 if (log_temp_files >= 0)
1444 {
1445 if ((size / 1024) >= log_temp_files)
1446 ereport(LOG,
1447 (errmsg("temporary file: path \"%s\", size %lu",
1448 path, (unsigned long) size)));
1449 }
1450 }
1451
1452 /*
1453 * Called to register a temporary file for automatic close.
1454 * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1455 * before the file was opened.
1456 */
1457 static void
RegisterTemporaryFile(File file)1458 RegisterTemporaryFile(File file)
1459 {
1460 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1461 VfdCache[file].resowner = CurrentResourceOwner;
1462
1463 /* Backup mechanism for closing at end of xact. */
1464 VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1465 have_xact_temporary_files = true;
1466 }
1467
1468 /*
1469 * Called when we get a shared invalidation message on some relation.
1470 */
1471 #ifdef NOT_USED
1472 void
FileInvalidate(File file)1473 FileInvalidate(File file)
1474 {
1475 Assert(FileIsValid(file));
1476 if (!FileIsNotOpen(file))
1477 LruDelete(file);
1478 }
1479 #endif
1480
1481 /*
1482 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1483 * fileMode parameter.
1484 */
1485 File
PathNameOpenFile(const char * fileName,int fileFlags)1486 PathNameOpenFile(const char *fileName, int fileFlags)
1487 {
1488 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1489 }
1490
1491 /*
1492 * open a file in an arbitrary directory
1493 *
1494 * NB: if the passed pathname is relative (which it usually is),
1495 * it will be interpreted relative to the process' working directory
1496 * (which should always be $PGDATA when this code is running).
1497 */
1498 File
PathNameOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1499 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1500 {
1501 char *fnamecopy;
1502 File file;
1503 Vfd *vfdP;
1504
1505 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1506 fileName, fileFlags, fileMode));
1507
1508 /*
1509 * We need a malloc'd copy of the file name; fail cleanly if no room.
1510 */
1511 fnamecopy = strdup(fileName);
1512 if (fnamecopy == NULL)
1513 ereport(ERROR,
1514 (errcode(ERRCODE_OUT_OF_MEMORY),
1515 errmsg("out of memory")));
1516
1517 file = AllocateVfd();
1518 vfdP = &VfdCache[file];
1519
1520 /* Close excess kernel FDs. */
1521 ReleaseLruFiles();
1522
1523 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1524
1525 if (vfdP->fd < 0)
1526 {
1527 int save_errno = errno;
1528
1529 FreeVfd(file);
1530 free(fnamecopy);
1531 errno = save_errno;
1532 return -1;
1533 }
1534 ++nfile;
1535 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1536 vfdP->fd));
1537
1538 vfdP->fileName = fnamecopy;
1539 /* Saved flags are adjusted to be OK for re-opening file */
1540 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1541 vfdP->fileMode = fileMode;
1542 vfdP->fileSize = 0;
1543 vfdP->fdstate = 0x0;
1544 vfdP->resowner = NULL;
1545
1546 Insert(file);
1547
1548 return file;
1549 }
1550
1551 /*
1552 * Create directory 'directory'. If necessary, create 'basedir', which must
1553 * be the directory above it. This is designed for creating the top-level
1554 * temporary directory on demand before creating a directory underneath it.
1555 * Do nothing if the directory already exists.
1556 *
1557 * Directories created within the top-level temporary directory should begin
1558 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1559 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1560 * that do not need any particular prefix.
1561 */
1562 void
PathNameCreateTemporaryDir(const char * basedir,const char * directory)1563 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1564 {
1565 if (MakePGDirectory(directory) < 0)
1566 {
1567 if (errno == EEXIST)
1568 return;
1569
1570 /*
1571 * Failed. Try to create basedir first in case it's missing. Tolerate
1572 * EEXIST to close a race against another process following the same
1573 * algorithm.
1574 */
1575 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1576 ereport(ERROR,
1577 (errcode_for_file_access(),
1578 errmsg("cannot create temporary directory \"%s\": %m",
1579 basedir)));
1580
1581 /* Try again. */
1582 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1583 ereport(ERROR,
1584 (errcode_for_file_access(),
1585 errmsg("cannot create temporary subdirectory \"%s\": %m",
1586 directory)));
1587 }
1588 }
1589
1590 /*
1591 * Delete a directory and everything in it, if it exists.
1592 */
1593 void
PathNameDeleteTemporaryDir(const char * dirname)1594 PathNameDeleteTemporaryDir(const char *dirname)
1595 {
1596 struct stat statbuf;
1597
1598 /* Silently ignore missing directory. */
1599 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1600 return;
1601
1602 /*
1603 * Currently, walkdir doesn't offer a way for our passed in function to
1604 * maintain state. Perhaps it should, so that we could tell the caller
1605 * whether this operation succeeded or failed. Since this operation is
1606 * used in a cleanup path, we wouldn't actually behave differently: we'll
1607 * just log failures.
1608 */
1609 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1610 }
1611
1612 /*
1613 * Open a temporary file that will disappear when we close it.
1614 *
1615 * This routine takes care of generating an appropriate tempfile name.
1616 * There's no need to pass in fileFlags or fileMode either, since only
1617 * one setting makes any sense for a temp file.
1618 *
1619 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1620 * to ensure it's closed and deleted when it's no longer needed, typically at
1621 * the end-of-transaction. In most cases, you don't want temporary files to
1622 * outlive the transaction that created them, so this should be false -- but
1623 * if you need "somewhat" temporary storage, this might be useful. In either
1624 * case, the file is removed when the File is explicitly closed.
1625 */
1626 File
OpenTemporaryFile(bool interXact)1627 OpenTemporaryFile(bool interXact)
1628 {
1629 File file = 0;
1630
1631 /*
1632 * Make sure the current resource owner has space for this File before we
1633 * open it, if we'll be registering it below.
1634 */
1635 if (!interXact)
1636 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1637
1638 /*
1639 * If some temp tablespace(s) have been given to us, try to use the next
1640 * one. If a given tablespace can't be found, we silently fall back to
1641 * the database's default tablespace.
1642 *
1643 * BUT: if the temp file is slated to outlive the current transaction,
1644 * force it into the database's default tablespace, so that it will not
1645 * pose a threat to possible tablespace drop attempts.
1646 */
1647 if (numTempTableSpaces > 0 && !interXact)
1648 {
1649 Oid tblspcOid = GetNextTempTableSpace();
1650
1651 if (OidIsValid(tblspcOid))
1652 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1653 }
1654
1655 /*
1656 * If not, or if tablespace is bad, create in database's default
1657 * tablespace. MyDatabaseTableSpace should normally be set before we get
1658 * here, but just in case it isn't, fall back to pg_default tablespace.
1659 */
1660 if (file <= 0)
1661 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1662 MyDatabaseTableSpace :
1663 DEFAULTTABLESPACE_OID,
1664 true);
1665
1666 /* Mark it for deletion at close and temporary file size limit */
1667 VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1668
1669 /* Register it with the current resource owner */
1670 if (!interXact)
1671 RegisterTemporaryFile(file);
1672
1673 return file;
1674 }
1675
1676 /*
1677 * Return the path of the temp directory in a given tablespace.
1678 */
1679 void
TempTablespacePath(char * path,Oid tablespace)1680 TempTablespacePath(char *path, Oid tablespace)
1681 {
1682 /*
1683 * Identify the tempfile directory for this tablespace.
1684 *
1685 * If someone tries to specify pg_global, use pg_default instead.
1686 */
1687 if (tablespace == InvalidOid ||
1688 tablespace == DEFAULTTABLESPACE_OID ||
1689 tablespace == GLOBALTABLESPACE_OID)
1690 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1691 else
1692 {
1693 /* All other tablespaces are accessed via symlinks */
1694 snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1695 tablespace, TABLESPACE_VERSION_DIRECTORY,
1696 PG_TEMP_FILES_DIR);
1697 }
1698 }
1699
1700 /*
1701 * Open a temporary file in a specific tablespace.
1702 * Subroutine for OpenTemporaryFile, which see for details.
1703 */
1704 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1705 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1706 {
1707 char tempdirpath[MAXPGPATH];
1708 char tempfilepath[MAXPGPATH];
1709 File file;
1710
1711 TempTablespacePath(tempdirpath, tblspcOid);
1712
1713 /*
1714 * Generate a tempfile name that should be unique within the current
1715 * database instance.
1716 */
1717 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1718 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1719
1720 /*
1721 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1722 * temp file that can be reused.
1723 */
1724 file = PathNameOpenFile(tempfilepath,
1725 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1726 if (file <= 0)
1727 {
1728 /*
1729 * We might need to create the tablespace's tempfile directory, if no
1730 * one has yet done so.
1731 *
1732 * Don't check for an error from MakePGDirectory; it could fail if
1733 * someone else just did the same thing. If it doesn't work then
1734 * we'll bomb out on the second create attempt, instead.
1735 */
1736 (void) MakePGDirectory(tempdirpath);
1737
1738 file = PathNameOpenFile(tempfilepath,
1739 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1740 if (file <= 0 && rejectError)
1741 elog(ERROR, "could not create temporary file \"%s\": %m",
1742 tempfilepath);
1743 }
1744
1745 return file;
1746 }
1747
1748
1749 /*
1750 * Create a new file. The directory containing it must already exist. Files
1751 * created this way are subject to temp_file_limit and are automatically
1752 * closed at end of transaction, but are not automatically deleted on close
1753 * because they are intended to be shared between cooperating backends.
1754 *
1755 * If the file is inside the top-level temporary directory, its name should
1756 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1757 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1758 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1759 * the prefix isn't needed.
1760 */
1761 File
PathNameCreateTemporaryFile(const char * path,bool error_on_failure)1762 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1763 {
1764 File file;
1765
1766 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1767
1768 /*
1769 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1770 * temp file that can be reused.
1771 */
1772 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1773 if (file <= 0)
1774 {
1775 if (error_on_failure)
1776 ereport(ERROR,
1777 (errcode_for_file_access(),
1778 errmsg("could not create temporary file \"%s\": %m",
1779 path)));
1780 else
1781 return file;
1782 }
1783
1784 /* Mark it for temp_file_limit accounting. */
1785 VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1786
1787 /* Register it for automatic close. */
1788 RegisterTemporaryFile(file);
1789
1790 return file;
1791 }
1792
1793 /*
1794 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1795 * another backend. Files opened this way don't count against the
1796 * temp_file_limit of the caller, are automatically closed at the end of the
1797 * transaction but are not deleted on close.
1798 */
1799 File
PathNameOpenTemporaryFile(const char * path,int mode)1800 PathNameOpenTemporaryFile(const char *path, int mode)
1801 {
1802 File file;
1803
1804 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1805
1806 file = PathNameOpenFile(path, mode | PG_BINARY);
1807
1808 /* If no such file, then we don't raise an error. */
1809 if (file <= 0 && errno != ENOENT)
1810 ereport(ERROR,
1811 (errcode_for_file_access(),
1812 errmsg("could not open temporary file \"%s\": %m",
1813 path)));
1814
1815 if (file > 0)
1816 {
1817 /* Register it for automatic close. */
1818 RegisterTemporaryFile(file);
1819 }
1820
1821 return file;
1822 }
1823
1824 /*
1825 * Delete a file by pathname. Return true if the file existed, false if
1826 * didn't.
1827 */
1828 bool
PathNameDeleteTemporaryFile(const char * path,bool error_on_failure)1829 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1830 {
1831 struct stat filestats;
1832 int stat_errno;
1833
1834 /* Get the final size for pgstat reporting. */
1835 if (stat(path, &filestats) != 0)
1836 stat_errno = errno;
1837 else
1838 stat_errno = 0;
1839
1840 /*
1841 * Unlike FileClose's automatic file deletion code, we tolerate
1842 * non-existence to support BufFileDeleteShared which doesn't know how
1843 * many segments it has to delete until it runs out.
1844 */
1845 if (stat_errno == ENOENT)
1846 return false;
1847
1848 if (unlink(path) < 0)
1849 {
1850 if (errno != ENOENT)
1851 ereport(error_on_failure ? ERROR : LOG,
1852 (errcode_for_file_access(),
1853 errmsg("could not unlink temporary file \"%s\": %m",
1854 path)));
1855 return false;
1856 }
1857
1858 if (stat_errno == 0)
1859 ReportTemporaryFileUsage(path, filestats.st_size);
1860 else
1861 {
1862 errno = stat_errno;
1863 ereport(LOG,
1864 (errcode_for_file_access(),
1865 errmsg("could not stat file \"%s\": %m", path)));
1866 }
1867
1868 return true;
1869 }
1870
1871 /*
1872 * close a file when done with it
1873 */
1874 void
FileClose(File file)1875 FileClose(File file)
1876 {
1877 Vfd *vfdP;
1878
1879 Assert(FileIsValid(file));
1880
1881 DO_DB(elog(LOG, "FileClose: %d (%s)",
1882 file, VfdCache[file].fileName));
1883
1884 vfdP = &VfdCache[file];
1885
1886 if (!FileIsNotOpen(file))
1887 {
1888 /* close the file */
1889 if (close(vfdP->fd) != 0)
1890 {
1891 /*
1892 * We may need to panic on failure to close non-temporary files;
1893 * see LruDelete.
1894 */
1895 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1896 "could not close file \"%s\": %m", vfdP->fileName);
1897 }
1898
1899 --nfile;
1900 vfdP->fd = VFD_CLOSED;
1901
1902 /* remove the file from the lru ring */
1903 Delete(file);
1904 }
1905
1906 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1907 {
1908 /* Subtract its size from current usage (do first in case of error) */
1909 temporary_files_size -= vfdP->fileSize;
1910 vfdP->fileSize = 0;
1911 }
1912
1913 /*
1914 * Delete the file if it was temporary, and make a log entry if wanted
1915 */
1916 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1917 {
1918 struct stat filestats;
1919 int stat_errno;
1920
1921 /*
1922 * If we get an error, as could happen within the ereport/elog calls,
1923 * we'll come right back here during transaction abort. Reset the
1924 * flag to ensure that we can't get into an infinite loop. This code
1925 * is arranged to ensure that the worst-case consequence is failing to
1926 * emit log message(s), not failing to attempt the unlink.
1927 */
1928 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1929
1930
1931 /* first try the stat() */
1932 if (stat(vfdP->fileName, &filestats))
1933 stat_errno = errno;
1934 else
1935 stat_errno = 0;
1936
1937 /* in any case do the unlink */
1938 if (unlink(vfdP->fileName))
1939 ereport(LOG,
1940 (errcode_for_file_access(),
1941 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
1942
1943 /* and last report the stat results */
1944 if (stat_errno == 0)
1945 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1946 else
1947 {
1948 errno = stat_errno;
1949 ereport(LOG,
1950 (errcode_for_file_access(),
1951 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
1952 }
1953 }
1954
1955 /* Unregister it from the resource owner */
1956 if (vfdP->resowner)
1957 ResourceOwnerForgetFile(vfdP->resowner, file);
1958
1959 /*
1960 * Return the Vfd slot to the free list
1961 */
1962 FreeVfd(file);
1963 }
1964
1965 /*
1966 * FilePrefetch - initiate asynchronous read of a given range of the file.
1967 *
1968 * Currently the only implementation of this function is using posix_fadvise
1969 * which is the simplest standardized interface that accomplishes this.
1970 * We could add an implementation using libaio in the future; but note that
1971 * this API is inappropriate for libaio, which wants to have a buffer provided
1972 * to read into.
1973 */
1974 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1975 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1976 {
1977 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1978 int returnCode;
1979
1980 Assert(FileIsValid(file));
1981
1982 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1983 file, VfdCache[file].fileName,
1984 (int64) offset, amount));
1985
1986 returnCode = FileAccess(file);
1987 if (returnCode < 0)
1988 return returnCode;
1989
1990 pgstat_report_wait_start(wait_event_info);
1991 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1992 POSIX_FADV_WILLNEED);
1993 pgstat_report_wait_end();
1994
1995 return returnCode;
1996 #else
1997 Assert(FileIsValid(file));
1998 return 0;
1999 #endif
2000 }
2001
2002 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)2003 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2004 {
2005 int returnCode;
2006
2007 Assert(FileIsValid(file));
2008
2009 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2010 file, VfdCache[file].fileName,
2011 (int64) offset, (int64) nbytes));
2012
2013 if (nbytes <= 0)
2014 return;
2015
2016 returnCode = FileAccess(file);
2017 if (returnCode < 0)
2018 return;
2019
2020 pgstat_report_wait_start(wait_event_info);
2021 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2022 pgstat_report_wait_end();
2023 }
2024
2025 int
FileRead(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)2026 FileRead(File file, char *buffer, int amount, off_t offset,
2027 uint32 wait_event_info)
2028 {
2029 int returnCode;
2030 Vfd *vfdP;
2031
2032 Assert(FileIsValid(file));
2033
2034 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
2035 file, VfdCache[file].fileName,
2036 (int64) offset,
2037 amount, buffer));
2038
2039 returnCode = FileAccess(file);
2040 if (returnCode < 0)
2041 return returnCode;
2042
2043 vfdP = &VfdCache[file];
2044
2045 retry:
2046 pgstat_report_wait_start(wait_event_info);
2047 returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2048 pgstat_report_wait_end();
2049
2050 if (returnCode < 0)
2051 {
2052 /*
2053 * Windows may run out of kernel buffers and return "Insufficient
2054 * system resources" error. Wait a bit and retry to solve it.
2055 *
2056 * It is rumored that EINTR is also possible on some Unix filesystems,
2057 * in which case immediate retry is indicated.
2058 */
2059 #ifdef WIN32
2060 DWORD error = GetLastError();
2061
2062 switch (error)
2063 {
2064 case ERROR_NO_SYSTEM_RESOURCES:
2065 pg_usleep(1000L);
2066 errno = EINTR;
2067 break;
2068 default:
2069 _dosmaperr(error);
2070 break;
2071 }
2072 #endif
2073 /* OK to retry if interrupted */
2074 if (errno == EINTR)
2075 goto retry;
2076 }
2077
2078 return returnCode;
2079 }
2080
2081 int
FileWrite(File file,char * buffer,int amount,off_t offset,uint32 wait_event_info)2082 FileWrite(File file, char *buffer, int amount, off_t offset,
2083 uint32 wait_event_info)
2084 {
2085 int returnCode;
2086 Vfd *vfdP;
2087
2088 Assert(FileIsValid(file));
2089
2090 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2091 file, VfdCache[file].fileName,
2092 (int64) offset,
2093 amount, buffer));
2094
2095 returnCode = FileAccess(file);
2096 if (returnCode < 0)
2097 return returnCode;
2098
2099 vfdP = &VfdCache[file];
2100
2101 /*
2102 * If enforcing temp_file_limit and it's a temp file, check to see if the
2103 * write would overrun temp_file_limit, and throw error if so. Note: it's
2104 * really a modularity violation to throw error here; we should set errno
2105 * and return -1. However, there's no way to report a suitable error
2106 * message if we do that. All current callers would just throw error
2107 * immediately anyway, so this is safe at present.
2108 */
2109 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2110 {
2111 off_t past_write = offset + amount;
2112
2113 if (past_write > vfdP->fileSize)
2114 {
2115 uint64 newTotal = temporary_files_size;
2116
2117 newTotal += past_write - vfdP->fileSize;
2118 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2119 ereport(ERROR,
2120 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2121 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2122 temp_file_limit)));
2123 }
2124 }
2125
2126 retry:
2127 errno = 0;
2128 pgstat_report_wait_start(wait_event_info);
2129 returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2130 pgstat_report_wait_end();
2131
2132 /* if write didn't set errno, assume problem is no disk space */
2133 if (returnCode != amount && errno == 0)
2134 errno = ENOSPC;
2135
2136 if (returnCode >= 0)
2137 {
2138 /*
2139 * Maintain fileSize and temporary_files_size if it's a temp file.
2140 */
2141 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2142 {
2143 off_t past_write = offset + amount;
2144
2145 if (past_write > vfdP->fileSize)
2146 {
2147 temporary_files_size += past_write - vfdP->fileSize;
2148 vfdP->fileSize = past_write;
2149 }
2150 }
2151 }
2152 else
2153 {
2154 /*
2155 * See comments in FileRead()
2156 */
2157 #ifdef WIN32
2158 DWORD error = GetLastError();
2159
2160 switch (error)
2161 {
2162 case ERROR_NO_SYSTEM_RESOURCES:
2163 pg_usleep(1000L);
2164 errno = EINTR;
2165 break;
2166 default:
2167 _dosmaperr(error);
2168 break;
2169 }
2170 #endif
2171 /* OK to retry if interrupted */
2172 if (errno == EINTR)
2173 goto retry;
2174 }
2175
2176 return returnCode;
2177 }
2178
2179 int
FileSync(File file,uint32 wait_event_info)2180 FileSync(File file, uint32 wait_event_info)
2181 {
2182 int returnCode;
2183
2184 Assert(FileIsValid(file));
2185
2186 DO_DB(elog(LOG, "FileSync: %d (%s)",
2187 file, VfdCache[file].fileName));
2188
2189 returnCode = FileAccess(file);
2190 if (returnCode < 0)
2191 return returnCode;
2192
2193 pgstat_report_wait_start(wait_event_info);
2194 returnCode = pg_fsync(VfdCache[file].fd);
2195 pgstat_report_wait_end();
2196
2197 return returnCode;
2198 }
2199
2200 off_t
FileSize(File file)2201 FileSize(File file)
2202 {
2203 Assert(FileIsValid(file));
2204
2205 DO_DB(elog(LOG, "FileSize %d (%s)",
2206 file, VfdCache[file].fileName));
2207
2208 if (FileIsNotOpen(file))
2209 {
2210 if (FileAccess(file) < 0)
2211 return (off_t) -1;
2212 }
2213
2214 return lseek(VfdCache[file].fd, 0, SEEK_END);
2215 }
2216
2217 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)2218 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2219 {
2220 int returnCode;
2221
2222 Assert(FileIsValid(file));
2223
2224 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2225 file, VfdCache[file].fileName));
2226
2227 returnCode = FileAccess(file);
2228 if (returnCode < 0)
2229 return returnCode;
2230
2231 pgstat_report_wait_start(wait_event_info);
2232 returnCode = ftruncate(VfdCache[file].fd, offset);
2233 pgstat_report_wait_end();
2234
2235 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2236 {
2237 /* adjust our state for truncation of a temp file */
2238 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2239 temporary_files_size -= VfdCache[file].fileSize - offset;
2240 VfdCache[file].fileSize = offset;
2241 }
2242
2243 return returnCode;
2244 }
2245
2246 /*
2247 * Return the pathname associated with an open file.
2248 *
2249 * The returned string points to an internal buffer, which is valid until
2250 * the file is closed.
2251 */
2252 char *
FilePathName(File file)2253 FilePathName(File file)
2254 {
2255 Assert(FileIsValid(file));
2256
2257 return VfdCache[file].fileName;
2258 }
2259
2260 /*
2261 * Return the raw file descriptor of an opened file.
2262 *
2263 * The returned file descriptor will be valid until the file is closed, but
2264 * there are a lot of things that can make that happen. So the caller should
2265 * be careful not to do much of anything else before it finishes using the
2266 * returned file descriptor.
2267 */
2268 int
FileGetRawDesc(File file)2269 FileGetRawDesc(File file)
2270 {
2271 Assert(FileIsValid(file));
2272 return VfdCache[file].fd;
2273 }
2274
2275 /*
2276 * FileGetRawFlags - returns the file flags on open(2)
2277 */
2278 int
FileGetRawFlags(File file)2279 FileGetRawFlags(File file)
2280 {
2281 Assert(FileIsValid(file));
2282 return VfdCache[file].fileFlags;
2283 }
2284
2285 /*
2286 * FileGetRawMode - returns the mode bitmask passed to open(2)
2287 */
2288 mode_t
FileGetRawMode(File file)2289 FileGetRawMode(File file)
2290 {
2291 Assert(FileIsValid(file));
2292 return VfdCache[file].fileMode;
2293 }
2294
2295 /*
2296 * Make room for another allocatedDescs[] array entry if needed and possible.
2297 * Returns true if an array element is available.
2298 */
2299 static bool
reserveAllocatedDesc(void)2300 reserveAllocatedDesc(void)
2301 {
2302 AllocateDesc *newDescs;
2303 int newMax;
2304
2305 /* Quick out if array already has a free slot. */
2306 if (numAllocatedDescs < maxAllocatedDescs)
2307 return true;
2308
2309 /*
2310 * If the array hasn't yet been created in the current process, initialize
2311 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2312 * we will ever need, anyway. We don't want to look at max_safe_fds
2313 * immediately because set_max_safe_fds() may not have run yet.
2314 */
2315 if (allocatedDescs == NULL)
2316 {
2317 newMax = FD_MINFREE / 3;
2318 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2319 /* Out of memory already? Treat as fatal error. */
2320 if (newDescs == NULL)
2321 ereport(ERROR,
2322 (errcode(ERRCODE_OUT_OF_MEMORY),
2323 errmsg("out of memory")));
2324 allocatedDescs = newDescs;
2325 maxAllocatedDescs = newMax;
2326 return true;
2327 }
2328
2329 /*
2330 * Consider enlarging the array beyond the initial allocation used above.
2331 * By the time this happens, max_safe_fds should be known accurately.
2332 *
2333 * We mustn't let allocated descriptors hog all the available FDs, and in
2334 * practice we'd better leave a reasonable number of FDs for VFD use. So
2335 * set the maximum to max_safe_fds / 3. (This should certainly be at
2336 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2337 * tightening the restriction here.) Recall that "external" FDs are
2338 * allowed to consume another third of max_safe_fds.
2339 */
2340 newMax = max_safe_fds / 3;
2341 if (newMax > maxAllocatedDescs)
2342 {
2343 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2344 newMax * sizeof(AllocateDesc));
2345 /* Treat out-of-memory as a non-fatal error. */
2346 if (newDescs == NULL)
2347 return false;
2348 allocatedDescs = newDescs;
2349 maxAllocatedDescs = newMax;
2350 return true;
2351 }
2352
2353 /* Can't enlarge allocatedDescs[] any more. */
2354 return false;
2355 }
2356
2357 /*
2358 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2359 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2360 * necessary to open the file. When done, call FreeFile rather than fclose.
2361 *
2362 * Note that files that will be open for any significant length of time
2363 * should NOT be handled this way, since they cannot share kernel file
2364 * descriptors with other files; there is grave risk of running out of FDs
2365 * if anyone locks down too many FDs. Most callers of this routine are
2366 * simply reading a config file that they will read and close immediately.
2367 *
2368 * fd.c will automatically close all files opened with AllocateFile at
2369 * transaction commit or abort; this prevents FD leakage if a routine
2370 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2371 *
2372 * Ideally this should be the *only* direct call of fopen() in the backend.
2373 */
2374 FILE *
AllocateFile(const char * name,const char * mode)2375 AllocateFile(const char *name, const char *mode)
2376 {
2377 FILE *file;
2378
2379 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2380 numAllocatedDescs, name));
2381
2382 /* Can we allocate another non-virtual FD? */
2383 if (!reserveAllocatedDesc())
2384 ereport(ERROR,
2385 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2386 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2387 maxAllocatedDescs, name)));
2388
2389 /* Close excess kernel FDs. */
2390 ReleaseLruFiles();
2391
2392 TryAgain:
2393 if ((file = fopen(name, mode)) != NULL)
2394 {
2395 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2396
2397 desc->kind = AllocateDescFile;
2398 desc->desc.file = file;
2399 desc->create_subid = GetCurrentSubTransactionId();
2400 numAllocatedDescs++;
2401 return desc->desc.file;
2402 }
2403
2404 if (errno == EMFILE || errno == ENFILE)
2405 {
2406 int save_errno = errno;
2407
2408 ereport(LOG,
2409 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2410 errmsg("out of file descriptors: %m; release and retry")));
2411 errno = 0;
2412 if (ReleaseLruFile())
2413 goto TryAgain;
2414 errno = save_errno;
2415 }
2416
2417 return NULL;
2418 }
2419
2420 /*
2421 * Open a file with OpenTransientFilePerm() and pass default file mode for
2422 * the fileMode parameter.
2423 */
2424 int
OpenTransientFile(const char * fileName,int fileFlags)2425 OpenTransientFile(const char *fileName, int fileFlags)
2426 {
2427 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2428 }
2429
2430 /*
2431 * Like AllocateFile, but returns an unbuffered fd like open(2)
2432 */
2433 int
OpenTransientFilePerm(const char * fileName,int fileFlags,mode_t fileMode)2434 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2435 {
2436 int fd;
2437
2438 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2439 numAllocatedDescs, fileName));
2440
2441 /* Can we allocate another non-virtual FD? */
2442 if (!reserveAllocatedDesc())
2443 ereport(ERROR,
2444 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2445 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2446 maxAllocatedDescs, fileName)));
2447
2448 /* Close excess kernel FDs. */
2449 ReleaseLruFiles();
2450
2451 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2452
2453 if (fd >= 0)
2454 {
2455 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2456
2457 desc->kind = AllocateDescRawFD;
2458 desc->desc.fd = fd;
2459 desc->create_subid = GetCurrentSubTransactionId();
2460 numAllocatedDescs++;
2461
2462 return fd;
2463 }
2464
2465 return -1; /* failure */
2466 }
2467
2468 /*
2469 * Routines that want to initiate a pipe stream should use OpenPipeStream
2470 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2471 * necessary. When done, call ClosePipeStream rather than pclose.
2472 *
2473 * This function also ensures that the popen'd program is run with default
2474 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2475 * uses. This ensures desirable response to, eg, closing a read pipe early.
2476 */
2477 FILE *
OpenPipeStream(const char * command,const char * mode)2478 OpenPipeStream(const char *command, const char *mode)
2479 {
2480 FILE *file;
2481 int save_errno;
2482
2483 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2484 numAllocatedDescs, command));
2485
2486 /* Can we allocate another non-virtual FD? */
2487 if (!reserveAllocatedDesc())
2488 ereport(ERROR,
2489 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2490 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2491 maxAllocatedDescs, command)));
2492
2493 /* Close excess kernel FDs. */
2494 ReleaseLruFiles();
2495
2496 TryAgain:
2497 fflush(stdout);
2498 fflush(stderr);
2499 pqsignal(SIGPIPE, SIG_DFL);
2500 errno = 0;
2501 file = popen(command, mode);
2502 save_errno = errno;
2503 pqsignal(SIGPIPE, SIG_IGN);
2504 errno = save_errno;
2505 if (file != NULL)
2506 {
2507 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2508
2509 desc->kind = AllocateDescPipe;
2510 desc->desc.file = file;
2511 desc->create_subid = GetCurrentSubTransactionId();
2512 numAllocatedDescs++;
2513 return desc->desc.file;
2514 }
2515
2516 if (errno == EMFILE || errno == ENFILE)
2517 {
2518 ereport(LOG,
2519 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2520 errmsg("out of file descriptors: %m; release and retry")));
2521 if (ReleaseLruFile())
2522 goto TryAgain;
2523 errno = save_errno;
2524 }
2525
2526 return NULL;
2527 }
2528
2529 /*
2530 * Free an AllocateDesc of any type.
2531 *
2532 * The argument *must* point into the allocatedDescs[] array.
2533 */
2534 static int
FreeDesc(AllocateDesc * desc)2535 FreeDesc(AllocateDesc *desc)
2536 {
2537 int result;
2538
2539 /* Close the underlying object */
2540 switch (desc->kind)
2541 {
2542 case AllocateDescFile:
2543 result = fclose(desc->desc.file);
2544 break;
2545 case AllocateDescPipe:
2546 result = pclose(desc->desc.file);
2547 break;
2548 case AllocateDescDir:
2549 result = closedir(desc->desc.dir);
2550 break;
2551 case AllocateDescRawFD:
2552 result = close(desc->desc.fd);
2553 break;
2554 default:
2555 elog(ERROR, "AllocateDesc kind not recognized");
2556 result = 0; /* keep compiler quiet */
2557 break;
2558 }
2559
2560 /* Compact storage in the allocatedDescs array */
2561 numAllocatedDescs--;
2562 *desc = allocatedDescs[numAllocatedDescs];
2563
2564 return result;
2565 }
2566
2567 /*
2568 * Close a file returned by AllocateFile.
2569 *
2570 * Note we do not check fclose's return value --- it is up to the caller
2571 * to handle close errors.
2572 */
2573 int
FreeFile(FILE * file)2574 FreeFile(FILE *file)
2575 {
2576 int i;
2577
2578 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2579
2580 /* Remove file from list of allocated files, if it's present */
2581 for (i = numAllocatedDescs; --i >= 0;)
2582 {
2583 AllocateDesc *desc = &allocatedDescs[i];
2584
2585 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2586 return FreeDesc(desc);
2587 }
2588
2589 /* Only get here if someone passes us a file not in allocatedDescs */
2590 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2591
2592 return fclose(file);
2593 }
2594
2595 /*
2596 * Close a file returned by OpenTransientFile.
2597 *
2598 * Note we do not check close's return value --- it is up to the caller
2599 * to handle close errors.
2600 */
2601 int
CloseTransientFile(int fd)2602 CloseTransientFile(int fd)
2603 {
2604 int i;
2605
2606 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2607
2608 /* Remove fd from list of allocated files, if it's present */
2609 for (i = numAllocatedDescs; --i >= 0;)
2610 {
2611 AllocateDesc *desc = &allocatedDescs[i];
2612
2613 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2614 return FreeDesc(desc);
2615 }
2616
2617 /* Only get here if someone passes us a file not in allocatedDescs */
2618 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2619
2620 return close(fd);
2621 }
2622
2623 /*
2624 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2625 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2626 * necessary to open the directory, and with closing it after an elog.
2627 * When done, call FreeDir rather than closedir.
2628 *
2629 * Returns NULL, with errno set, on failure. Note that failure detection
2630 * is commonly left to the following call of ReadDir or ReadDirExtended;
2631 * see the comments for ReadDir.
2632 *
2633 * Ideally this should be the *only* direct call of opendir() in the backend.
2634 */
2635 DIR *
AllocateDir(const char * dirname)2636 AllocateDir(const char *dirname)
2637 {
2638 DIR *dir;
2639
2640 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2641 numAllocatedDescs, dirname));
2642
2643 /* Can we allocate another non-virtual FD? */
2644 if (!reserveAllocatedDesc())
2645 ereport(ERROR,
2646 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2647 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2648 maxAllocatedDescs, dirname)));
2649
2650 /* Close excess kernel FDs. */
2651 ReleaseLruFiles();
2652
2653 TryAgain:
2654 if ((dir = opendir(dirname)) != NULL)
2655 {
2656 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2657
2658 desc->kind = AllocateDescDir;
2659 desc->desc.dir = dir;
2660 desc->create_subid = GetCurrentSubTransactionId();
2661 numAllocatedDescs++;
2662 return desc->desc.dir;
2663 }
2664
2665 if (errno == EMFILE || errno == ENFILE)
2666 {
2667 int save_errno = errno;
2668
2669 ereport(LOG,
2670 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2671 errmsg("out of file descriptors: %m; release and retry")));
2672 errno = 0;
2673 if (ReleaseLruFile())
2674 goto TryAgain;
2675 errno = save_errno;
2676 }
2677
2678 return NULL;
2679 }
2680
2681 /*
2682 * Read a directory opened with AllocateDir, ereport'ing any error.
2683 *
2684 * This is easier to use than raw readdir() since it takes care of some
2685 * otherwise rather tedious and error-prone manipulation of errno. Also,
2686 * if you are happy with a generic error message for AllocateDir failure,
2687 * you can just do
2688 *
2689 * dir = AllocateDir(path);
2690 * while ((dirent = ReadDir(dir, path)) != NULL)
2691 * process dirent;
2692 * FreeDir(dir);
2693 *
2694 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2695 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2696 * use this shortcut.)
2697 *
2698 * The pathname passed to AllocateDir must be passed to this routine too,
2699 * but it is only used for error reporting.
2700 */
2701 struct dirent *
ReadDir(DIR * dir,const char * dirname)2702 ReadDir(DIR *dir, const char *dirname)
2703 {
2704 return ReadDirExtended(dir, dirname, ERROR);
2705 }
2706
2707 /*
2708 * Alternate version of ReadDir that allows caller to specify the elevel
2709 * for any error report (whether it's reporting an initial failure of
2710 * AllocateDir or a subsequent directory read failure).
2711 *
2712 * If elevel < ERROR, returns NULL after any error. With the normal coding
2713 * pattern, this will result in falling out of the loop immediately as
2714 * though the directory contained no (more) entries.
2715 */
2716 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2717 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2718 {
2719 struct dirent *dent;
2720
2721 /* Give a generic message for AllocateDir failure, if caller didn't */
2722 if (dir == NULL)
2723 {
2724 ereport(elevel,
2725 (errcode_for_file_access(),
2726 errmsg("could not open directory \"%s\": %m",
2727 dirname)));
2728 return NULL;
2729 }
2730
2731 errno = 0;
2732 if ((dent = readdir(dir)) != NULL)
2733 return dent;
2734
2735 if (errno)
2736 ereport(elevel,
2737 (errcode_for_file_access(),
2738 errmsg("could not read directory \"%s\": %m",
2739 dirname)));
2740 return NULL;
2741 }
2742
2743 /*
2744 * Close a directory opened with AllocateDir.
2745 *
2746 * Returns closedir's return value (with errno set if it's not 0).
2747 * Note we do not check the return value --- it is up to the caller
2748 * to handle close errors if wanted.
2749 *
2750 * Does nothing if dir == NULL; we assume that directory open failure was
2751 * already reported if desired.
2752 */
2753 int
FreeDir(DIR * dir)2754 FreeDir(DIR *dir)
2755 {
2756 int i;
2757
2758 /* Nothing to do if AllocateDir failed */
2759 if (dir == NULL)
2760 return 0;
2761
2762 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2763
2764 /* Remove dir from list of allocated dirs, if it's present */
2765 for (i = numAllocatedDescs; --i >= 0;)
2766 {
2767 AllocateDesc *desc = &allocatedDescs[i];
2768
2769 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2770 return FreeDesc(desc);
2771 }
2772
2773 /* Only get here if someone passes us a dir not in allocatedDescs */
2774 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2775
2776 return closedir(dir);
2777 }
2778
2779
2780 /*
2781 * Close a pipe stream returned by OpenPipeStream.
2782 */
2783 int
ClosePipeStream(FILE * file)2784 ClosePipeStream(FILE *file)
2785 {
2786 int i;
2787
2788 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2789
2790 /* Remove file from list of allocated files, if it's present */
2791 for (i = numAllocatedDescs; --i >= 0;)
2792 {
2793 AllocateDesc *desc = &allocatedDescs[i];
2794
2795 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2796 return FreeDesc(desc);
2797 }
2798
2799 /* Only get here if someone passes us a file not in allocatedDescs */
2800 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2801
2802 return pclose(file);
2803 }
2804
2805 /*
2806 * closeAllVfds
2807 *
2808 * Force all VFDs into the physically-closed state, so that the fewest
2809 * possible number of kernel file descriptors are in use. There is no
2810 * change in the logical state of the VFDs.
2811 */
2812 void
closeAllVfds(void)2813 closeAllVfds(void)
2814 {
2815 Index i;
2816
2817 if (SizeVfdCache > 0)
2818 {
2819 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2820 for (i = 1; i < SizeVfdCache; i++)
2821 {
2822 if (!FileIsNotOpen(i))
2823 LruDelete(i);
2824 }
2825 }
2826 }
2827
2828
2829 /*
2830 * SetTempTablespaces
2831 *
2832 * Define a list (actually an array) of OIDs of tablespaces to use for
2833 * temporary files. This list will be used until end of transaction,
2834 * unless this function is called again before then. It is caller's
2835 * responsibility that the passed-in array has adequate lifespan (typically
2836 * it'd be allocated in TopTransactionContext).
2837 *
2838 * Some entries of the array may be InvalidOid, indicating that the current
2839 * database's default tablespace should be used.
2840 */
2841 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2842 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2843 {
2844 Assert(numSpaces >= 0);
2845 tempTableSpaces = tableSpaces;
2846 numTempTableSpaces = numSpaces;
2847
2848 /*
2849 * Select a random starting point in the list. This is to minimize
2850 * conflicts between backends that are most likely sharing the same list
2851 * of temp tablespaces. Note that if we create multiple temp files in the
2852 * same transaction, we'll advance circularly through the list --- this
2853 * ensures that large temporary sort files are nicely spread across all
2854 * available tablespaces.
2855 */
2856 if (numSpaces > 1)
2857 nextTempTableSpace = random() % numSpaces;
2858 else
2859 nextTempTableSpace = 0;
2860 }
2861
2862 /*
2863 * TempTablespacesAreSet
2864 *
2865 * Returns true if SetTempTablespaces has been called in current transaction.
2866 * (This is just so that tablespaces.c doesn't need its own per-transaction
2867 * state.)
2868 */
2869 bool
TempTablespacesAreSet(void)2870 TempTablespacesAreSet(void)
2871 {
2872 return (numTempTableSpaces >= 0);
2873 }
2874
2875 /*
2876 * GetTempTablespaces
2877 *
2878 * Populate an array with the OIDs of the tablespaces that should be used for
2879 * temporary files. (Some entries may be InvalidOid, indicating that the
2880 * current database's default tablespace should be used.) At most numSpaces
2881 * entries will be filled.
2882 * Returns the number of OIDs that were copied into the output array.
2883 */
2884 int
GetTempTablespaces(Oid * tableSpaces,int numSpaces)2885 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2886 {
2887 int i;
2888
2889 Assert(TempTablespacesAreSet());
2890 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2891 tableSpaces[i] = tempTableSpaces[i];
2892
2893 return i;
2894 }
2895
2896 /*
2897 * GetNextTempTableSpace
2898 *
2899 * Select the next temp tablespace to use. A result of InvalidOid means
2900 * to use the current database's default tablespace.
2901 */
2902 Oid
GetNextTempTableSpace(void)2903 GetNextTempTableSpace(void)
2904 {
2905 if (numTempTableSpaces > 0)
2906 {
2907 /* Advance nextTempTableSpace counter with wraparound */
2908 if (++nextTempTableSpace >= numTempTableSpaces)
2909 nextTempTableSpace = 0;
2910 return tempTableSpaces[nextTempTableSpace];
2911 }
2912 return InvalidOid;
2913 }
2914
2915
2916 /*
2917 * AtEOSubXact_Files
2918 *
2919 * Take care of subtransaction commit/abort. At abort, we close temp files
2920 * that the subtransaction may have opened. At commit, we reassign the
2921 * files that were opened to the parent subtransaction.
2922 */
2923 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2924 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2925 SubTransactionId parentSubid)
2926 {
2927 Index i;
2928
2929 for (i = 0; i < numAllocatedDescs; i++)
2930 {
2931 if (allocatedDescs[i].create_subid == mySubid)
2932 {
2933 if (isCommit)
2934 allocatedDescs[i].create_subid = parentSubid;
2935 else
2936 {
2937 /* have to recheck the item after FreeDesc (ugly) */
2938 FreeDesc(&allocatedDescs[i--]);
2939 }
2940 }
2941 }
2942 }
2943
2944 /*
2945 * AtEOXact_Files
2946 *
2947 * This routine is called during transaction commit or abort. All still-open
2948 * per-transaction temporary file VFDs are closed, which also causes the
2949 * underlying files to be deleted (although they should've been closed already
2950 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2951 * closed. We also forget any transaction-local temp tablespace list.
2952 *
2953 * The isCommit flag is used only to decide whether to emit warnings about
2954 * unclosed files.
2955 */
2956 void
AtEOXact_Files(bool isCommit)2957 AtEOXact_Files(bool isCommit)
2958 {
2959 CleanupTempFiles(isCommit, false);
2960 tempTableSpaces = NULL;
2961 numTempTableSpaces = -1;
2962 }
2963
2964 /*
2965 * AtProcExit_Files
2966 *
2967 * on_proc_exit hook to clean up temp files during backend shutdown.
2968 * Here, we want to clean up *all* temp files including interXact ones.
2969 */
2970 static void
AtProcExit_Files(int code,Datum arg)2971 AtProcExit_Files(int code, Datum arg)
2972 {
2973 CleanupTempFiles(false, true);
2974 }
2975
2976 /*
2977 * Close temporary files and delete their underlying files.
2978 *
2979 * isCommit: if true, this is normal transaction commit, and we don't
2980 * expect any remaining files; warn if there are some.
2981 *
2982 * isProcExit: if true, this is being called as the backend process is
2983 * exiting. If that's the case, we should remove all temporary files; if
2984 * that's not the case, we are being called for transaction commit/abort
2985 * and should only remove transaction-local temp files. In either case,
2986 * also clean up "allocated" stdio files, dirs and fds.
2987 */
2988 static void
CleanupTempFiles(bool isCommit,bool isProcExit)2989 CleanupTempFiles(bool isCommit, bool isProcExit)
2990 {
2991 Index i;
2992
2993 /*
2994 * Careful here: at proc_exit we need extra cleanup, not just
2995 * xact_temporary files.
2996 */
2997 if (isProcExit || have_xact_temporary_files)
2998 {
2999 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3000 for (i = 1; i < SizeVfdCache; i++)
3001 {
3002 unsigned short fdstate = VfdCache[i].fdstate;
3003
3004 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3005 VfdCache[i].fileName != NULL)
3006 {
3007 /*
3008 * If we're in the process of exiting a backend process, close
3009 * all temporary files. Otherwise, only close temporary files
3010 * local to the current transaction. They should be closed by
3011 * the ResourceOwner mechanism already, so this is just a
3012 * debugging cross-check.
3013 */
3014 if (isProcExit)
3015 FileClose(i);
3016 else if (fdstate & FD_CLOSE_AT_EOXACT)
3017 {
3018 elog(WARNING,
3019 "temporary file %s not closed at end-of-transaction",
3020 VfdCache[i].fileName);
3021 FileClose(i);
3022 }
3023 }
3024 }
3025
3026 have_xact_temporary_files = false;
3027 }
3028
3029 /* Complain if any allocated files remain open at commit. */
3030 if (isCommit && numAllocatedDescs > 0)
3031 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3032 numAllocatedDescs);
3033
3034 /* Clean up "allocated" stdio files, dirs and fds. */
3035 while (numAllocatedDescs > 0)
3036 FreeDesc(&allocatedDescs[0]);
3037 }
3038
3039
3040 /*
3041 * Remove temporary and temporary relation files left over from a prior
3042 * postmaster session
3043 *
3044 * This should be called during postmaster startup. It will forcibly
3045 * remove any leftover files created by OpenTemporaryFile and any leftover
3046 * temporary relation files created by mdcreate.
3047 *
3048 * During post-backend-crash restart cycle, this routine is called when
3049 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3050 * queries are using temp files could result in useless storage usage that can
3051 * only be reclaimed by a service restart. The argument against enabling it is
3052 * that someone might want to examine the temporary files for debugging
3053 * purposes. This does however mean that OpenTemporaryFile had better allow for
3054 * collision with an existing temp file name.
3055 *
3056 * NOTE: this function and its subroutines generally report syscall failures
3057 * with ereport(LOG) and keep going. Removing temp files is not so critical
3058 * that we should fail to start the database when we can't do it.
3059 */
3060 void
RemovePgTempFiles(void)3061 RemovePgTempFiles(void)
3062 {
3063 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3064 DIR *spc_dir;
3065 struct dirent *spc_de;
3066
3067 /*
3068 * First process temp files in pg_default ($PGDATA/base)
3069 */
3070 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3071 RemovePgTempFilesInDir(temp_path, true, false);
3072 RemovePgTempRelationFiles("base");
3073
3074 /*
3075 * Cycle through temp directories for all non-default tablespaces.
3076 */
3077 spc_dir = AllocateDir("pg_tblspc");
3078
3079 while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3080 {
3081 if (strcmp(spc_de->d_name, ".") == 0 ||
3082 strcmp(spc_de->d_name, "..") == 0)
3083 continue;
3084
3085 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3086 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
3087 RemovePgTempFilesInDir(temp_path, true, false);
3088
3089 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3090 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3091 RemovePgTempRelationFiles(temp_path);
3092 }
3093
3094 FreeDir(spc_dir);
3095
3096 /*
3097 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3098 * DataDir as well. However, that is *not* cleaned here because doing so
3099 * would create a race condition. It's done separately, earlier in
3100 * postmaster startup.
3101 */
3102 }
3103
3104 /*
3105 * Process one pgsql_tmp directory for RemovePgTempFiles.
3106 *
3107 * If missing_ok is true, it's all right for the named directory to not exist.
3108 * Any other problem results in a LOG message. (missing_ok should be true at
3109 * the top level, since pgsql_tmp directories are not created until needed.)
3110 *
3111 * At the top level, this should be called with unlink_all = false, so that
3112 * only files matching the temporary name prefix will be unlinked. When
3113 * recursing it will be called with unlink_all = true to unlink everything
3114 * under a top-level temporary directory.
3115 *
3116 * (These two flags could be replaced by one, but it seems clearer to keep
3117 * them separate.)
3118 */
3119 void
RemovePgTempFilesInDir(const char * tmpdirname,bool missing_ok,bool unlink_all)3120 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3121 {
3122 DIR *temp_dir;
3123 struct dirent *temp_de;
3124 char rm_path[MAXPGPATH * 2];
3125
3126 temp_dir = AllocateDir(tmpdirname);
3127
3128 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3129 return;
3130
3131 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3132 {
3133 if (strcmp(temp_de->d_name, ".") == 0 ||
3134 strcmp(temp_de->d_name, "..") == 0)
3135 continue;
3136
3137 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3138 tmpdirname, temp_de->d_name);
3139
3140 if (unlink_all ||
3141 strncmp(temp_de->d_name,
3142 PG_TEMP_FILE_PREFIX,
3143 strlen(PG_TEMP_FILE_PREFIX)) == 0)
3144 {
3145 struct stat statbuf;
3146
3147 if (lstat(rm_path, &statbuf) < 0)
3148 {
3149 ereport(LOG,
3150 (errcode_for_file_access(),
3151 errmsg("could not stat file \"%s\": %m", rm_path)));
3152 continue;
3153 }
3154
3155 if (S_ISDIR(statbuf.st_mode))
3156 {
3157 /* recursively remove contents, then directory itself */
3158 RemovePgTempFilesInDir(rm_path, false, true);
3159
3160 if (rmdir(rm_path) < 0)
3161 ereport(LOG,
3162 (errcode_for_file_access(),
3163 errmsg("could not remove directory \"%s\": %m",
3164 rm_path)));
3165 }
3166 else
3167 {
3168 if (unlink(rm_path) < 0)
3169 ereport(LOG,
3170 (errcode_for_file_access(),
3171 errmsg("could not remove file \"%s\": %m",
3172 rm_path)));
3173 }
3174 }
3175 else
3176 ereport(LOG,
3177 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3178 rm_path)));
3179 }
3180
3181 FreeDir(temp_dir);
3182 }
3183
3184 /* Process one tablespace directory, look for per-DB subdirectories */
3185 static void
RemovePgTempRelationFiles(const char * tsdirname)3186 RemovePgTempRelationFiles(const char *tsdirname)
3187 {
3188 DIR *ts_dir;
3189 struct dirent *de;
3190 char dbspace_path[MAXPGPATH * 2];
3191
3192 ts_dir = AllocateDir(tsdirname);
3193
3194 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3195 {
3196 /*
3197 * We're only interested in the per-database directories, which have
3198 * numeric names. Note that this code will also (properly) ignore "."
3199 * and "..".
3200 */
3201 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3202 continue;
3203
3204 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3205 tsdirname, de->d_name);
3206 RemovePgTempRelationFilesInDbspace(dbspace_path);
3207 }
3208
3209 FreeDir(ts_dir);
3210 }
3211
3212 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3213 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)3214 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3215 {
3216 DIR *dbspace_dir;
3217 struct dirent *de;
3218 char rm_path[MAXPGPATH * 2];
3219
3220 dbspace_dir = AllocateDir(dbspacedirname);
3221
3222 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3223 {
3224 if (!looks_like_temp_rel_name(de->d_name))
3225 continue;
3226
3227 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3228 dbspacedirname, de->d_name);
3229
3230 if (unlink(rm_path) < 0)
3231 ereport(LOG,
3232 (errcode_for_file_access(),
3233 errmsg("could not remove file \"%s\": %m",
3234 rm_path)));
3235 }
3236
3237 FreeDir(dbspace_dir);
3238 }
3239
3240 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3241 bool
looks_like_temp_rel_name(const char * name)3242 looks_like_temp_rel_name(const char *name)
3243 {
3244 int pos;
3245 int savepos;
3246
3247 /* Must start with "t". */
3248 if (name[0] != 't')
3249 return false;
3250
3251 /* Followed by a non-empty string of digits and then an underscore. */
3252 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3253 ;
3254 if (pos == 1 || name[pos] != '_')
3255 return false;
3256
3257 /* Followed by another nonempty string of digits. */
3258 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3259 ;
3260 if (savepos == pos)
3261 return false;
3262
3263 /* We might have _forkname or .segment or both. */
3264 if (name[pos] == '_')
3265 {
3266 int forkchar = forkname_chars(&name[pos + 1], NULL);
3267
3268 if (forkchar <= 0)
3269 return false;
3270 pos += forkchar + 1;
3271 }
3272 if (name[pos] == '.')
3273 {
3274 int segchar;
3275
3276 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3277 ;
3278 if (segchar <= 1)
3279 return false;
3280 pos += segchar;
3281 }
3282
3283 /* Now we should be at the end. */
3284 if (name[pos] != '\0')
3285 return false;
3286 return true;
3287 }
3288
3289 #ifdef HAVE_SYNCFS
3290 static void
do_syncfs(const char * path)3291 do_syncfs(const char *path)
3292 {
3293 int fd;
3294
3295 fd = OpenTransientFile(path, O_RDONLY);
3296 if (fd < 0)
3297 {
3298 ereport(LOG,
3299 (errcode_for_file_access(),
3300 errmsg("could not open file \"%s\": %m", path)));
3301 return;
3302 }
3303 if (syncfs(fd) < 0)
3304 ereport(LOG,
3305 (errcode_for_file_access(),
3306 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3307 CloseTransientFile(fd);
3308 }
3309 #endif
3310
3311 /*
3312 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3313 * all potential filesystem, depending on recovery_init_sync_method setting.
3314 *
3315 * We fsync regular files and directories wherever they are, but we
3316 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3317 * Other symlinks are presumed to point at files we're not responsible
3318 * for fsyncing, and might not have privileges to write at all.
3319 *
3320 * Errors are logged but not considered fatal; that's because this is used
3321 * only during database startup, to deal with the possibility that there are
3322 * issued-but-unsynced writes pending against the data directory. We want to
3323 * ensure that such writes reach disk before anything that's done in the new
3324 * run. However, aborting on error would result in failure to start for
3325 * harmless cases such as read-only files in the data directory, and that's
3326 * not good either.
3327 *
3328 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3329 * rewriting all changes again during recovery.
3330 *
3331 * Note we assume we're chdir'd into PGDATA to begin with.
3332 */
3333 void
SyncDataDirectory(void)3334 SyncDataDirectory(void)
3335 {
3336 bool xlog_is_symlink;
3337
3338 /* We can skip this whole thing if fsync is disabled. */
3339 if (!enableFsync)
3340 return;
3341
3342 /*
3343 * If pg_wal is a symlink, we'll need to recurse into it separately,
3344 * because the first walkdir below will ignore it.
3345 */
3346 xlog_is_symlink = false;
3347
3348 #ifndef WIN32
3349 {
3350 struct stat st;
3351
3352 if (lstat("pg_wal", &st) < 0)
3353 ereport(LOG,
3354 (errcode_for_file_access(),
3355 errmsg("could not stat file \"%s\": %m",
3356 "pg_wal")));
3357 else if (S_ISLNK(st.st_mode))
3358 xlog_is_symlink = true;
3359 }
3360 #else
3361 if (pgwin32_is_junction("pg_wal"))
3362 xlog_is_symlink = true;
3363 #endif
3364
3365 #ifdef HAVE_SYNCFS
3366 if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
3367 {
3368 DIR *dir;
3369 struct dirent *de;
3370
3371 /*
3372 * On Linux, we don't have to open every single file one by one. We
3373 * can use syncfs() to sync whole filesystems. We only expect
3374 * filesystem boundaries to exist where we tolerate symlinks, namely
3375 * pg_wal and the tablespaces, so we call syncfs() for each of those
3376 * directories.
3377 */
3378
3379 /* Sync the top level pgdata directory. */
3380 do_syncfs(".");
3381 /* If any tablespaces are configured, sync each of those. */
3382 dir = AllocateDir("pg_tblspc");
3383 while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3384 {
3385 char path[MAXPGPATH];
3386
3387 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3388 continue;
3389
3390 snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3391 do_syncfs(path);
3392 }
3393 FreeDir(dir);
3394 /* If pg_wal is a symlink, process that too. */
3395 if (xlog_is_symlink)
3396 do_syncfs("pg_wal");
3397 return;
3398 }
3399 #endif /* !HAVE_SYNCFS */
3400
3401 /*
3402 * If possible, hint to the kernel that we're soon going to fsync the data
3403 * directory and its contents. Errors in this step are even less
3404 * interesting than normal, so log them only at DEBUG1.
3405 */
3406 #ifdef PG_FLUSH_DATA_WORKS
3407 walkdir(".", pre_sync_fname, false, DEBUG1);
3408 if (xlog_is_symlink)
3409 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3410 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3411 #endif
3412
3413 /*
3414 * Now we do the fsync()s in the same order.
3415 *
3416 * The main call ignores symlinks, so in addition to specially processing
3417 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3418 * process_symlinks = true. Note that if there are any plain directories
3419 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3420 * so we don't worry about optimizing it.
3421 */
3422 walkdir(".", datadir_fsync_fname, false, LOG);
3423 if (xlog_is_symlink)
3424 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3425 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3426 }
3427
3428 /*
3429 * walkdir: recursively walk a directory, applying the action to each
3430 * regular file and directory (including the named directory itself).
3431 *
3432 * If process_symlinks is true, the action and recursion are also applied
3433 * to regular files and directories that are pointed to by symlinks in the
3434 * given directory; otherwise symlinks are ignored. Symlinks are always
3435 * ignored in subdirectories, ie we intentionally don't pass down the
3436 * process_symlinks flag to recursive calls.
3437 *
3438 * Errors are reported at level elevel, which might be ERROR or less.
3439 *
3440 * See also walkdir in file_utils.c, which is a frontend version of this
3441 * logic.
3442 */
3443 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3444 walkdir(const char *path,
3445 void (*action) (const char *fname, bool isdir, int elevel),
3446 bool process_symlinks,
3447 int elevel)
3448 {
3449 DIR *dir;
3450 struct dirent *de;
3451
3452 dir = AllocateDir(path);
3453
3454 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3455 {
3456 char subpath[MAXPGPATH * 2];
3457
3458 CHECK_FOR_INTERRUPTS();
3459
3460 if (strcmp(de->d_name, ".") == 0 ||
3461 strcmp(de->d_name, "..") == 0)
3462 continue;
3463
3464 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3465
3466 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3467 {
3468 case PGFILETYPE_REG:
3469 (*action) (subpath, false, elevel);
3470 break;
3471 case PGFILETYPE_DIR:
3472 walkdir(subpath, action, false, elevel);
3473 break;
3474 default:
3475
3476 /*
3477 * Errors are already reported directly by get_dirent_type(),
3478 * and any remaining symlinks and unknown file types are
3479 * ignored.
3480 */
3481 break;
3482 }
3483 }
3484
3485 FreeDir(dir); /* we ignore any error here */
3486
3487 /*
3488 * It's important to fsync the destination directory itself as individual
3489 * file fsyncs don't guarantee that the directory entry for the file is
3490 * synced. However, skip this if AllocateDir failed; the action function
3491 * might not be robust against that.
3492 */
3493 if (dir)
3494 (*action) (path, true, elevel);
3495 }
3496
3497
3498 /*
3499 * Hint to the OS that it should get ready to fsync() this file.
3500 *
3501 * Ignores errors trying to open unreadable files, and logs other errors at a
3502 * caller-specified level.
3503 */
3504 #ifdef PG_FLUSH_DATA_WORKS
3505
3506 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3507 pre_sync_fname(const char *fname, bool isdir, int elevel)
3508 {
3509 int fd;
3510
3511 /* Don't try to flush directories, it'll likely just fail */
3512 if (isdir)
3513 return;
3514
3515 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3516
3517 if (fd < 0)
3518 {
3519 if (errno == EACCES)
3520 return;
3521 ereport(elevel,
3522 (errcode_for_file_access(),
3523 errmsg("could not open file \"%s\": %m", fname)));
3524 return;
3525 }
3526
3527 /*
3528 * pg_flush_data() ignores errors, which is ok because this is only a
3529 * hint.
3530 */
3531 pg_flush_data(fd, 0, 0);
3532
3533 if (CloseTransientFile(fd) != 0)
3534 ereport(elevel,
3535 (errcode_for_file_access(),
3536 errmsg("could not close file \"%s\": %m", fname)));
3537 }
3538
3539 #endif /* PG_FLUSH_DATA_WORKS */
3540
3541 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3542 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3543 {
3544 /*
3545 * We want to silently ignoring errors about unreadable files. Pass that
3546 * desire on to fsync_fname_ext().
3547 */
3548 fsync_fname_ext(fname, isdir, true, elevel);
3549 }
3550
3551 static void
unlink_if_exists_fname(const char * fname,bool isdir,int elevel)3552 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3553 {
3554 if (isdir)
3555 {
3556 if (rmdir(fname) != 0 && errno != ENOENT)
3557 ereport(elevel,
3558 (errcode_for_file_access(),
3559 errmsg("could not remove directory \"%s\": %m", fname)));
3560 }
3561 else
3562 {
3563 /* Use PathNameDeleteTemporaryFile to report filesize */
3564 PathNameDeleteTemporaryFile(fname, false);
3565 }
3566 }
3567
3568 /*
3569 * fsync_fname_ext -- Try to fsync a file or directory
3570 *
3571 * If ignore_perm is true, ignore errors upon trying to open unreadable
3572 * files. Logs other errors at a caller-specified level.
3573 *
3574 * Returns 0 if the operation succeeded, -1 otherwise.
3575 */
3576 int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3577 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3578 {
3579 int fd;
3580 int flags;
3581 int returncode;
3582
3583 /*
3584 * Some OSs require directories to be opened read-only whereas other
3585 * systems don't allow us to fsync files opened read-only; so we need both
3586 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3587 * not writable by our userid, but we assume that's OK.
3588 */
3589 flags = PG_BINARY;
3590 if (!isdir)
3591 flags |= O_RDWR;
3592 else
3593 flags |= O_RDONLY;
3594
3595 fd = OpenTransientFile(fname, flags);
3596
3597 /*
3598 * Some OSs don't allow us to open directories at all (Windows returns
3599 * EACCES), just ignore the error in that case. If desired also silently
3600 * ignoring errors about unreadable files. Log others.
3601 */
3602 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3603 return 0;
3604 else if (fd < 0 && ignore_perm && errno == EACCES)
3605 return 0;
3606 else if (fd < 0)
3607 {
3608 ereport(elevel,
3609 (errcode_for_file_access(),
3610 errmsg("could not open file \"%s\": %m", fname)));
3611 return -1;
3612 }
3613
3614 returncode = pg_fsync(fd);
3615
3616 /*
3617 * Some OSes don't allow us to fsync directories at all, so we can ignore
3618 * those errors. Anything else needs to be logged.
3619 */
3620 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3621 {
3622 int save_errno;
3623
3624 /* close file upon error, might not be in transaction context */
3625 save_errno = errno;
3626 (void) CloseTransientFile(fd);
3627 errno = save_errno;
3628
3629 ereport(elevel,
3630 (errcode_for_file_access(),
3631 errmsg("could not fsync file \"%s\": %m", fname)));
3632 return -1;
3633 }
3634
3635 if (CloseTransientFile(fd) != 0)
3636 {
3637 ereport(elevel,
3638 (errcode_for_file_access(),
3639 errmsg("could not close file \"%s\": %m", fname)));
3640 return -1;
3641 }
3642
3643 return 0;
3644 }
3645
3646 /*
3647 * fsync_parent_path -- fsync the parent path of a file or directory
3648 *
3649 * This is aimed at making file operations persistent on disk in case of
3650 * an OS crash or power failure.
3651 */
3652 static int
fsync_parent_path(const char * fname,int elevel)3653 fsync_parent_path(const char *fname, int elevel)
3654 {
3655 char parentpath[MAXPGPATH];
3656
3657 strlcpy(parentpath, fname, MAXPGPATH);
3658 get_parent_directory(parentpath);
3659
3660 /*
3661 * get_parent_directory() returns an empty string if the input argument is
3662 * just a file name (see comments in path.c), so handle that as being the
3663 * current directory.
3664 */
3665 if (strlen(parentpath) == 0)
3666 strlcpy(parentpath, ".", MAXPGPATH);
3667
3668 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3669 return -1;
3670
3671 return 0;
3672 }
3673
3674 /*
3675 * Create a PostgreSQL data sub-directory
3676 *
3677 * The data directory itself, and most of its sub-directories, are created at
3678 * initdb time, but we do have some occasions when we create directories in
3679 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3680 * make sure that those directories are created consistently. Today, that means
3681 * making sure that the created directory has the correct permissions, which is
3682 * what pg_dir_create_mode tracks for us.
3683 *
3684 * Note that we also set the umask() based on what we understand the correct
3685 * permissions to be (see file_perm.c).
3686 *
3687 * For permissions other than the default, mkdir() can be used directly, but
3688 * be sure to consider carefully such cases -- a sub-directory with incorrect
3689 * permissions in a PostgreSQL data directory could cause backups and other
3690 * processes to fail.
3691 */
3692 int
MakePGDirectory(const char * directoryName)3693 MakePGDirectory(const char *directoryName)
3694 {
3695 return mkdir(directoryName, pg_dir_create_mode);
3696 }
3697
3698 /*
3699 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3700 *
3701 * Failure to fsync any data file is cause for immediate panic, unless
3702 * data_sync_retry is enabled. Data may have been written to the operating
3703 * system and removed from our buffer pool already, and if we are running on
3704 * an operating system that forgets dirty data on write-back failure, there
3705 * may be only one copy of the data remaining: in the WAL. A later attempt to
3706 * fsync again might falsely report success. Therefore we must not allow any
3707 * further checkpoints to be attempted. data_sync_retry can in theory be
3708 * enabled on systems known not to drop dirty buffered data on write-back
3709 * failure (with the likely outcome that checkpoints will continue to fail
3710 * until the underlying problem is fixed).
3711 *
3712 * Any code that reports a failure from fsync() or related functions should
3713 * filter the error level with this function.
3714 */
3715 int
data_sync_elevel(int elevel)3716 data_sync_elevel(int elevel)
3717 {
3718 return data_sync_retry ? elevel : PANIC;
3719 }
3720
3721 /*
3722 * A convenience wrapper for pg_pwritev() that retries on partial write. If an
3723 * error is returned, it is unspecified how much has been written.
3724 */
3725 ssize_t
pg_pwritev_with_retry(int fd,const struct iovec * iov,int iovcnt,off_t offset)3726 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
3727 {
3728 struct iovec iov_copy[PG_IOV_MAX];
3729 ssize_t sum = 0;
3730 ssize_t part;
3731
3732 /* We'd better have space to make a copy, in case we need to retry. */
3733 if (iovcnt > PG_IOV_MAX)
3734 {
3735 errno = EINVAL;
3736 return -1;
3737 }
3738
3739 for (;;)
3740 {
3741 /* Write as much as we can. */
3742 part = pg_pwritev(fd, iov, iovcnt, offset);
3743 if (part < 0)
3744 return -1;
3745
3746 #ifdef SIMULATE_SHORT_WRITE
3747 part = Min(part, 4096);
3748 #endif
3749
3750 /* Count our progress. */
3751 sum += part;
3752 offset += part;
3753
3754 /* Step over iovecs that are done. */
3755 while (iovcnt > 0 && iov->iov_len <= part)
3756 {
3757 part -= iov->iov_len;
3758 ++iov;
3759 --iovcnt;
3760 }
3761
3762 /* Are they all done? */
3763 if (iovcnt == 0)
3764 {
3765 /* We don't expect the kernel to write more than requested. */
3766 Assert(part == 0);
3767 break;
3768 }
3769
3770 /*
3771 * Move whatever's left to the front of our mutable copy and adjust
3772 * the leading iovec.
3773 */
3774 Assert(iovcnt > 0);
3775 memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
3776 Assert(iov->iov_len > part);
3777 iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
3778 iov_copy[0].iov_len -= part;
3779 iov = iov_copy;
3780 }
3781
3782 return sum;
3783 }
3784