1 /*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 256 on many modern
20 * operating systems, but can be as low as 32 on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 *-------------------------------------------------------------------------
65 */
66
67 #include "postgres.h"
68
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h> /* for getrlimit */
80 #endif
81
82 #include "miscadmin.h"
83 #include "access/xact.h"
84 #include "access/xlog.h"
85 #include "catalog/pg_tablespace.h"
86 #include "common/file_perm.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93
94
95 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96 #if defined(HAVE_SYNC_FILE_RANGE)
97 #define PG_FLUSH_DATA_WORKS 1
98 #elif !defined(WIN32) && defined(MS_ASYNC)
99 #define PG_FLUSH_DATA_WORKS 1
100 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101 #define PG_FLUSH_DATA_WORKS 1
102 #endif
103
104 /*
105 * We must leave some file descriptors free for system(), the dynamic loader,
106 * and other code that tries to open files without consulting fd.c. This
107 * is the number left free. (While we can be pretty sure we won't get
108 * EMFILE, there's never any guarantee that we won't get ENFILE due to
109 * other processes chewing up FDs. So it's a bad idea to try to open files
110 * without consulting fd.c. Nonetheless we cannot control all code.)
111 *
112 * Because this is just a fixed setting, we are effectively assuming that
113 * no such code will leave FDs open over the long term; otherwise the slop
114 * is likely to be insufficient. Note in particular that we expect that
115 * loading a shared library does not result in any permanent increase in
116 * the number of open files. (This appears to be true on most if not
117 * all platforms as of Feb 2004.)
118 */
119 #define NUM_RESERVED_FDS 10
120
121 /*
122 * If we have fewer than this many usable FDs after allowing for the reserved
123 * ones, choke.
124 */
125 #define FD_MINFREE 10
126
127 /*
128 * A number of platforms allow individual processes to open many more files
129 * than they can really support when *many* processes do the same thing.
130 * This GUC parameter lets the DBA limit max_safe_fds to something less than
131 * what the postmaster's initial probe suggests will work.
132 */
133 int max_files_per_process = 1000;
134
135 /*
136 * Maximum number of file descriptors to open for either VFD entries or
137 * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
138 * to a conservative value, and remains that way indefinitely in bootstrap or
139 * standalone-backend cases. In normal postmaster operation, the postmaster
140 * calls set_max_safe_fds() late in initialization to update the value, and
141 * that value is then inherited by forked subprocesses.
142 *
143 * Note: the value of max_files_per_process is taken into account while
144 * setting this variable, and so need not be tested separately.
145 */
146 int max_safe_fds = 32; /* default if not changed */
147
148 /* Whether it is safe to continue running after fsync() fails. */
149 bool data_sync_retry = false;
150
151 /* Debugging.... */
152
153 #ifdef FDDEBUG
154 #define DO_DB(A) \
155 do { \
156 int _do_db_save_errno = errno; \
157 A; \
158 errno = _do_db_save_errno; \
159 } while (0)
160 #else
161 #define DO_DB(A) \
162 ((void) 0)
163 #endif
164
165 #define VFD_CLOSED (-1)
166
167 #define FileIsValid(file) \
168 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
169
170 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
171
172 /*
173 * Note: a VFD's seekPos is normally always valid, but if for some reason
174 * an lseek() fails, it might become set to FileUnknownPos. We can struggle
175 * along without knowing the seek position in many cases, but in some places
176 * we have to fail if we don't have it.
177 */
178 #define FileUnknownPos ((off_t) -1)
179 #define FilePosIsUnknown(pos) ((pos) < 0)
180
181 /* these are the assigned bits in fdstate below: */
182 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
183 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
184 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
185
186 typedef struct vfd
187 {
188 int fd; /* current FD, or VFD_CLOSED if none */
189 unsigned short fdstate; /* bitflags for VFD's state */
190 ResourceOwner resowner; /* owner, for automatic cleanup */
191 File nextFree; /* link to next free VFD, if in freelist */
192 File lruMoreRecently; /* doubly linked recency-of-use list */
193 File lruLessRecently;
194 off_t seekPos; /* current logical file position, or -1 */
195 off_t fileSize; /* current size of file (0 if not temporary) */
196 char *fileName; /* name of file, or NULL for unused VFD */
197 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
198 int fileFlags; /* open(2) flags for (re)opening the file */
199 mode_t fileMode; /* mode to pass to open(2) */
200 } Vfd;
201
202 /*
203 * Virtual File Descriptor array pointer and size. This grows as
204 * needed. 'File' values are indexes into this array.
205 * Note that VfdCache[0] is not a usable VFD, just a list header.
206 */
207 static Vfd *VfdCache;
208 static Size SizeVfdCache = 0;
209
210 /*
211 * Number of file descriptors known to be in use by VFD entries.
212 */
213 static int nfile = 0;
214
215 /*
216 * Flag to tell whether it's worth scanning VfdCache looking for temp files
217 * to close
218 */
219 static bool have_xact_temporary_files = false;
220
221 /*
222 * Tracks the total size of all temporary files. Note: when temp_file_limit
223 * is being enforced, this cannot overflow since the limit cannot be more
224 * than INT_MAX kilobytes. When not enforcing, it could theoretically
225 * overflow, but we don't care.
226 */
227 static uint64 temporary_files_size = 0;
228
229 /*
230 * List of OS handles opened with AllocateFile, AllocateDir and
231 * OpenTransientFile.
232 */
233 typedef enum
234 {
235 AllocateDescFile,
236 AllocateDescPipe,
237 AllocateDescDir,
238 AllocateDescRawFD
239 } AllocateDescKind;
240
241 typedef struct
242 {
243 AllocateDescKind kind;
244 SubTransactionId create_subid;
245 union
246 {
247 FILE *file;
248 DIR *dir;
249 int fd;
250 } desc;
251 } AllocateDesc;
252
253 static int numAllocatedDescs = 0;
254 static int maxAllocatedDescs = 0;
255 static AllocateDesc *allocatedDescs = NULL;
256
257 /*
258 * Number of temporary files opened during the current session;
259 * this is used in generation of tempfile names.
260 */
261 static long tempFileCounter = 0;
262
263 /*
264 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
265 * indicating that the current database's default tablespace should be used.)
266 * When numTempTableSpaces is -1, this has not been set in the current
267 * transaction.
268 */
269 static Oid *tempTableSpaces = NULL;
270 static int numTempTableSpaces = -1;
271 static int nextTempTableSpace = 0;
272
273
274 /*--------------------
275 *
276 * Private Routines
277 *
278 * Delete - delete a file from the Lru ring
279 * LruDelete - remove a file from the Lru ring and close its FD
280 * Insert - put a file at the front of the Lru ring
281 * LruInsert - put a file at the front of the Lru ring and open it
282 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
283 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
284 * AllocateVfd - grab a free (or new) file record (from VfdArray)
285 * FreeVfd - free a file record
286 *
287 * The Least Recently Used ring is a doubly linked list that begins and
288 * ends on element zero. Element zero is special -- it doesn't represent
289 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
290 * anchor that shows us the beginning/end of the ring.
291 * Only VFD elements that are currently really open (have an FD assigned) are
292 * in the Lru ring. Elements that are "virtually" open can be recognized
293 * by having a non-null fileName field.
294 *
295 * example:
296 *
297 * /--less----\ /---------\
298 * v \ v \
299 * #0 --more---> LeastRecentlyUsed --more-\ \
300 * ^\ | |
301 * \\less--> MostRecentlyUsedFile <---/ |
302 * \more---/ \--less--/
303 *
304 *--------------------
305 */
306 static void Delete(File file);
307 static void LruDelete(File file);
308 static void Insert(File file);
309 static int LruInsert(File file);
310 static bool ReleaseLruFile(void);
311 static void ReleaseLruFiles(void);
312 static File AllocateVfd(void);
313 static void FreeVfd(File file);
314
315 static int FileAccess(File file);
316 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
317 static bool reserveAllocatedDesc(void);
318 static int FreeDesc(AllocateDesc *desc);
319
320 static void AtProcExit_Files(int code, Datum arg);
321 static void CleanupTempFiles(bool isCommit, bool isProcExit);
322 static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
323 bool unlink_all);
324 static void RemovePgTempRelationFiles(const char *tsdirname);
325 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
326
327 static void walkdir(const char *path,
328 void (*action) (const char *fname, bool isdir, int elevel),
329 bool process_symlinks,
330 int elevel);
331 #ifdef PG_FLUSH_DATA_WORKS
332 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
333 #endif
334 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
335 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
336
337 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
338 static int fsync_parent_path(const char *fname, int elevel);
339
340
341 /*
342 * pg_fsync --- do fsync with or without writethrough
343 */
344 int
pg_fsync(int fd)345 pg_fsync(int fd)
346 {
347 /* #if is to skip the sync_method test if there's no need for it */
348 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
349 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
350 return pg_fsync_writethrough(fd);
351 else
352 #endif
353 return pg_fsync_no_writethrough(fd);
354 }
355
356
357 /*
358 * pg_fsync_no_writethrough --- same as fsync except does nothing if
359 * enableFsync is off
360 */
361 int
pg_fsync_no_writethrough(int fd)362 pg_fsync_no_writethrough(int fd)
363 {
364 if (enableFsync)
365 return fsync(fd);
366 else
367 return 0;
368 }
369
370 /*
371 * pg_fsync_writethrough
372 */
373 int
pg_fsync_writethrough(int fd)374 pg_fsync_writethrough(int fd)
375 {
376 if (enableFsync)
377 {
378 #ifdef WIN32
379 return _commit(fd);
380 #elif defined(F_FULLFSYNC)
381 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
382 #else
383 errno = ENOSYS;
384 return -1;
385 #endif
386 }
387 else
388 return 0;
389 }
390
391 /*
392 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
393 *
394 * Not all platforms have fdatasync; treat as fsync if not available.
395 */
396 int
pg_fdatasync(int fd)397 pg_fdatasync(int fd)
398 {
399 if (enableFsync)
400 {
401 #ifdef HAVE_FDATASYNC
402 return fdatasync(fd);
403 #else
404 return fsync(fd);
405 #endif
406 }
407 else
408 return 0;
409 }
410
411 /*
412 * pg_flush_data --- advise OS that the described dirty data should be flushed
413 *
414 * offset of 0 with nbytes 0 means that the entire file should be flushed;
415 * in this case, this function may have side-effects on the file's
416 * seek position!
417 */
418 void
pg_flush_data(int fd,off_t offset,off_t nbytes)419 pg_flush_data(int fd, off_t offset, off_t nbytes)
420 {
421 /*
422 * Right now file flushing is primarily used to avoid making later
423 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
424 * if fsyncs are disabled - that's a decision we might want to make
425 * configurable at some point.
426 */
427 if (!enableFsync)
428 return;
429
430 /*
431 * We compile all alternatives that are supported on the current platform,
432 * to find portability problems more easily.
433 */
434 #if defined(HAVE_SYNC_FILE_RANGE)
435 {
436 int rc;
437 static bool not_implemented_by_kernel = false;
438
439 if (not_implemented_by_kernel)
440 return;
441
442 /*
443 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
444 * tells the OS that writeback for the specified blocks should be
445 * started, but that we don't want to wait for completion. Note that
446 * this call might block if too much dirty data exists in the range.
447 * This is the preferable method on OSs supporting it, as it works
448 * reliably when available (contrast to msync()) and doesn't flush out
449 * clean data (like FADV_DONTNEED).
450 */
451 rc = sync_file_range(fd, offset, nbytes,
452 SYNC_FILE_RANGE_WRITE);
453 if (rc != 0)
454 {
455 int elevel;
456
457 /*
458 * For systems that don't have an implementation of
459 * sync_file_range() such as Windows WSL, generate only one
460 * warning and then suppress all further attempts by this process.
461 */
462 if (errno == ENOSYS)
463 {
464 elevel = WARNING;
465 not_implemented_by_kernel = true;
466 }
467 else
468 elevel = data_sync_elevel(WARNING);
469
470 ereport(elevel,
471 (errcode_for_file_access(),
472 errmsg("could not flush dirty data: %m")));
473 }
474
475 return;
476 }
477 #endif
478 #if !defined(WIN32) && defined(MS_ASYNC)
479 {
480 void *p;
481 static int pagesize = 0;
482
483 /*
484 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
485 * writeback. On linux it only does so if MS_SYNC is specified, but
486 * then it does the writeback synchronously. Luckily all common linux
487 * systems have sync_file_range(). This is preferable over
488 * FADV_DONTNEED because it doesn't flush out clean data.
489 *
490 * We map the file (mmap()), tell the kernel to sync back the contents
491 * (msync()), and then remove the mapping again (munmap()).
492 */
493
494 /* mmap() needs actual length if we want to map whole file */
495 if (offset == 0 && nbytes == 0)
496 {
497 nbytes = lseek(fd, 0, SEEK_END);
498 if (nbytes < 0)
499 {
500 ereport(WARNING,
501 (errcode_for_file_access(),
502 errmsg("could not determine dirty data size: %m")));
503 return;
504 }
505 }
506
507 /*
508 * Some platforms reject partial-page mmap() attempts. To deal with
509 * that, just truncate the request to a page boundary. If any extra
510 * bytes don't get flushed, well, it's only a hint anyway.
511 */
512
513 /* fetch pagesize only once */
514 if (pagesize == 0)
515 pagesize = sysconf(_SC_PAGESIZE);
516
517 /* align length to pagesize, dropping any fractional page */
518 if (pagesize > 0)
519 nbytes = (nbytes / pagesize) * pagesize;
520
521 /* fractional-page request is a no-op */
522 if (nbytes <= 0)
523 return;
524
525 /*
526 * mmap could well fail, particularly on 32-bit platforms where there
527 * may simply not be enough address space. If so, silently fall
528 * through to the next implementation.
529 */
530 if (nbytes <= (off_t) SSIZE_MAX)
531 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
532 else
533 p = MAP_FAILED;
534
535 if (p != MAP_FAILED)
536 {
537 int rc;
538
539 rc = msync(p, (size_t) nbytes, MS_ASYNC);
540 if (rc != 0)
541 {
542 ereport(data_sync_elevel(WARNING),
543 (errcode_for_file_access(),
544 errmsg("could not flush dirty data: %m")));
545 /* NB: need to fall through to munmap()! */
546 }
547
548 rc = munmap(p, (size_t) nbytes);
549 if (rc != 0)
550 {
551 /* FATAL error because mapping would remain */
552 ereport(FATAL,
553 (errcode_for_file_access(),
554 errmsg("could not munmap() while flushing data: %m")));
555 }
556
557 return;
558 }
559 }
560 #endif
561 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
562 {
563 int rc;
564
565 /*
566 * Signal the kernel that the passed in range should not be cached
567 * anymore. This has the, desired, side effect of writing out dirty
568 * data, and the, undesired, side effect of likely discarding useful
569 * clean cached blocks. For the latter reason this is the least
570 * preferable method.
571 */
572
573 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
574
575 if (rc != 0)
576 {
577 /* don't error out, this is just a performance optimization */
578 ereport(WARNING,
579 (errcode_for_file_access(),
580 errmsg("could not flush dirty data: %m")));
581 }
582
583 return;
584 }
585 #endif
586 }
587
588
589 /*
590 * fsync_fname -- fsync a file or directory, handling errors properly
591 *
592 * Try to fsync a file or directory. When doing the latter, ignore errors that
593 * indicate the OS just doesn't allow/require fsyncing directories.
594 */
595 void
fsync_fname(const char * fname,bool isdir)596 fsync_fname(const char *fname, bool isdir)
597 {
598 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
599 }
600
601 /*
602 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
603 *
604 * This routine ensures that, after returning, the effect of renaming file
605 * persists in case of a crash. A crash while this routine is running will
606 * leave you with either the pre-existing or the moved file in place of the
607 * new file; no mixed state or truncated files are possible.
608 *
609 * It does so by using fsync on the old filename and the possibly existing
610 * target filename before the rename, and the target file and directory after.
611 *
612 * Note that rename() cannot be used across arbitrary directories, as they
613 * might not be on the same filesystem. Therefore this routine does not
614 * support renaming across directories.
615 *
616 * Log errors with the caller specified severity.
617 *
618 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
619 * valid upon return.
620 */
621 int
durable_rename(const char * oldfile,const char * newfile,int elevel)622 durable_rename(const char *oldfile, const char *newfile, int elevel)
623 {
624 int fd;
625
626 /*
627 * First fsync the old and target path (if it exists), to ensure that they
628 * are properly persistent on disk. Syncing the target file is not
629 * strictly necessary, but it makes it easier to reason about crashes;
630 * because it's then guaranteed that either source or target file exists
631 * after a crash.
632 */
633 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
634 return -1;
635
636 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
637 if (fd < 0)
638 {
639 if (errno != ENOENT)
640 {
641 ereport(elevel,
642 (errcode_for_file_access(),
643 errmsg("could not open file \"%s\": %m", newfile)));
644 return -1;
645 }
646 }
647 else
648 {
649 if (pg_fsync(fd) != 0)
650 {
651 int save_errno;
652
653 /* close file upon error, might not be in transaction context */
654 save_errno = errno;
655 CloseTransientFile(fd);
656 errno = save_errno;
657
658 ereport(elevel,
659 (errcode_for_file_access(),
660 errmsg("could not fsync file \"%s\": %m", newfile)));
661 return -1;
662 }
663 CloseTransientFile(fd);
664 }
665
666 /* Time to do the real deal... */
667 if (rename(oldfile, newfile) < 0)
668 {
669 ereport(elevel,
670 (errcode_for_file_access(),
671 errmsg("could not rename file \"%s\" to \"%s\": %m",
672 oldfile, newfile)));
673 return -1;
674 }
675
676 /*
677 * To guarantee renaming the file is persistent, fsync the file with its
678 * new name, and its containing directory.
679 */
680 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
681 return -1;
682
683 if (fsync_parent_path(newfile, elevel) != 0)
684 return -1;
685
686 return 0;
687 }
688
689 /*
690 * durable_unlink -- remove a file in a durable manner
691 *
692 * This routine ensures that, after returning, the effect of removing file
693 * persists in case of a crash. A crash while this routine is running will
694 * leave the system in no mixed state.
695 *
696 * It does so by using fsync on the parent directory of the file after the
697 * actual removal is done.
698 *
699 * Log errors with the severity specified by caller.
700 *
701 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
702 * valid upon return.
703 */
704 int
durable_unlink(const char * fname,int elevel)705 durable_unlink(const char *fname, int elevel)
706 {
707 if (unlink(fname) < 0)
708 {
709 ereport(elevel,
710 (errcode_for_file_access(),
711 errmsg("could not remove file \"%s\": %m",
712 fname)));
713 return -1;
714 }
715
716 /*
717 * To guarantee that the removal of the file is persistent, fsync its
718 * parent directory.
719 */
720 if (fsync_parent_path(fname, elevel) != 0)
721 return -1;
722
723 return 0;
724 }
725
726 /*
727 * durable_link_or_rename -- rename a file in a durable manner.
728 *
729 * Similar to durable_rename(), except that this routine tries (but does not
730 * guarantee) not to overwrite the target file.
731 *
732 * Note that a crash in an unfortunate moment can leave you with two links to
733 * the target file.
734 *
735 * Log errors with the caller specified severity.
736 *
737 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
738 * valid upon return.
739 */
740 int
durable_link_or_rename(const char * oldfile,const char * newfile,int elevel)741 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
742 {
743 /*
744 * Ensure that, if we crash directly after the rename/link, a file with
745 * valid contents is moved into place.
746 */
747 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
748 return -1;
749
750 #if HAVE_WORKING_LINK
751 if (link(oldfile, newfile) < 0)
752 {
753 ereport(elevel,
754 (errcode_for_file_access(),
755 errmsg("could not link file \"%s\" to \"%s\": %m",
756 oldfile, newfile)));
757 return -1;
758 }
759 unlink(oldfile);
760 #else
761 /* XXX: Add racy file existence check? */
762 if (rename(oldfile, newfile) < 0)
763 {
764 ereport(elevel,
765 (errcode_for_file_access(),
766 errmsg("could not rename file \"%s\" to \"%s\": %m",
767 oldfile, newfile)));
768 return -1;
769 }
770 #endif
771
772 /*
773 * Make change persistent in case of an OS crash, both the new entry and
774 * its parent directory need to be flushed.
775 */
776 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
777 return -1;
778
779 /* Same for parent directory */
780 if (fsync_parent_path(newfile, elevel) != 0)
781 return -1;
782
783 return 0;
784 }
785
786 /*
787 * InitFileAccess --- initialize this module during backend startup
788 *
789 * This is called during either normal or standalone backend start.
790 * It is *not* called in the postmaster.
791 */
792 void
InitFileAccess(void)793 InitFileAccess(void)
794 {
795 Assert(SizeVfdCache == 0); /* call me only once */
796
797 /* initialize cache header entry */
798 VfdCache = (Vfd *) malloc(sizeof(Vfd));
799 if (VfdCache == NULL)
800 ereport(FATAL,
801 (errcode(ERRCODE_OUT_OF_MEMORY),
802 errmsg("out of memory")));
803
804 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
805 VfdCache->fd = VFD_CLOSED;
806
807 SizeVfdCache = 1;
808
809 /* register proc-exit hook to ensure temp files are dropped at exit */
810 on_proc_exit(AtProcExit_Files, 0);
811 }
812
813 /*
814 * count_usable_fds --- count how many FDs the system will let us open,
815 * and estimate how many are already open.
816 *
817 * We stop counting if usable_fds reaches max_to_probe. Note: a small
818 * value of max_to_probe might result in an underestimate of already_open;
819 * we must fill in any "gaps" in the set of used FDs before the calculation
820 * of already_open will give the right answer. In practice, max_to_probe
821 * of a couple of dozen should be enough to ensure good results.
822 *
823 * We assume stdin (FD 0) is available for dup'ing
824 */
825 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)826 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
827 {
828 int *fd;
829 int size;
830 int used = 0;
831 int highestfd = 0;
832 int j;
833
834 #ifdef HAVE_GETRLIMIT
835 struct rlimit rlim;
836 int getrlimit_status;
837 #endif
838
839 size = 1024;
840 fd = (int *) palloc(size * sizeof(int));
841
842 #ifdef HAVE_GETRLIMIT
843 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
844 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
845 #else /* but BSD doesn't ... */
846 getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
847 #endif /* RLIMIT_NOFILE */
848 if (getrlimit_status != 0)
849 ereport(WARNING, (errmsg("getrlimit failed: %m")));
850 #endif /* HAVE_GETRLIMIT */
851
852 /* dup until failure or probe limit reached */
853 for (;;)
854 {
855 int thisfd;
856
857 #ifdef HAVE_GETRLIMIT
858
859 /*
860 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
861 * some platforms
862 */
863 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
864 break;
865 #endif
866
867 thisfd = dup(0);
868 if (thisfd < 0)
869 {
870 /* Expect EMFILE or ENFILE, else it's fishy */
871 if (errno != EMFILE && errno != ENFILE)
872 elog(WARNING, "dup(0) failed after %d successes: %m", used);
873 break;
874 }
875
876 if (used >= size)
877 {
878 size *= 2;
879 fd = (int *) repalloc(fd, size * sizeof(int));
880 }
881 fd[used++] = thisfd;
882
883 if (highestfd < thisfd)
884 highestfd = thisfd;
885
886 if (used >= max_to_probe)
887 break;
888 }
889
890 /* release the files we opened */
891 for (j = 0; j < used; j++)
892 close(fd[j]);
893
894 pfree(fd);
895
896 /*
897 * Return results. usable_fds is just the number of successful dups. We
898 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
899 * number) and so already_open is highestfd+1 - usable_fds.
900 */
901 *usable_fds = used;
902 *already_open = highestfd + 1 - used;
903 }
904
905 /*
906 * set_max_safe_fds
907 * Determine number of filedescriptors that fd.c is allowed to use
908 */
909 void
set_max_safe_fds(void)910 set_max_safe_fds(void)
911 {
912 int usable_fds;
913 int already_open;
914
915 /*----------
916 * We want to set max_safe_fds to
917 * MIN(usable_fds, max_files_per_process - already_open)
918 * less the slop factor for files that are opened without consulting
919 * fd.c. This ensures that we won't exceed either max_files_per_process
920 * or the experimentally-determined EMFILE limit.
921 *----------
922 */
923 count_usable_fds(max_files_per_process,
924 &usable_fds, &already_open);
925
926 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
927
928 /*
929 * Take off the FDs reserved for system() etc.
930 */
931 max_safe_fds -= NUM_RESERVED_FDS;
932
933 /*
934 * Make sure we still have enough to get by.
935 */
936 if (max_safe_fds < FD_MINFREE)
937 ereport(FATAL,
938 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
939 errmsg("insufficient file descriptors available to start server process"),
940 errdetail("System allows %d, we need at least %d.",
941 max_safe_fds + NUM_RESERVED_FDS,
942 FD_MINFREE + NUM_RESERVED_FDS)));
943
944 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
945 max_safe_fds, usable_fds, already_open);
946 }
947
948 /*
949 * Open a file with BasicOpenFilePerm() and pass default file mode for the
950 * fileMode parameter.
951 */
952 int
BasicOpenFile(const char * fileName,int fileFlags)953 BasicOpenFile(const char *fileName, int fileFlags)
954 {
955 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
956 }
957
958 /*
959 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
960 *
961 * This is exported for use by places that really want a plain kernel FD,
962 * but need to be proof against running out of FDs. Once an FD has been
963 * successfully returned, it is the caller's responsibility to ensure that
964 * it will not be leaked on ereport()! Most users should *not* call this
965 * routine directly, but instead use the VFD abstraction level, which
966 * provides protection against descriptor leaks as well as management of
967 * files that need to be open for more than a short period of time.
968 *
969 * Ideally this should be the *only* direct call of open() in the backend.
970 * In practice, the postmaster calls open() directly, and there are some
971 * direct open() calls done early in backend startup. Those are OK since
972 * this module wouldn't have any open files to close at that point anyway.
973 */
974 int
BasicOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)975 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
976 {
977 int fd;
978
979 tryAgain:
980 fd = open(fileName, fileFlags, fileMode);
981
982 if (fd >= 0)
983 return fd; /* success! */
984
985 if (errno == EMFILE || errno == ENFILE)
986 {
987 int save_errno = errno;
988
989 ereport(LOG,
990 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
991 errmsg("out of file descriptors: %m; release and retry")));
992 errno = 0;
993 if (ReleaseLruFile())
994 goto tryAgain;
995 errno = save_errno;
996 }
997
998 return -1; /* failure */
999 }
1000
1001 #if defined(FDDEBUG)
1002
1003 static void
_dump_lru(void)1004 _dump_lru(void)
1005 {
1006 int mru = VfdCache[0].lruLessRecently;
1007 Vfd *vfdP = &VfdCache[mru];
1008 char buf[2048];
1009
1010 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1011 while (mru != 0)
1012 {
1013 mru = vfdP->lruLessRecently;
1014 vfdP = &VfdCache[mru];
1015 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1016 }
1017 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1018 elog(LOG, "%s", buf);
1019 }
1020 #endif /* FDDEBUG */
1021
1022 static void
Delete(File file)1023 Delete(File file)
1024 {
1025 Vfd *vfdP;
1026
1027 Assert(file != 0);
1028
1029 DO_DB(elog(LOG, "Delete %d (%s)",
1030 file, VfdCache[file].fileName));
1031 DO_DB(_dump_lru());
1032
1033 vfdP = &VfdCache[file];
1034
1035 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1036 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1037
1038 DO_DB(_dump_lru());
1039 }
1040
1041 static void
LruDelete(File file)1042 LruDelete(File file)
1043 {
1044 Vfd *vfdP;
1045
1046 Assert(file != 0);
1047
1048 DO_DB(elog(LOG, "LruDelete %d (%s)",
1049 file, VfdCache[file].fileName));
1050
1051 vfdP = &VfdCache[file];
1052
1053 /*
1054 * Normally we should know the seek position, but if for some reason we
1055 * have lost track of it, try again to get it. If we still can't get it,
1056 * we have a problem: we will be unable to restore the file seek position
1057 * when and if the file is re-opened. But we can't really throw an error
1058 * and refuse to close the file, or activities such as transaction cleanup
1059 * will be broken.
1060 */
1061 if (FilePosIsUnknown(vfdP->seekPos))
1062 {
1063 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1064 if (FilePosIsUnknown(vfdP->seekPos))
1065 elog(LOG, "could not seek file \"%s\" before closing: %m",
1066 vfdP->fileName);
1067 }
1068
1069 /*
1070 * Close the file. We aren't expecting this to fail; if it does, better
1071 * to leak the FD than to mess up our internal state.
1072 */
1073 if (close(vfdP->fd))
1074 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1075 "could not close file \"%s\": %m", vfdP->fileName);
1076 vfdP->fd = VFD_CLOSED;
1077 --nfile;
1078
1079 /* delete the vfd record from the LRU ring */
1080 Delete(file);
1081 }
1082
1083 static void
Insert(File file)1084 Insert(File file)
1085 {
1086 Vfd *vfdP;
1087
1088 Assert(file != 0);
1089
1090 DO_DB(elog(LOG, "Insert %d (%s)",
1091 file, VfdCache[file].fileName));
1092 DO_DB(_dump_lru());
1093
1094 vfdP = &VfdCache[file];
1095
1096 vfdP->lruMoreRecently = 0;
1097 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1098 VfdCache[0].lruLessRecently = file;
1099 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1100
1101 DO_DB(_dump_lru());
1102 }
1103
1104 /* returns 0 on success, -1 on re-open failure (with errno set) */
1105 static int
LruInsert(File file)1106 LruInsert(File file)
1107 {
1108 Vfd *vfdP;
1109
1110 Assert(file != 0);
1111
1112 DO_DB(elog(LOG, "LruInsert %d (%s)",
1113 file, VfdCache[file].fileName));
1114
1115 vfdP = &VfdCache[file];
1116
1117 if (FileIsNotOpen(file))
1118 {
1119 /* Close excess kernel FDs. */
1120 ReleaseLruFiles();
1121
1122 /*
1123 * The open could still fail for lack of file descriptors, eg due to
1124 * overall system file table being full. So, be prepared to release
1125 * another FD if necessary...
1126 */
1127 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1128 vfdP->fileMode);
1129 if (vfdP->fd < 0)
1130 {
1131 DO_DB(elog(LOG, "re-open failed: %m"));
1132 return -1;
1133 }
1134 else
1135 {
1136 ++nfile;
1137 }
1138
1139 /*
1140 * Seek to the right position. We need no special case for seekPos
1141 * equal to FileUnknownPos, as lseek() will certainly reject that
1142 * (thus completing the logic noted in LruDelete() that we will fail
1143 * to re-open a file if we couldn't get its seek position before
1144 * closing).
1145 */
1146 if (vfdP->seekPos != (off_t) 0)
1147 {
1148 if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1149 {
1150 /*
1151 * If we fail to restore the seek position, treat it like an
1152 * open() failure.
1153 */
1154 int save_errno = errno;
1155
1156 elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1157 vfdP->fileName);
1158 (void) close(vfdP->fd);
1159 vfdP->fd = VFD_CLOSED;
1160 --nfile;
1161 errno = save_errno;
1162 return -1;
1163 }
1164 }
1165 }
1166
1167 /*
1168 * put it at the head of the Lru ring
1169 */
1170
1171 Insert(file);
1172
1173 return 0;
1174 }
1175
1176 /*
1177 * Release one kernel FD by closing the least-recently-used VFD.
1178 */
1179 static bool
ReleaseLruFile(void)1180 ReleaseLruFile(void)
1181 {
1182 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1183
1184 if (nfile > 0)
1185 {
1186 /*
1187 * There are opened files and so there should be at least one used vfd
1188 * in the ring.
1189 */
1190 Assert(VfdCache[0].lruMoreRecently != 0);
1191 LruDelete(VfdCache[0].lruMoreRecently);
1192 return true; /* freed a file */
1193 }
1194 return false; /* no files available to free */
1195 }
1196
1197 /*
1198 * Release kernel FDs as needed to get under the max_safe_fds limit.
1199 * After calling this, it's OK to try to open another file.
1200 */
1201 static void
ReleaseLruFiles(void)1202 ReleaseLruFiles(void)
1203 {
1204 while (nfile + numAllocatedDescs >= max_safe_fds)
1205 {
1206 if (!ReleaseLruFile())
1207 break;
1208 }
1209 }
1210
1211 static File
AllocateVfd(void)1212 AllocateVfd(void)
1213 {
1214 Index i;
1215 File file;
1216
1217 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1218
1219 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1220
1221 if (VfdCache[0].nextFree == 0)
1222 {
1223 /*
1224 * The free list is empty so it is time to increase the size of the
1225 * array. We choose to double it each time this happens. However,
1226 * there's not much point in starting *real* small.
1227 */
1228 Size newCacheSize = SizeVfdCache * 2;
1229 Vfd *newVfdCache;
1230
1231 if (newCacheSize < 32)
1232 newCacheSize = 32;
1233
1234 /*
1235 * Be careful not to clobber VfdCache ptr if realloc fails.
1236 */
1237 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1238 if (newVfdCache == NULL)
1239 ereport(ERROR,
1240 (errcode(ERRCODE_OUT_OF_MEMORY),
1241 errmsg("out of memory")));
1242 VfdCache = newVfdCache;
1243
1244 /*
1245 * Initialize the new entries and link them into the free list.
1246 */
1247 for (i = SizeVfdCache; i < newCacheSize; i++)
1248 {
1249 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1250 VfdCache[i].nextFree = i + 1;
1251 VfdCache[i].fd = VFD_CLOSED;
1252 }
1253 VfdCache[newCacheSize - 1].nextFree = 0;
1254 VfdCache[0].nextFree = SizeVfdCache;
1255
1256 /*
1257 * Record the new size
1258 */
1259 SizeVfdCache = newCacheSize;
1260 }
1261
1262 file = VfdCache[0].nextFree;
1263
1264 VfdCache[0].nextFree = VfdCache[file].nextFree;
1265
1266 return file;
1267 }
1268
1269 static void
FreeVfd(File file)1270 FreeVfd(File file)
1271 {
1272 Vfd *vfdP = &VfdCache[file];
1273
1274 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1275 file, vfdP->fileName ? vfdP->fileName : ""));
1276
1277 if (vfdP->fileName != NULL)
1278 {
1279 free(vfdP->fileName);
1280 vfdP->fileName = NULL;
1281 }
1282 vfdP->fdstate = 0x0;
1283
1284 vfdP->nextFree = VfdCache[0].nextFree;
1285 VfdCache[0].nextFree = file;
1286 }
1287
1288 /* returns 0 on success, -1 on re-open failure (with errno set) */
1289 static int
FileAccess(File file)1290 FileAccess(File file)
1291 {
1292 int returnValue;
1293
1294 DO_DB(elog(LOG, "FileAccess %d (%s)",
1295 file, VfdCache[file].fileName));
1296
1297 /*
1298 * Is the file open? If not, open it and put it at the head of the LRU
1299 * ring (possibly closing the least recently used file to get an FD).
1300 */
1301
1302 if (FileIsNotOpen(file))
1303 {
1304 returnValue = LruInsert(file);
1305 if (returnValue != 0)
1306 return returnValue;
1307 }
1308 else if (VfdCache[0].lruLessRecently != file)
1309 {
1310 /*
1311 * We now know that the file is open and that it is not the last one
1312 * accessed, so we need to move it to the head of the Lru ring.
1313 */
1314
1315 Delete(file);
1316 Insert(file);
1317 }
1318
1319 return 0;
1320 }
1321
1322 /*
1323 * Called whenever a temporary file is deleted to report its size.
1324 */
1325 static void
ReportTemporaryFileUsage(const char * path,off_t size)1326 ReportTemporaryFileUsage(const char *path, off_t size)
1327 {
1328 pgstat_report_tempfile(size);
1329
1330 if (log_temp_files >= 0)
1331 {
1332 if ((size / 1024) >= log_temp_files)
1333 ereport(LOG,
1334 (errmsg("temporary file: path \"%s\", size %lu",
1335 path, (unsigned long) size)));
1336 }
1337 }
1338
1339 /*
1340 * Called to register a temporary file for automatic close.
1341 * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1342 * before the file was opened.
1343 */
1344 static void
RegisterTemporaryFile(File file)1345 RegisterTemporaryFile(File file)
1346 {
1347 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1348 VfdCache[file].resowner = CurrentResourceOwner;
1349
1350 /* Backup mechanism for closing at end of xact. */
1351 VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1352 have_xact_temporary_files = true;
1353 }
1354
1355 /*
1356 * Called when we get a shared invalidation message on some relation.
1357 */
1358 #ifdef NOT_USED
1359 void
FileInvalidate(File file)1360 FileInvalidate(File file)
1361 {
1362 Assert(FileIsValid(file));
1363 if (!FileIsNotOpen(file))
1364 LruDelete(file);
1365 }
1366 #endif
1367
1368 /*
1369 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1370 * fileMode parameter.
1371 */
1372 File
PathNameOpenFile(const char * fileName,int fileFlags)1373 PathNameOpenFile(const char *fileName, int fileFlags)
1374 {
1375 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1376 }
1377
1378 /*
1379 * open a file in an arbitrary directory
1380 *
1381 * NB: if the passed pathname is relative (which it usually is),
1382 * it will be interpreted relative to the process' working directory
1383 * (which should always be $PGDATA when this code is running).
1384 */
1385 File
PathNameOpenFilePerm(const char * fileName,int fileFlags,mode_t fileMode)1386 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1387 {
1388 char *fnamecopy;
1389 File file;
1390 Vfd *vfdP;
1391
1392 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1393 fileName, fileFlags, fileMode));
1394
1395 /*
1396 * We need a malloc'd copy of the file name; fail cleanly if no room.
1397 */
1398 fnamecopy = strdup(fileName);
1399 if (fnamecopy == NULL)
1400 ereport(ERROR,
1401 (errcode(ERRCODE_OUT_OF_MEMORY),
1402 errmsg("out of memory")));
1403
1404 file = AllocateVfd();
1405 vfdP = &VfdCache[file];
1406
1407 /* Close excess kernel FDs. */
1408 ReleaseLruFiles();
1409
1410 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1411
1412 if (vfdP->fd < 0)
1413 {
1414 int save_errno = errno;
1415
1416 FreeVfd(file);
1417 free(fnamecopy);
1418 errno = save_errno;
1419 return -1;
1420 }
1421 ++nfile;
1422 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1423 vfdP->fd));
1424
1425 vfdP->fileName = fnamecopy;
1426 /* Saved flags are adjusted to be OK for re-opening file */
1427 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1428 vfdP->fileMode = fileMode;
1429 vfdP->seekPos = 0;
1430 vfdP->fileSize = 0;
1431 vfdP->fdstate = 0x0;
1432 vfdP->resowner = NULL;
1433
1434 Insert(file);
1435
1436 return file;
1437 }
1438
1439 /*
1440 * Create directory 'directory'. If necessary, create 'basedir', which must
1441 * be the directory above it. This is designed for creating the top-level
1442 * temporary directory on demand before creating a directory underneath it.
1443 * Do nothing if the directory already exists.
1444 *
1445 * Directories created within the top-level temporary directory should begin
1446 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1447 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1448 * that do not need any particular prefix.
1449 */
1450 void
PathNameCreateTemporaryDir(const char * basedir,const char * directory)1451 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1452 {
1453 if (MakePGDirectory(directory) < 0)
1454 {
1455 if (errno == EEXIST)
1456 return;
1457
1458 /*
1459 * Failed. Try to create basedir first in case it's missing. Tolerate
1460 * EEXIST to close a race against another process following the same
1461 * algorithm.
1462 */
1463 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1464 ereport(ERROR,
1465 (errcode_for_file_access(),
1466 errmsg("cannot create temporary directory \"%s\": %m",
1467 basedir)));
1468
1469 /* Try again. */
1470 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1471 ereport(ERROR,
1472 (errcode_for_file_access(),
1473 errmsg("cannot create temporary subdirectory \"%s\": %m",
1474 directory)));
1475 }
1476 }
1477
1478 /*
1479 * Delete a directory and everything in it, if it exists.
1480 */
1481 void
PathNameDeleteTemporaryDir(const char * dirname)1482 PathNameDeleteTemporaryDir(const char *dirname)
1483 {
1484 struct stat statbuf;
1485
1486 /* Silently ignore missing directory. */
1487 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1488 return;
1489
1490 /*
1491 * Currently, walkdir doesn't offer a way for our passed in function to
1492 * maintain state. Perhaps it should, so that we could tell the caller
1493 * whether this operation succeeded or failed. Since this operation is
1494 * used in a cleanup path, we wouldn't actually behave differently: we'll
1495 * just log failures.
1496 */
1497 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1498 }
1499
1500 /*
1501 * Open a temporary file that will disappear when we close it.
1502 *
1503 * This routine takes care of generating an appropriate tempfile name.
1504 * There's no need to pass in fileFlags or fileMode either, since only
1505 * one setting makes any sense for a temp file.
1506 *
1507 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1508 * to ensure it's closed and deleted when it's no longer needed, typically at
1509 * the end-of-transaction. In most cases, you don't want temporary files to
1510 * outlive the transaction that created them, so this should be false -- but
1511 * if you need "somewhat" temporary storage, this might be useful. In either
1512 * case, the file is removed when the File is explicitly closed.
1513 */
1514 File
OpenTemporaryFile(bool interXact)1515 OpenTemporaryFile(bool interXact)
1516 {
1517 File file = 0;
1518
1519 /*
1520 * Make sure the current resource owner has space for this File before we
1521 * open it, if we'll be registering it below.
1522 */
1523 if (!interXact)
1524 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1525
1526 /*
1527 * If some temp tablespace(s) have been given to us, try to use the next
1528 * one. If a given tablespace can't be found, we silently fall back to
1529 * the database's default tablespace.
1530 *
1531 * BUT: if the temp file is slated to outlive the current transaction,
1532 * force it into the database's default tablespace, so that it will not
1533 * pose a threat to possible tablespace drop attempts.
1534 */
1535 if (numTempTableSpaces > 0 && !interXact)
1536 {
1537 Oid tblspcOid = GetNextTempTableSpace();
1538
1539 if (OidIsValid(tblspcOid))
1540 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1541 }
1542
1543 /*
1544 * If not, or if tablespace is bad, create in database's default
1545 * tablespace. MyDatabaseTableSpace should normally be set before we get
1546 * here, but just in case it isn't, fall back to pg_default tablespace.
1547 */
1548 if (file <= 0)
1549 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1550 MyDatabaseTableSpace :
1551 DEFAULTTABLESPACE_OID,
1552 true);
1553
1554 /* Mark it for deletion at close and temporary file size limit */
1555 VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1556
1557 /* Register it with the current resource owner */
1558 if (!interXact)
1559 RegisterTemporaryFile(file);
1560
1561 return file;
1562 }
1563
1564 /*
1565 * Return the path of the temp directory in a given tablespace.
1566 */
1567 void
TempTablespacePath(char * path,Oid tablespace)1568 TempTablespacePath(char *path, Oid tablespace)
1569 {
1570 /*
1571 * Identify the tempfile directory for this tablespace.
1572 *
1573 * If someone tries to specify pg_global, use pg_default instead.
1574 */
1575 if (tablespace == InvalidOid ||
1576 tablespace == DEFAULTTABLESPACE_OID ||
1577 tablespace == GLOBALTABLESPACE_OID)
1578 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1579 else
1580 {
1581 /* All other tablespaces are accessed via symlinks */
1582 snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1583 tablespace, TABLESPACE_VERSION_DIRECTORY,
1584 PG_TEMP_FILES_DIR);
1585 }
1586 }
1587
1588 /*
1589 * Open a temporary file in a specific tablespace.
1590 * Subroutine for OpenTemporaryFile, which see for details.
1591 */
1592 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1593 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1594 {
1595 char tempdirpath[MAXPGPATH];
1596 char tempfilepath[MAXPGPATH];
1597 File file;
1598
1599 TempTablespacePath(tempdirpath, tblspcOid);
1600
1601 /*
1602 * Generate a tempfile name that should be unique within the current
1603 * database instance.
1604 */
1605 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1606 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1607
1608 /*
1609 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1610 * temp file that can be reused.
1611 */
1612 file = PathNameOpenFile(tempfilepath,
1613 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1614 if (file <= 0)
1615 {
1616 /*
1617 * We might need to create the tablespace's tempfile directory, if no
1618 * one has yet done so.
1619 *
1620 * Don't check for an error from MakePGDirectory; it could fail if
1621 * someone else just did the same thing. If it doesn't work then
1622 * we'll bomb out on the second create attempt, instead.
1623 */
1624 (void) MakePGDirectory(tempdirpath);
1625
1626 file = PathNameOpenFile(tempfilepath,
1627 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1628 if (file <= 0 && rejectError)
1629 elog(ERROR, "could not create temporary file \"%s\": %m",
1630 tempfilepath);
1631 }
1632
1633 return file;
1634 }
1635
1636
1637 /*
1638 * Create a new file. The directory containing it must already exist. Files
1639 * created this way are subject to temp_file_limit and are automatically
1640 * closed at end of transaction, but are not automatically deleted on close
1641 * because they are intended to be shared between cooperating backends.
1642 *
1643 * If the file is inside the top-level temporary directory, its name should
1644 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1645 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1646 * inside a directory created with PathnameCreateTemporaryDir(), in which case
1647 * the prefix isn't needed.
1648 */
1649 File
PathNameCreateTemporaryFile(const char * path,bool error_on_failure)1650 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1651 {
1652 File file;
1653
1654 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1655
1656 /*
1657 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1658 * temp file that can be reused.
1659 */
1660 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1661 if (file <= 0)
1662 {
1663 if (error_on_failure)
1664 ereport(ERROR,
1665 (errcode_for_file_access(),
1666 errmsg("could not create temporary file \"%s\": %m",
1667 path)));
1668 else
1669 return file;
1670 }
1671
1672 /* Mark it for temp_file_limit accounting. */
1673 VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1674
1675 /* Register it for automatic close. */
1676 RegisterTemporaryFile(file);
1677
1678 return file;
1679 }
1680
1681 /*
1682 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1683 * another backend. Files opened this way don't count against the
1684 * temp_file_limit of the caller, are read-only and are automatically closed
1685 * at the end of the transaction but are not deleted on close.
1686 */
1687 File
PathNameOpenTemporaryFile(const char * path)1688 PathNameOpenTemporaryFile(const char *path)
1689 {
1690 File file;
1691
1692 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1693
1694 /* We open the file read-only. */
1695 file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1696
1697 /* If no such file, then we don't raise an error. */
1698 if (file <= 0 && errno != ENOENT)
1699 ereport(ERROR,
1700 (errcode_for_file_access(),
1701 errmsg("could not open temporary file \"%s\": %m",
1702 path)));
1703
1704 if (file > 0)
1705 {
1706 /* Register it for automatic close. */
1707 RegisterTemporaryFile(file);
1708 }
1709
1710 return file;
1711 }
1712
1713 /*
1714 * Delete a file by pathname. Return true if the file existed, false if
1715 * didn't.
1716 */
1717 bool
PathNameDeleteTemporaryFile(const char * path,bool error_on_failure)1718 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1719 {
1720 struct stat filestats;
1721 int stat_errno;
1722
1723 /* Get the final size for pgstat reporting. */
1724 if (stat(path, &filestats) != 0)
1725 stat_errno = errno;
1726 else
1727 stat_errno = 0;
1728
1729 /*
1730 * Unlike FileClose's automatic file deletion code, we tolerate
1731 * non-existence to support BufFileDeleteShared which doesn't know how
1732 * many segments it has to delete until it runs out.
1733 */
1734 if (stat_errno == ENOENT)
1735 return false;
1736
1737 if (unlink(path) < 0)
1738 {
1739 if (errno != ENOENT)
1740 ereport(error_on_failure ? ERROR : LOG,
1741 (errcode_for_file_access(),
1742 errmsg("cannot unlink temporary file \"%s\": %m",
1743 path)));
1744 return false;
1745 }
1746
1747 if (stat_errno == 0)
1748 ReportTemporaryFileUsage(path, filestats.st_size);
1749 else
1750 {
1751 errno = stat_errno;
1752 ereport(LOG,
1753 (errcode_for_file_access(),
1754 errmsg("could not stat file \"%s\": %m", path)));
1755 }
1756
1757 return true;
1758 }
1759
1760 /*
1761 * close a file when done with it
1762 */
1763 void
FileClose(File file)1764 FileClose(File file)
1765 {
1766 Vfd *vfdP;
1767
1768 Assert(FileIsValid(file));
1769
1770 DO_DB(elog(LOG, "FileClose: %d (%s)",
1771 file, VfdCache[file].fileName));
1772
1773 vfdP = &VfdCache[file];
1774
1775 if (!FileIsNotOpen(file))
1776 {
1777 /* close the file */
1778 if (close(vfdP->fd))
1779 {
1780 /*
1781 * We may need to panic on failure to close non-temporary files;
1782 * see LruDelete.
1783 */
1784 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1785 "could not close file \"%s\": %m", vfdP->fileName);
1786 }
1787
1788 --nfile;
1789 vfdP->fd = VFD_CLOSED;
1790
1791 /* remove the file from the lru ring */
1792 Delete(file);
1793 }
1794
1795 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1796 {
1797 /* Subtract its size from current usage (do first in case of error) */
1798 temporary_files_size -= vfdP->fileSize;
1799 vfdP->fileSize = 0;
1800 }
1801
1802 /*
1803 * Delete the file if it was temporary, and make a log entry if wanted
1804 */
1805 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1806 {
1807 struct stat filestats;
1808 int stat_errno;
1809
1810 /*
1811 * If we get an error, as could happen within the ereport/elog calls,
1812 * we'll come right back here during transaction abort. Reset the
1813 * flag to ensure that we can't get into an infinite loop. This code
1814 * is arranged to ensure that the worst-case consequence is failing to
1815 * emit log message(s), not failing to attempt the unlink.
1816 */
1817 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1818
1819
1820 /* first try the stat() */
1821 if (stat(vfdP->fileName, &filestats))
1822 stat_errno = errno;
1823 else
1824 stat_errno = 0;
1825
1826 /* in any case do the unlink */
1827 if (unlink(vfdP->fileName))
1828 elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1829
1830 /* and last report the stat results */
1831 if (stat_errno == 0)
1832 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1833 else
1834 {
1835 errno = stat_errno;
1836 elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1837 }
1838 }
1839
1840 /* Unregister it from the resource owner */
1841 if (vfdP->resowner)
1842 ResourceOwnerForgetFile(vfdP->resowner, file);
1843
1844 /*
1845 * Return the Vfd slot to the free list
1846 */
1847 FreeVfd(file);
1848 }
1849
1850 /*
1851 * FilePrefetch - initiate asynchronous read of a given range of the file.
1852 * The logical seek position is unaffected.
1853 *
1854 * Currently the only implementation of this function is using posix_fadvise
1855 * which is the simplest standardized interface that accomplishes this.
1856 * We could add an implementation using libaio in the future; but note that
1857 * this API is inappropriate for libaio, which wants to have a buffer provided
1858 * to read into.
1859 */
1860 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1861 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1862 {
1863 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1864 int returnCode;
1865
1866 Assert(FileIsValid(file));
1867
1868 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1869 file, VfdCache[file].fileName,
1870 (int64) offset, amount));
1871
1872 returnCode = FileAccess(file);
1873 if (returnCode < 0)
1874 return returnCode;
1875
1876 pgstat_report_wait_start(wait_event_info);
1877 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1878 POSIX_FADV_WILLNEED);
1879 pgstat_report_wait_end();
1880
1881 return returnCode;
1882 #else
1883 Assert(FileIsValid(file));
1884 return 0;
1885 #endif
1886 }
1887
1888 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)1889 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1890 {
1891 int returnCode;
1892
1893 Assert(FileIsValid(file));
1894
1895 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1896 file, VfdCache[file].fileName,
1897 (int64) offset, (int64) nbytes));
1898
1899 /*
1900 * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1901 * file's seek position. We prefer to define that as a no-op here.
1902 */
1903 if (nbytes <= 0)
1904 return;
1905
1906 returnCode = FileAccess(file);
1907 if (returnCode < 0)
1908 return;
1909
1910 pgstat_report_wait_start(wait_event_info);
1911 pg_flush_data(VfdCache[file].fd, offset, nbytes);
1912 pgstat_report_wait_end();
1913 }
1914
1915 int
FileRead(File file,char * buffer,int amount,uint32 wait_event_info)1916 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1917 {
1918 int returnCode;
1919 Vfd *vfdP;
1920
1921 Assert(FileIsValid(file));
1922
1923 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1924 file, VfdCache[file].fileName,
1925 (int64) VfdCache[file].seekPos,
1926 amount, buffer));
1927
1928 returnCode = FileAccess(file);
1929 if (returnCode < 0)
1930 return returnCode;
1931
1932 vfdP = &VfdCache[file];
1933
1934 retry:
1935 pgstat_report_wait_start(wait_event_info);
1936 returnCode = read(vfdP->fd, buffer, amount);
1937 pgstat_report_wait_end();
1938
1939 if (returnCode >= 0)
1940 {
1941 /* if seekPos is unknown, leave it that way */
1942 if (!FilePosIsUnknown(vfdP->seekPos))
1943 vfdP->seekPos += returnCode;
1944 }
1945 else
1946 {
1947 /*
1948 * Windows may run out of kernel buffers and return "Insufficient
1949 * system resources" error. Wait a bit and retry to solve it.
1950 *
1951 * It is rumored that EINTR is also possible on some Unix filesystems,
1952 * in which case immediate retry is indicated.
1953 */
1954 #ifdef WIN32
1955 DWORD error = GetLastError();
1956
1957 switch (error)
1958 {
1959 case ERROR_NO_SYSTEM_RESOURCES:
1960 pg_usleep(1000L);
1961 errno = EINTR;
1962 break;
1963 default:
1964 _dosmaperr(error);
1965 break;
1966 }
1967 #endif
1968 /* OK to retry if interrupted */
1969 if (errno == EINTR)
1970 goto retry;
1971
1972 /* Trouble, so assume we don't know the file position anymore */
1973 vfdP->seekPos = FileUnknownPos;
1974 }
1975
1976 return returnCode;
1977 }
1978
1979 int
FileWrite(File file,char * buffer,int amount,uint32 wait_event_info)1980 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1981 {
1982 int returnCode;
1983 Vfd *vfdP;
1984
1985 Assert(FileIsValid(file));
1986
1987 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1988 file, VfdCache[file].fileName,
1989 (int64) VfdCache[file].seekPos,
1990 amount, buffer));
1991
1992 returnCode = FileAccess(file);
1993 if (returnCode < 0)
1994 return returnCode;
1995
1996 vfdP = &VfdCache[file];
1997
1998 /*
1999 * If enforcing temp_file_limit and it's a temp file, check to see if the
2000 * write would overrun temp_file_limit, and throw error if so. Note: it's
2001 * really a modularity violation to throw error here; we should set errno
2002 * and return -1. However, there's no way to report a suitable error
2003 * message if we do that. All current callers would just throw error
2004 * immediately anyway, so this is safe at present.
2005 */
2006 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2007 {
2008 off_t newPos;
2009
2010 /*
2011 * Normally we should know the seek position, but if for some reason
2012 * we have lost track of it, try again to get it. Here, it's fine to
2013 * throw an error if we still can't get it.
2014 */
2015 if (FilePosIsUnknown(vfdP->seekPos))
2016 {
2017 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
2018 if (FilePosIsUnknown(vfdP->seekPos))
2019 elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
2020 }
2021
2022 newPos = vfdP->seekPos + amount;
2023 if (newPos > vfdP->fileSize)
2024 {
2025 uint64 newTotal = temporary_files_size;
2026
2027 newTotal += newPos - vfdP->fileSize;
2028 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2029 ereport(ERROR,
2030 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2031 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2032 temp_file_limit)));
2033 }
2034 }
2035
2036 retry:
2037 errno = 0;
2038 pgstat_report_wait_start(wait_event_info);
2039 returnCode = write(vfdP->fd, buffer, amount);
2040 pgstat_report_wait_end();
2041
2042 /* if write didn't set errno, assume problem is no disk space */
2043 if (returnCode != amount && errno == 0)
2044 errno = ENOSPC;
2045
2046 if (returnCode >= 0)
2047 {
2048 /* if seekPos is unknown, leave it that way */
2049 if (!FilePosIsUnknown(vfdP->seekPos))
2050 vfdP->seekPos += returnCode;
2051
2052 /*
2053 * Maintain fileSize and temporary_files_size if it's a temp file.
2054 *
2055 * If seekPos is -1 (unknown), this will do nothing; but we could only
2056 * get here in that state if we're not enforcing temporary_files_size,
2057 * so we don't care.
2058 */
2059 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2060 {
2061 off_t newPos = vfdP->seekPos;
2062
2063 if (newPos > vfdP->fileSize)
2064 {
2065 temporary_files_size += newPos - vfdP->fileSize;
2066 vfdP->fileSize = newPos;
2067 }
2068 }
2069 }
2070 else
2071 {
2072 /*
2073 * See comments in FileRead()
2074 */
2075 #ifdef WIN32
2076 DWORD error = GetLastError();
2077
2078 switch (error)
2079 {
2080 case ERROR_NO_SYSTEM_RESOURCES:
2081 pg_usleep(1000L);
2082 errno = EINTR;
2083 break;
2084 default:
2085 _dosmaperr(error);
2086 break;
2087 }
2088 #endif
2089 /* OK to retry if interrupted */
2090 if (errno == EINTR)
2091 goto retry;
2092
2093 /* Trouble, so assume we don't know the file position anymore */
2094 vfdP->seekPos = FileUnknownPos;
2095 }
2096
2097 return returnCode;
2098 }
2099
2100 int
FileSync(File file,uint32 wait_event_info)2101 FileSync(File file, uint32 wait_event_info)
2102 {
2103 int returnCode;
2104
2105 Assert(FileIsValid(file));
2106
2107 DO_DB(elog(LOG, "FileSync: %d (%s)",
2108 file, VfdCache[file].fileName));
2109
2110 returnCode = FileAccess(file);
2111 if (returnCode < 0)
2112 return returnCode;
2113
2114 pgstat_report_wait_start(wait_event_info);
2115 returnCode = pg_fsync(VfdCache[file].fd);
2116 pgstat_report_wait_end();
2117
2118 return returnCode;
2119 }
2120
2121 off_t
FileSeek(File file,off_t offset,int whence)2122 FileSeek(File file, off_t offset, int whence)
2123 {
2124 Vfd *vfdP;
2125
2126 Assert(FileIsValid(file));
2127
2128 DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
2129 file, VfdCache[file].fileName,
2130 (int64) VfdCache[file].seekPos,
2131 (int64) offset, whence));
2132
2133 vfdP = &VfdCache[file];
2134
2135 if (FileIsNotOpen(file))
2136 {
2137 switch (whence)
2138 {
2139 case SEEK_SET:
2140 if (offset < 0)
2141 {
2142 errno = EINVAL;
2143 return (off_t) -1;
2144 }
2145 vfdP->seekPos = offset;
2146 break;
2147 case SEEK_CUR:
2148 if (FilePosIsUnknown(vfdP->seekPos) ||
2149 vfdP->seekPos + offset < 0)
2150 {
2151 errno = EINVAL;
2152 return (off_t) -1;
2153 }
2154 vfdP->seekPos += offset;
2155 break;
2156 case SEEK_END:
2157 if (FileAccess(file) < 0)
2158 return (off_t) -1;
2159 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2160 break;
2161 default:
2162 elog(ERROR, "invalid whence: %d", whence);
2163 break;
2164 }
2165 }
2166 else
2167 {
2168 switch (whence)
2169 {
2170 case SEEK_SET:
2171 if (offset < 0)
2172 {
2173 errno = EINVAL;
2174 return (off_t) -1;
2175 }
2176 if (vfdP->seekPos != offset)
2177 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2178 break;
2179 case SEEK_CUR:
2180 if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
2181 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2182 break;
2183 case SEEK_END:
2184 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2185 break;
2186 default:
2187 elog(ERROR, "invalid whence: %d", whence);
2188 break;
2189 }
2190 }
2191
2192 return vfdP->seekPos;
2193 }
2194
2195 /*
2196 * XXX not actually used but here for completeness
2197 */
2198 #ifdef NOT_USED
2199 off_t
FileTell(File file)2200 FileTell(File file)
2201 {
2202 Assert(FileIsValid(file));
2203 DO_DB(elog(LOG, "FileTell %d (%s)",
2204 file, VfdCache[file].fileName));
2205 return VfdCache[file].seekPos;
2206 }
2207 #endif
2208
2209 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)2210 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2211 {
2212 int returnCode;
2213
2214 Assert(FileIsValid(file));
2215
2216 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2217 file, VfdCache[file].fileName));
2218
2219 returnCode = FileAccess(file);
2220 if (returnCode < 0)
2221 return returnCode;
2222
2223 pgstat_report_wait_start(wait_event_info);
2224 returnCode = ftruncate(VfdCache[file].fd, offset);
2225 pgstat_report_wait_end();
2226
2227 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2228 {
2229 /* adjust our state for truncation of a temp file */
2230 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2231 temporary_files_size -= VfdCache[file].fileSize - offset;
2232 VfdCache[file].fileSize = offset;
2233 }
2234
2235 return returnCode;
2236 }
2237
2238 /*
2239 * Return the pathname associated with an open file.
2240 *
2241 * The returned string points to an internal buffer, which is valid until
2242 * the file is closed.
2243 */
2244 char *
FilePathName(File file)2245 FilePathName(File file)
2246 {
2247 Assert(FileIsValid(file));
2248
2249 return VfdCache[file].fileName;
2250 }
2251
2252 /*
2253 * Return the raw file descriptor of an opened file.
2254 *
2255 * The returned file descriptor will be valid until the file is closed, but
2256 * there are a lot of things that can make that happen. So the caller should
2257 * be careful not to do much of anything else before it finishes using the
2258 * returned file descriptor.
2259 */
2260 int
FileGetRawDesc(File file)2261 FileGetRawDesc(File file)
2262 {
2263 Assert(FileIsValid(file));
2264 return VfdCache[file].fd;
2265 }
2266
2267 /*
2268 * FileGetRawFlags - returns the file flags on open(2)
2269 */
2270 int
FileGetRawFlags(File file)2271 FileGetRawFlags(File file)
2272 {
2273 Assert(FileIsValid(file));
2274 return VfdCache[file].fileFlags;
2275 }
2276
2277 /*
2278 * FileGetRawMode - returns the mode bitmask passed to open(2)
2279 */
2280 mode_t
FileGetRawMode(File file)2281 FileGetRawMode(File file)
2282 {
2283 Assert(FileIsValid(file));
2284 return VfdCache[file].fileMode;
2285 }
2286
2287 /*
2288 * Make room for another allocatedDescs[] array entry if needed and possible.
2289 * Returns true if an array element is available.
2290 */
2291 static bool
reserveAllocatedDesc(void)2292 reserveAllocatedDesc(void)
2293 {
2294 AllocateDesc *newDescs;
2295 int newMax;
2296
2297 /* Quick out if array already has a free slot. */
2298 if (numAllocatedDescs < maxAllocatedDescs)
2299 return true;
2300
2301 /*
2302 * If the array hasn't yet been created in the current process, initialize
2303 * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2304 * we will ever need, anyway. We don't want to look at max_safe_fds
2305 * immediately because set_max_safe_fds() may not have run yet.
2306 */
2307 if (allocatedDescs == NULL)
2308 {
2309 newMax = FD_MINFREE / 2;
2310 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2311 /* Out of memory already? Treat as fatal error. */
2312 if (newDescs == NULL)
2313 ereport(ERROR,
2314 (errcode(ERRCODE_OUT_OF_MEMORY),
2315 errmsg("out of memory")));
2316 allocatedDescs = newDescs;
2317 maxAllocatedDescs = newMax;
2318 return true;
2319 }
2320
2321 /*
2322 * Consider enlarging the array beyond the initial allocation used above.
2323 * By the time this happens, max_safe_fds should be known accurately.
2324 *
2325 * We mustn't let allocated descriptors hog all the available FDs, and in
2326 * practice we'd better leave a reasonable number of FDs for VFD use. So
2327 * set the maximum to max_safe_fds / 2. (This should certainly be at
2328 * least as large as the initial size, FD_MINFREE / 2.)
2329 */
2330 newMax = max_safe_fds / 2;
2331 if (newMax > maxAllocatedDescs)
2332 {
2333 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2334 newMax * sizeof(AllocateDesc));
2335 /* Treat out-of-memory as a non-fatal error. */
2336 if (newDescs == NULL)
2337 return false;
2338 allocatedDescs = newDescs;
2339 maxAllocatedDescs = newMax;
2340 return true;
2341 }
2342
2343 /* Can't enlarge allocatedDescs[] any more. */
2344 return false;
2345 }
2346
2347 /*
2348 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2349 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2350 * necessary to open the file. When done, call FreeFile rather than fclose.
2351 *
2352 * Note that files that will be open for any significant length of time
2353 * should NOT be handled this way, since they cannot share kernel file
2354 * descriptors with other files; there is grave risk of running out of FDs
2355 * if anyone locks down too many FDs. Most callers of this routine are
2356 * simply reading a config file that they will read and close immediately.
2357 *
2358 * fd.c will automatically close all files opened with AllocateFile at
2359 * transaction commit or abort; this prevents FD leakage if a routine
2360 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2361 *
2362 * Ideally this should be the *only* direct call of fopen() in the backend.
2363 */
2364 FILE *
AllocateFile(const char * name,const char * mode)2365 AllocateFile(const char *name, const char *mode)
2366 {
2367 FILE *file;
2368
2369 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2370 numAllocatedDescs, name));
2371
2372 /* Can we allocate another non-virtual FD? */
2373 if (!reserveAllocatedDesc())
2374 ereport(ERROR,
2375 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2376 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2377 maxAllocatedDescs, name)));
2378
2379 /* Close excess kernel FDs. */
2380 ReleaseLruFiles();
2381
2382 TryAgain:
2383 if ((file = fopen(name, mode)) != NULL)
2384 {
2385 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2386
2387 desc->kind = AllocateDescFile;
2388 desc->desc.file = file;
2389 desc->create_subid = GetCurrentSubTransactionId();
2390 numAllocatedDescs++;
2391 return desc->desc.file;
2392 }
2393
2394 if (errno == EMFILE || errno == ENFILE)
2395 {
2396 int save_errno = errno;
2397
2398 ereport(LOG,
2399 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2400 errmsg("out of file descriptors: %m; release and retry")));
2401 errno = 0;
2402 if (ReleaseLruFile())
2403 goto TryAgain;
2404 errno = save_errno;
2405 }
2406
2407 return NULL;
2408 }
2409
2410 /*
2411 * Open a file with OpenTransientFilePerm() and pass default file mode for
2412 * the fileMode parameter.
2413 */
2414 int
OpenTransientFile(const char * fileName,int fileFlags)2415 OpenTransientFile(const char *fileName, int fileFlags)
2416 {
2417 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2418 }
2419
2420 /*
2421 * Like AllocateFile, but returns an unbuffered fd like open(2)
2422 */
2423 int
OpenTransientFilePerm(const char * fileName,int fileFlags,mode_t fileMode)2424 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2425 {
2426 int fd;
2427
2428 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2429 numAllocatedDescs, fileName));
2430
2431 /* Can we allocate another non-virtual FD? */
2432 if (!reserveAllocatedDesc())
2433 ereport(ERROR,
2434 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2435 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2436 maxAllocatedDescs, fileName)));
2437
2438 /* Close excess kernel FDs. */
2439 ReleaseLruFiles();
2440
2441 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2442
2443 if (fd >= 0)
2444 {
2445 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2446
2447 desc->kind = AllocateDescRawFD;
2448 desc->desc.fd = fd;
2449 desc->create_subid = GetCurrentSubTransactionId();
2450 numAllocatedDescs++;
2451
2452 return fd;
2453 }
2454
2455 return -1; /* failure */
2456 }
2457
2458 /*
2459 * Routines that want to initiate a pipe stream should use OpenPipeStream
2460 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2461 * necessary. When done, call ClosePipeStream rather than pclose.
2462 *
2463 * This function also ensures that the popen'd program is run with default
2464 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2465 * uses. This ensures desirable response to, eg, closing a read pipe early.
2466 */
2467 FILE *
OpenPipeStream(const char * command,const char * mode)2468 OpenPipeStream(const char *command, const char *mode)
2469 {
2470 FILE *file;
2471 int save_errno;
2472
2473 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2474 numAllocatedDescs, command));
2475
2476 /* Can we allocate another non-virtual FD? */
2477 if (!reserveAllocatedDesc())
2478 ereport(ERROR,
2479 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2480 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2481 maxAllocatedDescs, command)));
2482
2483 /* Close excess kernel FDs. */
2484 ReleaseLruFiles();
2485
2486 TryAgain:
2487 fflush(stdout);
2488 fflush(stderr);
2489 pqsignal(SIGPIPE, SIG_DFL);
2490 errno = 0;
2491 file = popen(command, mode);
2492 save_errno = errno;
2493 pqsignal(SIGPIPE, SIG_IGN);
2494 errno = save_errno;
2495 if (file != NULL)
2496 {
2497 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2498
2499 desc->kind = AllocateDescPipe;
2500 desc->desc.file = file;
2501 desc->create_subid = GetCurrentSubTransactionId();
2502 numAllocatedDescs++;
2503 return desc->desc.file;
2504 }
2505
2506 if (errno == EMFILE || errno == ENFILE)
2507 {
2508 ereport(LOG,
2509 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2510 errmsg("out of file descriptors: %m; release and retry")));
2511 if (ReleaseLruFile())
2512 goto TryAgain;
2513 errno = save_errno;
2514 }
2515
2516 return NULL;
2517 }
2518
2519 /*
2520 * Free an AllocateDesc of any type.
2521 *
2522 * The argument *must* point into the allocatedDescs[] array.
2523 */
2524 static int
FreeDesc(AllocateDesc * desc)2525 FreeDesc(AllocateDesc *desc)
2526 {
2527 int result;
2528
2529 /* Close the underlying object */
2530 switch (desc->kind)
2531 {
2532 case AllocateDescFile:
2533 result = fclose(desc->desc.file);
2534 break;
2535 case AllocateDescPipe:
2536 result = pclose(desc->desc.file);
2537 break;
2538 case AllocateDescDir:
2539 result = closedir(desc->desc.dir);
2540 break;
2541 case AllocateDescRawFD:
2542 result = close(desc->desc.fd);
2543 break;
2544 default:
2545 elog(ERROR, "AllocateDesc kind not recognized");
2546 result = 0; /* keep compiler quiet */
2547 break;
2548 }
2549
2550 /* Compact storage in the allocatedDescs array */
2551 numAllocatedDescs--;
2552 *desc = allocatedDescs[numAllocatedDescs];
2553
2554 return result;
2555 }
2556
2557 /*
2558 * Close a file returned by AllocateFile.
2559 *
2560 * Note we do not check fclose's return value --- it is up to the caller
2561 * to handle close errors.
2562 */
2563 int
FreeFile(FILE * file)2564 FreeFile(FILE *file)
2565 {
2566 int i;
2567
2568 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2569
2570 /* Remove file from list of allocated files, if it's present */
2571 for (i = numAllocatedDescs; --i >= 0;)
2572 {
2573 AllocateDesc *desc = &allocatedDescs[i];
2574
2575 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2576 return FreeDesc(desc);
2577 }
2578
2579 /* Only get here if someone passes us a file not in allocatedDescs */
2580 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2581
2582 return fclose(file);
2583 }
2584
2585 /*
2586 * Close a file returned by OpenTransientFile.
2587 *
2588 * Note we do not check close's return value --- it is up to the caller
2589 * to handle close errors.
2590 */
2591 int
CloseTransientFile(int fd)2592 CloseTransientFile(int fd)
2593 {
2594 int i;
2595
2596 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2597
2598 /* Remove fd from list of allocated files, if it's present */
2599 for (i = numAllocatedDescs; --i >= 0;)
2600 {
2601 AllocateDesc *desc = &allocatedDescs[i];
2602
2603 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2604 return FreeDesc(desc);
2605 }
2606
2607 /* Only get here if someone passes us a file not in allocatedDescs */
2608 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2609
2610 return close(fd);
2611 }
2612
2613 /*
2614 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2615 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2616 * necessary to open the directory, and with closing it after an elog.
2617 * When done, call FreeDir rather than closedir.
2618 *
2619 * Returns NULL, with errno set, on failure. Note that failure detection
2620 * is commonly left to the following call of ReadDir or ReadDirExtended;
2621 * see the comments for ReadDir.
2622 *
2623 * Ideally this should be the *only* direct call of opendir() in the backend.
2624 */
2625 DIR *
AllocateDir(const char * dirname)2626 AllocateDir(const char *dirname)
2627 {
2628 DIR *dir;
2629
2630 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2631 numAllocatedDescs, dirname));
2632
2633 /* Can we allocate another non-virtual FD? */
2634 if (!reserveAllocatedDesc())
2635 ereport(ERROR,
2636 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2637 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2638 maxAllocatedDescs, dirname)));
2639
2640 /* Close excess kernel FDs. */
2641 ReleaseLruFiles();
2642
2643 TryAgain:
2644 if ((dir = opendir(dirname)) != NULL)
2645 {
2646 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2647
2648 desc->kind = AllocateDescDir;
2649 desc->desc.dir = dir;
2650 desc->create_subid = GetCurrentSubTransactionId();
2651 numAllocatedDescs++;
2652 return desc->desc.dir;
2653 }
2654
2655 if (errno == EMFILE || errno == ENFILE)
2656 {
2657 int save_errno = errno;
2658
2659 ereport(LOG,
2660 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2661 errmsg("out of file descriptors: %m; release and retry")));
2662 errno = 0;
2663 if (ReleaseLruFile())
2664 goto TryAgain;
2665 errno = save_errno;
2666 }
2667
2668 return NULL;
2669 }
2670
2671 /*
2672 * Read a directory opened with AllocateDir, ereport'ing any error.
2673 *
2674 * This is easier to use than raw readdir() since it takes care of some
2675 * otherwise rather tedious and error-prone manipulation of errno. Also,
2676 * if you are happy with a generic error message for AllocateDir failure,
2677 * you can just do
2678 *
2679 * dir = AllocateDir(path);
2680 * while ((dirent = ReadDir(dir, path)) != NULL)
2681 * process dirent;
2682 * FreeDir(dir);
2683 *
2684 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2685 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2686 * use this shortcut.)
2687 *
2688 * The pathname passed to AllocateDir must be passed to this routine too,
2689 * but it is only used for error reporting.
2690 */
2691 struct dirent *
ReadDir(DIR * dir,const char * dirname)2692 ReadDir(DIR *dir, const char *dirname)
2693 {
2694 return ReadDirExtended(dir, dirname, ERROR);
2695 }
2696
2697 /*
2698 * Alternate version of ReadDir that allows caller to specify the elevel
2699 * for any error report (whether it's reporting an initial failure of
2700 * AllocateDir or a subsequent directory read failure).
2701 *
2702 * If elevel < ERROR, returns NULL after any error. With the normal coding
2703 * pattern, this will result in falling out of the loop immediately as
2704 * though the directory contained no (more) entries.
2705 */
2706 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2707 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2708 {
2709 struct dirent *dent;
2710
2711 /* Give a generic message for AllocateDir failure, if caller didn't */
2712 if (dir == NULL)
2713 {
2714 ereport(elevel,
2715 (errcode_for_file_access(),
2716 errmsg("could not open directory \"%s\": %m",
2717 dirname)));
2718 return NULL;
2719 }
2720
2721 errno = 0;
2722 if ((dent = readdir(dir)) != NULL)
2723 return dent;
2724
2725 if (errno)
2726 ereport(elevel,
2727 (errcode_for_file_access(),
2728 errmsg("could not read directory \"%s\": %m",
2729 dirname)));
2730 return NULL;
2731 }
2732
2733 /*
2734 * Close a directory opened with AllocateDir.
2735 *
2736 * Returns closedir's return value (with errno set if it's not 0).
2737 * Note we do not check the return value --- it is up to the caller
2738 * to handle close errors if wanted.
2739 *
2740 * Does nothing if dir == NULL; we assume that directory open failure was
2741 * already reported if desired.
2742 */
2743 int
FreeDir(DIR * dir)2744 FreeDir(DIR *dir)
2745 {
2746 int i;
2747
2748 /* Nothing to do if AllocateDir failed */
2749 if (dir == NULL)
2750 return 0;
2751
2752 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2753
2754 /* Remove dir from list of allocated dirs, if it's present */
2755 for (i = numAllocatedDescs; --i >= 0;)
2756 {
2757 AllocateDesc *desc = &allocatedDescs[i];
2758
2759 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2760 return FreeDesc(desc);
2761 }
2762
2763 /* Only get here if someone passes us a dir not in allocatedDescs */
2764 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2765
2766 return closedir(dir);
2767 }
2768
2769
2770 /*
2771 * Close a pipe stream returned by OpenPipeStream.
2772 */
2773 int
ClosePipeStream(FILE * file)2774 ClosePipeStream(FILE *file)
2775 {
2776 int i;
2777
2778 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2779
2780 /* Remove file from list of allocated files, if it's present */
2781 for (i = numAllocatedDescs; --i >= 0;)
2782 {
2783 AllocateDesc *desc = &allocatedDescs[i];
2784
2785 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2786 return FreeDesc(desc);
2787 }
2788
2789 /* Only get here if someone passes us a file not in allocatedDescs */
2790 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2791
2792 return pclose(file);
2793 }
2794
2795 /*
2796 * closeAllVfds
2797 *
2798 * Force all VFDs into the physically-closed state, so that the fewest
2799 * possible number of kernel file descriptors are in use. There is no
2800 * change in the logical state of the VFDs.
2801 */
2802 void
closeAllVfds(void)2803 closeAllVfds(void)
2804 {
2805 Index i;
2806
2807 if (SizeVfdCache > 0)
2808 {
2809 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2810 for (i = 1; i < SizeVfdCache; i++)
2811 {
2812 if (!FileIsNotOpen(i))
2813 LruDelete(i);
2814 }
2815 }
2816 }
2817
2818
2819 /*
2820 * SetTempTablespaces
2821 *
2822 * Define a list (actually an array) of OIDs of tablespaces to use for
2823 * temporary files. This list will be used until end of transaction,
2824 * unless this function is called again before then. It is caller's
2825 * responsibility that the passed-in array has adequate lifespan (typically
2826 * it'd be allocated in TopTransactionContext).
2827 *
2828 * Some entries of the array may be InvalidOid, indicating that the current
2829 * database's default tablespace should be used.
2830 */
2831 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2832 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2833 {
2834 Assert(numSpaces >= 0);
2835 tempTableSpaces = tableSpaces;
2836 numTempTableSpaces = numSpaces;
2837
2838 /*
2839 * Select a random starting point in the list. This is to minimize
2840 * conflicts between backends that are most likely sharing the same list
2841 * of temp tablespaces. Note that if we create multiple temp files in the
2842 * same transaction, we'll advance circularly through the list --- this
2843 * ensures that large temporary sort files are nicely spread across all
2844 * available tablespaces.
2845 */
2846 if (numSpaces > 1)
2847 nextTempTableSpace = random() % numSpaces;
2848 else
2849 nextTempTableSpace = 0;
2850 }
2851
2852 /*
2853 * TempTablespacesAreSet
2854 *
2855 * Returns true if SetTempTablespaces has been called in current transaction.
2856 * (This is just so that tablespaces.c doesn't need its own per-transaction
2857 * state.)
2858 */
2859 bool
TempTablespacesAreSet(void)2860 TempTablespacesAreSet(void)
2861 {
2862 return (numTempTableSpaces >= 0);
2863 }
2864
2865 /*
2866 * GetTempTablespaces
2867 *
2868 * Populate an array with the OIDs of the tablespaces that should be used for
2869 * temporary files. (Some entries may be InvalidOid, indicating that the
2870 * current database's default tablespace should be used.) At most numSpaces
2871 * entries will be filled.
2872 * Returns the number of OIDs that were copied into the output array.
2873 */
2874 int
GetTempTablespaces(Oid * tableSpaces,int numSpaces)2875 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2876 {
2877 int i;
2878
2879 Assert(TempTablespacesAreSet());
2880 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2881 tableSpaces[i] = tempTableSpaces[i];
2882
2883 return i;
2884 }
2885
2886 /*
2887 * GetNextTempTableSpace
2888 *
2889 * Select the next temp tablespace to use. A result of InvalidOid means
2890 * to use the current database's default tablespace.
2891 */
2892 Oid
GetNextTempTableSpace(void)2893 GetNextTempTableSpace(void)
2894 {
2895 if (numTempTableSpaces > 0)
2896 {
2897 /* Advance nextTempTableSpace counter with wraparound */
2898 if (++nextTempTableSpace >= numTempTableSpaces)
2899 nextTempTableSpace = 0;
2900 return tempTableSpaces[nextTempTableSpace];
2901 }
2902 return InvalidOid;
2903 }
2904
2905
2906 /*
2907 * AtEOSubXact_Files
2908 *
2909 * Take care of subtransaction commit/abort. At abort, we close temp files
2910 * that the subtransaction may have opened. At commit, we reassign the
2911 * files that were opened to the parent subtransaction.
2912 */
2913 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2914 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2915 SubTransactionId parentSubid)
2916 {
2917 Index i;
2918
2919 for (i = 0; i < numAllocatedDescs; i++)
2920 {
2921 if (allocatedDescs[i].create_subid == mySubid)
2922 {
2923 if (isCommit)
2924 allocatedDescs[i].create_subid = parentSubid;
2925 else
2926 {
2927 /* have to recheck the item after FreeDesc (ugly) */
2928 FreeDesc(&allocatedDescs[i--]);
2929 }
2930 }
2931 }
2932 }
2933
2934 /*
2935 * AtEOXact_Files
2936 *
2937 * This routine is called during transaction commit or abort. All still-open
2938 * per-transaction temporary file VFDs are closed, which also causes the
2939 * underlying files to be deleted (although they should've been closed already
2940 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2941 * closed. We also forget any transaction-local temp tablespace list.
2942 *
2943 * The isCommit flag is used only to decide whether to emit warnings about
2944 * unclosed files.
2945 */
2946 void
AtEOXact_Files(bool isCommit)2947 AtEOXact_Files(bool isCommit)
2948 {
2949 CleanupTempFiles(isCommit, false);
2950 tempTableSpaces = NULL;
2951 numTempTableSpaces = -1;
2952 }
2953
2954 /*
2955 * AtProcExit_Files
2956 *
2957 * on_proc_exit hook to clean up temp files during backend shutdown.
2958 * Here, we want to clean up *all* temp files including interXact ones.
2959 */
2960 static void
AtProcExit_Files(int code,Datum arg)2961 AtProcExit_Files(int code, Datum arg)
2962 {
2963 CleanupTempFiles(false, true);
2964 }
2965
2966 /*
2967 * Close temporary files and delete their underlying files.
2968 *
2969 * isCommit: if true, this is normal transaction commit, and we don't
2970 * expect any remaining files; warn if there are some.
2971 *
2972 * isProcExit: if true, this is being called as the backend process is
2973 * exiting. If that's the case, we should remove all temporary files; if
2974 * that's not the case, we are being called for transaction commit/abort
2975 * and should only remove transaction-local temp files. In either case,
2976 * also clean up "allocated" stdio files, dirs and fds.
2977 */
2978 static void
CleanupTempFiles(bool isCommit,bool isProcExit)2979 CleanupTempFiles(bool isCommit, bool isProcExit)
2980 {
2981 Index i;
2982
2983 /*
2984 * Careful here: at proc_exit we need extra cleanup, not just
2985 * xact_temporary files.
2986 */
2987 if (isProcExit || have_xact_temporary_files)
2988 {
2989 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2990 for (i = 1; i < SizeVfdCache; i++)
2991 {
2992 unsigned short fdstate = VfdCache[i].fdstate;
2993
2994 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2995 VfdCache[i].fileName != NULL)
2996 {
2997 /*
2998 * If we're in the process of exiting a backend process, close
2999 * all temporary files. Otherwise, only close temporary files
3000 * local to the current transaction. They should be closed by
3001 * the ResourceOwner mechanism already, so this is just a
3002 * debugging cross-check.
3003 */
3004 if (isProcExit)
3005 FileClose(i);
3006 else if (fdstate & FD_CLOSE_AT_EOXACT)
3007 {
3008 elog(WARNING,
3009 "temporary file %s not closed at end-of-transaction",
3010 VfdCache[i].fileName);
3011 FileClose(i);
3012 }
3013 }
3014 }
3015
3016 have_xact_temporary_files = false;
3017 }
3018
3019 /* Complain if any allocated files remain open at commit. */
3020 if (isCommit && numAllocatedDescs > 0)
3021 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3022 numAllocatedDescs);
3023
3024 /* Clean up "allocated" stdio files, dirs and fds. */
3025 while (numAllocatedDescs > 0)
3026 FreeDesc(&allocatedDescs[0]);
3027 }
3028
3029
3030 /*
3031 * Remove temporary and temporary relation files left over from a prior
3032 * postmaster session
3033 *
3034 * This should be called during postmaster startup. It will forcibly
3035 * remove any leftover files created by OpenTemporaryFile and any leftover
3036 * temporary relation files created by mdcreate.
3037 *
3038 * NOTE: we could, but don't, call this during a post-backend-crash restart
3039 * cycle. The argument for not doing it is that someone might want to examine
3040 * the temp files for debugging purposes. This does however mean that
3041 * OpenTemporaryFile had better allow for collision with an existing temp
3042 * file name.
3043 *
3044 * NOTE: this function and its subroutines generally report syscall failures
3045 * with ereport(LOG) and keep going. Removing temp files is not so critical
3046 * that we should fail to start the database when we can't do it.
3047 */
3048 void
RemovePgTempFiles(void)3049 RemovePgTempFiles(void)
3050 {
3051 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3052 DIR *spc_dir;
3053 struct dirent *spc_de;
3054
3055 /*
3056 * First process temp files in pg_default ($PGDATA/base)
3057 */
3058 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3059 RemovePgTempFilesInDir(temp_path, true, false);
3060 RemovePgTempRelationFiles("base");
3061
3062 /*
3063 * Cycle through temp directories for all non-default tablespaces.
3064 */
3065 spc_dir = AllocateDir("pg_tblspc");
3066
3067 while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3068 {
3069 if (strcmp(spc_de->d_name, ".") == 0 ||
3070 strcmp(spc_de->d_name, "..") == 0)
3071 continue;
3072
3073 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3074 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
3075 RemovePgTempFilesInDir(temp_path, true, false);
3076
3077 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3078 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3079 RemovePgTempRelationFiles(temp_path);
3080 }
3081
3082 FreeDir(spc_dir);
3083
3084 /*
3085 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3086 * DataDir as well.
3087 */
3088 #ifdef EXEC_BACKEND
3089 RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false);
3090 #endif
3091 }
3092
3093 /*
3094 * Process one pgsql_tmp directory for RemovePgTempFiles.
3095 *
3096 * If missing_ok is true, it's all right for the named directory to not exist.
3097 * Any other problem results in a LOG message. (missing_ok should be true at
3098 * the top level, since pgsql_tmp directories are not created until needed.)
3099 *
3100 * At the top level, this should be called with unlink_all = false, so that
3101 * only files matching the temporary name prefix will be unlinked. When
3102 * recursing it will be called with unlink_all = true to unlink everything
3103 * under a top-level temporary directory.
3104 *
3105 * (These two flags could be replaced by one, but it seems clearer to keep
3106 * them separate.)
3107 */
3108 static void
RemovePgTempFilesInDir(const char * tmpdirname,bool missing_ok,bool unlink_all)3109 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3110 {
3111 DIR *temp_dir;
3112 struct dirent *temp_de;
3113 char rm_path[MAXPGPATH * 2];
3114
3115 temp_dir = AllocateDir(tmpdirname);
3116
3117 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3118 return;
3119
3120 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3121 {
3122 if (strcmp(temp_de->d_name, ".") == 0 ||
3123 strcmp(temp_de->d_name, "..") == 0)
3124 continue;
3125
3126 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3127 tmpdirname, temp_de->d_name);
3128
3129 if (unlink_all ||
3130 strncmp(temp_de->d_name,
3131 PG_TEMP_FILE_PREFIX,
3132 strlen(PG_TEMP_FILE_PREFIX)) == 0)
3133 {
3134 struct stat statbuf;
3135
3136 if (lstat(rm_path, &statbuf) < 0)
3137 {
3138 ereport(LOG,
3139 (errcode_for_file_access(),
3140 errmsg("could not stat file \"%s\": %m", rm_path)));
3141 continue;
3142 }
3143
3144 if (S_ISDIR(statbuf.st_mode))
3145 {
3146 /* recursively remove contents, then directory itself */
3147 RemovePgTempFilesInDir(rm_path, false, true);
3148
3149 if (rmdir(rm_path) < 0)
3150 ereport(LOG,
3151 (errcode_for_file_access(),
3152 errmsg("could not remove directory \"%s\": %m",
3153 rm_path)));
3154 }
3155 else
3156 {
3157 if (unlink(rm_path) < 0)
3158 ereport(LOG,
3159 (errcode_for_file_access(),
3160 errmsg("could not remove file \"%s\": %m",
3161 rm_path)));
3162 }
3163 }
3164 else
3165 ereport(LOG,
3166 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3167 rm_path)));
3168 }
3169
3170 FreeDir(temp_dir);
3171 }
3172
3173 /* Process one tablespace directory, look for per-DB subdirectories */
3174 static void
RemovePgTempRelationFiles(const char * tsdirname)3175 RemovePgTempRelationFiles(const char *tsdirname)
3176 {
3177 DIR *ts_dir;
3178 struct dirent *de;
3179 char dbspace_path[MAXPGPATH * 2];
3180
3181 ts_dir = AllocateDir(tsdirname);
3182
3183 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3184 {
3185 /*
3186 * We're only interested in the per-database directories, which have
3187 * numeric names. Note that this code will also (properly) ignore "."
3188 * and "..".
3189 */
3190 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3191 continue;
3192
3193 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3194 tsdirname, de->d_name);
3195 RemovePgTempRelationFilesInDbspace(dbspace_path);
3196 }
3197
3198 FreeDir(ts_dir);
3199 }
3200
3201 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3202 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)3203 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3204 {
3205 DIR *dbspace_dir;
3206 struct dirent *de;
3207 char rm_path[MAXPGPATH * 2];
3208
3209 dbspace_dir = AllocateDir(dbspacedirname);
3210
3211 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3212 {
3213 if (!looks_like_temp_rel_name(de->d_name))
3214 continue;
3215
3216 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3217 dbspacedirname, de->d_name);
3218
3219 if (unlink(rm_path) < 0)
3220 ereport(LOG,
3221 (errcode_for_file_access(),
3222 errmsg("could not remove file \"%s\": %m",
3223 rm_path)));
3224 }
3225
3226 FreeDir(dbspace_dir);
3227 }
3228
3229 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3230 bool
looks_like_temp_rel_name(const char * name)3231 looks_like_temp_rel_name(const char *name)
3232 {
3233 int pos;
3234 int savepos;
3235
3236 /* Must start with "t". */
3237 if (name[0] != 't')
3238 return false;
3239
3240 /* Followed by a non-empty string of digits and then an underscore. */
3241 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3242 ;
3243 if (pos == 1 || name[pos] != '_')
3244 return false;
3245
3246 /* Followed by another nonempty string of digits. */
3247 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3248 ;
3249 if (savepos == pos)
3250 return false;
3251
3252 /* We might have _forkname or .segment or both. */
3253 if (name[pos] == '_')
3254 {
3255 int forkchar = forkname_chars(&name[pos + 1], NULL);
3256
3257 if (forkchar <= 0)
3258 return false;
3259 pos += forkchar + 1;
3260 }
3261 if (name[pos] == '.')
3262 {
3263 int segchar;
3264
3265 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3266 ;
3267 if (segchar <= 1)
3268 return false;
3269 pos += segchar;
3270 }
3271
3272 /* Now we should be at the end. */
3273 if (name[pos] != '\0')
3274 return false;
3275 return true;
3276 }
3277
3278
3279 /*
3280 * Issue fsync recursively on PGDATA and all its contents.
3281 *
3282 * We fsync regular files and directories wherever they are, but we
3283 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3284 * Other symlinks are presumed to point at files we're not responsible
3285 * for fsyncing, and might not have privileges to write at all.
3286 *
3287 * Errors are logged but not considered fatal; that's because this is used
3288 * only during database startup, to deal with the possibility that there are
3289 * issued-but-unsynced writes pending against the data directory. We want to
3290 * ensure that such writes reach disk before anything that's done in the new
3291 * run. However, aborting on error would result in failure to start for
3292 * harmless cases such as read-only files in the data directory, and that's
3293 * not good either.
3294 *
3295 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3296 * rewriting all changes again during recovery.
3297 *
3298 * Note we assume we're chdir'd into PGDATA to begin with.
3299 */
3300 void
SyncDataDirectory(void)3301 SyncDataDirectory(void)
3302 {
3303 bool xlog_is_symlink;
3304
3305 /* We can skip this whole thing if fsync is disabled. */
3306 if (!enableFsync)
3307 return;
3308
3309 /*
3310 * If pg_wal is a symlink, we'll need to recurse into it separately,
3311 * because the first walkdir below will ignore it.
3312 */
3313 xlog_is_symlink = false;
3314
3315 #ifndef WIN32
3316 {
3317 struct stat st;
3318
3319 if (lstat("pg_wal", &st) < 0)
3320 ereport(LOG,
3321 (errcode_for_file_access(),
3322 errmsg("could not stat file \"%s\": %m",
3323 "pg_wal")));
3324 else if (S_ISLNK(st.st_mode))
3325 xlog_is_symlink = true;
3326 }
3327 #else
3328 if (pgwin32_is_junction("pg_wal"))
3329 xlog_is_symlink = true;
3330 #endif
3331
3332 /*
3333 * If possible, hint to the kernel that we're soon going to fsync the data
3334 * directory and its contents. Errors in this step are even less
3335 * interesting than normal, so log them only at DEBUG1.
3336 */
3337 #ifdef PG_FLUSH_DATA_WORKS
3338 walkdir(".", pre_sync_fname, false, DEBUG1);
3339 if (xlog_is_symlink)
3340 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3341 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3342 #endif
3343
3344 /*
3345 * Now we do the fsync()s in the same order.
3346 *
3347 * The main call ignores symlinks, so in addition to specially processing
3348 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3349 * process_symlinks = true. Note that if there are any plain directories
3350 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3351 * so we don't worry about optimizing it.
3352 */
3353 walkdir(".", datadir_fsync_fname, false, LOG);
3354 if (xlog_is_symlink)
3355 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3356 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3357 }
3358
3359 /*
3360 * walkdir: recursively walk a directory, applying the action to each
3361 * regular file and directory (including the named directory itself).
3362 *
3363 * If process_symlinks is true, the action and recursion are also applied
3364 * to regular files and directories that are pointed to by symlinks in the
3365 * given directory; otherwise symlinks are ignored. Symlinks are always
3366 * ignored in subdirectories, ie we intentionally don't pass down the
3367 * process_symlinks flag to recursive calls.
3368 *
3369 * Errors are reported at level elevel, which might be ERROR or less.
3370 *
3371 * See also walkdir in initdb.c, which is a frontend version of this logic.
3372 */
3373 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3374 walkdir(const char *path,
3375 void (*action) (const char *fname, bool isdir, int elevel),
3376 bool process_symlinks,
3377 int elevel)
3378 {
3379 DIR *dir;
3380 struct dirent *de;
3381
3382 dir = AllocateDir(path);
3383
3384 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3385 {
3386 char subpath[MAXPGPATH * 2];
3387 struct stat fst;
3388 int sret;
3389
3390 CHECK_FOR_INTERRUPTS();
3391
3392 if (strcmp(de->d_name, ".") == 0 ||
3393 strcmp(de->d_name, "..") == 0)
3394 continue;
3395
3396 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3397
3398 if (process_symlinks)
3399 sret = stat(subpath, &fst);
3400 else
3401 sret = lstat(subpath, &fst);
3402
3403 if (sret < 0)
3404 {
3405 ereport(elevel,
3406 (errcode_for_file_access(),
3407 errmsg("could not stat file \"%s\": %m", subpath)));
3408 continue;
3409 }
3410
3411 if (S_ISREG(fst.st_mode))
3412 (*action) (subpath, false, elevel);
3413 else if (S_ISDIR(fst.st_mode))
3414 walkdir(subpath, action, false, elevel);
3415 }
3416
3417 FreeDir(dir); /* we ignore any error here */
3418
3419 /*
3420 * It's important to fsync the destination directory itself as individual
3421 * file fsyncs don't guarantee that the directory entry for the file is
3422 * synced. However, skip this if AllocateDir failed; the action function
3423 * might not be robust against that.
3424 */
3425 if (dir)
3426 (*action) (path, true, elevel);
3427 }
3428
3429
3430 /*
3431 * Hint to the OS that it should get ready to fsync() this file.
3432 *
3433 * Ignores errors trying to open unreadable files, and logs other errors at a
3434 * caller-specified level.
3435 */
3436 #ifdef PG_FLUSH_DATA_WORKS
3437
3438 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3439 pre_sync_fname(const char *fname, bool isdir, int elevel)
3440 {
3441 int fd;
3442
3443 /* Don't try to flush directories, it'll likely just fail */
3444 if (isdir)
3445 return;
3446
3447 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3448
3449 if (fd < 0)
3450 {
3451 if (errno == EACCES)
3452 return;
3453 ereport(elevel,
3454 (errcode_for_file_access(),
3455 errmsg("could not open file \"%s\": %m", fname)));
3456 return;
3457 }
3458
3459 /*
3460 * pg_flush_data() ignores errors, which is ok because this is only a
3461 * hint.
3462 */
3463 pg_flush_data(fd, 0, 0);
3464
3465 (void) CloseTransientFile(fd);
3466 }
3467
3468 #endif /* PG_FLUSH_DATA_WORKS */
3469
3470 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3471 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3472 {
3473 /*
3474 * We want to silently ignoring errors about unreadable files. Pass that
3475 * desire on to fsync_fname_ext().
3476 */
3477 fsync_fname_ext(fname, isdir, true, elevel);
3478 }
3479
3480 static void
unlink_if_exists_fname(const char * fname,bool isdir,int elevel)3481 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3482 {
3483 if (isdir)
3484 {
3485 if (rmdir(fname) != 0 && errno != ENOENT)
3486 ereport(elevel,
3487 (errcode_for_file_access(),
3488 errmsg("could not rmdir directory \"%s\": %m", fname)));
3489 }
3490 else
3491 {
3492 /* Use PathNameDeleteTemporaryFile to report filesize */
3493 PathNameDeleteTemporaryFile(fname, false);
3494 }
3495 }
3496
3497 /*
3498 * fsync_fname_ext -- Try to fsync a file or directory
3499 *
3500 * If ignore_perm is true, ignore errors upon trying to open unreadable
3501 * files. Logs other errors at a caller-specified level.
3502 *
3503 * Returns 0 if the operation succeeded, -1 otherwise.
3504 */
3505 static int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3506 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3507 {
3508 int fd;
3509 int flags;
3510 int returncode;
3511
3512 /*
3513 * Some OSs require directories to be opened read-only whereas other
3514 * systems don't allow us to fsync files opened read-only; so we need both
3515 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3516 * not writable by our userid, but we assume that's OK.
3517 */
3518 flags = PG_BINARY;
3519 if (!isdir)
3520 flags |= O_RDWR;
3521 else
3522 flags |= O_RDONLY;
3523
3524 fd = OpenTransientFile(fname, flags);
3525
3526 /*
3527 * Some OSs don't allow us to open directories at all (Windows returns
3528 * EACCES), just ignore the error in that case. If desired also silently
3529 * ignoring errors about unreadable files. Log others.
3530 */
3531 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3532 return 0;
3533 else if (fd < 0 && ignore_perm && errno == EACCES)
3534 return 0;
3535 else if (fd < 0)
3536 {
3537 ereport(elevel,
3538 (errcode_for_file_access(),
3539 errmsg("could not open file \"%s\": %m", fname)));
3540 return -1;
3541 }
3542
3543 returncode = pg_fsync(fd);
3544
3545 /*
3546 * Some OSes don't allow us to fsync directories at all, so we can ignore
3547 * those errors. Anything else needs to be logged.
3548 */
3549 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3550 {
3551 int save_errno;
3552
3553 /* close file upon error, might not be in transaction context */
3554 save_errno = errno;
3555 (void) CloseTransientFile(fd);
3556 errno = save_errno;
3557
3558 ereport(elevel,
3559 (errcode_for_file_access(),
3560 errmsg("could not fsync file \"%s\": %m", fname)));
3561 return -1;
3562 }
3563
3564 (void) CloseTransientFile(fd);
3565
3566 return 0;
3567 }
3568
3569 /*
3570 * fsync_parent_path -- fsync the parent path of a file or directory
3571 *
3572 * This is aimed at making file operations persistent on disk in case of
3573 * an OS crash or power failure.
3574 */
3575 static int
fsync_parent_path(const char * fname,int elevel)3576 fsync_parent_path(const char *fname, int elevel)
3577 {
3578 char parentpath[MAXPGPATH];
3579
3580 strlcpy(parentpath, fname, MAXPGPATH);
3581 get_parent_directory(parentpath);
3582
3583 /*
3584 * get_parent_directory() returns an empty string if the input argument is
3585 * just a file name (see comments in path.c), so handle that as being the
3586 * current directory.
3587 */
3588 if (strlen(parentpath) == 0)
3589 strlcpy(parentpath, ".", MAXPGPATH);
3590
3591 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3592 return -1;
3593
3594 return 0;
3595 }
3596
3597 /*
3598 * Create a PostgreSQL data sub-directory
3599 *
3600 * The data directory itself, and most of its sub-directories, are created at
3601 * initdb time, but we do have some occasions when we create directories in
3602 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3603 * make sure that those directories are created consistently. Today, that means
3604 * making sure that the created directory has the correct permissions, which is
3605 * what pg_dir_create_mode tracks for us.
3606 *
3607 * Note that we also set the umask() based on what we understand the correct
3608 * permissions to be (see file_perm.c).
3609 *
3610 * For permissions other than the default, mkdir() can be used directly, but
3611 * be sure to consider carefully such cases -- a sub-directory with incorrect
3612 * permissions in a PostgreSQL data directory could cause backups and other
3613 * processes to fail.
3614 */
3615 int
MakePGDirectory(const char * directoryName)3616 MakePGDirectory(const char *directoryName)
3617 {
3618 return mkdir(directoryName, pg_dir_create_mode);
3619 }
3620
3621 /*
3622 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3623 *
3624 * Failure to fsync any data file is cause for immediate panic, unless
3625 * data_sync_retry is enabled. Data may have been written to the operating
3626 * system and removed from our buffer pool already, and if we are running on
3627 * an operating system that forgets dirty data on write-back failure, there
3628 * may be only one copy of the data remaining: in the WAL. A later attempt to
3629 * fsync again might falsely report success. Therefore we must not allow any
3630 * further checkpoints to be attempted. data_sync_retry can in theory be
3631 * enabled on systems known not to drop dirty buffered data on write-back
3632 * failure (with the likely outcome that checkpoints will continue to fail
3633 * until the underlying problem is fixed).
3634 *
3635 * Any code that reports a failure from fsync() or related functions should
3636 * filter the error level with this function.
3637 */
3638 int
data_sync_elevel(int elevel)3639 data_sync_elevel(int elevel)
3640 {
3641 return data_sync_retry ? elevel : PANIC;
3642 }
3643