1 /*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 256 on many modern
20 * operating systems, but can be as low as 32 on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
43 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
44 * They behave like the corresponding native functions, except that the handle
45 * is registered with the current subtransaction, and will be automatically
46 * closed at abort. These are intended mainly for short operations like
47 * reading a configuration file; there is a limit on the number of files that
48 * can be opened using these functions at any one time.
49 *
50 * Finally, BasicOpenFile is just a thin wrapper around open() that can
51 * release file descriptors in use by the virtual file descriptors if
52 * necessary. There is no automatic cleanup of file descriptors returned by
53 * BasicOpenFile, it is solely the caller's responsibility to close the file
54 * descriptor by calling close(2).
55 *
56 *-------------------------------------------------------------------------
57 */
58
59 #include "postgres.h"
60
61 #include <sys/file.h>
62 #include <sys/param.h>
63 #include <sys/stat.h>
64 #ifndef WIN32
65 #include <sys/mman.h>
66 #endif
67 #include <limits.h>
68 #include <unistd.h>
69 #include <fcntl.h>
70 #ifdef HAVE_SYS_RESOURCE_H
71 #include <sys/resource.h> /* for getrlimit */
72 #endif
73
74 #include "miscadmin.h"
75 #include "access/xact.h"
76 #include "access/xlog.h"
77 #include "catalog/catalog.h"
78 #include "catalog/pg_tablespace.h"
79 #include "pgstat.h"
80 #include "portability/mem.h"
81 #include "storage/fd.h"
82 #include "storage/ipc.h"
83 #include "utils/guc.h"
84 #include "utils/resowner_private.h"
85
86
87 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
88 #if defined(HAVE_SYNC_FILE_RANGE)
89 #define PG_FLUSH_DATA_WORKS 1
90 #elif !defined(WIN32) && defined(MS_ASYNC)
91 #define PG_FLUSH_DATA_WORKS 1
92 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
93 #define PG_FLUSH_DATA_WORKS 1
94 #endif
95
96 /*
97 * We must leave some file descriptors free for system(), the dynamic loader,
98 * and other code that tries to open files without consulting fd.c. This
99 * is the number left free. (While we can be pretty sure we won't get
100 * EMFILE, there's never any guarantee that we won't get ENFILE due to
101 * other processes chewing up FDs. So it's a bad idea to try to open files
102 * without consulting fd.c. Nonetheless we cannot control all code.)
103 *
104 * Because this is just a fixed setting, we are effectively assuming that
105 * no such code will leave FDs open over the long term; otherwise the slop
106 * is likely to be insufficient. Note in particular that we expect that
107 * loading a shared library does not result in any permanent increase in
108 * the number of open files. (This appears to be true on most if not
109 * all platforms as of Feb 2004.)
110 */
111 #define NUM_RESERVED_FDS 10
112
113 /*
114 * If we have fewer than this many usable FDs after allowing for the reserved
115 * ones, choke.
116 */
117 #define FD_MINFREE 10
118
119
120 /*
121 * A number of platforms allow individual processes to open many more files
122 * than they can really support when *many* processes do the same thing.
123 * This GUC parameter lets the DBA limit max_safe_fds to something less than
124 * what the postmaster's initial probe suggests will work.
125 */
126 int max_files_per_process = 1000;
127
128 /*
129 * Maximum number of file descriptors to open for either VFD entries or
130 * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
131 * to a conservative value, and remains that way indefinitely in bootstrap or
132 * standalone-backend cases. In normal postmaster operation, the postmaster
133 * calls set_max_safe_fds() late in initialization to update the value, and
134 * that value is then inherited by forked subprocesses.
135 *
136 * Note: the value of max_files_per_process is taken into account while
137 * setting this variable, and so need not be tested separately.
138 */
139 int max_safe_fds = 32; /* default if not changed */
140
141 /* Whether it is safe to continue running after fsync() fails. */
142 bool data_sync_retry = false;
143
144 /* Debugging.... */
145
146 #ifdef FDDEBUG
147 #define DO_DB(A) \
148 do { \
149 int _do_db_save_errno = errno; \
150 A; \
151 errno = _do_db_save_errno; \
152 } while (0)
153 #else
154 #define DO_DB(A) \
155 ((void) 0)
156 #endif
157
158 #define VFD_CLOSED (-1)
159
160 #define FileIsValid(file) \
161 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
162
163 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
164
165 /*
166 * Note: a VFD's seekPos is normally always valid, but if for some reason
167 * an lseek() fails, it might become set to FileUnknownPos. We can struggle
168 * along without knowing the seek position in many cases, but in some places
169 * we have to fail if we don't have it.
170 */
171 #define FileUnknownPos ((off_t) -1)
172 #define FilePosIsUnknown(pos) ((pos) < 0)
173
174 /* these are the assigned bits in fdstate below: */
175 #define FD_TEMPORARY (1 << 0) /* T = delete when closed */
176 #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
177
178 typedef struct vfd
179 {
180 int fd; /* current FD, or VFD_CLOSED if none */
181 unsigned short fdstate; /* bitflags for VFD's state */
182 ResourceOwner resowner; /* owner, for automatic cleanup */
183 File nextFree; /* link to next free VFD, if in freelist */
184 File lruMoreRecently; /* doubly linked recency-of-use list */
185 File lruLessRecently;
186 off_t seekPos; /* current logical file position, or -1 */
187 off_t fileSize; /* current size of file (0 if not temporary) */
188 char *fileName; /* name of file, or NULL for unused VFD */
189 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
190 int fileFlags; /* open(2) flags for (re)opening the file */
191 int fileMode; /* mode to pass to open(2) */
192 } Vfd;
193
194 /*
195 * Virtual File Descriptor array pointer and size. This grows as
196 * needed. 'File' values are indexes into this array.
197 * Note that VfdCache[0] is not a usable VFD, just a list header.
198 */
199 static Vfd *VfdCache;
200 static Size SizeVfdCache = 0;
201
202 /*
203 * Number of file descriptors known to be in use by VFD entries.
204 */
205 static int nfile = 0;
206
207 /*
208 * Flag to tell whether it's worth scanning VfdCache looking for temp files
209 * to close
210 */
211 static bool have_xact_temporary_files = false;
212
213 /*
214 * Tracks the total size of all temporary files. Note: when temp_file_limit
215 * is being enforced, this cannot overflow since the limit cannot be more
216 * than INT_MAX kilobytes. When not enforcing, it could theoretically
217 * overflow, but we don't care.
218 */
219 static uint64 temporary_files_size = 0;
220
221 /*
222 * List of OS handles opened with AllocateFile, AllocateDir and
223 * OpenTransientFile.
224 */
225 typedef enum
226 {
227 AllocateDescFile,
228 AllocateDescPipe,
229 AllocateDescDir,
230 AllocateDescRawFD
231 } AllocateDescKind;
232
233 typedef struct
234 {
235 AllocateDescKind kind;
236 SubTransactionId create_subid;
237 union
238 {
239 FILE *file;
240 DIR *dir;
241 int fd;
242 } desc;
243 } AllocateDesc;
244
245 static int numAllocatedDescs = 0;
246 static int maxAllocatedDescs = 0;
247 static AllocateDesc *allocatedDescs = NULL;
248
249 /*
250 * Number of temporary files opened during the current session;
251 * this is used in generation of tempfile names.
252 */
253 static long tempFileCounter = 0;
254
255 /*
256 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
257 * indicating that the current database's default tablespace should be used.)
258 * When numTempTableSpaces is -1, this has not been set in the current
259 * transaction.
260 */
261 static Oid *tempTableSpaces = NULL;
262 static int numTempTableSpaces = -1;
263 static int nextTempTableSpace = 0;
264
265
266 /*--------------------
267 *
268 * Private Routines
269 *
270 * Delete - delete a file from the Lru ring
271 * LruDelete - remove a file from the Lru ring and close its FD
272 * Insert - put a file at the front of the Lru ring
273 * LruInsert - put a file at the front of the Lru ring and open it
274 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
275 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
276 * AllocateVfd - grab a free (or new) file record (from VfdArray)
277 * FreeVfd - free a file record
278 *
279 * The Least Recently Used ring is a doubly linked list that begins and
280 * ends on element zero. Element zero is special -- it doesn't represent
281 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
282 * anchor that shows us the beginning/end of the ring.
283 * Only VFD elements that are currently really open (have an FD assigned) are
284 * in the Lru ring. Elements that are "virtually" open can be recognized
285 * by having a non-null fileName field.
286 *
287 * example:
288 *
289 * /--less----\ /---------\
290 * v \ v \
291 * #0 --more---> LeastRecentlyUsed --more-\ \
292 * ^\ | |
293 * \\less--> MostRecentlyUsedFile <---/ |
294 * \more---/ \--less--/
295 *
296 *--------------------
297 */
298 static void Delete(File file);
299 static void LruDelete(File file);
300 static void Insert(File file);
301 static int LruInsert(File file);
302 static bool ReleaseLruFile(void);
303 static void ReleaseLruFiles(void);
304 static File AllocateVfd(void);
305 static void FreeVfd(File file);
306
307 static int FileAccess(File file);
308 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
309 static bool reserveAllocatedDesc(void);
310 static int FreeDesc(AllocateDesc *desc);
311
312 static void AtProcExit_Files(int code, Datum arg);
313 static void CleanupTempFiles(bool isProcExit);
314 static void RemovePgTempFilesInDir(const char *tmpdirname);
315 static void RemovePgTempRelationFiles(const char *tsdirname);
316 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
317 static bool looks_like_temp_rel_name(const char *name);
318
319 static void walkdir(const char *path,
320 void (*action) (const char *fname, bool isdir, int elevel),
321 bool process_symlinks,
322 int elevel);
323 #ifdef PG_FLUSH_DATA_WORKS
324 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
325 #endif
326 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
327
328 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
329 static int fsync_parent_path(const char *fname, int elevel);
330
331
332 /*
333 * pg_fsync --- do fsync with or without writethrough
334 */
335 int
pg_fsync(int fd)336 pg_fsync(int fd)
337 {
338 /* #if is to skip the sync_method test if there's no need for it */
339 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
340 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
341 return pg_fsync_writethrough(fd);
342 else
343 #endif
344 return pg_fsync_no_writethrough(fd);
345 }
346
347
348 /*
349 * pg_fsync_no_writethrough --- same as fsync except does nothing if
350 * enableFsync is off
351 */
352 int
pg_fsync_no_writethrough(int fd)353 pg_fsync_no_writethrough(int fd)
354 {
355 if (enableFsync)
356 return fsync(fd);
357 else
358 return 0;
359 }
360
361 /*
362 * pg_fsync_writethrough
363 */
364 int
pg_fsync_writethrough(int fd)365 pg_fsync_writethrough(int fd)
366 {
367 if (enableFsync)
368 {
369 #ifdef WIN32
370 return _commit(fd);
371 #elif defined(F_FULLFSYNC)
372 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
373 #else
374 errno = ENOSYS;
375 return -1;
376 #endif
377 }
378 else
379 return 0;
380 }
381
382 /*
383 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
384 *
385 * Not all platforms have fdatasync; treat as fsync if not available.
386 */
387 int
pg_fdatasync(int fd)388 pg_fdatasync(int fd)
389 {
390 if (enableFsync)
391 {
392 #ifdef HAVE_FDATASYNC
393 return fdatasync(fd);
394 #else
395 return fsync(fd);
396 #endif
397 }
398 else
399 return 0;
400 }
401
402 /*
403 * pg_flush_data --- advise OS that the described dirty data should be flushed
404 *
405 * offset of 0 with nbytes 0 means that the entire file should be flushed;
406 * in this case, this function may have side-effects on the file's
407 * seek position!
408 */
409 void
pg_flush_data(int fd,off_t offset,off_t nbytes)410 pg_flush_data(int fd, off_t offset, off_t nbytes)
411 {
412 /*
413 * Right now file flushing is primarily used to avoid making later
414 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
415 * if fsyncs are disabled - that's a decision we might want to make
416 * configurable at some point.
417 */
418 if (!enableFsync)
419 return;
420
421 /*
422 * We compile all alternatives that are supported on the current platform,
423 * to find portability problems more easily.
424 */
425 #if defined(HAVE_SYNC_FILE_RANGE)
426 {
427 int rc;
428 static bool not_implemented_by_kernel = false;
429
430 if (not_implemented_by_kernel)
431 return;
432
433 /*
434 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
435 * tells the OS that writeback for the specified blocks should be
436 * started, but that we don't want to wait for completion. Note that
437 * this call might block if too much dirty data exists in the range.
438 * This is the preferable method on OSs supporting it, as it works
439 * reliably when available (contrast to msync()) and doesn't flush out
440 * clean data (like FADV_DONTNEED).
441 */
442 rc = sync_file_range(fd, offset, nbytes,
443 SYNC_FILE_RANGE_WRITE);
444 if (rc != 0)
445 {
446 int elevel;
447
448 /*
449 * For systems that don't have an implementation of
450 * sync_file_range() such as Windows WSL, generate only one
451 * warning and then suppress all further attempts by this process.
452 */
453 if (errno == ENOSYS)
454 {
455 elevel = WARNING;
456 not_implemented_by_kernel = true;
457 }
458 else
459 elevel = data_sync_elevel(WARNING);
460
461 ereport(elevel,
462 (errcode_for_file_access(),
463 errmsg("could not flush dirty data: %m")));
464 }
465
466 return;
467 }
468 #endif
469 #if !defined(WIN32) && defined(MS_ASYNC)
470 {
471 void *p;
472 static int pagesize = 0;
473
474 /*
475 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
476 * writeback. On linux it only does so if MS_SYNC is specified, but
477 * then it does the writeback synchronously. Luckily all common linux
478 * systems have sync_file_range(). This is preferable over
479 * FADV_DONTNEED because it doesn't flush out clean data.
480 *
481 * We map the file (mmap()), tell the kernel to sync back the contents
482 * (msync()), and then remove the mapping again (munmap()).
483 */
484
485 /* mmap() needs actual length if we want to map whole file */
486 if (offset == 0 && nbytes == 0)
487 {
488 nbytes = lseek(fd, 0, SEEK_END);
489 if (nbytes < 0)
490 {
491 ereport(WARNING,
492 (errcode_for_file_access(),
493 errmsg("could not determine dirty data size: %m")));
494 return;
495 }
496 }
497
498 /*
499 * Some platforms reject partial-page mmap() attempts. To deal with
500 * that, just truncate the request to a page boundary. If any extra
501 * bytes don't get flushed, well, it's only a hint anyway.
502 */
503
504 /* fetch pagesize only once */
505 if (pagesize == 0)
506 pagesize = sysconf(_SC_PAGESIZE);
507
508 /* align length to pagesize, dropping any fractional page */
509 if (pagesize > 0)
510 nbytes = (nbytes / pagesize) * pagesize;
511
512 /* fractional-page request is a no-op */
513 if (nbytes <= 0)
514 return;
515
516 /*
517 * mmap could well fail, particularly on 32-bit platforms where there
518 * may simply not be enough address space. If so, silently fall
519 * through to the next implementation.
520 */
521 if (nbytes <= (off_t) SSIZE_MAX)
522 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
523 else
524 p = MAP_FAILED;
525
526 if (p != MAP_FAILED)
527 {
528 int rc;
529
530 rc = msync(p, (size_t) nbytes, MS_ASYNC);
531 if (rc != 0)
532 {
533 ereport(data_sync_elevel(WARNING),
534 (errcode_for_file_access(),
535 errmsg("could not flush dirty data: %m")));
536 /* NB: need to fall through to munmap()! */
537 }
538
539 rc = munmap(p, (size_t) nbytes);
540 if (rc != 0)
541 {
542 /* FATAL error because mapping would remain */
543 ereport(FATAL,
544 (errcode_for_file_access(),
545 errmsg("could not munmap() while flushing data: %m")));
546 }
547
548 return;
549 }
550 }
551 #endif
552 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
553 {
554 int rc;
555
556 /*
557 * Signal the kernel that the passed in range should not be cached
558 * anymore. This has the, desired, side effect of writing out dirty
559 * data, and the, undesired, side effect of likely discarding useful
560 * clean cached blocks. For the latter reason this is the least
561 * preferable method.
562 */
563
564 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
565
566 if (rc != 0)
567 {
568 /* don't error out, this is just a performance optimization */
569 ereport(WARNING,
570 (errcode_for_file_access(),
571 errmsg("could not flush dirty data: %m")));
572 }
573
574 return;
575 }
576 #endif
577 }
578
579
580 /*
581 * fsync_fname -- fsync a file or directory, handling errors properly
582 *
583 * Try to fsync a file or directory. When doing the latter, ignore errors that
584 * indicate the OS just doesn't allow/require fsyncing directories.
585 */
586 void
fsync_fname(const char * fname,bool isdir)587 fsync_fname(const char *fname, bool isdir)
588 {
589 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
590 }
591
592 /*
593 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
594 *
595 * This routine ensures that, after returning, the effect of renaming file
596 * persists in case of a crash. A crash while this routine is running will
597 * leave you with either the pre-existing or the moved file in place of the
598 * new file; no mixed state or truncated files are possible.
599 *
600 * It does so by using fsync on the old filename and the possibly existing
601 * target filename before the rename, and the target file and directory after.
602 *
603 * Note that rename() cannot be used across arbitrary directories, as they
604 * might not be on the same filesystem. Therefore this routine does not
605 * support renaming across directories.
606 *
607 * Log errors with the caller specified severity.
608 *
609 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
610 * valid upon return.
611 */
612 int
durable_rename(const char * oldfile,const char * newfile,int elevel)613 durable_rename(const char *oldfile, const char *newfile, int elevel)
614 {
615 int fd;
616
617 /*
618 * First fsync the old and target path (if it exists), to ensure that they
619 * are properly persistent on disk. Syncing the target file is not
620 * strictly necessary, but it makes it easier to reason about crashes;
621 * because it's then guaranteed that either source or target file exists
622 * after a crash.
623 */
624 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
625 return -1;
626
627 fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
628 if (fd < 0)
629 {
630 if (errno != ENOENT)
631 {
632 ereport(elevel,
633 (errcode_for_file_access(),
634 errmsg("could not open file \"%s\": %m", newfile)));
635 return -1;
636 }
637 }
638 else
639 {
640 if (pg_fsync(fd) != 0)
641 {
642 int save_errno;
643
644 /* close file upon error, might not be in transaction context */
645 save_errno = errno;
646 CloseTransientFile(fd);
647 errno = save_errno;
648
649 ereport(elevel,
650 (errcode_for_file_access(),
651 errmsg("could not fsync file \"%s\": %m", newfile)));
652 return -1;
653 }
654 CloseTransientFile(fd);
655 }
656
657 /* Time to do the real deal... */
658 if (rename(oldfile, newfile) < 0)
659 {
660 ereport(elevel,
661 (errcode_for_file_access(),
662 errmsg("could not rename file \"%s\" to \"%s\": %m",
663 oldfile, newfile)));
664 return -1;
665 }
666
667 /*
668 * To guarantee renaming the file is persistent, fsync the file with its
669 * new name, and its containing directory.
670 */
671 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
672 return -1;
673
674 if (fsync_parent_path(newfile, elevel) != 0)
675 return -1;
676
677 return 0;
678 }
679
680 /*
681 * durable_unlink -- remove a file in a durable manner
682 *
683 * This routine ensures that, after returning, the effect of removing file
684 * persists in case of a crash. A crash while this routine is running will
685 * leave the system in no mixed state.
686 *
687 * It does so by using fsync on the parent directory of the file after the
688 * actual removal is done.
689 *
690 * Log errors with the severity specified by caller.
691 *
692 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
693 * valid upon return.
694 */
695 int
durable_unlink(const char * fname,int elevel)696 durable_unlink(const char *fname, int elevel)
697 {
698 if (unlink(fname) < 0)
699 {
700 ereport(elevel,
701 (errcode_for_file_access(),
702 errmsg("could not remove file \"%s\": %m",
703 fname)));
704 return -1;
705 }
706
707 /*
708 * To guarantee that the removal of the file is persistent, fsync its
709 * parent directory.
710 */
711 if (fsync_parent_path(fname, elevel) != 0)
712 return -1;
713
714 return 0;
715 }
716
717 /*
718 * durable_link_or_rename -- rename a file in a durable manner.
719 *
720 * Similar to durable_rename(), except that this routine tries (but does not
721 * guarantee) not to overwrite the target file.
722 *
723 * Note that a crash in an unfortunate moment can leave you with two links to
724 * the target file.
725 *
726 * Log errors with the caller specified severity.
727 *
728 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
729 * valid upon return.
730 */
731 int
durable_link_or_rename(const char * oldfile,const char * newfile,int elevel)732 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
733 {
734 /*
735 * Ensure that, if we crash directly after the rename/link, a file with
736 * valid contents is moved into place.
737 */
738 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
739 return -1;
740
741 #if HAVE_WORKING_LINK
742 if (link(oldfile, newfile) < 0)
743 {
744 ereport(elevel,
745 (errcode_for_file_access(),
746 errmsg("could not link file \"%s\" to \"%s\": %m",
747 oldfile, newfile)));
748 return -1;
749 }
750 unlink(oldfile);
751 #else
752 /* XXX: Add racy file existence check? */
753 if (rename(oldfile, newfile) < 0)
754 {
755 ereport(elevel,
756 (errcode_for_file_access(),
757 errmsg("could not rename file \"%s\" to \"%s\": %m",
758 oldfile, newfile)));
759 return -1;
760 }
761 #endif
762
763 /*
764 * Make change persistent in case of an OS crash, both the new entry and
765 * its parent directory need to be flushed.
766 */
767 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
768 return -1;
769
770 /* Same for parent directory */
771 if (fsync_parent_path(newfile, elevel) != 0)
772 return -1;
773
774 return 0;
775 }
776
777 /*
778 * InitFileAccess --- initialize this module during backend startup
779 *
780 * This is called during either normal or standalone backend start.
781 * It is *not* called in the postmaster.
782 */
783 void
InitFileAccess(void)784 InitFileAccess(void)
785 {
786 Assert(SizeVfdCache == 0); /* call me only once */
787
788 /* initialize cache header entry */
789 VfdCache = (Vfd *) malloc(sizeof(Vfd));
790 if (VfdCache == NULL)
791 ereport(FATAL,
792 (errcode(ERRCODE_OUT_OF_MEMORY),
793 errmsg("out of memory")));
794
795 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
796 VfdCache->fd = VFD_CLOSED;
797
798 SizeVfdCache = 1;
799
800 /* register proc-exit hook to ensure temp files are dropped at exit */
801 on_proc_exit(AtProcExit_Files, 0);
802 }
803
804 /*
805 * count_usable_fds --- count how many FDs the system will let us open,
806 * and estimate how many are already open.
807 *
808 * We stop counting if usable_fds reaches max_to_probe. Note: a small
809 * value of max_to_probe might result in an underestimate of already_open;
810 * we must fill in any "gaps" in the set of used FDs before the calculation
811 * of already_open will give the right answer. In practice, max_to_probe
812 * of a couple of dozen should be enough to ensure good results.
813 *
814 * We assume stdin (FD 0) is available for dup'ing
815 */
816 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)817 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
818 {
819 int *fd;
820 int size;
821 int used = 0;
822 int highestfd = 0;
823 int j;
824
825 #ifdef HAVE_GETRLIMIT
826 struct rlimit rlim;
827 int getrlimit_status;
828 #endif
829
830 size = 1024;
831 fd = (int *) palloc(size * sizeof(int));
832
833 #ifdef HAVE_GETRLIMIT
834 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
835 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
836 #else /* but BSD doesn't ... */
837 getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
838 #endif /* RLIMIT_NOFILE */
839 if (getrlimit_status != 0)
840 ereport(WARNING, (errmsg("getrlimit failed: %m")));
841 #endif /* HAVE_GETRLIMIT */
842
843 /* dup until failure or probe limit reached */
844 for (;;)
845 {
846 int thisfd;
847
848 #ifdef HAVE_GETRLIMIT
849
850 /*
851 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
852 * some platforms
853 */
854 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
855 break;
856 #endif
857
858 thisfd = dup(0);
859 if (thisfd < 0)
860 {
861 /* Expect EMFILE or ENFILE, else it's fishy */
862 if (errno != EMFILE && errno != ENFILE)
863 elog(WARNING, "dup(0) failed after %d successes: %m", used);
864 break;
865 }
866
867 if (used >= size)
868 {
869 size *= 2;
870 fd = (int *) repalloc(fd, size * sizeof(int));
871 }
872 fd[used++] = thisfd;
873
874 if (highestfd < thisfd)
875 highestfd = thisfd;
876
877 if (used >= max_to_probe)
878 break;
879 }
880
881 /* release the files we opened */
882 for (j = 0; j < used; j++)
883 close(fd[j]);
884
885 pfree(fd);
886
887 /*
888 * Return results. usable_fds is just the number of successful dups. We
889 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
890 * number) and so already_open is highestfd+1 - usable_fds.
891 */
892 *usable_fds = used;
893 *already_open = highestfd + 1 - used;
894 }
895
896 /*
897 * set_max_safe_fds
898 * Determine number of filedescriptors that fd.c is allowed to use
899 */
900 void
set_max_safe_fds(void)901 set_max_safe_fds(void)
902 {
903 int usable_fds;
904 int already_open;
905
906 /*----------
907 * We want to set max_safe_fds to
908 * MIN(usable_fds, max_files_per_process - already_open)
909 * less the slop factor for files that are opened without consulting
910 * fd.c. This ensures that we won't exceed either max_files_per_process
911 * or the experimentally-determined EMFILE limit.
912 *----------
913 */
914 count_usable_fds(max_files_per_process,
915 &usable_fds, &already_open);
916
917 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
918
919 /*
920 * Take off the FDs reserved for system() etc.
921 */
922 max_safe_fds -= NUM_RESERVED_FDS;
923
924 /*
925 * Make sure we still have enough to get by.
926 */
927 if (max_safe_fds < FD_MINFREE)
928 ereport(FATAL,
929 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
930 errmsg("insufficient file descriptors available to start server process"),
931 errdetail("System allows %d, we need at least %d.",
932 max_safe_fds + NUM_RESERVED_FDS,
933 FD_MINFREE + NUM_RESERVED_FDS)));
934
935 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
936 max_safe_fds, usable_fds, already_open);
937 }
938
939 /*
940 * BasicOpenFile --- same as open(2) except can free other FDs if needed
941 *
942 * This is exported for use by places that really want a plain kernel FD,
943 * but need to be proof against running out of FDs. Once an FD has been
944 * successfully returned, it is the caller's responsibility to ensure that
945 * it will not be leaked on ereport()! Most users should *not* call this
946 * routine directly, but instead use the VFD abstraction level, which
947 * provides protection against descriptor leaks as well as management of
948 * files that need to be open for more than a short period of time.
949 *
950 * Ideally this should be the *only* direct call of open() in the backend.
951 * In practice, the postmaster calls open() directly, and there are some
952 * direct open() calls done early in backend startup. Those are OK since
953 * this module wouldn't have any open files to close at that point anyway.
954 */
955 int
BasicOpenFile(FileName fileName,int fileFlags,int fileMode)956 BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
957 {
958 int fd;
959
960 tryAgain:
961 fd = open(fileName, fileFlags, fileMode);
962
963 if (fd >= 0)
964 return fd; /* success! */
965
966 if (errno == EMFILE || errno == ENFILE)
967 {
968 int save_errno = errno;
969
970 ereport(LOG,
971 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
972 errmsg("out of file descriptors: %m; release and retry")));
973 errno = 0;
974 if (ReleaseLruFile())
975 goto tryAgain;
976 errno = save_errno;
977 }
978
979 return -1; /* failure */
980 }
981
982 #if defined(FDDEBUG)
983
984 static void
_dump_lru(void)985 _dump_lru(void)
986 {
987 int mru = VfdCache[0].lruLessRecently;
988 Vfd *vfdP = &VfdCache[mru];
989 char buf[2048];
990
991 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
992 while (mru != 0)
993 {
994 mru = vfdP->lruLessRecently;
995 vfdP = &VfdCache[mru];
996 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
997 }
998 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
999 elog(LOG, "%s", buf);
1000 }
1001 #endif /* FDDEBUG */
1002
1003 static void
Delete(File file)1004 Delete(File file)
1005 {
1006 Vfd *vfdP;
1007
1008 Assert(file != 0);
1009
1010 DO_DB(elog(LOG, "Delete %d (%s)",
1011 file, VfdCache[file].fileName));
1012 DO_DB(_dump_lru());
1013
1014 vfdP = &VfdCache[file];
1015
1016 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1017 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1018
1019 DO_DB(_dump_lru());
1020 }
1021
1022 static void
LruDelete(File file)1023 LruDelete(File file)
1024 {
1025 Vfd *vfdP;
1026
1027 Assert(file != 0);
1028
1029 DO_DB(elog(LOG, "LruDelete %d (%s)",
1030 file, VfdCache[file].fileName));
1031
1032 vfdP = &VfdCache[file];
1033
1034 /*
1035 * Normally we should know the seek position, but if for some reason we
1036 * have lost track of it, try again to get it. If we still can't get it,
1037 * we have a problem: we will be unable to restore the file seek position
1038 * when and if the file is re-opened. But we can't really throw an error
1039 * and refuse to close the file, or activities such as transaction cleanup
1040 * will be broken.
1041 */
1042 if (FilePosIsUnknown(vfdP->seekPos))
1043 {
1044 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1045 if (FilePosIsUnknown(vfdP->seekPos))
1046 elog(LOG, "could not seek file \"%s\" before closing: %m",
1047 vfdP->fileName);
1048 }
1049
1050 /*
1051 * Close the file. We aren't expecting this to fail; if it does, better
1052 * to leak the FD than to mess up our internal state.
1053 */
1054 if (close(vfdP->fd))
1055 elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1056 "could not close file \"%s\": %m", vfdP->fileName);
1057 vfdP->fd = VFD_CLOSED;
1058 --nfile;
1059
1060 /* delete the vfd record from the LRU ring */
1061 Delete(file);
1062 }
1063
1064 static void
Insert(File file)1065 Insert(File file)
1066 {
1067 Vfd *vfdP;
1068
1069 Assert(file != 0);
1070
1071 DO_DB(elog(LOG, "Insert %d (%s)",
1072 file, VfdCache[file].fileName));
1073 DO_DB(_dump_lru());
1074
1075 vfdP = &VfdCache[file];
1076
1077 vfdP->lruMoreRecently = 0;
1078 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1079 VfdCache[0].lruLessRecently = file;
1080 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1081
1082 DO_DB(_dump_lru());
1083 }
1084
1085 /* returns 0 on success, -1 on re-open failure (with errno set) */
1086 static int
LruInsert(File file)1087 LruInsert(File file)
1088 {
1089 Vfd *vfdP;
1090
1091 Assert(file != 0);
1092
1093 DO_DB(elog(LOG, "LruInsert %d (%s)",
1094 file, VfdCache[file].fileName));
1095
1096 vfdP = &VfdCache[file];
1097
1098 if (FileIsNotOpen(file))
1099 {
1100 /* Close excess kernel FDs. */
1101 ReleaseLruFiles();
1102
1103 /*
1104 * The open could still fail for lack of file descriptors, eg due to
1105 * overall system file table being full. So, be prepared to release
1106 * another FD if necessary...
1107 */
1108 vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
1109 vfdP->fileMode);
1110 if (vfdP->fd < 0)
1111 {
1112 DO_DB(elog(LOG, "re-open failed: %m"));
1113 return -1;
1114 }
1115 else
1116 {
1117 ++nfile;
1118 }
1119
1120 /*
1121 * Seek to the right position. We need no special case for seekPos
1122 * equal to FileUnknownPos, as lseek() will certainly reject that
1123 * (thus completing the logic noted in LruDelete() that we will fail
1124 * to re-open a file if we couldn't get its seek position before
1125 * closing).
1126 */
1127 if (vfdP->seekPos != (off_t) 0)
1128 {
1129 if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1130 {
1131 /*
1132 * If we fail to restore the seek position, treat it like an
1133 * open() failure.
1134 */
1135 int save_errno = errno;
1136
1137 elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1138 vfdP->fileName);
1139 (void) close(vfdP->fd);
1140 vfdP->fd = VFD_CLOSED;
1141 --nfile;
1142 errno = save_errno;
1143 return -1;
1144 }
1145 }
1146 }
1147
1148 /*
1149 * put it at the head of the Lru ring
1150 */
1151
1152 Insert(file);
1153
1154 return 0;
1155 }
1156
1157 /*
1158 * Release one kernel FD by closing the least-recently-used VFD.
1159 */
1160 static bool
ReleaseLruFile(void)1161 ReleaseLruFile(void)
1162 {
1163 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1164
1165 if (nfile > 0)
1166 {
1167 /*
1168 * There are opened files and so there should be at least one used vfd
1169 * in the ring.
1170 */
1171 Assert(VfdCache[0].lruMoreRecently != 0);
1172 LruDelete(VfdCache[0].lruMoreRecently);
1173 return true; /* freed a file */
1174 }
1175 return false; /* no files available to free */
1176 }
1177
1178 /*
1179 * Release kernel FDs as needed to get under the max_safe_fds limit.
1180 * After calling this, it's OK to try to open another file.
1181 */
1182 static void
ReleaseLruFiles(void)1183 ReleaseLruFiles(void)
1184 {
1185 while (nfile + numAllocatedDescs >= max_safe_fds)
1186 {
1187 if (!ReleaseLruFile())
1188 break;
1189 }
1190 }
1191
1192 static File
AllocateVfd(void)1193 AllocateVfd(void)
1194 {
1195 Index i;
1196 File file;
1197
1198 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1199
1200 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1201
1202 if (VfdCache[0].nextFree == 0)
1203 {
1204 /*
1205 * The free list is empty so it is time to increase the size of the
1206 * array. We choose to double it each time this happens. However,
1207 * there's not much point in starting *real* small.
1208 */
1209 Size newCacheSize = SizeVfdCache * 2;
1210 Vfd *newVfdCache;
1211
1212 if (newCacheSize < 32)
1213 newCacheSize = 32;
1214
1215 /*
1216 * Be careful not to clobber VfdCache ptr if realloc fails.
1217 */
1218 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1219 if (newVfdCache == NULL)
1220 ereport(ERROR,
1221 (errcode(ERRCODE_OUT_OF_MEMORY),
1222 errmsg("out of memory")));
1223 VfdCache = newVfdCache;
1224
1225 /*
1226 * Initialize the new entries and link them into the free list.
1227 */
1228 for (i = SizeVfdCache; i < newCacheSize; i++)
1229 {
1230 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1231 VfdCache[i].nextFree = i + 1;
1232 VfdCache[i].fd = VFD_CLOSED;
1233 }
1234 VfdCache[newCacheSize - 1].nextFree = 0;
1235 VfdCache[0].nextFree = SizeVfdCache;
1236
1237 /*
1238 * Record the new size
1239 */
1240 SizeVfdCache = newCacheSize;
1241 }
1242
1243 file = VfdCache[0].nextFree;
1244
1245 VfdCache[0].nextFree = VfdCache[file].nextFree;
1246
1247 return file;
1248 }
1249
1250 static void
FreeVfd(File file)1251 FreeVfd(File file)
1252 {
1253 Vfd *vfdP = &VfdCache[file];
1254
1255 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1256 file, vfdP->fileName ? vfdP->fileName : ""));
1257
1258 if (vfdP->fileName != NULL)
1259 {
1260 free(vfdP->fileName);
1261 vfdP->fileName = NULL;
1262 }
1263 vfdP->fdstate = 0x0;
1264
1265 vfdP->nextFree = VfdCache[0].nextFree;
1266 VfdCache[0].nextFree = file;
1267 }
1268
1269 /* returns 0 on success, -1 on re-open failure (with errno set) */
1270 static int
FileAccess(File file)1271 FileAccess(File file)
1272 {
1273 int returnValue;
1274
1275 DO_DB(elog(LOG, "FileAccess %d (%s)",
1276 file, VfdCache[file].fileName));
1277
1278 /*
1279 * Is the file open? If not, open it and put it at the head of the LRU
1280 * ring (possibly closing the least recently used file to get an FD).
1281 */
1282
1283 if (FileIsNotOpen(file))
1284 {
1285 returnValue = LruInsert(file);
1286 if (returnValue != 0)
1287 return returnValue;
1288 }
1289 else if (VfdCache[0].lruLessRecently != file)
1290 {
1291 /*
1292 * We now know that the file is open and that it is not the last one
1293 * accessed, so we need to move it to the head of the Lru ring.
1294 */
1295
1296 Delete(file);
1297 Insert(file);
1298 }
1299
1300 return 0;
1301 }
1302
1303 /*
1304 * Called when we get a shared invalidation message on some relation.
1305 */
1306 #ifdef NOT_USED
1307 void
FileInvalidate(File file)1308 FileInvalidate(File file)
1309 {
1310 Assert(FileIsValid(file));
1311 if (!FileIsNotOpen(file))
1312 LruDelete(file);
1313 }
1314 #endif
1315
1316 /*
1317 * open a file in an arbitrary directory
1318 *
1319 * NB: if the passed pathname is relative (which it usually is),
1320 * it will be interpreted relative to the process' working directory
1321 * (which should always be $PGDATA when this code is running).
1322 */
1323 File
PathNameOpenFile(FileName fileName,int fileFlags,int fileMode)1324 PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
1325 {
1326 char *fnamecopy;
1327 File file;
1328 Vfd *vfdP;
1329
1330 DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
1331 fileName, fileFlags, fileMode));
1332
1333 /*
1334 * We need a malloc'd copy of the file name; fail cleanly if no room.
1335 */
1336 fnamecopy = strdup(fileName);
1337 if (fnamecopy == NULL)
1338 ereport(ERROR,
1339 (errcode(ERRCODE_OUT_OF_MEMORY),
1340 errmsg("out of memory")));
1341
1342 file = AllocateVfd();
1343 vfdP = &VfdCache[file];
1344
1345 /* Close excess kernel FDs. */
1346 ReleaseLruFiles();
1347
1348 vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
1349
1350 if (vfdP->fd < 0)
1351 {
1352 int save_errno = errno;
1353
1354 FreeVfd(file);
1355 free(fnamecopy);
1356 errno = save_errno;
1357 return -1;
1358 }
1359 ++nfile;
1360 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1361 vfdP->fd));
1362
1363 vfdP->fileName = fnamecopy;
1364 /* Saved flags are adjusted to be OK for re-opening file */
1365 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1366 vfdP->fileMode = fileMode;
1367 vfdP->seekPos = 0;
1368 vfdP->fileSize = 0;
1369 vfdP->fdstate = 0x0;
1370 vfdP->resowner = NULL;
1371
1372 Insert(file);
1373
1374 return file;
1375 }
1376
1377 /*
1378 * Open a temporary file that will disappear when we close it.
1379 *
1380 * This routine takes care of generating an appropriate tempfile name.
1381 * There's no need to pass in fileFlags or fileMode either, since only
1382 * one setting makes any sense for a temp file.
1383 *
1384 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1385 * to ensure it's closed and deleted when it's no longer needed, typically at
1386 * the end-of-transaction. In most cases, you don't want temporary files to
1387 * outlive the transaction that created them, so this should be false -- but
1388 * if you need "somewhat" temporary storage, this might be useful. In either
1389 * case, the file is removed when the File is explicitly closed.
1390 */
1391 File
OpenTemporaryFile(bool interXact)1392 OpenTemporaryFile(bool interXact)
1393 {
1394 File file = 0;
1395
1396 /*
1397 * Make sure the current resource owner has space for this File before we
1398 * open it, if we'll be registering it below.
1399 */
1400 if (!interXact)
1401 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1402
1403 /*
1404 * If some temp tablespace(s) have been given to us, try to use the next
1405 * one. If a given tablespace can't be found, we silently fall back to
1406 * the database's default tablespace.
1407 *
1408 * BUT: if the temp file is slated to outlive the current transaction,
1409 * force it into the database's default tablespace, so that it will not
1410 * pose a threat to possible tablespace drop attempts.
1411 */
1412 if (numTempTableSpaces > 0 && !interXact)
1413 {
1414 Oid tblspcOid = GetNextTempTableSpace();
1415
1416 if (OidIsValid(tblspcOid))
1417 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1418 }
1419
1420 /*
1421 * If not, or if tablespace is bad, create in database's default
1422 * tablespace. MyDatabaseTableSpace should normally be set before we get
1423 * here, but just in case it isn't, fall back to pg_default tablespace.
1424 */
1425 if (file <= 0)
1426 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1427 MyDatabaseTableSpace :
1428 DEFAULTTABLESPACE_OID,
1429 true);
1430
1431 /* Mark it for deletion at close */
1432 VfdCache[file].fdstate |= FD_TEMPORARY;
1433
1434 /* Register it with the current resource owner */
1435 if (!interXact)
1436 {
1437 VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
1438
1439 VfdCache[file].resowner = CurrentResourceOwner;
1440 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1441
1442 /* ensure cleanup happens at eoxact */
1443 have_xact_temporary_files = true;
1444 }
1445
1446 return file;
1447 }
1448
1449 /*
1450 * Open a temporary file in a specific tablespace.
1451 * Subroutine for OpenTemporaryFile, which see for details.
1452 */
1453 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1454 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1455 {
1456 char tempdirpath[MAXPGPATH];
1457 char tempfilepath[MAXPGPATH];
1458 File file;
1459
1460 /*
1461 * Identify the tempfile directory for this tablespace.
1462 *
1463 * If someone tries to specify pg_global, use pg_default instead.
1464 */
1465 if (tblspcOid == DEFAULTTABLESPACE_OID ||
1466 tblspcOid == GLOBALTABLESPACE_OID)
1467 {
1468 /* The default tablespace is {datadir}/base */
1469 snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
1470 PG_TEMP_FILES_DIR);
1471 }
1472 else
1473 {
1474 /* All other tablespaces are accessed via symlinks */
1475 snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
1476 tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
1477 }
1478
1479 /*
1480 * Generate a tempfile name that should be unique within the current
1481 * database instance.
1482 */
1483 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1484 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1485
1486 /*
1487 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1488 * temp file that can be reused.
1489 */
1490 file = PathNameOpenFile(tempfilepath,
1491 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1492 0600);
1493 if (file <= 0)
1494 {
1495 /*
1496 * We might need to create the tablespace's tempfile directory, if no
1497 * one has yet done so.
1498 *
1499 * Don't check for error from mkdir; it could fail if someone else
1500 * just did the same thing. If it doesn't work then we'll bomb out on
1501 * the second create attempt, instead.
1502 */
1503 mkdir(tempdirpath, S_IRWXU);
1504
1505 file = PathNameOpenFile(tempfilepath,
1506 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1507 0600);
1508 if (file <= 0 && rejectError)
1509 elog(ERROR, "could not create temporary file \"%s\": %m",
1510 tempfilepath);
1511 }
1512
1513 return file;
1514 }
1515
1516 /*
1517 * close a file when done with it
1518 */
1519 void
FileClose(File file)1520 FileClose(File file)
1521 {
1522 Vfd *vfdP;
1523
1524 Assert(FileIsValid(file));
1525
1526 DO_DB(elog(LOG, "FileClose: %d (%s)",
1527 file, VfdCache[file].fileName));
1528
1529 vfdP = &VfdCache[file];
1530
1531 if (!FileIsNotOpen(file))
1532 {
1533 /* close the file */
1534 if (close(vfdP->fd))
1535 {
1536 /*
1537 * We may need to panic on failure to close non-temporary files;
1538 * see LruDelete.
1539 */
1540 elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1541 "could not close file \"%s\": %m", vfdP->fileName);
1542 }
1543
1544 --nfile;
1545 vfdP->fd = VFD_CLOSED;
1546
1547 /* remove the file from the lru ring */
1548 Delete(file);
1549 }
1550
1551 /*
1552 * Delete the file if it was temporary, and make a log entry if wanted
1553 */
1554 if (vfdP->fdstate & FD_TEMPORARY)
1555 {
1556 struct stat filestats;
1557 int stat_errno;
1558
1559 /*
1560 * If we get an error, as could happen within the ereport/elog calls,
1561 * we'll come right back here during transaction abort. Reset the
1562 * flag to ensure that we can't get into an infinite loop. This code
1563 * is arranged to ensure that the worst-case consequence is failing to
1564 * emit log message(s), not failing to attempt the unlink.
1565 */
1566 vfdP->fdstate &= ~FD_TEMPORARY;
1567
1568 /* Subtract its size from current usage (do first in case of error) */
1569 temporary_files_size -= vfdP->fileSize;
1570 vfdP->fileSize = 0;
1571
1572 /* first try the stat() */
1573 if (stat(vfdP->fileName, &filestats))
1574 stat_errno = errno;
1575 else
1576 stat_errno = 0;
1577
1578 /* in any case do the unlink */
1579 if (unlink(vfdP->fileName))
1580 elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1581
1582 /* and last report the stat results */
1583 if (stat_errno == 0)
1584 {
1585 pgstat_report_tempfile(filestats.st_size);
1586
1587 if (log_temp_files >= 0)
1588 {
1589 if ((filestats.st_size / 1024) >= log_temp_files)
1590 ereport(LOG,
1591 (errmsg("temporary file: path \"%s\", size %lu",
1592 vfdP->fileName,
1593 (unsigned long) filestats.st_size)));
1594 }
1595 }
1596 else
1597 {
1598 errno = stat_errno;
1599 elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1600 }
1601 }
1602
1603 /* Unregister it from the resource owner */
1604 if (vfdP->resowner)
1605 ResourceOwnerForgetFile(vfdP->resowner, file);
1606
1607 /*
1608 * Return the Vfd slot to the free list
1609 */
1610 FreeVfd(file);
1611 }
1612
1613 /*
1614 * FilePrefetch - initiate asynchronous read of a given range of the file.
1615 * The logical seek position is unaffected.
1616 *
1617 * Currently the only implementation of this function is using posix_fadvise
1618 * which is the simplest standardized interface that accomplishes this.
1619 * We could add an implementation using libaio in the future; but note that
1620 * this API is inappropriate for libaio, which wants to have a buffer provided
1621 * to read into.
1622 */
1623 int
FilePrefetch(File file,off_t offset,int amount,uint32 wait_event_info)1624 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1625 {
1626 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1627 int returnCode;
1628
1629 Assert(FileIsValid(file));
1630
1631 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1632 file, VfdCache[file].fileName,
1633 (int64) offset, amount));
1634
1635 returnCode = FileAccess(file);
1636 if (returnCode < 0)
1637 return returnCode;
1638
1639 pgstat_report_wait_start(wait_event_info);
1640 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1641 POSIX_FADV_WILLNEED);
1642 pgstat_report_wait_end();
1643
1644 return returnCode;
1645 #else
1646 Assert(FileIsValid(file));
1647 return 0;
1648 #endif
1649 }
1650
1651 void
FileWriteback(File file,off_t offset,off_t nbytes,uint32 wait_event_info)1652 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1653 {
1654 int returnCode;
1655
1656 Assert(FileIsValid(file));
1657
1658 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1659 file, VfdCache[file].fileName,
1660 (int64) offset, (int64) nbytes));
1661
1662 /*
1663 * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1664 * file's seek position. We prefer to define that as a no-op here.
1665 */
1666 if (nbytes <= 0)
1667 return;
1668
1669 returnCode = FileAccess(file);
1670 if (returnCode < 0)
1671 return;
1672
1673 pgstat_report_wait_start(wait_event_info);
1674 pg_flush_data(VfdCache[file].fd, offset, nbytes);
1675 pgstat_report_wait_end();
1676 }
1677
1678 int
FileRead(File file,char * buffer,int amount,uint32 wait_event_info)1679 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1680 {
1681 int returnCode;
1682 Vfd *vfdP;
1683
1684 Assert(FileIsValid(file));
1685
1686 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1687 file, VfdCache[file].fileName,
1688 (int64) VfdCache[file].seekPos,
1689 amount, buffer));
1690
1691 returnCode = FileAccess(file);
1692 if (returnCode < 0)
1693 return returnCode;
1694
1695 vfdP = &VfdCache[file];
1696
1697 retry:
1698 pgstat_report_wait_start(wait_event_info);
1699 returnCode = read(vfdP->fd, buffer, amount);
1700 pgstat_report_wait_end();
1701
1702 if (returnCode >= 0)
1703 {
1704 /* if seekPos is unknown, leave it that way */
1705 if (!FilePosIsUnknown(vfdP->seekPos))
1706 vfdP->seekPos += returnCode;
1707 }
1708 else
1709 {
1710 /*
1711 * Windows may run out of kernel buffers and return "Insufficient
1712 * system resources" error. Wait a bit and retry to solve it.
1713 *
1714 * It is rumored that EINTR is also possible on some Unix filesystems,
1715 * in which case immediate retry is indicated.
1716 */
1717 #ifdef WIN32
1718 DWORD error = GetLastError();
1719
1720 switch (error)
1721 {
1722 case ERROR_NO_SYSTEM_RESOURCES:
1723 pg_usleep(1000L);
1724 errno = EINTR;
1725 break;
1726 default:
1727 _dosmaperr(error);
1728 break;
1729 }
1730 #endif
1731 /* OK to retry if interrupted */
1732 if (errno == EINTR)
1733 goto retry;
1734
1735 /* Trouble, so assume we don't know the file position anymore */
1736 vfdP->seekPos = FileUnknownPos;
1737 }
1738
1739 return returnCode;
1740 }
1741
1742 int
FileWrite(File file,char * buffer,int amount,uint32 wait_event_info)1743 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1744 {
1745 int returnCode;
1746 Vfd *vfdP;
1747
1748 Assert(FileIsValid(file));
1749
1750 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1751 file, VfdCache[file].fileName,
1752 (int64) VfdCache[file].seekPos,
1753 amount, buffer));
1754
1755 returnCode = FileAccess(file);
1756 if (returnCode < 0)
1757 return returnCode;
1758
1759 vfdP = &VfdCache[file];
1760
1761 /*
1762 * If enforcing temp_file_limit and it's a temp file, check to see if the
1763 * write would overrun temp_file_limit, and throw error if so. Note: it's
1764 * really a modularity violation to throw error here; we should set errno
1765 * and return -1. However, there's no way to report a suitable error
1766 * message if we do that. All current callers would just throw error
1767 * immediately anyway, so this is safe at present.
1768 */
1769 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
1770 {
1771 off_t newPos;
1772
1773 /*
1774 * Normally we should know the seek position, but if for some reason
1775 * we have lost track of it, try again to get it. Here, it's fine to
1776 * throw an error if we still can't get it.
1777 */
1778 if (FilePosIsUnknown(vfdP->seekPos))
1779 {
1780 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1781 if (FilePosIsUnknown(vfdP->seekPos))
1782 elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1783 }
1784
1785 newPos = vfdP->seekPos + amount;
1786 if (newPos > vfdP->fileSize)
1787 {
1788 uint64 newTotal = temporary_files_size;
1789
1790 newTotal += newPos - vfdP->fileSize;
1791 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1792 ereport(ERROR,
1793 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1794 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1795 temp_file_limit)));
1796 }
1797 }
1798
1799 retry:
1800 errno = 0;
1801 pgstat_report_wait_start(wait_event_info);
1802 returnCode = write(vfdP->fd, buffer, amount);
1803 pgstat_report_wait_end();
1804
1805 /* if write didn't set errno, assume problem is no disk space */
1806 if (returnCode != amount && errno == 0)
1807 errno = ENOSPC;
1808
1809 if (returnCode >= 0)
1810 {
1811 /* if seekPos is unknown, leave it that way */
1812 if (!FilePosIsUnknown(vfdP->seekPos))
1813 vfdP->seekPos += returnCode;
1814
1815 /*
1816 * Maintain fileSize and temporary_files_size if it's a temp file.
1817 *
1818 * If seekPos is -1 (unknown), this will do nothing; but we could only
1819 * get here in that state if we're not enforcing temporary_files_size,
1820 * so we don't care.
1821 */
1822 if (vfdP->fdstate & FD_TEMPORARY)
1823 {
1824 off_t newPos = vfdP->seekPos;
1825
1826 if (newPos > vfdP->fileSize)
1827 {
1828 temporary_files_size += newPos - vfdP->fileSize;
1829 vfdP->fileSize = newPos;
1830 }
1831 }
1832 }
1833 else
1834 {
1835 /*
1836 * See comments in FileRead()
1837 */
1838 #ifdef WIN32
1839 DWORD error = GetLastError();
1840
1841 switch (error)
1842 {
1843 case ERROR_NO_SYSTEM_RESOURCES:
1844 pg_usleep(1000L);
1845 errno = EINTR;
1846 break;
1847 default:
1848 _dosmaperr(error);
1849 break;
1850 }
1851 #endif
1852 /* OK to retry if interrupted */
1853 if (errno == EINTR)
1854 goto retry;
1855
1856 /* Trouble, so assume we don't know the file position anymore */
1857 vfdP->seekPos = FileUnknownPos;
1858 }
1859
1860 return returnCode;
1861 }
1862
1863 int
FileSync(File file,uint32 wait_event_info)1864 FileSync(File file, uint32 wait_event_info)
1865 {
1866 int returnCode;
1867
1868 Assert(FileIsValid(file));
1869
1870 DO_DB(elog(LOG, "FileSync: %d (%s)",
1871 file, VfdCache[file].fileName));
1872
1873 returnCode = FileAccess(file);
1874 if (returnCode < 0)
1875 return returnCode;
1876
1877 pgstat_report_wait_start(wait_event_info);
1878 returnCode = pg_fsync(VfdCache[file].fd);
1879 pgstat_report_wait_end();
1880
1881 return returnCode;
1882 }
1883
1884 off_t
FileSeek(File file,off_t offset,int whence)1885 FileSeek(File file, off_t offset, int whence)
1886 {
1887 Vfd *vfdP;
1888
1889 Assert(FileIsValid(file));
1890
1891 DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
1892 file, VfdCache[file].fileName,
1893 (int64) VfdCache[file].seekPos,
1894 (int64) offset, whence));
1895
1896 vfdP = &VfdCache[file];
1897
1898 if (FileIsNotOpen(file))
1899 {
1900 switch (whence)
1901 {
1902 case SEEK_SET:
1903 if (offset < 0)
1904 {
1905 errno = EINVAL;
1906 return (off_t) -1;
1907 }
1908 vfdP->seekPos = offset;
1909 break;
1910 case SEEK_CUR:
1911 if (FilePosIsUnknown(vfdP->seekPos) ||
1912 vfdP->seekPos + offset < 0)
1913 {
1914 errno = EINVAL;
1915 return (off_t) -1;
1916 }
1917 vfdP->seekPos += offset;
1918 break;
1919 case SEEK_END:
1920 if (FileAccess(file) < 0)
1921 return (off_t) -1;
1922 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1923 break;
1924 default:
1925 elog(ERROR, "invalid whence: %d", whence);
1926 break;
1927 }
1928 }
1929 else
1930 {
1931 switch (whence)
1932 {
1933 case SEEK_SET:
1934 if (offset < 0)
1935 {
1936 errno = EINVAL;
1937 return (off_t) -1;
1938 }
1939 if (vfdP->seekPos != offset)
1940 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1941 break;
1942 case SEEK_CUR:
1943 if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
1944 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1945 break;
1946 case SEEK_END:
1947 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1948 break;
1949 default:
1950 elog(ERROR, "invalid whence: %d", whence);
1951 break;
1952 }
1953 }
1954
1955 return vfdP->seekPos;
1956 }
1957
1958 /*
1959 * XXX not actually used but here for completeness
1960 */
1961 #ifdef NOT_USED
1962 off_t
FileTell(File file)1963 FileTell(File file)
1964 {
1965 Assert(FileIsValid(file));
1966 DO_DB(elog(LOG, "FileTell %d (%s)",
1967 file, VfdCache[file].fileName));
1968 return VfdCache[file].seekPos;
1969 }
1970 #endif
1971
1972 int
FileTruncate(File file,off_t offset,uint32 wait_event_info)1973 FileTruncate(File file, off_t offset, uint32 wait_event_info)
1974 {
1975 int returnCode;
1976
1977 Assert(FileIsValid(file));
1978
1979 DO_DB(elog(LOG, "FileTruncate %d (%s)",
1980 file, VfdCache[file].fileName));
1981
1982 returnCode = FileAccess(file);
1983 if (returnCode < 0)
1984 return returnCode;
1985
1986 pgstat_report_wait_start(wait_event_info);
1987 returnCode = ftruncate(VfdCache[file].fd, offset);
1988 pgstat_report_wait_end();
1989
1990 if (returnCode == 0 && VfdCache[file].fileSize > offset)
1991 {
1992 /* adjust our state for truncation of a temp file */
1993 Assert(VfdCache[file].fdstate & FD_TEMPORARY);
1994 temporary_files_size -= VfdCache[file].fileSize - offset;
1995 VfdCache[file].fileSize = offset;
1996 }
1997
1998 return returnCode;
1999 }
2000
2001 /*
2002 * Return the pathname associated with an open file.
2003 *
2004 * The returned string points to an internal buffer, which is valid until
2005 * the file is closed.
2006 */
2007 char *
FilePathName(File file)2008 FilePathName(File file)
2009 {
2010 Assert(FileIsValid(file));
2011
2012 return VfdCache[file].fileName;
2013 }
2014
2015 /*
2016 * Return the raw file descriptor of an opened file.
2017 *
2018 * The returned file descriptor will be valid until the file is closed, but
2019 * there are a lot of things that can make that happen. So the caller should
2020 * be careful not to do much of anything else before it finishes using the
2021 * returned file descriptor.
2022 */
2023 int
FileGetRawDesc(File file)2024 FileGetRawDesc(File file)
2025 {
2026 Assert(FileIsValid(file));
2027 return VfdCache[file].fd;
2028 }
2029
2030 /*
2031 * FileGetRawFlags - returns the file flags on open(2)
2032 */
2033 int
FileGetRawFlags(File file)2034 FileGetRawFlags(File file)
2035 {
2036 Assert(FileIsValid(file));
2037 return VfdCache[file].fileFlags;
2038 }
2039
2040 /*
2041 * FileGetRawMode - returns the mode bitmask passed to open(2)
2042 */
2043 int
FileGetRawMode(File file)2044 FileGetRawMode(File file)
2045 {
2046 Assert(FileIsValid(file));
2047 return VfdCache[file].fileMode;
2048 }
2049
2050 /*
2051 * Make room for another allocatedDescs[] array entry if needed and possible.
2052 * Returns true if an array element is available.
2053 */
2054 static bool
reserveAllocatedDesc(void)2055 reserveAllocatedDesc(void)
2056 {
2057 AllocateDesc *newDescs;
2058 int newMax;
2059
2060 /* Quick out if array already has a free slot. */
2061 if (numAllocatedDescs < maxAllocatedDescs)
2062 return true;
2063
2064 /*
2065 * If the array hasn't yet been created in the current process, initialize
2066 * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2067 * we will ever need, anyway. We don't want to look at max_safe_fds
2068 * immediately because set_max_safe_fds() may not have run yet.
2069 */
2070 if (allocatedDescs == NULL)
2071 {
2072 newMax = FD_MINFREE / 2;
2073 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2074 /* Out of memory already? Treat as fatal error. */
2075 if (newDescs == NULL)
2076 ereport(ERROR,
2077 (errcode(ERRCODE_OUT_OF_MEMORY),
2078 errmsg("out of memory")));
2079 allocatedDescs = newDescs;
2080 maxAllocatedDescs = newMax;
2081 return true;
2082 }
2083
2084 /*
2085 * Consider enlarging the array beyond the initial allocation used above.
2086 * By the time this happens, max_safe_fds should be known accurately.
2087 *
2088 * We mustn't let allocated descriptors hog all the available FDs, and in
2089 * practice we'd better leave a reasonable number of FDs for VFD use. So
2090 * set the maximum to max_safe_fds / 2. (This should certainly be at
2091 * least as large as the initial size, FD_MINFREE / 2.)
2092 */
2093 newMax = max_safe_fds / 2;
2094 if (newMax > maxAllocatedDescs)
2095 {
2096 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2097 newMax * sizeof(AllocateDesc));
2098 /* Treat out-of-memory as a non-fatal error. */
2099 if (newDescs == NULL)
2100 return false;
2101 allocatedDescs = newDescs;
2102 maxAllocatedDescs = newMax;
2103 return true;
2104 }
2105
2106 /* Can't enlarge allocatedDescs[] any more. */
2107 return false;
2108 }
2109
2110 /*
2111 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2112 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2113 * necessary to open the file. When done, call FreeFile rather than fclose.
2114 *
2115 * Note that files that will be open for any significant length of time
2116 * should NOT be handled this way, since they cannot share kernel file
2117 * descriptors with other files; there is grave risk of running out of FDs
2118 * if anyone locks down too many FDs. Most callers of this routine are
2119 * simply reading a config file that they will read and close immediately.
2120 *
2121 * fd.c will automatically close all files opened with AllocateFile at
2122 * transaction commit or abort; this prevents FD leakage if a routine
2123 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2124 *
2125 * Ideally this should be the *only* direct call of fopen() in the backend.
2126 */
2127 FILE *
AllocateFile(const char * name,const char * mode)2128 AllocateFile(const char *name, const char *mode)
2129 {
2130 FILE *file;
2131
2132 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2133 numAllocatedDescs, name));
2134
2135 /* Can we allocate another non-virtual FD? */
2136 if (!reserveAllocatedDesc())
2137 ereport(ERROR,
2138 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2139 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2140 maxAllocatedDescs, name)));
2141
2142 /* Close excess kernel FDs. */
2143 ReleaseLruFiles();
2144
2145 TryAgain:
2146 if ((file = fopen(name, mode)) != NULL)
2147 {
2148 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2149
2150 desc->kind = AllocateDescFile;
2151 desc->desc.file = file;
2152 desc->create_subid = GetCurrentSubTransactionId();
2153 numAllocatedDescs++;
2154 return desc->desc.file;
2155 }
2156
2157 if (errno == EMFILE || errno == ENFILE)
2158 {
2159 int save_errno = errno;
2160
2161 ereport(LOG,
2162 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2163 errmsg("out of file descriptors: %m; release and retry")));
2164 errno = 0;
2165 if (ReleaseLruFile())
2166 goto TryAgain;
2167 errno = save_errno;
2168 }
2169
2170 return NULL;
2171 }
2172
2173
2174 /*
2175 * Like AllocateFile, but returns an unbuffered fd like open(2)
2176 */
2177 int
OpenTransientFile(FileName fileName,int fileFlags,int fileMode)2178 OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
2179 {
2180 int fd;
2181
2182 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2183 numAllocatedDescs, fileName));
2184
2185 /* Can we allocate another non-virtual FD? */
2186 if (!reserveAllocatedDesc())
2187 ereport(ERROR,
2188 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2189 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2190 maxAllocatedDescs, fileName)));
2191
2192 /* Close excess kernel FDs. */
2193 ReleaseLruFiles();
2194
2195 fd = BasicOpenFile(fileName, fileFlags, fileMode);
2196
2197 if (fd >= 0)
2198 {
2199 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2200
2201 desc->kind = AllocateDescRawFD;
2202 desc->desc.fd = fd;
2203 desc->create_subid = GetCurrentSubTransactionId();
2204 numAllocatedDescs++;
2205
2206 return fd;
2207 }
2208
2209 return -1; /* failure */
2210 }
2211
2212 /*
2213 * Routines that want to initiate a pipe stream should use OpenPipeStream
2214 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2215 * necessary. When done, call ClosePipeStream rather than pclose.
2216 *
2217 * This function also ensures that the popen'd program is run with default
2218 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2219 * uses. This ensures desirable response to, eg, closing a read pipe early.
2220 */
2221 FILE *
OpenPipeStream(const char * command,const char * mode)2222 OpenPipeStream(const char *command, const char *mode)
2223 {
2224 FILE *file;
2225 int save_errno;
2226
2227 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2228 numAllocatedDescs, command));
2229
2230 /* Can we allocate another non-virtual FD? */
2231 if (!reserveAllocatedDesc())
2232 ereport(ERROR,
2233 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2234 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2235 maxAllocatedDescs, command)));
2236
2237 /* Close excess kernel FDs. */
2238 ReleaseLruFiles();
2239
2240 TryAgain:
2241 fflush(stdout);
2242 fflush(stderr);
2243 pqsignal(SIGPIPE, SIG_DFL);
2244 errno = 0;
2245 file = popen(command, mode);
2246 save_errno = errno;
2247 pqsignal(SIGPIPE, SIG_IGN);
2248 errno = save_errno;
2249 if (file != NULL)
2250 {
2251 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2252
2253 desc->kind = AllocateDescPipe;
2254 desc->desc.file = file;
2255 desc->create_subid = GetCurrentSubTransactionId();
2256 numAllocatedDescs++;
2257 return desc->desc.file;
2258 }
2259
2260 if (errno == EMFILE || errno == ENFILE)
2261 {
2262 ereport(LOG,
2263 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2264 errmsg("out of file descriptors: %m; release and retry")));
2265 if (ReleaseLruFile())
2266 goto TryAgain;
2267 errno = save_errno;
2268 }
2269
2270 return NULL;
2271 }
2272
2273 /*
2274 * Free an AllocateDesc of any type.
2275 *
2276 * The argument *must* point into the allocatedDescs[] array.
2277 */
2278 static int
FreeDesc(AllocateDesc * desc)2279 FreeDesc(AllocateDesc *desc)
2280 {
2281 int result;
2282
2283 /* Close the underlying object */
2284 switch (desc->kind)
2285 {
2286 case AllocateDescFile:
2287 result = fclose(desc->desc.file);
2288 break;
2289 case AllocateDescPipe:
2290 result = pclose(desc->desc.file);
2291 break;
2292 case AllocateDescDir:
2293 result = closedir(desc->desc.dir);
2294 break;
2295 case AllocateDescRawFD:
2296 result = close(desc->desc.fd);
2297 break;
2298 default:
2299 elog(ERROR, "AllocateDesc kind not recognized");
2300 result = 0; /* keep compiler quiet */
2301 break;
2302 }
2303
2304 /* Compact storage in the allocatedDescs array */
2305 numAllocatedDescs--;
2306 *desc = allocatedDescs[numAllocatedDescs];
2307
2308 return result;
2309 }
2310
2311 /*
2312 * Close a file returned by AllocateFile.
2313 *
2314 * Note we do not check fclose's return value --- it is up to the caller
2315 * to handle close errors.
2316 */
2317 int
FreeFile(FILE * file)2318 FreeFile(FILE *file)
2319 {
2320 int i;
2321
2322 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2323
2324 /* Remove file from list of allocated files, if it's present */
2325 for (i = numAllocatedDescs; --i >= 0;)
2326 {
2327 AllocateDesc *desc = &allocatedDescs[i];
2328
2329 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2330 return FreeDesc(desc);
2331 }
2332
2333 /* Only get here if someone passes us a file not in allocatedDescs */
2334 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2335
2336 return fclose(file);
2337 }
2338
2339 /*
2340 * Close a file returned by OpenTransientFile.
2341 *
2342 * Note we do not check close's return value --- it is up to the caller
2343 * to handle close errors.
2344 */
2345 int
CloseTransientFile(int fd)2346 CloseTransientFile(int fd)
2347 {
2348 int i;
2349
2350 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2351
2352 /* Remove fd from list of allocated files, if it's present */
2353 for (i = numAllocatedDescs; --i >= 0;)
2354 {
2355 AllocateDesc *desc = &allocatedDescs[i];
2356
2357 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2358 return FreeDesc(desc);
2359 }
2360
2361 /* Only get here if someone passes us a file not in allocatedDescs */
2362 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2363
2364 return close(fd);
2365 }
2366
2367 /*
2368 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2369 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2370 * necessary to open the directory, and with closing it after an elog.
2371 * When done, call FreeDir rather than closedir.
2372 *
2373 * Returns NULL, with errno set, on failure. Note that failure detection
2374 * is commonly left to the following call of ReadDir or ReadDirExtended;
2375 * see the comments for ReadDir.
2376 *
2377 * Ideally this should be the *only* direct call of opendir() in the backend.
2378 */
2379 DIR *
AllocateDir(const char * dirname)2380 AllocateDir(const char *dirname)
2381 {
2382 DIR *dir;
2383
2384 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2385 numAllocatedDescs, dirname));
2386
2387 /* Can we allocate another non-virtual FD? */
2388 if (!reserveAllocatedDesc())
2389 ereport(ERROR,
2390 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2391 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2392 maxAllocatedDescs, dirname)));
2393
2394 /* Close excess kernel FDs. */
2395 ReleaseLruFiles();
2396
2397 TryAgain:
2398 if ((dir = opendir(dirname)) != NULL)
2399 {
2400 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2401
2402 desc->kind = AllocateDescDir;
2403 desc->desc.dir = dir;
2404 desc->create_subid = GetCurrentSubTransactionId();
2405 numAllocatedDescs++;
2406 return desc->desc.dir;
2407 }
2408
2409 if (errno == EMFILE || errno == ENFILE)
2410 {
2411 int save_errno = errno;
2412
2413 ereport(LOG,
2414 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2415 errmsg("out of file descriptors: %m; release and retry")));
2416 errno = 0;
2417 if (ReleaseLruFile())
2418 goto TryAgain;
2419 errno = save_errno;
2420 }
2421
2422 return NULL;
2423 }
2424
2425 /*
2426 * Read a directory opened with AllocateDir, ereport'ing any error.
2427 *
2428 * This is easier to use than raw readdir() since it takes care of some
2429 * otherwise rather tedious and error-prone manipulation of errno. Also,
2430 * if you are happy with a generic error message for AllocateDir failure,
2431 * you can just do
2432 *
2433 * dir = AllocateDir(path);
2434 * while ((dirent = ReadDir(dir, path)) != NULL)
2435 * process dirent;
2436 * FreeDir(dir);
2437 *
2438 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2439 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2440 * use this shortcut.)
2441 *
2442 * The pathname passed to AllocateDir must be passed to this routine too,
2443 * but it is only used for error reporting.
2444 */
2445 struct dirent *
ReadDir(DIR * dir,const char * dirname)2446 ReadDir(DIR *dir, const char *dirname)
2447 {
2448 return ReadDirExtended(dir, dirname, ERROR);
2449 }
2450
2451 /*
2452 * Alternate version of ReadDir that allows caller to specify the elevel
2453 * for any error report (whether it's reporting an initial failure of
2454 * AllocateDir or a subsequent directory read failure).
2455 *
2456 * If elevel < ERROR, returns NULL after any error. With the normal coding
2457 * pattern, this will result in falling out of the loop immediately as
2458 * though the directory contained no (more) entries.
2459 */
2460 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2461 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2462 {
2463 struct dirent *dent;
2464
2465 /* Give a generic message for AllocateDir failure, if caller didn't */
2466 if (dir == NULL)
2467 {
2468 ereport(elevel,
2469 (errcode_for_file_access(),
2470 errmsg("could not open directory \"%s\": %m",
2471 dirname)));
2472 return NULL;
2473 }
2474
2475 errno = 0;
2476 if ((dent = readdir(dir)) != NULL)
2477 return dent;
2478
2479 if (errno)
2480 ereport(elevel,
2481 (errcode_for_file_access(),
2482 errmsg("could not read directory \"%s\": %m",
2483 dirname)));
2484 return NULL;
2485 }
2486
2487 /*
2488 * Close a directory opened with AllocateDir.
2489 *
2490 * Returns closedir's return value (with errno set if it's not 0).
2491 * Note we do not check the return value --- it is up to the caller
2492 * to handle close errors if wanted.
2493 *
2494 * Does nothing if dir == NULL; we assume that directory open failure was
2495 * already reported if desired.
2496 */
2497 int
FreeDir(DIR * dir)2498 FreeDir(DIR *dir)
2499 {
2500 int i;
2501
2502 /* Nothing to do if AllocateDir failed */
2503 if (dir == NULL)
2504 return 0;
2505
2506 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2507
2508 /* Remove dir from list of allocated dirs, if it's present */
2509 for (i = numAllocatedDescs; --i >= 0;)
2510 {
2511 AllocateDesc *desc = &allocatedDescs[i];
2512
2513 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2514 return FreeDesc(desc);
2515 }
2516
2517 /* Only get here if someone passes us a dir not in allocatedDescs */
2518 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2519
2520 return closedir(dir);
2521 }
2522
2523
2524 /*
2525 * Close a pipe stream returned by OpenPipeStream.
2526 */
2527 int
ClosePipeStream(FILE * file)2528 ClosePipeStream(FILE *file)
2529 {
2530 int i;
2531
2532 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2533
2534 /* Remove file from list of allocated files, if it's present */
2535 for (i = numAllocatedDescs; --i >= 0;)
2536 {
2537 AllocateDesc *desc = &allocatedDescs[i];
2538
2539 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2540 return FreeDesc(desc);
2541 }
2542
2543 /* Only get here if someone passes us a file not in allocatedDescs */
2544 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2545
2546 return pclose(file);
2547 }
2548
2549 /*
2550 * closeAllVfds
2551 *
2552 * Force all VFDs into the physically-closed state, so that the fewest
2553 * possible number of kernel file descriptors are in use. There is no
2554 * change in the logical state of the VFDs.
2555 */
2556 void
closeAllVfds(void)2557 closeAllVfds(void)
2558 {
2559 Index i;
2560
2561 if (SizeVfdCache > 0)
2562 {
2563 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2564 for (i = 1; i < SizeVfdCache; i++)
2565 {
2566 if (!FileIsNotOpen(i))
2567 LruDelete(i);
2568 }
2569 }
2570 }
2571
2572
2573 /*
2574 * SetTempTablespaces
2575 *
2576 * Define a list (actually an array) of OIDs of tablespaces to use for
2577 * temporary files. This list will be used until end of transaction,
2578 * unless this function is called again before then. It is caller's
2579 * responsibility that the passed-in array has adequate lifespan (typically
2580 * it'd be allocated in TopTransactionContext).
2581 *
2582 * Some entries of the array may be InvalidOid, indicating that the current
2583 * database's default tablespace should be used.
2584 */
2585 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2586 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2587 {
2588 Assert(numSpaces >= 0);
2589 tempTableSpaces = tableSpaces;
2590 numTempTableSpaces = numSpaces;
2591
2592 /*
2593 * Select a random starting point in the list. This is to minimize
2594 * conflicts between backends that are most likely sharing the same list
2595 * of temp tablespaces. Note that if we create multiple temp files in the
2596 * same transaction, we'll advance circularly through the list --- this
2597 * ensures that large temporary sort files are nicely spread across all
2598 * available tablespaces.
2599 */
2600 if (numSpaces > 1)
2601 nextTempTableSpace = random() % numSpaces;
2602 else
2603 nextTempTableSpace = 0;
2604 }
2605
2606 /*
2607 * TempTablespacesAreSet
2608 *
2609 * Returns TRUE if SetTempTablespaces has been called in current transaction.
2610 * (This is just so that tablespaces.c doesn't need its own per-transaction
2611 * state.)
2612 */
2613 bool
TempTablespacesAreSet(void)2614 TempTablespacesAreSet(void)
2615 {
2616 return (numTempTableSpaces >= 0);
2617 }
2618
2619 /*
2620 * GetNextTempTableSpace
2621 *
2622 * Select the next temp tablespace to use. A result of InvalidOid means
2623 * to use the current database's default tablespace.
2624 */
2625 Oid
GetNextTempTableSpace(void)2626 GetNextTempTableSpace(void)
2627 {
2628 if (numTempTableSpaces > 0)
2629 {
2630 /* Advance nextTempTableSpace counter with wraparound */
2631 if (++nextTempTableSpace >= numTempTableSpaces)
2632 nextTempTableSpace = 0;
2633 return tempTableSpaces[nextTempTableSpace];
2634 }
2635 return InvalidOid;
2636 }
2637
2638
2639 /*
2640 * AtEOSubXact_Files
2641 *
2642 * Take care of subtransaction commit/abort. At abort, we close temp files
2643 * that the subtransaction may have opened. At commit, we reassign the
2644 * files that were opened to the parent subtransaction.
2645 */
2646 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2647 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2648 SubTransactionId parentSubid)
2649 {
2650 Index i;
2651
2652 for (i = 0; i < numAllocatedDescs; i++)
2653 {
2654 if (allocatedDescs[i].create_subid == mySubid)
2655 {
2656 if (isCommit)
2657 allocatedDescs[i].create_subid = parentSubid;
2658 else
2659 {
2660 /* have to recheck the item after FreeDesc (ugly) */
2661 FreeDesc(&allocatedDescs[i--]);
2662 }
2663 }
2664 }
2665 }
2666
2667 /*
2668 * AtEOXact_Files
2669 *
2670 * This routine is called during transaction commit or abort (it doesn't
2671 * particularly care which). All still-open per-transaction temporary file
2672 * VFDs are closed, which also causes the underlying files to be deleted
2673 * (although they should've been closed already by the ResourceOwner
2674 * cleanup). Furthermore, all "allocated" stdio files are closed. We also
2675 * forget any transaction-local temp tablespace list.
2676 */
2677 void
AtEOXact_Files(void)2678 AtEOXact_Files(void)
2679 {
2680 CleanupTempFiles(false);
2681 tempTableSpaces = NULL;
2682 numTempTableSpaces = -1;
2683 }
2684
2685 /*
2686 * AtProcExit_Files
2687 *
2688 * on_proc_exit hook to clean up temp files during backend shutdown.
2689 * Here, we want to clean up *all* temp files including interXact ones.
2690 */
2691 static void
AtProcExit_Files(int code,Datum arg)2692 AtProcExit_Files(int code, Datum arg)
2693 {
2694 CleanupTempFiles(true);
2695 }
2696
2697 /*
2698 * Close temporary files and delete their underlying files.
2699 *
2700 * isProcExit: if true, this is being called as the backend process is
2701 * exiting. If that's the case, we should remove all temporary files; if
2702 * that's not the case, we are being called for transaction commit/abort
2703 * and should only remove transaction-local temp files. In either case,
2704 * also clean up "allocated" stdio files, dirs and fds.
2705 */
2706 static void
CleanupTempFiles(bool isProcExit)2707 CleanupTempFiles(bool isProcExit)
2708 {
2709 Index i;
2710
2711 /*
2712 * Careful here: at proc_exit we need extra cleanup, not just
2713 * xact_temporary files.
2714 */
2715 if (isProcExit || have_xact_temporary_files)
2716 {
2717 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2718 for (i = 1; i < SizeVfdCache; i++)
2719 {
2720 unsigned short fdstate = VfdCache[i].fdstate;
2721
2722 if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
2723 {
2724 /*
2725 * If we're in the process of exiting a backend process, close
2726 * all temporary files. Otherwise, only close temporary files
2727 * local to the current transaction. They should be closed by
2728 * the ResourceOwner mechanism already, so this is just a
2729 * debugging cross-check.
2730 */
2731 if (isProcExit)
2732 FileClose(i);
2733 else if (fdstate & FD_XACT_TEMPORARY)
2734 {
2735 elog(WARNING,
2736 "temporary file %s not closed at end-of-transaction",
2737 VfdCache[i].fileName);
2738 FileClose(i);
2739 }
2740 }
2741 }
2742
2743 have_xact_temporary_files = false;
2744 }
2745
2746 /* Clean up "allocated" stdio files, dirs and fds. */
2747 while (numAllocatedDescs > 0)
2748 FreeDesc(&allocatedDescs[0]);
2749 }
2750
2751
2752 /*
2753 * Remove temporary and temporary relation files left over from a prior
2754 * postmaster session
2755 *
2756 * This should be called during postmaster startup. It will forcibly
2757 * remove any leftover files created by OpenTemporaryFile and any leftover
2758 * temporary relation files created by mdcreate.
2759 *
2760 * NOTE: we could, but don't, call this during a post-backend-crash restart
2761 * cycle. The argument for not doing it is that someone might want to examine
2762 * the temp files for debugging purposes. This does however mean that
2763 * OpenTemporaryFile had better allow for collision with an existing temp
2764 * file name.
2765 */
2766 void
RemovePgTempFiles(void)2767 RemovePgTempFiles(void)
2768 {
2769 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2770 DIR *spc_dir;
2771 struct dirent *spc_de;
2772
2773 /*
2774 * First process temp files in pg_default ($PGDATA/base)
2775 */
2776 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2777 RemovePgTempFilesInDir(temp_path);
2778 RemovePgTempRelationFiles("base");
2779
2780 /*
2781 * Cycle through temp directories for all non-default tablespaces.
2782 */
2783 spc_dir = AllocateDir("pg_tblspc");
2784
2785 while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
2786 {
2787 if (strcmp(spc_de->d_name, ".") == 0 ||
2788 strcmp(spc_de->d_name, "..") == 0)
2789 continue;
2790
2791 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2792 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2793 RemovePgTempFilesInDir(temp_path);
2794
2795 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2796 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2797 RemovePgTempRelationFiles(temp_path);
2798 }
2799
2800 FreeDir(spc_dir);
2801
2802 /*
2803 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2804 * DataDir as well.
2805 */
2806 #ifdef EXEC_BACKEND
2807 RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
2808 #endif
2809 }
2810
2811 /* Process one pgsql_tmp directory for RemovePgTempFiles */
2812 static void
RemovePgTempFilesInDir(const char * tmpdirname)2813 RemovePgTempFilesInDir(const char *tmpdirname)
2814 {
2815 DIR *temp_dir;
2816 struct dirent *temp_de;
2817 char rm_path[MAXPGPATH * 2];
2818
2819 temp_dir = AllocateDir(tmpdirname);
2820 if (temp_dir == NULL)
2821 {
2822 /* anything except ENOENT is fishy */
2823 if (errno != ENOENT)
2824 elog(LOG,
2825 "could not open temporary-files directory \"%s\": %m",
2826 tmpdirname);
2827 return;
2828 }
2829
2830 while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
2831 {
2832 if (strcmp(temp_de->d_name, ".") == 0 ||
2833 strcmp(temp_de->d_name, "..") == 0)
2834 continue;
2835
2836 snprintf(rm_path, sizeof(rm_path), "%s/%s",
2837 tmpdirname, temp_de->d_name);
2838
2839 if (strncmp(temp_de->d_name,
2840 PG_TEMP_FILE_PREFIX,
2841 strlen(PG_TEMP_FILE_PREFIX)) == 0)
2842 unlink(rm_path); /* note we ignore any error */
2843 else
2844 elog(LOG,
2845 "unexpected file found in temporary-files directory: \"%s\"",
2846 rm_path);
2847 }
2848
2849 FreeDir(temp_dir);
2850 }
2851
2852 /* Process one tablespace directory, look for per-DB subdirectories */
2853 static void
RemovePgTempRelationFiles(const char * tsdirname)2854 RemovePgTempRelationFiles(const char *tsdirname)
2855 {
2856 DIR *ts_dir;
2857 struct dirent *de;
2858 char dbspace_path[MAXPGPATH * 2];
2859
2860 ts_dir = AllocateDir(tsdirname);
2861 if (ts_dir == NULL)
2862 {
2863 /* anything except ENOENT is fishy */
2864 if (errno != ENOENT)
2865 elog(LOG,
2866 "could not open tablespace directory \"%s\": %m",
2867 tsdirname);
2868 return;
2869 }
2870
2871 while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
2872 {
2873 int i = 0;
2874
2875 /*
2876 * We're only interested in the per-database directories, which have
2877 * numeric names. Note that this code will also (properly) ignore "."
2878 * and "..".
2879 */
2880 while (isdigit((unsigned char) de->d_name[i]))
2881 ++i;
2882 if (de->d_name[i] != '\0' || i == 0)
2883 continue;
2884
2885 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
2886 tsdirname, de->d_name);
2887 RemovePgTempRelationFilesInDbspace(dbspace_path);
2888 }
2889
2890 FreeDir(ts_dir);
2891 }
2892
2893 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
2894 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)2895 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
2896 {
2897 DIR *dbspace_dir;
2898 struct dirent *de;
2899 char rm_path[MAXPGPATH * 2];
2900
2901 dbspace_dir = AllocateDir(dbspacedirname);
2902 if (dbspace_dir == NULL)
2903 {
2904 /* we just saw this directory, so it really ought to be there */
2905 elog(LOG,
2906 "could not open dbspace directory \"%s\": %m",
2907 dbspacedirname);
2908 return;
2909 }
2910
2911 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
2912 {
2913 if (!looks_like_temp_rel_name(de->d_name))
2914 continue;
2915
2916 snprintf(rm_path, sizeof(rm_path), "%s/%s",
2917 dbspacedirname, de->d_name);
2918
2919 unlink(rm_path); /* note we ignore any error */
2920 }
2921
2922 FreeDir(dbspace_dir);
2923 }
2924
2925 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
2926 static bool
looks_like_temp_rel_name(const char * name)2927 looks_like_temp_rel_name(const char *name)
2928 {
2929 int pos;
2930 int savepos;
2931
2932 /* Must start with "t". */
2933 if (name[0] != 't')
2934 return false;
2935
2936 /* Followed by a non-empty string of digits and then an underscore. */
2937 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
2938 ;
2939 if (pos == 1 || name[pos] != '_')
2940 return false;
2941
2942 /* Followed by another nonempty string of digits. */
2943 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
2944 ;
2945 if (savepos == pos)
2946 return false;
2947
2948 /* We might have _forkname or .segment or both. */
2949 if (name[pos] == '_')
2950 {
2951 int forkchar = forkname_chars(&name[pos + 1], NULL);
2952
2953 if (forkchar <= 0)
2954 return false;
2955 pos += forkchar + 1;
2956 }
2957 if (name[pos] == '.')
2958 {
2959 int segchar;
2960
2961 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
2962 ;
2963 if (segchar <= 1)
2964 return false;
2965 pos += segchar;
2966 }
2967
2968 /* Now we should be at the end. */
2969 if (name[pos] != '\0')
2970 return false;
2971 return true;
2972 }
2973
2974
2975 /*
2976 * Issue fsync recursively on PGDATA and all its contents.
2977 *
2978 * We fsync regular files and directories wherever they are, but we
2979 * follow symlinks only for pg_wal and immediately under pg_tblspc.
2980 * Other symlinks are presumed to point at files we're not responsible
2981 * for fsyncing, and might not have privileges to write at all.
2982 *
2983 * Errors are logged but not considered fatal; that's because this is used
2984 * only during database startup, to deal with the possibility that there are
2985 * issued-but-unsynced writes pending against the data directory. We want to
2986 * ensure that such writes reach disk before anything that's done in the new
2987 * run. However, aborting on error would result in failure to start for
2988 * harmless cases such as read-only files in the data directory, and that's
2989 * not good either.
2990 *
2991 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
2992 * rewriting all changes again during recovery.
2993 *
2994 * Note we assume we're chdir'd into PGDATA to begin with.
2995 */
2996 void
SyncDataDirectory(void)2997 SyncDataDirectory(void)
2998 {
2999 bool xlog_is_symlink;
3000
3001 /* We can skip this whole thing if fsync is disabled. */
3002 if (!enableFsync)
3003 return;
3004
3005 /*
3006 * If pg_wal is a symlink, we'll need to recurse into it separately,
3007 * because the first walkdir below will ignore it.
3008 */
3009 xlog_is_symlink = false;
3010
3011 #ifndef WIN32
3012 {
3013 struct stat st;
3014
3015 if (lstat("pg_wal", &st) < 0)
3016 ereport(LOG,
3017 (errcode_for_file_access(),
3018 errmsg("could not stat file \"%s\": %m",
3019 "pg_wal")));
3020 else if (S_ISLNK(st.st_mode))
3021 xlog_is_symlink = true;
3022 }
3023 #else
3024 if (pgwin32_is_junction("pg_wal"))
3025 xlog_is_symlink = true;
3026 #endif
3027
3028 /*
3029 * If possible, hint to the kernel that we're soon going to fsync the data
3030 * directory and its contents. Errors in this step are even less
3031 * interesting than normal, so log them only at DEBUG1.
3032 */
3033 #ifdef PG_FLUSH_DATA_WORKS
3034 walkdir(".", pre_sync_fname, false, DEBUG1);
3035 if (xlog_is_symlink)
3036 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3037 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3038 #endif
3039
3040 /*
3041 * Now we do the fsync()s in the same order.
3042 *
3043 * The main call ignores symlinks, so in addition to specially processing
3044 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3045 * process_symlinks = true. Note that if there are any plain directories
3046 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3047 * so we don't worry about optimizing it.
3048 */
3049 walkdir(".", datadir_fsync_fname, false, LOG);
3050 if (xlog_is_symlink)
3051 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3052 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3053 }
3054
3055 /*
3056 * walkdir: recursively walk a directory, applying the action to each
3057 * regular file and directory (including the named directory itself).
3058 *
3059 * If process_symlinks is true, the action and recursion are also applied
3060 * to regular files and directories that are pointed to by symlinks in the
3061 * given directory; otherwise symlinks are ignored. Symlinks are always
3062 * ignored in subdirectories, ie we intentionally don't pass down the
3063 * process_symlinks flag to recursive calls.
3064 *
3065 * Errors are reported at level elevel, which might be ERROR or less.
3066 *
3067 * See also walkdir in initdb.c, which is a frontend version of this logic.
3068 */
3069 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3070 walkdir(const char *path,
3071 void (*action) (const char *fname, bool isdir, int elevel),
3072 bool process_symlinks,
3073 int elevel)
3074 {
3075 DIR *dir;
3076 struct dirent *de;
3077
3078 dir = AllocateDir(path);
3079 if (dir == NULL)
3080 {
3081 ereport(elevel,
3082 (errcode_for_file_access(),
3083 errmsg("could not open directory \"%s\": %m", path)));
3084 return;
3085 }
3086
3087 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3088 {
3089 char subpath[MAXPGPATH * 2];
3090 struct stat fst;
3091 int sret;
3092
3093 CHECK_FOR_INTERRUPTS();
3094
3095 if (strcmp(de->d_name, ".") == 0 ||
3096 strcmp(de->d_name, "..") == 0)
3097 continue;
3098
3099 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3100
3101 if (process_symlinks)
3102 sret = stat(subpath, &fst);
3103 else
3104 sret = lstat(subpath, &fst);
3105
3106 if (sret < 0)
3107 {
3108 ereport(elevel,
3109 (errcode_for_file_access(),
3110 errmsg("could not stat file \"%s\": %m", subpath)));
3111 continue;
3112 }
3113
3114 if (S_ISREG(fst.st_mode))
3115 (*action) (subpath, false, elevel);
3116 else if (S_ISDIR(fst.st_mode))
3117 walkdir(subpath, action, false, elevel);
3118 }
3119
3120 FreeDir(dir); /* we ignore any error here */
3121
3122 /*
3123 * It's important to fsync the destination directory itself as individual
3124 * file fsyncs don't guarantee that the directory entry for the file is
3125 * synced.
3126 */
3127 (*action) (path, true, elevel);
3128 }
3129
3130
3131 /*
3132 * Hint to the OS that it should get ready to fsync() this file.
3133 *
3134 * Ignores errors trying to open unreadable files, and logs other errors at a
3135 * caller-specified level.
3136 */
3137 #ifdef PG_FLUSH_DATA_WORKS
3138
3139 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3140 pre_sync_fname(const char *fname, bool isdir, int elevel)
3141 {
3142 int fd;
3143
3144 /* Don't try to flush directories, it'll likely just fail */
3145 if (isdir)
3146 return;
3147
3148 fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);
3149
3150 if (fd < 0)
3151 {
3152 if (errno == EACCES)
3153 return;
3154 ereport(elevel,
3155 (errcode_for_file_access(),
3156 errmsg("could not open file \"%s\": %m", fname)));
3157 return;
3158 }
3159
3160 /*
3161 * pg_flush_data() ignores errors, which is ok because this is only a
3162 * hint.
3163 */
3164 pg_flush_data(fd, 0, 0);
3165
3166 (void) CloseTransientFile(fd);
3167 }
3168
3169 #endif /* PG_FLUSH_DATA_WORKS */
3170
3171 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3172 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3173 {
3174 /*
3175 * We want to silently ignoring errors about unreadable files. Pass that
3176 * desire on to fsync_fname_ext().
3177 */
3178 fsync_fname_ext(fname, isdir, true, elevel);
3179 }
3180
3181 /*
3182 * fsync_fname_ext -- Try to fsync a file or directory
3183 *
3184 * If ignore_perm is true, ignore errors upon trying to open unreadable
3185 * files. Logs other errors at a caller-specified level.
3186 *
3187 * Returns 0 if the operation succeeded, -1 otherwise.
3188 */
3189 static int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3190 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3191 {
3192 int fd;
3193 int flags;
3194 int returncode;
3195
3196 /*
3197 * Some OSs require directories to be opened read-only whereas other
3198 * systems don't allow us to fsync files opened read-only; so we need both
3199 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3200 * not writable by our userid, but we assume that's OK.
3201 */
3202 flags = PG_BINARY;
3203 if (!isdir)
3204 flags |= O_RDWR;
3205 else
3206 flags |= O_RDONLY;
3207
3208 fd = OpenTransientFile((char *) fname, flags, 0);
3209
3210 /*
3211 * Some OSs don't allow us to open directories at all (Windows returns
3212 * EACCES), just ignore the error in that case. If desired also silently
3213 * ignoring errors about unreadable files. Log others.
3214 */
3215 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3216 return 0;
3217 else if (fd < 0 && ignore_perm && errno == EACCES)
3218 return 0;
3219 else if (fd < 0)
3220 {
3221 ereport(elevel,
3222 (errcode_for_file_access(),
3223 errmsg("could not open file \"%s\": %m", fname)));
3224 return -1;
3225 }
3226
3227 returncode = pg_fsync(fd);
3228
3229 /*
3230 * Some OSes don't allow us to fsync directories at all, so we can ignore
3231 * those errors. Anything else needs to be logged.
3232 */
3233 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3234 {
3235 int save_errno;
3236
3237 /* close file upon error, might not be in transaction context */
3238 save_errno = errno;
3239 (void) CloseTransientFile(fd);
3240 errno = save_errno;
3241
3242 ereport(elevel,
3243 (errcode_for_file_access(),
3244 errmsg("could not fsync file \"%s\": %m", fname)));
3245 return -1;
3246 }
3247
3248 (void) CloseTransientFile(fd);
3249
3250 return 0;
3251 }
3252
3253 /*
3254 * fsync_parent_path -- fsync the parent path of a file or directory
3255 *
3256 * This is aimed at making file operations persistent on disk in case of
3257 * an OS crash or power failure.
3258 */
3259 static int
fsync_parent_path(const char * fname,int elevel)3260 fsync_parent_path(const char *fname, int elevel)
3261 {
3262 char parentpath[MAXPGPATH];
3263
3264 strlcpy(parentpath, fname, MAXPGPATH);
3265 get_parent_directory(parentpath);
3266
3267 /*
3268 * get_parent_directory() returns an empty string if the input argument is
3269 * just a file name (see comments in path.c), so handle that as being the
3270 * current directory.
3271 */
3272 if (strlen(parentpath) == 0)
3273 strlcpy(parentpath, ".", MAXPGPATH);
3274
3275 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3276 return -1;
3277
3278 return 0;
3279 }
3280
3281 /*
3282 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3283 *
3284 * Failure to fsync any data file is cause for immediate panic, unless
3285 * data_sync_retry is enabled. Data may have been written to the operating
3286 * system and removed from our buffer pool already, and if we are running on
3287 * an operating system that forgets dirty data on write-back failure, there
3288 * may be only one copy of the data remaining: in the WAL. A later attempt to
3289 * fsync again might falsely report success. Therefore we must not allow any
3290 * further checkpoints to be attempted. data_sync_retry can in theory be
3291 * enabled on systems known not to drop dirty buffered data on write-back
3292 * failure (with the likely outcome that checkpoints will continue to fail
3293 * until the underlying problem is fixed).
3294 *
3295 * Any code that reports a failure from fsync() or related functions should
3296 * filter the error level with this function.
3297 */
3298 int
data_sync_elevel(int elevel)3299 data_sync_elevel(int elevel)
3300 {
3301 return data_sync_retry ? elevel : PANIC;
3302 }
3303