1 /*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 256 on many modern
20 * operating systems, but can be as low as 32 on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
43 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
44 * They behave like the corresponding native functions, except that the handle
45 * is registered with the current subtransaction, and will be automatically
46 * closed at abort. These are intended mainly for short operations like
47 * reading a configuration file; there is a limit on the number of files that
48 * can be opened using these functions at any one time.
49 *
50 * Finally, BasicOpenFile is just a thin wrapper around open() that can
51 * release file descriptors in use by the virtual file descriptors if
52 * necessary. There is no automatic cleanup of file descriptors returned by
53 * BasicOpenFile, it is solely the caller's responsibility to close the file
54 * descriptor by calling close(2).
55 *
56 *-------------------------------------------------------------------------
57 */
58
59 #include "postgres.h"
60
61 #include <sys/file.h>
62 #include <sys/param.h>
63 #include <sys/stat.h>
64 #ifndef WIN32
65 #include <sys/mman.h>
66 #endif
67 #include <limits.h>
68 #include <unistd.h>
69 #include <fcntl.h>
70 #ifdef HAVE_SYS_RESOURCE_H
71 #include <sys/resource.h> /* for getrlimit */
72 #endif
73
74 #include "miscadmin.h"
75 #include "access/xact.h"
76 #include "access/xlog.h"
77 #include "catalog/catalog.h"
78 #include "catalog/pg_tablespace.h"
79 #include "pgstat.h"
80 #include "portability/mem.h"
81 #include "storage/fd.h"
82 #include "storage/ipc.h"
83 #include "utils/guc.h"
84 #include "utils/resowner_private.h"
85
86
87 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
88 #if defined(HAVE_SYNC_FILE_RANGE)
89 #define PG_FLUSH_DATA_WORKS 1
90 #elif !defined(WIN32) && defined(MS_ASYNC)
91 #define PG_FLUSH_DATA_WORKS 1
92 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
93 #define PG_FLUSH_DATA_WORKS 1
94 #endif
95
96 /*
97 * We must leave some file descriptors free for system(), the dynamic loader,
98 * and other code that tries to open files without consulting fd.c. This
99 * is the number left free. (While we can be pretty sure we won't get
100 * EMFILE, there's never any guarantee that we won't get ENFILE due to
101 * other processes chewing up FDs. So it's a bad idea to try to open files
102 * without consulting fd.c. Nonetheless we cannot control all code.)
103 *
104 * Because this is just a fixed setting, we are effectively assuming that
105 * no such code will leave FDs open over the long term; otherwise the slop
106 * is likely to be insufficient. Note in particular that we expect that
107 * loading a shared library does not result in any permanent increase in
108 * the number of open files. (This appears to be true on most if not
109 * all platforms as of Feb 2004.)
110 */
111 #define NUM_RESERVED_FDS 10
112
113 /*
114 * If we have fewer than this many usable FDs after allowing for the reserved
115 * ones, choke.
116 */
117 #define FD_MINFREE 10
118
119
120 /*
121 * A number of platforms allow individual processes to open many more files
122 * than they can really support when *many* processes do the same thing.
123 * This GUC parameter lets the DBA limit max_safe_fds to something less than
124 * what the postmaster's initial probe suggests will work.
125 */
126 int max_files_per_process = 1000;
127
128 /*
129 * Maximum number of file descriptors to open for either VFD entries or
130 * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
131 * to a conservative value, and remains that way indefinitely in bootstrap or
132 * standalone-backend cases. In normal postmaster operation, the postmaster
133 * calls set_max_safe_fds() late in initialization to update the value, and
134 * that value is then inherited by forked subprocesses.
135 *
136 * Note: the value of max_files_per_process is taken into account while
137 * setting this variable, and so need not be tested separately.
138 */
139 int max_safe_fds = 32; /* default if not changed */
140
141 /* Whether it is safe to continue running after fsync() fails. */
142 bool data_sync_retry = false;
143
144 /* Debugging.... */
145
146 #ifdef FDDEBUG
147 #define DO_DB(A) \
148 do { \
149 int _do_db_save_errno = errno; \
150 A; \
151 errno = _do_db_save_errno; \
152 } while (0)
153 #else
154 #define DO_DB(A) \
155 ((void) 0)
156 #endif
157
158 #define VFD_CLOSED (-1)
159
160 #define FileIsValid(file) \
161 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
162
163 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
164
165 /*
166 * Note: a VFD's seekPos is normally always valid, but if for some reason
167 * an lseek() fails, it might become set to FileUnknownPos. We can struggle
168 * along without knowing the seek position in many cases, but in some places
169 * we have to fail if we don't have it.
170 */
171 #define FileUnknownPos ((off_t) -1)
172 #define FilePosIsUnknown(pos) ((pos) < 0)
173
174 /* these are the assigned bits in fdstate below: */
175 #define FD_TEMPORARY (1 << 0) /* T = delete when closed */
176 #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
177
178 typedef struct vfd
179 {
180 int fd; /* current FD, or VFD_CLOSED if none */
181 unsigned short fdstate; /* bitflags for VFD's state */
182 ResourceOwner resowner; /* owner, for automatic cleanup */
183 File nextFree; /* link to next free VFD, if in freelist */
184 File lruMoreRecently; /* doubly linked recency-of-use list */
185 File lruLessRecently;
186 off_t seekPos; /* current logical file position, or -1 */
187 off_t fileSize; /* current size of file (0 if not temporary) */
188 char *fileName; /* name of file, or NULL for unused VFD */
189 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
190 int fileFlags; /* open(2) flags for (re)opening the file */
191 int fileMode; /* mode to pass to open(2) */
192 } Vfd;
193
194 /*
195 * Virtual File Descriptor array pointer and size. This grows as
196 * needed. 'File' values are indexes into this array.
197 * Note that VfdCache[0] is not a usable VFD, just a list header.
198 */
199 static Vfd *VfdCache;
200 static Size SizeVfdCache = 0;
201
202 /*
203 * Number of file descriptors known to be in use by VFD entries.
204 */
205 static int nfile = 0;
206
207 /*
208 * Flag to tell whether it's worth scanning VfdCache looking for temp files
209 * to close
210 */
211 static bool have_xact_temporary_files = false;
212
213 /*
214 * Tracks the total size of all temporary files. Note: when temp_file_limit
215 * is being enforced, this cannot overflow since the limit cannot be more
216 * than INT_MAX kilobytes. When not enforcing, it could theoretically
217 * overflow, but we don't care.
218 */
219 static uint64 temporary_files_size = 0;
220
221 /*
222 * List of OS handles opened with AllocateFile, AllocateDir and
223 * OpenTransientFile.
224 */
225 typedef enum
226 {
227 AllocateDescFile,
228 AllocateDescPipe,
229 AllocateDescDir,
230 AllocateDescRawFD
231 } AllocateDescKind;
232
233 typedef struct
234 {
235 AllocateDescKind kind;
236 SubTransactionId create_subid;
237 union
238 {
239 FILE *file;
240 DIR *dir;
241 int fd;
242 } desc;
243 } AllocateDesc;
244
245 static int numAllocatedDescs = 0;
246 static int maxAllocatedDescs = 0;
247 static AllocateDesc *allocatedDescs = NULL;
248
249 /*
250 * Number of temporary files opened during the current session;
251 * this is used in generation of tempfile names.
252 */
253 static long tempFileCounter = 0;
254
255 /*
256 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
257 * indicating that the current database's default tablespace should be used.)
258 * When numTempTableSpaces is -1, this has not been set in the current
259 * transaction.
260 */
261 static Oid *tempTableSpaces = NULL;
262 static int numTempTableSpaces = -1;
263 static int nextTempTableSpace = 0;
264
265
266 /*--------------------
267 *
268 * Private Routines
269 *
270 * Delete - delete a file from the Lru ring
271 * LruDelete - remove a file from the Lru ring and close its FD
272 * Insert - put a file at the front of the Lru ring
273 * LruInsert - put a file at the front of the Lru ring and open it
274 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
275 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
276 * AllocateVfd - grab a free (or new) file record (from VfdArray)
277 * FreeVfd - free a file record
278 *
279 * The Least Recently Used ring is a doubly linked list that begins and
280 * ends on element zero. Element zero is special -- it doesn't represent
281 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
282 * anchor that shows us the beginning/end of the ring.
283 * Only VFD elements that are currently really open (have an FD assigned) are
284 * in the Lru ring. Elements that are "virtually" open can be recognized
285 * by having a non-null fileName field.
286 *
287 * example:
288 *
289 * /--less----\ /---------\
290 * v \ v \
291 * #0 --more---> LeastRecentlyUsed --more-\ \
292 * ^\ | |
293 * \\less--> MostRecentlyUsedFile <---/ |
294 * \more---/ \--less--/
295 *
296 *--------------------
297 */
298 static void Delete(File file);
299 static void LruDelete(File file);
300 static void Insert(File file);
301 static int LruInsert(File file);
302 static bool ReleaseLruFile(void);
303 static void ReleaseLruFiles(void);
304 static File AllocateVfd(void);
305 static void FreeVfd(File file);
306
307 static int FileAccess(File file);
308 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
309 static bool reserveAllocatedDesc(void);
310 static int FreeDesc(AllocateDesc *desc);
311
312 static void AtProcExit_Files(int code, Datum arg);
313 static void CleanupTempFiles(bool isProcExit);
314 static void RemovePgTempFilesInDir(const char *tmpdirname);
315 static void RemovePgTempRelationFiles(const char *tsdirname);
316 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
317 static bool looks_like_temp_rel_name(const char *name);
318
319 static void walkdir(const char *path,
320 void (*action) (const char *fname, bool isdir, int elevel),
321 bool process_symlinks,
322 int elevel);
323 #ifdef PG_FLUSH_DATA_WORKS
324 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
325 #endif
326 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
327
328 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
329 static int fsync_parent_path(const char *fname, int elevel);
330
331
332 /*
333 * pg_fsync --- do fsync with or without writethrough
334 */
335 int
pg_fsync(int fd)336 pg_fsync(int fd)
337 {
338 /* #if is to skip the sync_method test if there's no need for it */
339 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
340 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
341 return pg_fsync_writethrough(fd);
342 else
343 #endif
344 return pg_fsync_no_writethrough(fd);
345 }
346
347
348 /*
349 * pg_fsync_no_writethrough --- same as fsync except does nothing if
350 * enableFsync is off
351 */
352 int
pg_fsync_no_writethrough(int fd)353 pg_fsync_no_writethrough(int fd)
354 {
355 if (enableFsync)
356 return fsync(fd);
357 else
358 return 0;
359 }
360
361 /*
362 * pg_fsync_writethrough
363 */
364 int
pg_fsync_writethrough(int fd)365 pg_fsync_writethrough(int fd)
366 {
367 if (enableFsync)
368 {
369 #ifdef WIN32
370 return _commit(fd);
371 #elif defined(F_FULLFSYNC)
372 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
373 #else
374 errno = ENOSYS;
375 return -1;
376 #endif
377 }
378 else
379 return 0;
380 }
381
382 /*
383 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
384 *
385 * Not all platforms have fdatasync; treat as fsync if not available.
386 */
387 int
pg_fdatasync(int fd)388 pg_fdatasync(int fd)
389 {
390 if (enableFsync)
391 {
392 #ifdef HAVE_FDATASYNC
393 return fdatasync(fd);
394 #else
395 return fsync(fd);
396 #endif
397 }
398 else
399 return 0;
400 }
401
402 /*
403 * pg_flush_data --- advise OS that the described dirty data should be flushed
404 *
405 * offset of 0 with nbytes 0 means that the entire file should be flushed;
406 * in this case, this function may have side-effects on the file's
407 * seek position!
408 */
409 void
pg_flush_data(int fd,off_t offset,off_t nbytes)410 pg_flush_data(int fd, off_t offset, off_t nbytes)
411 {
412 /*
413 * Right now file flushing is primarily used to avoid making later
414 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
415 * if fsyncs are disabled - that's a decision we might want to make
416 * configurable at some point.
417 */
418 if (!enableFsync)
419 return;
420
421 /*
422 * We compile all alternatives that are supported on the current platform,
423 * to find portability problems more easily.
424 */
425 #if defined(HAVE_SYNC_FILE_RANGE)
426 {
427 int rc;
428 static bool not_implemented_by_kernel = false;
429
430 if (not_implemented_by_kernel)
431 return;
432
433 /*
434 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
435 * tells the OS that writeback for the specified blocks should be
436 * started, but that we don't want to wait for completion. Note that
437 * this call might block if too much dirty data exists in the range.
438 * This is the preferable method on OSs supporting it, as it works
439 * reliably when available (contrast to msync()) and doesn't flush out
440 * clean data (like FADV_DONTNEED).
441 */
442 rc = sync_file_range(fd, offset, nbytes,
443 SYNC_FILE_RANGE_WRITE);
444 if (rc != 0)
445 {
446 int elevel;
447
448 /*
449 * For systems that don't have an implementation of
450 * sync_file_range() such as Windows WSL, generate only one
451 * warning and then suppress all further attempts by this process.
452 */
453 if (errno == ENOSYS)
454 {
455 elevel = WARNING;
456 not_implemented_by_kernel = true;
457 }
458 else
459 elevel = data_sync_elevel(WARNING);
460
461 ereport(elevel,
462 (errcode_for_file_access(),
463 errmsg("could not flush dirty data: %m")));
464 }
465
466 return;
467 }
468 #endif
469 #if !defined(WIN32) && defined(MS_ASYNC)
470 {
471 void *p;
472 static int pagesize = 0;
473
474 /*
475 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
476 * writeback. On linux it only does so if MS_SYNC is specified, but
477 * then it does the writeback synchronously. Luckily all common linux
478 * systems have sync_file_range(). This is preferable over
479 * FADV_DONTNEED because it doesn't flush out clean data.
480 *
481 * We map the file (mmap()), tell the kernel to sync back the contents
482 * (msync()), and then remove the mapping again (munmap()).
483 */
484
485 /* mmap() needs actual length if we want to map whole file */
486 if (offset == 0 && nbytes == 0)
487 {
488 nbytes = lseek(fd, 0, SEEK_END);
489 if (nbytes < 0)
490 {
491 ereport(WARNING,
492 (errcode_for_file_access(),
493 errmsg("could not determine dirty data size: %m")));
494 return;
495 }
496 }
497
498 /*
499 * Some platforms reject partial-page mmap() attempts. To deal with
500 * that, just truncate the request to a page boundary. If any extra
501 * bytes don't get flushed, well, it's only a hint anyway.
502 */
503
504 /* fetch pagesize only once */
505 if (pagesize == 0)
506 pagesize = sysconf(_SC_PAGESIZE);
507
508 /* align length to pagesize, dropping any fractional page */
509 if (pagesize > 0)
510 nbytes = (nbytes / pagesize) * pagesize;
511
512 /* fractional-page request is a no-op */
513 if (nbytes <= 0)
514 return;
515
516 /*
517 * mmap could well fail, particularly on 32-bit platforms where there
518 * may simply not be enough address space. If so, silently fall
519 * through to the next implementation.
520 */
521 if (nbytes <= (off_t) SSIZE_MAX)
522 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
523 else
524 p = MAP_FAILED;
525
526 if (p != MAP_FAILED)
527 {
528 int rc;
529
530 rc = msync(p, (size_t) nbytes, MS_ASYNC);
531 if (rc != 0)
532 {
533 ereport(data_sync_elevel(WARNING),
534 (errcode_for_file_access(),
535 errmsg("could not flush dirty data: %m")));
536 /* NB: need to fall through to munmap()! */
537 }
538
539 rc = munmap(p, (size_t) nbytes);
540 if (rc != 0)
541 {
542 /* FATAL error because mapping would remain */
543 ereport(FATAL,
544 (errcode_for_file_access(),
545 errmsg("could not munmap() while flushing data: %m")));
546 }
547
548 return;
549 }
550 }
551 #endif
552 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
553 {
554 int rc;
555
556 /*
557 * Signal the kernel that the passed in range should not be cached
558 * anymore. This has the, desired, side effect of writing out dirty
559 * data, and the, undesired, side effect of likely discarding useful
560 * clean cached blocks. For the latter reason this is the least
561 * preferable method.
562 */
563
564 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
565
566 if (rc != 0)
567 {
568 /* don't error out, this is just a performance optimization */
569 ereport(WARNING,
570 (errcode_for_file_access(),
571 errmsg("could not flush dirty data: %m")));
572 }
573
574 return;
575 }
576 #endif
577 }
578
579
580 /*
581 * fsync_fname -- fsync a file or directory, handling errors properly
582 *
583 * Try to fsync a file or directory. When doing the latter, ignore errors that
584 * indicate the OS just doesn't allow/require fsyncing directories.
585 */
586 void
fsync_fname(const char * fname,bool isdir)587 fsync_fname(const char *fname, bool isdir)
588 {
589 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
590 }
591
592 /*
593 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
594 *
595 * This routine ensures that, after returning, the effect of renaming file
596 * persists in case of a crash. A crash while this routine is running will
597 * leave you with either the pre-existing or the moved file in place of the
598 * new file; no mixed state or truncated files are possible.
599 *
600 * It does so by using fsync on the old filename and the possibly existing
601 * target filename before the rename, and the target file and directory after.
602 *
603 * Note that rename() cannot be used across arbitrary directories, as they
604 * might not be on the same filesystem. Therefore this routine does not
605 * support renaming across directories.
606 *
607 * Log errors with the caller specified severity.
608 *
609 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
610 * valid upon return.
611 */
612 int
durable_rename(const char * oldfile,const char * newfile,int elevel)613 durable_rename(const char *oldfile, const char *newfile, int elevel)
614 {
615 int fd;
616
617 /*
618 * First fsync the old and target path (if it exists), to ensure that they
619 * are properly persistent on disk. Syncing the target file is not
620 * strictly necessary, but it makes it easier to reason about crashes;
621 * because it's then guaranteed that either source or target file exists
622 * after a crash.
623 */
624 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
625 return -1;
626
627 fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
628 if (fd < 0)
629 {
630 if (errno != ENOENT)
631 {
632 ereport(elevel,
633 (errcode_for_file_access(),
634 errmsg("could not open file \"%s\": %m", newfile)));
635 return -1;
636 }
637 }
638 else
639 {
640 if (pg_fsync(fd) != 0)
641 {
642 int save_errno;
643
644 /* close file upon error, might not be in transaction context */
645 save_errno = errno;
646 CloseTransientFile(fd);
647 errno = save_errno;
648
649 ereport(elevel,
650 (errcode_for_file_access(),
651 errmsg("could not fsync file \"%s\": %m", newfile)));
652 return -1;
653 }
654 CloseTransientFile(fd);
655 }
656
657 /* Time to do the real deal... */
658 if (rename(oldfile, newfile) < 0)
659 {
660 ereport(elevel,
661 (errcode_for_file_access(),
662 errmsg("could not rename file \"%s\" to \"%s\": %m",
663 oldfile, newfile)));
664 return -1;
665 }
666
667 /*
668 * To guarantee renaming the file is persistent, fsync the file with its
669 * new name, and its containing directory.
670 */
671 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
672 return -1;
673
674 if (fsync_parent_path(newfile, elevel) != 0)
675 return -1;
676
677 return 0;
678 }
679
680 /*
681 * durable_link_or_rename -- rename a file in a durable manner.
682 *
683 * Similar to durable_rename(), except that this routine tries (but does not
684 * guarantee) not to overwrite the target file.
685 *
686 * Note that a crash in an unfortunate moment can leave you with two links to
687 * the target file.
688 *
689 * Log errors with the caller specified severity.
690 *
691 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
692 * valid upon return.
693 */
694 int
durable_link_or_rename(const char * oldfile,const char * newfile,int elevel)695 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
696 {
697 /*
698 * Ensure that, if we crash directly after the rename/link, a file with
699 * valid contents is moved into place.
700 */
701 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
702 return -1;
703
704 #if HAVE_WORKING_LINK
705 if (link(oldfile, newfile) < 0)
706 {
707 ereport(elevel,
708 (errcode_for_file_access(),
709 errmsg("could not link file \"%s\" to \"%s\": %m",
710 oldfile, newfile)));
711 return -1;
712 }
713 unlink(oldfile);
714 #else
715 /* XXX: Add racy file existence check? */
716 if (rename(oldfile, newfile) < 0)
717 {
718 ereport(elevel,
719 (errcode_for_file_access(),
720 errmsg("could not rename file \"%s\" to \"%s\": %m",
721 oldfile, newfile)));
722 return -1;
723 }
724 #endif
725
726 /*
727 * Make change persistent in case of an OS crash, both the new entry and
728 * its parent directory need to be flushed.
729 */
730 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
731 return -1;
732
733 /* Same for parent directory */
734 if (fsync_parent_path(newfile, elevel) != 0)
735 return -1;
736
737 return 0;
738 }
739
740 /*
741 * InitFileAccess --- initialize this module during backend startup
742 *
743 * This is called during either normal or standalone backend start.
744 * It is *not* called in the postmaster.
745 */
746 void
InitFileAccess(void)747 InitFileAccess(void)
748 {
749 Assert(SizeVfdCache == 0); /* call me only once */
750
751 /* initialize cache header entry */
752 VfdCache = (Vfd *) malloc(sizeof(Vfd));
753 if (VfdCache == NULL)
754 ereport(FATAL,
755 (errcode(ERRCODE_OUT_OF_MEMORY),
756 errmsg("out of memory")));
757
758 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
759 VfdCache->fd = VFD_CLOSED;
760
761 SizeVfdCache = 1;
762
763 /* register proc-exit hook to ensure temp files are dropped at exit */
764 on_proc_exit(AtProcExit_Files, 0);
765 }
766
767 /*
768 * count_usable_fds --- count how many FDs the system will let us open,
769 * and estimate how many are already open.
770 *
771 * We stop counting if usable_fds reaches max_to_probe. Note: a small
772 * value of max_to_probe might result in an underestimate of already_open;
773 * we must fill in any "gaps" in the set of used FDs before the calculation
774 * of already_open will give the right answer. In practice, max_to_probe
775 * of a couple of dozen should be enough to ensure good results.
776 *
777 * We assume stdin (FD 0) is available for dup'ing
778 */
779 static void
count_usable_fds(int max_to_probe,int * usable_fds,int * already_open)780 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
781 {
782 int *fd;
783 int size;
784 int used = 0;
785 int highestfd = 0;
786 int j;
787
788 #ifdef HAVE_GETRLIMIT
789 struct rlimit rlim;
790 int getrlimit_status;
791 #endif
792
793 size = 1024;
794 fd = (int *) palloc(size * sizeof(int));
795
796 #ifdef HAVE_GETRLIMIT
797 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
798 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
799 #else /* but BSD doesn't ... */
800 getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
801 #endif /* RLIMIT_NOFILE */
802 if (getrlimit_status != 0)
803 ereport(WARNING, (errmsg("getrlimit failed: %m")));
804 #endif /* HAVE_GETRLIMIT */
805
806 /* dup until failure or probe limit reached */
807 for (;;)
808 {
809 int thisfd;
810
811 #ifdef HAVE_GETRLIMIT
812
813 /*
814 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
815 * some platforms
816 */
817 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
818 break;
819 #endif
820
821 thisfd = dup(0);
822 if (thisfd < 0)
823 {
824 /* Expect EMFILE or ENFILE, else it's fishy */
825 if (errno != EMFILE && errno != ENFILE)
826 elog(WARNING, "dup(0) failed after %d successes: %m", used);
827 break;
828 }
829
830 if (used >= size)
831 {
832 size *= 2;
833 fd = (int *) repalloc(fd, size * sizeof(int));
834 }
835 fd[used++] = thisfd;
836
837 if (highestfd < thisfd)
838 highestfd = thisfd;
839
840 if (used >= max_to_probe)
841 break;
842 }
843
844 /* release the files we opened */
845 for (j = 0; j < used; j++)
846 close(fd[j]);
847
848 pfree(fd);
849
850 /*
851 * Return results. usable_fds is just the number of successful dups. We
852 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
853 * number) and so already_open is highestfd+1 - usable_fds.
854 */
855 *usable_fds = used;
856 *already_open = highestfd + 1 - used;
857 }
858
859 /*
860 * set_max_safe_fds
861 * Determine number of filedescriptors that fd.c is allowed to use
862 */
863 void
set_max_safe_fds(void)864 set_max_safe_fds(void)
865 {
866 int usable_fds;
867 int already_open;
868
869 /*----------
870 * We want to set max_safe_fds to
871 * MIN(usable_fds, max_files_per_process - already_open)
872 * less the slop factor for files that are opened without consulting
873 * fd.c. This ensures that we won't exceed either max_files_per_process
874 * or the experimentally-determined EMFILE limit.
875 *----------
876 */
877 count_usable_fds(max_files_per_process,
878 &usable_fds, &already_open);
879
880 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
881
882 /*
883 * Take off the FDs reserved for system() etc.
884 */
885 max_safe_fds -= NUM_RESERVED_FDS;
886
887 /*
888 * Make sure we still have enough to get by.
889 */
890 if (max_safe_fds < FD_MINFREE)
891 ereport(FATAL,
892 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
893 errmsg("insufficient file descriptors available to start server process"),
894 errdetail("System allows %d, we need at least %d.",
895 max_safe_fds + NUM_RESERVED_FDS,
896 FD_MINFREE + NUM_RESERVED_FDS)));
897
898 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
899 max_safe_fds, usable_fds, already_open);
900 }
901
902 /*
903 * BasicOpenFile --- same as open(2) except can free other FDs if needed
904 *
905 * This is exported for use by places that really want a plain kernel FD,
906 * but need to be proof against running out of FDs. Once an FD has been
907 * successfully returned, it is the caller's responsibility to ensure that
908 * it will not be leaked on ereport()! Most users should *not* call this
909 * routine directly, but instead use the VFD abstraction level, which
910 * provides protection against descriptor leaks as well as management of
911 * files that need to be open for more than a short period of time.
912 *
913 * Ideally this should be the *only* direct call of open() in the backend.
914 * In practice, the postmaster calls open() directly, and there are some
915 * direct open() calls done early in backend startup. Those are OK since
916 * this module wouldn't have any open files to close at that point anyway.
917 */
918 int
BasicOpenFile(FileName fileName,int fileFlags,int fileMode)919 BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
920 {
921 int fd;
922
923 tryAgain:
924 fd = open(fileName, fileFlags, fileMode);
925
926 if (fd >= 0)
927 return fd; /* success! */
928
929 if (errno == EMFILE || errno == ENFILE)
930 {
931 int save_errno = errno;
932
933 ereport(LOG,
934 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
935 errmsg("out of file descriptors: %m; release and retry")));
936 errno = 0;
937 if (ReleaseLruFile())
938 goto tryAgain;
939 errno = save_errno;
940 }
941
942 return -1; /* failure */
943 }
944
945 #if defined(FDDEBUG)
946
947 static void
_dump_lru(void)948 _dump_lru(void)
949 {
950 int mru = VfdCache[0].lruLessRecently;
951 Vfd *vfdP = &VfdCache[mru];
952 char buf[2048];
953
954 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
955 while (mru != 0)
956 {
957 mru = vfdP->lruLessRecently;
958 vfdP = &VfdCache[mru];
959 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
960 }
961 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
962 elog(LOG, "%s", buf);
963 }
964 #endif /* FDDEBUG */
965
966 static void
Delete(File file)967 Delete(File file)
968 {
969 Vfd *vfdP;
970
971 Assert(file != 0);
972
973 DO_DB(elog(LOG, "Delete %d (%s)",
974 file, VfdCache[file].fileName));
975 DO_DB(_dump_lru());
976
977 vfdP = &VfdCache[file];
978
979 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
980 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
981
982 DO_DB(_dump_lru());
983 }
984
985 static void
LruDelete(File file)986 LruDelete(File file)
987 {
988 Vfd *vfdP;
989
990 Assert(file != 0);
991
992 DO_DB(elog(LOG, "LruDelete %d (%s)",
993 file, VfdCache[file].fileName));
994
995 vfdP = &VfdCache[file];
996
997 /*
998 * Normally we should know the seek position, but if for some reason we
999 * have lost track of it, try again to get it. If we still can't get it,
1000 * we have a problem: we will be unable to restore the file seek position
1001 * when and if the file is re-opened. But we can't really throw an error
1002 * and refuse to close the file, or activities such as transaction cleanup
1003 * will be broken.
1004 */
1005 if (FilePosIsUnknown(vfdP->seekPos))
1006 {
1007 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1008 if (FilePosIsUnknown(vfdP->seekPos))
1009 elog(LOG, "could not seek file \"%s\" before closing: %m",
1010 vfdP->fileName);
1011 }
1012
1013 /*
1014 * Close the file. We aren't expecting this to fail; if it does, better
1015 * to leak the FD than to mess up our internal state.
1016 */
1017 if (close(vfdP->fd))
1018 elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1019 "could not close file \"%s\": %m", vfdP->fileName);
1020 vfdP->fd = VFD_CLOSED;
1021 --nfile;
1022
1023 /* delete the vfd record from the LRU ring */
1024 Delete(file);
1025 }
1026
1027 static void
Insert(File file)1028 Insert(File file)
1029 {
1030 Vfd *vfdP;
1031
1032 Assert(file != 0);
1033
1034 DO_DB(elog(LOG, "Insert %d (%s)",
1035 file, VfdCache[file].fileName));
1036 DO_DB(_dump_lru());
1037
1038 vfdP = &VfdCache[file];
1039
1040 vfdP->lruMoreRecently = 0;
1041 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1042 VfdCache[0].lruLessRecently = file;
1043 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1044
1045 DO_DB(_dump_lru());
1046 }
1047
1048 /* returns 0 on success, -1 on re-open failure (with errno set) */
1049 static int
LruInsert(File file)1050 LruInsert(File file)
1051 {
1052 Vfd *vfdP;
1053
1054 Assert(file != 0);
1055
1056 DO_DB(elog(LOG, "LruInsert %d (%s)",
1057 file, VfdCache[file].fileName));
1058
1059 vfdP = &VfdCache[file];
1060
1061 if (FileIsNotOpen(file))
1062 {
1063 /* Close excess kernel FDs. */
1064 ReleaseLruFiles();
1065
1066 /*
1067 * The open could still fail for lack of file descriptors, eg due to
1068 * overall system file table being full. So, be prepared to release
1069 * another FD if necessary...
1070 */
1071 vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
1072 vfdP->fileMode);
1073 if (vfdP->fd < 0)
1074 {
1075 DO_DB(elog(LOG, "re-open failed: %m"));
1076 return -1;
1077 }
1078 else
1079 {
1080 ++nfile;
1081 }
1082
1083 /*
1084 * Seek to the right position. We need no special case for seekPos
1085 * equal to FileUnknownPos, as lseek() will certainly reject that
1086 * (thus completing the logic noted in LruDelete() that we will fail
1087 * to re-open a file if we couldn't get its seek position before
1088 * closing).
1089 */
1090 if (vfdP->seekPos != (off_t) 0)
1091 {
1092 if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1093 {
1094 /*
1095 * If we fail to restore the seek position, treat it like an
1096 * open() failure.
1097 */
1098 int save_errno = errno;
1099
1100 elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1101 vfdP->fileName);
1102 (void) close(vfdP->fd);
1103 vfdP->fd = VFD_CLOSED;
1104 --nfile;
1105 errno = save_errno;
1106 return -1;
1107 }
1108 }
1109 }
1110
1111 /*
1112 * put it at the head of the Lru ring
1113 */
1114
1115 Insert(file);
1116
1117 return 0;
1118 }
1119
1120 /*
1121 * Release one kernel FD by closing the least-recently-used VFD.
1122 */
1123 static bool
ReleaseLruFile(void)1124 ReleaseLruFile(void)
1125 {
1126 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1127
1128 if (nfile > 0)
1129 {
1130 /*
1131 * There are opened files and so there should be at least one used vfd
1132 * in the ring.
1133 */
1134 Assert(VfdCache[0].lruMoreRecently != 0);
1135 LruDelete(VfdCache[0].lruMoreRecently);
1136 return true; /* freed a file */
1137 }
1138 return false; /* no files available to free */
1139 }
1140
1141 /*
1142 * Release kernel FDs as needed to get under the max_safe_fds limit.
1143 * After calling this, it's OK to try to open another file.
1144 */
1145 static void
ReleaseLruFiles(void)1146 ReleaseLruFiles(void)
1147 {
1148 while (nfile + numAllocatedDescs >= max_safe_fds)
1149 {
1150 if (!ReleaseLruFile())
1151 break;
1152 }
1153 }
1154
1155 static File
AllocateVfd(void)1156 AllocateVfd(void)
1157 {
1158 Index i;
1159 File file;
1160
1161 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1162
1163 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1164
1165 if (VfdCache[0].nextFree == 0)
1166 {
1167 /*
1168 * The free list is empty so it is time to increase the size of the
1169 * array. We choose to double it each time this happens. However,
1170 * there's not much point in starting *real* small.
1171 */
1172 Size newCacheSize = SizeVfdCache * 2;
1173 Vfd *newVfdCache;
1174
1175 if (newCacheSize < 32)
1176 newCacheSize = 32;
1177
1178 /*
1179 * Be careful not to clobber VfdCache ptr if realloc fails.
1180 */
1181 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1182 if (newVfdCache == NULL)
1183 ereport(ERROR,
1184 (errcode(ERRCODE_OUT_OF_MEMORY),
1185 errmsg("out of memory")));
1186 VfdCache = newVfdCache;
1187
1188 /*
1189 * Initialize the new entries and link them into the free list.
1190 */
1191 for (i = SizeVfdCache; i < newCacheSize; i++)
1192 {
1193 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1194 VfdCache[i].nextFree = i + 1;
1195 VfdCache[i].fd = VFD_CLOSED;
1196 }
1197 VfdCache[newCacheSize - 1].nextFree = 0;
1198 VfdCache[0].nextFree = SizeVfdCache;
1199
1200 /*
1201 * Record the new size
1202 */
1203 SizeVfdCache = newCacheSize;
1204 }
1205
1206 file = VfdCache[0].nextFree;
1207
1208 VfdCache[0].nextFree = VfdCache[file].nextFree;
1209
1210 return file;
1211 }
1212
1213 static void
FreeVfd(File file)1214 FreeVfd(File file)
1215 {
1216 Vfd *vfdP = &VfdCache[file];
1217
1218 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1219 file, vfdP->fileName ? vfdP->fileName : ""));
1220
1221 if (vfdP->fileName != NULL)
1222 {
1223 free(vfdP->fileName);
1224 vfdP->fileName = NULL;
1225 }
1226 vfdP->fdstate = 0x0;
1227
1228 vfdP->nextFree = VfdCache[0].nextFree;
1229 VfdCache[0].nextFree = file;
1230 }
1231
1232 /* returns 0 on success, -1 on re-open failure (with errno set) */
1233 static int
FileAccess(File file)1234 FileAccess(File file)
1235 {
1236 int returnValue;
1237
1238 DO_DB(elog(LOG, "FileAccess %d (%s)",
1239 file, VfdCache[file].fileName));
1240
1241 /*
1242 * Is the file open? If not, open it and put it at the head of the LRU
1243 * ring (possibly closing the least recently used file to get an FD).
1244 */
1245
1246 if (FileIsNotOpen(file))
1247 {
1248 returnValue = LruInsert(file);
1249 if (returnValue != 0)
1250 return returnValue;
1251 }
1252 else if (VfdCache[0].lruLessRecently != file)
1253 {
1254 /*
1255 * We now know that the file is open and that it is not the last one
1256 * accessed, so we need to move it to the head of the Lru ring.
1257 */
1258
1259 Delete(file);
1260 Insert(file);
1261 }
1262
1263 return 0;
1264 }
1265
1266 /*
1267 * Called when we get a shared invalidation message on some relation.
1268 */
1269 #ifdef NOT_USED
1270 void
FileInvalidate(File file)1271 FileInvalidate(File file)
1272 {
1273 Assert(FileIsValid(file));
1274 if (!FileIsNotOpen(file))
1275 LruDelete(file);
1276 }
1277 #endif
1278
1279 /*
1280 * open a file in an arbitrary directory
1281 *
1282 * NB: if the passed pathname is relative (which it usually is),
1283 * it will be interpreted relative to the process' working directory
1284 * (which should always be $PGDATA when this code is running).
1285 */
1286 File
PathNameOpenFile(FileName fileName,int fileFlags,int fileMode)1287 PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
1288 {
1289 char *fnamecopy;
1290 File file;
1291 Vfd *vfdP;
1292
1293 DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
1294 fileName, fileFlags, fileMode));
1295
1296 /*
1297 * We need a malloc'd copy of the file name; fail cleanly if no room.
1298 */
1299 fnamecopy = strdup(fileName);
1300 if (fnamecopy == NULL)
1301 ereport(ERROR,
1302 (errcode(ERRCODE_OUT_OF_MEMORY),
1303 errmsg("out of memory")));
1304
1305 file = AllocateVfd();
1306 vfdP = &VfdCache[file];
1307
1308 /* Close excess kernel FDs. */
1309 ReleaseLruFiles();
1310
1311 vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
1312
1313 if (vfdP->fd < 0)
1314 {
1315 int save_errno = errno;
1316
1317 FreeVfd(file);
1318 free(fnamecopy);
1319 errno = save_errno;
1320 return -1;
1321 }
1322 ++nfile;
1323 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1324 vfdP->fd));
1325
1326 vfdP->fileName = fnamecopy;
1327 /* Saved flags are adjusted to be OK for re-opening file */
1328 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1329 vfdP->fileMode = fileMode;
1330 vfdP->seekPos = 0;
1331 vfdP->fileSize = 0;
1332 vfdP->fdstate = 0x0;
1333 vfdP->resowner = NULL;
1334
1335 Insert(file);
1336
1337 return file;
1338 }
1339
1340 /*
1341 * Open a temporary file that will disappear when we close it.
1342 *
1343 * This routine takes care of generating an appropriate tempfile name.
1344 * There's no need to pass in fileFlags or fileMode either, since only
1345 * one setting makes any sense for a temp file.
1346 *
1347 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1348 * to ensure it's closed and deleted when it's no longer needed, typically at
1349 * the end-of-transaction. In most cases, you don't want temporary files to
1350 * outlive the transaction that created them, so this should be false -- but
1351 * if you need "somewhat" temporary storage, this might be useful. In either
1352 * case, the file is removed when the File is explicitly closed.
1353 */
1354 File
OpenTemporaryFile(bool interXact)1355 OpenTemporaryFile(bool interXact)
1356 {
1357 File file = 0;
1358
1359 /*
1360 * Make sure the current resource owner has space for this File before we
1361 * open it, if we'll be registering it below.
1362 */
1363 if (!interXact)
1364 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1365
1366 /*
1367 * If some temp tablespace(s) have been given to us, try to use the next
1368 * one. If a given tablespace can't be found, we silently fall back to
1369 * the database's default tablespace.
1370 *
1371 * BUT: if the temp file is slated to outlive the current transaction,
1372 * force it into the database's default tablespace, so that it will not
1373 * pose a threat to possible tablespace drop attempts.
1374 */
1375 if (numTempTableSpaces > 0 && !interXact)
1376 {
1377 Oid tblspcOid = GetNextTempTableSpace();
1378
1379 if (OidIsValid(tblspcOid))
1380 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1381 }
1382
1383 /*
1384 * If not, or if tablespace is bad, create in database's default
1385 * tablespace. MyDatabaseTableSpace should normally be set before we get
1386 * here, but just in case it isn't, fall back to pg_default tablespace.
1387 */
1388 if (file <= 0)
1389 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1390 MyDatabaseTableSpace :
1391 DEFAULTTABLESPACE_OID,
1392 true);
1393
1394 /* Mark it for deletion at close */
1395 VfdCache[file].fdstate |= FD_TEMPORARY;
1396
1397 /* Register it with the current resource owner */
1398 if (!interXact)
1399 {
1400 VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
1401
1402 VfdCache[file].resowner = CurrentResourceOwner;
1403 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1404
1405 /* ensure cleanup happens at eoxact */
1406 have_xact_temporary_files = true;
1407 }
1408
1409 return file;
1410 }
1411
1412 /*
1413 * Open a temporary file in a specific tablespace.
1414 * Subroutine for OpenTemporaryFile, which see for details.
1415 */
1416 static File
OpenTemporaryFileInTablespace(Oid tblspcOid,bool rejectError)1417 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1418 {
1419 char tempdirpath[MAXPGPATH];
1420 char tempfilepath[MAXPGPATH];
1421 File file;
1422
1423 /*
1424 * Identify the tempfile directory for this tablespace.
1425 *
1426 * If someone tries to specify pg_global, use pg_default instead.
1427 */
1428 if (tblspcOid == DEFAULTTABLESPACE_OID ||
1429 tblspcOid == GLOBALTABLESPACE_OID)
1430 {
1431 /* The default tablespace is {datadir}/base */
1432 snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
1433 PG_TEMP_FILES_DIR);
1434 }
1435 else
1436 {
1437 /* All other tablespaces are accessed via symlinks */
1438 snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
1439 tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
1440 }
1441
1442 /*
1443 * Generate a tempfile name that should be unique within the current
1444 * database instance.
1445 */
1446 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1447 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1448
1449 /*
1450 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1451 * temp file that can be reused.
1452 */
1453 file = PathNameOpenFile(tempfilepath,
1454 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1455 0600);
1456 if (file <= 0)
1457 {
1458 /*
1459 * We might need to create the tablespace's tempfile directory, if no
1460 * one has yet done so.
1461 *
1462 * Don't check for error from mkdir; it could fail if someone else
1463 * just did the same thing. If it doesn't work then we'll bomb out on
1464 * the second create attempt, instead.
1465 */
1466 mkdir(tempdirpath, S_IRWXU);
1467
1468 file = PathNameOpenFile(tempfilepath,
1469 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1470 0600);
1471 if (file <= 0 && rejectError)
1472 elog(ERROR, "could not create temporary file \"%s\": %m",
1473 tempfilepath);
1474 }
1475
1476 return file;
1477 }
1478
1479 /*
1480 * close a file when done with it
1481 */
1482 void
FileClose(File file)1483 FileClose(File file)
1484 {
1485 Vfd *vfdP;
1486
1487 Assert(FileIsValid(file));
1488
1489 DO_DB(elog(LOG, "FileClose: %d (%s)",
1490 file, VfdCache[file].fileName));
1491
1492 vfdP = &VfdCache[file];
1493
1494 if (!FileIsNotOpen(file))
1495 {
1496 /* close the file */
1497 if (close(vfdP->fd))
1498 {
1499 /*
1500 * We may need to panic on failure to close non-temporary files;
1501 * see LruDelete.
1502 */
1503 elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1504 "could not close file \"%s\": %m", vfdP->fileName);
1505 }
1506
1507 --nfile;
1508 vfdP->fd = VFD_CLOSED;
1509
1510 /* remove the file from the lru ring */
1511 Delete(file);
1512 }
1513
1514 /*
1515 * Delete the file if it was temporary, and make a log entry if wanted
1516 */
1517 if (vfdP->fdstate & FD_TEMPORARY)
1518 {
1519 struct stat filestats;
1520 int stat_errno;
1521
1522 /*
1523 * If we get an error, as could happen within the ereport/elog calls,
1524 * we'll come right back here during transaction abort. Reset the
1525 * flag to ensure that we can't get into an infinite loop. This code
1526 * is arranged to ensure that the worst-case consequence is failing to
1527 * emit log message(s), not failing to attempt the unlink.
1528 */
1529 vfdP->fdstate &= ~FD_TEMPORARY;
1530
1531 /* Subtract its size from current usage (do first in case of error) */
1532 temporary_files_size -= vfdP->fileSize;
1533 vfdP->fileSize = 0;
1534
1535 /* first try the stat() */
1536 if (stat(vfdP->fileName, &filestats))
1537 stat_errno = errno;
1538 else
1539 stat_errno = 0;
1540
1541 /* in any case do the unlink */
1542 if (unlink(vfdP->fileName))
1543 elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1544
1545 /* and last report the stat results */
1546 if (stat_errno == 0)
1547 {
1548 pgstat_report_tempfile(filestats.st_size);
1549
1550 if (log_temp_files >= 0)
1551 {
1552 if ((filestats.st_size / 1024) >= log_temp_files)
1553 ereport(LOG,
1554 (errmsg("temporary file: path \"%s\", size %lu",
1555 vfdP->fileName,
1556 (unsigned long) filestats.st_size)));
1557 }
1558 }
1559 else
1560 {
1561 errno = stat_errno;
1562 elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1563 }
1564 }
1565
1566 /* Unregister it from the resource owner */
1567 if (vfdP->resowner)
1568 ResourceOwnerForgetFile(vfdP->resowner, file);
1569
1570 /*
1571 * Return the Vfd slot to the free list
1572 */
1573 FreeVfd(file);
1574 }
1575
1576 /*
1577 * FilePrefetch - initiate asynchronous read of a given range of the file.
1578 * The logical seek position is unaffected.
1579 *
1580 * Currently the only implementation of this function is using posix_fadvise
1581 * which is the simplest standardized interface that accomplishes this.
1582 * We could add an implementation using libaio in the future; but note that
1583 * this API is inappropriate for libaio, which wants to have a buffer provided
1584 * to read into.
1585 */
1586 int
FilePrefetch(File file,off_t offset,int amount)1587 FilePrefetch(File file, off_t offset, int amount)
1588 {
1589 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1590 int returnCode;
1591
1592 Assert(FileIsValid(file));
1593
1594 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1595 file, VfdCache[file].fileName,
1596 (int64) offset, amount));
1597
1598 returnCode = FileAccess(file);
1599 if (returnCode < 0)
1600 return returnCode;
1601
1602 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1603 POSIX_FADV_WILLNEED);
1604
1605 return returnCode;
1606 #else
1607 Assert(FileIsValid(file));
1608 return 0;
1609 #endif
1610 }
1611
1612 void
FileWriteback(File file,off_t offset,off_t nbytes)1613 FileWriteback(File file, off_t offset, off_t nbytes)
1614 {
1615 int returnCode;
1616
1617 Assert(FileIsValid(file));
1618
1619 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1620 file, VfdCache[file].fileName,
1621 (int64) offset, (int64) nbytes));
1622
1623 /*
1624 * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1625 * file's seek position. We prefer to define that as a no-op here.
1626 */
1627 if (nbytes <= 0)
1628 return;
1629
1630 returnCode = FileAccess(file);
1631 if (returnCode < 0)
1632 return;
1633
1634 pg_flush_data(VfdCache[file].fd, offset, nbytes);
1635 }
1636
1637 int
FileRead(File file,char * buffer,int amount)1638 FileRead(File file, char *buffer, int amount)
1639 {
1640 int returnCode;
1641 Vfd *vfdP;
1642
1643 Assert(FileIsValid(file));
1644
1645 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1646 file, VfdCache[file].fileName,
1647 (int64) VfdCache[file].seekPos,
1648 amount, buffer));
1649
1650 returnCode = FileAccess(file);
1651 if (returnCode < 0)
1652 return returnCode;
1653
1654 vfdP = &VfdCache[file];
1655
1656 retry:
1657 returnCode = read(vfdP->fd, buffer, amount);
1658
1659 if (returnCode >= 0)
1660 {
1661 /* if seekPos is unknown, leave it that way */
1662 if (!FilePosIsUnknown(vfdP->seekPos))
1663 vfdP->seekPos += returnCode;
1664 }
1665 else
1666 {
1667 /*
1668 * Windows may run out of kernel buffers and return "Insufficient
1669 * system resources" error. Wait a bit and retry to solve it.
1670 *
1671 * It is rumored that EINTR is also possible on some Unix filesystems,
1672 * in which case immediate retry is indicated.
1673 */
1674 #ifdef WIN32
1675 DWORD error = GetLastError();
1676
1677 switch (error)
1678 {
1679 case ERROR_NO_SYSTEM_RESOURCES:
1680 pg_usleep(1000L);
1681 errno = EINTR;
1682 break;
1683 default:
1684 _dosmaperr(error);
1685 break;
1686 }
1687 #endif
1688 /* OK to retry if interrupted */
1689 if (errno == EINTR)
1690 goto retry;
1691
1692 /* Trouble, so assume we don't know the file position anymore */
1693 vfdP->seekPos = FileUnknownPos;
1694 }
1695
1696 return returnCode;
1697 }
1698
1699 int
FileWrite(File file,char * buffer,int amount)1700 FileWrite(File file, char *buffer, int amount)
1701 {
1702 int returnCode;
1703 Vfd *vfdP;
1704
1705 Assert(FileIsValid(file));
1706
1707 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1708 file, VfdCache[file].fileName,
1709 (int64) VfdCache[file].seekPos,
1710 amount, buffer));
1711
1712 returnCode = FileAccess(file);
1713 if (returnCode < 0)
1714 return returnCode;
1715
1716 vfdP = &VfdCache[file];
1717
1718 /*
1719 * If enforcing temp_file_limit and it's a temp file, check to see if the
1720 * write would overrun temp_file_limit, and throw error if so. Note: it's
1721 * really a modularity violation to throw error here; we should set errno
1722 * and return -1. However, there's no way to report a suitable error
1723 * message if we do that. All current callers would just throw error
1724 * immediately anyway, so this is safe at present.
1725 */
1726 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
1727 {
1728 off_t newPos;
1729
1730 /*
1731 * Normally we should know the seek position, but if for some reason
1732 * we have lost track of it, try again to get it. Here, it's fine to
1733 * throw an error if we still can't get it.
1734 */
1735 if (FilePosIsUnknown(vfdP->seekPos))
1736 {
1737 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1738 if (FilePosIsUnknown(vfdP->seekPos))
1739 elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1740 }
1741
1742 newPos = vfdP->seekPos + amount;
1743 if (newPos > vfdP->fileSize)
1744 {
1745 uint64 newTotal = temporary_files_size;
1746
1747 newTotal += newPos - vfdP->fileSize;
1748 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1749 ereport(ERROR,
1750 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1751 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1752 temp_file_limit)));
1753 }
1754 }
1755
1756 retry:
1757 errno = 0;
1758 returnCode = write(vfdP->fd, buffer, amount);
1759
1760 /* if write didn't set errno, assume problem is no disk space */
1761 if (returnCode != amount && errno == 0)
1762 errno = ENOSPC;
1763
1764 if (returnCode >= 0)
1765 {
1766 /* if seekPos is unknown, leave it that way */
1767 if (!FilePosIsUnknown(vfdP->seekPos))
1768 vfdP->seekPos += returnCode;
1769
1770 /*
1771 * Maintain fileSize and temporary_files_size if it's a temp file.
1772 *
1773 * If seekPos is -1 (unknown), this will do nothing; but we could only
1774 * get here in that state if we're not enforcing temporary_files_size,
1775 * so we don't care.
1776 */
1777 if (vfdP->fdstate & FD_TEMPORARY)
1778 {
1779 off_t newPos = vfdP->seekPos;
1780
1781 if (newPos > vfdP->fileSize)
1782 {
1783 temporary_files_size += newPos - vfdP->fileSize;
1784 vfdP->fileSize = newPos;
1785 }
1786 }
1787 }
1788 else
1789 {
1790 /*
1791 * See comments in FileRead()
1792 */
1793 #ifdef WIN32
1794 DWORD error = GetLastError();
1795
1796 switch (error)
1797 {
1798 case ERROR_NO_SYSTEM_RESOURCES:
1799 pg_usleep(1000L);
1800 errno = EINTR;
1801 break;
1802 default:
1803 _dosmaperr(error);
1804 break;
1805 }
1806 #endif
1807 /* OK to retry if interrupted */
1808 if (errno == EINTR)
1809 goto retry;
1810
1811 /* Trouble, so assume we don't know the file position anymore */
1812 vfdP->seekPos = FileUnknownPos;
1813 }
1814
1815 return returnCode;
1816 }
1817
1818 int
FileSync(File file)1819 FileSync(File file)
1820 {
1821 int returnCode;
1822
1823 Assert(FileIsValid(file));
1824
1825 DO_DB(elog(LOG, "FileSync: %d (%s)",
1826 file, VfdCache[file].fileName));
1827
1828 returnCode = FileAccess(file);
1829 if (returnCode < 0)
1830 return returnCode;
1831
1832 return pg_fsync(VfdCache[file].fd);
1833 }
1834
1835 off_t
FileSeek(File file,off_t offset,int whence)1836 FileSeek(File file, off_t offset, int whence)
1837 {
1838 Vfd *vfdP;
1839
1840 Assert(FileIsValid(file));
1841
1842 DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
1843 file, VfdCache[file].fileName,
1844 (int64) VfdCache[file].seekPos,
1845 (int64) offset, whence));
1846
1847 vfdP = &VfdCache[file];
1848
1849 if (FileIsNotOpen(file))
1850 {
1851 switch (whence)
1852 {
1853 case SEEK_SET:
1854 if (offset < 0)
1855 {
1856 errno = EINVAL;
1857 return (off_t) -1;
1858 }
1859 vfdP->seekPos = offset;
1860 break;
1861 case SEEK_CUR:
1862 if (FilePosIsUnknown(vfdP->seekPos) ||
1863 vfdP->seekPos + offset < 0)
1864 {
1865 errno = EINVAL;
1866 return (off_t) -1;
1867 }
1868 vfdP->seekPos += offset;
1869 break;
1870 case SEEK_END:
1871 if (FileAccess(file) < 0)
1872 return (off_t) -1;
1873 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1874 break;
1875 default:
1876 elog(ERROR, "invalid whence: %d", whence);
1877 break;
1878 }
1879 }
1880 else
1881 {
1882 switch (whence)
1883 {
1884 case SEEK_SET:
1885 if (offset < 0)
1886 {
1887 errno = EINVAL;
1888 return (off_t) -1;
1889 }
1890 if (vfdP->seekPos != offset)
1891 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1892 break;
1893 case SEEK_CUR:
1894 if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
1895 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1896 break;
1897 case SEEK_END:
1898 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1899 break;
1900 default:
1901 elog(ERROR, "invalid whence: %d", whence);
1902 break;
1903 }
1904 }
1905
1906 return vfdP->seekPos;
1907 }
1908
1909 /*
1910 * XXX not actually used but here for completeness
1911 */
1912 #ifdef NOT_USED
1913 off_t
FileTell(File file)1914 FileTell(File file)
1915 {
1916 Assert(FileIsValid(file));
1917 DO_DB(elog(LOG, "FileTell %d (%s)",
1918 file, VfdCache[file].fileName));
1919 return VfdCache[file].seekPos;
1920 }
1921 #endif
1922
1923 int
FileTruncate(File file,off_t offset)1924 FileTruncate(File file, off_t offset)
1925 {
1926 int returnCode;
1927
1928 Assert(FileIsValid(file));
1929
1930 DO_DB(elog(LOG, "FileTruncate %d (%s)",
1931 file, VfdCache[file].fileName));
1932
1933 returnCode = FileAccess(file);
1934 if (returnCode < 0)
1935 return returnCode;
1936
1937 returnCode = ftruncate(VfdCache[file].fd, offset);
1938
1939 if (returnCode == 0 && VfdCache[file].fileSize > offset)
1940 {
1941 /* adjust our state for truncation of a temp file */
1942 Assert(VfdCache[file].fdstate & FD_TEMPORARY);
1943 temporary_files_size -= VfdCache[file].fileSize - offset;
1944 VfdCache[file].fileSize = offset;
1945 }
1946
1947 return returnCode;
1948 }
1949
1950 /*
1951 * Return the pathname associated with an open file.
1952 *
1953 * The returned string points to an internal buffer, which is valid until
1954 * the file is closed.
1955 */
1956 char *
FilePathName(File file)1957 FilePathName(File file)
1958 {
1959 Assert(FileIsValid(file));
1960
1961 return VfdCache[file].fileName;
1962 }
1963
1964 /*
1965 * Return the raw file descriptor of an opened file.
1966 *
1967 * The returned file descriptor will be valid until the file is closed, but
1968 * there are a lot of things that can make that happen. So the caller should
1969 * be careful not to do much of anything else before it finishes using the
1970 * returned file descriptor.
1971 */
1972 int
FileGetRawDesc(File file)1973 FileGetRawDesc(File file)
1974 {
1975 Assert(FileIsValid(file));
1976 return VfdCache[file].fd;
1977 }
1978
1979 /*
1980 * FileGetRawFlags - returns the file flags on open(2)
1981 */
1982 int
FileGetRawFlags(File file)1983 FileGetRawFlags(File file)
1984 {
1985 Assert(FileIsValid(file));
1986 return VfdCache[file].fileFlags;
1987 }
1988
1989 /*
1990 * FileGetRawMode - returns the mode bitmask passed to open(2)
1991 */
1992 int
FileGetRawMode(File file)1993 FileGetRawMode(File file)
1994 {
1995 Assert(FileIsValid(file));
1996 return VfdCache[file].fileMode;
1997 }
1998
1999 /*
2000 * Make room for another allocatedDescs[] array entry if needed and possible.
2001 * Returns true if an array element is available.
2002 */
2003 static bool
reserveAllocatedDesc(void)2004 reserveAllocatedDesc(void)
2005 {
2006 AllocateDesc *newDescs;
2007 int newMax;
2008
2009 /* Quick out if array already has a free slot. */
2010 if (numAllocatedDescs < maxAllocatedDescs)
2011 return true;
2012
2013 /*
2014 * If the array hasn't yet been created in the current process, initialize
2015 * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2016 * we will ever need, anyway. We don't want to look at max_safe_fds
2017 * immediately because set_max_safe_fds() may not have run yet.
2018 */
2019 if (allocatedDescs == NULL)
2020 {
2021 newMax = FD_MINFREE / 2;
2022 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2023 /* Out of memory already? Treat as fatal error. */
2024 if (newDescs == NULL)
2025 ereport(ERROR,
2026 (errcode(ERRCODE_OUT_OF_MEMORY),
2027 errmsg("out of memory")));
2028 allocatedDescs = newDescs;
2029 maxAllocatedDescs = newMax;
2030 return true;
2031 }
2032
2033 /*
2034 * Consider enlarging the array beyond the initial allocation used above.
2035 * By the time this happens, max_safe_fds should be known accurately.
2036 *
2037 * We mustn't let allocated descriptors hog all the available FDs, and in
2038 * practice we'd better leave a reasonable number of FDs for VFD use. So
2039 * set the maximum to max_safe_fds / 2. (This should certainly be at
2040 * least as large as the initial size, FD_MINFREE / 2.)
2041 */
2042 newMax = max_safe_fds / 2;
2043 if (newMax > maxAllocatedDescs)
2044 {
2045 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2046 newMax * sizeof(AllocateDesc));
2047 /* Treat out-of-memory as a non-fatal error. */
2048 if (newDescs == NULL)
2049 return false;
2050 allocatedDescs = newDescs;
2051 maxAllocatedDescs = newMax;
2052 return true;
2053 }
2054
2055 /* Can't enlarge allocatedDescs[] any more. */
2056 return false;
2057 }
2058
2059 /*
2060 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2061 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2062 * necessary to open the file. When done, call FreeFile rather than fclose.
2063 *
2064 * Note that files that will be open for any significant length of time
2065 * should NOT be handled this way, since they cannot share kernel file
2066 * descriptors with other files; there is grave risk of running out of FDs
2067 * if anyone locks down too many FDs. Most callers of this routine are
2068 * simply reading a config file that they will read and close immediately.
2069 *
2070 * fd.c will automatically close all files opened with AllocateFile at
2071 * transaction commit or abort; this prevents FD leakage if a routine
2072 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2073 *
2074 * Ideally this should be the *only* direct call of fopen() in the backend.
2075 */
2076 FILE *
AllocateFile(const char * name,const char * mode)2077 AllocateFile(const char *name, const char *mode)
2078 {
2079 FILE *file;
2080
2081 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2082 numAllocatedDescs, name));
2083
2084 /* Can we allocate another non-virtual FD? */
2085 if (!reserveAllocatedDesc())
2086 ereport(ERROR,
2087 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2088 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2089 maxAllocatedDescs, name)));
2090
2091 /* Close excess kernel FDs. */
2092 ReleaseLruFiles();
2093
2094 TryAgain:
2095 if ((file = fopen(name, mode)) != NULL)
2096 {
2097 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2098
2099 desc->kind = AllocateDescFile;
2100 desc->desc.file = file;
2101 desc->create_subid = GetCurrentSubTransactionId();
2102 numAllocatedDescs++;
2103 return desc->desc.file;
2104 }
2105
2106 if (errno == EMFILE || errno == ENFILE)
2107 {
2108 int save_errno = errno;
2109
2110 ereport(LOG,
2111 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2112 errmsg("out of file descriptors: %m; release and retry")));
2113 errno = 0;
2114 if (ReleaseLruFile())
2115 goto TryAgain;
2116 errno = save_errno;
2117 }
2118
2119 return NULL;
2120 }
2121
2122
2123 /*
2124 * Like AllocateFile, but returns an unbuffered fd like open(2)
2125 */
2126 int
OpenTransientFile(FileName fileName,int fileFlags,int fileMode)2127 OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
2128 {
2129 int fd;
2130
2131 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2132 numAllocatedDescs, fileName));
2133
2134 /* Can we allocate another non-virtual FD? */
2135 if (!reserveAllocatedDesc())
2136 ereport(ERROR,
2137 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2138 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2139 maxAllocatedDescs, fileName)));
2140
2141 /* Close excess kernel FDs. */
2142 ReleaseLruFiles();
2143
2144 fd = BasicOpenFile(fileName, fileFlags, fileMode);
2145
2146 if (fd >= 0)
2147 {
2148 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2149
2150 desc->kind = AllocateDescRawFD;
2151 desc->desc.fd = fd;
2152 desc->create_subid = GetCurrentSubTransactionId();
2153 numAllocatedDescs++;
2154
2155 return fd;
2156 }
2157
2158 return -1; /* failure */
2159 }
2160
2161 /*
2162 * Routines that want to initiate a pipe stream should use OpenPipeStream
2163 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2164 * necessary. When done, call ClosePipeStream rather than pclose.
2165 */
2166 FILE *
OpenPipeStream(const char * command,const char * mode)2167 OpenPipeStream(const char *command, const char *mode)
2168 {
2169 FILE *file;
2170
2171 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2172 numAllocatedDescs, command));
2173
2174 /* Can we allocate another non-virtual FD? */
2175 if (!reserveAllocatedDesc())
2176 ereport(ERROR,
2177 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2178 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2179 maxAllocatedDescs, command)));
2180
2181 /* Close excess kernel FDs. */
2182 ReleaseLruFiles();
2183
2184 TryAgain:
2185 fflush(stdout);
2186 fflush(stderr);
2187 errno = 0;
2188 if ((file = popen(command, mode)) != NULL)
2189 {
2190 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2191
2192 desc->kind = AllocateDescPipe;
2193 desc->desc.file = file;
2194 desc->create_subid = GetCurrentSubTransactionId();
2195 numAllocatedDescs++;
2196 return desc->desc.file;
2197 }
2198
2199 if (errno == EMFILE || errno == ENFILE)
2200 {
2201 int save_errno = errno;
2202
2203 ereport(LOG,
2204 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2205 errmsg("out of file descriptors: %m; release and retry")));
2206 errno = 0;
2207 if (ReleaseLruFile())
2208 goto TryAgain;
2209 errno = save_errno;
2210 }
2211
2212 return NULL;
2213 }
2214
2215 /*
2216 * Free an AllocateDesc of any type.
2217 *
2218 * The argument *must* point into the allocatedDescs[] array.
2219 */
2220 static int
FreeDesc(AllocateDesc * desc)2221 FreeDesc(AllocateDesc *desc)
2222 {
2223 int result;
2224
2225 /* Close the underlying object */
2226 switch (desc->kind)
2227 {
2228 case AllocateDescFile:
2229 result = fclose(desc->desc.file);
2230 break;
2231 case AllocateDescPipe:
2232 result = pclose(desc->desc.file);
2233 break;
2234 case AllocateDescDir:
2235 result = closedir(desc->desc.dir);
2236 break;
2237 case AllocateDescRawFD:
2238 result = close(desc->desc.fd);
2239 break;
2240 default:
2241 elog(ERROR, "AllocateDesc kind not recognized");
2242 result = 0; /* keep compiler quiet */
2243 break;
2244 }
2245
2246 /* Compact storage in the allocatedDescs array */
2247 numAllocatedDescs--;
2248 *desc = allocatedDescs[numAllocatedDescs];
2249
2250 return result;
2251 }
2252
2253 /*
2254 * Close a file returned by AllocateFile.
2255 *
2256 * Note we do not check fclose's return value --- it is up to the caller
2257 * to handle close errors.
2258 */
2259 int
FreeFile(FILE * file)2260 FreeFile(FILE *file)
2261 {
2262 int i;
2263
2264 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2265
2266 /* Remove file from list of allocated files, if it's present */
2267 for (i = numAllocatedDescs; --i >= 0;)
2268 {
2269 AllocateDesc *desc = &allocatedDescs[i];
2270
2271 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2272 return FreeDesc(desc);
2273 }
2274
2275 /* Only get here if someone passes us a file not in allocatedDescs */
2276 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2277
2278 return fclose(file);
2279 }
2280
2281 /*
2282 * Close a file returned by OpenTransientFile.
2283 *
2284 * Note we do not check close's return value --- it is up to the caller
2285 * to handle close errors.
2286 */
2287 int
CloseTransientFile(int fd)2288 CloseTransientFile(int fd)
2289 {
2290 int i;
2291
2292 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2293
2294 /* Remove fd from list of allocated files, if it's present */
2295 for (i = numAllocatedDescs; --i >= 0;)
2296 {
2297 AllocateDesc *desc = &allocatedDescs[i];
2298
2299 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2300 return FreeDesc(desc);
2301 }
2302
2303 /* Only get here if someone passes us a file not in allocatedDescs */
2304 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2305
2306 return close(fd);
2307 }
2308
2309 /*
2310 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2311 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2312 * necessary to open the directory, and with closing it after an elog.
2313 * When done, call FreeDir rather than closedir.
2314 *
2315 * Returns NULL, with errno set, on failure. Note that failure detection
2316 * is commonly left to the following call of ReadDir or ReadDirExtended;
2317 * see the comments for ReadDir.
2318 *
2319 * Ideally this should be the *only* direct call of opendir() in the backend.
2320 */
2321 DIR *
AllocateDir(const char * dirname)2322 AllocateDir(const char *dirname)
2323 {
2324 DIR *dir;
2325
2326 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2327 numAllocatedDescs, dirname));
2328
2329 /* Can we allocate another non-virtual FD? */
2330 if (!reserveAllocatedDesc())
2331 ereport(ERROR,
2332 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2333 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2334 maxAllocatedDescs, dirname)));
2335
2336 /* Close excess kernel FDs. */
2337 ReleaseLruFiles();
2338
2339 TryAgain:
2340 if ((dir = opendir(dirname)) != NULL)
2341 {
2342 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2343
2344 desc->kind = AllocateDescDir;
2345 desc->desc.dir = dir;
2346 desc->create_subid = GetCurrentSubTransactionId();
2347 numAllocatedDescs++;
2348 return desc->desc.dir;
2349 }
2350
2351 if (errno == EMFILE || errno == ENFILE)
2352 {
2353 int save_errno = errno;
2354
2355 ereport(LOG,
2356 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2357 errmsg("out of file descriptors: %m; release and retry")));
2358 errno = 0;
2359 if (ReleaseLruFile())
2360 goto TryAgain;
2361 errno = save_errno;
2362 }
2363
2364 return NULL;
2365 }
2366
2367 /*
2368 * Read a directory opened with AllocateDir, ereport'ing any error.
2369 *
2370 * This is easier to use than raw readdir() since it takes care of some
2371 * otherwise rather tedious and error-prone manipulation of errno. Also,
2372 * if you are happy with a generic error message for AllocateDir failure,
2373 * you can just do
2374 *
2375 * dir = AllocateDir(path);
2376 * while ((dirent = ReadDir(dir, path)) != NULL)
2377 * process dirent;
2378 * FreeDir(dir);
2379 *
2380 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2381 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2382 * use this shortcut.)
2383 *
2384 * The pathname passed to AllocateDir must be passed to this routine too,
2385 * but it is only used for error reporting.
2386 */
2387 struct dirent *
ReadDir(DIR * dir,const char * dirname)2388 ReadDir(DIR *dir, const char *dirname)
2389 {
2390 return ReadDirExtended(dir, dirname, ERROR);
2391 }
2392
2393 /*
2394 * Alternate version of ReadDir that allows caller to specify the elevel
2395 * for any error report (whether it's reporting an initial failure of
2396 * AllocateDir or a subsequent directory read failure).
2397 *
2398 * If elevel < ERROR, returns NULL after any error. With the normal coding
2399 * pattern, this will result in falling out of the loop immediately as
2400 * though the directory contained no (more) entries.
2401 */
2402 struct dirent *
ReadDirExtended(DIR * dir,const char * dirname,int elevel)2403 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2404 {
2405 struct dirent *dent;
2406
2407 /* Give a generic message for AllocateDir failure, if caller didn't */
2408 if (dir == NULL)
2409 {
2410 ereport(elevel,
2411 (errcode_for_file_access(),
2412 errmsg("could not open directory \"%s\": %m",
2413 dirname)));
2414 return NULL;
2415 }
2416
2417 errno = 0;
2418 if ((dent = readdir(dir)) != NULL)
2419 return dent;
2420
2421 if (errno)
2422 ereport(elevel,
2423 (errcode_for_file_access(),
2424 errmsg("could not read directory \"%s\": %m",
2425 dirname)));
2426 return NULL;
2427 }
2428
2429 /*
2430 * Close a directory opened with AllocateDir.
2431 *
2432 * Returns closedir's return value (with errno set if it's not 0).
2433 * Note we do not check the return value --- it is up to the caller
2434 * to handle close errors if wanted.
2435 *
2436 * Does nothing if dir == NULL; we assume that directory open failure was
2437 * already reported if desired.
2438 */
2439 int
FreeDir(DIR * dir)2440 FreeDir(DIR *dir)
2441 {
2442 int i;
2443
2444 /* Nothing to do if AllocateDir failed */
2445 if (dir == NULL)
2446 return 0;
2447
2448 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2449
2450 /* Remove dir from list of allocated dirs, if it's present */
2451 for (i = numAllocatedDescs; --i >= 0;)
2452 {
2453 AllocateDesc *desc = &allocatedDescs[i];
2454
2455 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2456 return FreeDesc(desc);
2457 }
2458
2459 /* Only get here if someone passes us a dir not in allocatedDescs */
2460 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2461
2462 return closedir(dir);
2463 }
2464
2465
2466 /*
2467 * Close a pipe stream returned by OpenPipeStream.
2468 */
2469 int
ClosePipeStream(FILE * file)2470 ClosePipeStream(FILE *file)
2471 {
2472 int i;
2473
2474 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2475
2476 /* Remove file from list of allocated files, if it's present */
2477 for (i = numAllocatedDescs; --i >= 0;)
2478 {
2479 AllocateDesc *desc = &allocatedDescs[i];
2480
2481 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2482 return FreeDesc(desc);
2483 }
2484
2485 /* Only get here if someone passes us a file not in allocatedDescs */
2486 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2487
2488 return pclose(file);
2489 }
2490
2491 /*
2492 * closeAllVfds
2493 *
2494 * Force all VFDs into the physically-closed state, so that the fewest
2495 * possible number of kernel file descriptors are in use. There is no
2496 * change in the logical state of the VFDs.
2497 */
2498 void
closeAllVfds(void)2499 closeAllVfds(void)
2500 {
2501 Index i;
2502
2503 if (SizeVfdCache > 0)
2504 {
2505 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2506 for (i = 1; i < SizeVfdCache; i++)
2507 {
2508 if (!FileIsNotOpen(i))
2509 LruDelete(i);
2510 }
2511 }
2512 }
2513
2514
2515 /*
2516 * SetTempTablespaces
2517 *
2518 * Define a list (actually an array) of OIDs of tablespaces to use for
2519 * temporary files. This list will be used until end of transaction,
2520 * unless this function is called again before then. It is caller's
2521 * responsibility that the passed-in array has adequate lifespan (typically
2522 * it'd be allocated in TopTransactionContext).
2523 *
2524 * Some entries of the array may be InvalidOid, indicating that the current
2525 * database's default tablespace should be used.
2526 */
2527 void
SetTempTablespaces(Oid * tableSpaces,int numSpaces)2528 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2529 {
2530 Assert(numSpaces >= 0);
2531 tempTableSpaces = tableSpaces;
2532 numTempTableSpaces = numSpaces;
2533
2534 /*
2535 * Select a random starting point in the list. This is to minimize
2536 * conflicts between backends that are most likely sharing the same list
2537 * of temp tablespaces. Note that if we create multiple temp files in the
2538 * same transaction, we'll advance circularly through the list --- this
2539 * ensures that large temporary sort files are nicely spread across all
2540 * available tablespaces.
2541 */
2542 if (numSpaces > 1)
2543 nextTempTableSpace = random() % numSpaces;
2544 else
2545 nextTempTableSpace = 0;
2546 }
2547
2548 /*
2549 * TempTablespacesAreSet
2550 *
2551 * Returns TRUE if SetTempTablespaces has been called in current transaction.
2552 * (This is just so that tablespaces.c doesn't need its own per-transaction
2553 * state.)
2554 */
2555 bool
TempTablespacesAreSet(void)2556 TempTablespacesAreSet(void)
2557 {
2558 return (numTempTableSpaces >= 0);
2559 }
2560
2561 /*
2562 * GetNextTempTableSpace
2563 *
2564 * Select the next temp tablespace to use. A result of InvalidOid means
2565 * to use the current database's default tablespace.
2566 */
2567 Oid
GetNextTempTableSpace(void)2568 GetNextTempTableSpace(void)
2569 {
2570 if (numTempTableSpaces > 0)
2571 {
2572 /* Advance nextTempTableSpace counter with wraparound */
2573 if (++nextTempTableSpace >= numTempTableSpaces)
2574 nextTempTableSpace = 0;
2575 return tempTableSpaces[nextTempTableSpace];
2576 }
2577 return InvalidOid;
2578 }
2579
2580
2581 /*
2582 * AtEOSubXact_Files
2583 *
2584 * Take care of subtransaction commit/abort. At abort, we close temp files
2585 * that the subtransaction may have opened. At commit, we reassign the
2586 * files that were opened to the parent subtransaction.
2587 */
2588 void
AtEOSubXact_Files(bool isCommit,SubTransactionId mySubid,SubTransactionId parentSubid)2589 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2590 SubTransactionId parentSubid)
2591 {
2592 Index i;
2593
2594 for (i = 0; i < numAllocatedDescs; i++)
2595 {
2596 if (allocatedDescs[i].create_subid == mySubid)
2597 {
2598 if (isCommit)
2599 allocatedDescs[i].create_subid = parentSubid;
2600 else
2601 {
2602 /* have to recheck the item after FreeDesc (ugly) */
2603 FreeDesc(&allocatedDescs[i--]);
2604 }
2605 }
2606 }
2607 }
2608
2609 /*
2610 * AtEOXact_Files
2611 *
2612 * This routine is called during transaction commit or abort (it doesn't
2613 * particularly care which). All still-open per-transaction temporary file
2614 * VFDs are closed, which also causes the underlying files to be deleted
2615 * (although they should've been closed already by the ResourceOwner
2616 * cleanup). Furthermore, all "allocated" stdio files are closed. We also
2617 * forget any transaction-local temp tablespace list.
2618 */
2619 void
AtEOXact_Files(void)2620 AtEOXact_Files(void)
2621 {
2622 CleanupTempFiles(false);
2623 tempTableSpaces = NULL;
2624 numTempTableSpaces = -1;
2625 }
2626
2627 /*
2628 * AtProcExit_Files
2629 *
2630 * on_proc_exit hook to clean up temp files during backend shutdown.
2631 * Here, we want to clean up *all* temp files including interXact ones.
2632 */
2633 static void
AtProcExit_Files(int code,Datum arg)2634 AtProcExit_Files(int code, Datum arg)
2635 {
2636 CleanupTempFiles(true);
2637 }
2638
2639 /*
2640 * Close temporary files and delete their underlying files.
2641 *
2642 * isProcExit: if true, this is being called as the backend process is
2643 * exiting. If that's the case, we should remove all temporary files; if
2644 * that's not the case, we are being called for transaction commit/abort
2645 * and should only remove transaction-local temp files. In either case,
2646 * also clean up "allocated" stdio files, dirs and fds.
2647 */
2648 static void
CleanupTempFiles(bool isProcExit)2649 CleanupTempFiles(bool isProcExit)
2650 {
2651 Index i;
2652
2653 /*
2654 * Careful here: at proc_exit we need extra cleanup, not just
2655 * xact_temporary files.
2656 */
2657 if (isProcExit || have_xact_temporary_files)
2658 {
2659 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2660 for (i = 1; i < SizeVfdCache; i++)
2661 {
2662 unsigned short fdstate = VfdCache[i].fdstate;
2663
2664 if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
2665 {
2666 /*
2667 * If we're in the process of exiting a backend process, close
2668 * all temporary files. Otherwise, only close temporary files
2669 * local to the current transaction. They should be closed by
2670 * the ResourceOwner mechanism already, so this is just a
2671 * debugging cross-check.
2672 */
2673 if (isProcExit)
2674 FileClose(i);
2675 else if (fdstate & FD_XACT_TEMPORARY)
2676 {
2677 elog(WARNING,
2678 "temporary file %s not closed at end-of-transaction",
2679 VfdCache[i].fileName);
2680 FileClose(i);
2681 }
2682 }
2683 }
2684
2685 have_xact_temporary_files = false;
2686 }
2687
2688 /* Clean up "allocated" stdio files, dirs and fds. */
2689 while (numAllocatedDescs > 0)
2690 FreeDesc(&allocatedDescs[0]);
2691 }
2692
2693
2694 /*
2695 * Remove temporary and temporary relation files left over from a prior
2696 * postmaster session
2697 *
2698 * This should be called during postmaster startup. It will forcibly
2699 * remove any leftover files created by OpenTemporaryFile and any leftover
2700 * temporary relation files created by mdcreate.
2701 *
2702 * NOTE: we could, but don't, call this during a post-backend-crash restart
2703 * cycle. The argument for not doing it is that someone might want to examine
2704 * the temp files for debugging purposes. This does however mean that
2705 * OpenTemporaryFile had better allow for collision with an existing temp
2706 * file name.
2707 */
2708 void
RemovePgTempFiles(void)2709 RemovePgTempFiles(void)
2710 {
2711 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2712 DIR *spc_dir;
2713 struct dirent *spc_de;
2714
2715 /*
2716 * First process temp files in pg_default ($PGDATA/base)
2717 */
2718 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2719 RemovePgTempFilesInDir(temp_path);
2720 RemovePgTempRelationFiles("base");
2721
2722 /*
2723 * Cycle through temp directories for all non-default tablespaces.
2724 */
2725 spc_dir = AllocateDir("pg_tblspc");
2726
2727 while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
2728 {
2729 if (strcmp(spc_de->d_name, ".") == 0 ||
2730 strcmp(spc_de->d_name, "..") == 0)
2731 continue;
2732
2733 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2734 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2735 RemovePgTempFilesInDir(temp_path);
2736
2737 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2738 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2739 RemovePgTempRelationFiles(temp_path);
2740 }
2741
2742 FreeDir(spc_dir);
2743
2744 /*
2745 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2746 * DataDir as well.
2747 */
2748 #ifdef EXEC_BACKEND
2749 RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
2750 #endif
2751 }
2752
2753 /* Process one pgsql_tmp directory for RemovePgTempFiles */
2754 static void
RemovePgTempFilesInDir(const char * tmpdirname)2755 RemovePgTempFilesInDir(const char *tmpdirname)
2756 {
2757 DIR *temp_dir;
2758 struct dirent *temp_de;
2759 char rm_path[MAXPGPATH * 2];
2760
2761 temp_dir = AllocateDir(tmpdirname);
2762 if (temp_dir == NULL)
2763 {
2764 /* anything except ENOENT is fishy */
2765 if (errno != ENOENT)
2766 elog(LOG,
2767 "could not open temporary-files directory \"%s\": %m",
2768 tmpdirname);
2769 return;
2770 }
2771
2772 while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
2773 {
2774 if (strcmp(temp_de->d_name, ".") == 0 ||
2775 strcmp(temp_de->d_name, "..") == 0)
2776 continue;
2777
2778 snprintf(rm_path, sizeof(rm_path), "%s/%s",
2779 tmpdirname, temp_de->d_name);
2780
2781 if (strncmp(temp_de->d_name,
2782 PG_TEMP_FILE_PREFIX,
2783 strlen(PG_TEMP_FILE_PREFIX)) == 0)
2784 unlink(rm_path); /* note we ignore any error */
2785 else
2786 elog(LOG,
2787 "unexpected file found in temporary-files directory: \"%s\"",
2788 rm_path);
2789 }
2790
2791 FreeDir(temp_dir);
2792 }
2793
2794 /* Process one tablespace directory, look for per-DB subdirectories */
2795 static void
RemovePgTempRelationFiles(const char * tsdirname)2796 RemovePgTempRelationFiles(const char *tsdirname)
2797 {
2798 DIR *ts_dir;
2799 struct dirent *de;
2800 char dbspace_path[MAXPGPATH * 2];
2801
2802 ts_dir = AllocateDir(tsdirname);
2803 if (ts_dir == NULL)
2804 {
2805 /* anything except ENOENT is fishy */
2806 if (errno != ENOENT)
2807 elog(LOG,
2808 "could not open tablespace directory \"%s\": %m",
2809 tsdirname);
2810 return;
2811 }
2812
2813 while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
2814 {
2815 int i = 0;
2816
2817 /*
2818 * We're only interested in the per-database directories, which have
2819 * numeric names. Note that this code will also (properly) ignore "."
2820 * and "..".
2821 */
2822 while (isdigit((unsigned char) de->d_name[i]))
2823 ++i;
2824 if (de->d_name[i] != '\0' || i == 0)
2825 continue;
2826
2827 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
2828 tsdirname, de->d_name);
2829 RemovePgTempRelationFilesInDbspace(dbspace_path);
2830 }
2831
2832 FreeDir(ts_dir);
2833 }
2834
2835 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
2836 static void
RemovePgTempRelationFilesInDbspace(const char * dbspacedirname)2837 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
2838 {
2839 DIR *dbspace_dir;
2840 struct dirent *de;
2841 char rm_path[MAXPGPATH * 2];
2842
2843 dbspace_dir = AllocateDir(dbspacedirname);
2844 if (dbspace_dir == NULL)
2845 {
2846 /* we just saw this directory, so it really ought to be there */
2847 elog(LOG,
2848 "could not open dbspace directory \"%s\": %m",
2849 dbspacedirname);
2850 return;
2851 }
2852
2853 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
2854 {
2855 if (!looks_like_temp_rel_name(de->d_name))
2856 continue;
2857
2858 snprintf(rm_path, sizeof(rm_path), "%s/%s",
2859 dbspacedirname, de->d_name);
2860
2861 unlink(rm_path); /* note we ignore any error */
2862 }
2863
2864 FreeDir(dbspace_dir);
2865 }
2866
2867 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
2868 static bool
looks_like_temp_rel_name(const char * name)2869 looks_like_temp_rel_name(const char *name)
2870 {
2871 int pos;
2872 int savepos;
2873
2874 /* Must start with "t". */
2875 if (name[0] != 't')
2876 return false;
2877
2878 /* Followed by a non-empty string of digits and then an underscore. */
2879 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
2880 ;
2881 if (pos == 1 || name[pos] != '_')
2882 return false;
2883
2884 /* Followed by another nonempty string of digits. */
2885 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
2886 ;
2887 if (savepos == pos)
2888 return false;
2889
2890 /* We might have _forkname or .segment or both. */
2891 if (name[pos] == '_')
2892 {
2893 int forkchar = forkname_chars(&name[pos + 1], NULL);
2894
2895 if (forkchar <= 0)
2896 return false;
2897 pos += forkchar + 1;
2898 }
2899 if (name[pos] == '.')
2900 {
2901 int segchar;
2902
2903 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
2904 ;
2905 if (segchar <= 1)
2906 return false;
2907 pos += segchar;
2908 }
2909
2910 /* Now we should be at the end. */
2911 if (name[pos] != '\0')
2912 return false;
2913 return true;
2914 }
2915
2916
2917 /*
2918 * Issue fsync recursively on PGDATA and all its contents.
2919 *
2920 * We fsync regular files and directories wherever they are, but we
2921 * follow symlinks only for pg_xlog and immediately under pg_tblspc.
2922 * Other symlinks are presumed to point at files we're not responsible
2923 * for fsyncing, and might not have privileges to write at all.
2924 *
2925 * Errors are logged but not considered fatal; that's because this is used
2926 * only during database startup, to deal with the possibility that there are
2927 * issued-but-unsynced writes pending against the data directory. We want to
2928 * ensure that such writes reach disk before anything that's done in the new
2929 * run. However, aborting on error would result in failure to start for
2930 * harmless cases such as read-only files in the data directory, and that's
2931 * not good either.
2932 *
2933 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
2934 * rewriting all changes again during recovery.
2935 *
2936 * Note we assume we're chdir'd into PGDATA to begin with.
2937 */
2938 void
SyncDataDirectory(void)2939 SyncDataDirectory(void)
2940 {
2941 bool xlog_is_symlink;
2942
2943 /* We can skip this whole thing if fsync is disabled. */
2944 if (!enableFsync)
2945 return;
2946
2947 /*
2948 * If pg_xlog is a symlink, we'll need to recurse into it separately,
2949 * because the first walkdir below will ignore it.
2950 */
2951 xlog_is_symlink = false;
2952
2953 #ifndef WIN32
2954 {
2955 struct stat st;
2956
2957 if (lstat("pg_xlog", &st) < 0)
2958 ereport(LOG,
2959 (errcode_for_file_access(),
2960 errmsg("could not stat file \"%s\": %m",
2961 "pg_xlog")));
2962 else if (S_ISLNK(st.st_mode))
2963 xlog_is_symlink = true;
2964 }
2965 #else
2966 if (pgwin32_is_junction("pg_xlog"))
2967 xlog_is_symlink = true;
2968 #endif
2969
2970 /*
2971 * If possible, hint to the kernel that we're soon going to fsync the data
2972 * directory and its contents. Errors in this step are even less
2973 * interesting than normal, so log them only at DEBUG1.
2974 */
2975 #ifdef PG_FLUSH_DATA_WORKS
2976 walkdir(".", pre_sync_fname, false, DEBUG1);
2977 if (xlog_is_symlink)
2978 walkdir("pg_xlog", pre_sync_fname, false, DEBUG1);
2979 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
2980 #endif
2981
2982 /*
2983 * Now we do the fsync()s in the same order.
2984 *
2985 * The main call ignores symlinks, so in addition to specially processing
2986 * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with
2987 * process_symlinks = true. Note that if there are any plain directories
2988 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
2989 * so we don't worry about optimizing it.
2990 */
2991 walkdir(".", datadir_fsync_fname, false, LOG);
2992 if (xlog_is_symlink)
2993 walkdir("pg_xlog", datadir_fsync_fname, false, LOG);
2994 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
2995 }
2996
2997 /*
2998 * walkdir: recursively walk a directory, applying the action to each
2999 * regular file and directory (including the named directory itself).
3000 *
3001 * If process_symlinks is true, the action and recursion are also applied
3002 * to regular files and directories that are pointed to by symlinks in the
3003 * given directory; otherwise symlinks are ignored. Symlinks are always
3004 * ignored in subdirectories, ie we intentionally don't pass down the
3005 * process_symlinks flag to recursive calls.
3006 *
3007 * Errors are reported at level elevel, which might be ERROR or less.
3008 *
3009 * See also walkdir in initdb.c, which is a frontend version of this logic.
3010 */
3011 static void
walkdir(const char * path,void (* action)(const char * fname,bool isdir,int elevel),bool process_symlinks,int elevel)3012 walkdir(const char *path,
3013 void (*action) (const char *fname, bool isdir, int elevel),
3014 bool process_symlinks,
3015 int elevel)
3016 {
3017 DIR *dir;
3018 struct dirent *de;
3019
3020 dir = AllocateDir(path);
3021 if (dir == NULL)
3022 {
3023 ereport(elevel,
3024 (errcode_for_file_access(),
3025 errmsg("could not open directory \"%s\": %m", path)));
3026 return;
3027 }
3028
3029 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3030 {
3031 char subpath[MAXPGPATH * 2];
3032 struct stat fst;
3033 int sret;
3034
3035 CHECK_FOR_INTERRUPTS();
3036
3037 if (strcmp(de->d_name, ".") == 0 ||
3038 strcmp(de->d_name, "..") == 0)
3039 continue;
3040
3041 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3042
3043 if (process_symlinks)
3044 sret = stat(subpath, &fst);
3045 else
3046 sret = lstat(subpath, &fst);
3047
3048 if (sret < 0)
3049 {
3050 ereport(elevel,
3051 (errcode_for_file_access(),
3052 errmsg("could not stat file \"%s\": %m", subpath)));
3053 continue;
3054 }
3055
3056 if (S_ISREG(fst.st_mode))
3057 (*action) (subpath, false, elevel);
3058 else if (S_ISDIR(fst.st_mode))
3059 walkdir(subpath, action, false, elevel);
3060 }
3061
3062 FreeDir(dir); /* we ignore any error here */
3063
3064 /*
3065 * It's important to fsync the destination directory itself as individual
3066 * file fsyncs don't guarantee that the directory entry for the file is
3067 * synced.
3068 */
3069 (*action) (path, true, elevel);
3070 }
3071
3072
3073 /*
3074 * Hint to the OS that it should get ready to fsync() this file.
3075 *
3076 * Ignores errors trying to open unreadable files, and logs other errors at a
3077 * caller-specified level.
3078 */
3079 #ifdef PG_FLUSH_DATA_WORKS
3080
3081 static void
pre_sync_fname(const char * fname,bool isdir,int elevel)3082 pre_sync_fname(const char *fname, bool isdir, int elevel)
3083 {
3084 int fd;
3085
3086 /* Don't try to flush directories, it'll likely just fail */
3087 if (isdir)
3088 return;
3089
3090 fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);
3091
3092 if (fd < 0)
3093 {
3094 if (errno == EACCES)
3095 return;
3096 ereport(elevel,
3097 (errcode_for_file_access(),
3098 errmsg("could not open file \"%s\": %m", fname)));
3099 return;
3100 }
3101
3102 /*
3103 * pg_flush_data() ignores errors, which is ok because this is only a
3104 * hint.
3105 */
3106 pg_flush_data(fd, 0, 0);
3107
3108 (void) CloseTransientFile(fd);
3109 }
3110
3111 #endif /* PG_FLUSH_DATA_WORKS */
3112
3113 static void
datadir_fsync_fname(const char * fname,bool isdir,int elevel)3114 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3115 {
3116 /*
3117 * We want to silently ignoring errors about unreadable files. Pass that
3118 * desire on to fsync_fname_ext().
3119 */
3120 fsync_fname_ext(fname, isdir, true, elevel);
3121 }
3122
3123 /*
3124 * fsync_fname_ext -- Try to fsync a file or directory
3125 *
3126 * If ignore_perm is true, ignore errors upon trying to open unreadable
3127 * files. Logs other errors at a caller-specified level.
3128 *
3129 * Returns 0 if the operation succeeded, -1 otherwise.
3130 */
3131 static int
fsync_fname_ext(const char * fname,bool isdir,bool ignore_perm,int elevel)3132 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3133 {
3134 int fd;
3135 int flags;
3136 int returncode;
3137
3138 /*
3139 * Some OSs require directories to be opened read-only whereas other
3140 * systems don't allow us to fsync files opened read-only; so we need both
3141 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3142 * not writable by our userid, but we assume that's OK.
3143 */
3144 flags = PG_BINARY;
3145 if (!isdir)
3146 flags |= O_RDWR;
3147 else
3148 flags |= O_RDONLY;
3149
3150 fd = OpenTransientFile((char *) fname, flags, 0);
3151
3152 /*
3153 * Some OSs don't allow us to open directories at all (Windows returns
3154 * EACCES), just ignore the error in that case. If desired also silently
3155 * ignoring errors about unreadable files. Log others.
3156 */
3157 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3158 return 0;
3159 else if (fd < 0 && ignore_perm && errno == EACCES)
3160 return 0;
3161 else if (fd < 0)
3162 {
3163 ereport(elevel,
3164 (errcode_for_file_access(),
3165 errmsg("could not open file \"%s\": %m", fname)));
3166 return -1;
3167 }
3168
3169 returncode = pg_fsync(fd);
3170
3171 /*
3172 * Some OSes don't allow us to fsync directories at all, so we can ignore
3173 * those errors. Anything else needs to be logged.
3174 */
3175 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3176 {
3177 int save_errno;
3178
3179 /* close file upon error, might not be in transaction context */
3180 save_errno = errno;
3181 (void) CloseTransientFile(fd);
3182 errno = save_errno;
3183
3184 ereport(elevel,
3185 (errcode_for_file_access(),
3186 errmsg("could not fsync file \"%s\": %m", fname)));
3187 return -1;
3188 }
3189
3190 (void) CloseTransientFile(fd);
3191
3192 return 0;
3193 }
3194
3195 /*
3196 * fsync_parent_path -- fsync the parent path of a file or directory
3197 *
3198 * This is aimed at making file operations persistent on disk in case of
3199 * an OS crash or power failure.
3200 */
3201 static int
fsync_parent_path(const char * fname,int elevel)3202 fsync_parent_path(const char *fname, int elevel)
3203 {
3204 char parentpath[MAXPGPATH];
3205
3206 strlcpy(parentpath, fname, MAXPGPATH);
3207 get_parent_directory(parentpath);
3208
3209 /*
3210 * get_parent_directory() returns an empty string if the input argument is
3211 * just a file name (see comments in path.c), so handle that as being the
3212 * current directory.
3213 */
3214 if (strlen(parentpath) == 0)
3215 strlcpy(parentpath, ".", MAXPGPATH);
3216
3217 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3218 return -1;
3219
3220 return 0;
3221 }
3222
3223 /*
3224 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3225 *
3226 * Failure to fsync any data file is cause for immediate panic, unless
3227 * data_sync_retry is enabled. Data may have been written to the operating
3228 * system and removed from our buffer pool already, and if we are running on
3229 * an operating system that forgets dirty data on write-back failure, there
3230 * may be only one copy of the data remaining: in the WAL. A later attempt to
3231 * fsync again might falsely report success. Therefore we must not allow any
3232 * further checkpoints to be attempted. data_sync_retry can in theory be
3233 * enabled on systems known not to drop dirty buffered data on write-back
3234 * failure (with the likely outcome that checkpoints will continue to fail
3235 * until the underlying problem is fixed).
3236 *
3237 * Any code that reports a failure from fsync() or related functions should
3238 * filter the error level with this function.
3239 */
3240 int
data_sync_elevel(int elevel)3241 data_sync_elevel(int elevel)
3242 {
3243 return data_sync_retry ? elevel : PANIC;
3244 }
3245