1 /*-------------------------------------------------------------------------
2 *
3 * File-processing utility routines.
4 *
5 * Assorted utility functions to work on files.
6 *
7 *
8 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
10 *
11 * src/common/file_utils.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres_fe.h"
16
17 #include <dirent.h>
18 #include <fcntl.h>
19 #include <sys/stat.h>
20 #include <unistd.h>
21
22 #include "common/file_utils.h"
23 #include "common/logging.h"
24
25
26 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
27 #if defined(HAVE_SYNC_FILE_RANGE)
28 #define PG_FLUSH_DATA_WORKS 1
29 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
30 #define PG_FLUSH_DATA_WORKS 1
31 #endif
32
33 /*
34 * pg_xlog has been renamed to pg_wal in version 10.
35 */
36 #define MINIMUM_VERSION_FOR_PG_WAL 100000
37
38 #ifdef PG_FLUSH_DATA_WORKS
39 static int pre_sync_fname(const char *fname, bool isdir);
40 #endif
41 static void walkdir(const char *path,
42 int (*action) (const char *fname, bool isdir),
43 bool process_symlinks);
44
45 /*
46 * Issue fsync recursively on PGDATA and all its contents.
47 *
48 * We fsync regular files and directories wherever they are, but we follow
49 * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
50 * Other symlinks are presumed to point at files we're not responsible for
51 * fsyncing, and might not have privileges to write at all.
52 *
53 * serverVersion indicates the version of the server to be fsync'd.
54 */
55 void
fsync_pgdata(const char * pg_data,int serverVersion)56 fsync_pgdata(const char *pg_data,
57 int serverVersion)
58 {
59 bool xlog_is_symlink;
60 char pg_wal[MAXPGPATH];
61 char pg_tblspc[MAXPGPATH];
62
63 /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
64 snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
65 serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
66 snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
67
68 /*
69 * If pg_wal is a symlink, we'll need to recurse into it separately,
70 * because the first walkdir below will ignore it.
71 */
72 xlog_is_symlink = false;
73
74 #ifndef WIN32
75 {
76 struct stat st;
77
78 if (lstat(pg_wal, &st) < 0)
79 pg_log_error("could not stat file \"%s\": %m", pg_wal);
80 else if (S_ISLNK(st.st_mode))
81 xlog_is_symlink = true;
82 }
83 #else
84 if (pgwin32_is_junction(pg_wal))
85 xlog_is_symlink = true;
86 #endif
87
88 /*
89 * If possible, hint to the kernel that we're soon going to fsync the data
90 * directory and its contents.
91 */
92 #ifdef PG_FLUSH_DATA_WORKS
93 walkdir(pg_data, pre_sync_fname, false);
94 if (xlog_is_symlink)
95 walkdir(pg_wal, pre_sync_fname, false);
96 walkdir(pg_tblspc, pre_sync_fname, true);
97 #endif
98
99 /*
100 * Now we do the fsync()s in the same order.
101 *
102 * The main call ignores symlinks, so in addition to specially processing
103 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
104 * process_symlinks = true. Note that if there are any plain directories
105 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
106 * so we don't worry about optimizing it.
107 */
108 walkdir(pg_data, fsync_fname, false);
109 if (xlog_is_symlink)
110 walkdir(pg_wal, fsync_fname, false);
111 walkdir(pg_tblspc, fsync_fname, true);
112 }
113
114 /*
115 * Issue fsync recursively on the given directory and all its contents.
116 *
117 * This is a convenient wrapper on top of walkdir().
118 */
119 void
fsync_dir_recurse(const char * dir)120 fsync_dir_recurse(const char *dir)
121 {
122 /*
123 * If possible, hint to the kernel that we're soon going to fsync the data
124 * directory and its contents.
125 */
126 #ifdef PG_FLUSH_DATA_WORKS
127 walkdir(dir, pre_sync_fname, false);
128 #endif
129
130 walkdir(dir, fsync_fname, false);
131 }
132
133 /*
134 * walkdir: recursively walk a directory, applying the action to each
135 * regular file and directory (including the named directory itself).
136 *
137 * If process_symlinks is true, the action and recursion are also applied
138 * to regular files and directories that are pointed to by symlinks in the
139 * given directory; otherwise symlinks are ignored. Symlinks are always
140 * ignored in subdirectories, ie we intentionally don't pass down the
141 * process_symlinks flag to recursive calls.
142 *
143 * Errors are reported but not considered fatal.
144 *
145 * See also walkdir in fd.c, which is a backend version of this logic.
146 */
147 static void
walkdir(const char * path,int (* action)(const char * fname,bool isdir),bool process_symlinks)148 walkdir(const char *path,
149 int (*action) (const char *fname, bool isdir),
150 bool process_symlinks)
151 {
152 DIR *dir;
153 struct dirent *de;
154
155 dir = opendir(path);
156 if (dir == NULL)
157 {
158 pg_log_error("could not open directory \"%s\": %m", path);
159 return;
160 }
161
162 while (errno = 0, (de = readdir(dir)) != NULL)
163 {
164 char subpath[MAXPGPATH * 2];
165 struct stat fst;
166 int sret;
167
168 if (strcmp(de->d_name, ".") == 0 ||
169 strcmp(de->d_name, "..") == 0)
170 continue;
171
172 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
173
174 if (process_symlinks)
175 sret = stat(subpath, &fst);
176 else
177 sret = lstat(subpath, &fst);
178
179 if (sret < 0)
180 {
181 pg_log_error("could not stat file \"%s\": %m", subpath);
182 continue;
183 }
184
185 if (S_ISREG(fst.st_mode))
186 (*action) (subpath, false);
187 else if (S_ISDIR(fst.st_mode))
188 walkdir(subpath, action, false);
189 }
190
191 if (errno)
192 pg_log_error("could not read directory \"%s\": %m", path);
193
194 (void) closedir(dir);
195
196 /*
197 * It's important to fsync the destination directory itself as individual
198 * file fsyncs don't guarantee that the directory entry for the file is
199 * synced. Recent versions of ext4 have made the window much wider but
200 * it's been an issue for ext3 and other filesystems in the past.
201 */
202 (*action) (path, true);
203 }
204
205 /*
206 * Hint to the OS that it should get ready to fsync() this file.
207 *
208 * Ignores errors trying to open unreadable files, and reports other errors
209 * non-fatally.
210 */
211 #ifdef PG_FLUSH_DATA_WORKS
212
213 static int
pre_sync_fname(const char * fname,bool isdir)214 pre_sync_fname(const char *fname, bool isdir)
215 {
216 int fd;
217
218 fd = open(fname, O_RDONLY | PG_BINARY, 0);
219
220 if (fd < 0)
221 {
222 if (errno == EACCES || (isdir && errno == EISDIR))
223 return 0;
224 pg_log_error("could not open file \"%s\": %m", fname);
225 return -1;
226 }
227
228 /*
229 * We do what pg_flush_data() would do in the backend: prefer to use
230 * sync_file_range, but fall back to posix_fadvise. We ignore errors
231 * because this is only a hint.
232 */
233 #if defined(HAVE_SYNC_FILE_RANGE)
234 (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
235 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
236 (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
237 #else
238 #error PG_FLUSH_DATA_WORKS should not have been defined
239 #endif
240
241 (void) close(fd);
242 return 0;
243 }
244
245 #endif /* PG_FLUSH_DATA_WORKS */
246
247 /*
248 * fsync_fname -- Try to fsync a file or directory
249 *
250 * Ignores errors trying to open unreadable files, or trying to fsync
251 * directories on systems where that isn't allowed/required. All other errors
252 * are fatal.
253 */
254 int
fsync_fname(const char * fname,bool isdir)255 fsync_fname(const char *fname, bool isdir)
256 {
257 int fd;
258 int flags;
259 int returncode;
260
261 /*
262 * Some OSs require directories to be opened read-only whereas other
263 * systems don't allow us to fsync files opened read-only; so we need both
264 * cases here. Using O_RDWR will cause us to fail to fsync files that are
265 * not writable by our userid, but we assume that's OK.
266 */
267 flags = PG_BINARY;
268 if (!isdir)
269 flags |= O_RDWR;
270 else
271 flags |= O_RDONLY;
272
273 /*
274 * Open the file, silently ignoring errors about unreadable files (or
275 * unsupported operations, e.g. opening a directory under Windows), and
276 * logging others.
277 */
278 fd = open(fname, flags, 0);
279 if (fd < 0)
280 {
281 if (errno == EACCES || (isdir && errno == EISDIR))
282 return 0;
283 pg_log_error("could not open file \"%s\": %m", fname);
284 return -1;
285 }
286
287 returncode = fsync(fd);
288
289 /*
290 * Some OSes don't allow us to fsync directories at all, so we can ignore
291 * those errors. Anything else needs to be reported.
292 */
293 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
294 {
295 pg_log_fatal("could not fsync file \"%s\": %m", fname);
296 (void) close(fd);
297 exit(EXIT_FAILURE);
298 }
299
300 (void) close(fd);
301 return 0;
302 }
303
304 /*
305 * fsync_parent_path -- fsync the parent path of a file or directory
306 *
307 * This is aimed at making file operations persistent on disk in case of
308 * an OS crash or power failure.
309 */
310 int
fsync_parent_path(const char * fname)311 fsync_parent_path(const char *fname)
312 {
313 char parentpath[MAXPGPATH];
314
315 strlcpy(parentpath, fname, MAXPGPATH);
316 get_parent_directory(parentpath);
317
318 /*
319 * get_parent_directory() returns an empty string if the input argument is
320 * just a file name (see comments in path.c), so handle that as being the
321 * current directory.
322 */
323 if (strlen(parentpath) == 0)
324 strlcpy(parentpath, ".", MAXPGPATH);
325
326 if (fsync_fname(parentpath, true) != 0)
327 return -1;
328
329 return 0;
330 }
331
332 /*
333 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
334 *
335 * Wrapper around rename, similar to the backend version.
336 */
337 int
durable_rename(const char * oldfile,const char * newfile)338 durable_rename(const char *oldfile, const char *newfile)
339 {
340 int fd;
341
342 /*
343 * First fsync the old and target path (if it exists), to ensure that they
344 * are properly persistent on disk. Syncing the target file is not
345 * strictly necessary, but it makes it easier to reason about crashes;
346 * because it's then guaranteed that either source or target file exists
347 * after a crash.
348 */
349 if (fsync_fname(oldfile, false) != 0)
350 return -1;
351
352 fd = open(newfile, PG_BINARY | O_RDWR, 0);
353 if (fd < 0)
354 {
355 if (errno != ENOENT)
356 {
357 pg_log_error("could not open file \"%s\": %m", newfile);
358 return -1;
359 }
360 }
361 else
362 {
363 if (fsync(fd) != 0)
364 {
365 pg_log_fatal("could not fsync file \"%s\": %m", newfile);
366 close(fd);
367 exit(EXIT_FAILURE);
368 }
369 close(fd);
370 }
371
372 /* Time to do the real deal... */
373 if (rename(oldfile, newfile) != 0)
374 {
375 pg_log_error("could not rename file \"%s\" to \"%s\": %m",
376 oldfile, newfile);
377 return -1;
378 }
379
380 /*
381 * To guarantee renaming the file is persistent, fsync the file with its
382 * new name, and its containing directory.
383 */
384 if (fsync_fname(newfile, false) != 0)
385 return -1;
386
387 if (fsync_parent_path(newfile) != 0)
388 return -1;
389
390 return 0;
391 }
392