1 /*-------------------------------------------------------------------------
2  *
3  * File-processing utility routines.
4  *
5  * Assorted utility functions to work on files.
6  *
7  *
8  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  * src/common/file_utils.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres_fe.h"
16 
17 #include <dirent.h>
18 #include <fcntl.h>
19 #include <sys/stat.h>
20 #include <unistd.h>
21 
22 #include "common/file_utils.h"
23 #include "common/logging.h"
24 
25 
26 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
27 #if defined(HAVE_SYNC_FILE_RANGE)
28 #define PG_FLUSH_DATA_WORKS 1
29 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
30 #define PG_FLUSH_DATA_WORKS 1
31 #endif
32 
33 /*
34  * pg_xlog has been renamed to pg_wal in version 10.
35  */
36 #define MINIMUM_VERSION_FOR_PG_WAL	100000
37 
38 #ifdef PG_FLUSH_DATA_WORKS
39 static int	pre_sync_fname(const char *fname, bool isdir);
40 #endif
41 static void walkdir(const char *path,
42 					int (*action) (const char *fname, bool isdir),
43 					bool process_symlinks);
44 
45 /*
46  * Issue fsync recursively on PGDATA and all its contents.
47  *
48  * We fsync regular files and directories wherever they are, but we follow
49  * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
50  * Other symlinks are presumed to point at files we're not responsible for
51  * fsyncing, and might not have privileges to write at all.
52  *
53  * serverVersion indicates the version of the server to be fsync'd.
54  *
55  * Errors are reported but not considered fatal.
56  */
57 void
fsync_pgdata(const char * pg_data,int serverVersion)58 fsync_pgdata(const char *pg_data,
59 			 int serverVersion)
60 {
61 	bool		xlog_is_symlink;
62 	char		pg_wal[MAXPGPATH];
63 	char		pg_tblspc[MAXPGPATH];
64 
65 	/* handle renaming of pg_xlog to pg_wal in post-10 clusters */
66 	snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
67 			 serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
68 	snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
69 
70 	/*
71 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
72 	 * because the first walkdir below will ignore it.
73 	 */
74 	xlog_is_symlink = false;
75 
76 #ifndef WIN32
77 	{
78 		struct stat st;
79 
80 		if (lstat(pg_wal, &st) < 0)
81 			pg_log_error("could not stat file \"%s\": %m", pg_wal);
82 		else if (S_ISLNK(st.st_mode))
83 			xlog_is_symlink = true;
84 	}
85 #else
86 	if (pgwin32_is_junction(pg_wal))
87 		xlog_is_symlink = true;
88 #endif
89 
90 	/*
91 	 * If possible, hint to the kernel that we're soon going to fsync the data
92 	 * directory and its contents.
93 	 */
94 #ifdef PG_FLUSH_DATA_WORKS
95 	walkdir(pg_data, pre_sync_fname, false);
96 	if (xlog_is_symlink)
97 		walkdir(pg_wal, pre_sync_fname, false);
98 	walkdir(pg_tblspc, pre_sync_fname, true);
99 #endif
100 
101 	/*
102 	 * Now we do the fsync()s in the same order.
103 	 *
104 	 * The main call ignores symlinks, so in addition to specially processing
105 	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
106 	 * process_symlinks = true.  Note that if there are any plain directories
107 	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
108 	 * so we don't worry about optimizing it.
109 	 */
110 	walkdir(pg_data, fsync_fname, false);
111 	if (xlog_is_symlink)
112 		walkdir(pg_wal, fsync_fname, false);
113 	walkdir(pg_tblspc, fsync_fname, true);
114 }
115 
116 /*
117  * Issue fsync recursively on the given directory and all its contents.
118  *
119  * This is a convenient wrapper on top of walkdir().
120  */
121 void
fsync_dir_recurse(const char * dir)122 fsync_dir_recurse(const char *dir)
123 {
124 	/*
125 	 * If possible, hint to the kernel that we're soon going to fsync the data
126 	 * directory and its contents.
127 	 */
128 #ifdef PG_FLUSH_DATA_WORKS
129 	walkdir(dir, pre_sync_fname, false);
130 #endif
131 
132 	walkdir(dir, fsync_fname, false);
133 }
134 
135 /*
136  * walkdir: recursively walk a directory, applying the action to each
137  * regular file and directory (including the named directory itself).
138  *
139  * If process_symlinks is true, the action and recursion are also applied
140  * to regular files and directories that are pointed to by symlinks in the
141  * given directory; otherwise symlinks are ignored.  Symlinks are always
142  * ignored in subdirectories, ie we intentionally don't pass down the
143  * process_symlinks flag to recursive calls.
144  *
145  * Errors are reported but not considered fatal.
146  *
147  * See also walkdir in fd.c, which is a backend version of this logic.
148  */
149 static void
walkdir(const char * path,int (* action)(const char * fname,bool isdir),bool process_symlinks)150 walkdir(const char *path,
151 		int (*action) (const char *fname, bool isdir),
152 		bool process_symlinks)
153 {
154 	DIR		   *dir;
155 	struct dirent *de;
156 
157 	dir = opendir(path);
158 	if (dir == NULL)
159 	{
160 		pg_log_error("could not open directory \"%s\": %m", path);
161 		return;
162 	}
163 
164 	while (errno = 0, (de = readdir(dir)) != NULL)
165 	{
166 		char		subpath[MAXPGPATH * 2];
167 		struct stat fst;
168 		int			sret;
169 
170 		if (strcmp(de->d_name, ".") == 0 ||
171 			strcmp(de->d_name, "..") == 0)
172 			continue;
173 
174 		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
175 
176 		if (process_symlinks)
177 			sret = stat(subpath, &fst);
178 		else
179 			sret = lstat(subpath, &fst);
180 
181 		if (sret < 0)
182 		{
183 			pg_log_error("could not stat file \"%s\": %m", subpath);
184 			continue;
185 		}
186 
187 		if (S_ISREG(fst.st_mode))
188 			(*action) (subpath, false);
189 		else if (S_ISDIR(fst.st_mode))
190 			walkdir(subpath, action, false);
191 	}
192 
193 	if (errno)
194 		pg_log_error("could not read directory \"%s\": %m", path);
195 
196 	(void) closedir(dir);
197 
198 	/*
199 	 * It's important to fsync the destination directory itself as individual
200 	 * file fsyncs don't guarantee that the directory entry for the file is
201 	 * synced.  Recent versions of ext4 have made the window much wider but
202 	 * it's been an issue for ext3 and other filesystems in the past.
203 	 */
204 	(*action) (path, true);
205 }
206 
207 /*
208  * Hint to the OS that it should get ready to fsync() this file.
209  *
210  * Ignores errors trying to open unreadable files, and reports other errors
211  * non-fatally.
212  */
213 #ifdef PG_FLUSH_DATA_WORKS
214 
215 static int
pre_sync_fname(const char * fname,bool isdir)216 pre_sync_fname(const char *fname, bool isdir)
217 {
218 	int			fd;
219 
220 	fd = open(fname, O_RDONLY | PG_BINARY, 0);
221 
222 	if (fd < 0)
223 	{
224 		if (errno == EACCES || (isdir && errno == EISDIR))
225 			return 0;
226 		pg_log_error("could not open file \"%s\": %m", fname);
227 		return -1;
228 	}
229 
230 	/*
231 	 * We do what pg_flush_data() would do in the backend: prefer to use
232 	 * sync_file_range, but fall back to posix_fadvise.  We ignore errors
233 	 * because this is only a hint.
234 	 */
235 #if defined(HAVE_SYNC_FILE_RANGE)
236 	(void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
237 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
238 	(void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
239 #else
240 #error PG_FLUSH_DATA_WORKS should not have been defined
241 #endif
242 
243 	(void) close(fd);
244 	return 0;
245 }
246 
247 #endif							/* PG_FLUSH_DATA_WORKS */
248 
249 /*
250  * fsync_fname -- Try to fsync a file or directory
251  *
252  * Ignores errors trying to open unreadable files, or trying to fsync
253  * directories on systems where that isn't allowed/required.  Reports
254  * other errors non-fatally.
255  */
256 int
fsync_fname(const char * fname,bool isdir)257 fsync_fname(const char *fname, bool isdir)
258 {
259 	int			fd;
260 	int			flags;
261 	int			returncode;
262 
263 	/*
264 	 * Some OSs require directories to be opened read-only whereas other
265 	 * systems don't allow us to fsync files opened read-only; so we need both
266 	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
267 	 * not writable by our userid, but we assume that's OK.
268 	 */
269 	flags = PG_BINARY;
270 	if (!isdir)
271 		flags |= O_RDWR;
272 	else
273 		flags |= O_RDONLY;
274 
275 	/*
276 	 * Open the file, silently ignoring errors about unreadable files (or
277 	 * unsupported operations, e.g. opening a directory under Windows), and
278 	 * logging others.
279 	 */
280 	fd = open(fname, flags, 0);
281 	if (fd < 0)
282 	{
283 		if (errno == EACCES || (isdir && errno == EISDIR))
284 			return 0;
285 		pg_log_error("could not open file \"%s\": %m", fname);
286 		return -1;
287 	}
288 
289 	returncode = fsync(fd);
290 
291 	/*
292 	 * Some OSes don't allow us to fsync directories at all, so we can ignore
293 	 * those errors. Anything else needs to be reported.
294 	 */
295 	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
296 	{
297 		pg_log_error("could not fsync file \"%s\": %m", fname);
298 		(void) close(fd);
299 		return -1;
300 	}
301 
302 	(void) close(fd);
303 	return 0;
304 }
305 
306 /*
307  * fsync_parent_path -- fsync the parent path of a file or directory
308  *
309  * This is aimed at making file operations persistent on disk in case of
310  * an OS crash or power failure.
311  */
312 int
fsync_parent_path(const char * fname)313 fsync_parent_path(const char *fname)
314 {
315 	char		parentpath[MAXPGPATH];
316 
317 	strlcpy(parentpath, fname, MAXPGPATH);
318 	get_parent_directory(parentpath);
319 
320 	/*
321 	 * get_parent_directory() returns an empty string if the input argument is
322 	 * just a file name (see comments in path.c), so handle that as being the
323 	 * current directory.
324 	 */
325 	if (strlen(parentpath) == 0)
326 		strlcpy(parentpath, ".", MAXPGPATH);
327 
328 	if (fsync_fname(parentpath, true) != 0)
329 		return -1;
330 
331 	return 0;
332 }
333 
334 /*
335  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
336  *
337  * Wrapper around rename, similar to the backend version.
338  */
339 int
durable_rename(const char * oldfile,const char * newfile)340 durable_rename(const char *oldfile, const char *newfile)
341 {
342 	int			fd;
343 
344 	/*
345 	 * First fsync the old and target path (if it exists), to ensure that they
346 	 * are properly persistent on disk. Syncing the target file is not
347 	 * strictly necessary, but it makes it easier to reason about crashes;
348 	 * because it's then guaranteed that either source or target file exists
349 	 * after a crash.
350 	 */
351 	if (fsync_fname(oldfile, false) != 0)
352 		return -1;
353 
354 	fd = open(newfile, PG_BINARY | O_RDWR, 0);
355 	if (fd < 0)
356 	{
357 		if (errno != ENOENT)
358 		{
359 			pg_log_error("could not open file \"%s\": %m", newfile);
360 			return -1;
361 		}
362 	}
363 	else
364 	{
365 		if (fsync(fd) != 0)
366 		{
367 			pg_log_error("could not fsync file \"%s\": %m", newfile);
368 			close(fd);
369 			return -1;
370 		}
371 		close(fd);
372 	}
373 
374 	/* Time to do the real deal... */
375 	if (rename(oldfile, newfile) != 0)
376 	{
377 		pg_log_error("could not rename file \"%s\" to \"%s\": %m",
378 					 oldfile, newfile);
379 		return -1;
380 	}
381 
382 	/*
383 	 * To guarantee renaming the file is persistent, fsync the file with its
384 	 * new name, and its containing directory.
385 	 */
386 	if (fsync_fname(newfile, false) != 0)
387 		return -1;
388 
389 	if (fsync_parent_path(newfile) != 0)
390 		return -1;
391 
392 	return 0;
393 }
394