1 /*-------------------------------------------------------------------------
2  *
3  * File-processing utility routines.
4  *
5  * Assorted utility functions to work on files.
6  *
7  *
8  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  * src/common/file_utils.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres_fe.h"
16 
17 #include <dirent.h>
18 #include <fcntl.h>
19 #include <sys/stat.h>
20 #include <unistd.h>
21 
22 #include "common/file_utils.h"
23 
24 
25 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
26 #if defined(HAVE_SYNC_FILE_RANGE)
27 #define PG_FLUSH_DATA_WORKS 1
28 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
29 #define PG_FLUSH_DATA_WORKS 1
30 #endif
31 
32 /*
33  * pg_xlog has been renamed to pg_wal in version 10.
34  */
35 #define MINIMUM_VERSION_FOR_PG_WAL	100000
36 
37 #ifdef PG_FLUSH_DATA_WORKS
38 static int pre_sync_fname(const char *fname, bool isdir,
39 			   const char *progname);
40 #endif
41 static void walkdir(const char *path,
42 		int (*action) (const char *fname, bool isdir, const char *progname),
43 		bool process_symlinks, const char *progname);
44 
45 /*
46  * Issue fsync recursively on PGDATA and all its contents.
47  *
48  * We fsync regular files and directories wherever they are, but we follow
49  * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
50  * Other symlinks are presumed to point at files we're not responsible for
51  * fsyncing, and might not have privileges to write at all.
52  *
53  * serverVersion indicates the version of the server to be fsync'd.
54  *
55  * Errors are reported but not considered fatal.
56  */
57 void
fsync_pgdata(const char * pg_data,const char * progname,int serverVersion)58 fsync_pgdata(const char *pg_data,
59 			 const char *progname,
60 			 int serverVersion)
61 {
62 	bool		xlog_is_symlink;
63 	char		pg_wal[MAXPGPATH];
64 	char		pg_tblspc[MAXPGPATH];
65 
66 	/* handle renaming of pg_xlog to pg_wal in post-10 clusters */
67 	snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
68 			 serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
69 	snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
70 
71 	/*
72 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
73 	 * because the first walkdir below will ignore it.
74 	 */
75 	xlog_is_symlink = false;
76 
77 #ifndef WIN32
78 	{
79 		struct stat st;
80 
81 		if (lstat(pg_wal, &st) < 0)
82 			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
83 					progname, pg_wal, strerror(errno));
84 		else if (S_ISLNK(st.st_mode))
85 			xlog_is_symlink = true;
86 	}
87 #else
88 	if (pgwin32_is_junction(pg_wal))
89 		xlog_is_symlink = true;
90 #endif
91 
92 	/*
93 	 * If possible, hint to the kernel that we're soon going to fsync the data
94 	 * directory and its contents.
95 	 */
96 #ifdef PG_FLUSH_DATA_WORKS
97 	walkdir(pg_data, pre_sync_fname, false, progname);
98 	if (xlog_is_symlink)
99 		walkdir(pg_wal, pre_sync_fname, false, progname);
100 	walkdir(pg_tblspc, pre_sync_fname, true, progname);
101 #endif
102 
103 	/*
104 	 * Now we do the fsync()s in the same order.
105 	 *
106 	 * The main call ignores symlinks, so in addition to specially processing
107 	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
108 	 * process_symlinks = true.  Note that if there are any plain directories
109 	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
110 	 * so we don't worry about optimizing it.
111 	 */
112 	walkdir(pg_data, fsync_fname, false, progname);
113 	if (xlog_is_symlink)
114 		walkdir(pg_wal, fsync_fname, false, progname);
115 	walkdir(pg_tblspc, fsync_fname, true, progname);
116 }
117 
118 /*
119  * Issue fsync recursively on the given directory and all its contents.
120  *
121  * This is a convenient wrapper on top of walkdir().
122  */
123 void
fsync_dir_recurse(const char * dir,const char * progname)124 fsync_dir_recurse(const char *dir, const char *progname)
125 {
126 	/*
127 	 * If possible, hint to the kernel that we're soon going to fsync the data
128 	 * directory and its contents.
129 	 */
130 #ifdef PG_FLUSH_DATA_WORKS
131 	walkdir(dir, pre_sync_fname, false, progname);
132 #endif
133 
134 	walkdir(dir, fsync_fname, false, progname);
135 }
136 
137 /*
138  * walkdir: recursively walk a directory, applying the action to each
139  * regular file and directory (including the named directory itself).
140  *
141  * If process_symlinks is true, the action and recursion are also applied
142  * to regular files and directories that are pointed to by symlinks in the
143  * given directory; otherwise symlinks are ignored.  Symlinks are always
144  * ignored in subdirectories, ie we intentionally don't pass down the
145  * process_symlinks flag to recursive calls.
146  *
147  * Errors are reported but not considered fatal.
148  *
149  * See also walkdir in fd.c, which is a backend version of this logic.
150  */
151 static void
walkdir(const char * path,int (* action)(const char * fname,bool isdir,const char * progname),bool process_symlinks,const char * progname)152 walkdir(const char *path,
153 		int (*action) (const char *fname, bool isdir, const char *progname),
154 		bool process_symlinks, const char *progname)
155 {
156 	DIR		   *dir;
157 	struct dirent *de;
158 
159 	dir = opendir(path);
160 	if (dir == NULL)
161 	{
162 		fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"),
163 				progname, path, strerror(errno));
164 		return;
165 	}
166 
167 	while (errno = 0, (de = readdir(dir)) != NULL)
168 	{
169 		char		subpath[MAXPGPATH * 2];
170 		struct stat fst;
171 		int			sret;
172 
173 		if (strcmp(de->d_name, ".") == 0 ||
174 			strcmp(de->d_name, "..") == 0)
175 			continue;
176 
177 		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
178 
179 		if (process_symlinks)
180 			sret = stat(subpath, &fst);
181 		else
182 			sret = lstat(subpath, &fst);
183 
184 		if (sret < 0)
185 		{
186 			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
187 					progname, subpath, strerror(errno));
188 			continue;
189 		}
190 
191 		if (S_ISREG(fst.st_mode))
192 			(*action) (subpath, false, progname);
193 		else if (S_ISDIR(fst.st_mode))
194 			walkdir(subpath, action, false, progname);
195 	}
196 
197 	if (errno)
198 		fprintf(stderr, _("%s: could not read directory \"%s\": %s\n"),
199 				progname, path, strerror(errno));
200 
201 	(void) closedir(dir);
202 
203 	/*
204 	 * It's important to fsync the destination directory itself as individual
205 	 * file fsyncs don't guarantee that the directory entry for the file is
206 	 * synced.  Recent versions of ext4 have made the window much wider but
207 	 * it's been an issue for ext3 and other filesystems in the past.
208 	 */
209 	(*action) (path, true, progname);
210 }
211 
212 /*
213  * Hint to the OS that it should get ready to fsync() this file.
214  *
215  * Ignores errors trying to open unreadable files, and reports other errors
216  * non-fatally.
217  */
218 #ifdef PG_FLUSH_DATA_WORKS
219 
220 static int
pre_sync_fname(const char * fname,bool isdir,const char * progname)221 pre_sync_fname(const char *fname, bool isdir, const char *progname)
222 {
223 	int			fd;
224 
225 	fd = open(fname, O_RDONLY | PG_BINARY);
226 
227 	if (fd < 0)
228 	{
229 		if (errno == EACCES || (isdir && errno == EISDIR))
230 			return 0;
231 		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
232 				progname, fname, strerror(errno));
233 		return -1;
234 	}
235 
236 	/*
237 	 * We do what pg_flush_data() would do in the backend: prefer to use
238 	 * sync_file_range, but fall back to posix_fadvise.  We ignore errors
239 	 * because this is only a hint.
240 	 */
241 #if defined(HAVE_SYNC_FILE_RANGE)
242 	(void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
243 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
244 	(void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
245 #else
246 #error PG_FLUSH_DATA_WORKS should not have been defined
247 #endif
248 
249 	(void) close(fd);
250 	return 0;
251 }
252 
253 #endif							/* PG_FLUSH_DATA_WORKS */
254 
255 /*
256  * fsync_fname -- Try to fsync a file or directory
257  *
258  * Ignores errors trying to open unreadable files, or trying to fsync
259  * directories on systems where that isn't allowed/required.  Reports
260  * other errors non-fatally.
261  */
262 int
fsync_fname(const char * fname,bool isdir,const char * progname)263 fsync_fname(const char *fname, bool isdir, const char *progname)
264 {
265 	int			fd;
266 	int			flags;
267 	int			returncode;
268 
269 	/*
270 	 * Some OSs require directories to be opened read-only whereas other
271 	 * systems don't allow us to fsync files opened read-only; so we need both
272 	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
273 	 * not writable by our userid, but we assume that's OK.
274 	 */
275 	flags = PG_BINARY;
276 	if (!isdir)
277 		flags |= O_RDWR;
278 	else
279 		flags |= O_RDONLY;
280 
281 	/*
282 	 * Open the file, silently ignoring errors about unreadable files (or
283 	 * unsupported operations, e.g. opening a directory under Windows), and
284 	 * logging others.
285 	 */
286 	fd = open(fname, flags);
287 	if (fd < 0)
288 	{
289 		if (errno == EACCES || (isdir && errno == EISDIR))
290 			return 0;
291 		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
292 				progname, fname, strerror(errno));
293 		return -1;
294 	}
295 
296 	returncode = fsync(fd);
297 
298 	/*
299 	 * Some OSes don't allow us to fsync directories at all, so we can ignore
300 	 * those errors. Anything else needs to be reported.
301 	 */
302 	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
303 	{
304 		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
305 				progname, fname, strerror(errno));
306 		(void) close(fd);
307 		return -1;
308 	}
309 
310 	(void) close(fd);
311 	return 0;
312 }
313 
314 /*
315  * fsync_parent_path -- fsync the parent path of a file or directory
316  *
317  * This is aimed at making file operations persistent on disk in case of
318  * an OS crash or power failure.
319  */
320 int
fsync_parent_path(const char * fname,const char * progname)321 fsync_parent_path(const char *fname, const char *progname)
322 {
323 	char		parentpath[MAXPGPATH];
324 
325 	strlcpy(parentpath, fname, MAXPGPATH);
326 	get_parent_directory(parentpath);
327 
328 	/*
329 	 * get_parent_directory() returns an empty string if the input argument is
330 	 * just a file name (see comments in path.c), so handle that as being the
331 	 * current directory.
332 	 */
333 	if (strlen(parentpath) == 0)
334 		strlcpy(parentpath, ".", MAXPGPATH);
335 
336 	if (fsync_fname(parentpath, true, progname) != 0)
337 		return -1;
338 
339 	return 0;
340 }
341 
342 /*
343  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
344  *
345  * Wrapper around rename, similar to the backend version.
346  */
347 int
durable_rename(const char * oldfile,const char * newfile,const char * progname)348 durable_rename(const char *oldfile, const char *newfile, const char *progname)
349 {
350 	int			fd;
351 
352 	/*
353 	 * First fsync the old and target path (if it exists), to ensure that they
354 	 * are properly persistent on disk. Syncing the target file is not
355 	 * strictly necessary, but it makes it easier to reason about crashes;
356 	 * because it's then guaranteed that either source or target file exists
357 	 * after a crash.
358 	 */
359 	if (fsync_fname(oldfile, false, progname) != 0)
360 		return -1;
361 
362 	fd = open(newfile, PG_BINARY | O_RDWR, 0);
363 	if (fd < 0)
364 	{
365 		if (errno != ENOENT)
366 		{
367 			fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
368 					progname, newfile, strerror(errno));
369 			return -1;
370 		}
371 	}
372 	else
373 	{
374 		if (fsync(fd) != 0)
375 		{
376 			fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
377 					progname, newfile, strerror(errno));
378 			close(fd);
379 			return -1;
380 		}
381 		close(fd);
382 	}
383 
384 	/* Time to do the real deal... */
385 	if (rename(oldfile, newfile) != 0)
386 	{
387 		fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"),
388 				progname, oldfile, newfile, strerror(errno));
389 		return -1;
390 	}
391 
392 	/*
393 	 * To guarantee renaming the file is persistent, fsync the file with its
394 	 * new name, and its containing directory.
395 	 */
396 	if (fsync_fname(newfile, false, progname) != 0)
397 		return -1;
398 
399 	if (fsync_parent_path(newfile, progname) != 0)
400 		return -1;
401 
402 	return 0;
403 }
404