1 /*
2  *	file.c
3  *
4  *	file system operations
5  *
6  *	Copyright (c) 2010-2019, PostgreSQL Global Development Group
7  *	src/bin/pg_upgrade/file.c
8  */
9 
10 #include "postgres_fe.h"
11 
12 #include "access/visibilitymap.h"
13 #include "common/file_perm.h"
14 #include "pg_upgrade.h"
15 #include "storage/bufpage.h"
16 #include "storage/checksum.h"
17 #include "storage/checksum_impl.h"
18 
19 #include <sys/stat.h>
20 #include <fcntl.h>
21 #ifdef HAVE_COPYFILE_H
22 #include <copyfile.h>
23 #endif
24 #ifdef __linux__
25 #include <sys/ioctl.h>
26 #include <linux/fs.h>
27 #endif
28 
29 
30 #ifdef WIN32
31 static int	win32_pghardlink(const char *src, const char *dst);
32 #endif
33 
34 
35 /*
36  * cloneFile()
37  *
38  * Clones/reflinks a relation file from src to dst.
39  *
40  * schemaName/relName are relation's SQL name (used for error messages only).
41  */
42 void
cloneFile(const char * src,const char * dst,const char * schemaName,const char * relName)43 cloneFile(const char *src, const char *dst,
44 		  const char *schemaName, const char *relName)
45 {
46 #if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
47 	if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0)
48 		pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
49 				 schemaName, relName, src, dst, strerror(errno));
50 #elif defined(__linux__) && defined(FICLONE)
51 	int			src_fd;
52 	int			dest_fd;
53 
54 	if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
55 		pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s\n",
56 				 schemaName, relName, src, strerror(errno));
57 
58 	if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
59 						pg_file_create_mode)) < 0)
60 		pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s\n",
61 				 schemaName, relName, dst, strerror(errno));
62 
63 	if (ioctl(dest_fd, FICLONE, src_fd) < 0)
64 	{
65 		unlink(dst);
66 		pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
67 				 schemaName, relName, src, dst, strerror(errno));
68 	}
69 
70 	close(src_fd);
71 	close(dest_fd);
72 #endif
73 }
74 
75 
76 /*
77  * copyFile()
78  *
79  * Copies a relation file from src to dst.
80  * schemaName/relName are relation's SQL name (used for error messages only).
81  */
82 void
copyFile(const char * src,const char * dst,const char * schemaName,const char * relName)83 copyFile(const char *src, const char *dst,
84 		 const char *schemaName, const char *relName)
85 {
86 #ifndef WIN32
87 	int			src_fd;
88 	int			dest_fd;
89 	char	   *buffer;
90 
91 	if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
92 		pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n",
93 				 schemaName, relName, src, strerror(errno));
94 
95 	if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
96 						pg_file_create_mode)) < 0)
97 		pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n",
98 				 schemaName, relName, dst, strerror(errno));
99 
100 	/* copy in fairly large chunks for best efficiency */
101 #define COPY_BUF_SIZE (50 * BLCKSZ)
102 
103 	buffer = (char *) pg_malloc(COPY_BUF_SIZE);
104 
105 	/* perform data copying i.e read src source, write to destination */
106 	while (true)
107 	{
108 		ssize_t		nbytes = read(src_fd, buffer, COPY_BUF_SIZE);
109 
110 		if (nbytes < 0)
111 			pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s\n",
112 					 schemaName, relName, src, strerror(errno));
113 
114 		if (nbytes == 0)
115 			break;
116 
117 		errno = 0;
118 		if (write(dest_fd, buffer, nbytes) != nbytes)
119 		{
120 			/* if write didn't set errno, assume problem is no disk space */
121 			if (errno == 0)
122 				errno = ENOSPC;
123 			pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s\n",
124 					 schemaName, relName, dst, strerror(errno));
125 		}
126 	}
127 
128 	pg_free(buffer);
129 	close(src_fd);
130 	close(dest_fd);
131 
132 #else							/* WIN32 */
133 
134 	if (CopyFile(src, dst, true) == 0)
135 	{
136 		_dosmaperr(GetLastError());
137 		pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
138 				 schemaName, relName, src, dst, strerror(errno));
139 	}
140 
141 #endif							/* WIN32 */
142 }
143 
144 
145 /*
146  * linkFile()
147  *
148  * Hard-links a relation file from src to dst.
149  * schemaName/relName are relation's SQL name (used for error messages only).
150  */
151 void
linkFile(const char * src,const char * dst,const char * schemaName,const char * relName)152 linkFile(const char *src, const char *dst,
153 		 const char *schemaName, const char *relName)
154 {
155 	if (pg_link_file(src, dst) < 0)
156 		pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
157 				 schemaName, relName, src, dst, strerror(errno));
158 }
159 
160 
161 /*
162  * rewriteVisibilityMap()
163  *
164  * Transform a visibility map file, copying from src to dst.
165  * schemaName/relName are relation's SQL name (used for error messages only).
166  *
167  * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
168  * visibility map included one bit per heap page; it now includes two.
169  * When upgrading a cluster from before that time to a current PostgreSQL
170  * version, we could refuse to copy visibility maps from the old cluster
171  * to the new cluster; the next VACUUM would recreate them, but at the
172  * price of scanning the entire table.  So, instead, we rewrite the old
173  * visibility maps in the new format.  That way, the all-visible bits
174  * remain set for the pages for which they were set previously.  The
175  * all-frozen bits are never set by this conversion; we leave that to VACUUM.
176  */
177 void
rewriteVisibilityMap(const char * fromfile,const char * tofile,const char * schemaName,const char * relName)178 rewriteVisibilityMap(const char *fromfile, const char *tofile,
179 					 const char *schemaName, const char *relName)
180 {
181 	int			src_fd;
182 	int			dst_fd;
183 	PGAlignedBlock buffer;
184 	PGAlignedBlock new_vmbuf;
185 	ssize_t		totalBytesRead = 0;
186 	ssize_t		src_filesize;
187 	int			rewriteVmBytesPerPage;
188 	BlockNumber new_blkno = 0;
189 	struct stat statbuf;
190 
191 	/* Compute number of old-format bytes per new page */
192 	rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
193 
194 	if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0)
195 		pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n",
196 				 schemaName, relName, fromfile, strerror(errno));
197 
198 	if (fstat(src_fd, &statbuf) != 0)
199 		pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s\n",
200 				 schemaName, relName, fromfile, strerror(errno));
201 
202 	if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
203 					   pg_file_create_mode)) < 0)
204 		pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n",
205 				 schemaName, relName, tofile, strerror(errno));
206 
207 	/* Save old file size */
208 	src_filesize = statbuf.st_size;
209 
210 	/*
211 	 * Turn each visibility map page into 2 pages one by one. Each new page
212 	 * has the same page header as the old one.  If the last section of the
213 	 * last page is empty, we skip it, mostly to avoid turning one-page
214 	 * visibility maps for small relations into two pages needlessly.
215 	 */
216 	while (totalBytesRead < src_filesize)
217 	{
218 		ssize_t		bytesRead;
219 		char	   *old_cur;
220 		char	   *old_break;
221 		char	   *old_blkend;
222 		PageHeaderData pageheader;
223 		bool		old_lastblk;
224 
225 		if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ)
226 		{
227 			if (bytesRead < 0)
228 				pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s\n",
229 						 schemaName, relName, fromfile, strerror(errno));
230 			else
231 				pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"\n",
232 						 schemaName, relName, fromfile);
233 		}
234 
235 		totalBytesRead += BLCKSZ;
236 		old_lastblk = (totalBytesRead == src_filesize);
237 
238 		/* Save the page header data */
239 		memcpy(&pageheader, buffer.data, SizeOfPageHeaderData);
240 
241 		/*
242 		 * These old_* variables point to old visibility map page. old_cur
243 		 * points to current position on old page. old_blkend points to end of
244 		 * old block.  old_break is the end+1 position on the old page for the
245 		 * data that will be transferred to the current new page.
246 		 */
247 		old_cur = buffer.data + SizeOfPageHeaderData;
248 		old_blkend = buffer.data + bytesRead;
249 		old_break = old_cur + rewriteVmBytesPerPage;
250 
251 		while (old_break <= old_blkend)
252 		{
253 			char	   *new_cur;
254 			bool		empty = true;
255 			bool		old_lastpart;
256 
257 			/* First, copy old page header to new page */
258 			memcpy(new_vmbuf.data, &pageheader, SizeOfPageHeaderData);
259 
260 			/* Rewriting the last part of the last old page? */
261 			old_lastpart = old_lastblk && (old_break == old_blkend);
262 
263 			new_cur = new_vmbuf.data + SizeOfPageHeaderData;
264 
265 			/* Process old page bytes one by one, and turn it into new page. */
266 			while (old_cur < old_break)
267 			{
268 				uint8		byte = *(uint8 *) old_cur;
269 				uint16		new_vmbits = 0;
270 				int			i;
271 
272 				/* Generate new format bits while keeping old information */
273 				for (i = 0; i < BITS_PER_BYTE; i++)
274 				{
275 					if (byte & (1 << i))
276 					{
277 						empty = false;
278 						new_vmbits |=
279 							VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i);
280 					}
281 				}
282 
283 				/* Copy new visibility map bytes to new-format page */
284 				new_cur[0] = (char) (new_vmbits & 0xFF);
285 				new_cur[1] = (char) (new_vmbits >> 8);
286 
287 				old_cur++;
288 				new_cur += BITS_PER_HEAPBLOCK;
289 			}
290 
291 			/* If the last part of the last page is empty, skip writing it */
292 			if (old_lastpart && empty)
293 				break;
294 
295 			/* Set new checksum for visibility map page, if enabled */
296 			if (new_cluster.controldata.data_checksum_version != 0)
297 				((PageHeader) new_vmbuf.data)->pd_checksum =
298 					pg_checksum_page(new_vmbuf.data, new_blkno);
299 
300 			errno = 0;
301 			if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ)
302 			{
303 				/* if write didn't set errno, assume problem is no disk space */
304 				if (errno == 0)
305 					errno = ENOSPC;
306 				pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s\n",
307 						 schemaName, relName, tofile, strerror(errno));
308 			}
309 
310 			/* Advance for next new page */
311 			old_break += rewriteVmBytesPerPage;
312 			new_blkno++;
313 		}
314 	}
315 
316 	/* Clean up */
317 	close(dst_fd);
318 	close(src_fd);
319 }
320 
321 void
check_file_clone(void)322 check_file_clone(void)
323 {
324 	char		existing_file[MAXPGPATH];
325 	char		new_link_file[MAXPGPATH];
326 
327 	snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
328 	snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata);
329 	unlink(new_link_file);		/* might fail */
330 
331 #if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
332 	if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0)
333 		pg_fatal("could not clone file between old and new data directories: %s\n",
334 				 strerror(errno));
335 #elif defined(__linux__) && defined(FICLONE)
336 	{
337 		int			src_fd;
338 		int			dest_fd;
339 
340 		if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0)
341 			pg_fatal("could not open file \"%s\": %s\n",
342 					 existing_file, strerror(errno));
343 
344 		if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
345 							pg_file_create_mode)) < 0)
346 			pg_fatal("could not create file \"%s\": %s\n",
347 					 new_link_file, strerror(errno));
348 
349 		if (ioctl(dest_fd, FICLONE, src_fd) < 0)
350 			pg_fatal("could not clone file between old and new data directories: %s\n",
351 					 strerror(errno));
352 
353 		close(src_fd);
354 		close(dest_fd);
355 	}
356 #else
357 	pg_fatal("file cloning not supported on this platform\n");
358 #endif
359 
360 	unlink(new_link_file);
361 }
362 
363 void
check_hard_link(void)364 check_hard_link(void)
365 {
366 	char		existing_file[MAXPGPATH];
367 	char		new_link_file[MAXPGPATH];
368 
369 	snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
370 	snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata);
371 	unlink(new_link_file);		/* might fail */
372 
373 	if (pg_link_file(existing_file, new_link_file) < 0)
374 		pg_fatal("could not create hard link between old and new data directories: %s\n"
375 				 "In link mode the old and new data directories must be on the same file system.\n",
376 				 strerror(errno));
377 
378 	unlink(new_link_file);
379 }
380 
381 #ifdef WIN32
382 /* implementation of pg_link_file() on Windows */
383 static int
win32_pghardlink(const char * src,const char * dst)384 win32_pghardlink(const char *src, const char *dst)
385 {
386 	/*
387 	 * CreateHardLinkA returns zero for failure
388 	 * http://msdn.microsoft.com/en-us/library/aa363860(VS.85).aspx
389 	 */
390 	if (CreateHardLinkA(dst, src, NULL) == 0)
391 	{
392 		_dosmaperr(GetLastError());
393 		return -1;
394 	}
395 	else
396 		return 0;
397 }
398 #endif
399