1 /*
2  *	file.c
3  *
4  *	file system operations
5  *
6  *	Copyright (c) 2010-2021, PostgreSQL Global Development Group
7  *	src/bin/pg_upgrade/file.c
8  */
9 
10 #include "postgres_fe.h"
11 
12 #include <sys/stat.h>
13 #include <fcntl.h>
14 #ifdef HAVE_COPYFILE_H
15 #include <copyfile.h>
16 #endif
17 #ifdef __linux__
18 #include <sys/ioctl.h>
19 #include <linux/fs.h>
20 #endif
21 
22 #include "access/visibilitymapdefs.h"
23 #include "common/file_perm.h"
24 #include "pg_upgrade.h"
25 #include "storage/bufpage.h"
26 #include "storage/checksum.h"
27 #include "storage/checksum_impl.h"
28 
29 
30 /*
31  * cloneFile()
32  *
33  * Clones/reflinks a relation file from src to dst.
34  *
35  * schemaName/relName are relation's SQL name (used for error messages only).
36  */
37 void
cloneFile(const char * src,const char * dst,const char * schemaName,const char * relName)38 cloneFile(const char *src, const char *dst,
39 		  const char *schemaName, const char *relName)
40 {
41 #if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
42 	if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0)
43 		pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
44 				 schemaName, relName, src, dst, strerror(errno));
45 #elif defined(__linux__) && defined(FICLONE)
46 	int			src_fd;
47 	int			dest_fd;
48 
49 	if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
50 		pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s\n",
51 				 schemaName, relName, src, strerror(errno));
52 
53 	if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
54 						pg_file_create_mode)) < 0)
55 		pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s\n",
56 				 schemaName, relName, dst, strerror(errno));
57 
58 	if (ioctl(dest_fd, FICLONE, src_fd) < 0)
59 	{
60 		unlink(dst);
61 		pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
62 				 schemaName, relName, src, dst, strerror(errno));
63 	}
64 
65 	close(src_fd);
66 	close(dest_fd);
67 #endif
68 }
69 
70 
71 /*
72  * copyFile()
73  *
74  * Copies a relation file from src to dst.
75  * schemaName/relName are relation's SQL name (used for error messages only).
76  */
77 void
copyFile(const char * src,const char * dst,const char * schemaName,const char * relName)78 copyFile(const char *src, const char *dst,
79 		 const char *schemaName, const char *relName)
80 {
81 #ifndef WIN32
82 	int			src_fd;
83 	int			dest_fd;
84 	char	   *buffer;
85 
86 	if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
87 		pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n",
88 				 schemaName, relName, src, strerror(errno));
89 
90 	if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
91 						pg_file_create_mode)) < 0)
92 		pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n",
93 				 schemaName, relName, dst, strerror(errno));
94 
95 	/* copy in fairly large chunks for best efficiency */
96 #define COPY_BUF_SIZE (50 * BLCKSZ)
97 
98 	buffer = (char *) pg_malloc(COPY_BUF_SIZE);
99 
100 	/* perform data copying i.e read src source, write to destination */
101 	while (true)
102 	{
103 		ssize_t		nbytes = read(src_fd, buffer, COPY_BUF_SIZE);
104 
105 		if (nbytes < 0)
106 			pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s\n",
107 					 schemaName, relName, src, strerror(errno));
108 
109 		if (nbytes == 0)
110 			break;
111 
112 		errno = 0;
113 		if (write(dest_fd, buffer, nbytes) != nbytes)
114 		{
115 			/* if write didn't set errno, assume problem is no disk space */
116 			if (errno == 0)
117 				errno = ENOSPC;
118 			pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s\n",
119 					 schemaName, relName, dst, strerror(errno));
120 		}
121 	}
122 
123 	pg_free(buffer);
124 	close(src_fd);
125 	close(dest_fd);
126 
127 #else							/* WIN32 */
128 
129 	if (CopyFile(src, dst, true) == 0)
130 	{
131 		_dosmaperr(GetLastError());
132 		pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
133 				 schemaName, relName, src, dst, strerror(errno));
134 	}
135 
136 #endif							/* WIN32 */
137 }
138 
139 
140 /*
141  * linkFile()
142  *
143  * Hard-links a relation file from src to dst.
144  * schemaName/relName are relation's SQL name (used for error messages only).
145  */
146 void
linkFile(const char * src,const char * dst,const char * schemaName,const char * relName)147 linkFile(const char *src, const char *dst,
148 		 const char *schemaName, const char *relName)
149 {
150 	if (link(src, dst) < 0)
151 		pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
152 				 schemaName, relName, src, dst, strerror(errno));
153 }
154 
155 
156 /*
157  * rewriteVisibilityMap()
158  *
159  * Transform a visibility map file, copying from src to dst.
160  * schemaName/relName are relation's SQL name (used for error messages only).
161  *
162  * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
163  * visibility map included one bit per heap page; it now includes two.
164  * When upgrading a cluster from before that time to a current PostgreSQL
165  * version, we could refuse to copy visibility maps from the old cluster
166  * to the new cluster; the next VACUUM would recreate them, but at the
167  * price of scanning the entire table.  So, instead, we rewrite the old
168  * visibility maps in the new format.  That way, the all-visible bits
169  * remain set for the pages for which they were set previously.  The
170  * all-frozen bits are never set by this conversion; we leave that to VACUUM.
171  */
172 void
rewriteVisibilityMap(const char * fromfile,const char * tofile,const char * schemaName,const char * relName)173 rewriteVisibilityMap(const char *fromfile, const char *tofile,
174 					 const char *schemaName, const char *relName)
175 {
176 	int			src_fd;
177 	int			dst_fd;
178 	PGAlignedBlock buffer;
179 	PGAlignedBlock new_vmbuf;
180 	ssize_t		totalBytesRead = 0;
181 	ssize_t		src_filesize;
182 	int			rewriteVmBytesPerPage;
183 	BlockNumber new_blkno = 0;
184 	struct stat statbuf;
185 
186 	/* Compute number of old-format bytes per new page */
187 	rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
188 
189 	if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0)
190 		pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n",
191 				 schemaName, relName, fromfile, strerror(errno));
192 
193 	if (fstat(src_fd, &statbuf) != 0)
194 		pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s\n",
195 				 schemaName, relName, fromfile, strerror(errno));
196 
197 	if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
198 					   pg_file_create_mode)) < 0)
199 		pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n",
200 				 schemaName, relName, tofile, strerror(errno));
201 
202 	/* Save old file size */
203 	src_filesize = statbuf.st_size;
204 
205 	/*
206 	 * Turn each visibility map page into 2 pages one by one. Each new page
207 	 * has the same page header as the old one.  If the last section of the
208 	 * last page is empty, we skip it, mostly to avoid turning one-page
209 	 * visibility maps for small relations into two pages needlessly.
210 	 */
211 	while (totalBytesRead < src_filesize)
212 	{
213 		ssize_t		bytesRead;
214 		char	   *old_cur;
215 		char	   *old_break;
216 		char	   *old_blkend;
217 		PageHeaderData pageheader;
218 		bool		old_lastblk;
219 
220 		if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ)
221 		{
222 			if (bytesRead < 0)
223 				pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s\n",
224 						 schemaName, relName, fromfile, strerror(errno));
225 			else
226 				pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"\n",
227 						 schemaName, relName, fromfile);
228 		}
229 
230 		totalBytesRead += BLCKSZ;
231 		old_lastblk = (totalBytesRead == src_filesize);
232 
233 		/* Save the page header data */
234 		memcpy(&pageheader, buffer.data, SizeOfPageHeaderData);
235 
236 		/*
237 		 * These old_* variables point to old visibility map page. old_cur
238 		 * points to current position on old page. old_blkend points to end of
239 		 * old block.  old_break is the end+1 position on the old page for the
240 		 * data that will be transferred to the current new page.
241 		 */
242 		old_cur = buffer.data + SizeOfPageHeaderData;
243 		old_blkend = buffer.data + bytesRead;
244 		old_break = old_cur + rewriteVmBytesPerPage;
245 
246 		while (old_break <= old_blkend)
247 		{
248 			char	   *new_cur;
249 			bool		empty = true;
250 			bool		old_lastpart;
251 
252 			/* First, copy old page header to new page */
253 			memcpy(new_vmbuf.data, &pageheader, SizeOfPageHeaderData);
254 
255 			/* Rewriting the last part of the last old page? */
256 			old_lastpart = old_lastblk && (old_break == old_blkend);
257 
258 			new_cur = new_vmbuf.data + SizeOfPageHeaderData;
259 
260 			/* Process old page bytes one by one, and turn it into new page. */
261 			while (old_cur < old_break)
262 			{
263 				uint8		byte = *(uint8 *) old_cur;
264 				uint16		new_vmbits = 0;
265 				int			i;
266 
267 				/* Generate new format bits while keeping old information */
268 				for (i = 0; i < BITS_PER_BYTE; i++)
269 				{
270 					if (byte & (1 << i))
271 					{
272 						empty = false;
273 						new_vmbits |=
274 							VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i);
275 					}
276 				}
277 
278 				/* Copy new visibility map bytes to new-format page */
279 				new_cur[0] = (char) (new_vmbits & 0xFF);
280 				new_cur[1] = (char) (new_vmbits >> 8);
281 
282 				old_cur++;
283 				new_cur += BITS_PER_HEAPBLOCK;
284 			}
285 
286 			/* If the last part of the last page is empty, skip writing it */
287 			if (old_lastpart && empty)
288 				break;
289 
290 			/* Set new checksum for visibility map page, if enabled */
291 			if (new_cluster.controldata.data_checksum_version != 0)
292 				((PageHeader) new_vmbuf.data)->pd_checksum =
293 					pg_checksum_page(new_vmbuf.data, new_blkno);
294 
295 			errno = 0;
296 			if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ)
297 			{
298 				/* if write didn't set errno, assume problem is no disk space */
299 				if (errno == 0)
300 					errno = ENOSPC;
301 				pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s\n",
302 						 schemaName, relName, tofile, strerror(errno));
303 			}
304 
305 			/* Advance for next new page */
306 			old_break += rewriteVmBytesPerPage;
307 			new_blkno++;
308 		}
309 	}
310 
311 	/* Clean up */
312 	close(dst_fd);
313 	close(src_fd);
314 }
315 
316 void
check_file_clone(void)317 check_file_clone(void)
318 {
319 	char		existing_file[MAXPGPATH];
320 	char		new_link_file[MAXPGPATH];
321 
322 	snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
323 	snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata);
324 	unlink(new_link_file);		/* might fail */
325 
326 #if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
327 	if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0)
328 		pg_fatal("could not clone file between old and new data directories: %s\n",
329 				 strerror(errno));
330 #elif defined(__linux__) && defined(FICLONE)
331 	{
332 		int			src_fd;
333 		int			dest_fd;
334 
335 		if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0)
336 			pg_fatal("could not open file \"%s\": %s\n",
337 					 existing_file, strerror(errno));
338 
339 		if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
340 							pg_file_create_mode)) < 0)
341 			pg_fatal("could not create file \"%s\": %s\n",
342 					 new_link_file, strerror(errno));
343 
344 		if (ioctl(dest_fd, FICLONE, src_fd) < 0)
345 			pg_fatal("could not clone file between old and new data directories: %s\n",
346 					 strerror(errno));
347 
348 		close(src_fd);
349 		close(dest_fd);
350 	}
351 #else
352 	pg_fatal("file cloning not supported on this platform\n");
353 #endif
354 
355 	unlink(new_link_file);
356 }
357 
358 void
check_hard_link(void)359 check_hard_link(void)
360 {
361 	char		existing_file[MAXPGPATH];
362 	char		new_link_file[MAXPGPATH];
363 
364 	snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
365 	snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata);
366 	unlink(new_link_file);		/* might fail */
367 
368 	if (link(existing_file, new_link_file) < 0)
369 		pg_fatal("could not create hard link between old and new data directories: %s\n"
370 				 "In link mode the old and new data directories must be on the same file system.\n",
371 				 strerror(errno));
372 
373 	unlink(new_link_file);
374 }
375