1 /* 2 * file.c 3 * 4 * file system operations 5 * 6 * Copyright (c) 2010-2019, PostgreSQL Global Development Group 7 * src/bin/pg_upgrade/file.c 8 */ 9 10 #include "postgres_fe.h" 11 12 #include "access/visibilitymap.h" 13 #include "common/file_perm.h" 14 #include "pg_upgrade.h" 15 #include "storage/bufpage.h" 16 #include "storage/checksum.h" 17 #include "storage/checksum_impl.h" 18 19 #include <sys/stat.h> 20 #include <fcntl.h> 21 #ifdef HAVE_COPYFILE_H 22 #include <copyfile.h> 23 #endif 24 #ifdef __linux__ 25 #include <sys/ioctl.h> 26 #include <linux/fs.h> 27 #endif 28 29 30 #ifdef WIN32 31 static int win32_pghardlink(const char *src, const char *dst); 32 #endif 33 34 35 /* 36 * cloneFile() 37 * 38 * Clones/reflinks a relation file from src to dst. 39 * 40 * schemaName/relName are relation's SQL name (used for error messages only). 41 */ 42 void 43 cloneFile(const char *src, const char *dst, 44 const char *schemaName, const char *relName) 45 { 46 #if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) 47 if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0) 48 pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", 49 schemaName, relName, src, dst, strerror(errno)); 50 #elif defined(__linux__) && defined(FICLONE) 51 int src_fd; 52 int dest_fd; 53 54 if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) 55 pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s\n", 56 schemaName, relName, src, strerror(errno)); 57 58 if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 59 pg_file_create_mode)) < 0) 60 pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s\n", 61 schemaName, relName, dst, strerror(errno)); 62 63 if (ioctl(dest_fd, FICLONE, src_fd) < 0) 64 { 65 unlink(dst); 66 pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", 67 schemaName, relName, src, dst, strerror(errno)); 68 } 69 70 close(src_fd); 71 close(dest_fd); 72 #endif 73 } 74 75 76 /* 77 * copyFile() 78 * 79 * Copies a relation file from src to dst. 80 * schemaName/relName are relation's SQL name (used for error messages only). 81 */ 82 void 83 copyFile(const char *src, const char *dst, 84 const char *schemaName, const char *relName) 85 { 86 #ifndef WIN32 87 int src_fd; 88 int dest_fd; 89 char *buffer; 90 91 if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) 92 pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n", 93 schemaName, relName, src, strerror(errno)); 94 95 if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 96 pg_file_create_mode)) < 0) 97 pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n", 98 schemaName, relName, dst, strerror(errno)); 99 100 /* copy in fairly large chunks for best efficiency */ 101 #define COPY_BUF_SIZE (50 * BLCKSZ) 102 103 buffer = (char *) pg_malloc(COPY_BUF_SIZE); 104 105 /* perform data copying i.e read src source, write to destination */ 106 while (true) 107 { 108 ssize_t nbytes = read(src_fd, buffer, COPY_BUF_SIZE); 109 110 if (nbytes < 0) 111 pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s\n", 112 schemaName, relName, src, strerror(errno)); 113 114 if (nbytes == 0) 115 break; 116 117 errno = 0; 118 if (write(dest_fd, buffer, nbytes) != nbytes) 119 { 120 /* if write didn't set errno, assume problem is no disk space */ 121 if (errno == 0) 122 errno = ENOSPC; 123 pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s\n", 124 schemaName, relName, dst, strerror(errno)); 125 } 126 } 127 128 pg_free(buffer); 129 close(src_fd); 130 close(dest_fd); 131 132 #else /* WIN32 */ 133 134 if (CopyFile(src, dst, true) == 0) 135 { 136 _dosmaperr(GetLastError()); 137 pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", 138 schemaName, relName, src, dst, strerror(errno)); 139 } 140 141 #endif /* WIN32 */ 142 } 143 144 145 /* 146 * linkFile() 147 * 148 * Hard-links a relation file from src to dst. 149 * schemaName/relName are relation's SQL name (used for error messages only). 150 */ 151 void 152 linkFile(const char *src, const char *dst, 153 const char *schemaName, const char *relName) 154 { 155 if (pg_link_file(src, dst) < 0) 156 pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", 157 schemaName, relName, src, dst, strerror(errno)); 158 } 159 160 161 /* 162 * rewriteVisibilityMap() 163 * 164 * Transform a visibility map file, copying from src to dst. 165 * schemaName/relName are relation's SQL name (used for error messages only). 166 * 167 * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's 168 * visibility map included one bit per heap page; it now includes two. 169 * When upgrading a cluster from before that time to a current PostgreSQL 170 * version, we could refuse to copy visibility maps from the old cluster 171 * to the new cluster; the next VACUUM would recreate them, but at the 172 * price of scanning the entire table. So, instead, we rewrite the old 173 * visibility maps in the new format. That way, the all-visible bits 174 * remain set for the pages for which they were set previously. The 175 * all-frozen bits are never set by this conversion; we leave that to VACUUM. 176 */ 177 void 178 rewriteVisibilityMap(const char *fromfile, const char *tofile, 179 const char *schemaName, const char *relName) 180 { 181 int src_fd; 182 int dst_fd; 183 PGAlignedBlock buffer; 184 PGAlignedBlock new_vmbuf; 185 ssize_t totalBytesRead = 0; 186 ssize_t src_filesize; 187 int rewriteVmBytesPerPage; 188 BlockNumber new_blkno = 0; 189 struct stat statbuf; 190 191 /* Compute number of old-format bytes per new page */ 192 rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; 193 194 if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0) 195 pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n", 196 schemaName, relName, fromfile, strerror(errno)); 197 198 if (fstat(src_fd, &statbuf) != 0) 199 pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s\n", 200 schemaName, relName, fromfile, strerror(errno)); 201 202 if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 203 pg_file_create_mode)) < 0) 204 pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n", 205 schemaName, relName, tofile, strerror(errno)); 206 207 /* Save old file size */ 208 src_filesize = statbuf.st_size; 209 210 /* 211 * Turn each visibility map page into 2 pages one by one. Each new page 212 * has the same page header as the old one. If the last section of the 213 * last page is empty, we skip it, mostly to avoid turning one-page 214 * visibility maps for small relations into two pages needlessly. 215 */ 216 while (totalBytesRead < src_filesize) 217 { 218 ssize_t bytesRead; 219 char *old_cur; 220 char *old_break; 221 char *old_blkend; 222 PageHeaderData pageheader; 223 bool old_lastblk; 224 225 if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ) 226 { 227 if (bytesRead < 0) 228 pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s\n", 229 schemaName, relName, fromfile, strerror(errno)); 230 else 231 pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"\n", 232 schemaName, relName, fromfile); 233 } 234 235 totalBytesRead += BLCKSZ; 236 old_lastblk = (totalBytesRead == src_filesize); 237 238 /* Save the page header data */ 239 memcpy(&pageheader, buffer.data, SizeOfPageHeaderData); 240 241 /* 242 * These old_* variables point to old visibility map page. old_cur 243 * points to current position on old page. old_blkend points to end of 244 * old block. old_break is the end+1 position on the old page for the 245 * data that will be transferred to the current new page. 246 */ 247 old_cur = buffer.data + SizeOfPageHeaderData; 248 old_blkend = buffer.data + bytesRead; 249 old_break = old_cur + rewriteVmBytesPerPage; 250 251 while (old_break <= old_blkend) 252 { 253 char *new_cur; 254 bool empty = true; 255 bool old_lastpart; 256 257 /* First, copy old page header to new page */ 258 memcpy(new_vmbuf.data, &pageheader, SizeOfPageHeaderData); 259 260 /* Rewriting the last part of the last old page? */ 261 old_lastpart = old_lastblk && (old_break == old_blkend); 262 263 new_cur = new_vmbuf.data + SizeOfPageHeaderData; 264 265 /* Process old page bytes one by one, and turn it into new page. */ 266 while (old_cur < old_break) 267 { 268 uint8 byte = *(uint8 *) old_cur; 269 uint16 new_vmbits = 0; 270 int i; 271 272 /* Generate new format bits while keeping old information */ 273 for (i = 0; i < BITS_PER_BYTE; i++) 274 { 275 if (byte & (1 << i)) 276 { 277 empty = false; 278 new_vmbits |= 279 VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i); 280 } 281 } 282 283 /* Copy new visibility map bytes to new-format page */ 284 new_cur[0] = (char) (new_vmbits & 0xFF); 285 new_cur[1] = (char) (new_vmbits >> 8); 286 287 old_cur++; 288 new_cur += BITS_PER_HEAPBLOCK; 289 } 290 291 /* If the last part of the last page is empty, skip writing it */ 292 if (old_lastpart && empty) 293 break; 294 295 /* Set new checksum for visibility map page, if enabled */ 296 if (new_cluster.controldata.data_checksum_version != 0) 297 ((PageHeader) new_vmbuf.data)->pd_checksum = 298 pg_checksum_page(new_vmbuf.data, new_blkno); 299 300 errno = 0; 301 if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ) 302 { 303 /* if write didn't set errno, assume problem is no disk space */ 304 if (errno == 0) 305 errno = ENOSPC; 306 pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s\n", 307 schemaName, relName, tofile, strerror(errno)); 308 } 309 310 /* Advance for next new page */ 311 old_break += rewriteVmBytesPerPage; 312 new_blkno++; 313 } 314 } 315 316 /* Clean up */ 317 close(dst_fd); 318 close(src_fd); 319 } 320 321 void 322 check_file_clone(void) 323 { 324 char existing_file[MAXPGPATH]; 325 char new_link_file[MAXPGPATH]; 326 327 snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); 328 snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata); 329 unlink(new_link_file); /* might fail */ 330 331 #if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) 332 if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0) 333 pg_fatal("could not clone file between old and new data directories: %s\n", 334 strerror(errno)); 335 #elif defined(__linux__) && defined(FICLONE) 336 { 337 int src_fd; 338 int dest_fd; 339 340 if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) 341 pg_fatal("could not open file \"%s\": %s\n", 342 existing_file, strerror(errno)); 343 344 if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 345 pg_file_create_mode)) < 0) 346 pg_fatal("could not create file \"%s\": %s\n", 347 new_link_file, strerror(errno)); 348 349 if (ioctl(dest_fd, FICLONE, src_fd) < 0) 350 pg_fatal("could not clone file between old and new data directories: %s\n", 351 strerror(errno)); 352 353 close(src_fd); 354 close(dest_fd); 355 } 356 #else 357 pg_fatal("file cloning not supported on this platform\n"); 358 #endif 359 360 unlink(new_link_file); 361 } 362 363 void 364 check_hard_link(void) 365 { 366 char existing_file[MAXPGPATH]; 367 char new_link_file[MAXPGPATH]; 368 369 snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); 370 snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata); 371 unlink(new_link_file); /* might fail */ 372 373 if (pg_link_file(existing_file, new_link_file) < 0) 374 pg_fatal("could not create hard link between old and new data directories: %s\n" 375 "In link mode the old and new data directories must be on the same file system.\n", 376 strerror(errno)); 377 378 unlink(new_link_file); 379 } 380 381 #ifdef WIN32 382 /* implementation of pg_link_file() on Windows */ 383 static int 384 win32_pghardlink(const char *src, const char *dst) 385 { 386 /* 387 * CreateHardLinkA returns zero for failure 388 * http://msdn.microsoft.com/en-us/library/aa363860(VS.85).aspx 389 */ 390 if (CreateHardLinkA(dst, src, NULL) == 0) 391 { 392 _dosmaperr(GetLastError()); 393 return -1; 394 } 395 else 396 return 0; 397 } 398 #endif 399