1 /*------------------------------------------------------------------------- 2 * 3 * md.c 4 * This code manages relations that reside on magnetic disk. 5 * 6 * Or at least, that was what the Berkeley folk had in mind when they named 7 * this file. In reality, what this code provides is an interface from 8 * the smgr API to Unix-like filesystem APIs, so it will work with any type 9 * of device for which the operating system provides filesystem support. 10 * It doesn't matter whether the bits are on spinning rust or some other 11 * storage technology. 12 * 13 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 14 * Portions Copyright (c) 1994, Regents of the University of California 15 * 16 * 17 * IDENTIFICATION 18 * src/backend/storage/smgr/md.c 19 * 20 *------------------------------------------------------------------------- 21 */ 22 #include "postgres.h" 23 24 #include <unistd.h> 25 #include <fcntl.h> 26 #include <sys/file.h> 27 28 #include "miscadmin.h" 29 #include "access/xlogutils.h" 30 #include "access/xlog.h" 31 #include "pgstat.h" 32 #include "portability/instr_time.h" 33 #include "postmaster/bgwriter.h" 34 #include "storage/fd.h" 35 #include "storage/bufmgr.h" 36 #include "storage/relfilenode.h" 37 #include "storage/smgr.h" 38 #include "utils/hsearch.h" 39 #include "utils/memutils.h" 40 #include "pg_trace.h" 41 42 43 /* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */ 44 #define FSYNCS_PER_ABSORB 10 45 #define UNLINKS_PER_ABSORB 10 46 47 /* 48 * Special values for the segno arg to RememberFsyncRequest. 49 * 50 * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an 51 * fsync request from the queue if an identical, subsequent request is found. 52 * See comments there before making changes here. 53 */ 54 #define FORGET_RELATION_FSYNC (InvalidBlockNumber) 55 #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1) 56 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2) 57 58 /* 59 * On Windows, we have to interpret EACCES as possibly meaning the same as 60 * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, 61 * that's what you get. Ugh. This code is designed so that we don't 62 * actually believe these cases are okay without further evidence (namely, 63 * a pending fsync request getting canceled ... see mdsync). 64 */ 65 #ifndef WIN32 66 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) 67 #else 68 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES) 69 #endif 70 71 /* 72 * The magnetic disk storage manager keeps track of open file 73 * descriptors in its own descriptor pool. This is done to make it 74 * easier to support relations that are larger than the operating 75 * system's file size limit (often 2GBytes). In order to do that, 76 * we break relations up into "segment" files that are each shorter than 77 * the OS file size limit. The segment size is set by the RELSEG_SIZE 78 * configuration constant in pg_config.h. 79 * 80 * On disk, a relation must consist of consecutively numbered segment 81 * files in the pattern 82 * -- Zero or more full segments of exactly RELSEG_SIZE blocks each 83 * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks 84 * -- Optionally, any number of inactive segments of size 0 blocks. 85 * The full and partial segments are collectively the "active" segments. 86 * Inactive segments are those that once contained data but are currently 87 * not needed because of an mdtruncate() operation. The reason for leaving 88 * them present at size zero, rather than unlinking them, is that other 89 * backends and/or the checkpointer might be holding open file references to 90 * such segments. If the relation expands again after mdtruncate(), such 91 * that a deactivated segment becomes active again, it is important that 92 * such file references still be valid --- else data might get written 93 * out to an unlinked old copy of a segment file that will eventually 94 * disappear. 95 * 96 * File descriptors are stored in the per-fork md_seg_fds arrays inside 97 * SMgrRelation. The length of these arrays is stored in md_num_open_segs. 98 * Note that a fork's md_num_open_segs having a specific value does not 99 * necessarily mean the relation doesn't have additional segments; we may 100 * just not have opened the next segment yet. (We could not have "all 101 * segments are in the array" as an invariant anyway, since another backend 102 * could extend the relation while we aren't looking.) We do not have 103 * entries for inactive segments, however; as soon as we find a partial 104 * segment, we assume that any subsequent segments are inactive. 105 * 106 * The entire MdfdVec array is palloc'd in the MdCxt memory context. 107 */ 108 109 typedef struct _MdfdVec 110 { 111 File mdfd_vfd; /* fd number in fd.c's pool */ 112 BlockNumber mdfd_segno; /* segment number, from 0 */ 113 } MdfdVec; 114 115 static MemoryContext MdCxt; /* context for all MdfdVec objects */ 116 117 118 /* 119 * In some contexts (currently, standalone backends and the checkpointer) 120 * we keep track of pending fsync operations: we need to remember all relation 121 * segments that have been written since the last checkpoint, so that we can 122 * fsync them down to disk before completing the next checkpoint. This hash 123 * table remembers the pending operations. We use a hash table mostly as 124 * a convenient way of merging duplicate requests. 125 * 126 * We use a similar mechanism to remember no-longer-needed files that can 127 * be deleted after the next checkpoint, but we use a linked list instead of 128 * a hash table, because we don't expect there to be any duplicate requests. 129 * 130 * These mechanisms are only used for non-temp relations; we never fsync 131 * temp rels, nor do we need to postpone their deletion (see comments in 132 * mdunlink). 133 * 134 * (Regular backends do not track pending operations locally, but forward 135 * them to the checkpointer.) 136 */ 137 typedef uint16 CycleCtr; /* can be any convenient integer size */ 138 139 typedef struct 140 { 141 RelFileNode rnode; /* hash table key (must be first!) */ 142 CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */ 143 /* requests[f] has bit n set if we need to fsync segment n of fork f */ 144 Bitmapset *requests[MAX_FORKNUM + 1]; 145 /* canceled[f] is true if we canceled fsyncs for fork "recently" */ 146 bool canceled[MAX_FORKNUM + 1]; 147 } PendingOperationEntry; 148 149 typedef struct 150 { 151 RelFileNode rnode; /* the dead relation to delete */ 152 CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */ 153 } PendingUnlinkEntry; 154 155 static HTAB *pendingOpsTable = NULL; 156 static List *pendingUnlinks = NIL; 157 static MemoryContext pendingOpsCxt; /* context for the above */ 158 159 static CycleCtr mdsync_cycle_ctr = 0; 160 static CycleCtr mdckpt_cycle_ctr = 0; 161 162 163 /*** behavior for mdopen & _mdfd_getseg ***/ 164 /* ereport if segment not present */ 165 #define EXTENSION_FAIL (1 << 0) 166 /* return NULL if segment not present */ 167 #define EXTENSION_RETURN_NULL (1 << 1) 168 /* create new segments as needed */ 169 #define EXTENSION_CREATE (1 << 2) 170 /* create new segments if needed during recovery */ 171 #define EXTENSION_CREATE_RECOVERY (1 << 3) 172 /* 173 * Allow opening segments which are preceded by segments smaller than 174 * RELSEG_SIZE, e.g. inactive segments (see above). Note that this is breaks 175 * mdnblocks() and related functionality henceforth - which currently is ok, 176 * because this is only required in the checkpointer which never uses 177 * mdnblocks(). 178 */ 179 #define EXTENSION_DONT_CHECK_SIZE (1 << 4) 180 181 182 /* local routines */ 183 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, 184 bool isRedo); 185 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior); 186 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, 187 MdfdVec *seg); 188 static void register_unlink(RelFileNodeBackend rnode); 189 static void _fdvec_resize(SMgrRelation reln, 190 ForkNumber forknum, 191 int nseg); 192 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, 193 BlockNumber segno); 194 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno, 195 BlockNumber segno, int oflags); 196 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, 197 BlockNumber blkno, bool skipFsync, int behavior); 198 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, 199 MdfdVec *seg); 200 201 202 /* 203 * mdinit() -- Initialize private state for magnetic disk storage manager. 204 */ 205 void 206 mdinit(void) 207 { 208 MdCxt = AllocSetContextCreate(TopMemoryContext, 209 "MdSmgr", 210 ALLOCSET_DEFAULT_SIZES); 211 212 /* 213 * Create pending-operations hashtable if we need it. Currently, we need 214 * it if we are standalone (not under a postmaster) or if we are a startup 215 * or checkpointer auxiliary process. 216 */ 217 if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) 218 { 219 HASHCTL hash_ctl; 220 221 /* 222 * XXX: The checkpointer needs to add entries to the pending ops table 223 * when absorbing fsync requests. That is done within a critical 224 * section, which isn't usually allowed, but we make an exception. It 225 * means that there's a theoretical possibility that you run out of 226 * memory while absorbing fsync requests, which leads to a PANIC. 227 * Fortunately the hash table is small so that's unlikely to happen in 228 * practice. 229 */ 230 pendingOpsCxt = AllocSetContextCreate(MdCxt, 231 "Pending ops context", 232 ALLOCSET_DEFAULT_SIZES); 233 MemoryContextAllowInCriticalSection(pendingOpsCxt, true); 234 235 MemSet(&hash_ctl, 0, sizeof(hash_ctl)); 236 hash_ctl.keysize = sizeof(RelFileNode); 237 hash_ctl.entrysize = sizeof(PendingOperationEntry); 238 hash_ctl.hcxt = pendingOpsCxt; 239 pendingOpsTable = hash_create("Pending Ops Table", 240 100L, 241 &hash_ctl, 242 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); 243 pendingUnlinks = NIL; 244 } 245 } 246 247 /* 248 * In archive recovery, we rely on checkpointer to do fsyncs, but we will have 249 * already created the pendingOpsTable during initialization of the startup 250 * process. Calling this function drops the local pendingOpsTable so that 251 * subsequent requests will be forwarded to checkpointer. 252 */ 253 void 254 SetForwardFsyncRequests(void) 255 { 256 /* Perform any pending fsyncs we may have queued up, then drop table */ 257 if (pendingOpsTable) 258 { 259 mdsync(); 260 hash_destroy(pendingOpsTable); 261 } 262 pendingOpsTable = NULL; 263 264 /* 265 * We should not have any pending unlink requests, since mdunlink doesn't 266 * queue unlink requests when isRedo. 267 */ 268 Assert(pendingUnlinks == NIL); 269 } 270 271 /* 272 * mdexists() -- Does the physical file exist? 273 * 274 * Note: this will return true for lingering files, with pending deletions 275 */ 276 bool 277 mdexists(SMgrRelation reln, ForkNumber forkNum) 278 { 279 /* 280 * Close it first, to ensure that we notice if the fork has been unlinked 281 * since we opened it. 282 */ 283 mdclose(reln, forkNum); 284 285 return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL); 286 } 287 288 /* 289 * mdcreate() -- Create a new relation on magnetic disk. 290 * 291 * If isRedo is true, it's okay for the relation to exist already. 292 */ 293 void 294 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) 295 { 296 MdfdVec *mdfd; 297 char *path; 298 File fd; 299 300 if (isRedo && reln->md_num_open_segs[forkNum] > 0) 301 return; /* created and opened already... */ 302 303 Assert(reln->md_num_open_segs[forkNum] == 0); 304 305 path = relpath(reln->smgr_rnode, forkNum); 306 307 fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); 308 309 if (fd < 0) 310 { 311 int save_errno = errno; 312 313 /* 314 * During bootstrap, there are cases where a system relation will be 315 * accessed (by internal backend processes) before the bootstrap 316 * script nominally creates it. Therefore, allow the file to exist 317 * already, even if isRedo is not set. (See also mdopen) 318 */ 319 if (isRedo || IsBootstrapProcessingMode()) 320 fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); 321 if (fd < 0) 322 { 323 /* be sure to report the error reported by create, not open */ 324 errno = save_errno; 325 ereport(ERROR, 326 (errcode_for_file_access(), 327 errmsg("could not create file \"%s\": %m", path))); 328 } 329 } 330 331 pfree(path); 332 333 _fdvec_resize(reln, forkNum, 1); 334 mdfd = &reln->md_seg_fds[forkNum][0]; 335 mdfd->mdfd_vfd = fd; 336 mdfd->mdfd_segno = 0; 337 } 338 339 /* 340 * mdunlink() -- Unlink a relation. 341 * 342 * Note that we're passed a RelFileNodeBackend --- by the time this is called, 343 * there won't be an SMgrRelation hashtable entry anymore. 344 * 345 * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber 346 * to delete all forks. 347 * 348 * For regular relations, we don't unlink the first segment file of the rel, 349 * but just truncate it to zero length, and record a request to unlink it after 350 * the next checkpoint. Additional segments can be unlinked immediately, 351 * however. Leaving the empty file in place prevents that relfilenode 352 * number from being reused. The scenario this protects us from is: 353 * 1. We delete a relation (and commit, and actually remove its file). 354 * 2. We create a new relation, which by chance gets the same relfilenode as 355 * the just-deleted one (OIDs must've wrapped around for that to happen). 356 * 3. We crash before another checkpoint occurs. 357 * During replay, we would delete the file and then recreate it, which is fine 358 * if the contents of the file were repopulated by subsequent WAL entries. 359 * But if we didn't WAL-log insertions, but instead relied on fsyncing the 360 * file after populating it (as for instance CLUSTER and CREATE INDEX do), 361 * the contents of the file would be lost forever. By leaving the empty file 362 * until after the next checkpoint, we prevent reassignment of the relfilenode 363 * number until it's safe, because relfilenode assignment skips over any 364 * existing file. 365 * 366 * We do not need to go through this dance for temp relations, though, because 367 * we never make WAL entries for temp rels, and so a temp rel poses no threat 368 * to the health of a regular rel that has taken over its relfilenode number. 369 * The fact that temp rels and regular rels have different file naming 370 * patterns provides additional safety. 371 * 372 * All the above applies only to the relation's main fork; other forks can 373 * just be removed immediately, since they are not needed to prevent the 374 * relfilenode number from being recycled. Also, we do not carefully 375 * track whether other forks have been created or not, but just attempt to 376 * unlink them unconditionally; so we should never complain about ENOENT. 377 * 378 * If isRedo is true, it's unsurprising for the relation to be already gone. 379 * Also, we should remove the file immediately instead of queuing a request 380 * for later, since during redo there's no possibility of creating a 381 * conflicting relation. 382 * 383 * Note: any failure should be reported as WARNING not ERROR, because 384 * we are usually not in a transaction anymore when this is called. 385 */ 386 void 387 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) 388 { 389 /* 390 * We have to clean out any pending fsync requests for the doomed 391 * relation, else the next mdsync() will fail. There can't be any such 392 * requests for a temp relation, though. We can send just one request 393 * even when deleting multiple forks, since the fsync queuing code accepts 394 * the "InvalidForkNumber = all forks" convention. 395 */ 396 if (!RelFileNodeBackendIsTemp(rnode)) 397 ForgetRelationFsyncRequests(rnode.node, forkNum); 398 399 /* Now do the per-fork work */ 400 if (forkNum == InvalidForkNumber) 401 { 402 for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) 403 mdunlinkfork(rnode, forkNum, isRedo); 404 } 405 else 406 mdunlinkfork(rnode, forkNum, isRedo); 407 } 408 409 /* 410 * Truncate a file to release disk space. 411 */ 412 static int 413 do_truncate(const char *path) 414 { 415 int save_errno; 416 int ret; 417 int fd; 418 419 /* truncate(2) would be easier here, but Windows hasn't got it */ 420 fd = OpenTransientFile(path, O_RDWR | PG_BINARY); 421 if (fd >= 0) 422 { 423 ret = ftruncate(fd, 0); 424 save_errno = errno; 425 CloseTransientFile(fd); 426 errno = save_errno; 427 } 428 else 429 ret = -1; 430 431 /* Log a warning here to avoid repetition in callers. */ 432 if (ret < 0 && errno != ENOENT) 433 { 434 save_errno = errno; 435 ereport(WARNING, 436 (errcode_for_file_access(), 437 errmsg("could not truncate file \"%s\": %m", path))); 438 errno = save_errno; 439 } 440 441 return ret; 442 } 443 444 static void 445 mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) 446 { 447 char *path; 448 int ret; 449 450 path = relpath(rnode, forkNum); 451 452 /* 453 * Delete or truncate the first segment. 454 */ 455 if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode)) 456 { 457 if (!RelFileNodeBackendIsTemp(rnode)) 458 { 459 /* Prevent other backends' fds from holding on to the disk space */ 460 ret = do_truncate(path); 461 } 462 else 463 ret = 0; 464 465 /* Next unlink the file, unless it was already found to be missing */ 466 if (ret == 0 || errno != ENOENT) 467 { 468 ret = unlink(path); 469 if (ret < 0 && errno != ENOENT) 470 ereport(WARNING, 471 (errcode_for_file_access(), 472 errmsg("could not remove file \"%s\": %m", path))); 473 } 474 } 475 else 476 { 477 /* Prevent other backends' fds from holding on to the disk space */ 478 ret = do_truncate(path); 479 480 /* Register request to unlink first segment later */ 481 register_unlink(rnode); 482 } 483 484 /* 485 * Delete any additional segments. 486 */ 487 if (ret >= 0) 488 { 489 char *segpath = (char *) palloc(strlen(path) + 12); 490 BlockNumber segno; 491 492 /* 493 * Note that because we loop until getting ENOENT, we will correctly 494 * remove all inactive segments as well as active ones. 495 */ 496 for (segno = 1;; segno++) 497 { 498 sprintf(segpath, "%s.%u", path, segno); 499 500 if (!RelFileNodeBackendIsTemp(rnode)) 501 { 502 /* 503 * Prevent other backends' fds from holding on to the disk 504 * space. 505 */ 506 if (do_truncate(segpath) < 0 && errno == ENOENT) 507 break; 508 } 509 510 if (unlink(segpath) < 0) 511 { 512 /* ENOENT is expected after the last segment... */ 513 if (errno != ENOENT) 514 ereport(WARNING, 515 (errcode_for_file_access(), 516 errmsg("could not remove file \"%s\": %m", segpath))); 517 break; 518 } 519 } 520 pfree(segpath); 521 } 522 523 pfree(path); 524 } 525 526 /* 527 * mdextend() -- Add a block to the specified relation. 528 * 529 * The semantics are nearly the same as mdwrite(): write at the 530 * specified position. However, this is to be used for the case of 531 * extending a relation (i.e., blocknum is at or beyond the current 532 * EOF). Note that we assume writing a block beyond current EOF 533 * causes intervening file space to become filled with zeroes. 534 */ 535 void 536 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, 537 char *buffer, bool skipFsync) 538 { 539 off_t seekpos; 540 int nbytes; 541 MdfdVec *v; 542 543 /* This assert is too expensive to have on normally ... */ 544 #ifdef CHECK_WRITE_VS_EXTEND 545 Assert(blocknum >= mdnblocks(reln, forknum)); 546 #endif 547 548 /* 549 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any 550 * more --- we mustn't create a block whose number actually is 551 * InvalidBlockNumber. (Note that this failure should be unreachable 552 * because of upstream checks in bufmgr.c.) 553 */ 554 if (blocknum == InvalidBlockNumber) 555 ereport(ERROR, 556 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), 557 errmsg("cannot extend file \"%s\" beyond %u blocks", 558 relpath(reln->smgr_rnode, forknum), 559 InvalidBlockNumber))); 560 561 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); 562 563 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); 564 565 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); 566 567 /* 568 * Note: because caller usually obtained blocknum by calling mdnblocks, 569 * which did a seek(SEEK_END), this seek is often redundant and will be 570 * optimized away by fd.c. It's not redundant, however, if there is a 571 * partial page at the end of the file. In that case we want to try to 572 * overwrite the partial page with a full page. It's also not redundant 573 * if bufmgr.c had to dump another buffer of the same file to make room 574 * for the new page's buffer. 575 */ 576 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) 577 ereport(ERROR, 578 (errcode_for_file_access(), 579 errmsg("could not seek to block %u in file \"%s\": %m", 580 blocknum, FilePathName(v->mdfd_vfd)))); 581 582 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) 583 { 584 if (nbytes < 0) 585 ereport(ERROR, 586 (errcode_for_file_access(), 587 errmsg("could not extend file \"%s\": %m", 588 FilePathName(v->mdfd_vfd)), 589 errhint("Check free disk space."))); 590 /* short write: complain appropriately */ 591 ereport(ERROR, 592 (errcode(ERRCODE_DISK_FULL), 593 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u", 594 FilePathName(v->mdfd_vfd), 595 nbytes, BLCKSZ, blocknum), 596 errhint("Check free disk space."))); 597 } 598 599 if (!skipFsync && !SmgrIsTemp(reln)) 600 register_dirty_segment(reln, forknum, v); 601 602 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); 603 } 604 605 /* 606 * mdopen() -- Open the specified relation. 607 * 608 * Note we only open the first segment, when there are multiple segments. 609 * 610 * If first segment is not present, either ereport or return NULL according 611 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL; 612 * EXTENSION_CREATE means it's OK to extend an existing relation, not to 613 * invent one out of whole cloth. 614 */ 615 static MdfdVec * 616 mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) 617 { 618 MdfdVec *mdfd; 619 char *path; 620 File fd; 621 622 /* No work if already open */ 623 if (reln->md_num_open_segs[forknum] > 0) 624 return &reln->md_seg_fds[forknum][0]; 625 626 path = relpath(reln->smgr_rnode, forknum); 627 628 fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); 629 630 if (fd < 0) 631 { 632 /* 633 * During bootstrap, there are cases where a system relation will be 634 * accessed (by internal backend processes) before the bootstrap 635 * script nominally creates it. Therefore, accept mdopen() as a 636 * substitute for mdcreate() in bootstrap mode only. (See mdcreate) 637 */ 638 if (IsBootstrapProcessingMode()) 639 fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); 640 if (fd < 0) 641 { 642 if ((behavior & EXTENSION_RETURN_NULL) && 643 FILE_POSSIBLY_DELETED(errno)) 644 { 645 pfree(path); 646 return NULL; 647 } 648 ereport(ERROR, 649 (errcode_for_file_access(), 650 errmsg("could not open file \"%s\": %m", path))); 651 } 652 } 653 654 pfree(path); 655 656 _fdvec_resize(reln, forknum, 1); 657 mdfd = &reln->md_seg_fds[forknum][0]; 658 mdfd->mdfd_vfd = fd; 659 mdfd->mdfd_segno = 0; 660 661 Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); 662 663 return mdfd; 664 } 665 666 /* 667 * mdclose() -- Close the specified relation, if it isn't closed already. 668 */ 669 void 670 mdclose(SMgrRelation reln, ForkNumber forknum) 671 { 672 int nopensegs = reln->md_num_open_segs[forknum]; 673 674 /* No work if already closed */ 675 if (nopensegs == 0) 676 return; 677 678 /* close segments starting from the end */ 679 while (nopensegs > 0) 680 { 681 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; 682 683 FileClose(v->mdfd_vfd); 684 _fdvec_resize(reln, forknum, nopensegs - 1); 685 nopensegs--; 686 } 687 } 688 689 /* 690 * mdprefetch() -- Initiate asynchronous read of the specified block of a relation 691 */ 692 void 693 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) 694 { 695 #ifdef USE_PREFETCH 696 off_t seekpos; 697 MdfdVec *v; 698 699 v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); 700 701 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); 702 703 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); 704 705 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); 706 #endif /* USE_PREFETCH */ 707 } 708 709 /* 710 * mdwriteback() -- Tell the kernel to write pages back to storage. 711 * 712 * This accepts a range of blocks because flushing several pages at once is 713 * considerably more efficient than doing so individually. 714 */ 715 void 716 mdwriteback(SMgrRelation reln, ForkNumber forknum, 717 BlockNumber blocknum, BlockNumber nblocks) 718 { 719 /* 720 * Issue flush requests in as few requests as possible; have to split at 721 * segment boundaries though, since those are actually separate files. 722 */ 723 while (nblocks > 0) 724 { 725 BlockNumber nflush = nblocks; 726 off_t seekpos; 727 MdfdVec *v; 728 int segnum_start, 729 segnum_end; 730 731 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , 732 EXTENSION_RETURN_NULL); 733 734 /* 735 * We might be flushing buffers of already removed relations, that's 736 * ok, just ignore that case. 737 */ 738 if (!v) 739 return; 740 741 /* compute offset inside the current segment */ 742 segnum_start = blocknum / RELSEG_SIZE; 743 744 /* compute number of desired writes within the current segment */ 745 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; 746 if (segnum_start != segnum_end) 747 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); 748 749 Assert(nflush >= 1); 750 Assert(nflush <= nblocks); 751 752 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); 753 754 FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); 755 756 nblocks -= nflush; 757 blocknum += nflush; 758 } 759 } 760 761 /* 762 * mdread() -- Read the specified block from a relation. 763 */ 764 void 765 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, 766 char *buffer) 767 { 768 off_t seekpos; 769 int nbytes; 770 MdfdVec *v; 771 772 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, 773 reln->smgr_rnode.node.spcNode, 774 reln->smgr_rnode.node.dbNode, 775 reln->smgr_rnode.node.relNode, 776 reln->smgr_rnode.backend); 777 778 v = _mdfd_getseg(reln, forknum, blocknum, false, 779 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); 780 781 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); 782 783 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); 784 785 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) 786 ereport(ERROR, 787 (errcode_for_file_access(), 788 errmsg("could not seek to block %u in file \"%s\": %m", 789 blocknum, FilePathName(v->mdfd_vfd)))); 790 791 nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ); 792 793 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, 794 reln->smgr_rnode.node.spcNode, 795 reln->smgr_rnode.node.dbNode, 796 reln->smgr_rnode.node.relNode, 797 reln->smgr_rnode.backend, 798 nbytes, 799 BLCKSZ); 800 801 if (nbytes != BLCKSZ) 802 { 803 if (nbytes < 0) 804 ereport(ERROR, 805 (errcode_for_file_access(), 806 errmsg("could not read block %u in file \"%s\": %m", 807 blocknum, FilePathName(v->mdfd_vfd)))); 808 809 /* 810 * Short read: we are at or past EOF, or we read a partial block at 811 * EOF. Normally this is an error; upper levels should never try to 812 * read a nonexistent block. However, if zero_damaged_pages is ON or 813 * we are InRecovery, we should instead return zeroes without 814 * complaining. This allows, for example, the case of trying to 815 * update a block that was later truncated away. 816 */ 817 if (zero_damaged_pages || InRecovery) 818 MemSet(buffer, 0, BLCKSZ); 819 else 820 ereport(ERROR, 821 (errcode(ERRCODE_DATA_CORRUPTED), 822 errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", 823 blocknum, FilePathName(v->mdfd_vfd), 824 nbytes, BLCKSZ))); 825 } 826 } 827 828 /* 829 * mdwrite() -- Write the supplied block at the appropriate location. 830 * 831 * This is to be used only for updating already-existing blocks of a 832 * relation (ie, those before the current EOF). To extend a relation, 833 * use mdextend(). 834 */ 835 void 836 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, 837 char *buffer, bool skipFsync) 838 { 839 off_t seekpos; 840 int nbytes; 841 MdfdVec *v; 842 843 /* This assert is too expensive to have on normally ... */ 844 #ifdef CHECK_WRITE_VS_EXTEND 845 Assert(blocknum < mdnblocks(reln, forknum)); 846 #endif 847 848 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, 849 reln->smgr_rnode.node.spcNode, 850 reln->smgr_rnode.node.dbNode, 851 reln->smgr_rnode.node.relNode, 852 reln->smgr_rnode.backend); 853 854 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, 855 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); 856 857 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); 858 859 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); 860 861 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) 862 ereport(ERROR, 863 (errcode_for_file_access(), 864 errmsg("could not seek to block %u in file \"%s\": %m", 865 blocknum, FilePathName(v->mdfd_vfd)))); 866 867 nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE); 868 869 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, 870 reln->smgr_rnode.node.spcNode, 871 reln->smgr_rnode.node.dbNode, 872 reln->smgr_rnode.node.relNode, 873 reln->smgr_rnode.backend, 874 nbytes, 875 BLCKSZ); 876 877 if (nbytes != BLCKSZ) 878 { 879 if (nbytes < 0) 880 ereport(ERROR, 881 (errcode_for_file_access(), 882 errmsg("could not write block %u in file \"%s\": %m", 883 blocknum, FilePathName(v->mdfd_vfd)))); 884 /* short write: complain appropriately */ 885 ereport(ERROR, 886 (errcode(ERRCODE_DISK_FULL), 887 errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", 888 blocknum, 889 FilePathName(v->mdfd_vfd), 890 nbytes, BLCKSZ), 891 errhint("Check free disk space."))); 892 } 893 894 if (!skipFsync && !SmgrIsTemp(reln)) 895 register_dirty_segment(reln, forknum, v); 896 } 897 898 /* 899 * mdnblocks() -- Get the number of blocks stored in a relation. 900 * 901 * Important side effect: all active segments of the relation are opened 902 * and added to the mdfd_seg_fds array. If this routine has not been 903 * called, then only segments up to the last one actually touched 904 * are present in the array. 905 */ 906 BlockNumber 907 mdnblocks(SMgrRelation reln, ForkNumber forknum) 908 { 909 MdfdVec *v = mdopen(reln, forknum, EXTENSION_FAIL); 910 BlockNumber nblocks; 911 BlockNumber segno = 0; 912 913 /* mdopen has opened the first segment */ 914 Assert(reln->md_num_open_segs[forknum] > 0); 915 916 /* 917 * Start from the last open segments, to avoid redundant seeks. We have 918 * previously verified that these segments are exactly RELSEG_SIZE long, 919 * and it's useless to recheck that each time. 920 * 921 * NOTE: this assumption could only be wrong if another backend has 922 * truncated the relation. We rely on higher code levels to handle that 923 * scenario by closing and re-opening the md fd, which is handled via 924 * relcache flush. (Since the checkpointer doesn't participate in 925 * relcache flush, it could have segment entries for inactive segments; 926 * that's OK because the checkpointer never needs to compute relation 927 * size.) 928 */ 929 segno = reln->md_num_open_segs[forknum] - 1; 930 v = &reln->md_seg_fds[forknum][segno]; 931 932 for (;;) 933 { 934 nblocks = _mdnblocks(reln, forknum, v); 935 if (nblocks > ((BlockNumber) RELSEG_SIZE)) 936 elog(FATAL, "segment too big"); 937 if (nblocks < ((BlockNumber) RELSEG_SIZE)) 938 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks; 939 940 /* 941 * If segment is exactly RELSEG_SIZE, advance to next one. 942 */ 943 segno++; 944 945 /* 946 * We used to pass O_CREAT here, but that's has the disadvantage that 947 * it might create a segment which has vanished through some operating 948 * system misadventure. In such a case, creating the segment here 949 * undermines _mdfd_getseg's attempts to notice and report an error 950 * upon access to a missing segment. 951 */ 952 v = _mdfd_openseg(reln, forknum, segno, 0); 953 if (v == NULL) 954 return segno * ((BlockNumber) RELSEG_SIZE); 955 } 956 } 957 958 /* 959 * mdtruncate() -- Truncate relation to specified number of blocks. 960 */ 961 void 962 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) 963 { 964 BlockNumber curnblk; 965 BlockNumber priorblocks; 966 int curopensegs; 967 968 /* 969 * NOTE: mdnblocks makes sure we have opened all active segments, so that 970 * truncation loop will get them all! 971 */ 972 curnblk = mdnblocks(reln, forknum); 973 if (nblocks > curnblk) 974 { 975 /* Bogus request ... but no complaint if InRecovery */ 976 if (InRecovery) 977 return; 978 ereport(ERROR, 979 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now", 980 relpath(reln->smgr_rnode, forknum), 981 nblocks, curnblk))); 982 } 983 if (nblocks == curnblk) 984 return; /* no work */ 985 986 /* 987 * Truncate segments, starting at the last one. Starting at the end makes 988 * managing the memory for the fd array easier, should there be errors. 989 */ 990 curopensegs = reln->md_num_open_segs[forknum]; 991 while (curopensegs > 0) 992 { 993 MdfdVec *v; 994 995 priorblocks = (curopensegs - 1) * RELSEG_SIZE; 996 997 v = &reln->md_seg_fds[forknum][curopensegs - 1]; 998 999 if (priorblocks > nblocks) 1000 { 1001 /* 1002 * This segment is no longer active. We truncate the file, but do 1003 * not delete it, for reasons explained in the header comments. 1004 */ 1005 if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) 1006 ereport(ERROR, 1007 (errcode_for_file_access(), 1008 errmsg("could not truncate file \"%s\": %m", 1009 FilePathName(v->mdfd_vfd)))); 1010 1011 if (!SmgrIsTemp(reln)) 1012 register_dirty_segment(reln, forknum, v); 1013 1014 /* we never drop the 1st segment */ 1015 Assert(v != &reln->md_seg_fds[forknum][0]); 1016 1017 FileClose(v->mdfd_vfd); 1018 _fdvec_resize(reln, forknum, curopensegs - 1); 1019 } 1020 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) 1021 { 1022 /* 1023 * This is the last segment we want to keep. Truncate the file to 1024 * the right length. NOTE: if nblocks is exactly a multiple K of 1025 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but 1026 * keep it. This adheres to the invariant given in the header 1027 * comments. 1028 */ 1029 BlockNumber lastsegblocks = nblocks - priorblocks; 1030 1031 if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) 1032 ereport(ERROR, 1033 (errcode_for_file_access(), 1034 errmsg("could not truncate file \"%s\" to %u blocks: %m", 1035 FilePathName(v->mdfd_vfd), 1036 nblocks))); 1037 if (!SmgrIsTemp(reln)) 1038 register_dirty_segment(reln, forknum, v); 1039 } 1040 else 1041 { 1042 /* 1043 * We still need this segment, so nothing to do for this and any 1044 * earlier segment. 1045 */ 1046 break; 1047 } 1048 curopensegs--; 1049 } 1050 } 1051 1052 /* 1053 * mdimmedsync() -- Immediately sync a relation to stable storage. 1054 * 1055 * Note that only writes already issued are synced; this routine knows 1056 * nothing of dirty buffers that may exist inside the buffer manager. 1057 */ 1058 void 1059 mdimmedsync(SMgrRelation reln, ForkNumber forknum) 1060 { 1061 int segno; 1062 1063 /* 1064 * NOTE: mdnblocks makes sure we have opened all active segments, so that 1065 * fsync loop will get them all! 1066 */ 1067 mdnblocks(reln, forknum); 1068 1069 segno = reln->md_num_open_segs[forknum]; 1070 1071 while (segno > 0) 1072 { 1073 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; 1074 1075 if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) 1076 ereport(data_sync_elevel(ERROR), 1077 (errcode_for_file_access(), 1078 errmsg("could not fsync file \"%s\": %m", 1079 FilePathName(v->mdfd_vfd)))); 1080 segno--; 1081 } 1082 } 1083 1084 /* 1085 * mdsync() -- Sync previous writes to stable storage. 1086 */ 1087 void 1088 mdsync(void) 1089 { 1090 static bool mdsync_in_progress = false; 1091 1092 HASH_SEQ_STATUS hstat; 1093 PendingOperationEntry *entry; 1094 int absorb_counter; 1095 1096 /* Statistics on sync times */ 1097 int processed = 0; 1098 instr_time sync_start, 1099 sync_end, 1100 sync_diff; 1101 uint64 elapsed; 1102 uint64 longest = 0; 1103 uint64 total_elapsed = 0; 1104 1105 /* 1106 * This is only called during checkpoints, and checkpoints should only 1107 * occur in processes that have created a pendingOpsTable. 1108 */ 1109 if (!pendingOpsTable) 1110 elog(ERROR, "cannot sync without a pendingOpsTable"); 1111 1112 /* 1113 * If we are in the checkpointer, the sync had better include all fsync 1114 * requests that were queued by backends up to this point. The tightest 1115 * race condition that could occur is that a buffer that must be written 1116 * and fsync'd for the checkpoint could have been dumped by a backend just 1117 * before it was visited by BufferSync(). We know the backend will have 1118 * queued an fsync request before clearing the buffer's dirtybit, so we 1119 * are safe as long as we do an Absorb after completing BufferSync(). 1120 */ 1121 AbsorbFsyncRequests(); 1122 1123 /* 1124 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating 1125 * checkpoint), we want to ignore fsync requests that are entered into the 1126 * hashtable after this point --- they should be processed next time, 1127 * instead. We use mdsync_cycle_ctr to tell old entries apart from new 1128 * ones: new ones will have cycle_ctr equal to the incremented value of 1129 * mdsync_cycle_ctr. 1130 * 1131 * In normal circumstances, all entries present in the table at this point 1132 * will have cycle_ctr exactly equal to the current (about to be old) 1133 * value of mdsync_cycle_ctr. However, if we fail partway through the 1134 * fsync'ing loop, then older values of cycle_ctr might remain when we 1135 * come back here to try again. Repeated checkpoint failures would 1136 * eventually wrap the counter around to the point where an old entry 1137 * might appear new, causing us to skip it, possibly allowing a checkpoint 1138 * to succeed that should not have. To forestall wraparound, any time the 1139 * previous mdsync() failed to complete, run through the table and 1140 * forcibly set cycle_ctr = mdsync_cycle_ctr. 1141 * 1142 * Think not to merge this loop with the main loop, as the problem is 1143 * exactly that that loop may fail before having visited all the entries. 1144 * From a performance point of view it doesn't matter anyway, as this path 1145 * will never be taken in a system that's functioning normally. 1146 */ 1147 if (mdsync_in_progress) 1148 { 1149 /* prior try failed, so update any stale cycle_ctr values */ 1150 hash_seq_init(&hstat, pendingOpsTable); 1151 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) 1152 { 1153 entry->cycle_ctr = mdsync_cycle_ctr; 1154 } 1155 } 1156 1157 /* Advance counter so that new hashtable entries are distinguishable */ 1158 mdsync_cycle_ctr++; 1159 1160 /* Set flag to detect failure if we don't reach the end of the loop */ 1161 mdsync_in_progress = true; 1162 1163 /* Now scan the hashtable for fsync requests to process */ 1164 absorb_counter = FSYNCS_PER_ABSORB; 1165 hash_seq_init(&hstat, pendingOpsTable); 1166 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) 1167 { 1168 ForkNumber forknum; 1169 1170 /* 1171 * If the entry is new then don't process it this time; it might 1172 * contain multiple fsync-request bits, but they are all new. Note 1173 * "continue" bypasses the hash-remove call at the bottom of the loop. 1174 */ 1175 if (entry->cycle_ctr == mdsync_cycle_ctr) 1176 continue; 1177 1178 /* Else assert we haven't missed it */ 1179 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); 1180 1181 /* 1182 * Scan over the forks and segments represented by the entry. 1183 * 1184 * The bitmap manipulations are slightly tricky, because we can call 1185 * AbsorbFsyncRequests() inside the loop and that could result in 1186 * bms_add_member() modifying and even re-palloc'ing the bitmapsets. 1187 * So we detach it, but if we fail we'll merge it with any new 1188 * requests that have arrived in the meantime. 1189 */ 1190 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) 1191 { 1192 Bitmapset *requests = entry->requests[forknum]; 1193 int segno; 1194 1195 entry->requests[forknum] = NULL; 1196 entry->canceled[forknum] = false; 1197 1198 segno = -1; 1199 while ((segno = bms_next_member(requests, segno)) >= 0) 1200 { 1201 int failures; 1202 1203 /* 1204 * If fsync is off then we don't have to bother opening the 1205 * file at all. (We delay checking until this point so that 1206 * changing fsync on the fly behaves sensibly.) 1207 */ 1208 if (!enableFsync) 1209 continue; 1210 1211 /* 1212 * If in checkpointer, we want to absorb pending requests 1213 * every so often to prevent overflow of the fsync request 1214 * queue. It is unspecified whether newly-added entries will 1215 * be visited by hash_seq_search, but we don't care since we 1216 * don't need to process them anyway. 1217 */ 1218 if (--absorb_counter <= 0) 1219 { 1220 AbsorbFsyncRequests(); 1221 absorb_counter = FSYNCS_PER_ABSORB; 1222 } 1223 1224 /* 1225 * The fsync table could contain requests to fsync segments 1226 * that have been deleted (unlinked) by the time we get to 1227 * them. Rather than just hoping an ENOENT (or EACCES on 1228 * Windows) error can be ignored, what we do on error is 1229 * absorb pending requests and then retry. Since mdunlink() 1230 * queues a "cancel" message before actually unlinking, the 1231 * fsync request is guaranteed to be marked canceled after the 1232 * absorb if it really was this case. DROP DATABASE likewise 1233 * has to tell us to forget fsync requests before it starts 1234 * deletions. 1235 */ 1236 for (failures = 0;; failures++) /* loop exits at "break" */ 1237 { 1238 SMgrRelation reln; 1239 MdfdVec *seg; 1240 char *path; 1241 int save_errno; 1242 1243 /* 1244 * Find or create an smgr hash entry for this relation. 1245 * This may seem a bit unclean -- md calling smgr? But 1246 * it's really the best solution. It ensures that the 1247 * open file reference isn't permanently leaked if we get 1248 * an error here. (You may say "but an unreferenced 1249 * SMgrRelation is still a leak!" Not really, because the 1250 * only case in which a checkpoint is done by a process 1251 * that isn't about to shut down is in the checkpointer, 1252 * and it will periodically do smgrcloseall(). This fact 1253 * justifies our not closing the reln in the success path 1254 * either, which is a good thing since in non-checkpointer 1255 * cases we couldn't safely do that.) 1256 */ 1257 reln = smgropen(entry->rnode, InvalidBackendId); 1258 1259 /* Attempt to open and fsync the target segment */ 1260 seg = _mdfd_getseg(reln, forknum, 1261 (BlockNumber) segno * (BlockNumber) RELSEG_SIZE, 1262 false, 1263 EXTENSION_RETURN_NULL 1264 | EXTENSION_DONT_CHECK_SIZE); 1265 1266 INSTR_TIME_SET_CURRENT(sync_start); 1267 1268 if (seg != NULL && 1269 FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0) 1270 { 1271 /* Success; update statistics about sync timing */ 1272 INSTR_TIME_SET_CURRENT(sync_end); 1273 sync_diff = sync_end; 1274 INSTR_TIME_SUBTRACT(sync_diff, sync_start); 1275 elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); 1276 if (elapsed > longest) 1277 longest = elapsed; 1278 total_elapsed += elapsed; 1279 processed++; 1280 requests = bms_del_member(requests, segno); 1281 if (log_checkpoints) 1282 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec", 1283 processed, 1284 FilePathName(seg->mdfd_vfd), 1285 (double) elapsed / 1000); 1286 1287 break; /* out of retry loop */ 1288 } 1289 1290 /* Compute file name for use in message */ 1291 save_errno = errno; 1292 path = _mdfd_segpath(reln, forknum, (BlockNumber) segno); 1293 errno = save_errno; 1294 1295 /* 1296 * It is possible that the relation has been dropped or 1297 * truncated since the fsync request was entered. 1298 * Therefore, allow ENOENT, but only if we didn't fail 1299 * already on this file. This applies both for 1300 * _mdfd_getseg() and for FileSync, since fd.c might have 1301 * closed the file behind our back. 1302 * 1303 * XXX is there any point in allowing more than one retry? 1304 * Don't see one at the moment, but easy to change the 1305 * test here if so. 1306 */ 1307 if (!FILE_POSSIBLY_DELETED(errno) || 1308 failures > 0) 1309 { 1310 Bitmapset *new_requests; 1311 1312 /* 1313 * We need to merge these unsatisfied requests with 1314 * any others that have arrived since we started. 1315 */ 1316 new_requests = entry->requests[forknum]; 1317 entry->requests[forknum] = 1318 bms_join(new_requests, requests); 1319 1320 errno = save_errno; 1321 ereport(data_sync_elevel(ERROR), 1322 (errcode_for_file_access(), 1323 errmsg("could not fsync file \"%s\": %m", 1324 path))); 1325 } 1326 else 1327 ereport(DEBUG1, 1328 (errcode_for_file_access(), 1329 errmsg("could not fsync file \"%s\" but retrying: %m", 1330 path))); 1331 pfree(path); 1332 1333 /* 1334 * Absorb incoming requests and check to see if a cancel 1335 * arrived for this relation fork. 1336 */ 1337 AbsorbFsyncRequests(); 1338 absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ 1339 1340 if (entry->canceled[forknum]) 1341 break; 1342 } /* end retry loop */ 1343 } 1344 bms_free(requests); 1345 } 1346 1347 /* 1348 * We've finished everything that was requested before we started to 1349 * scan the entry. If no new requests have been inserted meanwhile, 1350 * remove the entry. Otherwise, update its cycle counter, as all the 1351 * requests now in it must have arrived during this cycle. 1352 */ 1353 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) 1354 { 1355 if (entry->requests[forknum] != NULL) 1356 break; 1357 } 1358 if (forknum <= MAX_FORKNUM) 1359 entry->cycle_ctr = mdsync_cycle_ctr; 1360 else 1361 { 1362 /* Okay to remove it */ 1363 if (hash_search(pendingOpsTable, &entry->rnode, 1364 HASH_REMOVE, NULL) == NULL) 1365 elog(ERROR, "pendingOpsTable corrupted"); 1366 } 1367 } /* end loop over hashtable entries */ 1368 1369 /* Return sync performance metrics for report at checkpoint end */ 1370 CheckpointStats.ckpt_sync_rels = processed; 1371 CheckpointStats.ckpt_longest_sync = longest; 1372 CheckpointStats.ckpt_agg_sync_time = total_elapsed; 1373 1374 /* Flag successful completion of mdsync */ 1375 mdsync_in_progress = false; 1376 } 1377 1378 /* 1379 * mdpreckpt() -- Do pre-checkpoint work 1380 * 1381 * To distinguish unlink requests that arrived before this checkpoint 1382 * started from those that arrived during the checkpoint, we use a cycle 1383 * counter similar to the one we use for fsync requests. That cycle 1384 * counter is incremented here. 1385 * 1386 * This must be called *before* the checkpoint REDO point is determined. 1387 * That ensures that we won't delete files too soon. 1388 * 1389 * Note that we can't do anything here that depends on the assumption 1390 * that the checkpoint will be completed. 1391 */ 1392 void 1393 mdpreckpt(void) 1394 { 1395 /* 1396 * Any unlink requests arriving after this point will be assigned the next 1397 * cycle counter, and won't be unlinked until next checkpoint. 1398 */ 1399 mdckpt_cycle_ctr++; 1400 } 1401 1402 /* 1403 * mdpostckpt() -- Do post-checkpoint work 1404 * 1405 * Remove any lingering files that can now be safely removed. 1406 */ 1407 void 1408 mdpostckpt(void) 1409 { 1410 int absorb_counter; 1411 1412 absorb_counter = UNLINKS_PER_ABSORB; 1413 while (pendingUnlinks != NIL) 1414 { 1415 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); 1416 char *path; 1417 1418 /* 1419 * New entries are appended to the end, so if the entry is new we've 1420 * reached the end of old entries. 1421 * 1422 * Note: if just the right number of consecutive checkpoints fail, we 1423 * could be fooled here by cycle_ctr wraparound. However, the only 1424 * consequence is that we'd delay unlinking for one more checkpoint, 1425 * which is perfectly tolerable. 1426 */ 1427 if (entry->cycle_ctr == mdckpt_cycle_ctr) 1428 break; 1429 1430 /* Unlink the file */ 1431 path = relpathperm(entry->rnode, MAIN_FORKNUM); 1432 if (unlink(path) < 0) 1433 { 1434 /* 1435 * There's a race condition, when the database is dropped at the 1436 * same time that we process the pending unlink requests. If the 1437 * DROP DATABASE deletes the file before we do, we will get ENOENT 1438 * here. rmtree() also has to ignore ENOENT errors, to deal with 1439 * the possibility that we delete the file first. 1440 */ 1441 if (errno != ENOENT) 1442 ereport(WARNING, 1443 (errcode_for_file_access(), 1444 errmsg("could not remove file \"%s\": %m", path))); 1445 } 1446 pfree(path); 1447 1448 /* And remove the list entry */ 1449 pendingUnlinks = list_delete_first(pendingUnlinks); 1450 pfree(entry); 1451 1452 /* 1453 * As in mdsync, we don't want to stop absorbing fsync requests for a 1454 * long time when there are many deletions to be done. We can safely 1455 * call AbsorbFsyncRequests() at this point in the loop (note it might 1456 * try to delete list entries). 1457 */ 1458 if (--absorb_counter <= 0) 1459 { 1460 AbsorbFsyncRequests(); 1461 absorb_counter = UNLINKS_PER_ABSORB; 1462 } 1463 } 1464 } 1465 1466 /* 1467 * register_dirty_segment() -- Mark a relation segment as needing fsync 1468 * 1469 * If there is a local pending-ops table, just make an entry in it for 1470 * mdsync to process later. Otherwise, try to pass off the fsync request 1471 * to the checkpointer process. If that fails, just do the fsync 1472 * locally before returning (we hope this will not happen often enough 1473 * to be a performance problem). 1474 */ 1475 static void 1476 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) 1477 { 1478 /* Temp relations should never be fsync'd */ 1479 Assert(!SmgrIsTemp(reln)); 1480 1481 if (pendingOpsTable) 1482 { 1483 /* push it into local pending-ops table */ 1484 RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno); 1485 } 1486 else 1487 { 1488 if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno)) 1489 return; /* passed it off successfully */ 1490 1491 ereport(DEBUG1, 1492 (errmsg("could not forward fsync request because request queue is full"))); 1493 1494 if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) 1495 ereport(data_sync_elevel(ERROR), 1496 (errcode_for_file_access(), 1497 errmsg("could not fsync file \"%s\": %m", 1498 FilePathName(seg->mdfd_vfd)))); 1499 } 1500 } 1501 1502 /* 1503 * register_unlink() -- Schedule a file to be deleted after next checkpoint 1504 * 1505 * We don't bother passing in the fork number, because this is only used 1506 * with main forks. 1507 * 1508 * As with register_dirty_segment, this could involve either a local or 1509 * a remote pending-ops table. 1510 */ 1511 static void 1512 register_unlink(RelFileNodeBackend rnode) 1513 { 1514 /* Should never be used with temp relations */ 1515 Assert(!RelFileNodeBackendIsTemp(rnode)); 1516 1517 if (pendingOpsTable) 1518 { 1519 /* push it into local pending-ops table */ 1520 RememberFsyncRequest(rnode.node, MAIN_FORKNUM, 1521 UNLINK_RELATION_REQUEST); 1522 } 1523 else 1524 { 1525 /* 1526 * Notify the checkpointer about it. If we fail to queue the request 1527 * message, we have to sleep and try again, because we can't simply 1528 * delete the file now. Ugly, but hopefully won't happen often. 1529 * 1530 * XXX should we just leave the file orphaned instead? 1531 */ 1532 Assert(IsUnderPostmaster); 1533 while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM, 1534 UNLINK_RELATION_REQUEST)) 1535 pg_usleep(10000L); /* 10 msec seems a good number */ 1536 } 1537 } 1538 1539 /* 1540 * RememberFsyncRequest() -- callback from checkpointer side of fsync request 1541 * 1542 * We stuff fsync requests into the local hash table for execution 1543 * during the checkpointer's next checkpoint. UNLINK requests go into a 1544 * separate linked list, however, because they get processed separately. 1545 * 1546 * The range of possible segment numbers is way less than the range of 1547 * BlockNumber, so we can reserve high values of segno for special purposes. 1548 * We define three: 1549 * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation, 1550 * either for one fork, or all forks if forknum is InvalidForkNumber 1551 * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database 1552 * - UNLINK_RELATION_REQUEST is a request to delete the file after the next 1553 * checkpoint. 1554 * Note also that we're assuming real segment numbers don't exceed INT_MAX. 1555 * 1556 * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash 1557 * table has to be searched linearly, but dropping a database is a pretty 1558 * heavyweight operation anyhow, so we'll live with it.) 1559 */ 1560 void 1561 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) 1562 { 1563 Assert(pendingOpsTable); 1564 1565 if (segno == FORGET_RELATION_FSYNC) 1566 { 1567 /* Remove any pending requests for the relation (one or all forks) */ 1568 PendingOperationEntry *entry; 1569 1570 entry = (PendingOperationEntry *) hash_search(pendingOpsTable, 1571 &rnode, 1572 HASH_FIND, 1573 NULL); 1574 if (entry) 1575 { 1576 /* 1577 * We can't just delete the entry since mdsync could have an 1578 * active hashtable scan. Instead we delete the bitmapsets; this 1579 * is safe because of the way mdsync is coded. We also set the 1580 * "canceled" flags so that mdsync can tell that a cancel arrived 1581 * for the fork(s). 1582 */ 1583 if (forknum == InvalidForkNumber) 1584 { 1585 /* remove requests for all forks */ 1586 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) 1587 { 1588 bms_free(entry->requests[forknum]); 1589 entry->requests[forknum] = NULL; 1590 entry->canceled[forknum] = true; 1591 } 1592 } 1593 else 1594 { 1595 /* remove requests for single fork */ 1596 bms_free(entry->requests[forknum]); 1597 entry->requests[forknum] = NULL; 1598 entry->canceled[forknum] = true; 1599 } 1600 } 1601 } 1602 else if (segno == FORGET_DATABASE_FSYNC) 1603 { 1604 /* Remove any pending requests for the entire database */ 1605 HASH_SEQ_STATUS hstat; 1606 PendingOperationEntry *entry; 1607 ListCell *cell, 1608 *prev, 1609 *next; 1610 1611 /* Remove fsync requests */ 1612 hash_seq_init(&hstat, pendingOpsTable); 1613 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) 1614 { 1615 if (entry->rnode.dbNode == rnode.dbNode) 1616 { 1617 /* remove requests for all forks */ 1618 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) 1619 { 1620 bms_free(entry->requests[forknum]); 1621 entry->requests[forknum] = NULL; 1622 entry->canceled[forknum] = true; 1623 } 1624 } 1625 } 1626 1627 /* Remove unlink requests */ 1628 prev = NULL; 1629 for (cell = list_head(pendingUnlinks); cell; cell = next) 1630 { 1631 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); 1632 1633 next = lnext(cell); 1634 if (entry->rnode.dbNode == rnode.dbNode) 1635 { 1636 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev); 1637 pfree(entry); 1638 } 1639 else 1640 prev = cell; 1641 } 1642 } 1643 else if (segno == UNLINK_RELATION_REQUEST) 1644 { 1645 /* Unlink request: put it in the linked list */ 1646 MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); 1647 PendingUnlinkEntry *entry; 1648 1649 /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */ 1650 Assert(forknum == MAIN_FORKNUM); 1651 1652 entry = palloc(sizeof(PendingUnlinkEntry)); 1653 entry->rnode = rnode; 1654 entry->cycle_ctr = mdckpt_cycle_ctr; 1655 1656 pendingUnlinks = lappend(pendingUnlinks, entry); 1657 1658 MemoryContextSwitchTo(oldcxt); 1659 } 1660 else 1661 { 1662 /* Normal case: enter a request to fsync this segment */ 1663 MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); 1664 PendingOperationEntry *entry; 1665 bool found; 1666 1667 entry = (PendingOperationEntry *) hash_search(pendingOpsTable, 1668 &rnode, 1669 HASH_ENTER, 1670 &found); 1671 /* if new entry, initialize it */ 1672 if (!found) 1673 { 1674 entry->cycle_ctr = mdsync_cycle_ctr; 1675 MemSet(entry->requests, 0, sizeof(entry->requests)); 1676 MemSet(entry->canceled, 0, sizeof(entry->canceled)); 1677 } 1678 1679 /* 1680 * NB: it's intentional that we don't change cycle_ctr if the entry 1681 * already exists. The cycle_ctr must represent the oldest fsync 1682 * request that could be in the entry. 1683 */ 1684 1685 entry->requests[forknum] = bms_add_member(entry->requests[forknum], 1686 (int) segno); 1687 1688 MemoryContextSwitchTo(oldcxt); 1689 } 1690 } 1691 1692 /* 1693 * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork 1694 * 1695 * forknum == InvalidForkNumber means all forks, although this code doesn't 1696 * actually know that, since it's just forwarding the request elsewhere. 1697 */ 1698 void 1699 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) 1700 { 1701 if (pendingOpsTable) 1702 { 1703 /* standalone backend or startup process: fsync state is local */ 1704 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC); 1705 } 1706 else if (IsUnderPostmaster) 1707 { 1708 /* 1709 * Notify the checkpointer about it. If we fail to queue the cancel 1710 * message, we have to sleep and try again ... ugly, but hopefully 1711 * won't happen often. 1712 * 1713 * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an 1714 * error would leave the no-longer-used file still present on disk, 1715 * which would be bad, so I'm inclined to assume that the checkpointer 1716 * will always empty the queue soon. 1717 */ 1718 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC)) 1719 pg_usleep(10000L); /* 10 msec seems a good number */ 1720 1721 /* 1722 * Note we don't wait for the checkpointer to actually absorb the 1723 * cancel message; see mdsync() for the implications. 1724 */ 1725 } 1726 } 1727 1728 /* 1729 * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB 1730 */ 1731 void 1732 ForgetDatabaseFsyncRequests(Oid dbid) 1733 { 1734 RelFileNode rnode; 1735 1736 rnode.dbNode = dbid; 1737 rnode.spcNode = 0; 1738 rnode.relNode = 0; 1739 1740 if (pendingOpsTable) 1741 { 1742 /* standalone backend or startup process: fsync state is local */ 1743 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC); 1744 } 1745 else if (IsUnderPostmaster) 1746 { 1747 /* see notes in ForgetRelationFsyncRequests */ 1748 while (!ForwardFsyncRequest(rnode, InvalidForkNumber, 1749 FORGET_DATABASE_FSYNC)) 1750 pg_usleep(10000L); /* 10 msec seems a good number */ 1751 } 1752 } 1753 1754 /* 1755 * DropRelationFiles -- drop files of all given relations 1756 */ 1757 void 1758 DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo) 1759 { 1760 SMgrRelation *srels; 1761 int i; 1762 1763 srels = palloc(sizeof(SMgrRelation) * ndelrels); 1764 for (i = 0; i < ndelrels; i++) 1765 { 1766 SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); 1767 1768 if (isRedo) 1769 { 1770 ForkNumber fork; 1771 1772 for (fork = 0; fork <= MAX_FORKNUM; fork++) 1773 XLogDropRelation(delrels[i], fork); 1774 } 1775 srels[i] = srel; 1776 } 1777 1778 smgrdounlinkall(srels, ndelrels, isRedo); 1779 1780 for (i = 0; i < ndelrels; i++) 1781 smgrclose(srels[i]); 1782 pfree(srels); 1783 } 1784 1785 1786 /* 1787 * _fdvec_resize() -- Resize the fork's open segments array 1788 */ 1789 static void 1790 _fdvec_resize(SMgrRelation reln, 1791 ForkNumber forknum, 1792 int nseg) 1793 { 1794 if (nseg == 0) 1795 { 1796 if (reln->md_num_open_segs[forknum] > 0) 1797 { 1798 pfree(reln->md_seg_fds[forknum]); 1799 reln->md_seg_fds[forknum] = NULL; 1800 } 1801 } 1802 else if (reln->md_num_open_segs[forknum] == 0) 1803 { 1804 reln->md_seg_fds[forknum] = 1805 MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg); 1806 } 1807 else 1808 { 1809 /* 1810 * It doesn't seem worthwhile complicating the code to amortize 1811 * repalloc() calls. Those are far faster than PathNameOpenFile() or 1812 * FileClose(), and the memory context internally will sometimes avoid 1813 * doing an actual reallocation. 1814 */ 1815 reln->md_seg_fds[forknum] = 1816 repalloc(reln->md_seg_fds[forknum], 1817 sizeof(MdfdVec) * nseg); 1818 } 1819 1820 reln->md_num_open_segs[forknum] = nseg; 1821 } 1822 1823 /* 1824 * Return the filename for the specified segment of the relation. The 1825 * returned string is palloc'd. 1826 */ 1827 static char * 1828 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) 1829 { 1830 char *path, 1831 *fullpath; 1832 1833 path = relpath(reln->smgr_rnode, forknum); 1834 1835 if (segno > 0) 1836 { 1837 fullpath = psprintf("%s.%u", path, segno); 1838 pfree(path); 1839 } 1840 else 1841 fullpath = path; 1842 1843 return fullpath; 1844 } 1845 1846 /* 1847 * Open the specified segment of the relation, 1848 * and make a MdfdVec object for it. Returns NULL on failure. 1849 */ 1850 static MdfdVec * 1851 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, 1852 int oflags) 1853 { 1854 MdfdVec *v; 1855 int fd; 1856 char *fullpath; 1857 1858 fullpath = _mdfd_segpath(reln, forknum, segno); 1859 1860 /* open the file */ 1861 fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); 1862 1863 pfree(fullpath); 1864 1865 if (fd < 0) 1866 return NULL; 1867 1868 if (segno <= reln->md_num_open_segs[forknum]) 1869 _fdvec_resize(reln, forknum, segno + 1); 1870 1871 /* fill the entry */ 1872 v = &reln->md_seg_fds[forknum][segno]; 1873 v->mdfd_vfd = fd; 1874 v->mdfd_segno = segno; 1875 1876 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); 1877 1878 /* all done */ 1879 return v; 1880 } 1881 1882 /* 1883 * _mdfd_getseg() -- Find the segment of the relation holding the 1884 * specified block. 1885 * 1886 * If the segment doesn't exist, we ereport, return NULL, or create the 1887 * segment, according to "behavior". Note: skipFsync is only used in the 1888 * EXTENSION_CREATE case. 1889 */ 1890 static MdfdVec * 1891 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, 1892 bool skipFsync, int behavior) 1893 { 1894 MdfdVec *v; 1895 BlockNumber targetseg; 1896 BlockNumber nextsegno; 1897 1898 /* some way to handle non-existent segments needs to be specified */ 1899 Assert(behavior & 1900 (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL)); 1901 1902 targetseg = blkno / ((BlockNumber) RELSEG_SIZE); 1903 1904 /* if an existing and opened segment, we're done */ 1905 if (targetseg < reln->md_num_open_segs[forknum]) 1906 { 1907 v = &reln->md_seg_fds[forknum][targetseg]; 1908 return v; 1909 } 1910 1911 /* 1912 * The target segment is not yet open. Iterate over all the segments 1913 * between the last opened and the target segment. This way missing 1914 * segments either raise an error, or get created (according to 1915 * 'behavior'). Start with either the last opened, or the first segment if 1916 * none was opened before. 1917 */ 1918 if (reln->md_num_open_segs[forknum] > 0) 1919 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1]; 1920 else 1921 { 1922 v = mdopen(reln, forknum, behavior); 1923 if (!v) 1924 return NULL; /* if behavior & EXTENSION_RETURN_NULL */ 1925 } 1926 1927 for (nextsegno = reln->md_num_open_segs[forknum]; 1928 nextsegno <= targetseg; nextsegno++) 1929 { 1930 BlockNumber nblocks = _mdnblocks(reln, forknum, v); 1931 int flags = 0; 1932 1933 Assert(nextsegno == v->mdfd_segno + 1); 1934 1935 if (nblocks > ((BlockNumber) RELSEG_SIZE)) 1936 elog(FATAL, "segment too big"); 1937 1938 if ((behavior & EXTENSION_CREATE) || 1939 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY))) 1940 { 1941 /* 1942 * Normally we will create new segments only if authorized by the 1943 * caller (i.e., we are doing mdextend()). But when doing WAL 1944 * recovery, create segments anyway; this allows cases such as 1945 * replaying WAL data that has a write into a high-numbered 1946 * segment of a relation that was later deleted. We want to go 1947 * ahead and create the segments so we can finish out the replay. 1948 * However if the caller has specified 1949 * EXTENSION_REALLY_RETURN_NULL, then extension is not desired 1950 * even in recovery; we won't reach this point in that case. 1951 * 1952 * We have to maintain the invariant that segments before the last 1953 * active segment are of size RELSEG_SIZE; therefore, if 1954 * extending, pad them out with zeroes if needed. (This only 1955 * matters if in recovery, or if the caller is extending the 1956 * relation discontiguously, but that can happen in hash indexes.) 1957 */ 1958 if (nblocks < ((BlockNumber) RELSEG_SIZE)) 1959 { 1960 char *zerobuf = palloc0(BLCKSZ); 1961 1962 mdextend(reln, forknum, 1963 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, 1964 zerobuf, skipFsync); 1965 pfree(zerobuf); 1966 } 1967 flags = O_CREAT; 1968 } 1969 else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && 1970 nblocks < ((BlockNumber) RELSEG_SIZE)) 1971 { 1972 /* 1973 * When not extending (or explicitly including truncated 1974 * segments), only open the next segment if the current one is 1975 * exactly RELSEG_SIZE. If not (this branch), either return NULL 1976 * or fail. 1977 */ 1978 if (behavior & EXTENSION_RETURN_NULL) 1979 { 1980 /* 1981 * Some callers discern between reasons for _mdfd_getseg() 1982 * returning NULL based on errno. As there's no failing 1983 * syscall involved in this case, explicitly set errno to 1984 * ENOENT, as that seems the closest interpretation. 1985 */ 1986 errno = ENOENT; 1987 return NULL; 1988 } 1989 1990 ereport(ERROR, 1991 (errcode_for_file_access(), 1992 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks", 1993 _mdfd_segpath(reln, forknum, nextsegno), 1994 blkno, nblocks))); 1995 } 1996 1997 v = _mdfd_openseg(reln, forknum, nextsegno, flags); 1998 1999 if (v == NULL) 2000 { 2001 if ((behavior & EXTENSION_RETURN_NULL) && 2002 FILE_POSSIBLY_DELETED(errno)) 2003 return NULL; 2004 ereport(ERROR, 2005 (errcode_for_file_access(), 2006 errmsg("could not open file \"%s\" (target block %u): %m", 2007 _mdfd_segpath(reln, forknum, nextsegno), 2008 blkno))); 2009 } 2010 } 2011 2012 return v; 2013 } 2014 2015 /* 2016 * Get number of blocks present in a single disk file 2017 */ 2018 static BlockNumber 2019 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) 2020 { 2021 off_t len; 2022 2023 len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END); 2024 if (len < 0) 2025 ereport(ERROR, 2026 (errcode_for_file_access(), 2027 errmsg("could not seek to end of file \"%s\": %m", 2028 FilePathName(seg->mdfd_vfd)))); 2029 /* note that this calculation will ignore any partial block at EOF */ 2030 return (BlockNumber) (len / BLCKSZ); 2031 } 2032