1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * HAMMER mirroring ioctls - serialize and deserialize modifications made 36 * to a filesystem. 37 */ 38 39 #include "hammer.h" 40 41 static int hammer_mirror_check(hammer_cursor_t cursor, 42 struct hammer_ioc_mrecord_rec *mrec); 43 static int hammer_mirror_update(hammer_cursor_t cursor, 44 struct hammer_ioc_mrecord_rec *mrec); 45 static int hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 46 struct hammer_ioc_mrecord_rec *mrec, 47 struct hammer_ioc_mirror_rw *mirror, 48 uint32_t localization, 49 char *uptr); 50 static int hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 51 struct hammer_ioc_mrecord_rec *mrec, 52 struct hammer_ioc_mirror_rw *mirror, 53 uint32_t localization); 54 static int hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 55 struct hammer_ioc_mrecord_skip *mrec, 56 struct hammer_ioc_mirror_rw *mirror, 57 uint32_t localization); 58 static int hammer_mirror_delete_to(hammer_cursor_t cursor, 59 struct hammer_ioc_mirror_rw *mirror); 60 static int hammer_mirror_nomirror(hammer_base_elm_t base); 61 62 /* 63 * All B-Tree records within the specified key range which also conform 64 * to the transaction id range are returned. Mirroring code keeps track 65 * of the last transaction id fully scanned and can efficiently pick up 66 * where it left off if interrupted. 67 * 68 * The PFS is identified in the mirror structure. The passed ip is just 69 * some directory in the overall HAMMER filesystem and has nothing to 70 * do with the PFS. 71 */ 72 int 73 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip, 74 struct hammer_ioc_mirror_rw *mirror) 75 { 76 struct hammer_cmirror cmirror; 77 struct hammer_cursor cursor; 78 union hammer_ioc_mrecord_any mrec; 79 hammer_btree_leaf_elm_t elm; 80 char *uptr; 81 int error; 82 int data_len; 83 int bytes; 84 int eatdisk; 85 int mrec_flags; 86 uint32_t localization; 87 hammer_crc_t rec_crc; 88 89 localization = pfs_to_lo(mirror->pfs_id); 90 91 if ((mirror->key_beg.localization | mirror->key_end.localization) & 92 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 93 return(EINVAL); 94 } 95 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0) 96 return(EINVAL); 97 98 mirror->key_cur = mirror->key_beg; 99 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 100 mirror->key_cur.localization |= localization; 101 bzero(&mrec, sizeof(mrec)); 102 bzero(&cmirror, sizeof(cmirror)); 103 104 /* 105 * Make CRC errors non-fatal (at least on data), causing an EDOM 106 * error instead of EIO. 107 */ 108 trans->flags |= HAMMER_TRANSF_CRCDOM; 109 110 retry: 111 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 112 if (error) { 113 hammer_done_cursor(&cursor); 114 goto failed; 115 } 116 cursor.key_beg = mirror->key_cur; 117 cursor.key_end = mirror->key_end; 118 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK; 119 cursor.key_end.localization |= localization; 120 121 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 122 cursor.flags |= HAMMER_CURSOR_BACKEND; 123 124 /* 125 * This flag filters the search to only return elements whos create 126 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid 127 * field stored with internal and leaf nodes to shortcut the scan. 128 */ 129 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED; 130 cursor.cmirror = &cmirror; 131 cmirror.mirror_tid = mirror->tid_beg; 132 133 error = hammer_btree_first(&cursor); 134 while (error == 0) { 135 /* 136 * Yield to more important tasks 137 */ 138 if (error == 0) { 139 error = hammer_signal_check(trans->hmp); 140 if (error) 141 break; 142 } 143 144 /* 145 * An internal node can be returned in mirror-filtered 146 * mode and indicates that the scan is returning a skip 147 * range in the cursor->cmirror structure. 148 */ 149 uptr = (char *)mirror->ubuf + mirror->count; 150 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 151 /* 152 * Check space 153 */ 154 mirror->key_cur = cmirror.skip_beg; 155 bytes = sizeof(mrec.skip); 156 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 157 mirror->size) { 158 break; 159 } 160 161 /* 162 * Fill mrec 163 */ 164 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 165 mrec.head.type = HAMMER_MREC_TYPE_SKIP; 166 mrec.head.rec_size = bytes; 167 mrec.skip.skip_beg = cmirror.skip_beg; 168 mrec.skip.skip_end = cmirror.skip_end; 169 hammer_crc_set_mrec_head(&mrec.head, bytes); 170 error = copyout(&mrec, uptr, bytes); 171 eatdisk = 0; 172 goto didwrite; 173 } 174 175 /* 176 * Leaf node. In full-history mode we could filter out 177 * elements modified outside the user-requested TID range. 178 * 179 * However, such elements must be returned so the writer 180 * can compare them against the target to determine what 181 * needs to be deleted on the target, particular for 182 * no-history mirrors. 183 */ 184 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF); 185 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 186 mirror->key_cur = elm->base; 187 188 /* 189 * If the record was created after our end point we just 190 * ignore it. 191 */ 192 if (elm->base.create_tid > mirror->tid_end) { 193 error = 0; 194 bytes = 0; 195 eatdisk = 1; 196 goto didwrite; 197 } 198 199 /* 200 * Determine if we should generate a PASS or a REC. PASS 201 * records are records without any data payload. Such 202 * records will be generated if the target is already expected 203 * to have the record, allowing it to delete the gaps. 204 * 205 * A PASS record is also used to perform deletions on the 206 * target. 207 * 208 * Such deletions are needed if the master or files on the 209 * master are no-history, or if the slave is so far behind 210 * the master has already been pruned. 211 */ 212 if (elm->base.create_tid < mirror->tid_beg) { 213 bytes = sizeof(mrec.rec); 214 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 215 mirror->size) { 216 break; 217 } 218 219 /* 220 * Fill mrec. 221 */ 222 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 223 mrec.head.type = HAMMER_MREC_TYPE_PASS; 224 mrec.head.rec_size = bytes; 225 mrec.rec.leaf = *elm; 226 hammer_crc_set_mrec_head(&mrec.head, bytes); 227 error = copyout(&mrec, uptr, bytes); 228 eatdisk = 1; 229 goto didwrite; 230 } 231 232 /* 233 * The core code exports the data to userland. 234 * 235 * CRC errors on data are reported but passed through, 236 * but the data must be washed by the user program. 237 * 238 * If userland just wants the btree records it can 239 * request that bulk data not be returned. This is 240 * use during mirror-stream histogram generation. 241 */ 242 mrec_flags = 0; 243 data_len = (elm->data_offset) ? elm->data_len : 0; 244 if (data_len && 245 (mirror->head.flags & HAMMER_IOC_MIRROR_NODATA)) { 246 data_len = 0; 247 mrec_flags |= HAMMER_MRECF_NODATA; 248 } 249 if (data_len) { 250 error = hammer_btree_extract_data(&cursor); 251 if (error) { 252 if (error != EDOM) 253 break; 254 mrec_flags |= HAMMER_MRECF_CRC_ERROR | 255 HAMMER_MRECF_DATA_CRC_BAD; 256 } 257 } 258 259 bytes = sizeof(mrec.rec) + data_len; 260 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > mirror->size) 261 break; 262 263 /* 264 * Construct the record for userland and copyout. 265 * 266 * The user is asking for a snapshot, if the record was 267 * deleted beyond the user-requested ending tid, the record 268 * is not considered deleted from the point of view of 269 * userland and delete_tid is cleared. 270 */ 271 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 272 mrec.head.type = HAMMER_MREC_TYPE_REC | mrec_flags; 273 mrec.head.rec_size = bytes; 274 mrec.rec.leaf = *elm; 275 276 if (elm->base.delete_tid > mirror->tid_end) 277 mrec.rec.leaf.base.delete_tid = 0; 278 rec_crc = hammer_crc_get_mrec_head(&mrec.head, sizeof(mrec.rec)); 279 if (data_len) 280 rec_crc = crc32_ext(cursor.data, data_len, rec_crc); 281 mrec.head.rec_crc = rec_crc; 282 error = copyout(&mrec, uptr, sizeof(mrec.rec)); 283 if (data_len && error == 0) { 284 error = copyout(cursor.data, uptr + sizeof(mrec.rec), 285 data_len); 286 } 287 eatdisk = 1; 288 289 /* 290 * eatdisk controls whether we skip the current cursor 291 * position on the next scan or not. If doing a SKIP 292 * the cursor is already positioned properly for the next 293 * scan and eatdisk will be 0. 294 */ 295 didwrite: 296 if (error == 0) { 297 mirror->count += HAMMER_HEAD_DOALIGN(bytes); 298 if (eatdisk) 299 cursor.flags |= HAMMER_CURSOR_ATEDISK; 300 else 301 cursor.flags &= ~HAMMER_CURSOR_ATEDISK; 302 error = hammer_btree_iterate(&cursor); 303 } 304 } 305 if (error == ENOENT) { 306 mirror->key_cur = mirror->key_end; 307 error = 0; 308 } 309 hammer_done_cursor(&cursor); 310 if (error == EDEADLK) 311 goto retry; 312 if (error == EINTR) { 313 mirror->head.flags |= HAMMER_IOC_HEAD_INTR; 314 error = 0; 315 } 316 failed: 317 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 318 return(error); 319 } 320 321 /* 322 * Copy records from userland to the target mirror. 323 * 324 * The PFS is identified in the mirror structure. The passed ip is just 325 * some directory in the overall HAMMER filesystem and has nothing to 326 * do with the PFS. In fact, there might not even be a root directory for 327 * the PFS yet! 328 */ 329 int 330 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, 331 struct hammer_ioc_mirror_rw *mirror) 332 { 333 union hammer_ioc_mrecord_any mrec; 334 struct hammer_cursor cursor; 335 uint32_t localization; 336 int checkspace_count = 0; 337 int error; 338 int bytes; 339 char *uptr; 340 int seq; 341 342 localization = pfs_to_lo(mirror->pfs_id); 343 seq = trans->hmp->flusher.done; 344 345 /* 346 * Validate the mirror structure and relocalize the tracking keys. 347 */ 348 if (mirror->size < 0 || mirror->size > 0x70000000) 349 return(EINVAL); 350 mirror->key_beg.localization &= HAMMER_LOCALIZE_MASK; 351 mirror->key_beg.localization |= localization; 352 mirror->key_end.localization &= HAMMER_LOCALIZE_MASK; 353 mirror->key_end.localization |= localization; 354 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 355 mirror->key_cur.localization |= localization; 356 357 /* 358 * Set up our tracking cursor for the loop. The tracking cursor 359 * is used to delete records that are no longer present on the 360 * master. The last handled record at key_cur must be skipped. 361 */ 362 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 363 364 cursor.key_beg = mirror->key_cur; 365 cursor.key_end = mirror->key_end; 366 cursor.flags |= HAMMER_CURSOR_BACKEND; 367 error = hammer_btree_first(&cursor); 368 if (error == 0) 369 cursor.flags |= HAMMER_CURSOR_ATEDISK; 370 if (error == ENOENT) 371 error = 0; 372 373 /* 374 * Loop until our input buffer has been exhausted. 375 */ 376 while (error == 0 && 377 mirror->count + sizeof(mrec.head) <= mirror->size) { 378 379 /* 380 * Don't blow out the buffer cache. Leave room for frontend 381 * cache as well. 382 * 383 * WARNING: See warnings in hammer_unlock_cursor() function. 384 */ 385 while (hammer_flusher_meta_halflimit(trans->hmp) || 386 hammer_flusher_undo_exhausted(trans, 2)) { 387 hammer_unlock_cursor(&cursor); 388 hammer_flusher_wait(trans->hmp, seq); 389 hammer_lock_cursor(&cursor); 390 seq = hammer_flusher_async_one(trans->hmp); 391 } 392 393 /* 394 * If there is insufficient free space it may be due to 395 * reserved big-blocks, which flushing might fix. 396 */ 397 if (hammer_checkspace(trans->hmp, HAMMER_CHKSPC_MIRROR)) { 398 if (++checkspace_count == 10) { 399 error = ENOSPC; 400 break; 401 } 402 hammer_unlock_cursor(&cursor); 403 hammer_flusher_wait(trans->hmp, seq); 404 hammer_lock_cursor(&cursor); 405 seq = hammer_flusher_async(trans->hmp, NULL); 406 } 407 408 409 /* 410 * Acquire and validate header 411 */ 412 if ((bytes = mirror->size - mirror->count) > sizeof(mrec)) 413 bytes = sizeof(mrec); 414 uptr = (char *)mirror->ubuf + mirror->count; 415 error = copyin(uptr, &mrec, bytes); 416 if (error) 417 break; 418 if (mrec.head.signature != HAMMER_IOC_MIRROR_SIGNATURE) { 419 error = EINVAL; 420 break; 421 } 422 if (mrec.head.rec_size < sizeof(mrec.head) || 423 mrec.head.rec_size > sizeof(mrec) + HAMMER_XBUFSIZE || 424 mirror->count + mrec.head.rec_size > mirror->size) { 425 error = EINVAL; 426 break; 427 } 428 429 switch(mrec.head.type & HAMMER_MRECF_TYPE_MASK) { 430 case HAMMER_MREC_TYPE_SKIP: 431 if (mrec.head.rec_size != sizeof(mrec.skip)) 432 error = EINVAL; 433 if (error == 0) 434 error = hammer_ioc_mirror_write_skip(&cursor, &mrec.skip, mirror, localization); 435 break; 436 case HAMMER_MREC_TYPE_REC: 437 if (mrec.head.rec_size < sizeof(mrec.rec)) 438 error = EINVAL; 439 if (error == 0) 440 error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec)); 441 break; 442 case HAMMER_MREC_TYPE_REC_NODATA: 443 case HAMMER_MREC_TYPE_REC_BADCRC: 444 /* 445 * Records with bad data payloads are ignored XXX. 446 * Records with no data payload have to be skipped 447 * (they shouldn't have been written in the first 448 * place). 449 */ 450 if (mrec.head.rec_size < sizeof(mrec.rec)) 451 error = EINVAL; 452 break; 453 case HAMMER_MREC_TYPE_PASS: 454 if (mrec.head.rec_size != sizeof(mrec.rec)) 455 error = EINVAL; 456 if (error == 0) 457 error = hammer_ioc_mirror_write_pass(&cursor, &mrec.rec, mirror, localization); 458 break; 459 default: 460 error = EINVAL; 461 break; 462 } 463 464 /* 465 * Retry the current record on deadlock, otherwise setup 466 * for the next loop. 467 */ 468 if (error == EDEADLK) { 469 while (error == EDEADLK) { 470 hammer_sync_lock_sh(trans); 471 hammer_recover_cursor(&cursor); 472 error = hammer_cursor_upgrade(&cursor); 473 hammer_sync_unlock(trans); 474 } 475 } else { 476 if (error == EALREADY) 477 error = 0; 478 if (error == 0) { 479 mirror->count += 480 HAMMER_HEAD_DOALIGN(mrec.head.rec_size); 481 } 482 } 483 } 484 hammer_done_cursor(&cursor); 485 486 /* 487 * cumulative error 488 */ 489 if (error) { 490 mirror->head.flags |= HAMMER_IOC_HEAD_ERROR; 491 mirror->head.error = error; 492 } 493 494 /* 495 * ioctls don't update the RW data structure if an error is returned, 496 * always return 0. 497 */ 498 return(0); 499 } 500 501 /* 502 * Handle skip records. 503 * 504 * We must iterate from the last resolved record position at mirror->key_cur 505 * to skip_beg non-inclusive and delete any records encountered. 506 * 507 * mirror->key_cur must be carefully set when we succeed in processing 508 * this mrec. 509 */ 510 static int 511 hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 512 struct hammer_ioc_mrecord_skip *mrec, 513 struct hammer_ioc_mirror_rw *mirror, 514 uint32_t localization) 515 { 516 int error; 517 518 /* 519 * Relocalize the skip range 520 */ 521 mrec->skip_beg.localization &= HAMMER_LOCALIZE_MASK; 522 mrec->skip_beg.localization |= localization; 523 mrec->skip_end.localization &= HAMMER_LOCALIZE_MASK; 524 mrec->skip_end.localization |= localization; 525 526 /* 527 * Iterate from current position to skip_beg, deleting any records 528 * we encounter. The record at skip_beg is not included (it is 529 * skipped). 530 */ 531 cursor->key_end = mrec->skip_beg; 532 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 533 cursor->flags |= HAMMER_CURSOR_BACKEND; 534 error = hammer_mirror_delete_to(cursor, mirror); 535 536 /* 537 * Now skip past the skip (which is the whole point point of 538 * having a skip record). The sender has not sent us any records 539 * for the skip area so we wouldn't know what to keep and what 540 * to delete anyway. 541 * 542 * Clear ATEDISK because skip_end is non-inclusive, so we can't 543 * count an exact match if we happened to get one. 544 */ 545 if (error == 0) { 546 mirror->key_cur = mrec->skip_end; 547 cursor->key_beg = mrec->skip_end; 548 error = hammer_btree_lookup(cursor); 549 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 550 if (error == ENOENT) 551 error = 0; 552 } 553 return(error); 554 } 555 556 /* 557 * Handle B-Tree records. 558 * 559 * We must iterate to mrec->base.key (non-inclusively), and then process 560 * the record. We are allowed to write a new record or delete an existing 561 * record, but cannot replace an existing record. 562 * 563 * mirror->key_cur must be carefully set when we succeed in processing 564 * this mrec. 565 */ 566 static int 567 hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 568 struct hammer_ioc_mrecord_rec *mrec, 569 struct hammer_ioc_mirror_rw *mirror, 570 uint32_t localization, 571 char *uptr) 572 { 573 int error; 574 575 if (mrec->leaf.data_len < 0 || 576 mrec->leaf.data_len > HAMMER_XBUFSIZE || 577 mrec->leaf.data_len + sizeof(*mrec) > mrec->head.rec_size) { 578 return(EINVAL); 579 } 580 581 /* 582 * Re-localize for target. relocalization of data is handled 583 * by hammer_create_at_cursor(). 584 */ 585 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 586 mrec->leaf.base.localization |= localization; 587 588 /* 589 * Delete records through until we reach (non-inclusively) the 590 * target record. 591 */ 592 cursor->key_end = mrec->leaf.base; 593 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 594 cursor->flags |= HAMMER_CURSOR_BACKEND; 595 error = hammer_mirror_delete_to(cursor, mirror); 596 597 /* 598 * Certain records are not part of the mirroring operation 599 */ 600 if (error == 0 && hammer_mirror_nomirror(&mrec->leaf.base)) 601 return(0); 602 603 /* 604 * Locate the record. 605 * 606 * If the record exists only the delete_tid may be updated. 607 * 608 * If the record does not exist we can create it only if the 609 * create_tid is not too old. If the create_tid is too old 610 * it may have already been destroyed on the slave from pruning. 611 * 612 * Note that mirror operations are effectively as-of operations 613 * and delete_tid can be 0 for mirroring purposes even if it is 614 * not actually 0 at the originator. 615 * 616 * These functions can return EDEADLK 617 */ 618 if (error == 0) { 619 cursor->key_beg = mrec->leaf.base; 620 cursor->flags |= HAMMER_CURSOR_BACKEND; 621 cursor->flags &= ~HAMMER_CURSOR_INSERT; 622 error = hammer_btree_lookup(cursor); 623 } 624 625 if (error == 0 && hammer_mirror_check(cursor, mrec)) { 626 error = hammer_mirror_update(cursor, mrec); 627 } else if (error == ENOENT) { 628 if (mrec->leaf.base.create_tid >= mirror->tid_beg) { 629 error = hammer_create_at_cursor( 630 cursor, &mrec->leaf, 631 uptr, HAMMER_CREATE_MODE_UMIRROR); 632 } else { 633 error = 0; 634 } 635 } 636 if (error == 0 || error == EALREADY) 637 mirror->key_cur = mrec->leaf.base; 638 return(error); 639 } 640 641 /* 642 * This works like write_rec but no write or update is necessary, 643 * and no data payload is included so we couldn't do a write even 644 * if we wanted to. 645 * 646 * We must still iterate for deletions, and we can validate the 647 * record header which is a good way to test for corrupted mirror 648 * targets XXX. 649 * 650 * mirror->key_cur must be carefully set when we succeed in processing 651 * this mrec. 652 */ 653 static 654 int 655 hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 656 struct hammer_ioc_mrecord_rec *mrec, 657 struct hammer_ioc_mirror_rw *mirror, 658 uint32_t localization) 659 { 660 int error; 661 662 /* 663 * Re-localize for target. Relocalization of data is handled 664 * by hammer_create_at_cursor(). 665 */ 666 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 667 mrec->leaf.base.localization |= localization; 668 669 /* 670 * Delete records through until we reach (non-inclusively) the 671 * target record. 672 */ 673 cursor->key_end = mrec->leaf.base; 674 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 675 cursor->flags |= HAMMER_CURSOR_BACKEND; 676 error = hammer_mirror_delete_to(cursor, mirror); 677 678 /* 679 * Certain records are not part of the mirroring operation 680 */ 681 if (hammer_mirror_nomirror(&mrec->leaf.base)) 682 return(0); 683 684 /* 685 * Locate the record and get past it by setting ATEDISK. Perform 686 * any necessary deletions. We have no data payload and cannot 687 * create a new record. 688 */ 689 if (error == 0) { 690 mirror->key_cur = mrec->leaf.base; 691 cursor->key_beg = mrec->leaf.base; 692 cursor->flags |= HAMMER_CURSOR_BACKEND; 693 cursor->flags &= ~HAMMER_CURSOR_INSERT; 694 error = hammer_btree_lookup(cursor); 695 if (error == 0) { 696 if (hammer_mirror_check(cursor, mrec)) 697 error = hammer_mirror_update(cursor, mrec); 698 cursor->flags |= HAMMER_CURSOR_ATEDISK; 699 } else { 700 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 701 } 702 if (error == ENOENT) 703 error = 0; 704 } 705 return(error); 706 } 707 708 /* 709 * As part of the mirror write we iterate across swaths of records 710 * on the target which no longer exist on the source, and mark them 711 * deleted. 712 * 713 * The caller has indexed the cursor and set up key_end. We iterate 714 * through to key_end. 715 * 716 * There is an edge case where the master has deleted a record whos 717 * create_tid exactly matches our end_tid. We cannot delete this 718 * record on the slave yet because we cannot assign delete_tid == create_tid. 719 * The deletion should be picked up on the next sequence since in order 720 * to have been deleted on the master a transaction must have occured with 721 * a TID greater then the create_tid of the record. 722 * 723 * To support incremental re-mirroring, just for robustness, we do not 724 * touch any records created beyond (or equal to) mirror->tid_end. 725 */ 726 static 727 int 728 hammer_mirror_delete_to(hammer_cursor_t cursor, 729 struct hammer_ioc_mirror_rw *mirror) 730 { 731 hammer_btree_leaf_elm_t elm; 732 int error; 733 734 error = hammer_btree_iterate(cursor); 735 while (error == 0) { 736 elm = &cursor->node->ondisk->elms[cursor->index].leaf; 737 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 738 cursor->flags |= HAMMER_CURSOR_ATEDISK; 739 740 /* 741 * Certain records are not part of the mirroring operation 742 */ 743 if (hammer_mirror_nomirror(&elm->base)) { 744 error = hammer_btree_iterate(cursor); 745 continue; 746 } 747 748 /* 749 * Note: Must still delete records with create_tid < tid_beg, 750 * as record may have been pruned-away on source. 751 */ 752 if (elm->base.delete_tid == 0 && 753 elm->base.create_tid < mirror->tid_end) { 754 error = hammer_delete_at_cursor(cursor, 755 HAMMER_DELETE_ADJUST, 756 mirror->tid_end, 757 time_second, 758 1, NULL); 759 } 760 if (error == 0) 761 error = hammer_btree_iterate(cursor); 762 } 763 if (error == ENOENT) 764 error = 0; 765 return(error); 766 } 767 768 /* 769 * Check whether an update is needed in the case where a match already 770 * exists on the target. The only type of update allowed in this case 771 * is an update of the delete_tid. 772 * 773 * Return non-zero if the update should proceed. 774 */ 775 static 776 int 777 hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord_rec *mrec) 778 { 779 hammer_btree_leaf_elm_t leaf = cursor->leaf; 780 781 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) { 782 if (mrec->leaf.base.delete_tid != 0) 783 return(1); 784 } 785 return(0); 786 } 787 788 /* 789 * Filter out records which are never mirrored, such as configuration space 790 * records (for hammer cleanup). 791 * 792 * NOTE: We currently allow HAMMER_RECTYPE_SNAPSHOT records to be mirrored. 793 */ 794 static 795 int 796 hammer_mirror_nomirror(hammer_base_elm_t base) 797 { 798 /* 799 * Certain types of records are never updated when mirroring. 800 * Slaves have their own configuration space. 801 */ 802 if (base->rec_type == HAMMER_RECTYPE_CONFIG) 803 return(1); 804 return(0); 805 } 806 807 808 /* 809 * Update a record in-place. Only the delete_tid can change, and 810 * only from zero to non-zero. 811 */ 812 static 813 int 814 hammer_mirror_update(hammer_cursor_t cursor, 815 struct hammer_ioc_mrecord_rec *mrec) 816 { 817 int error; 818 819 /* 820 * This case shouldn't occur. 821 */ 822 if (mrec->leaf.base.delete_tid == 0) 823 return(0); 824 825 /* 826 * Mark the record deleted on the mirror target. 827 */ 828 error = hammer_delete_at_cursor(cursor, HAMMER_DELETE_ADJUST, 829 mrec->leaf.base.delete_tid, 830 mrec->leaf.delete_ts, 831 1, NULL); 832 cursor->flags |= HAMMER_CURSOR_ATEDISK; 833 return(error); 834 } 835