1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.17 2008/07/31 22:30:33 dillon Exp $ 35 */ 36 /* 37 * HAMMER mirroring ioctls - serialize and deserialize modifications made 38 * to a filesystem. 39 */ 40 41 #include "hammer.h" 42 43 static int hammer_mirror_check(hammer_cursor_t cursor, 44 struct hammer_ioc_mrecord_rec *mrec); 45 static int hammer_mirror_update(hammer_cursor_t cursor, 46 struct hammer_ioc_mrecord_rec *mrec); 47 static int hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 48 struct hammer_ioc_mrecord_rec *mrec, 49 struct hammer_ioc_mirror_rw *mirror, 50 u_int32_t localization, 51 char *uptr); 52 static int hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 53 struct hammer_ioc_mrecord_rec *mrec, 54 struct hammer_ioc_mirror_rw *mirror, 55 u_int32_t localization); 56 static int hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 57 struct hammer_ioc_mrecord_skip *mrec, 58 struct hammer_ioc_mirror_rw *mirror, 59 u_int32_t localization); 60 static int hammer_mirror_delete_to(hammer_cursor_t cursor, 61 struct hammer_ioc_mirror_rw *mirror); 62 static int hammer_mirror_nomirror(struct hammer_base_elm *base); 63 64 /* 65 * All B-Tree records within the specified key range which also conform 66 * to the transaction id range are returned. Mirroring code keeps track 67 * of the last transaction id fully scanned and can efficiently pick up 68 * where it left off if interrupted. 69 * 70 * The PFS is identified in the mirror structure. The passed ip is just 71 * some directory in the overall HAMMER filesystem and has nothing to 72 * do with the PFS. 73 */ 74 int 75 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip, 76 struct hammer_ioc_mirror_rw *mirror) 77 { 78 struct hammer_cmirror cmirror; 79 struct hammer_cursor cursor; 80 union hammer_ioc_mrecord_any mrec; 81 hammer_btree_leaf_elm_t elm; 82 const int crc_start = HAMMER_MREC_CRCOFF; 83 char *uptr; 84 int error; 85 int data_len; 86 int bytes; 87 int eatdisk; 88 int mrec_flags; 89 u_int32_t localization; 90 u_int32_t rec_crc; 91 92 localization = (u_int32_t)mirror->pfs_id << 16; 93 94 if ((mirror->key_beg.localization | mirror->key_end.localization) & 95 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 96 return(EINVAL); 97 } 98 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0) 99 return(EINVAL); 100 101 mirror->key_cur = mirror->key_beg; 102 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 103 mirror->key_cur.localization += localization; 104 bzero(&mrec, sizeof(mrec)); 105 bzero(&cmirror, sizeof(cmirror)); 106 107 /* 108 * Make CRC errors non-fatal (at least on data), causing an EDOM 109 * error instead of EIO. 110 */ 111 trans->flags |= HAMMER_TRANSF_CRCDOM; 112 113 retry: 114 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 115 if (error) { 116 hammer_done_cursor(&cursor); 117 goto failed; 118 } 119 cursor.key_beg = mirror->key_cur; 120 cursor.key_end = mirror->key_end; 121 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK; 122 cursor.key_end.localization += localization; 123 124 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 125 cursor.flags |= HAMMER_CURSOR_BACKEND; 126 127 /* 128 * This flag filters the search to only return elements whos create 129 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid 130 * field stored with internal and leaf nodes to shortcut the scan. 131 */ 132 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED; 133 cursor.cmirror = &cmirror; 134 cmirror.mirror_tid = mirror->tid_beg; 135 136 error = hammer_btree_first(&cursor); 137 while (error == 0) { 138 /* 139 * Yield to more important tasks 140 */ 141 if (error == 0) { 142 error = hammer_signal_check(trans->hmp); 143 if (error) 144 break; 145 } 146 147 /* 148 * An internal node can be returned in mirror-filtered 149 * mode and indicates that the scan is returning a skip 150 * range in the cursor->cmirror structure. 151 */ 152 uptr = (char *)mirror->ubuf + mirror->count; 153 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 154 /* 155 * Check space 156 */ 157 mirror->key_cur = cmirror.skip_beg; 158 bytes = sizeof(mrec.skip); 159 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 160 mirror->size) { 161 break; 162 } 163 164 /* 165 * Fill mrec 166 */ 167 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 168 mrec.head.type = HAMMER_MREC_TYPE_SKIP; 169 mrec.head.rec_size = bytes; 170 mrec.skip.skip_beg = cmirror.skip_beg; 171 mrec.skip.skip_end = cmirror.skip_end; 172 mrec.head.rec_crc = crc32(&mrec.head.rec_size, 173 bytes - crc_start); 174 error = copyout(&mrec, uptr, bytes); 175 eatdisk = 0; 176 goto didwrite; 177 } 178 179 /* 180 * Leaf node. In full-history mode we could filter out 181 * elements modified outside the user-requested TID range. 182 * 183 * However, such elements must be returned so the writer 184 * can compare them against the target to determine what 185 * needs to be deleted on the target, particular for 186 * no-history mirrors. 187 */ 188 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF); 189 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 190 mirror->key_cur = elm->base; 191 192 /* 193 * If the record was created after our end point we just 194 * ignore it. 195 */ 196 if (elm->base.create_tid > mirror->tid_end) { 197 error = 0; 198 bytes = 0; 199 eatdisk = 1; 200 goto didwrite; 201 } 202 203 /* 204 * Determine if we should generate a PASS or a REC. PASS 205 * records are records without any data payload. Such 206 * records will be generated if the target is already expected 207 * to have the record, allowing it to delete the gaps. 208 * 209 * A PASS record is also used to perform deletions on the 210 * target. 211 * 212 * Such deletions are needed if the master or files on the 213 * master are no-history, or if the slave is so far behind 214 * the master has already been pruned. 215 */ 216 if (elm->base.create_tid < mirror->tid_beg) { 217 bytes = sizeof(mrec.rec); 218 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 219 mirror->size) { 220 break; 221 } 222 223 /* 224 * Fill mrec. 225 */ 226 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 227 mrec.head.type = HAMMER_MREC_TYPE_PASS; 228 mrec.head.rec_size = bytes; 229 mrec.rec.leaf = *elm; 230 mrec.head.rec_crc = crc32(&mrec.head.rec_size, 231 bytes - crc_start); 232 error = copyout(&mrec, uptr, bytes); 233 eatdisk = 1; 234 goto didwrite; 235 236 } 237 238 /* 239 * The core code exports the data to userland. 240 * 241 * CRC errors on data are reported but passed through, 242 * but the data must be washed by the user program. 243 */ 244 mrec_flags = 0; 245 data_len = (elm->data_offset) ? elm->data_len : 0; 246 if (data_len) { 247 error = hammer_btree_extract(&cursor, 248 HAMMER_CURSOR_GET_DATA); 249 if (error) { 250 if (error != EDOM) 251 break; 252 mrec_flags |= HAMMER_MRECF_CRC_ERROR | 253 HAMMER_MRECF_DATA_CRC_BAD; 254 } 255 } 256 257 bytes = sizeof(mrec.rec) + data_len; 258 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > mirror->size) 259 break; 260 261 /* 262 * Construct the record for userland and copyout. 263 * 264 * The user is asking for a snapshot, if the record was 265 * deleted beyond the user-requested ending tid, the record 266 * is not considered deleted from the point of view of 267 * userland and delete_tid is cleared. 268 */ 269 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 270 mrec.head.type = HAMMER_MREC_TYPE_REC | mrec_flags; 271 mrec.head.rec_size = bytes; 272 mrec.rec.leaf = *elm; 273 274 if (elm->base.delete_tid > mirror->tid_end) 275 mrec.rec.leaf.base.delete_tid = 0; 276 rec_crc = crc32(&mrec.head.rec_size, 277 sizeof(mrec.rec) - crc_start); 278 if (data_len) 279 rec_crc = crc32_ext(cursor.data, data_len, rec_crc); 280 mrec.head.rec_crc = rec_crc; 281 error = copyout(&mrec, uptr, sizeof(mrec.rec)); 282 if (data_len && error == 0) { 283 error = copyout(cursor.data, uptr + sizeof(mrec.rec), 284 data_len); 285 } 286 eatdisk = 1; 287 288 /* 289 * eatdisk controls whether we skip the current cursor 290 * position on the next scan or not. If doing a SKIP 291 * the cursor is already positioned properly for the next 292 * scan and eatdisk will be 0. 293 */ 294 didwrite: 295 if (error == 0) { 296 mirror->count += HAMMER_HEAD_DOALIGN(bytes); 297 if (eatdisk) 298 cursor.flags |= HAMMER_CURSOR_ATEDISK; 299 else 300 cursor.flags &= ~HAMMER_CURSOR_ATEDISK; 301 error = hammer_btree_iterate(&cursor); 302 } 303 } 304 if (error == ENOENT) { 305 mirror->key_cur = mirror->key_end; 306 error = 0; 307 } 308 hammer_done_cursor(&cursor); 309 if (error == EDEADLK) 310 goto retry; 311 if (error == EINTR) { 312 mirror->head.flags |= HAMMER_IOC_HEAD_INTR; 313 error = 0; 314 } 315 failed: 316 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 317 return(error); 318 } 319 320 /* 321 * Copy records from userland to the target mirror. 322 * 323 * The PFS is identified in the mirror structure. The passed ip is just 324 * some directory in the overall HAMMER filesystem and has nothing to 325 * do with the PFS. In fact, there might not even be a root directory for 326 * the PFS yet! 327 */ 328 int 329 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, 330 struct hammer_ioc_mirror_rw *mirror) 331 { 332 union hammer_ioc_mrecord_any mrec; 333 struct hammer_cursor cursor; 334 u_int32_t localization; 335 int checkspace_count = 0; 336 int error; 337 int bytes; 338 char *uptr; 339 int seq; 340 341 localization = (u_int32_t)mirror->pfs_id << 16; 342 seq = trans->hmp->flusher.act; 343 344 /* 345 * Validate the mirror structure and relocalize the tracking keys. 346 */ 347 if (mirror->size < 0 || mirror->size > 0x70000000) 348 return(EINVAL); 349 mirror->key_beg.localization &= HAMMER_LOCALIZE_MASK; 350 mirror->key_beg.localization += localization; 351 mirror->key_end.localization &= HAMMER_LOCALIZE_MASK; 352 mirror->key_end.localization += localization; 353 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 354 mirror->key_cur.localization += localization; 355 356 /* 357 * Set up our tracking cursor for the loop. The tracking cursor 358 * is used to delete records that are no longer present on the 359 * master. The last handled record at key_cur must be skipped. 360 */ 361 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 362 363 cursor.key_beg = mirror->key_cur; 364 cursor.key_end = mirror->key_end; 365 cursor.flags |= HAMMER_CURSOR_BACKEND; 366 error = hammer_btree_first(&cursor); 367 if (error == 0) 368 cursor.flags |= HAMMER_CURSOR_ATEDISK; 369 if (error == ENOENT) 370 error = 0; 371 372 /* 373 * Loop until our input buffer has been exhausted. 374 */ 375 while (error == 0 && 376 mirror->count + sizeof(mrec.head) <= mirror->size) { 377 378 /* 379 * Don't blow out the buffer cache. Leave room for frontend 380 * cache as well. 381 * 382 * WARNING: See warnings in hammer_unlock_cursor() function. 383 */ 384 while (hammer_flusher_meta_halflimit(trans->hmp) || 385 hammer_flusher_undo_exhausted(trans, 2)) { 386 hammer_unlock_cursor(&cursor); 387 hammer_flusher_wait(trans->hmp, seq); 388 hammer_lock_cursor(&cursor); 389 seq = hammer_flusher_async_one(trans->hmp); 390 } 391 392 /* 393 * If there is insufficient free space it may be due to 394 * reserved bigblocks, which flushing might fix. 395 */ 396 if (hammer_checkspace(trans->hmp, HAMMER_CHKSPC_MIRROR)) { 397 if (++checkspace_count == 10) { 398 error = ENOSPC; 399 break; 400 } 401 hammer_unlock_cursor(&cursor); 402 hammer_flusher_wait(trans->hmp, seq); 403 hammer_lock_cursor(&cursor); 404 seq = hammer_flusher_async(trans->hmp, NULL); 405 } 406 407 408 /* 409 * Acquire and validate header 410 */ 411 if ((bytes = mirror->size - mirror->count) > sizeof(mrec)) 412 bytes = sizeof(mrec); 413 uptr = (char *)mirror->ubuf + mirror->count; 414 error = copyin(uptr, &mrec, bytes); 415 if (error) 416 break; 417 if (mrec.head.signature != HAMMER_IOC_MIRROR_SIGNATURE) { 418 error = EINVAL; 419 break; 420 } 421 if (mrec.head.rec_size < sizeof(mrec.head) || 422 mrec.head.rec_size > sizeof(mrec) + HAMMER_XBUFSIZE || 423 mirror->count + mrec.head.rec_size > mirror->size) { 424 error = EINVAL; 425 break; 426 } 427 428 switch(mrec.head.type & HAMMER_MRECF_TYPE_MASK) { 429 case HAMMER_MREC_TYPE_SKIP: 430 if (mrec.head.rec_size != sizeof(mrec.skip)) 431 error = EINVAL; 432 if (error == 0) 433 error = hammer_ioc_mirror_write_skip(&cursor, &mrec.skip, mirror, localization); 434 break; 435 case HAMMER_MREC_TYPE_REC: 436 if (mrec.head.rec_size < sizeof(mrec.rec)) 437 error = EINVAL; 438 if (error == 0) 439 error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec)); 440 break; 441 case HAMMER_MREC_TYPE_REC_BADCRC: 442 /* 443 * Records with bad data payloads are ignored XXX. 444 */ 445 if (mrec.head.rec_size < sizeof(mrec.rec)) 446 error = EINVAL; 447 break; 448 case HAMMER_MREC_TYPE_PASS: 449 if (mrec.head.rec_size != sizeof(mrec.rec)) 450 error = EINVAL; 451 if (error == 0) 452 error = hammer_ioc_mirror_write_pass(&cursor, &mrec.rec, mirror, localization); 453 break; 454 default: 455 error = EINVAL; 456 break; 457 } 458 459 /* 460 * Retry the current record on deadlock, otherwise setup 461 * for the next loop. 462 */ 463 if (error == EDEADLK) { 464 while (error == EDEADLK) { 465 hammer_recover_cursor(&cursor); 466 error = hammer_cursor_upgrade(&cursor); 467 } 468 } else { 469 if (error == EALREADY) 470 error = 0; 471 if (error == 0) { 472 mirror->count += 473 HAMMER_HEAD_DOALIGN(mrec.head.rec_size); 474 } 475 } 476 } 477 hammer_done_cursor(&cursor); 478 479 /* 480 * cumulative error 481 */ 482 if (error) { 483 mirror->head.flags |= HAMMER_IOC_HEAD_ERROR; 484 mirror->head.error = error; 485 } 486 487 /* 488 * ioctls don't update the RW data structure if an error is returned, 489 * always return 0. 490 */ 491 return(0); 492 } 493 494 /* 495 * Handle skip records. 496 * 497 * We must iterate from the last resolved record position at mirror->key_cur 498 * to skip_beg non-inclusive and delete any records encountered. 499 * 500 * mirror->key_cur must be carefully set when we succeed in processing 501 * this mrec. 502 */ 503 static int 504 hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 505 struct hammer_ioc_mrecord_skip *mrec, 506 struct hammer_ioc_mirror_rw *mirror, 507 u_int32_t localization) 508 { 509 int error; 510 511 /* 512 * Relocalize the skip range 513 */ 514 mrec->skip_beg.localization &= HAMMER_LOCALIZE_MASK; 515 mrec->skip_beg.localization += localization; 516 mrec->skip_end.localization &= HAMMER_LOCALIZE_MASK; 517 mrec->skip_end.localization += localization; 518 519 /* 520 * Iterate from current position to skip_beg, deleting any records 521 * we encounter. The record at skip_beg is not included (it is 522 * skipped). 523 */ 524 cursor->key_end = mrec->skip_beg; 525 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 526 cursor->flags |= HAMMER_CURSOR_BACKEND; 527 error = hammer_mirror_delete_to(cursor, mirror); 528 529 /* 530 * Now skip past the skip (which is the whole point point of 531 * having a skip record). The sender has not sent us any records 532 * for the skip area so we wouldn't know what to keep and what 533 * to delete anyway. 534 * 535 * Clear ATEDISK because skip_end is non-inclusive, so we can't 536 * count an exact match if we happened to get one. 537 */ 538 if (error == 0) { 539 mirror->key_cur = mrec->skip_end; 540 cursor->key_beg = mrec->skip_end; 541 error = hammer_btree_lookup(cursor); 542 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 543 if (error == ENOENT) 544 error = 0; 545 } 546 return(error); 547 } 548 549 /* 550 * Handle B-Tree records. 551 * 552 * We must iterate to mrec->base.key (non-inclusively), and then process 553 * the record. We are allowed to write a new record or delete an existing 554 * record, but cannot replace an existing record. 555 * 556 * mirror->key_cur must be carefully set when we succeed in processing 557 * this mrec. 558 */ 559 static int 560 hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 561 struct hammer_ioc_mrecord_rec *mrec, 562 struct hammer_ioc_mirror_rw *mirror, 563 u_int32_t localization, 564 char *uptr) 565 { 566 hammer_transaction_t trans; 567 u_int32_t rec_crc; 568 int error; 569 570 trans = cursor->trans; 571 rec_crc = crc32(mrec, sizeof(*mrec)); 572 573 if (mrec->leaf.data_len < 0 || 574 mrec->leaf.data_len > HAMMER_XBUFSIZE || 575 mrec->leaf.data_len + sizeof(*mrec) > mrec->head.rec_size) { 576 return(EINVAL); 577 } 578 579 /* 580 * Re-localize for target. relocalization of data is handled 581 * by hammer_mirror_write(). 582 */ 583 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 584 mrec->leaf.base.localization += localization; 585 586 /* 587 * Delete records through until we reach (non-inclusively) the 588 * target record. 589 */ 590 cursor->key_end = mrec->leaf.base; 591 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 592 cursor->flags |= HAMMER_CURSOR_BACKEND; 593 error = hammer_mirror_delete_to(cursor, mirror); 594 595 /* 596 * Certain records are not part of the mirroring operation 597 */ 598 if (hammer_mirror_nomirror(&mrec->leaf.base)) 599 return(0); 600 601 /* 602 * Locate the record. 603 * 604 * If the record exists only the delete_tid may be updated. 605 * 606 * If the record does not exist we can create it only if the 607 * create_tid is not too old. If the create_tid is too old 608 * it may have already been destroyed on the slave from pruning. 609 * 610 * Note that mirror operations are effectively as-of operations 611 * and delete_tid can be 0 for mirroring purposes even if it is 612 * not actually 0 at the originator. 613 * 614 * These functions can return EDEADLK 615 */ 616 cursor->key_beg = mrec->leaf.base; 617 cursor->flags |= HAMMER_CURSOR_BACKEND; 618 cursor->flags &= ~HAMMER_CURSOR_INSERT; 619 error = hammer_btree_lookup(cursor); 620 621 if (error == 0 && hammer_mirror_check(cursor, mrec)) { 622 error = hammer_mirror_update(cursor, mrec); 623 } else if (error == ENOENT) { 624 if (mrec->leaf.base.create_tid >= mirror->tid_beg) { 625 error = hammer_create_at_cursor( 626 cursor, &mrec->leaf, 627 uptr, HAMMER_CREATE_MODE_UMIRROR); 628 } else { 629 error = 0; 630 } 631 } 632 if (error == 0 || error == EALREADY) 633 mirror->key_cur = mrec->leaf.base; 634 return(error); 635 } 636 637 /* 638 * This works like write_rec but no write or update is necessary, 639 * and no data payload is included so we couldn't do a write even 640 * if we wanted to. 641 * 642 * We must still iterate for deletions, and we can validate the 643 * record header which is a good way to test for corrupted mirror 644 * targets XXX. 645 * 646 * mirror->key_cur must be carefully set when we succeed in processing 647 * this mrec. 648 */ 649 static 650 int 651 hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 652 struct hammer_ioc_mrecord_rec *mrec, 653 struct hammer_ioc_mirror_rw *mirror, 654 u_int32_t localization) 655 { 656 hammer_transaction_t trans; 657 u_int32_t rec_crc; 658 int error; 659 660 trans = cursor->trans; 661 rec_crc = crc32(mrec, sizeof(*mrec)); 662 663 /* 664 * Re-localize for target. Relocalization of data is handled 665 * by hammer_mirror_write(). 666 */ 667 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 668 mrec->leaf.base.localization += localization; 669 670 /* 671 * Delete records through until we reach (non-inclusively) the 672 * target record. 673 */ 674 cursor->key_end = mrec->leaf.base; 675 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 676 cursor->flags |= HAMMER_CURSOR_BACKEND; 677 error = hammer_mirror_delete_to(cursor, mirror); 678 679 /* 680 * Certain records are not part of the mirroring operation 681 */ 682 if (hammer_mirror_nomirror(&mrec->leaf.base)) 683 return(0); 684 685 /* 686 * Locate the record and get past it by setting ATEDISK. Perform 687 * any necessary deletions. We have no data payload and cannot 688 * create a new record. 689 */ 690 if (error == 0) { 691 mirror->key_cur = mrec->leaf.base; 692 cursor->key_beg = mrec->leaf.base; 693 cursor->flags |= HAMMER_CURSOR_BACKEND; 694 cursor->flags &= ~HAMMER_CURSOR_INSERT; 695 error = hammer_btree_lookup(cursor); 696 if (error == 0) { 697 if (hammer_mirror_check(cursor, mrec)) 698 error = hammer_mirror_update(cursor, mrec); 699 cursor->flags |= HAMMER_CURSOR_ATEDISK; 700 } else { 701 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 702 } 703 if (error == ENOENT) 704 error = 0; 705 } 706 return(error); 707 } 708 709 /* 710 * As part of the mirror write we iterate across swaths of records 711 * on the target which no longer exist on the source, and mark them 712 * deleted. 713 * 714 * The caller has indexed the cursor and set up key_end. We iterate 715 * through to key_end. 716 * 717 * There is an edge case where the master has deleted a record whos 718 * create_tid exactly matches our end_tid. We cannot delete this 719 * record on the slave yet because we cannot assign delete_tid == create_tid. 720 * The deletion should be picked up on the next sequence since in order 721 * to have been deleted on the master a transaction must have occured with 722 * a TID greater then the create_tid of the record. 723 * 724 * To support incremental re-mirroring, just for robustness, we do not 725 * touch any records created beyond (or equal to) mirror->tid_end. 726 */ 727 static 728 int 729 hammer_mirror_delete_to(hammer_cursor_t cursor, 730 struct hammer_ioc_mirror_rw *mirror) 731 { 732 hammer_btree_leaf_elm_t elm; 733 int error; 734 735 error = hammer_btree_iterate(cursor); 736 while (error == 0) { 737 elm = &cursor->node->ondisk->elms[cursor->index].leaf; 738 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 739 cursor->flags |= HAMMER_CURSOR_ATEDISK; 740 741 /* 742 * Certain records are not part of the mirroring operation 743 */ 744 if (hammer_mirror_nomirror(&elm->base)) { 745 error = hammer_btree_iterate(cursor); 746 continue; 747 } 748 749 /* 750 * Note: Must still delete records with create_tid < tid_beg, 751 * as record may have been pruned-away on source. 752 */ 753 if (elm->base.delete_tid == 0 && 754 elm->base.create_tid < mirror->tid_end) { 755 error = hammer_delete_at_cursor(cursor, 756 HAMMER_DELETE_ADJUST, 757 mirror->tid_end, 758 time_second, 759 1, NULL); 760 } 761 if (error == 0) 762 error = hammer_btree_iterate(cursor); 763 } 764 if (error == ENOENT) 765 error = 0; 766 return(error); 767 } 768 769 /* 770 * Check whether an update is needed in the case where a match already 771 * exists on the target. The only type of update allowed in this case 772 * is an update of the delete_tid. 773 * 774 * Return non-zero if the update should proceed. 775 */ 776 static 777 int 778 hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord_rec *mrec) 779 { 780 hammer_btree_leaf_elm_t leaf = cursor->leaf; 781 782 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) { 783 if (mrec->leaf.base.delete_tid != 0) 784 return(1); 785 } 786 return(0); 787 } 788 789 /* 790 * Filter out records which are never mirrored, such as configuration space 791 * records (for hammer cleanup). 792 * 793 * NOTE: We currently allow HAMMER_RECTYPE_SNAPSHOT records to be mirrored. 794 */ 795 static 796 int 797 hammer_mirror_nomirror(struct hammer_base_elm *base) 798 { 799 /* 800 * Certain types of records are never updated when mirroring. 801 * Slaves have their own configuration space. 802 */ 803 if (base->rec_type == HAMMER_RECTYPE_CONFIG) 804 return(1); 805 return(0); 806 } 807 808 809 /* 810 * Update a record in-place. Only the delete_tid can change, and 811 * only from zero to non-zero. 812 */ 813 static 814 int 815 hammer_mirror_update(hammer_cursor_t cursor, 816 struct hammer_ioc_mrecord_rec *mrec) 817 { 818 int error; 819 820 /* 821 * This case shouldn't occur. 822 */ 823 if (mrec->leaf.base.delete_tid == 0) 824 return(0); 825 826 /* 827 * Mark the record deleted on the mirror target. 828 */ 829 error = hammer_delete_at_cursor(cursor, HAMMER_DELETE_ADJUST, 830 mrec->leaf.base.delete_tid, 831 mrec->leaf.delete_ts, 832 1, NULL); 833 cursor->flags |= HAMMER_CURSOR_ATEDISK; 834 return(error); 835 } 836 837 #if 0 838 /* 839 * MOVED TO HAMMER_OBJECT.C: hammer_create_at_cursor() 840 */ 841 842 static int hammer_mirror_localize_data(hammer_data_ondisk_t data, 843 hammer_btree_leaf_elm_t leaf); 844 845 /* 846 * Write out a new record. 847 */ 848 static 849 int 850 hammer_mirror_write(hammer_cursor_t cursor, 851 struct hammer_ioc_mrecord_rec *mrec, 852 char *udata) 853 { 854 hammer_transaction_t trans; 855 hammer_buffer_t data_buffer; 856 hammer_off_t ndata_offset; 857 hammer_tid_t high_tid; 858 void *ndata; 859 int error; 860 int doprop; 861 862 trans = cursor->trans; 863 data_buffer = NULL; 864 865 /* 866 * Get the sync lock so the whole mess is atomic 867 */ 868 hammer_sync_lock_sh(trans); 869 870 /* 871 * Allocate and adjust data 872 */ 873 if (mrec->leaf.data_len && mrec->leaf.data_offset) { 874 ndata = hammer_alloc_data(trans, mrec->leaf.data_len, 875 mrec->leaf.base.rec_type, 876 &ndata_offset, &data_buffer, 877 0, &error); 878 if (ndata == NULL) 879 return(error); 880 mrec->leaf.data_offset = ndata_offset; 881 hammer_modify_buffer(trans, data_buffer, NULL, 0); 882 error = copyin(udata, ndata, mrec->leaf.data_len); 883 if (error == 0) { 884 if (hammer_crc_test_leaf(ndata, &mrec->leaf) == 0) { 885 kprintf("data crc mismatch on pipe\n"); 886 error = EINVAL; 887 } else { 888 error = hammer_mirror_localize_data( 889 ndata, &mrec->leaf); 890 } 891 } 892 hammer_modify_buffer_done(data_buffer); 893 } else { 894 mrec->leaf.data_offset = 0; 895 error = 0; 896 ndata = NULL; 897 } 898 if (error) 899 goto failed; 900 901 /* 902 * Do the insertion. This can fail with a EDEADLK or EALREADY 903 */ 904 cursor->flags |= HAMMER_CURSOR_INSERT; 905 error = hammer_btree_lookup(cursor); 906 if (error != ENOENT) { 907 if (error == 0) 908 error = EALREADY; 909 goto failed; 910 } 911 912 error = hammer_btree_insert(cursor, &mrec->leaf, &doprop); 913 914 /* 915 * Cursor is left on the current element, we want to skip it now. 916 */ 917 cursor->flags |= HAMMER_CURSOR_ATEDISK; 918 cursor->flags &= ~HAMMER_CURSOR_INSERT; 919 920 /* 921 * Track a count of active inodes. 922 */ 923 if (error == 0 && 924 mrec->leaf.base.rec_type == HAMMER_RECTYPE_INODE && 925 mrec->leaf.base.delete_tid == 0) { 926 hammer_modify_volume_field(trans, 927 trans->rootvol, 928 vol0_stat_inodes); 929 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes; 930 hammer_modify_volume_done(trans->rootvol); 931 } 932 933 /* 934 * vol0_next_tid must track the highest TID stored in the filesystem. 935 * We do not need to generate undo for this update. 936 */ 937 high_tid = mrec->leaf.base.create_tid; 938 if (high_tid < mrec->leaf.base.delete_tid) 939 high_tid = mrec->leaf.base.delete_tid; 940 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) { 941 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 942 trans->rootvol->ondisk->vol0_next_tid = high_tid; 943 hammer_modify_volume_done(trans->rootvol); 944 } 945 946 /* 947 * WARNING! cursor's leaf pointer may have changed after 948 * do_propagation returns. 949 */ 950 if (error == 0 && doprop) 951 hammer_btree_do_propagation(cursor, NULL, &mrec->leaf); 952 953 failed: 954 /* 955 * Cleanup 956 */ 957 if (error && mrec->leaf.data_offset) { 958 hammer_blockmap_free(cursor->trans, 959 mrec->leaf.data_offset, 960 mrec->leaf.data_len); 961 } 962 hammer_sync_unlock(trans); 963 if (data_buffer) 964 hammer_rel_buffer(data_buffer, 0); 965 return(error); 966 } 967 968 /* 969 * Localize the data payload. Directory entries may need their 970 * localization adjusted. 971 */ 972 static 973 int 974 hammer_mirror_localize_data(hammer_data_ondisk_t data, 975 hammer_btree_leaf_elm_t leaf) 976 { 977 u_int32_t localization; 978 979 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) { 980 localization = leaf->base.localization & 981 HAMMER_LOCALIZE_PSEUDOFS_MASK; 982 if (data->entry.localization != localization) { 983 data->entry.localization = localization; 984 hammer_crc_set_leaf(data, leaf); 985 } 986 } 987 return(0); 988 } 989 990 #endif 991