1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.17 2008/07/31 22:30:33 dillon Exp $ 35 */ 36 /* 37 * HAMMER mirroring ioctls - serialize and deserialize modifications made 38 * to a filesystem. 39 */ 40 41 #include "hammer.h" 42 43 static int hammer_mirror_check(hammer_cursor_t cursor, 44 struct hammer_ioc_mrecord_rec *mrec); 45 static int hammer_mirror_update(hammer_cursor_t cursor, 46 struct hammer_ioc_mrecord_rec *mrec); 47 static int hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 48 struct hammer_ioc_mrecord_rec *mrec, 49 struct hammer_ioc_mirror_rw *mirror, 50 u_int32_t localization, 51 char *uptr); 52 static int hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 53 struct hammer_ioc_mrecord_rec *mrec, 54 struct hammer_ioc_mirror_rw *mirror, 55 u_int32_t localization); 56 static int hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 57 struct hammer_ioc_mrecord_skip *mrec, 58 struct hammer_ioc_mirror_rw *mirror, 59 u_int32_t localization); 60 static int hammer_mirror_delete_to(hammer_cursor_t cursor, 61 struct hammer_ioc_mirror_rw *mirror); 62 static int hammer_mirror_nomirror(struct hammer_base_elm *base); 63 64 /* 65 * All B-Tree records within the specified key range which also conform 66 * to the transaction id range are returned. Mirroring code keeps track 67 * of the last transaction id fully scanned and can efficiently pick up 68 * where it left off if interrupted. 69 * 70 * The PFS is identified in the mirror structure. The passed ip is just 71 * some directory in the overall HAMMER filesystem and has nothing to 72 * do with the PFS. 73 */ 74 int 75 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip, 76 struct hammer_ioc_mirror_rw *mirror) 77 { 78 struct hammer_cmirror cmirror; 79 struct hammer_cursor cursor; 80 union hammer_ioc_mrecord_any mrec; 81 hammer_btree_leaf_elm_t elm; 82 const int crc_start = HAMMER_MREC_CRCOFF; 83 char *uptr; 84 int error; 85 int data_len; 86 int bytes; 87 int eatdisk; 88 int mrec_flags; 89 u_int32_t localization; 90 u_int32_t rec_crc; 91 92 localization = (u_int32_t)mirror->pfs_id << 16; 93 94 if ((mirror->key_beg.localization | mirror->key_end.localization) & 95 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 96 return(EINVAL); 97 } 98 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0) 99 return(EINVAL); 100 101 mirror->key_cur = mirror->key_beg; 102 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 103 mirror->key_cur.localization += localization; 104 bzero(&mrec, sizeof(mrec)); 105 bzero(&cmirror, sizeof(cmirror)); 106 107 /* 108 * Make CRC errors non-fatal (at least on data), causing an EDOM 109 * error instead of EIO. 110 */ 111 trans->flags |= HAMMER_TRANSF_CRCDOM; 112 113 retry: 114 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 115 if (error) { 116 hammer_done_cursor(&cursor); 117 goto failed; 118 } 119 cursor.key_beg = mirror->key_cur; 120 cursor.key_end = mirror->key_end; 121 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK; 122 cursor.key_end.localization += localization; 123 124 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 125 cursor.flags |= HAMMER_CURSOR_BACKEND; 126 127 /* 128 * This flag filters the search to only return elements whos create 129 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid 130 * field stored with internal and leaf nodes to shortcut the scan. 131 */ 132 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED; 133 cursor.cmirror = &cmirror; 134 cmirror.mirror_tid = mirror->tid_beg; 135 136 error = hammer_btree_first(&cursor); 137 while (error == 0) { 138 /* 139 * Yield to more important tasks 140 */ 141 if (error == 0) { 142 error = hammer_signal_check(trans->hmp); 143 if (error) 144 break; 145 } 146 147 /* 148 * An internal node can be returned in mirror-filtered 149 * mode and indicates that the scan is returning a skip 150 * range in the cursor->cmirror structure. 151 */ 152 uptr = (char *)mirror->ubuf + mirror->count; 153 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 154 /* 155 * Check space 156 */ 157 mirror->key_cur = cmirror.skip_beg; 158 bytes = sizeof(mrec.skip); 159 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 160 mirror->size) { 161 break; 162 } 163 164 /* 165 * Fill mrec 166 */ 167 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 168 mrec.head.type = HAMMER_MREC_TYPE_SKIP; 169 mrec.head.rec_size = bytes; 170 mrec.skip.skip_beg = cmirror.skip_beg; 171 mrec.skip.skip_end = cmirror.skip_end; 172 mrec.head.rec_crc = crc32(&mrec.head.rec_size, 173 bytes - crc_start); 174 error = copyout(&mrec, uptr, bytes); 175 eatdisk = 0; 176 goto didwrite; 177 } 178 179 /* 180 * Leaf node. In full-history mode we could filter out 181 * elements modified outside the user-requested TID range. 182 * 183 * However, such elements must be returned so the writer 184 * can compare them against the target to determine what 185 * needs to be deleted on the target, particular for 186 * no-history mirrors. 187 */ 188 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF); 189 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 190 mirror->key_cur = elm->base; 191 192 /* 193 * If the record was created after our end point we just 194 * ignore it. 195 */ 196 if (elm->base.create_tid > mirror->tid_end) { 197 error = 0; 198 bytes = 0; 199 eatdisk = 1; 200 goto didwrite; 201 } 202 203 /* 204 * Determine if we should generate a PASS or a REC. PASS 205 * records are records without any data payload. Such 206 * records will be generated if the target is already expected 207 * to have the record, allowing it to delete the gaps. 208 * 209 * A PASS record is also used to perform deletions on the 210 * target. 211 * 212 * Such deletions are needed if the master or files on the 213 * master are no-history, or if the slave is so far behind 214 * the master has already been pruned. 215 */ 216 if (elm->base.create_tid < mirror->tid_beg) { 217 bytes = sizeof(mrec.rec); 218 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 219 mirror->size) { 220 break; 221 } 222 223 /* 224 * Fill mrec. 225 */ 226 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 227 mrec.head.type = HAMMER_MREC_TYPE_PASS; 228 mrec.head.rec_size = bytes; 229 mrec.rec.leaf = *elm; 230 mrec.head.rec_crc = crc32(&mrec.head.rec_size, 231 bytes - crc_start); 232 error = copyout(&mrec, uptr, bytes); 233 eatdisk = 1; 234 goto didwrite; 235 236 } 237 238 /* 239 * The core code exports the data to userland. 240 * 241 * CRC errors on data are reported but passed through, 242 * but the data must be washed by the user program. 243 * 244 * If userland just wants the btree records it can 245 * request that bulk data not be returned. This is 246 * use during mirror-stream histogram generation. 247 */ 248 mrec_flags = 0; 249 data_len = (elm->data_offset) ? elm->data_len : 0; 250 if (data_len && 251 (mirror->head.flags & HAMMER_IOC_MIRROR_NODATA)) { 252 data_len = 0; 253 mrec_flags |= HAMMER_MRECF_NODATA; 254 } 255 if (data_len) { 256 error = hammer_btree_extract(&cursor, 257 HAMMER_CURSOR_GET_DATA); 258 if (error) { 259 if (error != EDOM) 260 break; 261 mrec_flags |= HAMMER_MRECF_CRC_ERROR | 262 HAMMER_MRECF_DATA_CRC_BAD; 263 } 264 } 265 266 bytes = sizeof(mrec.rec) + data_len; 267 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > mirror->size) 268 break; 269 270 /* 271 * Construct the record for userland and copyout. 272 * 273 * The user is asking for a snapshot, if the record was 274 * deleted beyond the user-requested ending tid, the record 275 * is not considered deleted from the point of view of 276 * userland and delete_tid is cleared. 277 */ 278 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 279 mrec.head.type = HAMMER_MREC_TYPE_REC | mrec_flags; 280 mrec.head.rec_size = bytes; 281 mrec.rec.leaf = *elm; 282 283 if (elm->base.delete_tid > mirror->tid_end) 284 mrec.rec.leaf.base.delete_tid = 0; 285 rec_crc = crc32(&mrec.head.rec_size, 286 sizeof(mrec.rec) - crc_start); 287 if (data_len) 288 rec_crc = crc32_ext(cursor.data, data_len, rec_crc); 289 mrec.head.rec_crc = rec_crc; 290 error = copyout(&mrec, uptr, sizeof(mrec.rec)); 291 if (data_len && error == 0) { 292 error = copyout(cursor.data, uptr + sizeof(mrec.rec), 293 data_len); 294 } 295 eatdisk = 1; 296 297 /* 298 * eatdisk controls whether we skip the current cursor 299 * position on the next scan or not. If doing a SKIP 300 * the cursor is already positioned properly for the next 301 * scan and eatdisk will be 0. 302 */ 303 didwrite: 304 if (error == 0) { 305 mirror->count += HAMMER_HEAD_DOALIGN(bytes); 306 if (eatdisk) 307 cursor.flags |= HAMMER_CURSOR_ATEDISK; 308 else 309 cursor.flags &= ~HAMMER_CURSOR_ATEDISK; 310 error = hammer_btree_iterate(&cursor); 311 } 312 } 313 if (error == ENOENT) { 314 mirror->key_cur = mirror->key_end; 315 error = 0; 316 } 317 hammer_done_cursor(&cursor); 318 if (error == EDEADLK) 319 goto retry; 320 if (error == EINTR) { 321 mirror->head.flags |= HAMMER_IOC_HEAD_INTR; 322 error = 0; 323 } 324 failed: 325 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 326 return(error); 327 } 328 329 /* 330 * Copy records from userland to the target mirror. 331 * 332 * The PFS is identified in the mirror structure. The passed ip is just 333 * some directory in the overall HAMMER filesystem and has nothing to 334 * do with the PFS. In fact, there might not even be a root directory for 335 * the PFS yet! 336 */ 337 int 338 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, 339 struct hammer_ioc_mirror_rw *mirror) 340 { 341 union hammer_ioc_mrecord_any mrec; 342 struct hammer_cursor cursor; 343 u_int32_t localization; 344 int checkspace_count = 0; 345 int error; 346 int bytes; 347 char *uptr; 348 int seq; 349 350 localization = (u_int32_t)mirror->pfs_id << 16; 351 seq = trans->hmp->flusher.act; 352 353 /* 354 * Validate the mirror structure and relocalize the tracking keys. 355 */ 356 if (mirror->size < 0 || mirror->size > 0x70000000) 357 return(EINVAL); 358 mirror->key_beg.localization &= HAMMER_LOCALIZE_MASK; 359 mirror->key_beg.localization += localization; 360 mirror->key_end.localization &= HAMMER_LOCALIZE_MASK; 361 mirror->key_end.localization += localization; 362 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 363 mirror->key_cur.localization += localization; 364 365 /* 366 * Set up our tracking cursor for the loop. The tracking cursor 367 * is used to delete records that are no longer present on the 368 * master. The last handled record at key_cur must be skipped. 369 */ 370 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 371 372 cursor.key_beg = mirror->key_cur; 373 cursor.key_end = mirror->key_end; 374 cursor.flags |= HAMMER_CURSOR_BACKEND; 375 error = hammer_btree_first(&cursor); 376 if (error == 0) 377 cursor.flags |= HAMMER_CURSOR_ATEDISK; 378 if (error == ENOENT) 379 error = 0; 380 381 /* 382 * Loop until our input buffer has been exhausted. 383 */ 384 while (error == 0 && 385 mirror->count + sizeof(mrec.head) <= mirror->size) { 386 387 /* 388 * Don't blow out the buffer cache. Leave room for frontend 389 * cache as well. 390 * 391 * WARNING: See warnings in hammer_unlock_cursor() function. 392 */ 393 while (hammer_flusher_meta_halflimit(trans->hmp) || 394 hammer_flusher_undo_exhausted(trans, 2)) { 395 hammer_unlock_cursor(&cursor); 396 hammer_flusher_wait(trans->hmp, seq); 397 hammer_lock_cursor(&cursor); 398 seq = hammer_flusher_async_one(trans->hmp); 399 } 400 401 /* 402 * If there is insufficient free space it may be due to 403 * reserved bigblocks, which flushing might fix. 404 */ 405 if (hammer_checkspace(trans->hmp, HAMMER_CHKSPC_MIRROR)) { 406 if (++checkspace_count == 10) { 407 error = ENOSPC; 408 break; 409 } 410 hammer_unlock_cursor(&cursor); 411 hammer_flusher_wait(trans->hmp, seq); 412 hammer_lock_cursor(&cursor); 413 seq = hammer_flusher_async(trans->hmp, NULL); 414 } 415 416 417 /* 418 * Acquire and validate header 419 */ 420 if ((bytes = mirror->size - mirror->count) > sizeof(mrec)) 421 bytes = sizeof(mrec); 422 uptr = (char *)mirror->ubuf + mirror->count; 423 error = copyin(uptr, &mrec, bytes); 424 if (error) 425 break; 426 if (mrec.head.signature != HAMMER_IOC_MIRROR_SIGNATURE) { 427 error = EINVAL; 428 break; 429 } 430 if (mrec.head.rec_size < sizeof(mrec.head) || 431 mrec.head.rec_size > sizeof(mrec) + HAMMER_XBUFSIZE || 432 mirror->count + mrec.head.rec_size > mirror->size) { 433 error = EINVAL; 434 break; 435 } 436 437 switch(mrec.head.type & HAMMER_MRECF_TYPE_MASK) { 438 case HAMMER_MREC_TYPE_SKIP: 439 if (mrec.head.rec_size != sizeof(mrec.skip)) 440 error = EINVAL; 441 if (error == 0) 442 error = hammer_ioc_mirror_write_skip(&cursor, &mrec.skip, mirror, localization); 443 break; 444 case HAMMER_MREC_TYPE_REC: 445 if (mrec.head.rec_size < sizeof(mrec.rec)) 446 error = EINVAL; 447 if (error == 0) 448 error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec)); 449 break; 450 case HAMMER_MREC_TYPE_REC_NODATA: 451 case HAMMER_MREC_TYPE_REC_BADCRC: 452 /* 453 * Records with bad data payloads are ignored XXX. 454 * Records with no data payload have to be skipped 455 * (they shouldn't have been written in the first 456 * place). 457 */ 458 if (mrec.head.rec_size < sizeof(mrec.rec)) 459 error = EINVAL; 460 break; 461 case HAMMER_MREC_TYPE_PASS: 462 if (mrec.head.rec_size != sizeof(mrec.rec)) 463 error = EINVAL; 464 if (error == 0) 465 error = hammer_ioc_mirror_write_pass(&cursor, &mrec.rec, mirror, localization); 466 break; 467 default: 468 error = EINVAL; 469 break; 470 } 471 472 /* 473 * Retry the current record on deadlock, otherwise setup 474 * for the next loop. 475 */ 476 if (error == EDEADLK) { 477 while (error == EDEADLK) { 478 hammer_sync_lock_sh(trans); 479 hammer_recover_cursor(&cursor); 480 error = hammer_cursor_upgrade(&cursor); 481 hammer_sync_unlock(trans); 482 } 483 } else { 484 if (error == EALREADY) 485 error = 0; 486 if (error == 0) { 487 mirror->count += 488 HAMMER_HEAD_DOALIGN(mrec.head.rec_size); 489 } 490 } 491 } 492 hammer_done_cursor(&cursor); 493 494 /* 495 * cumulative error 496 */ 497 if (error) { 498 mirror->head.flags |= HAMMER_IOC_HEAD_ERROR; 499 mirror->head.error = error; 500 } 501 502 /* 503 * ioctls don't update the RW data structure if an error is returned, 504 * always return 0. 505 */ 506 return(0); 507 } 508 509 /* 510 * Handle skip records. 511 * 512 * We must iterate from the last resolved record position at mirror->key_cur 513 * to skip_beg non-inclusive and delete any records encountered. 514 * 515 * mirror->key_cur must be carefully set when we succeed in processing 516 * this mrec. 517 */ 518 static int 519 hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 520 struct hammer_ioc_mrecord_skip *mrec, 521 struct hammer_ioc_mirror_rw *mirror, 522 u_int32_t localization) 523 { 524 int error; 525 526 /* 527 * Relocalize the skip range 528 */ 529 mrec->skip_beg.localization &= HAMMER_LOCALIZE_MASK; 530 mrec->skip_beg.localization += localization; 531 mrec->skip_end.localization &= HAMMER_LOCALIZE_MASK; 532 mrec->skip_end.localization += localization; 533 534 /* 535 * Iterate from current position to skip_beg, deleting any records 536 * we encounter. The record at skip_beg is not included (it is 537 * skipped). 538 */ 539 cursor->key_end = mrec->skip_beg; 540 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 541 cursor->flags |= HAMMER_CURSOR_BACKEND; 542 error = hammer_mirror_delete_to(cursor, mirror); 543 544 /* 545 * Now skip past the skip (which is the whole point point of 546 * having a skip record). The sender has not sent us any records 547 * for the skip area so we wouldn't know what to keep and what 548 * to delete anyway. 549 * 550 * Clear ATEDISK because skip_end is non-inclusive, so we can't 551 * count an exact match if we happened to get one. 552 */ 553 if (error == 0) { 554 mirror->key_cur = mrec->skip_end; 555 cursor->key_beg = mrec->skip_end; 556 error = hammer_btree_lookup(cursor); 557 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 558 if (error == ENOENT) 559 error = 0; 560 } 561 return(error); 562 } 563 564 /* 565 * Handle B-Tree records. 566 * 567 * We must iterate to mrec->base.key (non-inclusively), and then process 568 * the record. We are allowed to write a new record or delete an existing 569 * record, but cannot replace an existing record. 570 * 571 * mirror->key_cur must be carefully set when we succeed in processing 572 * this mrec. 573 */ 574 static int 575 hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 576 struct hammer_ioc_mrecord_rec *mrec, 577 struct hammer_ioc_mirror_rw *mirror, 578 u_int32_t localization, 579 char *uptr) 580 { 581 hammer_transaction_t trans; 582 u_int32_t rec_crc; 583 int error; 584 585 trans = cursor->trans; 586 rec_crc = crc32(mrec, sizeof(*mrec)); 587 588 if (mrec->leaf.data_len < 0 || 589 mrec->leaf.data_len > HAMMER_XBUFSIZE || 590 mrec->leaf.data_len + sizeof(*mrec) > mrec->head.rec_size) { 591 return(EINVAL); 592 } 593 594 /* 595 * Re-localize for target. relocalization of data is handled 596 * by hammer_mirror_write(). 597 */ 598 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 599 mrec->leaf.base.localization += localization; 600 601 /* 602 * Delete records through until we reach (non-inclusively) the 603 * target record. 604 */ 605 cursor->key_end = mrec->leaf.base; 606 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 607 cursor->flags |= HAMMER_CURSOR_BACKEND; 608 error = hammer_mirror_delete_to(cursor, mirror); 609 610 /* 611 * Certain records are not part of the mirroring operation 612 */ 613 if (error == 0 && hammer_mirror_nomirror(&mrec->leaf.base)) 614 return(0); 615 616 /* 617 * Locate the record. 618 * 619 * If the record exists only the delete_tid may be updated. 620 * 621 * If the record does not exist we can create it only if the 622 * create_tid is not too old. If the create_tid is too old 623 * it may have already been destroyed on the slave from pruning. 624 * 625 * Note that mirror operations are effectively as-of operations 626 * and delete_tid can be 0 for mirroring purposes even if it is 627 * not actually 0 at the originator. 628 * 629 * These functions can return EDEADLK 630 */ 631 if (error == 0) { 632 cursor->key_beg = mrec->leaf.base; 633 cursor->flags |= HAMMER_CURSOR_BACKEND; 634 cursor->flags &= ~HAMMER_CURSOR_INSERT; 635 error = hammer_btree_lookup(cursor); 636 } 637 638 if (error == 0 && hammer_mirror_check(cursor, mrec)) { 639 error = hammer_mirror_update(cursor, mrec); 640 } else if (error == ENOENT) { 641 if (mrec->leaf.base.create_tid >= mirror->tid_beg) { 642 error = hammer_create_at_cursor( 643 cursor, &mrec->leaf, 644 uptr, HAMMER_CREATE_MODE_UMIRROR); 645 } else { 646 error = 0; 647 } 648 } 649 if (error == 0 || error == EALREADY) 650 mirror->key_cur = mrec->leaf.base; 651 return(error); 652 } 653 654 /* 655 * This works like write_rec but no write or update is necessary, 656 * and no data payload is included so we couldn't do a write even 657 * if we wanted to. 658 * 659 * We must still iterate for deletions, and we can validate the 660 * record header which is a good way to test for corrupted mirror 661 * targets XXX. 662 * 663 * mirror->key_cur must be carefully set when we succeed in processing 664 * this mrec. 665 */ 666 static 667 int 668 hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 669 struct hammer_ioc_mrecord_rec *mrec, 670 struct hammer_ioc_mirror_rw *mirror, 671 u_int32_t localization) 672 { 673 hammer_transaction_t trans; 674 u_int32_t rec_crc; 675 int error; 676 677 trans = cursor->trans; 678 rec_crc = crc32(mrec, sizeof(*mrec)); 679 680 /* 681 * Re-localize for target. Relocalization of data is handled 682 * by hammer_mirror_write(). 683 */ 684 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 685 mrec->leaf.base.localization += localization; 686 687 /* 688 * Delete records through until we reach (non-inclusively) the 689 * target record. 690 */ 691 cursor->key_end = mrec->leaf.base; 692 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 693 cursor->flags |= HAMMER_CURSOR_BACKEND; 694 error = hammer_mirror_delete_to(cursor, mirror); 695 696 /* 697 * Certain records are not part of the mirroring operation 698 */ 699 if (hammer_mirror_nomirror(&mrec->leaf.base)) 700 return(0); 701 702 /* 703 * Locate the record and get past it by setting ATEDISK. Perform 704 * any necessary deletions. We have no data payload and cannot 705 * create a new record. 706 */ 707 if (error == 0) { 708 mirror->key_cur = mrec->leaf.base; 709 cursor->key_beg = mrec->leaf.base; 710 cursor->flags |= HAMMER_CURSOR_BACKEND; 711 cursor->flags &= ~HAMMER_CURSOR_INSERT; 712 error = hammer_btree_lookup(cursor); 713 if (error == 0) { 714 if (hammer_mirror_check(cursor, mrec)) 715 error = hammer_mirror_update(cursor, mrec); 716 cursor->flags |= HAMMER_CURSOR_ATEDISK; 717 } else { 718 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 719 } 720 if (error == ENOENT) 721 error = 0; 722 } 723 return(error); 724 } 725 726 /* 727 * As part of the mirror write we iterate across swaths of records 728 * on the target which no longer exist on the source, and mark them 729 * deleted. 730 * 731 * The caller has indexed the cursor and set up key_end. We iterate 732 * through to key_end. 733 * 734 * There is an edge case where the master has deleted a record whos 735 * create_tid exactly matches our end_tid. We cannot delete this 736 * record on the slave yet because we cannot assign delete_tid == create_tid. 737 * The deletion should be picked up on the next sequence since in order 738 * to have been deleted on the master a transaction must have occured with 739 * a TID greater then the create_tid of the record. 740 * 741 * To support incremental re-mirroring, just for robustness, we do not 742 * touch any records created beyond (or equal to) mirror->tid_end. 743 */ 744 static 745 int 746 hammer_mirror_delete_to(hammer_cursor_t cursor, 747 struct hammer_ioc_mirror_rw *mirror) 748 { 749 hammer_btree_leaf_elm_t elm; 750 int error; 751 752 error = hammer_btree_iterate(cursor); 753 while (error == 0) { 754 elm = &cursor->node->ondisk->elms[cursor->index].leaf; 755 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 756 cursor->flags |= HAMMER_CURSOR_ATEDISK; 757 758 /* 759 * Certain records are not part of the mirroring operation 760 */ 761 if (hammer_mirror_nomirror(&elm->base)) { 762 error = hammer_btree_iterate(cursor); 763 continue; 764 } 765 766 /* 767 * Note: Must still delete records with create_tid < tid_beg, 768 * as record may have been pruned-away on source. 769 */ 770 if (elm->base.delete_tid == 0 && 771 elm->base.create_tid < mirror->tid_end) { 772 error = hammer_delete_at_cursor(cursor, 773 HAMMER_DELETE_ADJUST, 774 mirror->tid_end, 775 time_second, 776 1, NULL); 777 } 778 if (error == 0) 779 error = hammer_btree_iterate(cursor); 780 } 781 if (error == ENOENT) 782 error = 0; 783 return(error); 784 } 785 786 /* 787 * Check whether an update is needed in the case where a match already 788 * exists on the target. The only type of update allowed in this case 789 * is an update of the delete_tid. 790 * 791 * Return non-zero if the update should proceed. 792 */ 793 static 794 int 795 hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord_rec *mrec) 796 { 797 hammer_btree_leaf_elm_t leaf = cursor->leaf; 798 799 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) { 800 if (mrec->leaf.base.delete_tid != 0) 801 return(1); 802 } 803 return(0); 804 } 805 806 /* 807 * Filter out records which are never mirrored, such as configuration space 808 * records (for hammer cleanup). 809 * 810 * NOTE: We currently allow HAMMER_RECTYPE_SNAPSHOT records to be mirrored. 811 */ 812 static 813 int 814 hammer_mirror_nomirror(struct hammer_base_elm *base) 815 { 816 /* 817 * Certain types of records are never updated when mirroring. 818 * Slaves have their own configuration space. 819 */ 820 if (base->rec_type == HAMMER_RECTYPE_CONFIG) 821 return(1); 822 return(0); 823 } 824 825 826 /* 827 * Update a record in-place. Only the delete_tid can change, and 828 * only from zero to non-zero. 829 */ 830 static 831 int 832 hammer_mirror_update(hammer_cursor_t cursor, 833 struct hammer_ioc_mrecord_rec *mrec) 834 { 835 int error; 836 837 /* 838 * This case shouldn't occur. 839 */ 840 if (mrec->leaf.base.delete_tid == 0) 841 return(0); 842 843 /* 844 * Mark the record deleted on the mirror target. 845 */ 846 error = hammer_delete_at_cursor(cursor, HAMMER_DELETE_ADJUST, 847 mrec->leaf.base.delete_tid, 848 mrec->leaf.delete_ts, 849 1, NULL); 850 cursor->flags |= HAMMER_CURSOR_ATEDISK; 851 return(error); 852 } 853 854 #if 0 855 /* 856 * MOVED TO HAMMER_OBJECT.C: hammer_create_at_cursor() 857 */ 858 859 static int hammer_mirror_localize_data(hammer_data_ondisk_t data, 860 hammer_btree_leaf_elm_t leaf); 861 862 /* 863 * Write out a new record. 864 */ 865 static 866 int 867 hammer_mirror_write(hammer_cursor_t cursor, 868 struct hammer_ioc_mrecord_rec *mrec, 869 char *udata) 870 { 871 hammer_transaction_t trans; 872 hammer_buffer_t data_buffer; 873 hammer_off_t ndata_offset; 874 hammer_tid_t high_tid; 875 void *ndata; 876 int error; 877 int doprop; 878 879 trans = cursor->trans; 880 data_buffer = NULL; 881 882 /* 883 * Get the sync lock so the whole mess is atomic 884 */ 885 hammer_sync_lock_sh(trans); 886 887 /* 888 * Allocate and adjust data 889 */ 890 if (mrec->leaf.data_len && mrec->leaf.data_offset) { 891 ndata = hammer_alloc_data(trans, mrec->leaf.data_len, 892 mrec->leaf.base.rec_type, 893 &ndata_offset, &data_buffer, 894 0, &error); 895 if (ndata == NULL) 896 return(error); 897 mrec->leaf.data_offset = ndata_offset; 898 hammer_modify_buffer(trans, data_buffer, NULL, 0); 899 error = copyin(udata, ndata, mrec->leaf.data_len); 900 if (error == 0) { 901 if (hammer_crc_test_leaf(ndata, &mrec->leaf) == 0) { 902 kprintf("data crc mismatch on pipe\n"); 903 error = EINVAL; 904 } else { 905 error = hammer_mirror_localize_data( 906 ndata, &mrec->leaf); 907 } 908 } 909 hammer_modify_buffer_done(data_buffer); 910 } else { 911 mrec->leaf.data_offset = 0; 912 error = 0; 913 ndata = NULL; 914 } 915 if (error) 916 goto failed; 917 918 /* 919 * Do the insertion. This can fail with a EDEADLK or EALREADY 920 */ 921 cursor->flags |= HAMMER_CURSOR_INSERT; 922 error = hammer_btree_lookup(cursor); 923 if (error != ENOENT) { 924 if (error == 0) 925 error = EALREADY; 926 goto failed; 927 } 928 929 error = hammer_btree_insert(cursor, &mrec->leaf, &doprop); 930 931 /* 932 * Cursor is left on the current element, we want to skip it now. 933 */ 934 cursor->flags |= HAMMER_CURSOR_ATEDISK; 935 cursor->flags &= ~HAMMER_CURSOR_INSERT; 936 937 /* 938 * Track a count of active inodes. 939 */ 940 if (error == 0 && 941 mrec->leaf.base.rec_type == HAMMER_RECTYPE_INODE && 942 mrec->leaf.base.delete_tid == 0) { 943 hammer_modify_volume_field(trans, 944 trans->rootvol, 945 vol0_stat_inodes); 946 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes; 947 hammer_modify_volume_done(trans->rootvol); 948 } 949 950 /* 951 * vol0_next_tid must track the highest TID stored in the filesystem. 952 * We do not need to generate undo for this update. 953 */ 954 high_tid = mrec->leaf.base.create_tid; 955 if (high_tid < mrec->leaf.base.delete_tid) 956 high_tid = mrec->leaf.base.delete_tid; 957 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) { 958 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 959 trans->rootvol->ondisk->vol0_next_tid = high_tid; 960 hammer_modify_volume_done(trans->rootvol); 961 } 962 963 /* 964 * WARNING! cursor's leaf pointer may have changed after 965 * do_propagation returns. 966 */ 967 if (error == 0 && doprop) 968 hammer_btree_do_propagation(cursor, NULL, &mrec->leaf); 969 970 failed: 971 /* 972 * Cleanup 973 */ 974 if (error && mrec->leaf.data_offset) { 975 hammer_blockmap_free(cursor->trans, 976 mrec->leaf.data_offset, 977 mrec->leaf.data_len); 978 } 979 hammer_sync_unlock(trans); 980 if (data_buffer) 981 hammer_rel_buffer(data_buffer, 0); 982 return(error); 983 } 984 985 /* 986 * Localize the data payload. Directory entries may need their 987 * localization adjusted. 988 */ 989 static 990 int 991 hammer_mirror_localize_data(hammer_data_ondisk_t data, 992 hammer_btree_leaf_elm_t leaf) 993 { 994 u_int32_t localization; 995 996 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) { 997 localization = leaf->base.localization & 998 HAMMER_LOCALIZE_PSEUDOFS_MASK; 999 if (data->entry.localization != localization) { 1000 data->entry.localization = localization; 1001 hammer_crc_set_leaf(data, leaf); 1002 } 1003 } 1004 return(0); 1005 } 1006 1007 #endif 1008