1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * HAMMER mirroring ioctls - serialize and deserialize modifications made 36 * to a filesystem. 37 */ 38 39 #include "hammer.h" 40 41 static int hammer_mirror_check(hammer_cursor_t cursor, 42 struct hammer_ioc_mrecord_rec *mrec); 43 static int hammer_mirror_update(hammer_cursor_t cursor, 44 struct hammer_ioc_mrecord_rec *mrec); 45 static int hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 46 struct hammer_ioc_mrecord_rec *mrec, 47 struct hammer_ioc_mirror_rw *mirror, 48 u_int32_t localization, 49 char *uptr); 50 static int hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 51 struct hammer_ioc_mrecord_rec *mrec, 52 struct hammer_ioc_mirror_rw *mirror, 53 u_int32_t localization); 54 static int hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 55 struct hammer_ioc_mrecord_skip *mrec, 56 struct hammer_ioc_mirror_rw *mirror, 57 u_int32_t localization); 58 static int hammer_mirror_delete_to(hammer_cursor_t cursor, 59 struct hammer_ioc_mirror_rw *mirror); 60 static int hammer_mirror_nomirror(struct hammer_base_elm *base); 61 62 /* 63 * All B-Tree records within the specified key range which also conform 64 * to the transaction id range are returned. Mirroring code keeps track 65 * of the last transaction id fully scanned and can efficiently pick up 66 * where it left off if interrupted. 67 * 68 * The PFS is identified in the mirror structure. The passed ip is just 69 * some directory in the overall HAMMER filesystem and has nothing to 70 * do with the PFS. 71 */ 72 int 73 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip, 74 struct hammer_ioc_mirror_rw *mirror) 75 { 76 struct hammer_cmirror cmirror; 77 struct hammer_cursor cursor; 78 union hammer_ioc_mrecord_any mrec; 79 hammer_btree_leaf_elm_t elm; 80 const int crc_start = HAMMER_MREC_CRCOFF; 81 char *uptr; 82 int error; 83 int data_len; 84 int bytes; 85 int eatdisk; 86 int mrec_flags; 87 u_int32_t localization; 88 u_int32_t rec_crc; 89 90 localization = (u_int32_t)mirror->pfs_id << 16; 91 92 if ((mirror->key_beg.localization | mirror->key_end.localization) & 93 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 94 return(EINVAL); 95 } 96 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0) 97 return(EINVAL); 98 99 mirror->key_cur = mirror->key_beg; 100 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 101 mirror->key_cur.localization += localization; 102 bzero(&mrec, sizeof(mrec)); 103 bzero(&cmirror, sizeof(cmirror)); 104 105 /* 106 * Make CRC errors non-fatal (at least on data), causing an EDOM 107 * error instead of EIO. 108 */ 109 trans->flags |= HAMMER_TRANSF_CRCDOM; 110 111 retry: 112 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 113 if (error) { 114 hammer_done_cursor(&cursor); 115 goto failed; 116 } 117 cursor.key_beg = mirror->key_cur; 118 cursor.key_end = mirror->key_end; 119 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK; 120 cursor.key_end.localization += localization; 121 122 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 123 cursor.flags |= HAMMER_CURSOR_BACKEND; 124 125 /* 126 * This flag filters the search to only return elements whos create 127 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid 128 * field stored with internal and leaf nodes to shortcut the scan. 129 */ 130 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED; 131 cursor.cmirror = &cmirror; 132 cmirror.mirror_tid = mirror->tid_beg; 133 134 error = hammer_btree_first(&cursor); 135 while (error == 0) { 136 /* 137 * Yield to more important tasks 138 */ 139 if (error == 0) { 140 error = hammer_signal_check(trans->hmp); 141 if (error) 142 break; 143 } 144 145 /* 146 * An internal node can be returned in mirror-filtered 147 * mode and indicates that the scan is returning a skip 148 * range in the cursor->cmirror structure. 149 */ 150 uptr = (char *)mirror->ubuf + mirror->count; 151 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 152 /* 153 * Check space 154 */ 155 mirror->key_cur = cmirror.skip_beg; 156 bytes = sizeof(mrec.skip); 157 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 158 mirror->size) { 159 break; 160 } 161 162 /* 163 * Fill mrec 164 */ 165 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 166 mrec.head.type = HAMMER_MREC_TYPE_SKIP; 167 mrec.head.rec_size = bytes; 168 mrec.skip.skip_beg = cmirror.skip_beg; 169 mrec.skip.skip_end = cmirror.skip_end; 170 mrec.head.rec_crc = crc32(&mrec.head.rec_size, 171 bytes - crc_start); 172 error = copyout(&mrec, uptr, bytes); 173 eatdisk = 0; 174 goto didwrite; 175 } 176 177 /* 178 * Leaf node. In full-history mode we could filter out 179 * elements modified outside the user-requested TID range. 180 * 181 * However, such elements must be returned so the writer 182 * can compare them against the target to determine what 183 * needs to be deleted on the target, particular for 184 * no-history mirrors. 185 */ 186 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF); 187 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 188 mirror->key_cur = elm->base; 189 190 /* 191 * If the record was created after our end point we just 192 * ignore it. 193 */ 194 if (elm->base.create_tid > mirror->tid_end) { 195 error = 0; 196 bytes = 0; 197 eatdisk = 1; 198 goto didwrite; 199 } 200 201 /* 202 * Determine if we should generate a PASS or a REC. PASS 203 * records are records without any data payload. Such 204 * records will be generated if the target is already expected 205 * to have the record, allowing it to delete the gaps. 206 * 207 * A PASS record is also used to perform deletions on the 208 * target. 209 * 210 * Such deletions are needed if the master or files on the 211 * master are no-history, or if the slave is so far behind 212 * the master has already been pruned. 213 */ 214 if (elm->base.create_tid < mirror->tid_beg) { 215 bytes = sizeof(mrec.rec); 216 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 217 mirror->size) { 218 break; 219 } 220 221 /* 222 * Fill mrec. 223 */ 224 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 225 mrec.head.type = HAMMER_MREC_TYPE_PASS; 226 mrec.head.rec_size = bytes; 227 mrec.rec.leaf = *elm; 228 mrec.head.rec_crc = crc32(&mrec.head.rec_size, 229 bytes - crc_start); 230 error = copyout(&mrec, uptr, bytes); 231 eatdisk = 1; 232 goto didwrite; 233 234 } 235 236 /* 237 * The core code exports the data to userland. 238 * 239 * CRC errors on data are reported but passed through, 240 * but the data must be washed by the user program. 241 * 242 * If userland just wants the btree records it can 243 * request that bulk data not be returned. This is 244 * use during mirror-stream histogram generation. 245 */ 246 mrec_flags = 0; 247 data_len = (elm->data_offset) ? elm->data_len : 0; 248 if (data_len && 249 (mirror->head.flags & HAMMER_IOC_MIRROR_NODATA)) { 250 data_len = 0; 251 mrec_flags |= HAMMER_MRECF_NODATA; 252 } 253 if (data_len) { 254 error = hammer_btree_extract(&cursor, 255 HAMMER_CURSOR_GET_DATA); 256 if (error) { 257 if (error != EDOM) 258 break; 259 mrec_flags |= HAMMER_MRECF_CRC_ERROR | 260 HAMMER_MRECF_DATA_CRC_BAD; 261 } 262 } 263 264 bytes = sizeof(mrec.rec) + data_len; 265 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > mirror->size) 266 break; 267 268 /* 269 * Construct the record for userland and copyout. 270 * 271 * The user is asking for a snapshot, if the record was 272 * deleted beyond the user-requested ending tid, the record 273 * is not considered deleted from the point of view of 274 * userland and delete_tid is cleared. 275 */ 276 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 277 mrec.head.type = HAMMER_MREC_TYPE_REC | mrec_flags; 278 mrec.head.rec_size = bytes; 279 mrec.rec.leaf = *elm; 280 281 if (elm->base.delete_tid > mirror->tid_end) 282 mrec.rec.leaf.base.delete_tid = 0; 283 rec_crc = crc32(&mrec.head.rec_size, 284 sizeof(mrec.rec) - crc_start); 285 if (data_len) 286 rec_crc = crc32_ext(cursor.data, data_len, rec_crc); 287 mrec.head.rec_crc = rec_crc; 288 error = copyout(&mrec, uptr, sizeof(mrec.rec)); 289 if (data_len && error == 0) { 290 error = copyout(cursor.data, uptr + sizeof(mrec.rec), 291 data_len); 292 } 293 eatdisk = 1; 294 295 /* 296 * eatdisk controls whether we skip the current cursor 297 * position on the next scan or not. If doing a SKIP 298 * the cursor is already positioned properly for the next 299 * scan and eatdisk will be 0. 300 */ 301 didwrite: 302 if (error == 0) { 303 mirror->count += HAMMER_HEAD_DOALIGN(bytes); 304 if (eatdisk) 305 cursor.flags |= HAMMER_CURSOR_ATEDISK; 306 else 307 cursor.flags &= ~HAMMER_CURSOR_ATEDISK; 308 error = hammer_btree_iterate(&cursor); 309 } 310 } 311 if (error == ENOENT) { 312 mirror->key_cur = mirror->key_end; 313 error = 0; 314 } 315 hammer_done_cursor(&cursor); 316 if (error == EDEADLK) 317 goto retry; 318 if (error == EINTR) { 319 mirror->head.flags |= HAMMER_IOC_HEAD_INTR; 320 error = 0; 321 } 322 failed: 323 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 324 return(error); 325 } 326 327 /* 328 * Copy records from userland to the target mirror. 329 * 330 * The PFS is identified in the mirror structure. The passed ip is just 331 * some directory in the overall HAMMER filesystem and has nothing to 332 * do with the PFS. In fact, there might not even be a root directory for 333 * the PFS yet! 334 */ 335 int 336 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, 337 struct hammer_ioc_mirror_rw *mirror) 338 { 339 union hammer_ioc_mrecord_any mrec; 340 struct hammer_cursor cursor; 341 u_int32_t localization; 342 int checkspace_count = 0; 343 int error; 344 int bytes; 345 char *uptr; 346 int seq; 347 348 localization = (u_int32_t)mirror->pfs_id << 16; 349 seq = trans->hmp->flusher.done; 350 351 /* 352 * Validate the mirror structure and relocalize the tracking keys. 353 */ 354 if (mirror->size < 0 || mirror->size > 0x70000000) 355 return(EINVAL); 356 mirror->key_beg.localization &= HAMMER_LOCALIZE_MASK; 357 mirror->key_beg.localization += localization; 358 mirror->key_end.localization &= HAMMER_LOCALIZE_MASK; 359 mirror->key_end.localization += localization; 360 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 361 mirror->key_cur.localization += localization; 362 363 /* 364 * Set up our tracking cursor for the loop. The tracking cursor 365 * is used to delete records that are no longer present on the 366 * master. The last handled record at key_cur must be skipped. 367 */ 368 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 369 370 cursor.key_beg = mirror->key_cur; 371 cursor.key_end = mirror->key_end; 372 cursor.flags |= HAMMER_CURSOR_BACKEND; 373 error = hammer_btree_first(&cursor); 374 if (error == 0) 375 cursor.flags |= HAMMER_CURSOR_ATEDISK; 376 if (error == ENOENT) 377 error = 0; 378 379 /* 380 * Loop until our input buffer has been exhausted. 381 */ 382 while (error == 0 && 383 mirror->count + sizeof(mrec.head) <= mirror->size) { 384 385 /* 386 * Don't blow out the buffer cache. Leave room for frontend 387 * cache as well. 388 * 389 * WARNING: See warnings in hammer_unlock_cursor() function. 390 */ 391 while (hammer_flusher_meta_halflimit(trans->hmp) || 392 hammer_flusher_undo_exhausted(trans, 2)) { 393 hammer_unlock_cursor(&cursor); 394 hammer_flusher_wait(trans->hmp, seq); 395 hammer_lock_cursor(&cursor); 396 seq = hammer_flusher_async_one(trans->hmp); 397 } 398 399 /* 400 * If there is insufficient free space it may be due to 401 * reserved bigblocks, which flushing might fix. 402 */ 403 if (hammer_checkspace(trans->hmp, HAMMER_CHKSPC_MIRROR)) { 404 if (++checkspace_count == 10) { 405 error = ENOSPC; 406 break; 407 } 408 hammer_unlock_cursor(&cursor); 409 hammer_flusher_wait(trans->hmp, seq); 410 hammer_lock_cursor(&cursor); 411 seq = hammer_flusher_async(trans->hmp, NULL); 412 } 413 414 415 /* 416 * Acquire and validate header 417 */ 418 if ((bytes = mirror->size - mirror->count) > sizeof(mrec)) 419 bytes = sizeof(mrec); 420 uptr = (char *)mirror->ubuf + mirror->count; 421 error = copyin(uptr, &mrec, bytes); 422 if (error) 423 break; 424 if (mrec.head.signature != HAMMER_IOC_MIRROR_SIGNATURE) { 425 error = EINVAL; 426 break; 427 } 428 if (mrec.head.rec_size < sizeof(mrec.head) || 429 mrec.head.rec_size > sizeof(mrec) + HAMMER_XBUFSIZE || 430 mirror->count + mrec.head.rec_size > mirror->size) { 431 error = EINVAL; 432 break; 433 } 434 435 switch(mrec.head.type & HAMMER_MRECF_TYPE_MASK) { 436 case HAMMER_MREC_TYPE_SKIP: 437 if (mrec.head.rec_size != sizeof(mrec.skip)) 438 error = EINVAL; 439 if (error == 0) 440 error = hammer_ioc_mirror_write_skip(&cursor, &mrec.skip, mirror, localization); 441 break; 442 case HAMMER_MREC_TYPE_REC: 443 if (mrec.head.rec_size < sizeof(mrec.rec)) 444 error = EINVAL; 445 if (error == 0) 446 error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec)); 447 break; 448 case HAMMER_MREC_TYPE_REC_NODATA: 449 case HAMMER_MREC_TYPE_REC_BADCRC: 450 /* 451 * Records with bad data payloads are ignored XXX. 452 * Records with no data payload have to be skipped 453 * (they shouldn't have been written in the first 454 * place). 455 */ 456 if (mrec.head.rec_size < sizeof(mrec.rec)) 457 error = EINVAL; 458 break; 459 case HAMMER_MREC_TYPE_PASS: 460 if (mrec.head.rec_size != sizeof(mrec.rec)) 461 error = EINVAL; 462 if (error == 0) 463 error = hammer_ioc_mirror_write_pass(&cursor, &mrec.rec, mirror, localization); 464 break; 465 default: 466 error = EINVAL; 467 break; 468 } 469 470 /* 471 * Retry the current record on deadlock, otherwise setup 472 * for the next loop. 473 */ 474 if (error == EDEADLK) { 475 while (error == EDEADLK) { 476 hammer_sync_lock_sh(trans); 477 hammer_recover_cursor(&cursor); 478 error = hammer_cursor_upgrade(&cursor); 479 hammer_sync_unlock(trans); 480 } 481 } else { 482 if (error == EALREADY) 483 error = 0; 484 if (error == 0) { 485 mirror->count += 486 HAMMER_HEAD_DOALIGN(mrec.head.rec_size); 487 } 488 } 489 } 490 hammer_done_cursor(&cursor); 491 492 /* 493 * cumulative error 494 */ 495 if (error) { 496 mirror->head.flags |= HAMMER_IOC_HEAD_ERROR; 497 mirror->head.error = error; 498 } 499 500 /* 501 * ioctls don't update the RW data structure if an error is returned, 502 * always return 0. 503 */ 504 return(0); 505 } 506 507 /* 508 * Handle skip records. 509 * 510 * We must iterate from the last resolved record position at mirror->key_cur 511 * to skip_beg non-inclusive and delete any records encountered. 512 * 513 * mirror->key_cur must be carefully set when we succeed in processing 514 * this mrec. 515 */ 516 static int 517 hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 518 struct hammer_ioc_mrecord_skip *mrec, 519 struct hammer_ioc_mirror_rw *mirror, 520 u_int32_t localization) 521 { 522 int error; 523 524 /* 525 * Relocalize the skip range 526 */ 527 mrec->skip_beg.localization &= HAMMER_LOCALIZE_MASK; 528 mrec->skip_beg.localization += localization; 529 mrec->skip_end.localization &= HAMMER_LOCALIZE_MASK; 530 mrec->skip_end.localization += localization; 531 532 /* 533 * Iterate from current position to skip_beg, deleting any records 534 * we encounter. The record at skip_beg is not included (it is 535 * skipped). 536 */ 537 cursor->key_end = mrec->skip_beg; 538 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 539 cursor->flags |= HAMMER_CURSOR_BACKEND; 540 error = hammer_mirror_delete_to(cursor, mirror); 541 542 /* 543 * Now skip past the skip (which is the whole point point of 544 * having a skip record). The sender has not sent us any records 545 * for the skip area so we wouldn't know what to keep and what 546 * to delete anyway. 547 * 548 * Clear ATEDISK because skip_end is non-inclusive, so we can't 549 * count an exact match if we happened to get one. 550 */ 551 if (error == 0) { 552 mirror->key_cur = mrec->skip_end; 553 cursor->key_beg = mrec->skip_end; 554 error = hammer_btree_lookup(cursor); 555 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 556 if (error == ENOENT) 557 error = 0; 558 } 559 return(error); 560 } 561 562 /* 563 * Handle B-Tree records. 564 * 565 * We must iterate to mrec->base.key (non-inclusively), and then process 566 * the record. We are allowed to write a new record or delete an existing 567 * record, but cannot replace an existing record. 568 * 569 * mirror->key_cur must be carefully set when we succeed in processing 570 * this mrec. 571 */ 572 static int 573 hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 574 struct hammer_ioc_mrecord_rec *mrec, 575 struct hammer_ioc_mirror_rw *mirror, 576 u_int32_t localization, 577 char *uptr) 578 { 579 int error; 580 581 if (mrec->leaf.data_len < 0 || 582 mrec->leaf.data_len > HAMMER_XBUFSIZE || 583 mrec->leaf.data_len + sizeof(*mrec) > mrec->head.rec_size) { 584 return(EINVAL); 585 } 586 587 /* 588 * Re-localize for target. relocalization of data is handled 589 * by hammer_mirror_write(). 590 */ 591 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 592 mrec->leaf.base.localization += localization; 593 594 /* 595 * Delete records through until we reach (non-inclusively) the 596 * target record. 597 */ 598 cursor->key_end = mrec->leaf.base; 599 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 600 cursor->flags |= HAMMER_CURSOR_BACKEND; 601 error = hammer_mirror_delete_to(cursor, mirror); 602 603 /* 604 * Certain records are not part of the mirroring operation 605 */ 606 if (error == 0 && hammer_mirror_nomirror(&mrec->leaf.base)) 607 return(0); 608 609 /* 610 * Locate the record. 611 * 612 * If the record exists only the delete_tid may be updated. 613 * 614 * If the record does not exist we can create it only if the 615 * create_tid is not too old. If the create_tid is too old 616 * it may have already been destroyed on the slave from pruning. 617 * 618 * Note that mirror operations are effectively as-of operations 619 * and delete_tid can be 0 for mirroring purposes even if it is 620 * not actually 0 at the originator. 621 * 622 * These functions can return EDEADLK 623 */ 624 if (error == 0) { 625 cursor->key_beg = mrec->leaf.base; 626 cursor->flags |= HAMMER_CURSOR_BACKEND; 627 cursor->flags &= ~HAMMER_CURSOR_INSERT; 628 error = hammer_btree_lookup(cursor); 629 } 630 631 if (error == 0 && hammer_mirror_check(cursor, mrec)) { 632 error = hammer_mirror_update(cursor, mrec); 633 } else if (error == ENOENT) { 634 if (mrec->leaf.base.create_tid >= mirror->tid_beg) { 635 error = hammer_create_at_cursor( 636 cursor, &mrec->leaf, 637 uptr, HAMMER_CREATE_MODE_UMIRROR); 638 } else { 639 error = 0; 640 } 641 } 642 if (error == 0 || error == EALREADY) 643 mirror->key_cur = mrec->leaf.base; 644 return(error); 645 } 646 647 /* 648 * This works like write_rec but no write or update is necessary, 649 * and no data payload is included so we couldn't do a write even 650 * if we wanted to. 651 * 652 * We must still iterate for deletions, and we can validate the 653 * record header which is a good way to test for corrupted mirror 654 * targets XXX. 655 * 656 * mirror->key_cur must be carefully set when we succeed in processing 657 * this mrec. 658 */ 659 static 660 int 661 hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 662 struct hammer_ioc_mrecord_rec *mrec, 663 struct hammer_ioc_mirror_rw *mirror, 664 u_int32_t localization) 665 { 666 int error; 667 668 /* 669 * Re-localize for target. Relocalization of data is handled 670 * by hammer_mirror_write(). 671 */ 672 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 673 mrec->leaf.base.localization += localization; 674 675 /* 676 * Delete records through until we reach (non-inclusively) the 677 * target record. 678 */ 679 cursor->key_end = mrec->leaf.base; 680 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 681 cursor->flags |= HAMMER_CURSOR_BACKEND; 682 error = hammer_mirror_delete_to(cursor, mirror); 683 684 /* 685 * Certain records are not part of the mirroring operation 686 */ 687 if (hammer_mirror_nomirror(&mrec->leaf.base)) 688 return(0); 689 690 /* 691 * Locate the record and get past it by setting ATEDISK. Perform 692 * any necessary deletions. We have no data payload and cannot 693 * create a new record. 694 */ 695 if (error == 0) { 696 mirror->key_cur = mrec->leaf.base; 697 cursor->key_beg = mrec->leaf.base; 698 cursor->flags |= HAMMER_CURSOR_BACKEND; 699 cursor->flags &= ~HAMMER_CURSOR_INSERT; 700 error = hammer_btree_lookup(cursor); 701 if (error == 0) { 702 if (hammer_mirror_check(cursor, mrec)) 703 error = hammer_mirror_update(cursor, mrec); 704 cursor->flags |= HAMMER_CURSOR_ATEDISK; 705 } else { 706 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 707 } 708 if (error == ENOENT) 709 error = 0; 710 } 711 return(error); 712 } 713 714 /* 715 * As part of the mirror write we iterate across swaths of records 716 * on the target which no longer exist on the source, and mark them 717 * deleted. 718 * 719 * The caller has indexed the cursor and set up key_end. We iterate 720 * through to key_end. 721 * 722 * There is an edge case where the master has deleted a record whos 723 * create_tid exactly matches our end_tid. We cannot delete this 724 * record on the slave yet because we cannot assign delete_tid == create_tid. 725 * The deletion should be picked up on the next sequence since in order 726 * to have been deleted on the master a transaction must have occured with 727 * a TID greater then the create_tid of the record. 728 * 729 * To support incremental re-mirroring, just for robustness, we do not 730 * touch any records created beyond (or equal to) mirror->tid_end. 731 */ 732 static 733 int 734 hammer_mirror_delete_to(hammer_cursor_t cursor, 735 struct hammer_ioc_mirror_rw *mirror) 736 { 737 hammer_btree_leaf_elm_t elm; 738 int error; 739 740 error = hammer_btree_iterate(cursor); 741 while (error == 0) { 742 elm = &cursor->node->ondisk->elms[cursor->index].leaf; 743 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 744 cursor->flags |= HAMMER_CURSOR_ATEDISK; 745 746 /* 747 * Certain records are not part of the mirroring operation 748 */ 749 if (hammer_mirror_nomirror(&elm->base)) { 750 error = hammer_btree_iterate(cursor); 751 continue; 752 } 753 754 /* 755 * Note: Must still delete records with create_tid < tid_beg, 756 * as record may have been pruned-away on source. 757 */ 758 if (elm->base.delete_tid == 0 && 759 elm->base.create_tid < mirror->tid_end) { 760 error = hammer_delete_at_cursor(cursor, 761 HAMMER_DELETE_ADJUST, 762 mirror->tid_end, 763 time_second, 764 1, NULL); 765 } 766 if (error == 0) 767 error = hammer_btree_iterate(cursor); 768 } 769 if (error == ENOENT) 770 error = 0; 771 return(error); 772 } 773 774 /* 775 * Check whether an update is needed in the case where a match already 776 * exists on the target. The only type of update allowed in this case 777 * is an update of the delete_tid. 778 * 779 * Return non-zero if the update should proceed. 780 */ 781 static 782 int 783 hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord_rec *mrec) 784 { 785 hammer_btree_leaf_elm_t leaf = cursor->leaf; 786 787 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) { 788 if (mrec->leaf.base.delete_tid != 0) 789 return(1); 790 } 791 return(0); 792 } 793 794 /* 795 * Filter out records which are never mirrored, such as configuration space 796 * records (for hammer cleanup). 797 * 798 * NOTE: We currently allow HAMMER_RECTYPE_SNAPSHOT records to be mirrored. 799 */ 800 static 801 int 802 hammer_mirror_nomirror(struct hammer_base_elm *base) 803 { 804 /* 805 * Certain types of records are never updated when mirroring. 806 * Slaves have their own configuration space. 807 */ 808 if (base->rec_type == HAMMER_RECTYPE_CONFIG) 809 return(1); 810 return(0); 811 } 812 813 814 /* 815 * Update a record in-place. Only the delete_tid can change, and 816 * only from zero to non-zero. 817 */ 818 static 819 int 820 hammer_mirror_update(hammer_cursor_t cursor, 821 struct hammer_ioc_mrecord_rec *mrec) 822 { 823 int error; 824 825 /* 826 * This case shouldn't occur. 827 */ 828 if (mrec->leaf.base.delete_tid == 0) 829 return(0); 830 831 /* 832 * Mark the record deleted on the mirror target. 833 */ 834 error = hammer_delete_at_cursor(cursor, HAMMER_DELETE_ADJUST, 835 mrec->leaf.base.delete_tid, 836 mrec->leaf.delete_ts, 837 1, NULL); 838 cursor->flags |= HAMMER_CURSOR_ATEDISK; 839 return(error); 840 } 841 842 #if 0 843 /* 844 * MOVED TO HAMMER_OBJECT.C: hammer_create_at_cursor() 845 */ 846 847 static int hammer_mirror_localize_data(hammer_data_ondisk_t data, 848 hammer_btree_leaf_elm_t leaf); 849 850 /* 851 * Write out a new record. 852 */ 853 static 854 int 855 hammer_mirror_write(hammer_cursor_t cursor, 856 struct hammer_ioc_mrecord_rec *mrec, 857 char *udata) 858 { 859 hammer_transaction_t trans; 860 hammer_buffer_t data_buffer; 861 hammer_off_t ndata_offset; 862 hammer_tid_t high_tid; 863 void *ndata; 864 int error; 865 int doprop; 866 867 trans = cursor->trans; 868 data_buffer = NULL; 869 870 /* 871 * Get the sync lock so the whole mess is atomic 872 */ 873 hammer_sync_lock_sh(trans); 874 875 /* 876 * Allocate and adjust data 877 */ 878 if (mrec->leaf.data_len && mrec->leaf.data_offset) { 879 ndata = hammer_alloc_data(trans, mrec->leaf.data_len, 880 mrec->leaf.base.rec_type, 881 &ndata_offset, &data_buffer, 882 0, &error); 883 if (ndata == NULL) 884 return(error); 885 mrec->leaf.data_offset = ndata_offset; 886 hammer_modify_buffer(trans, data_buffer, NULL, 0); 887 error = copyin(udata, ndata, mrec->leaf.data_len); 888 if (error == 0) { 889 if (hammer_crc_test_leaf(ndata, &mrec->leaf) == 0) { 890 kprintf("data crc mismatch on pipe\n"); 891 error = EINVAL; 892 } else { 893 error = hammer_mirror_localize_data( 894 ndata, &mrec->leaf); 895 } 896 } 897 hammer_modify_buffer_done(data_buffer); 898 } else { 899 mrec->leaf.data_offset = 0; 900 error = 0; 901 ndata = NULL; 902 } 903 if (error) 904 goto failed; 905 906 /* 907 * Do the insertion. This can fail with a EDEADLK or EALREADY 908 */ 909 cursor->flags |= HAMMER_CURSOR_INSERT; 910 error = hammer_btree_lookup(cursor); 911 if (error != ENOENT) { 912 if (error == 0) 913 error = EALREADY; 914 goto failed; 915 } 916 917 error = hammer_btree_insert(cursor, &mrec->leaf, &doprop); 918 919 /* 920 * Cursor is left on the current element, we want to skip it now. 921 */ 922 cursor->flags |= HAMMER_CURSOR_ATEDISK; 923 cursor->flags &= ~HAMMER_CURSOR_INSERT; 924 925 /* 926 * Track a count of active inodes. 927 */ 928 if (error == 0 && 929 mrec->leaf.base.rec_type == HAMMER_RECTYPE_INODE && 930 mrec->leaf.base.delete_tid == 0) { 931 hammer_modify_volume_field(trans, 932 trans->rootvol, 933 vol0_stat_inodes); 934 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes; 935 hammer_modify_volume_done(trans->rootvol); 936 } 937 938 /* 939 * vol0_next_tid must track the highest TID stored in the filesystem. 940 * We do not need to generate undo for this update. 941 */ 942 high_tid = mrec->leaf.base.create_tid; 943 if (high_tid < mrec->leaf.base.delete_tid) 944 high_tid = mrec->leaf.base.delete_tid; 945 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) { 946 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 947 trans->rootvol->ondisk->vol0_next_tid = high_tid; 948 hammer_modify_volume_done(trans->rootvol); 949 } 950 951 /* 952 * WARNING! cursor's leaf pointer may have changed after 953 * do_propagation returns. 954 */ 955 if (error == 0 && doprop) 956 hammer_btree_do_propagation(cursor, NULL, &mrec->leaf); 957 958 failed: 959 /* 960 * Cleanup 961 */ 962 if (error && mrec->leaf.data_offset) { 963 hammer_blockmap_free(cursor->trans, 964 mrec->leaf.data_offset, 965 mrec->leaf.data_len); 966 } 967 hammer_sync_unlock(trans); 968 if (data_buffer) 969 hammer_rel_buffer(data_buffer, 0); 970 return(error); 971 } 972 973 /* 974 * Localize the data payload. Directory entries may need their 975 * localization adjusted. 976 */ 977 static 978 int 979 hammer_mirror_localize_data(hammer_data_ondisk_t data, 980 hammer_btree_leaf_elm_t leaf) 981 { 982 u_int32_t localization; 983 984 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) { 985 localization = leaf->base.localization & 986 HAMMER_LOCALIZE_PSEUDOFS_MASK; 987 if (data->entry.localization != localization) { 988 data->entry.localization = localization; 989 hammer_crc_set_leaf(data, leaf); 990 } 991 } 992 return(0); 993 } 994 995 #endif 996