1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * HAMMER mirroring ioctls - serialize and deserialize modifications made 36 * to a filesystem. 37 */ 38 39 #include "hammer.h" 40 41 static int hammer_mirror_check(hammer_cursor_t cursor, 42 struct hammer_ioc_mrecord_rec *mrec); 43 static int hammer_mirror_update(hammer_cursor_t cursor, 44 struct hammer_ioc_mrecord_rec *mrec); 45 static int hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 46 struct hammer_ioc_mrecord_rec *mrec, 47 struct hammer_ioc_mirror_rw *mirror, 48 u_int32_t localization, 49 char *uptr); 50 static int hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 51 struct hammer_ioc_mrecord_rec *mrec, 52 struct hammer_ioc_mirror_rw *mirror, 53 u_int32_t localization); 54 static int hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 55 struct hammer_ioc_mrecord_skip *mrec, 56 struct hammer_ioc_mirror_rw *mirror, 57 u_int32_t localization); 58 static int hammer_mirror_delete_to(hammer_cursor_t cursor, 59 struct hammer_ioc_mirror_rw *mirror); 60 static int hammer_mirror_nomirror(struct hammer_base_elm *base); 61 62 /* 63 * All B-Tree records within the specified key range which also conform 64 * to the transaction id range are returned. Mirroring code keeps track 65 * of the last transaction id fully scanned and can efficiently pick up 66 * where it left off if interrupted. 67 * 68 * The PFS is identified in the mirror structure. The passed ip is just 69 * some directory in the overall HAMMER filesystem and has nothing to 70 * do with the PFS. 71 */ 72 int 73 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip, 74 struct hammer_ioc_mirror_rw *mirror) 75 { 76 struct hammer_cmirror cmirror; 77 struct hammer_cursor cursor; 78 union hammer_ioc_mrecord_any mrec; 79 hammer_btree_leaf_elm_t elm; 80 const int crc_start = HAMMER_MREC_CRCOFF; 81 char *uptr; 82 int error; 83 int data_len; 84 int bytes; 85 int eatdisk; 86 int mrec_flags; 87 u_int32_t localization; 88 u_int32_t rec_crc; 89 90 localization = (u_int32_t)mirror->pfs_id << 16; 91 92 if ((mirror->key_beg.localization | mirror->key_end.localization) & 93 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 94 return(EINVAL); 95 } 96 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0) 97 return(EINVAL); 98 99 mirror->key_cur = mirror->key_beg; 100 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 101 mirror->key_cur.localization += localization; 102 bzero(&mrec, sizeof(mrec)); 103 bzero(&cmirror, sizeof(cmirror)); 104 105 /* 106 * Make CRC errors non-fatal (at least on data), causing an EDOM 107 * error instead of EIO. 108 */ 109 trans->flags |= HAMMER_TRANSF_CRCDOM; 110 111 retry: 112 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 113 if (error) { 114 hammer_done_cursor(&cursor); 115 goto failed; 116 } 117 cursor.key_beg = mirror->key_cur; 118 cursor.key_end = mirror->key_end; 119 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK; 120 cursor.key_end.localization += localization; 121 122 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 123 cursor.flags |= HAMMER_CURSOR_BACKEND; 124 125 /* 126 * This flag filters the search to only return elements whos create 127 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid 128 * field stored with internal and leaf nodes to shortcut the scan. 129 */ 130 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED; 131 cursor.cmirror = &cmirror; 132 cmirror.mirror_tid = mirror->tid_beg; 133 134 error = hammer_btree_first(&cursor); 135 while (error == 0) { 136 /* 137 * Yield to more important tasks 138 */ 139 if (error == 0) { 140 error = hammer_signal_check(trans->hmp); 141 if (error) 142 break; 143 } 144 145 /* 146 * An internal node can be returned in mirror-filtered 147 * mode and indicates that the scan is returning a skip 148 * range in the cursor->cmirror structure. 149 */ 150 uptr = (char *)mirror->ubuf + mirror->count; 151 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 152 /* 153 * Check space 154 */ 155 mirror->key_cur = cmirror.skip_beg; 156 bytes = sizeof(mrec.skip); 157 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 158 mirror->size) { 159 break; 160 } 161 162 /* 163 * Fill mrec 164 */ 165 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 166 mrec.head.type = HAMMER_MREC_TYPE_SKIP; 167 mrec.head.rec_size = bytes; 168 mrec.skip.skip_beg = cmirror.skip_beg; 169 mrec.skip.skip_end = cmirror.skip_end; 170 mrec.head.rec_crc = crc32(&mrec.head.rec_size, 171 bytes - crc_start); 172 error = copyout(&mrec, uptr, bytes); 173 eatdisk = 0; 174 goto didwrite; 175 } 176 177 /* 178 * Leaf node. In full-history mode we could filter out 179 * elements modified outside the user-requested TID range. 180 * 181 * However, such elements must be returned so the writer 182 * can compare them against the target to determine what 183 * needs to be deleted on the target, particular for 184 * no-history mirrors. 185 */ 186 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF); 187 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 188 mirror->key_cur = elm->base; 189 190 /* 191 * If the record was created after our end point we just 192 * ignore it. 193 */ 194 if (elm->base.create_tid > mirror->tid_end) { 195 error = 0; 196 bytes = 0; 197 eatdisk = 1; 198 goto didwrite; 199 } 200 201 /* 202 * Determine if we should generate a PASS or a REC. PASS 203 * records are records without any data payload. Such 204 * records will be generated if the target is already expected 205 * to have the record, allowing it to delete the gaps. 206 * 207 * A PASS record is also used to perform deletions on the 208 * target. 209 * 210 * Such deletions are needed if the master or files on the 211 * master are no-history, or if the slave is so far behind 212 * the master has already been pruned. 213 */ 214 if (elm->base.create_tid < mirror->tid_beg) { 215 bytes = sizeof(mrec.rec); 216 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > 217 mirror->size) { 218 break; 219 } 220 221 /* 222 * Fill mrec. 223 */ 224 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 225 mrec.head.type = HAMMER_MREC_TYPE_PASS; 226 mrec.head.rec_size = bytes; 227 mrec.rec.leaf = *elm; 228 mrec.head.rec_crc = crc32(&mrec.head.rec_size, 229 bytes - crc_start); 230 error = copyout(&mrec, uptr, bytes); 231 eatdisk = 1; 232 goto didwrite; 233 } 234 235 /* 236 * The core code exports the data to userland. 237 * 238 * CRC errors on data are reported but passed through, 239 * but the data must be washed by the user program. 240 * 241 * If userland just wants the btree records it can 242 * request that bulk data not be returned. This is 243 * use during mirror-stream histogram generation. 244 */ 245 mrec_flags = 0; 246 data_len = (elm->data_offset) ? elm->data_len : 0; 247 if (data_len && 248 (mirror->head.flags & HAMMER_IOC_MIRROR_NODATA)) { 249 data_len = 0; 250 mrec_flags |= HAMMER_MRECF_NODATA; 251 } 252 if (data_len) { 253 error = hammer_btree_extract(&cursor, 254 HAMMER_CURSOR_GET_DATA); 255 if (error) { 256 if (error != EDOM) 257 break; 258 mrec_flags |= HAMMER_MRECF_CRC_ERROR | 259 HAMMER_MRECF_DATA_CRC_BAD; 260 } 261 } 262 263 bytes = sizeof(mrec.rec) + data_len; 264 if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > mirror->size) 265 break; 266 267 /* 268 * Construct the record for userland and copyout. 269 * 270 * The user is asking for a snapshot, if the record was 271 * deleted beyond the user-requested ending tid, the record 272 * is not considered deleted from the point of view of 273 * userland and delete_tid is cleared. 274 */ 275 mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; 276 mrec.head.type = HAMMER_MREC_TYPE_REC | mrec_flags; 277 mrec.head.rec_size = bytes; 278 mrec.rec.leaf = *elm; 279 280 if (elm->base.delete_tid > mirror->tid_end) 281 mrec.rec.leaf.base.delete_tid = 0; 282 rec_crc = crc32(&mrec.head.rec_size, 283 sizeof(mrec.rec) - crc_start); 284 if (data_len) 285 rec_crc = crc32_ext(cursor.data, data_len, rec_crc); 286 mrec.head.rec_crc = rec_crc; 287 error = copyout(&mrec, uptr, sizeof(mrec.rec)); 288 if (data_len && error == 0) { 289 error = copyout(cursor.data, uptr + sizeof(mrec.rec), 290 data_len); 291 } 292 eatdisk = 1; 293 294 /* 295 * eatdisk controls whether we skip the current cursor 296 * position on the next scan or not. If doing a SKIP 297 * the cursor is already positioned properly for the next 298 * scan and eatdisk will be 0. 299 */ 300 didwrite: 301 if (error == 0) { 302 mirror->count += HAMMER_HEAD_DOALIGN(bytes); 303 if (eatdisk) 304 cursor.flags |= HAMMER_CURSOR_ATEDISK; 305 else 306 cursor.flags &= ~HAMMER_CURSOR_ATEDISK; 307 error = hammer_btree_iterate(&cursor); 308 } 309 } 310 if (error == ENOENT) { 311 mirror->key_cur = mirror->key_end; 312 error = 0; 313 } 314 hammer_done_cursor(&cursor); 315 if (error == EDEADLK) 316 goto retry; 317 if (error == EINTR) { 318 mirror->head.flags |= HAMMER_IOC_HEAD_INTR; 319 error = 0; 320 } 321 failed: 322 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 323 return(error); 324 } 325 326 /* 327 * Copy records from userland to the target mirror. 328 * 329 * The PFS is identified in the mirror structure. The passed ip is just 330 * some directory in the overall HAMMER filesystem and has nothing to 331 * do with the PFS. In fact, there might not even be a root directory for 332 * the PFS yet! 333 */ 334 int 335 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, 336 struct hammer_ioc_mirror_rw *mirror) 337 { 338 union hammer_ioc_mrecord_any mrec; 339 struct hammer_cursor cursor; 340 u_int32_t localization; 341 int checkspace_count = 0; 342 int error; 343 int bytes; 344 char *uptr; 345 int seq; 346 347 localization = (u_int32_t)mirror->pfs_id << 16; 348 seq = trans->hmp->flusher.done; 349 350 /* 351 * Validate the mirror structure and relocalize the tracking keys. 352 */ 353 if (mirror->size < 0 || mirror->size > 0x70000000) 354 return(EINVAL); 355 mirror->key_beg.localization &= HAMMER_LOCALIZE_MASK; 356 mirror->key_beg.localization += localization; 357 mirror->key_end.localization &= HAMMER_LOCALIZE_MASK; 358 mirror->key_end.localization += localization; 359 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 360 mirror->key_cur.localization += localization; 361 362 /* 363 * Set up our tracking cursor for the loop. The tracking cursor 364 * is used to delete records that are no longer present on the 365 * master. The last handled record at key_cur must be skipped. 366 */ 367 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 368 369 cursor.key_beg = mirror->key_cur; 370 cursor.key_end = mirror->key_end; 371 cursor.flags |= HAMMER_CURSOR_BACKEND; 372 error = hammer_btree_first(&cursor); 373 if (error == 0) 374 cursor.flags |= HAMMER_CURSOR_ATEDISK; 375 if (error == ENOENT) 376 error = 0; 377 378 /* 379 * Loop until our input buffer has been exhausted. 380 */ 381 while (error == 0 && 382 mirror->count + sizeof(mrec.head) <= mirror->size) { 383 384 /* 385 * Don't blow out the buffer cache. Leave room for frontend 386 * cache as well. 387 * 388 * WARNING: See warnings in hammer_unlock_cursor() function. 389 */ 390 while (hammer_flusher_meta_halflimit(trans->hmp) || 391 hammer_flusher_undo_exhausted(trans, 2)) { 392 hammer_unlock_cursor(&cursor); 393 hammer_flusher_wait(trans->hmp, seq); 394 hammer_lock_cursor(&cursor); 395 seq = hammer_flusher_async_one(trans->hmp); 396 } 397 398 /* 399 * If there is insufficient free space it may be due to 400 * reserved big-blocks, which flushing might fix. 401 */ 402 if (hammer_checkspace(trans->hmp, HAMMER_CHKSPC_MIRROR)) { 403 if (++checkspace_count == 10) { 404 error = ENOSPC; 405 break; 406 } 407 hammer_unlock_cursor(&cursor); 408 hammer_flusher_wait(trans->hmp, seq); 409 hammer_lock_cursor(&cursor); 410 seq = hammer_flusher_async(trans->hmp, NULL); 411 } 412 413 414 /* 415 * Acquire and validate header 416 */ 417 if ((bytes = mirror->size - mirror->count) > sizeof(mrec)) 418 bytes = sizeof(mrec); 419 uptr = (char *)mirror->ubuf + mirror->count; 420 error = copyin(uptr, &mrec, bytes); 421 if (error) 422 break; 423 if (mrec.head.signature != HAMMER_IOC_MIRROR_SIGNATURE) { 424 error = EINVAL; 425 break; 426 } 427 if (mrec.head.rec_size < sizeof(mrec.head) || 428 mrec.head.rec_size > sizeof(mrec) + HAMMER_XBUFSIZE || 429 mirror->count + mrec.head.rec_size > mirror->size) { 430 error = EINVAL; 431 break; 432 } 433 434 switch(mrec.head.type & HAMMER_MRECF_TYPE_MASK) { 435 case HAMMER_MREC_TYPE_SKIP: 436 if (mrec.head.rec_size != sizeof(mrec.skip)) 437 error = EINVAL; 438 if (error == 0) 439 error = hammer_ioc_mirror_write_skip(&cursor, &mrec.skip, mirror, localization); 440 break; 441 case HAMMER_MREC_TYPE_REC: 442 if (mrec.head.rec_size < sizeof(mrec.rec)) 443 error = EINVAL; 444 if (error == 0) 445 error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec)); 446 break; 447 case HAMMER_MREC_TYPE_REC_NODATA: 448 case HAMMER_MREC_TYPE_REC_BADCRC: 449 /* 450 * Records with bad data payloads are ignored XXX. 451 * Records with no data payload have to be skipped 452 * (they shouldn't have been written in the first 453 * place). 454 */ 455 if (mrec.head.rec_size < sizeof(mrec.rec)) 456 error = EINVAL; 457 break; 458 case HAMMER_MREC_TYPE_PASS: 459 if (mrec.head.rec_size != sizeof(mrec.rec)) 460 error = EINVAL; 461 if (error == 0) 462 error = hammer_ioc_mirror_write_pass(&cursor, &mrec.rec, mirror, localization); 463 break; 464 default: 465 error = EINVAL; 466 break; 467 } 468 469 /* 470 * Retry the current record on deadlock, otherwise setup 471 * for the next loop. 472 */ 473 if (error == EDEADLK) { 474 while (error == EDEADLK) { 475 hammer_sync_lock_sh(trans); 476 hammer_recover_cursor(&cursor); 477 error = hammer_cursor_upgrade(&cursor); 478 hammer_sync_unlock(trans); 479 } 480 } else { 481 if (error == EALREADY) 482 error = 0; 483 if (error == 0) { 484 mirror->count += 485 HAMMER_HEAD_DOALIGN(mrec.head.rec_size); 486 } 487 } 488 } 489 hammer_done_cursor(&cursor); 490 491 /* 492 * cumulative error 493 */ 494 if (error) { 495 mirror->head.flags |= HAMMER_IOC_HEAD_ERROR; 496 mirror->head.error = error; 497 } 498 499 /* 500 * ioctls don't update the RW data structure if an error is returned, 501 * always return 0. 502 */ 503 return(0); 504 } 505 506 /* 507 * Handle skip records. 508 * 509 * We must iterate from the last resolved record position at mirror->key_cur 510 * to skip_beg non-inclusive and delete any records encountered. 511 * 512 * mirror->key_cur must be carefully set when we succeed in processing 513 * this mrec. 514 */ 515 static int 516 hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, 517 struct hammer_ioc_mrecord_skip *mrec, 518 struct hammer_ioc_mirror_rw *mirror, 519 u_int32_t localization) 520 { 521 int error; 522 523 /* 524 * Relocalize the skip range 525 */ 526 mrec->skip_beg.localization &= HAMMER_LOCALIZE_MASK; 527 mrec->skip_beg.localization += localization; 528 mrec->skip_end.localization &= HAMMER_LOCALIZE_MASK; 529 mrec->skip_end.localization += localization; 530 531 /* 532 * Iterate from current position to skip_beg, deleting any records 533 * we encounter. The record at skip_beg is not included (it is 534 * skipped). 535 */ 536 cursor->key_end = mrec->skip_beg; 537 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 538 cursor->flags |= HAMMER_CURSOR_BACKEND; 539 error = hammer_mirror_delete_to(cursor, mirror); 540 541 /* 542 * Now skip past the skip (which is the whole point point of 543 * having a skip record). The sender has not sent us any records 544 * for the skip area so we wouldn't know what to keep and what 545 * to delete anyway. 546 * 547 * Clear ATEDISK because skip_end is non-inclusive, so we can't 548 * count an exact match if we happened to get one. 549 */ 550 if (error == 0) { 551 mirror->key_cur = mrec->skip_end; 552 cursor->key_beg = mrec->skip_end; 553 error = hammer_btree_lookup(cursor); 554 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 555 if (error == ENOENT) 556 error = 0; 557 } 558 return(error); 559 } 560 561 /* 562 * Handle B-Tree records. 563 * 564 * We must iterate to mrec->base.key (non-inclusively), and then process 565 * the record. We are allowed to write a new record or delete an existing 566 * record, but cannot replace an existing record. 567 * 568 * mirror->key_cur must be carefully set when we succeed in processing 569 * this mrec. 570 */ 571 static int 572 hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, 573 struct hammer_ioc_mrecord_rec *mrec, 574 struct hammer_ioc_mirror_rw *mirror, 575 u_int32_t localization, 576 char *uptr) 577 { 578 int error; 579 580 if (mrec->leaf.data_len < 0 || 581 mrec->leaf.data_len > HAMMER_XBUFSIZE || 582 mrec->leaf.data_len + sizeof(*mrec) > mrec->head.rec_size) { 583 return(EINVAL); 584 } 585 586 /* 587 * Re-localize for target. relocalization of data is handled 588 * by hammer_mirror_write(). 589 */ 590 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 591 mrec->leaf.base.localization += localization; 592 593 /* 594 * Delete records through until we reach (non-inclusively) the 595 * target record. 596 */ 597 cursor->key_end = mrec->leaf.base; 598 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 599 cursor->flags |= HAMMER_CURSOR_BACKEND; 600 error = hammer_mirror_delete_to(cursor, mirror); 601 602 /* 603 * Certain records are not part of the mirroring operation 604 */ 605 if (error == 0 && hammer_mirror_nomirror(&mrec->leaf.base)) 606 return(0); 607 608 /* 609 * Locate the record. 610 * 611 * If the record exists only the delete_tid may be updated. 612 * 613 * If the record does not exist we can create it only if the 614 * create_tid is not too old. If the create_tid is too old 615 * it may have already been destroyed on the slave from pruning. 616 * 617 * Note that mirror operations are effectively as-of operations 618 * and delete_tid can be 0 for mirroring purposes even if it is 619 * not actually 0 at the originator. 620 * 621 * These functions can return EDEADLK 622 */ 623 if (error == 0) { 624 cursor->key_beg = mrec->leaf.base; 625 cursor->flags |= HAMMER_CURSOR_BACKEND; 626 cursor->flags &= ~HAMMER_CURSOR_INSERT; 627 error = hammer_btree_lookup(cursor); 628 } 629 630 if (error == 0 && hammer_mirror_check(cursor, mrec)) { 631 error = hammer_mirror_update(cursor, mrec); 632 } else if (error == ENOENT) { 633 if (mrec->leaf.base.create_tid >= mirror->tid_beg) { 634 error = hammer_create_at_cursor( 635 cursor, &mrec->leaf, 636 uptr, HAMMER_CREATE_MODE_UMIRROR); 637 } else { 638 error = 0; 639 } 640 } 641 if (error == 0 || error == EALREADY) 642 mirror->key_cur = mrec->leaf.base; 643 return(error); 644 } 645 646 /* 647 * This works like write_rec but no write or update is necessary, 648 * and no data payload is included so we couldn't do a write even 649 * if we wanted to. 650 * 651 * We must still iterate for deletions, and we can validate the 652 * record header which is a good way to test for corrupted mirror 653 * targets XXX. 654 * 655 * mirror->key_cur must be carefully set when we succeed in processing 656 * this mrec. 657 */ 658 static 659 int 660 hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, 661 struct hammer_ioc_mrecord_rec *mrec, 662 struct hammer_ioc_mirror_rw *mirror, 663 u_int32_t localization) 664 { 665 int error; 666 667 /* 668 * Re-localize for target. Relocalization of data is handled 669 * by hammer_mirror_write(). 670 */ 671 mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; 672 mrec->leaf.base.localization += localization; 673 674 /* 675 * Delete records through until we reach (non-inclusively) the 676 * target record. 677 */ 678 cursor->key_end = mrec->leaf.base; 679 cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; 680 cursor->flags |= HAMMER_CURSOR_BACKEND; 681 error = hammer_mirror_delete_to(cursor, mirror); 682 683 /* 684 * Certain records are not part of the mirroring operation 685 */ 686 if (hammer_mirror_nomirror(&mrec->leaf.base)) 687 return(0); 688 689 /* 690 * Locate the record and get past it by setting ATEDISK. Perform 691 * any necessary deletions. We have no data payload and cannot 692 * create a new record. 693 */ 694 if (error == 0) { 695 mirror->key_cur = mrec->leaf.base; 696 cursor->key_beg = mrec->leaf.base; 697 cursor->flags |= HAMMER_CURSOR_BACKEND; 698 cursor->flags &= ~HAMMER_CURSOR_INSERT; 699 error = hammer_btree_lookup(cursor); 700 if (error == 0) { 701 if (hammer_mirror_check(cursor, mrec)) 702 error = hammer_mirror_update(cursor, mrec); 703 cursor->flags |= HAMMER_CURSOR_ATEDISK; 704 } else { 705 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 706 } 707 if (error == ENOENT) 708 error = 0; 709 } 710 return(error); 711 } 712 713 /* 714 * As part of the mirror write we iterate across swaths of records 715 * on the target which no longer exist on the source, and mark them 716 * deleted. 717 * 718 * The caller has indexed the cursor and set up key_end. We iterate 719 * through to key_end. 720 * 721 * There is an edge case where the master has deleted a record whos 722 * create_tid exactly matches our end_tid. We cannot delete this 723 * record on the slave yet because we cannot assign delete_tid == create_tid. 724 * The deletion should be picked up on the next sequence since in order 725 * to have been deleted on the master a transaction must have occured with 726 * a TID greater then the create_tid of the record. 727 * 728 * To support incremental re-mirroring, just for robustness, we do not 729 * touch any records created beyond (or equal to) mirror->tid_end. 730 */ 731 static 732 int 733 hammer_mirror_delete_to(hammer_cursor_t cursor, 734 struct hammer_ioc_mirror_rw *mirror) 735 { 736 hammer_btree_leaf_elm_t elm; 737 int error; 738 739 error = hammer_btree_iterate(cursor); 740 while (error == 0) { 741 elm = &cursor->node->ondisk->elms[cursor->index].leaf; 742 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 743 cursor->flags |= HAMMER_CURSOR_ATEDISK; 744 745 /* 746 * Certain records are not part of the mirroring operation 747 */ 748 if (hammer_mirror_nomirror(&elm->base)) { 749 error = hammer_btree_iterate(cursor); 750 continue; 751 } 752 753 /* 754 * Note: Must still delete records with create_tid < tid_beg, 755 * as record may have been pruned-away on source. 756 */ 757 if (elm->base.delete_tid == 0 && 758 elm->base.create_tid < mirror->tid_end) { 759 error = hammer_delete_at_cursor(cursor, 760 HAMMER_DELETE_ADJUST, 761 mirror->tid_end, 762 time_second, 763 1, NULL); 764 } 765 if (error == 0) 766 error = hammer_btree_iterate(cursor); 767 } 768 if (error == ENOENT) 769 error = 0; 770 return(error); 771 } 772 773 /* 774 * Check whether an update is needed in the case where a match already 775 * exists on the target. The only type of update allowed in this case 776 * is an update of the delete_tid. 777 * 778 * Return non-zero if the update should proceed. 779 */ 780 static 781 int 782 hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord_rec *mrec) 783 { 784 hammer_btree_leaf_elm_t leaf = cursor->leaf; 785 786 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) { 787 if (mrec->leaf.base.delete_tid != 0) 788 return(1); 789 } 790 return(0); 791 } 792 793 /* 794 * Filter out records which are never mirrored, such as configuration space 795 * records (for hammer cleanup). 796 * 797 * NOTE: We currently allow HAMMER_RECTYPE_SNAPSHOT records to be mirrored. 798 */ 799 static 800 int 801 hammer_mirror_nomirror(struct hammer_base_elm *base) 802 { 803 /* 804 * Certain types of records are never updated when mirroring. 805 * Slaves have their own configuration space. 806 */ 807 if (base->rec_type == HAMMER_RECTYPE_CONFIG) 808 return(1); 809 return(0); 810 } 811 812 813 /* 814 * Update a record in-place. Only the delete_tid can change, and 815 * only from zero to non-zero. 816 */ 817 static 818 int 819 hammer_mirror_update(hammer_cursor_t cursor, 820 struct hammer_ioc_mrecord_rec *mrec) 821 { 822 int error; 823 824 /* 825 * This case shouldn't occur. 826 */ 827 if (mrec->leaf.base.delete_tid == 0) 828 return(0); 829 830 /* 831 * Mark the record deleted on the mirror target. 832 */ 833 error = hammer_delete_at_cursor(cursor, HAMMER_DELETE_ADJUST, 834 mrec->leaf.base.delete_tid, 835 mrec->leaf.delete_ts, 836 1, NULL); 837 cursor->flags |= HAMMER_CURSOR_ATEDISK; 838 return(error); 839 } 840 841 #if 0 842 /* 843 * MOVED TO HAMMER_OBJECT.C: hammer_create_at_cursor() 844 */ 845 846 static int hammer_mirror_localize_data(hammer_data_ondisk_t data, 847 hammer_btree_leaf_elm_t leaf); 848 849 /* 850 * Write out a new record. 851 */ 852 static 853 int 854 hammer_mirror_write(hammer_cursor_t cursor, 855 struct hammer_ioc_mrecord_rec *mrec, 856 char *udata) 857 { 858 hammer_transaction_t trans; 859 hammer_buffer_t data_buffer; 860 hammer_off_t ndata_offset; 861 hammer_tid_t high_tid; 862 void *ndata; 863 int error; 864 int doprop; 865 866 trans = cursor->trans; 867 data_buffer = NULL; 868 869 /* 870 * Get the sync lock so the whole mess is atomic 871 */ 872 hammer_sync_lock_sh(trans); 873 874 /* 875 * Allocate and adjust data 876 */ 877 if (mrec->leaf.data_len && mrec->leaf.data_offset) { 878 ndata = hammer_alloc_data(trans, mrec->leaf.data_len, 879 mrec->leaf.base.rec_type, 880 &ndata_offset, &data_buffer, 881 0, &error); 882 if (ndata == NULL) 883 return(error); 884 mrec->leaf.data_offset = ndata_offset; 885 hammer_modify_buffer_noundo(trans, data_buffer); 886 error = copyin(udata, ndata, mrec->leaf.data_len); 887 if (error == 0) { 888 if (hammer_crc_test_leaf(ndata, &mrec->leaf) == 0) { 889 kprintf("data crc mismatch on pipe\n"); 890 error = EINVAL; 891 } else { 892 error = hammer_mirror_localize_data( 893 ndata, &mrec->leaf); 894 } 895 } 896 hammer_modify_buffer_done(data_buffer); 897 } else { 898 mrec->leaf.data_offset = 0; 899 error = 0; 900 ndata = NULL; 901 } 902 if (error) 903 goto failed; 904 905 /* 906 * Do the insertion. This can fail with a EDEADLK or EALREADY 907 */ 908 cursor->flags |= HAMMER_CURSOR_INSERT; 909 error = hammer_btree_lookup(cursor); 910 if (error != ENOENT) { 911 if (error == 0) 912 error = EALREADY; 913 goto failed; 914 } 915 916 error = hammer_btree_insert(cursor, &mrec->leaf, &doprop); 917 918 /* 919 * Cursor is left on the current element, we want to skip it now. 920 */ 921 cursor->flags |= HAMMER_CURSOR_ATEDISK; 922 cursor->flags &= ~HAMMER_CURSOR_INSERT; 923 924 /* 925 * Track a count of active inodes. 926 */ 927 if (error == 0 && 928 mrec->leaf.base.rec_type == HAMMER_RECTYPE_INODE && 929 mrec->leaf.base.delete_tid == 0) { 930 hammer_modify_volume_field(trans, 931 trans->rootvol, 932 vol0_stat_inodes); 933 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes; 934 hammer_modify_volume_done(trans->rootvol); 935 } 936 937 /* 938 * vol0_next_tid must track the highest TID stored in the filesystem. 939 * We do not need to generate undo for this update. 940 */ 941 high_tid = mrec->leaf.base.create_tid; 942 if (high_tid < mrec->leaf.base.delete_tid) 943 high_tid = mrec->leaf.base.delete_tid; 944 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) { 945 hammer_modify_volume_noundo(trans, trans->rootvol); 946 trans->rootvol->ondisk->vol0_next_tid = high_tid; 947 hammer_modify_volume_done(trans->rootvol); 948 } 949 950 /* 951 * WARNING! cursor's leaf pointer may have changed after 952 * do_propagation returns. 953 */ 954 if (error == 0 && doprop) 955 hammer_btree_do_propagation(cursor, NULL, &mrec->leaf); 956 957 failed: 958 /* 959 * Cleanup 960 */ 961 if (error && mrec->leaf.data_offset) { 962 hammer_blockmap_free(cursor->trans, 963 mrec->leaf.data_offset, 964 mrec->leaf.data_len); 965 } 966 hammer_sync_unlock(trans); 967 if (data_buffer) 968 hammer_rel_buffer(data_buffer, 0); 969 return(error); 970 } 971 972 /* 973 * Localize the data payload. Directory entries may need their 974 * localization adjusted. 975 */ 976 static 977 int 978 hammer_mirror_localize_data(hammer_data_ondisk_t data, 979 hammer_btree_leaf_elm_t leaf) 980 { 981 u_int32_t localization; 982 983 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) { 984 localization = leaf->base.localization & 985 HAMMER_LOCALIZE_PSEUDOFS_MASK; 986 if (data->entry.localization != localization) { 987 data->entry.localization = localization; 988 hammer_crc_set_leaf(data, leaf); 989 } 990 } 991 return(0); 992 } 993 994 #endif 995