1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.9 2008/07/09 10:29:20 dillon Exp $ 35 */ 36 /* 37 * HAMMER mirroring ioctls - serialize and deserialize modifications made 38 * to a filesystem. 39 */ 40 41 #include "hammer.h" 42 43 static int hammer_mirror_check(hammer_cursor_t cursor, 44 struct hammer_ioc_mrecord *mrec); 45 static int hammer_mirror_update(hammer_cursor_t cursor, 46 struct hammer_ioc_mrecord *mrec); 47 static int hammer_mirror_write(hammer_cursor_t cursor, 48 struct hammer_ioc_mrecord *mrec, 49 hammer_inode_t ip, char *udata); 50 static int hammer_mirror_localize_data(hammer_data_ondisk_t data, 51 hammer_btree_leaf_elm_t leaf); 52 53 /* 54 * All B-Tree records within the specified key range which also conform 55 * to the transaction id range are returned. Mirroring code keeps track 56 * of the last transaction id fully scanned and can efficiently pick up 57 * where it left off if interrupted. 58 * 59 * The PFS is identified in the mirror structure. The passed ip is just 60 * some directory in the overall HAMMER filesystem and has nothing to 61 * do with the PFS. 62 */ 63 int 64 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip, 65 struct hammer_ioc_mirror_rw *mirror) 66 { 67 struct hammer_cursor cursor; 68 struct hammer_ioc_mrecord mrec; 69 hammer_btree_leaf_elm_t elm; 70 const int head_size = HAMMER_MREC_HEADSIZE; 71 const int crc_start = HAMMER_MREC_CRCOFF; 72 char *uptr; 73 int error; 74 int data_len; 75 int bytes; 76 u_int32_t localization; 77 78 localization = (u_int32_t)mirror->pfs_id << 16; 79 80 if ((mirror->key_beg.localization | mirror->key_end.localization) & 81 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 82 return(EINVAL); 83 } 84 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0) 85 return(EINVAL); 86 87 mirror->key_cur = mirror->key_beg; 88 mirror->key_cur.localization += localization; 89 bzero(&mrec, sizeof(mrec)); 90 91 retry: 92 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 93 if (error) { 94 hammer_done_cursor(&cursor); 95 goto failed; 96 } 97 cursor.key_beg = mirror->key_cur; 98 cursor.key_end = mirror->key_end; 99 cursor.key_end.localization += localization; 100 101 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 102 cursor.flags |= HAMMER_CURSOR_BACKEND; 103 104 /* 105 * This flag filters the search to only return elements whos create 106 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid 107 * field stored with internal and leaf nodes to shortcut the scan. 108 */ 109 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED; 110 cursor.mirror_tid = mirror->tid_beg; 111 112 error = hammer_btree_first(&cursor); 113 while (error == 0) { 114 /* 115 * Leaf node. Only return elements modified in the range 116 * requested by userland. 117 */ 118 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF); 119 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 120 121 if (elm->base.create_tid < mirror->tid_beg || 122 elm->base.create_tid >= mirror->tid_end) { 123 if (elm->base.delete_tid < mirror->tid_beg || 124 elm->base.delete_tid >= mirror->tid_end) { 125 goto skip; 126 } 127 } 128 129 mirror->key_cur = elm->base; 130 131 /* 132 * Yield to more important tasks 133 */ 134 if ((error = hammer_signal_check(trans->hmp)) != 0) 135 break; 136 if (trans->hmp->sync_lock.wanted) { 137 tsleep(trans, 0, "hmrslo", hz / 10); 138 } 139 if (trans->hmp->locked_dirty_space + 140 trans->hmp->io_running_space > hammer_limit_dirtybufspace) { 141 hammer_flusher_async(trans->hmp); 142 tsleep(trans, 0, "hmrslo", hz / 10); 143 } 144 145 /* 146 * The core code exports the data to userland. 147 */ 148 data_len = (elm->data_offset) ? elm->data_len : 0; 149 if (data_len) { 150 error = hammer_btree_extract(&cursor, 151 HAMMER_CURSOR_GET_DATA); 152 if (error) 153 break; 154 } 155 bytes = sizeof(struct hammer_ioc_mrecord) + data_len; 156 bytes = (bytes + HAMMER_HEAD_ALIGN_MASK) & 157 ~HAMMER_HEAD_ALIGN_MASK; 158 if (mirror->count + bytes > mirror->size) 159 break; 160 161 /* 162 * Construct the record for userland and copyout. 163 * 164 * The user is asking for a snapshot, if the record was 165 * deleted beyond the user-requested ending tid, the record 166 * is not considered deleted from the point of view of 167 * userland and delete_tid is cleared. 168 */ 169 mrec.signature = HAMMER_IOC_MIRROR_SIGNATURE; 170 mrec.type = HAMMER_MREC_TYPE_REC; 171 mrec.rec_size = bytes; 172 mrec.leaf = *elm; 173 if (elm->base.delete_tid >= mirror->tid_end) 174 mrec.leaf.base.delete_tid = 0; 175 mrec.rec_crc = crc32(&mrec.rec_size, head_size - crc_start); 176 uptr = (char *)mirror->ubuf + mirror->count; 177 error = copyout(&mrec, uptr, head_size); 178 if (data_len && error == 0) { 179 error = copyout(cursor.data, uptr + head_size, 180 data_len); 181 } 182 if (error == 0) 183 mirror->count += bytes; 184 skip: 185 if (error == 0) { 186 cursor.flags |= HAMMER_CURSOR_ATEDISK; 187 error = hammer_btree_iterate(&cursor); 188 } 189 } 190 if (error == ENOENT) { 191 mirror->key_cur = mirror->key_end; 192 error = 0; 193 } 194 hammer_done_cursor(&cursor); 195 if (error == EDEADLK) 196 goto retry; 197 if (error == EINTR) { 198 mirror->head.flags |= HAMMER_IOC_HEAD_INTR; 199 error = 0; 200 } 201 failed: 202 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; 203 return(error); 204 } 205 206 /* 207 * Copy records from userland to the target mirror. Records which already 208 * exist may only have their delete_tid updated. 209 * 210 * The PFS is identified in the mirror structure. The passed ip is just 211 * some directory in the overall HAMMER filesystem and has nothing to 212 * do with the PFS. In fact, there might not even be a root directory for 213 * the PFS yet! 214 */ 215 int 216 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, 217 struct hammer_ioc_mirror_rw *mirror) 218 { 219 struct hammer_cursor cursor; 220 struct hammer_ioc_mrecord mrec; 221 const int head_size = HAMMER_MREC_HEADSIZE; 222 const int crc_start = HAMMER_MREC_CRCOFF; 223 u_int32_t rec_crc; 224 int error; 225 char *uptr; 226 u_int32_t localization; 227 228 localization = (u_int32_t)mirror->pfs_id << 16; 229 230 if (mirror->size < 0 || mirror->size > 0x70000000) 231 return(EINVAL); 232 233 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 234 retry: 235 hammer_normalize_cursor(&cursor); 236 237 while (error == 0 && mirror->count + head_size <= mirror->size) { 238 /* 239 * Acquire and validate header 240 */ 241 uptr = (char *)mirror->ubuf + mirror->count; 242 error = copyin(uptr, &mrec, head_size); 243 if (error) 244 break; 245 rec_crc = crc32(&mrec.rec_size, head_size - crc_start); 246 if (mrec.signature != HAMMER_IOC_MIRROR_SIGNATURE) { 247 error = EINVAL; 248 break; 249 } 250 if (mrec.type != HAMMER_MREC_TYPE_REC) { 251 error = EINVAL; 252 break; 253 } 254 if (rec_crc != mrec.rec_crc) { 255 error = EINVAL; 256 break; 257 } 258 if (mrec.rec_size < head_size || 259 mrec.rec_size > head_size + HAMMER_XBUFSIZE + 16 || 260 mirror->count + mrec.rec_size > mirror->size) { 261 error = EINVAL; 262 break; 263 } 264 if (mrec.leaf.data_len < 0 || 265 mrec.leaf.data_len > HAMMER_XBUFSIZE || 266 sizeof(struct hammer_ioc_mrecord) + mrec.leaf.data_len > mrec.rec_size) { 267 error = EINVAL; 268 } 269 270 /* 271 * Re-localize for target. relocalization of data is handled 272 * by hammer_mirror_write(). 273 */ 274 mrec.leaf.base.localization &= HAMMER_LOCALIZE_MASK; 275 mrec.leaf.base.localization += localization; 276 277 /* 278 * Locate the record. 279 * 280 * If the record exists only the delete_tid may be updated. 281 * 282 * If the record does not exist we create it. For now we 283 * ignore records with a non-zero delete_tid. Note that 284 * mirror operations are effective an as-of operation and 285 * delete_tid can be 0 for mirroring purposes even if it is 286 * not actually 0 at the originator. 287 */ 288 hammer_normalize_cursor(&cursor); 289 cursor.key_beg = mrec.leaf.base; 290 cursor.flags |= HAMMER_CURSOR_BACKEND; 291 cursor.flags &= ~HAMMER_CURSOR_INSERT; 292 error = hammer_btree_lookup(&cursor); 293 294 if (error == 0 && hammer_mirror_check(&cursor, &mrec)) { 295 hammer_sync_lock_sh(trans); 296 error = hammer_mirror_update(&cursor, &mrec); 297 hammer_sync_unlock(trans); 298 } else if (error == ENOENT && mrec.leaf.base.delete_tid == 0) { 299 hammer_sync_lock_sh(trans); 300 error = hammer_mirror_write(&cursor, &mrec, ip, 301 uptr + head_size); 302 hammer_sync_unlock(trans); 303 } else if (error == ENOENT) { 304 error = 0; 305 } 306 307 /* 308 * Clean for loop. It is ok if the record already exists 309 * on the target. 310 */ 311 if (error == EDEADLK) { 312 hammer_done_cursor(&cursor); 313 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 314 goto retry; 315 } 316 317 if (error == EALREADY) 318 error = 0; 319 if (error == 0) 320 mirror->count += mrec.rec_size; 321 } 322 hammer_done_cursor(&cursor); 323 return(0); 324 } 325 326 /* 327 * Check whether an update is needed in the case where a match already 328 * exists on the target. The only type of update allowed in this case 329 * is an update of the delete_tid. 330 * 331 * Return non-zero if the update should proceed. 332 */ 333 static 334 int 335 hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec) 336 { 337 hammer_btree_leaf_elm_t leaf = cursor->leaf; 338 339 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) { 340 if (mrec->leaf.base.delete_tid != 0) 341 return(1); 342 } 343 return(0); 344 } 345 346 /* 347 * Update a record in-place. Only the delete_tid can change. 348 */ 349 static 350 int 351 hammer_mirror_update(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec) 352 { 353 hammer_transaction_t trans; 354 hammer_btree_leaf_elm_t elm; 355 356 elm = cursor->leaf; 357 trans = cursor->trans; 358 359 if (mrec->leaf.base.delete_tid == 0) { 360 kprintf("mirror_write: object %016llx:%016llx deleted on " 361 "target, not deleted on source\n", 362 elm->base.obj_id, elm->base.key); 363 return(0); 364 } 365 366 KKASSERT(elm->base.create_tid < mrec->leaf.base.delete_tid); 367 hammer_modify_node(trans, cursor->node, elm, sizeof(*elm)); 368 elm->base.delete_tid = mrec->leaf.base.delete_tid; 369 elm->delete_ts = mrec->leaf.delete_ts; 370 hammer_modify_node_done(cursor->node); 371 372 /* 373 * Track a count of active inodes. 374 */ 375 if (elm->base.obj_type == HAMMER_RECTYPE_INODE) { 376 hammer_modify_volume_field(trans, 377 trans->rootvol, 378 vol0_stat_inodes); 379 --trans->hmp->rootvol->ondisk->vol0_stat_inodes; 380 hammer_modify_volume_done(trans->rootvol); 381 } 382 383 return(0); 384 } 385 386 /* 387 * Write out a new record. 388 */ 389 static 390 int 391 hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec, 392 hammer_inode_t ip, char *udata) 393 { 394 hammer_transaction_t trans; 395 hammer_buffer_t data_buffer; 396 hammer_off_t ndata_offset; 397 hammer_tid_t high_tid; 398 void *ndata; 399 int error; 400 int doprop; 401 402 #if 0 403 /* 404 * removed: all records are now duplicated, including the root 405 * inode. 406 */ 407 if (mrec->leaf.base.obj_id == HAMMER_OBJID_ROOT) { 408 if (mrec->leaf.base.rec_type == HAMMER_RECTYPE_INODE || 409 mrec->leaf.base.rec_type == HAMMER_RECTYPE_FIX) { 410 return(0); 411 } 412 } 413 #endif 414 415 trans = cursor->trans; 416 data_buffer = NULL; 417 418 /* 419 * Allocate and adjust data 420 */ 421 if (mrec->leaf.data_len && mrec->leaf.data_offset) { 422 ndata = hammer_alloc_data(trans, mrec->leaf.data_len, 423 mrec->leaf.base.rec_type, 424 &ndata_offset, &data_buffer, &error); 425 if (ndata == NULL) 426 return(error); 427 mrec->leaf.data_offset = ndata_offset; 428 hammer_modify_buffer(trans, data_buffer, NULL, 0); 429 error = copyin(udata, ndata, mrec->leaf.data_len); 430 if (error == 0) { 431 if (hammer_crc_test_leaf(ndata, &mrec->leaf) == 0) { 432 kprintf("data crc mismatch on pipe\n"); 433 error = EINVAL; 434 } else { 435 error = hammer_mirror_localize_data( 436 ndata, &mrec->leaf); 437 } 438 } 439 hammer_modify_buffer_done(data_buffer); 440 } else { 441 mrec->leaf.data_offset = 0; 442 error = 0; 443 ndata = NULL; 444 } 445 if (error) 446 goto failed; 447 448 /* 449 * Do the insertion 450 */ 451 cursor->flags |= HAMMER_CURSOR_INSERT; 452 error = hammer_btree_lookup(cursor); 453 if (error != ENOENT) { 454 if (error == 0) 455 error = EALREADY; 456 goto failed; 457 } 458 error = 0; 459 460 error = hammer_btree_insert(cursor, &mrec->leaf, &doprop); 461 462 /* 463 * Track a count of active inodes. 464 */ 465 if (error == 0 && mrec->leaf.base.delete_tid == 0 && 466 mrec->leaf.base.obj_type == HAMMER_RECTYPE_INODE) { 467 hammer_modify_volume_field(trans, 468 trans->rootvol, 469 vol0_stat_inodes); 470 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes; 471 hammer_modify_volume_done(trans->rootvol); 472 } 473 474 /* 475 * vol0_next_tid must track the highest TID stored in the filesystem. 476 * We do not need to generate undo for this update. 477 */ 478 high_tid = mrec->leaf.base.create_tid; 479 if (high_tid < mrec->leaf.base.delete_tid) 480 high_tid = mrec->leaf.base.delete_tid; 481 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) { 482 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 483 trans->rootvol->ondisk->vol0_next_tid = high_tid; 484 hammer_modify_volume_done(trans->rootvol); 485 } 486 487 if (error == 0 && doprop) 488 hammer_btree_do_propagation(cursor, ip, &mrec->leaf); 489 490 failed: 491 /* 492 * Cleanup 493 */ 494 if (error && mrec->leaf.data_offset) { 495 hammer_blockmap_free(cursor->trans, 496 mrec->leaf.data_offset, 497 mrec->leaf.data_len); 498 } 499 if (data_buffer) 500 hammer_rel_buffer(data_buffer, 0); 501 return(error); 502 } 503 504 /* 505 * Localize the data payload. Directory entries may need their 506 * localization adjusted. 507 * 508 * PFS directory entries must be skipped entirely (return EALREADY). 509 */ 510 static 511 int 512 hammer_mirror_localize_data(hammer_data_ondisk_t data, 513 hammer_btree_leaf_elm_t leaf) 514 { 515 u_int32_t localization; 516 517 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) { 518 if (data->entry.obj_id == HAMMER_OBJID_ROOT) 519 return(EALREADY); 520 localization = leaf->base.localization & 521 HAMMER_LOCALIZE_PSEUDOFS_MASK; 522 if (data->entry.localization != localization) { 523 data->entry.localization = localization; 524 hammer_crc_set_leaf(data, leaf); 525 } 526 } 527 return(0); 528 } 529 530 /* 531 * Auto-detect the pseudofs. 532 */ 533 static 534 void 535 hammer_mirror_autodetect(struct hammer_ioc_pseudofs_rw *pfs, hammer_inode_t ip) 536 { 537 if (pfs->pfs_id == -1) 538 pfs->pfs_id = (int)(ip->obj_localization >> 16); 539 } 540 541 /* 542 * Get mirroring/pseudo-fs information 543 */ 544 int 545 hammer_ioc_get_pseudofs(hammer_transaction_t trans, hammer_inode_t ip, 546 struct hammer_ioc_pseudofs_rw *pfs) 547 { 548 hammer_pseudofs_inmem_t pfsm; 549 u_int32_t localization; 550 int error; 551 552 hammer_mirror_autodetect(pfs, ip); 553 if (pfs->pfs_id < 0 || pfs->pfs_id >= HAMMER_MAX_PFS) 554 return(EINVAL); 555 localization = (u_int32_t)pfs->pfs_id << 16; 556 pfs->bytes = sizeof(struct hammer_pseudofs_data); 557 pfs->version = HAMMER_IOC_PSEUDOFS_VERSION; 558 559 pfsm = hammer_load_pseudofs(trans, localization, &error); 560 if (error) { 561 hammer_rel_pseudofs(trans->hmp, pfsm); 562 return(error); 563 } 564 565 /* 566 * If the PFS is a master the sync tid is set by normal operation 567 * rather then the mirroring code, and will always track the 568 * real HAMMER filesystem. 569 */ 570 if (pfsm->pfsd.master_id >= 0) 571 pfsm->pfsd.sync_end_tid = trans->rootvol->ondisk->vol0_next_tid; 572 573 /* 574 * Copy out to userland. 575 */ 576 error = 0; 577 if (pfs->ondisk && error == 0) 578 error = copyout(&pfsm->pfsd, pfs->ondisk, sizeof(pfsm->pfsd)); 579 hammer_rel_pseudofs(trans->hmp, pfsm); 580 return(error); 581 } 582 583 /* 584 * Set mirroring/pseudo-fs information 585 */ 586 int 587 hammer_ioc_set_pseudofs(hammer_transaction_t trans, hammer_inode_t ip, 588 struct ucred *cred, struct hammer_ioc_pseudofs_rw *pfs) 589 { 590 hammer_pseudofs_inmem_t pfsm; 591 int error; 592 u_int32_t localization; 593 594 error = 0; 595 hammer_mirror_autodetect(pfs, ip); 596 if (pfs->pfs_id < 0 || pfs->pfs_id >= HAMMER_MAX_PFS) 597 error = EINVAL; 598 if (pfs->bytes != sizeof(pfsm->pfsd)) 599 error = EINVAL; 600 if (pfs->version != HAMMER_IOC_PSEUDOFS_VERSION) 601 error = EINVAL; 602 if (error == 0 && pfs->ondisk) { 603 /* 604 * Load the PFS so we can modify our in-core copy. 605 */ 606 localization = (u_int32_t)pfs->pfs_id << 16; 607 pfsm = hammer_load_pseudofs(trans, localization, &error); 608 error = copyin(pfs->ondisk, &pfsm->pfsd, sizeof(pfsm->pfsd)); 609 610 /* 611 * Save it back, create a root inode if we are in master 612 * mode and no root exists. 613 */ 614 if (error == 0) 615 error = hammer_mkroot_pseudofs(trans, cred, pfsm); 616 if (error == 0) 617 error = hammer_save_pseudofs(trans, pfsm); 618 hammer_rel_pseudofs(trans->hmp, pfsm); 619 } 620 return(error); 621 } 622 623