1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 static __inline int validate_zone(hammer_off_t data_offset); 38 39 int 40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip, 41 struct hammer_ioc_dedup *dedup) 42 { 43 struct hammer_cursor cursor1, cursor2; 44 int error; 45 int seq; 46 47 /* 48 * Enforce hammer filesystem version requirements 49 */ 50 if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) { 51 kprintf("hammer: Filesystem must be upgraded to v5 " 52 "before you can run dedup\n"); 53 return (EOPNOTSUPP); 54 } 55 56 /* 57 * Cursor1, return an error -> candidate goes to pass2 list 58 */ 59 error = hammer_init_cursor(trans, &cursor1, NULL, NULL); 60 if (error) 61 goto done_cursor; 62 cursor1.key_beg = dedup->elm1; 63 cursor1.flags |= HAMMER_CURSOR_BACKEND; 64 65 error = hammer_btree_lookup(&cursor1); 66 if (error) 67 goto done_cursor; 68 error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF | 69 HAMMER_CURSOR_GET_DATA); 70 if (error) 71 goto done_cursor; 72 73 /* 74 * Cursor2, return an error -> candidate goes to pass2 list 75 */ 76 error = hammer_init_cursor(trans, &cursor2, NULL, NULL); 77 if (error) 78 goto done_cursors; 79 cursor2.key_beg = dedup->elm2; 80 cursor2.flags |= HAMMER_CURSOR_BACKEND; 81 82 error = hammer_btree_lookup(&cursor2); 83 if (error) 84 goto done_cursors; 85 error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF | 86 HAMMER_CURSOR_GET_DATA); 87 if (error) 88 goto done_cursors; 89 90 /* 91 * Zone validation. We can't de-dup any of the other zones 92 * (BTREE or META) or bad things will happen. 93 * 94 * Return with error = 0, but set an INVALID_ZONE flag. 95 */ 96 error = validate_zone(cursor1.leaf->data_offset) + 97 validate_zone(cursor2.leaf->data_offset); 98 if (error) { 99 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE; 100 error = 0; 101 goto done_cursors; 102 } 103 104 /* 105 * Comparison checks 106 * 107 * If zones don't match or data_len fields aren't the same 108 * we consider it to be a comparison failure. 109 * 110 * Return with error = 0, but set a CMP_FAILURE flag. 111 */ 112 if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) != 113 (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) { 114 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 115 goto done_cursors; 116 } 117 if (cursor1.leaf->data_len != cursor2.leaf->data_len) { 118 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 119 goto done_cursors; 120 } 121 122 /* byte-by-byte comparison to be sure */ 123 if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) { 124 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 125 goto done_cursors; 126 } 127 128 /* 129 * Upgrade both cursors together to an exclusive lock 130 * 131 * Return an error -> candidate goes to pass2 list 132 */ 133 hammer_sync_lock_sh(trans); 134 error = hammer_cursor_upgrade2(&cursor1, &cursor2); 135 if (error) { 136 hammer_sync_unlock(trans); 137 goto done_cursors; 138 } 139 140 error = hammer_blockmap_dedup(cursor1.trans, 141 cursor1.leaf->data_offset, cursor1.leaf->data_len); 142 if (error) { 143 if (error == ERANGE) { 144 /* 145 * Return with error = 0, but set an UNDERFLOW flag 146 */ 147 dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW; 148 error = 0; 149 goto downgrade_cursors; 150 } else { 151 /* 152 * Return an error -> block goes to pass2 list 153 */ 154 goto downgrade_cursors; 155 } 156 } 157 158 /* 159 * The cursor2's cache must be invalidated before calling 160 * hammer_blockmap_free(), otherwise it will not be able to 161 * invalidate the underlying data buffer. 162 */ 163 hammer_cursor_invalidate_cache(&cursor2); 164 hammer_blockmap_free(cursor2.trans, 165 cursor2.leaf->data_offset, cursor2.leaf->data_len); 166 167 hammer_modify_node(cursor2.trans, cursor2.node, 168 &cursor2.leaf->data_offset, sizeof(hammer_off_t)); 169 cursor2.leaf->data_offset = cursor1.leaf->data_offset; 170 hammer_modify_node_done(cursor2.node); 171 172 downgrade_cursors: 173 hammer_cursor_downgrade2(&cursor1, &cursor2); 174 hammer_sync_unlock(trans); 175 done_cursors: 176 hammer_done_cursor(&cursor2); 177 done_cursor: 178 hammer_done_cursor(&cursor1); 179 180 /* 181 * Avoid deadlocking the buffer cache 182 */ 183 seq = trans->hmp->flusher.done; 184 while (hammer_flusher_meta_halflimit(trans->hmp) || 185 hammer_flusher_undo_exhausted(trans, 2)) { 186 hammer_flusher_wait(trans->hmp, seq); 187 seq = hammer_flusher_async_one(trans->hmp); 188 } 189 return (error); 190 } 191 192 static __inline int 193 validate_zone(hammer_off_t data_offset) 194 { 195 switch(data_offset & HAMMER_OFF_ZONE_MASK) { 196 case HAMMER_ZONE_LARGE_DATA: 197 case HAMMER_ZONE_SMALL_DATA: 198 return (0); 199 default: 200 return (1); 201 } 202 } 203 204 /************************************************************************ 205 * LIVE DEDUP * 206 ************************************************************************ 207 * 208 * HAMMER Live Dedup (aka as efficient cp(1) implementation) 209 * 210 * The dedup cache is operated in a LRU fashion and destroyed on 211 * unmount, so essentially this is a live dedup on a cached dataset and 212 * not a full-fledged fs-wide one - we have a batched dedup for that. 213 * We allow duplicate entries in the buffer cache, data blocks are 214 * deduplicated only on their way to media. By default the cache is 215 * populated on reads only, but it can be populated on writes too. 216 * 217 * The main implementation gotcha is on-media requirement - in order for 218 * a data block to be added to a dedup cache it has to be present on 219 * disk. This simplifies cache logic a lot - once data is laid out on 220 * media it remains valid on media all the way up to the point where the 221 * related big block the data was stored in is freed - so there is only 222 * one place we need to bother with invalidation code. 223 */ 224 225 /* 226 * RB-Tree support for dedup cache structures 227 */ 228 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry, 229 hammer_dedup_crc_rb_compare, hammer_crc_t, crc); 230 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry, 231 hammer_dedup_off_rb_compare, hammer_off_t, data_offset); 232 233 struct hammer_dedup_inval { 234 struct hammer_mount *hmp; 235 hammer_off_t base_offset; 236 }; 237 238 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data); 239 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, 240 void *data); 241 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data, 242 int *errorp); 243 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data, 244 int *errorp); 245 246 int 247 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, 248 hammer_dedup_cache_t dc2) 249 { 250 if (dc1->crc < dc2->crc) 251 return (-1); 252 if (dc1->crc > dc2->crc) 253 return (1); 254 255 return (0); 256 } 257 258 int 259 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, 260 hammer_dedup_cache_t dc2) 261 { 262 if (dc1->data_offset < dc2->data_offset) 263 return (-1); 264 if (dc1->data_offset > dc2->data_offset) 265 return (1); 266 267 return (0); 268 } 269 270 static int 271 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data) 272 { 273 hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset; 274 275 if (dc->data_offset < off) 276 return (-1); 277 if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE) 278 return (1); 279 280 return (0); 281 } 282 283 static int 284 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data) 285 { 286 hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp; 287 288 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc); 289 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc); 290 TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry); 291 292 --hmp->dedup_cache_count; 293 kfree(dc, hmp->m_misc); 294 295 return (0); 296 } 297 298 hammer_dedup_cache_t 299 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf) 300 { 301 hammer_dedup_cache_t dcp, tmp; 302 hammer_mount_t hmp = ip->hmp; 303 304 if (hmp->dedup_free_cache == NULL) { 305 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO); 306 if (hmp->dedup_free_cache == NULL) 307 hmp->dedup_free_cache = tmp; 308 else 309 kfree(tmp, hmp->m_misc); 310 } 311 312 KKASSERT(leaf != NULL); 313 314 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 315 &hmp->rb_dedup_crc_root, leaf->data_crc); 316 if (dcp != NULL) { 317 RB_REMOVE(hammer_dedup_off_rb_tree, 318 &hmp->rb_dedup_off_root, dcp); 319 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 320 goto populate; 321 } 322 323 if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) { 324 dcp = hmp->dedup_free_cache; 325 hmp->dedup_free_cache = NULL; 326 ++hmp->dedup_cache_count; 327 } else { 328 dcp = TAILQ_FIRST(&hmp->dedup_lru_list); 329 RB_REMOVE(hammer_dedup_crc_rb_tree, 330 &hmp->rb_dedup_crc_root, dcp); 331 RB_REMOVE(hammer_dedup_off_rb_tree, 332 &hmp->rb_dedup_off_root, dcp); 333 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 334 } 335 336 dcp->crc = leaf->data_crc; 337 tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 338 KKASSERT(tmp == NULL); 339 340 populate: 341 dcp->hmp = ip->hmp; 342 dcp->obj_id = ip->obj_id; 343 dcp->localization = ip->obj_localization; 344 dcp->file_offset = leaf->base.key - leaf->data_len; 345 dcp->bytes = leaf->data_len; 346 dcp->data_offset = leaf->data_offset; 347 348 tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 349 KKASSERT(tmp == NULL); 350 TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry); 351 352 return (dcp); 353 } 354 355 __inline hammer_dedup_cache_t 356 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc) 357 { 358 hammer_dedup_cache_t dcp; 359 360 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 361 &hmp->rb_dedup_crc_root, crc); 362 return dcp; 363 } 364 365 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset) 366 { 367 struct hammer_dedup_inval di; 368 369 di.hmp = hmp; 370 di.base_offset = base_offset; 371 372 RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, 373 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di); 374 } 375 376 void 377 hammer_destroy_dedup_cache(hammer_mount_t hmp) 378 { 379 hammer_dedup_cache_t dcp; 380 381 while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) { 382 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 383 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 384 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 385 --hmp->dedup_cache_count; 386 kfree(dcp, hmp->m_misc); 387 } 388 389 KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root)); 390 KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root)); 391 KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list)); 392 393 KKASSERT(hmp->dedup_cache_count == 0); 394 } 395 396 int 397 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, 398 void *data) 399 { 400 int error; 401 402 /* 403 * Zone validation 404 */ 405 if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone) 406 return (0); 407 408 /* 409 * Block length validation 410 */ 411 if (dcp->bytes != bytes) 412 return (0); 413 414 /* 415 * Byte-by-byte data comparison 416 * 417 * The data we need for validation may already be present in the 418 * buffer cache in two flavours: vnode-based buffer or 419 * block-device-based buffer. In case vnode-based buffer wasn't 420 * there or if a non-blocking attempt to acquire it failed use 421 * device-based buffer (for large-zone data blocks it will 422 * generate a separate read). 423 * 424 * XXX vnode-based checking is not MP safe so when live-dedup 425 * is turned on we must always use the device buffer. 426 */ 427 #if 0 428 if (hammer_double_buffer) { 429 error = 1; 430 } else if (_vnode_validate(dcp, data, &error)) { 431 hammer_live_dedup_vnode_bcmps++; 432 return (1); 433 } else { 434 if (error == 3) 435 hammer_live_dedup_findblk_failures++; 436 } 437 438 /* 439 * If there was an error previously or if double buffering is 440 * enabled. 441 */ 442 if (error) { 443 if (_dev_validate(dcp, data, &error)) { 444 hammer_live_dedup_device_bcmps++; 445 return (1); 446 } 447 } 448 #endif 449 if (_dev_validate(dcp, data, &error)) { 450 hammer_live_dedup_device_bcmps++; 451 return (1); 452 } 453 454 return (0); 455 } 456 457 static __inline int 458 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 459 { 460 struct hammer_transaction trans; 461 hammer_inode_t ip; 462 struct vnode *vp; 463 struct buf *bp; 464 off_t dooffset; 465 int result, error; 466 467 result = error = 0; 468 *errorp = 0; 469 470 hammer_simple_transaction(&trans, dcp->hmp); 471 472 ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID, 473 dcp->localization, 0, &error); 474 if (ip == NULL) { 475 kprintf("dedup: unable to find objid %016jx:%08x\n", 476 (intmax_t)dcp->obj_id, dcp->localization); 477 *errorp = 1; 478 goto failed2; 479 } 480 481 error = hammer_get_vnode(ip, &vp); 482 if (error) { 483 kprintf("dedup: unable to acquire vnode for %016jx:%08x\n", 484 (intmax_t)dcp->obj_id, dcp->localization); 485 *errorp = 2; 486 goto failed; 487 } 488 489 if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) { 490 bremfree(bp); 491 492 /* XXX if (mapped to userspace) goto done, *errorp = 4 */ 493 494 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) { 495 *errorp = 5; 496 goto done; 497 } 498 499 if (bp->b_bio2.bio_offset != dcp->data_offset) { 500 error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset, 501 NULL, NULL, BUF_CMD_READ); 502 if (error) { 503 *errorp = 6; 504 goto done; 505 } 506 507 if (dooffset != dcp->data_offset) { 508 *errorp = 7; 509 goto done; 510 } 511 hammer_live_dedup_bmap_saves++; 512 } 513 514 if (bcmp(data, bp->b_data, dcp->bytes) == 0) 515 result = 1; 516 517 done: 518 bqrelse(bp); 519 } else { 520 *errorp = 3; 521 } 522 vput(vp); 523 524 failed: 525 hammer_rel_inode(ip, 0); 526 failed2: 527 hammer_done_transaction(&trans); 528 return (result); 529 } 530 531 static __inline int 532 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 533 { 534 hammer_buffer_t buffer = NULL; 535 void *ondisk_data; 536 int result, error; 537 538 result = error = 0; 539 *errorp = 0; 540 541 ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset, 542 dcp->bytes, &error, &buffer); 543 if (error) { 544 *errorp = 1; 545 goto failed; 546 } 547 548 if (bcmp(data, ondisk_data, dcp->bytes) == 0) 549 result = 1; 550 551 failed: 552 if (buffer) 553 hammer_rel_buffer(buffer, 0); 554 555 return (result); 556 } 557