1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 static __inline int validate_zone(hammer_off_t data_offset); 38 39 int 40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip, 41 struct hammer_ioc_dedup *dedup) 42 { 43 struct hammer_cursor cursor1, cursor2; 44 int error; 45 46 /* 47 * Enforce hammer filesystem version requirements 48 */ 49 if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) { 50 kprintf("hammer: Filesystem must be upgraded to v5 " 51 "before you can run dedup\n"); 52 return (EOPNOTSUPP); 53 } 54 55 /* 56 * Cursor1, return an error -> candidate goes to pass2 list 57 */ 58 error = hammer_init_cursor(trans, &cursor1, NULL, NULL); 59 if (error) 60 goto done_cursor; 61 cursor1.key_beg = dedup->elm1; 62 cursor1.flags |= HAMMER_CURSOR_BACKEND; 63 64 error = hammer_btree_lookup(&cursor1); 65 if (error) 66 goto done_cursor; 67 error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF | 68 HAMMER_CURSOR_GET_DATA); 69 if (error) 70 goto done_cursor; 71 72 /* 73 * Cursor2, return an error -> candidate goes to pass2 list 74 */ 75 error = hammer_init_cursor(trans, &cursor2, NULL, NULL); 76 if (error) 77 goto done_cursors; 78 cursor2.key_beg = dedup->elm2; 79 cursor2.flags |= HAMMER_CURSOR_BACKEND; 80 81 error = hammer_btree_lookup(&cursor2); 82 if (error) 83 goto done_cursors; 84 error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF | 85 HAMMER_CURSOR_GET_DATA); 86 if (error) 87 goto done_cursors; 88 89 /* 90 * Zone validation. We can't de-dup any of the other zones 91 * (BTREE or META) or bad things will happen. 92 * 93 * Return with error = 0, but set an INVALID_ZONE flag. 94 */ 95 error = validate_zone(cursor1.leaf->data_offset) + 96 validate_zone(cursor2.leaf->data_offset); 97 if (error) { 98 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE; 99 error = 0; 100 goto done_cursors; 101 } 102 103 /* 104 * Comparison checks 105 * 106 * If zones don't match or data_len fields aren't the same 107 * we consider it to be a comparison failure. 108 * 109 * Return with error = 0, but set a CMP_FAILURE flag. 110 */ 111 if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) != 112 (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) { 113 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 114 goto done_cursors; 115 } 116 if (cursor1.leaf->data_len != cursor2.leaf->data_len) { 117 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 118 goto done_cursors; 119 } 120 121 /* byte-by-byte comparison to be sure */ 122 if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) { 123 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 124 goto done_cursors; 125 } 126 127 /* 128 * Upgrade both cursors together to an exclusive lock 129 * 130 * Return an error -> candidate goes to pass2 list 131 */ 132 hammer_sync_lock_sh(trans); 133 error = hammer_cursor_upgrade2(&cursor1, &cursor2); 134 if (error) { 135 hammer_sync_unlock(trans); 136 goto done_cursors; 137 } 138 139 error = hammer_blockmap_dedup(cursor1.trans, 140 cursor1.leaf->data_offset, cursor1.leaf->data_len); 141 if (error) { 142 if (error == ERANGE) { 143 /* 144 * Return with error = 0, but set an UNDERFLOW flag 145 */ 146 dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW; 147 error = 0; 148 goto downgrade_cursors; 149 } else { 150 /* 151 * Return an error -> block goes to pass2 list 152 */ 153 goto downgrade_cursors; 154 } 155 } 156 157 /* 158 * The cursor2's cache must be invalidated before calling 159 * hammer_blockmap_free(), otherwise it will not be able to 160 * invalidate the underlying data buffer. 161 */ 162 hammer_cursor_invalidate_cache(&cursor2); 163 hammer_blockmap_free(cursor2.trans, 164 cursor2.leaf->data_offset, cursor2.leaf->data_len); 165 166 hammer_modify_node(cursor2.trans, cursor2.node, 167 &cursor2.leaf->data_offset, sizeof(hammer_off_t)); 168 cursor2.leaf->data_offset = cursor1.leaf->data_offset; 169 hammer_modify_node_done(cursor2.node); 170 171 downgrade_cursors: 172 hammer_cursor_downgrade2(&cursor1, &cursor2); 173 hammer_sync_unlock(trans); 174 done_cursors: 175 hammer_done_cursor(&cursor2); 176 done_cursor: 177 hammer_done_cursor(&cursor1); 178 return (error); 179 } 180 181 static __inline int 182 validate_zone(hammer_off_t data_offset) 183 { 184 switch(data_offset & HAMMER_OFF_ZONE_MASK) { 185 case HAMMER_ZONE_LARGE_DATA: 186 case HAMMER_ZONE_SMALL_DATA: 187 return (0); 188 default: 189 return (1); 190 } 191 } 192 193 /************************************************************************ 194 * LIVE DEDUP * 195 ************************************************************************ 196 * 197 * HAMMER Live Dedup (aka as efficient cp(1) implementation) 198 * 199 * The dedup cache is operated in a LRU fashion and destroyed on 200 * unmount, so essentially this is a live dedup on a cached dataset and 201 * not a full-fledged fs-wide one - we have a batched dedup for that. 202 * We allow duplicate entries in the buffer cache, data blocks are 203 * deduplicated only on their way to media. By default the cache is 204 * populated on reads only, but it can be populated on writes too. 205 * 206 * The main implementation gotcha is on-media requirement - in order for 207 * a data block to be added to a dedup cache it has to be present on 208 * disk. This simplifies cache logic a lot - once data is laid out on 209 * media it remains valid on media all the way up to the point where the 210 * related big block the data was stored in is freed - so there is only 211 * one place we need to bother with invalidation code. 212 */ 213 214 /* 215 * RB-Tree support for dedup cache structures 216 */ 217 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry, 218 hammer_dedup_crc_rb_compare, hammer_crc_t, crc); 219 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry, 220 hammer_dedup_off_rb_compare, hammer_off_t, data_offset); 221 222 struct hammer_dedup_inval { 223 struct hammer_mount *hmp; 224 hammer_off_t base_offset; 225 }; 226 227 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data); 228 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, 229 void *data); 230 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data, 231 int *errorp); 232 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data, 233 int *errorp); 234 235 int 236 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, 237 hammer_dedup_cache_t dc2) 238 { 239 if (dc1->crc < dc2->crc) 240 return (-1); 241 if (dc1->crc > dc2->crc) 242 return (1); 243 244 return (0); 245 } 246 247 int 248 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, 249 hammer_dedup_cache_t dc2) 250 { 251 if (dc1->data_offset < dc2->data_offset) 252 return (-1); 253 if (dc1->data_offset > dc2->data_offset) 254 return (1); 255 256 return (0); 257 } 258 259 static int 260 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data) 261 { 262 hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset; 263 264 if (dc->data_offset < off) 265 return (-1); 266 if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE) 267 return (1); 268 269 return (0); 270 } 271 272 static int 273 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data) 274 { 275 hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp; 276 277 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc); 278 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc); 279 TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry); 280 281 --hmp->dedup_cache_count; 282 kfree(dc, hmp->m_misc); 283 284 return (0); 285 } 286 287 hammer_dedup_cache_t 288 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf) 289 { 290 hammer_dedup_cache_t dcp, tmp; 291 hammer_mount_t hmp = ip->hmp; 292 293 if (hmp->dedup_free_cache == NULL) { 294 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO); 295 if (hmp->dedup_free_cache == NULL) 296 hmp->dedup_free_cache = tmp; 297 else 298 kfree(tmp, hmp->m_misc); 299 } 300 301 KKASSERT(leaf != NULL); 302 303 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 304 &hmp->rb_dedup_crc_root, leaf->data_crc); 305 if (dcp != NULL) { 306 RB_REMOVE(hammer_dedup_off_rb_tree, 307 &hmp->rb_dedup_off_root, dcp); 308 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 309 goto populate; 310 } 311 312 if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) { 313 dcp = hmp->dedup_free_cache; 314 hmp->dedup_free_cache = NULL; 315 ++hmp->dedup_cache_count; 316 } else { 317 dcp = TAILQ_FIRST(&hmp->dedup_lru_list); 318 RB_REMOVE(hammer_dedup_crc_rb_tree, 319 &hmp->rb_dedup_crc_root, dcp); 320 RB_REMOVE(hammer_dedup_off_rb_tree, 321 &hmp->rb_dedup_off_root, dcp); 322 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 323 } 324 325 dcp->crc = leaf->data_crc; 326 tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 327 KKASSERT(tmp == NULL); 328 329 populate: 330 dcp->hmp = ip->hmp; 331 dcp->obj_id = ip->obj_id; 332 dcp->localization = ip->obj_localization; 333 dcp->file_offset = leaf->base.key - leaf->data_len; 334 dcp->bytes = leaf->data_len; 335 dcp->data_offset = leaf->data_offset; 336 337 tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 338 KKASSERT(tmp == NULL); 339 TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry); 340 341 return (dcp); 342 } 343 344 __inline hammer_dedup_cache_t 345 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc) 346 { 347 hammer_dedup_cache_t dcp; 348 349 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 350 &hmp->rb_dedup_crc_root, crc); 351 return dcp; 352 } 353 354 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset) 355 { 356 struct hammer_dedup_inval di; 357 358 di.hmp = hmp; 359 di.base_offset = base_offset; 360 361 RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, 362 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di); 363 } 364 365 void 366 hammer_destroy_dedup_cache(hammer_mount_t hmp) 367 { 368 hammer_dedup_cache_t dcp; 369 370 while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) { 371 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 372 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 373 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 374 --hmp->dedup_cache_count; 375 kfree(dcp, hmp->m_misc); 376 } 377 378 KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root)); 379 KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root)); 380 KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list)); 381 382 KKASSERT(hmp->dedup_cache_count == 0); 383 } 384 385 int 386 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, 387 void *data) 388 { 389 int error; 390 391 /* 392 * Zone validation 393 */ 394 if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone) 395 return (0); 396 397 /* 398 * Block length validation 399 */ 400 if (dcp->bytes != bytes) 401 return (0); 402 403 /* 404 * Byte-by-byte data comparison 405 * 406 * The data we need for validation may already be present in the 407 * buffer cache in two flavours: vnode-based buffer or 408 * block-device-based buffer. In case vnode-based buffer wasn't 409 * there or if a non-blocking attempt to acquire it failed use 410 * device-based buffer (for large-zone data blocks it will 411 * generate a separate read). 412 * 413 * XXX vnode-based checking is not MP safe so when live-dedup 414 * is turned on we must always use the device buffer. 415 */ 416 #if 0 417 if (hammer_double_buffer) { 418 error = 1; 419 } else if (_vnode_validate(dcp, data, &error)) { 420 hammer_live_dedup_vnode_bcmps++; 421 return (1); 422 } else { 423 if (error == 3) 424 hammer_live_dedup_findblk_failures++; 425 } 426 427 /* 428 * If there was an error previously or if double buffering is 429 * enabled. 430 */ 431 if (error) { 432 if (_dev_validate(dcp, data, &error)) { 433 hammer_live_dedup_device_bcmps++; 434 return (1); 435 } 436 } 437 #endif 438 if (_dev_validate(dcp, data, &error)) { 439 hammer_live_dedup_device_bcmps++; 440 return (1); 441 } 442 443 return (0); 444 } 445 446 static __inline int 447 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 448 { 449 struct hammer_transaction trans; 450 hammer_inode_t ip; 451 struct vnode *vp; 452 struct buf *bp; 453 off_t dooffset; 454 int result, error; 455 456 result = error = 0; 457 *errorp = 0; 458 459 hammer_simple_transaction(&trans, dcp->hmp); 460 461 ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID, 462 dcp->localization, 0, &error); 463 if (ip == NULL) { 464 kprintf("dedup: unable to find objid %016jx:%08x\n", 465 (intmax_t)dcp->obj_id, dcp->localization); 466 *errorp = 1; 467 goto failed2; 468 } 469 470 error = hammer_get_vnode(ip, &vp); 471 if (error) { 472 kprintf("dedup: unable to acquire vnode for %016jx:%08x\n", 473 (intmax_t)dcp->obj_id, dcp->localization); 474 *errorp = 2; 475 goto failed; 476 } 477 478 if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) { 479 bremfree(bp); 480 481 /* XXX if (mapped to userspace) goto done, *errorp = 4 */ 482 483 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) { 484 *errorp = 5; 485 goto done; 486 } 487 488 if (bp->b_bio2.bio_offset != dcp->data_offset) { 489 error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset, 490 NULL, NULL, BUF_CMD_READ); 491 if (error) { 492 *errorp = 6; 493 goto done; 494 } 495 496 if (dooffset != dcp->data_offset) { 497 *errorp = 7; 498 goto done; 499 } 500 hammer_live_dedup_bmap_saves++; 501 } 502 503 if (bcmp(data, bp->b_data, dcp->bytes) == 0) 504 result = 1; 505 506 done: 507 bqrelse(bp); 508 } else { 509 *errorp = 3; 510 } 511 vput(vp); 512 513 failed: 514 hammer_rel_inode(ip, 0); 515 failed2: 516 hammer_done_transaction(&trans); 517 return (result); 518 } 519 520 static __inline int 521 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 522 { 523 hammer_buffer_t buffer = NULL; 524 void *ondisk_data; 525 int result, error; 526 527 result = error = 0; 528 *errorp = 0; 529 530 ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset, 531 dcp->bytes, &error, &buffer); 532 if (error) { 533 *errorp = 1; 534 goto failed; 535 } 536 537 if (bcmp(data, ondisk_data, dcp->bytes) == 0) 538 result = 1; 539 540 failed: 541 if (buffer) 542 hammer_rel_buffer(buffer, 0); 543 544 return (result); 545 } 546