1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 static __inline int validate_zone(hammer_off_t data_offset); 38 39 int 40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip, 41 struct hammer_ioc_dedup *dedup) 42 { 43 struct hammer_cursor cursor1, cursor2; 44 int error; 45 int seq; 46 47 /* 48 * Enforce hammer filesystem version requirements 49 */ 50 if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) { 51 hkprintf("Filesystem must be upgraded to v5 " 52 "before you can run dedup\n"); 53 return (EOPNOTSUPP); 54 } 55 56 /* 57 * Cursor1, return an error -> candidate goes to pass2 list 58 */ 59 error = hammer_init_cursor(trans, &cursor1, NULL, NULL); 60 if (error) 61 goto done_cursor; 62 cursor1.key_beg = dedup->elm1; 63 cursor1.flags |= HAMMER_CURSOR_BACKEND; 64 65 error = hammer_btree_lookup(&cursor1); 66 if (error) 67 goto done_cursor; 68 error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF | 69 HAMMER_CURSOR_GET_DATA); 70 if (error) 71 goto done_cursor; 72 73 /* 74 * Cursor2, return an error -> candidate goes to pass2 list 75 */ 76 error = hammer_init_cursor(trans, &cursor2, NULL, NULL); 77 if (error) 78 goto done_cursors; 79 cursor2.key_beg = dedup->elm2; 80 cursor2.flags |= HAMMER_CURSOR_BACKEND; 81 82 error = hammer_btree_lookup(&cursor2); 83 if (error) 84 goto done_cursors; 85 error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF | 86 HAMMER_CURSOR_GET_DATA); 87 if (error) 88 goto done_cursors; 89 90 /* 91 * Zone validation. We can't de-dup any of the other zones 92 * (BTREE or META) or bad things will happen. 93 * 94 * Return with error = 0, but set an INVALID_ZONE flag. 95 */ 96 error = validate_zone(cursor1.leaf->data_offset) + 97 validate_zone(cursor2.leaf->data_offset); 98 if (error) { 99 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE; 100 error = 0; 101 goto done_cursors; 102 } 103 104 /* 105 * Comparison checks 106 * 107 * If zones don't match or data_len fields aren't the same 108 * we consider it to be a comparison failure. 109 * 110 * Return with error = 0, but set a CMP_FAILURE flag. 111 */ 112 if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) != 113 (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) { 114 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 115 goto done_cursors; 116 } 117 if (cursor1.leaf->data_len != cursor2.leaf->data_len) { 118 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 119 goto done_cursors; 120 } 121 122 /* byte-by-byte comparison to be sure */ 123 if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) { 124 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 125 goto done_cursors; 126 } 127 128 /* 129 * Upgrade both cursors together to an exclusive lock 130 * 131 * Return an error -> candidate goes to pass2 list 132 */ 133 hammer_sync_lock_sh(trans); 134 error = hammer_cursor_upgrade2(&cursor1, &cursor2); 135 if (error) { 136 hammer_sync_unlock(trans); 137 goto done_cursors; 138 } 139 140 error = hammer_blockmap_dedup(cursor1.trans, 141 cursor1.leaf->data_offset, cursor1.leaf->data_len); 142 if (error) { 143 if (error == ERANGE) { 144 /* Return with error = 0, but set an UNDERFLOW flag */ 145 dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW; 146 error = 0; 147 } 148 149 /* Return all other errors -> block goes to pass2 list */ 150 goto downgrade_cursors; 151 } 152 153 /* 154 * The cursor2's cache must be invalidated before calling 155 * hammer_blockmap_free(), otherwise it will not be able to 156 * invalidate the underlying data buffer. 157 */ 158 hammer_cursor_invalidate_cache(&cursor2); 159 hammer_blockmap_free(cursor2.trans, 160 cursor2.leaf->data_offset, cursor2.leaf->data_len); 161 162 hammer_modify_node(cursor2.trans, cursor2.node, 163 &cursor2.leaf->data_offset, sizeof(hammer_off_t)); 164 cursor2.leaf->data_offset = cursor1.leaf->data_offset; 165 hammer_modify_node_done(cursor2.node); 166 167 downgrade_cursors: 168 hammer_cursor_downgrade2(&cursor1, &cursor2); 169 hammer_sync_unlock(trans); 170 done_cursors: 171 hammer_done_cursor(&cursor2); 172 done_cursor: 173 hammer_done_cursor(&cursor1); 174 175 /* 176 * Avoid deadlocking the buffer cache 177 */ 178 seq = trans->hmp->flusher.done; 179 while (hammer_flusher_meta_halflimit(trans->hmp) || 180 hammer_flusher_undo_exhausted(trans, 2)) { 181 hammer_flusher_wait(trans->hmp, seq); 182 seq = hammer_flusher_async_one(trans->hmp); 183 } 184 return (error); 185 } 186 187 static __inline int 188 validate_zone(hammer_off_t data_offset) 189 { 190 switch(data_offset & HAMMER_OFF_ZONE_MASK) { 191 case HAMMER_ZONE_LARGE_DATA: 192 case HAMMER_ZONE_SMALL_DATA: 193 return (0); 194 default: 195 return (1); 196 } 197 } 198 199 /************************************************************************ 200 * LIVE DEDUP * 201 ************************************************************************ 202 * 203 * HAMMER Live Dedup (aka as efficient cp(1) implementation) 204 * 205 * The dedup cache is operated in a LRU fashion and destroyed on 206 * unmount, so essentially this is a live dedup on a cached dataset and 207 * not a full-fledged fs-wide one - we have a batched dedup for that. 208 * We allow duplicate entries in the buffer cache, data blocks are 209 * deduplicated only on their way to media. By default the cache is 210 * populated on reads only, but it can be populated on writes too. 211 * 212 * The main implementation gotcha is on-media requirement - in order for 213 * a data block to be added to a dedup cache it has to be present on 214 * disk. This simplifies cache logic a lot - once data is laid out on 215 * media it remains valid on media all the way up to the point where the 216 * related big-block the data was stored in is freed - so there is only 217 * one place we need to bother with invalidation code. 218 */ 219 220 /* 221 * RB-Tree support for dedup cache structures 222 */ 223 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry, 224 hammer_dedup_crc_rb_compare, hammer_crc_t, crc); 225 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry, 226 hammer_dedup_off_rb_compare, hammer_off_t, data_offset); 227 228 struct hammer_dedup_inval { 229 struct hammer_mount *hmp; 230 hammer_off_t base_offset; 231 }; 232 233 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data); 234 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, 235 void *data); 236 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data, 237 int *errorp); 238 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data, 239 int *errorp); 240 241 int 242 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2) 243 { 244 if (dc1->crc < dc2->crc) 245 return (-1); 246 if (dc1->crc > dc2->crc) 247 return (1); 248 249 return (0); 250 } 251 252 int 253 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2) 254 { 255 if (dc1->data_offset < dc2->data_offset) 256 return (-1); 257 if (dc1->data_offset > dc2->data_offset) 258 return (1); 259 260 return (0); 261 } 262 263 static int 264 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data) 265 { 266 hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset; 267 268 if (dc->data_offset < off) 269 return (-1); 270 if (dc->data_offset >= off + HAMMER_BIGBLOCK_SIZE) 271 return (1); 272 273 return (0); 274 } 275 276 static int 277 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data) 278 { 279 hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp; 280 281 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc); 282 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc); 283 TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry); 284 285 --hmp->dedup_cache_count; 286 kfree(dc, hmp->m_misc); 287 288 return (0); 289 } 290 291 hammer_dedup_cache_t 292 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf) 293 { 294 hammer_dedup_cache_t dcp, tmp; 295 hammer_mount_t hmp = ip->hmp; 296 297 if (hmp->dedup_free_cache == NULL) { 298 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO); 299 if (hmp->dedup_free_cache == NULL) 300 hmp->dedup_free_cache = tmp; 301 else 302 kfree(tmp, hmp->m_misc); 303 } 304 305 KKASSERT(leaf != NULL); 306 307 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 308 &hmp->rb_dedup_crc_root, leaf->data_crc); 309 if (dcp != NULL) { 310 RB_REMOVE(hammer_dedup_off_rb_tree, 311 &hmp->rb_dedup_off_root, dcp); 312 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 313 goto populate; 314 } 315 316 if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) { 317 dcp = hmp->dedup_free_cache; 318 hmp->dedup_free_cache = NULL; 319 ++hmp->dedup_cache_count; 320 } else { 321 dcp = TAILQ_FIRST(&hmp->dedup_lru_list); 322 RB_REMOVE(hammer_dedup_crc_rb_tree, 323 &hmp->rb_dedup_crc_root, dcp); 324 RB_REMOVE(hammer_dedup_off_rb_tree, 325 &hmp->rb_dedup_off_root, dcp); 326 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 327 } 328 329 dcp->crc = leaf->data_crc; 330 tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 331 KKASSERT(tmp == NULL); 332 333 populate: 334 dcp->hmp = ip->hmp; 335 dcp->obj_id = ip->obj_id; 336 dcp->localization = ip->obj_localization; 337 dcp->file_offset = leaf->base.key - leaf->data_len; 338 dcp->bytes = leaf->data_len; 339 dcp->data_offset = leaf->data_offset; 340 341 tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 342 KKASSERT(tmp == NULL); 343 TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry); 344 345 return (dcp); 346 } 347 348 __inline hammer_dedup_cache_t 349 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc) 350 { 351 hammer_dedup_cache_t dcp; 352 353 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 354 &hmp->rb_dedup_crc_root, crc); 355 return dcp; 356 } 357 358 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset) 359 { 360 struct hammer_dedup_inval di; 361 362 di.hmp = hmp; 363 di.base_offset = base_offset; 364 365 RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, 366 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di); 367 } 368 369 void 370 hammer_destroy_dedup_cache(hammer_mount_t hmp) 371 { 372 hammer_dedup_cache_t dcp; 373 374 while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) { 375 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 376 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 377 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 378 --hmp->dedup_cache_count; 379 kfree(dcp, hmp->m_misc); 380 } 381 382 KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root)); 383 KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root)); 384 KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list)); 385 386 KKASSERT(hmp->dedup_cache_count == 0); 387 } 388 389 int 390 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, void *data) 391 { 392 int error; 393 394 /* 395 * Zone validation 396 */ 397 if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone) 398 return (0); 399 400 /* 401 * Block length validation 402 */ 403 if (dcp->bytes != bytes) 404 return (0); 405 406 /* 407 * Byte-by-byte data comparison 408 * 409 * The data we need for validation may already be present in the 410 * buffer cache in two flavours: vnode-based buffer or 411 * block-device-based buffer. In case vnode-based buffer wasn't 412 * there or if a non-blocking attempt to acquire it failed use 413 * device-based buffer (for large-zone data blocks it will 414 * generate a separate read). 415 * 416 * XXX vnode-based checking is not MP safe so when live-dedup 417 * is turned on we must always use the device buffer. 418 */ 419 #if 0 420 if (hammer_double_buffer) { 421 error = 1; 422 } else if (_vnode_validate(dcp, data, &error)) { 423 hammer_live_dedup_vnode_bcmps++; 424 return (1); 425 } else { 426 if (error == 3) 427 hammer_live_dedup_findblk_failures++; 428 } 429 430 /* 431 * If there was an error previously or if double buffering is 432 * enabled. 433 */ 434 if (error) { 435 if (_dev_validate(dcp, data, &error)) { 436 hammer_live_dedup_device_bcmps++; 437 return (1); 438 } 439 } 440 #endif 441 if (_dev_validate(dcp, data, &error)) { 442 hammer_live_dedup_device_bcmps++; 443 return (1); 444 } 445 446 return (0); 447 } 448 449 static __inline int 450 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 451 { 452 struct hammer_transaction trans; 453 hammer_inode_t ip; 454 struct vnode *vp; 455 struct buf *bp; 456 off_t dooffset; 457 int result, error; 458 459 result = error = 0; 460 *errorp = 0; 461 462 hammer_simple_transaction(&trans, dcp->hmp); 463 464 ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID, 465 dcp->localization, 0, &error); 466 if (ip == NULL) { 467 hkprintf("dedup: unable to find objid %016jx:%08x\n", 468 (intmax_t)dcp->obj_id, dcp->localization); 469 *errorp = 1; 470 goto failed2; 471 } 472 473 error = hammer_get_vnode(ip, &vp); 474 if (error) { 475 hkprintf("dedup: unable to acquire vnode for %016jx:%08x\n", 476 (intmax_t)dcp->obj_id, dcp->localization); 477 *errorp = 2; 478 goto failed; 479 } 480 481 if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) { 482 bremfree(bp); 483 484 /* XXX if (mapped to userspace) goto done, *errorp = 4 */ 485 486 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) { 487 *errorp = 5; 488 goto done; 489 } 490 491 if (bp->b_bio2.bio_offset != dcp->data_offset) { 492 error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset, 493 NULL, NULL, BUF_CMD_READ); 494 if (error) { 495 *errorp = 6; 496 goto done; 497 } 498 499 if (dooffset != dcp->data_offset) { 500 *errorp = 7; 501 goto done; 502 } 503 hammer_live_dedup_bmap_saves++; 504 } 505 506 if (bcmp(data, bp->b_data, dcp->bytes) == 0) 507 result = 1; 508 509 done: 510 bqrelse(bp); 511 } else { 512 *errorp = 3; 513 } 514 vput(vp); 515 516 failed: 517 hammer_rel_inode(ip, 0); 518 failed2: 519 hammer_done_transaction(&trans); 520 return (result); 521 } 522 523 static __inline int 524 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 525 { 526 hammer_buffer_t buffer = NULL; 527 void *ondisk_data; 528 int result, error; 529 530 result = error = 0; 531 *errorp = 0; 532 533 ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset, 534 dcp->bytes, &error, &buffer); 535 if (error) { 536 *errorp = 1; 537 goto failed; 538 } 539 540 if (bcmp(data, ondisk_data, dcp->bytes) == 0) 541 result = 1; 542 543 failed: 544 if (buffer) 545 hammer_rel_buffer(buffer, 0); 546 547 return (result); 548 } 549