1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 static __inline int validate_zone(hammer_off_t data_offset); 38 39 int 40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip, 41 struct hammer_ioc_dedup *dedup) 42 { 43 struct hammer_cursor cursor1, cursor2; 44 int error; 45 int seq; 46 47 /* 48 * Enforce hammer filesystem version requirements 49 */ 50 if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) { 51 hkprintf("Filesystem must be upgraded to v5 " 52 "before you can run dedup\n"); 53 return (EOPNOTSUPP); 54 } 55 56 /* 57 * Cursor1, return an error -> candidate goes to pass2 list 58 */ 59 error = hammer_init_cursor(trans, &cursor1, NULL, NULL); 60 if (error) 61 goto done_cursor; 62 cursor1.key_beg = dedup->elm1; 63 cursor1.flags |= HAMMER_CURSOR_BACKEND; 64 65 error = hammer_btree_lookup(&cursor1); 66 if (error) 67 goto done_cursor; 68 error = hammer_btree_extract_data(&cursor1); 69 if (error) 70 goto done_cursor; 71 72 /* 73 * Cursor2, return an error -> candidate goes to pass2 list 74 */ 75 error = hammer_init_cursor(trans, &cursor2, NULL, NULL); 76 if (error) 77 goto done_cursors; 78 cursor2.key_beg = dedup->elm2; 79 cursor2.flags |= HAMMER_CURSOR_BACKEND; 80 81 error = hammer_btree_lookup(&cursor2); 82 if (error) 83 goto done_cursors; 84 error = hammer_btree_extract_data(&cursor2); 85 if (error) 86 goto done_cursors; 87 88 /* 89 * Zone validation. We can't de-dup any of the other zones 90 * (BTREE or META) or bad things will happen. 91 * 92 * Return with error = 0, but set an INVALID_ZONE flag. 93 */ 94 error = validate_zone(cursor1.leaf->data_offset) + 95 validate_zone(cursor2.leaf->data_offset); 96 if (error) { 97 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE; 98 error = 0; 99 goto done_cursors; 100 } 101 102 /* 103 * Comparison checks 104 * 105 * If zones don't match or data_len fields aren't the same 106 * we consider it to be a comparison failure. 107 * 108 * Return with error = 0, but set a CMP_FAILURE flag. 109 */ 110 if (HAMMER_ZONE(cursor1.leaf->data_offset) != 111 HAMMER_ZONE(cursor2.leaf->data_offset)) { 112 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 113 goto done_cursors; 114 } 115 if (cursor1.leaf->data_len != cursor2.leaf->data_len) { 116 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 117 goto done_cursors; 118 } 119 120 /* byte-by-byte comparison to be sure */ 121 if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) { 122 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 123 goto done_cursors; 124 } 125 126 /* 127 * Upgrade both cursors together to an exclusive lock 128 * 129 * Return an error -> candidate goes to pass2 list 130 */ 131 hammer_sync_lock_sh(trans); 132 error = hammer_cursor_upgrade2(&cursor1, &cursor2); 133 if (error) { 134 hammer_sync_unlock(trans); 135 goto done_cursors; 136 } 137 138 error = hammer_blockmap_dedup(cursor1.trans, 139 cursor1.leaf->data_offset, cursor1.leaf->data_len); 140 if (error) { 141 if (error == ERANGE) { 142 /* Return with error = 0, but set an UNDERFLOW flag */ 143 dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW; 144 error = 0; 145 } 146 147 /* Return all other errors -> block goes to pass2 list */ 148 goto downgrade_cursors; 149 } 150 151 /* 152 * The cursor2's cache must be invalidated before calling 153 * hammer_blockmap_free(), otherwise it will not be able to 154 * invalidate the underlying data buffer. 155 */ 156 hammer_cursor_invalidate_cache(&cursor2); 157 hammer_blockmap_free(cursor2.trans, 158 cursor2.leaf->data_offset, cursor2.leaf->data_len); 159 160 hammer_modify_node(cursor2.trans, cursor2.node, 161 &cursor2.leaf->data_offset, sizeof(hammer_off_t)); 162 cursor2.leaf->data_offset = cursor1.leaf->data_offset; 163 hammer_modify_node_done(cursor2.node); 164 165 downgrade_cursors: 166 hammer_cursor_downgrade2(&cursor1, &cursor2); 167 hammer_sync_unlock(trans); 168 done_cursors: 169 hammer_done_cursor(&cursor2); 170 done_cursor: 171 hammer_done_cursor(&cursor1); 172 173 /* 174 * Avoid deadlocking the buffer cache 175 */ 176 seq = trans->hmp->flusher.done; 177 while (hammer_flusher_meta_halflimit(trans->hmp) || 178 hammer_flusher_undo_exhausted(trans, 2)) { 179 hammer_flusher_wait(trans->hmp, seq); 180 seq = hammer_flusher_async_one(trans->hmp); 181 } 182 return (error); 183 } 184 185 static __inline int 186 validate_zone(hammer_off_t data_offset) 187 { 188 switch(HAMMER_ZONE(data_offset)) { 189 case HAMMER_ZONE_LARGE_DATA: 190 case HAMMER_ZONE_SMALL_DATA: 191 return (0); 192 default: 193 return (1); 194 } 195 } 196 197 /************************************************************************ 198 * LIVE DEDUP * 199 ************************************************************************ 200 * 201 * HAMMER Live Dedup (aka as efficient cp(1) implementation) 202 * 203 * The dedup cache is operated in a LRU fashion and destroyed on 204 * unmount, so essentially this is a live dedup on a cached dataset and 205 * not a full-fledged fs-wide one - we have a batched dedup for that. 206 * We allow duplicate entries in the buffer cache, data blocks are 207 * deduplicated only on their way to media. By default the cache is 208 * populated on reads only, but it can be populated on writes too. 209 * 210 * The main implementation gotcha is on-media requirement - in order for 211 * a data block to be added to a dedup cache it has to be present on 212 * disk. This simplifies cache logic a lot - once data is laid out on 213 * media it remains valid on media all the way up to the point where the 214 * related big-block the data was stored in is freed - so there is only 215 * one place we need to bother with invalidation code. 216 */ 217 218 /* 219 * RB-Tree support for dedup cache structures 220 */ 221 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry, 222 hammer_dedup_crc_rb_compare, hammer_crc_t, crc); 223 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry, 224 hammer_dedup_off_rb_compare, hammer_off_t, data_offset); 225 226 struct hammer_dedup_inval { 227 struct hammer_mount *hmp; 228 hammer_off_t base_offset; 229 }; 230 231 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data); 232 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, 233 void *data); 234 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data, 235 int *errorp); 236 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data, 237 int *errorp); 238 239 int 240 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2) 241 { 242 if (dc1->crc < dc2->crc) 243 return (-1); 244 if (dc1->crc > dc2->crc) 245 return (1); 246 247 return (0); 248 } 249 250 int 251 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2) 252 { 253 if (dc1->data_offset < dc2->data_offset) 254 return (-1); 255 if (dc1->data_offset > dc2->data_offset) 256 return (1); 257 258 return (0); 259 } 260 261 static int 262 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data) 263 { 264 hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset; 265 266 if (dc->data_offset < off) 267 return (-1); 268 if (dc->data_offset >= off + HAMMER_BIGBLOCK_SIZE) 269 return (1); 270 271 return (0); 272 } 273 274 static int 275 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data) 276 { 277 hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp; 278 279 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc); 280 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc); 281 TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry); 282 283 --hmp->dedup_cache_count; 284 kfree(dc, hmp->m_misc); 285 286 return (0); 287 } 288 289 hammer_dedup_cache_t 290 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf) 291 { 292 hammer_dedup_cache_t dcp, tmp; 293 hammer_mount_t hmp = ip->hmp; 294 295 if (hmp->dedup_free_cache == NULL) { 296 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO); 297 if (hmp->dedup_free_cache == NULL) 298 hmp->dedup_free_cache = tmp; 299 else 300 kfree(tmp, hmp->m_misc); 301 } 302 303 KKASSERT(leaf != NULL); 304 305 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 306 &hmp->rb_dedup_crc_root, leaf->data_crc); 307 if (dcp != NULL) { 308 RB_REMOVE(hammer_dedup_off_rb_tree, 309 &hmp->rb_dedup_off_root, dcp); 310 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 311 goto populate; 312 } 313 314 if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) { 315 dcp = hmp->dedup_free_cache; 316 hmp->dedup_free_cache = NULL; 317 ++hmp->dedup_cache_count; 318 } else { 319 dcp = TAILQ_FIRST(&hmp->dedup_lru_list); 320 RB_REMOVE(hammer_dedup_crc_rb_tree, 321 &hmp->rb_dedup_crc_root, dcp); 322 RB_REMOVE(hammer_dedup_off_rb_tree, 323 &hmp->rb_dedup_off_root, dcp); 324 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 325 } 326 327 dcp->crc = leaf->data_crc; 328 tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 329 KKASSERT(tmp == NULL); 330 331 populate: 332 dcp->hmp = ip->hmp; 333 dcp->obj_id = ip->obj_id; 334 dcp->localization = ip->obj_localization; 335 dcp->file_offset = leaf->base.key - leaf->data_len; 336 dcp->bytes = leaf->data_len; 337 dcp->data_offset = leaf->data_offset; 338 339 tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 340 KKASSERT(tmp == NULL); 341 TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry); 342 343 return (dcp); 344 } 345 346 __inline hammer_dedup_cache_t 347 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc) 348 { 349 hammer_dedup_cache_t dcp; 350 351 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 352 &hmp->rb_dedup_crc_root, crc); 353 return dcp; 354 } 355 356 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset) 357 { 358 struct hammer_dedup_inval di; 359 360 di.hmp = hmp; 361 di.base_offset = base_offset; 362 363 RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, 364 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di); 365 } 366 367 void 368 hammer_destroy_dedup_cache(hammer_mount_t hmp) 369 { 370 hammer_dedup_cache_t dcp; 371 372 while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) { 373 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 374 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 375 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 376 --hmp->dedup_cache_count; 377 kfree(dcp, hmp->m_misc); 378 } 379 380 KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root)); 381 KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root)); 382 KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list)); 383 384 KKASSERT(hmp->dedup_cache_count == 0); 385 } 386 387 int 388 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, void *data) 389 { 390 int error; 391 392 /* 393 * Zone validation 394 */ 395 if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone) 396 return (0); 397 398 /* 399 * Block length validation 400 */ 401 if (dcp->bytes != bytes) 402 return (0); 403 404 /* 405 * Byte-by-byte data comparison 406 * 407 * The data we need for validation may already be present in the 408 * buffer cache in two flavours: vnode-based buffer or 409 * block-device-based buffer. In case vnode-based buffer wasn't 410 * there or if a non-blocking attempt to acquire it failed use 411 * device-based buffer (for large-zone data blocks it will 412 * generate a separate read). 413 * 414 * XXX vnode-based checking is not MP safe so when live-dedup 415 * is turned on we must always use the device buffer. 416 */ 417 #if 0 418 if (hammer_double_buffer) { 419 error = 1; 420 } else if (_vnode_validate(dcp, data, &error)) { 421 hammer_live_dedup_vnode_bcmps++; 422 return (1); 423 } else { 424 if (error == 3) 425 hammer_live_dedup_findblk_failures++; 426 } 427 428 /* 429 * If there was an error previously or if double buffering is 430 * enabled. 431 */ 432 if (error) { 433 if (_dev_validate(dcp, data, &error)) { 434 hammer_live_dedup_device_bcmps++; 435 return (1); 436 } 437 } 438 #endif 439 if (_dev_validate(dcp, data, &error)) { 440 hammer_live_dedup_device_bcmps++; 441 return (1); 442 } 443 444 return (0); 445 } 446 447 static __inline int 448 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 449 { 450 struct hammer_transaction trans; 451 hammer_inode_t ip; 452 struct vnode *vp; 453 struct buf *bp; 454 off_t dooffset; 455 int result, error; 456 457 result = error = 0; 458 *errorp = 0; 459 460 hammer_simple_transaction(&trans, dcp->hmp); 461 462 ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID, 463 dcp->localization, 0, &error); 464 if (ip == NULL) { 465 hkprintf("dedup: unable to find objid %016jx:%08x\n", 466 (intmax_t)dcp->obj_id, dcp->localization); 467 *errorp = 1; 468 goto failed2; 469 } 470 471 error = hammer_get_vnode(ip, &vp); 472 if (error) { 473 hkprintf("dedup: unable to acquire vnode for %016jx:%08x\n", 474 (intmax_t)dcp->obj_id, dcp->localization); 475 *errorp = 2; 476 goto failed; 477 } 478 479 if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) { 480 bremfree(bp); 481 482 /* XXX if (mapped to userspace) goto done, *errorp = 4 */ 483 484 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) { 485 *errorp = 5; 486 goto done; 487 } 488 489 if (bp->b_bio2.bio_offset != dcp->data_offset) { 490 error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset, 491 NULL, NULL, BUF_CMD_READ); 492 if (error) { 493 *errorp = 6; 494 goto done; 495 } 496 497 if (dooffset != dcp->data_offset) { 498 *errorp = 7; 499 goto done; 500 } 501 hammer_live_dedup_bmap_saves++; 502 } 503 504 if (bcmp(data, bp->b_data, dcp->bytes) == 0) 505 result = 1; 506 507 done: 508 bqrelse(bp); 509 } else { 510 *errorp = 3; 511 } 512 vput(vp); 513 514 failed: 515 hammer_rel_inode(ip, 0); 516 failed2: 517 hammer_done_transaction(&trans); 518 return (result); 519 } 520 521 static __inline int 522 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 523 { 524 hammer_buffer_t buffer = NULL; 525 void *ondisk_data; 526 int result, error; 527 528 result = error = 0; 529 *errorp = 0; 530 531 ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset, 532 dcp->bytes, &error, &buffer); 533 if (error) { 534 *errorp = 1; 535 goto failed; 536 } 537 538 if (bcmp(data, ondisk_data, dcp->bytes) == 0) 539 result = 1; 540 541 failed: 542 if (buffer) 543 hammer_rel_buffer(buffer, 0); 544 545 return (result); 546 } 547