1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 int 38 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip, 39 struct hammer_ioc_dedup *dedup) 40 { 41 struct hammer_cursor cursor1, cursor2; 42 int error; 43 int seq; 44 45 /* 46 * Enforce hammer filesystem version requirements 47 */ 48 if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) { 49 hkprintf("Filesystem must be upgraded to v5 " 50 "before you can run dedup\n"); 51 return (EOPNOTSUPP); 52 } 53 54 /* 55 * Cursor1, return an error -> candidate goes to pass2 list 56 */ 57 error = hammer_init_cursor(trans, &cursor1, NULL, NULL); 58 if (error) 59 goto done_cursor; 60 cursor1.key_beg = dedup->elm1; 61 cursor1.flags |= HAMMER_CURSOR_BACKEND; 62 63 error = hammer_btree_lookup(&cursor1); 64 if (error) 65 goto done_cursor; 66 error = hammer_btree_extract_data(&cursor1); 67 if (error) 68 goto done_cursor; 69 70 /* 71 * Cursor2, return an error -> candidate goes to pass2 list 72 */ 73 error = hammer_init_cursor(trans, &cursor2, NULL, NULL); 74 if (error) 75 goto done_cursors; 76 cursor2.key_beg = dedup->elm2; 77 cursor2.flags |= HAMMER_CURSOR_BACKEND; 78 79 error = hammer_btree_lookup(&cursor2); 80 if (error) 81 goto done_cursors; 82 error = hammer_btree_extract_data(&cursor2); 83 if (error) 84 goto done_cursors; 85 86 /* 87 * Zone validation. 88 * We can only de-dup data zones or bad things will happen. 89 * 90 * Return with error = 0, but set an INVALID_ZONE flag. 91 */ 92 if (!hammer_is_zone_data(cursor1.leaf->data_offset) || 93 !hammer_is_zone_data(cursor2.leaf->data_offset)) { 94 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE; 95 goto done_cursors; 96 } 97 98 /* 99 * Comparison checks 100 * 101 * If zones don't match or data_len fields aren't the same 102 * we consider it to be a comparison failure. 103 * 104 * Return with error = 0, but set a CMP_FAILURE flag. 105 */ 106 if (HAMMER_ZONE(cursor1.leaf->data_offset) != 107 HAMMER_ZONE(cursor2.leaf->data_offset)) { 108 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 109 goto done_cursors; 110 } 111 if (cursor1.leaf->data_len != cursor2.leaf->data_len) { 112 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 113 goto done_cursors; 114 } 115 116 /* byte-by-byte comparison to be sure */ 117 if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) { 118 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE; 119 goto done_cursors; 120 } 121 122 /* 123 * Upgrade both cursors together to an exclusive lock 124 * 125 * Return an error -> candidate goes to pass2 list 126 */ 127 hammer_sync_lock_sh(trans); 128 error = hammer_cursor_upgrade2(&cursor1, &cursor2); 129 if (error) { 130 hammer_sync_unlock(trans); 131 goto done_cursors; 132 } 133 134 error = hammer_blockmap_dedup(cursor1.trans, 135 cursor1.leaf->data_offset, cursor1.leaf->data_len); 136 if (error) { 137 if (error == ERANGE) { 138 /* Return with error = 0, but set an UNDERFLOW flag */ 139 dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW; 140 error = 0; 141 } 142 143 /* Return all other errors -> block goes to pass2 list */ 144 goto downgrade_cursors; 145 } 146 147 /* 148 * The cursor2's cache must be invalidated before calling 149 * hammer_blockmap_free(), otherwise it will not be able to 150 * invalidate the underlying data buffer. 151 */ 152 hammer_cursor_invalidate_cache(&cursor2); 153 hammer_blockmap_free(cursor2.trans, 154 cursor2.leaf->data_offset, cursor2.leaf->data_len); 155 156 hammer_modify_node(cursor2.trans, cursor2.node, 157 &cursor2.leaf->data_offset, sizeof(hammer_off_t)); 158 cursor2.leaf->data_offset = cursor1.leaf->data_offset; 159 hammer_modify_node_done(cursor2.node); 160 161 downgrade_cursors: 162 hammer_cursor_downgrade2(&cursor1, &cursor2); 163 hammer_sync_unlock(trans); 164 done_cursors: 165 hammer_done_cursor(&cursor2); 166 done_cursor: 167 hammer_done_cursor(&cursor1); 168 169 /* 170 * Avoid deadlocking the buffer cache 171 */ 172 seq = trans->hmp->flusher.done; 173 while (hammer_flusher_meta_halflimit(trans->hmp) || 174 hammer_flusher_undo_exhausted(trans, 2)) { 175 hammer_flusher_wait(trans->hmp, seq); 176 seq = hammer_flusher_async_one(trans->hmp); 177 } 178 return (error); 179 } 180 181 /************************************************************************ 182 * LIVE DEDUP * 183 ************************************************************************ 184 * 185 * HAMMER Live Dedup (aka as efficient cp(1) implementation) 186 * 187 * The dedup cache is operated in a LRU fashion and destroyed on 188 * unmount, so essentially this is a live dedup on a cached dataset and 189 * not a full-fledged fs-wide one - we have a batched dedup for that. 190 * We allow duplicate entries in the buffer cache, data blocks are 191 * deduplicated only on their way to media. By default the cache is 192 * populated on reads only, but it can be populated on writes too. 193 * 194 * The main implementation gotcha is on-media requirement - in order for 195 * a data block to be added to a dedup cache it has to be present on 196 * disk. This simplifies cache logic a lot - once data is laid out on 197 * media it remains valid on media all the way up to the point where the 198 * related big-block the data was stored in is freed - so there is only 199 * one place we need to bother with invalidation code. 200 */ 201 202 /* 203 * RB-Tree support for dedup cache structures 204 */ 205 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry, 206 hammer_dedup_crc_rb_compare, hammer_crc_t, crc); 207 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry, 208 hammer_dedup_off_rb_compare, hammer_off_t, data_offset); 209 210 struct hammer_dedup_inval { 211 hammer_mount_t hmp; 212 hammer_off_t base_offset; 213 }; 214 215 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data); 216 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, 217 void *data); 218 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data, 219 int *errorp); 220 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data, 221 int *errorp); 222 223 int 224 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2) 225 { 226 if (dc1->crc < dc2->crc) 227 return (-1); 228 if (dc1->crc > dc2->crc) 229 return (1); 230 231 return (0); 232 } 233 234 int 235 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2) 236 { 237 if (dc1->data_offset < dc2->data_offset) 238 return (-1); 239 if (dc1->data_offset > dc2->data_offset) 240 return (1); 241 242 return (0); 243 } 244 245 static int 246 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data) 247 { 248 hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset; 249 250 if (dc->data_offset < off) 251 return (-1); 252 if (dc->data_offset >= off + HAMMER_BIGBLOCK_SIZE) 253 return (1); 254 255 return (0); 256 } 257 258 static int 259 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data) 260 { 261 hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp; 262 263 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc); 264 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc); 265 TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry); 266 267 --hmp->dedup_cache_count; 268 kfree(dc, hmp->m_misc); 269 270 return (0); 271 } 272 273 hammer_dedup_cache_t 274 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf) 275 { 276 hammer_dedup_cache_t dcp, tmp; 277 hammer_mount_t hmp = ip->hmp; 278 279 if (hmp->dedup_free_cache == NULL) { 280 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO); 281 if (hmp->dedup_free_cache == NULL) 282 hmp->dedup_free_cache = tmp; 283 else 284 kfree(tmp, hmp->m_misc); 285 } 286 287 KKASSERT(leaf != NULL); 288 289 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 290 &hmp->rb_dedup_crc_root, leaf->data_crc); 291 if (dcp != NULL) { 292 RB_REMOVE(hammer_dedup_off_rb_tree, 293 &hmp->rb_dedup_off_root, dcp); 294 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 295 goto populate; 296 } 297 298 if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) { 299 dcp = hmp->dedup_free_cache; 300 hmp->dedup_free_cache = NULL; 301 ++hmp->dedup_cache_count; 302 } else { 303 dcp = TAILQ_FIRST(&hmp->dedup_lru_list); 304 RB_REMOVE(hammer_dedup_crc_rb_tree, 305 &hmp->rb_dedup_crc_root, dcp); 306 RB_REMOVE(hammer_dedup_off_rb_tree, 307 &hmp->rb_dedup_off_root, dcp); 308 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 309 } 310 311 dcp->crc = leaf->data_crc; 312 tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 313 KKASSERT(tmp == NULL); 314 315 populate: 316 dcp->hmp = ip->hmp; 317 dcp->obj_id = ip->obj_id; 318 dcp->localization = ip->obj_localization; 319 dcp->file_offset = leaf->base.key - leaf->data_len; 320 dcp->bytes = leaf->data_len; 321 dcp->data_offset = leaf->data_offset; 322 323 tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 324 KKASSERT(tmp == NULL); 325 TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry); 326 327 return (dcp); 328 } 329 330 __inline hammer_dedup_cache_t 331 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc) 332 { 333 hammer_dedup_cache_t dcp; 334 335 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, 336 &hmp->rb_dedup_crc_root, crc); 337 return dcp; 338 } 339 340 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset) 341 { 342 struct hammer_dedup_inval di; 343 344 di.hmp = hmp; 345 di.base_offset = base_offset; 346 347 RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, 348 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di); 349 } 350 351 void 352 hammer_destroy_dedup_cache(hammer_mount_t hmp) 353 { 354 hammer_dedup_cache_t dcp; 355 356 while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) { 357 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); 358 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); 359 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); 360 --hmp->dedup_cache_count; 361 kfree(dcp, hmp->m_misc); 362 } 363 364 KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root)); 365 KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root)); 366 KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list)); 367 368 KKASSERT(hmp->dedup_cache_count == 0); 369 } 370 371 int 372 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, void *data) 373 { 374 int error; 375 376 /* 377 * Zone validation 378 */ 379 if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone) 380 return (0); 381 382 /* 383 * Block length validation 384 */ 385 if (dcp->bytes != bytes) 386 return (0); 387 388 /* 389 * Byte-by-byte data comparison 390 * 391 * The data we need for validation may already be present in the 392 * buffer cache in two flavours: vnode-based buffer or 393 * block-device-based buffer. In case vnode-based buffer wasn't 394 * there or if a non-blocking attempt to acquire it failed use 395 * device-based buffer (for large-zone data blocks it will 396 * generate a separate read). 397 * 398 * XXX vnode-based checking is not MP safe so when live-dedup 399 * is turned on we must always use the device buffer. 400 */ 401 #if 0 402 if (hammer_double_buffer) { 403 error = 1; 404 } else if (_vnode_validate(dcp, data, &error)) { 405 hammer_live_dedup_vnode_bcmps++; 406 return (1); 407 } else { 408 if (error == 3) 409 hammer_live_dedup_findblk_failures++; 410 } 411 412 /* 413 * If there was an error previously or if double buffering is 414 * enabled. 415 */ 416 if (error) { 417 if (_dev_validate(dcp, data, &error)) { 418 hammer_live_dedup_device_bcmps++; 419 return (1); 420 } 421 } 422 #endif 423 if (_dev_validate(dcp, data, &error)) { 424 hammer_live_dedup_device_bcmps++; 425 return (1); 426 } 427 428 return (0); 429 } 430 431 static __inline int 432 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 433 { 434 struct hammer_transaction trans; 435 hammer_inode_t ip; 436 struct vnode *vp; 437 struct buf *bp; 438 off_t dooffset; 439 int result, error; 440 441 result = error = 0; 442 *errorp = 0; 443 444 hammer_simple_transaction(&trans, dcp->hmp); 445 446 ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID, 447 dcp->localization, 0, &error); 448 if (ip == NULL) { 449 hkprintf("dedup: unable to find objid %016jx:%08x\n", 450 (intmax_t)dcp->obj_id, dcp->localization); 451 *errorp = 1; 452 goto failed2; 453 } 454 455 error = hammer_get_vnode(ip, &vp); 456 if (error) { 457 hkprintf("dedup: unable to acquire vnode for %016jx:%08x\n", 458 (intmax_t)dcp->obj_id, dcp->localization); 459 *errorp = 2; 460 goto failed; 461 } 462 463 if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) { 464 bremfree(bp); 465 466 /* XXX if (mapped to userspace) goto done, *errorp = 4 */ 467 468 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) { 469 *errorp = 5; 470 goto done; 471 } 472 473 if (bp->b_bio2.bio_offset != dcp->data_offset) { 474 error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset, 475 NULL, NULL, BUF_CMD_READ); 476 if (error) { 477 *errorp = 6; 478 goto done; 479 } 480 481 if (dooffset != dcp->data_offset) { 482 *errorp = 7; 483 goto done; 484 } 485 hammer_live_dedup_bmap_saves++; 486 } 487 488 if (bcmp(data, bp->b_data, dcp->bytes) == 0) 489 result = 1; 490 491 done: 492 bqrelse(bp); 493 } else { 494 *errorp = 3; 495 } 496 vput(vp); 497 498 failed: 499 hammer_rel_inode(ip, 0); 500 failed2: 501 hammer_done_transaction(&trans); 502 return (result); 503 } 504 505 static __inline int 506 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) 507 { 508 hammer_buffer_t buffer = NULL; 509 void *ondisk_data; 510 int result, error; 511 512 result = error = 0; 513 *errorp = 0; 514 515 ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset, 516 dcp->bytes, &error, &buffer); 517 if (error) { 518 *errorp = 1; 519 goto failed; 520 } 521 522 if (bcmp(data, ondisk_data, dcp->bytes) == 0) 523 result = 1; 524 525 failed: 526 if (buffer) 527 hammer_rel_buffer(buffer, 0); 528 529 return (result); 530 } 531