xref: /dragonfly/sys/vfs/hammer/hammer_dedup.c (revision 65cc0652)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 int
38 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
39 		 struct hammer_ioc_dedup *dedup)
40 {
41 	struct hammer_cursor cursor1, cursor2;
42 	int error;
43 	int seq;
44 
45 	/*
46 	 * Enforce hammer filesystem version requirements
47 	 */
48 	if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
49 		hkprintf("Filesystem must be upgraded to v5 "
50 			"before you can run dedup\n");
51 		return (EOPNOTSUPP);
52 	}
53 
54 	/*
55 	 * Cursor1, return an error -> candidate goes to pass2 list
56 	 */
57 	error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
58 	if (error)
59 		goto done_cursor;
60 	cursor1.key_beg = dedup->elm1;
61 	cursor1.flags |= HAMMER_CURSOR_BACKEND;
62 
63 	error = hammer_btree_lookup(&cursor1);
64 	if (error)
65 		goto done_cursor;
66 	error = hammer_btree_extract_data(&cursor1);
67 	if (error)
68 		goto done_cursor;
69 
70 	/*
71 	 * Cursor2, return an error -> candidate goes to pass2 list
72 	 */
73 	error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
74 	if (error)
75 		goto done_cursors;
76 	cursor2.key_beg = dedup->elm2;
77 	cursor2.flags |= HAMMER_CURSOR_BACKEND;
78 
79 	error = hammer_btree_lookup(&cursor2);
80 	if (error)
81 		goto done_cursors;
82 	error = hammer_btree_extract_data(&cursor2);
83 	if (error)
84 		goto done_cursors;
85 
86 	/*
87 	 * Zone validation.
88 	 * We can only de-dup data zones or bad things will happen.
89 	 *
90 	 * Return with error = 0, but set an INVALID_ZONE flag.
91 	 */
92 	if (!hammer_is_zone_data(cursor1.leaf->data_offset) ||
93 	    !hammer_is_zone_data(cursor2.leaf->data_offset)) {
94 		dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
95 		goto done_cursors;
96 	}
97 
98 	/*
99 	 * Comparison checks
100 	 *
101 	 * If zones don't match or data_len fields aren't the same
102 	 * we consider it to be a comparison failure.
103 	 *
104 	 * Return with error = 0, but set a CMP_FAILURE flag.
105 	 */
106 	if (HAMMER_ZONE(cursor1.leaf->data_offset) !=
107 	    HAMMER_ZONE(cursor2.leaf->data_offset)) {
108 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
109 		goto done_cursors;
110 	}
111 	if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
112 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
113 		goto done_cursors;
114 	}
115 
116 	/* byte-by-byte comparison to be sure */
117 	if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
118 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
119 		goto done_cursors;
120 	}
121 
122 	/*
123 	 * Upgrade both cursors together to an exclusive lock
124 	 *
125 	 * Return an error -> candidate goes to pass2 list
126 	 */
127 	hammer_sync_lock_sh(trans);
128 	error = hammer_cursor_upgrade2(&cursor1, &cursor2);
129 	if (error) {
130 		hammer_sync_unlock(trans);
131 		goto done_cursors;
132 	}
133 
134 	error = hammer_blockmap_dedup(cursor1.trans,
135 			cursor1.leaf->data_offset, cursor1.leaf->data_len);
136 	if (error) {
137 		if (error == ERANGE) {
138 			/* Return with error = 0, but set an UNDERFLOW flag */
139 			dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
140 			error = 0;
141 		}
142 
143 		/* Return all other errors -> block goes to pass2 list */
144 		goto downgrade_cursors;
145 	}
146 
147 	/*
148 	 * The cursor2's cache must be invalidated before calling
149 	 * hammer_blockmap_free(), otherwise it will not be able to
150 	 * invalidate the underlying data buffer.
151 	 */
152 	hammer_cursor_invalidate_cache(&cursor2);
153 	hammer_blockmap_free(cursor2.trans,
154 			cursor2.leaf->data_offset, cursor2.leaf->data_len);
155 
156 	hammer_modify_node(cursor2.trans, cursor2.node,
157 			&cursor2.leaf->data_offset, sizeof(hammer_off_t));
158 	cursor2.leaf->data_offset = cursor1.leaf->data_offset;
159 	hammer_modify_node_done(cursor2.node);
160 
161 downgrade_cursors:
162 	hammer_cursor_downgrade2(&cursor1, &cursor2);
163 	hammer_sync_unlock(trans);
164 done_cursors:
165 	hammer_done_cursor(&cursor2);
166 done_cursor:
167 	hammer_done_cursor(&cursor1);
168 
169 	/*
170 	 * Avoid deadlocking the buffer cache
171 	 */
172 	seq = trans->hmp->flusher.done;
173 	while (hammer_flusher_meta_halflimit(trans->hmp) ||
174 	       hammer_flusher_undo_exhausted(trans, 2)) {
175 		hammer_flusher_wait(trans->hmp, seq);
176 		seq = hammer_flusher_async_one(trans->hmp);
177 	}
178 	return (error);
179 }
180 
181 /************************************************************************
182  *				LIVE DEDUP 				*
183  ************************************************************************
184  *
185  * HAMMER Live Dedup (aka as efficient cp(1) implementation)
186  *
187  * The dedup cache is operated in a LRU fashion and destroyed on
188  * unmount, so essentially this is a live dedup on a cached dataset and
189  * not a full-fledged fs-wide one - we have a batched dedup for that.
190  * We allow duplicate entries in the buffer cache, data blocks are
191  * deduplicated only on their way to media.  By default the cache is
192  * populated on reads only, but it can be populated on writes too.
193  *
194  * The main implementation gotcha is on-media requirement - in order for
195  * a data block to be added to a dedup cache it has to be present on
196  * disk.  This simplifies cache logic a lot - once data is laid out on
197  * media it remains valid on media all the way up to the point where the
198  * related big-block the data was stored in is freed - so there is only
199  * one place we need to bother with invalidation code.
200  */
201 
202 /*
203  * RB-Tree support for dedup cache structures
204  */
205 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
206 		hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
207 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
208 		hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
209 
210 struct hammer_dedup_inval {
211 	hammer_mount_t hmp;
212 	hammer_off_t base_offset;
213 };
214 
215 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
216 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
217 		void *data);
218 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
219 		int *errorp);
220 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
221 		int *errorp);
222 
223 int
224 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2)
225 {
226 	if (dc1->crc < dc2->crc)
227 		return (-1);
228 	if (dc1->crc > dc2->crc)
229 		return (1);
230 
231 	return (0);
232 }
233 
234 int
235 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2)
236 {
237 	if (dc1->data_offset < dc2->data_offset)
238 		return (-1);
239 	if (dc1->data_offset > dc2->data_offset)
240 		return (1);
241 
242 	return (0);
243 }
244 
245 static int
246 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
247 {
248 	hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
249 
250 	if (dc->data_offset < off)
251 		return (-1);
252 	if (dc->data_offset >= off + HAMMER_BIGBLOCK_SIZE)
253 		return (1);
254 
255 	return (0);
256 }
257 
258 static int
259 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
260 {
261 	hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
262 
263 	RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
264 	RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
265 	TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
266 
267 	--hmp->dedup_cache_count;
268 	kfree(dc, hmp->m_misc);
269 
270 	return (0);
271 }
272 
273 hammer_dedup_cache_t
274 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
275 {
276 	hammer_dedup_cache_t dcp, tmp;
277 	hammer_mount_t hmp = ip->hmp;
278 
279 	if (hmp->dedup_free_cache == NULL) {
280 		tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
281 		if (hmp->dedup_free_cache == NULL)
282 			hmp->dedup_free_cache = tmp;
283 		else
284 			kfree(tmp, hmp->m_misc);
285 	}
286 
287 	KKASSERT(leaf != NULL);
288 
289 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
290 				&hmp->rb_dedup_crc_root, leaf->data_crc);
291 	if (dcp != NULL) {
292 		RB_REMOVE(hammer_dedup_off_rb_tree,
293 				&hmp->rb_dedup_off_root, dcp);
294 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
295 		goto populate;
296 	}
297 
298 	if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
299 		dcp = hmp->dedup_free_cache;
300 		hmp->dedup_free_cache = NULL;
301 		++hmp->dedup_cache_count;
302 	} else {
303 		dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
304 		RB_REMOVE(hammer_dedup_crc_rb_tree,
305 				&hmp->rb_dedup_crc_root, dcp);
306 		RB_REMOVE(hammer_dedup_off_rb_tree,
307 				&hmp->rb_dedup_off_root, dcp);
308 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
309 	}
310 
311 	dcp->crc = leaf->data_crc;
312 	tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
313 	KKASSERT(tmp == NULL);
314 
315 populate:
316 	dcp->hmp = ip->hmp;
317 	dcp->obj_id = ip->obj_id;
318 	dcp->localization = ip->obj_localization;
319 	dcp->file_offset = leaf->base.key - leaf->data_len;
320 	dcp->bytes = leaf->data_len;
321 	dcp->data_offset = leaf->data_offset;
322 
323 	tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
324 	KKASSERT(tmp == NULL);
325 	TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
326 
327 	return (dcp);
328 }
329 
330 __inline hammer_dedup_cache_t
331 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
332 {
333 	hammer_dedup_cache_t dcp;
334 
335 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
336 				&hmp->rb_dedup_crc_root, crc);
337 	return dcp;
338 }
339 
340 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
341 {
342 	struct hammer_dedup_inval di;
343 
344 	di.hmp = hmp;
345 	di.base_offset = base_offset;
346 
347 	RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
348 		hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
349 }
350 
351 void
352 hammer_destroy_dedup_cache(hammer_mount_t hmp)
353 {
354 	hammer_dedup_cache_t dcp;
355 
356 	while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
357 		RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
358 		RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
359 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
360 		--hmp->dedup_cache_count;
361 		kfree(dcp, hmp->m_misc);
362 	}
363 
364 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
365 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
366 	KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
367 
368 	KKASSERT(hmp->dedup_cache_count == 0);
369 }
370 
371 int
372 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, void *data)
373 {
374 	int error;
375 
376 	/*
377 	 * Zone validation
378 	 */
379 	if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
380 		return (0);
381 
382 	/*
383 	 * Block length validation
384 	 */
385 	if (dcp->bytes != bytes)
386 		return (0);
387 
388 	/*
389 	 * Byte-by-byte data comparison
390 	 *
391 	 * The data we need for validation may already be present in the
392 	 * buffer cache in two flavours: vnode-based buffer or
393 	 * block-device-based buffer.  In case vnode-based buffer wasn't
394 	 * there or if a non-blocking attempt to acquire it failed use
395 	 * device-based buffer (for large-zone data blocks it will
396 	 * generate a separate read).
397 	 *
398 	 * XXX vnode-based checking is not MP safe so when live-dedup
399 	 *     is turned on we must always use the device buffer.
400 	 */
401 #if 0
402 	if (hammer_double_buffer) {
403 		error = 1;
404 	} else if (_vnode_validate(dcp, data, &error)) {
405 		hammer_live_dedup_vnode_bcmps++;
406 		return (1);
407 	} else {
408 		if (error == 3)
409 			hammer_live_dedup_findblk_failures++;
410 	}
411 
412 	/*
413 	 * If there was an error previously or if double buffering is
414 	 * enabled.
415 	 */
416 	if (error) {
417 		if (_dev_validate(dcp, data, &error)) {
418 			hammer_live_dedup_device_bcmps++;
419 			return (1);
420 		}
421 	}
422 #endif
423 	if (_dev_validate(dcp, data, &error)) {
424 		hammer_live_dedup_device_bcmps++;
425 		return (1);
426 	}
427 
428 	return (0);
429 }
430 
431 static __inline int
432 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
433 {
434 	struct hammer_transaction trans;
435 	hammer_inode_t ip;
436 	struct vnode *vp;
437 	struct buf *bp;
438 	off_t dooffset;
439 	int result, error;
440 
441 	result = error = 0;
442 	*errorp = 0;
443 
444 	hammer_simple_transaction(&trans, dcp->hmp);
445 
446 	ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
447 	    dcp->localization, 0, &error);
448 	if (ip == NULL) {
449 		hkprintf("dedup: unable to find objid %016jx:%08x\n",
450 		    (intmax_t)dcp->obj_id, dcp->localization);
451 		*errorp = 1;
452 		goto failed2;
453 	}
454 
455 	error = hammer_get_vnode(ip, &vp);
456 	if (error) {
457 		hkprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
458 		    (intmax_t)dcp->obj_id, dcp->localization);
459 		*errorp = 2;
460 		goto failed;
461 	}
462 
463 	if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
464 		bremfree(bp);
465 
466 		/* XXX if (mapped to userspace) goto done, *errorp = 4 */
467 
468 		if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
469 			*errorp = 5;
470 			goto done;
471 		}
472 
473 		if (bp->b_bio2.bio_offset != dcp->data_offset) {
474 			error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
475 			    NULL, NULL, BUF_CMD_READ);
476 			if (error) {
477 				*errorp = 6;
478 				goto done;
479 			}
480 
481 			if (dooffset != dcp->data_offset) {
482 				*errorp = 7;
483 				goto done;
484 			}
485 			hammer_live_dedup_bmap_saves++;
486 		}
487 
488 		if (bcmp(data, bp->b_data, dcp->bytes) == 0)
489 			result = 1;
490 
491 done:
492 		bqrelse(bp);
493 	} else {
494 		*errorp = 3;
495 	}
496 	vput(vp);
497 
498 failed:
499 	hammer_rel_inode(ip, 0);
500 failed2:
501 	hammer_done_transaction(&trans);
502 	return (result);
503 }
504 
505 static __inline int
506 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
507 {
508 	hammer_buffer_t buffer = NULL;
509 	void *ondisk_data;
510 	int result, error;
511 
512 	result = error = 0;
513 	*errorp = 0;
514 
515 	ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
516 	    dcp->bytes, &error, &buffer);
517 	if (error) {
518 		*errorp = 1;
519 		goto failed;
520 	}
521 
522 	if (bcmp(data, ondisk_data, dcp->bytes) == 0)
523 		result = 1;
524 
525 failed:
526 	if (buffer)
527 		hammer_rel_buffer(buffer, 0);
528 
529 	return (result);
530 }
531