xref: /dragonfly/sys/vfs/hammer/hammer_dedup.c (revision aeaecd48)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 static __inline int validate_zone(hammer_off_t data_offset);
38 
39 int
40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
41 		 struct hammer_ioc_dedup *dedup)
42 {
43 	struct hammer_cursor cursor1, cursor2;
44 	int error;
45 	int seq;
46 
47 	/*
48 	 * Enforce hammer filesystem version requirements
49 	 */
50 	if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
51 		hkprintf("Filesystem must be upgraded to v5 "
52 			"before you can run dedup\n");
53 		return (EOPNOTSUPP);
54 	}
55 
56 	/*
57 	 * Cursor1, return an error -> candidate goes to pass2 list
58 	 */
59 	error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
60 	if (error)
61 		goto done_cursor;
62 	cursor1.key_beg = dedup->elm1;
63 	cursor1.flags |= HAMMER_CURSOR_BACKEND;
64 
65 	error = hammer_btree_lookup(&cursor1);
66 	if (error)
67 		goto done_cursor;
68 	error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
69 						HAMMER_CURSOR_GET_DATA);
70 	if (error)
71 		goto done_cursor;
72 
73 	/*
74 	 * Cursor2, return an error -> candidate goes to pass2 list
75 	 */
76 	error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
77 	if (error)
78 		goto done_cursors;
79 	cursor2.key_beg = dedup->elm2;
80 	cursor2.flags |= HAMMER_CURSOR_BACKEND;
81 
82 	error = hammer_btree_lookup(&cursor2);
83 	if (error)
84 		goto done_cursors;
85 	error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
86 						HAMMER_CURSOR_GET_DATA);
87 	if (error)
88 		goto done_cursors;
89 
90 	/*
91 	 * Zone validation. We can't de-dup any of the other zones
92 	 * (BTREE or META) or bad things will happen.
93 	 *
94 	 * Return with error = 0, but set an INVALID_ZONE flag.
95 	 */
96 	error = validate_zone(cursor1.leaf->data_offset) +
97 			    validate_zone(cursor2.leaf->data_offset);
98 	if (error) {
99 		dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
100 		error = 0;
101 		goto done_cursors;
102 	}
103 
104 	/*
105 	 * Comparison checks
106 	 *
107 	 * If zones don't match or data_len fields aren't the same
108 	 * we consider it to be a comparison failure.
109 	 *
110 	 * Return with error = 0, but set a CMP_FAILURE flag.
111 	 */
112 	if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
113 	    (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
114 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
115 		goto done_cursors;
116 	}
117 	if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
118 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
119 		goto done_cursors;
120 	}
121 
122 	/* byte-by-byte comparison to be sure */
123 	if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
124 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
125 		goto done_cursors;
126 	}
127 
128 	/*
129 	 * Upgrade both cursors together to an exclusive lock
130 	 *
131 	 * Return an error -> candidate goes to pass2 list
132 	 */
133 	hammer_sync_lock_sh(trans);
134 	error = hammer_cursor_upgrade2(&cursor1, &cursor2);
135 	if (error) {
136 		hammer_sync_unlock(trans);
137 		goto done_cursors;
138 	}
139 
140 	error = hammer_blockmap_dedup(cursor1.trans,
141 			cursor1.leaf->data_offset, cursor1.leaf->data_len);
142 	if (error) {
143 		if (error == ERANGE) {
144 			/* Return with error = 0, but set an UNDERFLOW flag */
145 			dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
146 			error = 0;
147 		}
148 
149 		/* Return all other errors -> block goes to pass2 list */
150 		goto downgrade_cursors;
151 	}
152 
153 	/*
154 	 * The cursor2's cache must be invalidated before calling
155 	 * hammer_blockmap_free(), otherwise it will not be able to
156 	 * invalidate the underlying data buffer.
157 	 */
158 	hammer_cursor_invalidate_cache(&cursor2);
159 	hammer_blockmap_free(cursor2.trans,
160 			cursor2.leaf->data_offset, cursor2.leaf->data_len);
161 
162 	hammer_modify_node(cursor2.trans, cursor2.node,
163 			&cursor2.leaf->data_offset, sizeof(hammer_off_t));
164 	cursor2.leaf->data_offset = cursor1.leaf->data_offset;
165 	hammer_modify_node_done(cursor2.node);
166 
167 downgrade_cursors:
168 	hammer_cursor_downgrade2(&cursor1, &cursor2);
169 	hammer_sync_unlock(trans);
170 done_cursors:
171 	hammer_done_cursor(&cursor2);
172 done_cursor:
173 	hammer_done_cursor(&cursor1);
174 
175 	/*
176 	 * Avoid deadlocking the buffer cache
177 	 */
178 	seq = trans->hmp->flusher.done;
179 	while (hammer_flusher_meta_halflimit(trans->hmp) ||
180 	       hammer_flusher_undo_exhausted(trans, 2)) {
181 		hammer_flusher_wait(trans->hmp, seq);
182 		seq = hammer_flusher_async_one(trans->hmp);
183 	}
184 	return (error);
185 }
186 
187 static __inline int
188 validate_zone(hammer_off_t data_offset)
189 {
190 	switch(data_offset & HAMMER_OFF_ZONE_MASK) {
191 	case HAMMER_ZONE_LARGE_DATA:
192 	case HAMMER_ZONE_SMALL_DATA:
193 		return (0);
194 	default:
195 		return (1);
196 	}
197 }
198 
199 /************************************************************************
200  *				LIVE DEDUP 				*
201  ************************************************************************
202  *
203  * HAMMER Live Dedup (aka as efficient cp(1) implementation)
204  *
205  * The dedup cache is operated in a LRU fashion and destroyed on
206  * unmount, so essentially this is a live dedup on a cached dataset and
207  * not a full-fledged fs-wide one - we have a batched dedup for that.
208  * We allow duplicate entries in the buffer cache, data blocks are
209  * deduplicated only on their way to media.  By default the cache is
210  * populated on reads only, but it can be populated on writes too.
211  *
212  * The main implementation gotcha is on-media requirement - in order for
213  * a data block to be added to a dedup cache it has to be present on
214  * disk.  This simplifies cache logic a lot - once data is laid out on
215  * media it remains valid on media all the way up to the point where the
216  * related big-block the data was stored in is freed - so there is only
217  * one place we need to bother with invalidation code.
218  */
219 
220 /*
221  * RB-Tree support for dedup cache structures
222  */
223 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
224 		hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
225 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
226 		hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
227 
228 struct hammer_dedup_inval {
229 	struct hammer_mount *hmp;
230 	hammer_off_t base_offset;
231 };
232 
233 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
234 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
235 		void *data);
236 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
237 		int *errorp);
238 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
239 		int *errorp);
240 
241 int
242 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2)
243 {
244 	if (dc1->crc < dc2->crc)
245 		return (-1);
246 	if (dc1->crc > dc2->crc)
247 		return (1);
248 
249 	return (0);
250 }
251 
252 int
253 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2)
254 {
255 	if (dc1->data_offset < dc2->data_offset)
256 		return (-1);
257 	if (dc1->data_offset > dc2->data_offset)
258 		return (1);
259 
260 	return (0);
261 }
262 
263 static int
264 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
265 {
266 	hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
267 
268 	if (dc->data_offset < off)
269 		return (-1);
270 	if (dc->data_offset >= off + HAMMER_BIGBLOCK_SIZE)
271 		return (1);
272 
273 	return (0);
274 }
275 
276 static int
277 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
278 {
279 	hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
280 
281 	RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
282 	RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
283 	TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
284 
285 	--hmp->dedup_cache_count;
286 	kfree(dc, hmp->m_misc);
287 
288 	return (0);
289 }
290 
291 hammer_dedup_cache_t
292 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
293 {
294 	hammer_dedup_cache_t dcp, tmp;
295 	hammer_mount_t hmp = ip->hmp;
296 
297 	if (hmp->dedup_free_cache == NULL) {
298 		tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
299 		if (hmp->dedup_free_cache == NULL)
300 			hmp->dedup_free_cache = tmp;
301 		else
302 			kfree(tmp, hmp->m_misc);
303 	}
304 
305 	KKASSERT(leaf != NULL);
306 
307 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
308 				&hmp->rb_dedup_crc_root, leaf->data_crc);
309 	if (dcp != NULL) {
310 		RB_REMOVE(hammer_dedup_off_rb_tree,
311 				&hmp->rb_dedup_off_root, dcp);
312 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
313 		goto populate;
314 	}
315 
316 	if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
317 		dcp = hmp->dedup_free_cache;
318 		hmp->dedup_free_cache = NULL;
319 		++hmp->dedup_cache_count;
320 	} else {
321 		dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
322 		RB_REMOVE(hammer_dedup_crc_rb_tree,
323 				&hmp->rb_dedup_crc_root, dcp);
324 		RB_REMOVE(hammer_dedup_off_rb_tree,
325 				&hmp->rb_dedup_off_root, dcp);
326 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
327 	}
328 
329 	dcp->crc = leaf->data_crc;
330 	tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
331 	KKASSERT(tmp == NULL);
332 
333 populate:
334 	dcp->hmp = ip->hmp;
335 	dcp->obj_id = ip->obj_id;
336 	dcp->localization = ip->obj_localization;
337 	dcp->file_offset = leaf->base.key - leaf->data_len;
338 	dcp->bytes = leaf->data_len;
339 	dcp->data_offset = leaf->data_offset;
340 
341 	tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
342 	KKASSERT(tmp == NULL);
343 	TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
344 
345 	return (dcp);
346 }
347 
348 __inline hammer_dedup_cache_t
349 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
350 {
351 	hammer_dedup_cache_t dcp;
352 
353 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
354 				&hmp->rb_dedup_crc_root, crc);
355 	return dcp;
356 }
357 
358 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
359 {
360 	struct hammer_dedup_inval di;
361 
362 	di.hmp = hmp;
363 	di.base_offset = base_offset;
364 
365 	RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
366 		hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
367 }
368 
369 void
370 hammer_destroy_dedup_cache(hammer_mount_t hmp)
371 {
372 	hammer_dedup_cache_t dcp;
373 
374 	while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
375 		RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
376 		RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
377 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
378 		--hmp->dedup_cache_count;
379 		kfree(dcp, hmp->m_misc);
380 	}
381 
382 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
383 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
384 	KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
385 
386 	KKASSERT(hmp->dedup_cache_count == 0);
387 }
388 
389 int
390 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, void *data)
391 {
392 	int error;
393 
394 	/*
395 	 * Zone validation
396 	 */
397 	if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
398 		return (0);
399 
400 	/*
401 	 * Block length validation
402 	 */
403 	if (dcp->bytes != bytes)
404 		return (0);
405 
406 	/*
407 	 * Byte-by-byte data comparison
408 	 *
409 	 * The data we need for validation may already be present in the
410 	 * buffer cache in two flavours: vnode-based buffer or
411 	 * block-device-based buffer.  In case vnode-based buffer wasn't
412 	 * there or if a non-blocking attempt to acquire it failed use
413 	 * device-based buffer (for large-zone data blocks it will
414 	 * generate a separate read).
415 	 *
416 	 * XXX vnode-based checking is not MP safe so when live-dedup
417 	 *     is turned on we must always use the device buffer.
418 	 */
419 #if 0
420 	if (hammer_double_buffer) {
421 		error = 1;
422 	} else if (_vnode_validate(dcp, data, &error)) {
423 		hammer_live_dedup_vnode_bcmps++;
424 		return (1);
425 	} else {
426 		if (error == 3)
427 			hammer_live_dedup_findblk_failures++;
428 	}
429 
430 	/*
431 	 * If there was an error previously or if double buffering is
432 	 * enabled.
433 	 */
434 	if (error) {
435 		if (_dev_validate(dcp, data, &error)) {
436 			hammer_live_dedup_device_bcmps++;
437 			return (1);
438 		}
439 	}
440 #endif
441 	if (_dev_validate(dcp, data, &error)) {
442 		hammer_live_dedup_device_bcmps++;
443 		return (1);
444 	}
445 
446 	return (0);
447 }
448 
449 static __inline int
450 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
451 {
452 	struct hammer_transaction trans;
453 	hammer_inode_t ip;
454 	struct vnode *vp;
455 	struct buf *bp;
456 	off_t dooffset;
457 	int result, error;
458 
459 	result = error = 0;
460 	*errorp = 0;
461 
462 	hammer_simple_transaction(&trans, dcp->hmp);
463 
464 	ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
465 	    dcp->localization, 0, &error);
466 	if (ip == NULL) {
467 		hkprintf("dedup: unable to find objid %016jx:%08x\n",
468 		    (intmax_t)dcp->obj_id, dcp->localization);
469 		*errorp = 1;
470 		goto failed2;
471 	}
472 
473 	error = hammer_get_vnode(ip, &vp);
474 	if (error) {
475 		hkprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
476 		    (intmax_t)dcp->obj_id, dcp->localization);
477 		*errorp = 2;
478 		goto failed;
479 	}
480 
481 	if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
482 		bremfree(bp);
483 
484 		/* XXX if (mapped to userspace) goto done, *errorp = 4 */
485 
486 		if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
487 			*errorp = 5;
488 			goto done;
489 		}
490 
491 		if (bp->b_bio2.bio_offset != dcp->data_offset) {
492 			error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
493 			    NULL, NULL, BUF_CMD_READ);
494 			if (error) {
495 				*errorp = 6;
496 				goto done;
497 			}
498 
499 			if (dooffset != dcp->data_offset) {
500 				*errorp = 7;
501 				goto done;
502 			}
503 			hammer_live_dedup_bmap_saves++;
504 		}
505 
506 		if (bcmp(data, bp->b_data, dcp->bytes) == 0)
507 			result = 1;
508 
509 done:
510 		bqrelse(bp);
511 	} else {
512 		*errorp = 3;
513 	}
514 	vput(vp);
515 
516 failed:
517 	hammer_rel_inode(ip, 0);
518 failed2:
519 	hammer_done_transaction(&trans);
520 	return (result);
521 }
522 
523 static __inline int
524 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
525 {
526 	hammer_buffer_t buffer = NULL;
527 	void *ondisk_data;
528 	int result, error;
529 
530 	result = error = 0;
531 	*errorp = 0;
532 
533 	ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
534 	    dcp->bytes, &error, &buffer);
535 	if (error) {
536 		*errorp = 1;
537 		goto failed;
538 	}
539 
540 	if (bcmp(data, ondisk_data, dcp->bytes) == 0)
541 		result = 1;
542 
543 failed:
544 	if (buffer)
545 		hammer_rel_buffer(buffer, 0);
546 
547 	return (result);
548 }
549