xref: /dragonfly/sys/vfs/hammer/hammer_object.c (revision ce0e08e2)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.97 2008/09/23 22:28:56 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 static int hammer_mem_lookup(hammer_cursor_t cursor);
40 static int hammer_mem_first(hammer_cursor_t cursor);
41 static int hammer_frontend_trunc_callback(hammer_record_t record,
42 				void *data __unused);
43 static int hammer_bulk_scan_callback(hammer_record_t record, void *data);
44 static int hammer_record_needs_overwrite_delete(hammer_record_t record);
45 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
46 		      hammer_btree_leaf_elm_t leaf);
47 
48 struct rec_trunc_info {
49 	u_int16_t	rec_type;
50 	int64_t		trunc_off;
51 };
52 
53 struct hammer_bulk_info {
54 	hammer_record_t record;
55 	struct hammer_btree_leaf_elm leaf;
56 };
57 
58 /*
59  * Red-black tree support.  Comparison code for insertion.
60  */
61 static int
62 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
63 {
64 	if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type)
65 		return(-1);
66 	if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type)
67 		return(1);
68 
69 	if (rec1->leaf.base.key < rec2->leaf.base.key)
70 		return(-1);
71 	if (rec1->leaf.base.key > rec2->leaf.base.key)
72 		return(1);
73 
74 	/*
75 	 * Never match against an item deleted by the front-end.
76 	 *
77 	 * rec1 is greater then rec2 if rec1 is marked deleted.
78 	 * rec1 is less then rec2 if rec2 is marked deleted.
79 	 *
80 	 * Multiple deleted records may be present, do not return 0
81 	 * if both are marked deleted.
82 	 */
83 	if (rec1->flags & HAMMER_RECF_DELETED_FE)
84 		return(1);
85 	if (rec2->flags & HAMMER_RECF_DELETED_FE)
86 		return(-1);
87 
88         return(0);
89 }
90 
91 /*
92  * Basic record comparison code similar to hammer_btree_cmp().
93  */
94 static int
95 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec)
96 {
97 	if (elm->rec_type < rec->leaf.base.rec_type)
98 		return(-3);
99 	if (elm->rec_type > rec->leaf.base.rec_type)
100 		return(3);
101 
102         if (elm->key < rec->leaf.base.key)
103                 return(-2);
104         if (elm->key > rec->leaf.base.key)
105                 return(2);
106 
107 	/*
108 	 * Never match against an item deleted by the front-end.
109 	 * elm is less then rec if rec is marked deleted.
110 	 */
111 	if (rec->flags & HAMMER_RECF_DELETED_FE)
112 		return(-1);
113         return(0);
114 }
115 
116 /*
117  * Ranged scan to locate overlapping record(s).  This is used by
118  * hammer_ip_get_bulk() to locate an overlapping record.  We have
119  * to use a ranged scan because the keys for data records with the
120  * same file base offset can be different due to differing data_len's.
121  *
122  * NOTE: The base file offset of a data record is (key - data_len), not (key).
123  */
124 static int
125 hammer_rec_overlap_cmp(hammer_record_t rec, void *data)
126 {
127 	struct hammer_bulk_info *info = data;
128 	hammer_btree_leaf_elm_t leaf = &info->leaf;
129 
130 	if (rec->leaf.base.rec_type < leaf->base.rec_type)
131 		return(-3);
132 	if (rec->leaf.base.rec_type > leaf->base.rec_type)
133 		return(3);
134 
135 	/*
136 	 * Overlap compare
137 	 */
138 	if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
139 		/* rec_beg >= leaf_end */
140 		if (rec->leaf.base.key - rec->leaf.data_len >= leaf->base.key)
141 			return(2);
142 		/* rec_end <= leaf_beg */
143 		if (rec->leaf.base.key <= leaf->base.key - leaf->data_len)
144 			return(-2);
145 	} else {
146 		if (rec->leaf.base.key < leaf->base.key)
147 			return(-2);
148 		if (rec->leaf.base.key > leaf->base.key)
149 			return(2);
150 	}
151 
152 	/*
153 	 * We have to return 0 at this point, even if DELETED_FE is set,
154 	 * because returning anything else will cause the scan to ignore
155 	 * one of the branches when we really want it to check both.
156 	 */
157         return(0);
158 }
159 
160 /*
161  * RB_SCAN comparison code for hammer_mem_first().  The argument order
162  * is reversed so the comparison result has to be negated.  key_beg and
163  * key_end are both range-inclusive.
164  *
165  * Localized deletions are not cached in-memory.
166  */
167 static
168 int
169 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
170 {
171 	hammer_cursor_t cursor = data;
172 	int r;
173 
174 	r = hammer_rec_cmp(&cursor->key_beg, rec);
175 	if (r > 1)
176 		return(-1);
177 	r = hammer_rec_cmp(&cursor->key_end, rec);
178 	if (r < -1)
179 		return(1);
180 	return(0);
181 }
182 
183 /*
184  * This compare function is used when simply looking up key_beg.
185  */
186 static
187 int
188 hammer_rec_find_cmp(hammer_record_t rec, void *data)
189 {
190 	hammer_cursor_t cursor = data;
191 	int r;
192 
193 	r = hammer_rec_cmp(&cursor->key_beg, rec);
194 	if (r > 1)
195 		return(-1);
196 	if (r < -1)
197 		return(1);
198 	return(0);
199 }
200 
201 /*
202  * Locate blocks within the truncation range.  Partial blocks do not count.
203  */
204 static
205 int
206 hammer_rec_trunc_cmp(hammer_record_t rec, void *data)
207 {
208 	struct rec_trunc_info *info = data;
209 
210 	if (rec->leaf.base.rec_type < info->rec_type)
211 		return(-1);
212 	if (rec->leaf.base.rec_type > info->rec_type)
213 		return(1);
214 
215 	switch(rec->leaf.base.rec_type) {
216 	case HAMMER_RECTYPE_DB:
217 		/*
218 		 * DB record key is not beyond the truncation point, retain.
219 		 */
220 		if (rec->leaf.base.key < info->trunc_off)
221 			return(-1);
222 		break;
223 	case HAMMER_RECTYPE_DATA:
224 		/*
225 		 * DATA record offset start is not beyond the truncation point,
226 		 * retain.
227 		 */
228 		if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off)
229 			return(-1);
230 		break;
231 	default:
232 		panic("hammer_rec_trunc_cmp: unexpected record type");
233 	}
234 
235 	/*
236 	 * The record start is >= the truncation point, return match,
237 	 * the record should be destroyed.
238 	 */
239 	return(0);
240 }
241 
242 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
243 
244 /*
245  * Allocate a record for the caller to finish filling in.  The record is
246  * returned referenced.
247  */
248 hammer_record_t
249 hammer_alloc_mem_record(hammer_inode_t ip, int data_len)
250 {
251 	hammer_record_t record;
252 	hammer_mount_t hmp;
253 
254 	hmp = ip->hmp;
255 	++hammer_count_records;
256 	record = kmalloc(sizeof(*record), hmp->m_misc,
257 			 M_WAITOK | M_ZERO | M_USE_RESERVE);
258 	record->flush_state = HAMMER_FST_IDLE;
259 	record->ip = ip;
260 	record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
261 	record->leaf.data_len = data_len;
262 	hammer_ref(&record->lock);
263 
264 	if (data_len) {
265 		record->data = kmalloc(data_len, hmp->m_misc, M_WAITOK | M_ZERO);
266 		record->flags |= HAMMER_RECF_ALLOCDATA;
267 		++hammer_count_record_datas;
268 	}
269 
270 	return (record);
271 }
272 
273 void
274 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident)
275 {
276 	while (record->flush_state == HAMMER_FST_FLUSH) {
277 		record->flags |= HAMMER_RECF_WANTED;
278 		tsleep(record, 0, ident, 0);
279 	}
280 }
281 
282 /*
283  * Called from the backend, hammer_inode.c, after a record has been
284  * flushed to disk.  The record has been exclusively locked by the
285  * caller and interlocked with BE.
286  *
287  * We clean up the state, unlock, and release the record (the record
288  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
289  */
290 void
291 hammer_flush_record_done(hammer_record_t record, int error)
292 {
293 	hammer_inode_t target_ip;
294 
295 	KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
296 	KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
297 
298 	if (error) {
299 		/*
300 		 * An error occured, the backend was unable to sync the
301 		 * record to its media.  Leave the record intact.
302 		 */
303 		hammer_critical_error(record->ip->hmp, record->ip, error,
304 				      "while flushing record");
305 	}
306 
307 	--record->flush_group->refs;
308 	record->flush_group = NULL;
309 
310 	if (record->flags & HAMMER_RECF_DELETED_BE) {
311 		if ((target_ip = record->target_ip) != NULL) {
312 			TAILQ_REMOVE(&target_ip->target_list, record,
313 				     target_entry);
314 			record->target_ip = NULL;
315 			hammer_test_inode(target_ip);
316 		}
317 		record->flush_state = HAMMER_FST_IDLE;
318 	} else {
319 		if (record->target_ip) {
320 			record->flush_state = HAMMER_FST_SETUP;
321 			hammer_test_inode(record->ip);
322 			hammer_test_inode(record->target_ip);
323 		} else {
324 			record->flush_state = HAMMER_FST_IDLE;
325 		}
326 	}
327 	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
328 	if (record->flags & HAMMER_RECF_WANTED) {
329 		record->flags &= ~HAMMER_RECF_WANTED;
330 		wakeup(record);
331 	}
332 	hammer_rel_mem_record(record);
333 }
334 
335 /*
336  * Release a memory record.  Records marked for deletion are immediately
337  * removed from the RB-Tree but otherwise left intact until the last ref
338  * goes away.
339  */
340 void
341 hammer_rel_mem_record(struct hammer_record *record)
342 {
343 	hammer_mount_t hmp;
344 	hammer_reserve_t resv;
345 	hammer_inode_t ip;
346 	hammer_inode_t target_ip;
347 
348 	hammer_unref(&record->lock);
349 
350 	if (record->lock.refs == 0) {
351 		/*
352 		 * Upon release of the last reference wakeup any waiters.
353 		 * The record structure may get destroyed so callers will
354 		 * loop up and do a relookup.
355 		 *
356 		 * WARNING!  Record must be removed from RB-TREE before we
357 		 * might possibly block.  hammer_test_inode() can block!
358 		 */
359 		ip = record->ip;
360 		hmp = ip->hmp;
361 
362 		/*
363 		 * Upon release of the last reference a record marked deleted
364 		 * is destroyed.
365 		 */
366 		if (record->flags & HAMMER_RECF_DELETED_FE) {
367 			KKASSERT(ip->lock.refs > 0);
368 			KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
369 
370 			/*
371 			 * target_ip may have zero refs, we have to ref it
372 			 * to prevent it from being ripped out from under
373 			 * us.
374 			 */
375 			if ((target_ip = record->target_ip) != NULL) {
376 				TAILQ_REMOVE(&target_ip->target_list,
377 					     record, target_entry);
378 				record->target_ip = NULL;
379 				hammer_ref(&target_ip->lock);
380 			}
381 
382 			if (record->flags & HAMMER_RECF_ONRBTREE) {
383 				RB_REMOVE(hammer_rec_rb_tree,
384 					  &record->ip->rec_tree,
385 					  record);
386 				KKASSERT(ip->rsv_recs > 0);
387 				--hmp->rsv_recs;
388 				--ip->rsv_recs;
389 				hmp->rsv_databytes -= record->leaf.data_len;
390 				record->flags &= ~HAMMER_RECF_ONRBTREE;
391 
392 				if (RB_EMPTY(&record->ip->rec_tree)) {
393 					record->ip->flags &= ~HAMMER_INODE_XDIRTY;
394 					record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
395 					hammer_test_inode(record->ip);
396 				}
397 			}
398 
399 			/*
400 			 * We must wait for any direct-IO to complete before
401 			 * we can destroy the record because the bio may
402 			 * have a reference to it.
403 			 */
404 			if (record->flags &
405 			   (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) {
406 				hammer_io_direct_wait(record);
407 			}
408 
409 
410 			/*
411 			 * Do this test after removing record from the B-Tree.
412 			 */
413 			if (target_ip) {
414 				hammer_test_inode(target_ip);
415 				hammer_rel_inode(target_ip, 0);
416 			}
417 
418 			if (record->flags & HAMMER_RECF_ALLOCDATA) {
419 				--hammer_count_record_datas;
420 				kfree(record->data, hmp->m_misc);
421 				record->flags &= ~HAMMER_RECF_ALLOCDATA;
422 			}
423 
424 			/*
425 			 * Release the reservation.  If the record was not
426 			 * committed return the reservation before
427 			 * releasing it.
428 			 */
429 			if ((resv = record->resv) != NULL) {
430 #if 0
431 				if ((record->flags & HAMMER_RECF_COMMITTED) == 0) {
432 					hammer_blockmap_reserve_undo(
433 						hmp, resv,
434 						record->leaf.data_offset,
435 						record->leaf.data_len);
436 				}
437 #endif
438 				hammer_blockmap_reserve_complete(hmp, resv);
439 				record->resv = NULL;
440 			}
441 			record->data = NULL;
442 			--hammer_count_records;
443 			kfree(record, hmp->m_misc);
444 		}
445 	}
446 }
447 
448 /*
449  * Record visibility depends on whether the record is being accessed by
450  * the backend or the frontend.
451  *
452  * Return non-zero if the record is visible, zero if it isn't or if it is
453  * deleted.
454  *
455  * If HAMMER_CURSOR_DELETE_VISIBILITY is set we allow deleted memory
456  * records to be returned.  This is so pending deletions are detected
457  * when using an iterator to locate an unused hash key, or when we need
458  * to locate historical records on-disk to destroy.
459  */
460 static __inline
461 int
462 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
463 {
464 	if (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY)
465 		return(1);
466 	if (cursor->flags & HAMMER_CURSOR_BACKEND) {
467 		if (record->flags & HAMMER_RECF_DELETED_BE)
468 			return(0);
469 	} else {
470 		if (record->flags & HAMMER_RECF_DELETED_FE)
471 			return(0);
472 	}
473 	return(1);
474 }
475 
476 /*
477  * This callback is used as part of the RB_SCAN function for in-memory
478  * records.  We terminate it (return -1) as soon as we get a match.
479  *
480  * This routine is used by frontend code.
481  *
482  * The primary compare code does not account for ASOF lookups.  This
483  * code handles that case as well as a few others.
484  */
485 static
486 int
487 hammer_rec_scan_callback(hammer_record_t rec, void *data)
488 {
489 	hammer_cursor_t cursor = data;
490 
491 	/*
492 	 * We terminate on success, so this should be NULL on entry.
493 	 */
494 	KKASSERT(cursor->iprec == NULL);
495 
496 	/*
497 	 * Skip if the record was marked deleted.
498 	 */
499 	if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
500 		return(0);
501 
502 	/*
503 	 * Skip if not visible due to our as-of TID
504 	 */
505         if (cursor->flags & HAMMER_CURSOR_ASOF) {
506                 if (cursor->asof < rec->leaf.base.create_tid)
507                         return(0);
508                 if (rec->leaf.base.delete_tid &&
509 		    cursor->asof >= rec->leaf.base.delete_tid) {
510                         return(0);
511 		}
512         }
513 
514 	/*
515 	 * ref the record.  The record is protected from backend B-Tree
516 	 * interactions by virtue of the cursor's IP lock.
517 	 */
518 	hammer_ref(&rec->lock);
519 
520 	/*
521 	 * The record may have been deleted while we were blocked.
522 	 */
523 	if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
524 		hammer_rel_mem_record(rec);
525 		return(0);
526 	}
527 
528 	/*
529 	 * Set the matching record and stop the scan.
530 	 */
531 	cursor->iprec = rec;
532 	return(-1);
533 }
534 
535 
536 /*
537  * Lookup an in-memory record given the key specified in the cursor.  Works
538  * just like hammer_btree_lookup() but operates on an inode's in-memory
539  * record list.
540  *
541  * The lookup must fail if the record is marked for deferred deletion.
542  */
543 static
544 int
545 hammer_mem_lookup(hammer_cursor_t cursor)
546 {
547 	int error;
548 
549 	KKASSERT(cursor->ip);
550 	if (cursor->iprec) {
551 		hammer_rel_mem_record(cursor->iprec);
552 		cursor->iprec = NULL;
553 	}
554 	hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp,
555 				   hammer_rec_scan_callback, cursor);
556 
557 	if (cursor->iprec == NULL)
558 		error = ENOENT;
559 	else
560 		error = 0;
561 	return(error);
562 }
563 
564 /*
565  * hammer_mem_first() - locate the first in-memory record matching the
566  * cursor within the bounds of the key range.
567  */
568 static
569 int
570 hammer_mem_first(hammer_cursor_t cursor)
571 {
572 	hammer_inode_t ip;
573 
574 	ip = cursor->ip;
575 	KKASSERT(ip != NULL);
576 
577 	if (cursor->iprec) {
578 		hammer_rel_mem_record(cursor->iprec);
579 		cursor->iprec = NULL;
580 	}
581 
582 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
583 				   hammer_rec_scan_callback, cursor);
584 
585 	/*
586 	 * Adjust scan.node and keep it linked into the RB-tree so we can
587 	 * hold the cursor through third party modifications of the RB-tree.
588 	 */
589 	if (cursor->iprec)
590 		return(0);
591 	return(ENOENT);
592 }
593 
594 /************************************************************************
595  *		     HAMMER IN-MEMORY RECORD FUNCTIONS			*
596  ************************************************************************
597  *
598  * These functions manipulate in-memory records.  Such records typically
599  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
600  */
601 
602 /*
603  * Add a directory entry (dip,ncp) which references inode (ip).
604  *
605  * Note that the low 32 bits of the namekey are set temporarily to create
606  * a unique in-memory record, and may be modified a second time when the
607  * record is synchronized to disk.  In particular, the low 32 bits cannot be
608  * all 0's when synching to disk, which is not handled here.
609  *
610  * NOTE: bytes does not include any terminating \0 on name, and name might
611  * not be terminated.
612  */
613 int
614 hammer_ip_add_directory(struct hammer_transaction *trans,
615 		     struct hammer_inode *dip, const char *name, int bytes,
616 		     struct hammer_inode *ip)
617 {
618 	struct hammer_cursor cursor;
619 	hammer_record_t record;
620 	int error;
621 	u_int32_t max_iterations;
622 
623 	record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes));
624 
625 	record->type = HAMMER_MEM_RECORD_ADD;
626 	record->leaf.base.localization = dip->obj_localization +
627 					 HAMMER_LOCALIZE_MISC;
628 	record->leaf.base.obj_id = dip->obj_id;
629 	record->leaf.base.key = hammer_directory_namekey(dip, name, bytes,
630 							 &max_iterations);
631 	record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
632 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
633 	record->data->entry.obj_id = ip->obj_id;
634 	record->data->entry.localization = ip->obj_localization;
635 	bcopy(name, record->data->entry.name, bytes);
636 
637 	++ip->ino_data.nlinks;
638 	hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
639 
640 	/*
641 	 * Find an unused namekey.  Both the in-memory record tree and
642 	 * the B-Tree are checked.  We do not want historically deleted
643 	 * names to create a collision as our iteration space may be limited,
644 	 * and since create_tid wouldn't match anyway an ASOF search
645 	 * must be used to locate collisions.
646 	 *
647 	 * delete-visibility is set so pending deletions do not give us
648 	 * a false-negative on our ability to use an iterator.
649 	 *
650 	 * The iterator must not rollover the key.  Directory keys only
651 	 * use the positive key space.
652 	 */
653 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
654 	cursor.key_beg = record->leaf.base;
655 	cursor.flags |= HAMMER_CURSOR_ASOF;
656 	cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
657 	cursor.asof = ip->obj_asof;
658 
659 	while (hammer_ip_lookup(&cursor) == 0) {
660 		++record->leaf.base.key;
661 		KKASSERT(record->leaf.base.key > 0);
662 		cursor.key_beg.key = record->leaf.base.key;
663 		if (--max_iterations == 0) {
664 			hammer_rel_mem_record(record);
665 			error = ENOSPC;
666 			goto failed;
667 		}
668 	}
669 
670 	/*
671 	 * The target inode and the directory entry are bound together.
672 	 */
673 	record->target_ip = ip;
674 	record->flush_state = HAMMER_FST_SETUP;
675 	TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
676 
677 	/*
678 	 * The inode now has a dependancy and must be taken out of the idle
679 	 * state.  An inode not in an idle state is given an extra reference.
680 	 *
681 	 * When transitioning to a SETUP state flag for an automatic reflush
682 	 * when the dependancies are disposed of if someone is waiting on
683 	 * the inode.
684 	 */
685 	if (ip->flush_state == HAMMER_FST_IDLE) {
686 		hammer_ref(&ip->lock);
687 		ip->flush_state = HAMMER_FST_SETUP;
688 		if (ip->flags & HAMMER_INODE_FLUSHW)
689 			ip->flags |= HAMMER_INODE_REFLUSH;
690 	}
691 	error = hammer_mem_add(record);
692 	if (error == 0) {
693 		dip->ino_data.mtime = trans->time;
694 		hammer_modify_inode(dip, HAMMER_INODE_MTIME);
695 	}
696 failed:
697 	hammer_done_cursor(&cursor);
698 	return(error);
699 }
700 
701 /*
702  * Delete the directory entry and update the inode link count.  The
703  * cursor must be seeked to the directory entry record being deleted.
704  *
705  * The related inode should be share-locked by the caller.  The caller is
706  * on the frontend.
707  *
708  * This function can return EDEADLK requiring the caller to terminate
709  * the cursor, any locks, wait on the returned record, and retry.
710  */
711 int
712 hammer_ip_del_directory(struct hammer_transaction *trans,
713 		     hammer_cursor_t cursor, struct hammer_inode *dip,
714 		     struct hammer_inode *ip)
715 {
716 	hammer_record_t record;
717 	int error;
718 
719 	if (hammer_cursor_inmem(cursor)) {
720 		/*
721 		 * In-memory (unsynchronized) records can simply be freed.
722 		 * Even though the HAMMER_RECF_DELETED_FE flag is ignored
723 		 * by the backend, we must still avoid races against the
724 		 * backend potentially syncing the record to the media.
725 		 *
726 		 * We cannot call hammer_ip_delete_record(), that routine may
727 		 * only be called from the backend.
728 		 */
729 		record = cursor->iprec;
730 		if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
731 			KKASSERT(cursor->deadlk_rec == NULL);
732 			hammer_ref(&record->lock);
733 			cursor->deadlk_rec = record;
734 			error = EDEADLK;
735 		} else {
736 			KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
737 			record->flags |= HAMMER_RECF_DELETED_FE;
738 			error = 0;
739 		}
740 	} else {
741 		/*
742 		 * If the record is on-disk we have to queue the deletion by
743 		 * the record's key.  This also causes lookups to skip the
744 		 * record.
745 		 */
746 		KKASSERT(dip->flags &
747 			 (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK));
748 		record = hammer_alloc_mem_record(dip, 0);
749 		record->type = HAMMER_MEM_RECORD_DEL;
750 		record->leaf.base = cursor->leaf->base;
751 
752 		record->target_ip = ip;
753 		record->flush_state = HAMMER_FST_SETUP;
754 		TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
755 
756 		/*
757 		 * The inode now has a dependancy and must be taken out of
758 		 * the idle state.  An inode not in an idle state is given
759 		 * an extra reference.
760 		 *
761 		 * When transitioning to a SETUP state flag for an automatic
762 		 * reflush when the dependancies are disposed of if someone
763 		 * is waiting on the inode.
764 		 */
765 		if (ip->flush_state == HAMMER_FST_IDLE) {
766 			hammer_ref(&ip->lock);
767 			ip->flush_state = HAMMER_FST_SETUP;
768 			if (ip->flags & HAMMER_INODE_FLUSHW)
769 				ip->flags |= HAMMER_INODE_REFLUSH;
770 		}
771 
772 		error = hammer_mem_add(record);
773 	}
774 
775 	/*
776 	 * One less link.  The file may still be open in the OS even after
777 	 * all links have gone away.
778 	 *
779 	 * We have to terminate the cursor before syncing the inode to
780 	 * avoid deadlocking against ourselves.  XXX this may no longer
781 	 * be true.
782 	 *
783 	 * If nlinks drops to zero and the vnode is inactive (or there is
784 	 * no vnode), call hammer_inode_unloadable_check() to zonk the
785 	 * inode.  If we don't do this here the inode will not be destroyed
786 	 * on-media until we unmount.
787 	 */
788 	if (error == 0) {
789 		--ip->ino_data.nlinks;
790 		dip->ino_data.mtime = trans->time;
791 		hammer_modify_inode(dip, HAMMER_INODE_MTIME);
792 		hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
793 		if (ip->ino_data.nlinks == 0 &&
794 		    (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
795 			hammer_done_cursor(cursor);
796 			hammer_inode_unloadable_check(ip, 1);
797 			hammer_flush_inode(ip, 0);
798 		}
799 
800 	}
801 	return(error);
802 }
803 
804 /*
805  * Add a record to an inode.
806  *
807  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
808  * initialize the following additional fields:
809  *
810  * The related inode should be share-locked by the caller.  The caller is
811  * on the frontend.
812  *
813  * record->rec.entry.base.base.key
814  * record->rec.entry.base.base.rec_type
815  * record->rec.entry.base.base.data_len
816  * record->data		(a copy will be kmalloc'd if it cannot be embedded)
817  */
818 int
819 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
820 {
821 	hammer_inode_t ip = record->ip;
822 	int error;
823 
824 	KKASSERT(record->leaf.base.localization != 0);
825 	record->leaf.base.obj_id = ip->obj_id;
826 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
827 	error = hammer_mem_add(record);
828 	return(error);
829 }
830 
831 /*
832  * Locate a bulk record in-memory.  Bulk records allow disk space to be
833  * reserved so the front-end can flush large data writes without having
834  * to queue the BIO to the flusher.  Only the related record gets queued
835  * to the flusher.
836  */
837 
838 static hammer_record_t
839 hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes)
840 {
841 	struct hammer_bulk_info info;
842 
843 	bzero(&info, sizeof(info));
844 	info.leaf.base.obj_id = ip->obj_id;
845 	info.leaf.base.key = file_offset + bytes;
846 	info.leaf.base.create_tid = 0;
847 	info.leaf.base.delete_tid = 0;
848 	info.leaf.base.rec_type = HAMMER_RECTYPE_DATA;
849 	info.leaf.base.obj_type = 0;				/* unused */
850 	info.leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;	/* unused */
851 	info.leaf.base.localization = ip->obj_localization +	/* unused */
852 				      HAMMER_LOCALIZE_MISC;
853 	info.leaf.data_len = bytes;
854 
855 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_overlap_cmp,
856 				   hammer_bulk_scan_callback, &info);
857 
858 	return(info.record);	/* may be NULL */
859 }
860 
861 /*
862  * Take records vetted by overlap_cmp.  The first non-deleted record
863  * (if any) stops the scan.
864  */
865 static int
866 hammer_bulk_scan_callback(hammer_record_t record, void *data)
867 {
868 	struct hammer_bulk_info *info = data;
869 
870 	if (record->flags & HAMMER_RECF_DELETED_FE)
871 		return(0);
872 	hammer_ref(&record->lock);
873 	info->record = record;
874 	return(-1);			/* stop scan */
875 }
876 
877 /*
878  * Reserve blockmap space placemarked with an in-memory record.
879  *
880  * This routine is called by the frontend in order to be able to directly
881  * flush a buffer cache buffer.  The frontend has locked the related buffer
882  * cache buffers and we should be able to manipulate any overlapping
883  * in-memory records.
884  *
885  * The caller is responsible for adding the returned record.
886  */
887 hammer_record_t
888 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
889 		   int *errorp)
890 {
891 	hammer_record_t record;
892 	hammer_record_t conflict;
893 	int zone;
894 
895 	/*
896 	 * Deal with conflicting in-memory records.  We cannot have multiple
897 	 * in-memory records for the same base offset without seriously
898 	 * confusing the backend, including but not limited to the backend
899 	 * issuing delete-create-delete or create-delete-create sequences
900 	 * and asserting on the delete_tid being the same as the create_tid.
901 	 *
902 	 * If we encounter a record with the backend interlock set we cannot
903 	 * immediately delete it without confusing the backend.
904 	 */
905 	while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) {
906 		if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) {
907 			conflict->flags |= HAMMER_RECF_WANTED;
908 			tsleep(conflict, 0, "hmrrc3", 0);
909 		} else {
910 			conflict->flags |= HAMMER_RECF_DELETED_FE;
911 		}
912 		hammer_rel_mem_record(conflict);
913 	}
914 
915 	/*
916 	 * Create a record to cover the direct write.  This is called with
917 	 * the related BIO locked so there should be no possible conflict.
918 	 *
919 	 * The backend is responsible for finalizing the space reserved in
920 	 * this record.
921 	 *
922 	 * XXX bytes not aligned, depend on the reservation code to
923 	 * align the reservation.
924 	 */
925 	record = hammer_alloc_mem_record(ip, 0);
926 	zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX :
927 					   HAMMER_ZONE_SMALL_DATA_INDEX;
928 	record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
929 					       &record->leaf.data_offset,
930 					       errorp);
931 	if (record->resv == NULL) {
932 		kprintf("hammer_ip_add_bulk: reservation failed\n");
933 		hammer_rel_mem_record(record);
934 		return(NULL);
935 	}
936 	record->type = HAMMER_MEM_RECORD_DATA;
937 	record->leaf.base.rec_type = HAMMER_RECTYPE_DATA;
938 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
939 	record->leaf.base.obj_id = ip->obj_id;
940 	record->leaf.base.key = file_offset + bytes;
941 	record->leaf.base.localization = ip->obj_localization +
942 					 HAMMER_LOCALIZE_MISC;
943 	record->leaf.data_len = bytes;
944 	hammer_crc_set_leaf(data, &record->leaf);
945 	KKASSERT(*errorp == 0);
946 	return(record);
947 }
948 
949 /*
950  * Frontend truncation code.  Scan in-memory records only.  On-disk records
951  * and records in a flushing state are handled by the backend.  The vnops
952  * setattr code will handle the block containing the truncation point.
953  *
954  * Partial blocks are not deleted.
955  */
956 int
957 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size)
958 {
959 	struct rec_trunc_info info;
960 
961 	switch(ip->ino_data.obj_type) {
962 	case HAMMER_OBJTYPE_REGFILE:
963 		info.rec_type = HAMMER_RECTYPE_DATA;
964 		break;
965 	case HAMMER_OBJTYPE_DBFILE:
966 		info.rec_type = HAMMER_RECTYPE_DB;
967 		break;
968 	default:
969 		return(EINVAL);
970 	}
971 	info.trunc_off = file_size;
972 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp,
973 				   hammer_frontend_trunc_callback, &info);
974 	return(0);
975 }
976 
977 static int
978 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused)
979 {
980 	if (record->flags & HAMMER_RECF_DELETED_FE)
981 		return(0);
982 	if (record->flush_state == HAMMER_FST_FLUSH)
983 		return(0);
984 	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
985 	hammer_ref(&record->lock);
986 	record->flags |= HAMMER_RECF_DELETED_FE;
987 	hammer_rel_mem_record(record);
988 	return(0);
989 }
990 
991 /*
992  * Return 1 if the caller must check for and delete existing records
993  * before writing out a new data record.
994  *
995  * Return 0 if the caller can just insert the record into the B-Tree without
996  * checking.
997  */
998 static int
999 hammer_record_needs_overwrite_delete(hammer_record_t record)
1000 {
1001 	hammer_inode_t ip = record->ip;
1002 	int64_t file_offset;
1003 	int r;
1004 
1005 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE)
1006 		file_offset = record->leaf.base.key;
1007 	else
1008 		file_offset = record->leaf.base.key - record->leaf.data_len;
1009 	r = (file_offset < ip->save_trunc_off);
1010 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1011 		if (ip->save_trunc_off <= record->leaf.base.key)
1012 			ip->save_trunc_off = record->leaf.base.key + 1;
1013 	} else {
1014 		if (ip->save_trunc_off < record->leaf.base.key)
1015 			ip->save_trunc_off = record->leaf.base.key;
1016 	}
1017 	return(r);
1018 }
1019 
1020 /*
1021  * Backend code.  Sync a record to the media.
1022  */
1023 int
1024 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
1025 {
1026 	hammer_transaction_t trans = cursor->trans;
1027 	int64_t file_offset;
1028 	int bytes;
1029 	void *bdata;
1030 	int error;
1031 	int doprop;
1032 
1033 	KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
1034 	KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
1035 	KKASSERT(record->leaf.base.localization != 0);
1036 
1037 	/*
1038 	 * Any direct-write related to the record must complete before we
1039 	 * can sync the record to the on-disk media.
1040 	 */
1041 	if (record->flags & (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL))
1042 		hammer_io_direct_wait(record);
1043 
1044 	/*
1045 	 * If this is a bulk-data record placemarker there may be an existing
1046 	 * record on-disk, indicating a data overwrite.  If there is the
1047 	 * on-disk record must be deleted before we can insert our new record.
1048 	 *
1049 	 * We've synthesized this record and do not know what the create_tid
1050 	 * on-disk is, nor how much data it represents.
1051 	 *
1052 	 * Keep in mind that (key) for data records is (base_offset + len),
1053 	 * not (base_offset).  Also, we only want to get rid of on-disk
1054 	 * records since we are trying to sync our in-memory record, call
1055 	 * hammer_ip_delete_range() with truncating set to 1 to make sure
1056 	 * it skips in-memory records.
1057 	 *
1058 	 * It is ok for the lookup to return ENOENT.
1059 	 *
1060 	 * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have
1061 	 * to call hammer_ip_delete_range() or not.  This also means we must
1062 	 * update sync_trunc_off() as we write.
1063 	 */
1064 	if (record->type == HAMMER_MEM_RECORD_DATA &&
1065 	    hammer_record_needs_overwrite_delete(record)) {
1066 		file_offset = record->leaf.base.key - record->leaf.data_len;
1067 		bytes = (record->leaf.data_len + HAMMER_BUFMASK) &
1068 			~HAMMER_BUFMASK;
1069 		KKASSERT((file_offset & HAMMER_BUFMASK) == 0);
1070 		error = hammer_ip_delete_range(
1071 				cursor, record->ip,
1072 				file_offset, file_offset + bytes - 1,
1073 				1);
1074 		if (error && error != ENOENT)
1075 			goto done;
1076 	}
1077 
1078 	/*
1079 	 * If this is a general record there may be an on-disk version
1080 	 * that must be deleted before we can insert the new record.
1081 	 */
1082 	if (record->type == HAMMER_MEM_RECORD_GENERAL) {
1083 		error = hammer_delete_general(cursor, record->ip,
1084 					      &record->leaf);
1085 		if (error && error != ENOENT)
1086 			goto done;
1087 	}
1088 
1089 	/*
1090 	 * Setup the cursor.
1091 	 */
1092 	hammer_normalize_cursor(cursor);
1093 	cursor->key_beg = record->leaf.base;
1094 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1095 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1096 	cursor->flags &= ~HAMMER_CURSOR_INSERT;
1097 
1098 	/*
1099 	 * Records can wind up on-media before the inode itself is on-media.
1100 	 * Flag the case.
1101 	 */
1102 	record->ip->flags |= HAMMER_INODE_DONDISK;
1103 
1104 	/*
1105 	 * If we are deleting a directory entry an exact match must be
1106 	 * found on-disk.
1107 	 */
1108 	if (record->type == HAMMER_MEM_RECORD_DEL) {
1109 		error = hammer_btree_lookup(cursor);
1110 		if (error == 0) {
1111 			KKASSERT(cursor->iprec == NULL);
1112 			error = hammer_ip_delete_record(cursor, record->ip,
1113 							trans->tid);
1114 			if (error == 0) {
1115 				record->flags |= HAMMER_RECF_DELETED_FE;
1116 				record->flags |= HAMMER_RECF_DELETED_BE;
1117 				record->flags |= HAMMER_RECF_COMMITTED;
1118 			}
1119 		}
1120 		goto done;
1121 	}
1122 
1123 	/*
1124 	 * We are inserting.
1125 	 *
1126 	 * Issue a lookup to position the cursor and locate the cluster.  The
1127 	 * target key should not exist.  If we are creating a directory entry
1128 	 * we may have to iterate the low 32 bits of the key to find an unused
1129 	 * key.
1130 	 */
1131 	hammer_sync_lock_sh(trans);
1132 	cursor->flags |= HAMMER_CURSOR_INSERT;
1133 	error = hammer_btree_lookup(cursor);
1134 	if (hammer_debug_inode)
1135 		kprintf("DOINSERT LOOKUP %d\n", error);
1136 	if (error == 0) {
1137 		kprintf("hammer_ip_sync_record: duplicate rec "
1138 			"at (%016llx)\n", record->leaf.base.key);
1139 		Debugger("duplicate record1");
1140 		error = EIO;
1141 	}
1142 #if 0
1143 	if (record->type == HAMMER_MEM_RECORD_DATA)
1144 		kprintf("sync_record  %016llx ---------------- %016llx %d\n",
1145 			record->leaf.base.key - record->leaf.data_len,
1146 			record->leaf.data_offset, error);
1147 #endif
1148 
1149 	if (error != ENOENT)
1150 		goto done_unlock;
1151 
1152 	/*
1153 	 * Allocate the record and data.  The result buffers will be
1154 	 * marked as being modified and further calls to
1155 	 * hammer_modify_buffer() will result in unneeded UNDO records.
1156 	 *
1157 	 * Support zero-fill records (data == NULL and data_len != 0)
1158 	 */
1159 	if (record->type == HAMMER_MEM_RECORD_DATA) {
1160 		/*
1161 		 * The data portion of a bulk-data record has already been
1162 		 * committed to disk, we need only adjust the layer2
1163 		 * statistics in the same transaction as our B-Tree insert.
1164 		 */
1165 		KKASSERT(record->leaf.data_offset != 0);
1166 		error = hammer_blockmap_finalize(trans,
1167 						 record->resv,
1168 						 record->leaf.data_offset,
1169 						 record->leaf.data_len);
1170 	} else if (record->data && record->leaf.data_len) {
1171 		/*
1172 		 * Wholely cached record, with data.  Allocate the data.
1173 		 */
1174 		bdata = hammer_alloc_data(trans, record->leaf.data_len,
1175 					  record->leaf.base.rec_type,
1176 					  &record->leaf.data_offset,
1177 					  &cursor->data_buffer, &error);
1178 		if (bdata == NULL)
1179 			goto done_unlock;
1180 		hammer_crc_set_leaf(record->data, &record->leaf);
1181 		hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
1182 		bcopy(record->data, bdata, record->leaf.data_len);
1183 		hammer_modify_buffer_done(cursor->data_buffer);
1184 	} else {
1185 		/*
1186 		 * Wholely cached record, without data.
1187 		 */
1188 		record->leaf.data_offset = 0;
1189 		record->leaf.data_crc = 0;
1190 	}
1191 
1192 	error = hammer_btree_insert(cursor, &record->leaf, &doprop);
1193 	if (hammer_debug_inode && error)
1194 		kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", error, cursor->node->node_offset, cursor->index, record->leaf.base.key);
1195 
1196 	/*
1197 	 * Our record is on-disk, normally mark the in-memory version as
1198 	 * deleted.  If the record represented a directory deletion but
1199 	 * we had to sync a valid directory entry to disk we must convert
1200 	 * the record to a covering delete so the frontend does not have
1201 	 * visibility on the synced entry.
1202 	 */
1203 	if (error == 0) {
1204 		if (doprop) {
1205 			hammer_btree_do_propagation(cursor,
1206 						    record->ip->pfsm,
1207 						    &record->leaf);
1208 		}
1209 		if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
1210 			KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
1211 			record->flags &= ~HAMMER_RECF_DELETED_FE;
1212 			record->type = HAMMER_MEM_RECORD_DEL;
1213 			KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
1214 			record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
1215 			/* hammer_flush_record_done takes care of the rest */
1216 		} else {
1217 			record->flags |= HAMMER_RECF_DELETED_FE;
1218 			record->flags |= HAMMER_RECF_DELETED_BE;
1219 		}
1220 		record->flags |= HAMMER_RECF_COMMITTED;
1221 	} else {
1222 		if (record->leaf.data_offset) {
1223 			hammer_blockmap_free(trans, record->leaf.data_offset,
1224 					     record->leaf.data_len);
1225 		}
1226 	}
1227 done_unlock:
1228 	hammer_sync_unlock(trans);
1229 done:
1230 	return(error);
1231 }
1232 
1233 /*
1234  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
1235  * entry's key is used to deal with hash collisions in the upper 32 bits.
1236  * A unique 64 bit key is generated in-memory and may be regenerated a
1237  * second time when the directory record is flushed to the on-disk B-Tree.
1238  *
1239  * A referenced record is passed to this function.  This function
1240  * eats the reference.  If an error occurs the record will be deleted.
1241  *
1242  * A copy of the temporary record->data pointer provided by the caller
1243  * will be made.
1244  */
1245 int
1246 hammer_mem_add(hammer_record_t record)
1247 {
1248 	hammer_mount_t hmp = record->ip->hmp;
1249 
1250 	/*
1251 	 * Make a private copy of record->data
1252 	 */
1253 	if (record->data)
1254 		KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA);
1255 
1256 	/*
1257 	 * Insert into the RB tree.  A unique key should have already
1258 	 * been selected if this is a directory entry.
1259 	 */
1260 	if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1261 		record->flags |= HAMMER_RECF_DELETED_FE;
1262 		hammer_rel_mem_record(record);
1263 		return (EEXIST);
1264 	}
1265 	++hmp->count_newrecords;
1266 	++hmp->rsv_recs;
1267 	++record->ip->rsv_recs;
1268 	record->ip->hmp->rsv_databytes += record->leaf.data_len;
1269 	record->flags |= HAMMER_RECF_ONRBTREE;
1270 	hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY);
1271 	hammer_rel_mem_record(record);
1272 	return(0);
1273 }
1274 
1275 /************************************************************************
1276  *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
1277  ************************************************************************
1278  *
1279  * These functions augment the B-Tree scanning functions in hammer_btree.c
1280  * by merging in-memory records with on-disk records.
1281  */
1282 
1283 /*
1284  * Locate a particular record either in-memory or on-disk.
1285  *
1286  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1287  * NOT be called to iterate results.
1288  */
1289 int
1290 hammer_ip_lookup(hammer_cursor_t cursor)
1291 {
1292 	int error;
1293 
1294 	/*
1295 	 * If the element is in-memory return it without searching the
1296 	 * on-disk B-Tree
1297 	 */
1298 	KKASSERT(cursor->ip);
1299 	error = hammer_mem_lookup(cursor);
1300 	if (error == 0) {
1301 		cursor->leaf = &cursor->iprec->leaf;
1302 		return(error);
1303 	}
1304 	if (error != ENOENT)
1305 		return(error);
1306 
1307 	/*
1308 	 * If the inode has on-disk components search the on-disk B-Tree.
1309 	 */
1310 	if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1311 		return(error);
1312 	error = hammer_btree_lookup(cursor);
1313 	if (error == 0)
1314 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1315 	return(error);
1316 }
1317 
1318 /*
1319  * Locate the first record within the cursor's key_beg/key_end range,
1320  * restricted to a particular inode.  0 is returned on success, ENOENT
1321  * if no records matched the requested range, or some other error.
1322  *
1323  * When 0 is returned hammer_ip_next() may be used to iterate additional
1324  * records within the requested range.
1325  *
1326  * This function can return EDEADLK, requiring the caller to terminate
1327  * the cursor and try again.
1328  */
1329 int
1330 hammer_ip_first(hammer_cursor_t cursor)
1331 {
1332 	hammer_inode_t ip = cursor->ip;
1333 	int error;
1334 
1335 	KKASSERT(ip != NULL);
1336 
1337 	/*
1338 	 * Clean up fields and setup for merged scan
1339 	 */
1340 	cursor->flags &= ~HAMMER_CURSOR_RETEST;
1341 	cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1342 	cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1343 	if (cursor->iprec) {
1344 		hammer_rel_mem_record(cursor->iprec);
1345 		cursor->iprec = NULL;
1346 	}
1347 
1348 	/*
1349 	 * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1350 	 * exact lookup so if we get ENOENT we have to call the iterate
1351 	 * function to validate the first record after the begin key.
1352 	 *
1353 	 * The ATEDISK flag is used by hammer_btree_iterate to determine
1354 	 * whether it must index forwards or not.  It is also used here
1355 	 * to select the next record from in-memory or on-disk.
1356 	 *
1357 	 * EDEADLK can only occur if the lookup hit an empty internal
1358 	 * element and couldn't delete it.  Since this could only occur
1359 	 * in-range, we can just iterate from the failure point.
1360 	 */
1361 	if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1362 		error = hammer_btree_lookup(cursor);
1363 		if (error == ENOENT || error == EDEADLK) {
1364 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1365 			if (hammer_debug_general & 0x2000)
1366 				kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1367 			error = hammer_btree_iterate(cursor);
1368 		}
1369 		if (error && error != ENOENT)
1370 			return(error);
1371 		if (error == 0) {
1372 			cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1373 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1374 		} else {
1375 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1376 		}
1377 	}
1378 
1379 	/*
1380 	 * Search the in-memory record list (Red-Black tree).  Unlike the
1381 	 * B-Tree search, mem_first checks for records in the range.
1382 	 */
1383 	error = hammer_mem_first(cursor);
1384 	if (error && error != ENOENT)
1385 		return(error);
1386 	if (error == 0) {
1387 		cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1388 		cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1389 		if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1390 			cursor->flags |= HAMMER_CURSOR_ATEMEM;
1391 	}
1392 
1393 	/*
1394 	 * This will return the first matching record.
1395 	 */
1396 	return(hammer_ip_next(cursor));
1397 }
1398 
1399 /*
1400  * Retrieve the next record in a merged iteration within the bounds of the
1401  * cursor.  This call may be made multiple times after the cursor has been
1402  * initially searched with hammer_ip_first().
1403  *
1404  * 0 is returned on success, ENOENT if no further records match the
1405  * requested range, or some other error code is returned.
1406  */
1407 int
1408 hammer_ip_next(hammer_cursor_t cursor)
1409 {
1410 	hammer_btree_elm_t elm;
1411 	hammer_record_t rec, save;
1412 	int error;
1413 	int r;
1414 
1415 next_btree:
1416 	/*
1417 	 * Load the current on-disk and in-memory record.  If we ate any
1418 	 * records we have to get the next one.
1419 	 *
1420 	 * If we deleted the last on-disk record we had scanned ATEDISK will
1421 	 * be clear and RETEST will be set, forcing a call to iterate.  The
1422 	 * fact that ATEDISK is clear causes iterate to re-test the 'current'
1423 	 * element.  If ATEDISK is set, iterate will skip the 'current'
1424 	 * element.
1425 	 *
1426 	 * Get the next on-disk record
1427 	 */
1428 	if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_RETEST)) {
1429 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1430 			error = hammer_btree_iterate(cursor);
1431 			cursor->flags &= ~HAMMER_CURSOR_RETEST;
1432 			if (error == 0) {
1433 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1434 				hammer_cache_node(&cursor->ip->cache[1],
1435 						  cursor->node);
1436 			} else {
1437 				cursor->flags |= HAMMER_CURSOR_DISKEOF |
1438 						 HAMMER_CURSOR_ATEDISK;
1439 			}
1440 		}
1441 	}
1442 
1443 next_memory:
1444 	/*
1445 	 * Get the next in-memory record.
1446 	 *
1447 	 * hammer_rec_scan_cmp:  Is the record still in our general range,
1448 	 *			 (non-inclusive of snapshot exclusions)?
1449 	 * hammer_rec_scan_callback: Is the record in our snapshot?
1450 	 */
1451 	if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1452 		if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1453 			save = cursor->iprec;
1454 			cursor->iprec = NULL;
1455 			rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1456 			while (rec) {
1457 				if (hammer_rec_scan_cmp(rec, cursor) != 0)
1458 					break;
1459 				if (hammer_rec_scan_callback(rec, cursor) != 0)
1460 					break;
1461 				rec = hammer_rec_rb_tree_RB_NEXT(rec);
1462 			}
1463 			if (save)
1464 				hammer_rel_mem_record(save);
1465 			if (cursor->iprec) {
1466 				KKASSERT(cursor->iprec == rec);
1467 				cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1468 			} else {
1469 				cursor->flags |= HAMMER_CURSOR_MEMEOF;
1470 			}
1471 		}
1472 	}
1473 
1474 	/*
1475 	 * The memory record may have become stale while being held in
1476 	 * cursor->iprec.  We are interlocked against the backend on
1477 	 * with regards to B-Tree entries.
1478 	 */
1479 	if ((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0) {
1480 		if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) {
1481 			cursor->flags |= HAMMER_CURSOR_ATEMEM;
1482 			goto next_memory;
1483 		}
1484 	}
1485 
1486 	/*
1487 	 * Extract either the disk or memory record depending on their
1488 	 * relative position.
1489 	 */
1490 	error = 0;
1491 	switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1492 	case 0:
1493 		/*
1494 		 * Both entries valid.   Compare the entries and nominally
1495 		 * return the first one in the sort order.  Numerous cases
1496 		 * require special attention, however.
1497 		 */
1498 		elm = &cursor->node->ondisk->elms[cursor->index];
1499 		r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base);
1500 
1501 		/*
1502 		 * If the two entries differ only by their key (-2/2) or
1503 		 * create_tid (-1/1), and are DATA records, we may have a
1504 		 * nominal match.  We have to calculate the base file
1505 		 * offset of the data.
1506 		 */
1507 		if (r <= 2 && r >= -2 && r != 0 &&
1508 		    cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE &&
1509 		    cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1510 			int64_t base1 = elm->leaf.base.key - elm->leaf.data_len;
1511 			int64_t base2 = cursor->iprec->leaf.base.key -
1512 					cursor->iprec->leaf.data_len;
1513 			if (base1 == base2)
1514 				r = 0;
1515 		}
1516 
1517 		if (r < 0) {
1518 			error = hammer_btree_extract(cursor,
1519 						     HAMMER_CURSOR_GET_LEAF);
1520 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1521 			break;
1522 		}
1523 
1524 		/*
1525 		 * If the entries match exactly the memory entry is either
1526 		 * an on-disk directory entry deletion or a bulk data
1527 		 * overwrite.  If it is a directory entry deletion we eat
1528 		 * both entries.
1529 		 *
1530 		 * For the bulk-data overwrite case it is possible to have
1531 		 * visibility into both, which simply means the syncer
1532 		 * hasn't gotten around to doing the delete+insert sequence
1533 		 * on the B-Tree.  Use the memory entry and throw away the
1534 		 * on-disk entry.
1535 		 *
1536 		 * If the in-memory record is not either of these we
1537 		 * probably caught the syncer while it was syncing it to
1538 		 * the media.  Since we hold a shared lock on the cursor,
1539 		 * the in-memory record had better be marked deleted at
1540 		 * this point.
1541 		 */
1542 		if (r == 0) {
1543 			if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1544 				if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1545 					cursor->flags |= HAMMER_CURSOR_ATEDISK;
1546 					cursor->flags |= HAMMER_CURSOR_ATEMEM;
1547 					goto next_btree;
1548 				}
1549 			} else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1550 				if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1551 					cursor->flags |= HAMMER_CURSOR_ATEDISK;
1552 				}
1553 				/* fall through to memory entry */
1554 			} else {
1555 				panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags);
1556 				cursor->flags |= HAMMER_CURSOR_ATEMEM;
1557 				goto next_memory;
1558 			}
1559 		}
1560 		/* fall through to the memory entry */
1561 	case HAMMER_CURSOR_ATEDISK:
1562 		/*
1563 		 * Only the memory entry is valid.
1564 		 */
1565 		cursor->leaf = &cursor->iprec->leaf;
1566 		cursor->flags |= HAMMER_CURSOR_ATEMEM;
1567 
1568 		/*
1569 		 * If the memory entry is an on-disk deletion we should have
1570 		 * also had found a B-Tree record.  If the backend beat us
1571 		 * to it it would have interlocked the cursor and we should
1572 		 * have seen the in-memory record marked DELETED_FE.
1573 		 */
1574 		if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL &&
1575 		    (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1576 			panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags);
1577 		}
1578 		break;
1579 	case HAMMER_CURSOR_ATEMEM:
1580 		/*
1581 		 * Only the disk entry is valid
1582 		 */
1583 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1584 		cursor->flags |= HAMMER_CURSOR_ATEDISK;
1585 		break;
1586 	default:
1587 		/*
1588 		 * Neither entry is valid
1589 		 *
1590 		 * XXX error not set properly
1591 		 */
1592 		cursor->leaf = NULL;
1593 		error = ENOENT;
1594 		break;
1595 	}
1596 	return(error);
1597 }
1598 
1599 /*
1600  * Resolve the cursor->data pointer for the current cursor position in
1601  * a merged iteration.
1602  */
1603 int
1604 hammer_ip_resolve_data(hammer_cursor_t cursor)
1605 {
1606 	hammer_record_t record;
1607 	int error;
1608 
1609 	if (hammer_cursor_inmem(cursor)) {
1610 		/*
1611 		 * The data associated with an in-memory record is usually
1612 		 * kmalloced, but reserve-ahead data records will have an
1613 		 * on-disk reference.
1614 		 *
1615 		 * NOTE: Reserve-ahead data records must be handled in the
1616 		 * context of the related high level buffer cache buffer
1617 		 * to interlock against async writes.
1618 		 */
1619 		record = cursor->iprec;
1620 		cursor->data = record->data;
1621 		error = 0;
1622 		if (cursor->data == NULL) {
1623 			KKASSERT(record->leaf.base.rec_type ==
1624 				 HAMMER_RECTYPE_DATA);
1625 			cursor->data = hammer_bread_ext(cursor->trans->hmp,
1626 						    record->leaf.data_offset,
1627 						    record->leaf.data_len,
1628 						    &error,
1629 						    &cursor->data_buffer);
1630 		}
1631 	} else {
1632 		cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf;
1633 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1634 	}
1635 	return(error);
1636 }
1637 
1638 /*
1639  * Backend truncation / record replacement - delete records in range.
1640  *
1641  * Delete all records within the specified range for inode ip.  In-memory
1642  * records still associated with the frontend are ignored.
1643  *
1644  * If truncating is non-zero in-memory records associated with the back-end
1645  * are ignored.  If truncating is > 1 we can return EWOULDBLOCK.
1646  *
1647  * NOTES:
1648  *
1649  *	* An unaligned range will cause new records to be added to cover
1650  *        the edge cases. (XXX not implemented yet).
1651  *
1652  *	* Replacement via reservations (see hammer_ip_sync_record_cursor())
1653  *        also do not deal with unaligned ranges.
1654  *
1655  *	* ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1656  *
1657  *	* Record keys for regular file data have to be special-cased since
1658  * 	  they indicate the end of the range (key = base + bytes).
1659  *
1660  *	* This function may be asked to delete ridiculously huge ranges, for
1661  *	  example if someone truncates or removes a 1TB regular file.  We
1662  *	  must be very careful on restarts and we may have to stop w/
1663  *	  EWOULDBLOCK to avoid blowing out the buffer cache.
1664  */
1665 int
1666 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip,
1667 		       int64_t ran_beg, int64_t ran_end, int truncating)
1668 {
1669 	hammer_transaction_t trans = cursor->trans;
1670 	hammer_btree_leaf_elm_t leaf;
1671 	int error;
1672 	int64_t off;
1673 	int64_t tmp64;
1674 
1675 #if 0
1676 	kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1677 #endif
1678 
1679 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1680 retry:
1681 	hammer_normalize_cursor(cursor);
1682 	cursor->key_beg.localization = ip->obj_localization +
1683 				       HAMMER_LOCALIZE_MISC;
1684 	cursor->key_beg.obj_id = ip->obj_id;
1685 	cursor->key_beg.create_tid = 0;
1686 	cursor->key_beg.delete_tid = 0;
1687 	cursor->key_beg.obj_type = 0;
1688 
1689 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1690 		cursor->key_beg.key = ran_beg;
1691 		cursor->key_beg.rec_type = HAMMER_RECTYPE_DB;
1692 	} else {
1693 		/*
1694 		 * The key in the B-Tree is (base+bytes), so the first possible
1695 		 * matching key is ran_beg + 1.
1696 		 */
1697 		cursor->key_beg.key = ran_beg + 1;
1698 		cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA;
1699 	}
1700 
1701 	cursor->key_end = cursor->key_beg;
1702 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1703 		cursor->key_end.key = ran_end;
1704 	} else {
1705 		tmp64 = ran_end + MAXPHYS + 1;	/* work around GCC-4 bug */
1706 		if (tmp64 < ran_end)
1707 			cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1708 		else
1709 			cursor->key_end.key = ran_end + MAXPHYS + 1;
1710 	}
1711 
1712 	cursor->asof = ip->obj_asof;
1713 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1714 	cursor->flags |= HAMMER_CURSOR_ASOF;
1715 	cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1716 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1717 	cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE;
1718 
1719 	error = hammer_ip_first(cursor);
1720 
1721 	/*
1722 	 * Iterate through matching records and mark them as deleted.
1723 	 */
1724 	while (error == 0) {
1725 		leaf = cursor->leaf;
1726 
1727 		KKASSERT(leaf->base.delete_tid == 0);
1728 		KKASSERT(leaf->base.obj_id == ip->obj_id);
1729 
1730 		/*
1731 		 * There may be overlap cases for regular file data.  Also
1732 		 * remember the key for a regular file record is (base + len),
1733 		 * NOT (base).
1734 		 *
1735 		 * Note that do to duplicates (mem & media) allowed by
1736 		 * DELETE_VISIBILITY, off can wind up less then ran_beg.
1737 		 */
1738 		if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
1739 			off = leaf->base.key - leaf->data_len;
1740 			/*
1741 			 * Check the left edge case.  We currently do not
1742 			 * split existing records.
1743 			 */
1744 			if (off < ran_beg && leaf->base.key > ran_beg) {
1745 				panic("hammer left edge case %016llx %d\n",
1746 					leaf->base.key, leaf->data_len);
1747 			}
1748 
1749 			/*
1750 			 * Check the right edge case.  Note that the
1751 			 * record can be completely out of bounds, which
1752 			 * terminates the search.
1753 			 *
1754 			 * base->key is exclusive of the right edge while
1755 			 * ran_end is inclusive of the right edge.  The
1756 			 * (key - data_len) left boundary is inclusive.
1757 			 *
1758 			 * XXX theory-check this test at some point, are
1759 			 * we missing a + 1 somewhere?  Note that ran_end
1760 			 * could overflow.
1761 			 */
1762 			if (leaf->base.key - 1 > ran_end) {
1763 				if (leaf->base.key - leaf->data_len > ran_end)
1764 					break;
1765 				panic("hammer right edge case\n");
1766 			}
1767 		} else {
1768 			off = leaf->base.key;
1769 		}
1770 
1771 		/*
1772 		 * Delete the record.  When truncating we do not delete
1773 		 * in-memory (data) records because they represent data
1774 		 * written after the truncation.
1775 		 *
1776 		 * This will also physically destroy the B-Tree entry and
1777 		 * data if the retention policy dictates.  The function
1778 		 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
1779 		 * to retest the new 'current' element.
1780 		 */
1781 		if (truncating == 0 || hammer_cursor_ondisk(cursor)) {
1782 			error = hammer_ip_delete_record(cursor, ip, trans->tid);
1783 			/*
1784 			 * If we have built up too many meta-buffers we risk
1785 			 * deadlocking the kernel and must stop.  This can
1786 			 * occur when deleting ridiculously huge files.
1787 			 * sync_trunc_off is updated so the next cycle does
1788 			 * not re-iterate records we have already deleted.
1789 			 *
1790 			 * This is only done with formal truncations.
1791 			 */
1792 			if (truncating > 1 && error == 0 &&
1793 			    hammer_flusher_meta_limit(ip->hmp)) {
1794 				ip->sync_trunc_off = off;
1795 				error = EWOULDBLOCK;
1796 			}
1797 		}
1798 		if (error)
1799 			break;
1800 		ran_beg = off;	/* for restart */
1801 		error = hammer_ip_next(cursor);
1802 	}
1803 	if (cursor->node)
1804 		hammer_cache_node(&ip->cache[1], cursor->node);
1805 
1806 	if (error == EDEADLK) {
1807 		hammer_done_cursor(cursor);
1808 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1809 		if (error == 0)
1810 			goto retry;
1811 	}
1812 	if (error == ENOENT)
1813 		error = 0;
1814 	return(error);
1815 }
1816 
1817 /*
1818  * This backend function deletes the specified record on-disk, similar to
1819  * delete_range but for a specific record.  Unlike the exact deletions
1820  * used when deleting a directory entry this function uses an ASOF search
1821  * like delete_range.
1822  *
1823  * This function may be called with ip->obj_asof set for a slave snapshot,
1824  * so don't use it.  We always delete non-historical records only.
1825  */
1826 static int
1827 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
1828 		      hammer_btree_leaf_elm_t leaf)
1829 {
1830 	hammer_transaction_t trans = cursor->trans;
1831 	int error;
1832 
1833 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1834 retry:
1835 	hammer_normalize_cursor(cursor);
1836 	cursor->key_beg = leaf->base;
1837 	cursor->asof = HAMMER_MAX_TID;
1838 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1839 	cursor->flags |= HAMMER_CURSOR_ASOF;
1840 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1841 	cursor->flags &= ~HAMMER_CURSOR_INSERT;
1842 
1843 	error = hammer_btree_lookup(cursor);
1844 	if (error == 0) {
1845 		error = hammer_ip_delete_record(cursor, ip, trans->tid);
1846 	}
1847 	if (error == EDEADLK) {
1848 		hammer_done_cursor(cursor);
1849 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1850 		if (error == 0)
1851 			goto retry;
1852 	}
1853 	return(error);
1854 }
1855 
1856 /*
1857  * This function deletes remaining auxillary records when an inode is
1858  * being deleted.  This function explicitly does not delete the
1859  * inode record, directory entry, data, or db records.  Those must be
1860  * properly disposed of prior to this call.
1861  */
1862 int
1863 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp)
1864 {
1865 	hammer_transaction_t trans = cursor->trans;
1866 	hammer_btree_leaf_elm_t leaf;
1867 	int error;
1868 
1869 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1870 retry:
1871 	hammer_normalize_cursor(cursor);
1872 	cursor->key_beg.localization = ip->obj_localization +
1873 				       HAMMER_LOCALIZE_MISC;
1874 	cursor->key_beg.obj_id = ip->obj_id;
1875 	cursor->key_beg.create_tid = 0;
1876 	cursor->key_beg.delete_tid = 0;
1877 	cursor->key_beg.obj_type = 0;
1878 	cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START;
1879 	cursor->key_beg.key = HAMMER_MIN_KEY;
1880 
1881 	cursor->key_end = cursor->key_beg;
1882 	cursor->key_end.rec_type = HAMMER_RECTYPE_MAX;
1883 	cursor->key_end.key = HAMMER_MAX_KEY;
1884 
1885 	cursor->asof = ip->obj_asof;
1886 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1887 	cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1888 	cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1889 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1890 
1891 	error = hammer_ip_first(cursor);
1892 
1893 	/*
1894 	 * Iterate through matching records and mark them as deleted.
1895 	 */
1896 	while (error == 0) {
1897 		leaf = cursor->leaf;
1898 
1899 		KKASSERT(leaf->base.delete_tid == 0);
1900 
1901 		/*
1902 		 * Mark the record and B-Tree entry as deleted.  This will
1903 		 * also physically delete the B-Tree entry, record, and
1904 		 * data if the retention policy dictates.  The function
1905 		 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
1906 		 * to retest the new 'current' element.
1907 		 *
1908 		 * Directory entries (and delete-on-disk directory entries)
1909 		 * must be synced and cannot be deleted.
1910 		 */
1911 		error = hammer_ip_delete_record(cursor, ip, trans->tid);
1912 		++*countp;
1913 		if (error)
1914 			break;
1915 		error = hammer_ip_next(cursor);
1916 	}
1917 	if (cursor->node)
1918 		hammer_cache_node(&ip->cache[1], cursor->node);
1919 	if (error == EDEADLK) {
1920 		hammer_done_cursor(cursor);
1921 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1922 		if (error == 0)
1923 			goto retry;
1924 	}
1925 	if (error == ENOENT)
1926 		error = 0;
1927 	return(error);
1928 }
1929 
1930 /*
1931  * Delete the record at the current cursor.  On success the cursor will
1932  * be positioned appropriately for an iteration but may no longer be at
1933  * a leaf node.
1934  *
1935  * This routine is only called from the backend.
1936  *
1937  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1938  * cursor and retry.
1939  */
1940 int
1941 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip,
1942 			hammer_tid_t tid)
1943 {
1944 	hammer_record_t iprec;
1945 	hammer_mount_t hmp;
1946 	int error;
1947 
1948 	KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1949 	KKASSERT(tid != 0);
1950 	hmp = cursor->node->hmp;
1951 
1952 	/*
1953 	 * In-memory (unsynchronized) records can simply be freed.  This
1954 	 * only occurs in range iterations since all other records are
1955 	 * individually synchronized.  Thus there should be no confusion with
1956 	 * the interlock.
1957 	 *
1958 	 * An in-memory record may be deleted before being committed to disk,
1959 	 * but could have been accessed in the mean time.  The reservation
1960 	 * code will deal with the case.
1961 	 */
1962 	if (hammer_cursor_inmem(cursor)) {
1963 		iprec = cursor->iprec;
1964 		KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1965 		iprec->flags |= HAMMER_RECF_DELETED_FE;
1966 		iprec->flags |= HAMMER_RECF_DELETED_BE;
1967 		return(0);
1968 	}
1969 
1970 	/*
1971 	 * On-disk records are marked as deleted by updating their delete_tid.
1972 	 * This does not effect their position in the B-Tree (which is based
1973 	 * on their create_tid).
1974 	 *
1975 	 * Frontend B-Tree operations track inodes so we tell
1976 	 * hammer_delete_at_cursor() not to.
1977 	 */
1978 	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1979 
1980 	if (error == 0) {
1981 		error = hammer_delete_at_cursor(
1982 				cursor,
1983 				HAMMER_DELETE_ADJUST | hammer_nohistory(ip),
1984 				cursor->trans->tid,
1985 				cursor->trans->time32,
1986 				0, NULL);
1987 	}
1988 	return(error);
1989 }
1990 
1991 /*
1992  * Delete the B-Tree element at the current cursor and do any necessary
1993  * mirror propagation.
1994  *
1995  * The cursor must be properly positioned for an iteration on return but
1996  * may be pointing at an internal element.
1997  *
1998  * An element can be un-deleted by passing a delete_tid of 0 with
1999  * HAMMER_DELETE_ADJUST.
2000  */
2001 int
2002 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
2003 			hammer_tid_t delete_tid, u_int32_t delete_ts,
2004 			int track, int64_t *stat_bytes)
2005 {
2006 	struct hammer_btree_leaf_elm save_leaf;
2007 	hammer_transaction_t trans;
2008 	hammer_btree_leaf_elm_t leaf;
2009 	hammer_node_t node;
2010 	hammer_btree_elm_t elm;
2011 	hammer_off_t data_offset;
2012 	int32_t data_len;
2013 	u_int16_t rec_type;
2014 	int error;
2015 	int icount;
2016 	int doprop;
2017 
2018 	error = hammer_cursor_upgrade(cursor);
2019 	if (error)
2020 		return(error);
2021 
2022 	trans = cursor->trans;
2023 	node = cursor->node;
2024 	elm = &node->ondisk->elms[cursor->index];
2025 	leaf = &elm->leaf;
2026 	KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
2027 
2028 	hammer_sync_lock_sh(trans);
2029 	doprop = 0;
2030 	icount = 0;
2031 
2032 	/*
2033 	 * Adjust the delete_tid.  Update the mirror_tid propagation field
2034 	 * as well.  delete_tid can be 0 (undelete -- used by mirroring).
2035 	 */
2036 	if (delete_flags & HAMMER_DELETE_ADJUST) {
2037 		if (elm->base.rec_type == HAMMER_RECTYPE_INODE) {
2038 			if (elm->leaf.base.delete_tid == 0 && delete_tid)
2039 				icount = -1;
2040 			if (elm->leaf.base.delete_tid && delete_tid == 0)
2041 				icount = 1;
2042 		}
2043 
2044 		hammer_modify_node(trans, node, elm, sizeof(*elm));
2045 		elm->leaf.base.delete_tid = delete_tid;
2046 		elm->leaf.delete_ts = delete_ts;
2047 		hammer_modify_node_done(node);
2048 
2049 		if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) {
2050 			hammer_modify_node_field(trans, node, mirror_tid);
2051 			node->ondisk->mirror_tid = elm->leaf.base.delete_tid;
2052 			hammer_modify_node_done(node);
2053 			doprop = 1;
2054 			if (hammer_debug_general & 0x0002) {
2055 				kprintf("delete_at_cursor: propagate %016llx"
2056 					" @%016llx\n",
2057 					elm->leaf.base.delete_tid,
2058 					node->node_offset);
2059 			}
2060 		}
2061 
2062 		/*
2063 		 * Adjust for the iteration.  We have deleted the current
2064 		 * element and want to clear ATEDISK so the iteration does
2065 		 * not skip the element after, which now becomes the current
2066 		 * element.  This element must be re-tested if doing an
2067 		 * iteration, which is handled by the RETEST flag.
2068 		 */
2069 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
2070 			cursor->flags |= HAMMER_CURSOR_RETEST;
2071 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
2072 		}
2073 
2074 		/*
2075 		 * An on-disk record cannot have the same delete_tid
2076 		 * as its create_tid.  In a chain of record updates
2077 		 * this could result in a duplicate record.
2078 		 */
2079 		KKASSERT(elm->leaf.base.delete_tid !=
2080 			 elm->leaf.base.create_tid);
2081 	}
2082 
2083 	/*
2084 	 * Destroy the B-Tree element if asked (typically if a nohistory
2085 	 * file or mount, or when called by the pruning code).
2086 	 *
2087 	 * Adjust the ATEDISK flag to properly support iterations.
2088 	 */
2089 	if (delete_flags & HAMMER_DELETE_DESTROY) {
2090 		data_offset = elm->leaf.data_offset;
2091 		data_len = elm->leaf.data_len;
2092 		rec_type = elm->leaf.base.rec_type;
2093 		if (doprop) {
2094 			save_leaf = elm->leaf;
2095 			leaf = &save_leaf;
2096 		}
2097 		if (elm->base.rec_type == HAMMER_RECTYPE_INODE &&
2098 		    elm->leaf.base.delete_tid == 0) {
2099 			icount = -1;
2100 		}
2101 
2102 		error = hammer_btree_delete(cursor);
2103 		if (error == 0) {
2104 			/*
2105 			 * The deletion moves the next element (if any) to
2106 			 * the current element position.  We must clear
2107 			 * ATEDISK so this element is not skipped and we
2108 			 * must set RETEST to force any iteration to re-test
2109 			 * the element.
2110 			 */
2111 			if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
2112 				cursor->flags |= HAMMER_CURSOR_RETEST;
2113 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
2114 			}
2115 		}
2116 		if (error == 0) {
2117 			switch(data_offset & HAMMER_OFF_ZONE_MASK) {
2118 			case HAMMER_ZONE_LARGE_DATA:
2119 			case HAMMER_ZONE_SMALL_DATA:
2120 			case HAMMER_ZONE_META:
2121 				hammer_blockmap_free(trans,
2122 						     data_offset, data_len);
2123 				break;
2124 			default:
2125 				break;
2126 			}
2127 		}
2128 	}
2129 
2130 	/*
2131 	 * Track inode count and next_tid.  This is used by the mirroring
2132 	 * and PFS code.  icount can be negative, zero, or positive.
2133 	 */
2134 	if (error == 0 && track) {
2135 		if (icount) {
2136 			hammer_modify_volume_field(trans, trans->rootvol,
2137 						   vol0_stat_inodes);
2138 			trans->rootvol->ondisk->vol0_stat_inodes += icount;
2139 			hammer_modify_volume_done(trans->rootvol);
2140 		}
2141 		if (trans->rootvol->ondisk->vol0_next_tid < delete_tid) {
2142 			hammer_modify_volume(trans, trans->rootvol, NULL, 0);
2143 			trans->rootvol->ondisk->vol0_next_tid = delete_tid;
2144 			hammer_modify_volume_done(trans->rootvol);
2145 		}
2146 	}
2147 
2148 	/*
2149 	 * mirror_tid propagation occurs if the node's mirror_tid had to be
2150 	 * updated while adjusting the delete_tid.
2151 	 *
2152 	 * This occurs when deleting even in nohistory mode, but does not
2153 	 * occur when pruning an already-deleted node.
2154 	 *
2155 	 * cursor->ip is NULL when called from the pruning, mirroring,
2156 	 * and pfs code.  If non-NULL propagation will be conditionalized
2157 	 * on whether the PFS is in no-history mode or not.
2158 	 */
2159 	if (doprop) {
2160 		if (cursor->ip)
2161 			hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf);
2162 		else
2163 			hammer_btree_do_propagation(cursor, NULL, leaf);
2164 	}
2165 	hammer_sync_unlock(trans);
2166 	return (error);
2167 }
2168 
2169 /*
2170  * Determine whether we can remove a directory.  This routine checks whether
2171  * a directory is empty or not and enforces flush connectivity.
2172  *
2173  * Flush connectivity requires that we block if the target directory is
2174  * currently flushing, otherwise it may not end up in the same flush group.
2175  *
2176  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
2177  */
2178 int
2179 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
2180 {
2181 	struct hammer_cursor cursor;
2182 	int error;
2183 
2184 	/*
2185 	 * Check directory empty
2186 	 */
2187 	hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2188 
2189 	cursor.key_beg.localization = ip->obj_localization +
2190 				      HAMMER_LOCALIZE_MISC;
2191 	cursor.key_beg.obj_id = ip->obj_id;
2192 	cursor.key_beg.create_tid = 0;
2193 	cursor.key_beg.delete_tid = 0;
2194 	cursor.key_beg.obj_type = 0;
2195 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
2196 	cursor.key_beg.key = HAMMER_MIN_KEY;
2197 
2198 	cursor.key_end = cursor.key_beg;
2199 	cursor.key_end.rec_type = 0xFFFF;
2200 	cursor.key_end.key = HAMMER_MAX_KEY;
2201 
2202 	cursor.asof = ip->obj_asof;
2203 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2204 
2205 	error = hammer_ip_first(&cursor);
2206 	if (error == ENOENT)
2207 		error = 0;
2208 	else if (error == 0)
2209 		error = ENOTEMPTY;
2210 	hammer_done_cursor(&cursor);
2211 	return(error);
2212 }
2213 
2214