xref: /dragonfly/sys/vfs/hammer/hammer_object.c (revision 409b4c59)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.97 2008/09/23 22:28:56 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 static int hammer_mem_lookup(hammer_cursor_t cursor);
40 static int hammer_mem_first(hammer_cursor_t cursor);
41 static int hammer_frontend_trunc_callback(hammer_record_t record,
42 				void *data __unused);
43 static int hammer_bulk_scan_callback(hammer_record_t record, void *data);
44 static int hammer_record_needs_overwrite_delete(hammer_record_t record);
45 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
46 		      hammer_btree_leaf_elm_t leaf);
47 
48 struct rec_trunc_info {
49 	u_int16_t	rec_type;
50 	int64_t		trunc_off;
51 };
52 
53 struct hammer_bulk_info {
54 	hammer_record_t record;
55 	struct hammer_btree_leaf_elm leaf;
56 };
57 
58 /*
59  * Red-black tree support.  Comparison code for insertion.
60  */
61 static int
62 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
63 {
64 	if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type)
65 		return(-1);
66 	if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type)
67 		return(1);
68 
69 	if (rec1->leaf.base.key < rec2->leaf.base.key)
70 		return(-1);
71 	if (rec1->leaf.base.key > rec2->leaf.base.key)
72 		return(1);
73 
74 	/*
75 	 * Never match against an item deleted by the front-end.
76 	 *
77 	 * rec1 is greater then rec2 if rec1 is marked deleted.
78 	 * rec1 is less then rec2 if rec2 is marked deleted.
79 	 *
80 	 * Multiple deleted records may be present, do not return 0
81 	 * if both are marked deleted.
82 	 */
83 	if (rec1->flags & HAMMER_RECF_DELETED_FE)
84 		return(1);
85 	if (rec2->flags & HAMMER_RECF_DELETED_FE)
86 		return(-1);
87 
88         return(0);
89 }
90 
91 /*
92  * Basic record comparison code similar to hammer_btree_cmp().
93  */
94 static int
95 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec)
96 {
97 	if (elm->rec_type < rec->leaf.base.rec_type)
98 		return(-3);
99 	if (elm->rec_type > rec->leaf.base.rec_type)
100 		return(3);
101 
102         if (elm->key < rec->leaf.base.key)
103                 return(-2);
104         if (elm->key > rec->leaf.base.key)
105                 return(2);
106 
107 	/*
108 	 * Never match against an item deleted by the front-end.
109 	 * elm is less then rec if rec is marked deleted.
110 	 */
111 	if (rec->flags & HAMMER_RECF_DELETED_FE)
112 		return(-1);
113         return(0);
114 }
115 
116 /*
117  * Ranged scan to locate overlapping record(s).  This is used by
118  * hammer_ip_get_bulk() to locate an overlapping record.  We have
119  * to use a ranged scan because the keys for data records with the
120  * same file base offset can be different due to differing data_len's.
121  *
122  * NOTE: The base file offset of a data record is (key - data_len), not (key).
123  */
124 static int
125 hammer_rec_overlap_cmp(hammer_record_t rec, void *data)
126 {
127 	struct hammer_bulk_info *info = data;
128 	hammer_btree_leaf_elm_t leaf = &info->leaf;
129 
130 	if (rec->leaf.base.rec_type < leaf->base.rec_type)
131 		return(-3);
132 	if (rec->leaf.base.rec_type > leaf->base.rec_type)
133 		return(3);
134 
135 	/*
136 	 * Overlap compare
137 	 */
138 	if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
139 		/* rec_beg >= leaf_end */
140 		if (rec->leaf.base.key - rec->leaf.data_len >= leaf->base.key)
141 			return(2);
142 		/* rec_end <= leaf_beg */
143 		if (rec->leaf.base.key <= leaf->base.key - leaf->data_len)
144 			return(-2);
145 	} else {
146 		if (rec->leaf.base.key < leaf->base.key)
147 			return(-2);
148 		if (rec->leaf.base.key > leaf->base.key)
149 			return(2);
150 	}
151 
152 	/*
153 	 * We have to return 0 at this point, even if DELETED_FE is set,
154 	 * because returning anything else will cause the scan to ignore
155 	 * one of the branches when we really want it to check both.
156 	 */
157         return(0);
158 }
159 
160 /*
161  * RB_SCAN comparison code for hammer_mem_first().  The argument order
162  * is reversed so the comparison result has to be negated.  key_beg and
163  * key_end are both range-inclusive.
164  *
165  * Localized deletions are not cached in-memory.
166  */
167 static
168 int
169 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
170 {
171 	hammer_cursor_t cursor = data;
172 	int r;
173 
174 	r = hammer_rec_cmp(&cursor->key_beg, rec);
175 	if (r > 1)
176 		return(-1);
177 	r = hammer_rec_cmp(&cursor->key_end, rec);
178 	if (r < -1)
179 		return(1);
180 	return(0);
181 }
182 
183 /*
184  * This compare function is used when simply looking up key_beg.
185  */
186 static
187 int
188 hammer_rec_find_cmp(hammer_record_t rec, void *data)
189 {
190 	hammer_cursor_t cursor = data;
191 	int r;
192 
193 	r = hammer_rec_cmp(&cursor->key_beg, rec);
194 	if (r > 1)
195 		return(-1);
196 	if (r < -1)
197 		return(1);
198 	return(0);
199 }
200 
201 /*
202  * Locate blocks within the truncation range.  Partial blocks do not count.
203  */
204 static
205 int
206 hammer_rec_trunc_cmp(hammer_record_t rec, void *data)
207 {
208 	struct rec_trunc_info *info = data;
209 
210 	if (rec->leaf.base.rec_type < info->rec_type)
211 		return(-1);
212 	if (rec->leaf.base.rec_type > info->rec_type)
213 		return(1);
214 
215 	switch(rec->leaf.base.rec_type) {
216 	case HAMMER_RECTYPE_DB:
217 		/*
218 		 * DB record key is not beyond the truncation point, retain.
219 		 */
220 		if (rec->leaf.base.key < info->trunc_off)
221 			return(-1);
222 		break;
223 	case HAMMER_RECTYPE_DATA:
224 		/*
225 		 * DATA record offset start is not beyond the truncation point,
226 		 * retain.
227 		 */
228 		if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off)
229 			return(-1);
230 		break;
231 	default:
232 		panic("hammer_rec_trunc_cmp: unexpected record type");
233 	}
234 
235 	/*
236 	 * The record start is >= the truncation point, return match,
237 	 * the record should be destroyed.
238 	 */
239 	return(0);
240 }
241 
242 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
243 
244 /*
245  * Allocate a record for the caller to finish filling in.  The record is
246  * returned referenced.
247  */
248 hammer_record_t
249 hammer_alloc_mem_record(hammer_inode_t ip, int data_len)
250 {
251 	hammer_record_t record;
252 	hammer_mount_t hmp;
253 
254 	hmp = ip->hmp;
255 	++hammer_count_records;
256 	record = kmalloc(sizeof(*record), hmp->m_misc,
257 			 M_WAITOK | M_ZERO | M_USE_RESERVE);
258 	record->flush_state = HAMMER_FST_IDLE;
259 	record->ip = ip;
260 	record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
261 	record->leaf.data_len = data_len;
262 	hammer_ref(&record->lock);
263 
264 	if (data_len) {
265 		record->data = kmalloc(data_len, hmp->m_misc, M_WAITOK | M_ZERO);
266 		record->flags |= HAMMER_RECF_ALLOCDATA;
267 		++hammer_count_record_datas;
268 	}
269 
270 	return (record);
271 }
272 
273 void
274 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident)
275 {
276 	while (record->flush_state == HAMMER_FST_FLUSH) {
277 		record->flags |= HAMMER_RECF_WANTED;
278 		tsleep(record, 0, ident, 0);
279 	}
280 }
281 
282 /*
283  * Called from the backend, hammer_inode.c, after a record has been
284  * flushed to disk.  The record has been exclusively locked by the
285  * caller and interlocked with BE.
286  *
287  * We clean up the state, unlock, and release the record (the record
288  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
289  */
290 void
291 hammer_flush_record_done(hammer_record_t record, int error)
292 {
293 	hammer_inode_t target_ip;
294 
295 	KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
296 	KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
297 
298 	if (error) {
299 		/*
300 		 * An error occured, the backend was unable to sync the
301 		 * record to its media.  Leave the record intact.
302 		 */
303 		hammer_critical_error(record->ip->hmp, record->ip, error,
304 				      "while flushing record");
305 	}
306 
307 	--record->flush_group->refs;
308 	record->flush_group = NULL;
309 
310 	if (record->flags & HAMMER_RECF_DELETED_BE) {
311 		if ((target_ip = record->target_ip) != NULL) {
312 			TAILQ_REMOVE(&target_ip->target_list, record,
313 				     target_entry);
314 			record->target_ip = NULL;
315 			hammer_test_inode(target_ip);
316 		}
317 		record->flush_state = HAMMER_FST_IDLE;
318 	} else {
319 		if (record->target_ip) {
320 			record->flush_state = HAMMER_FST_SETUP;
321 			hammer_test_inode(record->ip);
322 			hammer_test_inode(record->target_ip);
323 		} else {
324 			record->flush_state = HAMMER_FST_IDLE;
325 		}
326 	}
327 	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
328 	if (record->flags & HAMMER_RECF_WANTED) {
329 		record->flags &= ~HAMMER_RECF_WANTED;
330 		wakeup(record);
331 	}
332 	hammer_rel_mem_record(record);
333 }
334 
335 /*
336  * Release a memory record.  Records marked for deletion are immediately
337  * removed from the RB-Tree but otherwise left intact until the last ref
338  * goes away.
339  */
340 void
341 hammer_rel_mem_record(struct hammer_record *record)
342 {
343 	hammer_mount_t hmp;
344 	hammer_reserve_t resv;
345 	hammer_inode_t ip;
346 	hammer_inode_t target_ip;
347 
348 	hammer_unref(&record->lock);
349 
350 	if (record->lock.refs == 0) {
351 		/*
352 		 * Upon release of the last reference wakeup any waiters.
353 		 * The record structure may get destroyed so callers will
354 		 * loop up and do a relookup.
355 		 *
356 		 * WARNING!  Record must be removed from RB-TREE before we
357 		 * might possibly block.  hammer_test_inode() can block!
358 		 */
359 		ip = record->ip;
360 		hmp = ip->hmp;
361 
362 		/*
363 		 * Upon release of the last reference a record marked deleted
364 		 * is destroyed.
365 		 */
366 		if (record->flags & HAMMER_RECF_DELETED_FE) {
367 			KKASSERT(ip->lock.refs > 0);
368 			KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
369 
370 			/*
371 			 * target_ip may have zero refs, we have to ref it
372 			 * to prevent it from being ripped out from under
373 			 * us.
374 			 */
375 			if ((target_ip = record->target_ip) != NULL) {
376 				TAILQ_REMOVE(&target_ip->target_list,
377 					     record, target_entry);
378 				record->target_ip = NULL;
379 				hammer_ref(&target_ip->lock);
380 			}
381 
382 			if (record->flags & HAMMER_RECF_ONRBTREE) {
383 				RB_REMOVE(hammer_rec_rb_tree,
384 					  &record->ip->rec_tree,
385 					  record);
386 				KKASSERT(ip->rsv_recs > 0);
387 				--hmp->rsv_recs;
388 				--ip->rsv_recs;
389 				hmp->rsv_databytes -= record->leaf.data_len;
390 				record->flags &= ~HAMMER_RECF_ONRBTREE;
391 
392 				if (RB_EMPTY(&record->ip->rec_tree)) {
393 					record->ip->flags &= ~HAMMER_INODE_XDIRTY;
394 					record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
395 					hammer_test_inode(record->ip);
396 				}
397 			}
398 
399 			/*
400 			 * We must wait for any direct-IO to complete before
401 			 * we can destroy the record because the bio may
402 			 * have a reference to it.
403 			 */
404 			if (record->flags &
405 			   (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) {
406 				hammer_io_direct_wait(record);
407 			}
408 
409 
410 			/*
411 			 * Do this test after removing record from the B-Tree.
412 			 */
413 			if (target_ip) {
414 				hammer_test_inode(target_ip);
415 				hammer_rel_inode(target_ip, 0);
416 			}
417 
418 			if (record->flags & HAMMER_RECF_ALLOCDATA) {
419 				--hammer_count_record_datas;
420 				kfree(record->data, hmp->m_misc);
421 				record->flags &= ~HAMMER_RECF_ALLOCDATA;
422 			}
423 
424 			/*
425 			 * Release the reservation.  If the record was not
426 			 * committed return the reservation before
427 			 * releasing it.
428 			 */
429 			if ((resv = record->resv) != NULL) {
430 #if 0
431 				if ((record->flags & HAMMER_RECF_COMMITTED) == 0) {
432 					hammer_blockmap_reserve_undo(
433 						hmp, resv,
434 						record->leaf.data_offset,
435 						record->leaf.data_len);
436 				}
437 #endif
438 				hammer_blockmap_reserve_complete(hmp, resv);
439 				record->resv = NULL;
440 			}
441 			record->data = NULL;
442 			--hammer_count_records;
443 			kfree(record, hmp->m_misc);
444 		}
445 	}
446 }
447 
448 /*
449  * Record visibility depends on whether the record is being accessed by
450  * the backend or the frontend.
451  *
452  * Return non-zero if the record is visible, zero if it isn't or if it is
453  * deleted.
454  *
455  * If HAMMER_CURSOR_DELETE_VISIBILITY is set we allow deleted memory
456  * records to be returned.  This is so pending deletions are detected
457  * when using an iterator to locate an unused hash key, or when we need
458  * to locate historical records on-disk to destroy.
459  */
460 static __inline
461 int
462 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
463 {
464 	if (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY)
465 		return(1);
466 	if (cursor->flags & HAMMER_CURSOR_BACKEND) {
467 		if (record->flags & HAMMER_RECF_DELETED_BE)
468 			return(0);
469 	} else {
470 		if (record->flags & HAMMER_RECF_DELETED_FE)
471 			return(0);
472 	}
473 	return(1);
474 }
475 
476 /*
477  * This callback is used as part of the RB_SCAN function for in-memory
478  * records.  We terminate it (return -1) as soon as we get a match.
479  *
480  * This routine is used by frontend code.
481  *
482  * The primary compare code does not account for ASOF lookups.  This
483  * code handles that case as well as a few others.
484  */
485 static
486 int
487 hammer_rec_scan_callback(hammer_record_t rec, void *data)
488 {
489 	hammer_cursor_t cursor = data;
490 
491 	/*
492 	 * We terminate on success, so this should be NULL on entry.
493 	 */
494 	KKASSERT(cursor->iprec == NULL);
495 
496 	/*
497 	 * Skip if the record was marked deleted.
498 	 */
499 	if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
500 		return(0);
501 
502 	/*
503 	 * Skip if not visible due to our as-of TID
504 	 */
505         if (cursor->flags & HAMMER_CURSOR_ASOF) {
506                 if (cursor->asof < rec->leaf.base.create_tid)
507                         return(0);
508                 if (rec->leaf.base.delete_tid &&
509 		    cursor->asof >= rec->leaf.base.delete_tid) {
510                         return(0);
511 		}
512         }
513 
514 	/*
515 	 * ref the record.  The record is protected from backend B-Tree
516 	 * interactions by virtue of the cursor's IP lock.
517 	 */
518 	hammer_ref(&rec->lock);
519 
520 	/*
521 	 * The record may have been deleted while we were blocked.
522 	 */
523 	if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
524 		hammer_rel_mem_record(rec);
525 		return(0);
526 	}
527 
528 	/*
529 	 * Set the matching record and stop the scan.
530 	 */
531 	cursor->iprec = rec;
532 	return(-1);
533 }
534 
535 
536 /*
537  * Lookup an in-memory record given the key specified in the cursor.  Works
538  * just like hammer_btree_lookup() but operates on an inode's in-memory
539  * record list.
540  *
541  * The lookup must fail if the record is marked for deferred deletion.
542  */
543 static
544 int
545 hammer_mem_lookup(hammer_cursor_t cursor)
546 {
547 	int error;
548 
549 	KKASSERT(cursor->ip);
550 	if (cursor->iprec) {
551 		hammer_rel_mem_record(cursor->iprec);
552 		cursor->iprec = NULL;
553 	}
554 	hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp,
555 				   hammer_rec_scan_callback, cursor);
556 
557 	if (cursor->iprec == NULL)
558 		error = ENOENT;
559 	else
560 		error = 0;
561 	return(error);
562 }
563 
564 /*
565  * hammer_mem_first() - locate the first in-memory record matching the
566  * cursor within the bounds of the key range.
567  */
568 static
569 int
570 hammer_mem_first(hammer_cursor_t cursor)
571 {
572 	hammer_inode_t ip;
573 
574 	ip = cursor->ip;
575 	KKASSERT(ip != NULL);
576 
577 	if (cursor->iprec) {
578 		hammer_rel_mem_record(cursor->iprec);
579 		cursor->iprec = NULL;
580 	}
581 
582 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
583 				   hammer_rec_scan_callback, cursor);
584 
585 	/*
586 	 * Adjust scan.node and keep it linked into the RB-tree so we can
587 	 * hold the cursor through third party modifications of the RB-tree.
588 	 */
589 	if (cursor->iprec)
590 		return(0);
591 	return(ENOENT);
592 }
593 
594 /************************************************************************
595  *		     HAMMER IN-MEMORY RECORD FUNCTIONS			*
596  ************************************************************************
597  *
598  * These functions manipulate in-memory records.  Such records typically
599  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
600  */
601 
602 /*
603  * Add a directory entry (dip,ncp) which references inode (ip).
604  *
605  * Note that the low 32 bits of the namekey are set temporarily to create
606  * a unique in-memory record, and may be modified a second time when the
607  * record is synchronized to disk.  In particular, the low 32 bits cannot be
608  * all 0's when synching to disk, which is not handled here.
609  *
610  * NOTE: bytes does not include any terminating \0 on name, and name might
611  * not be terminated.
612  */
613 int
614 hammer_ip_add_directory(struct hammer_transaction *trans,
615 		     struct hammer_inode *dip, const char *name, int bytes,
616 		     struct hammer_inode *ip)
617 {
618 	struct hammer_cursor cursor;
619 	hammer_record_t record;
620 	int error;
621 	u_int32_t max_iterations;
622 
623 	record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes));
624 
625 	record->type = HAMMER_MEM_RECORD_ADD;
626 	record->leaf.base.localization = dip->obj_localization +
627 					 HAMMER_LOCALIZE_MISC;
628 	record->leaf.base.obj_id = dip->obj_id;
629 	record->leaf.base.key = hammer_directory_namekey(dip, name, bytes,
630 							 &max_iterations);
631 	record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
632 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
633 	record->data->entry.obj_id = ip->obj_id;
634 	record->data->entry.localization = ip->obj_localization;
635 	bcopy(name, record->data->entry.name, bytes);
636 
637 	++ip->ino_data.nlinks;
638 	hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
639 
640 	/*
641 	 * Find an unused namekey.  Both the in-memory record tree and
642 	 * the B-Tree are checked.  We do not want historically deleted
643 	 * names to create a collision as our iteration space may be limited,
644 	 * and since create_tid wouldn't match anyway an ASOF search
645 	 * must be used to locate collisions.
646 	 *
647 	 * delete-visibility is set so pending deletions do not give us
648 	 * a false-negative on our ability to use an iterator.
649 	 *
650 	 * The iterator must not rollover the key.  Directory keys only
651 	 * use the positive key space.
652 	 */
653 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
654 	cursor.key_beg = record->leaf.base;
655 	cursor.flags |= HAMMER_CURSOR_ASOF;
656 	cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
657 	cursor.asof = ip->obj_asof;
658 
659 	while (hammer_ip_lookup(&cursor) == 0) {
660 		++record->leaf.base.key;
661 		KKASSERT(record->leaf.base.key > 0);
662 		cursor.key_beg.key = record->leaf.base.key;
663 		if (--max_iterations == 0) {
664 			hammer_rel_mem_record(record);
665 			error = ENOSPC;
666 			goto failed;
667 		}
668 	}
669 
670 	/*
671 	 * The target inode and the directory entry are bound together.
672 	 */
673 	record->target_ip = ip;
674 	record->flush_state = HAMMER_FST_SETUP;
675 	TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
676 
677 	/*
678 	 * The inode now has a dependancy and must be taken out of the idle
679 	 * state.  An inode not in an idle state is given an extra reference.
680 	 *
681 	 * When transitioning to a SETUP state flag for an automatic reflush
682 	 * when the dependancies are disposed of if someone is waiting on
683 	 * the inode.
684 	 */
685 	if (ip->flush_state == HAMMER_FST_IDLE) {
686 		hammer_ref(&ip->lock);
687 		ip->flush_state = HAMMER_FST_SETUP;
688 		if (ip->flags & HAMMER_INODE_FLUSHW)
689 			ip->flags |= HAMMER_INODE_REFLUSH;
690 	}
691 	error = hammer_mem_add(record);
692 	if (error == 0) {
693 		dip->ino_data.mtime = trans->time;
694 		hammer_modify_inode(dip, HAMMER_INODE_MTIME);
695 	}
696 failed:
697 	hammer_done_cursor(&cursor);
698 	return(error);
699 }
700 
701 /*
702  * Delete the directory entry and update the inode link count.  The
703  * cursor must be seeked to the directory entry record being deleted.
704  *
705  * The related inode should be share-locked by the caller.  The caller is
706  * on the frontend.  It could also be NULL indicating that the directory
707  * entry being removed has no related inode.
708  *
709  * This function can return EDEADLK requiring the caller to terminate
710  * the cursor, any locks, wait on the returned record, and retry.
711  */
712 int
713 hammer_ip_del_directory(struct hammer_transaction *trans,
714 		     hammer_cursor_t cursor, struct hammer_inode *dip,
715 		     struct hammer_inode *ip)
716 {
717 	hammer_record_t record;
718 	int error;
719 
720 	if (hammer_cursor_inmem(cursor)) {
721 		/*
722 		 * In-memory (unsynchronized) records can simply be freed.
723 		 * Even though the HAMMER_RECF_DELETED_FE flag is ignored
724 		 * by the backend, we must still avoid races against the
725 		 * backend potentially syncing the record to the media.
726 		 *
727 		 * We cannot call hammer_ip_delete_record(), that routine may
728 		 * only be called from the backend.
729 		 */
730 		record = cursor->iprec;
731 		if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
732 			KKASSERT(cursor->deadlk_rec == NULL);
733 			hammer_ref(&record->lock);
734 			cursor->deadlk_rec = record;
735 			error = EDEADLK;
736 		} else {
737 			KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
738 			record->flags |= HAMMER_RECF_DELETED_FE;
739 			error = 0;
740 		}
741 	} else {
742 		/*
743 		 * If the record is on-disk we have to queue the deletion by
744 		 * the record's key.  This also causes lookups to skip the
745 		 * record.
746 		 */
747 		KKASSERT(dip->flags &
748 			 (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK));
749 		record = hammer_alloc_mem_record(dip, 0);
750 		record->type = HAMMER_MEM_RECORD_DEL;
751 		record->leaf.base = cursor->leaf->base;
752 
753 		/*
754 		 * ip may be NULL, indicating the deletion of a directory
755 		 * entry which has no related inode.
756 		 */
757 		record->target_ip = ip;
758 		if (ip) {
759 			record->flush_state = HAMMER_FST_SETUP;
760 			TAILQ_INSERT_TAIL(&ip->target_list, record,
761 					  target_entry);
762 		} else {
763 			record->flush_state = HAMMER_FST_IDLE;
764 		}
765 
766 		/*
767 		 * The inode now has a dependancy and must be taken out of
768 		 * the idle state.  An inode not in an idle state is given
769 		 * an extra reference.
770 		 *
771 		 * When transitioning to a SETUP state flag for an automatic
772 		 * reflush when the dependancies are disposed of if someone
773 		 * is waiting on the inode.
774 		 */
775 		if (ip && ip->flush_state == HAMMER_FST_IDLE) {
776 			hammer_ref(&ip->lock);
777 			ip->flush_state = HAMMER_FST_SETUP;
778 			if (ip->flags & HAMMER_INODE_FLUSHW)
779 				ip->flags |= HAMMER_INODE_REFLUSH;
780 		}
781 
782 		error = hammer_mem_add(record);
783 	}
784 
785 	/*
786 	 * One less link.  The file may still be open in the OS even after
787 	 * all links have gone away.
788 	 *
789 	 * We have to terminate the cursor before syncing the inode to
790 	 * avoid deadlocking against ourselves.  XXX this may no longer
791 	 * be true.
792 	 *
793 	 * If nlinks drops to zero and the vnode is inactive (or there is
794 	 * no vnode), call hammer_inode_unloadable_check() to zonk the
795 	 * inode.  If we don't do this here the inode will not be destroyed
796 	 * on-media until we unmount.
797 	 */
798 	if (error == 0) {
799 		if (ip)
800 			--ip->ino_data.nlinks;	/* do before we might block */
801 		dip->ino_data.mtime = trans->time;
802 		hammer_modify_inode(dip, HAMMER_INODE_MTIME);
803 		if (ip) {
804 			hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
805 			if (ip->ino_data.nlinks == 0 &&
806 			    (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
807 				hammer_done_cursor(cursor);
808 				hammer_inode_unloadable_check(ip, 1);
809 				hammer_flush_inode(ip, 0);
810 			}
811 		}
812 
813 	}
814 	return(error);
815 }
816 
817 /*
818  * Add a record to an inode.
819  *
820  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
821  * initialize the following additional fields:
822  *
823  * The related inode should be share-locked by the caller.  The caller is
824  * on the frontend.
825  *
826  * record->rec.entry.base.base.key
827  * record->rec.entry.base.base.rec_type
828  * record->rec.entry.base.base.data_len
829  * record->data		(a copy will be kmalloc'd if it cannot be embedded)
830  */
831 int
832 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
833 {
834 	hammer_inode_t ip = record->ip;
835 	int error;
836 
837 	KKASSERT(record->leaf.base.localization != 0);
838 	record->leaf.base.obj_id = ip->obj_id;
839 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
840 	error = hammer_mem_add(record);
841 	return(error);
842 }
843 
844 /*
845  * Locate a bulk record in-memory.  Bulk records allow disk space to be
846  * reserved so the front-end can flush large data writes without having
847  * to queue the BIO to the flusher.  Only the related record gets queued
848  * to the flusher.
849  */
850 
851 static hammer_record_t
852 hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes)
853 {
854 	struct hammer_bulk_info info;
855 
856 	bzero(&info, sizeof(info));
857 	info.leaf.base.obj_id = ip->obj_id;
858 	info.leaf.base.key = file_offset + bytes;
859 	info.leaf.base.create_tid = 0;
860 	info.leaf.base.delete_tid = 0;
861 	info.leaf.base.rec_type = HAMMER_RECTYPE_DATA;
862 	info.leaf.base.obj_type = 0;				/* unused */
863 	info.leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;	/* unused */
864 	info.leaf.base.localization = ip->obj_localization +	/* unused */
865 				      HAMMER_LOCALIZE_MISC;
866 	info.leaf.data_len = bytes;
867 
868 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_overlap_cmp,
869 				   hammer_bulk_scan_callback, &info);
870 
871 	return(info.record);	/* may be NULL */
872 }
873 
874 /*
875  * Take records vetted by overlap_cmp.  The first non-deleted record
876  * (if any) stops the scan.
877  */
878 static int
879 hammer_bulk_scan_callback(hammer_record_t record, void *data)
880 {
881 	struct hammer_bulk_info *info = data;
882 
883 	if (record->flags & HAMMER_RECF_DELETED_FE)
884 		return(0);
885 	hammer_ref(&record->lock);
886 	info->record = record;
887 	return(-1);			/* stop scan */
888 }
889 
890 /*
891  * Reserve blockmap space placemarked with an in-memory record.
892  *
893  * This routine is called by the frontend in order to be able to directly
894  * flush a buffer cache buffer.  The frontend has locked the related buffer
895  * cache buffers and we should be able to manipulate any overlapping
896  * in-memory records.
897  *
898  * The caller is responsible for adding the returned record.
899  */
900 hammer_record_t
901 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
902 		   int *errorp)
903 {
904 	hammer_record_t record;
905 	hammer_record_t conflict;
906 	int zone;
907 
908 	/*
909 	 * Deal with conflicting in-memory records.  We cannot have multiple
910 	 * in-memory records for the same base offset without seriously
911 	 * confusing the backend, including but not limited to the backend
912 	 * issuing delete-create-delete or create-delete-create sequences
913 	 * and asserting on the delete_tid being the same as the create_tid.
914 	 *
915 	 * If we encounter a record with the backend interlock set we cannot
916 	 * immediately delete it without confusing the backend.
917 	 */
918 	while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) {
919 		if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) {
920 			conflict->flags |= HAMMER_RECF_WANTED;
921 			tsleep(conflict, 0, "hmrrc3", 0);
922 		} else {
923 			conflict->flags |= HAMMER_RECF_DELETED_FE;
924 		}
925 		hammer_rel_mem_record(conflict);
926 	}
927 
928 	/*
929 	 * Create a record to cover the direct write.  This is called with
930 	 * the related BIO locked so there should be no possible conflict.
931 	 *
932 	 * The backend is responsible for finalizing the space reserved in
933 	 * this record.
934 	 *
935 	 * XXX bytes not aligned, depend on the reservation code to
936 	 * align the reservation.
937 	 */
938 	record = hammer_alloc_mem_record(ip, 0);
939 	zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX :
940 					   HAMMER_ZONE_SMALL_DATA_INDEX;
941 	record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
942 					       &record->leaf.data_offset,
943 					       errorp);
944 	if (record->resv == NULL) {
945 		kprintf("hammer_ip_add_bulk: reservation failed\n");
946 		hammer_rel_mem_record(record);
947 		return(NULL);
948 	}
949 	record->type = HAMMER_MEM_RECORD_DATA;
950 	record->leaf.base.rec_type = HAMMER_RECTYPE_DATA;
951 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
952 	record->leaf.base.obj_id = ip->obj_id;
953 	record->leaf.base.key = file_offset + bytes;
954 	record->leaf.base.localization = ip->obj_localization +
955 					 HAMMER_LOCALIZE_MISC;
956 	record->leaf.data_len = bytes;
957 	hammer_crc_set_leaf(data, &record->leaf);
958 	KKASSERT(*errorp == 0);
959 	return(record);
960 }
961 
962 /*
963  * Frontend truncation code.  Scan in-memory records only.  On-disk records
964  * and records in a flushing state are handled by the backend.  The vnops
965  * setattr code will handle the block containing the truncation point.
966  *
967  * Partial blocks are not deleted.
968  */
969 int
970 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size)
971 {
972 	struct rec_trunc_info info;
973 
974 	switch(ip->ino_data.obj_type) {
975 	case HAMMER_OBJTYPE_REGFILE:
976 		info.rec_type = HAMMER_RECTYPE_DATA;
977 		break;
978 	case HAMMER_OBJTYPE_DBFILE:
979 		info.rec_type = HAMMER_RECTYPE_DB;
980 		break;
981 	default:
982 		return(EINVAL);
983 	}
984 	info.trunc_off = file_size;
985 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp,
986 				   hammer_frontend_trunc_callback, &info);
987 	return(0);
988 }
989 
990 static int
991 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused)
992 {
993 	if (record->flags & HAMMER_RECF_DELETED_FE)
994 		return(0);
995 	if (record->flush_state == HAMMER_FST_FLUSH)
996 		return(0);
997 	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
998 	hammer_ref(&record->lock);
999 	record->flags |= HAMMER_RECF_DELETED_FE;
1000 	hammer_rel_mem_record(record);
1001 	return(0);
1002 }
1003 
1004 /*
1005  * Return 1 if the caller must check for and delete existing records
1006  * before writing out a new data record.
1007  *
1008  * Return 0 if the caller can just insert the record into the B-Tree without
1009  * checking.
1010  */
1011 static int
1012 hammer_record_needs_overwrite_delete(hammer_record_t record)
1013 {
1014 	hammer_inode_t ip = record->ip;
1015 	int64_t file_offset;
1016 	int r;
1017 
1018 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE)
1019 		file_offset = record->leaf.base.key;
1020 	else
1021 		file_offset = record->leaf.base.key - record->leaf.data_len;
1022 	r = (file_offset < ip->save_trunc_off);
1023 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1024 		if (ip->save_trunc_off <= record->leaf.base.key)
1025 			ip->save_trunc_off = record->leaf.base.key + 1;
1026 	} else {
1027 		if (ip->save_trunc_off < record->leaf.base.key)
1028 			ip->save_trunc_off = record->leaf.base.key;
1029 	}
1030 	return(r);
1031 }
1032 
1033 /*
1034  * Backend code.  Sync a record to the media.
1035  */
1036 int
1037 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
1038 {
1039 	hammer_transaction_t trans = cursor->trans;
1040 	int64_t file_offset;
1041 	int bytes;
1042 	void *bdata;
1043 	int error;
1044 	int doprop;
1045 
1046 	KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
1047 	KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
1048 	KKASSERT(record->leaf.base.localization != 0);
1049 
1050 	/*
1051 	 * Any direct-write related to the record must complete before we
1052 	 * can sync the record to the on-disk media.
1053 	 */
1054 	if (record->flags & (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL))
1055 		hammer_io_direct_wait(record);
1056 
1057 	/*
1058 	 * If this is a bulk-data record placemarker there may be an existing
1059 	 * record on-disk, indicating a data overwrite.  If there is the
1060 	 * on-disk record must be deleted before we can insert our new record.
1061 	 *
1062 	 * We've synthesized this record and do not know what the create_tid
1063 	 * on-disk is, nor how much data it represents.
1064 	 *
1065 	 * Keep in mind that (key) for data records is (base_offset + len),
1066 	 * not (base_offset).  Also, we only want to get rid of on-disk
1067 	 * records since we are trying to sync our in-memory record, call
1068 	 * hammer_ip_delete_range() with truncating set to 1 to make sure
1069 	 * it skips in-memory records.
1070 	 *
1071 	 * It is ok for the lookup to return ENOENT.
1072 	 *
1073 	 * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have
1074 	 * to call hammer_ip_delete_range() or not.  This also means we must
1075 	 * update sync_trunc_off() as we write.
1076 	 */
1077 	if (record->type == HAMMER_MEM_RECORD_DATA &&
1078 	    hammer_record_needs_overwrite_delete(record)) {
1079 		file_offset = record->leaf.base.key - record->leaf.data_len;
1080 		bytes = (record->leaf.data_len + HAMMER_BUFMASK) &
1081 			~HAMMER_BUFMASK;
1082 		KKASSERT((file_offset & HAMMER_BUFMASK) == 0);
1083 		error = hammer_ip_delete_range(
1084 				cursor, record->ip,
1085 				file_offset, file_offset + bytes - 1,
1086 				1);
1087 		if (error && error != ENOENT)
1088 			goto done;
1089 	}
1090 
1091 	/*
1092 	 * If this is a general record there may be an on-disk version
1093 	 * that must be deleted before we can insert the new record.
1094 	 */
1095 	if (record->type == HAMMER_MEM_RECORD_GENERAL) {
1096 		error = hammer_delete_general(cursor, record->ip,
1097 					      &record->leaf);
1098 		if (error && error != ENOENT)
1099 			goto done;
1100 	}
1101 
1102 	/*
1103 	 * Setup the cursor.
1104 	 */
1105 	hammer_normalize_cursor(cursor);
1106 	cursor->key_beg = record->leaf.base;
1107 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1108 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1109 	cursor->flags &= ~HAMMER_CURSOR_INSERT;
1110 
1111 	/*
1112 	 * Records can wind up on-media before the inode itself is on-media.
1113 	 * Flag the case.
1114 	 */
1115 	record->ip->flags |= HAMMER_INODE_DONDISK;
1116 
1117 	/*
1118 	 * If we are deleting a directory entry an exact match must be
1119 	 * found on-disk.
1120 	 */
1121 	if (record->type == HAMMER_MEM_RECORD_DEL) {
1122 		error = hammer_btree_lookup(cursor);
1123 		if (error == 0) {
1124 			KKASSERT(cursor->iprec == NULL);
1125 			error = hammer_ip_delete_record(cursor, record->ip,
1126 							trans->tid);
1127 			if (error == 0) {
1128 				record->flags |= HAMMER_RECF_DELETED_FE;
1129 				record->flags |= HAMMER_RECF_DELETED_BE;
1130 				record->flags |= HAMMER_RECF_COMMITTED;
1131 			}
1132 		}
1133 		goto done;
1134 	}
1135 
1136 	/*
1137 	 * We are inserting.
1138 	 *
1139 	 * Issue a lookup to position the cursor and locate the cluster.  The
1140 	 * target key should not exist.  If we are creating a directory entry
1141 	 * we may have to iterate the low 32 bits of the key to find an unused
1142 	 * key.
1143 	 */
1144 	hammer_sync_lock_sh(trans);
1145 	cursor->flags |= HAMMER_CURSOR_INSERT;
1146 	error = hammer_btree_lookup(cursor);
1147 	if (hammer_debug_inode)
1148 		kprintf("DOINSERT LOOKUP %d\n", error);
1149 	if (error == 0) {
1150 		kprintf("hammer_ip_sync_record: duplicate rec "
1151 			"at (%016llx)\n", record->leaf.base.key);
1152 		Debugger("duplicate record1");
1153 		error = EIO;
1154 	}
1155 #if 0
1156 	if (record->type == HAMMER_MEM_RECORD_DATA)
1157 		kprintf("sync_record  %016llx ---------------- %016llx %d\n",
1158 			record->leaf.base.key - record->leaf.data_len,
1159 			record->leaf.data_offset, error);
1160 #endif
1161 
1162 	if (error != ENOENT)
1163 		goto done_unlock;
1164 
1165 	/*
1166 	 * Allocate the record and data.  The result buffers will be
1167 	 * marked as being modified and further calls to
1168 	 * hammer_modify_buffer() will result in unneeded UNDO records.
1169 	 *
1170 	 * Support zero-fill records (data == NULL and data_len != 0)
1171 	 */
1172 	if (record->type == HAMMER_MEM_RECORD_DATA) {
1173 		/*
1174 		 * The data portion of a bulk-data record has already been
1175 		 * committed to disk, we need only adjust the layer2
1176 		 * statistics in the same transaction as our B-Tree insert.
1177 		 */
1178 		KKASSERT(record->leaf.data_offset != 0);
1179 		error = hammer_blockmap_finalize(trans,
1180 						 record->resv,
1181 						 record->leaf.data_offset,
1182 						 record->leaf.data_len);
1183 	} else if (record->data && record->leaf.data_len) {
1184 		/*
1185 		 * Wholely cached record, with data.  Allocate the data.
1186 		 */
1187 		bdata = hammer_alloc_data(trans, record->leaf.data_len,
1188 					  record->leaf.base.rec_type,
1189 					  &record->leaf.data_offset,
1190 					  &cursor->data_buffer, &error);
1191 		if (bdata == NULL)
1192 			goto done_unlock;
1193 		hammer_crc_set_leaf(record->data, &record->leaf);
1194 		hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
1195 		bcopy(record->data, bdata, record->leaf.data_len);
1196 		hammer_modify_buffer_done(cursor->data_buffer);
1197 	} else {
1198 		/*
1199 		 * Wholely cached record, without data.
1200 		 */
1201 		record->leaf.data_offset = 0;
1202 		record->leaf.data_crc = 0;
1203 	}
1204 
1205 	error = hammer_btree_insert(cursor, &record->leaf, &doprop);
1206 	if (hammer_debug_inode && error)
1207 		kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", error, cursor->node->node_offset, cursor->index, record->leaf.base.key);
1208 
1209 	/*
1210 	 * Our record is on-disk, normally mark the in-memory version as
1211 	 * deleted.  If the record represented a directory deletion but
1212 	 * we had to sync a valid directory entry to disk we must convert
1213 	 * the record to a covering delete so the frontend does not have
1214 	 * visibility on the synced entry.
1215 	 */
1216 	if (error == 0) {
1217 		if (doprop) {
1218 			hammer_btree_do_propagation(cursor,
1219 						    record->ip->pfsm,
1220 						    &record->leaf);
1221 		}
1222 		if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
1223 			KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
1224 			record->flags &= ~HAMMER_RECF_DELETED_FE;
1225 			record->type = HAMMER_MEM_RECORD_DEL;
1226 			KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
1227 			record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
1228 			/* hammer_flush_record_done takes care of the rest */
1229 		} else {
1230 			record->flags |= HAMMER_RECF_DELETED_FE;
1231 			record->flags |= HAMMER_RECF_DELETED_BE;
1232 		}
1233 		record->flags |= HAMMER_RECF_COMMITTED;
1234 	} else {
1235 		if (record->leaf.data_offset) {
1236 			hammer_blockmap_free(trans, record->leaf.data_offset,
1237 					     record->leaf.data_len);
1238 		}
1239 	}
1240 done_unlock:
1241 	hammer_sync_unlock(trans);
1242 done:
1243 	return(error);
1244 }
1245 
1246 /*
1247  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
1248  * entry's key is used to deal with hash collisions in the upper 32 bits.
1249  * A unique 64 bit key is generated in-memory and may be regenerated a
1250  * second time when the directory record is flushed to the on-disk B-Tree.
1251  *
1252  * A referenced record is passed to this function.  This function
1253  * eats the reference.  If an error occurs the record will be deleted.
1254  *
1255  * A copy of the temporary record->data pointer provided by the caller
1256  * will be made.
1257  */
1258 int
1259 hammer_mem_add(hammer_record_t record)
1260 {
1261 	hammer_mount_t hmp = record->ip->hmp;
1262 
1263 	/*
1264 	 * Make a private copy of record->data
1265 	 */
1266 	if (record->data)
1267 		KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA);
1268 
1269 	/*
1270 	 * Insert into the RB tree.  A unique key should have already
1271 	 * been selected if this is a directory entry.
1272 	 */
1273 	if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1274 		record->flags |= HAMMER_RECF_DELETED_FE;
1275 		hammer_rel_mem_record(record);
1276 		return (EEXIST);
1277 	}
1278 	++hmp->count_newrecords;
1279 	++hmp->rsv_recs;
1280 	++record->ip->rsv_recs;
1281 	record->ip->hmp->rsv_databytes += record->leaf.data_len;
1282 	record->flags |= HAMMER_RECF_ONRBTREE;
1283 	hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY);
1284 	hammer_rel_mem_record(record);
1285 	return(0);
1286 }
1287 
1288 /************************************************************************
1289  *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
1290  ************************************************************************
1291  *
1292  * These functions augment the B-Tree scanning functions in hammer_btree.c
1293  * by merging in-memory records with on-disk records.
1294  */
1295 
1296 /*
1297  * Locate a particular record either in-memory or on-disk.
1298  *
1299  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1300  * NOT be called to iterate results.
1301  */
1302 int
1303 hammer_ip_lookup(hammer_cursor_t cursor)
1304 {
1305 	int error;
1306 
1307 	/*
1308 	 * If the element is in-memory return it without searching the
1309 	 * on-disk B-Tree
1310 	 */
1311 	KKASSERT(cursor->ip);
1312 	error = hammer_mem_lookup(cursor);
1313 	if (error == 0) {
1314 		cursor->leaf = &cursor->iprec->leaf;
1315 		return(error);
1316 	}
1317 	if (error != ENOENT)
1318 		return(error);
1319 
1320 	/*
1321 	 * If the inode has on-disk components search the on-disk B-Tree.
1322 	 */
1323 	if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1324 		return(error);
1325 	error = hammer_btree_lookup(cursor);
1326 	if (error == 0)
1327 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1328 	return(error);
1329 }
1330 
1331 /*
1332  * Locate the first record within the cursor's key_beg/key_end range,
1333  * restricted to a particular inode.  0 is returned on success, ENOENT
1334  * if no records matched the requested range, or some other error.
1335  *
1336  * When 0 is returned hammer_ip_next() may be used to iterate additional
1337  * records within the requested range.
1338  *
1339  * This function can return EDEADLK, requiring the caller to terminate
1340  * the cursor and try again.
1341  */
1342 int
1343 hammer_ip_first(hammer_cursor_t cursor)
1344 {
1345 	hammer_inode_t ip = cursor->ip;
1346 	int error;
1347 
1348 	KKASSERT(ip != NULL);
1349 
1350 	/*
1351 	 * Clean up fields and setup for merged scan
1352 	 */
1353 	cursor->flags &= ~HAMMER_CURSOR_RETEST;
1354 	cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1355 	cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1356 	if (cursor->iprec) {
1357 		hammer_rel_mem_record(cursor->iprec);
1358 		cursor->iprec = NULL;
1359 	}
1360 
1361 	/*
1362 	 * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1363 	 * exact lookup so if we get ENOENT we have to call the iterate
1364 	 * function to validate the first record after the begin key.
1365 	 *
1366 	 * The ATEDISK flag is used by hammer_btree_iterate to determine
1367 	 * whether it must index forwards or not.  It is also used here
1368 	 * to select the next record from in-memory or on-disk.
1369 	 *
1370 	 * EDEADLK can only occur if the lookup hit an empty internal
1371 	 * element and couldn't delete it.  Since this could only occur
1372 	 * in-range, we can just iterate from the failure point.
1373 	 */
1374 	if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1375 		error = hammer_btree_lookup(cursor);
1376 		if (error == ENOENT || error == EDEADLK) {
1377 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1378 			if (hammer_debug_general & 0x2000)
1379 				kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1380 			error = hammer_btree_iterate(cursor);
1381 		}
1382 		if (error && error != ENOENT)
1383 			return(error);
1384 		if (error == 0) {
1385 			cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1386 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1387 		} else {
1388 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1389 		}
1390 	}
1391 
1392 	/*
1393 	 * Search the in-memory record list (Red-Black tree).  Unlike the
1394 	 * B-Tree search, mem_first checks for records in the range.
1395 	 */
1396 	error = hammer_mem_first(cursor);
1397 	if (error && error != ENOENT)
1398 		return(error);
1399 	if (error == 0) {
1400 		cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1401 		cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1402 		if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1403 			cursor->flags |= HAMMER_CURSOR_ATEMEM;
1404 	}
1405 
1406 	/*
1407 	 * This will return the first matching record.
1408 	 */
1409 	return(hammer_ip_next(cursor));
1410 }
1411 
1412 /*
1413  * Retrieve the next record in a merged iteration within the bounds of the
1414  * cursor.  This call may be made multiple times after the cursor has been
1415  * initially searched with hammer_ip_first().
1416  *
1417  * 0 is returned on success, ENOENT if no further records match the
1418  * requested range, or some other error code is returned.
1419  */
1420 int
1421 hammer_ip_next(hammer_cursor_t cursor)
1422 {
1423 	hammer_btree_elm_t elm;
1424 	hammer_record_t rec, save;
1425 	int error;
1426 	int r;
1427 
1428 next_btree:
1429 	/*
1430 	 * Load the current on-disk and in-memory record.  If we ate any
1431 	 * records we have to get the next one.
1432 	 *
1433 	 * If we deleted the last on-disk record we had scanned ATEDISK will
1434 	 * be clear and RETEST will be set, forcing a call to iterate.  The
1435 	 * fact that ATEDISK is clear causes iterate to re-test the 'current'
1436 	 * element.  If ATEDISK is set, iterate will skip the 'current'
1437 	 * element.
1438 	 *
1439 	 * Get the next on-disk record
1440 	 */
1441 	if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_RETEST)) {
1442 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1443 			error = hammer_btree_iterate(cursor);
1444 			cursor->flags &= ~HAMMER_CURSOR_RETEST;
1445 			if (error == 0) {
1446 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1447 				hammer_cache_node(&cursor->ip->cache[1],
1448 						  cursor->node);
1449 			} else {
1450 				cursor->flags |= HAMMER_CURSOR_DISKEOF |
1451 						 HAMMER_CURSOR_ATEDISK;
1452 			}
1453 		}
1454 	}
1455 
1456 next_memory:
1457 	/*
1458 	 * Get the next in-memory record.
1459 	 *
1460 	 * hammer_rec_scan_cmp:  Is the record still in our general range,
1461 	 *			 (non-inclusive of snapshot exclusions)?
1462 	 * hammer_rec_scan_callback: Is the record in our snapshot?
1463 	 */
1464 	if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1465 		if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1466 			save = cursor->iprec;
1467 			cursor->iprec = NULL;
1468 			rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1469 			while (rec) {
1470 				if (hammer_rec_scan_cmp(rec, cursor) != 0)
1471 					break;
1472 				if (hammer_rec_scan_callback(rec, cursor) != 0)
1473 					break;
1474 				rec = hammer_rec_rb_tree_RB_NEXT(rec);
1475 			}
1476 			if (save)
1477 				hammer_rel_mem_record(save);
1478 			if (cursor->iprec) {
1479 				KKASSERT(cursor->iprec == rec);
1480 				cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1481 			} else {
1482 				cursor->flags |= HAMMER_CURSOR_MEMEOF;
1483 			}
1484 		}
1485 	}
1486 
1487 	/*
1488 	 * The memory record may have become stale while being held in
1489 	 * cursor->iprec.  We are interlocked against the backend on
1490 	 * with regards to B-Tree entries.
1491 	 */
1492 	if ((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0) {
1493 		if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) {
1494 			cursor->flags |= HAMMER_CURSOR_ATEMEM;
1495 			goto next_memory;
1496 		}
1497 	}
1498 
1499 	/*
1500 	 * Extract either the disk or memory record depending on their
1501 	 * relative position.
1502 	 */
1503 	error = 0;
1504 	switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1505 	case 0:
1506 		/*
1507 		 * Both entries valid.   Compare the entries and nominally
1508 		 * return the first one in the sort order.  Numerous cases
1509 		 * require special attention, however.
1510 		 */
1511 		elm = &cursor->node->ondisk->elms[cursor->index];
1512 		r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base);
1513 
1514 		/*
1515 		 * If the two entries differ only by their key (-2/2) or
1516 		 * create_tid (-1/1), and are DATA records, we may have a
1517 		 * nominal match.  We have to calculate the base file
1518 		 * offset of the data.
1519 		 */
1520 		if (r <= 2 && r >= -2 && r != 0 &&
1521 		    cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE &&
1522 		    cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1523 			int64_t base1 = elm->leaf.base.key - elm->leaf.data_len;
1524 			int64_t base2 = cursor->iprec->leaf.base.key -
1525 					cursor->iprec->leaf.data_len;
1526 			if (base1 == base2)
1527 				r = 0;
1528 		}
1529 
1530 		if (r < 0) {
1531 			error = hammer_btree_extract(cursor,
1532 						     HAMMER_CURSOR_GET_LEAF);
1533 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1534 			break;
1535 		}
1536 
1537 		/*
1538 		 * If the entries match exactly the memory entry is either
1539 		 * an on-disk directory entry deletion or a bulk data
1540 		 * overwrite.  If it is a directory entry deletion we eat
1541 		 * both entries.
1542 		 *
1543 		 * For the bulk-data overwrite case it is possible to have
1544 		 * visibility into both, which simply means the syncer
1545 		 * hasn't gotten around to doing the delete+insert sequence
1546 		 * on the B-Tree.  Use the memory entry and throw away the
1547 		 * on-disk entry.
1548 		 *
1549 		 * If the in-memory record is not either of these we
1550 		 * probably caught the syncer while it was syncing it to
1551 		 * the media.  Since we hold a shared lock on the cursor,
1552 		 * the in-memory record had better be marked deleted at
1553 		 * this point.
1554 		 */
1555 		if (r == 0) {
1556 			if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1557 				if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1558 					cursor->flags |= HAMMER_CURSOR_ATEDISK;
1559 					cursor->flags |= HAMMER_CURSOR_ATEMEM;
1560 					goto next_btree;
1561 				}
1562 			} else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1563 				if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1564 					cursor->flags |= HAMMER_CURSOR_ATEDISK;
1565 				}
1566 				/* fall through to memory entry */
1567 			} else {
1568 				panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags);
1569 				cursor->flags |= HAMMER_CURSOR_ATEMEM;
1570 				goto next_memory;
1571 			}
1572 		}
1573 		/* fall through to the memory entry */
1574 	case HAMMER_CURSOR_ATEDISK:
1575 		/*
1576 		 * Only the memory entry is valid.
1577 		 */
1578 		cursor->leaf = &cursor->iprec->leaf;
1579 		cursor->flags |= HAMMER_CURSOR_ATEMEM;
1580 
1581 		/*
1582 		 * If the memory entry is an on-disk deletion we should have
1583 		 * also had found a B-Tree record.  If the backend beat us
1584 		 * to it it would have interlocked the cursor and we should
1585 		 * have seen the in-memory record marked DELETED_FE.
1586 		 */
1587 		if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL &&
1588 		    (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1589 			panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags);
1590 		}
1591 		break;
1592 	case HAMMER_CURSOR_ATEMEM:
1593 		/*
1594 		 * Only the disk entry is valid
1595 		 */
1596 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1597 		cursor->flags |= HAMMER_CURSOR_ATEDISK;
1598 		break;
1599 	default:
1600 		/*
1601 		 * Neither entry is valid
1602 		 *
1603 		 * XXX error not set properly
1604 		 */
1605 		cursor->leaf = NULL;
1606 		error = ENOENT;
1607 		break;
1608 	}
1609 	return(error);
1610 }
1611 
1612 /*
1613  * Resolve the cursor->data pointer for the current cursor position in
1614  * a merged iteration.
1615  */
1616 int
1617 hammer_ip_resolve_data(hammer_cursor_t cursor)
1618 {
1619 	hammer_record_t record;
1620 	int error;
1621 
1622 	if (hammer_cursor_inmem(cursor)) {
1623 		/*
1624 		 * The data associated with an in-memory record is usually
1625 		 * kmalloced, but reserve-ahead data records will have an
1626 		 * on-disk reference.
1627 		 *
1628 		 * NOTE: Reserve-ahead data records must be handled in the
1629 		 * context of the related high level buffer cache buffer
1630 		 * to interlock against async writes.
1631 		 */
1632 		record = cursor->iprec;
1633 		cursor->data = record->data;
1634 		error = 0;
1635 		if (cursor->data == NULL) {
1636 			KKASSERT(record->leaf.base.rec_type ==
1637 				 HAMMER_RECTYPE_DATA);
1638 			cursor->data = hammer_bread_ext(cursor->trans->hmp,
1639 						    record->leaf.data_offset,
1640 						    record->leaf.data_len,
1641 						    &error,
1642 						    &cursor->data_buffer);
1643 		}
1644 	} else {
1645 		cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf;
1646 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1647 	}
1648 	return(error);
1649 }
1650 
1651 /*
1652  * Backend truncation / record replacement - delete records in range.
1653  *
1654  * Delete all records within the specified range for inode ip.  In-memory
1655  * records still associated with the frontend are ignored.
1656  *
1657  * If truncating is non-zero in-memory records associated with the back-end
1658  * are ignored.  If truncating is > 1 we can return EWOULDBLOCK.
1659  *
1660  * NOTES:
1661  *
1662  *	* An unaligned range will cause new records to be added to cover
1663  *        the edge cases. (XXX not implemented yet).
1664  *
1665  *	* Replacement via reservations (see hammer_ip_sync_record_cursor())
1666  *        also do not deal with unaligned ranges.
1667  *
1668  *	* ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1669  *
1670  *	* Record keys for regular file data have to be special-cased since
1671  * 	  they indicate the end of the range (key = base + bytes).
1672  *
1673  *	* This function may be asked to delete ridiculously huge ranges, for
1674  *	  example if someone truncates or removes a 1TB regular file.  We
1675  *	  must be very careful on restarts and we may have to stop w/
1676  *	  EWOULDBLOCK to avoid blowing out the buffer cache.
1677  */
1678 int
1679 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip,
1680 		       int64_t ran_beg, int64_t ran_end, int truncating)
1681 {
1682 	hammer_transaction_t trans = cursor->trans;
1683 	hammer_btree_leaf_elm_t leaf;
1684 	int error;
1685 	int64_t off;
1686 	int64_t tmp64;
1687 
1688 #if 0
1689 	kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1690 #endif
1691 
1692 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1693 retry:
1694 	hammer_normalize_cursor(cursor);
1695 	cursor->key_beg.localization = ip->obj_localization +
1696 				       HAMMER_LOCALIZE_MISC;
1697 	cursor->key_beg.obj_id = ip->obj_id;
1698 	cursor->key_beg.create_tid = 0;
1699 	cursor->key_beg.delete_tid = 0;
1700 	cursor->key_beg.obj_type = 0;
1701 
1702 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1703 		cursor->key_beg.key = ran_beg;
1704 		cursor->key_beg.rec_type = HAMMER_RECTYPE_DB;
1705 	} else {
1706 		/*
1707 		 * The key in the B-Tree is (base+bytes), so the first possible
1708 		 * matching key is ran_beg + 1.
1709 		 */
1710 		cursor->key_beg.key = ran_beg + 1;
1711 		cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA;
1712 	}
1713 
1714 	cursor->key_end = cursor->key_beg;
1715 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1716 		cursor->key_end.key = ran_end;
1717 	} else {
1718 		tmp64 = ran_end + MAXPHYS + 1;	/* work around GCC-4 bug */
1719 		if (tmp64 < ran_end)
1720 			cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1721 		else
1722 			cursor->key_end.key = ran_end + MAXPHYS + 1;
1723 	}
1724 
1725 	cursor->asof = ip->obj_asof;
1726 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1727 	cursor->flags |= HAMMER_CURSOR_ASOF;
1728 	cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1729 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1730 	cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE;
1731 
1732 	error = hammer_ip_first(cursor);
1733 
1734 	/*
1735 	 * Iterate through matching records and mark them as deleted.
1736 	 */
1737 	while (error == 0) {
1738 		leaf = cursor->leaf;
1739 
1740 		KKASSERT(leaf->base.delete_tid == 0);
1741 		KKASSERT(leaf->base.obj_id == ip->obj_id);
1742 
1743 		/*
1744 		 * There may be overlap cases for regular file data.  Also
1745 		 * remember the key for a regular file record is (base + len),
1746 		 * NOT (base).
1747 		 *
1748 		 * Note that do to duplicates (mem & media) allowed by
1749 		 * DELETE_VISIBILITY, off can wind up less then ran_beg.
1750 		 */
1751 		if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
1752 			off = leaf->base.key - leaf->data_len;
1753 			/*
1754 			 * Check the left edge case.  We currently do not
1755 			 * split existing records.
1756 			 */
1757 			if (off < ran_beg && leaf->base.key > ran_beg) {
1758 				panic("hammer left edge case %016llx %d\n",
1759 					leaf->base.key, leaf->data_len);
1760 			}
1761 
1762 			/*
1763 			 * Check the right edge case.  Note that the
1764 			 * record can be completely out of bounds, which
1765 			 * terminates the search.
1766 			 *
1767 			 * base->key is exclusive of the right edge while
1768 			 * ran_end is inclusive of the right edge.  The
1769 			 * (key - data_len) left boundary is inclusive.
1770 			 *
1771 			 * XXX theory-check this test at some point, are
1772 			 * we missing a + 1 somewhere?  Note that ran_end
1773 			 * could overflow.
1774 			 */
1775 			if (leaf->base.key - 1 > ran_end) {
1776 				if (leaf->base.key - leaf->data_len > ran_end)
1777 					break;
1778 				panic("hammer right edge case\n");
1779 			}
1780 		} else {
1781 			off = leaf->base.key;
1782 		}
1783 
1784 		/*
1785 		 * Delete the record.  When truncating we do not delete
1786 		 * in-memory (data) records because they represent data
1787 		 * written after the truncation.
1788 		 *
1789 		 * This will also physically destroy the B-Tree entry and
1790 		 * data if the retention policy dictates.  The function
1791 		 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
1792 		 * to retest the new 'current' element.
1793 		 */
1794 		if (truncating == 0 || hammer_cursor_ondisk(cursor)) {
1795 			error = hammer_ip_delete_record(cursor, ip, trans->tid);
1796 			/*
1797 			 * If we have built up too many meta-buffers we risk
1798 			 * deadlocking the kernel and must stop.  This can
1799 			 * occur when deleting ridiculously huge files.
1800 			 * sync_trunc_off is updated so the next cycle does
1801 			 * not re-iterate records we have already deleted.
1802 			 *
1803 			 * This is only done with formal truncations.
1804 			 */
1805 			if (truncating > 1 && error == 0 &&
1806 			    hammer_flusher_meta_limit(ip->hmp)) {
1807 				ip->sync_trunc_off = off;
1808 				error = EWOULDBLOCK;
1809 			}
1810 		}
1811 		if (error)
1812 			break;
1813 		ran_beg = off;	/* for restart */
1814 		error = hammer_ip_next(cursor);
1815 	}
1816 	if (cursor->node)
1817 		hammer_cache_node(&ip->cache[1], cursor->node);
1818 
1819 	if (error == EDEADLK) {
1820 		hammer_done_cursor(cursor);
1821 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1822 		if (error == 0)
1823 			goto retry;
1824 	}
1825 	if (error == ENOENT)
1826 		error = 0;
1827 	return(error);
1828 }
1829 
1830 /*
1831  * This backend function deletes the specified record on-disk, similar to
1832  * delete_range but for a specific record.  Unlike the exact deletions
1833  * used when deleting a directory entry this function uses an ASOF search
1834  * like delete_range.
1835  *
1836  * This function may be called with ip->obj_asof set for a slave snapshot,
1837  * so don't use it.  We always delete non-historical records only.
1838  */
1839 static int
1840 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
1841 		      hammer_btree_leaf_elm_t leaf)
1842 {
1843 	hammer_transaction_t trans = cursor->trans;
1844 	int error;
1845 
1846 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1847 retry:
1848 	hammer_normalize_cursor(cursor);
1849 	cursor->key_beg = leaf->base;
1850 	cursor->asof = HAMMER_MAX_TID;
1851 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1852 	cursor->flags |= HAMMER_CURSOR_ASOF;
1853 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1854 	cursor->flags &= ~HAMMER_CURSOR_INSERT;
1855 
1856 	error = hammer_btree_lookup(cursor);
1857 	if (error == 0) {
1858 		error = hammer_ip_delete_record(cursor, ip, trans->tid);
1859 	}
1860 	if (error == EDEADLK) {
1861 		hammer_done_cursor(cursor);
1862 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1863 		if (error == 0)
1864 			goto retry;
1865 	}
1866 	return(error);
1867 }
1868 
1869 /*
1870  * This function deletes remaining auxillary records when an inode is
1871  * being deleted.  This function explicitly does not delete the
1872  * inode record, directory entry, data, or db records.  Those must be
1873  * properly disposed of prior to this call.
1874  */
1875 int
1876 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp)
1877 {
1878 	hammer_transaction_t trans = cursor->trans;
1879 	hammer_btree_leaf_elm_t leaf;
1880 	int error;
1881 
1882 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1883 retry:
1884 	hammer_normalize_cursor(cursor);
1885 	cursor->key_beg.localization = ip->obj_localization +
1886 				       HAMMER_LOCALIZE_MISC;
1887 	cursor->key_beg.obj_id = ip->obj_id;
1888 	cursor->key_beg.create_tid = 0;
1889 	cursor->key_beg.delete_tid = 0;
1890 	cursor->key_beg.obj_type = 0;
1891 	cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START;
1892 	cursor->key_beg.key = HAMMER_MIN_KEY;
1893 
1894 	cursor->key_end = cursor->key_beg;
1895 	cursor->key_end.rec_type = HAMMER_RECTYPE_MAX;
1896 	cursor->key_end.key = HAMMER_MAX_KEY;
1897 
1898 	cursor->asof = ip->obj_asof;
1899 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1900 	cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1901 	cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1902 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1903 
1904 	error = hammer_ip_first(cursor);
1905 
1906 	/*
1907 	 * Iterate through matching records and mark them as deleted.
1908 	 */
1909 	while (error == 0) {
1910 		leaf = cursor->leaf;
1911 
1912 		KKASSERT(leaf->base.delete_tid == 0);
1913 
1914 		/*
1915 		 * Mark the record and B-Tree entry as deleted.  This will
1916 		 * also physically delete the B-Tree entry, record, and
1917 		 * data if the retention policy dictates.  The function
1918 		 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
1919 		 * to retest the new 'current' element.
1920 		 *
1921 		 * Directory entries (and delete-on-disk directory entries)
1922 		 * must be synced and cannot be deleted.
1923 		 */
1924 		error = hammer_ip_delete_record(cursor, ip, trans->tid);
1925 		++*countp;
1926 		if (error)
1927 			break;
1928 		error = hammer_ip_next(cursor);
1929 	}
1930 	if (cursor->node)
1931 		hammer_cache_node(&ip->cache[1], cursor->node);
1932 	if (error == EDEADLK) {
1933 		hammer_done_cursor(cursor);
1934 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1935 		if (error == 0)
1936 			goto retry;
1937 	}
1938 	if (error == ENOENT)
1939 		error = 0;
1940 	return(error);
1941 }
1942 
1943 /*
1944  * Delete the record at the current cursor.  On success the cursor will
1945  * be positioned appropriately for an iteration but may no longer be at
1946  * a leaf node.
1947  *
1948  * This routine is only called from the backend.
1949  *
1950  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1951  * cursor and retry.
1952  */
1953 int
1954 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip,
1955 			hammer_tid_t tid)
1956 {
1957 	hammer_record_t iprec;
1958 	hammer_mount_t hmp;
1959 	int error;
1960 
1961 	KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1962 	KKASSERT(tid != 0);
1963 	hmp = cursor->node->hmp;
1964 
1965 	/*
1966 	 * In-memory (unsynchronized) records can simply be freed.  This
1967 	 * only occurs in range iterations since all other records are
1968 	 * individually synchronized.  Thus there should be no confusion with
1969 	 * the interlock.
1970 	 *
1971 	 * An in-memory record may be deleted before being committed to disk,
1972 	 * but could have been accessed in the mean time.  The reservation
1973 	 * code will deal with the case.
1974 	 */
1975 	if (hammer_cursor_inmem(cursor)) {
1976 		iprec = cursor->iprec;
1977 		KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1978 		iprec->flags |= HAMMER_RECF_DELETED_FE;
1979 		iprec->flags |= HAMMER_RECF_DELETED_BE;
1980 		return(0);
1981 	}
1982 
1983 	/*
1984 	 * On-disk records are marked as deleted by updating their delete_tid.
1985 	 * This does not effect their position in the B-Tree (which is based
1986 	 * on their create_tid).
1987 	 *
1988 	 * Frontend B-Tree operations track inodes so we tell
1989 	 * hammer_delete_at_cursor() not to.
1990 	 */
1991 	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1992 
1993 	if (error == 0) {
1994 		error = hammer_delete_at_cursor(
1995 				cursor,
1996 				HAMMER_DELETE_ADJUST | hammer_nohistory(ip),
1997 				cursor->trans->tid,
1998 				cursor->trans->time32,
1999 				0, NULL);
2000 	}
2001 	return(error);
2002 }
2003 
2004 /*
2005  * Delete the B-Tree element at the current cursor and do any necessary
2006  * mirror propagation.
2007  *
2008  * The cursor must be properly positioned for an iteration on return but
2009  * may be pointing at an internal element.
2010  *
2011  * An element can be un-deleted by passing a delete_tid of 0 with
2012  * HAMMER_DELETE_ADJUST.
2013  */
2014 int
2015 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
2016 			hammer_tid_t delete_tid, u_int32_t delete_ts,
2017 			int track, int64_t *stat_bytes)
2018 {
2019 	struct hammer_btree_leaf_elm save_leaf;
2020 	hammer_transaction_t trans;
2021 	hammer_btree_leaf_elm_t leaf;
2022 	hammer_node_t node;
2023 	hammer_btree_elm_t elm;
2024 	hammer_off_t data_offset;
2025 	int32_t data_len;
2026 	u_int16_t rec_type;
2027 	int error;
2028 	int icount;
2029 	int doprop;
2030 
2031 	error = hammer_cursor_upgrade(cursor);
2032 	if (error)
2033 		return(error);
2034 
2035 	trans = cursor->trans;
2036 	node = cursor->node;
2037 	elm = &node->ondisk->elms[cursor->index];
2038 	leaf = &elm->leaf;
2039 	KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
2040 
2041 	hammer_sync_lock_sh(trans);
2042 	doprop = 0;
2043 	icount = 0;
2044 
2045 	/*
2046 	 * Adjust the delete_tid.  Update the mirror_tid propagation field
2047 	 * as well.  delete_tid can be 0 (undelete -- used by mirroring).
2048 	 */
2049 	if (delete_flags & HAMMER_DELETE_ADJUST) {
2050 		if (elm->base.rec_type == HAMMER_RECTYPE_INODE) {
2051 			if (elm->leaf.base.delete_tid == 0 && delete_tid)
2052 				icount = -1;
2053 			if (elm->leaf.base.delete_tid && delete_tid == 0)
2054 				icount = 1;
2055 		}
2056 
2057 		hammer_modify_node(trans, node, elm, sizeof(*elm));
2058 		elm->leaf.base.delete_tid = delete_tid;
2059 		elm->leaf.delete_ts = delete_ts;
2060 		hammer_modify_node_done(node);
2061 
2062 		if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) {
2063 			hammer_modify_node_field(trans, node, mirror_tid);
2064 			node->ondisk->mirror_tid = elm->leaf.base.delete_tid;
2065 			hammer_modify_node_done(node);
2066 			doprop = 1;
2067 			if (hammer_debug_general & 0x0002) {
2068 				kprintf("delete_at_cursor: propagate %016llx"
2069 					" @%016llx\n",
2070 					elm->leaf.base.delete_tid,
2071 					node->node_offset);
2072 			}
2073 		}
2074 
2075 		/*
2076 		 * Adjust for the iteration.  We have deleted the current
2077 		 * element and want to clear ATEDISK so the iteration does
2078 		 * not skip the element after, which now becomes the current
2079 		 * element.  This element must be re-tested if doing an
2080 		 * iteration, which is handled by the RETEST flag.
2081 		 */
2082 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
2083 			cursor->flags |= HAMMER_CURSOR_RETEST;
2084 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
2085 		}
2086 
2087 		/*
2088 		 * An on-disk record cannot have the same delete_tid
2089 		 * as its create_tid.  In a chain of record updates
2090 		 * this could result in a duplicate record.
2091 		 */
2092 		KKASSERT(elm->leaf.base.delete_tid !=
2093 			 elm->leaf.base.create_tid);
2094 	}
2095 
2096 	/*
2097 	 * Destroy the B-Tree element if asked (typically if a nohistory
2098 	 * file or mount, or when called by the pruning code).
2099 	 *
2100 	 * Adjust the ATEDISK flag to properly support iterations.
2101 	 */
2102 	if (delete_flags & HAMMER_DELETE_DESTROY) {
2103 		data_offset = elm->leaf.data_offset;
2104 		data_len = elm->leaf.data_len;
2105 		rec_type = elm->leaf.base.rec_type;
2106 		if (doprop) {
2107 			save_leaf = elm->leaf;
2108 			leaf = &save_leaf;
2109 		}
2110 		if (elm->base.rec_type == HAMMER_RECTYPE_INODE &&
2111 		    elm->leaf.base.delete_tid == 0) {
2112 			icount = -1;
2113 		}
2114 
2115 		error = hammer_btree_delete(cursor);
2116 		if (error == 0) {
2117 			/*
2118 			 * The deletion moves the next element (if any) to
2119 			 * the current element position.  We must clear
2120 			 * ATEDISK so this element is not skipped and we
2121 			 * must set RETEST to force any iteration to re-test
2122 			 * the element.
2123 			 */
2124 			if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
2125 				cursor->flags |= HAMMER_CURSOR_RETEST;
2126 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
2127 			}
2128 		}
2129 		if (error == 0) {
2130 			switch(data_offset & HAMMER_OFF_ZONE_MASK) {
2131 			case HAMMER_ZONE_LARGE_DATA:
2132 			case HAMMER_ZONE_SMALL_DATA:
2133 			case HAMMER_ZONE_META:
2134 				hammer_blockmap_free(trans,
2135 						     data_offset, data_len);
2136 				break;
2137 			default:
2138 				break;
2139 			}
2140 		}
2141 	}
2142 
2143 	/*
2144 	 * Track inode count and next_tid.  This is used by the mirroring
2145 	 * and PFS code.  icount can be negative, zero, or positive.
2146 	 */
2147 	if (error == 0 && track) {
2148 		if (icount) {
2149 			hammer_modify_volume_field(trans, trans->rootvol,
2150 						   vol0_stat_inodes);
2151 			trans->rootvol->ondisk->vol0_stat_inodes += icount;
2152 			hammer_modify_volume_done(trans->rootvol);
2153 		}
2154 		if (trans->rootvol->ondisk->vol0_next_tid < delete_tid) {
2155 			hammer_modify_volume(trans, trans->rootvol, NULL, 0);
2156 			trans->rootvol->ondisk->vol0_next_tid = delete_tid;
2157 			hammer_modify_volume_done(trans->rootvol);
2158 		}
2159 	}
2160 
2161 	/*
2162 	 * mirror_tid propagation occurs if the node's mirror_tid had to be
2163 	 * updated while adjusting the delete_tid.
2164 	 *
2165 	 * This occurs when deleting even in nohistory mode, but does not
2166 	 * occur when pruning an already-deleted node.
2167 	 *
2168 	 * cursor->ip is NULL when called from the pruning, mirroring,
2169 	 * and pfs code.  If non-NULL propagation will be conditionalized
2170 	 * on whether the PFS is in no-history mode or not.
2171 	 */
2172 	if (doprop) {
2173 		if (cursor->ip)
2174 			hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf);
2175 		else
2176 			hammer_btree_do_propagation(cursor, NULL, leaf);
2177 	}
2178 	hammer_sync_unlock(trans);
2179 	return (error);
2180 }
2181 
2182 /*
2183  * Determine whether we can remove a directory.  This routine checks whether
2184  * a directory is empty or not and enforces flush connectivity.
2185  *
2186  * Flush connectivity requires that we block if the target directory is
2187  * currently flushing, otherwise it may not end up in the same flush group.
2188  *
2189  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
2190  */
2191 int
2192 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
2193 {
2194 	struct hammer_cursor cursor;
2195 	int error;
2196 
2197 	/*
2198 	 * Check directory empty
2199 	 */
2200 	hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2201 
2202 	cursor.key_beg.localization = ip->obj_localization +
2203 				      HAMMER_LOCALIZE_MISC;
2204 	cursor.key_beg.obj_id = ip->obj_id;
2205 	cursor.key_beg.create_tid = 0;
2206 	cursor.key_beg.delete_tid = 0;
2207 	cursor.key_beg.obj_type = 0;
2208 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
2209 	cursor.key_beg.key = HAMMER_MIN_KEY;
2210 
2211 	cursor.key_end = cursor.key_beg;
2212 	cursor.key_end.rec_type = 0xFFFF;
2213 	cursor.key_end.key = HAMMER_MAX_KEY;
2214 
2215 	cursor.asof = ip->obj_asof;
2216 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2217 
2218 	error = hammer_ip_first(&cursor);
2219 	if (error == ENOENT)
2220 		error = 0;
2221 	else if (error == 0)
2222 		error = ENOTEMPTY;
2223 	hammer_done_cursor(&cursor);
2224 	return(error);
2225 }
2226 
2227