xref: /dragonfly/sys/vfs/hammer/hammer_object.c (revision 8a7bdfea)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.48 2008/05/02 01:00:42 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
42 
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49 	if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50 		return(-1);
51 	if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52 		return(1);
53 
54 	if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55 		return(-1);
56 	if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57 		return(1);
58 
59 	if (rec1->rec.base.base.create_tid == 0) {
60 		if (rec2->rec.base.base.create_tid == 0)
61 			return(0);
62 		return(1);
63 	}
64 	if (rec2->rec.base.base.create_tid == 0)
65 		return(-1);
66 
67 	if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68 		return(-1);
69 	if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70 		return(1);
71 
72 	/*
73 	 * Never match against an item deleted by the front-end.
74 	 */
75 	if (rec1->flags & HAMMER_RECF_DELETED_FE)
76 		return(1);
77 	if (rec2->flags & HAMMER_RECF_DELETED_FE)
78 		return(-1);
79 
80         return(0);
81 }
82 
83 static int
84 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
85 {
86 	if (info->rec_type < rec->rec.base.base.rec_type)
87 		return(-3);
88 	if (info->rec_type > rec->rec.base.base.rec_type)
89 		return(3);
90 
91         if (info->key < rec->rec.base.base.key)
92                 return(-2);
93         if (info->key > rec->rec.base.base.key)
94                 return(2);
95 
96 	if (info->create_tid == 0) {
97 		if (rec->rec.base.base.create_tid == 0)
98 			return(0);
99 		return(1);
100 	}
101 	if (rec->rec.base.base.create_tid == 0)
102 		return(-1);
103 	if (info->create_tid < rec->rec.base.base.create_tid)
104 		return(-1);
105 	if (info->create_tid > rec->rec.base.base.create_tid)
106 		return(1);
107         return(0);
108 }
109 
110 /*
111  * RB_SCAN comparison code for hammer_mem_first().  The argument order
112  * is reversed so the comparison result has to be negated.  key_beg and
113  * key_end are both range-inclusive.
114  *
115  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
116  * These do not stop the scan.
117  *
118  * Localized deletions are not cached in-memory.
119  */
120 static
121 int
122 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
123 {
124 	hammer_cursor_t cursor = data;
125 	int r;
126 
127 	r = hammer_rec_compare(&cursor->key_beg, rec);
128 	if (r > 1)
129 		return(-1);
130 	r = hammer_rec_compare(&cursor->key_end, rec);
131 	if (r < -1)
132 		return(1);
133 	return(0);
134 }
135 
136 /*
137  * This compare function is used when simply looking up key_beg.
138  */
139 static
140 int
141 hammer_rec_find_cmp(hammer_record_t rec, void *data)
142 {
143 	hammer_cursor_t cursor = data;
144 	int r;
145 
146 	r = hammer_rec_compare(&cursor->key_beg, rec);
147 	if (r > 1)
148 		return(-1);
149 	if (r < -1)
150 		return(1);
151 	return(0);
152 }
153 
154 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
155 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
156 		    hammer_rec_compare, hammer_base_elm_t);
157 
158 /*
159  * Allocate a record for the caller to finish filling in.  The record is
160  * returned referenced.
161  */
162 hammer_record_t
163 hammer_alloc_mem_record(hammer_inode_t ip)
164 {
165 	hammer_record_t record;
166 
167 	++hammer_count_records;
168 	record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
169 	record->flush_state = HAMMER_FST_IDLE;
170 	record->ip = ip;
171 	record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
172 	hammer_ref(&record->lock);
173 	return (record);
174 }
175 
176 void
177 hammer_wait_mem_record(hammer_record_t record)
178 {
179 	while (record->flush_state == HAMMER_FST_FLUSH) {
180 		record->flags |= HAMMER_RECF_WANTED;
181 		tsleep(record, 0, "hmrrc2", 0);
182 	}
183 }
184 
185 /*
186  * Called from the backend, hammer_inode.c, after a record has been
187  * flushed to disk.  The record has been exclusively locked by the
188  * caller and interlocked with BE.
189  *
190  * We clean up the state, unlock, and release the record (the record
191  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
192  */
193 void
194 hammer_flush_record_done(hammer_record_t record, int error)
195 {
196 	hammer_inode_t target_ip;
197 	int cleanup = 0;
198 
199 	KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
200 	KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
201 
202 	if (error) {
203 		/*
204 		 * An error occured, the backend was unable to sync the
205 		 * record to its media.  Leave the record intact.
206 		 */
207 		Debugger("flush_record_done error");
208 	} else if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
209 		/*
210 		 * deleted-record to delete-on-disk conversion, occurs when
211 		 * we sync a record to disk which is marked deleted by the
212 		 * frontend, but not deleted from the point of view of the
213 		 * backend.
214 		 */
215 		if (record->flags & HAMMER_RECF_DELETED_BE) {
216 			record->flags |= HAMMER_RECF_DELETED_FE;
217 			cleanup = 1;
218 		} else {
219 			KKASSERT(record->type == HAMMER_MEM_RECORD_DEL);
220 		}
221 	} else {
222 		/*
223 		 * Normal completion, record has been disposed of (by
224 		 * having been synchronized to the media).
225 		 */
226 		record->flags |= HAMMER_RECF_DELETED_FE;
227 		record->flags |= HAMMER_RECF_DELETED_BE;
228 		cleanup = 1;
229 	}
230 	if (cleanup) {
231 		if ((target_ip = record->target_ip) != NULL) {
232 			TAILQ_REMOVE(&target_ip->target_list, record,
233 				     target_entry);
234 			record->target_ip = NULL;
235 			hammer_test_inode(target_ip);
236 		}
237 		record->flush_state = HAMMER_FST_IDLE;
238 	} else {
239 		if (record->target_ip)
240 			record->flush_state = HAMMER_FST_SETUP;
241 		else
242 			record->flush_state = HAMMER_FST_IDLE;
243 	}
244 
245 	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
246 	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
247 	if (record->flags & HAMMER_RECF_WANTED) {
248 		record->flags &= ~HAMMER_RECF_WANTED;
249 		wakeup(record);
250 	}
251 	hammer_rel_mem_record(record);
252 }
253 
254 /*
255  * Release a memory record.  Records marked for deletion are immediately
256  * removed from the RB-Tree but otherwise left intact until the last ref
257  * goes away.
258  */
259 void
260 hammer_rel_mem_record(struct hammer_record *record)
261 {
262 	hammer_inode_t ip, target_ip;
263 
264 	hammer_unref(&record->lock);
265 
266 	if (record->flags & HAMMER_RECF_DELETED_FE) {
267 		if (record->lock.refs == 0) {
268 			KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
269 
270 			ip = record->ip;
271 			if ((target_ip = record->target_ip) != NULL) {
272 				TAILQ_REMOVE(&target_ip->target_list,
273 					     record, target_entry);
274 				record->target_ip = NULL;
275 				hammer_test_inode(target_ip);
276 			}
277 
278 			if (record->flags & HAMMER_RECF_ONRBTREE) {
279 				RB_REMOVE(hammer_rec_rb_tree,
280 					  &record->ip->rec_tree,
281 					  record);
282 				record->flags &= ~HAMMER_RECF_ONRBTREE;
283 			}
284 			if (record->flags & HAMMER_RECF_ALLOCDATA) {
285 				--hammer_count_record_datas;
286 				kfree(record->data, M_HAMMER);
287 				record->flags &= ~HAMMER_RECF_ALLOCDATA;
288 			}
289 			record->data = NULL;
290 			--hammer_count_records;
291 			kfree(record, M_HAMMER);
292 			return;
293 		}
294 	}
295 }
296 
297 /*
298  * Record visibility depends on whether the record is being accessed by
299  * the backend or the frontend.
300  *
301  * Return non-zero if the record is visible, zero if it isn't or if it is
302  * deleted.
303  */
304 static __inline
305 int
306 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
307 {
308 	if (cursor->flags & HAMMER_CURSOR_BACKEND) {
309 		if (record->flags & HAMMER_RECF_DELETED_BE)
310 			return(0);
311 		if ((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0)
312 			return(0);
313 	} else {
314 		if (record->flags & HAMMER_RECF_DELETED_FE)
315 			return(0);
316 	}
317 	return(1);
318 }
319 
320 /*
321  * This callback is used as part of the RB_SCAN function for in-memory
322  * records.  We terminate it (return -1) as soon as we get a match.
323  *
324  * This routine is used by frontend code.
325  *
326  * The primary compare code does not account for ASOF lookups.  This
327  * code handles that case as well as a few others.
328  */
329 static
330 int
331 hammer_rec_scan_callback(hammer_record_t rec, void *data)
332 {
333 	hammer_cursor_t cursor = data;
334 
335 	/*
336 	 * We terminate on success, so this should be NULL on entry.
337 	 */
338 	KKASSERT(cursor->iprec == NULL);
339 
340 	/*
341 	 * Skip if the record was marked deleted.
342 	 */
343 	if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
344 		return(0);
345 
346 	/*
347 	 * Skip if not visible due to our as-of TID
348 	 */
349         if (cursor->flags & HAMMER_CURSOR_ASOF) {
350                 if (cursor->asof < rec->rec.base.base.create_tid)
351                         return(0);
352                 if (rec->rec.base.base.delete_tid &&
353 		    cursor->asof >= rec->rec.base.base.delete_tid) {
354                         return(0);
355 		}
356         }
357 
358 	/*
359 	 * If the record is queued to the flusher we have to block until
360 	 * it isn't.  Otherwise we may see duplication between our memory
361 	 * cache and the media.
362 	 */
363 	hammer_ref(&rec->lock);
364 
365 #warning "This deadlocks"
366 #if 0
367 	if (rec->flush_state == HAMMER_FST_FLUSH)
368 		hammer_wait_mem_record(rec);
369 #endif
370 
371 	/*
372 	 * The record may have been deleted while we were blocked.
373 	 */
374 	if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
375 		hammer_rel_mem_record(rec);
376 		return(0);
377 	}
378 
379 	/*
380 	 * Set the matching record and stop the scan.
381 	 */
382 	cursor->iprec = rec;
383 	return(-1);
384 }
385 
386 
387 /*
388  * Lookup an in-memory record given the key specified in the cursor.  Works
389  * just like hammer_btree_lookup() but operates on an inode's in-memory
390  * record list.
391  *
392  * The lookup must fail if the record is marked for deferred deletion.
393  */
394 static
395 int
396 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
397 {
398 	int error;
399 
400 	if (cursor->iprec) {
401 		hammer_rel_mem_record(cursor->iprec);
402 		cursor->iprec = NULL;
403 	}
404 	if (cursor->ip) {
405 		KKASSERT(cursor->ip->cursor_ip_refs > 0);
406 		--cursor->ip->cursor_ip_refs;
407 #if 0
408 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
409 						  &cursor->ip->rec_tree);
410 #endif
411 	}
412 	cursor->ip = ip;
413 #if 0
414 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
415 #endif
416 	++ip->cursor_ip_refs;
417 
418 #if 0
419 	cursor->scan.node = NULL;
420 #endif
421 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
422 				   hammer_rec_scan_callback, cursor);
423 
424 	if (cursor->iprec == NULL)
425 		error = ENOENT;
426 	else
427 		error = 0;
428 	return(error);
429 }
430 
431 /*
432  * hammer_mem_first() - locate the first in-memory record matching the
433  * cursor within the bounds of the key range.
434  */
435 static
436 int
437 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
438 {
439 	if (cursor->iprec) {
440 		hammer_rel_mem_record(cursor->iprec);
441 		cursor->iprec = NULL;
442 	}
443 	if (cursor->ip) {
444 		KKASSERT(cursor->ip->cursor_ip_refs > 0);
445 		--cursor->ip->cursor_ip_refs;
446 #if 0
447 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
448 						  &cursor->ip->rec_tree);
449 #endif
450 	}
451 	cursor->ip = ip;
452 #if 0
453 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
454 #endif
455 	++ip->cursor_ip_refs;
456 
457 #if 0
458 	cursor->scan.node = NULL;
459 #endif
460 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
461 				   hammer_rec_scan_callback, cursor);
462 
463 	/*
464 	 * Adjust scan.node and keep it linked into the RB-tree so we can
465 	 * hold the cursor through third party modifications of the RB-tree.
466 	 */
467 	if (cursor->iprec) {
468 #if 0
469 		cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
470 #endif
471 		return(0);
472 	}
473 	return(ENOENT);
474 }
475 
476 void
477 hammer_mem_done(hammer_cursor_t cursor)
478 {
479 	if (cursor->ip) {
480 		KKASSERT(cursor->ip->cursor_ip_refs > 0);
481 		--cursor->ip->cursor_ip_refs;
482 #if 0
483 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
484 						  &cursor->ip->rec_tree);
485 #endif
486 		cursor->ip = NULL;
487 	}
488         if (cursor->iprec) {
489 		hammer_rel_mem_record(cursor->iprec);
490 		cursor->iprec = NULL;
491 	}
492 }
493 
494 /************************************************************************
495  *		     HAMMER IN-MEMORY RECORD FUNCTIONS			*
496  ************************************************************************
497  *
498  * These functions manipulate in-memory records.  Such records typically
499  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
500  */
501 
502 /*
503  * Add a directory entry (dip,ncp) which references inode (ip).
504  *
505  * Note that the low 32 bits of the namekey are set temporarily to create
506  * a unique in-memory record, and may be modified a second time when the
507  * record is synchronized to disk.  In particular, the low 32 bits cannot be
508  * all 0's when synching to disk, which is not handled here.
509  */
510 int
511 hammer_ip_add_directory(struct hammer_transaction *trans,
512 		     struct hammer_inode *dip, struct namecache *ncp,
513 		     struct hammer_inode *ip)
514 {
515 	hammer_record_t record;
516 	int error;
517 	int bytes;
518 
519 	record = hammer_alloc_mem_record(dip);
520 
521 	bytes = ncp->nc_nlen;	/* NOTE: terminating \0 is NOT included */
522 	if (++trans->hmp->namekey_iterator == 0)
523 		++trans->hmp->namekey_iterator;
524 
525 	record->type = HAMMER_MEM_RECORD_ADD;
526 	record->rec.entry.base.base.obj_id = dip->obj_id;
527 	record->rec.entry.base.base.key =
528 		hammer_directory_namekey(ncp->nc_name, bytes);
529 	record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
530 	record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
531 	record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
532 	record->rec.entry.obj_id = ip->obj_id;
533 	record->data = (void *)ncp->nc_name;
534 	record->rec.entry.base.data_len = bytes;
535 	++ip->ino_rec.ino_nlinks;
536 	hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
537 
538 	/*
539 	 * The target inode and the directory entry are bound together.
540 	 */
541 	record->target_ip = ip;
542 	record->flush_state = HAMMER_FST_SETUP;
543 	TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
544 
545 	/*
546 	 * The inode now has a dependancy and must be taken out of the idle
547 	 * state.  An inode not in an idle state is given an extra reference.
548 	 */
549 	if (ip->flush_state == HAMMER_FST_IDLE) {
550 		hammer_ref(&ip->lock);
551 		ip->flush_state = HAMMER_FST_SETUP;
552 	}
553 
554 	/* NOTE: copies record->data */
555 	error = hammer_mem_add(trans, record);
556 	return(error);
557 }
558 
559 /*
560  * Delete the directory entry and update the inode link count.  The
561  * cursor must be seeked to the directory entry record being deleted.
562  *
563  * The related inode should be share-locked by the caller.  The caller is
564  * on the frontend.
565  *
566  * This function can return EDEADLK requiring the caller to terminate
567  * the cursor, any locks, wait on the returned record, and retry.
568  */
569 int
570 hammer_ip_del_directory(struct hammer_transaction *trans,
571 		     hammer_cursor_t cursor, struct hammer_inode *dip,
572 		     struct hammer_inode *ip)
573 {
574 	hammer_record_t record;
575 	int error;
576 
577 	if (cursor->record == &cursor->iprec->rec) {
578 		/*
579 		 * In-memory (unsynchronized) records can simply be freed.
580 		 * Even though the HAMMER_RECF_DELETED_FE flag is ignored
581 		 * by the backend, we must still avoid races against the
582 		 * backend potentially syncing the record to the media.
583 		 *
584 		 * We cannot call hammer_ip_delete_record(), that routine may
585 		 * only be called from the backend.
586 		 */
587 		record = cursor->iprec;
588 		if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
589 			KKASSERT(cursor->deadlk_rec == NULL);
590 			hammer_ref(&record->lock);
591 			cursor->deadlk_rec = record;
592 			error = EDEADLK;
593 		} else {
594 			KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
595 			record->flags |= HAMMER_RECF_DELETED_FE;
596 			error = 0;
597 		}
598 	} else {
599 		/*
600 		 * If the record is on-disk we have to queue the deletion by
601 		 * the record's key.  This also causes lookups to skip the
602 		 * record.
603 		 */
604 		record = hammer_alloc_mem_record(dip);
605 		record->type = HAMMER_MEM_RECORD_DEL;
606 		record->rec.entry.base.base = cursor->record->base.base;
607 		hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
608 
609 		record->target_ip = ip;
610 		record->flush_state = HAMMER_FST_SETUP;
611 		TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
612 
613 		/*
614 		 * The inode now has a dependancy and must be taken out of
615 		 * the idle state.  An inode not in an idle state is given
616 		 * an extra reference.
617 		 */
618 		if (ip->flush_state == HAMMER_FST_IDLE) {
619 			hammer_ref(&ip->lock);
620 			ip->flush_state = HAMMER_FST_SETUP;
621 		}
622 
623 		error = hammer_mem_add(trans, record);
624 	}
625 
626 	/*
627 	 * One less link.  The file may still be open in the OS even after
628 	 * all links have gone away so we only try to sync if the OS has
629 	 * no references and nlinks falls to 0.
630 	 *
631 	 * We have to terminate the cursor before syncing the inode to
632 	 * avoid deadlocking against ourselves.
633 	 *
634 	 * XXX we can't sync the inode here because the encompassing
635 	 * transaction might be a rename and might update the inode
636 	 * again with a new link.  That would force the delete_tid to be
637 	 * the same as the create_tid and cause a panic.
638 	 */
639 	if (error == 0) {
640 		--ip->ino_rec.ino_nlinks;
641 		hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
642 		if (ip->ino_rec.ino_nlinks == 0 &&
643 		    (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
644 			hammer_done_cursor(cursor);
645 		}
646 
647 	}
648 	return(error);
649 }
650 
651 /*
652  * Add a record to an inode.
653  *
654  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
655  * initialize the following additional fields:
656  *
657  * The related inode should be share-locked by the caller.  The caller is
658  * on the frontend.
659  *
660  * record->rec.entry.base.base.key
661  * record->rec.entry.base.base.rec_type
662  * record->rec.entry.base.base.data_len
663  * record->data		(a copy will be kmalloc'd if it cannot be embedded)
664  */
665 int
666 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
667 {
668 	hammer_inode_t ip = record->ip;
669 	int error;
670 
671 	record->rec.base.base.obj_id = ip->obj_id;
672 	record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
673 
674 	hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
675 
676 	/* NOTE: copies record->data */
677 	error = hammer_mem_add(trans, record);
678 	return(error);
679 }
680 
681 /*
682  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
683  * is called via the strategy called from a cached data source.  This code
684  * is responsible for actually writing a data record out to the disk.
685  *
686  * This can only occur non-historically (i.e. 'current' data only).
687  *
688  * The file offset must be HAMMER_BUFSIZE aligned but the data length
689  * can be truncated.  The record (currently) always represents a BUFSIZE
690  * swath of space whether the data is truncated or not.
691  */
692 int
693 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
694 		       int64_t offset, void *data, int bytes)
695 {
696 	struct hammer_cursor cursor;
697 	hammer_record_ondisk_t rec;
698 	union hammer_btree_elm elm;
699 	hammer_off_t rec_offset;
700 	void *bdata;
701 	int error;
702 
703 	KKASSERT((offset & HAMMER_BUFMASK) == 0);
704 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
705 retry:
706 	error = hammer_init_cursor(trans, &cursor, &ip->cache[0]);
707 	if (error)
708 		return(error);
709 	cursor.key_beg.obj_id = ip->obj_id;
710 	cursor.key_beg.key = offset + bytes;
711 	cursor.key_beg.create_tid = trans->tid;
712 	cursor.key_beg.delete_tid = 0;
713 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
714 	cursor.asof = trans->tid;
715 	cursor.flags |= HAMMER_CURSOR_INSERT;
716 	cursor.flags |= HAMMER_CURSOR_BACKEND;
717 
718 	/*
719 	 * Issue a lookup to position the cursor.
720 	 */
721 	error = hammer_btree_lookup(&cursor);
722 	if (error == 0) {
723 		kprintf("hammer_ip_sync_data: duplicate data at "
724 			"(%lld,%d) tid %016llx\n",
725 			offset, bytes, trans->tid);
726 		hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
727 				       HAMMER_BTREE_TYPE_LEAF, cursor.index);
728 		panic("Duplicate data");
729 		error = EIO;
730 	}
731 	if (error != ENOENT)
732 		goto done;
733 
734 	/*
735 	 * Allocate record and data space.  HAMMER_RECTYPE_DATA records
736 	 * can cross buffer boundaries so we may have to split our bcopy.
737 	 */
738 	rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
739 				  &cursor.record_buffer,
740 				  bytes, &bdata,
741 				  &cursor.data_buffer, &error);
742 	if (rec == NULL)
743 		goto done;
744 	if (hammer_debug_general & 0x1000)
745 		kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
746 
747 	/*
748 	 * Fill everything in and insert our B-Tree node.
749 	 *
750 	 * NOTE: hammer_alloc_record() has already marked the related
751 	 * buffers as modified.  If we do it again we will generate
752 	 * unnecessary undo elements.
753 	 */
754 	hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
755 	rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
756 	rec->base.base.obj_id = ip->obj_id;
757 	rec->base.base.key = offset + bytes;
758 	rec->base.base.create_tid = trans->tid;
759 	rec->base.base.delete_tid = 0;
760 	rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
761 	rec->base.data_crc = crc32(data, bytes);
762 	hammer_modify_buffer_done(cursor.record_buffer);
763 	KKASSERT(rec->base.data_len == bytes);
764 
765 	hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
766 	bcopy(data, bdata, bytes);
767 	hammer_modify_buffer_done(cursor.data_buffer);
768 
769 	elm.leaf.base = rec->base.base;
770 	elm.leaf.rec_offset = rec_offset;
771 	elm.leaf.data_offset = rec->base.data_off;
772 	elm.leaf.data_len = bytes;
773 	elm.leaf.data_crc = rec->base.data_crc;
774 
775 	/*
776 	 * Data records can wind up on-disk before the inode itself is
777 	 * on-disk.  One must assume data records may be on-disk if either
778 	 * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
779 	 */
780 	ip->flags |= HAMMER_INODE_DONDISK;
781 
782 	error = hammer_btree_insert(&cursor, &elm);
783 	if (error == 0)
784 		goto done;
785 
786 	hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
787 done:
788 	hammer_done_cursor(&cursor);
789 	if (error == EDEADLK)
790 		goto retry;
791 	return(error);
792 }
793 
794 /*
795  * Sync an in-memory record to the disk.  This is called by the backend.
796  * This code is responsible for actually writing a record out to the disk.
797  *
798  * This routine can only be called by the backend and the record
799  * must have been interlocked with BE.  It will remain interlocked on
800  * return.  The caller is responsible for the record's disposition.
801  */
802 int
803 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
804 {
805 	struct hammer_cursor cursor;
806 	hammer_record_ondisk_t rec;
807 	union hammer_btree_elm elm;
808 	hammer_off_t rec_offset;
809 	void *bdata;
810 	int error;
811 
812 	KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
813 	KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
814 
815 retry:
816 	/*
817 	 * Get a cursor, we will either be inserting or deleting.
818 	 */
819 	error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]);
820 	if (error)
821 		return(error);
822 	cursor.key_beg = record->rec.base.base;
823 	cursor.flags |= HAMMER_CURSOR_BACKEND;
824 
825 	/*
826 	 * If we are deleting an exact match must be found on-disk.
827 	 */
828 	if (record->type == HAMMER_MEM_RECORD_DEL) {
829 		error = hammer_btree_lookup(&cursor);
830 		if (error == 0)
831 			error = hammer_ip_delete_record(&cursor, trans->tid);
832 		goto done;
833 	}
834 
835 	/*
836 	 * We are inserting.
837 	 *
838 	 * Issue a lookup to position the cursor and locate the cluster.  The
839 	 * target key should not exist.  If we are creating a directory entry
840 	 * we may have to iterate the low 32 bits of the key to find an unused
841 	 * key.
842 	 */
843 	cursor.flags |= HAMMER_CURSOR_INSERT;
844 
845 	for (;;) {
846 		error = hammer_btree_lookup(&cursor);
847 		if (error)
848 			break;
849 		if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
850 			kprintf("hammer_ip_sync_record: duplicate rec "
851 				"at (%016llx)\n", record->rec.base.base.key);
852 			Debugger("duplicate record1");
853 			error = EIO;
854 			break;
855 		}
856 		if (++trans->hmp->namekey_iterator == 0)
857 			++trans->hmp->namekey_iterator;
858 		record->rec.base.base.key &= ~(0xFFFFFFFFLL);
859 		record->rec.base.base.key |= trans->hmp->namekey_iterator;
860 		cursor.key_beg.key = record->rec.base.base.key;
861 	}
862 	if (error != ENOENT)
863 		goto done;
864 
865 	/*
866 	 * Allocate the record and data.  The result buffers will be
867 	 * marked as being modified and further calls to
868 	 * hammer_modify_buffer() will result in unneeded UNDO records.
869 	 *
870 	 * Support zero-fill records (data == NULL and data_len != 0)
871 	 */
872 	if (record->data == NULL) {
873 		rec = hammer_alloc_record(trans, &rec_offset,
874 					  record->rec.base.base.rec_type,
875 					  &cursor.record_buffer,
876 					  0, &bdata,
877 					  NULL, &error);
878 		if (hammer_debug_general & 0x1000)
879 			kprintf("NULL RECORD DATA\n");
880 	} else if (record->flags & HAMMER_RECF_INBAND) {
881 		rec = hammer_alloc_record(trans, &rec_offset,
882 					  record->rec.base.base.rec_type,
883 					  &cursor.record_buffer,
884 					  record->rec.base.data_len, &bdata,
885 					  NULL, &error);
886 		if (hammer_debug_general & 0x1000)
887 			kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
888 	} else {
889 		rec = hammer_alloc_record(trans, &rec_offset,
890 					  record->rec.base.base.rec_type,
891 					  &cursor.record_buffer,
892 					  record->rec.base.data_len, &bdata,
893 					  &cursor.data_buffer, &error);
894 		if (hammer_debug_general & 0x1000)
895 			kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
896 	}
897 
898 	if (rec == NULL)
899 		goto done;
900 
901 	/*
902 	 * Fill in the remaining fields and insert our B-Tree node.
903 	 */
904 	hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
905 	rec->base.base = record->rec.base.base;
906 	bcopy(&record->rec.base + 1, &rec->base + 1,
907 	      HAMMER_RECORD_SIZE - sizeof(record->rec.base));
908 
909 	/*
910 	 * Copy the data and deal with zero-fill support.
911 	 */
912 	if (record->data && (record->flags & HAMMER_RECF_INBAND)) {
913 		rec->base.data_crc = crc32(record->data, rec->base.data_len);
914 		bcopy(record->data, bdata, rec->base.data_len);
915 	} else if (record->data) {
916 		rec->base.data_crc = crc32(record->data, rec->base.data_len);
917 		hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
918 		bcopy(record->data, bdata, rec->base.data_len);
919 		hammer_modify_buffer_done(cursor.data_buffer);
920 	} else {
921 		rec->base.data_len = record->rec.base.data_len;
922 	}
923 	hammer_modify_buffer_done(cursor.record_buffer);
924 
925 	elm.leaf.base = record->rec.base.base;
926 	elm.leaf.rec_offset = rec_offset;
927 	elm.leaf.data_offset = rec->base.data_off;
928 	elm.leaf.data_len = rec->base.data_len;
929 	elm.leaf.data_crc = rec->base.data_crc;
930 
931 	error = hammer_btree_insert(&cursor, &elm);
932 
933 	/*
934 	 * This occurs when the frontend creates a record and queues it to
935 	 * the backend, then tries to delete the record.  The backend must
936 	 * still sync the record to the media as if it were not deleted,
937 	 * but must interlock with the frontend to ensure that the
938 	 * synchronized record is not visible to the frontend, which means
939 	 * converting it from an ADD record to a DEL record.
940 	 *
941 	 * The DEL record then masks the record synced to disk until another
942 	 * round can delete it for real.
943 	 */
944 	if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE)) {
945 		KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
946                 record->flags &= ~HAMMER_RECF_DELETED_FE;
947 		record->type = HAMMER_MEM_RECORD_DEL;
948 		if (record->flush_state == HAMMER_FST_SETUP) {
949 			hammer_test_inode(record->ip);
950 			hammer_test_inode(record->target_ip);
951 		}
952 	}
953 
954 	/*
955 	 * If the error occured unwind the operation.
956 	 */
957 	if (error)
958 		hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
959 
960 done:
961 	hammer_done_cursor(&cursor);
962 	if (error == EDEADLK)
963 		goto retry;
964 	return(error);
965 }
966 
967 /*
968  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
969  * entry's key is used to deal with hash collisions in the upper 32 bits.
970  * A unique 64 bit key is generated in-memory and may be regenerated a
971  * second time when the directory record is flushed to the on-disk B-Tree.
972  *
973  * A referenced record is passed to this function.  This function
974  * eats the reference.  If an error occurs the record will be deleted.
975  *
976  * A copy of the temporary record->data pointer provided by the caller
977  * will be made.
978  */
979 static
980 int
981 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
982 {
983 	void *data;
984 	int bytes;
985 	int reclen;
986 
987 	/*
988 	 * Make a private copy of record->data
989 	 */
990 	if (record->data) {
991 		/*
992 		 * Try to embed the data in extra space in the record
993 		 * union, otherwise allocate a copy.
994 		 */
995 		bytes = record->rec.base.data_len;
996 		switch(record->rec.base.base.rec_type) {
997 		case HAMMER_RECTYPE_DIRENTRY:
998 			reclen = offsetof(struct hammer_entry_record, name[0]);
999 			break;
1000 		case HAMMER_RECTYPE_DATA:
1001 			reclen = offsetof(struct hammer_data_record, data[0]);
1002 			break;
1003 		default:
1004 			reclen = sizeof(record->rec);
1005 			break;
1006 		}
1007 		if (reclen + bytes <= HAMMER_RECORD_SIZE) {
1008 			bcopy(record->data, (char *)&record->rec + reclen,
1009 			      bytes);
1010 			record->data = (void *)((char *)&record->rec + reclen);
1011 			record->flags |= HAMMER_RECF_INBAND;
1012 		} else {
1013 			++hammer_count_record_datas;
1014 			data = kmalloc(bytes, M_HAMMER, M_WAITOK);
1015 			record->flags |= HAMMER_RECF_ALLOCDATA;
1016 			bcopy(record->data, data, bytes);
1017 			record->data = data;
1018 		}
1019 	}
1020 
1021 	/*
1022 	 * Insert into the RB tree, find an unused iterator if this is
1023 	 * a directory entry.
1024 	 */
1025 	while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1026 		if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
1027 			record->flags |= HAMMER_RECF_DELETED_FE;
1028 			hammer_rel_mem_record(record);
1029 			return (EEXIST);
1030 		}
1031 		if (++trans->hmp->namekey_iterator == 0)
1032 			++trans->hmp->namekey_iterator;
1033 		record->rec.base.base.key &= ~(0xFFFFFFFFLL);
1034 		record->rec.base.base.key |= trans->hmp->namekey_iterator;
1035 	}
1036 	record->flags |= HAMMER_RECF_ONRBTREE;
1037 	hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
1038 	hammer_rel_mem_record(record);
1039 	return(0);
1040 }
1041 
1042 /************************************************************************
1043  *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
1044  ************************************************************************
1045  *
1046  * These functions augment the B-Tree scanning functions in hammer_btree.c
1047  * by merging in-memory records with on-disk records.
1048  */
1049 
1050 /*
1051  * Locate a particular record either in-memory or on-disk.
1052  *
1053  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1054  * NOT be called to iterate results.
1055  */
1056 int
1057 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
1058 {
1059 	int error;
1060 
1061 	/*
1062 	 * If the element is in-memory return it without searching the
1063 	 * on-disk B-Tree
1064 	 */
1065 	error = hammer_mem_lookup(cursor, ip);
1066 	if (error == 0) {
1067 		cursor->record = &cursor->iprec->rec;
1068 		return(error);
1069 	}
1070 	if (error != ENOENT)
1071 		return(error);
1072 
1073 	/*
1074 	 * If the inode has on-disk components search the on-disk B-Tree.
1075 	 */
1076 	if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1077 		return(error);
1078 	error = hammer_btree_lookup(cursor);
1079 	if (error == 0)
1080 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1081 	return(error);
1082 }
1083 
1084 /*
1085  * Locate the first record within the cursor's key_beg/key_end range,
1086  * restricted to a particular inode.  0 is returned on success, ENOENT
1087  * if no records matched the requested range, or some other error.
1088  *
1089  * When 0 is returned hammer_ip_next() may be used to iterate additional
1090  * records within the requested range.
1091  *
1092  * This function can return EDEADLK, requiring the caller to terminate
1093  * the cursor and try again.
1094  */
1095 int
1096 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
1097 {
1098 	int error;
1099 
1100 	/*
1101 	 * Clean up fields and setup for merged scan
1102 	 */
1103 	cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1104 	cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1105 	cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1106 	if (cursor->iprec) {
1107 		hammer_rel_mem_record(cursor->iprec);
1108 		cursor->iprec = NULL;
1109 	}
1110 
1111 	/*
1112 	 * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1113 	 * exact lookup so if we get ENOENT we have to call the iterate
1114 	 * function to validate the first record after the begin key.
1115 	 *
1116 	 * The ATEDISK flag is used by hammer_btree_iterate to determine
1117 	 * whether it must index forwards or not.  It is also used here
1118 	 * to select the next record from in-memory or on-disk.
1119 	 *
1120 	 * EDEADLK can only occur if the lookup hit an empty internal
1121 	 * element and couldn't delete it.  Since this could only occur
1122 	 * in-range, we can just iterate from the failure point.
1123 	 */
1124 	if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1125 		error = hammer_btree_lookup(cursor);
1126 		if (error == ENOENT || error == EDEADLK) {
1127 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1128 			if (hammer_debug_general & 0x2000)
1129 				kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1130 			error = hammer_btree_iterate(cursor);
1131 		}
1132 		if (error && error != ENOENT)
1133 			return(error);
1134 		if (error == 0) {
1135 			cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1136 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1137 		} else {
1138 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1139 		}
1140 	}
1141 
1142 	/*
1143 	 * Search the in-memory record list (Red-Black tree).  Unlike the
1144 	 * B-Tree search, mem_first checks for records in the range.
1145 	 */
1146 	error = hammer_mem_first(cursor, ip);
1147 	if (error && error != ENOENT)
1148 		return(error);
1149 	if (error == 0) {
1150 		cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1151 		cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1152 		if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1153 			cursor->flags |= HAMMER_CURSOR_ATEMEM;
1154 	}
1155 
1156 	/*
1157 	 * This will return the first matching record.
1158 	 */
1159 	return(hammer_ip_next(cursor));
1160 }
1161 
1162 /*
1163  * Retrieve the next record in a merged iteration within the bounds of the
1164  * cursor.  This call may be made multiple times after the cursor has been
1165  * initially searched with hammer_ip_first().
1166  *
1167  * 0 is returned on success, ENOENT if no further records match the
1168  * requested range, or some other error code is returned.
1169  */
1170 int
1171 hammer_ip_next(hammer_cursor_t cursor)
1172 {
1173 	hammer_btree_elm_t elm;
1174 	hammer_record_t rec, save;
1175 	int error;
1176 	int r;
1177 
1178 next_btree:
1179 	/*
1180 	 * Load the current on-disk and in-memory record.  If we ate any
1181 	 * records we have to get the next one.
1182 	 *
1183 	 * If we deleted the last on-disk record we had scanned ATEDISK will
1184 	 * be clear and DELBTREE will be set, forcing a call to iterate. The
1185 	 * fact that ATEDISK is clear causes iterate to re-test the 'current'
1186 	 * element.  If ATEDISK is set, iterate will skip the 'current'
1187 	 * element.
1188 	 *
1189 	 * Get the next on-disk record
1190 	 */
1191 	if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1192 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1193 			error = hammer_btree_iterate(cursor);
1194 			cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1195 			if (error == 0)
1196 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1197 			else
1198 				cursor->flags |= HAMMER_CURSOR_DISKEOF |
1199 						 HAMMER_CURSOR_ATEDISK;
1200 		}
1201 	}
1202 
1203 next_memory:
1204 	/*
1205 	 * Get the next in-memory record.  The record can be ripped out
1206 	 * of the RB tree so we maintain a scan_info structure to track
1207 	 * the next node.
1208 	 *
1209 	 * hammer_rec_scan_cmp:  Is the record still in our general range,
1210 	 *			 (non-inclusive of snapshot exclusions)?
1211 	 * hammer_rec_scan_callback: Is the record in our snapshot?
1212 	 */
1213 	if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1214 		if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1215 			save = cursor->iprec;
1216 			cursor->iprec = NULL;
1217 			rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1218 			while (rec) {
1219 				if (hammer_rec_scan_cmp(rec, cursor) != 0)
1220 					break;
1221 				if (hammer_rec_scan_callback(rec, cursor) != 0)
1222 					break;
1223 				rec = hammer_rec_rb_tree_RB_NEXT(rec);
1224 			}
1225 			if (save)
1226 				hammer_rel_mem_record(save);
1227 			if (cursor->iprec) {
1228 				KKASSERT(cursor->iprec == rec);
1229 				cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1230 #if 0
1231 				cursor->scan.node =
1232 					hammer_rec_rb_tree_RB_NEXT(rec);
1233 #endif
1234 			} else {
1235 				cursor->flags |= HAMMER_CURSOR_MEMEOF;
1236 			}
1237 		}
1238 	}
1239 
1240 	/*
1241 	 * Extract either the disk or memory record depending on their
1242 	 * relative position.
1243 	 */
1244 	error = 0;
1245 	switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1246 	case 0:
1247 		/*
1248 		 * Both entries valid
1249 		 */
1250 		elm = &cursor->node->ondisk->elms[cursor->index];
1251 		r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1252 		if (r < 0) {
1253 			error = hammer_btree_extract(cursor,
1254 						     HAMMER_CURSOR_GET_RECORD);
1255 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1256 			break;
1257 		}
1258 
1259 		/*
1260 		 * If the entries match exactly the memory entry typically
1261 		 * specifies an on-disk deletion and we eat both entries.
1262 		 *
1263 		 * If the in-memory record is not an on-disk deletion we
1264 		 * probably caught the syncer while it was syncing it to
1265 		 * the media.  Since we hold a shared lock on the cursor,
1266 		 * the in-memory record had better be marked deleted at
1267 		 * this point.
1268 		 */
1269 		if (r == 0) {
1270 			if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1271 				if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1272 					cursor->flags |= HAMMER_CURSOR_ATEDISK;
1273 					cursor->flags |= HAMMER_CURSOR_ATEMEM;
1274 					goto next_btree;
1275 				}
1276 			} else {
1277 				KKASSERT(hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0);
1278 				cursor->flags |= HAMMER_CURSOR_ATEMEM;
1279 				goto next_memory;
1280 			}
1281 		}
1282 		/* fall through to the memory entry */
1283 	case HAMMER_CURSOR_ATEDISK:
1284 		/*
1285 		 * Only the memory entry is valid.  If the record is
1286 		 * placemarking an on-disk deletion, we skip it unless
1287 		 * the caller wants special record visibility.
1288 		 */
1289 		cursor->record = &cursor->iprec->rec;
1290 		cursor->flags |= HAMMER_CURSOR_ATEMEM;
1291 		if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1292 			if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
1293 				goto next_memory;
1294 		}
1295 		break;
1296 	case HAMMER_CURSOR_ATEMEM:
1297 		/*
1298 		 * Only the disk entry is valid
1299 		 */
1300 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1301 		cursor->flags |= HAMMER_CURSOR_ATEDISK;
1302 		break;
1303 	default:
1304 		/*
1305 		 * Neither entry is valid
1306 		 *
1307 		 * XXX error not set properly
1308 		 */
1309 		cursor->record = NULL;
1310 		error = ENOENT;
1311 		break;
1312 	}
1313 	return(error);
1314 }
1315 
1316 /*
1317  * Resolve the cursor->data pointer for the current cursor position in
1318  * a merged iteration.
1319  */
1320 int
1321 hammer_ip_resolve_data(hammer_cursor_t cursor)
1322 {
1323 	int error;
1324 
1325 	if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1326 		cursor->data = cursor->iprec->data;
1327 		error = 0;
1328 	} else {
1329 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1330 	}
1331 	return(error);
1332 }
1333 
1334 int
1335 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1336 {
1337 	int error;
1338 
1339 	if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1340 		cursor->data = cursor->iprec->data;
1341 		error = 0;
1342 	} else {
1343 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1344 						     HAMMER_CURSOR_GET_RECORD);
1345 	}
1346 	return(error);
1347 }
1348 
1349 /*
1350  * Delete all records within the specified range for inode ip.
1351  *
1352  * NOTE: An unaligned range will cause new records to be added to cover
1353  * the edge cases. (XXX not implemented yet).
1354  *
1355  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1356  *
1357  * NOTE: Record keys for regular file data have to be special-cased since
1358  * they indicate the end of the range (key = base + bytes).
1359  */
1360 int
1361 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1362 		       int64_t ran_beg, int64_t ran_end)
1363 {
1364 	struct hammer_cursor cursor;
1365 	hammer_record_ondisk_t rec;
1366 	hammer_base_elm_t base;
1367 	int error;
1368 	int64_t off;
1369 
1370 #if 0
1371 	kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1372 #endif
1373 
1374 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1375 retry:
1376 	hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1377 
1378 	cursor.key_beg.obj_id = ip->obj_id;
1379 	cursor.key_beg.create_tid = 0;
1380 	cursor.key_beg.delete_tid = 0;
1381 	cursor.key_beg.obj_type = 0;
1382 	cursor.asof = ip->obj_asof;
1383 	cursor.flags |= HAMMER_CURSOR_ASOF;
1384 	cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1385 	cursor.flags |= HAMMER_CURSOR_BACKEND;
1386 
1387 	cursor.key_end = cursor.key_beg;
1388 	if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1389 		cursor.key_beg.key = ran_beg;
1390 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1391 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1392 		cursor.key_end.key = ran_end;
1393 	} else {
1394 		/*
1395 		 * The key in the B-Tree is (base+bytes), so the first possible
1396 		 * matching key is ran_beg + 1.
1397 		 */
1398 		int64_t tmp64;
1399 
1400 		cursor.key_beg.key = ran_beg + 1;
1401 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1402 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1403 
1404 		tmp64 = ran_end + MAXPHYS + 1;	/* work around GCC-4 bug */
1405 		if (tmp64 < ran_end)
1406 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1407 		else
1408 			cursor.key_end.key = ran_end + MAXPHYS + 1;
1409 	}
1410 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1411 
1412 	error = hammer_ip_first(&cursor, ip);
1413 
1414 	/*
1415 	 * Iterate through matching records and mark them as deleted.
1416 	 */
1417 	while (error == 0) {
1418 		rec = cursor.record;
1419 		base = &rec->base.base;
1420 
1421 		KKASSERT(base->delete_tid == 0);
1422 
1423 		/*
1424 		 * There may be overlap cases for regular file data.  Also
1425 		 * remember the key for a regular file record is the offset
1426 		 * of the last byte of the record (base + len - 1), NOT the
1427 		 * base offset.
1428 		 */
1429 #if 0
1430 		kprintf("delete_range rec_type %02x\n", base->rec_type);
1431 #endif
1432 		if (base->rec_type == HAMMER_RECTYPE_DATA) {
1433 #if 0
1434 			kprintf("delete_range loop key %016llx,%d\n",
1435 				base->key - rec->base.data_len, rec->base.data_len);
1436 #endif
1437 			off = base->key - rec->base.data_len;
1438 			/*
1439 			 * Check the left edge case.  We currently do not
1440 			 * split existing records.
1441 			 */
1442 			if (off < ran_beg) {
1443 				panic("hammer left edge case %016llx %d\n",
1444 					base->key, rec->base.data_len);
1445 			}
1446 
1447 			/*
1448 			 * Check the right edge case.  Note that the
1449 			 * record can be completely out of bounds, which
1450 			 * terminates the search.
1451 			 *
1452 			 * base->key is exclusive of the right edge while
1453 			 * ran_end is inclusive of the right edge.  The
1454 			 * (key - data_len) left boundary is inclusive.
1455 			 *
1456 			 * XXX theory-check this test at some point, are
1457 			 * we missing a + 1 somewhere?  Note that ran_end
1458 			 * could overflow.
1459 			 */
1460 			if (base->key - 1 > ran_end) {
1461 				if (base->key - rec->base.data_len > ran_end)
1462 					break;
1463 				panic("hammer right edge case\n");
1464 			}
1465 		}
1466 
1467 		/*
1468 		 * Mark the record and B-Tree entry as deleted.  This will
1469 		 * also physically delete the B-Tree entry, record, and
1470 		 * data if the retention policy dictates.  The function
1471 		 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1472 		 * uses to perform a fixup.
1473 		 */
1474 		error = hammer_ip_delete_record(&cursor, trans->tid);
1475 		if (error)
1476 			break;
1477 		error = hammer_ip_next(&cursor);
1478 	}
1479 	hammer_done_cursor(&cursor);
1480 	if (error == EDEADLK)
1481 		goto retry;
1482 	if (error == ENOENT)
1483 		error = 0;
1484 	return(error);
1485 }
1486 
1487 /*
1488  * Delete all user records associated with an inode except the inode record
1489  * itself.  Directory entries are not deleted (they must be properly disposed
1490  * of or nlinks would get upset).
1491  */
1492 int
1493 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1494 {
1495 	struct hammer_cursor cursor;
1496 	hammer_record_ondisk_t rec;
1497 	hammer_base_elm_t base;
1498 	int error;
1499 
1500 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1501 retry:
1502 	hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1503 
1504 	cursor.key_beg.obj_id = ip->obj_id;
1505 	cursor.key_beg.create_tid = 0;
1506 	cursor.key_beg.delete_tid = 0;
1507 	cursor.key_beg.obj_type = 0;
1508 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1509 	cursor.key_beg.key = HAMMER_MIN_KEY;
1510 
1511 	cursor.key_end = cursor.key_beg;
1512 	cursor.key_end.rec_type = 0xFFFF;
1513 	cursor.key_end.key = HAMMER_MAX_KEY;
1514 
1515 	cursor.asof = ip->obj_asof;
1516 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1517 	cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1518 	cursor.flags |= HAMMER_CURSOR_BACKEND;
1519 
1520 	error = hammer_ip_first(&cursor, ip);
1521 
1522 	/*
1523 	 * Iterate through matching records and mark them as deleted.
1524 	 */
1525 	while (error == 0) {
1526 		rec = cursor.record;
1527 		base = &rec->base.base;
1528 
1529 		KKASSERT(base->delete_tid == 0);
1530 
1531 		/*
1532 		 * Mark the record and B-Tree entry as deleted.  This will
1533 		 * also physically delete the B-Tree entry, record, and
1534 		 * data if the retention policy dictates.  The function
1535 		 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1536 		 * uses to perform a fixup.
1537 		 *
1538 		 * Directory entries (and delete-on-disk directory entries)
1539 		 * must be synced and cannot be deleted.
1540 		 */
1541 		if (rec->base.base.rec_type != HAMMER_RECTYPE_DIRENTRY)
1542 			error = hammer_ip_delete_record(&cursor, trans->tid);
1543 		if (error)
1544 			break;
1545 		error = hammer_ip_next(&cursor);
1546 	}
1547 	hammer_done_cursor(&cursor);
1548 	if (error == EDEADLK)
1549 		goto retry;
1550 	if (error == ENOENT)
1551 		error = 0;
1552 	return(error);
1553 }
1554 
1555 /*
1556  * Delete the record at the current cursor.  On success the cursor will
1557  * be positioned appropriately for an iteration but may no longer be at
1558  * a leaf node.
1559  *
1560  * This routine is only called from the backend.
1561  *
1562  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1563  * cursor and retry.
1564  */
1565 int
1566 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1567 {
1568 	hammer_btree_elm_t elm;
1569 	hammer_mount_t hmp;
1570 	int error;
1571 	int dodelete;
1572 
1573 	KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1574 
1575 	/*
1576 	 * In-memory (unsynchronized) records can simply be freed.  This
1577 	 * only occurs in range iterations since all other records are
1578 	 * individually synchronized.  Thus there should be no confusion with
1579 	 * the interlock.
1580 	 */
1581 	if (cursor->record == &cursor->iprec->rec) {
1582 		KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1583 		cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
1584 		cursor->iprec->flags |= HAMMER_RECF_DELETED_BE;
1585 		return(0);
1586 	}
1587 
1588 	/*
1589 	 * On-disk records are marked as deleted by updating their delete_tid.
1590 	 * This does not effect their position in the B-Tree (which is based
1591 	 * on their create_tid).
1592 	 */
1593 	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1594 	elm = NULL;
1595 	hmp = cursor->node->hmp;
1596 
1597 	dodelete = 0;
1598 	if (error == 0) {
1599 		error = hammer_cursor_upgrade(cursor);
1600 		if (error == 0) {
1601 			elm = &cursor->node->ondisk->elms[cursor->index];
1602 			hammer_modify_node(cursor->trans, cursor->node,
1603 					   &elm->leaf.base.delete_tid,
1604 					   sizeof(elm->leaf.base.delete_tid));
1605 			elm->leaf.base.delete_tid = tid;
1606 			hammer_modify_node_done(cursor->node);
1607 
1608 			/*
1609 			 * An on-disk record cannot have the same delete_tid
1610 			 * as its create_tid.  In a chain of record updates
1611 			 * this could result in a duplicate record.
1612 			 */
1613 			KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1614 			hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1615 			cursor->record->base.base.delete_tid = tid;
1616 			hammer_modify_buffer_done(cursor->record_buffer);
1617 		}
1618 	}
1619 
1620 	/*
1621 	 * If we were mounted with the nohistory option, we physically
1622 	 * delete the record.
1623 	 */
1624 	if (hmp->hflags & HMNT_NOHISTORY)
1625 		dodelete = 1;
1626 
1627 	if (error == 0 && dodelete) {
1628 		error = hammer_delete_at_cursor(cursor, NULL);
1629 		if (error) {
1630 			panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1631 			error = 0;
1632 		}
1633 	}
1634 	return(error);
1635 }
1636 
1637 int
1638 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1639 {
1640 	hammer_btree_elm_t elm;
1641 	hammer_off_t rec_offset;
1642 	hammer_off_t data_offset;
1643 	int32_t data_len;
1644 	u_int16_t rec_type;
1645 	int error;
1646 
1647 	elm = &cursor->node->ondisk->elms[cursor->index];
1648 	KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1649 
1650 	rec_offset = elm->leaf.rec_offset;
1651 	data_offset = elm->leaf.data_offset;
1652 	data_len = elm->leaf.data_len;
1653 	rec_type = elm->leaf.base.rec_type;
1654 
1655 	error = hammer_btree_delete(cursor);
1656 	if (error == 0) {
1657 		/*
1658 		 * This forces a fixup for the iteration because
1659 		 * the cursor is now either sitting at the 'next'
1660 		 * element or sitting at the end of a leaf.
1661 		 */
1662 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1663 			cursor->flags |= HAMMER_CURSOR_DELBTREE;
1664 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1665 		}
1666 	}
1667 	if (error == 0) {
1668 		hammer_blockmap_free(cursor->trans, rec_offset,
1669 				     sizeof(union hammer_record_ondisk));
1670 	}
1671 	if (error == 0) {
1672 		switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1673 		case HAMMER_ZONE_LARGE_DATA:
1674 		case HAMMER_ZONE_SMALL_DATA:
1675 			hammer_blockmap_free(cursor->trans,
1676 					     data_offset, data_len);
1677 			break;
1678 		default:
1679 			break;
1680 		}
1681 	}
1682 #if 0
1683 	kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1684 		"(%d remain in cluster)\n",
1685 		cluster->volume->vol_no, cluster->clu_no,
1686 		rec_offset, data_offset, data_len,
1687 		cluster->ondisk->stat_records);
1688 #endif
1689 	return (error);
1690 }
1691 
1692 /*
1693  * Determine whether we can remove a directory.  This routine checks whether
1694  * a directory is empty or not and enforces flush connectivity.
1695  *
1696  * Flush connectivity requires that we block if the target directory is
1697  * currently flushing, otherwise it may not end up in the same flush group.
1698  *
1699  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
1700  */
1701 int
1702 hammer_ip_check_directory_empty(hammer_transaction_t trans,
1703 			hammer_cursor_t parent_cursor, hammer_inode_t ip)
1704 {
1705 	struct hammer_cursor cursor;
1706 	int error;
1707 
1708 #if 0
1709 	/*
1710 	 * Check flush connectivity
1711 	 */
1712 	if (ip->flush_state != HAMMER_FST_IDLE) {
1713 		kprintf("FWAIT\n");
1714 		hammer_done_cursor(parent_cursor);
1715 		hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL);
1716 		hammer_wait_inode(ip);
1717 		return (EDEADLK);
1718 	}
1719 #endif
1720 
1721 	/*
1722 	 * Check directory empty
1723 	 */
1724 	hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1725 
1726 	cursor.key_beg.obj_id = ip->obj_id;
1727 	cursor.key_beg.create_tid = 0;
1728 	cursor.key_beg.delete_tid = 0;
1729 	cursor.key_beg.obj_type = 0;
1730 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1731 	cursor.key_beg.key = HAMMER_MIN_KEY;
1732 
1733 	cursor.key_end = cursor.key_beg;
1734 	cursor.key_end.rec_type = 0xFFFF;
1735 	cursor.key_end.key = HAMMER_MAX_KEY;
1736 
1737 	cursor.asof = ip->obj_asof;
1738 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1739 
1740 	error = hammer_ip_first(&cursor, ip);
1741 	if (error == ENOENT)
1742 		error = 0;
1743 	else if (error == 0)
1744 		error = ENOTEMPTY;
1745 	hammer_done_cursor(&cursor);
1746 	return(error);
1747 }
1748 
1749