xref: /dragonfly/sys/vfs/hammer/hammer_object.c (revision 1465342b)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.9 2007/12/14 08:05:39 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 static int hammer_mem_add(hammer_transaction_t trans,
40 			     hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43 
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50 	if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51 		return(-1);
52 	if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53 		return(1);
54 
55 	if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56 		return(-1);
57 	if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58 		return(1);
59 
60 	if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61 		return(-1);
62 	if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63 		return(1);
64         return(0);
65 }
66 
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
69 {
70         /*
71          * A key1->rec_type of 0 matches any record type.
72          */
73         if (info->rec_type) {
74                 if (info->rec_type < rec->rec.base.base.rec_type)
75                         return(-3);
76                 if (info->rec_type > rec->rec.base.base.rec_type)
77                         return(3);
78         }
79 
80         /*
81          * There is no special case for key.  0 means 0.
82          */
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87 
88         /*
89          * This test has a number of special cases.  create_tid in key1 is
90          * the as-of transction id, and delete_tid in key1 is NOT USED.
91          *
92          * A key1->create_tid of 0 matches any record regardles of when
93          * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
94          * used to search for the most current state of the object.
95          *
96          * key2->create_tid is a HAMMER record and will never be
97          * 0.   key2->delete_tid is the deletion transaction id or 0 if
98          * the record has not yet been deleted.
99          */
100         if (info->create_tid) {
101                 if (info->create_tid < rec->rec.base.base.create_tid)
102                         return(-1);
103                 if (rec->rec.base.base.delete_tid &&
104 		    info->create_tid >= rec->rec.base.base.delete_tid) {
105                         return(1);
106 		}
107         }
108         return(0);
109 }
110 
111 /*
112  * RB_SCAN comparison code for hammer_mem_first().  The argument order
113  * is reversed so the comparison result has to be negated.  key_beg and
114  * key_end are both range-inclusive.
115  *
116  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
117  * These do not stop the scan.
118  *
119  * Localized deletions are not cached in-memory.
120  */
121 static
122 int
123 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
124 {
125 	hammer_cursor_t cursor = data;
126 	int r;
127 
128 	r = hammer_rec_compare(&cursor->key_beg, rec);
129 	if (r > 1)
130 		return(-1);
131 	if (r == 0)
132 		return(0);
133 	r = hammer_rec_compare(&cursor->key_end, rec);
134 	if (r < -1)
135 		return(1);
136 	return(0);
137 }
138 
139 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
140 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
141 		    hammer_rec_compare, hammer_base_elm_t);
142 
143 /*
144  * Allocate a record for the caller to finish filling in
145  */
146 hammer_record_t
147 hammer_alloc_mem_record(hammer_inode_t ip)
148 {
149 	hammer_record_t record;
150 
151 	record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
152 	record->ip = ip;
153 	return (record);
154 }
155 
156 /*
157  * Release a memory record.  If the record is marked for defered deletion,
158  * destroy the record when the last reference goes away.
159  */
160 void
161 hammer_rel_mem_record(struct hammer_record **recordp)
162 {
163 	hammer_record_t rec;
164 
165 	if ((rec = *recordp) != NULL) {
166 		if (hammer_islastref(&rec->lock)) {
167 			hammer_unref(&rec->lock);
168 			if (rec->flags & HAMMER_RECF_DELETED)
169 				hammer_free_mem_record(rec);
170 		} else {
171 			hammer_unref(&rec->lock);
172 		}
173 		*recordp = NULL;
174 	}
175 }
176 
177 /*
178  * Free a record.  Clean the structure up even though we are throwing it
179  * away as a sanity check.  The actual free operation is delayed while
180  * the record is referenced.  However, the record is removed from the RB
181  * tree immediately.
182  */
183 void
184 hammer_free_mem_record(hammer_record_t record)
185 {
186 	if (record->flags & HAMMER_RECF_ONRBTREE) {
187 		RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
188 		record->flags &= ~HAMMER_RECF_ONRBTREE;
189 	}
190 	if (record->lock.refs) {
191 		record->flags |= HAMMER_RECF_DELETED;
192 		return;
193 	}
194 	if (record->flags & HAMMER_RECF_ALLOCDATA) {
195 		kfree(record->data, M_HAMMER);
196 		record->flags &= ~HAMMER_RECF_ALLOCDATA;
197 	}
198 	record->data = NULL;
199 	kfree(record, M_HAMMER);
200 }
201 
202 /*
203  * Lookup an in-memory record given the key specified in the cursor.  Works
204  * just like hammer_btree_lookup() but operates on an inode's in-memory
205  * record list.
206  *
207  * The lookup must fail if the record is marked for deferred deletion.
208  */
209 static
210 int
211 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
212 {
213 	int error;
214 
215 	if (cursor->iprec)
216 		hammer_rel_mem_record(&cursor->iprec);
217 	if (cursor->ip) {
218 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
219 						  &cursor->ip->rec_tree);
220 	}
221 	cursor->ip = ip;
222 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
223 	cursor->scan.node = NULL;
224 	cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
225 				&ip->rec_tree, &cursor->key_beg);
226 	if (cursor->iprec == NULL) {
227 		error = ENOENT;
228 	} else {
229 		hammer_ref(&cursor->iprec->lock);
230 		error = 0;
231 	}
232 	return(error);
233 }
234 
235 /*
236  * hammer_mem_first() - locate the first in-memory record matching the
237  * cursor.
238  *
239  * The RB_SCAN function we use is designed as a callback.  We terminate it
240  * (return -1) as soon as we get a match.
241  */
242 static
243 int
244 hammer_rec_scan_callback(hammer_record_t rec, void *data)
245 {
246 	hammer_cursor_t cursor = data;
247 
248 	/*
249 	 * Skip if not visible due to our as-of TID
250 	 */
251         if (cursor->key_beg.create_tid) {
252                 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
253                         return(0);
254                 if (rec->rec.base.base.delete_tid &&
255 		    cursor->key_beg.create_tid >=
256 		     rec->rec.base.base.delete_tid) {
257                         return(0);
258 		}
259         }
260 
261 	/*
262 	 * Return the first matching record and stop the scan
263 	 */
264 	if (cursor->iprec == NULL) {
265 		cursor->iprec = rec;
266 		hammer_ref(&rec->lock);
267 		return(-1);
268 	}
269 	return(0);
270 }
271 
272 static
273 int
274 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
275 {
276 	if (cursor->iprec)
277 		hammer_rel_mem_record(&cursor->iprec);
278 	if (cursor->ip) {
279 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
280 						  &cursor->ip->rec_tree);
281 	}
282 	cursor->ip = ip;
283 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
284 	cursor->scan.node = NULL;
285 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
286 				   hammer_rec_scan_callback, cursor);
287 
288 	/*
289 	 * Adjust scan.node and keep it linked into the RB-tree so we can
290 	 * hold the cursor through third party modifications of the RB-tree.
291 	 */
292 	if (cursor->iprec) {
293 		cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
294 		return(0);
295 	}
296 	return(ENOENT);
297 }
298 
299 void
300 hammer_mem_done(hammer_cursor_t cursor)
301 {
302 	if (cursor->ip) {
303 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
304 						  &cursor->ip->rec_tree);
305 		cursor->ip = NULL;
306 	}
307         if (cursor->iprec)
308 		hammer_rel_mem_record(&cursor->iprec);
309 }
310 
311 /************************************************************************
312  *		     HAMMER IN-MEMORY RECORD FUNCTIONS			*
313  ************************************************************************
314  *
315  * These functions manipulate in-memory records.  Such records typically
316  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
317  */
318 
319 /*
320  * Add a directory entry (dip,ncp) which references inode (ip).
321  *
322  * Note that the low 32 bits of the namekey are set temporarily to create
323  * a unique in-memory record, and may be modified a second time when the
324  * record is synchronized to disk.  In particular, the low 32 bits cannot be
325  * all 0's when synching to disk, which is not handled here.
326  */
327 int
328 hammer_ip_add_directory(struct hammer_transaction *trans,
329 		     struct hammer_inode *dip, struct namecache *ncp,
330 		     struct hammer_inode *ip)
331 {
332 	hammer_record_t record;
333 	int error;
334 	int bytes;
335 
336 	record = hammer_alloc_mem_record(dip);
337 
338 	bytes = ncp->nc_nlen;	/* NOTE: terminating \0 is NOT included */
339 	if (++trans->hmp->namekey_iterator == 0)
340 		++trans->hmp->namekey_iterator;
341 
342 	record->rec.entry.base.base.obj_id = dip->obj_id;
343 	record->rec.entry.base.base.key =
344 		hammer_directory_namekey(ncp->nc_name, bytes);
345 	record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
346 	record->rec.entry.base.base.create_tid = trans->tid;
347 	record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
348 	record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
349 	record->rec.entry.obj_id = ip->obj_id;
350 	if (bytes <= sizeof(record->rec.entry.den_name)) {
351 		record->data = (void *)record->rec.entry.den_name;
352 		record->flags |= HAMMER_RECF_EMBEDDED_DATA;
353 	} else {
354 		record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
355 		record->flags |= HAMMER_RECF_ALLOCDATA;
356 	}
357 	bcopy(ncp->nc_name, record->data, bytes);
358 	record->rec.entry.base.data_len = bytes;
359 	++ip->ino_rec.ino_nlinks;
360 	hammer_modify_inode(trans, ip,
361 			    HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
362 	error = hammer_mem_add(trans, record);
363 	return(error);
364 }
365 
366 /*
367  * Delete the directory entry and update the inode link count.  The
368  * cursor must be seeked to the directory entry record being deleted.
369  *
370  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
371  */
372 int
373 hammer_ip_del_directory(struct hammer_transaction *trans,
374 		     hammer_cursor_t cursor, struct hammer_inode *dip,
375 		     struct hammer_inode *ip)
376 {
377 	int error;
378 
379 	error = hammer_ip_delete_record(cursor, trans->tid);
380 
381 	/*
382 	 * One less link.  The file may still be open in the OS even after
383 	 * all links have gone away so we don't destroy the inode's data
384 	 * here.
385 	 */
386 	if (error == 0) {
387 		--ip->ino_rec.ino_nlinks;
388 		hammer_modify_inode(trans, ip,
389 				    HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
390 		if (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))
391 			hammer_sync_inode(ip, MNT_NOWAIT, 1);
392 
393 	}
394 	return(error);
395 }
396 
397 /*
398  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
399  * is called via the strategy called from a cached data source.  This code
400  * is responsible for actually writing a data record out to the disk.
401  */
402 int
403 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
404 		       int64_t offset, void *data, int bytes)
405 {
406 	struct hammer_cursor cursor;
407 	hammer_record_ondisk_t rec;
408 	union hammer_btree_elm elm;
409 	void *bdata;
410 	int error;
411 
412 	error = hammer_init_cursor_ip(&cursor, ip);
413 	if (error)
414 		return(error);
415 	cursor.key_beg.obj_id = ip->obj_id;
416 	cursor.key_beg.key = offset + bytes;
417 	cursor.key_beg.create_tid = trans->tid;
418 	cursor.key_beg.delete_tid = 0;
419 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
420 	cursor.flags = HAMMER_CURSOR_INSERT;
421 
422 	/*
423 	 * Issue a lookup to position the cursor and locate the cluster
424 	 */
425 	error = hammer_btree_lookup(&cursor);
426 	if (error == 0) {
427 		kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
428 			offset, bytes);
429 		hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
430 				       HAMMER_BTREE_TYPE_LEAF, cursor.index);
431 		error = EIO;
432 	}
433 	if (error != ENOENT)
434 		goto done;
435 
436 	/*
437 	 * Allocate record and data space now that we know which cluster
438 	 * the B-Tree node ended up in.
439 	 */
440 	bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
441 				  &cursor.data_buffer);
442 	if (bdata == NULL)
443 		goto done;
444 	rec = hammer_alloc_record(cursor.node->cluster, &error,
445 				  &cursor.record_buffer);
446 	if (rec == NULL)
447 		goto fail1;
448 
449 	/*
450 	 * Fill everything in and insert our B-Tree node.
451 	 */
452 	rec->base.base = cursor.key_beg;
453 	rec->base.data_crc = crc32(data, bytes);
454 	rec->base.rec_id = 0;	/* XXX */
455 	rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
456 	rec->base.data_len = bytes;
457 	hammer_modify_buffer(cursor.record_buffer);
458 
459 	bcopy(data, bdata, bytes);
460 	hammer_modify_buffer(cursor.data_buffer);
461 
462 	elm.leaf.base = cursor.key_beg;
463 	elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
464 	elm.leaf.data_offset = rec->base.data_offset;
465 	elm.leaf.data_len = bytes;
466 	elm.leaf.data_crc = rec->base.data_crc;
467 
468 	error = hammer_btree_insert(&cursor, &elm);
469 	if (error == 0)
470 		goto done;
471 
472 	hammer_free_record_ptr(cursor.record_buffer, rec);
473 fail1:
474 	hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
475 done:
476 	hammer_done_cursor(&cursor);
477 	return(error);
478 }
479 
480 /*
481  * Sync an in-memory record to the disk.  this is typically called via fsync
482  * from a cached record source.  This code is responsible for actually
483  * writing a record out to the disk.
484  */
485 int
486 hammer_ip_sync_record(hammer_record_t record)
487 {
488 	struct hammer_cursor cursor;
489 	hammer_record_ondisk_t rec;
490 	union hammer_btree_elm elm;
491 	void *bdata;
492 	int error;
493 
494 	error = hammer_init_cursor_ip(&cursor, record->ip);
495 	if (error)
496 		return(error);
497 	cursor.key_beg = record->rec.base.base;
498 	cursor.flags = HAMMER_CURSOR_INSERT;
499 
500 	/*
501 	 * Issue a lookup to position the cursor and locate the cluster
502 	 */
503 	error = hammer_btree_lookup(&cursor);
504 	if (error == 0) {
505 		kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
506 			record->rec.base.base.key);
507 		error = EIO;
508 	}
509 	if (error != ENOENT)
510 		goto done;
511 
512 	/*
513 	 * Allocate record and data space now that we know which cluster
514 	 * the B-Tree node ended up in.
515 	 */
516 	if (record->data == NULL ||
517 	    (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
518 		bdata = record->data;
519 	} else {
520 		bdata = hammer_alloc_data(cursor.node->cluster,
521 					  record->rec.base.data_len, &error,
522 					  &cursor.data_buffer);
523 		if (bdata == NULL)
524 			goto done;
525 	}
526 	rec = hammer_alloc_record(cursor.node->cluster, &error,
527 				  &cursor.record_buffer);
528 	if (rec == NULL)
529 		goto fail1;
530 
531 	/*
532 	 * Fill everything in and insert our B-Tree node.
533 	 *
534 	 * XXX assign rec_id here
535 	 */
536 	*rec = record->rec;
537 	if (bdata) {
538 		rec->base.data_crc = crc32(record->data,
539 					   record->rec.base.data_len);
540 		if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
541 			/*
542 			 * Data embedded in record
543 			 */
544 			rec->base.data_offset = ((char *)bdata -
545 						 (char *)&record->rec);
546 			KKASSERT(rec->base.data_offset >= 0 &&
547 				 rec->base.data_offset + rec->base.data_len <
548 				  sizeof(*rec));
549 			rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
550 		} else {
551 			/*
552 			 * Data separate from record
553 			 */
554 			rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
555 			bcopy(record->data, bdata, rec->base.data_len);
556 			hammer_modify_buffer(cursor.data_buffer);
557 		}
558 	}
559 	rec->base.rec_id = 0;	/* XXX */
560 
561 	hammer_modify_buffer(cursor.record_buffer);
562 
563 	elm.leaf.base = cursor.key_beg;
564 	elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
565 	elm.leaf.data_offset = rec->base.data_offset;
566 	elm.leaf.data_len = rec->base.data_len;
567 	elm.leaf.data_crc = rec->base.data_crc;
568 
569 	error = hammer_btree_insert(&cursor, &elm);
570 	if (error == 0)
571 		goto done;
572 
573 	hammer_free_record_ptr(cursor.record_buffer, rec);
574 fail1:
575 	if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
576 		hammer_free_data_ptr(cursor.data_buffer, bdata,
577 				     rec->base.data_len);
578 	}
579 done:
580 	hammer_done_cursor(&cursor);
581 	return(error);
582 }
583 
584 
585 /*
586  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
587  * entry's key is used to deal with hash collisions in the upper 32 bits.
588  * A unique 64 bit key is generated in-memory and may be regenerated a
589  * second time when the directory record is flushed to the on-disk B-Tree.
590  */
591 static
592 int
593 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
594 {
595 	while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
596 		if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
597 			hammer_free_mem_record(record);
598 			return (EEXIST);
599 		}
600 		if (++trans->hmp->namekey_iterator == 0)
601 			++trans->hmp->namekey_iterator;
602 		record->rec.base.base.key &= ~(0xFFFFFFFFLL);
603 		record->rec.base.base.key |= trans->hmp->namekey_iterator;
604 	}
605 	record->flags |= HAMMER_RECF_ONRBTREE;
606 	return(0);
607 }
608 
609 /************************************************************************
610  *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
611  ************************************************************************
612  *
613  * These functions augment the B-Tree scanning functions in hammer_btree.c
614  * by merging in-memory records with on-disk records.
615  */
616 
617 /*
618  * Locate a particular record either in-memory or on-disk.
619  *
620  * NOTE: This is basically a standalone routine, hammer_ip_next() may
621  * NOT be called to iterate results.
622  */
623 int
624 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
625 {
626 	int error;
627 
628 	/*
629 	 * If the element is in-memory return it without searching the
630 	 * on-disk B-Tree
631 	 */
632 	error = hammer_mem_lookup(cursor, ip);
633 	if (error == 0) {
634 		cursor->record = &cursor->iprec->rec;
635 		return(error);
636 	}
637 	if (error != ENOENT)
638 		return(error);
639 
640 	/*
641 	 * If the inode has on-disk components search the on-disk B-Tree.
642 	 */
643 	if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
644 		return(error);
645 	error = hammer_btree_lookup(cursor);
646 	if (error == 0)
647 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
648 	return(error);
649 }
650 
651 /*
652  * Locate the first record within the cursor's key_beg/key_end range,
653  * restricted to a particular inode.  0 is returned on success, ENOENT
654  * if no records matched the requested range, or some other error.
655  *
656  * When 0 is returned hammer_ip_next() may be used to iterate additional
657  * records within the requested range.
658  */
659 int
660 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
661 {
662 	int error;
663 
664 	/*
665 	 * Clean up fields and setup for merged scan
666 	 */
667 	cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
668 	cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
669 	cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
670 	if (cursor->iprec)
671 		hammer_rel_mem_record(&cursor->iprec);
672 
673 	/*
674 	 * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
675 	 * exact lookup so if we get ENOENT we have to call the iterate
676 	 * function to validate the first record after the begin key.
677 	 *
678 	 * The ATEDISK flag is used by hammer_btree_iterate to determine
679 	 * whether it must index forwards or not.
680 	 */
681 	if (ip->flags & HAMMER_INODE_ONDISK) {
682 		error = hammer_btree_lookup(cursor);
683 		if (error == ENOENT) {
684 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
685 			error = hammer_btree_iterate(cursor);
686 		}
687 		if (error && error != ENOENT)
688 			return(error);
689 		if (error == 0) {
690 			cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
691 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
692 		} else {
693 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
694 		}
695 	}
696 
697 	/*
698 	 * Search the in-memory record list (Red-Black tree).  Unlike the
699 	 * B-Tree search, mem_first checks for records in the range.
700 	 */
701 	error = hammer_mem_first(cursor, ip);
702 	if (error && error != ENOENT)
703 		return(error);
704 	if (error == 0) {
705 		cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
706 		cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
707 	}
708 
709 	/*
710 	 * This will return the first matching record.
711 	 */
712 	return(hammer_ip_next(cursor));
713 }
714 
715 /*
716  * Retrieve the next record in a merged iteration within the bounds of the
717  * cursor.  This call may be made multiple times after the cursor has been
718  * initially searched with hammer_ip_first().
719  *
720  * 0 is returned on success, ENOENT if no further records match the
721  * requested range, or some other error code is returned.
722  */
723 int
724 hammer_ip_next(hammer_cursor_t cursor)
725 {
726 	hammer_btree_elm_t elm;
727 	hammer_record_t rec;
728 	int error;
729 	int r;
730 
731 	/*
732 	 * Load the current on-disk and in-memory record.  If we ate any
733 	 * records we have to get the next one.
734 	 *
735 	 * If we deleted the last on-disk record we had scanned ATEDISK will
736 	 * be clear and DELBTREE will be set, forcing a call to iterate. The
737 	 * fact that ATEDISK is clear causes iterate to re-test the 'current'
738 	 * element.  If ATEDISK is set, iterate will skip the 'current'
739 	 * element.
740 	 *
741 	 * Get the next on-disk record
742 	 */
743 	if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
744 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
745 			error = hammer_btree_iterate(cursor);
746 			if (error == 0)
747 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
748 			else
749 				cursor->flags |= HAMMER_CURSOR_DISKEOF |
750 						 HAMMER_CURSOR_ATEDISK;
751 		}
752 	}
753 
754 	/*
755 	 * Get the next in-memory record.  The record can be ripped out
756 	 * of the RB tree so we maintain a scan_info structure to track
757 	 * the next node.
758 	 *
759 	 * hammer_rec_scan_cmp:  Is the record still in our general range,
760 	 *			 (non-inclusive of snapshot exclusions)?
761 	 * hammer_rec_scan_callback: Is the record in our snapshot?
762 	 */
763 	if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
764 		if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
765 			hammer_rel_mem_record(&cursor->iprec);
766 			rec = cursor->scan.node;	/* next node */
767 			while (rec) {
768 				if (hammer_rec_scan_cmp(rec, cursor) != 0)
769 					break;
770 				if (hammer_rec_scan_callback(rec, cursor) != 0)
771 					break;
772 				rec = hammer_rec_rb_tree_RB_NEXT(rec);
773 			}
774 			if (cursor->iprec) {
775 				cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
776 				hammer_ref(&cursor->iprec->lock);
777 				cursor->scan.node =
778 					hammer_rec_rb_tree_RB_NEXT(rec);
779 			} else {
780 				cursor->flags |= HAMMER_CURSOR_MEMEOF;
781 			}
782 		}
783 	}
784 
785 	/*
786 	 * Extract either the disk or memory record depending on their
787 	 * relative position.
788 	 */
789 	error = 0;
790 	switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
791 	case 0:
792 		/*
793 		 * Both entries valid
794 		 */
795 		elm = &cursor->node->ondisk->elms[cursor->index];
796 		r = hammer_btree_cmp(&elm->base,
797 				     &cursor->iprec->rec.base.base);
798 		if (r < 0) {
799 			error = hammer_btree_extract(cursor,
800 						     HAMMER_CURSOR_GET_RECORD);
801 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
802 			break;
803 		}
804 		/* fall through to the memory entry */
805 	case HAMMER_CURSOR_ATEDISK:
806 		/*
807 		 * Only the memory entry is valid
808 		 */
809 		cursor->record = &cursor->iprec->rec;
810 		cursor->flags |= HAMMER_CURSOR_ATEMEM;
811 		break;
812 	case HAMMER_CURSOR_ATEMEM:
813 		/*
814 		 * Only the disk entry is valid
815 		 */
816 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
817 		cursor->flags |= HAMMER_CURSOR_ATEDISK;
818 		break;
819 	default:
820 		/*
821 		 * Neither entry is valid
822 		 *
823 		 * XXX error not set properly
824 		 */
825 		cursor->record = NULL;
826 		error = ENOENT;
827 		break;
828 	}
829 	return(error);
830 }
831 
832 /*
833  * Resolve the cursor->data pointer for the current cursor position in
834  * a merged iteration.
835  */
836 int
837 hammer_ip_resolve_data(hammer_cursor_t cursor)
838 {
839 	int error;
840 
841 	if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
842 		cursor->data = cursor->iprec->data;
843 		error = 0;
844 	} else {
845 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
846 	}
847 	return(error);
848 }
849 
850 /*
851  * Delete all records within the specified range for inode ip.
852  *
853  * NOTE: An unaligned range will cause new records to be added to cover
854  * the edge cases. (XXX not implemented yet).
855  *
856  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
857  *
858  * NOTE: Record keys for regular file data have to be special-cased since
859  * they indicate the end of the range (key = base + bytes).
860  */
861 int
862 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
863 		       int64_t ran_beg, int64_t ran_end)
864 {
865 	struct hammer_cursor cursor;
866 	hammer_record_ondisk_t rec;
867 	hammer_base_elm_t base;
868 	int error;
869 	int isregfile;
870 	int64_t off;
871 
872 	hammer_init_cursor_ip(&cursor, ip);
873 
874 	cursor.key_beg.obj_id = ip->obj_id;
875 	cursor.key_beg.create_tid = ip->obj_asof;
876 	cursor.key_beg.delete_tid = 0;
877 	cursor.key_beg.obj_type = 0;
878 
879 	cursor.key_end = cursor.key_beg;
880 	if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
881 		cursor.key_beg.key = ran_beg;
882 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
883 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
884 		cursor.key_end.key = ran_end;
885 		isregfile = 0;
886 	} else {
887 		/*
888 		 * The key in the B-Tree is (base+bytes), so the first possible
889 		 * matching key is ran_beg + 1.
890 		 */
891 		int64_t tmp64;
892 
893 		cursor.key_beg.key = ran_beg + 1;
894 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
895 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
896 
897 		tmp64 = ran_end + MAXPHYS + 1;	/* work around GCC-4 bug */
898 		if (tmp64 < ran_end)
899 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
900 		else
901 			cursor.key_end.key = ran_end + MAXPHYS + 1;
902 		isregfile = 1;
903 	}
904 
905 	error = hammer_ip_first(&cursor, ip);
906 
907 	/*
908 	 * Iterate through matching records and mark them as deleted.
909 	 */
910 	while (error == 0) {
911 		rec = cursor.record;
912 		base = &rec->base.base;
913 
914 		KKASSERT(base->delete_tid == 0);
915 
916 		/*
917 		 * There may be overlap cases for regular file data.  Also
918 		 * remember the key for a regular file record is the offset
919 		 * of the last byte of the record (base + len - 1), NOT the
920 		 * base offset.
921 		 */
922 #if 0
923 		kprintf("delete_range rec_type %02x\n", base->rec_type);
924 #endif
925 		if (base->rec_type == HAMMER_RECTYPE_DATA) {
926 #if 0
927 			kprintf("delete_range loop key %016llx\n",
928 				base->key - rec->base.data_len);
929 #endif
930 			off = base->key - rec->base.data_len;
931 			/*
932 			 * Check the left edge case.  We currently do not
933 			 * split existing records.
934 			 */
935 			if (off < ran_beg) {
936 				panic("hammer left edge case %016llx %d\n",
937 					base->key, rec->base.data_len);
938 			}
939 
940 			/*
941 			 * Check the right edge case.  Note that the
942 			 * record can be completely out of bounds, which
943 			 * terminates the search.
944 			 *
945 			 * base->key is exclusive of the right edge while
946 			 * ran_end is inclusive of the right edge.  The
947 			 * (key - data_len) left boundary is inclusive.
948 			 *
949 			 * XXX theory-check this test at some point, are
950 			 * we missing a + 1 somewhere?  Note that ran_end
951 			 * could overflow.
952 			 */
953 			if (base->key > ran_end) {
954 				if (base->key - rec->base.data_len > ran_end) {
955 					kprintf("right edge OOB\n");
956 					break;
957 				}
958 				panic("hammer right edge case\n");
959 			}
960 		}
961 
962 		/*
963 		 * Mark the record and B-Tree entry as deleted.  This will
964 		 * also physically delete the B-Tree entry, record, and
965 		 * data if the retention policy dictates.  The function
966 		 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
967 		 * uses to perform a fixup.
968 		 */
969 		error = hammer_ip_delete_record(&cursor, trans->tid);
970 		if (error)
971 			break;
972 		error = hammer_ip_next(&cursor);
973 	}
974 	hammer_done_cursor(&cursor);
975 	if (error == ENOENT)
976 		error = 0;
977 	return(error);
978 }
979 
980 /*
981  * Delete the record at the current cursor
982  */
983 int
984 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
985 {
986 	hammer_btree_elm_t elm;
987 	hammer_mount_t hmp;
988 	int error;
989 
990 	/*
991 	 * In-memory (unsynchronized) records can simply be freed.
992 	 */
993 	cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
994 	if (cursor->record == &cursor->iprec->rec) {
995 		hammer_free_mem_record(cursor->iprec);
996 		return(0);
997 	}
998 
999 	/*
1000 	 * On-disk records are marked as deleted by updating their delete_tid.
1001 	 */
1002 	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1003 	elm = NULL;
1004 	hmp = cursor->node->cluster->volume->hmp;
1005 
1006 	if (error == 0) {
1007 		elm = &cursor->node->ondisk->elms[cursor->index];
1008 		cursor->record->base.base.delete_tid = tid;
1009 		elm->leaf.base.delete_tid = tid;
1010 		hammer_modify_buffer(cursor->record_buffer);
1011 		hammer_modify_node(cursor->node);
1012 	}
1013 
1014 	/*
1015 	 * If we were mounted with the nohistory option, we physically
1016 	 * delete the record.
1017 	 */
1018 	if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1019 		int32_t rec_offset;
1020 		int32_t data_offset;
1021 		int32_t data_len;
1022 		hammer_cluster_t cluster;
1023 
1024 		rec_offset = elm->leaf.rec_offset;
1025 		data_offset = elm->leaf.data_offset;
1026 		data_len = elm->leaf.data_len;
1027 #if 0
1028 		kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1029 			rec_offset, data_offset, data_len);
1030 #endif
1031 		cluster = cursor->node->cluster;
1032 		hammer_ref_cluster(cluster);
1033 
1034 		error = hammer_btree_delete(cursor);
1035 		if (error == 0) {
1036 			/*
1037 			 * This forces a fixup for the iteration because
1038 			 * the cursor is now either sitting at the 'next'
1039 			 * element or sitting at the end of a leaf.
1040 			 */
1041 			if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1042 				cursor->flags |= HAMMER_CURSOR_DELBTREE;
1043 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1044 			}
1045 			hammer_free_record(cluster, rec_offset);
1046 			if (data_offset - rec_offset < 0 ||
1047 			    data_offset - rec_offset >= HAMMER_RECORD_SIZE) {
1048 				hammer_free_data(cluster, data_offset,data_len);
1049 			}
1050 		}
1051 		hammer_rel_cluster(cluster, 0);
1052 		if (error) {
1053 			kprintf("hammer_ip_delete_record: unable to physically delete the record!\n");
1054 			error = 0;
1055 		}
1056 	}
1057 	return(error);
1058 }
1059 
1060