1 /*
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
5 * Copyright (c) 2011-2022 The DragonFly Project. All rights reserved.
6 *
7 * This code is derived from software contributed to The DragonFly Project
8 * by Matthew Dillon <dillon@dragonflybsd.org>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
19 * distribution.
20 * 3. Neither the name of The DragonFly Project nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific, prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
28 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37 /*
38 * This subsystem implements most of the core support functions for
39 * the hammer2_chain structure.
40 *
41 * Chains are the in-memory version on media objects (volume header, inodes,
42 * indirect blocks, data blocks, etc). Chains represent a portion of the
43 * HAMMER2 topology.
44 *
45 * Chains are no-longer delete-duplicated. Instead, the original in-memory
46 * chain will be moved along with its block reference (e.g. for things like
47 * renames, hardlink operations, modifications, etc), and will be indexed
48 * on a secondary list for flush handling instead of propagating a flag
49 * upward to the root.
50 *
51 * Concurrent front-end operations can still run against backend flushes
52 * as long as they do not cross the current flush boundary. An operation
53 * running above the current flush (in areas not yet flushed) can become
54 * part of the current flush while ano peration running below the current
55 * flush can become part of the next flush.
56 */
57 /*
58 #include <sys/cdefs.h>
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/types.h>
62 #include <sys/lock.h>
63 #include <sys/buf.h>
64
65 #include <crypto/sha2/sha2.h>
66 */
67
68 #include "hammer2.h"
69
70 static hammer2_chain_t *hammer2_chain_create_indirect(
71 hammer2_chain_t *parent,
72 hammer2_key_t key, int keybits,
73 hammer2_tid_t mtid, int for_type, int *errorp);
74 static int hammer2_chain_delete_obref(hammer2_chain_t *parent,
75 hammer2_chain_t *chain,
76 hammer2_tid_t mtid, int flags,
77 hammer2_blockref_t *obref);
78 static hammer2_chain_t *hammer2_combined_find(
79 hammer2_chain_t *parent,
80 hammer2_blockref_t *base, int count,
81 hammer2_key_t *key_nextp,
82 hammer2_key_t key_beg, hammer2_key_t key_end,
83 hammer2_blockref_t **brefp);
84 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain,
85 int depth);
86 /*
87 * There are many degenerate situations where an extreme rate of console
88 * output can occur from warnings and errors. Make sure this output does
89 * not impede operations.
90 */
91 /*
92 static struct krate krate_h2chk = { .freq = 5 };
93 static struct krate krate_h2me = { .freq = 1 };
94 static struct krate krate_h2em = { .freq = 1 };
95 */
96
97 /*
98 * Basic RBTree for chains (core.rbtree).
99 */
100 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
101
102 int
hammer2_chain_cmp(hammer2_chain_t * chain1,hammer2_chain_t * chain2)103 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
104 {
105 hammer2_key_t c1_beg;
106 hammer2_key_t c1_end;
107 hammer2_key_t c2_beg;
108 hammer2_key_t c2_end;
109
110 /*
111 * Compare chains. Overlaps are not supposed to happen and catch
112 * any software issues early we count overlaps as a match.
113 */
114 c1_beg = chain1->bref.key;
115 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
116 c2_beg = chain2->bref.key;
117 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
118
119 if (c1_end < c2_beg) /* fully to the left */
120 return(-1);
121 if (c1_beg > c2_end) /* fully to the right */
122 return(1);
123 return(0); /* overlap (must not cross edge boundary) */
124 }
125
126 /*
127 * Assert that a chain has no media data associated with it.
128 */
129 static __inline void
hammer2_chain_assert_no_data(hammer2_chain_t * chain)130 hammer2_chain_assert_no_data(hammer2_chain_t *chain)
131 {
132 KKASSERT(chain->dio == NULL);
133 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME &&
134 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP &&
135 chain->data) {
136 panic("hammer2_chain_assert_no_data: chain %p still has data",
137 chain);
138 }
139 }
140
141 /*
142 * Make a chain visible to the flusher. The flusher operates using a top-down
143 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains,
144 * flushes them, and updates blocks back to the volume root.
145 *
146 * This routine sets the ONFLUSH flag upward from the triggering chain until
147 * it hits an inode root or the volume root. Inode chains serve as inflection
148 * points, requiring the flusher to bridge across trees. Inodes include
149 * regular inodes, PFS roots (pmp->iroot), and the media super root
150 * (spmp->iroot).
151 */
152 void
hammer2_chain_setflush(hammer2_chain_t * chain)153 hammer2_chain_setflush(hammer2_chain_t *chain)
154 {
155 hammer2_chain_t *parent;
156
157 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
158 hammer2_spin_sh(&chain->core.spin);
159 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
160 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
161 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
162 break;
163 if ((parent = chain->parent) == NULL)
164 break;
165 hammer2_spin_sh(&parent->core.spin);
166 hammer2_spin_unsh(&chain->core.spin);
167 chain = parent;
168 }
169 hammer2_spin_unsh(&chain->core.spin);
170 }
171 }
172
173 /*
174 * Allocate a new disconnected chain element representing the specified
175 * bref. chain->refs is set to 1 and the passed bref is copied to
176 * chain->bref. chain->bytes is derived from the bref.
177 *
178 * chain->pmp inherits pmp unless the chain is an inode (other than the
179 * super-root inode).
180 *
181 * NOTE: Returns a referenced but unlocked (because there is no core) chain.
182 */
183 hammer2_chain_t *
hammer2_chain_alloc(hammer2_dev_t * hmp,hammer2_pfs_t * pmp,hammer2_blockref_t * bref)184 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp,
185 hammer2_blockref_t *bref)
186 {
187 hammer2_chain_t *chain;
188 u_int bytes;
189
190 /*
191 * Special case - radix of 0 indicates a chain that does not
192 * need a data reference (context is completely embedded in the
193 * bref).
194 */
195 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX))
196 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
197 else
198 bytes = 0;
199
200 switch(bref->type) {
201 case HAMMER2_BREF_TYPE_INODE:
202 case HAMMER2_BREF_TYPE_INDIRECT:
203 case HAMMER2_BREF_TYPE_DATA:
204 case HAMMER2_BREF_TYPE_DIRENT:
205 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
206 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
207 case HAMMER2_BREF_TYPE_FREEMAP:
208 case HAMMER2_BREF_TYPE_VOLUME:
209 chain = kmalloc_obj(sizeof(*chain), hmp->mchain,
210 M_WAITOK | M_ZERO);
211 atomic_add_long(&hammer2_chain_allocs, 1);
212 break;
213 case HAMMER2_BREF_TYPE_EMPTY:
214 default:
215 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
216 bref->type);
217 break;
218 }
219
220 /*
221 * Initialize the new chain structure. pmp must be set to NULL for
222 * chains belonging to the super-root topology of a device mount.
223 */
224 if (pmp == hmp->spmp)
225 chain->pmp = NULL;
226 else
227 chain->pmp = pmp;
228
229 chain->hmp = hmp;
230 chain->bref = *bref;
231 chain->bytes = bytes;
232 chain->refs = 1;
233 chain->flags = HAMMER2_CHAIN_ALLOCATED;
234
235 /*
236 * Set the PFS boundary flag if this chain represents a PFS root.
237 */
238 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
239 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY);
240 hammer2_chain_init(chain);
241
242 return (chain);
243 }
244
245 /*
246 * A common function to initialize chains including fchain and vchain.
247 */
248 void
hammer2_chain_init(hammer2_chain_t * chain)249 hammer2_chain_init(hammer2_chain_t *chain)
250 {
251 RB_INIT(&chain->core.rbtree); /* live chains */
252 hammer2_mtx_init(&chain->lock, "h2chain");
253 hammer2_spin_init(&chain->core.spin, "h2chain");
254 lockinit(&chain->diolk, "chdio", 0, 0);
255 }
256
257 /*
258 * Add a reference to a chain element, preventing its destruction.
259 * Undone via hammer2_chain_drop()
260 *
261 * (can be called with spinlock held)
262 */
263 void
hammer2_chain_ref(hammer2_chain_t * chain)264 hammer2_chain_ref(hammer2_chain_t *chain)
265 {
266 if (atomic_fetchadd_int(&chain->refs, 1) == 0) {
267 /* NOP */
268 }
269 }
270
271 /*
272 * Ref a locked chain and force the data to be held across an unlock.
273 * Chain must be currently locked. The user of the chain who desires
274 * to release the hold must call hammer2_chain_lock_unhold() to relock
275 * and unhold the chain, then unlock normally, or may simply call
276 * hammer2_chain_drop_unhold() (which is safer against deadlocks).
277 */
278 void
hammer2_chain_ref_hold(hammer2_chain_t * chain)279 hammer2_chain_ref_hold(hammer2_chain_t *chain)
280 {
281 atomic_add_int(&chain->lockcnt, 1);
282 hammer2_chain_ref(chain);
283 }
284
285 /*
286 * Insert the chain in the core rbtree.
287 *
288 * Normal insertions are placed in the live rbtree. Insertion of a deleted
289 * chain is a special case used by the flush code that is placed on the
290 * unstaged deleted list to avoid confusing the live view.
291 */
292 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001
293 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002
294 #define HAMMER2_CHAIN_INSERT_RACE 0x0004
295
296 static
297 int
hammer2_chain_insert(hammer2_chain_t * parent,hammer2_chain_t * chain,int flags,int generation)298 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain,
299 int flags, int generation)
300 {
301 hammer2_chain_t *xchain __debugvar;
302 int error = 0;
303
304 if (flags & HAMMER2_CHAIN_INSERT_SPIN)
305 hammer2_spin_ex(&parent->core.spin);
306
307 /*
308 * Interlocked by spinlock, check for race
309 */
310 if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
311 parent->core.generation != generation) {
312 error = HAMMER2_ERROR_EAGAIN;
313 goto failed;
314 }
315
316 /*
317 * Insert chain
318 */
319 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain);
320 KASSERT(xchain == NULL,
321 ("hammer2_chain_insert: collision %p %p (key=%016jx)",
322 chain, xchain, chain->bref.key));
323 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
324 chain->parent = parent;
325 ++parent->core.chain_count;
326 ++parent->core.generation; /* XXX incs for _get() too, XXX */
327
328 /*
329 * We have to keep track of the effective live-view blockref count
330 * so the create code knows when to push an indirect block.
331 */
332 if (flags & HAMMER2_CHAIN_INSERT_LIVE)
333 atomic_add_int(&parent->core.live_count, 1);
334 failed:
335 if (flags & HAMMER2_CHAIN_INSERT_SPIN)
336 hammer2_spin_unex(&parent->core.spin);
337 return error;
338 }
339
340 /*
341 * Drop the caller's reference to the chain. When the ref count drops to
342 * zero this function will try to disassociate the chain from its parent and
343 * deallocate it, then recursely drop the parent using the implied ref
344 * from the chain's chain->parent.
345 *
346 * Nobody should own chain's mutex on the 1->0 transition, unless this drop
347 * races an acquisition by another cpu. Therefore we can loop if we are
348 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again
349 * race against another drop.
350 */
351 void
hammer2_chain_drop(hammer2_chain_t * chain)352 hammer2_chain_drop(hammer2_chain_t *chain)
353 {
354 u_int refs;
355
356 KKASSERT(chain->refs > 0);
357
358 while (chain) {
359 refs = chain->refs;
360 cpu_ccfence();
361 KKASSERT(refs > 0);
362
363 if (refs == 1) {
364 if (hammer2_mtx_ex_try(&chain->lock) == 0)
365 chain = hammer2_chain_lastdrop(chain, 0);
366 /* retry the same chain, or chain from lastdrop */
367 } else {
368 if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
369 break;
370 /* retry the same chain */
371 }
372 cpu_pause();
373 }
374 }
375
376 /*
377 * Unhold a held and probably not-locked chain, ensure that the data is
378 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive
379 * lock and then simply unlocking the chain.
380 */
381 void
hammer2_chain_unhold(hammer2_chain_t * chain)382 hammer2_chain_unhold(hammer2_chain_t *chain)
383 {
384 u_int lockcnt;
385 int iter = 0;
386
387 for (;;) {
388 lockcnt = chain->lockcnt;
389 cpu_ccfence();
390 if (lockcnt > 1) {
391 if (atomic_cmpset_int(&chain->lockcnt,
392 lockcnt, lockcnt - 1)) {
393 break;
394 }
395 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) {
396 hammer2_chain_unlock(chain);
397 break;
398 } else {
399 /*
400 * This situation can easily occur on SMP due to
401 * the gap inbetween the 1->0 transition and the
402 * final unlock. We cannot safely block on the
403 * mutex because lockcnt might go above 1.
404 *
405 * XXX Sleep for one tick if it takes too long.
406 */
407 if (++iter > 1000) {
408 if (iter > 1000 + hz) {
409 kprintf("hammer2: h2race1 %p\n", chain);
410 iter = 1000;
411 }
412 tsleep(&iter, 0, "h2race1", 1);
413 }
414 cpu_pause();
415 }
416 }
417 }
418
419 void
hammer2_chain_drop_unhold(hammer2_chain_t * chain)420 hammer2_chain_drop_unhold(hammer2_chain_t *chain)
421 {
422 hammer2_chain_unhold(chain);
423 hammer2_chain_drop(chain);
424 }
425
426 void
hammer2_chain_rehold(hammer2_chain_t * chain)427 hammer2_chain_rehold(hammer2_chain_t *chain)
428 {
429 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED);
430 atomic_add_int(&chain->lockcnt, 1);
431 hammer2_chain_unlock(chain);
432 }
433
434 /*
435 * Handles the (potential) last drop of chain->refs from 1->0. Called with
436 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are
437 * possible against refs and lockcnt. We must dispose of the mutex on chain.
438 *
439 * This function returns an unlocked chain for recursive drop or NULL. It
440 * can return the same chain if it determines it has raced another ref.
441 *
442 * --
443 *
444 * When two chains need to be recursively dropped we use the chain we
445 * would otherwise free to placehold the additional chain. It's a bit
446 * convoluted but we can't just recurse without potentially blowing out
447 * the kernel stack.
448 *
449 * The chain cannot be freed if it has any children.
450 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it.
451 * The chain cannot be freed if flagged UPDATE unless we can dispose of it.
452 * Any dedup registration can remain intact.
453 *
454 * The core spinlock is allowed to nest child-to-parent (not parent-to-child).
455 */
456 static
457 hammer2_chain_t *
hammer2_chain_lastdrop(hammer2_chain_t * chain,int depth)458 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
459 {
460 hammer2_dev_t *hmp;
461 hammer2_chain_t *parent;
462 hammer2_chain_t *rdrop;
463
464 /*
465 * We need chain's spinlock to interlock the sub-tree test.
466 * We already have chain's mutex, protecting chain->parent.
467 *
468 * Remember that chain->refs can be in flux.
469 */
470 hammer2_spin_ex(&chain->core.spin);
471
472 if (chain->parent != NULL) {
473 /*
474 * If the chain has a parent the UPDATE bit prevents scrapping
475 * as the chain is needed to properly flush the parent. Try
476 * to complete the 1->0 transition and return NULL. Retry
477 * (return chain) if we are unable to complete the 1->0
478 * transition, else return NULL (nothing more to do).
479 *
480 * If the chain has a parent the MODIFIED bit prevents
481 * scrapping.
482 *
483 * Chains with UPDATE/MODIFIED are *not* put on the LRU list!
484 */
485 if (chain->flags & (HAMMER2_CHAIN_UPDATE |
486 HAMMER2_CHAIN_MODIFIED)) {
487 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
488 hammer2_spin_unex(&chain->core.spin);
489 hammer2_chain_assert_no_data(chain);
490 hammer2_mtx_unlock(&chain->lock);
491 chain = NULL;
492 } else {
493 hammer2_spin_unex(&chain->core.spin);
494 hammer2_mtx_unlock(&chain->lock);
495 }
496 return (chain);
497 }
498 /* spinlock still held */
499 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME ||
500 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) {
501 /*
502 * Retain the static vchain and fchain. Clear bits that
503 * are not relevant. Do not clear the MODIFIED bit,
504 * and certainly do not put it on the delayed-flush queue.
505 */
506 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
507 } else {
508 /*
509 * The chain has no parent and can be flagged for destruction.
510 * Since it has no parent, UPDATE can also be cleared.
511 */
512 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
513 if (chain->flags & HAMMER2_CHAIN_UPDATE)
514 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
515
516 /*
517 * If the chain has children we must propagate the DESTROY
518 * flag downward and rip the disconnected topology apart.
519 * This is accomplished by calling hammer2_flush() on the
520 * chain.
521 *
522 * Any dedup is already handled by the underlying DIO, so
523 * we do not have to specifically flush it here.
524 */
525 if (chain->core.chain_count) {
526 hammer2_spin_unex(&chain->core.spin);
527 hammer2_flush(chain, HAMMER2_FLUSH_TOP |
528 HAMMER2_FLUSH_ALL);
529 hammer2_mtx_unlock(&chain->lock);
530
531 return(chain); /* retry drop */
532 }
533
534 /*
535 * Otherwise we can scrap the MODIFIED bit if it is set,
536 * and continue along the freeing path.
537 *
538 * Be sure to clean-out any dedup bits. Without a parent
539 * this chain will no longer be visible to the flush code.
540 * Easy check data_off to avoid the volume root.
541 */
542 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
543 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
544 atomic_add_long(&hammer2_count_modified_chains, -1);
545 if (chain->pmp)
546 hammer2_pfs_memory_wakeup(chain->pmp, -1);
547 }
548 /* spinlock still held */
549 }
550
551 /* spinlock still held */
552
553 /*
554 * If any children exist we must leave the chain intact with refs == 0.
555 * They exist because chains are retained below us which have refs or
556 * may require flushing.
557 *
558 * Retry (return chain) if we fail to transition the refs to 0, else
559 * return NULL indication nothing more to do.
560 *
561 * Chains with children are NOT put on the LRU list.
562 */
563 if (chain->core.chain_count) {
564 if (atomic_cmpset_int(&chain->refs, 1, 0)) {
565 hammer2_spin_unex(&chain->core.spin);
566 hammer2_chain_assert_no_data(chain);
567 hammer2_mtx_unlock(&chain->lock);
568 chain = NULL;
569 } else {
570 hammer2_spin_unex(&chain->core.spin);
571 hammer2_mtx_unlock(&chain->lock);
572 }
573 return (chain);
574 }
575 /* spinlock still held */
576 /* no chains left under us */
577
578 /*
579 * chain->core has no children left so no accessors can get to our
580 * chain from there. Now we have to lock the parent core to interlock
581 * remaining possible accessors that might bump chain's refs before
582 * we can safely drop chain's refs with intent to free the chain.
583 */
584 hmp = chain->hmp;
585 rdrop = NULL;
586
587 parent = chain->parent;
588
589 /*
590 * WARNING! chain's spin lock is still held here, and other spinlocks
591 * will be acquired and released in the code below. We
592 * cannot be making fancy procedure calls!
593 */
594
595 /*
596 * Spinlock the parent and try to drop the last ref on chain.
597 * On success determine if we should dispose of the chain
598 * (remove the chain from its parent, etc).
599 *
600 * (normal core locks are top-down recursive but we define
601 * core spinlocks as bottom-up recursive, so this is safe).
602 */
603 if (parent) {
604 hammer2_spin_ex(&parent->core.spin);
605 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
606 /*
607 * 1->0 transition failed, retry.
608 */
609 hammer2_spin_unex(&parent->core.spin);
610 hammer2_spin_unex(&chain->core.spin);
611 hammer2_mtx_unlock(&chain->lock);
612
613 return(chain);
614 }
615
616 /*
617 * 1->0 transition successful, parent spin held to prevent
618 * new lookups, chain spinlock held to protect parent field.
619 * Remove chain from the parent.
620 *
621 * If the chain is being removed from the parent's rbtree but
622 * is not blkmapped, we have to adjust live_count downward. If
623 * it is blkmapped then the blockref is retained in the parent
624 * as is its associated live_count. This case can occur when
625 * a chain added to the topology is unable to flush and is
626 * then later deleted.
627 */
628 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
629 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) &&
630 (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) {
631 atomic_add_int(&parent->core.live_count, -1);
632 }
633 RB_REMOVE(hammer2_chain_tree,
634 &parent->core.rbtree, chain);
635 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
636 --parent->core.chain_count;
637 chain->parent = NULL;
638 }
639
640 /*
641 * If our chain was the last chain in the parent's core the
642 * core is now empty and its parent might have to be
643 * re-dropped if it has 0 refs.
644 */
645 if (parent->core.chain_count == 0) {
646 rdrop = parent;
647 atomic_add_int(&rdrop->refs, 1);
648 /*
649 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0)
650 rdrop = NULL;
651 */
652 }
653 hammer2_spin_unex(&parent->core.spin);
654 parent = NULL; /* safety */
655 /* FALL THROUGH */
656 } else {
657 /*
658 * No-parent case.
659 */
660 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
661 /*
662 * 1->0 transition failed, retry.
663 */
664 hammer2_spin_unex(&parent->core.spin);
665 hammer2_spin_unex(&chain->core.spin);
666 hammer2_mtx_unlock(&chain->lock);
667
668 return(chain);
669 }
670 }
671
672 /*
673 * Successful 1->0 transition, no parent, no children... no way for
674 * anyone to ref this chain any more. We can clean-up and free it.
675 *
676 * We still have the core spinlock, and core's chain_count is 0.
677 * Any parent spinlock is gone.
678 */
679 hammer2_spin_unex(&chain->core.spin);
680 hammer2_chain_assert_no_data(chain);
681 hammer2_mtx_unlock(&chain->lock);
682 KKASSERT(RB_EMPTY(&chain->core.rbtree) &&
683 chain->core.chain_count == 0);
684
685 /*
686 * All locks are gone, no pointers remain to the chain, finish
687 * freeing it.
688 */
689 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE |
690 HAMMER2_CHAIN_MODIFIED)) == 0);
691
692 /*
693 * Once chain resources are gone we can use the now dead chain
694 * structure to placehold what might otherwise require a recursive
695 * drop, because we have potentially two things to drop and can only
696 * return one directly.
697 */
698 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
699 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED);
700 chain->hmp = NULL;
701 kfree_obj(chain, hmp->mchain);
702 atomic_add_long(&hammer2_chain_allocs, -1);
703 }
704
705 /*
706 * Possible chaining loop when parent re-drop needed.
707 */
708 return(rdrop);
709 }
710
711 /*
712 * On last lock release.
713 */
714 static hammer2_io_t *
hammer2_chain_drop_data(hammer2_chain_t * chain)715 hammer2_chain_drop_data(hammer2_chain_t *chain)
716 {
717 hammer2_io_t *dio;
718
719 if ((dio = chain->dio) != NULL) {
720 chain->dio = NULL;
721 chain->data = NULL;
722 } else {
723 switch(chain->bref.type) {
724 case HAMMER2_BREF_TYPE_VOLUME:
725 case HAMMER2_BREF_TYPE_FREEMAP:
726 break;
727 default:
728 if (chain->data != NULL) {
729 hammer2_spin_unex(&chain->core.spin);
730 panic("chain data not null: "
731 "chain %p bref %016jx.%02x "
732 "refs %d parent %p dio %p data %p",
733 chain, chain->bref.data_off,
734 chain->bref.type, chain->refs,
735 chain->parent,
736 chain->dio, chain->data);
737 }
738 KKASSERT(chain->data == NULL);
739 break;
740 }
741 }
742 return dio;
743 }
744
745 /*
746 * Lock a referenced chain element, acquiring its data with I/O if necessary,
747 * and specify how you would like the data to be resolved.
748 *
749 * If an I/O or other fatal error occurs, chain->error will be set to non-zero.
750 *
751 * The lock is allowed to recurse, multiple locking ops will aggregate
752 * the requested resolve types. Once data is assigned it will not be
753 * removed until the last unlock.
754 *
755 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
756 * (typically used to avoid device/logical buffer
757 * aliasing for data)
758 *
759 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
760 * the INITIAL-create state (indirect blocks only).
761 *
762 * Do not resolve data elements for DATA chains.
763 * (typically used to avoid device/logical buffer
764 * aliasing for data)
765 *
766 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
767 *
768 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
769 * it will be locked exclusive.
770 *
771 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If
772 * the lock fails, EAGAIN is returned.
773 *
774 * NOTE: Embedded elements (volume header, inodes) are always resolved
775 * regardless.
776 *
777 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
778 * element will instantiate and zero its buffer, and flush it on
779 * release.
780 *
781 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
782 * so as not to instantiate a device buffer, which could alias against
783 * a logical file buffer. However, if ALWAYS is specified the
784 * device buffer will be instantiated anyway.
785 *
786 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which
787 * case it can be either 0 or EAGAIN.
788 *
789 * WARNING! This function blocks on I/O if data needs to be fetched. This
790 * blocking can run concurrent with other compatible lock holders
791 * who do not need data returning. The lock is not upgraded to
792 * exclusive during a data fetch, a separate bit is used to
793 * interlock I/O. However, an exclusive lock holder can still count
794 * on being interlocked against an I/O fetch managed by a shared
795 * lock holder.
796 */
797 int
hammer2_chain_lock(hammer2_chain_t * chain,int how)798 hammer2_chain_lock(hammer2_chain_t *chain, int how)
799 {
800 KKASSERT(chain->refs > 0);
801
802 if (how & HAMMER2_RESOLVE_NONBLOCK) {
803 /*
804 * We still have to bump lockcnt before acquiring the lock,
805 * even for non-blocking operation, because the unlock code
806 * live-loops on lockcnt == 1 when dropping the last lock.
807 *
808 * If the non-blocking operation fails we have to use an
809 * unhold sequence to undo the mess.
810 *
811 * NOTE: LOCKAGAIN must always succeed without blocking,
812 * even if NONBLOCK is specified.
813 */
814 atomic_add_int(&chain->lockcnt, 1);
815 if (how & HAMMER2_RESOLVE_SHARED) {
816 if (how & HAMMER2_RESOLVE_LOCKAGAIN) {
817 hammer2_mtx_sh_again(&chain->lock);
818 } else {
819 if (hammer2_mtx_sh_try(&chain->lock) != 0) {
820 hammer2_chain_unhold(chain);
821 return EAGAIN;
822 }
823 }
824 } else {
825 if (hammer2_mtx_ex_try(&chain->lock) != 0) {
826 hammer2_chain_unhold(chain);
827 return EAGAIN;
828 }
829 }
830 } else {
831 /*
832 * Get the appropriate lock. If LOCKAGAIN is flagged with
833 * SHARED the caller expects a shared lock to already be
834 * present and we are giving it another ref. This case must
835 * importantly not block if there is a pending exclusive lock
836 * request.
837 */
838 atomic_add_int(&chain->lockcnt, 1);
839 if (how & HAMMER2_RESOLVE_SHARED) {
840 if (how & HAMMER2_RESOLVE_LOCKAGAIN) {
841 hammer2_mtx_sh_again(&chain->lock);
842 } else {
843 hammer2_mtx_sh(&chain->lock);
844 }
845 } else {
846 hammer2_mtx_ex(&chain->lock);
847 }
848 }
849
850 /*
851 * If we already have a valid data pointer make sure the data is
852 * synchronized to the current cpu, and then no further action is
853 * necessary.
854 */
855 if (chain->data) {
856 if (chain->dio)
857 hammer2_io_bkvasync(chain->dio);
858 return 0;
859 }
860
861 /*
862 * Do we have to resolve the data? This is generally only
863 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased.
864 * Other BREF types expects the data to be there.
865 */
866 switch(how & HAMMER2_RESOLVE_MASK) {
867 case HAMMER2_RESOLVE_NEVER:
868 return 0;
869 case HAMMER2_RESOLVE_MAYBE:
870 if (chain->flags & HAMMER2_CHAIN_INITIAL)
871 return 0;
872 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
873 return 0;
874 #if 0
875 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
876 return 0;
877 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
878 return 0;
879 #endif
880 /* fall through */
881 case HAMMER2_RESOLVE_ALWAYS:
882 default:
883 break;
884 }
885
886 /*
887 * Caller requires data
888 */
889 hammer2_chain_load_data(chain);
890
891 return 0;
892 }
893
894 #if 0
895 /*
896 * Lock the chain, retain the hold, and drop the data persistence count.
897 * The data should remain valid because we never transitioned lockcnt
898 * through 0.
899 */
900 void
901 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how)
902 {
903 hammer2_chain_lock(chain, how);
904 atomic_add_int(&chain->lockcnt, -1);
905 }
906
907 /*
908 * Downgrade an exclusive chain lock to a shared chain lock.
909 *
910 * NOTE: There is no upgrade equivalent due to the ease of
911 * deadlocks in that direction.
912 */
913 void
914 hammer2_chain_lock_downgrade(hammer2_chain_t *chain)
915 {
916 hammer2_mtx_downgrade(&chain->lock);
917 }
918 #endif
919
920 /*
921 * Issue I/O and install chain->data. Caller must hold a chain lock, lock
922 * may be of any type.
923 *
924 * Once chain->data is set it cannot be disposed of until all locks are
925 * released.
926 *
927 * Make sure the data is synchronized to the current cpu.
928 */
929 void
hammer2_chain_load_data(hammer2_chain_t * chain)930 hammer2_chain_load_data(hammer2_chain_t *chain)
931 {
932 hammer2_blockref_t *bref;
933 hammer2_dev_t *hmp;
934 hammer2_io_t *dio;
935 char *bdata;
936 int error;
937
938 /*
939 * Degenerate case, data already present, or chain has no media
940 * reference to load.
941 */
942 KKASSERT(chain->lock.mtx_lock & MTX_MASK);
943 if (chain->data) {
944 if (chain->dio)
945 hammer2_io_bkvasync(chain->dio);
946 return;
947 }
948 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
949 return;
950
951 hmp = chain->hmp;
952 KKASSERT(hmp != NULL);
953
954 /*
955 * Gain the IOINPROG bit, interlocked block.
956 */
957 for (;;) {
958 u_int oflags;
959 u_int nflags;
960
961 oflags = chain->flags;
962 cpu_ccfence();
963 if (oflags & HAMMER2_CHAIN_IOINPROG) {
964 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL;
965 tsleep_interlock(&chain->flags, 0);
966 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
967 tsleep(&chain->flags, PINTERLOCKED,
968 "h2iocw", 0);
969 }
970 /* retry */
971 } else {
972 nflags = oflags | HAMMER2_CHAIN_IOINPROG;
973 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
974 break;
975 }
976 /* retry */
977 }
978 }
979
980 /*
981 * We own CHAIN_IOINPROG
982 *
983 * Degenerate case if we raced another load.
984 */
985 if (chain->data) {
986 if (chain->dio)
987 hammer2_io_bkvasync(chain->dio);
988 goto done;
989 }
990
991 /*
992 * We must resolve to a device buffer, either by issuing I/O or
993 * by creating a zero-fill element. We do not mark the buffer
994 * dirty when creating a zero-fill element (the hammer2_chain_modify()
995 * API must still be used to do that).
996 *
997 * The device buffer is variable-sized in powers of 2 down
998 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
999 * chunk always contains buffers of the same size. (XXX)
1000 *
1001 * The minimum physical IO size may be larger than the variable
1002 * block size.
1003 */
1004 bref = &chain->bref;
1005
1006 /*
1007 * The getblk() optimization can only be used on newly created
1008 * elements if the physical block size matches the request.
1009 */
1010 if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1011 error = hammer2_io_new(hmp, bref->type,
1012 bref->data_off, chain->bytes,
1013 &chain->dio);
1014 } else {
1015 error = hammer2_io_bread(hmp, bref->type,
1016 bref->data_off, chain->bytes,
1017 &chain->dio);
1018 hammer2_adjreadcounter(chain->bref.type, chain->bytes);
1019 }
1020 if (error) {
1021 chain->error = HAMMER2_ERROR_EIO;
1022 kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n",
1023 (intmax_t)bref->data_off, error);
1024 hammer2_io_bqrelse(&chain->dio);
1025 goto done;
1026 }
1027 chain->error = 0;
1028
1029 /*
1030 * This isn't perfect and can be ignored on OSs which do not have
1031 * an indication as to whether a buffer is coming from cache or
1032 * if I/O was actually issued for the read. TESTEDGOOD will work
1033 * pretty well without the B_IOISSUED logic because chains are
1034 * cached, but in that situation (without B_IOISSUED) it will not
1035 * detect whether a re-read via I/O is corrupted verses the original
1036 * read.
1037 *
1038 * We can't re-run the CRC on every fresh lock. That would be
1039 * insanely expensive.
1040 *
1041 * If the underlying kernel buffer covers the entire chain we can
1042 * use the B_IOISSUED indication to determine if we have to re-run
1043 * the CRC on chain data for chains that managed to stay cached
1044 * across the kernel disposal of the original buffer.
1045 */
1046 if ((dio = chain->dio) != NULL && dio->bp) {
1047 //struct m_buf *bp = dio->bp;
1048
1049 if (dio->psize == chain->bytes //&&
1050 /*(bp->b_flags & B_IOISSUED)*/) {
1051 atomic_clear_int(&chain->flags,
1052 HAMMER2_CHAIN_TESTEDGOOD);
1053 //bp->b_flags &= ~B_IOISSUED;
1054 }
1055 }
1056
1057 /*
1058 * NOTE: A locked chain's data cannot be modified without first
1059 * calling hammer2_chain_modify().
1060 */
1061
1062 /*
1063 * NOTE: hammer2_io_data() call issues bkvasync()
1064 */
1065 bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
1066
1067 if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1068 /*
1069 * Clear INITIAL. In this case we used io_new() and the
1070 * buffer has been zero'd and marked dirty.
1071 *
1072 * CHAIN_MODIFIED has not been set yet, and we leave it
1073 * that way for now. Set a temporary CHAIN_NOTTESTED flag
1074 * to prevent hammer2_chain_testcheck() from trying to match
1075 * a check code that has not yet been generated. This bit
1076 * should NOT end up on the actual media.
1077 */
1078 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1079 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED);
1080 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
1081 /*
1082 * check data not currently synchronized due to
1083 * modification. XXX assumes data stays in the buffer
1084 * cache, which might not be true (need biodep on flush
1085 * to calculate crc? or simple crc?).
1086 */
1087 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) {
1088 if (hammer2_chain_testcheck(chain, bdata) == 0) {
1089 chain->error = HAMMER2_ERROR_CHECK;
1090 } else {
1091 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD);
1092 }
1093 }
1094
1095 /*
1096 * Setup the data pointer by pointing it into the buffer.
1097 * WARNING! Other threads can start using the data the instant we
1098 * set chain->data non-NULL.
1099 */
1100 switch (bref->type) {
1101 case HAMMER2_BREF_TYPE_VOLUME:
1102 case HAMMER2_BREF_TYPE_FREEMAP:
1103 panic("hammer2_chain_load_data: unresolved volume header");
1104 break;
1105 case HAMMER2_BREF_TYPE_DIRENT:
1106 KKASSERT(chain->bytes != 0);
1107 /* fall through */
1108 case HAMMER2_BREF_TYPE_INODE:
1109 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1110 case HAMMER2_BREF_TYPE_INDIRECT:
1111 case HAMMER2_BREF_TYPE_DATA:
1112 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1113 default:
1114 /*
1115 * Point data at the device buffer and leave dio intact.
1116 */
1117 chain->data = (void *)bdata;
1118 break;
1119 }
1120
1121 /*
1122 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested.
1123 */
1124 done:
1125 for (;;) {
1126 u_int oflags;
1127 u_int nflags;
1128
1129 oflags = chain->flags;
1130 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG |
1131 HAMMER2_CHAIN_IOSIGNAL);
1132 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG);
1133 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1134 if (oflags & HAMMER2_CHAIN_IOSIGNAL)
1135 wakeup(&chain->flags);
1136 break;
1137 }
1138 }
1139 }
1140
1141 /*
1142 * Unlock and deref a chain element.
1143 *
1144 * Remember that the presence of children under chain prevent the chain's
1145 * destruction but do not add additional references, so the dio will still
1146 * be dropped.
1147 */
1148 void
hammer2_chain_unlock(hammer2_chain_t * chain)1149 hammer2_chain_unlock(hammer2_chain_t *chain)
1150 {
1151 hammer2_io_t *dio;
1152 u_int lockcnt;
1153 int iter = 0;
1154
1155 /*
1156 * If multiple locks are present (or being attempted) on this
1157 * particular chain we can just unlock, drop refs, and return.
1158 *
1159 * Otherwise fall-through on the 1->0 transition.
1160 */
1161 for (;;) {
1162 lockcnt = chain->lockcnt;
1163 KKASSERT(lockcnt > 0);
1164 cpu_ccfence();
1165 if (lockcnt > 1) {
1166 if (atomic_cmpset_int(&chain->lockcnt,
1167 lockcnt, lockcnt - 1)) {
1168 hammer2_mtx_unlock(&chain->lock);
1169 return;
1170 }
1171 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) {
1172 /* while holding the mutex exclusively */
1173 if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
1174 break;
1175 } else {
1176 /*
1177 * This situation can easily occur on SMP due to
1178 * the gap inbetween the 1->0 transition and the
1179 * final unlock. We cannot safely block on the
1180 * mutex because lockcnt might go above 1.
1181 *
1182 * XXX Sleep for one tick if it takes too long.
1183 */
1184 if (++iter > 1000) {
1185 if (iter > 1000 + hz) {
1186 kprintf("hammer2: h2race2 %p\n", chain);
1187 iter = 1000;
1188 }
1189 tsleep(&iter, 0, "h2race2", 1);
1190 }
1191 cpu_pause();
1192 }
1193 /* retry */
1194 }
1195
1196 /*
1197 * Last unlock / mutex upgraded to exclusive. Drop the data
1198 * reference.
1199 */
1200 dio = hammer2_chain_drop_data(chain);
1201 if (dio)
1202 hammer2_io_bqrelse(&dio);
1203 hammer2_mtx_unlock(&chain->lock);
1204 }
1205
1206 #if 0
1207 /*
1208 * Unlock and hold chain data intact
1209 */
1210 void
1211 hammer2_chain_unlock_hold(hammer2_chain_t *chain)
1212 {
1213 atomic_add_int(&chain->lockcnt, 1);
1214 hammer2_chain_unlock(chain);
1215 }
1216 #endif
1217
1218 /*
1219 * Helper to obtain the blockref[] array base and count for a chain.
1220 *
1221 * XXX Not widely used yet, various use cases need to be validated and
1222 * converted to use this function.
1223 */
1224 static
1225 hammer2_blockref_t *
hammer2_chain_base_and_count(hammer2_chain_t * parent,int * countp)1226 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp)
1227 {
1228 hammer2_blockref_t *base;
1229 int count;
1230
1231 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1232 base = NULL;
1233
1234 switch(parent->bref.type) {
1235 case HAMMER2_BREF_TYPE_INODE:
1236 count = HAMMER2_SET_COUNT;
1237 break;
1238 case HAMMER2_BREF_TYPE_INDIRECT:
1239 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1240 count = parent->bytes / sizeof(hammer2_blockref_t);
1241 break;
1242 case HAMMER2_BREF_TYPE_VOLUME:
1243 count = HAMMER2_SET_COUNT;
1244 break;
1245 case HAMMER2_BREF_TYPE_FREEMAP:
1246 count = HAMMER2_SET_COUNT;
1247 break;
1248 default:
1249 panic("hammer2_chain_base_and_count: "
1250 "unrecognized blockref type: %d",
1251 parent->bref.type);
1252 count = 0;
1253 break;
1254 }
1255 } else {
1256 switch(parent->bref.type) {
1257 case HAMMER2_BREF_TYPE_INODE:
1258 base = &parent->data->ipdata.u.blockset.blockref[0];
1259 count = HAMMER2_SET_COUNT;
1260 break;
1261 case HAMMER2_BREF_TYPE_INDIRECT:
1262 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1263 base = &parent->data->npdata[0];
1264 count = parent->bytes / sizeof(hammer2_blockref_t);
1265 break;
1266 case HAMMER2_BREF_TYPE_VOLUME:
1267 base = &parent->data->voldata.
1268 sroot_blockset.blockref[0];
1269 count = HAMMER2_SET_COUNT;
1270 break;
1271 case HAMMER2_BREF_TYPE_FREEMAP:
1272 base = &parent->data->blkset.blockref[0];
1273 count = HAMMER2_SET_COUNT;
1274 break;
1275 default:
1276 panic("hammer2_chain_base_and_count: "
1277 "unrecognized blockref type: %d",
1278 parent->bref.type);
1279 base = NULL;
1280 count = 0;
1281 break;
1282 }
1283 }
1284 *countp = count;
1285
1286 return base;
1287 }
1288
1289 /*
1290 * This counts the number of live blockrefs in a block array and
1291 * also calculates the point at which all remaining blockrefs are empty.
1292 * This routine can only be called on a live chain.
1293 *
1294 * Caller holds the chain locked, but possibly with a shared lock. We
1295 * must use an exclusive spinlock to prevent corruption.
1296 *
1297 * NOTE: Flag is not set until after the count is complete, allowing
1298 * callers to test the flag without holding the spinlock.
1299 *
1300 * NOTE: If base is NULL the related chain is still in the INITIAL
1301 * state and there are no blockrefs to count.
1302 *
1303 * NOTE: live_count may already have some counts accumulated due to
1304 * creation and deletion and could even be initially negative.
1305 */
1306 void
hammer2_chain_countbrefs(hammer2_chain_t * chain,hammer2_blockref_t * base,int count)1307 hammer2_chain_countbrefs(hammer2_chain_t *chain,
1308 hammer2_blockref_t *base, int count)
1309 {
1310 hammer2_spin_ex(&chain->core.spin);
1311 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) {
1312 if (base) {
1313 while (--count >= 0) {
1314 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY)
1315 break;
1316 }
1317 chain->core.live_zero = count + 1;
1318 while (count >= 0) {
1319 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY)
1320 atomic_add_int(&chain->core.live_count,
1321 1);
1322 --count;
1323 }
1324 } else {
1325 chain->core.live_zero = 0;
1326 }
1327 /* else do not modify live_count */
1328 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS);
1329 }
1330 hammer2_spin_unex(&chain->core.spin);
1331 }
1332
1333 /*
1334 * Resize the chain's physical storage allocation in-place. This function does
1335 * not usually adjust the data pointer and must be followed by (typically) a
1336 * hammer2_chain_modify() call to copy any old data over and adjust the
1337 * data pointer.
1338 *
1339 * Chains can be resized smaller without reallocating the storage. Resizing
1340 * larger will reallocate the storage. Excess or prior storage is reclaimed
1341 * asynchronously at a later time.
1342 *
1343 * An nradix value of 0 is special-cased to mean that the storage should
1344 * be disassociated, that is the chain is being resized to 0 bytes (not 1
1345 * byte).
1346 *
1347 * Must be passed an exclusively locked parent and chain.
1348 *
1349 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1350 * to avoid instantiating a device buffer that conflicts with the vnode data
1351 * buffer. However, because H2 can compress or encrypt data, the chain may
1352 * have a dio assigned to it in those situations, and they do not conflict.
1353 *
1354 * XXX return error if cannot resize.
1355 */
1356 int
hammer2_chain_resize(hammer2_chain_t * chain,hammer2_tid_t mtid,hammer2_off_t dedup_off,int nradix,int flags)1357 hammer2_chain_resize(hammer2_chain_t *chain,
1358 hammer2_tid_t mtid, hammer2_off_t dedup_off,
1359 int nradix, int flags)
1360 {
1361 hammer2_dev_t *hmp;
1362 size_t obytes;
1363 size_t nbytes;
1364 int error;
1365
1366 hmp = chain->hmp;
1367
1368 /*
1369 * Only data and indirect blocks can be resized for now.
1370 * (The volu root, inodes, and freemap elements use a fixed size).
1371 */
1372 KKASSERT(chain != &hmp->vchain);
1373 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1374 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1375 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT);
1376
1377 /*
1378 * Nothing to do if the element is already the proper size
1379 */
1380 obytes = chain->bytes;
1381 nbytes = (nradix) ? (1U << nradix) : 0;
1382 if (obytes == nbytes)
1383 return (chain->error);
1384
1385 /*
1386 * Make sure the old data is instantiated so we can copy it. If this
1387 * is a data block, the device data may be superfluous since the data
1388 * might be in a logical block, but compressed or encrypted data is
1389 * another matter.
1390 *
1391 * NOTE: The modify will set BLKMAPUPD for us if BLKMAPPED is set.
1392 */
1393 error = hammer2_chain_modify(chain, mtid, dedup_off, 0);
1394 if (error)
1395 return error;
1396
1397 /*
1398 * Reallocate the block, even if making it smaller (because different
1399 * block sizes may be in different regions).
1400 *
1401 * NOTE: Operation does not copy the data and may only be used
1402 * to resize data blocks in-place, or directory entry blocks
1403 * which are about to be modified in some manner.
1404 */
1405 error = hammer2_freemap_alloc(chain, nbytes);
1406 if (error)
1407 return error;
1408
1409 chain->bytes = nbytes;
1410
1411 /*
1412 * We don't want the followup chain_modify() to try to copy data
1413 * from the old (wrong-sized) buffer. It won't know how much to
1414 * copy. This case should only occur during writes when the
1415 * originator already has the data to write in-hand.
1416 */
1417 if (chain->dio) {
1418 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1419 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT);
1420 hammer2_io_brelse(&chain->dio);
1421 chain->data = NULL;
1422 }
1423 return (chain->error);
1424 }
1425
1426 /*
1427 * Set the chain modified so its data can be changed by the caller, or
1428 * install deduplicated data. The caller must call this routine for each
1429 * set of modifications it makes, even if the chain is already flagged
1430 * MODIFIED.
1431 *
1432 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid
1433 * is a CLC (cluster level change) field and is not updated by parent
1434 * propagation during a flush.
1435 *
1436 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect
1437 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails
1438 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain
1439 * remains unmodified with its old data ref intact and chain->error
1440 * unchanged.
1441 *
1442 * Dedup Handling
1443 *
1444 * If the DEDUPABLE flag is set in the chain the storage must be reallocated
1445 * even if the chain is still flagged MODIFIED. In this case the chain's
1446 * DEDUPABLE flag will be cleared once the new storage has been assigned.
1447 *
1448 * If the caller passes a non-zero dedup_off we will use it to assign the
1449 * new storage. The MODIFIED flag will be *CLEARED* in this case, and
1450 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller
1451 * must not modify the data content upon return.
1452 */
1453 int
hammer2_chain_modify(hammer2_chain_t * chain,hammer2_tid_t mtid,hammer2_off_t dedup_off,int flags)1454 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
1455 hammer2_off_t dedup_off, int flags)
1456 {
1457 hammer2_dev_t *hmp;
1458 hammer2_io_t *dio;
1459 int error;
1460 int wasinitial;
1461 int setmodified;
1462 int setupdate;
1463 int newmod;
1464 char *bdata;
1465
1466 hmp = chain->hmp;
1467 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE);
1468
1469 /*
1470 * Data is not optional for freemap chains (we must always be sure
1471 * to copy the data on COW storage allocations).
1472 */
1473 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1474 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1475 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1476 (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1477 }
1478
1479 /*
1480 * Data must be resolved if already assigned, unless explicitly
1481 * flagged otherwise. If we cannot safety load the data the
1482 * modification fails and we return early.
1483 */
1484 if (chain->data == NULL && chain->bytes != 0 &&
1485 (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1486 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1487 hammer2_chain_load_data(chain);
1488 if (chain->error)
1489 return (chain->error);
1490 }
1491 error = 0;
1492
1493 /*
1494 * Set MODIFIED to indicate that the chain has been modified. A new
1495 * allocation is required when modifying a chain.
1496 *
1497 * Set UPDATE to ensure that the blockref is updated in the parent.
1498 *
1499 * If MODIFIED is already set determine if we can reuse the assigned
1500 * data block or if we need a new data block.
1501 */
1502 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1503 /*
1504 * Must set modified bit.
1505 */
1506 atomic_add_long(&hammer2_count_modified_chains, 1);
1507 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1508 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */
1509 setmodified = 1;
1510
1511 /*
1512 * We may be able to avoid a copy-on-write if the chain's
1513 * check mode is set to NONE and the chain's current
1514 * modify_tid is beyond the last explicit snapshot tid.
1515 *
1516 * This implements HAMMER2's overwrite-in-place feature.
1517 *
1518 * NOTE! This data-block cannot be used as a de-duplication
1519 * source when the check mode is set to NONE.
1520 */
1521 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1522 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) &&
1523 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 &&
1524 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 &&
1525 HAMMER2_DEC_CHECK(chain->bref.methods) ==
1526 HAMMER2_CHECK_NONE &&
1527 chain->pmp &&
1528 chain->bref.modify_tid >
1529 chain->pmp->iroot->meta.pfs_lsnap_tid) {
1530 /*
1531 * Sector overwrite allowed.
1532 */
1533 newmod = 0;
1534 } else if ((hmp->hflags & HMNT2_EMERG) &&
1535 chain->pmp &&
1536 chain->bref.modify_tid >
1537 chain->pmp->iroot->meta.pfs_lsnap_tid) {
1538 /*
1539 * If in emergency delete mode then do a modify-in-
1540 * place on any chain type belonging to the PFS as
1541 * long as it doesn't mess up a snapshot. We might
1542 * be forced to do this anyway a little further down
1543 * in the code if the allocation fails.
1544 *
1545 * Also note that in emergency mode, these modify-in-
1546 * place operations are NOT SAFE. A storage failure,
1547 * power failure, or panic can corrupt the filesystem.
1548 */
1549 newmod = 0;
1550 } else {
1551 /*
1552 * Sector overwrite not allowed, must copy-on-write.
1553 */
1554 newmod = 1;
1555 }
1556 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) {
1557 /*
1558 * If the modified chain was registered for dedup we need
1559 * a new allocation. This only happens for delayed-flush
1560 * chains (i.e. which run through the front-end buffer
1561 * cache).
1562 */
1563 newmod = 1;
1564 setmodified = 0;
1565 } else {
1566 /*
1567 * Already flagged modified, no new allocation is needed.
1568 */
1569 newmod = 0;
1570 setmodified = 0;
1571 }
1572
1573 /*
1574 * Flag parent update required.
1575 */
1576 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) {
1577 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1578 setupdate = 1;
1579 } else {
1580 setupdate = 0;
1581 }
1582
1583 /*
1584 * The XOP code returns held but unlocked focus chains. This
1585 * prevents the chain from being destroyed but does not prevent
1586 * it from being modified. diolk is used to interlock modifications
1587 * against XOP frontend accesses to the focus.
1588 *
1589 * This allows us to theoretically avoid deadlocking the frontend
1590 * if one of the backends lock up by not formally locking the
1591 * focused chain in the frontend. In addition, the synchronization
1592 * code relies on this mechanism to avoid deadlocking concurrent
1593 * synchronization threads.
1594 */
1595 lockmgr(&chain->diolk, LK_EXCLUSIVE);
1596
1597 /*
1598 * The modification or re-modification requires an allocation and
1599 * possible COW. If an error occurs, the previous content and data
1600 * reference is retained and the modification fails.
1601 *
1602 * If dedup_off is non-zero, the caller is requesting a deduplication
1603 * rather than a modification. The MODIFIED bit is not set and the
1604 * data offset is set to the deduplication offset. The data cannot
1605 * be modified.
1606 *
1607 * NOTE: The dedup offset is allowed to be in a partially free state
1608 * and we must be sure to reset it to a fully allocated state
1609 * to force two bulkfree passes to free it again.
1610 *
1611 * NOTE: Only applicable when chain->bytes != 0.
1612 *
1613 * XXX can a chain already be marked MODIFIED without a data
1614 * assignment? If not, assert here instead of testing the case.
1615 */
1616 if (chain != &hmp->vchain && chain != &hmp->fchain &&
1617 chain->bytes) {
1618 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1619 newmod
1620 ) {
1621 /*
1622 * NOTE: We do not have to remove the dedup
1623 * registration because the area is still
1624 * allocated and the underlying DIO will
1625 * still be flushed.
1626 */
1627 if (dedup_off) {
1628 chain->bref.data_off = dedup_off;
1629 if ((int)(dedup_off & HAMMER2_OFF_MASK_RADIX))
1630 chain->bytes = 1 <<
1631 (int)(dedup_off &
1632 HAMMER2_OFF_MASK_RADIX);
1633 else
1634 chain->bytes = 0;
1635 chain->error = 0;
1636 atomic_clear_int(&chain->flags,
1637 HAMMER2_CHAIN_MODIFIED);
1638 atomic_add_long(&hammer2_count_modified_chains,
1639 -1);
1640 if (chain->pmp) {
1641 hammer2_pfs_memory_wakeup(
1642 chain->pmp, -1);
1643 }
1644 hammer2_freemap_adjust(hmp, &chain->bref,
1645 HAMMER2_FREEMAP_DORECOVER);
1646 atomic_set_int(&chain->flags,
1647 HAMMER2_CHAIN_DEDUPABLE);
1648 } else {
1649 error = hammer2_freemap_alloc(chain,
1650 chain->bytes);
1651 atomic_clear_int(&chain->flags,
1652 HAMMER2_CHAIN_DEDUPABLE);
1653
1654 /*
1655 * If we are unable to allocate a new block
1656 * but we are in emergency mode, issue a
1657 * warning to the console and reuse the same
1658 * block.
1659 *
1660 * We behave as if the allocation were
1661 * successful.
1662 *
1663 * THIS IS IMPORTANT: These modifications
1664 * are virtually guaranteed to corrupt any
1665 * snapshots related to this filesystem.
1666 */
1667 if (error && (hmp->hflags & HMNT2_EMERG)) {
1668 error = 0;
1669 chain->bref.flags |=
1670 HAMMER2_BREF_FLAG_EMERG_MIP;
1671
1672 krateprintf(&krate_h2em,
1673 "hammer2: Emergency Mode WARNING: "
1674 "Operation will likely corrupt "
1675 "related snapshot: "
1676 "%016jx.%02x key=%016jx\n",
1677 chain->bref.data_off,
1678 chain->bref.type,
1679 chain->bref.key);
1680 } else if (error == 0) {
1681 chain->bref.flags &=
1682 ~HAMMER2_BREF_FLAG_EMERG_MIP;
1683 }
1684 }
1685 }
1686 }
1687
1688 /*
1689 * Stop here if error. We have to undo any flag bits we might
1690 * have set above.
1691 */
1692 if (error) {
1693 if (setmodified) {
1694 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1695 atomic_add_long(&hammer2_count_modified_chains, -1);
1696 if (chain->pmp)
1697 hammer2_pfs_memory_wakeup(chain->pmp, -1);
1698 }
1699 if (setupdate) {
1700 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1701 }
1702 lockmgr(&chain->diolk, LK_RELEASE);
1703
1704 return error;
1705 }
1706
1707 /*
1708 * Update mirror_tid and modify_tid. modify_tid is only updated
1709 * if not passed as zero (during flushes, parent propagation passes
1710 * the value 0).
1711 *
1712 * NOTE: chain->pmp could be the device spmp.
1713 */
1714 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1;
1715 if (mtid)
1716 chain->bref.modify_tid = mtid;
1717
1718 /*
1719 * Set BLKMAPUPD to tell the flush code that an existing blockmap entry
1720 * requires updating as well as to tell the delete code that the
1721 * chain's blockref might not exactly match (in terms of physical size
1722 * or block offset) the one in the parent's blocktable. The base key
1723 * of course will still match.
1724 */
1725 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED)
1726 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD);
1727
1728 /*
1729 * Short-cut data block handling when the caller does not need an
1730 * actual data reference to (aka OPTDATA), as long as the chain does
1731 * not already have a data pointer to the data and no de-duplication
1732 * occurred.
1733 *
1734 * This generally means that the modifications are being done via the
1735 * logical buffer cache.
1736 *
1737 * NOTE: If deduplication occurred we have to run through the data
1738 * stuff to clear INITIAL, and the caller will likely want to
1739 * assign the check code anyway. Leaving INITIAL set on a
1740 * dedup can be deadly (it can cause the block to be zero'd!).
1741 *
1742 * This code also handles bytes == 0 (most dirents).
1743 */
1744 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
1745 (flags & HAMMER2_MODIFY_OPTDATA) &&
1746 chain->data == NULL) {
1747 if (dedup_off == 0) {
1748 KKASSERT(chain->dio == NULL);
1749 goto skip2;
1750 }
1751 }
1752
1753 /*
1754 * Clearing the INITIAL flag (for indirect blocks) indicates that
1755 * we've processed the uninitialized storage allocation.
1756 *
1757 * If this flag is already clear we are likely in a copy-on-write
1758 * situation but we have to be sure NOT to bzero the storage if
1759 * no data is present.
1760 *
1761 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set,
1762 */
1763 if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1764 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1765 wasinitial = 1;
1766 } else {
1767 wasinitial = 0;
1768 }
1769
1770 /*
1771 * Instantiate data buffer and possibly execute COW operation
1772 */
1773 switch(chain->bref.type) {
1774 case HAMMER2_BREF_TYPE_VOLUME:
1775 case HAMMER2_BREF_TYPE_FREEMAP:
1776 /*
1777 * The data is embedded, no copy-on-write operation is
1778 * needed.
1779 */
1780 KKASSERT(chain->dio == NULL);
1781 break;
1782 case HAMMER2_BREF_TYPE_DIRENT:
1783 /*
1784 * The data might be fully embedded.
1785 */
1786 if (chain->bytes == 0) {
1787 KKASSERT(chain->dio == NULL);
1788 break;
1789 }
1790 /* fall through */
1791 case HAMMER2_BREF_TYPE_INODE:
1792 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1793 case HAMMER2_BREF_TYPE_DATA:
1794 case HAMMER2_BREF_TYPE_INDIRECT:
1795 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1796 /*
1797 * Perform the copy-on-write operation
1798 *
1799 * zero-fill or copy-on-write depending on whether
1800 * chain->data exists or not and set the dirty state for
1801 * the new buffer. hammer2_io_new() will handle the
1802 * zero-fill.
1803 *
1804 * If a dedup_off was supplied this is an existing block
1805 * and no COW, copy, or further modification is required.
1806 */
1807 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1808
1809 if (wasinitial && dedup_off == 0) {
1810 error = hammer2_io_new(hmp, chain->bref.type,
1811 chain->bref.data_off,
1812 chain->bytes, &dio);
1813 } else {
1814 error = hammer2_io_bread(hmp, chain->bref.type,
1815 chain->bref.data_off,
1816 chain->bytes, &dio);
1817 }
1818 hammer2_adjreadcounter(chain->bref.type, chain->bytes);
1819
1820 /*
1821 * If an I/O error occurs make sure callers cannot accidently
1822 * modify the old buffer's contents and corrupt the filesystem.
1823 *
1824 * NOTE: hammer2_io_data() call issues bkvasync()
1825 */
1826 if (error) {
1827 kprintf("hammer2_chain_modify: hmp=%p I/O error\n",
1828 hmp);
1829 chain->error = HAMMER2_ERROR_EIO;
1830 hammer2_io_brelse(&dio);
1831 hammer2_io_brelse(&chain->dio);
1832 chain->data = NULL;
1833 break;
1834 }
1835 chain->error = 0;
1836 bdata = hammer2_io_data(dio, chain->bref.data_off);
1837
1838 if (chain->data) {
1839 /*
1840 * COW (unless a dedup).
1841 */
1842 KKASSERT(chain->dio != NULL);
1843 if (chain->data != (void *)bdata && dedup_off == 0) {
1844 bcopy(chain->data, bdata, chain->bytes);
1845 }
1846 } else if (wasinitial == 0 && dedup_off == 0) {
1847 /*
1848 * We have a problem. We were asked to COW but
1849 * we don't have any data to COW with!
1850 */
1851 panic("hammer2_chain_modify: having a COW %p\n",
1852 chain);
1853 }
1854
1855 /*
1856 * Retire the old buffer, replace with the new. Dirty or
1857 * redirty the new buffer.
1858 *
1859 * WARNING! The system buffer cache may have already flushed
1860 * the buffer, so we must be sure to [re]dirty it
1861 * for further modification.
1862 *
1863 * If dedup_off was supplied, the caller is not
1864 * expected to make any further modification to the
1865 * buffer.
1866 *
1867 * WARNING! hammer2_get_gdata() assumes dio never transitions
1868 * through NULL in order to optimize away unnecessary
1869 * diolk operations.
1870 */
1871 {
1872 hammer2_io_t *tio;
1873
1874 if ((tio = chain->dio) != NULL)
1875 hammer2_io_bqrelse(&tio);
1876 chain->data = (void *)bdata;
1877 chain->dio = dio;
1878 if (dedup_off == 0)
1879 hammer2_io_setdirty(dio);
1880 }
1881 break;
1882 default:
1883 panic("hammer2_chain_modify: illegal non-embedded type %d",
1884 chain->bref.type);
1885 break;
1886
1887 }
1888 skip2:
1889 /*
1890 * setflush on parent indicating that the parent must recurse down
1891 * to us. Do not call on chain itself which might already have it
1892 * set.
1893 */
1894 if (chain->parent)
1895 hammer2_chain_setflush(chain->parent);
1896 lockmgr(&chain->diolk, LK_RELEASE);
1897
1898 return (chain->error);
1899 }
1900
1901 /*
1902 * Modify the chain associated with an inode.
1903 */
1904 int
hammer2_chain_modify_ip(hammer2_inode_t * ip,hammer2_chain_t * chain,hammer2_tid_t mtid,int flags)1905 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain,
1906 hammer2_tid_t mtid, int flags)
1907 {
1908 int error;
1909
1910 hammer2_inode_modify(ip);
1911 error = hammer2_chain_modify(chain, mtid, 0, flags);
1912
1913 return error;
1914 }
1915
1916 /*
1917 * This function returns the chain at the nearest key within the specified
1918 * range. The returned chain will be referenced but not locked.
1919 *
1920 * This function will recurse through chain->rbtree as necessary and will
1921 * return a *key_nextp suitable for iteration. *key_nextp is only set if
1922 * the iteration value is less than the current value of *key_nextp.
1923 *
1924 * The caller should use (*key_nextp) to calculate the actual range of
1925 * the returned element, which will be (key_beg to *key_nextp - 1), because
1926 * there might be another element which is superior to the returned element
1927 * and overlaps it.
1928 *
1929 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
1930 * chains continue to be returned. On EOF (*key_nextp) may overflow since
1931 * it will wind up being (key_end + 1).
1932 *
1933 * WARNING! Must be called with child's spinlock held. Spinlock remains
1934 * held through the operation.
1935 */
1936 struct hammer2_chain_find_info {
1937 hammer2_chain_t *best;
1938 hammer2_key_t key_beg;
1939 hammer2_key_t key_end;
1940 hammer2_key_t key_next;
1941 };
1942
1943 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
1944 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
1945
1946 static
1947 hammer2_chain_t *
hammer2_chain_find(hammer2_chain_t * parent,hammer2_key_t * key_nextp,hammer2_key_t key_beg,hammer2_key_t key_end)1948 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
1949 hammer2_key_t key_beg, hammer2_key_t key_end)
1950 {
1951 struct hammer2_chain_find_info info;
1952
1953 info.best = NULL;
1954 info.key_beg = key_beg;
1955 info.key_end = key_end;
1956 info.key_next = *key_nextp;
1957
1958 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree,
1959 hammer2_chain_find_cmp, hammer2_chain_find_callback,
1960 &info);
1961 *key_nextp = info.key_next;
1962 #if 0
1963 kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
1964 parent, key_beg, key_end, *key_nextp);
1965 #endif
1966
1967 return (info.best);
1968 }
1969
1970 static
1971 int
hammer2_chain_find_cmp(hammer2_chain_t * child,void * data)1972 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
1973 {
1974 struct hammer2_chain_find_info *info = data;
1975 hammer2_key_t child_beg;
1976 hammer2_key_t child_end;
1977
1978 child_beg = child->bref.key;
1979 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
1980
1981 if (child_end < info->key_beg)
1982 return(-1);
1983 if (child_beg > info->key_end)
1984 return(1);
1985 return(0);
1986 }
1987
1988 static
1989 int
hammer2_chain_find_callback(hammer2_chain_t * child,void * data)1990 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
1991 {
1992 struct hammer2_chain_find_info *info = data;
1993 hammer2_chain_t *best;
1994 hammer2_key_t child_end;
1995
1996 if ((best = info->best) == NULL) {
1997 /*
1998 * No previous best. Assign best
1999 */
2000 info->best = child;
2001 } else if (best->bref.key <= info->key_beg &&
2002 child->bref.key <= info->key_beg) {
2003 /*
2004 * Illegal overlap.
2005 */
2006 KKASSERT(0);
2007 /*info->best = child;*/
2008 } else if (child->bref.key < best->bref.key) {
2009 /*
2010 * Child has a nearer key and best is not flush with key_beg.
2011 * Set best to child. Truncate key_next to the old best key.
2012 */
2013 info->best = child;
2014 if (info->key_next > best->bref.key || info->key_next == 0)
2015 info->key_next = best->bref.key;
2016 } else if (child->bref.key == best->bref.key) {
2017 /*
2018 * If our current best is flush with the child then this
2019 * is an illegal overlap.
2020 *
2021 * key_next will automatically be limited to the smaller of
2022 * the two end-points.
2023 */
2024 KKASSERT(0);
2025 info->best = child;
2026 } else {
2027 /*
2028 * Keep the current best but truncate key_next to the child's
2029 * base.
2030 *
2031 * key_next will also automatically be limited to the smaller
2032 * of the two end-points (probably not necessary for this case
2033 * but we do it anyway).
2034 */
2035 if (info->key_next > child->bref.key || info->key_next == 0)
2036 info->key_next = child->bref.key;
2037 }
2038
2039 /*
2040 * Always truncate key_next based on child's end-of-range.
2041 */
2042 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
2043 if (child_end && (info->key_next > child_end || info->key_next == 0))
2044 info->key_next = child_end;
2045
2046 return(0);
2047 }
2048
2049 /*
2050 * Retrieve the specified chain from a media blockref, creating the
2051 * in-memory chain structure which reflects it. The returned chain is
2052 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must
2053 * handle crc-checks and so forth, and should check chain->error before
2054 * assuming that the data is good.
2055 *
2056 * To handle insertion races pass the INSERT_RACE flag along with the
2057 * generation number of the core. NULL will be returned if the generation
2058 * number changes before we have a chance to insert the chain. Insert
2059 * races can occur because the parent might be held shared.
2060 *
2061 * Caller must hold the parent locked shared or exclusive since we may
2062 * need the parent's bref array to find our block.
2063 *
2064 * WARNING! chain->pmp is always set to NULL for any chain representing
2065 * part of the super-root topology.
2066 */
2067 hammer2_chain_t *
hammer2_chain_get(hammer2_chain_t * parent,int generation,hammer2_blockref_t * bref,int how)2068 hammer2_chain_get(hammer2_chain_t *parent, int generation,
2069 hammer2_blockref_t *bref, int how)
2070 {
2071 hammer2_dev_t *hmp = parent->hmp;
2072 hammer2_chain_t *chain;
2073 int error;
2074
2075 /*
2076 * Allocate a chain structure representing the existing media
2077 * entry. Resulting chain has one ref and is not locked.
2078 */
2079 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
2080 chain = hammer2_chain_alloc(hmp, NULL, bref);
2081 else
2082 chain = hammer2_chain_alloc(hmp, parent->pmp, bref);
2083 /* ref'd chain returned */
2084
2085 /*
2086 * Flag that the chain is in the parent's blockmap so delete/flush
2087 * knows what to do with it.
2088 */
2089 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED);
2090
2091 /*
2092 * chain must be locked to avoid unexpected ripouts
2093 */
2094 hammer2_chain_lock(chain, how);
2095
2096 /*
2097 * Link the chain into its parent. A spinlock is required to safely
2098 * access the RBTREE, and it is possible to collide with another
2099 * hammer2_chain_get() operation because the caller might only hold
2100 * a shared lock on the parent.
2101 *
2102 * NOTE: Get races can occur quite often when we distribute
2103 * asynchronous read-aheads across multiple threads.
2104 */
2105 KKASSERT(parent->refs > 0);
2106 error = hammer2_chain_insert(parent, chain,
2107 HAMMER2_CHAIN_INSERT_SPIN |
2108 HAMMER2_CHAIN_INSERT_RACE,
2109 generation);
2110 if (error) {
2111 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
2112 /*kprintf("chain %p get race\n", chain);*/
2113 hammer2_chain_unlock(chain);
2114 hammer2_chain_drop(chain);
2115 chain = NULL;
2116 } else {
2117 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
2118 }
2119
2120 /*
2121 * Return our new chain referenced but not locked, or NULL if
2122 * a race occurred.
2123 */
2124 return (chain);
2125 }
2126
2127 /*
2128 * Lookup initialization/completion API
2129 */
2130 hammer2_chain_t *
hammer2_chain_lookup_init(hammer2_chain_t * parent,int flags)2131 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
2132 {
2133 hammer2_chain_ref(parent);
2134 if (flags & HAMMER2_LOOKUP_SHARED) {
2135 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
2136 HAMMER2_RESOLVE_SHARED);
2137 } else {
2138 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2139 }
2140 return (parent);
2141 }
2142
2143 void
hammer2_chain_lookup_done(hammer2_chain_t * parent)2144 hammer2_chain_lookup_done(hammer2_chain_t *parent)
2145 {
2146 if (parent) {
2147 hammer2_chain_unlock(parent);
2148 hammer2_chain_drop(parent);
2149 }
2150 }
2151
2152 /*
2153 * Take the locked chain and return a locked parent. The chain remains
2154 * locked on return, but may have to be temporarily unlocked to acquire
2155 * the parent. Because of this, (chain) must be stable and cannot be
2156 * deleted while it was temporarily unlocked (typically means that (chain)
2157 * is an inode).
2158 *
2159 * Pass HAMMER2_RESOLVE_* flags in flags.
2160 *
2161 * This will work even if the chain is errored, and the caller can check
2162 * parent->error on return if desired since the parent will be locked.
2163 *
2164 * This function handles the lock order reversal.
2165 */
2166 hammer2_chain_t *
hammer2_chain_getparent(hammer2_chain_t * chain,int flags)2167 hammer2_chain_getparent(hammer2_chain_t *chain, int flags)
2168 {
2169 hammer2_chain_t *parent;
2170
2171 /*
2172 * Be careful of order, chain must be unlocked before parent
2173 * is locked below to avoid a deadlock. Try it trivially first.
2174 */
2175 parent = chain->parent;
2176 if (parent == NULL)
2177 panic("hammer2_chain_getparent: no parent");
2178 hammer2_chain_ref(parent);
2179 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0)
2180 return parent;
2181
2182 for (;;) {
2183 hammer2_chain_unlock(chain);
2184 hammer2_chain_lock(parent, flags);
2185 hammer2_chain_lock(chain, flags);
2186
2187 /*
2188 * Parent relinking races are quite common. We have to get
2189 * it right or we will blow up the block table.
2190 */
2191 if (chain->parent == parent)
2192 break;
2193 hammer2_chain_unlock(parent);
2194 hammer2_chain_drop(parent);
2195 cpu_ccfence();
2196 parent = chain->parent;
2197 if (parent == NULL)
2198 panic("hammer2_chain_getparent: no parent");
2199 hammer2_chain_ref(parent);
2200 }
2201 return parent;
2202 }
2203
2204 /*
2205 * Take the locked chain and return a locked parent. The chain is unlocked
2206 * and dropped. *chainp is set to the returned parent as a convenience.
2207 * Pass HAMMER2_RESOLVE_* flags in flags.
2208 *
2209 * This will work even if the chain is errored, and the caller can check
2210 * parent->error on return if desired since the parent will be locked.
2211 *
2212 * The chain does NOT need to be stable. We use a tracking structure
2213 * to track the expected parent if the chain is deleted out from under us.
2214 *
2215 * This function handles the lock order reversal.
2216 */
2217 hammer2_chain_t *
hammer2_chain_repparent(hammer2_chain_t ** chainp,int flags)2218 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags)
2219 {
2220 hammer2_chain_t *chain;
2221 hammer2_chain_t *parent;
2222 struct hammer2_reptrack reptrack;
2223 struct hammer2_reptrack **repp;
2224
2225 /*
2226 * Be careful of order, chain must be unlocked before parent
2227 * is locked below to avoid a deadlock. Try it trivially first.
2228 */
2229 chain = *chainp;
2230 parent = chain->parent;
2231 if (parent == NULL) {
2232 hammer2_spin_unex(&chain->core.spin);
2233 panic("hammer2_chain_repparent: no parent");
2234 }
2235 hammer2_chain_ref(parent);
2236 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) {
2237 hammer2_chain_unlock(chain);
2238 hammer2_chain_drop(chain);
2239 *chainp = parent;
2240
2241 return parent;
2242 }
2243
2244 /*
2245 * Ok, now it gets a bit nasty. There are multiple situations where
2246 * the parent might be in the middle of a deletion, or where the child
2247 * (chain) might be deleted the instant we let go of its lock.
2248 * We can potentially end up in a no-win situation!
2249 *
2250 * In particular, the indirect_maintenance() case can cause these
2251 * situations.
2252 *
2253 * To deal with this we install a reptrack structure in the parent
2254 * This reptrack structure 'owns' the parent ref and will automatically
2255 * migrate to the parent's parent if the parent is deleted permanently.
2256 */
2257 hammer2_spin_init(&reptrack.spin, "h2reptrk");
2258 reptrack.chain = parent;
2259 hammer2_chain_ref(parent); /* for the reptrack */
2260
2261 hammer2_spin_ex(&parent->core.spin);
2262 reptrack.next = parent->core.reptrack;
2263 parent->core.reptrack = &reptrack;
2264 hammer2_spin_unex(&parent->core.spin);
2265
2266 hammer2_chain_unlock(chain);
2267 hammer2_chain_drop(chain);
2268 chain = NULL; /* gone */
2269
2270 /*
2271 * At the top of this loop, chain is gone and parent is refd both
2272 * by us explicitly AND via our reptrack. We are attempting to
2273 * lock parent.
2274 */
2275 for (;;) {
2276 hammer2_chain_lock(parent, flags);
2277
2278 if (reptrack.chain == parent)
2279 break;
2280 hammer2_chain_unlock(parent);
2281 hammer2_chain_drop(parent);
2282
2283 kprintf("hammer2: debug REPTRACK %p->%p\n",
2284 parent, reptrack.chain);
2285 hammer2_spin_ex(&reptrack.spin);
2286 parent = reptrack.chain;
2287 hammer2_chain_ref(parent);
2288 hammer2_spin_unex(&reptrack.spin);
2289 }
2290
2291 /*
2292 * Once parent is locked and matches our reptrack, our reptrack
2293 * will be stable and we have our parent. We can unlink our
2294 * reptrack.
2295 *
2296 * WARNING! Remember that the chain lock might be shared. Chains
2297 * locked shared have stable parent linkages.
2298 */
2299 hammer2_spin_ex(&parent->core.spin);
2300 repp = &parent->core.reptrack;
2301 while (*repp != &reptrack)
2302 repp = &(*repp)->next;
2303 *repp = reptrack.next;
2304 hammer2_spin_unex(&parent->core.spin);
2305
2306 hammer2_chain_drop(parent); /* reptrack ref */
2307 *chainp = parent; /* return parent lock+ref */
2308
2309 return parent;
2310 }
2311
2312 /*
2313 * Dispose of any linked reptrack structures in (chain) by shifting them to
2314 * (parent). Both (chain) and (parent) must be exclusively locked.
2315 *
2316 * This is interlocked against any children of (chain) on the other side.
2317 * No children so remain as-of when this is called so we can test
2318 * core.reptrack without holding the spin-lock.
2319 *
2320 * Used whenever the caller intends to permanently delete chains related
2321 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE),
2322 * where the chains underneath the node being deleted are given a new parent
2323 * above the node being deleted.
2324 */
2325 static
2326 void
hammer2_chain_repchange(hammer2_chain_t * parent,hammer2_chain_t * chain)2327 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain)
2328 {
2329 struct hammer2_reptrack *reptrack;
2330
2331 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree));
2332 while (chain->core.reptrack) {
2333 hammer2_spin_ex(&parent->core.spin);
2334 hammer2_spin_ex(&chain->core.spin);
2335 reptrack = chain->core.reptrack;
2336 if (reptrack == NULL) {
2337 hammer2_spin_unex(&chain->core.spin);
2338 hammer2_spin_unex(&parent->core.spin);
2339 break;
2340 }
2341 hammer2_spin_ex(&reptrack->spin);
2342 chain->core.reptrack = reptrack->next;
2343 reptrack->chain = parent;
2344 reptrack->next = parent->core.reptrack;
2345 parent->core.reptrack = reptrack;
2346 hammer2_chain_ref(parent); /* reptrack */
2347
2348 hammer2_spin_unex(&chain->core.spin);
2349 hammer2_spin_unex(&parent->core.spin);
2350 kprintf("hammer2: debug repchange %p %p->%p\n",
2351 reptrack, chain, parent);
2352 hammer2_chain_drop(chain); /* reptrack */
2353 }
2354 }
2355
2356 /*
2357 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
2358 * (*parentp) typically points to an inode but can also point to a related
2359 * indirect block and this function will recurse upwards and find the inode
2360 * or the nearest undeleted indirect block covering the key range.
2361 *
2362 * This function unconditionally sets *errorp, replacing any previous value.
2363 *
2364 * (*parentp) must be exclusive or shared locked (depending on flags) and
2365 * referenced and can be an inode or an existing indirect block within the
2366 * inode.
2367 *
2368 * If (*parent) is errored out, this function will not attempt to recurse
2369 * the radix tree and will return NULL along with an appropriate *errorp.
2370 * If NULL is returned and *errorp is 0, the requested lookup could not be
2371 * located.
2372 *
2373 * On return (*parentp) will be modified to point at the deepest parent chain
2374 * element encountered during the search, as a helper for an insertion or
2375 * deletion.
2376 *
2377 * The new (*parentp) will be locked shared or exclusive (depending on flags),
2378 * and referenced, and the old will be unlocked and dereferenced (no change
2379 * if they are both the same). This is particularly important if the caller
2380 * wishes to insert a new chain, (*parentp) will be set properly even if NULL
2381 * is returned, as long as no error occurred.
2382 *
2383 * The matching chain will be returned locked according to flags.
2384 *
2385 * --
2386 *
2387 * NULL is returned if no match was found, but (*parentp) will still
2388 * potentially be adjusted.
2389 *
2390 * On return (*key_nextp) will point to an iterative value for key_beg.
2391 * (If NULL is returned (*key_nextp) is set to (key_end + 1)).
2392 *
2393 * This function will also recurse up the chain if the key is not within the
2394 * current parent's range. (*parentp) can never be set to NULL. An iteration
2395 * can simply allow (*parentp) to float inside the loop.
2396 *
2397 * NOTE! chain->data is not always resolved. By default it will not be
2398 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use
2399 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
2400 * BREF_TYPE_DATA as the device buffer can alias the logical file
2401 * buffer).
2402 */
2403 hammer2_chain_t *
hammer2_chain_lookup(hammer2_chain_t ** parentp,hammer2_key_t * key_nextp,hammer2_key_t key_beg,hammer2_key_t key_end,int * errorp,int flags)2404 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
2405 hammer2_key_t key_beg, hammer2_key_t key_end,
2406 int *errorp, int flags)
2407 {
2408 hammer2_chain_t *parent;
2409 hammer2_chain_t *chain;
2410 hammer2_blockref_t *base;
2411 hammer2_blockref_t *bref;
2412 hammer2_blockref_t bsave;
2413 hammer2_key_t scan_beg;
2414 hammer2_key_t scan_end;
2415 int count = 0;
2416 int how_always = HAMMER2_RESOLVE_ALWAYS;
2417 int how_maybe = HAMMER2_RESOLVE_MAYBE;
2418 int how;
2419 int generation;
2420 int maxloops = 300000;
2421
2422 if (flags & HAMMER2_LOOKUP_ALWAYS) {
2423 how_maybe = how_always;
2424 how = HAMMER2_RESOLVE_ALWAYS;
2425 } else if (flags & HAMMER2_LOOKUP_NODATA) {
2426 how = HAMMER2_RESOLVE_NEVER;
2427 } else {
2428 how = HAMMER2_RESOLVE_MAYBE;
2429 }
2430 if (flags & HAMMER2_LOOKUP_SHARED) {
2431 how_maybe |= HAMMER2_RESOLVE_SHARED;
2432 how_always |= HAMMER2_RESOLVE_SHARED;
2433 how |= HAMMER2_RESOLVE_SHARED;
2434 }
2435
2436 /*
2437 * Recurse (*parentp) upward if necessary until the parent completely
2438 * encloses the key range or we hit the inode.
2439 *
2440 * Handle races against the flusher deleting indirect nodes on its
2441 * way back up by continuing to recurse upward past the deletion.
2442 */
2443 parent = *parentp;
2444 *errorp = 0;
2445
2446 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2447 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2448 scan_beg = parent->bref.key;
2449 scan_end = scan_beg +
2450 ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2451 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
2452 if (key_beg >= scan_beg && key_end <= scan_end)
2453 break;
2454 }
2455 parent = hammer2_chain_repparent(parentp, how_maybe);
2456 }
2457 again:
2458 if (--maxloops == 0)
2459 panic("hammer2_chain_lookup: maxloops");
2460
2461 /*
2462 * MATCHIND case that does not require parent->data (do prior to
2463 * parent->error check).
2464 */
2465 switch(parent->bref.type) {
2466 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2467 case HAMMER2_BREF_TYPE_INDIRECT:
2468 if (flags & HAMMER2_LOOKUP_MATCHIND) {
2469 scan_beg = parent->bref.key;
2470 scan_end = scan_beg +
2471 ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2472 if (key_beg == scan_beg && key_end == scan_end) {
2473 chain = parent;
2474 hammer2_chain_ref(chain);
2475 hammer2_chain_lock(chain, how_maybe);
2476 *key_nextp = scan_end + 1;
2477 goto done;
2478 }
2479 }
2480 break;
2481 default:
2482 break;
2483 }
2484
2485 /*
2486 * No lookup is possible if the parent is errored. We delayed
2487 * this check as long as we could to ensure that the parent backup,
2488 * embedded data, and MATCHIND code could still execute.
2489 */
2490 if (parent->error) {
2491 *errorp = parent->error;
2492 return NULL;
2493 }
2494
2495 /*
2496 * Locate the blockref array. Currently we do a fully associative
2497 * search through the array.
2498 */
2499 switch(parent->bref.type) {
2500 case HAMMER2_BREF_TYPE_INODE:
2501 /*
2502 * Special shortcut for embedded data returns the inode
2503 * itself. Callers must detect this condition and access
2504 * the embedded data (the strategy code does this for us).
2505 *
2506 * This is only applicable to regular files and softlinks.
2507 *
2508 * We need a second lock on parent. Since we already have
2509 * a lock we must pass LOCKAGAIN to prevent unexpected
2510 * blocking (we don't want to block on a second shared
2511 * ref if an exclusive lock is pending)
2512 */
2513 if (parent->data->ipdata.meta.op_flags &
2514 HAMMER2_OPFLAG_DIRECTDATA) {
2515 if (flags & HAMMER2_LOOKUP_NODIRECT) {
2516 chain = NULL;
2517 *key_nextp = key_end + 1;
2518 goto done;
2519 }
2520 hammer2_chain_ref(parent);
2521 hammer2_chain_lock(parent, how_always |
2522 HAMMER2_RESOLVE_LOCKAGAIN);
2523 *key_nextp = key_end + 1;
2524 return (parent);
2525 }
2526 base = &parent->data->ipdata.u.blockset.blockref[0];
2527 count = HAMMER2_SET_COUNT;
2528 break;
2529 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2530 case HAMMER2_BREF_TYPE_INDIRECT:
2531 /*
2532 * Optimize indirect blocks in the INITIAL state to avoid
2533 * I/O.
2534 *
2535 * Debugging: Enter permanent wait state instead of
2536 * panicing on unexpectedly NULL data for the moment.
2537 */
2538 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2539 base = NULL;
2540 } else {
2541 if (parent->data == NULL) {
2542 kprintf("hammer2: unexpected NULL data "
2543 "on %p\n", parent);
2544 while (1)
2545 tsleep(parent, 0, "xxx", 0);
2546 }
2547 base = &parent->data->npdata[0];
2548 }
2549 count = parent->bytes / sizeof(hammer2_blockref_t);
2550 break;
2551 case HAMMER2_BREF_TYPE_VOLUME:
2552 base = &parent->data->voldata.sroot_blockset.blockref[0];
2553 count = HAMMER2_SET_COUNT;
2554 break;
2555 case HAMMER2_BREF_TYPE_FREEMAP:
2556 base = &parent->data->blkset.blockref[0];
2557 count = HAMMER2_SET_COUNT;
2558 break;
2559 default:
2560 panic("hammer2_chain_lookup: unrecognized "
2561 "blockref(B) type: %d",
2562 parent->bref.type);
2563 base = NULL; /* safety */
2564 count = 0; /* safety */
2565 break;
2566 }
2567
2568 /*
2569 * Merged scan to find next candidate.
2570 *
2571 * hammer2_base_*() functions require the parent->core.live_* fields
2572 * to be synchronized.
2573 *
2574 * We need to hold the spinlock to access the block array and RB tree
2575 * and to interlock chain creation.
2576 */
2577 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2578 hammer2_chain_countbrefs(parent, base, count);
2579
2580 /*
2581 * Combined search
2582 */
2583 hammer2_spin_ex(&parent->core.spin);
2584 chain = hammer2_combined_find(parent, base, count,
2585 key_nextp,
2586 key_beg, key_end,
2587 &bref);
2588 generation = parent->core.generation;
2589
2590 /*
2591 * Exhausted parent chain, iterate.
2592 */
2593 if (bref == NULL) {
2594 KKASSERT(chain == NULL);
2595 hammer2_spin_unex(&parent->core.spin);
2596 if (key_beg == key_end) /* short cut single-key case */
2597 return (NULL);
2598
2599 /*
2600 * Stop if we reached the end of the iteration.
2601 */
2602 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2603 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2604 return (NULL);
2605 }
2606
2607 /*
2608 * Calculate next key, stop if we reached the end of the
2609 * iteration, otherwise go up one level and loop.
2610 */
2611 key_beg = parent->bref.key +
2612 ((hammer2_key_t)1 << parent->bref.keybits);
2613 if (key_beg == 0 || key_beg > key_end)
2614 return (NULL);
2615 parent = hammer2_chain_repparent(parentp, how_maybe);
2616 goto again;
2617 }
2618
2619 /*
2620 * Selected from blockref or in-memory chain.
2621 */
2622 bsave = *bref;
2623 if (chain == NULL) {
2624 hammer2_spin_unex(&parent->core.spin);
2625 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT ||
2626 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2627 chain = hammer2_chain_get(parent, generation,
2628 &bsave, how_maybe);
2629 } else {
2630 chain = hammer2_chain_get(parent, generation,
2631 &bsave, how);
2632 }
2633 if (chain == NULL)
2634 goto again;
2635 } else {
2636 hammer2_chain_ref(chain);
2637 hammer2_spin_unex(&parent->core.spin);
2638
2639 /*
2640 * chain is referenced but not locked. We must lock the
2641 * chain to obtain definitive state.
2642 */
2643 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT ||
2644 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2645 hammer2_chain_lock(chain, how_maybe);
2646 } else {
2647 hammer2_chain_lock(chain, how);
2648 }
2649 KKASSERT(chain->parent == parent);
2650 }
2651 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) ||
2652 chain->parent != parent) {
2653 hammer2_chain_unlock(chain);
2654 hammer2_chain_drop(chain);
2655 chain = NULL; /* SAFETY */
2656 goto again;
2657 }
2658
2659
2660 /*
2661 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2662 *
2663 * NOTE: Chain's key range is not relevant as there might be
2664 * one-offs within the range that are not deleted.
2665 *
2666 * NOTE: Lookups can race delete-duplicate because
2667 * delete-duplicate does not lock the parent's core
2668 * (they just use the spinlock on the core).
2669 */
2670 if (chain->flags & HAMMER2_CHAIN_DELETED) {
2671 kprintf("skip deleted chain %016jx.%02x key=%016jx\n",
2672 chain->bref.data_off, chain->bref.type,
2673 chain->bref.key);
2674 hammer2_chain_unlock(chain);
2675 hammer2_chain_drop(chain);
2676 chain = NULL; /* SAFETY */
2677 key_beg = *key_nextp;
2678 if (key_beg == 0 || key_beg > key_end)
2679 return(NULL);
2680 goto again;
2681 }
2682
2683 /*
2684 * If the chain element is an indirect block it becomes the new
2685 * parent and we loop on it. We must maintain our top-down locks
2686 * to prevent the flusher from interfering (i.e. doing a
2687 * delete-duplicate and leaving us recursing down a deleted chain).
2688 *
2689 * The parent always has to be locked with at least RESOLVE_MAYBE
2690 * so we can access its data. It might need a fixup if the caller
2691 * passed incompatible flags. Be careful not to cause a deadlock
2692 * as a data-load requires an exclusive lock.
2693 *
2694 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2695 * range is within the requested key range we return the indirect
2696 * block and do NOT loop. This is usually only used to acquire
2697 * freemap nodes.
2698 */
2699 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2700 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2701 hammer2_chain_unlock(parent);
2702 hammer2_chain_drop(parent);
2703 *parentp = parent = chain;
2704 chain = NULL; /* SAFETY */
2705 goto again;
2706 }
2707 done:
2708 /*
2709 * All done, return the locked chain.
2710 *
2711 * If the caller does not want a locked chain, replace the lock with
2712 * a ref. Perhaps this can eventually be optimized to not obtain the
2713 * lock in the first place for situations where the data does not
2714 * need to be resolved.
2715 *
2716 * NOTE! A chain->error must be tested by the caller upon return.
2717 * *errorp is only set based on issues which occur while
2718 * trying to reach the chain.
2719 */
2720 return (chain);
2721 }
2722
2723 /*
2724 * After having issued a lookup we can iterate all matching keys.
2725 *
2726 * If chain is non-NULL we continue the iteration from just after it's index.
2727 *
2728 * If chain is NULL we assume the parent was exhausted and continue the
2729 * iteration at the next parent.
2730 *
2731 * If a fatal error occurs (typically an I/O error), a dummy chain is
2732 * returned with chain->error and error-identifying information set. This
2733 * chain will assert if you try to do anything fancy with it.
2734 *
2735 * XXX Depending on where the error occurs we should allow continued iteration.
2736 *
2737 * parent must be locked on entry and remains locked throughout. chain's
2738 * lock status must match flags. Chain is always at least referenced.
2739 *
2740 * WARNING! The MATCHIND flag does not apply to this function.
2741 */
2742 hammer2_chain_t *
hammer2_chain_next(hammer2_chain_t ** parentp,hammer2_chain_t * chain,hammer2_key_t * key_nextp,hammer2_key_t key_beg,hammer2_key_t key_end,int * errorp,int flags)2743 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2744 hammer2_key_t *key_nextp,
2745 hammer2_key_t key_beg, hammer2_key_t key_end,
2746 int *errorp, int flags)
2747 {
2748 hammer2_chain_t *parent;
2749 int how_maybe;
2750
2751 /*
2752 * Calculate locking flags for upward recursion.
2753 */
2754 how_maybe = HAMMER2_RESOLVE_MAYBE;
2755 if (flags & HAMMER2_LOOKUP_SHARED)
2756 how_maybe |= HAMMER2_RESOLVE_SHARED;
2757
2758 parent = *parentp;
2759 *errorp = 0;
2760
2761 /*
2762 * Calculate the next index and recalculate the parent if necessary.
2763 */
2764 if (chain) {
2765 key_beg = chain->bref.key +
2766 ((hammer2_key_t)1 << chain->bref.keybits);
2767 hammer2_chain_unlock(chain);
2768 hammer2_chain_drop(chain);
2769
2770 /*
2771 * chain invalid past this point, but we can still do a
2772 * pointer comparison w/parent.
2773 *
2774 * Any scan where the lookup returned degenerate data embedded
2775 * in the inode has an invalid index and must terminate.
2776 */
2777 if (chain == parent)
2778 return(NULL);
2779 if (key_beg == 0 || key_beg > key_end)
2780 return(NULL);
2781 chain = NULL;
2782 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2783 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2784 /*
2785 * We reached the end of the iteration.
2786 */
2787 return (NULL);
2788 } else {
2789 /*
2790 * Continue iteration with next parent unless the current
2791 * parent covers the range.
2792 *
2793 * (This also handles the case of a deleted, empty indirect
2794 * node).
2795 */
2796 key_beg = parent->bref.key +
2797 ((hammer2_key_t)1 << parent->bref.keybits);
2798 if (key_beg == 0 || key_beg > key_end)
2799 return (NULL);
2800 parent = hammer2_chain_repparent(parentp, how_maybe);
2801 }
2802
2803 /*
2804 * And execute
2805 */
2806 return (hammer2_chain_lookup(parentp, key_nextp,
2807 key_beg, key_end,
2808 errorp, flags));
2809 }
2810
2811 /*
2812 * Caller wishes to iterate chains under parent, loading new chains into
2813 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and
2814 * then call hammer2_chain_scan() repeatedly until a non-zero return.
2815 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced
2816 * with the returned chain for the scan. The returned *chainp will be
2817 * locked and referenced. Any prior contents will be unlocked and dropped.
2818 *
2819 * Caller should check the return value. A normal scan EOF will return
2820 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an
2821 * error trying to access parent data. Any error in the returned chain
2822 * must be tested separately by the caller.
2823 *
2824 * (*chainp) is dropped on each scan, but will only be set if the returned
2825 * element itself can recurse. Leaf elements are NOT resolved, loaded, or
2826 * returned via *chainp. The caller will get their bref only.
2827 *
2828 * The raw scan function is similar to lookup/next but does not seek to a key.
2829 * Blockrefs are iterated via first_bref = (parent, NULL) and
2830 * next_chain = (parent, bref).
2831 *
2832 * The passed-in parent must be locked and its data resolved. The function
2833 * nominally returns a locked and referenced *chainp != NULL for chains
2834 * the caller might need to recurse on (and will dipose of any *chainp passed
2835 * in). The caller must check the chain->bref.type either way.
2836 */
2837 int
hammer2_chain_scan(hammer2_chain_t * parent,hammer2_chain_t ** chainp,hammer2_blockref_t * bref,int * firstp,int flags)2838 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp,
2839 hammer2_blockref_t *bref, int *firstp,
2840 int flags)
2841 {
2842 hammer2_blockref_t *base;
2843 hammer2_blockref_t *bref_ptr;
2844 hammer2_key_t key;
2845 hammer2_key_t next_key;
2846 hammer2_chain_t *chain = NULL;
2847 int count = 0;
2848 int how;
2849 int generation;
2850 int maxloops = 300000;
2851 int error;
2852
2853 error = 0;
2854
2855 /*
2856 * Scan flags borrowed from lookup.
2857 */
2858 if (flags & HAMMER2_LOOKUP_ALWAYS) {
2859 how = HAMMER2_RESOLVE_ALWAYS;
2860 } else if (flags & HAMMER2_LOOKUP_NODATA) {
2861 how = HAMMER2_RESOLVE_NEVER;
2862 } else {
2863 how = HAMMER2_RESOLVE_MAYBE;
2864 }
2865 if (flags & HAMMER2_LOOKUP_SHARED) {
2866 how |= HAMMER2_RESOLVE_SHARED;
2867 }
2868
2869 /*
2870 * Calculate key to locate first/next element, unlocking the previous
2871 * element as we go. Be careful, the key calculation can overflow.
2872 *
2873 * (also reset bref to NULL)
2874 */
2875 if (*firstp) {
2876 key = 0;
2877 *firstp = 0;
2878 } else {
2879 key = bref->key + ((hammer2_key_t)1 << bref->keybits);
2880 if ((chain = *chainp) != NULL) {
2881 *chainp = NULL;
2882 hammer2_chain_unlock(chain);
2883 hammer2_chain_drop(chain);
2884 chain = NULL;
2885 }
2886 if (key == 0) {
2887 error |= HAMMER2_ERROR_EOF;
2888 goto done;
2889 }
2890 }
2891
2892 again:
2893 if (parent->error) {
2894 error = parent->error;
2895 goto done;
2896 }
2897 if (--maxloops == 0)
2898 panic("hammer2_chain_scan: maxloops");
2899
2900 /*
2901 * Locate the blockref array. Currently we do a fully associative
2902 * search through the array.
2903 */
2904 switch(parent->bref.type) {
2905 case HAMMER2_BREF_TYPE_INODE:
2906 /*
2907 * An inode with embedded data has no sub-chains.
2908 *
2909 * WARNING! Bulk scan code may pass a static chain marked
2910 * as BREF_TYPE_INODE with a copy of the volume
2911 * root blockset to snapshot the volume.
2912 */
2913 if (parent->data->ipdata.meta.op_flags &
2914 HAMMER2_OPFLAG_DIRECTDATA) {
2915 error |= HAMMER2_ERROR_EOF;
2916 goto done;
2917 }
2918 base = &parent->data->ipdata.u.blockset.blockref[0];
2919 count = HAMMER2_SET_COUNT;
2920 break;
2921 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2922 case HAMMER2_BREF_TYPE_INDIRECT:
2923 /*
2924 * Optimize indirect blocks in the INITIAL state to avoid
2925 * I/O.
2926 */
2927 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2928 base = NULL;
2929 } else {
2930 if (parent->data == NULL)
2931 panic("parent->data is NULL");
2932 base = &parent->data->npdata[0];
2933 }
2934 count = parent->bytes / sizeof(hammer2_blockref_t);
2935 break;
2936 case HAMMER2_BREF_TYPE_VOLUME:
2937 base = &parent->data->voldata.sroot_blockset.blockref[0];
2938 count = HAMMER2_SET_COUNT;
2939 break;
2940 case HAMMER2_BREF_TYPE_FREEMAP:
2941 base = &parent->data->blkset.blockref[0];
2942 count = HAMMER2_SET_COUNT;
2943 break;
2944 default:
2945 panic("hammer2_chain_scan: unrecognized blockref type: %d",
2946 parent->bref.type);
2947 base = NULL; /* safety */
2948 count = 0; /* safety */
2949 break;
2950 }
2951
2952 /*
2953 * Merged scan to find next candidate.
2954 *
2955 * hammer2_base_*() functions require the parent->core.live_* fields
2956 * to be synchronized.
2957 *
2958 * We need to hold the spinlock to access the block array and RB tree
2959 * and to interlock chain creation.
2960 */
2961 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2962 hammer2_chain_countbrefs(parent, base, count);
2963
2964 next_key = 0;
2965 bref_ptr = NULL;
2966 hammer2_spin_ex(&parent->core.spin);
2967 chain = hammer2_combined_find(parent, base, count,
2968 &next_key,
2969 key, HAMMER2_KEY_MAX,
2970 &bref_ptr);
2971 generation = parent->core.generation;
2972
2973 /*
2974 * Exhausted parent chain, we're done.
2975 */
2976 if (bref_ptr == NULL) {
2977 hammer2_spin_unex(&parent->core.spin);
2978 KKASSERT(chain == NULL);
2979 error |= HAMMER2_ERROR_EOF;
2980 goto done;
2981 }
2982
2983 /*
2984 * Copy into the supplied stack-based blockref.
2985 */
2986 *bref = *bref_ptr;
2987
2988 /*
2989 * Selected from blockref or in-memory chain.
2990 */
2991 if (chain == NULL) {
2992 switch(bref->type) {
2993 case HAMMER2_BREF_TYPE_INODE:
2994 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2995 case HAMMER2_BREF_TYPE_INDIRECT:
2996 case HAMMER2_BREF_TYPE_VOLUME:
2997 case HAMMER2_BREF_TYPE_FREEMAP:
2998 /*
2999 * Recursion, always get the chain
3000 */
3001 hammer2_spin_unex(&parent->core.spin);
3002 chain = hammer2_chain_get(parent, generation,
3003 bref, how);
3004 if (chain == NULL)
3005 goto again;
3006 break;
3007 default:
3008 /*
3009 * No recursion, do not waste time instantiating
3010 * a chain, just iterate using the bref.
3011 */
3012 hammer2_spin_unex(&parent->core.spin);
3013 break;
3014 }
3015 } else {
3016 /*
3017 * Recursion or not we need the chain in order to supply
3018 * the bref.
3019 */
3020 hammer2_chain_ref(chain);
3021 hammer2_spin_unex(&parent->core.spin);
3022 hammer2_chain_lock(chain, how);
3023 }
3024 if (chain &&
3025 (bcmp(bref, &chain->bref, sizeof(*bref)) ||
3026 chain->parent != parent)) {
3027 hammer2_chain_unlock(chain);
3028 hammer2_chain_drop(chain);
3029 chain = NULL;
3030 goto again;
3031 }
3032
3033 /*
3034 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
3035 *
3036 * NOTE: chain's key range is not relevant as there might be
3037 * one-offs within the range that are not deleted.
3038 *
3039 * NOTE: XXX this could create problems with scans used in
3040 * situations other than mount-time recovery.
3041 *
3042 * NOTE: Lookups can race delete-duplicate because
3043 * delete-duplicate does not lock the parent's core
3044 * (they just use the spinlock on the core).
3045 */
3046 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3047 hammer2_chain_unlock(chain);
3048 hammer2_chain_drop(chain);
3049 chain = NULL;
3050
3051 key = next_key;
3052 if (key == 0) {
3053 error |= HAMMER2_ERROR_EOF;
3054 goto done;
3055 }
3056 goto again;
3057 }
3058
3059 done:
3060 /*
3061 * All done, return the bref or NULL, supply chain if necessary.
3062 */
3063 if (chain)
3064 *chainp = chain;
3065 return (error);
3066 }
3067
3068 /*
3069 * Create and return a new hammer2 system memory structure of the specified
3070 * key, type and size and insert it under (*parentp). This is a full
3071 * insertion, based on the supplied key/keybits, and may involve creating
3072 * indirect blocks and moving other chains around via delete/duplicate.
3073 *
3074 * This call can be made with parent == NULL as long as a non -1 methods
3075 * is supplied. hmp must also be supplied in this situation (otherwise
3076 * hmp is extracted from the supplied parent). The chain will be detached
3077 * from the topology. A later call with both parent and chain can be made
3078 * to attach it.
3079 *
3080 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
3081 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
3082 * FULL. This typically means that the caller is creating the chain after
3083 * doing a hammer2_chain_lookup().
3084 *
3085 * (*parentp) must be exclusive locked and may be replaced on return
3086 * depending on how much work the function had to do.
3087 *
3088 * (*parentp) must not be errored or this function will assert.
3089 *
3090 * (*chainp) usually starts out NULL and returns the newly created chain,
3091 * but if the caller desires the caller may allocate a disconnected chain
3092 * and pass it in instead.
3093 *
3094 * This function should NOT be used to insert INDIRECT blocks. It is
3095 * typically used to create/insert inodes and data blocks.
3096 *
3097 * Caller must pass-in an exclusively locked parent the new chain is to
3098 * be inserted under, and optionally pass-in a disconnected, exclusively
3099 * locked chain to insert (else we create a new chain). The function will
3100 * adjust (*parentp) as necessary, create or connect the chain, and
3101 * return an exclusively locked chain in *chainp.
3102 *
3103 * When creating a PFSROOT inode under the super-root, pmp is typically NULL
3104 * and will be reassigned.
3105 *
3106 * NOTE: returns HAMMER_ERROR_* flags
3107 */
3108 int
hammer2_chain_create(hammer2_chain_t ** parentp,hammer2_chain_t ** chainp,hammer2_dev_t * hmp,hammer2_pfs_t * pmp,int methods,hammer2_key_t key,int keybits,int type,size_t bytes,hammer2_tid_t mtid,hammer2_off_t dedup_off,int flags)3109 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
3110 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods,
3111 hammer2_key_t key, int keybits, int type, size_t bytes,
3112 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags)
3113 {
3114 hammer2_chain_t *chain;
3115 hammer2_chain_t *parent;
3116 hammer2_blockref_t *base;
3117 hammer2_blockref_t dummy;
3118 int allocated = 0;
3119 int error = 0;
3120 int count;
3121 int maxloops = 300000;
3122
3123 /*
3124 * Topology may be crossing a PFS boundary.
3125 */
3126 parent = *parentp;
3127 if (parent) {
3128 KKASSERT(hammer2_mtx_owned(&parent->lock));
3129 KKASSERT(parent->error == 0);
3130 hmp = parent->hmp;
3131 }
3132 chain = *chainp;
3133
3134 if (chain == NULL) {
3135 /*
3136 * First allocate media space and construct the dummy bref,
3137 * then allocate the in-memory chain structure. Set the
3138 * INITIAL flag for fresh chains which do not have embedded
3139 * data.
3140 */
3141 bzero(&dummy, sizeof(dummy));
3142 dummy.type = type;
3143 dummy.key = key;
3144 dummy.keybits = keybits;
3145 dummy.data_off = hammer2_getradix(bytes);
3146
3147 /*
3148 * Inherit methods from parent by default. Primarily used
3149 * for BREF_TYPE_DATA. Non-data types *must* be set to
3150 * a non-NONE check algorithm.
3151 */
3152 if (methods == HAMMER2_METH_DEFAULT)
3153 dummy.methods = parent->bref.methods;
3154 else
3155 dummy.methods = (uint8_t)methods;
3156
3157 if (type != HAMMER2_BREF_TYPE_DATA &&
3158 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) {
3159 dummy.methods |=
3160 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT);
3161 }
3162
3163 chain = hammer2_chain_alloc(hmp, pmp, &dummy);
3164
3165 /*
3166 * Lock the chain manually, chain_lock will load the chain
3167 * which we do NOT want to do. (note: chain->refs is set
3168 * to 1 by chain_alloc() for us, but lockcnt is not).
3169 */
3170 chain->lockcnt = 1;
3171 hammer2_mtx_ex(&chain->lock);
3172 allocated = 1;
3173
3174 /*
3175 * Set INITIAL to optimize I/O. The flag will generally be
3176 * processed when we call hammer2_chain_modify().
3177 */
3178 switch(type) {
3179 case HAMMER2_BREF_TYPE_VOLUME:
3180 case HAMMER2_BREF_TYPE_FREEMAP:
3181 panic("hammer2_chain_create: called with volume type");
3182 break;
3183 case HAMMER2_BREF_TYPE_INDIRECT:
3184 panic("hammer2_chain_create: cannot be used to"
3185 "create indirect block");
3186 break;
3187 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3188 panic("hammer2_chain_create: cannot be used to"
3189 "create freemap root or node");
3190 break;
3191 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3192 KKASSERT(bytes == sizeof(chain->data->bmdata));
3193 /* fall through */
3194 case HAMMER2_BREF_TYPE_DIRENT:
3195 case HAMMER2_BREF_TYPE_INODE:
3196 case HAMMER2_BREF_TYPE_DATA:
3197 default:
3198 /*
3199 * leave chain->data NULL, set INITIAL
3200 */
3201 KKASSERT(chain->data == NULL);
3202 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
3203 break;
3204 }
3205 } else {
3206 /*
3207 * We are reattaching a previously deleted chain, possibly
3208 * under a new parent and possibly with a new key/keybits.
3209 * The chain does not have to be in a modified state. The
3210 * UPDATE flag will be set later on in this routine.
3211 *
3212 * Do NOT mess with the current state of the INITIAL flag.
3213 */
3214 chain->bref.key = key;
3215 chain->bref.keybits = keybits;
3216 if (chain->flags & HAMMER2_CHAIN_DELETED)
3217 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3218 KKASSERT(chain->parent == NULL);
3219 }
3220
3221 /*
3222 * Set the appropriate bref flag if requested.
3223 *
3224 * NOTE! Callers can call this function to move chains without
3225 * knowing about special flags, so don't clear bref flags
3226 * here!
3227 */
3228 if (flags & HAMMER2_INSERT_PFSROOT)
3229 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
3230
3231 if (parent == NULL)
3232 goto skip;
3233
3234 /*
3235 * Calculate how many entries we have in the blockref array and
3236 * determine if an indirect block is required when inserting into
3237 * the parent.
3238 */
3239 again:
3240 if (--maxloops == 0)
3241 panic("hammer2_chain_create: maxloops");
3242
3243 switch(parent->bref.type) {
3244 case HAMMER2_BREF_TYPE_INODE:
3245 if ((parent->data->ipdata.meta.op_flags &
3246 HAMMER2_OPFLAG_DIRECTDATA) != 0) {
3247 kprintf("hammer2: parent set for direct-data! "
3248 "pkey=%016jx ckey=%016jx\n",
3249 parent->bref.key,
3250 chain->bref.key);
3251 }
3252 KKASSERT((parent->data->ipdata.meta.op_flags &
3253 HAMMER2_OPFLAG_DIRECTDATA) == 0);
3254 KKASSERT(parent->data != NULL);
3255 base = &parent->data->ipdata.u.blockset.blockref[0];
3256 count = HAMMER2_SET_COUNT;
3257 break;
3258 case HAMMER2_BREF_TYPE_INDIRECT:
3259 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3260 if (parent->flags & HAMMER2_CHAIN_INITIAL)
3261 base = NULL;
3262 else
3263 base = &parent->data->npdata[0];
3264 count = parent->bytes / sizeof(hammer2_blockref_t);
3265 break;
3266 case HAMMER2_BREF_TYPE_VOLUME:
3267 KKASSERT(parent->data != NULL);
3268 base = &parent->data->voldata.sroot_blockset.blockref[0];
3269 count = HAMMER2_SET_COUNT;
3270 break;
3271 case HAMMER2_BREF_TYPE_FREEMAP:
3272 KKASSERT(parent->data != NULL);
3273 base = &parent->data->blkset.blockref[0];
3274 count = HAMMER2_SET_COUNT;
3275 break;
3276 default:
3277 panic("hammer2_chain_create: unrecognized blockref type: %d",
3278 parent->bref.type);
3279 base = NULL;
3280 count = 0;
3281 break;
3282 }
3283
3284 /*
3285 * Make sure we've counted the brefs
3286 */
3287 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
3288 hammer2_chain_countbrefs(parent, base, count);
3289
3290 KASSERT(parent->core.live_count >= 0 &&
3291 parent->core.live_count <= count,
3292 ("bad live_count %d/%d (%02x, %d)",
3293 parent->core.live_count, count,
3294 parent->bref.type, parent->bytes));
3295
3296 /*
3297 * If no free blockref could be found we must create an indirect
3298 * block and move a number of blockrefs into it. With the parent
3299 * locked we can safely lock each child in order to delete+duplicate
3300 * it without causing a deadlock.
3301 *
3302 * This may return the new indirect block or the old parent depending
3303 * on where the key falls. NULL is returned on error.
3304 */
3305 if (parent->core.live_count == count) {
3306 hammer2_chain_t *nparent;
3307
3308 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0);
3309
3310 nparent = hammer2_chain_create_indirect(parent, key, keybits,
3311 mtid, type, &error);
3312 if (nparent == NULL) {
3313 if (allocated)
3314 hammer2_chain_drop(chain);
3315 chain = NULL;
3316 goto done;
3317 }
3318 if (parent != nparent) {
3319 hammer2_chain_unlock(parent);
3320 hammer2_chain_drop(parent);
3321 parent = *parentp = nparent;
3322 }
3323 goto again;
3324 }
3325
3326 /*
3327 * fall through if parent, or skip to here if no parent.
3328 */
3329 skip:
3330 if (chain->flags & HAMMER2_CHAIN_DELETED)
3331 kprintf("Inserting deleted chain @%016jx\n",
3332 chain->bref.key);
3333
3334 /*
3335 * Link the chain into its parent.
3336 */
3337 if (chain->parent != NULL)
3338 panic("hammer2: hammer2_chain_create: chain already connected");
3339 KKASSERT(chain->parent == NULL);
3340 if (parent) {
3341 KKASSERT(parent->core.live_count < count);
3342 hammer2_chain_insert(parent, chain,
3343 HAMMER2_CHAIN_INSERT_SPIN |
3344 HAMMER2_CHAIN_INSERT_LIVE,
3345 0);
3346 }
3347
3348 if (allocated) {
3349 /*
3350 * Mark the newly created chain modified. This will cause
3351 * UPDATE to be set and process the INITIAL flag.
3352 *
3353 * Device buffers are not instantiated for DATA elements
3354 * as these are handled by logical buffers.
3355 *
3356 * Indirect and freemap node indirect blocks are handled
3357 * by hammer2_chain_create_indirect() and not by this
3358 * function.
3359 *
3360 * Data for all other bref types is expected to be
3361 * instantiated (INODE, LEAF).
3362 */
3363 switch(chain->bref.type) {
3364 case HAMMER2_BREF_TYPE_DATA:
3365 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3366 case HAMMER2_BREF_TYPE_DIRENT:
3367 case HAMMER2_BREF_TYPE_INODE:
3368 error = hammer2_chain_modify(chain, mtid, dedup_off,
3369 HAMMER2_MODIFY_OPTDATA);
3370 break;
3371 default:
3372 /*
3373 * Remaining types are not supported by this function.
3374 * In particular, INDIRECT and LEAF_NODE types are
3375 * handled by create_indirect().
3376 */
3377 panic("hammer2_chain_create: bad type: %d",
3378 chain->bref.type);
3379 /* NOT REACHED */
3380 break;
3381 }
3382 } else {
3383 /*
3384 * When reconnecting a chain we must set UPDATE and
3385 * setflush so the flush recognizes that it must update
3386 * the bref in the parent.
3387 */
3388 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0)
3389 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
3390 }
3391
3392 /*
3393 * We must setflush(parent) to ensure that it recurses through to
3394 * chain. setflush(chain) might not work because ONFLUSH is possibly
3395 * already set in the chain (so it won't recurse up to set it in the
3396 * parent).
3397 */
3398 if (parent)
3399 hammer2_chain_setflush(parent);
3400
3401 done:
3402 *chainp = chain;
3403
3404 return (error);
3405 }
3406
3407 /*
3408 * Move the chain from its old parent to a new parent. The chain must have
3409 * already been deleted or already disconnected (or never associated) with
3410 * a parent. The chain is reassociated with the new parent and the deleted
3411 * flag will be cleared (no longer deleted). The chain's modification state
3412 * is not altered.
3413 *
3414 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION
3415 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
3416 * FULL. This typically means that the caller is creating the chain after
3417 * doing a hammer2_chain_lookup().
3418 *
3419 * Neither (parent) or (chain) can be errored.
3420 *
3421 * If (parent) is non-NULL then the chain is inserted under the parent.
3422 *
3423 * If (parent) is NULL then the newly duplicated chain is not inserted
3424 * anywhere, similar to if it had just been chain_alloc()'d (suitable for
3425 * passing into hammer2_chain_create() after this function returns).
3426 *
3427 * WARNING! This function calls create which means it can insert indirect
3428 * blocks. This can cause other unrelated chains in the parent to
3429 * be moved to a newly inserted indirect block in addition to the
3430 * specific chain.
3431 */
3432 void
hammer2_chain_rename(hammer2_chain_t ** parentp,hammer2_chain_t * chain,hammer2_tid_t mtid,int flags)3433 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain,
3434 hammer2_tid_t mtid, int flags)
3435 {
3436 hammer2_blockref_t *bref;
3437 hammer2_chain_t *parent;
3438
3439 /*
3440 * WARNING! We should never resolve DATA to device buffers
3441 * (XXX allow it if the caller did?), and since
3442 * we currently do not have the logical buffer cache
3443 * buffer in-hand to fix its cached physical offset
3444 * we also force the modify code to not COW it. XXX
3445 *
3446 * NOTE! We allow error'd chains to be renamed. The bref itself
3447 * is good and can be renamed. The content, however, may
3448 * be inaccessible.
3449 */
3450 KKASSERT(chain->parent == NULL);
3451 /*KKASSERT(chain->error == 0); allow */
3452 bref = &chain->bref;
3453
3454 /*
3455 * If parent is not NULL the duplicated chain will be entered under
3456 * the parent and the UPDATE bit set to tell flush to update
3457 * the blockref.
3458 *
3459 * We must setflush(parent) to ensure that it recurses through to
3460 * chain. setflush(chain) might not work because ONFLUSH is possibly
3461 * already set in the chain (so it won't recurse up to set it in the
3462 * parent).
3463 *
3464 * Having both chains locked is extremely important for atomicy.
3465 */
3466 if (parentp && (parent = *parentp) != NULL) {
3467 KKASSERT(hammer2_mtx_owned(&parent->lock));
3468 KKASSERT(parent->refs > 0);
3469 KKASSERT(parent->error == 0);
3470
3471 hammer2_chain_create(parentp, &chain, NULL, chain->pmp,
3472 HAMMER2_METH_DEFAULT,
3473 bref->key, bref->keybits, bref->type,
3474 chain->bytes, mtid, 0, flags);
3475 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
3476 hammer2_chain_setflush(*parentp);
3477 }
3478 }
3479
3480 /*
3481 * This works in tandem with delete_obref() to install a blockref in
3482 * (typically) an indirect block that is associated with the chain being
3483 * moved to *parentp.
3484 *
3485 * The reason we need this function is that the caller needs to maintain
3486 * the blockref as it was, and not generate a new blockref for what might
3487 * be a modified chain. Otherwise stuff will leak into the flush that
3488 * the flush code's FLUSH_INODE_STOP flag is unable to catch.
3489 *
3490 * It is EXTREMELY important that we properly set CHAIN_BLKMAPUPD and
3491 * CHAIN_UPDATE. We must set BLKMAPUPD if the bref does not match, and
3492 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if
3493 * it does. Otherwise we can end up in a situation where H2 is unable to
3494 * clean up the in-memory chain topology.
3495 *
3496 * The reason for this is that flushes do not generally flush through
3497 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq
3498 * or sideq to properly flush and dispose of the related inode chain's flags.
3499 * Situations where the inode is not actually modified by the frontend,
3500 * but where we have to move the related chains around as we insert or cleanup
3501 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory
3502 * inode chain that does not have a hammer2_inode_t associated with it.
3503 */
3504 static void
hammer2_chain_rename_obref(hammer2_chain_t ** parentp,hammer2_chain_t * chain,hammer2_tid_t mtid,int flags,hammer2_blockref_t * obref)3505 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain,
3506 hammer2_tid_t mtid, int flags,
3507 hammer2_blockref_t *obref)
3508 {
3509 hammer2_chain_rename(parentp, chain, mtid, flags);
3510
3511 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) {
3512 hammer2_blockref_t *tbase;
3513 int tcount;
3514
3515 KKASSERT((chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0);
3516 hammer2_chain_modify(*parentp, mtid, 0, 0);
3517 tbase = hammer2_chain_base_and_count(*parentp, &tcount);
3518 hammer2_base_insert(*parentp, tbase, tcount, chain, obref);
3519 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) {
3520 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD |
3521 HAMMER2_CHAIN_UPDATE);
3522 } else {
3523 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
3524 }
3525 }
3526 }
3527
3528 /*
3529 * Helper function for deleting chains.
3530 *
3531 * The chain is removed from the live view (the RBTREE) as well as the parent's
3532 * blockmap. Both chain and its parent must be locked.
3533 *
3534 * parent may not be errored. chain can be errored.
3535 */
3536 static int
_hammer2_chain_delete_helper(hammer2_chain_t * parent,hammer2_chain_t * chain,hammer2_tid_t mtid,int flags,hammer2_blockref_t * obref)3537 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain,
3538 hammer2_tid_t mtid, int flags,
3539 hammer2_blockref_t *obref)
3540 {
3541 int error = 0;
3542
3543 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
3544 KKASSERT(chain->parent == parent);
3545
3546 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) {
3547 /*
3548 * Chain is blockmapped, so there must be a parent.
3549 * Atomically remove the chain from the parent and remove
3550 * the blockmap entry. The parent must be set modified
3551 * to remove the blockmap entry.
3552 */
3553 hammer2_blockref_t *base;
3554 int count;
3555
3556 KKASSERT(parent != NULL);
3557 KKASSERT(parent->error == 0);
3558 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
3559 error = hammer2_chain_modify(parent, mtid, 0, 0);
3560 if (error)
3561 goto done;
3562
3563 /*
3564 * Calculate blockmap pointer
3565 */
3566 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
3567 hammer2_spin_ex(&chain->core.spin);
3568 hammer2_spin_ex(&parent->core.spin);
3569
3570 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3571 atomic_add_int(&parent->core.live_count, -1);
3572 ++parent->core.generation;
3573 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
3574 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
3575 --parent->core.chain_count;
3576 chain->parent = NULL;
3577
3578 switch(parent->bref.type) {
3579 case HAMMER2_BREF_TYPE_INODE:
3580 /*
3581 * Access the inode's block array. However, there
3582 * is no block array if the inode is flagged
3583 * DIRECTDATA.
3584 */
3585 if (parent->data &&
3586 (parent->data->ipdata.meta.op_flags &
3587 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
3588 base =
3589 &parent->data->ipdata.u.blockset.blockref[0];
3590 } else {
3591 base = NULL;
3592 }
3593 count = HAMMER2_SET_COUNT;
3594 break;
3595 case HAMMER2_BREF_TYPE_INDIRECT:
3596 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3597 if (parent->data)
3598 base = &parent->data->npdata[0];
3599 else
3600 base = NULL;
3601 count = parent->bytes / sizeof(hammer2_blockref_t);
3602 break;
3603 case HAMMER2_BREF_TYPE_VOLUME:
3604 base = &parent->data->voldata.
3605 sroot_blockset.blockref[0];
3606 count = HAMMER2_SET_COUNT;
3607 break;
3608 case HAMMER2_BREF_TYPE_FREEMAP:
3609 base = &parent->data->blkset.blockref[0];
3610 count = HAMMER2_SET_COUNT;
3611 break;
3612 default:
3613 base = NULL;
3614 count = 0;
3615 panic("_hammer2_chain_delete_helper: "
3616 "unrecognized blockref type: %d",
3617 parent->bref.type);
3618 break;
3619 }
3620
3621 /*
3622 * delete blockmapped chain from its parent.
3623 *
3624 * The parent is not affected by any statistics in chain
3625 * which are pending synchronization. That is, there is
3626 * nothing to undo in the parent since they have not yet
3627 * been incorporated into the parent.
3628 *
3629 * The parent is affected by statistics stored in inodes.
3630 * Those have already been synchronized, so they must be
3631 * undone. XXX split update possible w/delete in middle?
3632 */
3633 if (base) {
3634 hammer2_base_delete(parent, base, count, chain, obref);
3635 }
3636 hammer2_spin_unex(&parent->core.spin);
3637 hammer2_spin_unex(&chain->core.spin);
3638 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
3639 /*
3640 * Chain is not blockmapped but a parent is present.
3641 * Atomically remove the chain from the parent. There is
3642 * no blockmap entry to remove.
3643 *
3644 * Because chain was associated with a parent but not
3645 * synchronized, the chain's *_count_up fields contain
3646 * inode adjustment statistics which must be undone.
3647 */
3648 hammer2_spin_ex(&chain->core.spin);
3649 hammer2_spin_ex(&parent->core.spin);
3650 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3651 atomic_add_int(&parent->core.live_count, -1);
3652 ++parent->core.generation;
3653 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
3654 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
3655 --parent->core.chain_count;
3656 chain->parent = NULL;
3657 hammer2_spin_unex(&parent->core.spin);
3658 hammer2_spin_unex(&chain->core.spin);
3659 } else {
3660 /*
3661 * Chain is not blockmapped and has no parent. This
3662 * is a degenerate case.
3663 */
3664 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3665 }
3666 done:
3667 return error;
3668 }
3669
3670 /*
3671 * Create an indirect block that covers one or more of the elements in the
3672 * current parent. Either returns the existing parent with no locking or
3673 * ref changes or returns the new indirect block locked and referenced
3674 * and leaving the original parent lock/ref intact as well.
3675 *
3676 * If an error occurs, NULL is returned and *errorp is set to the H2 error.
3677 *
3678 * The returned chain depends on where the specified key falls.
3679 *
3680 * The key/keybits for the indirect mode only needs to follow three rules:
3681 *
3682 * (1) That all elements underneath it fit within its key space and
3683 *
3684 * (2) That all elements outside it are outside its key space.
3685 *
3686 * (3) When creating the new indirect block any elements in the current
3687 * parent that fit within the new indirect block's keyspace must be
3688 * moved into the new indirect block.
3689 *
3690 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
3691 * keyspace the the current parent, but lookup/iteration rules will
3692 * ensure (and must ensure) that rule (2) for all parents leading up
3693 * to the nearest inode or the root volume header is adhered to. This
3694 * is accomplished by always recursing through matching keyspaces in
3695 * the hammer2_chain_lookup() and hammer2_chain_next() API.
3696 *
3697 * The current implementation calculates the current worst-case keyspace by
3698 * iterating the current parent and then divides it into two halves, choosing
3699 * whichever half has the most elements (not necessarily the half containing
3700 * the requested key).
3701 *
3702 * We can also opt to use the half with the least number of elements. This
3703 * causes lower-numbered keys (aka logical file offsets) to recurse through
3704 * fewer indirect blocks and higher-numbered keys to recurse through more.
3705 * This also has the risk of not moving enough elements to the new indirect
3706 * block and being forced to create several indirect blocks before the element
3707 * can be inserted.
3708 *
3709 * Must be called with an exclusively locked parent.
3710 *
3711 * NOTE: *errorp set to HAMMER_ERROR_* flags
3712 */
3713 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
3714 hammer2_key_t *keyp, int keybits,
3715 hammer2_blockref_t *base, int count);
3716 static int hammer2_chain_indkey_file(hammer2_chain_t *parent,
3717 hammer2_key_t *keyp, int keybits,
3718 hammer2_blockref_t *base, int count,
3719 int ncount);
3720 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent,
3721 hammer2_key_t *keyp, int keybits,
3722 hammer2_blockref_t *base, int count,
3723 int ncount);
3724 static
3725 hammer2_chain_t *
hammer2_chain_create_indirect(hammer2_chain_t * parent,hammer2_key_t create_key,int create_bits,hammer2_tid_t mtid,int for_type,int * errorp)3726 hammer2_chain_create_indirect(hammer2_chain_t *parent,
3727 hammer2_key_t create_key, int create_bits,
3728 hammer2_tid_t mtid, int for_type, int *errorp)
3729 {
3730 hammer2_dev_t *hmp;
3731 hammer2_blockref_t *base;
3732 hammer2_blockref_t *bref;
3733 hammer2_blockref_t bsave;
3734 hammer2_blockref_t dummy;
3735 hammer2_chain_t *chain;
3736 hammer2_chain_t *ichain;
3737 hammer2_key_t key = create_key;
3738 hammer2_key_t key_beg;
3739 hammer2_key_t key_end;
3740 hammer2_key_t key_next;
3741 int keybits = create_bits;
3742 int count;
3743 int ncount;
3744 int nbytes;
3745 int loops;
3746 int error;
3747 int reason;
3748 int generation;
3749 int maxloops = 300000;
3750
3751 /*
3752 * Calculate the base blockref pointer or NULL if the chain
3753 * is known to be empty. We need to calculate the array count
3754 * for RB lookups either way.
3755 */
3756 hmp = parent->hmp;
3757 KKASSERT(hammer2_mtx_owned(&parent->lock));
3758
3759 /*
3760 * Pre-modify the parent now to avoid having to deal with error
3761 * processing if we tried to later (in the middle of our loop).
3762 *
3763 * We are going to be moving bref's around, the indirect blocks
3764 * cannot be in an initial state. Do not pass MODIFY_OPTDATA.
3765 */
3766 *errorp = hammer2_chain_modify(parent, mtid, 0, 0);
3767 if (*errorp) {
3768 kprintf("hammer2_chain_create_indirect: error %08x %s\n",
3769 *errorp, hammer2_error_str(*errorp));
3770 return NULL;
3771 }
3772 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
3773
3774 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/
3775 base = hammer2_chain_base_and_count(parent, &count);
3776
3777 /*
3778 * How big should our new indirect block be? It has to be at least
3779 * as large as its parent for splits to work properly.
3780 *
3781 * The freemap uses a specific indirect block size. The number of
3782 * levels are built dynamically and ultimately depend on the size
3783 * volume. Because freemap blocks are taken from the reserved areas
3784 * of the volume our goal is efficiency (fewer levels) and not so
3785 * much to save disk space.
3786 *
3787 * The first indirect block level for a directory usually uses
3788 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to
3789 * the hash mechanism, this typically gives us a nominal
3790 * 32 * 4 entries with one level of indirection.
3791 *
3792 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE
3793 * indirect blocks. The initial 4 entries in the inode gives us
3794 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels
3795 * of indirection gives us 137GB, and so forth. H2 can support
3796 * huge file sizes but they are not typical, so we try to stick
3797 * with compactness and do not use a larger indirect block size.
3798 *
3799 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but
3800 * due to the way indirect blocks are created this usually winds
3801 * up being extremely inefficient for small files. Even though
3802 * 16KB requires more levels of indirection for very large files,
3803 * the 16KB records can be ganged together into 64KB DIOs.
3804 */
3805 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3806 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3807 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE;
3808 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
3809 if (parent->data->ipdata.meta.type ==
3810 HAMMER2_OBJTYPE_DIRECTORY)
3811 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */
3812 else
3813 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */
3814
3815 } else {
3816 nbytes = HAMMER2_IND_BYTES_NOM;
3817 }
3818 if (nbytes < count * sizeof(hammer2_blockref_t)) {
3819 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
3820 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF);
3821 nbytes = count * sizeof(hammer2_blockref_t);
3822 }
3823 ncount = nbytes / sizeof(hammer2_blockref_t);
3824
3825 /*
3826 * When creating an indirect block for a freemap node or leaf
3827 * the key/keybits must be fitted to static radix levels because
3828 * particular radix levels use particular reserved blocks in the
3829 * related zone.
3830 *
3831 * This routine calculates the key/radix of the indirect block
3832 * we need to create, and whether it is on the high-side or the
3833 * low-side.
3834 */
3835 switch(for_type) {
3836 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3837 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3838 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
3839 base, count);
3840 break;
3841 case HAMMER2_BREF_TYPE_DATA:
3842 keybits = hammer2_chain_indkey_file(parent, &key, keybits,
3843 base, count, ncount);
3844 break;
3845 case HAMMER2_BREF_TYPE_DIRENT:
3846 case HAMMER2_BREF_TYPE_INODE:
3847 keybits = hammer2_chain_indkey_dir(parent, &key, keybits,
3848 base, count, ncount);
3849 break;
3850 default:
3851 panic("illegal indirect block for bref type %d", for_type);
3852 break;
3853 }
3854
3855 /*
3856 * Normalize the key for the radix being represented, keeping the
3857 * high bits and throwing away the low bits.
3858 */
3859 key &= ~(((hammer2_key_t)1 << keybits) - 1);
3860
3861 /*
3862 * Ok, create our new indirect block
3863 */
3864 bzero(&dummy, sizeof(dummy));
3865 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3866 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3867 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
3868 } else {
3869 dummy.type = HAMMER2_BREF_TYPE_INDIRECT;
3870 }
3871 dummy.key = key;
3872 dummy.keybits = keybits;
3873 dummy.data_off = hammer2_getradix(nbytes);
3874 dummy.methods =
3875 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) |
3876 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
3877
3878 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy);
3879 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
3880 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
3881 /* ichain has one ref at this point */
3882
3883 /*
3884 * We have to mark it modified to allocate its block, but use
3885 * OPTDATA to allow it to remain in the INITIAL state. Otherwise
3886 * it won't be acted upon by the flush code.
3887 *
3888 * XXX remove OPTDATA, we need a fully initialized indirect block to
3889 * be able to move the original blockref.
3890 */
3891 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0);
3892 if (*errorp) {
3893 kprintf("hammer2_chain_create_indirect: error %08x %s\n",
3894 *errorp, hammer2_error_str(*errorp));
3895 hammer2_chain_unlock(ichain);
3896 hammer2_chain_drop(ichain);
3897 return NULL;
3898 }
3899 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0);
3900
3901 /*
3902 * Iterate the original parent and move the matching brefs into
3903 * the new indirect block.
3904 *
3905 * XXX handle flushes.
3906 */
3907 key_beg = 0;
3908 key_end = HAMMER2_KEY_MAX;
3909 key_next = 0; /* avoid gcc warnings */
3910 hammer2_spin_ex(&parent->core.spin);
3911 loops = 0;
3912 reason = 0;
3913
3914 for (;;) {
3915 /*
3916 * Parent may have been modified, relocating its block array.
3917 * Reload the base pointer.
3918 */
3919 base = hammer2_chain_base_and_count(parent, &count);
3920
3921 if (++loops > 100000) {
3922 hammer2_spin_unex(&parent->core.spin);
3923 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
3924 reason, parent, base, count, key_next);
3925 }
3926
3927 /*
3928 * NOTE: spinlock stays intact, returned chain (if not NULL)
3929 * is not referenced or locked which means that we
3930 * cannot safely check its flagged / deletion status
3931 * until we lock it.
3932 */
3933 chain = hammer2_combined_find(parent, base, count,
3934 &key_next,
3935 key_beg, key_end,
3936 &bref);
3937 generation = parent->core.generation;
3938 if (bref == NULL)
3939 break;
3940 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3941
3942 /*
3943 * Skip keys that are not within the key/radix of the new
3944 * indirect block. They stay in the parent.
3945 */
3946 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) {
3947 goto next_key_spinlocked;
3948 }
3949
3950 /*
3951 * Load the new indirect block by acquiring the related
3952 * chains (potentially from media as it might not be
3953 * in-memory). Then move it to the new parent (ichain).
3954 *
3955 * chain is referenced but not locked. We must lock the
3956 * chain to obtain definitive state.
3957 */
3958 bsave = *bref;
3959 if (chain) {
3960 /*
3961 * Use chain already present in the RBTREE
3962 */
3963 hammer2_chain_ref(chain);
3964 hammer2_spin_unex(&parent->core.spin);
3965 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
3966 } else {
3967 /*
3968 * Get chain for blockref element. _get returns NULL
3969 * on insertion race.
3970 */
3971 hammer2_spin_unex(&parent->core.spin);
3972 chain = hammer2_chain_get(parent, generation, &bsave,
3973 HAMMER2_RESOLVE_NEVER);
3974 if (chain == NULL) {
3975 reason = 1;
3976 hammer2_spin_ex(&parent->core.spin);
3977 continue;
3978 }
3979 }
3980
3981 /*
3982 * This is always live so if the chain has been deleted
3983 * we raced someone and we have to retry.
3984 *
3985 * NOTE: Lookups can race delete-duplicate because
3986 * delete-duplicate does not lock the parent's core
3987 * (they just use the spinlock on the core).
3988 *
3989 * (note reversed logic for this one)
3990 */
3991 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) ||
3992 chain->parent != parent ||
3993 (chain->flags & HAMMER2_CHAIN_DELETED)) {
3994 hammer2_chain_unlock(chain);
3995 hammer2_chain_drop(chain);
3996 if (hammer2_debug & 0x0040) {
3997 kprintf("LOST PARENT RETRY "
3998 "RETRY (%p,%p)->%p %08x\n",
3999 parent, chain->parent, chain, chain->flags);
4000 }
4001 hammer2_spin_ex(&parent->core.spin);
4002 continue;
4003 }
4004
4005 /*
4006 * Shift the chain to the indirect block.
4007 *
4008 * WARNING! No reason for us to load chain data, pass NOSTATS
4009 * to prevent delete/insert from trying to access
4010 * inode stats (and thus asserting if there is no
4011 * chain->data loaded).
4012 *
4013 * WARNING! The (parent, chain) deletion may modify the parent
4014 * and invalidate the base pointer.
4015 *
4016 * WARNING! Parent must already be marked modified, so we
4017 * can assume that chain_delete always suceeds.
4018 *
4019 * WARNING! hammer2_chain_repchange() does not have to be
4020 * called (and doesn't work anyway because we are
4021 * only doing a partial shift). A recursion that is
4022 * in-progress can continue at the current parent
4023 * and will be able to properly find its next key.
4024 */
4025 error = hammer2_chain_delete_obref(parent, chain, mtid, 0,
4026 &bsave);
4027 KKASSERT(error == 0);
4028 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave);
4029 hammer2_chain_unlock(chain);
4030 hammer2_chain_drop(chain);
4031 KKASSERT(parent->refs > 0);
4032 chain = NULL;
4033 base = NULL; /* safety */
4034 hammer2_spin_ex(&parent->core.spin);
4035 next_key_spinlocked:
4036 if (--maxloops == 0)
4037 panic("hammer2_chain_create_indirect: maxloops");
4038 reason = 4;
4039 if (key_next == 0 || key_next > key_end)
4040 break;
4041 key_beg = key_next;
4042 /* loop */
4043 }
4044 hammer2_spin_unex(&parent->core.spin);
4045
4046 /*
4047 * Insert the new indirect block into the parent now that we've
4048 * cleared out some entries in the parent. We calculated a good
4049 * insertion index in the loop above (ichain->index).
4050 *
4051 * We don't have to set UPDATE here because we mark ichain
4052 * modified down below (so the normal modified -> flush -> set-moved
4053 * sequence applies).
4054 *
4055 * The insertion shouldn't race as this is a completely new block
4056 * and the parent is locked.
4057 */
4058 base = NULL; /* safety, parent modify may change address */
4059 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
4060 KKASSERT(parent->core.live_count < count);
4061 hammer2_chain_insert(parent, ichain,
4062 HAMMER2_CHAIN_INSERT_SPIN |
4063 HAMMER2_CHAIN_INSERT_LIVE,
4064 0);
4065
4066 /*
4067 * Make sure flushes propogate after our manual insertion.
4068 */
4069 hammer2_chain_setflush(ichain);
4070 hammer2_chain_setflush(parent);
4071
4072 /*
4073 * Figure out what to return.
4074 */
4075 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits) != 0) {
4076 /*
4077 * Key being created is outside the key range,
4078 * return the original parent.
4079 */
4080 hammer2_chain_unlock(ichain);
4081 hammer2_chain_drop(ichain);
4082 } else {
4083 /*
4084 * Otherwise its in the range, return the new parent.
4085 * (leave both the new and old parent locked).
4086 */
4087 parent = ichain;
4088 }
4089
4090 return(parent);
4091 }
4092
4093 /*
4094 * Do maintenance on an indirect chain. Both parent and chain are locked.
4095 *
4096 * Returns non-zero if (chain) is deleted, either due to being empty or
4097 * because its children were safely moved into the parent.
4098 */
4099 int
hammer2_chain_indirect_maintenance(hammer2_chain_t * parent,hammer2_chain_t * chain)4100 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent,
4101 hammer2_chain_t *chain)
4102 {
4103 hammer2_blockref_t *chain_base;
4104 hammer2_blockref_t *base;
4105 hammer2_blockref_t *bref;
4106 hammer2_blockref_t bsave;
4107 hammer2_key_t key_next;
4108 hammer2_key_t key_beg;
4109 hammer2_key_t key_end;
4110 hammer2_chain_t *sub;
4111 int chain_count;
4112 int count;
4113 int error;
4114 int generation;
4115
4116 /*
4117 * Make sure we have an accurate live_count
4118 */
4119 if ((chain->flags & (HAMMER2_CHAIN_INITIAL |
4120 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) {
4121 base = &chain->data->npdata[0];
4122 count = chain->bytes / sizeof(hammer2_blockref_t);
4123 hammer2_chain_countbrefs(chain, base, count);
4124 }
4125
4126 /*
4127 * If the indirect block is empty we can delete it.
4128 * (ignore deletion error)
4129 */
4130 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) {
4131 hammer2_chain_delete(parent, chain,
4132 chain->bref.modify_tid,
4133 HAMMER2_DELETE_PERMANENT);
4134 hammer2_chain_repchange(parent, chain);
4135 return 1;
4136 }
4137
4138 base = hammer2_chain_base_and_count(parent, &count);
4139
4140 if ((parent->flags & (HAMMER2_CHAIN_INITIAL |
4141 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) {
4142 hammer2_chain_countbrefs(parent, base, count);
4143 }
4144
4145 /*
4146 * Determine if we can collapse chain into parent, calculate
4147 * hysteresis for chain emptiness.
4148 */
4149 if (parent->core.live_count + chain->core.live_count - 1 > count)
4150 return 0;
4151 chain_count = chain->bytes / sizeof(hammer2_blockref_t);
4152 if (chain->core.live_count > chain_count * 3 / 4)
4153 return 0;
4154
4155 /*
4156 * Ok, theoretically we can collapse chain's contents into
4157 * parent. chain is locked, but any in-memory children of chain
4158 * are not. For this to work, we must be able to dispose of any
4159 * in-memory children of chain.
4160 *
4161 * For now require that there are no in-memory children of chain.
4162 *
4163 * WARNING! Both chain and parent must remain locked across this
4164 * entire operation.
4165 */
4166
4167 /*
4168 * Parent must be marked modified. Don't try to collapse it if we
4169 * can't mark it modified. Once modified, destroy chain to make room
4170 * and to get rid of what will be a conflicting key (this is included
4171 * in the calculation above). Finally, move the children of chain
4172 * into chain's parent.
4173 *
4174 * This order creates an accounting problem for bref.embed.stats
4175 * because we destroy chain before we remove its children. Any
4176 * elements whos blockref is already synchronized will be counted
4177 * twice. To deal with the problem we clean out chain's stats prior
4178 * to deleting it.
4179 */
4180 error = hammer2_chain_modify(parent, 0, 0, 0);
4181 if (error) {
4182 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n",
4183 hammer2_error_str(error));
4184 return 0;
4185 }
4186 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0);
4187 if (error) {
4188 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n",
4189 hammer2_error_str(error));
4190 return 0;
4191 }
4192
4193 chain->bref.embed.stats.inode_count = 0;
4194 chain->bref.embed.stats.data_count = 0;
4195 error = hammer2_chain_delete(parent, chain,
4196 chain->bref.modify_tid,
4197 HAMMER2_DELETE_PERMANENT);
4198 KKASSERT(error == 0);
4199
4200 /*
4201 * The combined_find call requires core.spin to be held. One would
4202 * think there wouldn't be any conflicts since we hold chain
4203 * exclusively locked, but the caching mechanism for 0-ref children
4204 * does not require a chain lock.
4205 */
4206 hammer2_spin_ex(&chain->core.spin);
4207
4208 key_next = 0;
4209 key_beg = 0;
4210 key_end = HAMMER2_KEY_MAX;
4211 for (;;) {
4212 chain_base = &chain->data->npdata[0];
4213 chain_count = chain->bytes / sizeof(hammer2_blockref_t);
4214 sub = hammer2_combined_find(chain, chain_base, chain_count,
4215 &key_next,
4216 key_beg, key_end,
4217 &bref);
4218 generation = chain->core.generation;
4219 if (bref == NULL)
4220 break;
4221 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4222
4223 bsave = *bref;
4224 if (sub) {
4225 hammer2_chain_ref(sub);
4226 hammer2_spin_unex(&chain->core.spin);
4227 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER);
4228 } else {
4229 hammer2_spin_unex(&chain->core.spin);
4230 sub = hammer2_chain_get(chain, generation, &bsave,
4231 HAMMER2_RESOLVE_NEVER);
4232 if (sub == NULL) {
4233 hammer2_spin_ex(&chain->core.spin);
4234 continue;
4235 }
4236 }
4237 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) ||
4238 sub->parent != chain ||
4239 (sub->flags & HAMMER2_CHAIN_DELETED)) {
4240 hammer2_chain_unlock(sub);
4241 hammer2_chain_drop(sub);
4242 hammer2_spin_ex(&chain->core.spin);
4243 sub = NULL; /* safety */
4244 continue;
4245 }
4246 error = hammer2_chain_delete_obref(chain, sub,
4247 sub->bref.modify_tid, 0,
4248 &bsave);
4249 KKASSERT(error == 0);
4250 hammer2_chain_rename_obref(&parent, sub,
4251 sub->bref.modify_tid,
4252 HAMMER2_INSERT_SAMEPARENT, &bsave);
4253 hammer2_chain_unlock(sub);
4254 hammer2_chain_drop(sub);
4255 hammer2_spin_ex(&chain->core.spin);
4256
4257 if (key_next == 0)
4258 break;
4259 key_beg = key_next;
4260 }
4261 hammer2_spin_unex(&chain->core.spin);
4262
4263 hammer2_chain_repchange(parent, chain);
4264
4265 return 1;
4266 }
4267
4268 /*
4269 * Freemap indirect blocks
4270 *
4271 * Calculate the keybits and highside/lowside of the freemap node the
4272 * caller is creating.
4273 *
4274 * This routine will specify the next higher-level freemap key/radix
4275 * representing the lowest-ordered set. By doing so, eventually all
4276 * low-ordered sets will be moved one level down.
4277 *
4278 * We have to be careful here because the freemap reserves a limited
4279 * number of blocks for a limited number of levels. So we can't just
4280 * push indiscriminately.
4281 */
4282 int
hammer2_chain_indkey_freemap(hammer2_chain_t * parent,hammer2_key_t * keyp,int keybits,hammer2_blockref_t * base,int count)4283 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
4284 int keybits, hammer2_blockref_t *base, int count)
4285 {
4286 hammer2_chain_t *chain;
4287 hammer2_blockref_t *bref;
4288 hammer2_key_t key;
4289 hammer2_key_t key_beg;
4290 hammer2_key_t key_end;
4291 hammer2_key_t key_next;
4292 int maxloops = 300000;
4293
4294 key = *keyp;
4295 keybits = 64;
4296
4297 /*
4298 * Calculate the range of keys in the array being careful to skip
4299 * slots which are overridden with a deletion.
4300 */
4301 key_beg = 0;
4302 key_end = HAMMER2_KEY_MAX;
4303 hammer2_spin_ex(&parent->core.spin);
4304
4305 for (;;) {
4306 if (--maxloops == 0) {
4307 panic("indkey_freemap shit %p %p:%d\n",
4308 parent, base, count);
4309 }
4310 chain = hammer2_combined_find(parent, base, count,
4311 &key_next,
4312 key_beg, key_end,
4313 &bref);
4314
4315 /*
4316 * Exhausted search
4317 */
4318 if (bref == NULL)
4319 break;
4320
4321 /*
4322 * Skip deleted chains.
4323 */
4324 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4325 if (key_next == 0 || key_next > key_end)
4326 break;
4327 key_beg = key_next;
4328 continue;
4329 }
4330
4331 /*
4332 * Use the full live (not deleted) element for the scan
4333 * iteration. HAMMER2 does not allow partial replacements.
4334 *
4335 * XXX should be built into hammer2_combined_find().
4336 */
4337 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4338
4339 if (keybits > bref->keybits) {
4340 key = bref->key;
4341 keybits = bref->keybits;
4342 } else if (keybits == bref->keybits && bref->key < key) {
4343 key = bref->key;
4344 }
4345 if (key_next == 0)
4346 break;
4347 key_beg = key_next;
4348 }
4349 hammer2_spin_unex(&parent->core.spin);
4350
4351 /*
4352 * Return the keybits for a higher-level FREEMAP_NODE covering
4353 * this node.
4354 */
4355 switch(keybits) {
4356 case HAMMER2_FREEMAP_LEVEL0_RADIX:
4357 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
4358 break;
4359 case HAMMER2_FREEMAP_LEVEL1_RADIX:
4360 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
4361 break;
4362 case HAMMER2_FREEMAP_LEVEL2_RADIX:
4363 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
4364 break;
4365 case HAMMER2_FREEMAP_LEVEL3_RADIX:
4366 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
4367 break;
4368 case HAMMER2_FREEMAP_LEVEL4_RADIX:
4369 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX;
4370 break;
4371 case HAMMER2_FREEMAP_LEVEL5_RADIX:
4372 panic("hammer2_chain_indkey_freemap: level too high");
4373 break;
4374 default:
4375 panic("hammer2_chain_indkey_freemap: bad radix");
4376 break;
4377 }
4378 *keyp = key;
4379
4380 return (keybits);
4381 }
4382
4383 /*
4384 * File indirect blocks
4385 *
4386 * Calculate the key/keybits for the indirect block to create by scanning
4387 * existing keys. The key being created is also passed in *keyp and can be
4388 * inside or outside the indirect block. Regardless, the indirect block
4389 * must hold at least two keys in order to guarantee sufficient space.
4390 *
4391 * We use a modified version of the freemap's fixed radix tree, but taylored
4392 * for file data. Basically we configure an indirect block encompassing the
4393 * smallest key.
4394 */
4395 static int
hammer2_chain_indkey_file(hammer2_chain_t * parent,hammer2_key_t * keyp,int keybits,hammer2_blockref_t * base,int count,int ncount)4396 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp,
4397 int keybits, hammer2_blockref_t *base, int count,
4398 int ncount)
4399 {
4400 hammer2_chain_t *chain;
4401 hammer2_blockref_t *bref;
4402 hammer2_key_t key;
4403 hammer2_key_t key_beg;
4404 hammer2_key_t key_end;
4405 hammer2_key_t key_next;
4406 int nradix;
4407 int maxloops = 300000;
4408
4409 key = *keyp;
4410 keybits = 64;
4411
4412 /*
4413 * Calculate the range of keys in the array being careful to skip
4414 * slots which are overridden with a deletion.
4415 *
4416 * Locate the smallest key.
4417 */
4418 key_beg = 0;
4419 key_end = HAMMER2_KEY_MAX;
4420 hammer2_spin_ex(&parent->core.spin);
4421
4422 for (;;) {
4423 if (--maxloops == 0) {
4424 panic("indkey_freemap shit %p %p:%d\n",
4425 parent, base, count);
4426 }
4427 chain = hammer2_combined_find(parent, base, count,
4428 &key_next,
4429 key_beg, key_end,
4430 &bref);
4431
4432 /*
4433 * Exhausted search
4434 */
4435 if (bref == NULL)
4436 break;
4437
4438 /*
4439 * Skip deleted chains.
4440 */
4441 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4442 if (key_next == 0 || key_next > key_end)
4443 break;
4444 key_beg = key_next;
4445 continue;
4446 }
4447
4448 /*
4449 * Use the full live (not deleted) element for the scan
4450 * iteration. HAMMER2 does not allow partial replacements.
4451 *
4452 * XXX should be built into hammer2_combined_find().
4453 */
4454 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4455
4456 if (keybits > bref->keybits) {
4457 key = bref->key;
4458 keybits = bref->keybits;
4459 } else if (keybits == bref->keybits && bref->key < key) {
4460 key = bref->key;
4461 }
4462 if (key_next == 0)
4463 break;
4464 key_beg = key_next;
4465 }
4466 hammer2_spin_unex(&parent->core.spin);
4467
4468 /*
4469 * Calculate the static keybits for a higher-level indirect block
4470 * that contains the key.
4471 */
4472 *keyp = key;
4473
4474 switch(ncount) {
4475 case HAMMER2_IND_COUNT_MIN:
4476 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX;
4477 break;
4478 case HAMMER2_IND_COUNT_NOM:
4479 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX;
4480 break;
4481 case HAMMER2_IND_COUNT_MAX:
4482 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX;
4483 break;
4484 default:
4485 panic("bad ncount %d\n", ncount);
4486 nradix = 0;
4487 break;
4488 }
4489
4490 /*
4491 * The largest radix that can be returned for an indirect block is
4492 * 63 bits. (The largest practical indirect block radix is actually
4493 * 62 bits because the top-level inode or volume root contains four
4494 * entries, but allow 63 to be returned).
4495 */
4496 if (nradix >= 64)
4497 nradix = 63;
4498
4499 return keybits + nradix;
4500 }
4501
4502 #if 1
4503
4504 /*
4505 * Directory indirect blocks.
4506 *
4507 * Covers both the inode index (directory of inodes), and directory contents
4508 * (filenames hardlinked to inodes).
4509 *
4510 * Because directory keys are hashed we generally try to cut the space in
4511 * half. We accomodate the inode index (which tends to have linearly
4512 * increasing inode numbers) by ensuring that the keyspace is at least large
4513 * enough to fill up the indirect block being created.
4514 */
4515 static int
hammer2_chain_indkey_dir(hammer2_chain_t * parent,hammer2_key_t * keyp,int keybits,hammer2_blockref_t * base,int count,int ncount)4516 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp,
4517 int keybits, hammer2_blockref_t *base, int count,
4518 int ncount)
4519 {
4520 hammer2_blockref_t *bref;
4521 hammer2_chain_t *chain;
4522 hammer2_key_t key_beg;
4523 hammer2_key_t key_end;
4524 hammer2_key_t key_next;
4525 hammer2_key_t key;
4526 int nkeybits;
4527 int locount;
4528 int hicount;
4529 int maxloops = 300000;
4530
4531 /*
4532 * NOTE: We can't take a shortcut here anymore for inodes because
4533 * the root directory can contain a mix of inodes and directory
4534 * entries (we used to just return 63 if parent->bref.type was
4535 * HAMMER2_BREF_TYPE_INODE.
4536 */
4537 key = *keyp;
4538 locount = 0;
4539 hicount = 0;
4540
4541 /*
4542 * Calculate the range of keys in the array being careful to skip
4543 * slots which are overridden with a deletion.
4544 */
4545 key_beg = 0;
4546 key_end = HAMMER2_KEY_MAX;
4547 hammer2_spin_ex(&parent->core.spin);
4548
4549 for (;;) {
4550 if (--maxloops == 0) {
4551 panic("indkey_freemap shit %p %p:%d\n",
4552 parent, base, count);
4553 }
4554 chain = hammer2_combined_find(parent, base, count,
4555 &key_next,
4556 key_beg, key_end,
4557 &bref);
4558
4559 /*
4560 * Exhausted search
4561 */
4562 if (bref == NULL)
4563 break;
4564
4565 /*
4566 * Deleted object
4567 */
4568 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4569 if (key_next == 0 || key_next > key_end)
4570 break;
4571 key_beg = key_next;
4572 continue;
4573 }
4574
4575 /*
4576 * Use the full live (not deleted) element for the scan
4577 * iteration. HAMMER2 does not allow partial replacements.
4578 *
4579 * XXX should be built into hammer2_combined_find().
4580 */
4581 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4582
4583 /*
4584 * Expand our calculated key range (key, keybits) to fit
4585 * the scanned key. nkeybits represents the full range
4586 * that we will later cut in half (two halves @ nkeybits - 1).
4587 */
4588 nkeybits = keybits;
4589 if (nkeybits < bref->keybits) {
4590 if (bref->keybits > 64) {
4591 kprintf("bad bref chain %p bref %p\n",
4592 chain, bref);
4593 Debugger("fubar");
4594 }
4595 nkeybits = bref->keybits;
4596 }
4597 while (nkeybits < 64 &&
4598 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) {
4599 ++nkeybits;
4600 }
4601
4602 /*
4603 * If the new key range is larger we have to determine
4604 * which side of the new key range the existing keys fall
4605 * under by checking the high bit, then collapsing the
4606 * locount into the hicount or vise-versa.
4607 */
4608 if (keybits != nkeybits) {
4609 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
4610 hicount += locount;
4611 locount = 0;
4612 } else {
4613 locount += hicount;
4614 hicount = 0;
4615 }
4616 keybits = nkeybits;
4617 }
4618
4619 /*
4620 * The newly scanned key will be in the lower half or the
4621 * upper half of the (new) key range.
4622 */
4623 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
4624 ++hicount;
4625 else
4626 ++locount;
4627
4628 if (key_next == 0)
4629 break;
4630 key_beg = key_next;
4631 }
4632 hammer2_spin_unex(&parent->core.spin);
4633 bref = NULL; /* now invalid (safety) */
4634
4635 /*
4636 * Adjust keybits to represent half of the full range calculated
4637 * above (radix 63 max) for our new indirect block.
4638 */
4639 --keybits;
4640
4641 /*
4642 * Expand keybits to hold at least ncount elements. ncount will be
4643 * a power of 2. This is to try to completely fill leaf nodes (at
4644 * least for keys which are not hashes).
4645 *
4646 * We aren't counting 'in' or 'out', we are counting 'high side'
4647 * and 'low side' based on the bit at (1LL << keybits). We want
4648 * everything to be inside in these cases so shift it all to
4649 * the low or high side depending on the new high bit.
4650 */
4651 while (((hammer2_key_t)1 << keybits) < ncount) {
4652 ++keybits;
4653 if (key & ((hammer2_key_t)1 << keybits)) {
4654 hicount += locount;
4655 locount = 0;
4656 } else {
4657 locount += hicount;
4658 hicount = 0;
4659 }
4660 }
4661
4662 if (hicount > locount)
4663 key |= (hammer2_key_t)1 << keybits;
4664 else
4665 key &= ~(hammer2_key_t)1 << keybits;
4666
4667 *keyp = key;
4668
4669 return (keybits);
4670 }
4671
4672 #else
4673
4674 /*
4675 * Directory indirect blocks.
4676 *
4677 * Covers both the inode index (directory of inodes), and directory contents
4678 * (filenames hardlinked to inodes).
4679 *
4680 * Because directory keys are hashed we generally try to cut the space in
4681 * half. We accomodate the inode index (which tends to have linearly
4682 * increasing inode numbers) by ensuring that the keyspace is at least large
4683 * enough to fill up the indirect block being created.
4684 */
4685 static int
hammer2_chain_indkey_dir(hammer2_chain_t * parent,hammer2_key_t * keyp,int keybits,hammer2_blockref_t * base,int count,int ncount)4686 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp,
4687 int keybits, hammer2_blockref_t *base, int count,
4688 int ncount)
4689 {
4690 hammer2_blockref_t *bref;
4691 hammer2_chain_t *chain;
4692 hammer2_key_t key_beg;
4693 hammer2_key_t key_end;
4694 hammer2_key_t key_next;
4695 hammer2_key_t key;
4696 int nkeybits;
4697 int locount;
4698 int hicount;
4699 int maxloops = 300000;
4700
4701 /*
4702 * Shortcut if the parent is the inode. In this situation the
4703 * parent has 4+1 directory entries and we are creating an indirect
4704 * block capable of holding many more.
4705 */
4706 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
4707 return 63;
4708 }
4709
4710 key = *keyp;
4711 locount = 0;
4712 hicount = 0;
4713
4714 /*
4715 * Calculate the range of keys in the array being careful to skip
4716 * slots which are overridden with a deletion.
4717 */
4718 key_beg = 0;
4719 key_end = HAMMER2_KEY_MAX;
4720 hammer2_spin_ex(&parent->core.spin);
4721
4722 for (;;) {
4723 if (--maxloops == 0) {
4724 panic("indkey_freemap shit %p %p:%d\n",
4725 parent, base, count);
4726 }
4727 chain = hammer2_combined_find(parent, base, count,
4728 &key_next,
4729 key_beg, key_end,
4730 &bref);
4731
4732 /*
4733 * Exhausted search
4734 */
4735 if (bref == NULL)
4736 break;
4737
4738 /*
4739 * Deleted object
4740 */
4741 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4742 if (key_next == 0 || key_next > key_end)
4743 break;
4744 key_beg = key_next;
4745 continue;
4746 }
4747
4748 /*
4749 * Use the full live (not deleted) element for the scan
4750 * iteration. HAMMER2 does not allow partial replacements.
4751 *
4752 * XXX should be built into hammer2_combined_find().
4753 */
4754 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4755
4756 /*
4757 * Expand our calculated key range (key, keybits) to fit
4758 * the scanned key. nkeybits represents the full range
4759 * that we will later cut in half (two halves @ nkeybits - 1).
4760 */
4761 nkeybits = keybits;
4762 if (nkeybits < bref->keybits) {
4763 if (bref->keybits > 64) {
4764 kprintf("bad bref chain %p bref %p\n",
4765 chain, bref);
4766 Debugger("fubar");
4767 }
4768 nkeybits = bref->keybits;
4769 }
4770 while (nkeybits < 64 &&
4771 (~(((hammer2_key_t)1 << nkeybits) - 1) &
4772 (key ^ bref->key)) != 0) {
4773 ++nkeybits;
4774 }
4775
4776 /*
4777 * If the new key range is larger we have to determine
4778 * which side of the new key range the existing keys fall
4779 * under by checking the high bit, then collapsing the
4780 * locount into the hicount or vise-versa.
4781 */
4782 if (keybits != nkeybits) {
4783 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
4784 hicount += locount;
4785 locount = 0;
4786 } else {
4787 locount += hicount;
4788 hicount = 0;
4789 }
4790 keybits = nkeybits;
4791 }
4792
4793 /*
4794 * The newly scanned key will be in the lower half or the
4795 * upper half of the (new) key range.
4796 */
4797 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
4798 ++hicount;
4799 else
4800 ++locount;
4801
4802 if (key_next == 0)
4803 break;
4804 key_beg = key_next;
4805 }
4806 hammer2_spin_unex(&parent->core.spin);
4807 bref = NULL; /* now invalid (safety) */
4808
4809 /*
4810 * Adjust keybits to represent half of the full range calculated
4811 * above (radix 63 max) for our new indirect block.
4812 */
4813 --keybits;
4814
4815 /*
4816 * Expand keybits to hold at least ncount elements. ncount will be
4817 * a power of 2. This is to try to completely fill leaf nodes (at
4818 * least for keys which are not hashes).
4819 *
4820 * We aren't counting 'in' or 'out', we are counting 'high side'
4821 * and 'low side' based on the bit at (1LL << keybits). We want
4822 * everything to be inside in these cases so shift it all to
4823 * the low or high side depending on the new high bit.
4824 */
4825 while (((hammer2_key_t)1 << keybits) < ncount) {
4826 ++keybits;
4827 if (key & ((hammer2_key_t)1 << keybits)) {
4828 hicount += locount;
4829 locount = 0;
4830 } else {
4831 locount += hicount;
4832 hicount = 0;
4833 }
4834 }
4835
4836 if (hicount > locount)
4837 key |= (hammer2_key_t)1 << keybits;
4838 else
4839 key &= ~(hammer2_key_t)1 << keybits;
4840
4841 *keyp = key;
4842
4843 return (keybits);
4844 }
4845
4846 #endif
4847
4848 /*
4849 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if
4850 * it exists.
4851 *
4852 * Both parent and chain must be locked exclusively.
4853 *
4854 * This function will modify the parent if the blockref requires removal
4855 * from the parent's block table.
4856 *
4857 * This function is NOT recursive. Any entity already pushed into the
4858 * chain (such as an inode) may still need visibility into its contents,
4859 * as well as the ability to read and modify the contents. For example,
4860 * for an unlinked file which is still open.
4861 *
4862 * Also note that the flusher is responsible for cleaning up empty
4863 * indirect blocks.
4864 */
4865 int
hammer2_chain_delete(hammer2_chain_t * parent,hammer2_chain_t * chain,hammer2_tid_t mtid,int flags)4866 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
4867 hammer2_tid_t mtid, int flags)
4868 {
4869 int error = 0;
4870
4871 KKASSERT(hammer2_mtx_owned(&chain->lock));
4872
4873 /*
4874 * Nothing to do if already marked.
4875 *
4876 * We need the spinlock on the core whos RBTREE contains chain
4877 * to protect against races.
4878 */
4879 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
4880 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
4881 chain->parent == parent);
4882 error = _hammer2_chain_delete_helper(parent, chain,
4883 mtid, flags, NULL);
4884 }
4885
4886 /*
4887 * Permanent deletions mark the chain as destroyed.
4888 *
4889 * NOTE: We do not setflush the chain unless the deletion is
4890 * permanent, since the deletion of a chain does not actually
4891 * require it to be flushed.
4892 */
4893 if (error == 0) {
4894 if (flags & HAMMER2_DELETE_PERMANENT) {
4895 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
4896 hammer2_chain_setflush(chain);
4897 }
4898 }
4899
4900 return error;
4901 }
4902
4903 static int
hammer2_chain_delete_obref(hammer2_chain_t * parent,hammer2_chain_t * chain,hammer2_tid_t mtid,int flags,hammer2_blockref_t * obref)4904 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain,
4905 hammer2_tid_t mtid, int flags,
4906 hammer2_blockref_t *obref)
4907 {
4908 int error = 0;
4909
4910 KKASSERT(hammer2_mtx_owned(&chain->lock));
4911
4912 /*
4913 * Nothing to do if already marked.
4914 *
4915 * We need the spinlock on the core whos RBTREE contains chain
4916 * to protect against races.
4917 */
4918 obref->type = HAMMER2_BREF_TYPE_EMPTY;
4919 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
4920 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
4921 chain->parent == parent);
4922 error = _hammer2_chain_delete_helper(parent, chain,
4923 mtid, flags, obref);
4924 }
4925
4926 /*
4927 * Permanent deletions mark the chain as destroyed.
4928 *
4929 * NOTE: We do not setflush the chain unless the deletion is
4930 * permanent, since the deletion of a chain does not actually
4931 * require it to be flushed.
4932 */
4933 if (error == 0) {
4934 if (flags & HAMMER2_DELETE_PERMANENT) {
4935 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
4936 hammer2_chain_setflush(chain);
4937 }
4938 }
4939
4940 return error;
4941 }
4942
4943 /*
4944 * Returns the index of the nearest element in the blockref array >= elm.
4945 * Returns (count) if no element could be found.
4946 *
4947 * Sets *key_nextp to the next key for loop purposes but does not modify
4948 * it if the next key would be higher than the current value of *key_nextp.
4949 * Note that *key_nexp can overflow to 0, which should be tested by the
4950 * caller.
4951 *
4952 * WARNING! Must be called with parent's spinlock held. Spinlock remains
4953 * held through the operation.
4954 */
4955 static int
hammer2_base_find(hammer2_chain_t * parent,hammer2_blockref_t * base,int count,hammer2_key_t * key_nextp,hammer2_key_t key_beg,hammer2_key_t key_end)4956 hammer2_base_find(hammer2_chain_t *parent,
4957 hammer2_blockref_t *base, int count,
4958 hammer2_key_t *key_nextp,
4959 hammer2_key_t key_beg, hammer2_key_t key_end)
4960 {
4961 hammer2_blockref_t *scan;
4962 hammer2_key_t scan_end;
4963 int i;
4964 int limit;
4965
4966 /*
4967 * Require the live chain's already have their core's counted
4968 * so we can optimize operations.
4969 */
4970 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS);
4971
4972 /*
4973 * Degenerate case
4974 */
4975 if (count == 0 || base == NULL)
4976 return(count);
4977
4978 /*
4979 * Sequential optimization using parent->cache_index. This is
4980 * the most likely scenario.
4981 *
4982 * We can avoid trailing empty entries on live chains, otherwise
4983 * we might have to check the whole block array.
4984 */
4985 i = parent->cache_index; /* SMP RACE OK */
4986 cpu_ccfence();
4987 limit = parent->core.live_zero;
4988 if (i >= limit)
4989 i = limit - 1;
4990 if (i < 0)
4991 i = 0;
4992 KKASSERT(i < count);
4993
4994 /*
4995 * Search backwards
4996 */
4997 scan = &base[i];
4998 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY ||
4999 scan->key > key_beg)) {
5000 --scan;
5001 --i;
5002 }
5003 parent->cache_index = i;
5004
5005 /*
5006 * Search forwards, stop when we find a scan element which
5007 * encloses the key or until we know that there are no further
5008 * elements.
5009 */
5010 while (i < count) {
5011 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) {
5012 scan_end = scan->key +
5013 ((hammer2_key_t)1 << scan->keybits) - 1;
5014 if (scan->key > key_beg || scan_end >= key_beg)
5015 break;
5016 }
5017 if (i >= limit)
5018 return (count);
5019 ++scan;
5020 ++i;
5021 }
5022 if (i != count) {
5023 parent->cache_index = i;
5024 if (i >= limit) {
5025 i = count;
5026 } else {
5027 scan_end = scan->key +
5028 ((hammer2_key_t)1 << scan->keybits);
5029 if (scan_end && (*key_nextp > scan_end ||
5030 *key_nextp == 0)) {
5031 *key_nextp = scan_end;
5032 }
5033 }
5034 }
5035 return (i);
5036 }
5037
5038 /*
5039 * Do a combined search and return the next match either from the blockref
5040 * array or from the in-memory chain. Sets *brefp to the returned bref in
5041 * both cases, or sets it to NULL if the search exhausted. Only returns
5042 * a non-NULL chain if the search matched from the in-memory chain.
5043 *
5044 * When no in-memory chain has been found and a non-NULL bref is returned
5045 * in *brefp.
5046 *
5047 *
5048 * The returned chain is not locked or referenced. Use the returned bref
5049 * to determine if the search exhausted or not. Iterate if the base find
5050 * is chosen but matches a deleted chain.
5051 *
5052 * WARNING! Must be called with parent's spinlock held. Spinlock remains
5053 * held through the operation.
5054 */
5055 static hammer2_chain_t *
hammer2_combined_find(hammer2_chain_t * parent,hammer2_blockref_t * base,int count,hammer2_key_t * key_nextp,hammer2_key_t key_beg,hammer2_key_t key_end,hammer2_blockref_t ** brefp)5056 hammer2_combined_find(hammer2_chain_t *parent,
5057 hammer2_blockref_t *base, int count,
5058 hammer2_key_t *key_nextp,
5059 hammer2_key_t key_beg, hammer2_key_t key_end,
5060 hammer2_blockref_t **brefp)
5061 {
5062 hammer2_blockref_t *bref;
5063 hammer2_chain_t *chain;
5064 int i;
5065
5066 /*
5067 * Lookup in block array and in rbtree.
5068 */
5069 *key_nextp = key_end + 1;
5070 i = hammer2_base_find(parent, base, count, key_nextp,
5071 key_beg, key_end);
5072 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
5073
5074 /*
5075 * Neither matched
5076 */
5077 if (i == count && chain == NULL) {
5078 *brefp = NULL;
5079 return(NULL);
5080 }
5081
5082 /*
5083 * Only chain matched.
5084 */
5085 if (i == count) {
5086 bref = &chain->bref;
5087 goto found;
5088 }
5089
5090 /*
5091 * Only blockref matched.
5092 */
5093 if (chain == NULL) {
5094 bref = &base[i];
5095 goto found;
5096 }
5097
5098 /*
5099 * Both in-memory and blockref matched, select the nearer element.
5100 *
5101 * If both are flush with the left-hand side or both are the
5102 * same distance away, select the chain. In this situation the
5103 * chain must have been loaded from the matching blockmap.
5104 */
5105 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
5106 chain->bref.key == base[i].key) {
5107 KKASSERT(chain->bref.key == base[i].key);
5108 bref = &chain->bref;
5109 goto found;
5110 }
5111
5112 /*
5113 * Select the nearer key
5114 */
5115 if (chain->bref.key < base[i].key) {
5116 bref = &chain->bref;
5117 } else {
5118 bref = &base[i];
5119 chain = NULL;
5120 }
5121
5122 /*
5123 * If the bref is out of bounds we've exhausted our search.
5124 */
5125 found:
5126 if (bref->key > key_end) {
5127 *brefp = NULL;
5128 chain = NULL;
5129 } else {
5130 *brefp = bref;
5131 }
5132 return(chain);
5133 }
5134
5135 /*
5136 * Locate the specified block array element and delete it. The element
5137 * must exist.
5138 *
5139 * The spin lock on the related chain must be held.
5140 *
5141 * NOTE: live_count was adjusted when the chain was deleted, so it does not
5142 * need to be adjusted when we commit the media change.
5143 */
5144 void
hammer2_base_delete(hammer2_chain_t * parent,hammer2_blockref_t * base,int count,hammer2_chain_t * chain,hammer2_blockref_t * obref)5145 hammer2_base_delete(hammer2_chain_t *parent,
5146 hammer2_blockref_t *base, int count,
5147 hammer2_chain_t *chain,
5148 hammer2_blockref_t *obref)
5149 {
5150 hammer2_blockref_t *elm = &chain->bref;
5151 hammer2_blockref_t *scan;
5152 hammer2_key_t key_next;
5153 int i;
5154
5155 /*
5156 * Delete element. Expect the element to exist.
5157 *
5158 * XXX see caller, flush code not yet sophisticated enough to prevent
5159 * re-flushed in some cases.
5160 */
5161 key_next = 0; /* max range */
5162 i = hammer2_base_find(parent, base, count, &key_next,
5163 elm->key, elm->key);
5164 scan = &base[i];
5165 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY ||
5166 scan->key != elm->key ||
5167 ((chain->flags & HAMMER2_CHAIN_BLKMAPUPD) == 0 &&
5168 scan->keybits != elm->keybits)) {
5169 hammer2_spin_unex(&parent->core.spin);
5170 panic("delete base %p element not found at %d/%d elm %p\n",
5171 base, i, count, elm);
5172 return;
5173 }
5174
5175 /*
5176 * Update stats and zero the entry.
5177 *
5178 * NOTE: Handle radix == 0 (0 bytes) case.
5179 */
5180 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) {
5181 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 <<
5182 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX);
5183 }
5184 switch(scan->type) {
5185 case HAMMER2_BREF_TYPE_INODE:
5186 --parent->bref.embed.stats.inode_count;
5187 /* fall through */
5188 case HAMMER2_BREF_TYPE_DATA:
5189 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5190 atomic_set_int(&chain->flags,
5191 HAMMER2_CHAIN_HINT_LEAF_COUNT);
5192 } else {
5193 if (parent->bref.leaf_count)
5194 --parent->bref.leaf_count;
5195 }
5196 /* fall through */
5197 case HAMMER2_BREF_TYPE_INDIRECT:
5198 if (scan->type != HAMMER2_BREF_TYPE_DATA) {
5199 parent->bref.embed.stats.data_count -=
5200 scan->embed.stats.data_count;
5201 parent->bref.embed.stats.inode_count -=
5202 scan->embed.stats.inode_count;
5203 }
5204 if (scan->type == HAMMER2_BREF_TYPE_INODE)
5205 break;
5206 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5207 atomic_set_int(&chain->flags,
5208 HAMMER2_CHAIN_HINT_LEAF_COUNT);
5209 } else {
5210 if (parent->bref.leaf_count <= scan->leaf_count)
5211 parent->bref.leaf_count = 0;
5212 else
5213 parent->bref.leaf_count -= scan->leaf_count;
5214 }
5215 break;
5216 case HAMMER2_BREF_TYPE_DIRENT:
5217 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5218 atomic_set_int(&chain->flags,
5219 HAMMER2_CHAIN_HINT_LEAF_COUNT);
5220 } else {
5221 if (parent->bref.leaf_count)
5222 --parent->bref.leaf_count;
5223 }
5224 default:
5225 break;
5226 }
5227
5228 if (obref)
5229 *obref = *scan;
5230 bzero(scan, sizeof(*scan));
5231
5232 /*
5233 * We can only optimize parent->core.live_zero for live chains.
5234 */
5235 if (parent->core.live_zero == i + 1) {
5236 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY)
5237 ;
5238 parent->core.live_zero = i + 1;
5239 }
5240
5241 /*
5242 * Clear appropriate blockmap flags in chain.
5243 */
5244 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED |
5245 HAMMER2_CHAIN_BLKMAPUPD);
5246 }
5247
5248 /*
5249 * Insert the specified element. The block array must not already have the
5250 * element and must have space available for the insertion.
5251 *
5252 * The spin lock on the related chain must be held.
5253 *
5254 * NOTE: live_count was adjusted when the chain was deleted, so it does not
5255 * need to be adjusted when we commit the media change.
5256 */
5257 void
hammer2_base_insert(hammer2_chain_t * parent,hammer2_blockref_t * base,int count,hammer2_chain_t * chain,hammer2_blockref_t * elm)5258 hammer2_base_insert(hammer2_chain_t *parent,
5259 hammer2_blockref_t *base, int count,
5260 hammer2_chain_t *chain, hammer2_blockref_t *elm)
5261 {
5262 hammer2_key_t key_next;
5263 hammer2_key_t xkey;
5264 int i;
5265 int j;
5266 int k;
5267 int l;
5268 int u = 1;
5269
5270 /*
5271 * Insert new element. Expect the element to not already exist
5272 * unless we are replacing it.
5273 *
5274 * XXX see caller, flush code not yet sophisticated enough to prevent
5275 * re-flushed in some cases.
5276 */
5277 key_next = 0; /* max range */
5278 i = hammer2_base_find(parent, base, count, &key_next,
5279 elm->key, elm->key);
5280
5281 /*
5282 * Shortcut fill optimization, typical ordered insertion(s) may not
5283 * require a search.
5284 */
5285 KKASSERT(i >= 0 && i <= count);
5286
5287 /*
5288 * Set appropriate blockmap flags in chain (if not NULL)
5289 */
5290 if (chain)
5291 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED);
5292
5293 /*
5294 * Update stats and zero the entry
5295 */
5296 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) {
5297 parent->bref.embed.stats.data_count += (hammer2_off_t)1 <<
5298 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX);
5299 }
5300 switch(elm->type) {
5301 case HAMMER2_BREF_TYPE_INODE:
5302 ++parent->bref.embed.stats.inode_count;
5303 /* fall through */
5304 case HAMMER2_BREF_TYPE_DATA:
5305 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX)
5306 ++parent->bref.leaf_count;
5307 /* fall through */
5308 case HAMMER2_BREF_TYPE_INDIRECT:
5309 if (elm->type != HAMMER2_BREF_TYPE_DATA) {
5310 parent->bref.embed.stats.data_count +=
5311 elm->embed.stats.data_count;
5312 parent->bref.embed.stats.inode_count +=
5313 elm->embed.stats.inode_count;
5314 }
5315 if (elm->type == HAMMER2_BREF_TYPE_INODE)
5316 break;
5317 if (parent->bref.leaf_count + elm->leaf_count <
5318 HAMMER2_BLOCKREF_LEAF_MAX) {
5319 parent->bref.leaf_count += elm->leaf_count;
5320 } else {
5321 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX;
5322 }
5323 break;
5324 case HAMMER2_BREF_TYPE_DIRENT:
5325 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX)
5326 ++parent->bref.leaf_count;
5327 break;
5328 default:
5329 break;
5330 }
5331
5332
5333 /*
5334 * We can only optimize parent->core.live_zero for live chains.
5335 */
5336 if (i == count && parent->core.live_zero < count) {
5337 i = parent->core.live_zero++;
5338 base[i] = *elm;
5339 return;
5340 }
5341
5342 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
5343 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
5344 hammer2_spin_unex(&parent->core.spin);
5345 panic("insert base %p overlapping elements at %d elm %p\n",
5346 base, i, elm);
5347 }
5348
5349 /*
5350 * Try to find an empty slot before or after.
5351 */
5352 j = i;
5353 k = i;
5354 while (j > 0 || k < count) {
5355 --j;
5356 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) {
5357 if (j == i - 1) {
5358 base[j] = *elm;
5359 } else {
5360 bcopy(&base[j+1], &base[j],
5361 (i - j - 1) * sizeof(*base));
5362 base[i - 1] = *elm;
5363 }
5364 goto validate;
5365 }
5366 ++k;
5367 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) {
5368 bcopy(&base[i], &base[i+1],
5369 (k - i) * sizeof(hammer2_blockref_t));
5370 base[i] = *elm;
5371
5372 /*
5373 * We can only update parent->core.live_zero for live
5374 * chains.
5375 */
5376 if (parent->core.live_zero <= k)
5377 parent->core.live_zero = k + 1;
5378 u = 2;
5379 goto validate;
5380 }
5381 }
5382 panic("hammer2_base_insert: no room!");
5383
5384 /*
5385 * Debugging
5386 */
5387 validate:
5388 key_next = 0;
5389 for (l = 0; l < count; ++l) {
5390 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) {
5391 key_next = base[l].key +
5392 ((hammer2_key_t)1 << base[l].keybits) - 1;
5393 break;
5394 }
5395 }
5396 while (++l < count) {
5397 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) {
5398 if (base[l].key <= key_next)
5399 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l);
5400 key_next = base[l].key +
5401 ((hammer2_key_t)1 << base[l].keybits) - 1;
5402
5403 }
5404 }
5405
5406 }
5407
5408 #if 0
5409
5410 /*
5411 * Sort the blockref array for the chain. Used by the flush code to
5412 * sort the blockref[] array.
5413 *
5414 * The chain must be exclusively locked AND spin-locked.
5415 */
5416 typedef hammer2_blockref_t *hammer2_blockref_p;
5417
5418 static
5419 int
5420 hammer2_base_sort_callback(const void *v1, const void *v2)
5421 {
5422 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
5423 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
5424
5425 /*
5426 * Make sure empty elements are placed at the end of the array
5427 */
5428 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) {
5429 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY)
5430 return(0);
5431 return(1);
5432 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) {
5433 return(-1);
5434 }
5435
5436 /*
5437 * Sort by key
5438 */
5439 if (bref1->key < bref2->key)
5440 return(-1);
5441 if (bref1->key > bref2->key)
5442 return(1);
5443 return(0);
5444 }
5445
5446 void
5447 hammer2_base_sort(hammer2_chain_t *chain)
5448 {
5449 hammer2_blockref_t *base;
5450 int count;
5451
5452 switch(chain->bref.type) {
5453 case HAMMER2_BREF_TYPE_INODE:
5454 /*
5455 * Special shortcut for embedded data returns the inode
5456 * itself. Callers must detect this condition and access
5457 * the embedded data (the strategy code does this for us).
5458 *
5459 * This is only applicable to regular files and softlinks.
5460 */
5461 if (chain->data->ipdata.meta.op_flags &
5462 HAMMER2_OPFLAG_DIRECTDATA) {
5463 return;
5464 }
5465 base = &chain->data->ipdata.u.blockset.blockref[0];
5466 count = HAMMER2_SET_COUNT;
5467 break;
5468 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
5469 case HAMMER2_BREF_TYPE_INDIRECT:
5470 /*
5471 * Optimize indirect blocks in the INITIAL state to avoid
5472 * I/O.
5473 */
5474 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
5475 base = &chain->data->npdata[0];
5476 count = chain->bytes / sizeof(hammer2_blockref_t);
5477 break;
5478 case HAMMER2_BREF_TYPE_VOLUME:
5479 base = &chain->data->voldata.sroot_blockset.blockref[0];
5480 count = HAMMER2_SET_COUNT;
5481 break;
5482 case HAMMER2_BREF_TYPE_FREEMAP:
5483 base = &chain->data->blkset.blockref[0];
5484 count = HAMMER2_SET_COUNT;
5485 break;
5486 default:
5487 panic("hammer2_base_sort: unrecognized "
5488 "blockref(A) type: %d",
5489 chain->bref.type);
5490 base = NULL; /* safety */
5491 count = 0; /* safety */
5492 break;
5493 }
5494 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
5495 }
5496
5497 #endif
5498
5499 /*
5500 * Set the check data for a chain. This can be a heavy-weight operation
5501 * and typically only runs on-flush. For file data check data is calculated
5502 * when the logical buffers are flushed.
5503 */
5504 void
hammer2_chain_setcheck(hammer2_chain_t * chain,void * bdata)5505 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata)
5506 {
5507 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED);
5508
5509 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
5510 case HAMMER2_CHECK_NONE:
5511 break;
5512 case HAMMER2_CHECK_DISABLED:
5513 break;
5514 case HAMMER2_CHECK_ISCSI32:
5515 chain->bref.check.iscsi32.value =
5516 hammer2_icrc32(bdata, chain->bytes);
5517 break;
5518 case HAMMER2_CHECK_XXHASH64:
5519 chain->bref.check.xxhash64.value =
5520 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
5521 break;
5522 case HAMMER2_CHECK_SHA192:
5523 assert(0); /* XXX unsupported */
5524 /*
5525 {
5526 SHA256_CTX hash_ctx;
5527 union {
5528 uint8_t digest[SHA256_DIGEST_LENGTH];
5529 uint64_t digest64[SHA256_DIGEST_LENGTH/8];
5530 } u;
5531
5532 SHA256_Init(&hash_ctx);
5533 SHA256_Update(&hash_ctx, bdata, chain->bytes);
5534 SHA256_Final(u.digest, &hash_ctx);
5535 u.digest64[2] ^= u.digest64[3];
5536 bcopy(u.digest,
5537 chain->bref.check.sha192.data,
5538 sizeof(chain->bref.check.sha192.data));
5539 }
5540 */
5541 break;
5542 case HAMMER2_CHECK_FREEMAP:
5543 chain->bref.check.freemap.icrc32 =
5544 hammer2_icrc32(bdata, chain->bytes);
5545 break;
5546 default:
5547 kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
5548 chain->bref.methods);
5549 break;
5550 }
5551 }
5552
5553 /*
5554 * Characterize a failed check code and try to trace back to the inode.
5555 */
5556 static void
hammer2_characterize_failed_chain(hammer2_chain_t * chain,uint64_t check,int bits)5557 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check,
5558 int bits)
5559 {
5560 hammer2_chain_t *lchain;
5561 hammer2_chain_t *ochain;
5562 int did;
5563
5564 did = krateprintf(&krate_h2chk,
5565 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL "
5566 "(flags=%08x, bref/data ",
5567 chain->bref.data_off,
5568 chain->bref.type,
5569 hammer2_bref_type_str(chain->bref.type),
5570 chain->bref.methods,
5571 chain->flags);
5572 if (did == 0)
5573 return;
5574
5575 if (bits == 32) {
5576 kprintf("%08x/%08x)\n",
5577 chain->bref.check.iscsi32.value,
5578 (uint32_t)check);
5579 } else {
5580 kprintf("%016jx/%016jx)\n",
5581 chain->bref.check.xxhash64.value,
5582 check);
5583 }
5584
5585 /*
5586 * Run up the chains to try to find the governing inode so we
5587 * can report it.
5588 *
5589 * XXX This error reporting is not really MPSAFE
5590 */
5591 ochain = chain;
5592 lchain = chain;
5593 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
5594 lchain = chain;
5595 chain = chain->parent;
5596 }
5597
5598 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
5599 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 ||
5600 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) {
5601 kprintf(" Resides at/in inode %ld\n",
5602 (long)chain->bref.key);
5603 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
5604 kprintf(" Resides in inode index - CRITICAL!!!\n");
5605 } else {
5606 kprintf(" Resides in root index - CRITICAL!!!\n");
5607 }
5608 if (ochain->hmp) {
5609 const char *pfsname = "UNKNOWN";
5610 int i;
5611
5612 if (ochain->pmp) {
5613 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
5614 if (ochain->pmp->pfs_hmps[i] == ochain->hmp &&
5615 ochain->pmp->pfs_names[i]) {
5616 pfsname = ochain->pmp->pfs_names[i];
5617 break;
5618 }
5619 }
5620 }
5621 kprintf(" In pfs %s on device %s\n",
5622 pfsname, ochain->hmp->devrepname);
5623 }
5624 }
5625
5626 /*
5627 * Returns non-zero on success, 0 on failure.
5628 */
5629 int
hammer2_chain_testcheck(hammer2_chain_t * chain,void * bdata)5630 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
5631 {
5632 uint32_t check32;
5633 uint64_t check64;
5634 int r;
5635
5636 if (chain->flags & HAMMER2_CHAIN_NOTTESTED)
5637 return 1;
5638
5639 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
5640 case HAMMER2_CHECK_NONE:
5641 r = 1;
5642 break;
5643 case HAMMER2_CHECK_DISABLED:
5644 r = 1;
5645 break;
5646 case HAMMER2_CHECK_ISCSI32:
5647 check32 = hammer2_icrc32(bdata, chain->bytes);
5648 r = (chain->bref.check.iscsi32.value == check32);
5649 if (r == 0) {
5650 hammer2_characterize_failed_chain(chain, check32, 32);
5651 }
5652 hammer2_process_icrc32 += chain->bytes;
5653 break;
5654 case HAMMER2_CHECK_XXHASH64:
5655 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
5656 r = (chain->bref.check.xxhash64.value == check64);
5657 if (r == 0) {
5658 hammer2_characterize_failed_chain(chain, check64, 64);
5659 }
5660 hammer2_process_xxhash64 += chain->bytes;
5661 break;
5662 case HAMMER2_CHECK_SHA192:
5663 assert(0); /* XXX unsupported */
5664 /*
5665 {
5666 SHA256_CTX hash_ctx;
5667 union {
5668 uint8_t digest[SHA256_DIGEST_LENGTH];
5669 uint64_t digest64[SHA256_DIGEST_LENGTH/8];
5670 } u;
5671
5672 SHA256_Init(&hash_ctx);
5673 SHA256_Update(&hash_ctx, bdata, chain->bytes);
5674 SHA256_Final(u.digest, &hash_ctx);
5675 u.digest64[2] ^= u.digest64[3];
5676 if (bcmp(u.digest,
5677 chain->bref.check.sha192.data,
5678 sizeof(chain->bref.check.sha192.data)) == 0) {
5679 r = 1;
5680 } else {
5681 r = 0;
5682 krateprintf(&krate_h2chk,
5683 "chain %016jx.%02x meth=%02x "
5684 "CHECK FAIL\n",
5685 chain->bref.data_off,
5686 chain->bref.type,
5687 chain->bref.methods);
5688 }
5689 }
5690 */
5691 break;
5692 case HAMMER2_CHECK_FREEMAP:
5693 r = (chain->bref.check.freemap.icrc32 ==
5694 hammer2_icrc32(bdata, chain->bytes));
5695 if (r == 0) {
5696 int did;
5697
5698 did = krateprintf(&krate_h2chk,
5699 "chain %016jx.%02x meth=%02x "
5700 "CHECK FAIL\n",
5701 chain->bref.data_off,
5702 chain->bref.type,
5703 chain->bref.methods);
5704 if (did) {
5705 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n",
5706 chain->bref.check.freemap.icrc32,
5707 hammer2_icrc32(bdata, chain->bytes),
5708 chain->bytes);
5709 if (chain->dio) {
5710 kprintf("dio %p buf %016jx,%ld "
5711 "bdata %p/%p\n",
5712 chain->dio,
5713 (intmax_t)chain->dio->bp->b_loffset,
5714 chain->dio->bp->b_bufsize,
5715 bdata,
5716 chain->dio->bp->b_data);
5717 }
5718 }
5719 }
5720 break;
5721 default:
5722 kprintf("hammer2_chain_testcheck: unknown check type %02x\n",
5723 chain->bref.methods);
5724 r = 1;
5725 break;
5726 }
5727 return r;
5728 }
5729
5730 /*
5731 * Acquire the chain and parent representing the specified inode for the
5732 * device at the specified cluster index.
5733 *
5734 * The flags passed in are LOOKUP flags, not RESOLVE flags.
5735 *
5736 * If we are unable to locate the inode, HAMMER2_ERROR_EIO or HAMMER2_ERROR_CHECK
5737 * is returned. In case of error, *chainp and/or *parentp may still be returned
5738 * non-NULL.
5739 *
5740 * The caller may pass-in a locked *parentp and/or *chainp, or neither.
5741 * They will be unlocked and released by this function. The *parentp and
5742 * *chainp representing the located inode are returned locked.
5743 *
5744 * The returned error includes any error on the returned chain in addition to
5745 * errors incurred while trying to lookup the inode. However, a chain->error
5746 * might not be recognized if HAMMER2_LOOKUP_NODATA is passed. This flag may
5747 * not be passed to this function.
5748 */
5749 int
hammer2_chain_inode_find(hammer2_pfs_t * pmp,hammer2_key_t inum,int clindex,int flags,hammer2_chain_t ** parentp,hammer2_chain_t ** chainp)5750 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum,
5751 int clindex, int flags,
5752 hammer2_chain_t **parentp, hammer2_chain_t **chainp)
5753 {
5754 hammer2_chain_t *parent;
5755 hammer2_chain_t *rchain;
5756 hammer2_key_t key_dummy;
5757 hammer2_inode_t *ip;
5758 int resolve_flags;
5759 int error;
5760
5761 KKASSERT((flags & HAMMER2_LOOKUP_NODATA) == 0);
5762
5763 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ?
5764 HAMMER2_RESOLVE_SHARED : 0;
5765
5766 /*
5767 * Caller expects us to replace these.
5768 */
5769 if (*chainp) {
5770 hammer2_chain_unlock(*chainp);
5771 hammer2_chain_drop(*chainp);
5772 *chainp = NULL;
5773 }
5774 if (*parentp) {
5775 hammer2_chain_unlock(*parentp);
5776 hammer2_chain_drop(*parentp);
5777 *parentp = NULL;
5778 }
5779
5780 /*
5781 * Be very careful, this is a backend function and we CANNOT
5782 * lock any frontend inode structure we find. But we have to
5783 * look the inode up this way first in case it exists but is
5784 * detached from the radix tree.
5785 */
5786 ip = hammer2_inode_lookup(pmp, inum);
5787 if (ip) {
5788 *chainp = hammer2_inode_chain_and_parent(ip, clindex,
5789 parentp,
5790 resolve_flags);
5791 hammer2_inode_drop(ip);
5792 if (*chainp)
5793 return (*chainp)->error;
5794 hammer2_chain_unlock(*chainp);
5795 hammer2_chain_drop(*chainp);
5796 *chainp = NULL;
5797 if (*parentp) {
5798 hammer2_chain_unlock(*parentp);
5799 hammer2_chain_drop(*parentp);
5800 *parentp = NULL;
5801 }
5802 }
5803
5804 /*
5805 * Inodes hang off of the iroot (bit 63 is clear, differentiating
5806 * inodes from root directory entries in the key lookup).
5807 */
5808 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags);
5809 rchain = NULL;
5810 if (parent) {
5811 /*
5812 * NOTE: rchain can be returned as NULL even if error == 0
5813 * (i.e. not found)
5814 */
5815 rchain = hammer2_chain_lookup(&parent, &key_dummy,
5816 inum, inum,
5817 &error, flags);
5818 /*
5819 * Propagate a chain-specific error to caller.
5820 *
5821 * If the chain is not errored, we must still validate that the inode
5822 * number is correct, because all hell will break loose if it isn't
5823 * correct. It should always be correct so print to the console and
5824 * simulate a CHECK error if it is not.
5825 */
5826 if (error == 0 && rchain) {
5827 error = rchain->error;
5828 if (error == 0 && rchain->data) {
5829 if (inum != rchain->data->ipdata.meta.inum) {
5830 kprintf("hammer2_chain_inode_find: lookup inum %ld, "
5831 "got valid inode but with inum %ld\n",
5832 (long)inum, (long)rchain->data->ipdata.meta.inum);
5833 error = HAMMER2_ERROR_CHECK;
5834 rchain->error = error;
5835 }
5836 }
5837 }
5838 } else {
5839 error = HAMMER2_ERROR_EIO;
5840 }
5841 *parentp = parent;
5842 *chainp = rchain;
5843
5844 return error;
5845 }
5846
5847 /*
5848 * Used by the bulkscan code to snapshot the synchronized storage for
5849 * a volume, allowing it to be scanned concurrently against normal
5850 * operation.
5851 */
5852 hammer2_chain_t *
hammer2_chain_bulksnap(hammer2_dev_t * hmp)5853 hammer2_chain_bulksnap(hammer2_dev_t *hmp)
5854 {
5855 hammer2_chain_t *copy;
5856
5857 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref);
5858 copy->data = kmalloc(sizeof(copy->data->voldata),
5859 hmp->mmsg, M_WAITOK | M_ZERO);
5860 hammer2_voldata_lock(hmp);
5861 copy->data->voldata = hmp->volsync;
5862 hammer2_voldata_unlock(hmp);
5863
5864 return copy;
5865 }
5866
5867 void
hammer2_chain_bulkdrop(hammer2_chain_t * copy)5868 hammer2_chain_bulkdrop(hammer2_chain_t *copy)
5869 {
5870 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME);
5871 KKASSERT(copy->data);
5872 kfree(copy->data, copy->hmp->mmsg);
5873 copy->data = NULL;
5874 hammer2_chain_drop(copy);
5875 }
5876
5877 /*
5878 * Returns non-zero if the chain (INODE or DIRENT) matches the
5879 * filename.
5880 */
5881 int
hammer2_chain_dirent_test(hammer2_chain_t * chain,const char * name,size_t name_len)5882 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name,
5883 size_t name_len)
5884 {
5885 const hammer2_inode_data_t *ripdata;
5886
5887 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
5888 ripdata = &chain->data->ipdata;
5889 if (ripdata->meta.name_len == name_len &&
5890 bcmp(ripdata->filename, name, name_len) == 0) {
5891 return 1;
5892 }
5893 }
5894 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT &&
5895 chain->bref.embed.dirent.namlen == name_len) {
5896 if (name_len > sizeof(chain->bref.check.buf) &&
5897 bcmp(chain->data->buf, name, name_len) == 0) {
5898 return 1;
5899 }
5900 if (name_len <= sizeof(chain->bref.check.buf) &&
5901 bcmp(chain->bref.check.buf, name, name_len) == 0) {
5902 return 1;
5903 }
5904 }
5905 return 0;
5906 }
5907
5908 /*
5909 * Debugging
5910 */
5911 void
hammer2_dump_chain(hammer2_chain_t * chain,int tab,int bi,int * countp,char pfx,u_int flags)5912 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int bi, int *countp,
5913 char pfx, u_int flags)
5914 {
5915 hammer2_chain_t *scan;
5916 hammer2_chain_t *parent;
5917
5918 --*countp;
5919 if (*countp == 0) {
5920 kprintf("%*.*s...\n", tab, tab, "");
5921 return;
5922 }
5923 if (*countp < 0)
5924 return;
5925 kprintf("%*.*s%c-chain %p %s.%-3d %016jx %016jx/%-2d mir=%016jx\n",
5926 tab, tab, "", pfx, chain,
5927 hammer2_bref_type_str(chain->bref.type), bi,
5928 chain->bref.data_off, chain->bref.key, chain->bref.keybits,
5929 chain->bref.mirror_tid);
5930
5931 kprintf("%*.*s [%08x] (%s) refs=%d",
5932 tab, tab, "",
5933 chain->flags,
5934 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
5935 chain->data) ? (char *)chain->data->ipdata.filename : "?"),
5936 chain->refs);
5937
5938 parent = chain->parent;
5939 if (parent)
5940 kprintf("\n%*.*s p=%p [pflags %08x prefs %d]",
5941 tab, tab, "",
5942 parent, parent->flags, parent->refs);
5943 if (RB_EMPTY(&chain->core.rbtree)) {
5944 kprintf("\n");
5945 } else {
5946 int bi = 0;
5947 kprintf(" {\n");
5948 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) {
5949 if ((scan->flags & flags) || flags == (u_int)-1) {
5950 hammer2_dump_chain(scan, tab + 4, bi, countp,
5951 'a', flags);
5952 }
5953 bi++;
5954 }
5955 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
5956 kprintf("%*.*s}(%s)\n", tab, tab, "",
5957 chain->data->ipdata.filename);
5958 else
5959 kprintf("%*.*s}\n", tab, tab, "");
5960 }
5961 }
5962
5963 void
hammer2_dump_chains(hammer2_dev_t * hmp,char vpfx,char fpfx)5964 hammer2_dump_chains(hammer2_dev_t *hmp, char vpfx, char fpfx)
5965 {
5966 int dumpcnt;
5967
5968 dumpcnt = 50;
5969 hammer2_dump_chain(&hmp->vchain, 0, 0, &dumpcnt, vpfx, (u_int)-1);
5970
5971 dumpcnt = 50;
5972 hammer2_dump_chain(&hmp->fchain, 0, 0, &dumpcnt, fpfx, (u_int)-1);
5973 }
5974