1// Copyright 2019 the Go-FUSE Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package fs
6
7import (
8	"context"
9	"fmt"
10	"log"
11	"math/rand"
12	"sort"
13	"strings"
14	"sync"
15	"syscall"
16	"unsafe"
17
18	"github.com/hanwen/go-fuse/v2/fuse"
19)
20
21type parentData struct {
22	name   string
23	parent *Inode
24}
25
26// StableAttr holds immutable attributes of a object in the filesystem.
27type StableAttr struct {
28	// Each Inode has a type, which does not change over the
29	// lifetime of the inode, for example fuse.S_IFDIR. The default (0)
30	// is interpreted as S_IFREG (regular file).
31	Mode uint32
32
33	// The inode number must be unique among the currently live
34	// objects in the file system. It is used to communicate to
35	// the kernel about this file object. The values uint64(-1),
36	// and 1 are reserved. When using Ino==0, a unique, sequential
37	// number is assigned (starting at 2^63 by default) on Inode creation.
38	Ino uint64
39
40	// When reusing a previously used inode number for a new
41	// object, the new object must have a different Gen
42	// number. This is irrelevant if the FS is not exported over
43	// NFS
44	Gen uint64
45}
46
47// Reserved returns if the StableAttr is using reserved Inode numbers.
48func (i *StableAttr) Reserved() bool {
49	return i.Ino == 1 || i.Ino == ^uint64(0)
50}
51
52// Inode is a node in VFS tree.  Inodes are one-to-one mapped to
53// Operations instances, which is the extension interface for file
54// systems.  One can create fully-formed trees of Inodes ahead of time
55// by creating "persistent" Inodes.
56//
57// The Inode struct contains a lock, so it should not be
58// copied. Inodes should be obtained by calling Inode.NewInode() or
59// Inode.NewPersistentInode().
60type Inode struct {
61	stableAttr StableAttr
62
63	ops    InodeEmbedder
64	bridge *rawBridge
65
66	// The *Node ID* is an arbitrary uint64 identifier chosen by the FUSE library.
67	// It is used the identify *nodes* (files/directories/symlinks/...) in the
68	// communication between the FUSE library and the Linux kernel.
69	nodeId uint64
70
71	// Following data is mutable.
72
73	// file handles.
74	// protected by bridge.mu
75	openFiles []uint32
76
77	// mu protects the following mutable fields. When locking
78	// multiple Inodes, locks must be acquired using
79	// lockNodes/unlockNodes
80	mu sync.Mutex
81
82	// persistent indicates that this node should not be removed
83	// from the tree, even if there are no live references. This
84	// must be set on creation, and can only be changed to false
85	// by calling removeRef.
86	// When you change this, you MUST increment changeCounter.
87	persistent bool
88
89	// changeCounter increments every time the mutable state
90	// (lookupCount, persistent, children, parents) protected by
91	// mu is modified.
92	//
93	// This is used in places where we have to relock inode into inode
94	// group lock, and after locking the group we have to check if inode
95	// did not changed, and if it changed - retry the operation.
96	changeCounter uint32
97
98	// Number of kernel refs to this node.
99	// When you change this, you MUST increment changeCounter.
100	lookupCount uint64
101
102	// Children of this Inode.
103	// When you change this, you MUST increment changeCounter.
104	children map[string]*Inode
105
106	// Parents of this Inode. Can be more than one due to hard links.
107	// When you change this, you MUST increment changeCounter.
108	parents map[parentData]struct{}
109}
110
111func (n *Inode) IsDir() bool {
112	return n.stableAttr.Mode&syscall.S_IFDIR != 0
113}
114
115func (n *Inode) embed() *Inode {
116	return n
117}
118
119func (n *Inode) EmbeddedInode() *Inode {
120	return n
121}
122
123func initInode(n *Inode, ops InodeEmbedder, attr StableAttr, bridge *rawBridge, persistent bool, nodeId uint64) {
124	n.ops = ops
125	n.stableAttr = attr
126	n.bridge = bridge
127	n.persistent = persistent
128	n.nodeId = nodeId
129	n.parents = make(map[parentData]struct{})
130	if attr.Mode == fuse.S_IFDIR {
131		n.children = make(map[string]*Inode)
132	}
133}
134
135// Set node ID and mode in EntryOut
136func (n *Inode) setEntryOut(out *fuse.EntryOut) {
137	out.NodeId = n.nodeId
138	out.Ino = n.stableAttr.Ino
139	out.Mode = (out.Attr.Mode & 07777) | n.stableAttr.Mode
140}
141
142// StableAttr returns the (Ino, Gen) tuple for this node.
143func (n *Inode) StableAttr() StableAttr {
144	return n.stableAttr
145}
146
147// Mode returns the filetype
148func (n *Inode) Mode() uint32 {
149	return n.stableAttr.Mode
150}
151
152// Returns the root of the tree
153func (n *Inode) Root() *Inode {
154	return n.bridge.root
155}
156
157// Returns whether this is the root of the tree
158func (n *Inode) IsRoot() bool {
159	return n.bridge.root == n
160}
161
162func modeStr(m uint32) string {
163	return map[uint32]string{
164		syscall.S_IFREG:  "reg",
165		syscall.S_IFLNK:  "lnk",
166		syscall.S_IFDIR:  "dir",
167		syscall.S_IFSOCK: "soc",
168		syscall.S_IFIFO:  "pip",
169		syscall.S_IFCHR:  "chr",
170		syscall.S_IFBLK:  "blk",
171	}[m]
172}
173
174// debugString is used for debugging. Racy.
175func (n *Inode) String() string {
176	n.mu.Lock()
177	defer n.mu.Unlock()
178	var ss []string
179	for nm, ch := range n.children {
180		ss = append(ss, fmt.Sprintf("%q=i%d[%s]", nm, ch.stableAttr.Ino, modeStr(ch.stableAttr.Mode)))
181	}
182
183	return fmt.Sprintf("i%d (%s): %s", n.stableAttr.Ino, modeStr(n.stableAttr.Mode), strings.Join(ss, ","))
184}
185
186// sortNodes rearranges inode group in consistent order.
187//
188// The nodes are ordered by their in-RAM address, which gives consistency
189// property: for any A and B inodes, sortNodes will either always order A < B,
190// or always order A > B.
191//
192// See lockNodes where this property is used to avoid deadlock when taking
193// locks on inode group.
194func sortNodes(ns []*Inode) {
195	sort.Slice(ns, func(i, j int) bool {
196		return nodeLess(ns[i], ns[j])
197	})
198}
199
200func nodeLess(a, b *Inode) bool {
201	return uintptr(unsafe.Pointer(a)) < uintptr(unsafe.Pointer(b))
202}
203
204// lockNodes locks group of inodes.
205//
206// It always lock the inodes in the same order - to avoid deadlocks.
207// It also avoids locking an inode more than once, if it was specified multiple times.
208// An example when an inode might be given multiple times is if dir/a and dir/b
209// are hardlinked to the same inode and the caller needs to take locks on dir children.
210func lockNodes(ns ...*Inode) {
211	sortNodes(ns)
212
213	// The default value nil prevents trying to lock nil nodes.
214	var nprev *Inode
215	for _, n := range ns {
216		if n != nprev {
217			n.mu.Lock()
218			nprev = n
219		}
220	}
221}
222
223// lockNode2 locks a and b in order consistent with lockNodes.
224func lockNode2(a, b *Inode) {
225	if a == b {
226		a.mu.Lock()
227	} else if nodeLess(a, b) {
228		a.mu.Lock()
229		b.mu.Lock()
230	} else {
231		b.mu.Lock()
232		a.mu.Lock()
233	}
234}
235
236// unlockNode2 unlocks a and b
237func unlockNode2(a, b *Inode) {
238	if a == b {
239		a.mu.Unlock()
240	} else {
241		a.mu.Unlock()
242		b.mu.Unlock()
243	}
244}
245
246// unlockNodes releases locks taken by lockNodes.
247func unlockNodes(ns ...*Inode) {
248	// we don't need to unlock in the same order that was used in lockNodes.
249	// however it still helps to have nodes sorted to avoid duplicates.
250	sortNodes(ns)
251
252	var nprev *Inode
253	for _, n := range ns {
254		if n != nprev {
255			n.mu.Unlock()
256			nprev = n
257		}
258	}
259}
260
261// Forgotten returns true if the kernel holds no references to this
262// inode.  This can be used for background cleanup tasks, since the
263// kernel has no way of reviving forgotten nodes by its own
264// initiative.
265func (n *Inode) Forgotten() bool {
266	n.mu.Lock()
267	defer n.mu.Unlock()
268	return n.lookupCount == 0 && len(n.parents) == 0 && !n.persistent
269}
270
271// Operations returns the object implementing the file system
272// operations.
273func (n *Inode) Operations() InodeEmbedder {
274	return n.ops
275}
276
277// Path returns a path string to the inode relative to `root`.
278// Pass nil to walk the hierarchy as far up as possible.
279//
280// If you set `root`, Path() warns if it finds an orphaned Inode, i.e.
281// if it does not end up at `root` after walking the hierarchy.
282func (n *Inode) Path(root *Inode) string {
283	var segments []string
284	p := n
285	for p != nil && p != root {
286		var pd parentData
287
288		// We don't try to take all locks at the same time, because
289		// the caller won't use the "path" string under lock anyway.
290		found := false
291		p.mu.Lock()
292		// Select an arbitrary parent
293		for pd = range p.parents {
294			found = true
295			break
296		}
297		p.mu.Unlock()
298		if found == false {
299			p = nil
300			break
301		}
302		if pd.parent == nil {
303			break
304		}
305
306		segments = append(segments, pd.name)
307		p = pd.parent
308	}
309
310	if root != nil && root != p {
311		deletedPlaceholder := fmt.Sprintf(".go-fuse.%d/deleted", rand.Uint64())
312		n.bridge.logf("warning: Inode.Path: n%d is orphaned, replacing segment with %q",
313			n.nodeId, deletedPlaceholder)
314		// NOSUBMIT - should replace rather than append?
315		segments = append(segments, deletedPlaceholder)
316	}
317
318	i := 0
319	j := len(segments) - 1
320
321	for i < j {
322		segments[i], segments[j] = segments[j], segments[i]
323		i++
324		j--
325	}
326
327	path := strings.Join(segments, "/")
328	return path
329}
330
331// setEntry does `iparent[name] = ichild` linking.
332//
333// setEntry must not be called simultaneously for any of iparent or ichild.
334// This, for example could be satisfied if both iparent and ichild are locked,
335// but it could be also valid if only iparent is locked and ichild was just
336// created and only one goroutine keeps referencing it.
337func (iparent *Inode) setEntry(name string, ichild *Inode) {
338	newParent := parentData{name, iparent}
339	if ichild.stableAttr.Mode == syscall.S_IFDIR {
340		// Directories cannot have more than one parent. Clear the map.
341		// This special-case is neccessary because ichild may still have a
342		// parent that was forgotten (i.e. removed from bridge.inoMap).
343		for i := range ichild.parents {
344			delete(ichild.parents, i)
345		}
346	}
347	ichild.parents[newParent] = struct{}{}
348	iparent.children[name] = ichild
349	ichild.changeCounter++
350	iparent.changeCounter++
351}
352
353// NewPersistentInode returns an Inode whose lifetime is not in
354// control of the kernel.
355//
356// When the kernel is short on memory, it will forget cached file
357// system information (directory entries and inode metadata). This is
358// announced with FORGET messages.  There are no guarantees if or when
359// this happens. When it happens, these are handled transparently by
360// go-fuse: all Inodes created with NewInode are released
361// automatically. NewPersistentInode creates inodes that go-fuse keeps
362// in memory, even if the kernel is not interested in them. This is
363// convenient for building static trees up-front.
364func (n *Inode) NewPersistentInode(ctx context.Context, node InodeEmbedder, id StableAttr) *Inode {
365	return n.newInode(ctx, node, id, true)
366}
367
368// ForgetPersistent manually marks the node as no longer important. If
369// it has no children, and if the kernel as no references, the nodes
370// gets removed from the tree.
371func (n *Inode) ForgetPersistent() {
372	n.removeRef(0, true)
373}
374
375// NewInode returns an inode for the given InodeEmbedder. The mode
376// should be standard mode argument (eg. S_IFDIR). The inode number in
377// id.Ino argument is used to implement hard-links.  If it is given,
378// and another node with the same ID is known, the new inode may be
379// ignored, and the old one used instead.
380func (n *Inode) NewInode(ctx context.Context, node InodeEmbedder, id StableAttr) *Inode {
381	return n.newInode(ctx, node, id, false)
382}
383
384func (n *Inode) newInode(ctx context.Context, ops InodeEmbedder, id StableAttr, persistent bool) *Inode {
385	return n.bridge.newInode(ctx, ops, id, persistent)
386}
387
388// removeRef decreases references. Returns if this operation caused
389// the node to be forgotten (for kernel references), and whether it is
390// live (ie. was not dropped from the tree)
391func (n *Inode) removeRef(nlookup uint64, dropPersistence bool) (forgotten bool, live bool) {
392	var lockme []*Inode
393	var parents []parentData
394
395	n.mu.Lock()
396	if nlookup > 0 && dropPersistence {
397		log.Panic("only one allowed")
398	} else if nlookup > n.lookupCount {
399		log.Panicf("n%d lookupCount underflow: lookupCount=%d, decrement=%d", n.nodeId, n.lookupCount, nlookup)
400	} else if nlookup > 0 {
401		n.lookupCount -= nlookup
402		n.changeCounter++
403	} else if dropPersistence && n.persistent {
404		n.persistent = false
405		n.changeCounter++
406	}
407
408	n.bridge.mu.Lock()
409	if n.lookupCount == 0 {
410		forgotten = true
411		// Dropping the node from inoMap guarantees that no new references to this node are
412		// handed out to the kernel, hence we can also safely delete it from nodeidMap.
413		delete(n.bridge.stableAttrs, n.stableAttr)
414		delete(n.bridge.kernelNodeIds, n.nodeId)
415	}
416	n.bridge.mu.Unlock()
417
418retry:
419	for {
420		lockme = append(lockme[:0], n)
421		parents = parents[:0]
422		nChange := n.changeCounter
423		live = n.lookupCount > 0 || len(n.children) > 0 || n.persistent
424		for p := range n.parents {
425			parents = append(parents, p)
426			lockme = append(lockme, p.parent)
427		}
428		n.mu.Unlock()
429
430		if live {
431			return forgotten, live
432		}
433
434		lockNodes(lockme...)
435		if n.changeCounter != nChange {
436			unlockNodes(lockme...)
437			// could avoid unlocking and relocking n here.
438			n.mu.Lock()
439			continue retry
440		}
441
442		for _, p := range parents {
443			if p.parent.children[p.name] != n {
444				// another node has replaced us already
445				continue
446			}
447			delete(p.parent.children, p.name)
448			p.parent.changeCounter++
449		}
450		n.parents = map[parentData]struct{}{}
451		n.changeCounter++
452
453		if n.lookupCount != 0 {
454			log.Panicf("n%d %p lookupCount changed: %d", n.nodeId, n, n.lookupCount)
455		}
456
457		unlockNodes(lockme...)
458		break
459	}
460
461	for _, p := range lockme {
462		if p != n {
463			p.removeRef(0, false)
464		}
465	}
466	return forgotten, false
467}
468
469// GetChild returns a child node with the given name, or nil if the
470// directory has no child by that name.
471func (n *Inode) GetChild(name string) *Inode {
472	n.mu.Lock()
473	defer n.mu.Unlock()
474	return n.children[name]
475}
476
477// AddChild adds a child to this node. If overwrite is false, fail if
478// the destination already exists.
479func (n *Inode) AddChild(name string, ch *Inode, overwrite bool) (success bool) {
480	if len(name) == 0 {
481		log.Panic("empty name for inode")
482	}
483
484retry:
485	for {
486		lockNode2(n, ch)
487		prev, ok := n.children[name]
488		parentCounter := n.changeCounter
489		if !ok {
490			n.children[name] = ch
491			ch.parents[parentData{name, n}] = struct{}{}
492			n.changeCounter++
493			ch.changeCounter++
494			unlockNode2(n, ch)
495			return true
496		}
497		unlockNode2(n, ch)
498		if !overwrite {
499			return false
500		}
501		lockme := [3]*Inode{n, ch, prev}
502
503		lockNodes(lockme[:]...)
504		if parentCounter != n.changeCounter {
505			unlockNodes(lockme[:]...)
506			continue retry
507		}
508
509		delete(prev.parents, parentData{name, n})
510		n.children[name] = ch
511		ch.parents[parentData{name, n}] = struct{}{}
512		n.changeCounter++
513		ch.changeCounter++
514		prev.changeCounter++
515		unlockNodes(lockme[:]...)
516
517		return true
518	}
519}
520
521// Children returns the list of children of this directory Inode.
522func (n *Inode) Children() map[string]*Inode {
523	n.mu.Lock()
524	defer n.mu.Unlock()
525	r := make(map[string]*Inode, len(n.children))
526	for k, v := range n.children {
527		r[k] = v
528	}
529	return r
530}
531
532// Parents returns a parent of this Inode, or nil if this Inode is
533// deleted or is the root
534func (n *Inode) Parent() (string, *Inode) {
535	n.mu.Lock()
536	defer n.mu.Unlock()
537	for k := range n.parents {
538		return k.name, k.parent
539	}
540	return "", nil
541}
542
543// RmAllChildren recursively drops a tree, forgetting all persistent
544// nodes.
545func (n *Inode) RmAllChildren() {
546	for {
547		chs := n.Children()
548		if len(chs) == 0 {
549			break
550		}
551		for nm, ch := range chs {
552			ch.RmAllChildren()
553			n.RmChild(nm)
554		}
555	}
556	n.removeRef(0, true)
557}
558
559// RmChild removes multiple children.  Returns whether the removal
560// succeeded and whether the node is still live afterward. The removal
561// is transactional: it only succeeds if all names are children, and
562// if they all were removed successfully.  If the removal was
563// successful, and there are no children left, the node may be removed
564// from the FS tree. In that case, RmChild returns live==false.
565func (n *Inode) RmChild(names ...string) (success, live bool) {
566	var lockme []*Inode
567
568retry:
569	for {
570		n.mu.Lock()
571		lockme = append(lockme[:0], n)
572		nChange := n.changeCounter
573		for _, nm := range names {
574			ch := n.children[nm]
575			if ch == nil {
576				n.mu.Unlock()
577				return false, true
578			}
579			lockme = append(lockme, ch)
580		}
581		n.mu.Unlock()
582
583		lockNodes(lockme...)
584		if n.changeCounter != nChange {
585			unlockNodes(lockme...)
586			continue retry
587		}
588
589		for _, nm := range names {
590			ch := n.children[nm]
591			delete(n.children, nm)
592			delete(ch.parents, parentData{nm, n})
593
594			ch.changeCounter++
595		}
596		n.changeCounter++
597
598		live = n.lookupCount > 0 || len(n.children) > 0 || n.persistent
599		unlockNodes(lockme...)
600
601		// removal successful
602		break
603	}
604
605	if !live {
606		_, live := n.removeRef(0, false)
607		return true, live
608	}
609
610	return true, true
611}
612
613// MvChild executes a rename. If overwrite is set, a child at the
614// destination will be overwritten, should it exist. It returns false
615// if 'overwrite' is false, and the destination exists.
616func (n *Inode) MvChild(old string, newParent *Inode, newName string, overwrite bool) bool {
617	if len(newName) == 0 {
618		log.Panicf("empty newName for MvChild")
619	}
620
621retry:
622	for {
623		lockNode2(n, newParent)
624		counter1 := n.changeCounter
625		counter2 := newParent.changeCounter
626
627		oldChild := n.children[old]
628		destChild := newParent.children[newName]
629		unlockNode2(n, newParent)
630
631		if destChild != nil && !overwrite {
632			return false
633		}
634
635		lockNodes(n, newParent, oldChild, destChild)
636		if counter2 != newParent.changeCounter || counter1 != n.changeCounter {
637			unlockNodes(n, newParent, oldChild, destChild)
638			continue retry
639		}
640
641		if oldChild != nil {
642			delete(n.children, old)
643			delete(oldChild.parents, parentData{old, n})
644			n.changeCounter++
645			oldChild.changeCounter++
646		}
647
648		if destChild != nil {
649			// This can cause the child to be slated for
650			// removal; see below
651			delete(newParent.children, newName)
652			delete(destChild.parents, parentData{newName, newParent})
653			destChild.changeCounter++
654			newParent.changeCounter++
655		}
656
657		if oldChild != nil {
658			newParent.children[newName] = oldChild
659			newParent.changeCounter++
660
661			oldChild.parents[parentData{newName, newParent}] = struct{}{}
662			oldChild.changeCounter++
663		}
664
665		unlockNodes(n, newParent, oldChild, destChild)
666
667		if destChild != nil {
668			destChild.removeRef(0, false)
669		}
670		return true
671	}
672}
673
674// ExchangeChild swaps the entries at (n, oldName) and (newParent,
675// newName).
676func (n *Inode) ExchangeChild(oldName string, newParent *Inode, newName string) {
677	oldParent := n
678retry:
679	for {
680		lockNode2(oldParent, newParent)
681		counter1 := oldParent.changeCounter
682		counter2 := newParent.changeCounter
683
684		oldChild := oldParent.children[oldName]
685		destChild := newParent.children[newName]
686		unlockNode2(oldParent, newParent)
687
688		if destChild == oldChild {
689			return
690		}
691
692		lockNodes(oldParent, newParent, oldChild, destChild)
693		if counter2 != newParent.changeCounter || counter1 != oldParent.changeCounter {
694			unlockNodes(oldParent, newParent, oldChild, destChild)
695			continue retry
696		}
697
698		// Detach
699		if oldChild != nil {
700			delete(oldParent.children, oldName)
701			delete(oldChild.parents, parentData{oldName, oldParent})
702			oldParent.changeCounter++
703			oldChild.changeCounter++
704		}
705
706		if destChild != nil {
707			delete(newParent.children, newName)
708			delete(destChild.parents, parentData{newName, newParent})
709			destChild.changeCounter++
710			newParent.changeCounter++
711		}
712
713		// Attach
714		if oldChild != nil {
715			newParent.children[newName] = oldChild
716			newParent.changeCounter++
717
718			oldChild.parents[parentData{newName, newParent}] = struct{}{}
719			oldChild.changeCounter++
720		}
721
722		if destChild != nil {
723			oldParent.children[oldName] = oldChild
724			oldParent.changeCounter++
725
726			destChild.parents[parentData{oldName, oldParent}] = struct{}{}
727			destChild.changeCounter++
728		}
729		unlockNodes(oldParent, newParent, oldChild, destChild)
730		return
731	}
732}
733
734// NotifyEntry notifies the kernel that data for a (directory, name)
735// tuple should be invalidated. On next access, a LOOKUP operation
736// will be started.
737func (n *Inode) NotifyEntry(name string) syscall.Errno {
738	status := n.bridge.server.EntryNotify(n.nodeId, name)
739	return syscall.Errno(status)
740}
741
742// NotifyDelete notifies the kernel that the given inode was removed
743// from this directory as entry under the given name. It is equivalent
744// to NotifyEntry, but also sends an event to inotify watchers.
745func (n *Inode) NotifyDelete(name string, child *Inode) syscall.Errno {
746	// XXX arg ordering?
747	return syscall.Errno(n.bridge.server.DeleteNotify(n.nodeId, child.nodeId, name))
748
749}
750
751// NotifyContent notifies the kernel that content under the given
752// inode should be flushed from buffers.
753func (n *Inode) NotifyContent(off, sz int64) syscall.Errno {
754	// XXX how does this work for directories?
755	return syscall.Errno(n.bridge.server.InodeNotify(n.nodeId, off, sz))
756}
757
758// WriteCache stores data in the kernel cache.
759func (n *Inode) WriteCache(offset int64, data []byte) syscall.Errno {
760	return syscall.Errno(n.bridge.server.InodeNotifyStoreCache(n.nodeId, offset, data))
761}
762
763// ReadCache reads data from the kernel cache.
764func (n *Inode) ReadCache(offset int64, dest []byte) (count int, errno syscall.Errno) {
765	c, s := n.bridge.server.InodeRetrieveCache(n.nodeId, offset, dest)
766	return c, syscall.Errno(s)
767}
768