1// Copyright 2019 the Go-FUSE Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package fs 6 7import ( 8 "context" 9 "fmt" 10 "log" 11 "math/rand" 12 "sort" 13 "strings" 14 "sync" 15 "syscall" 16 "unsafe" 17 18 "github.com/hanwen/go-fuse/v2/fuse" 19) 20 21type parentData struct { 22 name string 23 parent *Inode 24} 25 26// StableAttr holds immutable attributes of a object in the filesystem. 27type StableAttr struct { 28 // Each Inode has a type, which does not change over the 29 // lifetime of the inode, for example fuse.S_IFDIR. The default (0) 30 // is interpreted as S_IFREG (regular file). 31 Mode uint32 32 33 // The inode number must be unique among the currently live 34 // objects in the file system. It is used to communicate to 35 // the kernel about this file object. The values uint64(-1), 36 // and 1 are reserved. When using Ino==0, a unique, sequential 37 // number is assigned (starting at 2^63 by default) on Inode creation. 38 Ino uint64 39 40 // When reusing a previously used inode number for a new 41 // object, the new object must have a different Gen 42 // number. This is irrelevant if the FS is not exported over 43 // NFS 44 Gen uint64 45} 46 47// Reserved returns if the StableAttr is using reserved Inode numbers. 48func (i *StableAttr) Reserved() bool { 49 return i.Ino == 1 || i.Ino == ^uint64(0) 50} 51 52// Inode is a node in VFS tree. Inodes are one-to-one mapped to 53// Operations instances, which is the extension interface for file 54// systems. One can create fully-formed trees of Inodes ahead of time 55// by creating "persistent" Inodes. 56// 57// The Inode struct contains a lock, so it should not be 58// copied. Inodes should be obtained by calling Inode.NewInode() or 59// Inode.NewPersistentInode(). 60type Inode struct { 61 stableAttr StableAttr 62 63 ops InodeEmbedder 64 bridge *rawBridge 65 66 // The *Node ID* is an arbitrary uint64 identifier chosen by the FUSE library. 67 // It is used the identify *nodes* (files/directories/symlinks/...) in the 68 // communication between the FUSE library and the Linux kernel. 69 nodeId uint64 70 71 // Following data is mutable. 72 73 // file handles. 74 // protected by bridge.mu 75 openFiles []uint32 76 77 // mu protects the following mutable fields. When locking 78 // multiple Inodes, locks must be acquired using 79 // lockNodes/unlockNodes 80 mu sync.Mutex 81 82 // persistent indicates that this node should not be removed 83 // from the tree, even if there are no live references. This 84 // must be set on creation, and can only be changed to false 85 // by calling removeRef. 86 // When you change this, you MUST increment changeCounter. 87 persistent bool 88 89 // changeCounter increments every time the mutable state 90 // (lookupCount, persistent, children, parents) protected by 91 // mu is modified. 92 // 93 // This is used in places where we have to relock inode into inode 94 // group lock, and after locking the group we have to check if inode 95 // did not changed, and if it changed - retry the operation. 96 changeCounter uint32 97 98 // Number of kernel refs to this node. 99 // When you change this, you MUST increment changeCounter. 100 lookupCount uint64 101 102 // Children of this Inode. 103 // When you change this, you MUST increment changeCounter. 104 children map[string]*Inode 105 106 // Parents of this Inode. Can be more than one due to hard links. 107 // When you change this, you MUST increment changeCounter. 108 parents map[parentData]struct{} 109} 110 111func (n *Inode) IsDir() bool { 112 return n.stableAttr.Mode&syscall.S_IFDIR != 0 113} 114 115func (n *Inode) embed() *Inode { 116 return n 117} 118 119func (n *Inode) EmbeddedInode() *Inode { 120 return n 121} 122 123func initInode(n *Inode, ops InodeEmbedder, attr StableAttr, bridge *rawBridge, persistent bool, nodeId uint64) { 124 n.ops = ops 125 n.stableAttr = attr 126 n.bridge = bridge 127 n.persistent = persistent 128 n.nodeId = nodeId 129 n.parents = make(map[parentData]struct{}) 130 if attr.Mode == fuse.S_IFDIR { 131 n.children = make(map[string]*Inode) 132 } 133} 134 135// Set node ID and mode in EntryOut 136func (n *Inode) setEntryOut(out *fuse.EntryOut) { 137 out.NodeId = n.nodeId 138 out.Ino = n.stableAttr.Ino 139 out.Mode = (out.Attr.Mode & 07777) | n.stableAttr.Mode 140} 141 142// StableAttr returns the (Ino, Gen) tuple for this node. 143func (n *Inode) StableAttr() StableAttr { 144 return n.stableAttr 145} 146 147// Mode returns the filetype 148func (n *Inode) Mode() uint32 { 149 return n.stableAttr.Mode 150} 151 152// Returns the root of the tree 153func (n *Inode) Root() *Inode { 154 return n.bridge.root 155} 156 157// Returns whether this is the root of the tree 158func (n *Inode) IsRoot() bool { 159 return n.bridge.root == n 160} 161 162func modeStr(m uint32) string { 163 return map[uint32]string{ 164 syscall.S_IFREG: "reg", 165 syscall.S_IFLNK: "lnk", 166 syscall.S_IFDIR: "dir", 167 syscall.S_IFSOCK: "soc", 168 syscall.S_IFIFO: "pip", 169 syscall.S_IFCHR: "chr", 170 syscall.S_IFBLK: "blk", 171 }[m] 172} 173 174// debugString is used for debugging. Racy. 175func (n *Inode) String() string { 176 n.mu.Lock() 177 defer n.mu.Unlock() 178 var ss []string 179 for nm, ch := range n.children { 180 ss = append(ss, fmt.Sprintf("%q=i%d[%s]", nm, ch.stableAttr.Ino, modeStr(ch.stableAttr.Mode))) 181 } 182 183 return fmt.Sprintf("i%d (%s): %s", n.stableAttr.Ino, modeStr(n.stableAttr.Mode), strings.Join(ss, ",")) 184} 185 186// sortNodes rearranges inode group in consistent order. 187// 188// The nodes are ordered by their in-RAM address, which gives consistency 189// property: for any A and B inodes, sortNodes will either always order A < B, 190// or always order A > B. 191// 192// See lockNodes where this property is used to avoid deadlock when taking 193// locks on inode group. 194func sortNodes(ns []*Inode) { 195 sort.Slice(ns, func(i, j int) bool { 196 return nodeLess(ns[i], ns[j]) 197 }) 198} 199 200func nodeLess(a, b *Inode) bool { 201 return uintptr(unsafe.Pointer(a)) < uintptr(unsafe.Pointer(b)) 202} 203 204// lockNodes locks group of inodes. 205// 206// It always lock the inodes in the same order - to avoid deadlocks. 207// It also avoids locking an inode more than once, if it was specified multiple times. 208// An example when an inode might be given multiple times is if dir/a and dir/b 209// are hardlinked to the same inode and the caller needs to take locks on dir children. 210func lockNodes(ns ...*Inode) { 211 sortNodes(ns) 212 213 // The default value nil prevents trying to lock nil nodes. 214 var nprev *Inode 215 for _, n := range ns { 216 if n != nprev { 217 n.mu.Lock() 218 nprev = n 219 } 220 } 221} 222 223// lockNode2 locks a and b in order consistent with lockNodes. 224func lockNode2(a, b *Inode) { 225 if a == b { 226 a.mu.Lock() 227 } else if nodeLess(a, b) { 228 a.mu.Lock() 229 b.mu.Lock() 230 } else { 231 b.mu.Lock() 232 a.mu.Lock() 233 } 234} 235 236// unlockNode2 unlocks a and b 237func unlockNode2(a, b *Inode) { 238 if a == b { 239 a.mu.Unlock() 240 } else { 241 a.mu.Unlock() 242 b.mu.Unlock() 243 } 244} 245 246// unlockNodes releases locks taken by lockNodes. 247func unlockNodes(ns ...*Inode) { 248 // we don't need to unlock in the same order that was used in lockNodes. 249 // however it still helps to have nodes sorted to avoid duplicates. 250 sortNodes(ns) 251 252 var nprev *Inode 253 for _, n := range ns { 254 if n != nprev { 255 n.mu.Unlock() 256 nprev = n 257 } 258 } 259} 260 261// Forgotten returns true if the kernel holds no references to this 262// inode. This can be used for background cleanup tasks, since the 263// kernel has no way of reviving forgotten nodes by its own 264// initiative. 265func (n *Inode) Forgotten() bool { 266 n.mu.Lock() 267 defer n.mu.Unlock() 268 return n.lookupCount == 0 && len(n.parents) == 0 && !n.persistent 269} 270 271// Operations returns the object implementing the file system 272// operations. 273func (n *Inode) Operations() InodeEmbedder { 274 return n.ops 275} 276 277// Path returns a path string to the inode relative to `root`. 278// Pass nil to walk the hierarchy as far up as possible. 279// 280// If you set `root`, Path() warns if it finds an orphaned Inode, i.e. 281// if it does not end up at `root` after walking the hierarchy. 282func (n *Inode) Path(root *Inode) string { 283 var segments []string 284 p := n 285 for p != nil && p != root { 286 var pd parentData 287 288 // We don't try to take all locks at the same time, because 289 // the caller won't use the "path" string under lock anyway. 290 found := false 291 p.mu.Lock() 292 // Select an arbitrary parent 293 for pd = range p.parents { 294 found = true 295 break 296 } 297 p.mu.Unlock() 298 if found == false { 299 p = nil 300 break 301 } 302 if pd.parent == nil { 303 break 304 } 305 306 segments = append(segments, pd.name) 307 p = pd.parent 308 } 309 310 if root != nil && root != p { 311 deletedPlaceholder := fmt.Sprintf(".go-fuse.%d/deleted", rand.Uint64()) 312 n.bridge.logf("warning: Inode.Path: n%d is orphaned, replacing segment with %q", 313 n.nodeId, deletedPlaceholder) 314 // NOSUBMIT - should replace rather than append? 315 segments = append(segments, deletedPlaceholder) 316 } 317 318 i := 0 319 j := len(segments) - 1 320 321 for i < j { 322 segments[i], segments[j] = segments[j], segments[i] 323 i++ 324 j-- 325 } 326 327 path := strings.Join(segments, "/") 328 return path 329} 330 331// setEntry does `iparent[name] = ichild` linking. 332// 333// setEntry must not be called simultaneously for any of iparent or ichild. 334// This, for example could be satisfied if both iparent and ichild are locked, 335// but it could be also valid if only iparent is locked and ichild was just 336// created and only one goroutine keeps referencing it. 337func (iparent *Inode) setEntry(name string, ichild *Inode) { 338 newParent := parentData{name, iparent} 339 if ichild.stableAttr.Mode == syscall.S_IFDIR { 340 // Directories cannot have more than one parent. Clear the map. 341 // This special-case is neccessary because ichild may still have a 342 // parent that was forgotten (i.e. removed from bridge.inoMap). 343 for i := range ichild.parents { 344 delete(ichild.parents, i) 345 } 346 } 347 ichild.parents[newParent] = struct{}{} 348 iparent.children[name] = ichild 349 ichild.changeCounter++ 350 iparent.changeCounter++ 351} 352 353// NewPersistentInode returns an Inode whose lifetime is not in 354// control of the kernel. 355// 356// When the kernel is short on memory, it will forget cached file 357// system information (directory entries and inode metadata). This is 358// announced with FORGET messages. There are no guarantees if or when 359// this happens. When it happens, these are handled transparently by 360// go-fuse: all Inodes created with NewInode are released 361// automatically. NewPersistentInode creates inodes that go-fuse keeps 362// in memory, even if the kernel is not interested in them. This is 363// convenient for building static trees up-front. 364func (n *Inode) NewPersistentInode(ctx context.Context, node InodeEmbedder, id StableAttr) *Inode { 365 return n.newInode(ctx, node, id, true) 366} 367 368// ForgetPersistent manually marks the node as no longer important. If 369// it has no children, and if the kernel as no references, the nodes 370// gets removed from the tree. 371func (n *Inode) ForgetPersistent() { 372 n.removeRef(0, true) 373} 374 375// NewInode returns an inode for the given InodeEmbedder. The mode 376// should be standard mode argument (eg. S_IFDIR). The inode number in 377// id.Ino argument is used to implement hard-links. If it is given, 378// and another node with the same ID is known, the new inode may be 379// ignored, and the old one used instead. 380func (n *Inode) NewInode(ctx context.Context, node InodeEmbedder, id StableAttr) *Inode { 381 return n.newInode(ctx, node, id, false) 382} 383 384func (n *Inode) newInode(ctx context.Context, ops InodeEmbedder, id StableAttr, persistent bool) *Inode { 385 return n.bridge.newInode(ctx, ops, id, persistent) 386} 387 388// removeRef decreases references. Returns if this operation caused 389// the node to be forgotten (for kernel references), and whether it is 390// live (ie. was not dropped from the tree) 391func (n *Inode) removeRef(nlookup uint64, dropPersistence bool) (forgotten bool, live bool) { 392 var lockme []*Inode 393 var parents []parentData 394 395 n.mu.Lock() 396 if nlookup > 0 && dropPersistence { 397 log.Panic("only one allowed") 398 } else if nlookup > n.lookupCount { 399 log.Panicf("n%d lookupCount underflow: lookupCount=%d, decrement=%d", n.nodeId, n.lookupCount, nlookup) 400 } else if nlookup > 0 { 401 n.lookupCount -= nlookup 402 n.changeCounter++ 403 } else if dropPersistence && n.persistent { 404 n.persistent = false 405 n.changeCounter++ 406 } 407 408 n.bridge.mu.Lock() 409 if n.lookupCount == 0 { 410 forgotten = true 411 // Dropping the node from inoMap guarantees that no new references to this node are 412 // handed out to the kernel, hence we can also safely delete it from nodeidMap. 413 delete(n.bridge.stableAttrs, n.stableAttr) 414 delete(n.bridge.kernelNodeIds, n.nodeId) 415 } 416 n.bridge.mu.Unlock() 417 418retry: 419 for { 420 lockme = append(lockme[:0], n) 421 parents = parents[:0] 422 nChange := n.changeCounter 423 live = n.lookupCount > 0 || len(n.children) > 0 || n.persistent 424 for p := range n.parents { 425 parents = append(parents, p) 426 lockme = append(lockme, p.parent) 427 } 428 n.mu.Unlock() 429 430 if live { 431 return forgotten, live 432 } 433 434 lockNodes(lockme...) 435 if n.changeCounter != nChange { 436 unlockNodes(lockme...) 437 // could avoid unlocking and relocking n here. 438 n.mu.Lock() 439 continue retry 440 } 441 442 for _, p := range parents { 443 if p.parent.children[p.name] != n { 444 // another node has replaced us already 445 continue 446 } 447 delete(p.parent.children, p.name) 448 p.parent.changeCounter++ 449 } 450 n.parents = map[parentData]struct{}{} 451 n.changeCounter++ 452 453 if n.lookupCount != 0 { 454 log.Panicf("n%d %p lookupCount changed: %d", n.nodeId, n, n.lookupCount) 455 } 456 457 unlockNodes(lockme...) 458 break 459 } 460 461 for _, p := range lockme { 462 if p != n { 463 p.removeRef(0, false) 464 } 465 } 466 return forgotten, false 467} 468 469// GetChild returns a child node with the given name, or nil if the 470// directory has no child by that name. 471func (n *Inode) GetChild(name string) *Inode { 472 n.mu.Lock() 473 defer n.mu.Unlock() 474 return n.children[name] 475} 476 477// AddChild adds a child to this node. If overwrite is false, fail if 478// the destination already exists. 479func (n *Inode) AddChild(name string, ch *Inode, overwrite bool) (success bool) { 480 if len(name) == 0 { 481 log.Panic("empty name for inode") 482 } 483 484retry: 485 for { 486 lockNode2(n, ch) 487 prev, ok := n.children[name] 488 parentCounter := n.changeCounter 489 if !ok { 490 n.children[name] = ch 491 ch.parents[parentData{name, n}] = struct{}{} 492 n.changeCounter++ 493 ch.changeCounter++ 494 unlockNode2(n, ch) 495 return true 496 } 497 unlockNode2(n, ch) 498 if !overwrite { 499 return false 500 } 501 lockme := [3]*Inode{n, ch, prev} 502 503 lockNodes(lockme[:]...) 504 if parentCounter != n.changeCounter { 505 unlockNodes(lockme[:]...) 506 continue retry 507 } 508 509 delete(prev.parents, parentData{name, n}) 510 n.children[name] = ch 511 ch.parents[parentData{name, n}] = struct{}{} 512 n.changeCounter++ 513 ch.changeCounter++ 514 prev.changeCounter++ 515 unlockNodes(lockme[:]...) 516 517 return true 518 } 519} 520 521// Children returns the list of children of this directory Inode. 522func (n *Inode) Children() map[string]*Inode { 523 n.mu.Lock() 524 defer n.mu.Unlock() 525 r := make(map[string]*Inode, len(n.children)) 526 for k, v := range n.children { 527 r[k] = v 528 } 529 return r 530} 531 532// Parents returns a parent of this Inode, or nil if this Inode is 533// deleted or is the root 534func (n *Inode) Parent() (string, *Inode) { 535 n.mu.Lock() 536 defer n.mu.Unlock() 537 for k := range n.parents { 538 return k.name, k.parent 539 } 540 return "", nil 541} 542 543// RmAllChildren recursively drops a tree, forgetting all persistent 544// nodes. 545func (n *Inode) RmAllChildren() { 546 for { 547 chs := n.Children() 548 if len(chs) == 0 { 549 break 550 } 551 for nm, ch := range chs { 552 ch.RmAllChildren() 553 n.RmChild(nm) 554 } 555 } 556 n.removeRef(0, true) 557} 558 559// RmChild removes multiple children. Returns whether the removal 560// succeeded and whether the node is still live afterward. The removal 561// is transactional: it only succeeds if all names are children, and 562// if they all were removed successfully. If the removal was 563// successful, and there are no children left, the node may be removed 564// from the FS tree. In that case, RmChild returns live==false. 565func (n *Inode) RmChild(names ...string) (success, live bool) { 566 var lockme []*Inode 567 568retry: 569 for { 570 n.mu.Lock() 571 lockme = append(lockme[:0], n) 572 nChange := n.changeCounter 573 for _, nm := range names { 574 ch := n.children[nm] 575 if ch == nil { 576 n.mu.Unlock() 577 return false, true 578 } 579 lockme = append(lockme, ch) 580 } 581 n.mu.Unlock() 582 583 lockNodes(lockme...) 584 if n.changeCounter != nChange { 585 unlockNodes(lockme...) 586 continue retry 587 } 588 589 for _, nm := range names { 590 ch := n.children[nm] 591 delete(n.children, nm) 592 delete(ch.parents, parentData{nm, n}) 593 594 ch.changeCounter++ 595 } 596 n.changeCounter++ 597 598 live = n.lookupCount > 0 || len(n.children) > 0 || n.persistent 599 unlockNodes(lockme...) 600 601 // removal successful 602 break 603 } 604 605 if !live { 606 _, live := n.removeRef(0, false) 607 return true, live 608 } 609 610 return true, true 611} 612 613// MvChild executes a rename. If overwrite is set, a child at the 614// destination will be overwritten, should it exist. It returns false 615// if 'overwrite' is false, and the destination exists. 616func (n *Inode) MvChild(old string, newParent *Inode, newName string, overwrite bool) bool { 617 if len(newName) == 0 { 618 log.Panicf("empty newName for MvChild") 619 } 620 621retry: 622 for { 623 lockNode2(n, newParent) 624 counter1 := n.changeCounter 625 counter2 := newParent.changeCounter 626 627 oldChild := n.children[old] 628 destChild := newParent.children[newName] 629 unlockNode2(n, newParent) 630 631 if destChild != nil && !overwrite { 632 return false 633 } 634 635 lockNodes(n, newParent, oldChild, destChild) 636 if counter2 != newParent.changeCounter || counter1 != n.changeCounter { 637 unlockNodes(n, newParent, oldChild, destChild) 638 continue retry 639 } 640 641 if oldChild != nil { 642 delete(n.children, old) 643 delete(oldChild.parents, parentData{old, n}) 644 n.changeCounter++ 645 oldChild.changeCounter++ 646 } 647 648 if destChild != nil { 649 // This can cause the child to be slated for 650 // removal; see below 651 delete(newParent.children, newName) 652 delete(destChild.parents, parentData{newName, newParent}) 653 destChild.changeCounter++ 654 newParent.changeCounter++ 655 } 656 657 if oldChild != nil { 658 newParent.children[newName] = oldChild 659 newParent.changeCounter++ 660 661 oldChild.parents[parentData{newName, newParent}] = struct{}{} 662 oldChild.changeCounter++ 663 } 664 665 unlockNodes(n, newParent, oldChild, destChild) 666 667 if destChild != nil { 668 destChild.removeRef(0, false) 669 } 670 return true 671 } 672} 673 674// ExchangeChild swaps the entries at (n, oldName) and (newParent, 675// newName). 676func (n *Inode) ExchangeChild(oldName string, newParent *Inode, newName string) { 677 oldParent := n 678retry: 679 for { 680 lockNode2(oldParent, newParent) 681 counter1 := oldParent.changeCounter 682 counter2 := newParent.changeCounter 683 684 oldChild := oldParent.children[oldName] 685 destChild := newParent.children[newName] 686 unlockNode2(oldParent, newParent) 687 688 if destChild == oldChild { 689 return 690 } 691 692 lockNodes(oldParent, newParent, oldChild, destChild) 693 if counter2 != newParent.changeCounter || counter1 != oldParent.changeCounter { 694 unlockNodes(oldParent, newParent, oldChild, destChild) 695 continue retry 696 } 697 698 // Detach 699 if oldChild != nil { 700 delete(oldParent.children, oldName) 701 delete(oldChild.parents, parentData{oldName, oldParent}) 702 oldParent.changeCounter++ 703 oldChild.changeCounter++ 704 } 705 706 if destChild != nil { 707 delete(newParent.children, newName) 708 delete(destChild.parents, parentData{newName, newParent}) 709 destChild.changeCounter++ 710 newParent.changeCounter++ 711 } 712 713 // Attach 714 if oldChild != nil { 715 newParent.children[newName] = oldChild 716 newParent.changeCounter++ 717 718 oldChild.parents[parentData{newName, newParent}] = struct{}{} 719 oldChild.changeCounter++ 720 } 721 722 if destChild != nil { 723 oldParent.children[oldName] = oldChild 724 oldParent.changeCounter++ 725 726 destChild.parents[parentData{oldName, oldParent}] = struct{}{} 727 destChild.changeCounter++ 728 } 729 unlockNodes(oldParent, newParent, oldChild, destChild) 730 return 731 } 732} 733 734// NotifyEntry notifies the kernel that data for a (directory, name) 735// tuple should be invalidated. On next access, a LOOKUP operation 736// will be started. 737func (n *Inode) NotifyEntry(name string) syscall.Errno { 738 status := n.bridge.server.EntryNotify(n.nodeId, name) 739 return syscall.Errno(status) 740} 741 742// NotifyDelete notifies the kernel that the given inode was removed 743// from this directory as entry under the given name. It is equivalent 744// to NotifyEntry, but also sends an event to inotify watchers. 745func (n *Inode) NotifyDelete(name string, child *Inode) syscall.Errno { 746 // XXX arg ordering? 747 return syscall.Errno(n.bridge.server.DeleteNotify(n.nodeId, child.nodeId, name)) 748 749} 750 751// NotifyContent notifies the kernel that content under the given 752// inode should be flushed from buffers. 753func (n *Inode) NotifyContent(off, sz int64) syscall.Errno { 754 // XXX how does this work for directories? 755 return syscall.Errno(n.bridge.server.InodeNotify(n.nodeId, off, sz)) 756} 757 758// WriteCache stores data in the kernel cache. 759func (n *Inode) WriteCache(offset int64, data []byte) syscall.Errno { 760 return syscall.Errno(n.bridge.server.InodeNotifyStoreCache(n.nodeId, offset, data)) 761} 762 763// ReadCache reads data from the kernel cache. 764func (n *Inode) ReadCache(offset int64, dest []byte) (count int, errno syscall.Errno) { 765 c, s := n.bridge.server.InodeRetrieveCache(n.nodeId, offset, dest) 766 return c, syscall.Errno(s) 767} 768