1 // Copyright (c) 2015 Sandstorm Development Group, Inc. and contributors
2 // Licensed under the MIT License:
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
5 // of this software and associated documentation files (the "Software"), to deal
6 // in the Software without restriction, including without limitation the rights
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the Software is
9 // furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 // THE SOFTWARE.
21 
22 #if !_WIN32
23 
24 #ifndef _GNU_SOURCE
25 #define _GNU_SOURCE
26 #endif
27 
28 #include "filesystem.h"
29 #include "debug.h"
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <sys/ioctl.h>
33 #include <fcntl.h>
34 #include <unistd.h>
35 #include <stdio.h>
36 #include <sys/mman.h>
37 #include <errno.h>
38 #include <dirent.h>
39 #include <stdlib.h>
40 #include "vector.h"
41 #include "miniposix.h"
42 #include <algorithm>
43 
44 #if __linux__
45 #include <syscall.h>
46 #include <linux/fs.h>
47 #include <sys/sendfile.h>
48 #endif
49 
50 namespace kj {
51 namespace {
52 
53 #define HIDDEN_PREFIX ".kj-tmp."
54 // Prefix for temp files which should be hidden when listing a directory.
55 //
56 // If you change this, make sure to update the unit test.
57 
58 #ifdef O_CLOEXEC
59 #define MAYBE_O_CLOEXEC O_CLOEXEC
60 #else
61 #define MAYBE_O_CLOEXEC 0
62 #endif
63 
64 #ifdef O_DIRECTORY
65 #define MAYBE_O_DIRECTORY O_DIRECTORY
66 #else
67 #define MAYBE_O_DIRECTORY 0
68 #endif
69 
70 #if __APPLE__
71 // Mac OSX defines SEEK_HOLE, but it doesn't work. ("Inappropriate ioctl for device", it says.)
72 #undef SEEK_HOLE
73 #endif
74 
75 #ifndef DTTOIF
76 #define DTTOIF(dirtype) ((dirtype) << 12)
77 #endif
78 
79 #if __BIONIC__
80 // No no DTTOIF function
81 #undef DT_UNKNOWN
82 #endif
83 
84 static void setCloexec(int fd) KJ_UNUSED;
setCloexec(int fd)85 static void setCloexec(int fd) {
86   // Set the O_CLOEXEC flag on the given fd.
87   //
88   // We try to avoid the need to call this by taking advantage of syscall flags that set it
89   // atomically on new file descriptors. Unfortunately some platforms do not support such syscalls.
90 
91 #ifdef FIOCLEX
92   // Yay, we can set the flag in one call.
93   KJ_SYSCALL_HANDLE_ERRORS(ioctl(fd, FIOCLEX)) {
94     case EINVAL:
95     case EOPNOTSUPP:
96       break;
97     default:
98       KJ_FAIL_SYSCALL("ioctl(fd, FIOCLEX)", error) { break; }
99       break;
100   } else {
101     // success
102     return;
103   }
104 #endif
105 
106   // Sadness, we must resort to read/modify/write.
107   //
108   // (On many platforms, FD_CLOEXEC is the only flag modifiable via F_SETFD and therefore we could
109   // skip the read... but it seems dangerous to assume that's true of all platforms, and anyway
110   // most platforms support FIOCLEX.)
111   int flags;
112   KJ_SYSCALL(flags = fcntl(fd, F_GETFD));
113   if (!(flags & FD_CLOEXEC)) {
114     KJ_SYSCALL(fcntl(fd, F_SETFD, flags | FD_CLOEXEC));
115   }
116 }
117 
toKjDate(struct timespec tv)118 static Date toKjDate(struct timespec tv) {
119   return tv.tv_sec * SECONDS + tv.tv_nsec * NANOSECONDS + UNIX_EPOCH;
120 }
121 
modeToType(mode_t mode)122 static FsNode::Type modeToType(mode_t mode) {
123   switch (mode & S_IFMT) {
124     case S_IFREG : return FsNode::Type::FILE;
125     case S_IFDIR : return FsNode::Type::DIRECTORY;
126     case S_IFLNK : return FsNode::Type::SYMLINK;
127     case S_IFBLK : return FsNode::Type::BLOCK_DEVICE;
128     case S_IFCHR : return FsNode::Type::CHARACTER_DEVICE;
129     case S_IFIFO : return FsNode::Type::NAMED_PIPE;
130     case S_IFSOCK: return FsNode::Type::SOCKET;
131     default: return FsNode::Type::OTHER;
132   }
133 }
134 
statToMetadata(struct stat & stats)135 static FsNode::Metadata statToMetadata(struct stat& stats) {
136   // Probably st_ino and st_dev are usually under 32 bits, so mix by rotating st_dev left 32 bits
137   // and XOR.
138   uint64_t d = stats.st_dev;
139   uint64_t hash = ((d << 32) | (d >> 32)) ^ stats.st_ino;
140 
141   return FsNode::Metadata {
142     modeToType(stats.st_mode),
143     implicitCast<uint64_t>(stats.st_size),
144     implicitCast<uint64_t>(stats.st_blocks * 512u),
145 #if __APPLE__
146     toKjDate(stats.st_mtimespec),
147 #else
148     toKjDate(stats.st_mtim),
149 #endif
150     implicitCast<uint>(stats.st_nlink),
151     hash
152   };
153 }
154 
155 static bool rmrf(int fd, StringPtr path);
156 
rmrfChildrenAndClose(int fd)157 static void rmrfChildrenAndClose(int fd) {
158   // Assumes fd is seeked to beginning.
159 
160   DIR* dir = fdopendir(fd);
161   if (dir == nullptr) {
162     close(fd);
163     KJ_FAIL_SYSCALL("fdopendir", errno);
164   };
165   KJ_DEFER(closedir(dir));
166 
167   for (;;) {
168     errno = 0;
169     struct dirent* entry = readdir(dir);
170     if (entry == nullptr) {
171       int error = errno;
172       if (error == 0) {
173         break;
174       } else {
175         KJ_FAIL_SYSCALL("readdir", error);
176       }
177     }
178 
179     if (entry->d_name[0] == '.' &&
180         (entry->d_name[1] == '\0' ||
181          (entry->d_name[1] == '.' &&
182           entry->d_name[2] == '\0'))) {
183       // ignore . and ..
184     } else {
185 #ifdef DT_UNKNOWN    // d_type is not available on all platforms.
186       if (entry->d_type == DT_DIR) {
187         int subdirFd;
188         KJ_SYSCALL(subdirFd = openat(
189             fd, entry->d_name, O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC));
190         rmrfChildrenAndClose(subdirFd);
191         KJ_SYSCALL(unlinkat(fd, entry->d_name, AT_REMOVEDIR));
192       } else if (entry->d_type != DT_UNKNOWN) {
193         KJ_SYSCALL(unlinkat(fd, entry->d_name, 0));
194       } else {
195 #endif
196         KJ_ASSERT(rmrf(fd, entry->d_name));
197 #ifdef DT_UNKNOWN
198       }
199 #endif
200     }
201   }
202 }
203 
rmrf(int fd,StringPtr path)204 static bool rmrf(int fd, StringPtr path) {
205   struct stat stats;
206   KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
207     case ENOENT:
208     case ENOTDIR:
209       // Doesn't exist.
210       return false;
211     default:
212       KJ_FAIL_SYSCALL("lstat(path)", error, path) { return false; }
213   }
214 
215   if (S_ISDIR(stats.st_mode)) {
216     int subdirFd;
217     KJ_SYSCALL(subdirFd = openat(
218         fd, path.cStr(), O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)) { return false; }
219     rmrfChildrenAndClose(subdirFd);
220     KJ_SYSCALL(unlinkat(fd, path.cStr(), AT_REMOVEDIR)) { return false; }
221   } else {
222     KJ_SYSCALL(unlinkat(fd, path.cStr(), 0)) { return false; }
223   }
224 
225   return true;
226 }
227 
228 struct MmapRange {
229   uint64_t offset;
230   uint64_t size;
231 };
232 
getMmapRange(uint64_t offset,uint64_t size)233 static MmapRange getMmapRange(uint64_t offset, uint64_t size) {
234   // Comes up with an offset and size to pass to mmap(), given an offset and size requested by
235   // the caller, and considering the fact that mappings must start at a page boundary.
236   //
237   // The offset is rounded down to the nearest page boundary, and the size is increased to
238   // compensate. Note that the endpoint of the mapping is *not* rounded up to a page boundary, as
239   // mmap() does not actually require this, and it causes trouble on some systems (notably Cygwin).
240 
241 #ifndef _SC_PAGESIZE
242 #define _SC_PAGESIZE _SC_PAGE_SIZE
243 #endif
244   static const uint64_t pageSize = sysconf(_SC_PAGESIZE);
245   uint64_t pageMask = pageSize - 1;
246 
247   uint64_t realOffset = offset & ~pageMask;
248 
249   return { realOffset, offset + size - realOffset };
250 }
251 
252 class MmapDisposer: public ArrayDisposer {
253 protected:
disposeImpl(void * firstElement,size_t elementSize,size_t elementCount,size_t capacity,void (* destroyElement)(void *)) const254   void disposeImpl(void* firstElement, size_t elementSize, size_t elementCount,
255                    size_t capacity, void (*destroyElement)(void*)) const {
256     auto range = getMmapRange(reinterpret_cast<uintptr_t>(firstElement),
257                               elementSize * elementCount);
258     KJ_SYSCALL(munmap(reinterpret_cast<byte*>(range.offset), range.size)) { break; }
259   }
260 };
261 
262 constexpr MmapDisposer mmapDisposer = MmapDisposer();
263 
264 class DiskHandle {
265   // We need to implement each of ReadableFile, AppendableFile, File, ReadableDirectory, and
266   // Directory for disk handles. There is a lot of implementation overlap between these, especially
267   // stat(), sync(), etc. We can't have everything inherit from a common DiskFsNode that implements
268   // these because then we get diamond inheritance which means we need to make all our inheritance
269   // virtual which means downcasting requires RTTI which violates our goal of supporting compiling
270   // with no RTTI. So instead we have the DiskHandle class which implements all the methods without
271   // inheriting anything, and then we have DiskFile, DiskDirectory, etc. hold this and delegate to
272   // it. Ugly, but works.
273 
274 public:
DiskHandle(AutoCloseFd && fd)275   DiskHandle(AutoCloseFd&& fd): fd(kj::mv(fd)) {}
276 
277   // OsHandle ------------------------------------------------------------------
278 
clone() const279   AutoCloseFd clone() const {
280     int fd2;
281 #ifdef F_DUPFD_CLOEXEC
282     KJ_SYSCALL_HANDLE_ERRORS(fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 3)) {
283       case EINVAL:
284       case EOPNOTSUPP:
285         // fall back
286         break;
287       default:
288         KJ_FAIL_SYSCALL("fnctl(fd, F_DUPFD_CLOEXEC, 3)", error) { break; }
289         break;
290     } else {
291       return AutoCloseFd(fd2);
292     }
293 #endif
294 
295     KJ_SYSCALL(fd2 = ::dup(fd));
296     AutoCloseFd result(fd2);
297     setCloexec(result);
298     return result;
299   }
300 
getFd() const301   int getFd() const {
302     return fd.get();
303   }
304 
305   // FsNode --------------------------------------------------------------------
306 
stat() const307   FsNode::Metadata stat() const {
308     struct stat stats;
309     KJ_SYSCALL(::fstat(fd, &stats));
310     return statToMetadata(stats);
311   }
312 
sync() const313   void sync() const {
314 #if __APPLE__
315     // For whatever reason, fsync() on OSX only flushes kernel buffers. It does not flush hardware
316     // disk buffers. This makes it not very useful. But OSX documents fcntl F_FULLFSYNC which does
317     // the right thing. Why they don't just make fsync() do the right thing, I do not know.
318     KJ_SYSCALL(fcntl(fd, F_FULLFSYNC));
319 #else
320     KJ_SYSCALL(fsync(fd));
321 #endif
322   }
323 
datasync() const324   void datasync() const {
325     // The presence of the _POSIX_SYNCHRONIZED_IO define is supposed to tell us that fdatasync()
326     // exists. But Apple defines this yet doesn't offer fdatasync(). Thanks, Apple.
327 #if _POSIX_SYNCHRONIZED_IO && !__APPLE__
328     KJ_SYSCALL(fdatasync(fd));
329 #else
330     this->sync();
331 #endif
332   }
333 
334   // ReadableFile --------------------------------------------------------------
335 
read(uint64_t offset,ArrayPtr<byte> buffer) const336   size_t read(uint64_t offset, ArrayPtr<byte> buffer) const {
337     // pread() probably never returns short reads unless it hits EOF. Unfortunately, though, per
338     // spec we are not allowed to assume this.
339 
340     size_t total = 0;
341     while (buffer.size() > 0) {
342       ssize_t n;
343       KJ_SYSCALL(n = pread(fd, buffer.begin(), buffer.size(), offset));
344       if (n == 0) break;
345       total += n;
346       offset += n;
347       buffer = buffer.slice(n, buffer.size());
348     }
349     return total;
350   }
351 
mmap(uint64_t offset,uint64_t size) const352   Array<const byte> mmap(uint64_t offset, uint64_t size) const {
353     if (size == 0) return nullptr;  // zero-length mmap() returns EINVAL, so avoid it
354     auto range = getMmapRange(offset, size);
355     const void* mapping = ::mmap(NULL, range.size, PROT_READ, MAP_SHARED, fd, range.offset);
356     if (mapping == MAP_FAILED) {
357       KJ_FAIL_SYSCALL("mmap", errno);
358     }
359     return Array<const byte>(reinterpret_cast<const byte*>(mapping) + (offset - range.offset),
360                              size, mmapDisposer);
361   }
362 
mmapPrivate(uint64_t offset,uint64_t size) const363   Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const {
364     if (size == 0) return nullptr;  // zero-length mmap() returns EINVAL, so avoid it
365     auto range = getMmapRange(offset, size);
366     void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, range.offset);
367     if (mapping == MAP_FAILED) {
368       KJ_FAIL_SYSCALL("mmap", errno);
369     }
370     return Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset),
371                        size, mmapDisposer);
372   }
373 
374   // File ----------------------------------------------------------------------
375 
write(uint64_t offset,ArrayPtr<const byte> data) const376   void write(uint64_t offset, ArrayPtr<const byte> data) const {
377     // pwrite() probably never returns short writes unless there's no space left on disk.
378     // Unfortunately, though, per spec we are not allowed to assume this.
379 
380     while (data.size() > 0) {
381       ssize_t n;
382       KJ_SYSCALL(n = pwrite(fd, data.begin(), data.size(), offset));
383       KJ_ASSERT(n > 0, "pwrite() returned zero?");
384       offset += n;
385       data = data.slice(n, data.size());
386     }
387   }
388 
zero(uint64_t offset,uint64_t size) const389   void zero(uint64_t offset, uint64_t size) const {
390 #ifdef FALLOC_FL_PUNCH_HOLE
391     KJ_SYSCALL_HANDLE_ERRORS(
392         fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, size)) {
393       case EOPNOTSUPP:
394         // fall back to below
395         break;
396       default:
397         KJ_FAIL_SYSCALL("fallocate(FALLOC_FL_PUNCH_HOLE)", error) { return; }
398     } else {
399       return;
400     }
401 #endif
402 
403     static const byte ZEROS[4096] = { 0 };
404 
405 #if __APPLE__ || __CYGWIN__ || (defined(__ANDROID__) && __ANDROID_API__ < 24)
406     // Mac & Cygwin & Android API levels 23 and lower doesn't have pwritev().
407     while (size > sizeof(ZEROS)) {
408       write(offset, ZEROS);
409       size -= sizeof(ZEROS);
410       offset += sizeof(ZEROS);
411     }
412     write(offset, kj::arrayPtr(ZEROS, size));
413 #else
414     // Use a 4k buffer of zeros amplified by iov to write zeros with as few syscalls as possible.
415     size_t count = (size + sizeof(ZEROS) - 1) / sizeof(ZEROS);
416     const size_t iovmax = miniposix::iovMax(count);
417     KJ_STACK_ARRAY(struct iovec, iov, kj::min(iovmax, count), 16, 256);
418 
419     for (auto& item: iov) {
420       item.iov_base = const_cast<byte*>(ZEROS);
421       item.iov_len = sizeof(ZEROS);
422     }
423 
424     while (size > 0) {
425       size_t iovCount;
426       if (size >= iov.size() * sizeof(ZEROS)) {
427         iovCount = iov.size();
428       } else {
429         iovCount = size / sizeof(ZEROS);
430         size_t rem = size % sizeof(ZEROS);
431         if (rem > 0) {
432           iov[iovCount++].iov_len = rem;
433         }
434       }
435 
436       ssize_t n;
437       KJ_SYSCALL(n = pwritev(fd, iov.begin(), count, offset));
438       KJ_ASSERT(n > 0, "pwrite() returned zero?");
439 
440       offset += n;
441       size -= n;
442     }
443 #endif
444   }
445 
truncate(uint64_t size) const446   void truncate(uint64_t size) const {
447     KJ_SYSCALL(ftruncate(fd, size));
448   }
449 
450   class WritableFileMappingImpl final: public WritableFileMapping {
451   public:
WritableFileMappingImpl(Array<byte> bytes)452     WritableFileMappingImpl(Array<byte> bytes): bytes(kj::mv(bytes)) {}
453 
get() const454     ArrayPtr<byte> get() const override {
455       // const_cast OK because WritableFileMapping does indeed provide a writable view despite
456       // being const itself.
457       return arrayPtr(const_cast<byte*>(bytes.begin()), bytes.size());
458     }
459 
changed(ArrayPtr<byte> slice) const460     void changed(ArrayPtr<byte> slice) const override {
461       KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(),
462                  "byte range is not part of this mapping");
463       if (slice.size() == 0) return;
464 
465       // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that.
466       auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size());
467       KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_ASYNC));
468     }
469 
sync(ArrayPtr<byte> slice) const470     void sync(ArrayPtr<byte> slice) const override {
471       KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(),
472                  "byte range is not part of this mapping");
473       if (slice.size() == 0) return;
474 
475       // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that.
476       auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size());
477       KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_SYNC));
478     }
479 
480   private:
481     Array<byte> bytes;
482   };
483 
mmapWritable(uint64_t offset,uint64_t size) const484   Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const {
485     if (size == 0) {
486       // zero-length mmap() returns EINVAL, so avoid it
487       return heap<WritableFileMappingImpl>(nullptr);
488     }
489     auto range = getMmapRange(offset, size);
490     void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, range.offset);
491     if (mapping == MAP_FAILED) {
492       KJ_FAIL_SYSCALL("mmap", errno);
493     }
494     auto array = Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset),
495                              size, mmapDisposer);
496     return heap<WritableFileMappingImpl>(kj::mv(array));
497   }
498 
copyChunk(uint64_t offset,int fromFd,uint64_t fromOffset,uint64_t size) const499   size_t copyChunk(uint64_t offset, int fromFd, uint64_t fromOffset, uint64_t size) const {
500     // Copies a range of bytes from `fromFd` to this file in the most efficient way possible for
501     // the OS. Only returns less than `size` if EOF. Does not account for holes.
502 
503 #if __linux__
504     {
505       KJ_SYSCALL(lseek(fd, offset, SEEK_SET));
506       off_t fromPos = fromOffset;
507       off_t end = fromOffset + size;
508       while (fromPos < end) {
509         ssize_t n;
510         KJ_SYSCALL_HANDLE_ERRORS(n = sendfile(fd, fromFd, &fromPos, end - fromPos)) {
511           case EINVAL:
512           case ENOSYS:
513             goto sendfileNotAvailable;
514           default:
515             KJ_FAIL_SYSCALL("sendfile", error) { return fromPos - fromOffset; }
516         }
517         if (n == 0) break;
518       }
519       return fromPos - fromOffset;
520     }
521 
522   sendfileNotAvailable:
523 #endif
524     uint64_t total = 0;
525     while (size > 0) {
526       byte buffer[4096];
527       ssize_t n;
528       KJ_SYSCALL(n = pread(fromFd, buffer, kj::min(sizeof(buffer), size), fromOffset));
529       if (n == 0) break;
530       write(offset, arrayPtr(buffer, n));
531       fromOffset += n;
532       offset += n;
533       total += n;
534       size -= n;
535     }
536     return total;
537   }
538 
copy(uint64_t offset,const ReadableFile & from,uint64_t fromOffset,uint64_t size) const539   kj::Maybe<size_t> copy(uint64_t offset, const ReadableFile& from,
540                          uint64_t fromOffset, uint64_t size) const {
541     KJ_IF_MAYBE(otherFd, from.getFd()) {
542 #ifdef FICLONE
543       if (offset == 0 && fromOffset == 0 && size == kj::maxValue && stat().size == 0) {
544         if (ioctl(fd, FICLONE, *otherFd) >= 0) {
545           return stat().size;
546         }
547       } else if (size > 0) {    // src_length = 0 has special meaning for the syscall, so avoid.
548         struct file_clone_range range;
549         memset(&range, 0, sizeof(range));
550         range.src_fd = *otherFd;
551         range.dest_offset = offset;
552         range.src_offset = fromOffset;
553         range.src_length = size == kj::maxValue ? 0 : size;
554         if (ioctl(fd, FICLONERANGE, &range) >= 0) {
555           // TODO(someday): What does FICLONERANGE actually do if the range goes past EOF? The docs
556           //   don't say. Maybe it only copies the parts that exist. Maybe it punches holes for the
557           //   rest. Where does the destination file's EOF marker end up? Who knows?
558           return kj::min(from.stat().size - fromOffset, size);
559         }
560       } else {
561         // size == 0
562         return size_t(0);
563       }
564 
565       // ioctl failed. Almost all failures documented for these are of the form "the operation is
566       // not supported for the filesystem(s) specified", so fall back to other approaches.
567 #endif
568 
569       off_t toPos = offset;
570       off_t fromPos = fromOffset;
571       off_t end = size == kj::maxValue ? off_t(kj::maxValue) : off_t(fromOffset + size);
572 
573       for (;;) {
574         // Handle data.
575         {
576           // Find out how much data there is before the next hole.
577           off_t nextHole;
578 #ifdef SEEK_HOLE
579           KJ_SYSCALL_HANDLE_ERRORS(nextHole = lseek(*otherFd, fromPos, SEEK_HOLE)) {
580             case EINVAL:
581               // SEEK_HOLE probably not supported. Assume no holes.
582               nextHole = end;
583               break;
584             case ENXIO:
585               // Past EOF. Stop here.
586               return fromPos - fromOffset;
587             default:
588               KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; }
589           }
590 #else
591           // SEEK_HOLE not supported. Assume no holes.
592           nextHole = end;
593 #endif
594 
595           // Copy the next chunk of data.
596           off_t copyTo = kj::min(end, nextHole);
597           size_t amount = copyTo - fromPos;
598           if (amount > 0) {
599             size_t n = copyChunk(toPos, *otherFd, fromPos, amount);
600             fromPos += n;
601             toPos += n;
602 
603             if (n < amount) {
604               return fromPos - fromOffset;
605             }
606           }
607 
608           if (fromPos == end) {
609             return fromPos - fromOffset;
610           }
611         }
612 
613 #ifdef SEEK_HOLE
614         // Handle hole.
615         {
616           // Find out how much hole there is before the next data.
617           off_t nextData;
618           KJ_SYSCALL_HANDLE_ERRORS(nextData = lseek(*otherFd, fromPos, SEEK_DATA)) {
619             case EINVAL:
620               // SEEK_DATA probably not supported. But we should only have gotten here if we
621               // were expecting a hole.
622               KJ_FAIL_ASSERT("can't determine hole size; SEEK_DATA not supported");
623               break;
624             case ENXIO:
625               // No more data. Set to EOF.
626               KJ_SYSCALL(nextData = lseek(*otherFd, 0, SEEK_END));
627               if (nextData > end) {
628                 end = nextData;
629               }
630               break;
631             default:
632               KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; }
633           }
634 
635           // Write zeros.
636           off_t zeroTo = kj::min(end, nextData);
637           off_t amount = zeroTo - fromPos;
638           if (amount > 0) {
639             zero(toPos, amount);
640             toPos += amount;
641             fromPos = zeroTo;
642           }
643 
644           if (fromPos == end) {
645             return fromPos - fromOffset;
646           }
647         }
648 #endif
649       }
650     }
651 
652     // Indicates caller should call File::copy() default implementation.
653     return nullptr;
654   }
655 
656   // ReadableDirectory ---------------------------------------------------------
657 
658   template <typename Func>
list(bool needTypes,Func && func) const659   auto list(bool needTypes, Func&& func) const
660       -> Array<Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))>> {
661     // Seek to start of directory.
662     KJ_SYSCALL(lseek(fd, 0, SEEK_SET));
663 
664     // Unfortunately, fdopendir() takes ownership of the file descriptor. Therefore we need to
665     // make a duplicate.
666     int duped;
667     KJ_SYSCALL(duped = dup(fd));
668     DIR* dir = fdopendir(duped);
669     if (dir == nullptr) {
670       close(duped);
671       KJ_FAIL_SYSCALL("fdopendir", errno);
672     }
673 
674     KJ_DEFER(closedir(dir));
675     typedef Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))> Entry;
676     kj::Vector<Entry> entries;
677 
678     for (;;) {
679       errno = 0;
680       struct dirent* entry = readdir(dir);
681       if (entry == nullptr) {
682         int error = errno;
683         if (error == 0) {
684           break;
685         } else {
686           KJ_FAIL_SYSCALL("readdir", error);
687         }
688       }
689 
690       kj::StringPtr name = entry->d_name;
691       if (name != "." && name != ".." && !name.startsWith(HIDDEN_PREFIX)) {
692 #ifdef DT_UNKNOWN    // d_type is not available on all platforms.
693         if (entry->d_type != DT_UNKNOWN) {
694           entries.add(func(name, modeToType(DTTOIF(entry->d_type))));
695         } else {
696 #endif
697           if (needTypes) {
698             // Unknown type. Fall back to stat.
699             struct stat stats;
700             KJ_SYSCALL(fstatat(fd, name.cStr(), &stats, AT_SYMLINK_NOFOLLOW));
701             entries.add(func(name, modeToType(stats.st_mode)));
702           } else {
703             entries.add(func(name, FsNode::Type::OTHER));
704           }
705 #ifdef DT_UNKNOWN
706         }
707 #endif
708       }
709     }
710 
711     auto result = entries.releaseAsArray();
712     std::sort(result.begin(), result.end());
713     return result;
714   }
715 
listNames() const716   Array<String> listNames() const {
717     return list(false, [](StringPtr name, FsNode::Type type) { return heapString(name); });
718   }
719 
listEntries() const720   Array<ReadableDirectory::Entry> listEntries() const {
721     return list(true, [](StringPtr name, FsNode::Type type) {
722       return ReadableDirectory::Entry { type, heapString(name), };
723     });
724   }
725 
exists(PathPtr path) const726   bool exists(PathPtr path) const {
727     KJ_SYSCALL_HANDLE_ERRORS(faccessat(fd, path.toString().cStr(), F_OK, 0)) {
728       case ENOENT:
729       case ENOTDIR:
730         return false;
731       default:
732         KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return false; }
733     }
734     return true;
735   }
736 
tryLstat(PathPtr path) const737   Maybe<FsNode::Metadata> tryLstat(PathPtr path) const {
738     struct stat stats;
739     KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.toString().cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
740       case ENOENT:
741       case ENOTDIR:
742         return nullptr;
743       default:
744         KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return nullptr; }
745     }
746     return statToMetadata(stats);
747   }
748 
tryOpenFile(PathPtr path) const749   Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const {
750     int newFd;
751     KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(
752         fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC)) {
753       case ENOENT:
754       case ENOTDIR:
755         return nullptr;
756       default:
757         KJ_FAIL_SYSCALL("openat(fd, path, O_RDONLY)", error, path) { return nullptr; }
758     }
759 
760     kj::AutoCloseFd result(newFd);
761 #ifndef O_CLOEXEC
762     setCloexec(result);
763 #endif
764 
765     return newDiskReadableFile(kj::mv(result));
766   }
767 
tryOpenSubdirInternal(PathPtr path) const768   Maybe<AutoCloseFd> tryOpenSubdirInternal(PathPtr path) const {
769     int newFd;
770     KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(
771         fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) {
772       case ENOENT:
773         return nullptr;
774       case ENOTDIR:
775         // Could mean that a parent is not a directory, which we treat as "doesn't exist".
776         // Could also mean that the specified file is not a directory, which should throw.
777         // Check using exists().
778         if (!exists(path)) {
779           return nullptr;
780         }
781         // fallthrough
782       default:
783         KJ_FAIL_SYSCALL("openat(fd, path, O_DIRECTORY)", error, path) { return nullptr; }
784     }
785 
786     kj::AutoCloseFd result(newFd);
787 #ifndef O_CLOEXEC
788     setCloexec(result);
789 #endif
790 
791     return kj::mv(result);
792   }
793 
tryOpenSubdir(PathPtr path) const794   Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const {
795     return tryOpenSubdirInternal(path).map(newDiskReadableDirectory);
796   }
797 
tryReadlink(PathPtr path) const798   Maybe<String> tryReadlink(PathPtr path) const {
799     size_t trySize = 256;
800     for (;;) {
801       KJ_STACK_ARRAY(char, buf, trySize, 256, 4096);
802       ssize_t n = readlinkat(fd, path.toString().cStr(), buf.begin(), buf.size());
803       if (n < 0) {
804         int error = errno;
805         switch (error) {
806           case EINTR:
807             continue;
808           case ENOENT:
809           case ENOTDIR:
810           case EINVAL:    // not a link
811             return nullptr;
812           default:
813             KJ_FAIL_SYSCALL("readlinkat(fd, path)", error, path) { return nullptr; }
814         }
815       }
816 
817       if (n >= buf.size()) {
818         // Didn't give it enough space. Better retry with a bigger buffer.
819         trySize *= 2;
820         continue;
821       }
822 
823       return heapString(buf.begin(), n);
824     }
825   }
826 
827   // Directory -----------------------------------------------------------------
828 
tryMkdir(PathPtr path,WriteMode mode,bool noThrow) const829   bool tryMkdir(PathPtr path, WriteMode mode, bool noThrow) const {
830     // Internal function to make a directory.
831 
832     auto filename = path.toString();
833     mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777;
834 
835     KJ_SYSCALL_HANDLE_ERRORS(mkdirat(fd, filename.cStr(), acl)) {
836       case EEXIST: {
837         // Apparently this path exists.
838         if (!has(mode, WriteMode::MODIFY)) {
839           // Require exclusive create.
840           return false;
841         }
842 
843         // MODIFY is allowed, so we just need to check whether the existing entry is a directory.
844         struct stat stats;
845         KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, filename.cStr(), &stats, 0)) {
846           default:
847             // mkdir() says EEXIST but we can't stat it. Maybe it's a dangling link, or maybe
848             // we can't access it for some reason. Assume failure.
849             //
850             // TODO(someday): Maybe we should be creating the directory at the target of the
851             //   link?
852             goto failed;
853         }
854         return (stats.st_mode & S_IFMT) == S_IFDIR;
855       }
856       case ENOENT:
857         if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
858             tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
859                                     WriteMode::CREATE_PARENT, true)) {
860           // Retry, but make sure we don't try to create the parent again.
861           return tryMkdir(path, mode - WriteMode::CREATE_PARENT, noThrow);
862         } else {
863           goto failed;
864         }
865       default:
866       failed:
867         if (noThrow) {
868           // Caller requested no throwing.
869           return false;
870         } else {
871           KJ_FAIL_SYSCALL("mkdirat(fd, path)", error, path);
872         }
873     }
874 
875     return true;
876   }
877 
createNamedTemporary(PathPtr finalName,WriteMode mode,Function<int (StringPtr)> tryCreate) const878   kj::Maybe<String> createNamedTemporary(
879       PathPtr finalName, WriteMode mode, Function<int(StringPtr)> tryCreate) const {
880     // Create a temporary file which will eventually replace `finalName`.
881     //
882     // Calls `tryCreate` to actually create the temporary, passing in the desired path. tryCreate()
883     // is expected to behave like a syscall, returning a negative value and setting `errno` on
884     // error. tryCreate() MUST fail with EEXIST if the path exists -- this is not checked in
885     // advance, since it needs to be checked atomically. In the case of EEXIST, tryCreate() will
886     // be called again with a new path.
887     //
888     // Returns the temporary path that succeeded. Only returns nullptr if there was an exception
889     // but we're compiled with -fno-exceptions.
890 
891     if (finalName.size() == 0) {
892       KJ_FAIL_REQUIRE("can't replace self") { break; }
893       return nullptr;
894     }
895 
896     static uint counter = 0;
897     static const pid_t pid = getpid();
898     String pathPrefix;
899     if (finalName.size() > 1) {
900       pathPrefix = kj::str(finalName.parent(), '/');
901     }
902     auto path = kj::str(pathPrefix, HIDDEN_PREFIX, pid, '.', counter++, '.',
903                         finalName.basename()[0], ".partial");
904 
905     KJ_SYSCALL_HANDLE_ERRORS(tryCreate(path)) {
906       case EEXIST:
907         return createNamedTemporary(finalName, mode, kj::mv(tryCreate));
908       case ENOENT:
909         if (has(mode, WriteMode::CREATE_PARENT) && finalName.size() > 1 &&
910             tryMkdir(finalName.parent(), WriteMode::CREATE | WriteMode::MODIFY |
911                                          WriteMode::CREATE_PARENT, true)) {
912           // Retry, but make sure we don't try to create the parent again.
913           mode = mode - WriteMode::CREATE_PARENT;
914           return createNamedTemporary(finalName, mode, kj::mv(tryCreate));
915         }
916         // fallthrough
917       default:
918         KJ_FAIL_SYSCALL("create(path)", error, path) { break; }
919         return nullptr;
920     }
921 
922     return kj::mv(path);
923   }
924 
tryReplaceNode(PathPtr path,WriteMode mode,Function<int (StringPtr)> tryCreate) const925   bool tryReplaceNode(PathPtr path, WriteMode mode, Function<int(StringPtr)> tryCreate) const {
926     // Replaces the given path with an object created by calling tryCreate().
927     //
928     // tryCreate() must behave like a syscall which creates the node at the path passed to it,
929     // returning a negative value on error. If the path passed to tryCreate already exists, it
930     // MUST fail with EEXIST.
931     //
932     // When `mode` includes MODIFY, replaceNode() reacts to EEXIST by creating the node in a
933     // temporary location and then rename()ing it into place.
934 
935     if (path.size() == 0) {
936       KJ_FAIL_REQUIRE("can't replace self") { return false; }
937     }
938 
939     auto filename = path.toString();
940 
941     if (has(mode, WriteMode::CREATE)) {
942       // First try just cerating the node in-place.
943       KJ_SYSCALL_HANDLE_ERRORS(tryCreate(filename)) {
944         case EEXIST:
945           // Target exists.
946           if (has(mode, WriteMode::MODIFY)) {
947             // Fall back to MODIFY path, below.
948             break;
949           } else {
950             return false;
951           }
952         case ENOENT:
953           if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
954               tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
955                                       WriteMode::CREATE_PARENT, true)) {
956             // Retry, but make sure we don't try to create the parent again.
957             return tryReplaceNode(path, mode - WriteMode::CREATE_PARENT, kj::mv(tryCreate));
958           }
959           // fallthrough
960         default:
961           KJ_FAIL_SYSCALL("create(path)", error, path) { return false; }
962       } else {
963         // Success.
964         return true;
965       }
966     }
967 
968     // Either we don't have CREATE mode or the target already exists. We need to perform a
969     // replacement instead.
970 
971     KJ_IF_MAYBE(tempPath, createNamedTemporary(path, mode, kj::mv(tryCreate))) {
972       if (tryCommitReplacement(filename, fd, *tempPath, mode)) {
973         return true;
974       } else {
975         KJ_SYSCALL_HANDLE_ERRORS(unlinkat(fd, tempPath->cStr(), 0)) {
976           case ENOENT:
977             // meh
978             break;
979           default:
980             KJ_FAIL_SYSCALL("unlinkat(fd, tempPath, 0)", error, *tempPath);
981         }
982         return false;
983       }
984     } else {
985       // threw, but exceptions are disabled
986       return false;
987     }
988   }
989 
tryOpenFileInternal(PathPtr path,WriteMode mode,bool append) const990   Maybe<AutoCloseFd> tryOpenFileInternal(PathPtr path, WriteMode mode, bool append) const {
991     uint flags = O_RDWR | MAYBE_O_CLOEXEC;
992     mode_t acl = 0666;
993     if (has(mode, WriteMode::CREATE)) {
994       flags |= O_CREAT;
995     }
996     if (!has(mode, WriteMode::MODIFY)) {
997       if (!has(mode, WriteMode::CREATE)) {
998         // Neither CREATE nor MODIFY -- impossible to satisfy preconditions.
999         return nullptr;
1000       }
1001       flags |= O_EXCL;
1002     }
1003     if (append) {
1004       flags |= O_APPEND;
1005     }
1006     if (has(mode, WriteMode::EXECUTABLE)) {
1007       acl = 0777;
1008     }
1009     if (has(mode, WriteMode::PRIVATE)) {
1010       acl &= 0700;
1011     }
1012 
1013     auto filename = path.toString();
1014 
1015     int newFd;
1016     KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(fd, filename.cStr(), flags, acl)) {
1017       case ENOENT:
1018         if (has(mode, WriteMode::CREATE)) {
1019           // Either:
1020           // - The file is a broken symlink.
1021           // - A parent directory didn't exist.
1022           if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
1023               tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
1024                                       WriteMode::CREATE_PARENT, true)) {
1025             // Retry, but make sure we don't try to create the parent again.
1026             return tryOpenFileInternal(path, mode - WriteMode::CREATE_PARENT, append);
1027           }
1028 
1029           // Check for broken link.
1030           if (!has(mode, WriteMode::MODIFY) &&
1031               faccessat(fd, filename.cStr(), F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
1032             // Yep. We treat this as already-exists, which means in CREATE-only mode this is a
1033             // simple failure.
1034             return nullptr;
1035           }
1036 
1037           KJ_FAIL_REQUIRE("parent is not a directory", path) { return nullptr; }
1038         } else {
1039           // MODIFY-only mode. ENOENT = doesn't exist = return null.
1040           return nullptr;
1041         }
1042       case ENOTDIR:
1043         if (!has(mode, WriteMode::CREATE)) {
1044           // MODIFY-only mode. ENOTDIR = parent not a directory = doesn't exist = return null.
1045           return nullptr;
1046         }
1047         goto failed;
1048       case EEXIST:
1049         if (!has(mode, WriteMode::MODIFY)) {
1050           // CREATE-only mode. EEXIST = already exists = return null.
1051           return nullptr;
1052         }
1053         goto failed;
1054       default:
1055       failed:
1056         KJ_FAIL_SYSCALL("openat(fd, path, O_RDWR | ...)", error, path) { return nullptr; }
1057     }
1058 
1059     kj::AutoCloseFd result(newFd);
1060 #ifndef O_CLOEXEC
1061     setCloexec(result);
1062 #endif
1063 
1064     return kj::mv(result);
1065   }
1066 
tryCommitReplacement(StringPtr toPath,int fromDirFd,StringPtr fromPath,WriteMode mode,int * errorReason=nullptr) const1067   bool tryCommitReplacement(StringPtr toPath, int fromDirFd, StringPtr fromPath, WriteMode mode,
1068                             int* errorReason = nullptr) const {
1069     if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) {
1070       // Always clobber. Try it.
1071       KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr())) {
1072         case EISDIR:
1073         case ENOTDIR:
1074         case ENOTEMPTY:
1075         case EEXIST:
1076           // Failed because target exists and due to the various weird quirks of rename(), it
1077           // can't remove it for us. On Linux we can try an exchange instead. On others we have
1078           // to move the target out of the way.
1079           break;
1080         default:
1081           if (errorReason == nullptr) {
1082             KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { return false; }
1083           } else {
1084             *errorReason = error;
1085             return false;
1086           }
1087       } else {
1088         return true;
1089       }
1090     }
1091 
1092 #if __linux__ && defined(RENAME_EXCHANGE)
1093     // Try to use Linux's renameat2() to atomically check preconditions and apply.
1094 
1095     if (has(mode, WriteMode::MODIFY)) {
1096       // Use an exchange to implement modification.
1097       //
1098       // We reach this branch when performing a MODIFY-only, or when performing a CREATE | MODIFY
1099       // in which we determined above that there's a node of a different type blocking the
1100       // exchange.
1101 
1102       KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2,
1103           fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_EXCHANGE)) {
1104         case ENOSYS:
1105           break;  // fall back to traditional means
1106         case ENOENT:
1107           // Presumably because the target path doesn't exist.
1108           if (has(mode, WriteMode::CREATE)) {
1109             KJ_FAIL_ASSERT("rename(tmp, path) claimed path exists but "
1110                 "renameat2(fromPath, toPath, EXCAHNGE) said it doest; concurrent modification?",
1111                 fromPath, toPath) { return false; }
1112           } else {
1113             // Assume target doesn't exist.
1114             return false;
1115           }
1116         default:
1117           if (errorReason == nullptr) {
1118             KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, EXCHANGE)", error, fromPath, toPath) {
1119               return false;
1120             }
1121           } else {
1122             *errorReason = error;
1123             return false;
1124           }
1125       } else {
1126         // Successful swap! Delete swapped-out content.
1127         rmrf(fromDirFd, fromPath);
1128         return true;
1129       }
1130     } else if (has(mode, WriteMode::CREATE)) {
1131       KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2,
1132           fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_NOREPLACE)) {
1133         case ENOSYS:
1134           break;  // fall back to traditional means
1135         case EEXIST:
1136           return false;
1137         default:
1138           if (errorReason == nullptr) {
1139             KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, NOREPLACE)", error, fromPath, toPath) {
1140               return false;
1141             }
1142           } else {
1143             *errorReason = error;
1144             return false;
1145           }
1146       } else {
1147         return true;
1148       }
1149     }
1150 #endif
1151 
1152     // We're unable to do what we wanted atomically. :(
1153 
1154     if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) {
1155       // We failed to atomically delete the target previously. So now we need to do two calls in
1156       // rapid succession to move the old file away then move the new one into place.
1157 
1158       // Find out what kind of file exists at the target path.
1159       struct stat stats;
1160       KJ_SYSCALL(fstatat(fd, toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { return false; }
1161 
1162       // Create a temporary location to move the existing object to. Note that rename() allows a
1163       // non-directory to replace a non-directory, and allows a directory to replace an empty
1164       // directory. So we have to create the right type.
1165       Path toPathParsed = Path::parse(toPath);
1166       String away;
1167       KJ_IF_MAYBE(awayPath, createNamedTemporary(toPathParsed, WriteMode::CREATE,
1168           [&](StringPtr candidatePath) {
1169         if (S_ISDIR(stats.st_mode)) {
1170           return mkdirat(fd, candidatePath.cStr(), 0700);
1171         } else {
1172 #if __APPLE__
1173           // No mknodat() on OSX, gotta open() a file, ugh.
1174           int newFd = openat(fd, candidatePath.cStr(),
1175                              O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0700);
1176           if (newFd >= 0) close(newFd);
1177           return newFd;
1178 #else
1179           return mknodat(fd, candidatePath.cStr(), S_IFREG | 0600, dev_t());
1180 #endif
1181         }
1182       })) {
1183         away = kj::mv(*awayPath);
1184       } else {
1185         // Already threw.
1186         return false;
1187       }
1188 
1189       // OK, now move the target object to replace the thing we just created.
1190       KJ_SYSCALL(renameat(fd, toPath.cStr(), fd, away.cStr())) {
1191         // Something went wrong. Remove the thing we just created.
1192         unlinkat(fd, away.cStr(), S_ISDIR(stats.st_mode) ? AT_REMOVEDIR : 0);
1193         return false;
1194       }
1195 
1196       // Now move the source object to the target location.
1197       KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd, toPath.cStr())) {
1198         default:
1199           // Try to put things back where they were. If this fails, though, then we have little
1200           // choice but to leave things broken.
1201           KJ_SYSCALL_HANDLE_ERRORS(renameat(fd, away.cStr(), fd, toPath.cStr())) {
1202             default: break;
1203           }
1204 
1205           if (errorReason == nullptr) {
1206             KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) {
1207               return false;
1208             }
1209           } else {
1210             *errorReason = error;
1211             return false;
1212           }
1213       }
1214 
1215       // OK, success. Delete the old content.
1216       rmrf(fd, away);
1217       return true;
1218     } else {
1219       // Only one of CREATE or MODIFY is specified, so we need to verify non-atomically that the
1220       // corresponding precondition (must-not-exist or must-exist, respectively) is held.
1221       if (has(mode, WriteMode::CREATE)) {
1222         struct stat stats;
1223         KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
1224           case ENOENT:
1225           case ENOTDIR:
1226             break;  // doesn't exist; continue
1227           default:
1228             KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; }
1229         } else {
1230           return false;  // already exists; fail
1231         }
1232       } else if (has(mode, WriteMode::MODIFY)) {
1233         struct stat stats;
1234         KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
1235           case ENOENT:
1236           case ENOTDIR:
1237             return false;  // doesn't exist; fail
1238           default:
1239             KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; }
1240         } else {
1241           // already exists; continue
1242         }
1243       } else {
1244         // Neither CREATE nor MODIFY.
1245         return false;
1246       }
1247 
1248       // Start over in create-and-modify mode.
1249       return tryCommitReplacement(toPath, fromDirFd, fromPath,
1250                                   WriteMode::CREATE | WriteMode::MODIFY,
1251                                   errorReason);
1252     }
1253   }
1254 
1255   template <typename T>
1256   class ReplacerImpl final: public Directory::Replacer<T> {
1257   public:
ReplacerImpl(Own<const T> && object,const DiskHandle & handle,String && tempPath,String && path,WriteMode mode)1258     ReplacerImpl(Own<const T>&& object, const DiskHandle& handle,
1259                  String&& tempPath, String&& path, WriteMode mode)
1260         : Directory::Replacer<T>(mode),
1261           object(kj::mv(object)), handle(handle),
1262           tempPath(kj::mv(tempPath)), path(kj::mv(path)) {}
1263 
~ReplacerImpl()1264     ~ReplacerImpl() noexcept(false) {
1265       if (!committed) {
1266         rmrf(handle.fd, tempPath);
1267       }
1268     }
1269 
get()1270     const T& get() override {
1271       return *object;
1272     }
1273 
tryCommit()1274     bool tryCommit() override {
1275       KJ_ASSERT(!committed, "already committed") { return false; }
1276       return committed = handle.tryCommitReplacement(path, handle.fd, tempPath,
1277                                                      Directory::Replacer<T>::mode);
1278     }
1279 
1280   private:
1281     Own<const T> object;
1282     const DiskHandle& handle;
1283     String tempPath;
1284     String path;
1285     bool committed = false;  // true if *successfully* committed (in which case tempPath is gone)
1286   };
1287 
1288   template <typename T>
1289   class BrokenReplacer final: public Directory::Replacer<T> {
1290     // For recovery path when exceptions are disabled.
1291 
1292   public:
BrokenReplacer(Own<const T> inner)1293     BrokenReplacer(Own<const T> inner)
1294         : Directory::Replacer<T>(WriteMode::CREATE | WriteMode::MODIFY),
1295           inner(kj::mv(inner)) {}
1296 
get()1297     const T& get() override { return *inner; }
tryCommit()1298     bool tryCommit() override { return false; }
1299 
1300   private:
1301     Own<const T> inner;
1302   };
1303 
tryOpenFile(PathPtr path,WriteMode mode) const1304   Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const {
1305     return tryOpenFileInternal(path, mode, false).map(newDiskFile);
1306   }
1307 
replaceFile(PathPtr path,WriteMode mode) const1308   Own<Directory::Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const {
1309     mode_t acl = 0666;
1310     if (has(mode, WriteMode::EXECUTABLE)) {
1311       acl = 0777;
1312     }
1313     if (has(mode, WriteMode::PRIVATE)) {
1314       acl &= 0700;
1315     }
1316 
1317     int newFd_;
1318     KJ_IF_MAYBE(temp, createNamedTemporary(path, mode,
1319         [&](StringPtr candidatePath) {
1320       return newFd_ = openat(fd, candidatePath.cStr(),
1321                              O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, acl);
1322     })) {
1323       AutoCloseFd newFd(newFd_);
1324 #ifndef O_CLOEXEC
1325       setCloexec(newFd);
1326 #endif
1327       return heap<ReplacerImpl<File>>(newDiskFile(kj::mv(newFd)), *this, kj::mv(*temp),
1328                                       path.toString(), mode);
1329     } else {
1330       // threw, but exceptions are disabled
1331       return heap<BrokenReplacer<File>>(newInMemoryFile(nullClock()));
1332     }
1333   }
1334 
createTemporary() const1335   Own<const File> createTemporary() const {
1336     int newFd_;
1337 
1338 #if __linux__ && defined(O_TMPFILE)
1339     // Use syscall() to work around glibc bug with O_TMPFILE:
1340     //     https://sourceware.org/bugzilla/show_bug.cgi?id=17523
1341     KJ_SYSCALL_HANDLE_ERRORS(newFd_ = syscall(
1342         SYS_openat, fd.get(), ".", O_RDWR | O_TMPFILE, 0700)) {
1343       case EOPNOTSUPP:
1344       case EINVAL:
1345       case EISDIR:
1346         // Maybe not supported by this kernel / filesystem. Fall back to below.
1347         break;
1348       default:
1349         KJ_FAIL_SYSCALL("open(O_TMPFILE)", error) { break; }
1350         break;
1351     } else {
1352       AutoCloseFd newFd(newFd_);
1353 #ifndef O_CLOEXEC
1354       setCloexec(newFd);
1355 #endif
1356       return newDiskFile(kj::mv(newFd));
1357     }
1358 #endif
1359 
1360     KJ_IF_MAYBE(temp, createNamedTemporary(Path("unnamed"), WriteMode::CREATE,
1361         [&](StringPtr path) {
1362       return newFd_ = openat(fd, path.cStr(), O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0600);
1363     })) {
1364       AutoCloseFd newFd(newFd_);
1365 #ifndef O_CLOEXEC
1366       setCloexec(newFd);
1367 #endif
1368       auto result = newDiskFile(kj::mv(newFd));
1369       KJ_SYSCALL(unlinkat(fd, temp->cStr(), 0)) { break; }
1370       return kj::mv(result);
1371     } else {
1372       // threw, but exceptions are disabled
1373       return newInMemoryFile(nullClock());
1374     }
1375   }
1376 
tryAppendFile(PathPtr path,WriteMode mode) const1377   Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const {
1378     return tryOpenFileInternal(path, mode, true).map(newDiskAppendableFile);
1379   }
1380 
tryOpenSubdir(PathPtr path,WriteMode mode) const1381   Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const {
1382     // Must create before open.
1383     if (has(mode, WriteMode::CREATE)) {
1384       if (!tryMkdir(path, mode, false)) return nullptr;
1385     }
1386 
1387     return tryOpenSubdirInternal(path).map(newDiskDirectory);
1388   }
1389 
replaceSubdir(PathPtr path,WriteMode mode) const1390   Own<Directory::Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const {
1391     mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777;
1392 
1393     KJ_IF_MAYBE(temp, createNamedTemporary(path, mode,
1394         [&](StringPtr candidatePath) {
1395       return mkdirat(fd, candidatePath.cStr(), acl);
1396     })) {
1397       int subdirFd_;
1398       KJ_SYSCALL_HANDLE_ERRORS(subdirFd_ = openat(
1399           fd, temp->cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) {
1400         default:
1401           KJ_FAIL_SYSCALL("open(just-created-temporary)", error);
1402           return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock()));
1403       }
1404 
1405       AutoCloseFd subdirFd(subdirFd_);
1406 #ifndef O_CLOEXEC
1407       setCloexec(subdirFd);
1408 #endif
1409       return heap<ReplacerImpl<Directory>>(
1410           newDiskDirectory(kj::mv(subdirFd)), *this, kj::mv(*temp), path.toString(), mode);
1411     } else {
1412       // threw, but exceptions are disabled
1413       return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock()));
1414     }
1415   }
1416 
trySymlink(PathPtr linkpath,StringPtr content,WriteMode mode) const1417   bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const {
1418     return tryReplaceNode(linkpath, mode, [&](StringPtr candidatePath) {
1419       return symlinkat(content.cStr(), fd, candidatePath.cStr());
1420     });
1421   }
1422 
tryTransfer(PathPtr toPath,WriteMode toMode,const Directory & fromDirectory,PathPtr fromPath,TransferMode mode,const Directory & self) const1423   bool tryTransfer(PathPtr toPath, WriteMode toMode,
1424                    const Directory& fromDirectory, PathPtr fromPath,
1425                    TransferMode mode, const Directory& self) const {
1426     KJ_REQUIRE(toPath.size() > 0, "can't replace self") { return false; }
1427 
1428     if (mode == TransferMode::LINK) {
1429       KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) {
1430         // Other is a disk directory, so we can hopefully do an efficient move/link.
1431         return tryReplaceNode(toPath, toMode, [&](StringPtr candidatePath) {
1432           return linkat(*fromFd, fromPath.toString().cStr(), fd, candidatePath.cStr(), 0);
1433         });
1434       };
1435     } else if (mode == TransferMode::MOVE) {
1436       KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) {
1437         KJ_ASSERT(mode == TransferMode::MOVE);
1438 
1439         int error = 0;
1440         if (tryCommitReplacement(toPath.toString(), *fromFd, fromPath.toString(), toMode,
1441                                  &error)) {
1442           return true;
1443         } else switch (error) {
1444           case 0:
1445             // Plain old WriteMode precondition failure.
1446             return false;
1447           case EXDEV:
1448             // Can't move between devices. Fall back to default implementation, which does
1449             // copy/delete.
1450             break;
1451           case ENOENT:
1452             // Either the destination directory doesn't exist or the source path doesn't exist.
1453             // Unfortunately we don't really know. If CREATE_PARENT was provided, try creating
1454             // the parent directory. Otherwise, we don't actually need to distinguish between
1455             // these two errors; just return false.
1456             if (has(toMode, WriteMode::CREATE) && has(toMode, WriteMode::CREATE_PARENT) &&
1457                 toPath.size() > 0 && tryMkdir(toPath.parent(),
1458                     WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) {
1459               // Retry, but make sure we don't try to create the parent again.
1460               return tryTransfer(toPath, toMode - WriteMode::CREATE_PARENT,
1461                                  fromDirectory, fromPath, mode, self);
1462             }
1463             return false;
1464           default:
1465             KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) {
1466               return false;
1467             }
1468         }
1469       }
1470     }
1471 
1472     // OK, we can't do anything efficient using the OS. Fall back to default implementation.
1473     return self.Directory::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode);
1474   }
1475 
tryRemove(PathPtr path) const1476   bool tryRemove(PathPtr path) const {
1477     return rmrf(fd, path.toString());
1478   }
1479 
1480 protected:
1481   AutoCloseFd fd;
1482 };
1483 
1484 #define FSNODE_METHODS(classname)                                   \
1485   Maybe<int> getFd() const override { return DiskHandle::getFd(); } \
1486                                                                     \
1487   Own<const FsNode> cloneFsNode() const override {                  \
1488     return heap<classname>(DiskHandle::clone());                    \
1489   }                                                                 \
1490                                                                     \
1491   Metadata stat() const override { return DiskHandle::stat(); }     \
1492   void sync() const override { DiskHandle::sync(); }                \
1493   void datasync() const override { DiskHandle::datasync(); }
1494 
1495 class DiskReadableFile final: public ReadableFile, public DiskHandle {
1496 public:
DiskReadableFile(AutoCloseFd && fd)1497   DiskReadableFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
1498 
1499   FSNODE_METHODS(DiskReadableFile);
1500 
read(uint64_t offset,ArrayPtr<byte> buffer) const1501   size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override {
1502     return DiskHandle::read(offset, buffer);
1503   }
mmap(uint64_t offset,uint64_t size) const1504   Array<const byte> mmap(uint64_t offset, uint64_t size) const override {
1505     return DiskHandle::mmap(offset, size);
1506   }
mmapPrivate(uint64_t offset,uint64_t size) const1507   Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override {
1508     return DiskHandle::mmapPrivate(offset, size);
1509   }
1510 };
1511 
1512 class DiskAppendableFile final: public AppendableFile, public DiskHandle, public FdOutputStream {
1513 public:
DiskAppendableFile(AutoCloseFd && fd)1514   DiskAppendableFile(AutoCloseFd&& fd)
1515       : DiskHandle(kj::mv(fd)),
1516         FdOutputStream(DiskHandle::fd.get()) {}
1517 
1518   FSNODE_METHODS(DiskAppendableFile);
1519 
write(const void * buffer,size_t size)1520   void write(const void* buffer, size_t size) override {
1521     FdOutputStream::write(buffer, size);
1522   }
write(ArrayPtr<const ArrayPtr<const byte>> pieces)1523   void write(ArrayPtr<const ArrayPtr<const byte>> pieces) override {
1524     FdOutputStream::write(pieces);
1525   }
1526 };
1527 
1528 class DiskFile final: public File, public DiskHandle {
1529 public:
DiskFile(AutoCloseFd && fd)1530   DiskFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
1531 
1532   FSNODE_METHODS(DiskFile);
1533 
read(uint64_t offset,ArrayPtr<byte> buffer) const1534   size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override {
1535     return DiskHandle::read(offset, buffer);
1536   }
mmap(uint64_t offset,uint64_t size) const1537   Array<const byte> mmap(uint64_t offset, uint64_t size) const override {
1538     return DiskHandle::mmap(offset, size);
1539   }
mmapPrivate(uint64_t offset,uint64_t size) const1540   Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override {
1541     return DiskHandle::mmapPrivate(offset, size);
1542   }
1543 
write(uint64_t offset,ArrayPtr<const byte> data) const1544   void write(uint64_t offset, ArrayPtr<const byte> data) const override {
1545     DiskHandle::write(offset, data);
1546   }
zero(uint64_t offset,uint64_t size) const1547   void zero(uint64_t offset, uint64_t size) const override {
1548     DiskHandle::zero(offset, size);
1549   }
truncate(uint64_t size) const1550   void truncate(uint64_t size) const override {
1551     DiskHandle::truncate(size);
1552   }
mmapWritable(uint64_t offset,uint64_t size) const1553   Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const override {
1554     return DiskHandle::mmapWritable(offset, size);
1555   }
copy(uint64_t offset,const ReadableFile & from,uint64_t fromOffset,uint64_t size) const1556   size_t copy(uint64_t offset, const ReadableFile& from,
1557               uint64_t fromOffset, uint64_t size) const override {
1558     KJ_IF_MAYBE(result, DiskHandle::copy(offset, from, fromOffset, size)) {
1559       return *result;
1560     } else {
1561       return File::copy(offset, from, fromOffset, size);
1562     }
1563   }
1564 };
1565 
1566 class DiskReadableDirectory final: public ReadableDirectory, public DiskHandle {
1567 public:
DiskReadableDirectory(AutoCloseFd && fd)1568   DiskReadableDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
1569 
1570   FSNODE_METHODS(DiskReadableDirectory);
1571 
listNames() const1572   Array<String> listNames() const override { return DiskHandle::listNames(); }
listEntries() const1573   Array<Entry> listEntries() const override { return DiskHandle::listEntries(); }
exists(PathPtr path) const1574   bool exists(PathPtr path) const override { return DiskHandle::exists(path); }
tryLstat(PathPtr path) const1575   Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override {
1576     return DiskHandle::tryLstat(path);
1577   }
tryOpenFile(PathPtr path) const1578   Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override {
1579     return DiskHandle::tryOpenFile(path);
1580   }
tryOpenSubdir(PathPtr path) const1581   Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override {
1582     return DiskHandle::tryOpenSubdir(path);
1583   }
tryReadlink(PathPtr path) const1584   Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); }
1585 };
1586 
1587 class DiskDirectory final: public Directory, public DiskHandle {
1588 public:
DiskDirectory(AutoCloseFd && fd)1589   DiskDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
1590 
1591   FSNODE_METHODS(DiskDirectory);
1592 
listNames() const1593   Array<String> listNames() const override { return DiskHandle::listNames(); }
listEntries() const1594   Array<Entry> listEntries() const override { return DiskHandle::listEntries(); }
exists(PathPtr path) const1595   bool exists(PathPtr path) const override { return DiskHandle::exists(path); }
tryLstat(PathPtr path) const1596   Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override {
1597     return DiskHandle::tryLstat(path);
1598   }
tryOpenFile(PathPtr path) const1599   Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override {
1600     return DiskHandle::tryOpenFile(path);
1601   }
tryOpenSubdir(PathPtr path) const1602   Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override {
1603     return DiskHandle::tryOpenSubdir(path);
1604   }
tryReadlink(PathPtr path) const1605   Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); }
1606 
tryOpenFile(PathPtr path,WriteMode mode) const1607   Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const override {
1608     return DiskHandle::tryOpenFile(path, mode);
1609   }
replaceFile(PathPtr path,WriteMode mode) const1610   Own<Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const override {
1611     return DiskHandle::replaceFile(path, mode);
1612   }
createTemporary() const1613   Own<const File> createTemporary() const override {
1614     return DiskHandle::createTemporary();
1615   }
tryAppendFile(PathPtr path,WriteMode mode) const1616   Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const override {
1617     return DiskHandle::tryAppendFile(path, mode);
1618   }
tryOpenSubdir(PathPtr path,WriteMode mode) const1619   Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const override {
1620     return DiskHandle::tryOpenSubdir(path, mode);
1621   }
replaceSubdir(PathPtr path,WriteMode mode) const1622   Own<Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const override {
1623     return DiskHandle::replaceSubdir(path, mode);
1624   }
trySymlink(PathPtr linkpath,StringPtr content,WriteMode mode) const1625   bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const override {
1626     return DiskHandle::trySymlink(linkpath, content, mode);
1627   }
tryTransfer(PathPtr toPath,WriteMode toMode,const Directory & fromDirectory,PathPtr fromPath,TransferMode mode) const1628   bool tryTransfer(PathPtr toPath, WriteMode toMode,
1629                    const Directory& fromDirectory, PathPtr fromPath,
1630                    TransferMode mode) const override {
1631     return DiskHandle::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode, *this);
1632   }
1633   // tryTransferTo() not implemented because we have nothing special we can do.
tryRemove(PathPtr path) const1634   bool tryRemove(PathPtr path) const override {
1635     return DiskHandle::tryRemove(path);
1636   }
1637 };
1638 
1639 class DiskFilesystem final: public Filesystem {
1640 public:
DiskFilesystem()1641   DiskFilesystem()
1642       : root(openDir("/")),
1643         current(openDir(".")),
1644         currentPath(computeCurrentPath()) {}
1645 
getRoot() const1646   const Directory& getRoot() const override {
1647     return root;
1648   }
1649 
getCurrent() const1650   const Directory& getCurrent() const override {
1651     return current;
1652   }
1653 
getCurrentPath() const1654   PathPtr getCurrentPath() const override {
1655     return currentPath;
1656   }
1657 
1658 private:
1659   DiskDirectory root;
1660   DiskDirectory current;
1661   Path currentPath;
1662 
openDir(const char * dir)1663   static AutoCloseFd openDir(const char* dir) {
1664     int newFd;
1665     KJ_SYSCALL(newFd = open(dir, O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY));
1666     AutoCloseFd result(newFd);
1667 #ifndef O_CLOEXEC
1668     setCloexec(result);
1669 #endif
1670     return result;
1671   }
1672 
computeCurrentPath()1673   static Path computeCurrentPath() {
1674     // If env var PWD is set and points to the current directory, use it. This captures the current
1675     // path according to the user's shell, which may differ from the kernel's idea in the presence
1676     // of symlinks.
1677     const char* pwd = getenv("PWD");
1678     if (pwd != nullptr) {
1679       Path result = nullptr;
1680       struct stat pwdStat, dotStat;
1681       KJ_IF_MAYBE(e, kj::runCatchingExceptions([&]() {
1682         KJ_ASSERT(pwd[0] == '/') { return; }
1683         result = Path::parse(pwd + 1);
1684         KJ_SYSCALL(lstat(result.toString(true).cStr(), &pwdStat), result) { return; }
1685         KJ_SYSCALL(lstat(".", &dotStat)) { return; }
1686       })) {
1687         // failed, give up on PWD
1688         KJ_LOG(WARNING, "PWD environment variable seems invalid", pwd, *e);
1689       } else {
1690         if (pwdStat.st_ino == dotStat.st_ino &&
1691             pwdStat.st_dev == dotStat.st_dev) {
1692           return kj::mv(result);
1693         } else {
1694           KJ_LOG(WARNING, "PWD environment variable doesn't match current directory", pwd);
1695         }
1696       }
1697     }
1698 
1699     size_t size = 256;
1700   retry:
1701     KJ_STACK_ARRAY(char, buf, size, 256, 4096);
1702     if (getcwd(buf.begin(), size) == nullptr) {
1703       int error = errno;
1704       if (error == ENAMETOOLONG) {
1705         size *= 2;
1706         goto retry;
1707       } else {
1708         KJ_FAIL_SYSCALL("getcwd()", error);
1709       }
1710     }
1711 
1712     StringPtr path = buf.begin();
1713 
1714     // On Linux, the path will start with "(unreachable)" if the working directory is not a subdir
1715     // of the root directory, which is possible via chroot() or mount namespaces.
1716     KJ_ASSERT(!path.startsWith("(unreachable)"),
1717         "working directory is not reachable from root", path);
1718     KJ_ASSERT(path.startsWith("/"), "current directory is not absolute", path);
1719 
1720     return Path::parse(path.slice(1));
1721   }
1722 };
1723 
1724 } // namespace
1725 
newDiskReadableFile(kj::AutoCloseFd fd)1726 Own<ReadableFile> newDiskReadableFile(kj::AutoCloseFd fd) {
1727   return heap<DiskReadableFile>(kj::mv(fd));
1728 }
newDiskAppendableFile(kj::AutoCloseFd fd)1729 Own<AppendableFile> newDiskAppendableFile(kj::AutoCloseFd fd) {
1730   return heap<DiskAppendableFile>(kj::mv(fd));
1731 }
newDiskFile(kj::AutoCloseFd fd)1732 Own<File> newDiskFile(kj::AutoCloseFd fd) {
1733   return heap<DiskFile>(kj::mv(fd));
1734 }
newDiskReadableDirectory(kj::AutoCloseFd fd)1735 Own<ReadableDirectory> newDiskReadableDirectory(kj::AutoCloseFd fd) {
1736   return heap<DiskReadableDirectory>(kj::mv(fd));
1737 }
newDiskDirectory(kj::AutoCloseFd fd)1738 Own<Directory> newDiskDirectory(kj::AutoCloseFd fd) {
1739   return heap<DiskDirectory>(kj::mv(fd));
1740 }
1741 
newDiskFilesystem()1742 Own<Filesystem> newDiskFilesystem() {
1743   return heap<DiskFilesystem>();
1744 }
1745 
1746 } // namespace kj
1747 
1748 #endif  // !_WIN32
1749