1{- git-annex links to content 2 - 3 - On file systems that support them, symlinks are used. 4 - 5 - On other filesystems, git instead stores the symlink target in a regular 6 - file. 7 - 8 - Pointer files are used instead of symlinks for unlocked files. 9 - 10 - Copyright 2013-2021 Joey Hess <id@joeyh.name> 11 - 12 - Licensed under the GNU AGPL version 3 or higher. 13 -} 14 15{-# LANGUAGE CPP, BangPatterns #-} 16 17module Annex.Link where 18 19import Annex.Common 20import qualified Annex 21import qualified Annex.Queue 22import qualified Git.Queue 23import qualified Git.UpdateIndex 24import qualified Git.Index 25import qualified Git.LockFile 26import qualified Git.Env 27import qualified Git 28import Git.Types 29import Git.FilePath 30import Git.Config 31import Annex.HashObject 32import Annex.InodeSentinal 33import Annex.PidLock 34import Utility.FileMode 35import Utility.InodeCache 36import Utility.Tmp.Dir 37import Utility.CopyFile 38import qualified Database.Keys.Handle 39import qualified Utility.RawFilePath as R 40 41import qualified Data.ByteString as S 42import qualified Data.ByteString.Char8 as S8 43import qualified Data.ByteString.Lazy as L 44import qualified System.FilePath.ByteString as P 45 46type LinkTarget = S.ByteString 47 48{- Checks if a file is a link to a key. -} 49isAnnexLink :: RawFilePath -> Annex (Maybe Key) 50isAnnexLink file = maybe Nothing parseLinkTargetOrPointer <$> getAnnexLinkTarget file 51 52{- Gets the link target of a symlink. 53 - 54 - On a filesystem that does not support symlinks, fall back to getting the 55 - link target by looking inside the file. 56 - 57 - Returns Nothing if the file is not a symlink, or not a link to annex 58 - content. 59 -} 60getAnnexLinkTarget :: RawFilePath -> Annex (Maybe LinkTarget) 61getAnnexLinkTarget f = getAnnexLinkTarget' f 62 =<< (coreSymlinks <$> Annex.getGitConfig) 63 64{- Pass False to force looking inside file, for when git checks out 65 - symlinks as plain files. -} 66getAnnexLinkTarget' :: RawFilePath -> Bool -> Annex (Maybe S.ByteString) 67getAnnexLinkTarget' file coresymlinks = if coresymlinks 68 then check probesymlink $ 69 return Nothing 70 else check probesymlink $ 71 check probefilecontent $ 72 return Nothing 73 where 74 check getlinktarget fallback = 75 liftIO (catchMaybeIO getlinktarget) >>= \case 76 Just l 77 | isLinkToAnnex l -> return (Just l) 78 | otherwise -> return Nothing 79 Nothing -> fallback 80 81 probesymlink = R.readSymbolicLink file 82 83 probefilecontent = withFile (fromRawFilePath file) ReadMode $ \h -> do 84 s <- S.hGet h unpaddedMaxPointerSz 85 -- If we got the full amount, the file is too large 86 -- to be a symlink target. 87 return $ if S.length s == unpaddedMaxPointerSz 88 then mempty 89 else 90 -- If there are any NUL or newline 91 -- characters, or whitespace, we 92 -- certianly don't have a symlink to a 93 -- git-annex key. 94 if any (`S8.elem` s) "\0\n\r \t" 95 then mempty 96 else s 97 98makeAnnexLink :: LinkTarget -> RawFilePath -> Annex () 99makeAnnexLink = makeGitLink 100 101{- Creates a link on disk. 102 - 103 - On a filesystem that does not support symlinks, writes the link target 104 - to a file. Note that git will only treat the file as a symlink if 105 - it's staged as such, so use addAnnexLink when adding a new file or 106 - modified link to git. 107 -} 108makeGitLink :: LinkTarget -> RawFilePath -> Annex () 109makeGitLink linktarget file = ifM (coreSymlinks <$> Annex.getGitConfig) 110 ( liftIO $ do 111 void $ tryIO $ R.removeLink file 112 R.createSymbolicLink linktarget file 113 , liftIO $ S.writeFile (fromRawFilePath file) linktarget 114 ) 115 116{- Creates a link on disk, and additionally stages it in git. -} 117addAnnexLink :: LinkTarget -> RawFilePath -> Annex () 118addAnnexLink linktarget file = do 119 makeAnnexLink linktarget file 120 stageSymlink file =<< hashSymlink linktarget 121 122{- Injects a symlink target into git, returning its Sha. -} 123hashSymlink :: LinkTarget -> Annex Sha 124hashSymlink = hashBlob . toInternalGitPath 125 126{- Stages a symlink to an annexed object, using a Sha of its target. -} 127stageSymlink :: RawFilePath -> Sha -> Annex () 128stageSymlink file sha = 129 Annex.Queue.addUpdateIndex =<< 130 inRepo (Git.UpdateIndex.stageSymlink file sha) 131 132{- Injects a pointer file content into git, returning its Sha. -} 133hashPointerFile :: Key -> Annex Sha 134hashPointerFile key = hashBlob $ formatPointer key 135 136{- Stages a pointer file, using a Sha of its content -} 137stagePointerFile :: RawFilePath -> Maybe FileMode -> Sha -> Annex () 138stagePointerFile file mode sha = 139 Annex.Queue.addUpdateIndex =<< 140 inRepo (Git.UpdateIndex.stageFile sha treeitemtype $ fromRawFilePath file) 141 where 142 treeitemtype 143 | maybe False isExecutable mode = TreeExecutable 144 | otherwise = TreeFile 145 146writePointerFile :: RawFilePath -> Key -> Maybe FileMode -> IO () 147writePointerFile file k mode = do 148 S.writeFile (fromRawFilePath file) (formatPointer k) 149 maybe noop (setFileMode $ fromRawFilePath file) mode 150 151newtype Restage = Restage Bool 152 153{- Restage pointer file. This is used after updating a worktree file 154 - when content is added/removed, to prevent git status from showing 155 - it as modified. 156 - 157 - Asks git to refresh its index information for the file. 158 - That in turn runs the clean filter on the file; when the clean 159 - filter produces the same pointer that was in the index before, git 160 - realizes that the file has not actually been modified. 161 - 162 - Note that, if the pointer file is staged for deletion, or has different 163 - content than the current worktree content staged, this won't change 164 - that. So it's safe to call at any time and any situation. 165 - 166 - If the index is known to be locked (eg, git add has run git-annex), 167 - that would fail. Restage False will prevent the index being updated. 168 - Will display a message to help the user understand why 169 - the file will appear to be modified. 170 - 171 - This uses the git queue, so the update is not performed immediately, 172 - and this can be run multiple times cheaply. 173 - 174 - The InodeCache is for the worktree file. It is used to detect when 175 - the worktree file is changed by something else before git update-index 176 - gets to look at it. 177 -} 178restagePointerFile :: Restage -> RawFilePath -> InodeCache -> Annex () 179restagePointerFile (Restage False) f _ = 180 toplevelWarning True $ unableToRestage $ Just $ fromRawFilePath f 181restagePointerFile (Restage True) f orig = withTSDelta $ \tsd -> 182 -- Avoid refreshing the index if run by the 183 -- smudge clean filter, because git uses that when 184 -- it's already refreshing the index, probably because 185 -- this very action is running. Running it again would likely 186 -- deadlock. 187 unlessM (Annex.getState Annex.insmudgecleanfilter) $ do 188 -- update-index is documented as picky about "./file" and it 189 -- fails on "../../repo/path/file" when cwd is not in the repo 190 -- being acted on. Avoid these problems with an absolute path. 191 absf <- liftIO $ absPath f 192 Annex.Queue.addInternalAction runner [(absf, isunmodified tsd)] 193 where 194 isunmodified tsd = genInodeCache f tsd >>= return . \case 195 Nothing -> False 196 Just new -> compareStrong orig new 197 198 -- Other changes to the files may have been staged before this 199 -- gets a chance to run. To avoid a race with any staging of 200 -- changes, first lock the index file. Then run git update-index 201 -- on all still-unmodified files, using a copy of the index file, 202 -- to bypass the lock. Then replace the old index file with the new 203 -- updated index file. 204 runner :: Git.Queue.InternalActionRunner Annex 205 runner = Git.Queue.InternalActionRunner "restagePointerFile" $ \r l -> do 206 liftIO . Database.Keys.Handle.flushDbQueue 207 =<< Annex.getRead Annex.keysdbhandle 208 realindex <- liftIO $ Git.Index.currentIndexFile r 209 let lock = fromRawFilePath (Git.Index.indexFileLock realindex) 210 lockindex = liftIO $ catchMaybeIO $ Git.LockFile.openLock' lock 211 unlockindex = liftIO . maybe noop Git.LockFile.closeLock 212 showwarning = warning $ unableToRestage Nothing 213 go Nothing = showwarning 214 go (Just _) = withTmpDirIn (fromRawFilePath $ Git.localGitDir r) "annexindex" $ \tmpdir -> do 215 let tmpindex = tmpdir </> "index" 216 let updatetmpindex = do 217 r' <- liftIO $ Git.Env.addGitEnv r Git.Index.indexEnv 218 =<< Git.Index.indexEnvVal (toRawFilePath tmpindex) 219 -- Avoid git warning about CRLF munging. 220 let r'' = r' { gitGlobalOpts = gitGlobalOpts r' ++ 221 [ Param "-c" 222 , Param $ "core.safecrlf=" ++ boolConfig False 223 ] } 224 runsGitAnnexChildProcessViaGit' r'' $ \r''' -> 225 liftIO $ Git.UpdateIndex.refreshIndex r''' $ \feed -> 226 forM_ l $ \(f', checkunmodified) -> 227 whenM checkunmodified $ 228 feed f' 229 let replaceindex = catchBoolIO $ do 230 moveFile tmpindex (fromRawFilePath realindex) 231 return True 232 ok <- liftIO (createLinkOrCopy (fromRawFilePath realindex) tmpindex) 233 <&&> updatetmpindex 234 <&&> liftIO replaceindex 235 unless ok showwarning 236 bracket lockindex unlockindex go 237 238unableToRestage :: Maybe FilePath -> String 239unableToRestage mf = unwords 240 [ "git status will show " ++ fromMaybe "some files" mf 241 , "to be modified, since content availability has changed" 242 , "and git-annex was unable to update the index." 243 , "This is only a cosmetic problem affecting git status; git add," 244 , "git commit, etc won't be affected." 245 , "To fix the git status display, you can run:" 246 , "git update-index -q --refresh " ++ fromMaybe "<file>" mf 247 ] 248 249{- Parses a symlink target or a pointer file to a Key. -} 250parseLinkTargetOrPointer :: S.ByteString -> Maybe Key 251parseLinkTargetOrPointer = parseLinkTarget . S8.takeWhile (not . lineend) 252 where 253 lineend '\n' = True 254 lineend '\r' = True 255 lineend _ = False 256 257{- Avoid looking at more of the lazy ByteString than necessary since it 258 - could be reading from a large file that is not a pointer file. -} 259parseLinkTargetOrPointerLazy :: L.ByteString -> Maybe Key 260parseLinkTargetOrPointerLazy b = 261 let b' = L.take (fromIntegral maxPointerSz) b 262 in parseLinkTargetOrPointer (L.toStrict b') 263 264{- Parses a symlink target to a Key. -} 265parseLinkTarget :: S.ByteString -> Maybe Key 266parseLinkTarget l 267 | isLinkToAnnex l = fileKey $ snd $ S8.breakEnd pathsep l 268 | otherwise = Nothing 269 where 270 pathsep '/' = True 271#ifdef mingw32_HOST_OS 272 pathsep '\\' = True 273#endif 274 pathsep _ = False 275 276formatPointer :: Key -> S.ByteString 277formatPointer k = prefix <> keyFile k <> nl 278 where 279 prefix = toInternalGitPath $ P.pathSeparator `S.cons` objectDir' 280 nl = S8.singleton '\n' 281 282{- Maximum size of a file that could be a pointer to a key. 283 - Check to avoid buffering really big files in git into 284 - memory when reading files that may be pointers. 285 - 286 - 8192 bytes is plenty for a pointer to a key. This adds some additional 287 - padding to allow for any pointer files that might have 288 - lines after the key explaining what the file is used for. -} 289maxPointerSz :: Integer 290maxPointerSz = 81920 291 292unpaddedMaxPointerSz :: Int 293unpaddedMaxPointerSz = 8192 294 295{- Checks if a worktree file is a pointer to a key. 296 - 297 - Unlocked files whose content is present are not detected by this. 298 - 299 - It's possible, though unlikely, that an annex symlink points to 300 - an object that looks like a pointer file. Or that a non-annex 301 - symlink does. Avoids a false positive in those cases. 302 - -} 303isPointerFile :: RawFilePath -> IO (Maybe Key) 304isPointerFile f = catchDefaultIO Nothing $ 305#if defined(mingw32_HOST_OS) 306 checkcontentfollowssymlinks -- no symlinks supported on windows 307#else 308#if MIN_VERSION_unix(2,8,0) 309 bracket 310 (openFd (fromRawFilePath f) ReadOnly (defaultFileFlags { nofollow = True }) Nothing) 311 closeFd 312 (\fd -> readhandle =<< fdToHandle fd) 313#else 314 ifM (isSymbolicLink <$> R.getSymbolicLinkStatus f) 315 ( return Nothing 316 , checkcontentfollowssymlinks 317 ) 318#endif 319#endif 320 where 321 checkcontentfollowssymlinks = 322 withFile (fromRawFilePath f) ReadMode readhandle 323 readhandle h = parseLinkTargetOrPointer <$> S.hGet h unpaddedMaxPointerSz 324 325{- Checks a symlink target or pointer file first line to see if it 326 - appears to point to annexed content. 327 - 328 - We only look for paths inside the .git directory, and not at the .git 329 - directory itself, because GIT_DIR may cause a directory name other 330 - than .git to be used. 331 -} 332isLinkToAnnex :: S.ByteString -> Bool 333isLinkToAnnex s = p `S.isInfixOf` s 334#ifdef mingw32_HOST_OS 335 -- '/' is used inside pointer files on Windows, not the native '\' 336 || p' `S.isInfixOf` s 337#endif 338 where 339 p = P.pathSeparator `S.cons` objectDir' 340#ifdef mingw32_HOST_OS 341 p' = toInternalGitPath p 342#endif 343