1{- git-annex links to content
2 -
3 - On file systems that support them, symlinks are used.
4 -
5 - On other filesystems, git instead stores the symlink target in a regular
6 - file.
7 -
8 - Pointer files are used instead of symlinks for unlocked files.
9 -
10 - Copyright 2013-2021 Joey Hess <id@joeyh.name>
11 -
12 - Licensed under the GNU AGPL version 3 or higher.
13 -}
14
15{-# LANGUAGE CPP, BangPatterns #-}
16
17module Annex.Link where
18
19import Annex.Common
20import qualified Annex
21import qualified Annex.Queue
22import qualified Git.Queue
23import qualified Git.UpdateIndex
24import qualified Git.Index
25import qualified Git.LockFile
26import qualified Git.Env
27import qualified Git
28import Git.Types
29import Git.FilePath
30import Git.Config
31import Annex.HashObject
32import Annex.InodeSentinal
33import Annex.PidLock
34import Utility.FileMode
35import Utility.InodeCache
36import Utility.Tmp.Dir
37import Utility.CopyFile
38import qualified Database.Keys.Handle
39import qualified Utility.RawFilePath as R
40
41import qualified Data.ByteString as S
42import qualified Data.ByteString.Char8 as S8
43import qualified Data.ByteString.Lazy as L
44import qualified System.FilePath.ByteString as P
45
46type LinkTarget = S.ByteString
47
48{- Checks if a file is a link to a key. -}
49isAnnexLink :: RawFilePath -> Annex (Maybe Key)
50isAnnexLink file = maybe Nothing parseLinkTargetOrPointer <$> getAnnexLinkTarget file
51
52{- Gets the link target of a symlink.
53 -
54 - On a filesystem that does not support symlinks, fall back to getting the
55 - link target by looking inside the file.
56 -
57 - Returns Nothing if the file is not a symlink, or not a link to annex
58 - content.
59 -}
60getAnnexLinkTarget :: RawFilePath -> Annex (Maybe LinkTarget)
61getAnnexLinkTarget f = getAnnexLinkTarget' f
62	=<< (coreSymlinks <$> Annex.getGitConfig)
63
64{- Pass False to force looking inside file, for when git checks out
65 - symlinks as plain files. -}
66getAnnexLinkTarget' :: RawFilePath -> Bool -> Annex (Maybe S.ByteString)
67getAnnexLinkTarget' file coresymlinks = if coresymlinks
68	then check probesymlink $
69		return Nothing
70	else check probesymlink $
71		check probefilecontent $
72			return Nothing
73  where
74	check getlinktarget fallback =
75		liftIO (catchMaybeIO getlinktarget) >>= \case
76			Just l
77				| isLinkToAnnex l -> return (Just l)
78				| otherwise -> return Nothing
79			Nothing -> fallback
80
81	probesymlink = R.readSymbolicLink file
82
83	probefilecontent = withFile (fromRawFilePath file) ReadMode $ \h -> do
84		s <- S.hGet h unpaddedMaxPointerSz
85		-- If we got the full amount, the file is too large
86		-- to be a symlink target.
87		return $ if S.length s == unpaddedMaxPointerSz
88			then mempty
89			else
90				-- If there are any NUL or newline
91				-- characters, or whitespace, we
92				-- certianly don't have a symlink to a
93				-- git-annex key.
94				if any (`S8.elem` s) "\0\n\r \t"
95					then mempty
96					else s
97
98makeAnnexLink :: LinkTarget -> RawFilePath -> Annex ()
99makeAnnexLink = makeGitLink
100
101{- Creates a link on disk.
102 -
103 - On a filesystem that does not support symlinks, writes the link target
104 - to a file. Note that git will only treat the file as a symlink if
105 - it's staged as such, so use addAnnexLink when adding a new file or
106 - modified link to git.
107 -}
108makeGitLink :: LinkTarget -> RawFilePath -> Annex ()
109makeGitLink linktarget file = ifM (coreSymlinks <$> Annex.getGitConfig)
110	( liftIO $ do
111		void $ tryIO $ R.removeLink file
112		R.createSymbolicLink linktarget file
113	, liftIO $ S.writeFile (fromRawFilePath file) linktarget
114	)
115
116{- Creates a link on disk, and additionally stages it in git. -}
117addAnnexLink :: LinkTarget -> RawFilePath -> Annex ()
118addAnnexLink linktarget file = do
119	makeAnnexLink linktarget file
120	stageSymlink file =<< hashSymlink linktarget
121
122{- Injects a symlink target into git, returning its Sha. -}
123hashSymlink :: LinkTarget -> Annex Sha
124hashSymlink = hashBlob . toInternalGitPath
125
126{- Stages a symlink to an annexed object, using a Sha of its target. -}
127stageSymlink :: RawFilePath -> Sha -> Annex ()
128stageSymlink file sha =
129	Annex.Queue.addUpdateIndex =<<
130		inRepo (Git.UpdateIndex.stageSymlink file sha)
131
132{- Injects a pointer file content into git, returning its Sha. -}
133hashPointerFile :: Key -> Annex Sha
134hashPointerFile key = hashBlob $ formatPointer key
135
136{- Stages a pointer file, using a Sha of its content -}
137stagePointerFile :: RawFilePath -> Maybe FileMode -> Sha -> Annex ()
138stagePointerFile file mode sha =
139	Annex.Queue.addUpdateIndex =<<
140		inRepo (Git.UpdateIndex.stageFile sha treeitemtype $ fromRawFilePath file)
141  where
142	treeitemtype
143		| maybe False isExecutable mode = TreeExecutable
144		| otherwise = TreeFile
145
146writePointerFile :: RawFilePath -> Key -> Maybe FileMode -> IO ()
147writePointerFile file k mode = do
148	S.writeFile (fromRawFilePath file) (formatPointer k)
149	maybe noop (setFileMode $ fromRawFilePath file) mode
150
151newtype Restage = Restage Bool
152
153{- Restage pointer file. This is used after updating a worktree file
154 - when content is added/removed, to prevent git status from showing
155 - it as modified.
156 -
157 - Asks git to refresh its index information for the file.
158 - That in turn runs the clean filter on the file; when the clean
159 - filter produces the same pointer that was in the index before, git
160 - realizes that the file has not actually been modified.
161 -
162 - Note that, if the pointer file is staged for deletion, or has different
163 - content than the current worktree content staged, this won't change
164 - that. So it's safe to call at any time and any situation.
165 -
166 - If the index is known to be locked (eg, git add has run git-annex),
167 - that would fail. Restage False will prevent the index being updated.
168 - Will display a message to help the user understand why
169 - the file will appear to be modified.
170 -
171 - This uses the git queue, so the update is not performed immediately,
172 - and this can be run multiple times cheaply.
173 -
174 - The InodeCache is for the worktree file. It is used to detect when
175 - the worktree file is changed by something else before git update-index
176 - gets to look at it.
177 -}
178restagePointerFile :: Restage -> RawFilePath -> InodeCache -> Annex ()
179restagePointerFile (Restage False) f _ =
180	toplevelWarning True $ unableToRestage $ Just $ fromRawFilePath f
181restagePointerFile (Restage True) f orig = withTSDelta $ \tsd ->
182	-- Avoid refreshing the index if run by the
183	-- smudge clean filter, because git uses that when
184	-- it's already refreshing the index, probably because
185	-- this very action is running. Running it again would likely
186	-- deadlock.
187	unlessM (Annex.getState Annex.insmudgecleanfilter) $ do
188		-- update-index is documented as picky about "./file" and it
189		-- fails on "../../repo/path/file" when cwd is not in the repo
190		-- being acted on. Avoid these problems with an absolute path.
191		absf <- liftIO $ absPath f
192		Annex.Queue.addInternalAction runner [(absf, isunmodified tsd)]
193  where
194	isunmodified tsd = genInodeCache f tsd >>= return . \case
195		Nothing -> False
196		Just new -> compareStrong orig new
197
198	-- Other changes to the files may have been staged before this
199	-- gets a chance to run. To avoid a race with any staging of
200	-- changes, first lock the index file. Then run git update-index
201	-- on all still-unmodified files, using a copy of the index file,
202	-- to bypass the lock. Then replace the old index file with the new
203	-- updated index file.
204	runner :: Git.Queue.InternalActionRunner Annex
205	runner = Git.Queue.InternalActionRunner "restagePointerFile" $ \r l -> do
206		liftIO . Database.Keys.Handle.flushDbQueue
207			=<< Annex.getRead Annex.keysdbhandle
208		realindex <- liftIO $ Git.Index.currentIndexFile r
209		let lock = fromRawFilePath (Git.Index.indexFileLock realindex)
210		    lockindex = liftIO $ catchMaybeIO $ Git.LockFile.openLock' lock
211		    unlockindex = liftIO . maybe noop Git.LockFile.closeLock
212		    showwarning = warning $ unableToRestage Nothing
213		    go Nothing = showwarning
214		    go (Just _) = withTmpDirIn (fromRawFilePath $ Git.localGitDir r) "annexindex" $ \tmpdir -> do
215			let tmpindex = tmpdir </> "index"
216			let updatetmpindex = do
217				r' <- liftIO $ Git.Env.addGitEnv r Git.Index.indexEnv
218					=<< Git.Index.indexEnvVal (toRawFilePath tmpindex)
219				-- Avoid git warning about CRLF munging.
220				let r'' = r' { gitGlobalOpts = gitGlobalOpts r' ++
221					[ Param "-c"
222					, Param $ "core.safecrlf=" ++ boolConfig False
223					] }
224				runsGitAnnexChildProcessViaGit' r'' $ \r''' ->
225					liftIO $ Git.UpdateIndex.refreshIndex r''' $ \feed ->
226						forM_ l $ \(f', checkunmodified) ->
227							whenM checkunmodified $
228								feed f'
229			let replaceindex = catchBoolIO $ do
230				moveFile tmpindex (fromRawFilePath realindex)
231				return True
232			ok <- liftIO (createLinkOrCopy (fromRawFilePath realindex) tmpindex)
233				<&&> updatetmpindex
234				<&&> liftIO replaceindex
235			unless ok showwarning
236		bracket lockindex unlockindex go
237
238unableToRestage :: Maybe FilePath -> String
239unableToRestage mf = unwords
240	[ "git status will show " ++ fromMaybe "some files" mf
241	, "to be modified, since content availability has changed"
242	, "and git-annex was unable to update the index."
243	, "This is only a cosmetic problem affecting git status; git add,"
244	, "git commit, etc won't be affected."
245	, "To fix the git status display, you can run:"
246	, "git update-index -q --refresh " ++ fromMaybe "<file>" mf
247	]
248
249{- Parses a symlink target or a pointer file to a Key. -}
250parseLinkTargetOrPointer :: S.ByteString -> Maybe Key
251parseLinkTargetOrPointer = parseLinkTarget . S8.takeWhile (not . lineend)
252  where
253	lineend '\n' = True
254	lineend '\r' = True
255	lineend _ = False
256
257{- Avoid looking at more of the lazy ByteString than necessary since it
258 - could be reading from a large file that is not a pointer file. -}
259parseLinkTargetOrPointerLazy :: L.ByteString -> Maybe Key
260parseLinkTargetOrPointerLazy b =
261	let b' = L.take (fromIntegral maxPointerSz) b
262	in parseLinkTargetOrPointer (L.toStrict b')
263
264{- Parses a symlink target to a Key. -}
265parseLinkTarget :: S.ByteString -> Maybe Key
266parseLinkTarget l
267	| isLinkToAnnex l = fileKey $ snd $ S8.breakEnd pathsep l
268	| otherwise = Nothing
269  where
270	pathsep '/' = True
271#ifdef mingw32_HOST_OS
272	pathsep '\\' = True
273#endif
274	pathsep _ = False
275
276formatPointer :: Key -> S.ByteString
277formatPointer k = prefix <> keyFile k <> nl
278  where
279	prefix = toInternalGitPath $ P.pathSeparator `S.cons` objectDir'
280	nl = S8.singleton '\n'
281
282{- Maximum size of a file that could be a pointer to a key.
283 - Check to avoid buffering really big files in git into
284 - memory when reading files that may be pointers.
285 -
286 - 8192 bytes is plenty for a pointer to a key. This adds some additional
287 - padding to allow for any pointer files that might have
288 - lines after the key explaining what the file is used for. -}
289maxPointerSz :: Integer
290maxPointerSz = 81920
291
292unpaddedMaxPointerSz :: Int
293unpaddedMaxPointerSz = 8192
294
295{- Checks if a worktree file is a pointer to a key.
296 -
297 - Unlocked files whose content is present are not detected by this.
298 -
299 - It's possible, though unlikely, that an annex symlink points to
300 - an object that looks like a pointer file. Or that a non-annex
301 - symlink does. Avoids a false positive in those cases.
302 - -}
303isPointerFile :: RawFilePath -> IO (Maybe Key)
304isPointerFile f = catchDefaultIO Nothing $
305#if defined(mingw32_HOST_OS)
306	checkcontentfollowssymlinks -- no symlinks supported on windows
307#else
308#if MIN_VERSION_unix(2,8,0)
309	bracket
310		(openFd (fromRawFilePath f) ReadOnly (defaultFileFlags { nofollow = True }) Nothing)
311		closeFd
312		(\fd -> readhandle =<< fdToHandle fd)
313#else
314	ifM (isSymbolicLink <$> R.getSymbolicLinkStatus f)
315		( return Nothing
316		, checkcontentfollowssymlinks
317		)
318#endif
319#endif
320  where
321	checkcontentfollowssymlinks =
322		withFile (fromRawFilePath f) ReadMode readhandle
323	readhandle h = parseLinkTargetOrPointer <$> S.hGet h unpaddedMaxPointerSz
324
325{- Checks a symlink target or pointer file first line to see if it
326 - appears to point to annexed content.
327 -
328 - We only look for paths inside the .git directory, and not at the .git
329 - directory itself, because GIT_DIR may cause a directory name other
330 - than .git to be used.
331 -}
332isLinkToAnnex :: S.ByteString -> Bool
333isLinkToAnnex s = p `S.isInfixOf` s
334#ifdef mingw32_HOST_OS
335	-- '/' is used inside pointer files on Windows, not the native '\'
336	|| p' `S.isInfixOf` s
337#endif
338  where
339	p = P.pathSeparator `S.cons` objectDir'
340#ifdef mingw32_HOST_OS
341	p' = toInternalGitPath p
342#endif
343