1{- youtube-dl integration for git-annex
2 -
3 - Copyright 2017-2021 Joey Hess <id@joeyh.name>
4 -
5 - Licensed under the GNU AGPL version 3 or higher.
6 -}
7
8module Annex.YoutubeDl (
9	youtubeDl,
10	youtubeDlTo,
11	youtubeDlSupported,
12	youtubeDlCheck,
13	youtubeDlFileName,
14	youtubeDlFileNameHtmlOnly,
15) where
16
17import Annex.Common
18import qualified Annex
19import Annex.Content
20import Annex.Url
21import Utility.DiskFree
22import Utility.HtmlDetect
23import Utility.Process.Transcript
24import Utility.Metered
25import Utility.DataUnits
26import Messages.Progress
27import Logs.Transfer
28
29import Network.URI
30import Control.Concurrent.Async
31import Data.Char
32import Text.Read
33
34-- youtube-dl can follow redirects to anywhere, including potentially
35-- localhost or a private address. So, it's only allowed to download
36-- content if the user has allowed access to all addresses.
37youtubeDlAllowed :: Annex Bool
38youtubeDlAllowed = ipAddressesUnlimited
39
40youtubeDlNotAllowedMessage :: String
41youtubeDlNotAllowedMessage = unwords
42	[ "This url is supported by youtube-dl, but"
43	, "youtube-dl could potentially access any address, and the"
44	, "configuration of annex.security.allowed-ip-addresses"
45	, "does not allow that. Not using youtube-dl."
46	]
47
48-- Runs youtube-dl in a work directory, to download a single media file
49-- from the url. Reutrns the path to the media file in the work directory.
50--
51-- Displays a progress meter as youtube-dl downloads.
52--
53-- If youtube-dl fails without writing any files to the work directory,
54-- or is not installed, returns Right Nothing.
55--
56-- The work directory can contain files from a previous run of youtube-dl
57-- and it will resume. It should not contain any other files though,
58-- and youtube-dl needs to finish up with only one file in the directory
59-- so we know which one it downloaded.
60--
61-- (Note that we can't use --output to specifiy the file to download to,
62-- due to <https://github.com/rg3/youtube-dl/issues/14864>)
63youtubeDl :: URLString -> FilePath -> MeterUpdate -> Annex (Either String (Maybe FilePath))
64youtubeDl url workdir p = ifM ipAddressesUnlimited
65	( withUrlOptions $ youtubeDl' url workdir p
66	, return $ Left youtubeDlNotAllowedMessage
67	)
68
69youtubeDl' :: URLString -> FilePath -> MeterUpdate -> UrlOptions -> Annex (Either String (Maybe FilePath))
70youtubeDl' url workdir p uo
71	| supportedScheme uo url = ifM (liftIO . inSearchPath =<< youtubeDlCommand)
72		( runcmd >>= \case
73			Right True -> workdirfiles >>= \case
74				(f:[]) -> return (Right (Just f))
75				[] -> return nofiles
76				fs -> return (toomanyfiles fs)
77			Right False -> workdirfiles >>= \case
78				[] -> return (Right Nothing)
79				_ -> return (Left "youtube-dl download is incomplete. Run the command again to resume.")
80			Left msg -> return (Left msg)
81		, return (Right Nothing)
82		)
83	| otherwise = return (Right Nothing)
84  where
85	nofiles = Left "youtube-dl did not put any media in its work directory, perhaps it's been configured to store files somewhere else?"
86	toomanyfiles fs = Left $ "youtube-dl downloaded multiple media files; git-annex is only able to deal with one per url: " ++ show fs
87	workdirfiles = liftIO $ filterM (doesFileExist) =<< dirContents workdir
88	runcmd = youtubeDlMaxSize workdir >>= \case
89		Left msg -> return (Left msg)
90		Right maxsize -> do
91			cmd <- youtubeDlCommand
92			opts <- youtubeDlOpts (dlopts ++ maxsize)
93			oh <- mkOutputHandlerQuiet
94			-- The size is unknown to start. Once youtube-dl
95			-- outputs some progress, the meter will be updated
96			-- with the size, which is why it's important the
97			-- meter is passed into commandMeter'
98			let unknownsize = Nothing :: Maybe FileSize
99			ok <- metered (Just p) unknownsize $ \meter meterupdate ->
100				liftIO $ commandMeter'
101					parseYoutubeDlProgress oh (Just meter) meterupdate cmd opts
102					(\pr -> pr { cwd = Just workdir })
103			return (Right ok)
104	dlopts =
105		[ Param url
106		-- To make youtube-dl only download one file when given a
107		-- page with a video and a playlist, download only the video.
108		, Param "--no-playlist"
109		-- And when given a page with only a playlist, download only
110		-- the first video on the playlist. (Assumes the video is
111		-- somewhat stable, but this is the only way to prevent
112		-- youtube-dl from downloading the whole playlist.)
113		, Param "--playlist-items", Param "0"
114		]
115
116-- To honor annex.diskreserve, ask youtube-dl to not download too
117-- large a media file. Factors in other downloads that are in progress,
118-- and any files in the workdir that it may have partially downloaded
119-- before.
120youtubeDlMaxSize :: FilePath -> Annex (Either String [CommandParam])
121youtubeDlMaxSize workdir = ifM (Annex.getState Annex.force)
122	( return $ Right []
123	, liftIO (getDiskFree workdir) >>= \case
124		Just have -> do
125			inprogress <- sizeOfDownloadsInProgress (const True)
126			partial <- liftIO $ sum
127				<$> (mapM (getFileSize . toRawFilePath) =<< dirContents workdir)
128			reserve <- annexDiskReserve <$> Annex.getGitConfig
129			let maxsize = have - reserve - inprogress + partial
130			if maxsize > 0
131				then return $ Right
132					[ Param "--max-filesize"
133					, Param (show maxsize)
134					]
135				else return $ Left $
136					needMoreDiskSpace $
137						negate maxsize + 1024
138		Nothing -> return $ Right []
139	)
140
141-- Download a media file to a destination,
142youtubeDlTo :: Key -> URLString -> FilePath -> MeterUpdate -> Annex Bool
143youtubeDlTo key url dest p = do
144	res <- withTmpWorkDir key $ \workdir ->
145		youtubeDl url (fromRawFilePath workdir) p >>= \case
146			Right (Just mediafile) -> do
147				liftIO $ renameFile mediafile dest
148				return (Just True)
149			Right Nothing -> return (Just False)
150			Left msg -> do
151				warning msg
152				return Nothing
153	return (fromMaybe False res)
154
155-- youtube-dl supports downloading urls that are not html pages,
156-- but we don't want to use it for such urls, since they can be downloaded
157-- without it. So, this first downloads part of the content and checks
158-- if it's a html page; only then is youtube-dl used.
159htmlOnly :: URLString -> a -> Annex a -> Annex a
160htmlOnly url fallback a = withUrlOptions $ \uo ->
161	liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
162		Just bs | isHtmlBs bs -> a
163		_ -> return fallback
164
165-- Check if youtube-dl supports downloading content from an url.
166youtubeDlSupported :: URLString -> Annex Bool
167youtubeDlSupported url = either (const False) id
168	<$> withUrlOptions (youtubeDlCheck' url)
169
170-- Check if youtube-dl can find media in an url.
171--
172-- While this does not download anything, it checks youtubeDlAllowed
173-- for symmetry with youtubeDl; the check should not succeed if the
174-- download won't succeed.
175youtubeDlCheck :: URLString -> Annex (Either String Bool)
176youtubeDlCheck url = ifM youtubeDlAllowed
177	( withUrlOptions $ youtubeDlCheck' url
178	, return $ Left youtubeDlNotAllowedMessage
179	)
180
181youtubeDlCheck' :: URLString -> UrlOptions -> Annex (Either String Bool)
182youtubeDlCheck' url uo
183	| supportedScheme uo url = catchMsgIO $ htmlOnly url False $ do
184		opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
185		cmd <- youtubeDlCommand
186		liftIO $ snd <$> processTranscript cmd (toCommand opts) Nothing
187	| otherwise = return (Right False)
188
189-- Ask youtube-dl for the filename of media in an url.
190--
191-- (This is not always identical to the filename it uses when downloading.)
192youtubeDlFileName :: URLString -> Annex (Either String FilePath)
193youtubeDlFileName url = withUrlOptions go
194  where
195	go uo
196		| supportedScheme uo url = flip catchIO (pure . Left . show) $
197			htmlOnly url nomedia (youtubeDlFileNameHtmlOnly' url uo)
198		| otherwise = return nomedia
199	nomedia = Left "no media in url"
200
201-- Does not check if the url contains htmlOnly; use when that's already
202-- been verified.
203youtubeDlFileNameHtmlOnly :: URLString -> Annex (Either String FilePath)
204youtubeDlFileNameHtmlOnly = withUrlOptions . youtubeDlFileNameHtmlOnly'
205
206youtubeDlFileNameHtmlOnly' :: URLString -> UrlOptions -> Annex (Either String FilePath)
207youtubeDlFileNameHtmlOnly' url uo
208	| supportedScheme uo url = flip catchIO (pure . Left . show) go
209	| otherwise = return nomedia
210  where
211	go = do
212		-- Sometimes youtube-dl will fail with an ugly backtrace
213		-- (eg, http://bugs.debian.org/874321)
214		-- so catch stderr as well as stdout to avoid the user
215		-- seeing it. --no-warnings avoids warning messages that
216		-- are output to stdout.
217		opts <- youtubeDlOpts
218			[ Param url
219			, Param "--get-filename"
220			, Param "--no-warnings"
221			, Param "--no-playlist"
222			]
223		cmd <- youtubeDlCommand
224		let p = (proc cmd (toCommand opts))
225			{ std_out = CreatePipe
226			, std_err = CreatePipe
227			}
228		liftIO $ withCreateProcess p waitproc
229
230	waitproc Nothing (Just o) (Just e) pid = do
231		errt <- async $ discardstderr pid e
232		output <- hGetContentsStrict o
233		ok <- liftIO $ checkSuccessProcess pid
234		wait errt
235		return $ case (ok, lines output) of
236			(True, (f:_)) | not (null f) -> Right f
237			_ -> nomedia
238	waitproc _ _ _ _ = error "internal"
239
240	discardstderr pid e = hGetLineUntilExitOrEOF pid e >>= \case
241		Nothing -> return ()
242		Just _ -> discardstderr pid e
243
244	nomedia = Left "no media in url"
245
246youtubeDlOpts :: [CommandParam] -> Annex [CommandParam]
247youtubeDlOpts addopts = do
248	opts <- map Param . annexYoutubeDlOptions <$> Annex.getGitConfig
249	return (opts ++ addopts)
250
251youtubeDlCommand :: Annex String
252youtubeDlCommand = fromMaybe "yooutube-dl" . annexYoutubeDlCommand
253	<$> Annex.getGitConfig
254
255supportedScheme :: UrlOptions -> URLString -> Bool
256supportedScheme uo url = case parseURIRelaxed url of
257	Nothing -> False
258	Just u -> case uriScheme u of
259		-- avoid ugly message from youtube-dl about not supporting file:
260		"file:" -> False
261		-- ftp indexes may look like html pages, and there's no point
262		-- involving youtube-dl in a ftp download
263		"ftp:" -> False
264		_ -> allowedScheme uo u
265
266{- Strategy: Look for chunks prefixed with \r, which look approximately
267 - like this:
268 - "ESC[K[download]  26.6% of 60.22MiB at 254.69MiB/s ETA 00:00"
269 - Look at the number before "% of " and the number and unit after,
270 - to determine the number of bytes.
271 -}
272parseYoutubeDlProgress :: ProgressParser
273parseYoutubeDlProgress = go [] . reverse . progresschunks
274  where
275	delim = '\r'
276
277	progresschunks = drop 1 . splitc delim
278
279	go remainder [] = (Nothing, Nothing, remainder)
280	go remainder (x:xs) = case split "% of " x of
281		(p:r:[]) -> case (parsepercent p, parsebytes r) of
282			(Just percent, Just total) ->
283				( Just (toBytesProcessed (calc percent total))
284				, Just (TotalSize total)
285				, remainder
286				)
287			_ -> go (delim:x++remainder) xs
288		_ -> go (delim:x++remainder) xs
289
290	calc :: Double -> Integer -> Integer
291	calc percent total = round (percent * fromIntegral total / 100)
292
293	parsepercent :: String -> Maybe Double
294	parsepercent = readMaybe . reverse . takeWhile (not . isSpace) . reverse
295
296	parsebytes = readSize units . takeWhile (not . isSpace)
297
298	units = memoryUnits ++ storageUnits
299