1{- youtube-dl integration for git-annex 2 - 3 - Copyright 2017-2021 Joey Hess <id@joeyh.name> 4 - 5 - Licensed under the GNU AGPL version 3 or higher. 6 -} 7 8module Annex.YoutubeDl ( 9 youtubeDl, 10 youtubeDlTo, 11 youtubeDlSupported, 12 youtubeDlCheck, 13 youtubeDlFileName, 14 youtubeDlFileNameHtmlOnly, 15) where 16 17import Annex.Common 18import qualified Annex 19import Annex.Content 20import Annex.Url 21import Utility.DiskFree 22import Utility.HtmlDetect 23import Utility.Process.Transcript 24import Utility.Metered 25import Utility.DataUnits 26import Messages.Progress 27import Logs.Transfer 28 29import Network.URI 30import Control.Concurrent.Async 31import Data.Char 32import Text.Read 33 34-- youtube-dl can follow redirects to anywhere, including potentially 35-- localhost or a private address. So, it's only allowed to download 36-- content if the user has allowed access to all addresses. 37youtubeDlAllowed :: Annex Bool 38youtubeDlAllowed = ipAddressesUnlimited 39 40youtubeDlNotAllowedMessage :: String 41youtubeDlNotAllowedMessage = unwords 42 [ "This url is supported by youtube-dl, but" 43 , "youtube-dl could potentially access any address, and the" 44 , "configuration of annex.security.allowed-ip-addresses" 45 , "does not allow that. Not using youtube-dl." 46 ] 47 48-- Runs youtube-dl in a work directory, to download a single media file 49-- from the url. Reutrns the path to the media file in the work directory. 50-- 51-- Displays a progress meter as youtube-dl downloads. 52-- 53-- If youtube-dl fails without writing any files to the work directory, 54-- or is not installed, returns Right Nothing. 55-- 56-- The work directory can contain files from a previous run of youtube-dl 57-- and it will resume. It should not contain any other files though, 58-- and youtube-dl needs to finish up with only one file in the directory 59-- so we know which one it downloaded. 60-- 61-- (Note that we can't use --output to specifiy the file to download to, 62-- due to <https://github.com/rg3/youtube-dl/issues/14864>) 63youtubeDl :: URLString -> FilePath -> MeterUpdate -> Annex (Either String (Maybe FilePath)) 64youtubeDl url workdir p = ifM ipAddressesUnlimited 65 ( withUrlOptions $ youtubeDl' url workdir p 66 , return $ Left youtubeDlNotAllowedMessage 67 ) 68 69youtubeDl' :: URLString -> FilePath -> MeterUpdate -> UrlOptions -> Annex (Either String (Maybe FilePath)) 70youtubeDl' url workdir p uo 71 | supportedScheme uo url = ifM (liftIO . inSearchPath =<< youtubeDlCommand) 72 ( runcmd >>= \case 73 Right True -> workdirfiles >>= \case 74 (f:[]) -> return (Right (Just f)) 75 [] -> return nofiles 76 fs -> return (toomanyfiles fs) 77 Right False -> workdirfiles >>= \case 78 [] -> return (Right Nothing) 79 _ -> return (Left "youtube-dl download is incomplete. Run the command again to resume.") 80 Left msg -> return (Left msg) 81 , return (Right Nothing) 82 ) 83 | otherwise = return (Right Nothing) 84 where 85 nofiles = Left "youtube-dl did not put any media in its work directory, perhaps it's been configured to store files somewhere else?" 86 toomanyfiles fs = Left $ "youtube-dl downloaded multiple media files; git-annex is only able to deal with one per url: " ++ show fs 87 workdirfiles = liftIO $ filterM (doesFileExist) =<< dirContents workdir 88 runcmd = youtubeDlMaxSize workdir >>= \case 89 Left msg -> return (Left msg) 90 Right maxsize -> do 91 cmd <- youtubeDlCommand 92 opts <- youtubeDlOpts (dlopts ++ maxsize) 93 oh <- mkOutputHandlerQuiet 94 -- The size is unknown to start. Once youtube-dl 95 -- outputs some progress, the meter will be updated 96 -- with the size, which is why it's important the 97 -- meter is passed into commandMeter' 98 let unknownsize = Nothing :: Maybe FileSize 99 ok <- metered (Just p) unknownsize $ \meter meterupdate -> 100 liftIO $ commandMeter' 101 parseYoutubeDlProgress oh (Just meter) meterupdate cmd opts 102 (\pr -> pr { cwd = Just workdir }) 103 return (Right ok) 104 dlopts = 105 [ Param url 106 -- To make youtube-dl only download one file when given a 107 -- page with a video and a playlist, download only the video. 108 , Param "--no-playlist" 109 -- And when given a page with only a playlist, download only 110 -- the first video on the playlist. (Assumes the video is 111 -- somewhat stable, but this is the only way to prevent 112 -- youtube-dl from downloading the whole playlist.) 113 , Param "--playlist-items", Param "0" 114 ] 115 116-- To honor annex.diskreserve, ask youtube-dl to not download too 117-- large a media file. Factors in other downloads that are in progress, 118-- and any files in the workdir that it may have partially downloaded 119-- before. 120youtubeDlMaxSize :: FilePath -> Annex (Either String [CommandParam]) 121youtubeDlMaxSize workdir = ifM (Annex.getState Annex.force) 122 ( return $ Right [] 123 , liftIO (getDiskFree workdir) >>= \case 124 Just have -> do 125 inprogress <- sizeOfDownloadsInProgress (const True) 126 partial <- liftIO $ sum 127 <$> (mapM (getFileSize . toRawFilePath) =<< dirContents workdir) 128 reserve <- annexDiskReserve <$> Annex.getGitConfig 129 let maxsize = have - reserve - inprogress + partial 130 if maxsize > 0 131 then return $ Right 132 [ Param "--max-filesize" 133 , Param (show maxsize) 134 ] 135 else return $ Left $ 136 needMoreDiskSpace $ 137 negate maxsize + 1024 138 Nothing -> return $ Right [] 139 ) 140 141-- Download a media file to a destination, 142youtubeDlTo :: Key -> URLString -> FilePath -> MeterUpdate -> Annex Bool 143youtubeDlTo key url dest p = do 144 res <- withTmpWorkDir key $ \workdir -> 145 youtubeDl url (fromRawFilePath workdir) p >>= \case 146 Right (Just mediafile) -> do 147 liftIO $ renameFile mediafile dest 148 return (Just True) 149 Right Nothing -> return (Just False) 150 Left msg -> do 151 warning msg 152 return Nothing 153 return (fromMaybe False res) 154 155-- youtube-dl supports downloading urls that are not html pages, 156-- but we don't want to use it for such urls, since they can be downloaded 157-- without it. So, this first downloads part of the content and checks 158-- if it's a html page; only then is youtube-dl used. 159htmlOnly :: URLString -> a -> Annex a -> Annex a 160htmlOnly url fallback a = withUrlOptions $ \uo -> 161 liftIO (downloadPartial url uo htmlPrefixLength) >>= \case 162 Just bs | isHtmlBs bs -> a 163 _ -> return fallback 164 165-- Check if youtube-dl supports downloading content from an url. 166youtubeDlSupported :: URLString -> Annex Bool 167youtubeDlSupported url = either (const False) id 168 <$> withUrlOptions (youtubeDlCheck' url) 169 170-- Check if youtube-dl can find media in an url. 171-- 172-- While this does not download anything, it checks youtubeDlAllowed 173-- for symmetry with youtubeDl; the check should not succeed if the 174-- download won't succeed. 175youtubeDlCheck :: URLString -> Annex (Either String Bool) 176youtubeDlCheck url = ifM youtubeDlAllowed 177 ( withUrlOptions $ youtubeDlCheck' url 178 , return $ Left youtubeDlNotAllowedMessage 179 ) 180 181youtubeDlCheck' :: URLString -> UrlOptions -> Annex (Either String Bool) 182youtubeDlCheck' url uo 183 | supportedScheme uo url = catchMsgIO $ htmlOnly url False $ do 184 opts <- youtubeDlOpts [ Param url, Param "--simulate" ] 185 cmd <- youtubeDlCommand 186 liftIO $ snd <$> processTranscript cmd (toCommand opts) Nothing 187 | otherwise = return (Right False) 188 189-- Ask youtube-dl for the filename of media in an url. 190-- 191-- (This is not always identical to the filename it uses when downloading.) 192youtubeDlFileName :: URLString -> Annex (Either String FilePath) 193youtubeDlFileName url = withUrlOptions go 194 where 195 go uo 196 | supportedScheme uo url = flip catchIO (pure . Left . show) $ 197 htmlOnly url nomedia (youtubeDlFileNameHtmlOnly' url uo) 198 | otherwise = return nomedia 199 nomedia = Left "no media in url" 200 201-- Does not check if the url contains htmlOnly; use when that's already 202-- been verified. 203youtubeDlFileNameHtmlOnly :: URLString -> Annex (Either String FilePath) 204youtubeDlFileNameHtmlOnly = withUrlOptions . youtubeDlFileNameHtmlOnly' 205 206youtubeDlFileNameHtmlOnly' :: URLString -> UrlOptions -> Annex (Either String FilePath) 207youtubeDlFileNameHtmlOnly' url uo 208 | supportedScheme uo url = flip catchIO (pure . Left . show) go 209 | otherwise = return nomedia 210 where 211 go = do 212 -- Sometimes youtube-dl will fail with an ugly backtrace 213 -- (eg, http://bugs.debian.org/874321) 214 -- so catch stderr as well as stdout to avoid the user 215 -- seeing it. --no-warnings avoids warning messages that 216 -- are output to stdout. 217 opts <- youtubeDlOpts 218 [ Param url 219 , Param "--get-filename" 220 , Param "--no-warnings" 221 , Param "--no-playlist" 222 ] 223 cmd <- youtubeDlCommand 224 let p = (proc cmd (toCommand opts)) 225 { std_out = CreatePipe 226 , std_err = CreatePipe 227 } 228 liftIO $ withCreateProcess p waitproc 229 230 waitproc Nothing (Just o) (Just e) pid = do 231 errt <- async $ discardstderr pid e 232 output <- hGetContentsStrict o 233 ok <- liftIO $ checkSuccessProcess pid 234 wait errt 235 return $ case (ok, lines output) of 236 (True, (f:_)) | not (null f) -> Right f 237 _ -> nomedia 238 waitproc _ _ _ _ = error "internal" 239 240 discardstderr pid e = hGetLineUntilExitOrEOF pid e >>= \case 241 Nothing -> return () 242 Just _ -> discardstderr pid e 243 244 nomedia = Left "no media in url" 245 246youtubeDlOpts :: [CommandParam] -> Annex [CommandParam] 247youtubeDlOpts addopts = do 248 opts <- map Param . annexYoutubeDlOptions <$> Annex.getGitConfig 249 return (opts ++ addopts) 250 251youtubeDlCommand :: Annex String 252youtubeDlCommand = fromMaybe "yooutube-dl" . annexYoutubeDlCommand 253 <$> Annex.getGitConfig 254 255supportedScheme :: UrlOptions -> URLString -> Bool 256supportedScheme uo url = case parseURIRelaxed url of 257 Nothing -> False 258 Just u -> case uriScheme u of 259 -- avoid ugly message from youtube-dl about not supporting file: 260 "file:" -> False 261 -- ftp indexes may look like html pages, and there's no point 262 -- involving youtube-dl in a ftp download 263 "ftp:" -> False 264 _ -> allowedScheme uo u 265 266{- Strategy: Look for chunks prefixed with \r, which look approximately 267 - like this: 268 - "ESC[K[download] 26.6% of 60.22MiB at 254.69MiB/s ETA 00:00" 269 - Look at the number before "% of " and the number and unit after, 270 - to determine the number of bytes. 271 -} 272parseYoutubeDlProgress :: ProgressParser 273parseYoutubeDlProgress = go [] . reverse . progresschunks 274 where 275 delim = '\r' 276 277 progresschunks = drop 1 . splitc delim 278 279 go remainder [] = (Nothing, Nothing, remainder) 280 go remainder (x:xs) = case split "% of " x of 281 (p:r:[]) -> case (parsepercent p, parsebytes r) of 282 (Just percent, Just total) -> 283 ( Just (toBytesProcessed (calc percent total)) 284 , Just (TotalSize total) 285 , remainder 286 ) 287 _ -> go (delim:x++remainder) xs 288 _ -> go (delim:x++remainder) xs 289 290 calc :: Double -> Integer -> Integer 291 calc percent total = round (percent * fromIntegral total / 100) 292 293 parsepercent :: String -> Maybe Double 294 parsepercent = readMaybe . reverse . takeWhile (not . isSpace) . reverse 295 296 parsebytes = readSize units . takeWhile (not . isSpace) 297 298 units = memoryUnits ++ storageUnits 299