1{- html detection 2 - 3 - Copyright 2017-2021 Joey Hess <id@joeyh.name> 4 - 5 - License: BSD-2-clause 6 -} 7 8module Utility.HtmlDetect ( 9 isHtml, 10 isHtmlBs, 11 isHtmlFile, 12 htmlPrefixLength, 13) where 14 15import Text.HTML.TagSoup 16import System.IO 17import Data.Char 18import qualified Data.ByteString.Lazy as B 19import qualified Data.ByteString.Lazy.Char8 as B8 20 21-- | Detect if a String is a html document. 22-- 23-- The document many not be valid, or may be truncated, and will 24-- still be detected as html, as long as it starts with a 25-- "<html>" or "<!DOCTYPE html>" tag. 26-- 27-- Html fragments like "<p>this</p>" are not detected as being html, 28-- although some browsers may chose to render them as html. 29isHtml :: String -> Bool 30isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength 31 where 32 evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" 33 evaluate (TagOpen "html" _:_) = True 34 -- Allow some leading whitespace before the tag. 35 evaluate (TagText t:rest) 36 | all isSpace t = evaluate rest 37 | otherwise = False 38 -- It would be pretty weird to have a html comment before the html 39 -- tag, but easy to allow for. 40 evaluate (TagComment _:rest) = evaluate rest 41 evaluate _ = False 42 43-- | Detect if a ByteString is a html document. 44isHtmlBs :: B.ByteString -> Bool 45-- The encoding of the ByteString is not known, but isHtml only 46-- looks for ascii strings. 47isHtmlBs = isHtml . B8.unpack 48 49-- | Check if the file is html. 50-- 51-- It would be equivilant to use isHtml <$> readFile file, 52-- but since that would not read all of the file, the handle 53-- would remain open until it got garbage collected sometime later. 54isHtmlFile :: FilePath -> IO Bool 55isHtmlFile file = withFile file ReadMode $ \h -> 56 isHtmlBs <$> B.hGet h htmlPrefixLength 57 58-- | How much of the beginning of a html document is needed to detect it. 59-- (conservatively) 60htmlPrefixLength :: Int 61htmlPrefixLength = 8192 62