1{- html detection
2 -
3 - Copyright 2017-2021 Joey Hess <id@joeyh.name>
4 -
5 - License: BSD-2-clause
6 -}
7
8module Utility.HtmlDetect (
9	isHtml,
10	isHtmlBs,
11	isHtmlFile,
12	htmlPrefixLength,
13) where
14
15import Text.HTML.TagSoup
16import System.IO
17import Data.Char
18import qualified Data.ByteString.Lazy as B
19import qualified Data.ByteString.Lazy.Char8 as B8
20
21-- | Detect if a String is a html document.
22--
23-- The document many not be valid, or may be truncated, and will
24-- still be detected as html, as long as it starts with a
25-- "<html>" or "<!DOCTYPE html>" tag.
26--
27-- Html fragments like "<p>this</p>" are not detected as being html,
28-- although some browsers may chose to render them as html.
29isHtml :: String -> Bool
30isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength
31  where
32	evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
33	evaluate (TagOpen "html" _:_) = True
34	-- Allow some leading whitespace before the tag.
35	evaluate (TagText t:rest)
36		| all isSpace t = evaluate rest
37		| otherwise = False
38	-- It would be pretty weird to have a html comment before the html
39	-- tag, but easy to allow for.
40	evaluate (TagComment _:rest) = evaluate rest
41	evaluate _ = False
42
43-- | Detect if a ByteString is a html document.
44isHtmlBs :: B.ByteString -> Bool
45-- The encoding of the ByteString is not known, but isHtml only
46-- looks for ascii strings.
47isHtmlBs = isHtml . B8.unpack
48
49-- | Check if the file is html.
50--
51-- It would be equivilant to use isHtml <$> readFile file,
52-- but since that would not read all of the file, the handle
53-- would remain open until it got garbage collected sometime later.
54isHtmlFile :: FilePath -> IO Bool
55isHtmlFile file = withFile file ReadMode $ \h ->
56	isHtmlBs <$> B.hGet h htmlPrefixLength
57
58-- | How much of the beginning of a html document is needed to detect it.
59-- (conservatively)
60htmlPrefixLength :: Int
61htmlPrefixLength = 8192
62