1{-# LANGUAGE RecordWildCards, PatternGuards, ScopedTypeVariables #-}
2
3module Text.HTML.TagSoup.Implementation where
4
5import Text.HTML.TagSoup.Type
6import Text.HTML.TagSoup.Options
7import Text.StringLike as Str
8import Numeric (readHex)
9import Data.Char (chr, ord)
10import Data.Ix
11import Control.Exception(assert)
12import Control.Arrow
13
14---------------------------------------------------------------------
15-- BOTTOM LAYER
16
17data Out
18    = Char Char
19    | Tag             -- <
20    | TagShut         -- </
21    | AttName
22    | AttVal
23    | TagEnd          -- >
24    | TagEndClose     -- />
25    | Comment         -- <!--
26    | CommentEnd      -- -->
27    | EntityName      -- &
28    | EntityNum       -- &#
29    | EntityHex       -- &#x
30    | EntityEnd Bool  -- Attributed followed by ; for True, missing ; for False
31    | Warn String
32    | Pos Position
33      deriving (Show,Eq)
34
35errSeen x = Warn $ "Unexpected " ++ show x
36errWant x = Warn $ "Expected " ++ show x
37
38data S = S
39    {s :: S
40    ,tl :: S
41    ,hd :: Char
42    ,eof :: Bool
43    ,next :: String -> Maybe S
44    ,pos :: [Out] -> [Out]
45    }
46
47
48expand :: Position -> String -> S
49expand p text = p `seq` res
50    where res = S{s = res
51                 ,tl = expand (positionChar p (head text)) (tail text)
52                 ,hd = if null text then '\0' else head text
53                 ,eof = null text
54                 ,next = next p text
55                 ,pos = (Pos p:)
56                 }
57
58          next p (t:ext) (s:tr) | t == s = next (positionChar p t) ext tr
59          next p text [] = Just $ expand p text
60          next _ _ _ = Nothing
61
62
63infixr &
64
65class Outable a where (&) :: a -> [Out] -> [Out]
66instance Outable Char where (&) = ampChar
67instance Outable Out where (&) = ampOut
68ampChar x y = Char x : y
69ampOut x y = x : y
70
71
72state :: String -> S
73state s = expand nullPosition s
74
75---------------------------------------------------------------------
76-- TOP LAYER
77
78
79output :: forall str . StringLike str => ParseOptions str -> [Out] -> [Tag str]
80output ParseOptions{..} x = (if optTagTextMerge then tagTextMerge else id) $ go ((nullPosition,[]),x)
81    where
82        -- main choice loop
83        go :: ((Position,[Tag str]),[Out]) -> [Tag str]
84        go ((p,ws),xs) | p `seq` False = [] -- otherwise p is a space leak when optTagPosition == False
85        go ((p,ws),xs) | not $ null ws = (if optTagWarning then (reverse ws++) else id) $ go ((p,[]),xs)
86        go ((p,ws),Pos p2:xs) = go ((p2,ws),xs)
87
88        go x | isChar x = pos x $ TagText a : go y
89            where (y,a) = charsStr x
90        go x | isTag x = pos x $ TagOpen a b : (if isTagEndClose z then pos x $ TagClose a : go (next z) else go (skip isTagEnd z))
91            where (y,a) = charsStr $ next x
92                  (z,b) = atts y
93        go x | isTagShut x = pos x $ (TagClose a:) $
94                (if not (null b) then warn x "Unexpected attributes in close tag" else id) $
95                if isTagEndClose z then warn x "Unexpected self-closing in close tag" $ go (next z) else go (skip isTagEnd z)
96            where (y,a) = charsStr $ next x
97                  (z,b) = atts y
98        go x | isComment x = pos x $ TagComment a : go (skip isCommentEnd y)
99            where (y,a) = charsStr $ next x
100        go x | isEntityName x = poss x ((if optTagWarning then id else filter (not . isTagWarning)) $ optEntityData (a, getEntityEnd y)) ++ go (skip isEntityEnd y)
101            where (y,a) = charsStr $ next x
102        go x | isEntityNumHex x = pos x $ TagText (fromChar $ entityChr x a) : go (skip isEntityEnd y)
103            where (y,a) = chars $ next x
104        go x | Just a <- fromWarn x = if optTagWarning then pos x $ TagWarning (fromString a) : go (next x) else go (next x)
105        go x | isEof x = []
106
107        atts :: ((Position,[Tag str]),[Out]) -> ( ((Position,[Tag str]),[Out]) , [(str,str)] )
108        atts x | isAttName x = second ((a,b):) $ atts z
109            where (y,a) = charsStr (next x)
110                  (z,b) = if isAttVal y then charsEntsStr (next y) else (y, empty)
111        atts x | isAttVal x = second ((empty,a):) $ atts y
112            where (y,a) = charsEntsStr (next x)
113        atts x = (x, [])
114
115        -- chars
116        chars x = charss False x
117        charsStr x = (id *** fromString) $ chars x
118        charsEntsStr x = (id *** fromString) $ charss True x
119
120        -- loop round collecting characters, if the b is set including entity
121        charss :: Bool -> ((Position,[Tag str]),[Out]) -> ( ((Position,[Tag str]),[Out]) , String)
122        charss t x | Just a <- fromChr x = (y, a:b)
123            where (y,b) = charss t (next x)
124        charss t x | t, isEntityName x = second (toString n ++) $ charss t $ addWarns m z
125            where (y,a) = charsStr $ next x
126                  b = getEntityEnd y
127                  z = skip isEntityEnd y
128                  (n,m) = optEntityAttrib (a,b)
129        charss t x | t, isEntityNumHex x = second (entityChr x a:) $ charss t z
130            where (y,a) = chars $ next x
131                  z = skip isEntityEnd y
132        charss t ((_,w),Pos p:xs) = charss t ((p,w),xs)
133        charss t x | Just a <- fromWarn x = charss t $ (if optTagWarning then addWarns [TagWarning $ fromString a] else id) $ next x
134        charss t x = (x, [])
135
136        -- utility functions
137        next x = second (drop 1) x
138        skip f x = assert (isEof x || f x) (next x)
139        addWarns ws x@((p,w),y) = ((p, reverse (poss x ws) ++ w), y)
140        pos ((p,_),_) rest = if optTagPosition then tagPosition p : rest else rest
141        warn x s rest = if optTagWarning then pos x $ TagWarning (fromString s) : rest else rest
142        poss x = concatMap (\w -> pos x [w])
143
144
145entityChr x s | isEntityNum x = chr_ $ read s
146              | isEntityHex x = chr_ $ fst $ head $ readHex s
147    where chr_ x | inRange (toInteger $ ord minBound, toInteger $ ord maxBound) x = chr $ fromInteger x
148                 | otherwise = '?'
149
150
151isEof (_,[]) = True; isEof _ = False
152isChar (_,Char{}:_) = True; isChar _ = False
153isTag (_,Tag{}:_) = True; isTag _ = False
154isTagShut (_,TagShut{}:_) = True; isTagShut _ = False
155isAttName (_,AttName{}:_) = True; isAttName _ = False
156isAttVal (_,AttVal{}:_) = True; isAttVal _ = False
157isTagEnd (_,TagEnd{}:_) = True; isTagEnd _ = False
158isTagEndClose (_,TagEndClose{}:_) = True; isTagEndClose _ = False
159isComment (_,Comment{}:_) = True; isComment _ = False
160isCommentEnd (_,CommentEnd{}:_) = True; isCommentEnd _ = False
161isEntityName (_,EntityName{}:_) = True; isEntityName _ = False
162isEntityNumHex (_,EntityNum{}:_) = True; isEntityNumHex (_,EntityHex{}:_) = True; isEntityNumHex _ = False
163isEntityNum (_,EntityNum{}:_) = True; isEntityNum _ = False
164isEntityHex (_,EntityHex{}:_) = True; isEntityHex _ = False
165isEntityEnd (_,EntityEnd{}:_) = True; isEntityEnd _ = False
166isWarn (_,Warn{}:_) = True; isWarn _ = False
167
168fromChr (_,Char x:_) = Just x ; fromChr _ = Nothing
169fromWarn (_,Warn x:_) = Just x ; fromWarn _ = Nothing
170
171getEntityEnd (_,EntityEnd b:_) = b
172
173
174-- Merge all adjacent TagText bits
175tagTextMerge :: StringLike str => [Tag str] -> [Tag str]
176tagTextMerge (TagText x:xs) = TagText (strConcat (x:a)) : tagTextMerge b
177    where
178        (a,b) = f xs
179
180        -- additional brackets on 3 lines to work around HSE 1.3.2 bugs with pattern fixities
181        f (TagText x:xs) = (x:a,b)
182            where (a,b) = f xs
183        f (TagPosition{}:(x@TagText{}:xs)) = f $ x : xs
184        f x = g x id x
185
186        g o op (p@TagPosition{}:(w@TagWarning{}:xs)) = g o (op . (p:) . (w:)) xs
187        g o op (w@TagWarning{}:xs) = g o (op . (w:)) xs
188        g o op (p@TagPosition{}:(x@TagText{}:xs)) = f $ p : x : op xs
189        g o op (x@TagText{}:xs) = f $ x : op xs
190        g o op _ = ([], o)
191
192tagTextMerge (x:xs) = x : tagTextMerge xs
193tagTextMerge [] = []
194