1{-# LANGUAGE RecordWildCards, PatternGuards, ScopedTypeVariables #-} 2 3module Text.HTML.TagSoup.Implementation where 4 5import Text.HTML.TagSoup.Type 6import Text.HTML.TagSoup.Options 7import Text.StringLike as Str 8import Numeric (readHex) 9import Data.Char (chr, ord) 10import Data.Ix 11import Control.Exception(assert) 12import Control.Arrow 13 14--------------------------------------------------------------------- 15-- BOTTOM LAYER 16 17data Out 18 = Char Char 19 | Tag -- < 20 | TagShut -- </ 21 | AttName 22 | AttVal 23 | TagEnd -- > 24 | TagEndClose -- /> 25 | Comment -- <!-- 26 | CommentEnd -- --> 27 | EntityName -- & 28 | EntityNum -- &# 29 | EntityHex -- &#x 30 | EntityEnd Bool -- Attributed followed by ; for True, missing ; for False 31 | Warn String 32 | Pos Position 33 deriving (Show,Eq) 34 35errSeen x = Warn $ "Unexpected " ++ show x 36errWant x = Warn $ "Expected " ++ show x 37 38data S = S 39 {s :: S 40 ,tl :: S 41 ,hd :: Char 42 ,eof :: Bool 43 ,next :: String -> Maybe S 44 ,pos :: [Out] -> [Out] 45 } 46 47 48expand :: Position -> String -> S 49expand p text = p `seq` res 50 where res = S{s = res 51 ,tl = expand (positionChar p (head text)) (tail text) 52 ,hd = if null text then '\0' else head text 53 ,eof = null text 54 ,next = next p text 55 ,pos = (Pos p:) 56 } 57 58 next p (t:ext) (s:tr) | t == s = next (positionChar p t) ext tr 59 next p text [] = Just $ expand p text 60 next _ _ _ = Nothing 61 62 63infixr & 64 65class Outable a where (&) :: a -> [Out] -> [Out] 66instance Outable Char where (&) = ampChar 67instance Outable Out where (&) = ampOut 68ampChar x y = Char x : y 69ampOut x y = x : y 70 71 72state :: String -> S 73state s = expand nullPosition s 74 75--------------------------------------------------------------------- 76-- TOP LAYER 77 78 79output :: forall str . StringLike str => ParseOptions str -> [Out] -> [Tag str] 80output ParseOptions{..} x = (if optTagTextMerge then tagTextMerge else id) $ go ((nullPosition,[]),x) 81 where 82 -- main choice loop 83 go :: ((Position,[Tag str]),[Out]) -> [Tag str] 84 go ((p,ws),xs) | p `seq` False = [] -- otherwise p is a space leak when optTagPosition == False 85 go ((p,ws),xs) | not $ null ws = (if optTagWarning then (reverse ws++) else id) $ go ((p,[]),xs) 86 go ((p,ws),Pos p2:xs) = go ((p2,ws),xs) 87 88 go x | isChar x = pos x $ TagText a : go y 89 where (y,a) = charsStr x 90 go x | isTag x = pos x $ TagOpen a b : (if isTagEndClose z then pos x $ TagClose a : go (next z) else go (skip isTagEnd z)) 91 where (y,a) = charsStr $ next x 92 (z,b) = atts y 93 go x | isTagShut x = pos x $ (TagClose a:) $ 94 (if not (null b) then warn x "Unexpected attributes in close tag" else id) $ 95 if isTagEndClose z then warn x "Unexpected self-closing in close tag" $ go (next z) else go (skip isTagEnd z) 96 where (y,a) = charsStr $ next x 97 (z,b) = atts y 98 go x | isComment x = pos x $ TagComment a : go (skip isCommentEnd y) 99 where (y,a) = charsStr $ next x 100 go x | isEntityName x = poss x ((if optTagWarning then id else filter (not . isTagWarning)) $ optEntityData (a, getEntityEnd y)) ++ go (skip isEntityEnd y) 101 where (y,a) = charsStr $ next x 102 go x | isEntityNumHex x = pos x $ TagText (fromChar $ entityChr x a) : go (skip isEntityEnd y) 103 where (y,a) = chars $ next x 104 go x | Just a <- fromWarn x = if optTagWarning then pos x $ TagWarning (fromString a) : go (next x) else go (next x) 105 go x | isEof x = [] 106 107 atts :: ((Position,[Tag str]),[Out]) -> ( ((Position,[Tag str]),[Out]) , [(str,str)] ) 108 atts x | isAttName x = second ((a,b):) $ atts z 109 where (y,a) = charsStr (next x) 110 (z,b) = if isAttVal y then charsEntsStr (next y) else (y, empty) 111 atts x | isAttVal x = second ((empty,a):) $ atts y 112 where (y,a) = charsEntsStr (next x) 113 atts x = (x, []) 114 115 -- chars 116 chars x = charss False x 117 charsStr x = (id *** fromString) $ chars x 118 charsEntsStr x = (id *** fromString) $ charss True x 119 120 -- loop round collecting characters, if the b is set including entity 121 charss :: Bool -> ((Position,[Tag str]),[Out]) -> ( ((Position,[Tag str]),[Out]) , String) 122 charss t x | Just a <- fromChr x = (y, a:b) 123 where (y,b) = charss t (next x) 124 charss t x | t, isEntityName x = second (toString n ++) $ charss t $ addWarns m z 125 where (y,a) = charsStr $ next x 126 b = getEntityEnd y 127 z = skip isEntityEnd y 128 (n,m) = optEntityAttrib (a,b) 129 charss t x | t, isEntityNumHex x = second (entityChr x a:) $ charss t z 130 where (y,a) = chars $ next x 131 z = skip isEntityEnd y 132 charss t ((_,w),Pos p:xs) = charss t ((p,w),xs) 133 charss t x | Just a <- fromWarn x = charss t $ (if optTagWarning then addWarns [TagWarning $ fromString a] else id) $ next x 134 charss t x = (x, []) 135 136 -- utility functions 137 next x = second (drop 1) x 138 skip f x = assert (isEof x || f x) (next x) 139 addWarns ws x@((p,w),y) = ((p, reverse (poss x ws) ++ w), y) 140 pos ((p,_),_) rest = if optTagPosition then tagPosition p : rest else rest 141 warn x s rest = if optTagWarning then pos x $ TagWarning (fromString s) : rest else rest 142 poss x = concatMap (\w -> pos x [w]) 143 144 145entityChr x s | isEntityNum x = chr_ $ read s 146 | isEntityHex x = chr_ $ fst $ head $ readHex s 147 where chr_ x | inRange (toInteger $ ord minBound, toInteger $ ord maxBound) x = chr $ fromInteger x 148 | otherwise = '?' 149 150 151isEof (_,[]) = True; isEof _ = False 152isChar (_,Char{}:_) = True; isChar _ = False 153isTag (_,Tag{}:_) = True; isTag _ = False 154isTagShut (_,TagShut{}:_) = True; isTagShut _ = False 155isAttName (_,AttName{}:_) = True; isAttName _ = False 156isAttVal (_,AttVal{}:_) = True; isAttVal _ = False 157isTagEnd (_,TagEnd{}:_) = True; isTagEnd _ = False 158isTagEndClose (_,TagEndClose{}:_) = True; isTagEndClose _ = False 159isComment (_,Comment{}:_) = True; isComment _ = False 160isCommentEnd (_,CommentEnd{}:_) = True; isCommentEnd _ = False 161isEntityName (_,EntityName{}:_) = True; isEntityName _ = False 162isEntityNumHex (_,EntityNum{}:_) = True; isEntityNumHex (_,EntityHex{}:_) = True; isEntityNumHex _ = False 163isEntityNum (_,EntityNum{}:_) = True; isEntityNum _ = False 164isEntityHex (_,EntityHex{}:_) = True; isEntityHex _ = False 165isEntityEnd (_,EntityEnd{}:_) = True; isEntityEnd _ = False 166isWarn (_,Warn{}:_) = True; isWarn _ = False 167 168fromChr (_,Char x:_) = Just x ; fromChr _ = Nothing 169fromWarn (_,Warn x:_) = Just x ; fromWarn _ = Nothing 170 171getEntityEnd (_,EntityEnd b:_) = b 172 173 174-- Merge all adjacent TagText bits 175tagTextMerge :: StringLike str => [Tag str] -> [Tag str] 176tagTextMerge (TagText x:xs) = TagText (strConcat (x:a)) : tagTextMerge b 177 where 178 (a,b) = f xs 179 180 -- additional brackets on 3 lines to work around HSE 1.3.2 bugs with pattern fixities 181 f (TagText x:xs) = (x:a,b) 182 where (a,b) = f xs 183 f (TagPosition{}:(x@TagText{}:xs)) = f $ x : xs 184 f x = g x id x 185 186 g o op (p@TagPosition{}:(w@TagWarning{}:xs)) = g o (op . (p:) . (w:)) xs 187 g o op (w@TagWarning{}:xs) = g o (op . (w:)) xs 188 g o op (p@TagPosition{}:(x@TagText{}:xs)) = f $ p : x : op xs 189 g o op (x@TagText{}:xs) = f $ x : op xs 190 g o op _ = ([], o) 191 192tagTextMerge (x:xs) = x : tagTextMerge xs 193tagTextMerge [] = [] 194