1{-# LANGUAGE RecordWildCards, PatternGuards #-} 2 3module Text.HTML.TagSoup.Specification(parse) where 4 5import Text.HTML.TagSoup.Implementation 6import Data.Char (isAlpha, isAlphaNum, isDigit, toLower) 7 8-- We make some generalisations: 9-- <!name is a valid tag start closed by > 10-- <?name is a valid tag start closed by ?> 11-- </!name> is a valid closing tag 12-- </?name> is a valid closing tag 13-- <a "foo"> is a valid tag attibute in ! and ?, i.e missing an attribute name 14-- We also don't do lowercase conversion 15-- Entities are handled without a list of known entity names 16-- We don't have RCData, CData or Escape modes (only effects dat and tagOpen) 17 18 19data TypeTag = TypeNormal -- <foo 20 | TypeXml -- <?foo 21 | TypeDecl -- <!foo 22 | TypeScript -- <script 23 deriving Eq 24 25 26-- 2.4.1 Common parser idioms 27white x = x `elem` " \t\n\f\r" 28 29 30-- 8.2.4 Tokenization 31 32type Parser = S -> [Out] 33 34parse :: String -> [Out] 35parse = dat . state 36 37-- 8.2.4.1 Data state 38dat :: Parser 39dat S{..} = pos $ case hd of 40 '&' -> charReference tl 41 '<' -> tagOpen tl 42 _ | eof -> [] 43 _ -> hd & dat tl 44 45 46-- 8.2.4.2 Character reference data state 47charReference s = charRef dat False Nothing s 48 49 50-- 8.2.4.3 Tag open state 51tagOpen S{..} = case hd of 52 '!' -> markupDeclOpen tl 53 '/' -> closeTagOpen tl 54 _ | isAlpha hd -> Tag & hd & tagName (if isScript s then TypeScript else TypeNormal) tl 55 '>' -> errSeen "<>" & '<' & '>' & dat tl 56 '?' -> neilXmlTagOpen tl -- NEIL 57 _ -> errSeen "<" & '<' & dat s 58 59isScript = f "script" 60 where 61 f (c:cs) S{..} = toLower hd == c && f cs tl 62 f [] S{..} = white hd || hd == '/' || hd == '>' || hd == '?' || eof 63 64 65-- seen "<?", emitted [] 66neilXmlTagOpen S{..} = case hd of 67 _ | isAlpha hd -> Tag & '?' & hd & tagName TypeXml tl 68 _ -> errSeen "<?" & '<' & '?' & dat s 69 70-- seen "?", expecting ">" 71neilXmlTagClose S{..} = pos $ case hd of 72 '>' -> TagEnd & dat tl 73 _ -> errSeen "?" & beforeAttName TypeXml s 74 75 76-- just seen ">" at the end, am given tl 77neilTagEnd typ S{..} 78 | typ == TypeXml = pos $ errWant "?>" & TagEnd & dat s 79 | typ == TypeScript = pos $ TagEnd & neilScriptBody s 80 | otherwise = pos $ TagEnd & dat s 81 82-- Inside a <script> tag, only break on </script 83neilScriptBody o@S{..} 84 | hd == '<', S{..} <- tl 85 , hd == '/', S{..} <- tl 86 , isScript s 87 = dat o 88 | eof = [] 89 | otherwise = pos $ hd & neilScriptBody tl 90 91 92-- 8.2.4.4 Close tag open state 93-- Deviation: We ignore the if CDATA/RCDATA bits and tag matching 94-- Deviation: On </> we output </> to the text 95-- Deviation: </!name> is a closing tag, not a bogus comment 96closeTagOpen S{..} = case hd of 97 _ | isAlpha hd || hd `elem` "?!" -> TagShut & hd & tagName TypeNormal tl 98 '>' -> errSeen "</>" & '<' & '/' & '>' & dat tl 99 _ | eof -> '<' & '/' & dat s 100 _ -> errWant "tag name" & bogusComment s 101 102 103-- 8.2.4.5 Tag name state 104tagName typ S{..} = pos $ case hd of 105 _ | white hd -> beforeAttName typ tl 106 '/' -> selfClosingStartTag typ tl 107 '>' -> neilTagEnd typ tl 108 '?' | typ == TypeXml -> neilXmlTagClose tl 109 _ | isAlpha hd -> hd & tagName typ tl 110 _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s 111 _ -> hd & tagName typ tl 112 113 114-- 8.2.4.6 Before attribute name state 115beforeAttName typ S{..} = pos $ case hd of 116 _ | white hd -> beforeAttName typ tl 117 '/' -> selfClosingStartTag typ tl 118 '>' -> neilTagEnd typ tl 119 '?' | typ == TypeXml -> neilXmlTagClose tl 120 _ | typ /= TypeNormal && hd `elem` "\'\"" -> beforeAttValue typ s -- NEIL 121 _ | hd `elem` "\"'<=" -> errSeen [hd] & AttName & hd & attName typ tl 122 _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s 123 _ -> AttName & hd & attName typ tl 124 125 126-- 8.2.4.7 Attribute name state 127attName typ S{..} = pos $ case hd of 128 _ | white hd -> afterAttName typ tl 129 '/' -> selfClosingStartTag typ tl 130 '=' -> beforeAttValue typ tl 131 '>' -> neilTagEnd typ tl 132 '?' | typ == TypeXml -> neilXmlTagClose tl 133 _ | hd `elem` "\"'<" -> errSeen [hd] & def 134 _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s 135 _ -> def 136 where def = hd & attName typ tl 137 138 139-- 8.2.4.8 After attribute name state 140afterAttName typ S{..} = pos $ case hd of 141 _ | white hd -> afterAttName typ tl 142 '/' -> selfClosingStartTag typ tl 143 '=' -> beforeAttValue typ tl 144 '>' -> neilTagEnd typ tl 145 '?' | typ == TypeXml -> neilXmlTagClose tl 146 _ | typ /= TypeNormal && hd `elem` "\"'" -> AttVal & beforeAttValue typ s -- NEIL 147 _ | hd `elem` "\"'<" -> errSeen [hd] & def 148 _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s 149 _ -> def 150 where def = AttName & hd & attName typ tl 151 152-- 8.2.4.9 Before attribute value state 153beforeAttValue typ S{..} = pos $ case hd of 154 _ | white hd -> beforeAttValue typ tl 155 '\"' -> AttVal & attValueDQuoted typ tl 156 '&' -> AttVal & attValueUnquoted typ s 157 '\'' -> AttVal & attValueSQuoted typ tl 158 '>' -> errSeen "=" & neilTagEnd typ tl 159 '?' | typ == TypeXml -> neilXmlTagClose tl 160 _ | hd `elem` "<=" -> errSeen [hd] & def 161 _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s 162 _ -> def 163 where def = AttVal & hd & attValueUnquoted typ tl 164 165 166-- 8.2.4.10 Attribute value (double-quoted) state 167attValueDQuoted typ S{..} = pos $ case hd of 168 '\"' -> afterAttValueQuoted typ tl 169 '&' -> charRefAttValue (attValueDQuoted typ) (Just '\"') tl 170 _ | eof -> errWant "\"" & dat s 171 _ -> hd & attValueDQuoted typ tl 172 173 174-- 8.2.4.11 Attribute value (single-quoted) state 175attValueSQuoted typ S{..} = pos $ case hd of 176 '\'' -> afterAttValueQuoted typ tl 177 '&' -> charRefAttValue (attValueSQuoted typ) (Just '\'') tl 178 _ | eof -> errWant "\'" & dat s 179 _ -> hd & attValueSQuoted typ tl 180 181 182-- 8.2.4.12 Attribute value (unquoted) state 183attValueUnquoted typ S{..} = pos $ case hd of 184 _ | white hd -> beforeAttName typ tl 185 '&' -> charRefAttValue (attValueUnquoted typ) Nothing tl 186 '>' -> neilTagEnd typ tl 187 '?' | typ == TypeXml -> neilXmlTagClose tl 188 _ | hd `elem` "\"'<=" -> errSeen [hd] & def 189 _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s 190 _ -> def 191 where def = hd & attValueUnquoted typ tl 192 193 194-- 8.2.4.13 Character reference in attribute value state 195charRefAttValue :: Parser -> Maybe Char -> Parser 196charRefAttValue resume c s = charRef resume True c s 197 198 199-- 8.2.4.14 After attribute value (quoted) state 200afterAttValueQuoted typ S{..} = pos $ case hd of 201 _ | white hd -> beforeAttName typ tl 202 '/' -> selfClosingStartTag typ tl 203 '>' -> neilTagEnd typ tl 204 '?' | typ == TypeXml -> neilXmlTagClose tl 205 _ | eof -> dat s 206 _ -> errSeen [hd] & beforeAttName typ s 207 208 209-- 8.2.4.15 Self-closing start tag state 210selfClosingStartTag typ S{..} = pos $ case hd of 211 _ | typ == TypeXml -> errSeen "/" & beforeAttName typ s 212 '>' -> TagEndClose & dat tl 213 _ | eof -> errWant ">" & dat s 214 _ -> errSeen "/" & beforeAttName typ s 215 216 217-- 8.2.4.16 Bogus comment state 218bogusComment S{..} = Comment & bogusComment1 s 219bogusComment1 S{..} = pos $ case hd of 220 '>' -> CommentEnd & dat tl 221 _ | eof -> CommentEnd & dat s 222 _ -> hd & bogusComment1 tl 223 224 225-- 8.2.4.17 Markup declaration open state 226markupDeclOpen S{..} = case hd of 227 _ | Just s <- next "--" -> Comment & commentStart s 228 _ | isAlpha hd -> Tag & '!' & hd & tagName TypeDecl tl -- NEIL 229 _ | Just s <- next "[CDATA[" -> cdataSection s 230 _ -> errWant "tag name" & bogusComment s 231 232 233-- 8.2.4.18 Comment start state 234commentStart S{..} = pos $ case hd of 235 '-' -> commentStartDash tl 236 '>' -> errSeen "<!-->" & CommentEnd & dat tl 237 _ | eof -> errWant "-->" & CommentEnd & dat s 238 _ -> hd & comment tl 239 240 241-- 8.2.4.19 Comment start dash state 242commentStartDash S{..} = pos $ case hd of 243 '-' -> commentEnd tl 244 '>' -> errSeen "<!--->" & CommentEnd & dat tl 245 _ | eof -> errWant "-->" & CommentEnd & dat s 246 _ -> '-' & hd & comment tl 247 248 249-- 8.2.4.20 Comment state 250comment S{..} = pos $ case hd of 251 '-' -> commentEndDash tl 252 _ | eof -> errWant "-->" & CommentEnd & dat s 253 _ -> hd & comment tl 254 255 256-- 8.2.4.21 Comment end dash state 257commentEndDash S{..} = pos $ case hd of 258 '-' -> commentEnd tl 259 _ | eof -> errWant "-->" & CommentEnd & dat s 260 _ -> '-' & hd & comment tl 261 262 263-- 8.2.4.22 Comment end state 264commentEnd S{..} = pos $ case hd of 265 '>' -> CommentEnd & dat tl 266 '-' -> errWant "-->" & '-' & commentEnd tl 267 _ | white hd -> errSeen "--" & '-' & '-' & hd & commentEndSpace tl 268 '!' -> errSeen "!" & commentEndBang tl 269 _ | eof -> errWant "-->" & CommentEnd & dat s 270 _ -> errSeen "--" & '-' & '-' & hd & comment tl 271 272 273-- 8.2.4.23 Comment end bang state 274commentEndBang S{..} = pos $ case hd of 275 '>' -> CommentEnd & dat tl 276 '-' -> '-' & '-' & '!' & commentEndDash tl 277 _ | eof -> errWant "-->" & CommentEnd & dat s 278 _ -> '-' & '-' & '!' & hd & comment tl 279 280 281-- 8.2.4.24 Comment end space state 282commentEndSpace S{..} = pos $ case hd of 283 '>' -> CommentEnd & dat tl 284 '-' -> commentEndDash tl 285 _ | white hd -> hd & commentEndSpace tl 286 _ | eof -> errWant "-->" & CommentEnd & dat s 287 _ -> hd & comment tl 288 289 290-- 8.2.4.38 CDATA section state 291cdataSection S{..} = pos $ case hd of 292 _ | Just s <- next "]]>" -> dat s 293 _ | eof -> dat s 294 _ | otherwise -> hd & cdataSection tl 295 296 297-- 8.2.4.39 Tokenizing character references 298-- Change from spec: this is reponsible for writing '&' if nothing is to be written 299charRef :: Parser -> Bool -> Maybe Char -> S -> [Out] 300charRef resume att end S{..} = case hd of 301 _ | eof || hd `elem` "\t\n\f <&" || maybe False (== hd) end -> '&' & resume s 302 '#' -> charRefNum resume s tl 303 _ -> charRefAlpha resume att s 304 305charRefNum resume o S{..} = case hd of 306 _ | hd `elem` "xX" -> charRefNum2 resume o True tl 307 _ -> charRefNum2 resume o False s 308 309charRefNum2 resume o hex S{..} = case hd of 310 _ | hexChar hex hd -> (if hex then EntityHex else EntityNum) & hd & charRefNum3 resume hex tl 311 _ -> errSeen "&" & '&' & resume o 312 313charRefNum3 resume hex S{..} = case hd of 314 _ | hexChar hex hd -> hd & charRefNum3 resume hex tl 315 ';' -> EntityEnd True & resume tl 316 _ -> EntityEnd False & errWant ";" & resume s 317 318charRefAlpha resume att S{..} = case hd of 319 _ | isAlpha hd -> EntityName & hd & charRefAlpha2 resume att tl 320 _ -> errSeen "&" & '&' & resume s 321 322charRefAlpha2 resume att S{..} = case hd of 323 _ | alphaChar hd -> hd & charRefAlpha2 resume att tl 324 ';' -> EntityEnd True & resume tl 325 _ | att -> EntityEnd False & resume s 326 _ -> EntityEnd False & errWant ";" & resume s 327 328 329alphaChar x = isAlphaNum x || x `elem` ":-_" 330 331hexChar False x = isDigit x 332hexChar True x = isDigit x || (x >= 'a' && x <= 'f') || (x >= 'A' && x <= 'F') 333