1{-# LANGUAGE RecordWildCards, PatternGuards #-}
2
3module Text.HTML.TagSoup.Specification(parse) where
4
5import Text.HTML.TagSoup.Implementation
6import Data.Char (isAlpha, isAlphaNum, isDigit, toLower)
7
8-- We make some generalisations:
9-- <!name is a valid tag start closed by >
10-- <?name is a valid tag start closed by ?>
11-- </!name> is a valid closing tag
12-- </?name> is a valid closing tag
13-- <a "foo"> is a valid tag attibute in ! and ?, i.e missing an attribute name
14-- We also don't do lowercase conversion
15-- Entities are handled without a list of known entity names
16-- We don't have RCData, CData or Escape modes (only effects dat and tagOpen)
17
18
19data TypeTag = TypeNormal -- <foo
20             | TypeXml    -- <?foo
21             | TypeDecl   -- <!foo
22             | TypeScript -- <script
23               deriving Eq
24
25
26-- 2.4.1 Common parser idioms
27white x = x `elem` " \t\n\f\r"
28
29
30-- 8.2.4 Tokenization
31
32type Parser = S -> [Out]
33
34parse :: String -> [Out]
35parse = dat . state
36
37-- 8.2.4.1 Data state
38dat :: Parser
39dat S{..} = pos $ case hd of
40    '&' -> charReference tl
41    '<' -> tagOpen tl
42    _ | eof -> []
43    _ -> hd & dat tl
44
45
46-- 8.2.4.2 Character reference data state
47charReference s = charRef dat False Nothing s
48
49
50-- 8.2.4.3 Tag open state
51tagOpen S{..} = case hd of
52    '!' -> markupDeclOpen tl
53    '/' -> closeTagOpen tl
54    _ | isAlpha hd -> Tag & hd & tagName (if isScript s then TypeScript else TypeNormal) tl
55    '>' -> errSeen "<>" & '<' & '>' & dat tl
56    '?' -> neilXmlTagOpen tl -- NEIL
57    _ -> errSeen  "<" & '<' & dat s
58
59isScript = f "script"
60    where
61        f (c:cs) S{..} = toLower hd == c && f cs tl
62        f [] S{..} = white hd || hd == '/' || hd == '>' || hd == '?' || eof
63
64
65-- seen "<?", emitted []
66neilXmlTagOpen S{..} = case hd of
67    _ | isAlpha hd -> Tag & '?' & hd & tagName TypeXml tl
68    _ -> errSeen "<?" & '<' & '?' & dat s
69
70-- seen "?", expecting ">"
71neilXmlTagClose S{..} = pos $ case hd of
72    '>' -> TagEnd & dat tl
73    _ -> errSeen "?" & beforeAttName TypeXml s
74
75
76-- just seen ">" at the end, am given tl
77neilTagEnd typ S{..}
78    | typ == TypeXml = pos $ errWant "?>" & TagEnd & dat s
79    | typ == TypeScript = pos $ TagEnd & neilScriptBody s
80    | otherwise = pos $ TagEnd & dat s
81
82-- Inside a <script> tag, only break on </script
83neilScriptBody o@S{..}
84    | hd == '<', S{..} <- tl
85    , hd == '/', S{..} <- tl
86    , isScript s
87    = dat o
88    | eof = []
89    | otherwise =  pos $ hd & neilScriptBody tl
90
91
92-- 8.2.4.4 Close tag open state
93-- Deviation: We ignore the if CDATA/RCDATA bits and tag matching
94-- Deviation: On </> we output </> to the text
95-- Deviation: </!name> is a closing tag, not a bogus comment
96closeTagOpen S{..} = case hd of
97    _ | isAlpha hd || hd `elem` "?!" -> TagShut & hd & tagName TypeNormal tl
98    '>' -> errSeen "</>" & '<' & '/' & '>' & dat tl
99    _ | eof -> '<' & '/' & dat s
100    _ -> errWant "tag name" & bogusComment s
101
102
103-- 8.2.4.5 Tag name state
104tagName typ S{..} = pos $ case hd of
105    _ | white hd -> beforeAttName typ tl
106    '/' -> selfClosingStartTag typ tl
107    '>' -> neilTagEnd typ tl
108    '?' | typ == TypeXml -> neilXmlTagClose tl
109    _ | isAlpha hd -> hd & tagName typ tl
110    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
111    _ -> hd & tagName typ tl
112
113
114-- 8.2.4.6 Before attribute name state
115beforeAttName typ S{..} = pos $ case hd of
116    _ | white hd -> beforeAttName typ tl
117    '/' -> selfClosingStartTag typ tl
118    '>' -> neilTagEnd typ tl
119    '?' | typ == TypeXml -> neilXmlTagClose tl
120    _ | typ /= TypeNormal && hd `elem` "\'\"" -> beforeAttValue typ s -- NEIL
121    _ | hd `elem` "\"'<=" -> errSeen [hd] & AttName & hd & attName typ tl
122    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
123    _ -> AttName & hd & attName typ tl
124
125
126-- 8.2.4.7 Attribute name state
127attName typ S{..} = pos $ case hd of
128    _ | white hd -> afterAttName typ tl
129    '/' -> selfClosingStartTag typ tl
130    '=' -> beforeAttValue typ tl
131    '>' -> neilTagEnd typ tl
132    '?' | typ == TypeXml -> neilXmlTagClose tl
133    _ | hd `elem` "\"'<" -> errSeen [hd] & def
134    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
135    _ -> def
136    where def = hd & attName typ tl
137
138
139-- 8.2.4.8 After attribute name state
140afterAttName typ S{..} = pos $ case hd of
141    _ | white hd -> afterAttName typ tl
142    '/' -> selfClosingStartTag typ tl
143    '=' -> beforeAttValue typ tl
144    '>' -> neilTagEnd typ tl
145    '?' | typ == TypeXml -> neilXmlTagClose tl
146    _ | typ /= TypeNormal && hd `elem` "\"'" -> AttVal & beforeAttValue typ s -- NEIL
147    _ | hd `elem` "\"'<" -> errSeen [hd] & def
148    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
149    _ -> def
150    where def = AttName & hd & attName typ tl
151
152-- 8.2.4.9 Before attribute value state
153beforeAttValue typ S{..} = pos $ case hd of
154    _ | white hd -> beforeAttValue typ tl
155    '\"' -> AttVal & attValueDQuoted typ tl
156    '&' -> AttVal & attValueUnquoted typ s
157    '\'' -> AttVal & attValueSQuoted typ tl
158    '>' -> errSeen "=" & neilTagEnd typ tl
159    '?' | typ == TypeXml -> neilXmlTagClose tl
160    _ | hd `elem` "<=" -> errSeen [hd] & def
161    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
162    _ -> def
163    where def = AttVal & hd & attValueUnquoted typ tl
164
165
166-- 8.2.4.10 Attribute value (double-quoted) state
167attValueDQuoted typ S{..} = pos $ case hd of
168    '\"' -> afterAttValueQuoted typ tl
169    '&' -> charRefAttValue (attValueDQuoted typ) (Just '\"') tl
170    _ | eof -> errWant "\"" & dat s
171    _ -> hd & attValueDQuoted typ tl
172
173
174-- 8.2.4.11 Attribute value (single-quoted) state
175attValueSQuoted typ S{..} = pos $ case hd of
176    '\'' -> afterAttValueQuoted typ tl
177    '&' -> charRefAttValue (attValueSQuoted typ) (Just '\'') tl
178    _ | eof -> errWant "\'" & dat s
179    _ -> hd & attValueSQuoted typ tl
180
181
182-- 8.2.4.12 Attribute value (unquoted) state
183attValueUnquoted typ S{..} = pos $ case hd of
184    _ | white hd -> beforeAttName typ tl
185    '&' -> charRefAttValue (attValueUnquoted typ) Nothing tl
186    '>' -> neilTagEnd typ tl
187    '?' | typ == TypeXml -> neilXmlTagClose tl
188    _ | hd `elem` "\"'<=" -> errSeen [hd] & def
189    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
190    _ -> def
191    where def = hd & attValueUnquoted typ tl
192
193
194-- 8.2.4.13 Character reference in attribute value state
195charRefAttValue :: Parser -> Maybe Char -> Parser
196charRefAttValue resume c s = charRef resume True c s
197
198
199-- 8.2.4.14 After attribute value (quoted) state
200afterAttValueQuoted typ S{..} = pos $ case hd of
201    _ | white hd -> beforeAttName typ tl
202    '/' -> selfClosingStartTag typ tl
203    '>' -> neilTagEnd typ tl
204    '?' | typ == TypeXml -> neilXmlTagClose tl
205    _ | eof -> dat s
206    _ -> errSeen [hd] & beforeAttName typ s
207
208
209-- 8.2.4.15 Self-closing start tag state
210selfClosingStartTag typ S{..} = pos $ case hd of
211    _ | typ == TypeXml -> errSeen "/" & beforeAttName typ s
212    '>' -> TagEndClose & dat tl
213    _ | eof -> errWant ">" & dat s
214    _ -> errSeen "/" & beforeAttName typ s
215
216
217-- 8.2.4.16 Bogus comment state
218bogusComment S{..} = Comment & bogusComment1 s
219bogusComment1 S{..} = pos $ case hd of
220    '>' -> CommentEnd & dat tl
221    _ | eof -> CommentEnd & dat s
222    _ -> hd & bogusComment1 tl
223
224
225-- 8.2.4.17 Markup declaration open state
226markupDeclOpen S{..} = case hd of
227    _ | Just s <- next "--" -> Comment & commentStart s
228    _ | isAlpha hd -> Tag & '!' & hd & tagName TypeDecl tl -- NEIL
229    _ | Just s <- next "[CDATA[" -> cdataSection s
230    _ -> errWant "tag name" & bogusComment s
231
232
233-- 8.2.4.18 Comment start state
234commentStart S{..} = pos $ case hd of
235    '-' -> commentStartDash tl
236    '>' -> errSeen "<!-->" & CommentEnd & dat tl
237    _ | eof -> errWant "-->" & CommentEnd & dat s
238    _ -> hd & comment tl
239
240
241-- 8.2.4.19 Comment start dash state
242commentStartDash S{..} = pos $ case hd of
243    '-' -> commentEnd tl
244    '>' -> errSeen "<!--->" & CommentEnd & dat tl
245    _ | eof -> errWant "-->" & CommentEnd & dat s
246    _ -> '-' & hd & comment tl
247
248
249-- 8.2.4.20 Comment state
250comment S{..} = pos $ case hd of
251    '-' -> commentEndDash tl
252    _ | eof -> errWant "-->" & CommentEnd & dat s
253    _ -> hd & comment tl
254
255
256-- 8.2.4.21 Comment end dash state
257commentEndDash S{..} = pos $ case hd of
258    '-' -> commentEnd tl
259    _ | eof -> errWant "-->" & CommentEnd & dat s
260    _ -> '-' & hd & comment tl
261
262
263-- 8.2.4.22 Comment end state
264commentEnd S{..} = pos $ case hd of
265    '>' -> CommentEnd & dat tl
266    '-' -> errWant "-->" & '-' & commentEnd tl
267    _ | white hd -> errSeen "--" & '-' & '-' & hd & commentEndSpace tl
268    '!' -> errSeen "!" & commentEndBang tl
269    _ | eof -> errWant "-->" & CommentEnd & dat s
270    _ -> errSeen "--" & '-' & '-' & hd & comment tl
271
272
273-- 8.2.4.23 Comment end bang state
274commentEndBang S{..} = pos $ case hd of
275    '>' -> CommentEnd & dat tl
276    '-' -> '-' & '-' & '!' & commentEndDash tl
277    _ | eof -> errWant "-->" & CommentEnd & dat s
278    _ -> '-' & '-' & '!' & hd & comment tl
279
280
281-- 8.2.4.24 Comment end space state
282commentEndSpace S{..} = pos $ case hd of
283    '>' -> CommentEnd & dat tl
284    '-' -> commentEndDash tl
285    _ | white hd -> hd & commentEndSpace tl
286    _ | eof -> errWant "-->" & CommentEnd & dat s
287    _ -> hd & comment tl
288
289
290-- 8.2.4.38 CDATA section state
291cdataSection S{..} = pos $ case hd of
292    _ | Just s <- next "]]>" -> dat s
293    _ | eof -> dat s
294    _ | otherwise -> hd & cdataSection tl
295
296
297-- 8.2.4.39 Tokenizing character references
298-- Change from spec: this is reponsible for writing '&' if nothing is to be written
299charRef :: Parser -> Bool -> Maybe Char -> S -> [Out]
300charRef resume att end S{..} = case hd of
301    _ | eof || hd `elem` "\t\n\f <&" || maybe False (== hd) end -> '&' & resume s
302    '#' -> charRefNum resume s tl
303    _ -> charRefAlpha resume att s
304
305charRefNum resume o S{..} = case hd of
306    _ | hd `elem` "xX" -> charRefNum2 resume o True tl
307    _ -> charRefNum2 resume o False s
308
309charRefNum2 resume o hex S{..} = case hd of
310    _ | hexChar hex hd -> (if hex then EntityHex else EntityNum) & hd & charRefNum3 resume hex tl
311    _ -> errSeen "&" & '&' & resume o
312
313charRefNum3 resume hex S{..} = case hd of
314    _ | hexChar hex hd -> hd & charRefNum3 resume hex tl
315    ';' -> EntityEnd True & resume tl
316    _ -> EntityEnd False & errWant ";" & resume s
317
318charRefAlpha resume att S{..} = case hd of
319    _ | isAlpha hd -> EntityName & hd & charRefAlpha2 resume att tl
320    _ -> errSeen "&" & '&' & resume s
321
322charRefAlpha2 resume att S{..} = case hd of
323    _ | alphaChar hd -> hd & charRefAlpha2 resume att tl
324    ';' -> EntityEnd True & resume tl
325    _ | att -> EntityEnd False & resume s
326    _ -> EntityEnd False & errWant ";" & resume s
327
328
329alphaChar x = isAlphaNum x || x `elem` ":-_"
330
331hexChar False x = isDigit x
332hexChar True  x = isDigit x || (x >= 'a' && x <= 'f') || (x >= 'A' && x <= 'F')
333