1{-# OPTIONS_HADDOCK hide #-}
2-----------------------------------------------------------------------------
3-- |
4-- Module      :  Language.Haskell.Exts.Annotated.InternalLexer
5-- Copyright   :  (c) The GHC Team, 1997-2000
6--                (c) Niklas Broberg, 2004-2009
7-- License     :  BSD-style (see the file LICENSE.txt)
8--
9-- Maintainer  :  Niklas Broberg, d00nibro@chalmers.se
10-- Stability   :  stable
11-- Portability :  portable
12--
13-- Lexer for Haskell, with some extensions.
14--
15-----------------------------------------------------------------------------
16
17-- ToDo: Introduce different tokens for decimal, octal and hexadecimal (?)
18-- ToDo: FloatTok should have three parts (integer part, fraction, exponent) (?)
19-- ToDo: Use a lexical analyser generator (lx?)
20
21module Language.Haskell.Exts.InternalLexer (Token(..), showToken, lexer, topLexer) where
22
23import Language.Haskell.Exts.ParseMonad
24import Language.Haskell.Exts.SrcLoc hiding (loc)
25import Language.Haskell.Exts.Comments
26import Language.Haskell.Exts.Extension
27import Language.Haskell.Exts.ExtScheme
28
29import Prelude hiding (id, exponent)
30import Data.Char
31import Data.Ratio
32import Data.List (intercalate, isPrefixOf)
33import Control.Monad (when)
34
35-- import Debug.Trace (trace)
36
37data Token
38        = VarId String
39        | LabelVarId String
40        | QVarId (String,String)
41        | IDupVarId (String)        -- duplicable implicit parameter
42        | ILinVarId (String)        -- linear implicit parameter
43        | ConId String
44        | QConId (String,String)
45        | DVarId [String]       -- to enable varid's with '-' in them
46        | VarSym String
47        | ConSym String
48        | QVarSym (String,String)
49        | QConSym (String,String)
50        | IntTok (Integer, String)
51        | FloatTok (Rational, String)
52        | Character (Char, String)
53        | StringTok (String, String)
54        | IntTokHash (Integer, String)        -- 1#
55        | WordTokHash (Integer, String)       -- 1##
56        | FloatTokHash (Rational, String)     -- 1.0#
57        | DoubleTokHash (Rational, String)    -- 1.0##
58        | CharacterHash (Char, String)        -- c#
59        | StringHash (String, String)         -- "Hello world!"#
60
61-- Symbols
62
63        | LeftParen
64        | RightParen
65        | LeftHashParen
66        | RightHashParen
67        | SemiColon
68        | LeftCurly
69        | RightCurly
70        | VRightCurly           -- a virtual close brace
71        | LeftSquare
72        | RightSquare
73        | ParArrayLeftSquare -- [:
74        | ParArrayRightSquare -- :]
75        | Comma
76        | Underscore
77        | BackQuote
78
79-- Reserved operators
80
81        | Dot           -- reserved for use with 'forall x . x'
82        | DotDot
83        | Colon
84        | QuoteColon
85        | DoubleColon
86        | Equals
87        | Backslash
88        | Bar
89        | LeftArrow
90        | RightArrow
91        | At
92        | TApp -- '@' but have to check for preceeding whitespace
93        | Tilde
94        | DoubleArrow
95        | Minus
96        | Exclamation
97        | Star
98        | LeftArrowTail         -- -<
99        | RightArrowTail        -- >-
100        | LeftDblArrowTail      -- -<<
101        | RightDblArrowTail     -- >>-
102        | OpenArrowBracket      -- (|
103        | CloseArrowBracket     -- |)
104
105-- Template Haskell
106        | THExpQuote            -- [| or [e|
107        | THTExpQuote           -- [|| or [e||
108        | THPatQuote            -- [p|
109        | THDecQuote            -- [d|
110        | THTypQuote            -- [t|
111        | THCloseQuote          -- |]
112        | THTCloseQuote         -- ||]
113        | THIdEscape (String)   -- dollar x
114        | THParenEscape         -- dollar (
115        | THTIdEscape String    -- dollar dollar x
116        | THTParenEscape        -- double dollar (
117        | THVarQuote            -- 'x (but without the x)
118        | THTyQuote             -- ''T (but without the T)
119        | THQuasiQuote (String,String)  -- [$...|...]
120
121-- HaRP
122        | RPGuardOpen       -- (|
123        | RPGuardClose      -- |)
124        | RPCAt             -- @:
125
126-- Hsx
127        | XCodeTagOpen      -- <%
128        | XCodeTagClose     -- %>
129        | XStdTagOpen       -- <
130        | XStdTagClose      -- >
131        | XCloseTagOpen     -- </
132        | XEmptyTagClose    -- />
133        | XChildTagOpen     -- <%> (note that close doesn't exist, it's XCloseTagOpen followed by XCodeTagClose)
134        | XPCDATA String
135        | XRPatOpen             -- <[
136        | XRPatClose            -- ]>
137
138-- Pragmas
139
140        | PragmaEnd                     -- #-}
141        | RULES
142        | INLINE Bool
143        | INLINE_CONLIKE
144        | SPECIALISE
145        | SPECIALISE_INLINE Bool
146        | SOURCE
147        | DEPRECATED
148        | WARNING
149        | SCC
150        | GENERATED
151        | CORE
152        | UNPACK
153        | NOUNPACK
154        | OPTIONS (Maybe String,String)
155--        | CFILES  String
156--        | INCLUDE String
157        | LANGUAGE
158        | ANN
159        | MINIMAL
160        | NO_OVERLAP
161        | OVERLAP
162        | OVERLAPPING
163        | OVERLAPPABLE
164        | OVERLAPS
165        | INCOHERENT
166        | COMPLETE
167
168-- Reserved Ids
169
170        | KW_As
171        | KW_By         -- transform list comprehensions
172        | KW_Case
173        | KW_Class
174        | KW_Data
175        | KW_Default
176        | KW_Deriving
177        | KW_Do
178        | KW_MDo
179        | KW_Else
180        | KW_Family     -- indexed type families
181        | KW_Forall     -- universal/existential types
182        | KW_Group      -- transform list comprehensions
183        | KW_Hiding
184        | KW_If
185        | KW_Import
186        | KW_In
187        | KW_Infix
188        | KW_InfixL
189        | KW_InfixR
190        | KW_Instance
191        | KW_Let
192        | KW_Module
193        | KW_NewType
194        | KW_Of
195        | KW_Proc       -- arrows
196        | KW_Rec        -- arrows
197        | KW_Role
198        | KW_Then
199        | KW_Type
200        | KW_Using      -- transform list comprehensions
201        | KW_Where
202        | KW_Qualified
203        | KW_Pattern
204        | KW_Stock
205        | KW_Anyclass
206        | KW_Via
207
208                -- FFI
209        | KW_Foreign
210        | KW_Export
211        | KW_Safe
212        | KW_Unsafe
213        | KW_Threadsafe
214        | KW_Interruptible
215        | KW_StdCall
216        | KW_CCall
217        | KW_CPlusPlus
218        | KW_DotNet
219        | KW_Jvm
220        | KW_Js
221        | KW_JavaScript
222        | KW_CApi
223
224        | EOF
225        deriving (Eq,Show)
226
227reserved_ops :: [(String,(Token, Maybe ExtScheme))]
228reserved_ops = [
229 ( "..", (DotDot,       Nothing) ),
230 ( ":",  (Colon,        Nothing) ),
231 ( "::", (DoubleColon,  Nothing) ),
232 ( "=",  (Equals,       Nothing) ),
233 ( "\\", (Backslash,    Nothing) ),
234 ( "|",  (Bar,          Nothing) ),
235 ( "<-", (LeftArrow,    Nothing) ),
236 ( "->", (RightArrow,   Nothing) ),
237 ( "@",  (At,           Nothing) ),
238 ( "@:", (RPCAt,        Just (Any [RegularPatterns])) ),
239 ( "~",  (Tilde,        Nothing) ),
240 ( "=>", (DoubleArrow,  Nothing) ),
241 ( "*",  (Star,         Just (Any [KindSignatures])) ),
242 -- Parallel arrays
243 ( "[:", (ParArrayLeftSquare,   Just (Any [ParallelArrays])) ),
244 ( ":]", (ParArrayRightSquare,  Just (Any [ParallelArrays])) ),
245 -- Arrows notation
246 ( "-<",  (LeftArrowTail,       Just (Any [Arrows])) ),
247 ( ">-",  (RightArrowTail,      Just (Any [Arrows])) ),
248 ( "-<<", (LeftDblArrowTail,    Just (Any [Arrows])) ),
249 ( ">>-", (RightDblArrowTail,   Just (Any [Arrows])) ),
250 -- Unicode notation
251 ( "\x2190",    (LeftArrow,     Just (Any  [UnicodeSyntax])) ),
252 ( "\x2192",    (RightArrow,    Just (Any  [UnicodeSyntax])) ),
253 ( "\x21d2",    (DoubleArrow,   Just (Any  [UnicodeSyntax])) ),
254 ( "\x2237",    (DoubleColon,   Just (Any  [UnicodeSyntax])) ),
255 ( "\x2919",    (LeftArrowTail,     Just (All [UnicodeSyntax, Arrows])) ),
256 ( "\x291a",    (RightArrowTail,    Just (All [UnicodeSyntax, Arrows])) ),
257 ( "\x291b",    (LeftDblArrowTail,  Just (All [UnicodeSyntax, Arrows])) ),
258 ( "\x291c",    (RightDblArrowTail, Just (All [UnicodeSyntax, Arrows])) ),
259 ( "\x2605",    (Star,              Just (All [UnicodeSyntax, KindSignatures])) ),
260 ( "\x2200",    (KW_Forall,         Just (All [UnicodeSyntax, ExplicitForAll])) )
261 ]
262
263special_varops :: [(String,(Token, Maybe ExtScheme))]
264special_varops = [
265 -- the dot is only a special symbol together with forall, but can still be used as function composition
266 ( ".",  (Dot,          Just (Any [ExplicitForAll, ExistentialQuantification])) ),
267 ( "-",  (Minus,        Nothing) ),
268 ( "!",  (Exclamation,  Nothing) )
269 ]
270
271reserved_ids :: [(String,(Token, Maybe ExtScheme))]
272reserved_ids = [
273 ( "_",         (Underscore,    Nothing) ),
274 ( "by",        (KW_By,         Just (Any [TransformListComp])) ),
275 ( "case",      (KW_Case,       Nothing) ),
276 ( "class",     (KW_Class,      Nothing) ),
277 ( "data",      (KW_Data,       Nothing) ),
278 ( "default",   (KW_Default,    Nothing) ),
279 ( "deriving",  (KW_Deriving,   Nothing) ),
280 ( "do",        (KW_Do,         Nothing) ),
281 ( "else",      (KW_Else,       Nothing) ),
282 ( "family",    (KW_Family,     Just (Any [TypeFamilies])) ),        -- indexed type families
283 ( "forall",    (KW_Forall,     Just (Any [ExplicitForAll, ExistentialQuantification])) ),    -- universal/existential quantification
284 ( "group",     (KW_Group,      Just (Any [TransformListComp])) ),
285 ( "if",        (KW_If,         Nothing) ),
286 ( "import",    (KW_Import,     Nothing) ),
287 ( "in",        (KW_In,         Nothing) ),
288 ( "infix",     (KW_Infix,      Nothing) ),
289 ( "infixl",    (KW_InfixL,     Nothing) ),
290 ( "infixr",    (KW_InfixR,     Nothing) ),
291 ( "instance",  (KW_Instance,   Nothing) ),
292 ( "let",       (KW_Let,        Nothing) ),
293 ( "mdo",       (KW_MDo,        Just (Any [RecursiveDo])) ),
294 ( "module",    (KW_Module,     Nothing) ),
295 ( "newtype",   (KW_NewType,    Nothing) ),
296 ( "of",        (KW_Of,         Nothing) ),
297 ( "proc",      (KW_Proc,       Just (Any [Arrows])) ),
298 ( "rec",       (KW_Rec,        Just (Any [Arrows, RecursiveDo, DoRec])) ),
299 ( "then",      (KW_Then,       Nothing) ),
300 ( "type",      (KW_Type,       Nothing) ),
301 ( "using",     (KW_Using,      Just (Any [TransformListComp])) ),
302 ( "where",     (KW_Where,      Nothing) ),
303 ( "role",      (KW_Role,       Just (Any [RoleAnnotations]))),
304 ( "pattern",   (KW_Pattern,    Just (Any [PatternSynonyms]))),
305 ( "stock",     (KW_Stock,      Just (Any [DerivingStrategies]))),
306 ( "anyclass",  (KW_Anyclass,   Just (Any [DerivingStrategies]))),
307 ( "via",       (KW_Via,        Just (Any [DerivingVia]))),
308
309-- FFI
310 ( "foreign",   (KW_Foreign,    Just (Any [ForeignFunctionInterface])) )
311 ]
312
313
314special_varids :: [(String,(Token, Maybe ExtScheme))]
315special_varids = [
316 ( "as",        (KW_As,         Nothing) ),
317 ( "qualified", (KW_Qualified,  Nothing) ),
318 ( "hiding",    (KW_Hiding,     Nothing) ),
319
320-- FFI
321 ( "export",        (KW_Export,        Just (Any [ForeignFunctionInterface])) ),
322 ( "safe",          (KW_Safe,          Just (Any [ForeignFunctionInterface, SafeImports, Safe, Trustworthy])) ),
323 ( "unsafe",        (KW_Unsafe,        Just (Any [ForeignFunctionInterface])) ),
324 ( "threadsafe",    (KW_Threadsafe,    Just (Any [ForeignFunctionInterface])) ),
325 ( "interruptible", (KW_Interruptible, Just (Any [InterruptibleFFI])) ),
326 ( "stdcall",       (KW_StdCall,       Just (Any [ForeignFunctionInterface])) ),
327 ( "ccall",         (KW_CCall,         Just (Any [ForeignFunctionInterface])) ),
328 ( "cplusplus",     (KW_CPlusPlus,     Just (Any [ForeignFunctionInterface])) ),
329 ( "dotnet",        (KW_DotNet,        Just (Any [ForeignFunctionInterface])) ),
330 ( "jvm",           (KW_Jvm,           Just (Any [ForeignFunctionInterface])) ),
331 ( "js",            (KW_Js,            Just (Any [ForeignFunctionInterface])) ),
332 ( "javascript",    (KW_JavaScript,    Just (Any [ForeignFunctionInterface])) ),
333 ( "capi",          (KW_CApi,          Just (Any [CApiFFI])) )
334 ]
335
336pragmas :: [(String,Token)]
337pragmas = [
338 ( "rules",             RULES           ),
339 ( "inline",            INLINE True     ),
340 ( "noinline",          INLINE False    ),
341 ( "notinline",         INLINE False    ),
342 ( "specialise",        SPECIALISE      ),
343 ( "specialize",        SPECIALISE      ),
344 ( "source",            SOURCE          ),
345 ( "deprecated",        DEPRECATED      ),
346 ( "warning",           WARNING         ),
347 ( "ann",               ANN             ),
348 ( "scc",               SCC             ),
349 ( "generated",         GENERATED       ),
350 ( "core",              CORE            ),
351 ( "unpack",            UNPACK          ),
352 ( "nounpack",          NOUNPACK        ),
353 ( "language",          LANGUAGE        ),
354 ( "minimal",           MINIMAL         ),
355 ( "no_overlap",        NO_OVERLAP      ),
356 ( "overlap",           OVERLAP         ),
357 ( "overlaps",          OVERLAPS        ),
358 ( "overlapping",       OVERLAPPING     ),
359 ( "overlappable",      OVERLAPPABLE    ),
360 ( "incoherent",        INCOHERENT      ),
361 ( "complete",          COMPLETE      ),
362 ( "options",           OPTIONS undefined ) -- we'll tweak it before use - promise!
363-- ( "cfiles",            CFILES  undefined ), -- same here...
364-- ( "include",           INCLUDE undefined )  -- ...and here!
365 ]
366
367isIdent, isHSymbol, isPragmaChar :: Char -> Bool
368isIdent   c = isAlphaNum c || c == '\'' || c == '_'
369
370isHSymbol c = c `elem` ":!#%&*./?@\\-" || ((isSymbol c || isPunctuation c) && not (c `elem` "(),;[]`{}_\"'"))
371
372isPragmaChar c = isAlphaNum c || c == '_'
373
374isIdentStart :: Char -> Bool
375isIdentStart c = isAlpha c && not (isUpper c) || c == '_'
376
377
378-- Used in the lexing of type applications
379-- Why is it like this? I don't know exactly but this is how it is in
380-- GHC's parser.
381isOpSymbol :: Char -> Bool
382isOpSymbol c = c `elem` "!#$%&*+./<=>?@\\^|-~"
383
384-- | Checks whether the character would be legal in some position of a qvar.
385--   Means that '..' and "AAA" will pass the test.
386isPossiblyQvar :: Char -> Bool
387isPossiblyQvar c = isIdent (toLower c) || c == '.'
388
389matchChar :: Char -> String -> Lex a ()
390matchChar c msg = do
391    s <- getInput
392    if null s || head s /= c then fail msg else discard 1
393
394-- The top-level lexer.
395-- We need to know whether we are at the beginning of the line to decide
396-- whether to insert layout tokens.
397
398lexer :: (Loc Token -> P a) -> P a
399lexer = runL topLexer
400
401topLexer :: Lex a (Loc Token)
402topLexer = do
403    b <- pullCtxtFlag
404    if b then -- trace (show cf ++ ": " ++ show VRightCurly) $
405              -- the lex context state flags that we must do an empty {} - UGLY
406              setBOL >> getSrcLocL >>= \l -> return (Loc (mkSrcSpan l l) VRightCurly)
407     else do
408        bol <- checkBOL
409        (bol', ws) <- lexWhiteSpace bol
410        -- take care of whitespace in PCDATA
411        ec <- getExtContext
412        case ec of
413         -- if there was no linebreak, and we are lexing PCDATA,
414         -- then we want to care about the whitespace.
415         -- We don't bother to test for XmlSyntax, since we
416         -- couldn't end up in ChildCtxt otherwise.
417         Just ChildCtxt | not bol' && ws -> getSrcLocL >>= \l -> return $ Loc (mkSrcSpan l l) $ XPCDATA " "
418         _ -> do startToken
419                 sl <- getSrcLocL
420                 t <- if bol' then lexBOL    -- >>= \t -> trace ("BOL: " ++ show t) (return t)
421                              else lexToken  -- >>= \t -> trace (show t) (return t)
422                 el <- getSrcLocL
423                 return $ Loc (mkSrcSpan sl el) t
424
425lexWhiteSpace :: Bool -> Lex a (Bool, Bool)
426lexWhiteSpace bol = do
427    s <- getInput
428    ignL <- ignoreLinePragmasL
429    case s of
430        -- If we find a recognised pragma, we don't want to treat it as a comment.
431        '{':'-':'#':rest | isRecognisedPragma rest -> return (bol, False)
432                         | isLinePragma rest && not ignL -> do
433                            (l, fn) <- lexLinePragma
434                            setSrcLineL l
435                            setLineFilenameL fn
436                            lexWhiteSpace True
437        '{':'-':_ -> do
438            loc <- getSrcLocL
439            discard 2
440            (bol1, c) <- lexNestedComment bol ""
441            loc2 <- getSrcLocL
442            pushComment $ Comment True (mkSrcSpan loc loc2) (reverse c)
443            (bol2, _) <- lexWhiteSpace bol1
444            return (bol2, True)
445        '-':'-':s1 | all (== '-') (takeWhile isHSymbol s1) -> do
446            loc    <- getSrcLocL
447            discard 2
448            dashes <- lexWhile (== '-')
449            rest   <- lexWhile (/= '\n')
450            s' <- getInput
451            loc2 <- getSrcLocL
452            let com = Comment False (mkSrcSpan loc loc2) $ dashes ++ rest
453            case s' of
454                [] -> pushComment com >> return (False, True)
455                _ -> do
456                    pushComment com
457                    lexNewline
458                    lexWhiteSpace_ True
459                    return (True, True)
460        '\n':_ -> do
461            lexNewline
462            lexWhiteSpace_ True
463            return (True, True)
464        '\t':_ -> do
465            lexTab
466            (bol', _) <- lexWhiteSpace bol
467            return (bol', True)
468        c:_ | isSpace c -> do
469            discard 1
470            (bol', _) <- lexWhiteSpace bol
471            return (bol', True)
472        _ -> return (bol, False)
473
474-- | lexWhiteSpace without the return value.
475lexWhiteSpace_ :: Bool -> Lex a ()
476lexWhiteSpace_ bol =  do _ <- lexWhiteSpace bol
477                         return ()
478
479isRecognisedPragma, isLinePragma :: String -> Bool
480isRecognisedPragma str = let pragma = takeWhile isPragmaChar . dropWhile isSpace $ str
481                          in case lookupKnownPragma pragma of
482                              Nothing -> False
483                              _       -> True
484
485isLinePragma str = let pragma = map toLower . takeWhile isAlphaNum . dropWhile isSpace $ str
486                    in case pragma of
487                        "line"  -> True
488                        _       -> False
489
490lexLinePragma :: Lex a (Int, String)
491lexLinePragma = do
492    discard 3   -- {-#
493    lexWhile_ isSpace
494    discard 4   -- LINE
495    lexWhile_ isSpace
496    i <- lexWhile isDigit
497    when (null i) $ fail "Improperly formatted LINE pragma"
498    lexWhile_ isSpace
499    matchChar '"' "Improperly formatted LINE pragma"
500    fn <- lexWhile (/= '"')
501    matchChar '"' "Impossible - lexLinePragma"
502    lexWhile_ isSpace
503    mapM_ (flip matchChar "Improperly formatted LINE pragma") "#-}"
504    lexNewline
505    return (read i, fn)
506
507lexNestedComment :: Bool -> String -> Lex a (Bool, String)
508lexNestedComment bol str = do
509    s <- getInput
510    case s of
511        '-':'}':_ -> discard 2 >> return (bol, str)
512        '{':'-':_ -> do
513            discard 2
514            (bol', c) <- lexNestedComment bol ("-{" ++ str) -- rest of the subcomment
515            lexNestedComment bol' ("}-" ++ c  ) -- rest of this comment
516        '\t':_    -> lexTab >> lexNestedComment bol ('\t':str)
517        '\n':_    -> lexNewline >> lexNestedComment True ('\n':str)
518        c:_       -> discard 1 >> lexNestedComment bol (c:str)
519        []        -> fail "Unterminated nested comment"
520
521-- When we are lexing the first token of a line, check whether we need to
522-- insert virtual semicolons or close braces due to layout.
523
524lexBOL :: Lex a Token
525lexBOL = do
526    pos <- getOffside
527    -- trace ("Off: " ++ (show pos)) $ do
528    case pos of
529        LT -> do
530                -- trace "layout: inserting '}'\n" $
531            -- Set col to 0, indicating that we're still at the
532            -- beginning of the line, in case we need a semi-colon too.
533            -- Also pop the context here, so that we don't insert
534            -- another close brace before the parser can pop it.
535            setBOL
536            popContextL "lexBOL"
537            return VRightCurly
538        EQ ->
539            -- trace "layout: inserting ';'\n" $
540            return SemiColon
541        GT -> lexToken
542
543lexToken :: Lex a Token
544lexToken = do
545    ec <- getExtContext
546    -- we don't bother to check XmlSyntax since we couldn't
547    -- have ended up in a non-Nothing context if it wasn't
548    -- enabled.
549    case ec of
550     Just HarpCtxt     -> lexHarpToken
551     Just TagCtxt      -> lexTagCtxt
552     Just CloseTagCtxt -> lexCloseTagCtxt
553     Just ChildCtxt    -> lexChildCtxt
554     Just CodeTagCtxt  -> lexCodeTagCtxt
555     _         -> lexStdToken
556
557
558lexChildCtxt :: Lex a Token
559lexChildCtxt = do
560    -- if we ever end up here, then XmlSyntax must be on.
561    s <- getInput
562    case s of
563        '<':'%':'>':_ -> do discard 3
564                            pushExtContextL ChildCtxt
565                            return XChildTagOpen
566        '<':'%':_ -> do discard 2
567                        pushExtContextL CodeTagCtxt
568                        return XCodeTagOpen
569        '<':'/':_ -> do discard 2
570                        popExtContextL "lexChildCtxt"
571                        pushExtContextL CloseTagCtxt
572                        return XCloseTagOpen
573        '<':'[':_ -> do discard 2
574                        pushExtContextL HarpCtxt
575                        return XRPatOpen
576        '<':_     -> do discard 1
577                        pushExtContextL TagCtxt
578                        return XStdTagOpen
579        _     -> lexPCDATA
580
581
582lexPCDATA :: Lex a Token
583lexPCDATA = do
584    -- if we ever end up here, then XmlSyntax must be on.
585    s <- getInput
586    case s of
587        [] -> return EOF
588        _  -> case s of
589            '\n':_ -> do
590                x <- lexNewline >> lexPCDATA
591                case x of
592                 XPCDATA p -> return $ XPCDATA $ '\n':p
593                 EOF -> return EOF
594                 _ -> fail $ "lexPCDATA: unexpected token: " ++ show x
595            '<':_ -> return $ XPCDATA ""
596            _ -> do let pcd = takeWhile (\c -> c `notElem` "<\n") s
597                        l = length pcd
598                    discard l
599                    x <- lexPCDATA
600                    case x of
601                     XPCDATA pcd' -> return $ XPCDATA $ pcd ++ pcd'
602                     EOF -> return EOF
603                     _ -> fail $ "lexPCDATA: unexpected token: " ++ show x
604
605
606lexCodeTagCtxt :: Lex a Token
607lexCodeTagCtxt = do
608    -- if we ever end up here, then XmlSyntax must be on.
609    s <- getInput
610    case s of
611        '%':'>':_ -> do discard 2
612                        popExtContextL "lexCodeTagContext"
613                        return XCodeTagClose
614        _     -> lexStdToken
615
616lexCloseTagCtxt :: Lex a Token
617lexCloseTagCtxt = do
618    -- if we ever end up here, then XmlSyntax must be on.
619    s <- getInput
620    case s of
621        '%':'>':_ -> do discard 2
622                        popExtContextL "lexCloseTagCtxt"
623                        return XCodeTagClose
624        '>':_     -> do discard 1
625                        popExtContextL "lexCloseTagCtxt"
626                        return XStdTagClose
627        _     -> lexStdToken
628
629lexTagCtxt :: Lex a Token
630lexTagCtxt = do
631    -- if we ever end up here, then XmlSyntax must be on.
632    s <- getInput
633    case s of
634        '/':'>':_ -> do discard 2
635                        popExtContextL "lexTagCtxt: Empty tag"
636                        return XEmptyTagClose
637        '>':_     -> do discard 1
638                        popExtContextL "lexTagCtxt: Standard tag"
639                        pushExtContextL ChildCtxt
640                        return XStdTagClose
641        _     -> lexStdToken
642
643lexHarpToken :: Lex a Token
644lexHarpToken = do
645    -- if we ever end up here, then RegularPatterns must be on.
646    s <- getInput
647    case s of
648        ']':'>':_ -> do discard 2
649                        popExtContextL "lexHarpToken"
650                        return XRPatClose
651        _     -> lexStdToken
652
653lexStdToken :: Lex a Token
654lexStdToken = do
655    s <- getInput
656    exts <- getExtensionsL
657    let intHash = lexHash IntTok IntTokHash (Right WordTokHash)
658    case s of
659        [] -> return EOF
660
661        '0':c:d:_ | toLower c == 'o' && isOctDigit d -> do
662                        discard 2
663                        (n, str) <- lexOctal
664                        con <- intHash
665                        return (con (n, '0':c:str))
666                  | toLower c == 'b' && isBinDigit d && BinaryLiterals `elem` exts -> do
667                        discard 2
668                        (n, str) <- lexBinary
669                        con <- intHash
670                        return (con (n, '0':c:str))
671                  | toLower c == 'x' && isHexDigit d -> do
672                        discard 2
673                        (n, str) <- lexHexadecimal
674                        con <- intHash
675                        return (con (n, '0':c:str))
676
677        -- implicit parameters
678        '?':c:_ | isIdentStart c && ImplicitParams `elem` exts -> do
679                        discard 1
680                        id <- lexWhile isIdent
681                        return $ IDupVarId id
682
683        '%':c:_ | isIdentStart c && ImplicitParams `elem` exts -> do
684                        discard 1
685                        id <- lexWhile isIdent
686                        return $ ILinVarId id
687        -- end implicit parameters
688
689        -- harp
690        '(':'|':c:_ | RegularPatterns `elem` exts && not (isHSymbol c) ->
691                        discard 2 >> return RPGuardOpen
692                    | Arrows `elem` exts && not (isHSymbol c) ->
693                        discard 2 >> return OpenArrowBracket
694        '|':')':_ | RegularPatterns `elem` exts -> discard 2 >> return RPGuardClose
695                  | Arrows `elem` exts -> discard 2 >> return CloseArrowBracket
696        {- This is handled by the reserved_ops above.
697        '@':':':_ | RegularPatterns `elem` exts ->
698                     do discard 2
699                        return RPCAt -}
700
701
702        -- template haskell
703        '[':'|':'|':_ | TemplateHaskell `elem` exts -> do
704                discard 3
705                return THTExpQuote
706
707        '[':'e':'|':'|':_ | TemplateHaskell `elem` exts -> do
708                discard 4
709                return THTExpQuote
710
711        '[':'|':_ | TemplateHaskell `elem` exts -> do
712                discard 2
713                return THExpQuote
714
715        '[':c:'|':_ | c == 'e' && TemplateHaskell `elem` exts -> do
716                        discard 3
717                        return THExpQuote
718                    | c == 'p' && TemplateHaskell `elem` exts -> do
719                        discard 3
720                        return THPatQuote
721                    | c == 'd' && TemplateHaskell `elem` exts -> do
722                        discard 3
723                        return THDecQuote
724                    | c == 't' && TemplateHaskell `elem` exts -> do
725                        discard 3
726                        return THTypQuote
727        '[':'$':c:_ | isIdentStart c && QuasiQuotes `elem` exts ->
728                        discard 2 >> lexQuasiQuote c
729
730        '[':c:s' | isIdentStart c && QuasiQuotes `elem` exts && case dropWhile isIdent s' of { '|':_ -> True;_->False} ->
731                        discard 1 >> lexQuasiQuote c
732                 | isUpper c && QuasiQuotes `elem` exts && case dropWhile isPossiblyQvar s' of { '|':_ -> True;_->False} ->
733                        discard 1 >> lexQuasiQuote c
734
735        '|':'|':']':_ | TemplateHaskell `elem` exts -> do
736                        discard 3
737                        return THTCloseQuote
738        '|':']':_ | TemplateHaskell `elem` exts -> do
739                        discard 2
740                        return THCloseQuote
741
742        '$':c1:c2:_ | isIdentStart c1 && TemplateHaskell `elem` exts -> do
743                        discard 1
744                        id <- lexWhile isIdent
745                        return $ THIdEscape id
746                    | c1 == '(' && TemplateHaskell `elem` exts -> do
747                        discard 2
748                        return THParenEscape
749                    | c1 == '$' && isIdentStart c2 && TemplateHaskell `elem` exts -> do
750                        discard 2
751                        id <- lexWhile isIdent
752                        return $ THTIdEscape id
753                    | c1 == '$' && c2 == '(' && TemplateHaskell `elem` exts -> do
754                        discard 3
755                        return THTParenEscape
756        -- end template haskell
757
758        -- hsx
759        '<':'%':c:_ | XmlSyntax `elem` exts ->
760                        case c of
761                         '>' -> do discard 3
762                                   pushExtContextL ChildCtxt
763                                   return XChildTagOpen
764                         _   -> do discard 2
765                                   pushExtContextL CodeTagCtxt
766                                   return XCodeTagOpen
767        '<':c:_ | isAlpha c && XmlSyntax `elem` exts -> do
768                        discard 1
769                        pushExtContextL TagCtxt
770                        return XStdTagOpen
771        -- end hsx
772
773        '(':'#':c:_ | unboxed exts && not (isHSymbol c) -> discard 2 >> return LeftHashParen
774
775        '#':')':_   | unboxed exts -> discard 2 >> return RightHashParen
776
777        -- pragmas
778
779        '{':'-':'#':_ -> saveExtensionsL >> discard 3 >> lexPragmaStart
780
781        '#':'-':'}':_ -> restoreExtensionsL >> discard 3 >> return PragmaEnd
782
783        -- Parallel arrays
784
785        '[':':':_ | ParallelArrays `elem` exts -> discard 2 >> return ParArrayLeftSquare
786
787        ':':']':_ | ParallelArrays `elem` exts -> discard 2 >> return ParArrayRightSquare
788
789        -- Lexed seperately to deal with visible type applciation
790
791        '@':c:_ | TypeApplications `elem` exts
792                   -- Operator starting with an '@'
793                   && not (isOpSymbol c) -> do
794                                                lc <- getLastChar
795                                                if isIdent lc
796                                                  then discard 1 >> return At
797                                                  else discard 1 >> return TApp
798
799        '#':c:_ | OverloadedLabels `elem` exts
800                   && isIdentStart c -> do
801                                                  discard 1
802                                                  [ident] <- lexIdents
803                                                  return $ LabelVarId ident
804
805
806        c:_ | isDigit c -> lexDecimalOrFloat
807
808            | isUpper c -> lexConIdOrQual ""
809
810            | isIdentStart c -> do
811                    idents <- lexIdents
812                    case idents of
813                     [ident] -> case lookup ident (reserved_ids ++ special_varids) of
814                                 Just (keyword, scheme) ->
815                                    -- check if an extension keyword is enabled
816                                    if isEnabled scheme exts
817                                     then flagKW keyword >> return keyword
818                                     else return $ VarId ident
819                                 Nothing -> return $ VarId ident
820                     _ -> return $ DVarId idents
821
822            | isHSymbol c -> do
823                    sym <- lexWhile isHSymbol
824                    return $ case lookup sym (reserved_ops ++ special_varops) of
825                              Just (t , scheme) ->
826                                -- check if an extension op is enabled
827                                if isEnabled scheme exts
828                                 then t
829                                 else case c of
830                                        ':' -> ConSym sym
831                                        _   -> VarSym sym
832                              Nothing -> case c of
833                                          ':' -> ConSym sym
834                                          _   -> VarSym sym
835
836            | otherwise -> do
837                    discard 1
838                    case c of
839
840                        -- First the special symbols
841                        '(' ->  return LeftParen
842                        ')' ->  return RightParen
843                        ',' ->  return Comma
844                        ';' ->  return SemiColon
845                        '[' ->  return LeftSquare
846                        ']' ->  return RightSquare
847                        '`' ->  return BackQuote
848                        '{' -> do
849                            pushContextL NoLayout
850                            return LeftCurly
851                        '}' -> do
852                            popContextL "lexStdToken"
853                            return RightCurly
854
855                        '\'' -> lexCharacter
856                        '"' ->  lexString
857
858                        _ ->    fail ("Illegal character \'" ++ show c ++ "\'\n")
859
860      where lexIdents :: Lex a [String]
861            lexIdents = do
862                ident <- lexWhile isIdent
863                s <- getInput
864                exts <- getExtensionsL
865                case s of
866                 -- This is the only way we can get more than one ident in the list
867                 -- and it requires XmlSyntax to be on.
868                 '-':c:_ | XmlSyntax `elem` exts && isAlpha c -> do
869                        discard 1
870                        idents <- lexIdents
871                        return $ ident : idents
872                 '#':_ | MagicHash `elem` exts -> do
873                        hashes <- lexWhile (== '#')
874                        return [ident ++ hashes]
875                 _ -> return [ident]
876
877            lexQuasiQuote :: Char -> Lex a Token
878            lexQuasiQuote c = do
879                -- We've seen and dropped [$ already
880                ident <- lexQuoter
881                matchChar '|' "Malformed quasi-quote quoter"
882                body <- lexQQBody
883                return $ THQuasiQuote (ident, body)
884                  where lexQuoter
885                         | isIdentStart c = lexWhile isIdent
886                         | otherwise = do
887                            qualThing <- lexConIdOrQual ""
888                            case qualThing of
889                                QVarId (s1,s2) -> return $ s1 ++ '.':s2
890                                QVarSym (s1, s2) -> return $ s1 ++ '.':s2
891                                _                -> fail "Malformed quasi-quote quoter"
892
893            lexQQBody :: Lex a String
894            lexQQBody = do
895                s <- getInput
896                case s of
897                  '\\':']':_ -> do discard 2
898                                   str <- lexQQBody
899                                   return (']':str)
900                  '\\':'|':_ -> do discard 2
901                                   str <- lexQQBody
902                                   return ('|':str)
903                  '|':']':_  -> discard 2 >> return ""
904                  '|':_ -> do discard 1
905                              str <- lexQQBody
906                              return ('|':str)
907                  ']':_ -> do discard 1
908                              str <- lexQQBody
909                              return (']':str)
910                  '\\':_ -> do discard 1
911                               str <- lexQQBody
912                               return ('\\':str)
913                  '\n':_ -> do lexNewline
914                               str <- lexQQBody
915                               return ('\n':str)
916                  []     -> fail "Unexpected end of input while lexing quasi-quoter"
917                  _ -> do str <- lexWhile (not . (`elem` "\\|\n"))
918                          rest <- lexQQBody
919                          return (str++rest)
920
921unboxed :: [KnownExtension] -> Bool
922unboxed exts = UnboxedSums `elem` exts || UnboxedTuples `elem` exts
923
924-- Underscores are used in some pragmas. Options pragmas are a special case
925-- with our representation: the thing after the underscore is a parameter.
926-- Strip off the parameters to option pragmas by hand here, everything else
927-- sits in the pragmas map.
928lookupKnownPragma :: String -> Maybe Token
929lookupKnownPragma s =
930    case map toLower s of
931      x | "options_" `isPrefixOf` x -> Just $ OPTIONS (Just $ drop 8 s, undefined)
932        | "options" == x            -> Just $ OPTIONS (Nothing, undefined)
933        | otherwise                 -> lookup x pragmas
934
935lexPragmaStart :: Lex a Token
936lexPragmaStart = do
937    lexWhile_ isSpace
938    pr <- lexWhile isPragmaChar
939    case lookupKnownPragma pr of
940     Just (INLINE True) -> do
941            s <- getInput
942            case map toLower s of
943             ' ':'c':'o':'n':'l':'i':'k':'e':_  -> do
944                      discard 8
945                      return INLINE_CONLIKE
946             _ -> return $ INLINE True
947     Just SPECIALISE -> do
948            s <- getInput
949            case dropWhile isSpace $ map toLower s of
950             'i':'n':'l':'i':'n':'e':_ -> do
951                      lexWhile_ isSpace
952                      discard 6
953                      return $ SPECIALISE_INLINE True
954             'n':'o':'i':'n':'l':'i':'n':'e':_ -> do
955                        lexWhile_ isSpace
956                        discard 8
957                        return $ SPECIALISE_INLINE False
958             'n':'o':'t':'i':'n':'l':'i':'n':'e':_ -> do
959                        lexWhile_ isSpace
960                        discard 9
961                        return $ SPECIALISE_INLINE False
962             _ -> return SPECIALISE
963
964     Just (OPTIONS opt) ->     -- see, I promised we'd mask out the 'undefined'
965            -- We do not want to store necessary whitespace in the datatype
966            -- but if the pragma starts with a newline then we must keep
967            -- it to differentiate the two cases.
968            let dropIfSpace (' ':xs) = xs
969                dropIfSpace xs       = xs
970             in
971              case fst opt of
972                Just opt' -> do
973                  rest <- lexRawPragma
974                  return $ OPTIONS (Just opt', dropIfSpace rest)
975                Nothing -> do
976                  s <- getInput
977                  case s of
978                    x:_ | isSpace x -> do
979                      rest <- lexRawPragma
980                      return $ OPTIONS (Nothing, dropIfSpace rest)
981                    _  -> fail "Malformed Options pragma"
982     Just RULES -> do -- Rules enable ScopedTypeVariables locally.
983            addExtensionL ScopedTypeVariables
984            return RULES
985{-     Just (CFILES _) -> do
986            rest <- lexRawPragma
987            return $ CFILES rest
988     Just (INCLUDE _) -> do
989            rest <- lexRawPragma
990            return $ INCLUDE rest -}
991     Just p ->  return p
992
993     _      -> fail "Internal error: Unrecognised recognised pragma"
994                  -- do rawStr <- lexRawPragma
995                  -- return $ PragmaUnknown (pr, rawStr) -- no support for unrecognized pragmas, treat as comment
996                  -- discard 3 -- #-}
997                  -- topLexer -- we just discard it as a comment for now and restart -}
998
999lexRawPragma :: Lex a String
1000lexRawPragma = lexRawPragmaAux
1001 where lexRawPragmaAux = do
1002        rpr <- lexWhile (/='#')
1003        s <- getInput
1004        case s of
1005         '#':'-':'}':_  -> return rpr
1006         "" -> fail "End-of-file inside pragma"
1007         _ -> do
1008            discard 1
1009            rpr' <- lexRawPragma
1010            return $ rpr ++ '#':rpr'
1011
1012lexDecimalOrFloat :: Lex a Token
1013lexDecimalOrFloat = do
1014    ds <- lexWhile isDigit
1015    rest <- getInput
1016    exts <- getExtensionsL
1017    case rest of
1018        ('.':d:_) | isDigit d -> do
1019                discard 1
1020                frac <- lexWhile isDigit
1021                let num = parseInteger 10 (ds ++ frac)
1022                    decimals = toInteger (length frac)
1023                (exponent, estr) <- do
1024                    rest2 <- getInput
1025                    case rest2 of
1026                        'e':_ -> lexExponent
1027                        'E':_ -> lexExponent
1028                        _     -> return (0,"")
1029                con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash)
1030                return $ con ((num%1) * 10^^(exponent - decimals), ds ++ '.':frac ++ estr)
1031        e:_ | toLower e == 'e' -> do
1032                (exponent, estr) <- lexExponent
1033                con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash)
1034                return $ con ((parseInteger 10 ds%1) * 10^^exponent, ds ++ estr)
1035        '#':'#':_ | MagicHash `elem` exts -> discard 2 >> return (WordTokHash (parseInteger 10 ds, ds))
1036        '#':_     | MagicHash `elem` exts -> discard 1 >> return (IntTokHash  (parseInteger 10 ds, ds))
1037        _         ->              return (IntTok      (parseInteger 10 ds, ds))
1038
1039    where
1040    lexExponent :: Lex a (Integer, String)
1041    lexExponent = do
1042        (e:r) <- getInput
1043        discard 1   -- 'e' or 'E'
1044        case r of
1045         '+':d:_ | isDigit d -> do
1046            discard 1
1047            (n, str) <- lexDecimal
1048            return (n, e:'+':str)
1049         '-':d:_ | isDigit d -> do
1050            discard 1
1051            (n, str) <- lexDecimal
1052            return (negate n, e:'-':str)
1053         d:_ | isDigit d -> lexDecimal >>= \(n,str) -> return (n, e:str)
1054         _ -> fail "Float with missing exponent"
1055
1056lexHash :: (b -> Token) -> (b -> Token) -> Either String (b -> Token) -> Lex a (b -> Token)
1057lexHash a b c = do
1058    exts <- getExtensionsL
1059    if MagicHash `elem` exts
1060     then do
1061        r <- getInput
1062        case r of
1063         '#':'#':_ -> case c of
1064                       Right c' -> discard 2 >> return c'
1065                       Left s  -> fail s
1066         '#':_     -> discard 1 >> return b
1067         _         ->              return a
1068     else return a
1069
1070lexConIdOrQual :: String -> Lex a Token
1071lexConIdOrQual qual = do
1072        con <- lexWhile isIdent
1073        let conid | null qual = ConId con
1074                  | otherwise = QConId (qual,con)
1075            qual' | null qual = con
1076                  | otherwise = qual ++ '.':con
1077        just_a_conid <- alternative (return conid)
1078        rest <- getInput
1079        exts <- getExtensionsL
1080        case rest of
1081          '.':c:_
1082             | isIdentStart c -> do  -- qualified varid?
1083                    discard 1
1084                    ident <- lexWhile isIdent
1085                    s <- getInput
1086                    exts' <- getExtensionsL
1087                    ident' <- case s of
1088                               '#':_ | MagicHash `elem` exts' -> discard 1 >> return (ident ++ "#")
1089                               _ -> return ident
1090                    case lookup ident' reserved_ids of
1091                       -- cannot qualify a reserved word
1092                       Just (_,scheme) | isEnabled scheme exts'  -> just_a_conid
1093                       _ -> return (QVarId (qual', ident'))
1094
1095             | isUpper c -> do      -- qualified conid?
1096                    discard 1
1097                    lexConIdOrQual qual'
1098
1099             | isHSymbol c -> do    -- qualified symbol?
1100                    discard 1
1101                    sym <- lexWhile isHSymbol
1102                    exts' <- getExtensionsL
1103                    case lookup sym reserved_ops of
1104                        -- cannot qualify a reserved operator
1105                        Just (_,scheme) | isEnabled scheme exts' -> just_a_conid
1106                        _        -> return $ case c of
1107                                              ':' -> QConSym (qual', sym)
1108                                              _   -> QVarSym (qual', sym)
1109
1110          '#':cs
1111            | null cs ||
1112              not (isHSymbol $ head cs) &&
1113              not (isIdent $ head cs) && MagicHash `elem` exts -> do
1114                discard 1
1115                case conid of
1116                 ConId con' -> return $ ConId $ con' ++ "#"
1117                 QConId (q,con') -> return $ QConId (q,con' ++ "#")
1118                 _ -> fail $ "lexConIdOrQual: unexpected token: " ++ show conid
1119          _ ->  return conid -- not a qualified thing
1120
1121lexCharacter :: Lex a Token
1122lexCharacter = do   -- We need to keep track of not only character constants but also TH 'x and ''T
1123        -- We've seen ' so far
1124        s <- getInput
1125        exts <- getExtensionsL
1126        case s of
1127         '\'':_ | TemplateHaskell `elem` exts -> discard 1 >> return THTyQuote
1128         '\\':_ -> do
1129                    (c,raw) <- lexEscape
1130                    matchQuote
1131                    con <- lexHash Character CharacterHash
1132                            (Left "Double hash not available for character literals")
1133                    return (con (c, '\\':raw))
1134         c:'\'':_ -> do
1135                    discard 2
1136                    con <- lexHash Character CharacterHash
1137                            (Left "Double hash not available for character literals")
1138                    return (con (c, [c]))
1139         _ | any (`elem` exts) [TemplateHaskell, DataKinds] -> return THVarQuote
1140         _ -> fail "Improper character constant or misplaced \'"
1141
1142    where matchQuote = matchChar '\'' "Improperly terminated character constant"
1143
1144
1145lexString :: Lex a Token
1146lexString = loop ("","")
1147    where
1148    loop (s,raw) = do
1149        r <- getInput
1150        exts <- getExtensionsL
1151        case r of
1152            '\\':'&':_ -> do
1153                    discard 2
1154                    loop (s, '&':'\\':raw)
1155            '\\':c:_ | isSpace c -> do
1156                        discard 1
1157                        wcs <- lexWhiteChars
1158                        matchChar '\\' "Illegal character in string gap"
1159                        loop (s, '\\':reverse wcs ++ '\\':raw)
1160                     | otherwise -> do
1161                        (ce, str) <- lexEscape
1162                        loop (ce:s, reverse str ++ '\\':raw)
1163            '"':'#':_ | MagicHash `elem` exts -> do
1164                        discard 2
1165                        return (StringHash (reverse s, reverse raw))
1166            '"':_ -> do
1167                discard 1
1168                return (StringTok (reverse s, reverse raw))
1169            c:_ | c /= '\n' -> do
1170                discard 1
1171                loop (c:s, c:raw)
1172            _ ->   fail "Improperly terminated string"
1173
1174    lexWhiteChars :: Lex a String
1175    lexWhiteChars = do
1176        s <- getInput
1177        case s of
1178            '\n':_ -> do
1179                    lexNewline
1180                    wcs <- lexWhiteChars
1181                    return $ '\n':wcs
1182            '\t':_ -> do
1183                    lexTab
1184                    wcs <- lexWhiteChars
1185                    return $ '\t':wcs
1186            c:_ | isSpace c -> do
1187                    discard 1
1188                    wcs <- lexWhiteChars
1189                    return $ c:wcs
1190            _ -> return ""
1191
1192lexEscape :: Lex a (Char, String)
1193lexEscape = do
1194    discard 1
1195    r <- getInput
1196    case r of
1197
1198-- Production charesc from section B.2 (Note: \& is handled by caller)
1199
1200        'a':_           -> discard 1 >> return ('\a', "a")
1201        'b':_           -> discard 1 >> return ('\b', "b")
1202        'f':_           -> discard 1 >> return ('\f', "f")
1203        'n':_           -> discard 1 >> return ('\n', "n")
1204        'r':_           -> discard 1 >> return ('\r', "r")
1205        't':_           -> discard 1 >> return ('\t', "t")
1206        'v':_           -> discard 1 >> return ('\v', "v")
1207        '\\':_          -> discard 1 >> return ('\\', "\\")
1208        '"':_           -> discard 1 >> return ('\"', "\"")
1209        '\'':_          -> discard 1 >> return ('\'', "\'")
1210
1211-- Production ascii from section B.2
1212
1213        '^':c:_         -> discard 2 >> cntrl c
1214        'N':'U':'L':_   -> discard 3 >> return ('\NUL', "NUL")
1215        'S':'O':'H':_   -> discard 3 >> return ('\SOH', "SOH")
1216        'S':'T':'X':_   -> discard 3 >> return ('\STX', "STX")
1217        'E':'T':'X':_   -> discard 3 >> return ('\ETX', "ETX")
1218        'E':'O':'T':_   -> discard 3 >> return ('\EOT', "EOT")
1219        'E':'N':'Q':_   -> discard 3 >> return ('\ENQ', "ENQ")
1220        'A':'C':'K':_   -> discard 3 >> return ('\ACK', "ACK")
1221        'B':'E':'L':_   -> discard 3 >> return ('\BEL', "BEL")
1222        'B':'S':_       -> discard 2 >> return ('\BS',  "BS")
1223        'H':'T':_       -> discard 2 >> return ('\HT',  "HT")
1224        'L':'F':_       -> discard 2 >> return ('\LF',  "LF")
1225        'V':'T':_       -> discard 2 >> return ('\VT',  "VT")
1226        'F':'F':_       -> discard 2 >> return ('\FF',  "FF")
1227        'C':'R':_       -> discard 2 >> return ('\CR',  "CR")
1228        'S':'O':_       -> discard 2 >> return ('\SO',  "SO")
1229        'S':'I':_       -> discard 2 >> return ('\SI',  "SI")
1230        'D':'L':'E':_   -> discard 3 >> return ('\DLE', "DLE")
1231        'D':'C':'1':_   -> discard 3 >> return ('\DC1', "DC1")
1232        'D':'C':'2':_   -> discard 3 >> return ('\DC2', "DC2")
1233        'D':'C':'3':_   -> discard 3 >> return ('\DC3', "DC3")
1234        'D':'C':'4':_   -> discard 3 >> return ('\DC4', "DC4")
1235        'N':'A':'K':_   -> discard 3 >> return ('\NAK', "NAK")
1236        'S':'Y':'N':_   -> discard 3 >> return ('\SYN', "SYN")
1237        'E':'T':'B':_   -> discard 3 >> return ('\ETB', "ETB")
1238        'C':'A':'N':_   -> discard 3 >> return ('\CAN', "CAN")
1239        'E':'M':_       -> discard 2 >> return ('\EM',  "EM")
1240        'S':'U':'B':_   -> discard 3 >> return ('\SUB', "SUB")
1241        'E':'S':'C':_   -> discard 3 >> return ('\ESC', "ESC")
1242        'F':'S':_       -> discard 2 >> return ('\FS',  "FS")
1243        'G':'S':_       -> discard 2 >> return ('\GS',  "GS")
1244        'R':'S':_       -> discard 2 >> return ('\RS',  "RS")
1245        'U':'S':_       -> discard 2 >> return ('\US',  "US")
1246        'S':'P':_       -> discard 2 >> return ('\SP',  "SP")
1247        'D':'E':'L':_   -> discard 3 >> return ('\DEL', "DEL")
1248
1249-- Escaped numbers
1250
1251        'o':c:_ | isOctDigit c -> do
1252                    discard 1
1253                    (n, raw) <- lexOctal
1254                    n' <- checkChar n
1255                    return (n', 'o':raw)
1256        'x':c:_ | isHexDigit c -> do
1257                    discard 1
1258                    (n, raw) <- lexHexadecimal
1259                    n' <- checkChar n
1260                    return (n', 'x':raw)
1261        c:_ | isDigit c -> do
1262                    (n, raw) <- lexDecimal
1263                    n' <- checkChar n
1264                    return (n', raw)
1265
1266        _       -> fail "Illegal escape sequence"
1267
1268    where
1269    checkChar n | n <= 0x10FFFF = return (chr (fromInteger n))
1270    checkChar _                 = fail "Character constant out of range"
1271
1272-- Production cntrl from section B.2
1273
1274    cntrl :: Char -> Lex a (Char, String)
1275    cntrl c | c >= '@' && c <= '_' = return (chr (ord c - ord '@'), '^':c:[])
1276    cntrl _                        = fail "Illegal control character"
1277
1278-- assumes at least one octal digit
1279lexOctal :: Lex a (Integer, String)
1280lexOctal = do
1281    ds <- lexWhile isOctDigit
1282    return (parseInteger 8 ds, ds)
1283
1284-- assumes at least one binary digit
1285lexBinary :: Lex a (Integer, String)
1286lexBinary = do
1287    ds <- lexWhile isBinDigit
1288    return (parseInteger 2 ds, ds)
1289
1290-- assumes at least one hexadecimal digit
1291lexHexadecimal :: Lex a (Integer, String)
1292lexHexadecimal = do
1293    ds <- lexWhile isHexDigit
1294    return (parseInteger 16 ds, ds)
1295
1296-- assumes at least one decimal digit
1297lexDecimal :: Lex a (Integer, String)
1298lexDecimal = do
1299    ds <- lexWhile isDigit
1300    return (parseInteger 10 ds, ds)
1301
1302-- Stolen from Hugs's Prelude
1303parseInteger :: Integer -> String -> Integer
1304parseInteger radix ds =
1305    foldl1 (\n d -> n * radix + d) (map (toInteger . digitToInt) ds)
1306
1307flagKW :: Token -> Lex a ()
1308flagKW t =
1309  when (t `elem` [KW_Do, KW_MDo]) $ do
1310       exts <- getExtensionsL
1311       when (NondecreasingIndentation `elem` exts) flagDo
1312
1313-- | Selects ASCII binary digits, i.e. @\'0\'@..@\'1\'@.
1314isBinDigit :: Char -> Bool
1315isBinDigit c =  c >= '0' && c <= '1'
1316------------------------------------------------------------------
1317-- "Pretty" printing for tokens
1318
1319showToken :: Token -> String
1320showToken t = case t of
1321  VarId s           -> s
1322  LabelVarId s      -> '#':s
1323  QVarId (q,s)      -> q ++ '.':s
1324  IDupVarId s       -> '?':s
1325  ILinVarId s       -> '%':s
1326  ConId s           -> s
1327  QConId (q,s)      -> q ++ '.':s
1328  DVarId ss         -> intercalate "-" ss
1329  VarSym s          -> s
1330  ConSym s          -> s
1331  QVarSym (q,s)     -> q ++ '.':s
1332  QConSym (q,s)     -> q ++ '.':s
1333  IntTok (_, s)         -> s
1334  FloatTok (_, s)       -> s
1335  Character (_, s)      -> '\'':s ++ "'"
1336  StringTok (_, s)      -> '"':s ++ "\""
1337  IntTokHash (_, s)     -> s ++ "#"
1338  WordTokHash (_, s)    -> s ++ "##"
1339  FloatTokHash (_, s)   -> s ++ "#"
1340  DoubleTokHash (_, s)  -> s ++ "##"
1341  CharacterHash (_, s)  -> '\'':s ++ "'#"
1342  StringHash (_, s)     -> '"':s ++ "\"#"
1343  LeftParen         -> "("
1344  RightParen        -> ")"
1345  LeftHashParen     -> "(#"
1346  RightHashParen    -> "#)"
1347  SemiColon         -> ";"
1348  LeftCurly         -> "{"
1349  RightCurly        -> "}"
1350  VRightCurly       -> "virtual }"
1351  LeftSquare        -> "["
1352  RightSquare       -> "]"
1353  ParArrayLeftSquare -> "[:"
1354  ParArrayRightSquare -> ":]"
1355  Comma             -> ","
1356  Underscore        -> "_"
1357  BackQuote         -> "`"
1358  QuoteColon        -> "':"
1359  Dot               -> "."
1360  DotDot            -> ".."
1361  Colon             -> ":"
1362  DoubleColon       -> "::"
1363  Equals            -> "="
1364  Backslash         -> "\\"
1365  Bar               -> "|"
1366  LeftArrow         -> "<-"
1367  RightArrow        -> "->"
1368  At                -> "@"
1369  TApp              -> "@"
1370  Tilde             -> "~"
1371  DoubleArrow       -> "=>"
1372  Minus             -> "-"
1373  Exclamation       -> "!"
1374  Star              -> "*"
1375  LeftArrowTail     -> "-<"
1376  RightArrowTail    -> ">-"
1377  LeftDblArrowTail  -> "-<<"
1378  RightDblArrowTail -> ">>-"
1379  OpenArrowBracket  -> "(|"
1380  CloseArrowBracket -> "|)"
1381  THExpQuote        -> "[|"
1382  THTExpQuote       -> "[||"
1383  THPatQuote        -> "[p|"
1384  THDecQuote        -> "[d|"
1385  THTypQuote        -> "[t|"
1386  THCloseQuote      -> "|]"
1387  THTCloseQuote     -> "||]"
1388  THIdEscape s      -> '$':s
1389  THParenEscape     -> "$("
1390  THTIdEscape s     -> "$$" ++ s
1391  THTParenEscape    -> "$$("
1392  THVarQuote        -> "'"
1393  THTyQuote         -> "''"
1394  THQuasiQuote (n,q) -> "[$" ++ n ++ "|" ++ q ++ "]"
1395  RPGuardOpen       -> "(|"
1396  RPGuardClose      -> "|)"
1397  RPCAt             -> "@:"
1398  XCodeTagOpen      -> "<%"
1399  XCodeTagClose     -> "%>"
1400  XStdTagOpen       -> "<"
1401  XStdTagClose      -> ">"
1402  XCloseTagOpen     -> "</"
1403  XEmptyTagClose    -> "/>"
1404  XPCDATA s         -> "PCDATA " ++ s
1405  XRPatOpen         -> "<["
1406  XRPatClose        -> "]>"
1407  PragmaEnd         -> "#-}"
1408  RULES             -> "{-# RULES"
1409  INLINE b          -> "{-# " ++ if b then "INLINE" else "NOINLINE"
1410  INLINE_CONLIKE    -> "{-# " ++ "INLINE CONLIKE"
1411  SPECIALISE        -> "{-# SPECIALISE"
1412  SPECIALISE_INLINE b -> "{-# SPECIALISE " ++ if b then "INLINE" else "NOINLINE"
1413  SOURCE            -> "{-# SOURCE"
1414  DEPRECATED        -> "{-# DEPRECATED"
1415  WARNING           -> "{-# WARNING"
1416  SCC               -> "{-# SCC"
1417  GENERATED         -> "{-# GENERATED"
1418  CORE              -> "{-# CORE"
1419  UNPACK            -> "{-# UNPACK"
1420  NOUNPACK          -> "{-# NOUNPACK"
1421  OPTIONS (mt,_)    -> "{-# OPTIONS" ++ maybe "" (':':) mt ++ " ..."
1422--  CFILES  s         -> "{-# CFILES ..."
1423--  INCLUDE s         -> "{-# INCLUDE ..."
1424  LANGUAGE          -> "{-# LANGUAGE"
1425  ANN               -> "{-# ANN"
1426  MINIMAL           -> "{-# MINIMAL"
1427  NO_OVERLAP        -> "{-# NO_OVERLAP"
1428  OVERLAP           -> "{-# OVERLAP"
1429  OVERLAPPING       -> "{-# OVERLAPPING"
1430  OVERLAPPABLE      -> "{-# OVERLAPPABLE"
1431  OVERLAPS          -> "{-# OVERLAPS"
1432  INCOHERENT        -> "{-# INCOHERENT"
1433  COMPLETE          -> "{-# COMPLETE"
1434  KW_As         -> "as"
1435  KW_By         -> "by"
1436  KW_Case       -> "case"
1437  KW_Class      -> "class"
1438  KW_Data       -> "data"
1439  KW_Default    -> "default"
1440  KW_Deriving   -> "deriving"
1441  KW_Do         -> "do"
1442  KW_MDo        -> "mdo"
1443  KW_Else       -> "else"
1444  KW_Family     -> "family"
1445  KW_Forall     -> "forall"
1446  KW_Group      -> "group"
1447  KW_Hiding     -> "hiding"
1448  KW_If         -> "if"
1449  KW_Import     -> "import"
1450  KW_In         -> "in"
1451  KW_Infix      -> "infix"
1452  KW_InfixL     -> "infixl"
1453  KW_InfixR     -> "infixr"
1454  KW_Instance   -> "instance"
1455  KW_Let        -> "let"
1456  KW_Module     -> "module"
1457  KW_NewType    -> "newtype"
1458  KW_Of         -> "of"
1459  KW_Proc       -> "proc"
1460  KW_Rec        -> "rec"
1461  KW_Then       -> "then"
1462  KW_Type       -> "type"
1463  KW_Using      -> "using"
1464  KW_Where      -> "where"
1465  KW_Qualified  -> "qualified"
1466  KW_Foreign    -> "foreign"
1467  KW_Export     -> "export"
1468  KW_Safe       -> "safe"
1469  KW_Unsafe     -> "unsafe"
1470  KW_Threadsafe -> "threadsafe"
1471  KW_Interruptible -> "interruptible"
1472  KW_StdCall    -> "stdcall"
1473  KW_CCall      -> "ccall"
1474  XChildTagOpen -> "<%>"
1475  KW_CPlusPlus  -> "cplusplus"
1476  KW_DotNet     -> "dotnet"
1477  KW_Jvm        -> "jvm"
1478  KW_Js         -> "js"
1479  KW_JavaScript -> "javascript"
1480  KW_CApi       -> "capi"
1481  KW_Role       -> "role"
1482  KW_Pattern    -> "pattern"
1483  KW_Stock      -> "stock"
1484  KW_Anyclass   -> "anyclass"
1485  KW_Via        -> "via"
1486
1487  EOF           -> "EOF"
1488