1-------------------------------------------------------------------------------
2--                  ALEX SCANNER AND LITERATE PREPROCESSOR
3--
4-- This Script defines the grammar used to generate the Alex scanner and a
5-- preprocessing scanner for dealing with literate scripts.  The actions for
6-- the Alex scanner are given separately in the Alex module.
7--
8-- See the Alex manual for a discussion of the scanners defined here.
9--
10-- Chris Dornan, Aug-95, 4-Jun-96, 10-Jul-96, 29-Sep-97
11-------------------------------------------------------------------------------
12
13{
14module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn) where
15
16import Data.Char
17import ParseMonad
18--import Debug.Trace
19}
20
21$digit    = 0-9
22$hexdig   = [0-9 A-F a-f]
23$octal    = 0-7
24$lower    = a-z
25$upper    = A-Z
26$alpha    = [$upper $lower]
27$alphanum = [$alpha $digit]
28$idchar   = [$alphanum \_ \']
29
30$special    = [\.\;\,\$\|\*\+\?\#\~\-\{\}\(\)\[\]\^\/]
31$graphic    = $printable # $white
32$nonspecial = $graphic # [$special \%]
33
34@id     = $alpha $idchar*
35@smac   = \$ @id | \$ \{ @id \}
36@rmac   = \@ @id | \@ \{ @id \}
37
38@comment = "--".*
39@ws      = $white+ | @comment
40
41alex :-
42
43@ws                             { skip }     -- white space; ignore
44
45<0> \" [^\"]* \"                { string }
46<0> (@id @ws?)? \:\-            { bind }
47<0> \{ / (\n | [^$digit])       { code }
48<0> $special                    { special }  -- note: matches {
49<0> \% "wrapper"                { wrapper }
50<0> \% "encoding"               { encoding }
51<0> \% "action"                 { actionty }
52<0> \% "token"                  { tokenty }
53<0> \% "typeclass"              { typeclass }
54
55<0> \\ $digit+                  { decch }
56<0> \\ x $hexdig+               { hexch }
57<0> \\ o $octal+                { octch }
58<0> \\ $printable               { escape }
59<0> $nonspecial # [\<]          { char }
60<0> @smac                       { smac }
61<0> @rmac                       { rmac }
62
63<0> @smac @ws? \=               { smacdef }
64<0> @rmac @ws? \=               { rmacdef }
65
66-- identifiers are allowed to be unquoted in startcode lists
67<0>             \<              { special `andBegin` startcodes }
68<startcodes>    0               { zero }
69<startcodes>    @id             { startcode }
70<startcodes>    \,              { special }
71<startcodes>    \>              { special `andBegin` afterstartcodes }
72
73-- After a <..> startcode sequence, we can have a {...} grouping of rules,
74-- so don't try to interpret the opening { as a code block.
75<afterstartcodes> \{ (\n | [^$digit ])  { special `andBegin` 0 }
76<afterstartcodes> ()            { skip `andBegin` 0 }  -- note: empty pattern
77{
78
79-- -----------------------------------------------------------------------------
80-- Token type
81
82data Token = T AlexPosn Tkn
83  deriving Show
84
85tokPosn (T p _) = p
86
87data Tkn
88  = SpecialT Char
89  | CodeT String
90  | ZeroT
91  | IdT String
92  | StringT String
93  | BindT String
94  | CharT Char
95  | SMacT String
96  | RMacT String
97  | SMacDefT String
98  | RMacDefT String
99  | NumT Int
100  | WrapperT
101  | EncodingT
102  | ActionTypeT
103  | TokenTypeT
104  | TypeClassT
105  | EOFT
106  deriving Show
107
108-- -----------------------------------------------------------------------------
109-- Token functions
110
111special, zero, string, bind, escape, decch, hexch, octch, char :: Action
112smac, rmac, smacdef, rmacdef, startcode, wrapper, encoding :: Action
113actionty, tokenty, typeclass :: Action
114special   (p,_,str) _  = return $ T p (SpecialT  (head str))
115zero      (p,_,_)   _  = return $ T p ZeroT
116string    (p,_,str) ln = return $ T p (StringT (extract ln str))
117bind      (p,_,str) _  = return $ T p (BindT (takeWhile isIdChar str))
118escape    (p,_,str) _  = return $ T p (CharT (esc str))
119decch     (p,_,str) ln = return $ T p (CharT (do_ech 10 ln (take (ln-1) (tail str))))
120hexch     (p,_,str) ln = return $ T p (CharT (do_ech 16 ln (take (ln-2) (drop 2 str))))
121octch     (p,_,str) ln = return $ T p (CharT (do_ech 8  ln (take (ln-2) (drop 2 str))))
122char      (p,_,str) _  = return $ T p (CharT (head str))
123smac      (p,_,str) ln = return $ T p (SMacT (mac ln str))
124rmac      (p,_,str) ln = return $ T p (RMacT (mac ln str))
125smacdef   (p,_,str) ln = return $ T p (SMacDefT (macdef ln str))
126rmacdef   (p,_,str) ln = return $ T p (RMacDefT (macdef ln str))
127startcode (p,_,str) ln = return $ T p (IdT (take ln str))
128wrapper   (p,_,_)   _  = return $ T p WrapperT
129encoding  (p,_,_)   _  = return $ T p EncodingT
130actionty  (p,_,_)   _  = return $ T p ActionTypeT
131tokenty   (p,_,_)   _  = return $ T p TokenTypeT
132typeclass (p,_,_)   _  = return $ T p TypeClassT
133
134isIdChar :: Char -> Bool
135isIdChar c = isAlphaNum c || c `elem` "_'"
136
137extract :: Int -> String -> String
138extract ln str = take (ln-2) (tail str)
139
140do_ech :: Int -> Int -> String -> Char
141do_ech radix _ln str = chr (parseInt radix str)
142
143mac :: Int -> String -> String
144mac ln str = take (ln-1) $ tail str
145
146-- TODO : replace not . isSpace with (\c -> not (isSpace c) && c /= '=')
147macdef :: Int -> String -> String
148macdef _ln str = takeWhile (\c -> not (isSpace c) && c /= '=') $ tail str
149
150esc :: String -> Char
151esc str =
152  case head $ tail str of
153    'a' -> '\a'
154    'b' -> '\b'
155    'f' -> '\f'
156    'n' -> '\n'
157    'r' -> '\r'
158    't' -> '\t'
159    'v' -> '\v'
160    c   ->  c
161
162parseInt :: Int -> String -> Int
163parseInt radix ds = foldl1 (\n d -> n * radix + d) (map digitToInt ds)
164
165-- In brace-delimited code, we have to be careful to match braces
166-- within the code, but ignore braces inside strings and character
167-- literals.  We do an approximate job (doing it properly requires
168-- implementing a large chunk of the Haskell lexical syntax).
169
170code :: Action
171code (p,_,_inp) _ = do
172  currentInput <- getInput
173  go currentInput 1 ""
174  where
175    go :: AlexInput -> Int -> String -> P Token
176    go inp 0 cs = do
177      setInput inp
178      return (T p (CodeT (reverse (tail cs))))
179    go inp n cs = do
180      case alexGetChar inp of
181        Nothing       -> err inp
182        Just (c,inp2) ->
183          case c of
184            '{'  -> go inp2 (n+1) (c:cs)
185            '}'  -> go inp2 (n-1) (c:cs)
186            '\'' -> go_char inp2 n (c:cs)
187            '\"' -> go_str inp2 n (c:cs) '\"'
188            c2   -> go inp2 n (c2:cs)
189
190    go_char :: AlexInput -> Int -> String -> P Token
191    -- try to catch multiple occurrences of ' at identifier end
192    go_char inp n cs@('\'':'\'':_) = go inp n cs
193    -- try to catch occurrences of ' within an identifier
194    go_char inp n cs@('\'':c2:_)
195      | isAlphaNum c2              = go inp n cs
196    go_char inp n cs               = go_str inp n cs '\''
197
198    go_str :: AlexInput -> Int -> String -> Char -> P Token
199    go_str inp n cs end = do
200      case alexGetChar inp of
201          Nothing -> err inp
202          Just (c,inp2)
203            | c == end  -> go inp2 n (c:cs)
204            | otherwise ->
205              case c of
206                '\\' -> case alexGetChar inp2 of
207                          Nothing       -> err inp2
208                          Just (d,inp3) -> go_str inp3 n (d:c:cs) end
209                c2   -> go_str inp2 n (c2:cs) end
210
211    err inp = do setInput inp; lexError "lexical error in code fragment"
212
213lexError :: String -> P a
214lexError s = do
215  (_,_,_,input) <- getInput
216  failP (s ++ (if (not (null input))
217                  then " at " ++ show (head input)
218                  else " at end of file"))
219
220lexer :: (Token -> P a) -> P a
221lexer cont = lexToken >>= cont
222
223lexToken :: P Token
224lexToken = do
225  inp@(p,c,_,s) <- getInput
226  sc <- getStartCode
227  case alexScan inp sc of
228    AlexEOF -> return (T p EOFT)
229    AlexError _ -> lexError "lexical error"
230    AlexSkip inp1 _ -> do
231      setInput inp1
232      lexToken
233    AlexToken inp1 len t -> do
234      setInput inp1
235      t (p,c,s) len
236
237type Action = (AlexPosn,Char,String) -> Int -> P Token
238
239skip :: Action
240skip _ _ = lexToken
241
242andBegin :: Action -> StartCode -> Action
243andBegin act sc inp len = setStartCode sc >> act inp len
244}
245