1-- -----------------------------------------------------------------------------
2--
3-- AbsSyn.hs, part of Alex
4--
5-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
6--
7-- This module provides a concrete representation for regular expressions and
8-- scanners.  Scanners are used for tokenising files in preparation for parsing.
9--
10-- ----------------------------------------------------------------------------}
11
12module AbsSyn (
13  Code, Directive(..), Scheme(..),
14  wrapperName,
15  Scanner(..),
16  RECtx(..),
17  RExp(..),
18  DFA(..), State(..), SNum, StartCode, Accept(..),
19  RightContext(..), showRCtx, strtype,
20  encodeStartCodes, extractActions,
21  Target(..),
22  UsesPreds(..), usesPreds,
23  StrType(..)
24  ) where
25
26import CharSet ( CharSet, Encoding )
27import Map ( Map )
28import qualified Map hiding ( Map )
29import Data.IntMap (IntMap)
30import Sort ( nub' )
31import Util ( str, nl )
32
33import Data.Maybe ( fromJust )
34
35infixl 4 :|
36infixl 5 :%%
37
38-- -----------------------------------------------------------------------------
39-- Abstract Syntax for Alex scripts
40
41type Code = String
42
43data Directive
44   = WrapperDirective String            -- use this wrapper
45   | EncodingDirective Encoding         -- use this encoding
46   | ActionType String                  -- Type signature of actions,
47                                        -- with optional typeclasses
48   | TypeClass String
49   | TokenType String
50   deriving Show
51
52data StrType = Str | Lazy | Strict
53
54instance Show StrType where
55  show Str = "String"
56  show Lazy = "ByteString.ByteString"
57  show Strict = "ByteString.ByteString"
58
59data Scheme
60  = Default { defaultTypeInfo :: Maybe (Maybe String, String) }
61  | GScan { gscanTypeInfo :: Maybe (Maybe String, String) }
62  | Basic { basicStrType :: StrType,
63            basicTypeInfo :: Maybe (Maybe String, String) }
64  | Posn { posnByteString :: Bool,
65           posnTypeInfo :: Maybe (Maybe String, String) }
66  | Monad { monadByteString :: Bool, monadUserState :: Bool,
67            monadTypeInfo :: Maybe (Maybe String, String) }
68
69strtype :: Bool -> String
70strtype True = "ByteString.ByteString"
71strtype False = "String"
72
73wrapperName :: Scheme -> Maybe String
74wrapperName Default {} = Nothing
75wrapperName GScan {} = Just "gscan"
76wrapperName Basic { basicStrType = Str } = Just "basic"
77wrapperName Basic { basicStrType = Lazy } = Just "basic-bytestring"
78wrapperName Basic { basicStrType = Strict } = Just "strict-bytestring"
79wrapperName Posn { posnByteString = False } = Just "posn"
80wrapperName Posn { posnByteString = True } = Just "posn-bytestring"
81wrapperName Monad { monadByteString = False,
82                    monadUserState = False } = Just "monad"
83wrapperName Monad { monadByteString = True,
84                    monadUserState = False } = Just "monad-bytestring"
85wrapperName Monad { monadByteString = False,
86                    monadUserState = True } = Just "monadUserState"
87wrapperName Monad { monadByteString = True,
88                    monadUserState = True } = Just "monadUserState-bytestring"
89
90-- TODO: update this comment
91--
92-- A `Scanner' consists of an association list associating token names with
93-- regular expressions with context.  The context may include a list of start
94-- codes, some leading context to test the character immediately preceding the
95-- token and trailing context to test the residual input after the token.
96--
97-- The start codes consist of the names and numbers of the start codes;
98-- initially the names only will be generated by the parser, the numbers being
99-- allocated at a later stage.  Start codes become meaningful when scanners are
100-- converted to DFAs; see the DFA section of the Scan module for details.
101
102data Scanner = Scanner { scannerName   :: String,
103                         scannerTokens :: [RECtx] }
104  deriving Show
105
106data RECtx = RECtx { reCtxStartCodes :: [(String,StartCode)],
107                     reCtxPreCtx     :: Maybe CharSet,
108                     reCtxRE         :: RExp,
109                     reCtxPostCtx    :: RightContext RExp,
110                     reCtxCode       :: Maybe Code
111                   }
112
113data RightContext r
114  = NoRightContext
115  | RightContextRExp r
116  | RightContextCode Code
117  deriving (Eq,Ord)
118
119instance Show RECtx where
120  showsPrec _ (RECtx scs _ r rctx code) =
121        showStarts scs . shows r . showRCtx rctx . showMaybeCode code
122
123showMaybeCode :: Maybe String -> String -> String
124showMaybeCode Nothing = id
125showMaybeCode (Just code) = showCode code
126
127showCode :: String -> String -> String
128showCode code = showString " { " . showString code . showString " }"
129
130showStarts :: [(String, StartCode)] -> String -> String
131showStarts [] = id
132showStarts scs = shows scs
133
134showRCtx :: Show r => RightContext r -> String -> String
135showRCtx NoRightContext = id
136showRCtx (RightContextRExp r) = ('\\':) . shows r
137showRCtx (RightContextCode code) = showString "\\ " . showCode code
138
139-- -----------------------------------------------------------------------------
140-- DFAs
141
142data DFA s a = DFA
143  { dfa_start_states :: [s],
144    dfa_states       :: Map s (State s a)
145  }
146
147data State s a = State { state_acc :: [Accept a],
148                         state_out :: IntMap s -- 0..255 only
149                       }
150
151type SNum = Int
152
153data Accept a
154  = Acc { accPrio       :: Int,
155          accAction     :: Maybe a,
156          accLeftCtx    :: Maybe CharSet, -- cannot be converted to byteset at this point.
157          accRightCtx   :: RightContext SNum
158    }
159    deriving (Eq,Ord)
160
161-- debug stuff
162instance Show (Accept a) where
163  showsPrec _ (Acc p _act _lctx _rctx) = shows p --TODO
164
165type StartCode = Int
166
167-- -----------------------------------------------------------------------------
168-- Predicates / contexts
169
170-- we can generate somewhat faster code in the case that
171-- the lexer doesn't use predicates
172data UsesPreds = UsesPreds | DoesntUsePreds
173
174usesPreds :: DFA s a -> UsesPreds
175usesPreds dfa
176    | any acceptHasCtx [ acc | st  <- Map.elems (dfa_states dfa)
177                             , acc <- state_acc st ]
178    = UsesPreds
179    | otherwise
180    = DoesntUsePreds
181  where
182    acceptHasCtx Acc { accLeftCtx  = Nothing
183                     , accRightCtx = NoRightContext } = False
184    acceptHasCtx _                                    = True
185
186-- -----------------------------------------------------------------------------
187-- Regular expressions
188
189-- `RExp' provides an abstract syntax for regular expressions.  `Eps' will
190-- match empty strings; `Ch p' matches strings containinng a single character
191-- `c' if `p c' is true; `re1 :%% re2' matches a string if `re1' matches one of
192-- its prefixes and `re2' matches the rest; `re1 :| re2' matches a string if
193-- `re1' or `re2' matches it; `Star re', `Plus re' and `Ques re' can be
194-- expressed in terms of the other operators.  See the definitions of `ARexp'
195-- for a formal definition of the semantics of these operators.
196
197data RExp
198  = Eps
199  | Ch CharSet
200  | RExp :%% RExp
201  | RExp :| RExp
202  | Star RExp
203  | Plus RExp
204  | Ques RExp
205
206instance Show RExp where
207  showsPrec _ Eps = showString "()"
208  showsPrec _ (Ch _) = showString "[..]"
209  showsPrec _ (l :%% r)  = shows l . shows r
210  showsPrec _ (l :| r)  = shows l . ('|':) . shows r
211  showsPrec _ (Star r) = shows r . ('*':)
212  showsPrec _ (Plus r) = shows r . ('+':)
213  showsPrec _ (Ques r) = shows r . ('?':)
214
215{------------------------------------------------------------------------------
216                          Abstract Regular Expression
217------------------------------------------------------------------------------}
218
219
220-- This section contains demonstrations; it is not part of Alex.
221
222{-
223-- This function illustrates `ARexp'. It returns true if the string in its
224-- argument is matched by the regular expression.
225
226recognise:: RExp -> String -> Bool
227recognise re inp = any (==len) (ap_ar (arexp re) inp)
228        where
229        len = length inp
230
231
232-- `ARexp' provides an regular expressions in abstract format.  Here regular
233-- expressions are represented by a function that takes the string to be
234-- matched and returns the sizes of all the prefixes matched by the regular
235-- expression (the list may contain duplicates).  Each of the `RExp' operators
236-- are represented by similarly named functions over ARexp.  The `ap' function
237-- takes an `ARExp', a string and returns the sizes of all the prefixes
238-- matching that regular expression.  `arexp' converts an `RExp' to an `ARexp'.
239
240
241arexp:: RExp -> ARexp
242arexp Eps = eps_ar
243arexp (Ch p) = ch_ar p
244arexp (re :%% re') = arexp re `seq_ar` arexp re'
245arexp (re :| re') = arexp re `bar_ar` arexp re'
246arexp (Star re) = star_ar (arexp re)
247arexp (Plus re) = plus_ar (arexp re)
248arexp (Ques re) = ques_ar (arexp re)
249
250
251star_ar:: ARexp -> ARexp
252star_ar sc =  eps_ar `bar_ar` plus_ar sc
253
254plus_ar:: ARexp -> ARexp
255plus_ar sc = sc `seq_ar` star_ar sc
256
257ques_ar:: ARexp -> ARexp
258ques_ar sc = eps_ar `bar_ar` sc
259
260
261-- Hugs abstract type definition -- not for GHC.
262
263type ARexp = String -> [Int]
264--      in ap_ar, eps_ar, ch_ar, seq_ar, bar_ar
265
266ap_ar:: ARexp -> String -> [Int]
267ap_ar sc = sc
268
269eps_ar:: ARexp
270eps_ar inp = [0]
271
272ch_ar:: (Char->Bool) -> ARexp
273ch_ar p "" = []
274ch_ar p (c:rst) = if p c then [1] else []
275
276seq_ar:: ARexp -> ARexp -> ARexp
277seq_ar sc sc' inp = [n+m| n<-sc inp, m<-sc' (drop n inp)]
278
279bar_ar:: ARexp -> ARexp -> ARexp
280bar_ar sc sc' inp = sc inp ++ sc' inp
281-}
282
283-- -----------------------------------------------------------------------------
284-- Utils
285
286-- Map the available start codes onto [1..]
287
288encodeStartCodes:: Scanner -> (Scanner,[StartCode],ShowS)
289encodeStartCodes scan = (scan', 0 : map snd name_code_pairs, sc_hdr)
290        where
291        scan' = scan{ scannerTokens = map mk_re_ctx (scannerTokens scan) }
292
293        mk_re_ctx (RECtx scs lc re rc code)
294          = RECtx (map mk_sc scs) lc re rc code
295
296        mk_sc (nm,_) = (nm, if nm=="0" then 0
297                                       else fromJust (Map.lookup nm code_map))
298
299        sc_hdr tl =
300                case name_code_pairs of
301                  [] -> tl
302                  (nm,_):rst -> "\n" ++ nm ++ foldr f t rst
303                        where
304                        f (nm', _) t' = "," ++ nm' ++ t'
305                        t = " :: Int\n" ++ foldr fmt_sc tl name_code_pairs
306                where
307                fmt_sc (nm,sc) t = nm ++ " = " ++ show sc ++ "\n" ++ t
308
309        code_map = Map.fromList name_code_pairs
310
311        name_code_pairs = zip (nub' (<=) nms) [1..]
312
313        nms = [nm | RECtx{reCtxStartCodes = scs} <- scannerTokens scan,
314                    (nm,_) <- scs, nm /= "0"]
315
316
317-- Grab the code fragments for the token actions, and replace them
318-- with function names of the form alex_action_$n$.  We do this
319-- because the actual action fragments might be duplicated in the
320-- generated file.
321
322extractActions :: Scheme -> Scanner -> (Scanner,ShowS)
323extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str)
324 where
325  (new_tokens, decls) = unzip (zipWith f (scannerTokens scanner) act_names)
326
327  f r@(RECtx{ reCtxCode = Just code }) name
328        = (r{reCtxCode = Just name}, Just (mkDecl name code))
329  f r@(RECtx{ reCtxCode = Nothing }) _
330        = (r{reCtxCode = Nothing}, Nothing)
331
332  gscanActionType res =
333      str "AlexPosn -> Char -> String -> Int -> ((Int, state) -> "
334    . str res . str ") -> (Int, state) -> " . str res
335
336  mkDecl fun code = case scheme of
337    Default { defaultTypeInfo = Just (Nothing, actionty) } ->
338        str fun . str " :: " . str actionty . str "\n"
339      . str fun . str " = " . str code . nl
340    Default { defaultTypeInfo = Just (Just tyclasses, actionty) } ->
341      str fun . str " :: (" . str tyclasses . str ") => " .
342      str actionty . str "\n" .
343      str fun . str " = " . str code . nl
344    GScan { gscanTypeInfo = Just (Nothing, tokenty) } ->
345        str fun . str " :: " . gscanActionType tokenty . str "\n"
346      . str fun . str " = " . str code . nl
347    GScan { gscanTypeInfo = Just (Just tyclasses, tokenty) } ->
348      str fun . str " :: (" . str tyclasses . str ") => " .
349      gscanActionType tokenty . str "\n" .
350      str fun . str " = " . str code . nl
351    Basic { basicStrType = strty, basicTypeInfo = Just (Nothing, tokenty) } ->
352      str fun . str " :: " . str (show strty) . str " -> "
353      . str tokenty . str "\n"
354      . str fun . str " = " . str code . nl
355    Basic { basicStrType = strty,
356            basicTypeInfo = Just (Just tyclasses, tokenty) } ->
357      str fun . str " :: (" . str tyclasses . str ") => " .
358      str (show strty) . str " -> " . str tokenty . str "\n" .
359      str fun . str " = " . str code . nl
360    Posn { posnByteString = isByteString,
361           posnTypeInfo = Just (Nothing, tokenty) } ->
362      str fun . str " :: AlexPosn -> " . str (strtype isByteString) . str " -> "
363      . str tokenty . str "\n"
364      . str fun . str " = " . str code . nl
365    Posn { posnByteString = isByteString,
366           posnTypeInfo = Just (Just tyclasses, tokenty) } ->
367      str fun . str " :: (" . str tyclasses . str ") => AlexPosn -> " .
368      str (strtype isByteString) . str " -> " . str tokenty . str "\n" .
369      str fun . str " = " . str code . nl
370    Monad { monadByteString = isByteString,
371            monadTypeInfo = Just (Nothing, tokenty) } ->
372      let
373        actintty = if isByteString then "Int64" else "Int"
374      in
375        str fun . str " :: AlexInput -> " . str actintty . str " -> Alex ("
376      . str tokenty . str ")\n"
377      . str fun . str " = " . str code . nl
378    Monad { monadByteString = isByteString,
379            monadTypeInfo = Just (Just tyclasses, tokenty) } ->
380      let
381        actintty = if isByteString then "Int64" else "Int"
382      in
383        str fun . str " :: (" . str tyclasses . str ") => "
384      . str " AlexInput -> " . str actintty
385      . str " -> Alex (" . str tokenty . str ")\n"
386      . str fun . str " = " . str code . nl
387    _ -> str fun . str " = " . str code . nl
388
389  act_names = map (\n -> "alex_action_" ++ show (n::Int)) [0..]
390
391  decl_str = foldr (.) id [ decl | Just decl <- decls ]
392
393-- -----------------------------------------------------------------------------
394-- Code generation targets
395
396data Target = GhcTarget | HaskellTarget
397