1--  C->Haskell Compiler: Lexer for CHS Files
2--
3--  Author : Manuel M T Chakravarty
4--  Created: 13 August 99
5--
6--  Version $Revision: 1.1 $ from $Date: 2004/11/21 21:05:35 $
7--
8--  Copyright (c) [1999..2004] Manuel M T Chakravarty
9--
10--  This file is free software; you can redistribute it and/or modify
11--  it under the terms of the GNU General Public License as published by
12--  the Free Software Foundation; either version 2 of the License, or
13--  (at your option) any later version.
14--
15--  This file is distributed in the hope that it will be useful,
16--  but WITHOUT ANY WARRANTY; without even the implied warranty of
17--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18--  GNU General Public License for more details.
19--
20--- DESCRIPTION ---------------------------------------------------------------
21--
22--  Lexer for CHS files; the tokens are only partially recognised.
23--
24--- DOCU ----------------------------------------------------------------------
25--
26--  language: Haskell 98
27--
28--  * CHS files are assumed to be Haskell 98 files that include C2HS binding
29--    hooks.
30--
31--  * Haskell code is not tokenised, but binding hooks (delimited by `{#'and
32--    `#}') are analysed.  Therefore the lexer operates in two states
33--    (realised as two lexer coupled by meta actions) depending on whether
34--    Haskell code or a binding hook is currently read.  The lexer reading
35--    Haskell code is called `base lexer'; the other one, `binding-hook
36--    lexer'.  In addition, there is a inline-c lexer, which, as the
37--    binding-hook lexer, can be triggered from the base lexer.
38--
39--  * Base lexer:
40--
41--      haskell -> (inline \\ special)*
42--               | special \\ `"'
43--               | comment
44--               | nested
45--               | hstring
46--               | '{#'
47--               | cpp
48--      special -> `(' | `{' | `-' | `"'
49--      ctrl    -> `\n' | `\f' | `\r' | `\t' | `\v'
50--
51--      inline  -> any \\ ctrl
52--      any     -> '\0'..'\255'
53--
54--    Within the base lexer control codes appear as separate tokens in the
55--    token list.
56--
57--    NOTE: It is important that `{' is an extra lexeme and not added as an
58--          optional component at the end of the first alternative for
59--          `haskell'.  Otherwise, the principle of the longest match will
60--          divide `foo {#' into the tokens `foo {' and `#' instead of `foo '
61--          and `{#'.
62--
63--    One line comments are handled by
64--
65--      comment -> `--' (any \\ `\n')* `\n'
66--
67--    and nested comments by
68--
69--      nested -> `{-' any* `-}'
70--
71--    where `any*' may contain _balanced_ occurrences of `{-' and `-}'.
72--
73--      hstring -> `"' inhstr* `"'
74--      inhstr  -> ` '..`\127' \\ `"'
75--               | `\"'
76--
77--    Pre-precessor directives as well as the switch to inline-C code are
78--    formed as follows:
79--
80--      cpp     -> `\n#' (inline | `\t')* `\n'
81--               | `\n#c' (' ' | '\t')* `\n'
82--
83--    We allow whitespace between the `#' and the actual directive, but in `#c'
84--    and `#endc' the directive must immediately follow the `#'.  This might
85--    be regarded as a not entirely orthogonal design, but simplifies matters
86--    especially for `#endc'.
87--
88--  * On encountering the lexeme `{#', a meta action in the base lexer
89--    transfers control to the following binding-hook lexer:
90--
91--      ident       -> letter (letter | digit | `\'')*
92--                   | `\'' letter (letter | digit)* `\''
93--      reservedid  -> `as' | `call' | `class' | `context' | `deriving'
94--                   | `enum' | `foreign' | `fun' | `get' | `lib'
95--                   | `newtype' | `pointer' | `prefix' | `pure' | `set'
96--                   | `sizeof' | `stable' | `type' | `underscoreToCase'
97--                   | `unsafe' | `with' | 'lock' | 'unlock'
98--      reservedsym -> `{#' | `#}' | `{' | `}' | `,' | `.' | `->' | `='
99--                   | `=>' | '-' | `*' | `&' | `^'
100--      string      -> `"' instr* `"'
101--      verbhs      -> `\`' instr* `\''
102--      instr       -> ` '..`\127' \\ `"'
103--      comment     -> `--' (any \\ `\n')* `\n'
104--
105--    Control characters, white space, and comments are discarded in the
106--    binding-hook lexer.  Nested comments are not allowed in a binding hook.
107--    Identifiers can be enclosed in single quotes to avoid collision with
108--    C->Haskell keywords.
109--
110--  * In the binding-hook lexer, the lexeme `#}' transfers control back to the
111--    base lexer.  An occurence of the lexeme `{#' inside the binding-hook
112--    lexer triggers an error.  The symbol `{#' is not explcitly represented
113--    in the resulting token stream.  However, the occurrence of a token
114--    representing one of the reserved identifiers `call', `context', `enum',
115--    and `field' marks the start of a binding hook.  Strictly speaking, `#}'
116--    need also not occur in the token stream, as the next `haskell' token
117--    marks a hook's end.  It is, however, useful for producing accurate error
118--    messages (in case an hook is closed to early) to have a token
119--    representing `#}'.
120--
121--  * The rule `ident' describes Haskell identifiers, but without
122--    distinguishing between variable and constructor identifers (ie, those
123--    starting with a lowercase and those starting with an uppercase letter).
124--    However, we use it also to scan C identifiers; although, strictly
125--    speaking, it is too general for them.  In the case of C identifiers,
126--    this should not have any impact on the range of descriptions accepted by
127--    the tool, as illegal identifier will never occur in a C header file that
128--    is accepted by the C lexer.  In the case of Haskell identifiers, a
129--    confusion between variable and constructor identifiers will be noted by
130--    the Haskell compiler translating the code generated by c2hs.  Moreover,
131--    identifiers can be enclosed in single quotes to avoid collision with
132--    C->Haskell keywords, but those may not contain apostrophes.
133--
134--  * Any line starting with the character `#' is regarded to be a C
135--    preprocessor directive.  With the exception of `#c' and `#endc', which
136--    delimit a set of lines containing inline C code.  Hence, in the base
137--    lexer, the lexeme `#c' triggers a meta action transferring control to the
138--    following inline-C lexer:
139--
140--      c  -> inline* \\ `\n#endc'
141--
142--    We do neither treat C strings nor C comments specially.  Hence, if the
143--    string "\n#endc" occurs in a comment, we will mistakenly regard it as
144--    the end of the inline C code.  Note that the problem cannot happen with
145--    strings, as C does not permit strings that extend over multiple lines.
146--    At the moment, it just seems not to be worth the effort required to
147--    treat this situation more accurately.
148--
149--    The inline-C lexer also doesn't handle pre-processor directives
150--    specially.  Hence, structural pre-processor directives (namely,
151--    conditionals) may occur within inline-C code only properly nested.
152--
153--  Shortcomings
154--  ~~~~~~~~~~~~
155--  Some lexemes that include single and double quote characters are not lexed
156--  correctly.  See the implementation comment at `haskell' for details.
157--
158--
159--- TODO ----------------------------------------------------------------------
160--
161--  * In `haskell', the case of a single `"' (without a matching second one)
162--    is caught by an eplicit error raising rule.  This shouldn't be
163--    necessary, but for some strange reason, the lexer otherwise hangs when a
164--    single `"' appears in the input.
165--
166--  * Comments in the "gap" of a string are not yet supported.
167--
168
169module CHSLexer (CHSToken(..), lexCHS)
170where
171
172import Data.List         ((\\))
173import Data.Char         (isDigit)
174import Control.Monad     (liftM)
175import Numeric   (readDec, readOct, readHex)
176
177import Position  (Position(..), Pos(posOf), incPos, retPos, tabPos)
178import Errors    (ErrorLvl(..), Error, makeError)
179import UNames    (NameSupply, Name, names)
180import Idents    (Ident, lexemeToIdent, identToLexeme)
181import Lexers    (Regexp, Lexer, Action, epsilon, char, (+>), lexaction,
182                  lexactionErr, lexmeta, (>|<), (>||<), ctrlLexer, star, plus,
183                  quest, alt, string, LexerState, execLexer)
184
185import C2HSState (CST, raise, raiseError, nop, getNameSupply)
186
187
188-- token definition
189-- ----------------
190
191-- possible tokens (EXPORTED)
192--
193data CHSToken = CHSTokArrow   Position          -- `->'
194              | CHSTokDArrow  Position          -- `=>'
195              | CHSTokDot     Position          -- `.'
196              | CHSTokComma   Position          -- `,'
197              | CHSTokEqual   Position          -- `='
198              | CHSTokMinus   Position          -- `-'
199              | CHSTokStar    Position          -- `*'
200              | CHSTokAmp     Position          -- `&'
201              | CHSTokHat     Position          -- `^'
202              | CHSTokLBrace  Position          -- `{'
203              | CHSTokRBrace  Position          -- `}'
204              | CHSTokLParen  Position          -- `('
205              | CHSTokRParen  Position          -- `)'
206              | CHSTokEndHook Position          -- `#}'
207              | CHSTokAs      Position          -- `as'
208              | CHSTokCall    Position          -- `call'
209              | CHSTokClass   Position          -- `class'
210              | CHSTokContext Position          -- `context'
211              | CHSTokDerive  Position          -- `deriving'
212              | CHSTokEnum    Position          -- `enum'
213              | CHSTokForeign Position          -- `foreign'
214              | CHSTokFun     Position          -- `fun'
215              | CHSTokGet     Position          -- `get'
216              | CHSTokImport  Position          -- `import'
217              | CHSTokLib     Position          -- `lib'
218              | CHSTokNewtype Position          -- `newtype'
219              | CHSTokPointer Position          -- `pointer'
220              | CHSTokPrefix  Position          -- `prefix'
221              | CHSTokPure    Position          -- `pure'
222              | CHSTokQualif  Position          -- `qualified'
223              | CHSTokSet     Position          -- `set'
224              | CHSTokSizeof  Position          -- `sizeof'
225              | CHSTokStable  Position          -- `stable'
226              | CHSTokType    Position          -- `type'
227              | CHSTok_2Case  Position          -- `underscoreToCase'
228              | CHSTokUnsafe  Position          -- `unsafe'
229              | CHSTokWith    Position          -- `with'
230              | CHSTokLock    Position          -- `lock'
231              | CHSTokNolock  Position          -- `nolock'
232              | CHSTokString  Position String   -- string
233              | CHSTokHSVerb  Position String   -- verbatim Haskell (`...')
234              | CHSTokIdent   Position Ident    -- identifier
235              | CHSTokHaskell Position String   -- verbatim Haskell code
236              | CHSTokCPP     Position String   -- pre-processor directive
237              | CHSTokLine    Position          -- line pragma
238              | CHSTokC       Position String   -- verbatim C code
239              | CHSTokCtrl    Position Char     -- control code
240              | CHSTokPragma  Position          -- '{-# LANGUAGE' language pragma begin
241              | CHSTokPragEnd Position          -- '#-}' language pragma end
242
243instance Pos CHSToken where
244  posOf (CHSTokArrow   pos  ) = pos
245  posOf (CHSTokDArrow  pos  ) = pos
246  posOf (CHSTokDot     pos  ) = pos
247  posOf (CHSTokComma   pos  ) = pos
248  posOf (CHSTokEqual   pos  ) = pos
249  posOf (CHSTokMinus   pos  ) = pos
250  posOf (CHSTokStar    pos  ) = pos
251  posOf (CHSTokAmp     pos  ) = pos
252  posOf (CHSTokHat     pos  ) = pos
253  posOf (CHSTokLBrace  pos  ) = pos
254  posOf (CHSTokRBrace  pos  ) = pos
255  posOf (CHSTokLParen  pos  ) = pos
256  posOf (CHSTokRParen  pos  ) = pos
257  posOf (CHSTokEndHook pos  ) = pos
258  posOf (CHSTokAs      pos  ) = pos
259  posOf (CHSTokCall    pos  ) = pos
260  posOf (CHSTokClass   pos  ) = pos
261  posOf (CHSTokContext pos  ) = pos
262  posOf (CHSTokDerive  pos  ) = pos
263  posOf (CHSTokEnum    pos  ) = pos
264  posOf (CHSTokForeign pos  ) = pos
265  posOf (CHSTokFun     pos  ) = pos
266  posOf (CHSTokGet     pos  ) = pos
267  posOf (CHSTokImport  pos  ) = pos
268  posOf (CHSTokLib     pos  ) = pos
269  posOf (CHSTokNewtype pos  ) = pos
270  posOf (CHSTokPointer pos  ) = pos
271  posOf (CHSTokPrefix  pos  ) = pos
272  posOf (CHSTokPure    pos  ) = pos
273  posOf (CHSTokQualif  pos  ) = pos
274  posOf (CHSTokSet     pos  ) = pos
275  posOf (CHSTokSizeof  pos  ) = pos
276  posOf (CHSTokStable  pos  ) = pos
277  posOf (CHSTokType    pos  ) = pos
278  posOf (CHSTok_2Case  pos  ) = pos
279  posOf (CHSTokUnsafe  pos  ) = pos
280  posOf (CHSTokWith    pos  ) = pos
281  posOf (CHSTokLock    pos  ) = pos
282  posOf (CHSTokNolock  pos  ) = pos
283  posOf (CHSTokString  pos _) = pos
284  posOf (CHSTokHSVerb  pos _) = pos
285  posOf (CHSTokIdent   pos _) = pos
286  posOf (CHSTokHaskell pos _) = pos
287  posOf (CHSTokCPP     pos _) = pos
288  posOf (CHSTokC       pos _) = pos
289  posOf (CHSTokCtrl    pos _) = pos
290  posOf (CHSTokPragma  pos  ) = pos
291  posOf (CHSTokPragEnd pos  ) = pos
292
293instance Eq CHSToken where
294  (CHSTokArrow    _  ) == (CHSTokArrow    _  ) = True
295  (CHSTokDArrow   _  ) == (CHSTokDArrow   _  ) = True
296  (CHSTokDot      _  ) == (CHSTokDot      _  ) = True
297  (CHSTokComma    _  ) == (CHSTokComma    _  ) = True
298  (CHSTokEqual    _  ) == (CHSTokEqual    _  ) = True
299  (CHSTokMinus    _  ) == (CHSTokMinus    _  ) = True
300  (CHSTokStar     _  ) == (CHSTokStar     _  ) = True
301  (CHSTokAmp      _  ) == (CHSTokAmp      _  ) = True
302  (CHSTokHat      _  ) == (CHSTokHat      _  ) = True
303  (CHSTokLBrace   _  ) == (CHSTokLBrace   _  ) = True
304  (CHSTokRBrace   _  ) == (CHSTokRBrace   _  ) = True
305  (CHSTokLParen   _  ) == (CHSTokLParen   _  ) = True
306  (CHSTokRParen   _  ) == (CHSTokRParen   _  ) = True
307  (CHSTokEndHook  _  ) == (CHSTokEndHook  _  ) = True
308  (CHSTokAs       _  ) == (CHSTokAs       _  ) = True
309  (CHSTokCall     _  ) == (CHSTokCall     _  ) = True
310  (CHSTokClass    _  ) == (CHSTokClass    _  ) = True
311  (CHSTokContext  _  ) == (CHSTokContext  _  ) = True
312  (CHSTokDerive   _  ) == (CHSTokDerive   _  ) = True
313  (CHSTokEnum     _  ) == (CHSTokEnum     _  ) = True
314  (CHSTokForeign  _  ) == (CHSTokForeign  _  ) = True
315  (CHSTokFun      _  ) == (CHSTokFun      _  ) = True
316  (CHSTokGet      _  ) == (CHSTokGet      _  ) = True
317  (CHSTokImport   _  ) == (CHSTokImport   _  ) = True
318  (CHSTokLib      _  ) == (CHSTokLib      _  ) = True
319  (CHSTokNewtype  _  ) == (CHSTokNewtype  _  ) = True
320  (CHSTokPointer  _  ) == (CHSTokPointer  _  ) = True
321  (CHSTokPrefix   _  ) == (CHSTokPrefix   _  ) = True
322  (CHSTokPure     _  ) == (CHSTokPure     _  ) = True
323  (CHSTokQualif   _  ) == (CHSTokQualif   _  ) = True
324  (CHSTokSet      _  ) == (CHSTokSet      _  ) = True
325  (CHSTokSizeof   _  ) == (CHSTokSizeof   _  ) = True
326  (CHSTokStable   _  ) == (CHSTokStable   _  ) = True
327  (CHSTokType     _  ) == (CHSTokType     _  ) = True
328  (CHSTok_2Case   _  ) == (CHSTok_2Case   _  ) = True
329  (CHSTokUnsafe   _  ) == (CHSTokUnsafe   _  ) = True
330  (CHSTokWith     _  ) == (CHSTokWith     _  ) = True
331  (CHSTokLock     _  ) == (CHSTokLock     _  ) = True
332  (CHSTokNolock   _  ) == (CHSTokNolock   _  ) = True
333  (CHSTokString   _ _) == (CHSTokString   _ _) = True
334  (CHSTokHSVerb   _ _) == (CHSTokHSVerb   _ _) = True
335  (CHSTokIdent    _ _) == (CHSTokIdent    _ _) = True
336  (CHSTokHaskell  _ _) == (CHSTokHaskell  _ _) = True
337  (CHSTokCPP      _ _) == (CHSTokCPP      _ _) = True
338  (CHSTokC        _ _) == (CHSTokC        _ _) = True
339  (CHSTokCtrl     _ _) == (CHSTokCtrl     _ _) = True
340  (CHSTokPragma   _  ) == (CHSTokPragma   _  ) = True
341  (CHSTokPragEnd  _  ) == (CHSTokPragEnd  _  ) = True
342  _                    == _                    = False
343
344instance Show CHSToken where
345  showsPrec _ (CHSTokArrow   _  ) = showString "->"
346  showsPrec _ (CHSTokDArrow  _  ) = showString "=>"
347  showsPrec _ (CHSTokDot     _  ) = showString "."
348  showsPrec _ (CHSTokComma   _  ) = showString ","
349  showsPrec _ (CHSTokEqual   _  ) = showString "="
350  showsPrec _ (CHSTokMinus   _  ) = showString "-"
351  showsPrec _ (CHSTokStar    _  ) = showString "*"
352  showsPrec _ (CHSTokAmp     _  ) = showString "&"
353  showsPrec _ (CHSTokHat     _  ) = showString "^"
354  showsPrec _ (CHSTokLBrace  _  ) = showString "{"
355  showsPrec _ (CHSTokRBrace  _  ) = showString "}"
356  showsPrec _ (CHSTokLParen  _  ) = showString "("
357  showsPrec _ (CHSTokRParen  _  ) = showString ")"
358  showsPrec _ (CHSTokEndHook _  ) = showString "#}"
359  showsPrec _ (CHSTokAs      _  ) = showString "as"
360  showsPrec _ (CHSTokCall    _  ) = showString "call"
361  showsPrec _ (CHSTokClass   _  ) = showString "class"
362  showsPrec _ (CHSTokContext _  ) = showString "context"
363  showsPrec _ (CHSTokDerive  _  ) = showString "deriving"
364  showsPrec _ (CHSTokEnum    _  ) = showString "enum"
365  showsPrec _ (CHSTokForeign _  ) = showString "foreign"
366  showsPrec _ (CHSTokFun     _  ) = showString "fun"
367  showsPrec _ (CHSTokGet     _  ) = showString "get"
368  showsPrec _ (CHSTokImport  _  ) = showString "import"
369  showsPrec _ (CHSTokLib     _  ) = showString "lib"
370  showsPrec _ (CHSTokNewtype _  ) = showString "newtype"
371  showsPrec _ (CHSTokPointer _  ) = showString "pointer"
372  showsPrec _ (CHSTokPrefix  _  ) = showString "prefix"
373  showsPrec _ (CHSTokPure    _  ) = showString "pure"
374  showsPrec _ (CHSTokQualif  _  ) = showString "qualified"
375  showsPrec _ (CHSTokSet     _  ) = showString "set"
376  showsPrec _ (CHSTokSizeof  _  ) = showString "sizeof"
377  showsPrec _ (CHSTokStable  _  ) = showString "stable"
378  showsPrec _ (CHSTokType    _  ) = showString "type"
379  showsPrec _ (CHSTok_2Case  _  ) = showString "underscoreToCase"
380  showsPrec _ (CHSTokUnsafe  _  ) = showString "unsafe"
381  showsPrec _ (CHSTokWith    _  ) = showString "with"
382  showsPrec _ (CHSTokLock    _  ) = showString "lock"
383  showsPrec _ (CHSTokNolock  _  ) = showString "nolock"
384  showsPrec _ (CHSTokString  _ s) = showString ("\"" ++ s ++ "\"")
385  showsPrec _ (CHSTokHSVerb  _ s) = showString ("`" ++ s ++ "'")
386  showsPrec _ (CHSTokIdent   _ i) = (showString . identToLexeme) i
387  showsPrec _ (CHSTokHaskell _ s) = showString s
388  showsPrec _ (CHSTokCPP     _ s) = showString s
389  showsPrec _ (CHSTokC       _ s) = showString s
390  showsPrec _ (CHSTokCtrl    _ c) = showChar c
391  showsPrec _ (CHSTokPragma  _  ) = showString "{-# LANGUAGE"
392  showsPrec _ (CHSTokPragEnd _  ) = showString "#-}"
393
394
395-- lexer state
396-- -----------
397
398-- state threaded through the lexer
399--
400data CHSLexerState = CHSLS {
401                       nestLvl :: Int,   -- nesting depth of nested comments
402                       inHook  :: Bool,  -- within a binding hook?
403                       namesup :: [Name] -- supply of unique names
404                     }
405
406-- initial state
407--
408initialState :: CST s CHSLexerState
409initialState  = do
410                  namesup <- liftM names getNameSupply
411                  return $ CHSLS {
412                             nestLvl = 0,
413                             inHook  = False,
414                             namesup = namesup
415                           }
416
417-- raise an error if the given state is not a final state
418--
419assertFinalState :: Position -> CHSLexerState -> CST s ()
420assertFinalState pos CHSLS {nestLvl = nestLvl, inHook = inHook}
421  | nestLvl > 0 = raiseError pos ["Unexpected end of file!",
422                                  "Unclosed nested comment."]
423  | inHook      = raiseError pos ["Unexpected end of file!",
424                                  "Unclosed binding hook."]
425  | otherwise   = nop
426
427-- lexer and action type used throughout this specification
428--
429type CHSLexer  = Lexer  CHSLexerState CHSToken
430type CHSAction = Action               CHSToken
431type CHSRegexp = Regexp CHSLexerState CHSToken
432
433-- for actions that need a new unique name
434--
435infixl 3 `lexactionName`
436lexactionName :: CHSRegexp
437              -> (String -> Position -> Name -> CHSToken)
438              -> CHSLexer
439re `lexactionName` action = re `lexmeta` action'
440  where
441    action' str pos state = let name:ns = namesup state
442                            in
443                            (Just $ Right (action str pos name),
444                             incPos pos (length str),
445                             state {namesup = ns},
446                             Nothing)
447
448
449-- lexical specification
450-- ---------------------
451
452-- the lexical definition of the tokens (the base lexer)
453--
454--
455chslexer :: CHSLexer
456chslexer  =      pragma         -- LANGUAGE pragma
457            >||< haskell        -- Haskell code
458            >||< nested         -- nested comments
459            >||< ctrl           -- control code (that has to be preserved)
460            >||< hook           -- start of a binding hook
461            >||< cpp            -- a pre-processor directive (or `#c')
462
463-- the LANGUAGE pragma
464pragma :: CHSLexer
465pragma = string "{-# LANGUAGE" `lexmeta` \_ pos s ->
466         (Just $ Right (CHSTokPragma pos), incPos pos 12, s, Just langLexer)
467
468langLexer :: CHSLexer
469langLexer = whitespace >||< identOrKW >||< symbol >||<
470            (string "#-}" `lexmeta` \_ pos s ->
471            (Just $ Right (CHSTokPragEnd pos), incPos pos 3, s, Just chslexer))
472
473-- stream of Haskell code (terminated by a control character or binding hook)
474--
475haskell :: CHSLexer
476--
477-- NB: We need to make sure that '"' is not regarded as the beginning of a
478--     string; however, we cannot really lex character literals properly
479--     without lexing identifiers (as the latter may containing single quotes
480--     as part of their lexeme).  Thus, we special case '"'.  This is still a
481--     kludge, as a program fragment, such as
482--
483--       foo'"'strange string"
484--
485--     will not be handled correctly.
486--
487haskell  = (    anyButSpecial`star` epsilon
488            >|< specialButQuotes
489            >|< char '"'  +> inhstr`star` char '"'
490            >|< string "'\"'"                           -- special case of "
491            >|< string "--" +> anyButNL`star` epsilon   -- comment
492           )
493           `lexaction` copyVerbatim
494           >||< char '"'                                -- this is a bad kludge
495                `lexactionErr`
496                  \_ pos -> (Left $ makeError ErrorErr pos
497                                              ["Lexical error!",
498                                              "Unclosed string."])
499           where
500             anyButSpecial    = alt (inlineSet \\ specialSet)
501             specialButQuotes = alt (specialSet \\ ['"'])
502             anyButNL         = alt (anySet \\ ['\n'])
503             inhstr           = instr >|< char '\\' >|< string "\\\"" >|< gap
504             gap              = char '\\' +> alt (' ':ctrlSet)`plus` char '\\'
505
506-- action copying the input verbatim to `CHSTokHaskell' tokens
507--
508copyVerbatim        :: CHSAction
509copyVerbatim cs pos  = Just $ CHSTokHaskell pos cs
510
511-- nested comments
512--
513nested :: CHSLexer
514nested  =
515       string "{-"              {- for Haskell emacs mode :-( -}
516       `lexmeta` enterComment
517  >||<
518       string "-}"
519       `lexmeta` leaveComment
520  where
521    enterComment cs pos s =
522      (copyVerbatim' cs pos,                    -- collect the lexeme
523       incPos pos 2,                            -- advance current position
524       s {nestLvl = nestLvl s + 1},             -- increase nesting level
525       Just $ inNestedComment)                  -- continue in comment lexer
526    --
527    leaveComment cs pos s =
528      case nestLvl s of
529        0 -> (commentCloseErr pos,              -- 0: -} outside comment => err
530              incPos pos 2,                     -- advance current position
531              s,
532              Nothing)
533        1 -> (copyVerbatim' cs pos,             -- collect the lexeme
534              incPos pos 2,                     -- advance current position
535              s {nestLvl = nestLvl s - 1},      -- decrease nesting level
536              Just chslexer)                    -- 1: continue with root lexer
537        _ -> (copyVerbatim' cs pos,             -- collect the lexeme
538              incPos pos 2,                     -- advance current position
539              s {nestLvl = nestLvl s - 1},      -- decrease nesting level
540              Nothing)                          -- _: cont with comment lexer
541    --
542    copyVerbatim' cs pos  = Just $ Right (CHSTokHaskell pos cs)
543    --
544    commentCloseErr pos =
545      Just $ Left (makeError ErrorErr pos
546                             ["Lexical error!",
547                             "`-}' not preceded by a matching `{-'."])
548                             {- for Haskell emacs mode :-( -}
549
550
551-- lexer processing the inner of a comment
552--
553inNestedComment :: CHSLexer
554inNestedComment  =      commentInterior         -- inside a comment
555                   >||< nested                  -- nested comments
556                   >||< ctrl                    -- control code (preserved)
557
558-- standard characters in a nested comment
559--
560commentInterior :: CHSLexer
561commentInterior  = (    anyButSpecial`star` epsilon
562                    >|< special
563                   )
564                   `lexaction` copyVerbatim
565                   where
566                     anyButSpecial = alt (inlineSet \\ commentSpecialSet)
567                     special       = alt commentSpecialSet
568
569-- control code in the base lexer (is turned into a token)
570--
571--  * this covers exactly the same set of characters as contained in `ctrlSet'
572--   and `Lexers.ctrlLexer' and advances positions also like the `ctrlLexer'
573--
574ctrl :: CHSLexer
575ctrl  =
576       char '\n' `lexmeta` newline
577  >||< char '\r' `lexmeta` newline
578  >||< char '\v' `lexmeta` newline
579  >||< char '\f' `lexmeta` formfeed
580  >||< char '\t' `lexmeta` tab
581  where
582    newline  [c] pos = ctrlResult pos c (retPos pos)
583    formfeed [c] pos = ctrlResult pos c (incPos pos 1)
584    tab      [c] pos = ctrlResult pos c (tabPos pos)
585
586    ctrlResult pos c pos' s =
587      (Just $ Right (CHSTokCtrl pos c), pos', s, Nothing)
588
589-- start of a binding hook (ie, enter the binding hook lexer)
590--
591hook :: CHSLexer
592hook  = string "{#"
593        `lexmeta` \_ pos s -> (Nothing, incPos pos 2, s, Just bhLexer)
594
595-- pre-processor directives and `#c'
596--
597--  * we lex `#c' as a directive and special case it in the action
598--
599--  * we lex C line number pragmas and special case it in the action
600--
601cpp :: CHSLexer
602cpp = directive
603      where
604        directive =
605          string "\n#" +> alt ('\t':inlineSet)`star` epsilon
606          `lexmeta`
607             \(_:_:dir) pos s ->        -- strip off the "\n#"
608               case dir of
609                 ['c']                      ->          -- #c
610                   (Nothing, retPos pos, s, Just cLexer)
611                 -- a #c may be followed by whitespace
612                 'c':sp:_ | sp `elem` " \t" ->          -- #c
613                   (Nothing, retPos pos, s, Just cLexer)
614                 ' ':line@(n:_) | isDigit n ->                 -- C line pragma
615                   let pos' = adjustPosByCLinePragma line pos
616                    in (Just $ Right (CHSTokLine pos'), pos', s, Nothing)
617                 _                            ->        -- CPP directive
618                   (Just $ Right (CHSTokCPP pos dir),
619                    retPos pos, s, Nothing)
620
621adjustPosByCLinePragma :: String -> Position -> Position
622adjustPosByCLinePragma str (Position fname _ _) =
623  (Position fname' row' 0)
624  where
625    str'            = dropWhite str
626    (rowStr, str'') = span isDigit str'
627    row'            = read rowStr
628    str'''          = dropWhite str''
629    fnameStr        = takeWhile (/= '"') . drop 1 $ str'''
630    fname'          | null str''' || head str''' /= '"' = fname
631                    -- try and get more sharing of file name strings
632                    | fnameStr == fname                 = fname
633                    | otherwise                         = fnameStr
634    --
635    dropWhite = dropWhile (\c -> c == ' ' || c == '\t')
636
637-- the binding hook lexer
638--
639bhLexer :: CHSLexer
640bhLexer  =      identOrKW
641           >||< symbol
642           >||< strlit
643           >||< hsverb
644           >||< whitespace
645           >||< endOfHook
646           >||< string "--" +> anyButNL`star` char '\n'   -- comment
647                `lexmeta` \_ pos s -> (Nothing, retPos pos, s, Nothing)
648           where
649             anyButNL  = alt (anySet \\ ['\n'])
650             endOfHook = string "#}"
651                         `lexmeta`
652                          \_ pos s -> (Just $ Right (CHSTokEndHook pos),
653                                       incPos pos 2, s, Just chslexer)
654
655-- the inline-C lexer
656--
657cLexer :: CHSLexer
658cLexer =      inlineC                     -- inline C code
659         >||< ctrl                        -- control code (preserved)
660         >||< string "\n#endc"            -- end of inline C code...
661              `lexmeta`                   -- ...preserve '\n' as control token
662              \_ pos s -> (Just $ Right (CHSTokCtrl pos '\n'), retPos pos, s,
663                           Just chslexer)
664         where
665           inlineC = alt inlineSet `lexaction` copyVerbatimC
666           --
667           copyVerbatimC :: CHSAction
668           copyVerbatimC cs pos = Just $ CHSTokC pos cs
669
670-- whitespace
671--
672--  * horizontal and vertical tabs, newlines, and form feeds are filter out by
673--   `Lexers.ctrlLexer'
674--
675whitespace :: CHSLexer
676whitespace  =      (char ' ' `lexaction` \_ _ -> Nothing)
677              >||< ctrlLexer
678
679-- identifiers and keywords
680--
681identOrKW :: CHSLexer
682--
683-- the strictness annotations seem to help a bit
684--
685identOrKW  =
686       -- identifier or keyword
687       (letter +> (letter >|< digit >|< char '\'')`star` epsilon
688       `lexactionName` \cs pos name -> (idkwtok $!pos) cs name)
689  >||< -- identifier in single quotes
690       (char '\'' +> letter +> (letter >|< digit)`star` char '\''
691       `lexactionName` \cs pos name -> (mkid $!pos) cs name)
692       -- NB: quotes are removed by lexemeToIdent
693  where
694    idkwtok pos "as"               _    = CHSTokAs      pos
695    idkwtok pos "call"             _    = CHSTokCall    pos
696    idkwtok pos "class"            _    = CHSTokClass   pos
697    idkwtok pos "context"          _    = CHSTokContext pos
698    idkwtok pos "deriving"         _    = CHSTokDerive  pos
699    idkwtok pos "enum"             _    = CHSTokEnum    pos
700    idkwtok pos "foreign"          _    = CHSTokForeign pos
701    idkwtok pos "fun"              _    = CHSTokFun     pos
702    idkwtok pos "get"              _    = CHSTokGet     pos
703    idkwtok pos "import"           _    = CHSTokImport  pos
704    idkwtok pos "lib"              _    = CHSTokLib     pos
705    idkwtok pos "newtype"          _    = CHSTokNewtype pos
706    idkwtok pos "pointer"          _    = CHSTokPointer pos
707    idkwtok pos "prefix"           _    = CHSTokPrefix  pos
708    idkwtok pos "pure"             _    = CHSTokPure    pos
709    idkwtok pos "qualified"        _    = CHSTokQualif  pos
710    idkwtok pos "set"              _    = CHSTokSet     pos
711    idkwtok pos "sizeof"           _    = CHSTokSizeof  pos
712    idkwtok pos "stable"           _    = CHSTokStable  pos
713    idkwtok pos "type"             _    = CHSTokType    pos
714    idkwtok pos "underscoreToCase" _    = CHSTok_2Case  pos
715    idkwtok pos "unsafe"           _    = CHSTokUnsafe  pos
716    idkwtok pos "with"             _    = CHSTokWith    pos
717    idkwtok pos "lock"             _    = CHSTokLock    pos
718    idkwtok pos "nolock"           _    = CHSTokNolock  pos
719    idkwtok pos cs                 name = mkid pos cs name
720    --
721    mkid pos cs name = CHSTokIdent pos (lexemeToIdent pos cs name)
722
723-- reserved symbols
724--
725symbol :: CHSLexer
726symbol  =      sym "->" CHSTokArrow
727          >||< sym "=>" CHSTokDArrow
728          >||< sym "."  CHSTokDot
729          >||< sym ","  CHSTokComma
730          >||< sym "="  CHSTokEqual
731          >||< sym "-"  CHSTokMinus
732          >||< sym "*"  CHSTokStar
733          >||< sym "&"  CHSTokAmp
734          >||< sym "^"  CHSTokHat
735          >||< sym "{"  CHSTokLBrace
736          >||< sym "}"  CHSTokRBrace
737          >||< sym "("  CHSTokLParen
738          >||< sym ")"  CHSTokRParen
739          where
740            sym cs con = string cs `lexaction` \_ pos -> Just (con pos)
741
742-- string
743--
744strlit :: CHSLexer
745strlit  = char '"' +> (instr >|< char '\\')`star` char '"'
746          `lexaction` \cs pos -> Just (CHSTokString pos (init . tail $ cs))
747
748-- verbatim code
749--
750hsverb :: CHSLexer
751hsverb  = char '`' +> inhsverb`star` char '\''
752          `lexaction` \cs pos -> Just (CHSTokHSVerb pos (init . tail $ cs))
753
754
755-- regular expressions
756--
757letter, digit, instr, inchar, inhsverb :: Regexp s t
758letter   = alt ['a'..'z'] >|< alt ['A'..'Z'] >|< char '_'
759digit    = alt ['0'..'9']
760instr    = alt ([' '..'\127'] \\ "\"\\")
761inchar   = alt ([' '..'\127'] \\ "\'")
762inhsverb = alt ([' '..'\127'] \\ "\'")
763
764-- character sets
765--
766anySet, inlineSet, specialSet, commentSpecialSet, ctrlSet :: [Char]
767anySet            = ['\0'..'\255']
768inlineSet         = anySet \\ ctrlSet
769specialSet        = ['{', '-', '"', '\'']
770commentSpecialSet = ['{', '-']
771ctrlSet           = ['\n', '\f', '\r', '\t', '\v']
772
773
774-- main lexing routine
775-- -------------------
776
777-- generate a token sequence out of a string denoting a CHS file
778-- (EXPORTED)
779--
780--  * the given position is attributed to the first character in the string
781--
782--  * errors are entered into the compiler state
783--
784lexCHS        :: String -> Position -> CST s [CHSToken]
785lexCHS cs pos  =
786  do
787    state <- initialState
788    let (ts, lstate, errs) = execLexer chslexer (cs, pos, state)
789        (_, pos', state')  = lstate
790    mapM raise errs
791    assertFinalState pos' state'
792    return ts
793