c2hs/chs/CHSLexer.hs

--  C->Haskell Compiler: Lexer for CHS Files
--
--  Author : Manuel M T Chakravarty
--  Created: 13 August 99
--
--  Version $Revision: 1.1 $ from $Date: 2004/11/21 21:05:35 $
--
--  Copyright (c) [1999..2004] Manuel M T Chakravarty
--
--  This file is free software; you can redistribute it and/or modify
--  it under the terms of the GNU General Public License as published by
--  the Free Software Foundation; either version 2 of the License, or
--  (at your option) any later version.
--
--  This file is distributed in the hope that it will be useful,
--  but WITHOUT ANY WARRANTY; without even the implied warranty of
--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
--  GNU General Public License for more details.
--
--- DESCRIPTION ---------------------------------------------------------------
--
--  Lexer for CHS files; the tokens are only partially recognised.
--
--- DOCU ----------------------------------------------------------------------
--
--  language: Haskell 98
--
--  * CHS files are assumed to be Haskell 98 files that include C2HS binding
--    hooks.
--
--  * Haskell code is not tokenised, but binding hooks (delimited by `{#'and
--    `#}') are analysed.  Therefore the lexer operates in two states
--    (realised as two lexer coupled by meta actions) depending on whether
--    Haskell code or a binding hook is currently read.  The lexer reading
--    Haskell code is called `base lexer'; the other one, `binding-hook
--    lexer'.  In addition, there is a inline-c lexer, which, as the
--    binding-hook lexer, can be triggered from the base lexer.
--
--  * Base lexer:
--
--      haskell -> (inline \\ special)*
--               | special \\ `"'
--               | comment
--               | nested
--               | hstring
--               | '{#'
--               | cpp
--      special -> `(' | `{' | `-' | `"'
--      ctrl    -> `\n' | `\f' | `\r' | `\t' | `\v'
--
--      inline  -> any \\ ctrl
--      any     -> '\0'..'\255'
--
--    Within the base lexer control codes appear as separate tokens in the
--    token list.
--
--    NOTE: It is important that `{' is an extra lexeme and not added as an
--          optional component at the end of the first alternative for
--          `haskell'.  Otherwise, the principle of the longest match will
--          divide `foo {#' into the tokens `foo {' and `#' instead of `foo '
--          and `{#'.
--
--    One line comments are handled by
--
--      comment -> `--' (any \\ `\n')* `\n'
--
--    and nested comments by
--
--      nested -> `{-' any* `-}'
--
--    where `any*' may contain _balanced_ occurrences of `{-' and `-}'.
--
--      hstring -> `"' inhstr* `"'
--      inhstr  -> ` '..`\127' \\ `"'
--               | `\"'
--
--    Pre-precessor directives as well as the switch to inline-C code are
--    formed as follows:
--
--      cpp     -> `\n#' (inline | `\t')* `\n'
--               | `\n#c' (' ' | '\t')* `\n'
--
--    We allow whitespace between the `#' and the actual directive, but in `#c'
--    and `#endc' the directive must immediately follow the `#'.  This might
--    be regarded as a not entirely orthogonal design, but simplifies matters
--    especially for `#endc'.
--
--  * On encountering the lexeme `{#', a meta action in the base lexer
--    transfers control to the following binding-hook lexer:
--
--      ident       -> letter (letter | digit | `\'')*
--                   | `\'' letter (letter | digit)* `\''
--      reservedid  -> `as' | `call' | `class' | `context' | `deriving'
--                   | `enum' | `foreign' | `fun' | `get' | `lib'
--                   | `newtype' | `pointer' | `prefix' | `pure' | `set'
--                   | `sizeof' | `stable' | `type' | `underscoreToCase'
--                   | `unsafe' | `with' | 'lock' | 'unlock'
--      reservedsym -> `{#' | `#}' | `{' | `}' | `,' | `.' | `->' | `='
--                   | `=>' | '-' | `*' | `&' | `^'
--      string      -> `"' instr* `"'
--      verbhs      -> `\`' instr* `\''
--      instr       -> ` '..`\127' \\ `"'
--      comment     -> `--' (any \\ `\n')* `\n'
--
--    Control characters, white space, and comments are discarded in the
--    binding-hook lexer.  Nested comments are not allowed in a binding hook.
--    Identifiers can be enclosed in single quotes to avoid collision with
--    C->Haskell keywords.
--
--  * In the binding-hook lexer, the lexeme `#}' transfers control back to the
--    base lexer.  An occurence of the lexeme `{#' inside the binding-hook
--    lexer triggers an error.  The symbol `{#' is not explcitly represented
--    in the resulting token stream.  However, the occurrence of a token
--    representing one of the reserved identifiers `call', `context', `enum',
--    and `field' marks the start of a binding hook.  Strictly speaking, `#}'
--    need also not occur in the token stream, as the next `haskell' token
--    marks a hook's end.  It is, however, useful for producing accurate error
--    messages (in case an hook is closed to early) to have a token
--    representing `#}'.
--
--  * The rule `ident' describes Haskell identifiers, but without
--    distinguishing between variable and constructor identifers (ie, those
--    starting with a lowercase and those starting with an uppercase letter).
--    However, we use it also to scan C identifiers; although, strictly
--    speaking, it is too general for them.  In the case of C identifiers,
--    this should not have any impact on the range of descriptions accepted by
--    the tool, as illegal identifier will never occur in a C header file that
--    is accepted by the C lexer.  In the case of Haskell identifiers, a
--    confusion between variable and constructor identifiers will be noted by
--    the Haskell compiler translating the code generated by c2hs.  Moreover,
--    identifiers can be enclosed in single quotes to avoid collision with
--    C->Haskell keywords, but those may not contain apostrophes.
--
--  * Any line starting with the character `#' is regarded to be a C
--    preprocessor directive.  With the exception of `#c' and `#endc', which
--    delimit a set of lines containing inline C code.  Hence, in the base
--    lexer, the lexeme `#c' triggers a meta action transferring control to the
--    following inline-C lexer:
--
--      c  -> inline* \\ `\n#endc'
--
--    We do neither treat C strings nor C comments specially.  Hence, if the
--    string "\n#endc" occurs in a comment, we will mistakenly regard it as
--    the end of the inline C code.  Note that the problem cannot happen with
--    strings, as C does not permit strings that extend over multiple lines.
--    At the moment, it just seems not to be worth the effort required to
--    treat this situation more accurately.
--
--    The inline-C lexer also doesn't handle pre-processor directives
--    specially.  Hence, structural pre-processor directives (namely,
--    conditionals) may occur within inline-C code only properly nested.
--
--  Shortcomings
--  ~~~~~~~~~~~~
--  Some lexemes that include single and double quote characters are not lexed
--  correctly.  See the implementation comment at `haskell' for details.
--
--
--- TODO ----------------------------------------------------------------------
--
--  * In `haskell', the case of a single `"' (without a matching second one)
--    is caught by an eplicit error raising rule.  This shouldn't be
--    necessary, but for some strange reason, the lexer otherwise hangs when a
--    single `"' appears in the input.
--
--  * Comments in the "gap" of a string are not yet supported.
--

module CHSLexer (CHSToken(..), lexCHS)
where

import Data.List         ((\\))
import Data.Char         (isDigit)
import Control.Monad     (liftM)
import Numeric   (readDec, readOct, readHex)

import Position  (Position(..), Pos(posOf), incPos, retPos, tabPos)
import Errors    (ErrorLvl(..), Error, makeError)
import UNames    (NameSupply, Name, names)
import Idents    (Ident, lexemeToIdent, identToLexeme)
import Lexers    (Regexp, Lexer, Action, epsilon, char, (+>), lexaction,
                  lexactionErr, lexmeta, (>|<), (>||<), ctrlLexer, star, plus,
                  quest, alt, string, LexerState, execLexer)

import C2HSState (CST, raise, raiseError, nop, getNameSupply)


-- token definition
-- ----------------

-- possible tokens (EXPORTED)
--
data CHSToken = CHSTokArrow   Position          -- `->'
              | CHSTokDArrow  Position          -- `=>'
              | CHSTokDot     Position          -- `.'
              | CHSTokComma   Position          -- `,'
              | CHSTokEqual   Position          -- `='
              | CHSTokMinus   Position          -- `-'
              | CHSTokStar    Position          -- `*'
              | CHSTokAmp     Position          -- `&'
              | CHSTokHat     Position          -- `^'
              | CHSTokLBrace  Position          -- `{'
              | CHSTokRBrace  Position          -- `}'
              | CHSTokLParen  Position          -- `('
              | CHSTokRParen  Position          -- `)'
              | CHSTokEndHook Position          -- `#}'
              | CHSTokAs      Position          -- `as'
              | CHSTokCall    Position          -- `call'
              | CHSTokClass   Position          -- `class'
              | CHSTokContext Position          -- `context'
              | CHSTokDerive  Position          -- `deriving'
              | CHSTokEnum    Position          -- `enum'
              | CHSTokForeign Position          -- `foreign'
              | CHSTokFun     Position          -- `fun'
              | CHSTokGet     Position          -- `get'
              | CHSTokImport  Position          -- `import'
              | CHSTokLib     Position          -- `lib'
              | CHSTokNewtype Position          -- `newtype'
              | CHSTokPointer Position          -- `pointer'
              | CHSTokPrefix  Position          -- `prefix'
              | CHSTokPure    Position          -- `pure'
              | CHSTokQualif  Position          -- `qualified'
              | CHSTokSet     Position          -- `set'
              | CHSTokSizeof  Position          -- `sizeof'
              | CHSTokStable  Position          -- `stable'
              | CHSTokType    Position          -- `type'
              | CHSTok_2Case  Position          -- `underscoreToCase'
              | CHSTokUnsafe  Position          -- `unsafe'
              | CHSTokWith    Position          -- `with'
              | CHSTokLock    Position          -- `lock'
              | CHSTokNolock  Position          -- `nolock'
              | CHSTokString  Position String   -- string
              | CHSTokHSVerb  Position String   -- verbatim Haskell (`...')
              | CHSTokIdent   Position Ident    -- identifier
              | CHSTokHaskell Position String   -- verbatim Haskell code
              | CHSTokCPP     Position String   -- pre-processor directive
              | CHSTokLine    Position          -- line pragma
              | CHSTokC       Position String   -- verbatim C code
              | CHSTokCtrl    Position Char     -- control code
              | CHSTokPragma  Position          -- '{-# LANGUAGE' language pragma begin
              | CHSTokPragEnd Position          -- '#-}' language pragma end

instance Pos CHSToken where
  posOf (CHSTokArrow   pos  ) = pos
  posOf (CHSTokDArrow  pos  ) = pos
  posOf (CHSTokDot     pos  ) = pos
  posOf (CHSTokComma   pos  ) = pos
  posOf (CHSTokEqual   pos  ) = pos
  posOf (CHSTokMinus   pos  ) = pos
  posOf (CHSTokStar    pos  ) = pos
  posOf (CHSTokAmp     pos  ) = pos
  posOf (CHSTokHat     pos  ) = pos
  posOf (CHSTokLBrace  pos  ) = pos
  posOf (CHSTokRBrace  pos  ) = pos
  posOf (CHSTokLParen  pos  ) = pos
  posOf (CHSTokRParen  pos  ) = pos
  posOf (CHSTokEndHook pos  ) = pos
  posOf (CHSTokAs      pos  ) = pos
  posOf (CHSTokCall    pos  ) = pos
  posOf (CHSTokClass   pos  ) = pos
  posOf (CHSTokContext pos  ) = pos
  posOf (CHSTokDerive  pos  ) = pos
  posOf (CHSTokEnum    pos  ) = pos
  posOf (CHSTokForeign pos  ) = pos
  posOf (CHSTokFun     pos  ) = pos
  posOf (CHSTokGet     pos  ) = pos
  posOf (CHSTokImport  pos  ) = pos
  posOf (CHSTokLib     pos  ) = pos
  posOf (CHSTokNewtype pos  ) = pos
  posOf (CHSTokPointer pos  ) = pos
  posOf (CHSTokPrefix  pos  ) = pos
  posOf (CHSTokPure    pos  ) = pos
  posOf (CHSTokQualif  pos  ) = pos
  posOf (CHSTokSet     pos  ) = pos
  posOf (CHSTokSizeof  pos  ) = pos
  posOf (CHSTokStable  pos  ) = pos
  posOf (CHSTokType    pos  ) = pos
  posOf (CHSTok_2Case  pos  ) = pos
  posOf (CHSTokUnsafe  pos  ) = pos
  posOf (CHSTokWith    pos  ) = pos
  posOf (CHSTokLock    pos  ) = pos
  posOf (CHSTokNolock  pos  ) = pos
  posOf (CHSTokString  pos _) = pos
  posOf (CHSTokHSVerb  pos _) = pos
  posOf (CHSTokIdent   pos _) = pos
  posOf (CHSTokHaskell pos _) = pos
  posOf (CHSTokCPP     pos _) = pos
  posOf (CHSTokC       pos _) = pos
  posOf (CHSTokCtrl    pos _) = pos
  posOf (CHSTokPragma  pos  ) = pos
  posOf (CHSTokPragEnd pos  ) = pos

instance Eq CHSToken where
  (CHSTokArrow    _  ) == (CHSTokArrow    _  ) = True
  (CHSTokDArrow   _  ) == (CHSTokDArrow   _  ) = True
  (CHSTokDot      _  ) == (CHSTokDot      _  ) = True
  (CHSTokComma    _  ) == (CHSTokComma    _  ) = True
  (CHSTokEqual    _  ) == (CHSTokEqual    _  ) = True
  (CHSTokMinus    _  ) == (CHSTokMinus    _  ) = True
  (CHSTokStar     _  ) == (CHSTokStar     _  ) = True
  (CHSTokAmp      _  ) == (CHSTokAmp      _  ) = True
  (CHSTokHat      _  ) == (CHSTokHat      _  ) = True
  (CHSTokLBrace   _  ) == (CHSTokLBrace   _  ) = True
  (CHSTokRBrace   _  ) == (CHSTokRBrace   _  ) = True
  (CHSTokLParen   _  ) == (CHSTokLParen   _  ) = True
  (CHSTokRParen   _  ) == (CHSTokRParen   _  ) = True
  (CHSTokEndHook  _  ) == (CHSTokEndHook  _  ) = True
  (CHSTokAs       _  ) == (CHSTokAs       _  ) = True
  (CHSTokCall     _  ) == (CHSTokCall     _  ) = True
  (CHSTokClass    _  ) == (CHSTokClass    _  ) = True
  (CHSTokContext  _  ) == (CHSTokContext  _  ) = True
  (CHSTokDerive   _  ) == (CHSTokDerive   _  ) = True
  (CHSTokEnum     _  ) == (CHSTokEnum     _  ) = True
  (CHSTokForeign  _  ) == (CHSTokForeign  _  ) = True
  (CHSTokFun      _  ) == (CHSTokFun      _  ) = True
  (CHSTokGet      _  ) == (CHSTokGet      _  ) = True
  (CHSTokImport   _  ) == (CHSTokImport   _  ) = True
  (CHSTokLib      _  ) == (CHSTokLib      _  ) = True
  (CHSTokNewtype  _  ) == (CHSTokNewtype  _  ) = True
  (CHSTokPointer  _  ) == (CHSTokPointer  _  ) = True
  (CHSTokPrefix   _  ) == (CHSTokPrefix   _  ) = True
  (CHSTokPure     _  ) == (CHSTokPure     _  ) = True
  (CHSTokQualif   _  ) == (CHSTokQualif   _  ) = True
  (CHSTokSet      _  ) == (CHSTokSet      _  ) = True
  (CHSTokSizeof   _  ) == (CHSTokSizeof   _  ) = True
  (CHSTokStable   _  ) == (CHSTokStable   _  ) = True
  (CHSTokType     _  ) == (CHSTokType     _  ) = True
  (CHSTok_2Case   _  ) == (CHSTok_2Case   _  ) = True
  (CHSTokUnsafe   _  ) == (CHSTokUnsafe   _  ) = True
  (CHSTokWith     _  ) == (CHSTokWith     _  ) = True
  (CHSTokLock     _  ) == (CHSTokLock     _  ) = True
  (CHSTokNolock   _  ) == (CHSTokNolock   _  ) = True
  (CHSTokString   _ _) == (CHSTokString   _ _) = True
  (CHSTokHSVerb   _ _) == (CHSTokHSVerb   _ _) = True
  (CHSTokIdent    _ _) == (CHSTokIdent    _ _) = True
  (CHSTokHaskell  _ _) == (CHSTokHaskell  _ _) = True
  (CHSTokCPP      _ _) == (CHSTokCPP      _ _) = True
  (CHSTokC        _ _) == (CHSTokC        _ _) = True
  (CHSTokCtrl     _ _) == (CHSTokCtrl     _ _) = True
  (CHSTokPragma   _  ) == (CHSTokPragma   _  ) = True
  (CHSTokPragEnd  _  ) == (CHSTokPragEnd  _  ) = True
  _                    == _                    = False

instance Show CHSToken where
  showsPrec _ (CHSTokArrow   _  ) = showString "->"
  showsPrec _ (CHSTokDArrow  _  ) = showString "=>"
  showsPrec _ (CHSTokDot     _  ) = showString "."
  showsPrec _ (CHSTokComma   _  ) = showString ","
  showsPrec _ (CHSTokEqual   _  ) = showString "="
  showsPrec _ (CHSTokMinus   _  ) = showString "-"
  showsPrec _ (CHSTokStar    _  ) = showString "*"
  showsPrec _ (CHSTokAmp     _  ) = showString "&"
  showsPrec _ (CHSTokHat     _  ) = showString "^"
  showsPrec _ (CHSTokLBrace  _  ) = showString "{"
  showsPrec _ (CHSTokRBrace  _  ) = showString "}"
  showsPrec _ (CHSTokLParen  _  ) = showString "("
  showsPrec _ (CHSTokRParen  _  ) = showString ")"
  showsPrec _ (CHSTokEndHook _  ) = showString "#}"
  showsPrec _ (CHSTokAs      _  ) = showString "as"
  showsPrec _ (CHSTokCall    _  ) = showString "call"
  showsPrec _ (CHSTokClass   _  ) = showString "class"
  showsPrec _ (CHSTokContext _  ) = showString "context"
  showsPrec _ (CHSTokDerive  _  ) = showString "deriving"
  showsPrec _ (CHSTokEnum    _  ) = showString "enum"
  showsPrec _ (CHSTokForeign _  ) = showString "foreign"
  showsPrec _ (CHSTokFun     _  ) = showString "fun"
  showsPrec _ (CHSTokGet     _  ) = showString "get"
  showsPrec _ (CHSTokImport  _  ) = showString "import"
  showsPrec _ (CHSTokLib     _  ) = showString "lib"
  showsPrec _ (CHSTokNewtype _  ) = showString "newtype"
  showsPrec _ (CHSTokPointer _  ) = showString "pointer"
  showsPrec _ (CHSTokPrefix  _  ) = showString "prefix"
  showsPrec _ (CHSTokPure    _  ) = showString "pure"
  showsPrec _ (CHSTokQualif  _  ) = showString "qualified"
  showsPrec _ (CHSTokSet     _  ) = showString "set"
  showsPrec _ (CHSTokSizeof  _  ) = showString "sizeof"
  showsPrec _ (CHSTokStable  _  ) = showString "stable"
  showsPrec _ (CHSTokType    _  ) = showString "type"
  showsPrec _ (CHSTok_2Case  _  ) = showString "underscoreToCase"
  showsPrec _ (CHSTokUnsafe  _  ) = showString "unsafe"
  showsPrec _ (CHSTokWith    _  ) = showString "with"
  showsPrec _ (CHSTokLock    _  ) = showString "lock"
  showsPrec _ (CHSTokNolock  _  ) = showString "nolock"
  showsPrec _ (CHSTokString  _ s) = showString ("\"" ++ s ++ "\"")
  showsPrec _ (CHSTokHSVerb  _ s) = showString ("`" ++ s ++ "'")
  showsPrec _ (CHSTokIdent   _ i) = (showString . identToLexeme) i
  showsPrec _ (CHSTokHaskell _ s) = showString s
  showsPrec _ (CHSTokCPP     _ s) = showString s
  showsPrec _ (CHSTokC       _ s) = showString s
  showsPrec _ (CHSTokCtrl    _ c) = showChar c
  showsPrec _ (CHSTokPragma  _  ) = showString "{-# LANGUAGE"
  showsPrec _ (CHSTokPragEnd _  ) = showString "#-}"


-- lexer state
-- -----------

-- state threaded through the lexer
--
data CHSLexerState = CHSLS {
                       nestLvl :: Int,   -- nesting depth of nested comments
                       inHook  :: Bool,  -- within a binding hook?
                       namesup :: [Name] -- supply of unique names
                     }

-- initial state
--
initialState :: CST s CHSLexerState
initialState  = do
                  namesup <- liftM names getNameSupply
                  return $ CHSLS {
                             nestLvl = 0,
                             inHook  = False,
                             namesup = namesup
                           }

-- raise an error if the given state is not a final state
--
assertFinalState :: Position -> CHSLexerState -> CST s ()
assertFinalState pos CHSLS {nestLvl = nestLvl, inHook = inHook}
  | nestLvl > 0 = raiseError pos ["Unexpected end of file!",
                                  "Unclosed nested comment."]
  | inHook      = raiseError pos ["Unexpected end of file!",
                                  "Unclosed binding hook."]
  | otherwise   = nop

-- lexer and action type used throughout this specification
--
type CHSLexer  = Lexer  CHSLexerState CHSToken
type CHSAction = Action               CHSToken
type CHSRegexp = Regexp CHSLexerState CHSToken

-- for actions that need a new unique name
--
infixl 3 `lexactionName`
lexactionName :: CHSRegexp
              -> (String -> Position -> Name -> CHSToken)
              -> CHSLexer
re `lexactionName` action = re `lexmeta` action'
  where
    action' str pos state = let name:ns = namesup state
                            in
                            (Just $ Right (action str pos name),
                             incPos pos (length str),
                             state {namesup = ns},
                             Nothing)


-- lexical specification
-- ---------------------

-- the lexical definition of the tokens (the base lexer)
--
--
chslexer :: CHSLexer
chslexer  =      pragma         -- LANGUAGE pragma
            >||< haskell        -- Haskell code
            >||< nested         -- nested comments
            >||< ctrl           -- control code (that has to be preserved)
            >||< hook           -- start of a binding hook
            >||< cpp            -- a pre-processor directive (or `#c')

-- the LANGUAGE pragma
pragma :: CHSLexer
pragma = string "{-# LANGUAGE" `lexmeta` \_ pos s ->
         (Just $ Right (CHSTokPragma pos), incPos pos 12, s, Just langLexer)

langLexer :: CHSLexer
langLexer = whitespace >||< identOrKW >||< symbol >||<
            (string "#-}" `lexmeta` \_ pos s ->
            (Just $ Right (CHSTokPragEnd pos), incPos pos 3, s, Just chslexer))

-- stream of Haskell code (terminated by a control character or binding hook)
--
haskell :: CHSLexer
--
-- NB: We need to make sure that '"' is not regarded as the beginning of a
--     string; however, we cannot really lex character literals properly
--     without lexing identifiers (as the latter may containing single quotes
--     as part of their lexeme).  Thus, we special case '"'.  This is still a
--     kludge, as a program fragment, such as
--
--       foo'"'strange string"
--
--     will not be handled correctly.
--
haskell  = (    anyButSpecial`star` epsilon
            >|< specialButQuotes
            >|< char '"'  +> inhstr`star` char '"'
            >|< string "'\"'"                           -- special case of "
            >|< string "--" +> anyButNL`star` epsilon   -- comment
           )
           `lexaction` copyVerbatim
           >||< char '"'                                -- this is a bad kludge
                `lexactionErr`
                  \_ pos -> (Left $ makeError ErrorErr pos
                                              ["Lexical error!",
                                              "Unclosed string."])
           where
             anyButSpecial    = alt (inlineSet \\ specialSet)
             specialButQuotes = alt (specialSet \\ ['"'])
             anyButNL         = alt (anySet \\ ['\n'])
             inhstr           = instr >|< char '\\' >|< string "\\\"" >|< gap
             gap              = char '\\' +> alt (' ':ctrlSet)`plus` char '\\'

-- action copying the input verbatim to `CHSTokHaskell' tokens
--
copyVerbatim        :: CHSAction
copyVerbatim cs pos  = Just $ CHSTokHaskell pos cs

-- nested comments
--
nested :: CHSLexer
nested  =
       string "{-"              {- for Haskell emacs mode :-( -}
       `lexmeta` enterComment
  >||<
       string "-}"
       `lexmeta` leaveComment
  where
    enterComment cs pos s =
      (copyVerbatim' cs pos,                    -- collect the lexeme
       incPos pos 2,                            -- advance current position
       s {nestLvl = nestLvl s + 1},             -- increase nesting level
       Just $ inNestedComment)                  -- continue in comment lexer
    --
    leaveComment cs pos s =
      case nestLvl s of
        0 -> (commentCloseErr pos,              -- 0: -} outside comment => err
              incPos pos 2,                     -- advance current position
              s,
              Nothing)
        1 -> (copyVerbatim' cs pos,             -- collect the lexeme
              incPos pos 2,                     -- advance current position
              s {nestLvl = nestLvl s - 1},      -- decrease nesting level
              Just chslexer)                    -- 1: continue with root lexer
        _ -> (copyVerbatim' cs pos,             -- collect the lexeme
              incPos pos 2,                     -- advance current position
              s {nestLvl = nestLvl s - 1},      -- decrease nesting level
              Nothing)                          -- _: cont with comment lexer
    --
    copyVerbatim' cs pos  = Just $ Right (CHSTokHaskell pos cs)
    --
    commentCloseErr pos =
      Just $ Left (makeError ErrorErr pos
                             ["Lexical error!",
                             "`-}' not preceded by a matching `{-'."])
                             {- for Haskell emacs mode :-( -}


-- lexer processing the inner of a comment
--
inNestedComment :: CHSLexer
inNestedComment  =      commentInterior         -- inside a comment
                   >||< nested                  -- nested comments
                   >||< ctrl                    -- control code (preserved)

-- standard characters in a nested comment
--
commentInterior :: CHSLexer
commentInterior  = (    anyButSpecial`star` epsilon
                    >|< special
                   )
                   `lexaction` copyVerbatim
                   where
                     anyButSpecial = alt (inlineSet \\ commentSpecialSet)
                     special       = alt commentSpecialSet

-- control code in the base lexer (is turned into a token)
--
--  * this covers exactly the same set of characters as contained in `ctrlSet'
--   and `Lexers.ctrlLexer' and advances positions also like the `ctrlLexer'
--
ctrl :: CHSLexer
ctrl  =
       char '\n' `lexmeta` newline
  >||< char '\r' `lexmeta` newline
  >||< char '\v' `lexmeta` newline
  >||< char '\f' `lexmeta` formfeed
  >||< char '\t' `lexmeta` tab
  where
    newline  [c] pos = ctrlResult pos c (retPos pos)
    formfeed [c] pos = ctrlResult pos c (incPos pos 1)
    tab      [c] pos = ctrlResult pos c (tabPos pos)

    ctrlResult pos c pos' s =
      (Just $ Right (CHSTokCtrl pos c), pos', s, Nothing)

-- start of a binding hook (ie, enter the binding hook lexer)
--
hook :: CHSLexer
hook  = string "{#"
        `lexmeta` \_ pos s -> (Nothing, incPos pos 2, s, Just bhLexer)

-- pre-processor directives and `#c'
--
--  * we lex `#c' as a directive and special case it in the action
--
--  * we lex C line number pragmas and special case it in the action
--
cpp :: CHSLexer
cpp = directive
      where
        directive =
          string "\n#" +> alt ('\t':inlineSet)`star` epsilon
          `lexmeta`
             \(_:_:dir) pos s ->        -- strip off the "\n#"
               case dir of
                 ['c']                      ->          -- #c
                   (Nothing, retPos pos, s, Just cLexer)
                 -- a #c may be followed by whitespace
                 'c':sp:_ | sp `elem` " \t" ->          -- #c
                   (Nothing, retPos pos, s, Just cLexer)
                 ' ':line@(n:_) | isDigit n ->                 -- C line pragma
                   let pos' = adjustPosByCLinePragma line pos
                    in (Just $ Right (CHSTokLine pos'), pos', s, Nothing)
                 _                            ->        -- CPP directive
                   (Just $ Right (CHSTokCPP pos dir),
                    retPos pos, s, Nothing)

adjustPosByCLinePragma :: String -> Position -> Position
adjustPosByCLinePragma str (Position fname _ _) =
  (Position fname' row' 0)
  where
    str'            = dropWhite str
    (rowStr, str'') = span isDigit str'
    row'            = read rowStr
    str'''          = dropWhite str''
    fnameStr        = takeWhile (/= '"') . drop 1 $ str'''
    fname'          | null str''' || head str''' /= '"' = fname
                    -- try and get more sharing of file name strings
                    | fnameStr == fname                 = fname
                    | otherwise                         = fnameStr
    --
    dropWhite = dropWhile (\c -> c == ' ' || c == '\t')

-- the binding hook lexer
--
bhLexer :: CHSLexer
bhLexer  =      identOrKW
           >||< symbol
           >||< strlit
           >||< hsverb
           >||< whitespace
           >||< endOfHook
           >||< string "--" +> anyButNL`star` char '\n'   -- comment
                `lexmeta` \_ pos s -> (Nothing, retPos pos, s, Nothing)
           where
             anyButNL  = alt (anySet \\ ['\n'])
             endOfHook = string "#}"
                         `lexmeta`
                          \_ pos s -> (Just $ Right (CHSTokEndHook pos),
                                       incPos pos 2, s, Just chslexer)

-- the inline-C lexer
--
cLexer :: CHSLexer
cLexer =      inlineC                     -- inline C code
         >||< ctrl                        -- control code (preserved)
         >||< string "\n#endc"            -- end of inline C code...
              `lexmeta`                   -- ...preserve '\n' as control token
              \_ pos s -> (Just $ Right (CHSTokCtrl pos '\n'), retPos pos, s,
                           Just chslexer)
         where
           inlineC = alt inlineSet `lexaction` copyVerbatimC
           --
           copyVerbatimC :: CHSAction
           copyVerbatimC cs pos = Just $ CHSTokC pos cs

-- whitespace
--
--  * horizontal and vertical tabs, newlines, and form feeds are filter out by
--   `Lexers.ctrlLexer'
--
whitespace :: CHSLexer
whitespace  =      (char ' ' `lexaction` \_ _ -> Nothing)
              >||< ctrlLexer

-- identifiers and keywords
--
identOrKW :: CHSLexer
--
-- the strictness annotations seem to help a bit
--
identOrKW  =
       -- identifier or keyword
       (letter +> (letter >|< digit >|< char '\'')`star` epsilon
       `lexactionName` \cs pos name -> (idkwtok $!pos) cs name)
  >||< -- identifier in single quotes
       (char '\'' +> letter +> (letter >|< digit)`star` char '\''
       `lexactionName` \cs pos name -> (mkid $!pos) cs name)
       -- NB: quotes are removed by lexemeToIdent
  where
    idkwtok pos "as"               _    = CHSTokAs      pos
    idkwtok pos "call"             _    = CHSTokCall    pos
    idkwtok pos "class"            _    = CHSTokClass   pos
    idkwtok pos "context"          _    = CHSTokContext pos
    idkwtok pos "deriving"         _    = CHSTokDerive  pos
    idkwtok pos "enum"             _    = CHSTokEnum    pos
    idkwtok pos "foreign"          _    = CHSTokForeign pos
    idkwtok pos "fun"              _    = CHSTokFun     pos
    idkwtok pos "get"              _    = CHSTokGet     pos
    idkwtok pos "import"           _    = CHSTokImport  pos
    idkwtok pos "lib"              _    = CHSTokLib     pos
    idkwtok pos "newtype"          _    = CHSTokNewtype pos
    idkwtok pos "pointer"          _    = CHSTokPointer pos
    idkwtok pos "prefix"           _    = CHSTokPrefix  pos
    idkwtok pos "pure"             _    = CHSTokPure    pos
    idkwtok pos "qualified"        _    = CHSTokQualif  pos
    idkwtok pos "set"              _    = CHSTokSet     pos
    idkwtok pos "sizeof"           _    = CHSTokSizeof  pos
    idkwtok pos "stable"           _    = CHSTokStable  pos
    idkwtok pos "type"             _    = CHSTokType    pos
    idkwtok pos "underscoreToCase" _    = CHSTok_2Case  pos
    idkwtok pos "unsafe"           _    = CHSTokUnsafe  pos
    idkwtok pos "with"             _    = CHSTokWith    pos
    idkwtok pos "lock"             _    = CHSTokLock    pos
    idkwtok pos "nolock"           _    = CHSTokNolock  pos
    idkwtok pos cs                 name = mkid pos cs name
    --
    mkid pos cs name = CHSTokIdent pos (lexemeToIdent pos cs name)

-- reserved symbols
--
symbol :: CHSLexer
symbol  =      sym "->" CHSTokArrow
          >||< sym "=>" CHSTokDArrow
          >||< sym "."  CHSTokDot
          >||< sym ","  CHSTokComma
          >||< sym "="  CHSTokEqual
          >||< sym "-"  CHSTokMinus
          >||< sym "*"  CHSTokStar
          >||< sym "&"  CHSTokAmp
          >||< sym "^"  CHSTokHat
          >||< sym "{"  CHSTokLBrace
          >||< sym "}"  CHSTokRBrace
          >||< sym "("  CHSTokLParen
          >||< sym ")"  CHSTokRParen
          where
            sym cs con = string cs `lexaction` \_ pos -> Just (con pos)

-- string
--
strlit :: CHSLexer
strlit  = char '"' +> (instr >|< char '\\')`star` char '"'
          `lexaction` \cs pos -> Just (CHSTokString pos (init . tail $ cs))

-- verbatim code
--
hsverb :: CHSLexer
hsverb  = char '`' +> inhsverb`star` char '\''
          `lexaction` \cs pos -> Just (CHSTokHSVerb pos (init . tail $ cs))


-- regular expressions
--
letter, digit, instr, inchar, inhsverb :: Regexp s t
letter   = alt ['a'..'z'] >|< alt ['A'..'Z'] >|< char '_'
digit    = alt ['0'..'9']
instr    = alt ([' '..'\127'] \\ "\"\\")
inchar   = alt ([' '..'\127'] \\ "\'")
inhsverb = alt ([' '..'\127'] \\ "\'")

-- character sets
--
anySet, inlineSet, specialSet, commentSpecialSet, ctrlSet :: [Char]
anySet            = ['\0'..'\255']
inlineSet         = anySet \\ ctrlSet
specialSet        = ['{', '-', '"', '\'']
commentSpecialSet = ['{', '-']
ctrlSet           = ['\n', '\f', '\r', '\t', '\v']


-- main lexing routine
-- -------------------

-- generate a token sequence out of a string denoting a CHS file
-- (EXPORTED)
--
--  * the given position is attributed to the first character in the string
--
--  * errors are entered into the compiler state
--
lexCHS        :: String -> Position -> CST s [CHSToken]
lexCHS cs pos  =
  do
    state <- initialState
    let (ts, lstate, errs) = execLexer chslexer (cs, pos, state)
        (_, pos', state')  = lstate
    mapM raise errs
    assertFinalState pos' state'
    return ts