1--  C->Haskell Compiler: Lexer for CHS Files
2--
3--  Author : Manuel M T Chakravarty
4--  Created: 13 August 99
5--
6--  Copyright (c) [1999..2005] Manuel M T Chakravarty
7--
8--  This file is free software; you can redistribute it and/or modify
9--  it under the terms of the GNU General Public License as published by
10--  the Free Software Foundation; either version 2 of the License, or
11--  (at your option) any later version.
12--
13--  This file is distributed in the hope that it will be useful,
14--  but WITHOUT ANY WARRANTY; without even the implied warranty of
15--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16--  GNU General Public License for more details.
17--
18--- DESCRIPTION ---------------------------------------------------------------
19--
20--  Lexer for CHS files; the tokens are only partially recognised.
21--
22--- DOCU ----------------------------------------------------------------------
23--
24--  language: Haskell 98
25--
26--  * CHS files are assumed to be Haskell 98 files that include C2HS binding
27--    hooks.
28--
29--  * Haskell code is not tokenised, but binding hooks (delimited by `{#'and
30--    `#}') are analysed.  Therefore the lexer operates in two states
31--    (realised as two lexer coupled by meta actions) depending on whether
32--    Haskell code or a binding hook is currently read.  The lexer reading
33--    Haskell code is called `base lexer'; the other one, `binding-hook
34--    lexer'.  In addition, there is a inline-c lexer, which, as the
35--    binding-hook lexer, can be triggered from the base lexer.
36--
37--  * Base lexer:
38--
39--      haskell -> (inline \\ special)*
40--               | special \\ `"'
41--               | comment
42--               | nested
43--               | hstring
44--               | '{#'
45--               | cpp
46--      special -> `(' | `{' | `-' | `"'
47--      ctrl    -> `\n' | `\f' | `\r' | `\t' | `\v'
48--
49--      inline  -> any \\ ctrl
50--      any     -> '\0'..'\255'
51--
52--    Within the base lexer control codes appear as separate tokens in the
53--    token list.
54--
55--    NOTE: It is important that `{' is an extra lexeme and not added as an
56--          optional component at the end of the first alternative for
57--          `haskell'.  Otherwise, the principle of the longest match will
58--          divide `foo {#' into the tokens `foo {' and `#' instead of `foo '
59--          and `{#'.
60--
61--    One line comments are handled by
62--
63--      comment -> `--' (any \\ `\n')* `\n'
64--
65--    and nested comments by
66--
67--      nested -> `{-' any* `-}'
68--
69--    where `any*' may contain _balanced_ occurrences of `{-' and `-}'.
70--
71--      hstring -> `"' inhstr* `"'
72--      inhstr  -> ` '..`\127' \\ `"'
73--               | `\"'
74--
75--    Pre-precessor directives as well as the switch to inline-C code are
76--    formed as follows:
77--
78--      cpp     -> `\n#' (inline | `\t')* `\n'
79--               | `\n#c' (' ' | '\t')* `\n'
80--
81--    We allow whitespace between the `#' and the actual directive, but in `#c'
82--    and `#endc' the directive must immediately follow the `#'.  This might
83--    be regarded as a not entirely orthogonal design, but simplifies matters
84--    especially for `#endc'.
85--
86--  * On encountering the lexeme `{#', a meta action in the base lexer
87--    transfers control to the following binding-hook lexer:
88--
89--      ident       -> letter (letter | digit | `\'')*
90--      cidenttail  -> digit (letter | digit)*
91--      reservedid  -> `add' | `as' | `call' | `class' | `context' | `deriving'
92--                   | `enum' | `foreign' | `fun' | `get' | `lib'
93--                   | `downcaseFirstLetter' | `finalizer' | `interruptible'
94--                   | `newtype' | `nocode' | `pointer' | `prefix' | `pure'
95--                   | `set' | `sizeof' | `stable' | `struct' | `type'
96--                   | `underscoreToCase' | `upcaseFirstLetter' | `unsafe' |
97--                   | `with' | `const' | `omit'
98--      reservedsym -> `{#' | `#}' | `{' | `}' | `,' | `.' | `->' | `='
99--                   | `=>' | '-' | `*' | `&' | `^' | `+' | `%'
100--      string      -> `"' instr* `"'
101--      verbhs      -> `\`' inhsverb* `\''
102--      quoths      -> `\'' inhsverb* `\''
103--      instr       -> ` '..`\127' \\ `"'
104--      inhsverb    -> ` '..`\127' \\ `\''
105--      comment     -> `--' (any \\ `\n')* `\n'
106--
107--    Control characters, white space, and comments are discarded in the
108--    binding-hook lexer.  Nested comments are not allowed in a binding hook.
109--    Identifiers can be enclosed in single quotes to avoid collision with
110--    C->Haskell keywords, and in this case quoted part could also be followed by
111--    what is assumed to be a valid Haskell code, which would be transferred in the
112--    output file verbatim.
113--
114--  * In the binding-hook lexer, the lexeme `#}' transfers control back to the
115--    base lexer.  An occurence of the lexeme `{#' inside the binding-hook
116--    lexer triggers an error.  The symbol `{#' is not explcitly represented
117--    in the resulting token stream.  However, the occurrence of a token
118--    representing one of the reserved identifiers `call', `context', `enum',
119--    and `field' marks the start of a binding hook.  Strictly speaking, `#}'
120--    need also not occur in the token stream, as the next `haskell' token
121--    marks a hook's end.  It is, however, useful for producing accurate error
122--    messages (in case an hook is closed to early) to have a token
123--    representing `#}'.
124--
125--  * The rule `ident' describes Haskell identifiers, but without
126--    distinguishing between variable and constructor identifers (ie, those
127--    starting with a lowercase and those starting with an uppercase letter).
128--    However, we use it also to scan C identifiers; although, strictly
129--    speaking, it is too general for them.  In the case of C identifiers,
130--    this should not have any impact on the range of descriptions accepted by
131--    the tool, as illegal identifier will never occur in a C header file that
132--    is accepted by the C lexer.  In the case of Haskell identifiers, a
133--    confusion between variable and constructor identifiers will be noted by
134--    the Haskell compiler translating the code generated by c2hs.  Moreover,
135--    identifiers that are inside quoted parts (see above) may not contain apostrophes.
136--
137--  * Any line starting with the character `#' is regarded to be a C
138--    preprocessor directive.  With the exception of `#c' and `#endc', which
139--    delimit a set of lines containing inline C code.  Hence, in the base
140--    lexer, the lexeme `#c' triggers a meta action transferring control to the
141--    following inline-C lexer:
142--
143--      c  -> inline* \\ `\n#endc'
144--
145--    We do neither treat C strings nor C comments specially.  Hence, if the
146--    string "\n#endc" occurs in a comment, we will mistakenly regard it as
147--    the end of the inline C code.  Note that the problem cannot happen with
148--    strings, as C does not permit strings that extend over multiple lines.
149--    At the moment, it just seems not to be worth the effort required to
150--    treat this situation more accurately.
151--
152--    The inline-C lexer also doesn't handle pre-processor directives
153--    specially.  Hence, structural pre-processor directives (namely,
154--    conditionals) may occur within inline-C code only properly nested.
155--
156--  Shortcomings
157--  ~~~~~~~~~~~~
158--  Some lexemes that include single and double quote characters are not lexed
159--  correctly.  See the implementation comment at `haskell' for details.
160--
161--
162--- TODO ----------------------------------------------------------------------
163--
164--  * In `haskell', the case of a single `"' (without a matching second one)
165--    is caught by an eplicit error raising rule.  This shouldn't be
166--    necessary, but for some strange reason, the lexer otherwise hangs when a
167--    single `"' appears in the input.
168--
169--  * Comments in the "gap" of a string are not yet supported.
170--
171
172module C2HS.CHS.Lexer (CHSToken(..), lexCHS, keywordToIdent)
173where
174
175import Data.List     ((\\))
176import Data.Char     (isDigit, isSpace)
177
178import Language.C.Data.Ident
179import Language.C.Data.Name
180import Language.C.Data.Position
181
182import Data.Errors    (ErrorLevel(..), makeError)
183import Text.Lexers    (Regexp, Lexer, Action, epsilon, char, (+>), lexaction,
184                  lexactionErr, lexmeta, (>|<), (>||<), ctrlLexer, star, plus,
185                  alt, string, execLexer)
186import Control.State (getNameSupply, setNameSupply)
187import C2HS.State (CST, raise, raiseError)
188
189
190-- token definition
191-- ----------------
192
193-- | possible tokens
194--
195data CHSToken = CHSTokArrow   Position          -- `->'
196              | CHSTokDArrow  Position          -- `=>'
197              | CHSTokDot     Position          -- `.'
198              | CHSTokComma   Position          -- `,'
199              | CHSTokEqual   Position          -- `='
200              | CHSTokMinus   Position          -- `-'
201              | CHSTokStar    Position          -- `*'
202              | CHSTokAmp     Position          -- `&'
203              | CHSTokHat     Position          -- `^'
204              | CHSTokPercent Position          -- `%'
205              | CHSTokPlus    Position          -- `+'
206              | CHSTokPlusS   Position          -- `+S'
207              | CHSTokPlusNum Position Int      -- `+<num>'
208              | CHSTokLBrace  Position          -- `{'
209              | CHSTokRBrace  Position          -- `}'
210              | CHSTokLParen  Position          -- `('
211              | CHSTokRParen  Position          -- `)'
212              | CHSTokLBrack  Position          -- `['
213              | CHSTokRBrack  Position          -- `]'
214              | CHSTokHook    Position          -- `{#'
215              | CHSTokEndHook Position          -- `#}'
216              | CHSTokAdd     Position          -- `add'
217              | CHSTokAs      Position          -- `as'
218              | CHSTokCall    Position          -- `call'
219              | CHSTokClass   Position          -- `class'
220              | CHSTokConst   Position          -- `const'
221              | CHSTokContext Position          -- `context'
222              | CHSTokNonGNU  Position          -- `nonGNU'
223              | CHSTokTypedef Position          -- `typedef'
224              | CHSTokDefault Position          -- `default'
225              | CHSTokDerive  Position          -- `deriving'
226              | CHSTokDown    Position          -- `downcaseFirstLetter'
227              | CHSTokEnum    Position          -- `enum'
228              | CHSTokFinal   Position          -- `finalizer'
229              | CHSTokForeign Position          -- `foreign'
230              | CHSTokFun     Position          -- `fun'
231              | CHSTokGet     Position          -- `get'
232              | CHSTokImport  Position          -- `import'
233              | CHSTokIntr    Position          -- `interruptible'
234              | CHSTokLib     Position          -- `lib'
235              | CHSTokNewtype Position          -- `newtype'
236              | CHSTokNocode  Position          -- `nocode'
237              | CHSTokOffsetof Position         -- `offsetof'
238              | CHSTokOmit    Position          -- `omit'
239              | CHSTokPointer Position          -- `pointer'
240              | CHSTokPrefix  Position          -- `prefix'
241              | CHSTokPure    Position          -- `pure'
242              | CHSTokQualif  Position          -- `qualified'
243              | CHSTokSet     Position          -- `set'
244              | CHSTokSizeof  Position          -- `sizeof'
245              | CHSTokAlignof Position          -- `alignof'
246              | CHSTokStable  Position          -- `stable'
247              | CHSTokStruct  Position          -- `struct'
248              | CHSTokType    Position          -- `type'
249              | CHSTok_2Case  Position          -- `underscoreToCase'
250              | CHSTokUnsafe  Position          -- `unsafe'
251              | CHSTokUpper   Position          -- `upcaseFirstLetter'
252              | CHSTokVariadic Position          -- `variadic'
253              | CHSTokWith    Position Ident    -- `with'
254              | CHSTokIn      Position          -- `in'
255              | CHSTokOut     Position          -- `out'
256              | CHSTokString  Position String   -- string
257              | CHSTokHSVerb  Position String   -- verbatim Haskell (`...')
258              | CHSTokHSQuot  Position String   -- quoted Haskell ('...')
259              | CHSTokIdent   Position Ident    -- identifier
260              | CHSTokHaskell Position String   -- verbatim Haskell code
261              | CHSTokCPP     Position String Bool -- pre-processor directive
262              | CHSTokLine    Position          -- line pragma
263              | CHSTokC       Position String   -- verbatim C code
264              | CHSTokCtrl    Position Char     -- control code
265              | CHSTokComment Position String   -- comment
266              | CHSTokCIdentTail Position Ident -- C identifier without prefix
267              | CHSTokCArg Position String      -- C type argument
268
269instance Pos CHSToken where
270  posOf (CHSTokArrow   pos  ) = pos
271  posOf (CHSTokDArrow  pos  ) = pos
272  posOf (CHSTokDot     pos  ) = pos
273  posOf (CHSTokComma   pos  ) = pos
274  posOf (CHSTokEqual   pos  ) = pos
275  posOf (CHSTokMinus   pos  ) = pos
276  posOf (CHSTokStar    pos  ) = pos
277  posOf (CHSTokAmp     pos  ) = pos
278  posOf (CHSTokHat     pos  ) = pos
279  posOf (CHSTokLBrace  pos  ) = pos
280  posOf (CHSTokRBrace  pos  ) = pos
281  posOf (CHSTokLParen  pos  ) = pos
282  posOf (CHSTokRParen  pos  ) = pos
283  posOf (CHSTokLBrack  pos  ) = pos
284  posOf (CHSTokRBrack  pos  ) = pos
285  posOf (CHSTokHook    pos  ) = pos
286  posOf (CHSTokEndHook pos  ) = pos
287  posOf (CHSTokAdd     pos  ) = pos
288  posOf (CHSTokAs      pos  ) = pos
289  posOf (CHSTokCall    pos  ) = pos
290  posOf (CHSTokClass   pos  ) = pos
291  posOf (CHSTokConst   pos  ) = pos
292  posOf (CHSTokContext pos  ) = pos
293  posOf (CHSTokNonGNU  pos  ) = pos
294  posOf (CHSTokDerive  pos  ) = pos
295  posOf (CHSTokTypedef pos  ) = pos
296  posOf (CHSTokDefault pos  ) = pos
297  posOf (CHSTokDown    pos  ) = pos
298  posOf (CHSTokEnum    pos  ) = pos
299  posOf (CHSTokFinal   pos  ) = pos
300  posOf (CHSTokForeign pos  ) = pos
301  posOf (CHSTokFun     pos  ) = pos
302  posOf (CHSTokGet     pos  ) = pos
303  posOf (CHSTokImport  pos  ) = pos
304  posOf (CHSTokLib     pos  ) = pos
305  posOf (CHSTokNewtype pos  ) = pos
306  posOf (CHSTokNocode  pos  ) = pos
307  posOf (CHSTokOffsetof pos ) = pos
308  posOf (CHSTokOmit    pos  ) = pos
309  posOf (CHSTokPointer pos  ) = pos
310  posOf (CHSTokPrefix  pos  ) = pos
311  posOf (CHSTokPure    pos  ) = pos
312  posOf (CHSTokQualif  pos  ) = pos
313  posOf (CHSTokSet     pos  ) = pos
314  posOf (CHSTokSizeof  pos  ) = pos
315  posOf (CHSTokAlignof pos  ) = pos
316  posOf (CHSTokStable  pos  ) = pos
317  posOf (CHSTokStruct  pos  ) = pos
318  posOf (CHSTokType    pos  ) = pos
319  posOf (CHSTok_2Case  pos  ) = pos
320  posOf (CHSTokUnsafe  pos  ) = pos
321  posOf (CHSTokUpper   pos  ) = pos
322  posOf (CHSTokVariadic pos  ) = pos
323  posOf (CHSTokWith    pos _) = pos
324  posOf (CHSTokIn      pos  ) = pos
325  posOf (CHSTokOut     pos  ) = pos
326  posOf (CHSTokString  pos _) = pos
327  posOf (CHSTokHSVerb  pos _) = pos
328  posOf (CHSTokHSQuot  pos _) = pos
329  posOf (CHSTokIdent   pos _) = pos
330  posOf (CHSTokHaskell pos _) = pos
331  posOf (CHSTokCPP     pos _ _) = pos
332  posOf (CHSTokLine    pos  ) = pos
333  posOf (CHSTokC       pos _) = pos
334  posOf (CHSTokCtrl    pos _) = pos
335  posOf (CHSTokComment pos _) = pos
336  posOf (CHSTokCIdentTail pos _) = pos
337  posOf (CHSTokCArg    pos _) = pos
338  posOf (CHSTokPercent pos) = pos
339
340instance Eq CHSToken where
341  (CHSTokArrow    _  ) == (CHSTokArrow    _  ) = True
342  (CHSTokDArrow   _  ) == (CHSTokDArrow   _  ) = True
343  (CHSTokDot      _  ) == (CHSTokDot      _  ) = True
344  (CHSTokComma    _  ) == (CHSTokComma    _  ) = True
345  (CHSTokEqual    _  ) == (CHSTokEqual    _  ) = True
346  (CHSTokMinus    _  ) == (CHSTokMinus    _  ) = True
347  (CHSTokStar     _  ) == (CHSTokStar     _  ) = True
348  (CHSTokAmp      _  ) == (CHSTokAmp      _  ) = True
349  (CHSTokHat      _  ) == (CHSTokHat      _  ) = True
350  (CHSTokLBrace   _  ) == (CHSTokLBrace   _  ) = True
351  (CHSTokRBrace   _  ) == (CHSTokRBrace   _  ) = True
352  (CHSTokLParen   _  ) == (CHSTokLParen   _  ) = True
353  (CHSTokRParen   _  ) == (CHSTokRParen   _  ) = True
354  (CHSTokLBrack   _  ) == (CHSTokLBrack   _  ) = True
355  (CHSTokRBrack   _  ) == (CHSTokRBrack   _  ) = True
356  (CHSTokHook     _  ) == (CHSTokHook     _  ) = True
357  (CHSTokEndHook  _  ) == (CHSTokEndHook  _  ) = True
358  (CHSTokAdd      _  ) == (CHSTokAdd      _  ) = True
359  (CHSTokAs       _  ) == (CHSTokAs       _  ) = True
360  (CHSTokCall     _  ) == (CHSTokCall     _  ) = True
361  (CHSTokClass    _  ) == (CHSTokClass    _  ) = True
362  (CHSTokConst    _  ) == (CHSTokConst    _  ) = True
363  (CHSTokContext  _  ) == (CHSTokContext  _  ) = True
364  (CHSTokNonGNU   _  ) == (CHSTokNonGNU   _  ) = True
365  (CHSTokTypedef  _  ) == (CHSTokTypedef  _  ) = True
366  (CHSTokDefault  _  ) == (CHSTokDefault  _  ) = True
367  (CHSTokDerive   _  ) == (CHSTokDerive   _  ) = True
368  (CHSTokDown     _  ) == (CHSTokDown     _  ) = True
369  (CHSTokEnum     _  ) == (CHSTokEnum     _  ) = True
370  (CHSTokFinal    _  ) == (CHSTokFinal    _  ) = True
371  (CHSTokForeign  _  ) == (CHSTokForeign  _  ) = True
372  (CHSTokFun      _  ) == (CHSTokFun      _  ) = True
373  (CHSTokGet      _  ) == (CHSTokGet      _  ) = True
374  (CHSTokImport   _  ) == (CHSTokImport   _  ) = True
375  (CHSTokLib      _  ) == (CHSTokLib      _  ) = True
376  (CHSTokNewtype  _  ) == (CHSTokNewtype  _  ) = True
377  (CHSTokNocode   _  ) == (CHSTokNocode   _  ) = True
378  (CHSTokOffsetof _  ) == (CHSTokOffsetof _  ) = True
379  (CHSTokOmit     _  ) == (CHSTokOmit     _  ) = True
380  (CHSTokPointer  _  ) == (CHSTokPointer  _  ) = True
381  (CHSTokPrefix   _  ) == (CHSTokPrefix   _  ) = True
382  (CHSTokPure     _  ) == (CHSTokPure     _  ) = True
383  (CHSTokQualif   _  ) == (CHSTokQualif   _  ) = True
384  (CHSTokSet      _  ) == (CHSTokSet      _  ) = True
385  (CHSTokSizeof   _  ) == (CHSTokSizeof   _  ) = True
386  (CHSTokAlignof  _  ) == (CHSTokAlignof  _  ) = True
387  (CHSTokStable   _  ) == (CHSTokStable   _  ) = True
388  (CHSTokStruct   _  ) == (CHSTokStruct   _  ) = True
389  (CHSTokType     _  ) == (CHSTokType     _  ) = True
390  (CHSTok_2Case   _  ) == (CHSTok_2Case   _  ) = True
391  (CHSTokUnsafe   _  ) == (CHSTokUnsafe   _  ) = True
392  (CHSTokUpper    _  ) == (CHSTokUpper    _  ) = True
393  (CHSTokVariadic _  ) == (CHSTokVariadic _  ) = True
394  (CHSTokWith     _ _) == (CHSTokWith     _ _) = True
395  (CHSTokIn       _  ) == (CHSTokIn       _  ) = True
396  (CHSTokOut      _  ) == (CHSTokOut      _  ) = True
397  (CHSTokString   _ _) == (CHSTokString   _ _) = True
398  (CHSTokHSVerb   _ _) == (CHSTokHSVerb   _ _) = True
399  (CHSTokHSQuot   _ _) == (CHSTokHSQuot   _ _) = True
400  (CHSTokIdent    _ _) == (CHSTokIdent    _ _) = True
401  (CHSTokHaskell  _ _) == (CHSTokHaskell  _ _) = True
402  (CHSTokCPP    _ _ _) == (CHSTokCPP    _ _ _) = True
403  (CHSTokLine     _  ) == (CHSTokLine     _  ) = True
404  (CHSTokC        _ _) == (CHSTokC        _ _) = True
405  (CHSTokCtrl     _ _) == (CHSTokCtrl     _ _) = True
406  (CHSTokComment  _ _) == (CHSTokComment  _ _) = True
407  (CHSTokCIdentTail _ _) == (CHSTokCIdentTail _ _) = True
408  (CHSTokCArg     _ _) == (CHSTokCArg     _ _) = True
409  _                    == _                    = False
410
411instance Show CHSToken where
412  showsPrec _ (CHSTokArrow   _  ) = showString "->"
413  showsPrec _ (CHSTokDArrow  _  ) = showString "=>"
414  showsPrec _ (CHSTokDot     _  ) = showString "."
415  showsPrec _ (CHSTokComma   _  ) = showString ","
416  showsPrec _ (CHSTokEqual   _  ) = showString "="
417  showsPrec _ (CHSTokMinus   _  ) = showString "-"
418  showsPrec _ (CHSTokStar    _  ) = showString "*"
419  showsPrec _ (CHSTokAmp     _  ) = showString "&"
420  showsPrec _ (CHSTokHat     _  ) = showString "^"
421  showsPrec _ (CHSTokPercent _  ) = showString "%"
422  showsPrec _ (CHSTokPlus    _  ) = showString "+"
423  showsPrec _ (CHSTokPlusS   _  ) = showString "+S"
424  showsPrec _ (CHSTokPlusNum _  sz) = showString ("+" ++ show sz)
425  showsPrec _ (CHSTokLBrace  _  ) = showString "{"
426  showsPrec _ (CHSTokRBrace  _  ) = showString "}"
427  showsPrec _ (CHSTokLParen  _  ) = showString "("
428  showsPrec _ (CHSTokRParen  _  ) = showString ")"
429  showsPrec _ (CHSTokLBrack  _  ) = showString "["
430  showsPrec _ (CHSTokRBrack  _  ) = showString "]"
431  showsPrec _ (CHSTokHook    _  ) = showString "{#"
432  showsPrec _ (CHSTokEndHook _  ) = showString "#}"
433  showsPrec _ (CHSTokAdd     _  ) = showString "add"
434  showsPrec _ (CHSTokAs      _  ) = showString "as"
435  showsPrec _ (CHSTokCall    _  ) = showString "call"
436  showsPrec _ (CHSTokClass   _  ) = showString "class"
437  showsPrec _ (CHSTokConst   _  ) = showString "const"
438  showsPrec _ (CHSTokContext _  ) = showString "context"
439  showsPrec _ (CHSTokNonGNU  _  ) = showString "nonGNU"
440  showsPrec _ (CHSTokTypedef _  ) = showString "typedef"
441  showsPrec _ (CHSTokDefault _  ) = showString "default"
442  showsPrec _ (CHSTokDerive  _  ) = showString "deriving"
443  showsPrec _ (CHSTokDown    _  ) = showString "downcaseFirstLetter"
444  showsPrec _ (CHSTokEnum    _  ) = showString "enum"
445  showsPrec _ (CHSTokFinal   _  ) = showString "finalizer"
446  showsPrec _ (CHSTokForeign _  ) = showString "foreign"
447  showsPrec _ (CHSTokFun     _  ) = showString "fun"
448  showsPrec _ (CHSTokGet     _  ) = showString "get"
449  showsPrec _ (CHSTokImport  _  ) = showString "import"
450  showsPrec _ (CHSTokIntr    _  ) = showString "interruptible"
451  showsPrec _ (CHSTokLib     _  ) = showString "lib"
452  showsPrec _ (CHSTokNewtype _  ) = showString "newtype"
453  showsPrec _ (CHSTokNocode  _  ) = showString "nocode"
454  showsPrec _ (CHSTokOffsetof _ ) = showString "offsetof"
455  showsPrec _ (CHSTokOmit    _  ) = showString "omit"
456  showsPrec _ (CHSTokPointer _  ) = showString "pointer"
457  showsPrec _ (CHSTokPrefix  _  ) = showString "prefix"
458  showsPrec _ (CHSTokPure    _  ) = showString "pure"
459  showsPrec _ (CHSTokQualif  _  ) = showString "qualified"
460  showsPrec _ (CHSTokSet     _  ) = showString "set"
461  showsPrec _ (CHSTokSizeof  _  ) = showString "sizeof"
462  showsPrec _ (CHSTokAlignof _  ) = showString "alignof"
463  showsPrec _ (CHSTokStable  _  ) = showString "stable"
464  showsPrec _ (CHSTokStruct  _  ) = showString "struct"
465  showsPrec _ (CHSTokType    _  ) = showString "type"
466  showsPrec _ (CHSTok_2Case  _  ) = showString "underscoreToCase"
467  showsPrec _ (CHSTokUnsafe  _  ) = showString "unsafe"
468  showsPrec _ (CHSTokUpper   _  ) = showString "upcaseFirstLetter"
469  showsPrec _ (CHSTokVariadic _  ) = showString "variadic"
470  showsPrec _ (CHSTokWith    _ _) = showString "with"
471  showsPrec _ (CHSTokIn      _  ) = showString "in"
472  showsPrec _ (CHSTokOut     _  ) = showString "out"
473  showsPrec _ (CHSTokString  _ s) = showString ("\"" ++ s ++ "\"")
474  showsPrec _ (CHSTokHSVerb  _ s) = showString ("`" ++ s ++ "'")
475  showsPrec _ (CHSTokHSQuot  _ s) = showString ("'" ++ s ++ "'")
476  showsPrec _ (CHSTokIdent   _ i) = (showString . identToString) i
477  showsPrec _ (CHSTokHaskell _ s) = showString s
478  showsPrec _ (CHSTokCPP  _ s nl) = showString (if nl then "\n" else "" ++ s)
479  showsPrec _ (CHSTokLine    _  ) = id            --TODO show line num?
480  showsPrec _ (CHSTokC       _ s) = showString s
481  showsPrec _ (CHSTokCtrl    _ c) = showChar c
482  showsPrec _ (CHSTokComment _ s) = showString (if null s
483                                                then ""
484                                                else " --" ++ s ++ "\n")
485  showsPrec _ (CHSTokCIdentTail _ i) = (showString . identToString) i
486  showsPrec _ (CHSTokCArg    _ s) = showString s
487
488-- lexer state
489-- -----------
490
491-- | state threaded through the lexer
492--
493data CHSLexerState = CHSLS {
494                       nestLvl :: Int,   -- nesting depth of nested comments
495                       inHook  :: Bool,  -- within a binding hook?
496                       namesup :: [Name] -- supply of unique names
497                     }
498
499-- | initial state
500--
501initialState :: [Name] -> CST s CHSLexerState
502initialState nameSupply =
503  do
504    return CHSLS {
505                         nestLvl = 0,
506                         inHook  = False,
507                         namesup = nameSupply
508                }
509
510-- | raise an error if the given state is not a final state
511--
512assertFinalState :: Position -> CHSLexerState -> CST s ()
513assertFinalState pos CHSLS {nestLvl = nestLvl', inHook = inHook'}
514  | nestLvl' > 0 = raiseError pos ["Unexpected end of file!",
515                                   "Unclosed nested comment."]
516  | inHook'      = raiseError pos ["Unexpected end of file!",
517                                   "Unclosed binding hook."]
518  | otherwise    = return ()
519
520-- | lexer and action type used throughout this specification
521--
522type CHSLexer  = Lexer  CHSLexerState CHSToken
523type CHSAction = Action               CHSToken
524type CHSRegexp = Regexp CHSLexerState CHSToken
525
526-- | for actions that need a new unique name
527--
528infixl 3 `lexactionName`
529lexactionName :: CHSRegexp
530              -> (String -> Position -> Name -> CHSToken)
531              -> CHSLexer
532re `lexactionName` action = re `lexmeta` action'
533  where
534    action' str pos state = let name:ns = namesup state
535                            in
536                            (Just $ Right (action str pos name),
537                             incPos pos (length str),
538                             state {namesup = ns},
539                             Nothing)
540
541
542-- lexical specification
543-- ---------------------
544
545-- | the lexical definition of the tokens (the base lexer)
546--
547chslexer :: CHSLexer
548chslexer  =      haskell        -- Haskell code
549            >||< nested         -- nested comments
550            >||< ctrl           -- control code (that has to be preserved)
551            >||< hook           -- start of a binding hook
552            >||< cpp            -- a pre-processor directive (or `#c')
553            >||< startmarker    -- marks beginning of input
554
555-- | stream of Haskell code (terminated by a control character or binding hook)
556--
557haskell :: CHSLexer
558--
559-- NB: We need to make sure that '"' is not regarded as the beginning of a
560--     string; however, we cannot really lex character literals properly
561--     without lexing identifiers (as the latter may containing single quotes
562--     as part of their lexeme).  Thus, we special case '"'.  This is still a
563--     kludge, as a program fragment, such as
564--
565--       foo'"'strange string"
566--
567--     will not be handled correctly.
568--
569haskell  = (    anyButSpecial`star` epsilon
570            >|< specialButQuotes
571            >|< char '"'  +> inhstr`star` char '"'
572            >|< string "'\"'"                           -- special case of "
573            >|< string "--" +> anyButNL`star` epsilon   -- comment
574           )
575           `lexaction` copyVerbatim
576           >||< char '"'                                -- this is a bad kludge
577                `lexactionErr`
578                  \_ pos -> (Left $ makeError LevelError pos
579                                              ["Lexical error!",
580                                              "Unclosed string."])
581           where
582             anyButSpecial    = alt (inlineSet \\ specialSet)
583             specialButQuotes = alt (specialSet \\ ['"'])
584             anyButNL         = alt (anySet \\ ['\n'])
585             inhstr           = instr >|< char '\\' >|< string "\\\"" >|< gap
586             gap              = char '\\' +> alt (' ':ctrlSet)`plus` char '\\'
587
588-- | action copying the input verbatim to `CHSTokHaskell' tokens
589--
590copyVerbatim        :: CHSAction
591copyVerbatim cs pos  = Just $ CHSTokHaskell pos cs
592
593-- | nested comments
594--
595nested :: CHSLexer
596nested  =
597       string "{-"              {- for Haskell emacs mode :-( -}
598       `lexmeta` enterComment
599  >||<
600       string "-}"
601       `lexmeta` leaveComment
602  where
603    enterComment cs pos s =
604      (copyVerbatim' cs pos,                    -- collect the lexeme
605       incPos pos 2,                            -- advance current position
606       s {nestLvl = nestLvl s + 1},             -- increase nesting level
607       Just $ inNestedComment)                  -- continue in comment lexer
608    --
609    leaveComment cs pos s =
610      case nestLvl s of
611        0 -> (commentCloseErr pos,              -- 0: -} outside comment => err
612              incPos pos 2,                     -- advance current position
613              s,
614              Nothing)
615        1 -> (copyVerbatim' cs pos,             -- collect the lexeme
616              incPos pos 2,                     -- advance current position
617              s {nestLvl = nestLvl s - 1},      -- decrease nesting level
618              Just chslexer)                    -- 1: continue with root lexer
619        _ -> (copyVerbatim' cs pos,             -- collect the lexeme
620              incPos pos 2,                     -- advance current position
621              s {nestLvl = nestLvl s - 1},      -- decrease nesting level
622              Nothing)                          -- _: cont with comment lexer
623    --
624    copyVerbatim' cs pos  = Just $ Right (CHSTokHaskell pos cs)
625    --
626    commentCloseErr pos =
627      Just $ Left (makeError LevelError pos
628                             ["Lexical error!",
629                             "`-}' not preceded by a matching `{-'."])
630                             {- for Haskell emacs mode :-( -}
631
632
633-- | lexer processing the inner of a comment
634--
635inNestedComment :: CHSLexer
636inNestedComment  =      commentInterior         -- inside a comment
637                   >||< nested                  -- nested comments
638                   >||< ctrl                    -- control code (preserved)
639
640-- | standard characters in a nested comment
641--
642commentInterior :: CHSLexer
643commentInterior  = (    anyButSpecial`star` epsilon
644                    >|< special
645                   )
646                   `lexaction` copyVerbatim
647                   where
648                     anyButSpecial = alt (inlineSet \\ commentSpecialSet)
649                     special       = alt commentSpecialSet
650
651-- | control code in the base lexer (is turned into a token)
652--
653-- * this covers exactly the same set of characters as contained in `ctrlSet'
654--   and `Lexers.ctrlLexer' and advances positions also like the `ctrlLexer'
655--
656ctrl :: CHSLexer
657ctrl  =
658       char '\n' `lexmeta` newline
659  >||< char '\r' `lexmeta` newline
660  >||< char '\v' `lexmeta` newline
661  >||< char '\f' `lexmeta` formfeed
662  >||< char '\t' `lexmeta` tab
663  where
664    newline  [c] pos = ctrlResult pos c (retPos pos)
665    formfeed [c] pos = ctrlResult pos c (incPos pos 1)
666    tab      [c] pos = ctrlResult pos c (incPos pos 8)
667
668    ctrlResult pos c pos' s =
669      (Just $ Right (CHSTokCtrl pos c), pos', s, Nothing)
670
671-- | start of a binding hook (ie, enter the binding hook lexer)
672--
673hook :: CHSLexer
674hook  = string "{#"
675        `lexmeta` \_ pos s -> (Just $ Right (CHSTokHook pos),
676                               incPos pos 2, s, Just bhLexer)
677
678-- | start marker: used to identify pre-processor directive at
679-- beginning of input -- this lexer just drops the start marker if it
680-- hasn't been used to handle a pre-processor directive
681--
682startmarker :: CHSLexer
683startmarker = char '\000' `lexmeta`
684              \_ pos s -> (Nothing, incPos pos 1, s, Just chslexer)
685
686-- | pre-processor directives and `#c'
687--
688-- * we lex `#c' as a directive and special case it in the action
689--
690-- * we lex C line number pragmas and special case it in the action
691--
692cpp :: CHSLexer
693cpp = directive
694      where
695        directive =
696          --(string "\n#" >|< string "\0#") +>
697          alt "\n\0" +> alt " \t" `star` string "#" +>
698          alt ('\t':inlineSet)`star` epsilon
699          `lexmeta`
700             \t@(ld:spdir) pos s ->      -- strip off the "\n" or "\0"
701             let dir = drop 1 $ dropWhile (`elem` " \t") spdir
702             in case dir of
703                 ['c']                      ->          -- #c
704                   (Nothing, incPos pos (length t), s, Just cLexer)
705                 -- a #c may be followed by whitespace
706                 'c':sp:_ | sp `elem` " \t" ->          -- #c
707                   (Nothing, incPos pos (length t), s, Just cLexer)
708                 ' ':line@(n:_) | isDigit n ->                 -- C line pragma
709                   let pos' = adjustPosByCLinePragma line pos
710                    in (Just $ Right (CHSTokLine pos'), pos', s, Nothing)
711                 _                            ->        -- CPP directive
712                   (Just $ Right (CHSTokCPP pos dir (ld == '\n')),
713                    if ld == '\n' then retPos pos else incPos pos (length t),
714                    s, Nothing)
715
716adjustPosByCLinePragma :: String -> Position -> Position
717adjustPosByCLinePragma str pos =
718  if isSourcePos pos
719  then position (posOffset pos) fname' row' 1 (posParent pos)
720  else pos
721  where
722    fname           = posFile pos
723    str'            = dropWhite str
724    (rowStr, str'') = span isDigit str'
725    row'            = read rowStr
726    str'''          = dropWhite str''
727    fnameStr        = takeWhile (/= '"') . drop 1 $ str'''
728    fname'          | null str''' || head str''' /= '"' = fname
729                    -- try and get more sharing of file name strings
730                    | fnameStr == fname                 = fname
731                    | otherwise                         = fnameStr
732    --
733    dropWhite = dropWhile (\c -> c == ' ' || c == '\t')
734
735-- | the binding hook lexer
736--
737bhLexer :: CHSLexer
738bhLexer  =      identOrKW
739           >||< symbol
740           >||< strlit
741           >||< hsverb
742           >||< hsquot
743           >||< whitespace
744           >||< arglist
745           >||< endOfHook
746           >||< string "--" +> anyButNL`star` epsilon  -- comment
747                `lexaction` \cs pos -> Just (CHSTokComment pos (drop 2 cs))
748           where
749             anyButNL  = alt (anySet \\ ['\n'])
750             endOfHook = string "#}"
751                         `lexmeta`
752                          \_ pos s -> (Just $ Right (CHSTokEndHook pos),
753                                       incPos pos 2, s, Just chslexer)
754             arglist = string "["
755                       `lexmeta` \_ pos s -> (Just $ Right (CHSTokLBrack pos),
756                                              incPos pos 1, s, Just alLexer)
757
758-- | lexer for C function types for variadic functions
759--
760alLexer :: CHSLexer
761alLexer =      sym ","  CHSTokComma
762          >||< endOfArgList
763          >||< cArg
764  where sym cs con = string cs `lexaction` \_ pos -> Just (con pos)
765        endOfArgList = string "]"
766                       `lexmeta`
767                       \_ pos s -> (Just $ Right (CHSTokRBrack pos),
768                                    incPos pos 1, s, Just bhLexer)
769        cArg = ((alt (anySet \\ ",]")) `star` epsilon) `lexaction`
770               \cs pos -> Just (CHSTokCArg pos $ trim cs)
771        trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace
772
773-- | the inline-C lexer
774--
775cLexer :: CHSLexer
776cLexer =      inlineC                     -- inline C code
777         >||< ctrl                        -- control code (preserved)
778         >||< string "\n#endc"            -- end of inline C code...
779              `lexmeta`                   -- ...preserve '\n' as control token
780              \_ pos s -> (Just $ Right (CHSTokCtrl pos '\n'), retPos pos, s,
781                           Just chslexer)
782         where
783           inlineC = alt inlineSet `lexaction` copyVerbatimC
784           --
785           copyVerbatimC :: CHSAction
786           copyVerbatimC cs pos = Just $ CHSTokC pos cs
787
788-- | whitespace
789--
790-- * horizontal and vertical tabs, newlines, and form feeds are filter out by
791--   `Lexers.ctrlLexer'
792--
793whitespace :: CHSLexer
794whitespace  =      (char ' ' `lexaction` \_ _ -> Nothing)
795              >||< ctrlLexer
796
797-- | identifiers and keywords
798--
799identOrKW :: CHSLexer
800--
801-- the strictness annotations seem to help a bit
802--
803identOrKW  =
804       -- identifier or keyword
805       (letter +> (letter >|< digit >|< char '\'')`star` epsilon
806       `lexactionName` \cs pos name -> (idkwtok $!pos) cs name)
807       >||<
808       (digit +> (letter >|< digit)`star` epsilon
809        `lexactionName` \cs pos name ->
810        CHSTokCIdentTail pos (mkIdent pos cs name))
811  where
812    idkwtok pos "add"              _    = CHSTokAdd     pos
813    idkwtok pos "as"               _    = CHSTokAs      pos
814    idkwtok pos "call"             _    = CHSTokCall    pos
815    idkwtok pos "class"            _    = CHSTokClass   pos
816    idkwtok pos "const"            _    = CHSTokConst   pos
817    idkwtok pos "context"          _    = CHSTokContext pos
818    idkwtok pos "nonGNU"           _    = CHSTokNonGNU  pos
819    idkwtok pos "typedef"          _    = CHSTokTypedef pos
820    idkwtok pos "default"          _    = CHSTokDefault pos
821    idkwtok pos "deriving"         _    = CHSTokDerive  pos
822    idkwtok pos "downcaseFirstLetter" _ = CHSTokDown    pos
823    idkwtok pos "enum"             _    = CHSTokEnum    pos
824    idkwtok pos "finalizer"        _    = CHSTokFinal   pos
825    idkwtok pos "foreign"          _    = CHSTokForeign pos
826    idkwtok pos "fun"              _    = CHSTokFun     pos
827    idkwtok pos "get"              _    = CHSTokGet     pos
828    idkwtok pos "import"           _    = CHSTokImport  pos
829    idkwtok pos "interruptible"    _    = CHSTokIntr    pos
830    idkwtok pos "lib"              _    = CHSTokLib     pos
831    idkwtok pos "newtype"          _    = CHSTokNewtype pos
832    idkwtok pos "nocode"           _    = CHSTokNocode  pos
833    idkwtok pos "offsetof"         _    = CHSTokOffsetof pos
834    idkwtok pos "omit"             _    = CHSTokOmit    pos
835    idkwtok pos "pointer"          _    = CHSTokPointer pos
836    idkwtok pos "prefix"           _    = CHSTokPrefix  pos
837    idkwtok pos "pure"             _    = CHSTokPure    pos
838    idkwtok pos "qualified"        _    = CHSTokQualif  pos
839    idkwtok pos "set"              _    = CHSTokSet     pos
840    idkwtok pos "sizeof"           _    = CHSTokSizeof  pos
841    idkwtok pos "alignof"          _    = CHSTokAlignof pos
842    idkwtok pos "stable"           _    = CHSTokStable  pos
843    idkwtok pos "struct"           _    = CHSTokStruct  pos
844    idkwtok pos "type"             _    = CHSTokType    pos
845    idkwtok pos "underscoreToCase" _    = CHSTok_2Case  pos
846    idkwtok pos "unsafe"           _    = CHSTokUnsafe  pos
847    idkwtok pos "upcaseFirstLetter"_    = CHSTokUpper   pos
848    idkwtok pos "variadic"         _    = CHSTokVariadic pos
849    idkwtok pos "with"             name = mkwith pos name
850    idkwtok pos "in"               _    = CHSTokIn      pos
851    idkwtok pos "out"              _    = CHSTokOut     pos
852    idkwtok pos cs                 name = mkid pos cs name
853    --
854    mkid pos cs name = CHSTokIdent pos (mkIdent pos cs name)
855    mkwith pos name = CHSTokWith pos (mkIdent pos "with" name)
856
857keywordToIdent :: CHSToken -> CHSToken
858keywordToIdent tok =
859  case tok of
860    CHSTokAdd     pos -> mkid pos "add"
861    CHSTokAs      pos -> mkid pos "as"
862    CHSTokCall    pos -> mkid pos "call"
863    CHSTokClass   pos -> mkid pos "class"
864    CHSTokConst   pos -> mkid pos "const"
865    CHSTokContext pos -> mkid pos "context"
866    CHSTokNonGNU  pos -> mkid pos "nonGNU"
867    CHSTokTypedef pos -> mkid pos "typedef"
868    CHSTokDefault pos -> mkid pos "default"
869    CHSTokDerive  pos -> mkid pos "deriving"
870    CHSTokDown    pos -> mkid pos "downcaseFirstLetter"
871    CHSTokEnum    pos -> mkid pos "enum"
872    CHSTokFinal   pos -> mkid pos "finalizer"
873    CHSTokForeign pos -> mkid pos "foreign"
874    CHSTokFun     pos -> mkid pos "fun"
875    CHSTokGet     pos -> mkid pos "get"
876    CHSTokImport  pos -> mkid pos "import"
877    CHSTokIntr    pos -> mkid pos "interruptible"
878    CHSTokLib     pos -> mkid pos "lib"
879    CHSTokNewtype pos -> mkid pos "newtype"
880    CHSTokNocode  pos -> mkid pos "nocode"
881    CHSTokOffsetof pos -> mkid pos "offsetof"
882    CHSTokOmit    pos -> mkid pos "omit"
883    CHSTokPointer pos -> mkid pos "pointer"
884    CHSTokPrefix  pos -> mkid pos "prefix"
885    CHSTokPure    pos -> mkid pos "pure"
886    CHSTokQualif  pos -> mkid pos "qualified"
887    CHSTokSet     pos -> mkid pos "set"
888    CHSTokSizeof  pos -> mkid pos "sizeof"
889    CHSTokAlignof pos -> mkid pos "alignof"
890    CHSTokStable  pos -> mkid pos "stable"
891    CHSTokStruct  pos -> mkid pos "struct"
892    CHSTokType    pos -> mkid pos "type"
893    CHSTok_2Case  pos -> mkid pos "underscoreToCase"
894    CHSTokUnsafe  pos -> mkid pos "unsafe"
895    CHSTokUpper   pos -> mkid pos "upcaseFirstLetter"
896    CHSTokVariadic pos -> mkid pos "variadic"
897    CHSTokWith    pos ide -> CHSTokIdent pos ide
898    CHSTokIn      pos -> mkid pos "in"
899    CHSTokOut     pos -> mkid pos "out"
900    _ -> tok
901    where mkid pos str = CHSTokIdent pos (internalIdent str)
902
903-- | reserved symbols
904--
905symbol :: CHSLexer
906symbol  =      sym "->" CHSTokArrow
907          >||< sym "=>" CHSTokDArrow
908          >||< sym "."  CHSTokDot
909          >||< sym ","  CHSTokComma
910          >||< sym "="  CHSTokEqual
911          >||< sym "-"  CHSTokMinus
912          >||< sym "*"  CHSTokStar
913          >||< sym "&"  CHSTokAmp
914          >||< sym "^"  CHSTokHat
915          >||< sym "%"  CHSTokPercent
916          >||< sym "+"  CHSTokPlus
917          >||< sym "+S" CHSTokPlusS
918          >||< sym_with_num "+" CHSTokPlusNum
919          >||< sym "{"  CHSTokLBrace
920          >||< sym "}"  CHSTokRBrace
921          >||< sym "("  CHSTokLParen
922          >||< sym ")"  CHSTokRParen
923          where
924            sym cs con = string cs `lexaction` \_ pos -> Just (con pos)
925            sym_with_num cs con =
926              string cs +> digit +> digit`star` epsilon
927              `lexaction` \(_:ds) pos -> Just (con pos (read ds))
928
929-- | string
930--
931strlit :: CHSLexer
932strlit  = char '"' +> (instr >|< char '\\')`star` char '"'
933          `lexaction` \cs pos -> Just (CHSTokString pos (init . tail $ cs))
934
935-- | verbatim code
936--
937hsverb :: CHSLexer
938hsverb  = char '`' +> inhsverb`star` char '\''
939          `lexaction` \cs pos -> Just (CHSTokHSVerb pos (init . tail $ cs))
940
941-- | quoted code
942--
943hsquot :: CHSLexer
944hsquot  = char '\'' +> inhsverb`star` char '\''
945          `lexaction` \cs pos -> Just (CHSTokHSQuot pos (init . tail $ cs))
946
947
948-- | regular expressions
949--
950letter, digit, instr, inhsverb :: Regexp s t
951letter    = alt ['a'..'z'] >|< alt ['A'..'Z'] >|< char '_'
952digit     = alt ['0'..'9']
953instr     = alt ([' '..'\255'] \\ "\"\\")
954inhsverb  = alt ([' '..'\127'] \\ "'")
955
956-- | character sets
957--
958anySet, inlineSet, specialSet, commentSpecialSet, ctrlSet :: [Char]
959anySet            = ['\1'..'\255']
960inlineSet         = anySet \\ ctrlSet
961specialSet        = ['{', '-', '"', '\'']
962commentSpecialSet = ['{', '-']
963ctrlSet           = ['\n', '\f', '\r', '\t', '\v']
964
965
966-- main lexing routine
967-- -------------------
968
969-- | generate a token sequence out of a string denoting a CHS file
970--
971-- * the given position is attributed to the first character in the string
972--
973-- * errors are entered into the compiler state
974--
975-- * on a successfull parse, the name supply is updated
976lexCHS        :: String -> Position -> CST s [CHSToken]
977lexCHS cs pos  =
978  do
979    nameSupply <- getNameSupply
980    state <- initialState nameSupply
981    let (ts, lstate, errs) = execLexer chslexer ('\0':cs, pos, state)
982        (_, pos', state')  = lstate
983    mapM_ raise errs
984    assertFinalState pos' state'
985    setNameSupply $ namesup state'
986    return ts
987