1-----------------------------------------------------------------------------
2-- (c) The University of Glasgow, 2006
3--
4-- GHC's lexer for Haskell 2010 [1].
5--
6-- This is a combination of an Alex-generated lexer [2] from a regex
7-- definition, with some hand-coded bits. [3]
8--
9-- Completely accurate information about token-spans within the source
10-- file is maintained.  Every token has a start and end RealSrcLoc
11-- attached to it.
12--
13-- References:
14-- [1] https://www.haskell.org/onlinereport/haskell2010/haskellch2.html
15-- [2] http://www.haskell.org/alex/
16-- [3] https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/parser
17--
18-----------------------------------------------------------------------------
19
20--   ToDo / known bugs:
21--    - parsing integers is a bit slow
22--    - readRational is a bit slow
23--
24--   Known bugs, that were also in the previous version:
25--    - M... should be 3 tokens, not 1.
26--    - pragma-end should be only valid in a pragma
27
28--   qualified operator NOTES.
29--
30--   - If M.(+) is a single lexeme, then..
31--     - Probably (+) should be a single lexeme too, for consistency.
32--       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
33--     - But we have to rule out reserved operators, otherwise (..) becomes
34--       a different lexeme.
35--     - Should we therefore also rule out reserved operators in the qualified
36--       form?  This is quite difficult to achieve.  We don't do it for
37--       qualified varids.
38
39
40-- -----------------------------------------------------------------------------
41-- Alex "Haskell code fragment top"
42
43{
44{-# LANGUAGE CPP #-}
45{-# LANGUAGE BangPatterns #-}
46{-# LANGUAGE LambdaCase #-}
47
48{-# OPTIONS_GHC -funbox-strict-fields #-}
49
50module Lexer (
51   Token(..), lexer, pragState, mkPState, mkPStatePure, PState(..),
52   P(..), ParseResult(..), mkParserFlags, mkParserFlags', ParserFlags(..),
53   appendWarning,
54   appendError,
55   allocateComments,
56   MonadP(..),
57   getRealSrcLoc, getPState, withThisPackage,
58   failLocMsgP, srcParseFail,
59   getErrorMessages, getMessages,
60   popContext, pushModuleContext, setLastToken, setSrcLoc,
61   activeContext, nextIsEOF,
62   getLexState, popLexState, pushLexState,
63   ExtBits(..),
64   xtest,
65   lexTokenStream,
66   AddAnn(..),mkParensApiAnn,
67   addAnnsAt,
68   commentToAnnotation
69  ) where
70
71import GhcPrelude
72
73-- base
74import Control.Monad
75import Control.Monad.Fail as MonadFail
76import Data.Bits
77import Data.Char
78import Data.List
79import Data.Maybe
80import Data.Word
81
82import EnumSet (EnumSet)
83import qualified EnumSet
84
85-- ghc-boot
86import qualified GHC.LanguageExtensions as LangExt
87
88-- bytestring
89import Data.ByteString (ByteString)
90
91-- containers
92import Data.Map (Map)
93import qualified Data.Map as Map
94
95-- compiler/utils
96import Bag
97import Outputable
98import StringBuffer
99import FastString
100import UniqFM
101import Util             ( readRational, readHexRational )
102
103-- compiler/main
104import ErrUtils
105import DynFlags
106
107-- compiler/basicTypes
108import SrcLoc
109import Module
110import BasicTypes     ( InlineSpec(..), RuleMatchInfo(..),
111                        IntegralLit(..), FractionalLit(..),
112                        SourceText(..) )
113
114-- compiler/parser
115import Ctype
116
117import ApiAnnotation
118}
119
120-- -----------------------------------------------------------------------------
121-- Alex "Character set macros"
122
123-- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs
124-- Any changes here should likely be reflected there.
125$unispace    = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex].
126$nl          = [\n\r\f]
127$whitechar   = [$nl\v\ $unispace]
128$white_no_nl = $whitechar # \n -- TODO #8424
129$tab         = \t
130
131$ascdigit  = 0-9
132$unidigit  = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex].
133$decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
134$digit     = [$ascdigit $unidigit]
135
136$special   = [\(\)\,\;\[\]\`\{\}]
137$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
138$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex].
139$symbol    = [$ascsymbol $unisymbol] # [$special \_\"\']
140
141$unilarge  = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex].
142$asclarge  = [A-Z]
143$large     = [$asclarge $unilarge]
144
145$unismall  = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex].
146$ascsmall  = [a-z]
147$small     = [$ascsmall $unismall \_]
148
149$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex].
150$graphic   = [$small $large $symbol $digit $special $unigraphic \"\']
151
152$binit     = 0-1
153$octit     = 0-7
154$hexit     = [$decdigit A-F a-f]
155
156$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
157$idchar    = [$small $large $digit $uniidchar \']
158
159$pragmachar = [$small $large $digit]
160
161$docsym    = [\| \^ \* \$]
162
163
164-- -----------------------------------------------------------------------------
165-- Alex "Regular expression macros"
166
167@varid     = $small $idchar*          -- variable identifiers
168@conid     = $large $idchar*          -- constructor identifiers
169
170@varsym    = ($symbol # \:) $symbol*  -- variable (operator) symbol
171@consym    = \: $symbol*              -- constructor (operator) symbol
172
173-- See Note [Lexing NumericUnderscores extension] and #14473
174@numspc       = _*                   -- numeric spacer (#14473)
175@decimal      = $decdigit(@numspc $decdigit)*
176@binary       = $binit(@numspc $binit)*
177@octal        = $octit(@numspc $octit)*
178@hexadecimal  = $hexit(@numspc $hexit)*
179@exponent     = @numspc [eE] [\-\+]? @decimal
180@bin_exponent = @numspc [pP] [\-\+]? @decimal
181
182@qual = (@conid \.)+
183@qvarid = @qual @varid
184@qconid = @qual @conid
185@qvarsym = @qual @varsym
186@qconsym = @qual @consym
187
188@floating_point = @numspc @decimal \. @decimal @exponent? | @numspc @decimal @exponent
189@hex_floating_point = @numspc @hexadecimal \. @hexadecimal @bin_exponent? | @numspc @hexadecimal @bin_exponent
190
191-- normal signed numerical literals can only be explicitly negative,
192-- not explicitly positive (contrast @exponent)
193@negative = \-
194@signed = @negative ?
195
196
197-- -----------------------------------------------------------------------------
198-- Alex "Identifier"
199
200haskell :-
201
202
203-- -----------------------------------------------------------------------------
204-- Alex "Rules"
205
206-- everywhere: skip whitespace
207$white_no_nl+ ;
208$tab          { warnTab }
209
210-- Everywhere: deal with nested comments.  We explicitly rule out
211-- pragmas, "{-#", so that we don't accidentally treat them as comments.
212-- (this can happen even though pragmas will normally take precedence due to
213-- longest-match, because pragmas aren't valid in every state, but comments
214-- are). We also rule out nested Haddock comments, if the -haddock flag is
215-- set.
216
217"{-" / { isNormalComment } { nested_comment lexToken }
218
219-- Single-line comments are a bit tricky.  Haskell 98 says that two or
220-- more dashes followed by a symbol should be parsed as a varsym, so we
221-- have to exclude those.
222
223-- Since Haddock comments aren't valid in every state, we need to rule them
224-- out here.
225
226-- The following two rules match comments that begin with two dashes, but
227-- continue with a different character. The rules test that this character
228-- is not a symbol (in which case we'd have a varsym), and that it's not a
229-- space followed by a Haddock comment symbol (docsym) (in which case we'd
230-- have a Haddock comment). The rules then munch the rest of the line.
231
232"-- " ~$docsym .* { lineCommentToken }
233"--" [^$symbol \ ] .* { lineCommentToken }
234
235-- Next, match Haddock comments if no -haddock flag
236
237"-- " $docsym .* / { alexNotPred (ifExtension HaddockBit) } { lineCommentToken }
238
239-- Now, when we've matched comments that begin with 2 dashes and continue
240-- with a different character, we need to match comments that begin with three
241-- or more dashes (which clearly can't be Haddock comments). We only need to
242-- make sure that the first non-dash character isn't a symbol, and munch the
243-- rest of the line.
244
245"---"\-* ~$symbol .* { lineCommentToken }
246
247-- Since the previous rules all match dashes followed by at least one
248-- character, we also need to match a whole line filled with just dashes.
249
250"--"\-* / { atEOL } { lineCommentToken }
251
252-- We need this rule since none of the other single line comment rules
253-- actually match this case.
254
255"-- " / { atEOL } { lineCommentToken }
256
257-- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
258-- blank lines) until we find a non-whitespace character, then do layout
259-- processing.
260--
261-- One slight wibble here: what if the line begins with {-#? In
262-- theory, we have to lex the pragma to see if it's one we recognise,
263-- and if it is, then we backtrack and do_bol, otherwise we treat it
264-- as a nested comment.  We don't bother with this: if the line begins
265-- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
266<bol> {
267  \n                                    ;
268  ^\# line                              { begin line_prag1 }
269  ^\# / { followedByDigit }             { begin line_prag1 }
270  ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
271  ^\# \! .* \n                          ; -- #!, for scripts
272  ()                                    { do_bol }
273}
274
275-- after a layout keyword (let, where, do, of), we begin a new layout
276-- context if the curly brace is missing.
277-- Careful! This stuff is quite delicate.
278<layout, layout_do, layout_if> {
279  \{ / { notFollowedBy '-' }            { hopefully_open_brace }
280        -- we might encounter {-# here, but {- has been handled already
281  \n                                    ;
282  ^\# (line)?                           { begin line_prag1 }
283}
284
285-- after an 'if', a vertical bar starts a layout context for MultiWayIf
286<layout_if> {
287  \| / { notFollowedBySymbol }          { new_layout_context True dontGenerateSemic ITvbar }
288  ()                                    { pop }
289}
290
291-- do is treated in a subtly different way, see new_layout_context
292<layout>    ()                          { new_layout_context True  generateSemic ITvocurly }
293<layout_do> ()                          { new_layout_context False generateSemic ITvocurly }
294
295-- after a new layout context which was found to be to the left of the
296-- previous context, we have generated a '{' token, and we now need to
297-- generate a matching '}' token.
298<layout_left>  ()                       { do_layout_left }
299
300<0,option_prags> \n                     { begin bol }
301
302"{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
303                                { dispatch_pragmas linePrags }
304
305-- single-line line pragmas, of the form
306--    # <line> "<file>" <extra-stuff> \n
307<line_prag1> {
308  @decimal $white_no_nl+ \" [$graphic \ ]* \"  { setLineAndFile line_prag1a }
309  ()                                           { failLinePrag1 }
310}
311<line_prag1a> .*                               { popLinePrag1 }
312
313-- Haskell-style line pragmas, of the form
314--    {-# LINE <line> "<file>" #-}
315<line_prag2> {
316  @decimal $white_no_nl+ \" [$graphic \ ]* \"  { setLineAndFile line_prag2a }
317}
318<line_prag2a> "#-}"|"-}"                       { pop }
319   -- NOTE: accept -} at the end of a LINE pragma, for compatibility
320   -- with older versions of GHC which generated these.
321
322-- Haskell-style column pragmas, of the form
323--    {-# COLUMN <column> #-}
324<column_prag> @decimal $whitechar* "#-}" { setColumn }
325
326<0,option_prags> {
327  "{-#" $whitechar* $pragmachar+
328        $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
329                                 { dispatch_pragmas twoWordPrags }
330
331  "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
332                                 { dispatch_pragmas oneWordPrags }
333
334  -- We ignore all these pragmas, but don't generate a warning for them
335  "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
336                                 { dispatch_pragmas ignoredPrags }
337
338  -- ToDo: should only be valid inside a pragma:
339  "#-}"                          { endPrag }
340}
341
342<option_prags> {
343  "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
344                                   { dispatch_pragmas fileHeaderPrags }
345}
346
347<0> {
348  -- In the "0" mode we ignore these pragmas
349  "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
350                     { nested_comment lexToken }
351}
352
353<0,option_prags> {
354  "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
355                    (nested_comment lexToken) }
356}
357
358-- '0' state: ordinary lexemes
359
360-- Haddock comments
361
362<0,option_prags> {
363  "-- " $docsym      / { ifExtension HaddockBit } { multiline_doc_comment }
364  "{-" \ ? $docsym   / { ifExtension HaddockBit } { nested_doc_comment }
365}
366
367-- "special" symbols
368
369<0> {
370  "[|"        / { ifExtension ThQuotesBit } { token (ITopenExpQuote NoE NormalSyntax) }
371  "[||"       / { ifExtension ThQuotesBit } { token (ITopenTExpQuote NoE) }
372  "[e|"       / { ifExtension ThQuotesBit } { token (ITopenExpQuote HasE NormalSyntax) }
373  "[e||"      / { ifExtension ThQuotesBit } { token (ITopenTExpQuote HasE) }
374  "[p|"       / { ifExtension ThQuotesBit } { token ITopenPatQuote }
375  "[d|"       / { ifExtension ThQuotesBit } { layout_token ITopenDecQuote }
376  "[t|"       / { ifExtension ThQuotesBit } { token ITopenTypQuote }
377  "|]"        / { ifExtension ThQuotesBit } { token (ITcloseQuote NormalSyntax) }
378  "||]"       / { ifExtension ThQuotesBit } { token ITcloseTExpQuote }
379  \$ @varid   / { ifExtension ThBit }       { skip_one_varid ITidEscape }
380  "$$" @varid / { ifExtension ThBit }       { skip_two_varid ITidTyEscape }
381  "$("        / { ifExtension ThBit }       { token ITparenEscape }
382  "$$("       / { ifExtension ThBit }       { token ITparenTyEscape }
383
384  "[" @varid "|"  / { ifExtension QqBit }   { lex_quasiquote_tok }
385
386  -- qualified quasi-quote (#5555)
387  "[" @qvarid "|"  / { ifExtension QqBit }  { lex_qquasiquote_tok }
388
389  $unigraphic -- ⟦
390    / { ifCurrentChar '⟦' `alexAndPred`
391        ifExtension UnicodeSyntaxBit `alexAndPred`
392        ifExtension ThQuotesBit }
393    { token (ITopenExpQuote NoE UnicodeSyntax) }
394  $unigraphic -- ⟧
395    / { ifCurrentChar '⟧' `alexAndPred`
396        ifExtension UnicodeSyntaxBit `alexAndPred`
397        ifExtension ThQuotesBit }
398    { token (ITcloseQuote UnicodeSyntax) }
399}
400
401  -- See Note [Lexing type applications]
402<0> {
403    [^ $idchar \) ] ^
404  "@"
405    / { ifExtension TypeApplicationsBit `alexAndPred` notFollowedBySymbol }
406    { token ITtypeApp }
407}
408
409<0> {
410  "(|"
411    / { ifExtension ArrowsBit `alexAndPred`
412        notFollowedBySymbol }
413    { special (IToparenbar NormalSyntax) }
414  "|)"
415    / { ifExtension ArrowsBit }
416    { special (ITcparenbar NormalSyntax) }
417
418  $unigraphic -- ⦇
419    / { ifCurrentChar '⦇' `alexAndPred`
420        ifExtension UnicodeSyntaxBit `alexAndPred`
421        ifExtension ArrowsBit }
422    { special (IToparenbar UnicodeSyntax) }
423  $unigraphic -- ⦈
424    / { ifCurrentChar '⦈' `alexAndPred`
425        ifExtension UnicodeSyntaxBit `alexAndPred`
426        ifExtension ArrowsBit }
427    { special (ITcparenbar UnicodeSyntax) }
428}
429
430<0> {
431  \? @varid / { ifExtension IpBit } { skip_one_varid ITdupipvarid }
432}
433
434<0> {
435  "#" @varid / { ifExtension OverloadedLabelsBit } { skip_one_varid ITlabelvarid }
436}
437
438<0> {
439  "(#" / { ifExtension UnboxedTuplesBit `alexOrPred`
440           ifExtension UnboxedSumsBit }
441         { token IToubxparen }
442  "#)" / { ifExtension UnboxedTuplesBit `alexOrPred`
443           ifExtension UnboxedSumsBit }
444         { token ITcubxparen }
445}
446
447<0,option_prags> {
448  \(                                    { special IToparen }
449  \)                                    { special ITcparen }
450  \[                                    { special ITobrack }
451  \]                                    { special ITcbrack }
452  \,                                    { special ITcomma }
453  \;                                    { special ITsemi }
454  \`                                    { special ITbackquote }
455
456  \{                                    { open_brace }
457  \}                                    { close_brace }
458}
459
460<0,option_prags> {
461  @qvarid                       { idtoken qvarid }
462  @qconid                       { idtoken qconid }
463  @varid                        { varid }
464  @conid                        { idtoken conid }
465}
466
467<0> {
468  @qvarid "#"+      / { ifExtension MagicHashBit } { idtoken qvarid }
469  @qconid "#"+      / { ifExtension MagicHashBit } { idtoken qconid }
470  @varid "#"+       / { ifExtension MagicHashBit } { varid }
471  @conid "#"+       / { ifExtension MagicHashBit } { idtoken conid }
472}
473
474-- ToDo: - move `var` and (sym) into lexical syntax?
475--       - remove backquote from $special?
476<0> {
477  @qvarsym                                         { idtoken qvarsym }
478  @qconsym                                         { idtoken qconsym }
479  @varsym                                          { varsym }
480  @consym                                          { consym }
481}
482
483-- For the normal boxed literals we need to be careful
484-- when trying to be close to Haskell98
485
486-- Note [Lexing NumericUnderscores extension] (#14473)
487--
488-- NumericUnderscores extension allows underscores in numeric literals.
489-- Multiple underscores are represented with @numspc macro.
490-- To be simpler, we have only the definitions with underscores.
491-- And then we have a separate function (tok_integral and tok_frac)
492-- that validates the literals.
493-- If extensions are not enabled, check that there are no underscores.
494--
495<0> {
496  -- Normal integral literals (:: Num a => a, from Integer)
497  @decimal                                                                   { tok_num positive 0 0 decimal }
498  0[bB] @numspc @binary                / { ifExtension BinaryLiteralsBit }   { tok_num positive 2 2 binary }
499  0[oO] @numspc @octal                                                       { tok_num positive 2 2 octal }
500  0[xX] @numspc @hexadecimal                                                 { tok_num positive 2 2 hexadecimal }
501  @negative @decimal                   / { ifExtension NegativeLiteralsBit } { tok_num negative 1 1 decimal }
502  @negative 0[bB] @numspc @binary      / { ifExtension NegativeLiteralsBit `alexAndPred`
503                                           ifExtension BinaryLiteralsBit }   { tok_num negative 3 3 binary }
504  @negative 0[oO] @numspc @octal       / { ifExtension NegativeLiteralsBit } { tok_num negative 3 3 octal }
505  @negative 0[xX] @numspc @hexadecimal / { ifExtension NegativeLiteralsBit } { tok_num negative 3 3 hexadecimal }
506
507  -- Normal rational literals (:: Fractional a => a, from Rational)
508  @floating_point                                                            { tok_frac 0 tok_float }
509  @negative @floating_point            / { ifExtension NegativeLiteralsBit } { tok_frac 0 tok_float }
510  0[xX] @numspc @hex_floating_point    / { ifExtension HexFloatLiteralsBit } { tok_frac 0 tok_hex_float }
511  @negative 0[xX] @numspc @hex_floating_point
512                                       / { ifExtension HexFloatLiteralsBit `alexAndPred`
513                                           ifExtension NegativeLiteralsBit } { tok_frac 0 tok_hex_float }
514}
515
516<0> {
517  -- Unboxed ints (:: Int#) and words (:: Word#)
518  -- It's simpler (and faster?) to give separate cases to the negatives,
519  -- especially considering octal/hexadecimal prefixes.
520  @decimal                          \# / { ifExtension MagicHashBit }        { tok_primint positive 0 1 decimal }
521  0[bB] @numspc @binary             \# / { ifExtension MagicHashBit `alexAndPred`
522                                           ifExtension BinaryLiteralsBit }   { tok_primint positive 2 3 binary }
523  0[oO] @numspc @octal              \# / { ifExtension MagicHashBit }        { tok_primint positive 2 3 octal }
524  0[xX] @numspc @hexadecimal        \# / { ifExtension MagicHashBit }        { tok_primint positive 2 3 hexadecimal }
525  @negative @decimal                \# / { ifExtension MagicHashBit }        { tok_primint negative 1 2 decimal }
526  @negative 0[bB] @numspc @binary   \# / { ifExtension MagicHashBit `alexAndPred`
527                                           ifExtension BinaryLiteralsBit }   { tok_primint negative 3 4 binary }
528  @negative 0[oO] @numspc @octal    \# / { ifExtension MagicHashBit }        { tok_primint negative 3 4 octal }
529  @negative 0[xX] @numspc @hexadecimal \#
530                                       / { ifExtension MagicHashBit }        { tok_primint negative 3 4 hexadecimal }
531
532  @decimal                       \# \# / { ifExtension MagicHashBit }        { tok_primword 0 2 decimal }
533  0[bB] @numspc @binary          \# \# / { ifExtension MagicHashBit `alexAndPred`
534                                           ifExtension BinaryLiteralsBit }   { tok_primword 2 4 binary }
535  0[oO] @numspc @octal           \# \# / { ifExtension MagicHashBit }        { tok_primword 2 4 octal }
536  0[xX] @numspc @hexadecimal     \# \# / { ifExtension MagicHashBit }        { tok_primword 2 4 hexadecimal }
537
538  -- Unboxed floats and doubles (:: Float#, :: Double#)
539  -- prim_{float,double} work with signed literals
540  @signed @floating_point           \# / { ifExtension MagicHashBit }        { tok_frac 1 tok_primfloat }
541  @signed @floating_point        \# \# / { ifExtension MagicHashBit }        { tok_frac 2 tok_primdouble }
542}
543
544-- Strings and chars are lexed by hand-written code.  The reason is
545-- that even if we recognise the string or char here in the regex
546-- lexer, we would still have to parse the string afterward in order
547-- to convert it to a String.
548<0> {
549  \'                            { lex_char_tok }
550  \"                            { lex_string_tok }
551}
552
553-- Note [Lexing type applications]
554-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
555-- The desired syntax for type applications is to prefix the type application
556-- with '@', like this:
557--
558--   foo @Int @Bool baz bum
559--
560-- This, of course, conflicts with as-patterns. The conflict arises because
561-- expressions and patterns use the same parser, and also because we want
562-- to allow type patterns within expression patterns.
563--
564-- Disambiguation is accomplished by requiring *something* to appear between
565-- type application and the preceding token. This something must end with
566-- a character that cannot be the end of the variable bound in an as-pattern.
567-- Currently (June 2015), this means that the something cannot end with a
568-- $idchar or a close-paren. (The close-paren is necessary if the as-bound
569-- identifier is symbolic.)
570--
571-- Note that looking for whitespace before the '@' is insufficient, because
572-- of this pathological case:
573--
574--   foo {- hi -}@Int
575--
576-- This design is predicated on the fact that as-patterns are generally
577-- whitespace-free, and also that this whole thing is opt-in, with the
578-- TypeApplications extension.
579
580-- -----------------------------------------------------------------------------
581-- Alex "Haskell code fragment bottom"
582
583{
584
585-- -----------------------------------------------------------------------------
586-- The token type
587
588data Token
589  = ITas                        -- Haskell keywords
590  | ITcase
591  | ITclass
592  | ITdata
593  | ITdefault
594  | ITderiving
595  | ITdo
596  | ITelse
597  | IThiding
598  | ITforeign
599  | ITif
600  | ITimport
601  | ITin
602  | ITinfix
603  | ITinfixl
604  | ITinfixr
605  | ITinstance
606  | ITlet
607  | ITmodule
608  | ITnewtype
609  | ITof
610  | ITqualified
611  | ITthen
612  | ITtype
613  | ITwhere
614
615  | ITforall            IsUnicodeSyntax -- GHC extension keywords
616  | ITexport
617  | ITlabel
618  | ITdynamic
619  | ITsafe
620  | ITinterruptible
621  | ITunsafe
622  | ITstdcallconv
623  | ITccallconv
624  | ITcapiconv
625  | ITprimcallconv
626  | ITjavascriptcallconv
627  | ITmdo
628  | ITfamily
629  | ITrole
630  | ITgroup
631  | ITby
632  | ITusing
633  | ITpattern
634  | ITstatic
635  | ITstock
636  | ITanyclass
637  | ITvia
638
639  -- Backpack tokens
640  | ITunit
641  | ITsignature
642  | ITdependency
643  | ITrequires
644
645  -- Pragmas, see  note [Pragma source text] in BasicTypes
646  | ITinline_prag       SourceText InlineSpec RuleMatchInfo
647  | ITspec_prag         SourceText                -- SPECIALISE
648  | ITspec_inline_prag  SourceText Bool    -- SPECIALISE INLINE (or NOINLINE)
649  | ITsource_prag       SourceText
650  | ITrules_prag        SourceText
651  | ITwarning_prag      SourceText
652  | ITdeprecated_prag   SourceText
653  | ITline_prag         SourceText  -- not usually produced, see 'UsePosPragsBit'
654  | ITcolumn_prag       SourceText  -- not usually produced, see 'UsePosPragsBit'
655  | ITscc_prag          SourceText
656  | ITgenerated_prag    SourceText
657  | ITcore_prag         SourceText         -- hdaume: core annotations
658  | ITunpack_prag       SourceText
659  | ITnounpack_prag     SourceText
660  | ITann_prag          SourceText
661  | ITcomplete_prag     SourceText
662  | ITclose_prag
663  | IToptions_prag String
664  | ITinclude_prag String
665  | ITlanguage_prag
666  | ITminimal_prag      SourceText
667  | IToverlappable_prag SourceText  -- instance overlap mode
668  | IToverlapping_prag  SourceText  -- instance overlap mode
669  | IToverlaps_prag     SourceText  -- instance overlap mode
670  | ITincoherent_prag   SourceText  -- instance overlap mode
671  | ITctype             SourceText
672  | ITcomment_line_prag         -- See Note [Nested comment line pragmas]
673
674  | ITdotdot                    -- reserved symbols
675  | ITcolon
676  | ITdcolon            IsUnicodeSyntax
677  | ITequal
678  | ITlam
679  | ITlcase
680  | ITvbar
681  | ITlarrow            IsUnicodeSyntax
682  | ITrarrow            IsUnicodeSyntax
683  | ITat
684  | ITtilde
685  | ITdarrow            IsUnicodeSyntax
686  | ITminus
687  | ITbang
688  | ITstar              IsUnicodeSyntax
689  | ITdot
690
691  | ITbiglam                    -- GHC-extension symbols
692
693  | ITocurly                    -- special symbols
694  | ITccurly
695  | ITvocurly
696  | ITvccurly
697  | ITobrack
698  | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
699  | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
700  | ITcbrack
701  | IToparen
702  | ITcparen
703  | IToubxparen
704  | ITcubxparen
705  | ITsemi
706  | ITcomma
707  | ITunderscore
708  | ITbackquote
709  | ITsimpleQuote               --  '
710
711  | ITvarid   FastString        -- identifiers
712  | ITconid   FastString
713  | ITvarsym  FastString
714  | ITconsym  FastString
715  | ITqvarid  (FastString,FastString)
716  | ITqconid  (FastString,FastString)
717  | ITqvarsym (FastString,FastString)
718  | ITqconsym (FastString,FastString)
719
720  | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
721  | ITlabelvarid   FastString   -- Overloaded label: #x
722
723  | ITchar     SourceText Char       -- Note [Literal source text] in BasicTypes
724  | ITstring   SourceText FastString -- Note [Literal source text] in BasicTypes
725  | ITinteger  IntegralLit           -- Note [Literal source text] in BasicTypes
726  | ITrational FractionalLit
727
728  | ITprimchar   SourceText Char     -- Note [Literal source text] in BasicTypes
729  | ITprimstring SourceText ByteString -- Note [Literal source text] @BasicTypes
730  | ITprimint    SourceText Integer  -- Note [Literal source text] in BasicTypes
731  | ITprimword   SourceText Integer  -- Note [Literal source text] in BasicTypes
732  | ITprimfloat  FractionalLit
733  | ITprimdouble FractionalLit
734
735  -- Template Haskell extension tokens
736  | ITopenExpQuote HasE IsUnicodeSyntax --  [| or [e|
737  | ITopenPatQuote                      --  [p|
738  | ITopenDecQuote                      --  [d|
739  | ITopenTypQuote                      --  [t|
740  | ITcloseQuote IsUnicodeSyntax        --  |]
741  | ITopenTExpQuote HasE                --  [|| or [e||
742  | ITcloseTExpQuote                    --  ||]
743  | ITidEscape   FastString             --  $x
744  | ITparenEscape                       --  $(
745  | ITidTyEscape   FastString           --  $$x
746  | ITparenTyEscape                     --  $$(
747  | ITtyQuote                           --  ''
748  | ITquasiQuote (FastString,FastString,RealSrcSpan)
749    -- ITquasiQuote(quoter, quote, loc)
750    -- represents a quasi-quote of the form
751    -- [quoter| quote |]
752  | ITqQuasiQuote (FastString,FastString,FastString,RealSrcSpan)
753    -- ITqQuasiQuote(Qual, quoter, quote, loc)
754    -- represents a qualified quasi-quote of the form
755    -- [Qual.quoter| quote |]
756
757  -- Arrow notation extension
758  | ITproc
759  | ITrec
760  | IToparenbar  IsUnicodeSyntax -- ^ @(|@
761  | ITcparenbar  IsUnicodeSyntax -- ^ @|)@
762  | ITlarrowtail IsUnicodeSyntax -- ^ @-<@
763  | ITrarrowtail IsUnicodeSyntax -- ^ @>-@
764  | ITLarrowtail IsUnicodeSyntax -- ^ @-<<@
765  | ITRarrowtail IsUnicodeSyntax -- ^ @>>-@
766
767  -- | Type application '@' (lexed differently than as-pattern '@',
768  -- due to checking for preceding whitespace)
769  | ITtypeApp
770
771
772  | ITunknown String             -- ^ Used when the lexer can't make sense of it
773  | ITeof                        -- ^ end of file token
774
775  -- Documentation annotations
776  | ITdocCommentNext  String     -- ^ something beginning @-- |@
777  | ITdocCommentPrev  String     -- ^ something beginning @-- ^@
778  | ITdocCommentNamed String     -- ^ something beginning @-- $@
779  | ITdocSection      Int String -- ^ a section heading
780  | ITdocOptions      String     -- ^ doc options (prune, ignore-exports, etc)
781  | ITlineComment     String     -- ^ comment starting by "--"
782  | ITblockComment    String     -- ^ comment in {- -}
783
784  deriving Show
785
786instance Outputable Token where
787  ppr x = text (show x)
788
789
790-- the bitmap provided as the third component indicates whether the
791-- corresponding extension keyword is valid under the extension options
792-- provided to the compiler; if the extension corresponding to *any* of the
793-- bits set in the bitmap is enabled, the keyword is valid (this setup
794-- facilitates using a keyword in two different extensions that can be
795-- activated independently)
796--
797reservedWordsFM :: UniqFM (Token, ExtsBitmap)
798reservedWordsFM = listToUFM $
799    map (\(x, y, z) -> (mkFastString x, (y, z)))
800        [( "_",              ITunderscore,    0 ),
801         ( "as",             ITas,            0 ),
802         ( "case",           ITcase,          0 ),
803         ( "class",          ITclass,         0 ),
804         ( "data",           ITdata,          0 ),
805         ( "default",        ITdefault,       0 ),
806         ( "deriving",       ITderiving,      0 ),
807         ( "do",             ITdo,            0 ),
808         ( "else",           ITelse,          0 ),
809         ( "hiding",         IThiding,        0 ),
810         ( "if",             ITif,            0 ),
811         ( "import",         ITimport,        0 ),
812         ( "in",             ITin,            0 ),
813         ( "infix",          ITinfix,         0 ),
814         ( "infixl",         ITinfixl,        0 ),
815         ( "infixr",         ITinfixr,        0 ),
816         ( "instance",       ITinstance,      0 ),
817         ( "let",            ITlet,           0 ),
818         ( "module",         ITmodule,        0 ),
819         ( "newtype",        ITnewtype,       0 ),
820         ( "of",             ITof,            0 ),
821         ( "qualified",      ITqualified,     0 ),
822         ( "then",           ITthen,          0 ),
823         ( "type",           ITtype,          0 ),
824         ( "where",          ITwhere,         0 ),
825
826         ( "forall",         ITforall NormalSyntax, 0),
827         ( "mdo",            ITmdo,           xbit RecursiveDoBit),
828             -- See Note [Lexing type pseudo-keywords]
829         ( "family",         ITfamily,        0 ),
830         ( "role",           ITrole,          0 ),
831         ( "pattern",        ITpattern,       xbit PatternSynonymsBit),
832         ( "static",         ITstatic,        xbit StaticPointersBit ),
833         ( "stock",          ITstock,         0 ),
834         ( "anyclass",       ITanyclass,      0 ),
835         ( "via",            ITvia,           0 ),
836         ( "group",          ITgroup,         xbit TransformComprehensionsBit),
837         ( "by",             ITby,            xbit TransformComprehensionsBit),
838         ( "using",          ITusing,         xbit TransformComprehensionsBit),
839
840         ( "foreign",        ITforeign,       xbit FfiBit),
841         ( "export",         ITexport,        xbit FfiBit),
842         ( "label",          ITlabel,         xbit FfiBit),
843         ( "dynamic",        ITdynamic,       xbit FfiBit),
844         ( "safe",           ITsafe,          xbit FfiBit .|.
845                                              xbit SafeHaskellBit),
846         ( "interruptible",  ITinterruptible, xbit InterruptibleFfiBit),
847         ( "unsafe",         ITunsafe,        xbit FfiBit),
848         ( "stdcall",        ITstdcallconv,   xbit FfiBit),
849         ( "ccall",          ITccallconv,     xbit FfiBit),
850         ( "capi",           ITcapiconv,      xbit CApiFfiBit),
851         ( "prim",           ITprimcallconv,  xbit FfiBit),
852         ( "javascript",     ITjavascriptcallconv, xbit FfiBit),
853
854         ( "unit",           ITunit,          0 ),
855         ( "dependency",     ITdependency,       0 ),
856         ( "signature",      ITsignature,     0 ),
857
858         ( "rec",            ITrec,           xbit ArrowsBit .|.
859                                              xbit RecursiveDoBit),
860         ( "proc",           ITproc,          xbit ArrowsBit)
861     ]
862
863{-----------------------------------
864Note [Lexing type pseudo-keywords]
865~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
866
867One might think that we wish to treat 'family' and 'role' as regular old
868varids whenever -XTypeFamilies and -XRoleAnnotations are off, respectively.
869But, there is no need to do so. These pseudo-keywords are not stolen syntax:
870they are only used after the keyword 'type' at the top-level, where varids are
871not allowed. Furthermore, checks further downstream (TcTyClsDecls) ensure that
872type families and role annotations are never declared without their extensions
873on. In fact, by unconditionally lexing these pseudo-keywords as special, we
874can get better error messages.
875
876Also, note that these are included in the `varid` production in the parser --
877a key detail to make all this work.
878-------------------------------------}
879
880reservedSymsFM :: UniqFM (Token, IsUnicodeSyntax, ExtsBitmap)
881reservedSymsFM = listToUFM $
882    map (\ (x,w,y,z) -> (mkFastString x,(w,y,z)))
883      [ ("..",  ITdotdot,                   NormalSyntax,  0 )
884        -- (:) is a reserved op, meaning only list cons
885       ,(":",   ITcolon,                    NormalSyntax,  0 )
886       ,("::",  ITdcolon NormalSyntax,      NormalSyntax,  0 )
887       ,("=",   ITequal,                    NormalSyntax,  0 )
888       ,("\\",  ITlam,                      NormalSyntax,  0 )
889       ,("|",   ITvbar,                     NormalSyntax,  0 )
890       ,("<-",  ITlarrow NormalSyntax,      NormalSyntax,  0 )
891       ,("->",  ITrarrow NormalSyntax,      NormalSyntax,  0 )
892       ,("@",   ITat,                       NormalSyntax,  0 )
893       ,("~",   ITtilde,                    NormalSyntax,  0 )
894       ,("=>",  ITdarrow NormalSyntax,      NormalSyntax,  0 )
895       ,("-",   ITminus,                    NormalSyntax,  0 )
896       ,("!",   ITbang,                     NormalSyntax,  0 )
897
898       ,("*",   ITstar NormalSyntax,        NormalSyntax,  xbit StarIsTypeBit)
899
900        -- For 'forall a . t'
901       ,(".",   ITdot,                      NormalSyntax,  0 )
902
903       ,("-<",  ITlarrowtail NormalSyntax,  NormalSyntax,  xbit ArrowsBit)
904       ,(">-",  ITrarrowtail NormalSyntax,  NormalSyntax,  xbit ArrowsBit)
905       ,("-<<", ITLarrowtail NormalSyntax,  NormalSyntax,  xbit ArrowsBit)
906       ,(">>-", ITRarrowtail NormalSyntax,  NormalSyntax,  xbit ArrowsBit)
907
908       ,("∷",   ITdcolon UnicodeSyntax,     UnicodeSyntax, 0 )
909       ,("⇒",   ITdarrow UnicodeSyntax,     UnicodeSyntax, 0 )
910       ,("∀",   ITforall UnicodeSyntax,     UnicodeSyntax, 0 )
911       ,("→",   ITrarrow UnicodeSyntax,     UnicodeSyntax, 0 )
912       ,("←",   ITlarrow UnicodeSyntax,     UnicodeSyntax, 0 )
913
914       ,("⤙",   ITlarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit)
915       ,("⤚",   ITrarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit)
916       ,("⤛",   ITLarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit)
917       ,("⤜",   ITRarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit)
918
919       ,("★",   ITstar UnicodeSyntax,       UnicodeSyntax, xbit StarIsTypeBit)
920
921        -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
922        -- form part of a large operator.  This would let us have a better
923        -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
924       ]
925
926-- -----------------------------------------------------------------------------
927-- Lexer actions
928
929type Action = RealSrcSpan -> StringBuffer -> Int -> P (RealLocated Token)
930
931special :: Token -> Action
932special tok span _buf _len = return (L span tok)
933
934token, layout_token :: Token -> Action
935token t span _buf _len = return (L span t)
936layout_token t span _buf _len = pushLexState layout >> return (L span t)
937
938idtoken :: (StringBuffer -> Int -> Token) -> Action
939idtoken f span buf len = return (L span $! (f buf len))
940
941skip_one_varid :: (FastString -> Token) -> Action
942skip_one_varid f span buf len
943  = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
944
945skip_two_varid :: (FastString -> Token) -> Action
946skip_two_varid f span buf len
947  = return (L span $! f (lexemeToFastString (stepOn (stepOn buf)) (len-2)))
948
949strtoken :: (String -> Token) -> Action
950strtoken f span buf len =
951  return (L span $! (f $! lexemeToString buf len))
952
953begin :: Int -> Action
954begin code _span _str _len = do pushLexState code; lexToken
955
956pop :: Action
957pop _span _buf _len = do _ <- popLexState
958                         lexToken
959-- See Note [Nested comment line pragmas]
960failLinePrag1 :: Action
961failLinePrag1 span _buf _len = do
962  b <- getBit InNestedCommentBit
963  if b then return (L span ITcomment_line_prag)
964       else lexError "lexical error in pragma"
965
966-- See Note [Nested comment line pragmas]
967popLinePrag1 :: Action
968popLinePrag1 span _buf _len = do
969  b <- getBit InNestedCommentBit
970  if b then return (L span ITcomment_line_prag) else do
971    _ <- popLexState
972    lexToken
973
974hopefully_open_brace :: Action
975hopefully_open_brace span buf len
976 = do relaxed <- getBit RelaxedLayoutBit
977      ctx <- getContext
978      (AI l _) <- getInput
979      let offset = srcLocCol l
980          isOK = relaxed ||
981                 case ctx of
982                 Layout prev_off _ : _ -> prev_off < offset
983                 _                     -> True
984      if isOK then pop_and open_brace span buf len
985              else addFatalError (RealSrcSpan span) (text "Missing block")
986
987pop_and :: Action -> Action
988pop_and act span buf len = do _ <- popLexState
989                              act span buf len
990
991{-# INLINE nextCharIs #-}
992nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
993nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
994
995{-# INLINE nextCharIsNot #-}
996nextCharIsNot :: StringBuffer -> (Char -> Bool) -> Bool
997nextCharIsNot buf p = not (nextCharIs buf p)
998
999notFollowedBy :: Char -> AlexAccPred ExtsBitmap
1000notFollowedBy char _ _ _ (AI _ buf)
1001  = nextCharIsNot buf (== char)
1002
1003notFollowedBySymbol :: AlexAccPred ExtsBitmap
1004notFollowedBySymbol _ _ _ (AI _ buf)
1005  = nextCharIsNot buf (`elem` "!#$%&*+./<=>?@\\^|-~")
1006
1007followedByDigit :: AlexAccPred ExtsBitmap
1008followedByDigit _ _ _ (AI _ buf)
1009  = afterOptionalSpace buf (\b -> nextCharIs b (`elem` ['0'..'9']))
1010
1011ifCurrentChar :: Char -> AlexAccPred ExtsBitmap
1012ifCurrentChar char _ (AI _ buf) _ _
1013  = nextCharIs buf (== char)
1014
1015-- We must reject doc comments as being ordinary comments everywhere.
1016-- In some cases the doc comment will be selected as the lexeme due to
1017-- maximal munch, but not always, because the nested comment rule is
1018-- valid in all states, but the doc-comment rules are only valid in
1019-- the non-layout states.
1020isNormalComment :: AlexAccPred ExtsBitmap
1021isNormalComment bits _ _ (AI _ buf)
1022  | HaddockBit `xtest` bits = notFollowedByDocOrPragma
1023  | otherwise               = nextCharIsNot buf (== '#')
1024  where
1025    notFollowedByDocOrPragma
1026       = afterOptionalSpace buf (\b -> nextCharIsNot b (`elem` "|^*$#"))
1027
1028afterOptionalSpace :: StringBuffer -> (StringBuffer -> Bool) -> Bool
1029afterOptionalSpace buf p
1030    = if nextCharIs buf (== ' ')
1031      then p (snd (nextChar buf))
1032      else p buf
1033
1034atEOL :: AlexAccPred ExtsBitmap
1035atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
1036
1037ifExtension :: ExtBits -> AlexAccPred ExtsBitmap
1038ifExtension extBits bits _ _ _ = extBits `xtest` bits
1039
1040alexNotPred p userState in1 len in2
1041  = not (p userState in1 len in2)
1042
1043alexOrPred p1 p2 userState in1 len in2
1044  = p1 userState in1 len in2 || p2 userState in1 len in2
1045
1046multiline_doc_comment :: Action
1047multiline_doc_comment span buf _len = withLexedDocType (worker "")
1048  where
1049    worker commentAcc input docType checkNextLine = case alexGetChar' input of
1050      Just ('\n', input')
1051        | checkNextLine -> case checkIfCommentLine input' of
1052          Just input -> worker ('\n':commentAcc) input docType checkNextLine
1053          Nothing -> docCommentEnd input commentAcc docType buf span
1054        | otherwise -> docCommentEnd input commentAcc docType buf span
1055      Just (c, input) -> worker (c:commentAcc) input docType checkNextLine
1056      Nothing -> docCommentEnd input commentAcc docType buf span
1057
1058    -- Check if the next line of input belongs to this doc comment as well.
1059    -- A doc comment continues onto the next line when the following
1060    -- conditions are met:
1061    --   * The line starts with "--"
1062    --   * The line doesn't start with "---".
1063    --   * The line doesn't start with "-- $", because that would be the
1064    --     start of a /new/ named haddock chunk (#10398).
1065    checkIfCommentLine :: AlexInput -> Maybe AlexInput
1066    checkIfCommentLine input = check (dropNonNewlineSpace input)
1067      where
1068        check input = do
1069          ('-', input) <- alexGetChar' input
1070          ('-', input) <- alexGetChar' input
1071          (c, after_c) <- alexGetChar' input
1072          case c of
1073            '-' -> Nothing
1074            ' ' -> case alexGetChar' after_c of
1075                     Just ('$', _) -> Nothing
1076                     _ -> Just input
1077            _   -> Just input
1078
1079        dropNonNewlineSpace input = case alexGetChar' input of
1080          Just (c, input')
1081            | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
1082            | otherwise -> input
1083          Nothing -> input
1084
1085lineCommentToken :: Action
1086lineCommentToken span buf len = do
1087  b <- getBit RawTokenStreamBit
1088  if b then strtoken ITlineComment span buf len else lexToken
1089
1090{-
1091  nested comments require traversing by hand, they can't be parsed
1092  using regular expressions.
1093-}
1094nested_comment :: P (RealLocated Token) -> Action
1095nested_comment cont span buf len = do
1096  input <- getInput
1097  go (reverse $ lexemeToString buf len) (1::Int) input
1098  where
1099    go commentAcc 0 input = do
1100      setInput input
1101      b <- getBit RawTokenStreamBit
1102      if b
1103        then docCommentEnd input commentAcc ITblockComment buf span
1104        else cont
1105    go commentAcc n input = case alexGetChar' input of
1106      Nothing -> errBrace input span
1107      Just ('-',input) -> case alexGetChar' input of
1108        Nothing  -> errBrace input span
1109        Just ('\125',input) -> go ('\125':'-':commentAcc) (n-1) input -- '}'
1110        Just (_,_)          -> go ('-':commentAcc) n input
1111      Just ('\123',input) -> case alexGetChar' input of  -- '{' char
1112        Nothing  -> errBrace input span
1113        Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
1114        Just (_,_)       -> go ('\123':commentAcc) n input
1115      -- See Note [Nested comment line pragmas]
1116      Just ('\n',input) -> case alexGetChar' input of
1117        Nothing  -> errBrace input span
1118        Just ('#',_) -> do (parsedAcc,input) <- parseNestedPragma input
1119                           go (parsedAcc ++ '\n':commentAcc) n input
1120        Just (_,_)   -> go ('\n':commentAcc) n input
1121      Just (c,input) -> go (c:commentAcc) n input
1122
1123nested_doc_comment :: Action
1124nested_doc_comment span buf _len = withLexedDocType (go "")
1125  where
1126    go commentAcc input docType _ = case alexGetChar' input of
1127      Nothing -> errBrace input span
1128      Just ('-',input) -> case alexGetChar' input of
1129        Nothing -> errBrace input span
1130        Just ('\125',input) ->
1131          docCommentEnd input commentAcc docType buf span
1132        Just (_,_) -> go ('-':commentAcc) input docType False
1133      Just ('\123', input) -> case alexGetChar' input of
1134        Nothing  -> errBrace input span
1135        Just ('-',input) -> do
1136          setInput input
1137          let cont = do input <- getInput; go commentAcc input docType False
1138          nested_comment cont span buf _len
1139        Just (_,_) -> go ('\123':commentAcc) input docType False
1140      -- See Note [Nested comment line pragmas]
1141      Just ('\n',input) -> case alexGetChar' input of
1142        Nothing  -> errBrace input span
1143        Just ('#',_) -> do (parsedAcc,input) <- parseNestedPragma input
1144                           go (parsedAcc ++ '\n':commentAcc) input docType False
1145        Just (_,_)   -> go ('\n':commentAcc) input docType False
1146      Just (c,input) -> go (c:commentAcc) input docType False
1147
1148-- See Note [Nested comment line pragmas]
1149parseNestedPragma :: AlexInput -> P (String,AlexInput)
1150parseNestedPragma input@(AI _ buf) = do
1151  origInput <- getInput
1152  setInput input
1153  setExts (.|. xbit InNestedCommentBit)
1154  pushLexState bol
1155  lt <- lexToken
1156  _ <- popLexState
1157  setExts (.&. complement (xbit InNestedCommentBit))
1158  postInput@(AI _ postBuf) <- getInput
1159  setInput origInput
1160  case unRealSrcSpan lt of
1161    ITcomment_line_prag -> do
1162      let bytes = byteDiff buf postBuf
1163          diff  = lexemeToString buf bytes
1164      return (reverse diff, postInput)
1165    lt' -> panic ("parseNestedPragma: unexpected token" ++ (show lt'))
1166
1167{-
1168Note [Nested comment line pragmas]
1169~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1170We used to ignore cpp-preprocessor-generated #line pragmas if they were inside
1171nested comments.
1172
1173Now, when parsing a nested comment, if we encounter a line starting with '#' we
1174call parseNestedPragma, which executes the following:
11751. Save the current lexer input (loc, buf) for later
11762. Set the current lexer input to the beginning of the line starting with '#'
11773. Turn the 'InNestedComment' extension on
11784. Push the 'bol' lexer state
11795. Lex a token. Due to (2), (3), and (4), this should always lex a single line
1180   or less and return the ITcomment_line_prag token. This may set source line
1181   and file location if a #line pragma is successfully parsed
11826. Restore lexer input and state to what they were before we did all this
11837. Return control to the function parsing a nested comment, informing it of
1184   what the lexer parsed
1185
1186Regarding (5) above:
1187Every exit from the 'bol' lexer state (do_bol, popLinePrag1, failLinePrag1)
1188checks if the 'InNestedComment' extension is set. If it is, that function will
1189return control to parseNestedPragma by returning the ITcomment_line_prag token.
1190
1191See #314 for more background on the bug this fixes.
1192-}
1193
1194withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (RealLocated Token))
1195                 -> P (RealLocated Token)
1196withLexedDocType lexDocComment = do
1197  input@(AI _ buf) <- getInput
1198  case prevChar buf ' ' of
1199    -- The `Bool` argument to lexDocComment signals whether or not the next
1200    -- line of input might also belong to this doc comment.
1201    '|' -> lexDocComment input ITdocCommentNext True
1202    '^' -> lexDocComment input ITdocCommentPrev True
1203    '$' -> lexDocComment input ITdocCommentNamed True
1204    '*' -> lexDocSection 1 input
1205    _ -> panic "withLexedDocType: Bad doc type"
1206 where
1207    lexDocSection n input = case alexGetChar' input of
1208      Just ('*', input) -> lexDocSection (n+1) input
1209      Just (_,   _)     -> lexDocComment input (ITdocSection n) False
1210      Nothing -> do setInput input; lexToken -- eof reached, lex it normally
1211
1212-- RULES pragmas turn on the forall and '.' keywords, and we turn them
1213-- off again at the end of the pragma.
1214rulePrag :: Action
1215rulePrag span buf len = do
1216  setExts (.|. xbit InRulePragBit)
1217  let !src = lexemeToString buf len
1218  return (L span (ITrules_prag (SourceText src)))
1219
1220-- When 'UsePosPragsBit' is not set, it is expected that we emit a token instead
1221-- of updating the position in 'PState'
1222linePrag :: Action
1223linePrag span buf len = do
1224  usePosPrags <- getBit UsePosPragsBit
1225  if usePosPrags
1226    then begin line_prag2 span buf len
1227    else let !src = lexemeToString buf len
1228         in return (L span (ITline_prag (SourceText src)))
1229
1230-- When 'UsePosPragsBit' is not set, it is expected that we emit a token instead
1231-- of updating the position in 'PState'
1232columnPrag :: Action
1233columnPrag span buf len = do
1234  usePosPrags <- getBit UsePosPragsBit
1235  let !src = lexemeToString buf len
1236  if usePosPrags
1237    then begin column_prag span buf len
1238    else let !src = lexemeToString buf len
1239         in return (L span (ITcolumn_prag (SourceText src)))
1240
1241endPrag :: Action
1242endPrag span _buf _len = do
1243  setExts (.&. complement (xbit InRulePragBit))
1244  return (L span ITclose_prag)
1245
1246-- docCommentEnd
1247-------------------------------------------------------------------------------
1248-- This function is quite tricky. We can't just return a new token, we also
1249-- need to update the state of the parser. Why? Because the token is longer
1250-- than what was lexed by Alex, and the lexToken function doesn't know this, so
1251-- it writes the wrong token length to the parser state. This function is
1252-- called afterwards, so it can just update the state.
1253
1254docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
1255                 RealSrcSpan -> P (RealLocated Token)
1256docCommentEnd input commentAcc docType buf span = do
1257  setInput input
1258  let (AI loc nextBuf) = input
1259      comment = reverse commentAcc
1260      span' = mkRealSrcSpan (realSrcSpanStart span) loc
1261      last_len = byteDiff buf nextBuf
1262
1263  span `seq` setLastToken span' last_len
1264  return (L span' (docType comment))
1265
1266errBrace :: AlexInput -> RealSrcSpan -> P a
1267errBrace (AI end _) span = failLocMsgP (realSrcSpanStart span) end "unterminated `{-'"
1268
1269open_brace, close_brace :: Action
1270open_brace span _str _len = do
1271  ctx <- getContext
1272  setContext (NoLayout:ctx)
1273  return (L span ITocurly)
1274close_brace span _str _len = do
1275  popContext
1276  return (L span ITccurly)
1277
1278qvarid, qconid :: StringBuffer -> Int -> Token
1279qvarid buf len = ITqvarid $! splitQualName buf len False
1280qconid buf len = ITqconid $! splitQualName buf len False
1281
1282splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
1283-- takes a StringBuffer and a length, and returns the module name
1284-- and identifier parts of a qualified name.  Splits at the *last* dot,
1285-- because of hierarchical module names.
1286splitQualName orig_buf len parens = split orig_buf orig_buf
1287  where
1288    split buf dot_buf
1289        | orig_buf `byteDiff` buf >= len  = done dot_buf
1290        | c == '.'                        = found_dot buf'
1291        | otherwise                       = split buf' dot_buf
1292      where
1293       (c,buf') = nextChar buf
1294
1295    -- careful, we might get names like M....
1296    -- so, if the character after the dot is not upper-case, this is
1297    -- the end of the qualifier part.
1298    found_dot buf -- buf points after the '.'
1299        | isUpper c    = split buf' buf
1300        | otherwise    = done buf
1301      where
1302       (c,buf') = nextChar buf
1303
1304    done dot_buf =
1305        (lexemeToFastString orig_buf (qual_size - 1),
1306         if parens -- Prelude.(+)
1307            then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
1308            else lexemeToFastString dot_buf (len - qual_size))
1309      where
1310        qual_size = orig_buf `byteDiff` dot_buf
1311
1312varid :: Action
1313varid span buf len =
1314  case lookupUFM reservedWordsFM fs of
1315    Just (ITcase, _) -> do
1316      lastTk <- getLastTk
1317      keyword <- case lastTk of
1318        Just ITlam -> do
1319          lambdaCase <- getBit LambdaCaseBit
1320          unless lambdaCase $ do
1321            pState <- getPState
1322            addError (RealSrcSpan (last_loc pState)) $ text
1323                     "Illegal lambda-case (use LambdaCase)"
1324          return ITlcase
1325        _ -> return ITcase
1326      maybe_layout keyword
1327      return $ L span keyword
1328    Just (keyword, 0) -> do
1329      maybe_layout keyword
1330      return $ L span keyword
1331    Just (keyword, i) -> do
1332      exts <- getExts
1333      if exts .&. i /= 0
1334        then do
1335          maybe_layout keyword
1336          return $ L span keyword
1337        else
1338          return $ L span $ ITvarid fs
1339    Nothing ->
1340      return $ L span $ ITvarid fs
1341  where
1342    !fs = lexemeToFastString buf len
1343
1344conid :: StringBuffer -> Int -> Token
1345conid buf len = ITconid $! lexemeToFastString buf len
1346
1347qvarsym, qconsym :: StringBuffer -> Int -> Token
1348qvarsym buf len = ITqvarsym $! splitQualName buf len False
1349qconsym buf len = ITqconsym $! splitQualName buf len False
1350
1351varsym, consym :: Action
1352varsym = sym ITvarsym
1353consym = sym ITconsym
1354
1355sym :: (FastString -> Token) -> Action
1356sym con span buf len =
1357  case lookupUFM reservedSymsFM fs of
1358    Just (keyword, NormalSyntax, 0) ->
1359      return $ L span keyword
1360    Just (keyword, NormalSyntax, i) -> do
1361      exts <- getExts
1362      if exts .&. i /= 0
1363        then return $ L span keyword
1364        else return $ L span (con fs)
1365    Just (keyword, UnicodeSyntax, 0) -> do
1366      exts <- getExts
1367      if xtest UnicodeSyntaxBit exts
1368        then return $ L span keyword
1369        else return $ L span (con fs)
1370    Just (keyword, UnicodeSyntax, i) -> do
1371      exts <- getExts
1372      if exts .&. i /= 0 && xtest UnicodeSyntaxBit exts
1373        then return $ L span keyword
1374        else return $ L span (con fs)
1375    Nothing ->
1376      return $ L span $! con fs
1377  where
1378    !fs = lexemeToFastString buf len
1379
1380-- Variations on the integral numeric literal.
1381tok_integral :: (SourceText -> Integer -> Token)
1382             -> (Integer -> Integer)
1383             -> Int -> Int
1384             -> (Integer, (Char -> Int))
1385             -> Action
1386tok_integral itint transint transbuf translen (radix,char_to_int) span buf len = do
1387  numericUnderscores <- getBit NumericUnderscoresBit  -- #14473
1388  let src = lexemeToString buf len
1389  when ((not numericUnderscores) && ('_' `elem` src)) $ do
1390    pState <- getPState
1391    addError (RealSrcSpan (last_loc pState)) $ text
1392             "Use NumericUnderscores to allow underscores in integer literals"
1393  return $ L span $ itint (SourceText src)
1394       $! transint $ parseUnsignedInteger
1395       (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1396
1397tok_num :: (Integer -> Integer)
1398        -> Int -> Int
1399        -> (Integer, (Char->Int)) -> Action
1400tok_num = tok_integral $ \case
1401    st@(SourceText ('-':_)) -> itint st (const True)
1402    st@(SourceText _)       -> itint st (const False)
1403    st@NoSourceText         -> itint st (< 0)
1404  where
1405    itint :: SourceText -> (Integer -> Bool) -> Integer -> Token
1406    itint !st is_negative !val = ITinteger ((IL st $! is_negative val) val)
1407
1408tok_primint :: (Integer -> Integer)
1409            -> Int -> Int
1410            -> (Integer, (Char->Int)) -> Action
1411tok_primint = tok_integral ITprimint
1412
1413
1414tok_primword :: Int -> Int
1415             -> (Integer, (Char->Int)) -> Action
1416tok_primword = tok_integral ITprimword positive
1417positive, negative :: (Integer -> Integer)
1418positive = id
1419negative = negate
1420decimal, octal, hexadecimal :: (Integer, Char -> Int)
1421decimal = (10,octDecDigit)
1422binary = (2,octDecDigit)
1423octal = (8,octDecDigit)
1424hexadecimal = (16,hexDigit)
1425
1426-- readRational can understand negative rationals, exponents, everything.
1427tok_frac :: Int -> (String -> Token) -> Action
1428tok_frac drop f span buf len = do
1429  numericUnderscores <- getBit NumericUnderscoresBit  -- #14473
1430  let src = lexemeToString buf (len-drop)
1431  when ((not numericUnderscores) && ('_' `elem` src)) $ do
1432    pState <- getPState
1433    addError (RealSrcSpan (last_loc pState)) $ text
1434             "Use NumericUnderscores to allow underscores in floating literals"
1435  return (L span $! (f $! src))
1436
1437tok_float, tok_primfloat, tok_primdouble :: String -> Token
1438tok_float        str = ITrational   $! readFractionalLit str
1439tok_hex_float    str = ITrational   $! readHexFractionalLit str
1440tok_primfloat    str = ITprimfloat  $! readFractionalLit str
1441tok_primdouble   str = ITprimdouble $! readFractionalLit str
1442
1443readFractionalLit :: String -> FractionalLit
1444readFractionalLit str = ((FL $! (SourceText str)) $! is_neg) $! readRational str
1445                        where is_neg = case str of ('-':_) -> True
1446                                                   _       -> False
1447readHexFractionalLit :: String -> FractionalLit
1448readHexFractionalLit str =
1449  FL { fl_text  = SourceText str
1450     , fl_neg   = case str of
1451                    '-' : _ -> True
1452                    _       -> False
1453     , fl_value = readHexRational str
1454     }
1455
1456-- -----------------------------------------------------------------------------
1457-- Layout processing
1458
1459-- we're at the first token on a line, insert layout tokens if necessary
1460do_bol :: Action
1461do_bol span _str _len = do
1462        -- See Note [Nested comment line pragmas]
1463        b <- getBit InNestedCommentBit
1464        if b then return (L span ITcomment_line_prag) else do
1465          (pos, gen_semic) <- getOffside
1466          case pos of
1467              LT -> do
1468                  --trace "layout: inserting '}'" $ do
1469                  popContext
1470                  -- do NOT pop the lex state, we might have a ';' to insert
1471                  return (L span ITvccurly)
1472              EQ | gen_semic -> do
1473                  --trace "layout: inserting ';'" $ do
1474                  _ <- popLexState
1475                  return (L span ITsemi)
1476              _ -> do
1477                  _ <- popLexState
1478                  lexToken
1479
1480-- certain keywords put us in the "layout" state, where we might
1481-- add an opening curly brace.
1482maybe_layout :: Token -> P ()
1483maybe_layout t = do -- If the alternative layout rule is enabled then
1484                    -- we never create an implicit layout context here.
1485                    -- Layout is handled XXX instead.
1486                    -- The code for closing implicit contexts, or
1487                    -- inserting implicit semi-colons, is therefore
1488                    -- irrelevant as it only applies in an implicit
1489                    -- context.
1490                    alr <- getBit AlternativeLayoutRuleBit
1491                    unless alr $ f t
1492    where f ITdo    = pushLexState layout_do
1493          f ITmdo   = pushLexState layout_do
1494          f ITof    = pushLexState layout
1495          f ITlcase = pushLexState layout
1496          f ITlet   = pushLexState layout
1497          f ITwhere = pushLexState layout
1498          f ITrec   = pushLexState layout
1499          f ITif    = pushLexState layout_if
1500          f _       = return ()
1501
1502-- Pushing a new implicit layout context.  If the indentation of the
1503-- next token is not greater than the previous layout context, then
1504-- Haskell 98 says that the new layout context should be empty; that is
1505-- the lexer must generate {}.
1506--
1507-- We are slightly more lenient than this: when the new context is started
1508-- by a 'do', then we allow the new context to be at the same indentation as
1509-- the previous context.  This is what the 'strict' argument is for.
1510new_layout_context :: Bool -> Bool -> Token -> Action
1511new_layout_context strict gen_semic tok span _buf len = do
1512    _ <- popLexState
1513    (AI l _) <- getInput
1514    let offset = srcLocCol l - len
1515    ctx <- getContext
1516    nondecreasing <- getBit NondecreasingIndentationBit
1517    let strict' = strict || not nondecreasing
1518    case ctx of
1519        Layout prev_off _ : _  |
1520           (strict'     && prev_off >= offset  ||
1521            not strict' && prev_off > offset) -> do
1522                -- token is indented to the left of the previous context.
1523                -- we must generate a {} sequence now.
1524                pushLexState layout_left
1525                return (L span tok)
1526        _ -> do setContext (Layout offset gen_semic : ctx)
1527                return (L span tok)
1528
1529do_layout_left :: Action
1530do_layout_left span _buf _len = do
1531    _ <- popLexState
1532    pushLexState bol  -- we must be at the start of a line
1533    return (L span ITvccurly)
1534
1535-- -----------------------------------------------------------------------------
1536-- LINE pragmas
1537
1538setLineAndFile :: Int -> Action
1539setLineAndFile code span buf len = do
1540  let src = lexemeToString buf (len - 1)  -- drop trailing quotation mark
1541      linenumLen = length $ head $ words src
1542      linenum = parseUnsignedInteger buf linenumLen 10 octDecDigit
1543      file = mkFastString $ go $ drop 1 $ dropWhile (/= '"') src
1544          -- skip everything through first quotation mark to get to the filename
1545        where go ('\\':c:cs) = c : go cs
1546              go (c:cs)      = c : go cs
1547              go []          = []
1548              -- decode escapes in the filename.  e.g. on Windows
1549              -- when our filenames have backslashes in, gcc seems to
1550              -- escape the backslashes.  One symptom of not doing this
1551              -- is that filenames in error messages look a bit strange:
1552              --   C:\\foo\bar.hs
1553              -- only the first backslash is doubled, because we apply
1554              -- System.FilePath.normalise before printing out
1555              -- filenames and it does not remove duplicate
1556              -- backslashes after the drive letter (should it?).
1557  setAlrLastLoc $ alrInitialLoc file
1558  setSrcLoc (mkRealSrcLoc file (fromIntegral linenum - 1) (srcSpanEndCol span))
1559      -- subtract one: the line number refers to the *following* line
1560  addSrcFile file
1561  _ <- popLexState
1562  pushLexState code
1563  lexToken
1564
1565setColumn :: Action
1566setColumn span buf len = do
1567  let column =
1568        case reads (lexemeToString buf len) of
1569          [(column, _)] -> column
1570          _ -> error "setColumn: expected integer" -- shouldn't happen
1571  setSrcLoc (mkRealSrcLoc (srcSpanFile span) (srcSpanEndLine span)
1572                          (fromIntegral (column :: Integer)))
1573  _ <- popLexState
1574  lexToken
1575
1576alrInitialLoc :: FastString -> RealSrcSpan
1577alrInitialLoc file = mkRealSrcSpan loc loc
1578    where -- This is a hack to ensure that the first line in a file
1579          -- looks like it is after the initial location:
1580          loc = mkRealSrcLoc file (-1) (-1)
1581
1582-- -----------------------------------------------------------------------------
1583-- Options, includes and language pragmas.
1584
1585lex_string_prag :: (String -> Token) -> Action
1586lex_string_prag mkTok span _buf _len
1587    = do input <- getInput
1588         start <- getRealSrcLoc
1589         tok <- go [] input
1590         end <- getRealSrcLoc
1591         return (L (mkRealSrcSpan start end) tok)
1592    where go acc input
1593              = if isString input "#-}"
1594                   then do setInput input
1595                           return (mkTok (reverse acc))
1596                   else case alexGetChar input of
1597                          Just (c,i) -> go (c:acc) i
1598                          Nothing -> err input
1599          isString _ [] = True
1600          isString i (x:xs)
1601              = case alexGetChar i of
1602                  Just (c,i') | c == x    -> isString i' xs
1603                  _other -> False
1604          err (AI end _) = failLocMsgP (realSrcSpanStart span) end "unterminated options pragma"
1605
1606
1607-- -----------------------------------------------------------------------------
1608-- Strings & Chars
1609
1610-- This stuff is horrible.  I hates it.
1611
1612lex_string_tok :: Action
1613lex_string_tok span buf _len = do
1614  tok <- lex_string ""
1615  (AI end bufEnd) <- getInput
1616  let
1617    tok' = case tok of
1618            ITprimstring _ bs -> ITprimstring (SourceText src) bs
1619            ITstring _ s -> ITstring (SourceText src) s
1620            _ -> panic "lex_string_tok"
1621    src = lexemeToString buf (cur bufEnd - cur buf)
1622  return (L (mkRealSrcSpan (realSrcSpanStart span) end) tok')
1623
1624lex_string :: String -> P Token
1625lex_string s = do
1626  i <- getInput
1627  case alexGetChar' i of
1628    Nothing -> lit_error i
1629
1630    Just ('"',i)  -> do
1631        setInput i
1632        let s' = reverse s
1633        magicHash <- getBit MagicHashBit
1634        if magicHash
1635          then do
1636            i <- getInput
1637            case alexGetChar' i of
1638              Just ('#',i) -> do
1639                setInput i
1640                when (any (> '\xFF') s') $ do
1641                  pState <- getPState
1642                  addError (RealSrcSpan (last_loc pState)) $ text
1643                     "primitive string literal must contain only characters <= \'\\xFF\'"
1644                return (ITprimstring (SourceText s') (unsafeMkByteString s'))
1645              _other ->
1646                return (ITstring (SourceText s') (mkFastString s'))
1647          else
1648                return (ITstring (SourceText s') (mkFastString s'))
1649
1650    Just ('\\',i)
1651        | Just ('&',i) <- next -> do
1652                setInput i; lex_string s
1653        | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1654                           -- is_space only works for <= '\x7f' (#3751, #5425)
1655                setInput i; lex_stringgap s
1656        where next = alexGetChar' i
1657
1658    Just (c, i1) -> do
1659        case c of
1660          '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1661          c | isAny c -> do setInput i1; lex_string (c:s)
1662          _other -> lit_error i
1663
1664lex_stringgap :: String -> P Token
1665lex_stringgap s = do
1666  i <- getInput
1667  c <- getCharOrFail i
1668  case c of
1669    '\\' -> lex_string s
1670    c | c <= '\x7f' && is_space c -> lex_stringgap s
1671                           -- is_space only works for <= '\x7f' (#3751, #5425)
1672    _other -> lit_error i
1673
1674
1675lex_char_tok :: Action
1676-- Here we are basically parsing character literals, such as 'x' or '\n'
1677-- but we additionally spot 'x and ''T, returning ITsimpleQuote and
1678-- ITtyQuote respectively, but WITHOUT CONSUMING the x or T part
1679-- (the parser does that).
1680-- So we have to do two characters of lookahead: when we see 'x we need to
1681-- see if there's a trailing quote
1682lex_char_tok span buf _len = do        -- We've seen '
1683   i1 <- getInput       -- Look ahead to first character
1684   let loc = realSrcSpanStart span
1685   case alexGetChar' i1 of
1686        Nothing -> lit_error  i1
1687
1688        Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1689                   setInput i2
1690                   return (L (mkRealSrcSpan loc end2)  ITtyQuote)
1691
1692        Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1693                  setInput i2
1694                  lit_ch <- lex_escape
1695                  i3 <- getInput
1696                  mc <- getCharOrFail i3 -- Trailing quote
1697                  if mc == '\'' then finish_char_tok buf loc lit_ch
1698                                else lit_error i3
1699
1700        Just (c, i2@(AI _end2 _))
1701                | not (isAny c) -> lit_error i1
1702                | otherwise ->
1703
1704                -- We've seen 'x, where x is a valid character
1705                --  (i.e. not newline etc) but not a quote or backslash
1706           case alexGetChar' i2 of      -- Look ahead one more character
1707                Just ('\'', i3) -> do   -- We've seen 'x'
1708                        setInput i3
1709                        finish_char_tok buf loc c
1710                _other -> do            -- We've seen 'x not followed by quote
1711                                        -- (including the possibility of EOF)
1712                                        -- Just parse the quote only
1713                        let (AI end _) = i1
1714                        return (L (mkRealSrcSpan loc end) ITsimpleQuote)
1715
1716finish_char_tok :: StringBuffer -> RealSrcLoc -> Char -> P (RealLocated Token)
1717finish_char_tok buf loc ch  -- We've already seen the closing quote
1718                        -- Just need to check for trailing #
1719  = do  magicHash <- getBit MagicHashBit
1720        i@(AI end bufEnd) <- getInput
1721        let src = lexemeToString buf (cur bufEnd - cur buf)
1722        if magicHash then do
1723            case alexGetChar' i of
1724              Just ('#',i@(AI end _)) -> do
1725                setInput i
1726                return (L (mkRealSrcSpan loc end)
1727                          (ITprimchar (SourceText src) ch))
1728              _other ->
1729                return (L (mkRealSrcSpan loc end)
1730                          (ITchar (SourceText src) ch))
1731            else do
1732              return (L (mkRealSrcSpan loc end) (ITchar (SourceText src) ch))
1733
1734isAny :: Char -> Bool
1735isAny c | c > '\x7f' = isPrint c
1736        | otherwise  = is_any c
1737
1738lex_escape :: P Char
1739lex_escape = do
1740  i0 <- getInput
1741  c <- getCharOrFail i0
1742  case c of
1743        'a'   -> return '\a'
1744        'b'   -> return '\b'
1745        'f'   -> return '\f'
1746        'n'   -> return '\n'
1747        'r'   -> return '\r'
1748        't'   -> return '\t'
1749        'v'   -> return '\v'
1750        '\\'  -> return '\\'
1751        '"'   -> return '\"'
1752        '\''  -> return '\''
1753        '^'   -> do i1 <- getInput
1754                    c <- getCharOrFail i1
1755                    if c >= '@' && c <= '_'
1756                        then return (chr (ord c - ord '@'))
1757                        else lit_error i1
1758
1759        'x'   -> readNum is_hexdigit 16 hexDigit
1760        'o'   -> readNum is_octdigit  8 octDecDigit
1761        x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1762
1763        c1 ->  do
1764           i <- getInput
1765           case alexGetChar' i of
1766            Nothing -> lit_error i0
1767            Just (c2,i2) ->
1768              case alexGetChar' i2 of
1769                Nothing -> do lit_error i0
1770                Just (c3,i3) ->
1771                   let str = [c1,c2,c3] in
1772                   case [ (c,rest) | (p,c) <- silly_escape_chars,
1773                                     Just rest <- [stripPrefix p str] ] of
1774                          (escape_char,[]):_ -> do
1775                                setInput i3
1776                                return escape_char
1777                          (escape_char,_:_):_ -> do
1778                                setInput i2
1779                                return escape_char
1780                          [] -> lit_error i0
1781
1782readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1783readNum is_digit base conv = do
1784  i <- getInput
1785  c <- getCharOrFail i
1786  if is_digit c
1787        then readNum2 is_digit base conv (conv c)
1788        else lit_error i
1789
1790readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1791readNum2 is_digit base conv i = do
1792  input <- getInput
1793  read i input
1794  where read i input = do
1795          case alexGetChar' input of
1796            Just (c,input') | is_digit c -> do
1797               let i' = i*base + conv c
1798               if i' > 0x10ffff
1799                  then setInput input >> lexError "numeric escape sequence out of range"
1800                  else read i' input'
1801            _other -> do
1802              setInput input; return (chr i)
1803
1804
1805silly_escape_chars :: [(String, Char)]
1806silly_escape_chars = [
1807        ("NUL", '\NUL'),
1808        ("SOH", '\SOH'),
1809        ("STX", '\STX'),
1810        ("ETX", '\ETX'),
1811        ("EOT", '\EOT'),
1812        ("ENQ", '\ENQ'),
1813        ("ACK", '\ACK'),
1814        ("BEL", '\BEL'),
1815        ("BS", '\BS'),
1816        ("HT", '\HT'),
1817        ("LF", '\LF'),
1818        ("VT", '\VT'),
1819        ("FF", '\FF'),
1820        ("CR", '\CR'),
1821        ("SO", '\SO'),
1822        ("SI", '\SI'),
1823        ("DLE", '\DLE'),
1824        ("DC1", '\DC1'),
1825        ("DC2", '\DC2'),
1826        ("DC3", '\DC3'),
1827        ("DC4", '\DC4'),
1828        ("NAK", '\NAK'),
1829        ("SYN", '\SYN'),
1830        ("ETB", '\ETB'),
1831        ("CAN", '\CAN'),
1832        ("EM", '\EM'),
1833        ("SUB", '\SUB'),
1834        ("ESC", '\ESC'),
1835        ("FS", '\FS'),
1836        ("GS", '\GS'),
1837        ("RS", '\RS'),
1838        ("US", '\US'),
1839        ("SP", '\SP'),
1840        ("DEL", '\DEL')
1841        ]
1842
1843-- before calling lit_error, ensure that the current input is pointing to
1844-- the position of the error in the buffer.  This is so that we can report
1845-- a correct location to the user, but also so we can detect UTF-8 decoding
1846-- errors if they occur.
1847lit_error :: AlexInput -> P a
1848lit_error i = do setInput i; lexError "lexical error in string/character literal"
1849
1850getCharOrFail :: AlexInput -> P Char
1851getCharOrFail i =  do
1852  case alexGetChar' i of
1853        Nothing -> lexError "unexpected end-of-file in string/character literal"
1854        Just (c,i)  -> do setInput i; return c
1855
1856-- -----------------------------------------------------------------------------
1857-- QuasiQuote
1858
1859lex_qquasiquote_tok :: Action
1860lex_qquasiquote_tok span buf len = do
1861  let (qual, quoter) = splitQualName (stepOn buf) (len - 2) False
1862  quoteStart <- getRealSrcLoc
1863  quote <- lex_quasiquote quoteStart ""
1864  end <- getRealSrcLoc
1865  return (L (mkRealSrcSpan (realSrcSpanStart span) end)
1866           (ITqQuasiQuote (qual,
1867                           quoter,
1868                           mkFastString (reverse quote),
1869                           mkRealSrcSpan quoteStart end)))
1870
1871lex_quasiquote_tok :: Action
1872lex_quasiquote_tok span buf len = do
1873  let quoter = tail (lexemeToString buf (len - 1))
1874                -- 'tail' drops the initial '[',
1875                -- while the -1 drops the trailing '|'
1876  quoteStart <- getRealSrcLoc
1877  quote <- lex_quasiquote quoteStart ""
1878  end <- getRealSrcLoc
1879  return (L (mkRealSrcSpan (realSrcSpanStart span) end)
1880           (ITquasiQuote (mkFastString quoter,
1881                          mkFastString (reverse quote),
1882                          mkRealSrcSpan quoteStart end)))
1883
1884lex_quasiquote :: RealSrcLoc -> String -> P String
1885lex_quasiquote start s = do
1886  i <- getInput
1887  case alexGetChar' i of
1888    Nothing -> quasiquote_error start
1889
1890    -- NB: The string "|]" terminates the quasiquote,
1891    -- with absolutely no escaping. See the extensive
1892    -- discussion on #5348 for why there is no
1893    -- escape handling.
1894    Just ('|',i)
1895        | Just (']',i) <- alexGetChar' i
1896        -> do { setInput i; return s }
1897
1898    Just (c, i) -> do
1899         setInput i; lex_quasiquote start (c : s)
1900
1901quasiquote_error :: RealSrcLoc -> P a
1902quasiquote_error start = do
1903  (AI end buf) <- getInput
1904  reportLexError start end buf "unterminated quasiquotation"
1905
1906-- -----------------------------------------------------------------------------
1907-- Warnings
1908
1909warnTab :: Action
1910warnTab srcspan _buf _len = do
1911    addTabWarning srcspan
1912    lexToken
1913
1914warnThen :: WarningFlag -> SDoc -> Action -> Action
1915warnThen option warning action srcspan buf len = do
1916    addWarning option (RealSrcSpan srcspan) warning
1917    action srcspan buf len
1918
1919-- -----------------------------------------------------------------------------
1920-- The Parse Monad
1921
1922-- | Do we want to generate ';' layout tokens? In some cases we just want to
1923-- generate '}', e.g. in MultiWayIf we don't need ';'s because '|' separates
1924-- alternatives (unlike a `case` expression where we need ';' to as a separator
1925-- between alternatives).
1926type GenSemic = Bool
1927
1928generateSemic, dontGenerateSemic :: GenSemic
1929generateSemic     = True
1930dontGenerateSemic = False
1931
1932data LayoutContext
1933  = NoLayout
1934  | Layout !Int !GenSemic
1935  deriving Show
1936
1937-- | The result of running a parser.
1938data ParseResult a
1939  = POk      -- ^ The parser has consumed a (possibly empty) prefix
1940             --   of the input and produced a result. Use 'getMessages'
1941             --   to check for accumulated warnings and non-fatal errors.
1942      PState -- ^ The resulting parsing state. Can be used to resume parsing.
1943      a      -- ^ The resulting value.
1944  | PFailed  -- ^ The parser has consumed a (possibly empty) prefix
1945             --   of the input and failed.
1946      PState -- ^ The parsing state right before failure, including the fatal
1947             --   parse error. 'getMessages' and 'getErrorMessages' must return
1948             --   a non-empty bag of errors.
1949
1950-- | Test whether a 'WarningFlag' is set
1951warnopt :: WarningFlag -> ParserFlags -> Bool
1952warnopt f options = f `EnumSet.member` pWarningFlags options
1953
1954-- | The subset of the 'DynFlags' used by the parser.
1955-- See 'mkParserFlags' or 'mkParserFlags'' for ways to construct this.
1956data ParserFlags = ParserFlags {
1957    pWarningFlags   :: EnumSet WarningFlag
1958  , pThisPackage    :: UnitId      -- ^ key of package currently being compiled
1959  , pExtsBitmap     :: !ExtsBitmap -- ^ bitmap of permitted extensions
1960  }
1961
1962data PState = PState {
1963        buffer     :: StringBuffer,
1964        options    :: ParserFlags,
1965        -- This needs to take DynFlags as an argument until
1966        -- we have a fix for #10143
1967        messages   :: DynFlags -> Messages,
1968        tab_first  :: Maybe RealSrcSpan, -- pos of first tab warning in the file
1969        tab_count  :: !Int,              -- number of tab warnings in the file
1970        last_tk    :: Maybe Token,
1971        last_loc   :: RealSrcSpan, -- pos of previous token
1972        last_len   :: !Int,        -- len of previous token
1973        loc        :: RealSrcLoc,  -- current loc (end of prev token + 1)
1974        context    :: [LayoutContext],
1975        lex_state  :: [Int],
1976        srcfiles   :: [FastString],
1977        -- Used in the alternative layout rule:
1978        -- These tokens are the next ones to be sent out. They are
1979        -- just blindly emitted, without the rule looking at them again:
1980        alr_pending_implicit_tokens :: [RealLocated Token],
1981        -- This is the next token to be considered or, if it is Nothing,
1982        -- we need to get the next token from the input stream:
1983        alr_next_token :: Maybe (RealLocated Token),
1984        -- This is what we consider to be the location of the last token
1985        -- emitted:
1986        alr_last_loc :: RealSrcSpan,
1987        -- The stack of layout contexts:
1988        alr_context :: [ALRContext],
1989        -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1990        -- us what sort of layout the '{' will open:
1991        alr_expecting_ocurly :: Maybe ALRLayout,
1992        -- Have we just had the '}' for a let block? If so, than an 'in'
1993        -- token doesn't need to close anything:
1994        alr_justClosedExplicitLetBlock :: Bool,
1995
1996        -- The next three are used to implement Annotations giving the
1997        -- locations of 'noise' tokens in the source, so that users of
1998        -- the GHC API can do source to source conversions.
1999        -- See note [Api annotations] in ApiAnnotation.hs
2000        annotations :: [(ApiAnnKey,[SrcSpan])],
2001        comment_q :: [Located AnnotationComment],
2002        annotations_comments :: [(SrcSpan,[Located AnnotationComment])]
2003     }
2004        -- last_loc and last_len are used when generating error messages,
2005        -- and in pushCurrentContext only.  Sigh, if only Happy passed the
2006        -- current token to happyError, we could at least get rid of last_len.
2007        -- Getting rid of last_loc would require finding another way to
2008        -- implement pushCurrentContext (which is only called from one place).
2009
2010data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
2011                              Bool{- is it a 'let' block? -}
2012                | ALRLayout ALRLayout Int
2013data ALRLayout = ALRLayoutLet
2014               | ALRLayoutWhere
2015               | ALRLayoutOf
2016               | ALRLayoutDo
2017
2018-- | The parsing monad, isomorphic to @StateT PState Maybe@.
2019newtype P a = P { unP :: PState -> ParseResult a }
2020
2021instance Functor P where
2022  fmap = liftM
2023
2024instance Applicative P where
2025  pure = returnP
2026  (<*>) = ap
2027
2028instance Monad P where
2029  (>>=) = thenP
2030#if !MIN_VERSION_base(4,13,0)
2031  fail = MonadFail.fail
2032#endif
2033
2034instance MonadFail.MonadFail P where
2035  fail = failMsgP
2036
2037returnP :: a -> P a
2038returnP a = a `seq` (P $ \s -> POk s a)
2039
2040thenP :: P a -> (a -> P b) -> P b
2041(P m) `thenP` k = P $ \ s ->
2042        case m s of
2043                POk s1 a         -> (unP (k a)) s1
2044                PFailed s1 -> PFailed s1
2045
2046failMsgP :: String -> P a
2047failMsgP msg = do
2048  pState <- getPState
2049  addFatalError (RealSrcSpan (last_loc pState)) (text msg)
2050
2051failLocMsgP :: RealSrcLoc -> RealSrcLoc -> String -> P a
2052failLocMsgP loc1 loc2 str =
2053  addFatalError (RealSrcSpan (mkRealSrcSpan loc1 loc2)) (text str)
2054
2055getPState :: P PState
2056getPState = P $ \s -> POk s s
2057
2058withThisPackage :: (UnitId -> a) -> P a
2059withThisPackage f = P $ \s@(PState{options = o}) -> POk s (f (pThisPackage o))
2060
2061getExts :: P ExtsBitmap
2062getExts = P $ \s -> POk s (pExtsBitmap . options $ s)
2063
2064setExts :: (ExtsBitmap -> ExtsBitmap) -> P ()
2065setExts f = P $ \s -> POk s {
2066  options =
2067    let p = options s
2068    in  p { pExtsBitmap = f (pExtsBitmap p) }
2069  } ()
2070
2071setSrcLoc :: RealSrcLoc -> P ()
2072setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
2073
2074getRealSrcLoc :: P RealSrcLoc
2075getRealSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
2076
2077addSrcFile :: FastString -> P ()
2078addSrcFile f = P $ \s -> POk s{ srcfiles = f : srcfiles s } ()
2079
2080setLastToken :: RealSrcSpan -> Int -> P ()
2081setLastToken loc len = P $ \s -> POk s {
2082  last_loc=loc,
2083  last_len=len
2084  } ()
2085
2086setLastTk :: Token -> P ()
2087setLastTk tk = P $ \s -> POk s { last_tk = Just tk } ()
2088
2089getLastTk :: P (Maybe Token)
2090getLastTk = P $ \s@(PState { last_tk = last_tk }) -> POk s last_tk
2091
2092data AlexInput = AI RealSrcLoc StringBuffer
2093
2094{-
2095Note [Unicode in Alex]
2096~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2097Although newer versions of Alex support unicode, this grammar is processed with
2098the old style '--latin1' behaviour. This means that when implementing the
2099functions
2100
2101    alexGetByte       :: AlexInput -> Maybe (Word8,AlexInput)
2102    alexInputPrevChar :: AlexInput -> Char
2103
2104which Alex uses to take apart our 'AlexInput', we must
2105
2106  * return a latin1 character in the 'Word8' that 'alexGetByte' expects
2107  * return a latin1 character in 'alexInputPrevChar'.
2108
2109We handle this in 'adjustChar' by squishing entire classes of unicode
2110characters into single bytes.
2111-}
2112
2113{-# INLINE adjustChar #-}
2114adjustChar :: Char -> Word8
2115adjustChar c = fromIntegral $ ord adj_c
2116  where non_graphic     = '\x00'
2117        upper           = '\x01'
2118        lower           = '\x02'
2119        digit           = '\x03'
2120        symbol          = '\x04'
2121        space           = '\x05'
2122        other_graphic   = '\x06'
2123        uniidchar       = '\x07'
2124
2125        adj_c
2126          | c <= '\x07' = non_graphic
2127          | c <= '\x7f' = c
2128          -- Alex doesn't handle Unicode, so when Unicode
2129          -- character is encountered we output these values
2130          -- with the actual character value hidden in the state.
2131          | otherwise =
2132                -- NB: The logic behind these definitions is also reflected
2133                -- in basicTypes/Lexeme.hs
2134                -- Any changes here should likely be reflected there.
2135
2136                case generalCategory c of
2137                  UppercaseLetter       -> upper
2138                  LowercaseLetter       -> lower
2139                  TitlecaseLetter       -> upper
2140                  ModifierLetter        -> uniidchar -- see #10196
2141                  OtherLetter           -> lower -- see #1103
2142                  NonSpacingMark        -> uniidchar -- see #7650
2143                  SpacingCombiningMark  -> other_graphic
2144                  EnclosingMark         -> other_graphic
2145                  DecimalNumber         -> digit
2146                  LetterNumber          -> other_graphic
2147                  OtherNumber           -> digit -- see #4373
2148                  ConnectorPunctuation  -> symbol
2149                  DashPunctuation       -> symbol
2150                  OpenPunctuation       -> other_graphic
2151                  ClosePunctuation      -> other_graphic
2152                  InitialQuote          -> other_graphic
2153                  FinalQuote            -> other_graphic
2154                  OtherPunctuation      -> symbol
2155                  MathSymbol            -> symbol
2156                  CurrencySymbol        -> symbol
2157                  ModifierSymbol        -> symbol
2158                  OtherSymbol           -> symbol
2159                  Space                 -> space
2160                  _other                -> non_graphic
2161
2162-- Getting the previous 'Char' isn't enough here - we need to convert it into
2163-- the same format that 'alexGetByte' would have produced.
2164--
2165-- See Note [Unicode in Alex] and #13986.
2166alexInputPrevChar :: AlexInput -> Char
2167alexInputPrevChar (AI _ buf) = chr (fromIntegral (adjustChar pc))
2168  where pc = prevChar buf '\n'
2169
2170-- backwards compatibility for Alex 2.x
2171alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
2172alexGetChar inp = case alexGetByte inp of
2173                    Nothing    -> Nothing
2174                    Just (b,i) -> c `seq` Just (c,i)
2175                       where c = chr $ fromIntegral b
2176
2177-- See Note [Unicode in Alex]
2178alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
2179alexGetByte (AI loc s)
2180  | atEnd s   = Nothing
2181  | otherwise = byte `seq` loc' `seq` s' `seq`
2182                --trace (show (ord c)) $
2183                Just (byte, (AI loc' s'))
2184  where (c,s') = nextChar s
2185        loc'   = advanceSrcLoc loc c
2186        byte   = adjustChar c
2187
2188-- This version does not squash unicode characters, it is used when
2189-- lexing strings.
2190alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
2191alexGetChar' (AI loc s)
2192  | atEnd s   = Nothing
2193  | otherwise = c `seq` loc' `seq` s' `seq`
2194                --trace (show (ord c)) $
2195                Just (c, (AI loc' s'))
2196  where (c,s') = nextChar s
2197        loc'   = advanceSrcLoc loc c
2198
2199getInput :: P AlexInput
2200getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
2201
2202setInput :: AlexInput -> P ()
2203setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
2204
2205nextIsEOF :: P Bool
2206nextIsEOF = do
2207  AI _ s <- getInput
2208  return $ atEnd s
2209
2210pushLexState :: Int -> P ()
2211pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
2212
2213popLexState :: P Int
2214popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
2215
2216getLexState :: P Int
2217getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
2218
2219popNextToken :: P (Maybe (RealLocated Token))
2220popNextToken
2221    = P $ \s@PState{ alr_next_token = m } ->
2222              POk (s {alr_next_token = Nothing}) m
2223
2224activeContext :: P Bool
2225activeContext = do
2226  ctxt <- getALRContext
2227  expc <- getAlrExpectingOCurly
2228  impt <- implicitTokenPending
2229  case (ctxt,expc) of
2230    ([],Nothing) -> return impt
2231    _other       -> return True
2232
2233setAlrLastLoc :: RealSrcSpan -> P ()
2234setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
2235
2236getAlrLastLoc :: P RealSrcSpan
2237getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
2238
2239getALRContext :: P [ALRContext]
2240getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
2241
2242setALRContext :: [ALRContext] -> P ()
2243setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
2244
2245getJustClosedExplicitLetBlock :: P Bool
2246getJustClosedExplicitLetBlock
2247 = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
2248
2249setJustClosedExplicitLetBlock :: Bool -> P ()
2250setJustClosedExplicitLetBlock b
2251 = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
2252
2253setNextToken :: RealLocated Token -> P ()
2254setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
2255
2256implicitTokenPending :: P Bool
2257implicitTokenPending
2258    = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
2259              case ts of
2260              [] -> POk s False
2261              _  -> POk s True
2262
2263popPendingImplicitToken :: P (Maybe (RealLocated Token))
2264popPendingImplicitToken
2265    = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
2266              case ts of
2267              [] -> POk s Nothing
2268              (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
2269
2270setPendingImplicitTokens :: [RealLocated Token] -> P ()
2271setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
2272
2273getAlrExpectingOCurly :: P (Maybe ALRLayout)
2274getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
2275
2276setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
2277setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
2278
2279-- | For reasons of efficiency, boolean parsing flags (eg, language extensions
2280-- or whether we are currently in a @RULE@ pragma) are represented by a bitmap
2281-- stored in a @Word64@.
2282type ExtsBitmap = Word64
2283
2284xbit :: ExtBits -> ExtsBitmap
2285xbit = bit . fromEnum
2286
2287xtest :: ExtBits -> ExtsBitmap -> Bool
2288xtest ext xmap = testBit xmap (fromEnum ext)
2289
2290-- | Various boolean flags, mostly language extensions, that impact lexing and
2291-- parsing. Note that a handful of these can change during lexing/parsing.
2292data ExtBits
2293  -- Flags that are constant once parsing starts
2294  = FfiBit
2295  | InterruptibleFfiBit
2296  | CApiFfiBit
2297  | ArrowsBit
2298  | ThBit
2299  | ThQuotesBit
2300  | IpBit
2301  | OverloadedLabelsBit -- #x overloaded labels
2302  | ExplicitForallBit -- the 'forall' keyword
2303  | BangPatBit -- Tells the parser to understand bang-patterns
2304               -- (doesn't affect the lexer)
2305  | PatternSynonymsBit -- pattern synonyms
2306  | HaddockBit-- Lex and parse Haddock comments
2307  | MagicHashBit -- "#" in both functions and operators
2308  | RecursiveDoBit -- mdo
2309  | UnicodeSyntaxBit -- the forall symbol, arrow symbols, etc
2310  | UnboxedTuplesBit -- (# and #)
2311  | UnboxedSumsBit -- (# and #)
2312  | DatatypeContextsBit
2313  | MonadComprehensionsBit
2314  | TransformComprehensionsBit
2315  | QqBit -- enable quasiquoting
2316  | RawTokenStreamBit -- producing a token stream with all comments included
2317  | AlternativeLayoutRuleBit
2318  | ALRTransitionalBit
2319  | RelaxedLayoutBit
2320  | NondecreasingIndentationBit
2321  | SafeHaskellBit
2322  | TraditionalRecordSyntaxBit
2323  | ExplicitNamespacesBit
2324  | LambdaCaseBit
2325  | BinaryLiteralsBit
2326  | NegativeLiteralsBit
2327  | HexFloatLiteralsBit
2328  | TypeApplicationsBit
2329  | StaticPointersBit
2330  | NumericUnderscoresBit
2331  | StarIsTypeBit
2332  | BlockArgumentsBit
2333  | NPlusKPatternsBit
2334  | DoAndIfThenElseBit
2335  | MultiWayIfBit
2336  | GadtSyntaxBit
2337  | ImportQualifiedPostBit
2338
2339  -- Flags that are updated once parsing starts
2340  | InRulePragBit
2341  | InNestedCommentBit -- See Note [Nested comment line pragmas]
2342  | UsePosPragsBit
2343    -- ^ If this is enabled, '{-# LINE ... -#}' and '{-# COLUMN ... #-}'
2344    -- update the internal position. Otherwise, those pragmas are lexed as
2345    -- tokens of their own.
2346  deriving Enum
2347
2348
2349
2350
2351
2352-- PState for parsing options pragmas
2353--
2354pragState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState
2355pragState dynflags buf loc = (mkPState dynflags buf loc) {
2356                                 lex_state = [bol, option_prags, 0]
2357                             }
2358
2359{-# INLINE mkParserFlags' #-}
2360mkParserFlags'
2361  :: EnumSet WarningFlag        -- ^ warnings flags enabled
2362  -> EnumSet LangExt.Extension  -- ^ permitted language extensions enabled
2363  -> UnitId                     -- ^ key of package currently being compiled
2364  -> Bool                       -- ^ are safe imports on?
2365  -> Bool                       -- ^ keeping Haddock comment tokens
2366  -> Bool                       -- ^ keep regular comment tokens
2367
2368  -> Bool
2369  -- ^ If this is enabled, '{-# LINE ... -#}' and '{-# COLUMN ... #-}' update
2370  -- the internal position kept by the parser. Otherwise, those pragmas are
2371  -- lexed as 'ITline_prag' and 'ITcolumn_prag' tokens.
2372
2373  -> ParserFlags
2374-- ^ Given exactly the information needed, set up the 'ParserFlags'
2375mkParserFlags' warningFlags extensionFlags thisPackage
2376  safeImports isHaddock rawTokStream usePosPrags =
2377    ParserFlags {
2378      pWarningFlags = warningFlags
2379    , pThisPackage = thisPackage
2380    , pExtsBitmap = safeHaskellBit .|. langExtBits .|. optBits
2381    }
2382  where
2383    safeHaskellBit = SafeHaskellBit `setBitIf` safeImports
2384    langExtBits =
2385          FfiBit                      `xoptBit` LangExt.ForeignFunctionInterface
2386      .|. InterruptibleFfiBit         `xoptBit` LangExt.InterruptibleFFI
2387      .|. CApiFfiBit                  `xoptBit` LangExt.CApiFFI
2388      .|. ArrowsBit                   `xoptBit` LangExt.Arrows
2389      .|. ThBit                       `xoptBit` LangExt.TemplateHaskell
2390      .|. ThQuotesBit                 `xoptBit` LangExt.TemplateHaskellQuotes
2391      .|. QqBit                       `xoptBit` LangExt.QuasiQuotes
2392      .|. IpBit                       `xoptBit` LangExt.ImplicitParams
2393      .|. OverloadedLabelsBit         `xoptBit` LangExt.OverloadedLabels
2394      .|. ExplicitForallBit           `xoptBit` LangExt.ExplicitForAll
2395      .|. BangPatBit                  `xoptBit` LangExt.BangPatterns
2396      .|. MagicHashBit                `xoptBit` LangExt.MagicHash
2397      .|. RecursiveDoBit              `xoptBit` LangExt.RecursiveDo
2398      .|. UnicodeSyntaxBit            `xoptBit` LangExt.UnicodeSyntax
2399      .|. UnboxedTuplesBit            `xoptBit` LangExt.UnboxedTuples
2400      .|. UnboxedSumsBit              `xoptBit` LangExt.UnboxedSums
2401      .|. DatatypeContextsBit         `xoptBit` LangExt.DatatypeContexts
2402      .|. TransformComprehensionsBit  `xoptBit` LangExt.TransformListComp
2403      .|. MonadComprehensionsBit      `xoptBit` LangExt.MonadComprehensions
2404      .|. AlternativeLayoutRuleBit    `xoptBit` LangExt.AlternativeLayoutRule
2405      .|. ALRTransitionalBit          `xoptBit` LangExt.AlternativeLayoutRuleTransitional
2406      .|. RelaxedLayoutBit            `xoptBit` LangExt.RelaxedLayout
2407      .|. NondecreasingIndentationBit `xoptBit` LangExt.NondecreasingIndentation
2408      .|. TraditionalRecordSyntaxBit  `xoptBit` LangExt.TraditionalRecordSyntax
2409      .|. ExplicitNamespacesBit       `xoptBit` LangExt.ExplicitNamespaces
2410      .|. LambdaCaseBit               `xoptBit` LangExt.LambdaCase
2411      .|. BinaryLiteralsBit           `xoptBit` LangExt.BinaryLiterals
2412      .|. NegativeLiteralsBit         `xoptBit` LangExt.NegativeLiterals
2413      .|. HexFloatLiteralsBit         `xoptBit` LangExt.HexFloatLiterals
2414      .|. PatternSynonymsBit          `xoptBit` LangExt.PatternSynonyms
2415      .|. TypeApplicationsBit         `xoptBit` LangExt.TypeApplications
2416      .|. StaticPointersBit           `xoptBit` LangExt.StaticPointers
2417      .|. NumericUnderscoresBit       `xoptBit` LangExt.NumericUnderscores
2418      .|. StarIsTypeBit               `xoptBit` LangExt.StarIsType
2419      .|. BlockArgumentsBit           `xoptBit` LangExt.BlockArguments
2420      .|. NPlusKPatternsBit           `xoptBit` LangExt.NPlusKPatterns
2421      .|. DoAndIfThenElseBit          `xoptBit` LangExt.DoAndIfThenElse
2422      .|. MultiWayIfBit               `xoptBit` LangExt.MultiWayIf
2423      .|. GadtSyntaxBit               `xoptBit` LangExt.GADTSyntax
2424      .|. ImportQualifiedPostBit      `xoptBit` LangExt.ImportQualifiedPost
2425    optBits =
2426          HaddockBit        `setBitIf` isHaddock
2427      .|. RawTokenStreamBit `setBitIf` rawTokStream
2428      .|. UsePosPragsBit    `setBitIf` usePosPrags
2429
2430    xoptBit bit ext = bit `setBitIf` EnumSet.member ext extensionFlags
2431
2432    setBitIf :: ExtBits -> Bool -> ExtsBitmap
2433    b `setBitIf` cond | cond      = xbit b
2434                      | otherwise = 0
2435
2436-- | Extracts the flag information needed for parsing
2437mkParserFlags :: DynFlags -> ParserFlags
2438mkParserFlags =
2439  mkParserFlags'
2440    <$> DynFlags.warningFlags
2441    <*> DynFlags.extensionFlags
2442    <*> DynFlags.thisPackage
2443    <*> safeImportsOn
2444    <*> gopt Opt_Haddock
2445    <*> gopt Opt_KeepRawTokenStream
2446    <*> const True
2447
2448-- | Creates a parse state from a 'DynFlags' value
2449mkPState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState
2450mkPState flags = mkPStatePure (mkParserFlags flags)
2451
2452-- | Creates a parse state from a 'ParserFlags' value
2453mkPStatePure :: ParserFlags -> StringBuffer -> RealSrcLoc -> PState
2454mkPStatePure options buf loc =
2455  PState {
2456      buffer        = buf,
2457      options       = options,
2458      messages      = const emptyMessages,
2459      tab_first     = Nothing,
2460      tab_count     = 0,
2461      last_tk       = Nothing,
2462      last_loc      = mkRealSrcSpan loc loc,
2463      last_len      = 0,
2464      loc           = loc,
2465      context       = [],
2466      lex_state     = [bol, 0],
2467      srcfiles      = [],
2468      alr_pending_implicit_tokens = [],
2469      alr_next_token = Nothing,
2470      alr_last_loc = alrInitialLoc (fsLit "<no file>"),
2471      alr_context = [],
2472      alr_expecting_ocurly = Nothing,
2473      alr_justClosedExplicitLetBlock = False,
2474      annotations = [],
2475      comment_q = [],
2476      annotations_comments = []
2477    }
2478
2479-- | An mtl-style class for monads that support parsing-related operations.
2480-- For example, sometimes we make a second pass over the parsing results to validate,
2481-- disambiguate, or rearrange them, and we do so in the PV monad which cannot consume
2482-- input but can report parsing errors, check for extension bits, and accumulate
2483-- parsing annotations. Both P and PV are instances of MonadP.
2484--
2485-- MonadP grants us convenient overloading. The other option is to have separate operations
2486-- for each monad: addErrorP vs addErrorPV, getBitP vs getBitPV, and so on.
2487--
2488class Monad m => MonadP m where
2489  -- | Add a non-fatal error. Use this when the parser can produce a result
2490  --   despite the error.
2491  --
2492  --   For example, when GHC encounters a @forall@ in a type,
2493  --   but @-XExplicitForAll@ is disabled, the parser constructs @ForAllTy@
2494  --   as if @-XExplicitForAll@ was enabled, adding a non-fatal error to
2495  --   the accumulator.
2496  --
2497  --   Control flow wise, non-fatal errors act like warnings: they are added
2498  --   to the accumulator and parsing continues. This allows GHC to report
2499  --   more than one parse error per file.
2500  --
2501  addError :: SrcSpan -> SDoc -> m ()
2502  -- | Add a warning to the accumulator.
2503  --   Use 'getMessages' to get the accumulated warnings.
2504  addWarning :: WarningFlag -> SrcSpan -> SDoc -> m ()
2505  -- | Add a fatal error. This will be the last error reported by the parser, and
2506  --   the parser will not produce any result, ending in a 'PFailed' state.
2507  addFatalError :: SrcSpan -> SDoc -> m a
2508  -- | Check if a given flag is currently set in the bitmap.
2509  getBit :: ExtBits -> m Bool
2510  -- | Given a location and a list of AddAnn, apply them all to the location.
2511  addAnnotation :: SrcSpan          -- SrcSpan of enclosing AST construct
2512                -> AnnKeywordId     -- The first two parameters are the key
2513                -> SrcSpan          -- The location of the keyword itself
2514                -> m ()
2515
2516appendError
2517  :: SrcSpan
2518  -> SDoc
2519  -> (DynFlags -> Messages)
2520  -> (DynFlags -> Messages)
2521appendError srcspan msg m =
2522  \d ->
2523    let (ws, es) = m d
2524        errormsg = mkErrMsg d srcspan alwaysQualify msg
2525        es' = es `snocBag` errormsg
2526    in (ws, es')
2527
2528appendWarning
2529  :: ParserFlags
2530  -> WarningFlag
2531  -> SrcSpan
2532  -> SDoc
2533  -> (DynFlags -> Messages)
2534  -> (DynFlags -> Messages)
2535appendWarning o option srcspan warning m =
2536  \d ->
2537    let (ws, es) = m d
2538        warning' = makeIntoWarning (Reason option) $
2539           mkWarnMsg d srcspan alwaysQualify warning
2540        ws' = if warnopt option o then ws `snocBag` warning' else ws
2541    in (ws', es)
2542
2543instance MonadP P where
2544  addError srcspan msg
2545   = P $ \s@PState{messages=m} ->
2546             POk s{messages=appendError srcspan msg m} ()
2547  addWarning option srcspan warning
2548   = P $ \s@PState{messages=m, options=o} ->
2549             POk s{messages=appendWarning o option srcspan warning m} ()
2550  addFatalError span msg =
2551    addError span msg >> P PFailed
2552  getBit ext = P $ \s -> let b =  ext `xtest` pExtsBitmap (options s)
2553                         in b `seq` POk s b
2554  addAnnotation l a v = do
2555    addAnnotationOnly l a v
2556    allocateCommentsP l
2557
2558addAnnsAt :: MonadP m => SrcSpan -> [AddAnn] -> m ()
2559addAnnsAt l = mapM_ (\(AddAnn a v) -> addAnnotation l a v)
2560
2561addTabWarning :: RealSrcSpan -> P ()
2562addTabWarning srcspan
2563 = P $ \s@PState{tab_first=tf, tab_count=tc, options=o} ->
2564       let tf' = if isJust tf then tf else Just srcspan
2565           tc' = tc + 1
2566           s' = if warnopt Opt_WarnTabs o
2567                then s{tab_first = tf', tab_count = tc'}
2568                else s
2569       in POk s' ()
2570
2571mkTabWarning :: PState -> DynFlags -> Maybe ErrMsg
2572mkTabWarning PState{tab_first=tf, tab_count=tc} d =
2573  let middle = if tc == 1
2574        then text ""
2575        else text ", and in" <+> speakNOf (tc - 1) (text "further location")
2576      message = text "Tab character found here"
2577                <> middle
2578                <> text "."
2579                $+$ text "Please use spaces instead."
2580  in fmap (\s -> makeIntoWarning (Reason Opt_WarnTabs) $
2581                 mkWarnMsg d (RealSrcSpan s) alwaysQualify message) tf
2582
2583-- | Get a bag of the errors that have been accumulated so far.
2584--   Does not take -Werror into account.
2585getErrorMessages :: PState -> DynFlags -> ErrorMessages
2586getErrorMessages PState{messages=m} d =
2587  let (_, es) = m d in es
2588
2589-- | Get the warnings and errors accumulated so far.
2590--   Does not take -Werror into account.
2591getMessages :: PState -> DynFlags -> Messages
2592getMessages p@PState{messages=m} d =
2593  let (ws, es) = m d
2594      tabwarning = mkTabWarning p d
2595      ws' = maybe ws (`consBag` ws) tabwarning
2596  in (ws', es)
2597
2598getContext :: P [LayoutContext]
2599getContext = P $ \s@PState{context=ctx} -> POk s ctx
2600
2601setContext :: [LayoutContext] -> P ()
2602setContext ctx = P $ \s -> POk s{context=ctx} ()
2603
2604popContext :: P ()
2605popContext = P $ \ s@(PState{ buffer = buf, options = o, context = ctx,
2606                              last_len = len, last_loc = last_loc }) ->
2607  case ctx of
2608        (_:tl) ->
2609          POk s{ context = tl } ()
2610        []     ->
2611          unP (addFatalError (RealSrcSpan last_loc) (srcParseErr o buf len)) s
2612
2613-- Push a new layout context at the indentation of the last token read.
2614pushCurrentContext :: GenSemic -> P ()
2615pushCurrentContext gen_semic = P $ \ s@PState{ last_loc=loc, context=ctx } ->
2616    POk s{context = Layout (srcSpanStartCol loc) gen_semic : ctx} ()
2617
2618-- This is only used at the outer level of a module when the 'module' keyword is
2619-- missing.
2620pushModuleContext :: P ()
2621pushModuleContext = pushCurrentContext generateSemic
2622
2623getOffside :: P (Ordering, Bool)
2624getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
2625                let offs = srcSpanStartCol loc in
2626                let ord = case stk of
2627                            Layout n gen_semic : _ ->
2628                              --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
2629                              (compare offs n, gen_semic)
2630                            _ ->
2631                              (GT, dontGenerateSemic)
2632                in POk s ord
2633
2634-- ---------------------------------------------------------------------------
2635-- Construct a parse error
2636
2637srcParseErr
2638  :: ParserFlags
2639  -> StringBuffer       -- current buffer (placed just after the last token)
2640  -> Int                -- length of the previous token
2641  -> MsgDoc
2642srcParseErr options buf len
2643  = if null token
2644         then text "parse error (possibly incorrect indentation or mismatched brackets)"
2645         else text "parse error on input" <+> quotes (text token)
2646              $$ ppWhen (not th_enabled && token == "$") -- #7396
2647                        (text "Perhaps you intended to use TemplateHaskell")
2648              $$ ppWhen (token == "<-")
2649                        (if mdoInLast100
2650                           then text "Perhaps you intended to use RecursiveDo"
2651                           else text "Perhaps this statement should be within a 'do' block?")
2652              $$ ppWhen (token == "=" && doInLast100) -- #15849
2653                        (text "Perhaps you need a 'let' in a 'do' block?"
2654                         $$ text "e.g. 'let x = 5' instead of 'x = 5'")
2655              $$ ppWhen (not ps_enabled && pattern == "pattern ") -- #12429
2656                        (text "Perhaps you intended to use PatternSynonyms")
2657  where token = lexemeToString (offsetBytes (-len) buf) len
2658        pattern = decodePrevNChars 8 buf
2659        last100 = decodePrevNChars 100 buf
2660        doInLast100 = "do" `isInfixOf` last100
2661        mdoInLast100 = "mdo" `isInfixOf` last100
2662        th_enabled = ThBit `xtest` pExtsBitmap options
2663        ps_enabled = PatternSynonymsBit `xtest` pExtsBitmap options
2664
2665-- Report a parse failure, giving the span of the previous token as
2666-- the location of the error.  This is the entry point for errors
2667-- detected during parsing.
2668srcParseFail :: P a
2669srcParseFail = P $ \s@PState{ buffer = buf, options = o, last_len = len,
2670                            last_loc = last_loc } ->
2671    unP (addFatalError (RealSrcSpan last_loc) (srcParseErr o buf len)) s
2672
2673-- A lexical error is reported at a particular position in the source file,
2674-- not over a token range.
2675lexError :: String -> P a
2676lexError str = do
2677  loc <- getRealSrcLoc
2678  (AI end buf) <- getInput
2679  reportLexError loc end buf str
2680
2681-- -----------------------------------------------------------------------------
2682-- This is the top-level function: called from the parser each time a
2683-- new token is to be read from the input.
2684
2685lexer :: Bool -> (Located Token -> P a) -> P a
2686lexer queueComments cont = do
2687  alr <- getBit AlternativeLayoutRuleBit
2688  let lexTokenFun = if alr then lexTokenAlr else lexToken
2689  (L span tok) <- lexTokenFun
2690  --trace ("token: " ++ show tok) $ do
2691
2692  case tok of
2693    ITeof -> addAnnotationOnly noSrcSpan AnnEofPos (RealSrcSpan span)
2694    _ -> return ()
2695
2696  if (queueComments && isDocComment tok)
2697    then queueComment (L (RealSrcSpan span) tok)
2698    else return ()
2699
2700  if (queueComments && isComment tok)
2701    then queueComment (L (RealSrcSpan span) tok) >> lexer queueComments cont
2702    else cont (L (RealSrcSpan span) tok)
2703
2704lexTokenAlr :: P (RealLocated Token)
2705lexTokenAlr = do mPending <- popPendingImplicitToken
2706                 t <- case mPending of
2707                      Nothing ->
2708                          do mNext <- popNextToken
2709                             t <- case mNext of
2710                                  Nothing -> lexToken
2711                                  Just next -> return next
2712                             alternativeLayoutRuleToken t
2713                      Just t ->
2714                          return t
2715                 setAlrLastLoc (getRealSrcSpan t)
2716                 case unRealSrcSpan t of
2717                     ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
2718                     ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
2719                     ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
2720                     ITlcase -> setAlrExpectingOCurly (Just ALRLayoutOf)
2721                     ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
2722                     ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2723                     ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2724                     _       -> return ()
2725                 return t
2726
2727alternativeLayoutRuleToken :: RealLocated Token -> P (RealLocated Token)
2728alternativeLayoutRuleToken t
2729    = do context <- getALRContext
2730         lastLoc <- getAlrLastLoc
2731         mExpectingOCurly <- getAlrExpectingOCurly
2732         transitional <- getBit ALRTransitionalBit
2733         justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2734         setJustClosedExplicitLetBlock False
2735         let thisLoc = getRealSrcSpan t
2736             thisCol = srcSpanStartCol thisLoc
2737             newLine = srcSpanStartLine thisLoc > srcSpanEndLine lastLoc
2738         case (unRealSrcSpan t, context, mExpectingOCurly) of
2739             -- This case handles a GHC extension to the original H98
2740             -- layout rule...
2741             (ITocurly, _, Just alrLayout) ->
2742                 do setAlrExpectingOCurly Nothing
2743                    let isLet = case alrLayout of
2744                                ALRLayoutLet -> True
2745                                _ -> False
2746                    setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2747                    return t
2748             -- ...and makes this case unnecessary
2749             {-
2750             -- I think our implicit open-curly handling is slightly
2751             -- different to John's, in how it interacts with newlines
2752             -- and "in"
2753             (ITocurly, _, Just _) ->
2754                 do setAlrExpectingOCurly Nothing
2755                    setNextToken t
2756                    lexTokenAlr
2757             -}
2758             (_, ALRLayout _ col : _ls, Just expectingOCurly)
2759              | (thisCol > col) ||
2760                (thisCol == col &&
2761                 isNonDecreasingIndentation expectingOCurly) ->
2762                 do setAlrExpectingOCurly Nothing
2763                    setALRContext (ALRLayout expectingOCurly thisCol : context)
2764                    setNextToken t
2765                    return (L thisLoc ITvocurly)
2766              | otherwise ->
2767                 do setAlrExpectingOCurly Nothing
2768                    setPendingImplicitTokens [L lastLoc ITvccurly]
2769                    setNextToken t
2770                    return (L lastLoc ITvocurly)
2771             (_, _, Just expectingOCurly) ->
2772                 do setAlrExpectingOCurly Nothing
2773                    setALRContext (ALRLayout expectingOCurly thisCol : context)
2774                    setNextToken t
2775                    return (L thisLoc ITvocurly)
2776             -- We do the [] cases earlier than in the spec, as we
2777             -- have an actual EOF token
2778             (ITeof, ALRLayout _ _ : ls, _) ->
2779                 do setALRContext ls
2780                    setNextToken t
2781                    return (L thisLoc ITvccurly)
2782             (ITeof, _, _) ->
2783                 return t
2784             -- the other ITeof case omitted; general case below covers it
2785             (ITin, _, _)
2786              | justClosedExplicitLetBlock ->
2787                 return t
2788             (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2789              | newLine ->
2790                 do setPendingImplicitTokens [t]
2791                    setALRContext ls
2792                    return (L thisLoc ITvccurly)
2793             -- This next case is to handle a transitional issue:
2794             (ITwhere, ALRLayout _ col : ls, _)
2795              | newLine && thisCol == col && transitional ->
2796                 do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2797                               (RealSrcSpan thisLoc)
2798                               (transitionalAlternativeLayoutWarning
2799                                    "`where' clause at the same depth as implicit layout block")
2800                    setALRContext ls
2801                    setNextToken t
2802                    -- Note that we use lastLoc, as we may need to close
2803                    -- more layouts, or give a semicolon
2804                    return (L lastLoc ITvccurly)
2805             -- This next case is to handle a transitional issue:
2806             (ITvbar, ALRLayout _ col : ls, _)
2807              | newLine && thisCol == col && transitional ->
2808                 do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2809                               (RealSrcSpan thisLoc)
2810                               (transitionalAlternativeLayoutWarning
2811                                    "`|' at the same depth as implicit layout block")
2812                    setALRContext ls
2813                    setNextToken t
2814                    -- Note that we use lastLoc, as we may need to close
2815                    -- more layouts, or give a semicolon
2816                    return (L lastLoc ITvccurly)
2817             (_, ALRLayout _ col : ls, _)
2818              | newLine && thisCol == col ->
2819                 do setNextToken t
2820                    let loc = realSrcSpanStart thisLoc
2821                        zeroWidthLoc = mkRealSrcSpan loc loc
2822                    return (L zeroWidthLoc ITsemi)
2823              | newLine && thisCol < col ->
2824                 do setALRContext ls
2825                    setNextToken t
2826                    -- Note that we use lastLoc, as we may need to close
2827                    -- more layouts, or give a semicolon
2828                    return (L lastLoc ITvccurly)
2829             -- We need to handle close before open, as 'then' is both
2830             -- an open and a close
2831             (u, _, _)
2832              | isALRclose u ->
2833                 case context of
2834                 ALRLayout _ _ : ls ->
2835                     do setALRContext ls
2836                        setNextToken t
2837                        return (L thisLoc ITvccurly)
2838                 ALRNoLayout _ isLet : ls ->
2839                     do let ls' = if isALRopen u
2840                                     then ALRNoLayout (containsCommas u) False : ls
2841                                     else ls
2842                        setALRContext ls'
2843                        when isLet $ setJustClosedExplicitLetBlock True
2844                        return t
2845                 [] ->
2846                     do let ls = if isALRopen u
2847                                    then [ALRNoLayout (containsCommas u) False]
2848                                    else []
2849                        setALRContext ls
2850                        -- XXX This is an error in John's code, but
2851                        -- it looks reachable to me at first glance
2852                        return t
2853             (u, _, _)
2854              | isALRopen u ->
2855                 do setALRContext (ALRNoLayout (containsCommas u) False : context)
2856                    return t
2857             (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2858                 do setALRContext ls
2859                    setPendingImplicitTokens [t]
2860                    return (L thisLoc ITvccurly)
2861             (ITin, ALRLayout _ _ : ls, _) ->
2862                 do setALRContext ls
2863                    setNextToken t
2864                    return (L thisLoc ITvccurly)
2865             -- the other ITin case omitted; general case below covers it
2866             (ITcomma, ALRLayout _ _ : ls, _)
2867              | topNoLayoutContainsCommas ls ->
2868                 do setALRContext ls
2869                    setNextToken t
2870                    return (L thisLoc ITvccurly)
2871             (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2872                 do setALRContext ls
2873                    setPendingImplicitTokens [t]
2874                    return (L thisLoc ITvccurly)
2875             -- the other ITwhere case omitted; general case below covers it
2876             (_, _, _) -> return t
2877
2878transitionalAlternativeLayoutWarning :: String -> SDoc
2879transitionalAlternativeLayoutWarning msg
2880    = text "transitional layout will not be accepted in the future:"
2881   $$ text msg
2882
2883isALRopen :: Token -> Bool
2884isALRopen ITcase          = True
2885isALRopen ITif            = True
2886isALRopen ITthen          = True
2887isALRopen IToparen        = True
2888isALRopen ITobrack        = True
2889isALRopen ITocurly        = True
2890-- GHC Extensions:
2891isALRopen IToubxparen     = True
2892isALRopen ITparenEscape   = True
2893isALRopen ITparenTyEscape = True
2894isALRopen _               = False
2895
2896isALRclose :: Token -> Bool
2897isALRclose ITof     = True
2898isALRclose ITthen   = True
2899isALRclose ITelse   = True
2900isALRclose ITcparen = True
2901isALRclose ITcbrack = True
2902isALRclose ITccurly = True
2903-- GHC Extensions:
2904isALRclose ITcubxparen = True
2905isALRclose _        = False
2906
2907isNonDecreasingIndentation :: ALRLayout -> Bool
2908isNonDecreasingIndentation ALRLayoutDo = True
2909isNonDecreasingIndentation _           = False
2910
2911containsCommas :: Token -> Bool
2912containsCommas IToparen = True
2913containsCommas ITobrack = True
2914-- John doesn't have {} as containing commas, but records contain them,
2915-- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2916-- (defaultInstallDirs).
2917containsCommas ITocurly = True
2918-- GHC Extensions:
2919containsCommas IToubxparen = True
2920containsCommas _        = False
2921
2922topNoLayoutContainsCommas :: [ALRContext] -> Bool
2923topNoLayoutContainsCommas [] = False
2924topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2925topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2926
2927lexToken :: P (RealLocated Token)
2928lexToken = do
2929  inp@(AI loc1 buf) <- getInput
2930  sc <- getLexState
2931  exts <- getExts
2932  case alexScanUser exts inp sc of
2933    AlexEOF -> do
2934        let span = mkRealSrcSpan loc1 loc1
2935        setLastToken span 0
2936        return (L span ITeof)
2937    AlexError (AI loc2 buf) ->
2938        reportLexError loc1 loc2 buf "lexical error"
2939    AlexSkip inp2 _ -> do
2940        setInput inp2
2941        lexToken
2942    AlexToken inp2@(AI end buf2) _ t -> do
2943        setInput inp2
2944        let span = mkRealSrcSpan loc1 end
2945        let bytes = byteDiff buf buf2
2946        span `seq` setLastToken span bytes
2947        lt <- t span buf bytes
2948        case unRealSrcSpan lt of
2949          ITlineComment _  -> return lt
2950          ITblockComment _ -> return lt
2951          lt' -> do
2952            setLastTk lt'
2953            return lt
2954
2955reportLexError :: RealSrcLoc -> RealSrcLoc -> StringBuffer -> [Char] -> P a
2956reportLexError loc1 loc2 buf str
2957  | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2958  | otherwise =
2959  let c = fst (nextChar buf)
2960  in if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2961     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2962     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2963
2964lexTokenStream :: StringBuffer -> RealSrcLoc -> DynFlags -> ParseResult [Located Token]
2965lexTokenStream buf loc dflags = unP go initState{ options = opts' }
2966    where dflags' = gopt_set (gopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2967          initState@PState{ options = opts } = mkPState dflags' buf loc
2968          opts' = opts{ pExtsBitmap = complement (xbit UsePosPragsBit) .&. pExtsBitmap opts }
2969          go = do
2970            ltok <- lexer False return
2971            case ltok of
2972              L _ ITeof -> return []
2973              _ -> liftM (ltok:) go
2974
2975linePrags = Map.singleton "line" linePrag
2976
2977fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2978                                 ("options_ghc", lex_string_prag IToptions_prag),
2979                                 ("options_haddock", lex_string_prag ITdocOptions),
2980                                 ("language", token ITlanguage_prag),
2981                                 ("include", lex_string_prag ITinclude_prag)])
2982
2983ignoredPrags = Map.fromList (map ignored pragmas)
2984               where ignored opt = (opt, nested_comment lexToken)
2985                     impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2986                     options_pragmas = map ("options_" ++) impls
2987                     -- CFILES is a hugs-only thing.
2988                     pragmas = options_pragmas ++ ["cfiles", "contract"]
2989
2990oneWordPrags = Map.fromList [
2991     ("rules", rulePrag),
2992     ("inline",
2993         strtoken (\s -> (ITinline_prag (SourceText s) Inline FunLike))),
2994     ("inlinable",
2995         strtoken (\s -> (ITinline_prag (SourceText s) Inlinable FunLike))),
2996     ("inlineable",
2997         strtoken (\s -> (ITinline_prag (SourceText s) Inlinable FunLike))),
2998                                    -- Spelling variant
2999     ("notinline",
3000         strtoken (\s -> (ITinline_prag (SourceText s) NoInline FunLike))),
3001     ("specialize", strtoken (\s -> ITspec_prag (SourceText s))),
3002     ("source", strtoken (\s -> ITsource_prag (SourceText s))),
3003     ("warning", strtoken (\s -> ITwarning_prag (SourceText s))),
3004     ("deprecated", strtoken (\s -> ITdeprecated_prag (SourceText s))),
3005     ("scc", strtoken (\s -> ITscc_prag (SourceText s))),
3006     ("generated", strtoken (\s -> ITgenerated_prag (SourceText s))),
3007     ("core", strtoken (\s -> ITcore_prag (SourceText s))),
3008     ("unpack", strtoken (\s -> ITunpack_prag (SourceText s))),
3009     ("nounpack", strtoken (\s -> ITnounpack_prag (SourceText s))),
3010     ("ann", strtoken (\s -> ITann_prag (SourceText s))),
3011     ("minimal", strtoken (\s -> ITminimal_prag (SourceText s))),
3012     ("overlaps", strtoken (\s -> IToverlaps_prag (SourceText s))),
3013     ("overlappable", strtoken (\s -> IToverlappable_prag (SourceText s))),
3014     ("overlapping", strtoken (\s -> IToverlapping_prag (SourceText s))),
3015     ("incoherent", strtoken (\s -> ITincoherent_prag (SourceText s))),
3016     ("ctype", strtoken (\s -> ITctype (SourceText s))),
3017     ("complete", strtoken (\s -> ITcomplete_prag (SourceText s))),
3018     ("column", columnPrag)
3019     ]
3020
3021twoWordPrags = Map.fromList [
3022     ("inline conlike",
3023         strtoken (\s -> (ITinline_prag (SourceText s) Inline ConLike))),
3024     ("notinline conlike",
3025         strtoken (\s -> (ITinline_prag (SourceText s) NoInline ConLike))),
3026     ("specialize inline",
3027         strtoken (\s -> (ITspec_inline_prag (SourceText s) True))),
3028     ("specialize notinline",
3029         strtoken (\s -> (ITspec_inline_prag (SourceText s) False)))
3030     ]
3031
3032dispatch_pragmas :: Map String Action -> Action
3033dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
3034                                       Just found -> found span buf len
3035                                       Nothing -> lexError "unknown pragma"
3036
3037known_pragma :: Map String Action -> AlexAccPred ExtsBitmap
3038known_pragma prags _ (AI _ startbuf) _ (AI _ curbuf)
3039 = isKnown && nextCharIsNot curbuf pragmaNameChar
3040    where l = lexemeToString startbuf (byteDiff startbuf curbuf)
3041          isKnown = isJust $ Map.lookup (clean_pragma l) prags
3042          pragmaNameChar c = isAlphaNum c || c == '_'
3043
3044clean_pragma :: String -> String
3045clean_pragma prag = canon_ws (map toLower (unprefix prag))
3046                    where unprefix prag' = case stripPrefix "{-#" prag' of
3047                                             Just rest -> rest
3048                                             Nothing -> prag'
3049                          canonical prag' = case prag' of
3050                                              "noinline" -> "notinline"
3051                                              "specialise" -> "specialize"
3052                                              "constructorlike" -> "conlike"
3053                                              _ -> prag'
3054                          canon_ws s = unwords (map canonical (words s))
3055
3056
3057
3058{-
3059%************************************************************************
3060%*                                                                      *
3061        Helper functions for generating annotations in the parser
3062%*                                                                      *
3063%************************************************************************
3064-}
3065
3066-- | Encapsulated call to addAnnotation, requiring only the SrcSpan of
3067--   the AST construct the annotation belongs to; together with the
3068--   AnnKeywordId, this is the key of the annotation map.
3069--
3070--   This type is useful for places in the parser where it is not yet
3071--   known what SrcSpan an annotation should be added to.  The most
3072--   common situation is when we are parsing a list: the annotations
3073--   need to be associated with the AST element that *contains* the
3074--   list, not the list itself.  'AddAnn' lets us defer adding the
3075--   annotations until we finish parsing the list and are now parsing
3076--   the enclosing element; we then apply the 'AddAnn' to associate
3077--   the annotations.  Another common situation is where a common fragment of
3078--   the AST has been factored out but there is no separate AST node for
3079--   this fragment (this occurs in class and data declarations). In this
3080--   case, the annotation belongs to the parent data declaration.
3081--
3082--   The usual way an 'AddAnn' is created is using the 'mj' ("make jump")
3083--   function, and then it can be discharged using the 'ams' function.
3084data AddAnn = AddAnn AnnKeywordId SrcSpan
3085
3086addAnnotationOnly :: SrcSpan -> AnnKeywordId -> SrcSpan -> P ()
3087addAnnotationOnly l a v = P $ \s -> POk s {
3088  annotations = ((l,a), [v]) : annotations s
3089  } ()
3090
3091-- |Given a 'SrcSpan' that surrounds a 'HsPar' or 'HsParTy', generate
3092-- 'AddAnn' values for the opening and closing bordering on the start
3093-- and end of the span
3094mkParensApiAnn :: SrcSpan -> [AddAnn]
3095mkParensApiAnn (UnhelpfulSpan _)  = []
3096mkParensApiAnn s@(RealSrcSpan ss) = [AddAnn AnnOpenP lo,AddAnn AnnCloseP lc]
3097  where
3098    f = srcSpanFile ss
3099    sl = srcSpanStartLine ss
3100    sc = srcSpanStartCol ss
3101    el = srcSpanEndLine ss
3102    ec = srcSpanEndCol ss
3103    lo = mkSrcSpan (srcSpanStart s)         (mkSrcLoc f sl (sc+1))
3104    lc = mkSrcSpan (mkSrcLoc f el (ec - 1)) (srcSpanEnd s)
3105
3106queueComment :: Located Token -> P()
3107queueComment c = P $ \s -> POk s {
3108  comment_q = commentToAnnotation c : comment_q s
3109  } ()
3110
3111-- | Go through the @comment_q@ in @PState@ and remove all comments
3112-- that belong within the given span
3113allocateCommentsP :: SrcSpan -> P ()
3114allocateCommentsP ss = P $ \s ->
3115  let (comment_q', newAnns) = allocateComments ss (comment_q s) in
3116    POk s {
3117       comment_q = comment_q'
3118     , annotations_comments = newAnns ++ (annotations_comments s)
3119     } ()
3120
3121allocateComments
3122  :: SrcSpan
3123  -> [Located AnnotationComment]
3124  -> ([Located AnnotationComment], [(SrcSpan,[Located AnnotationComment])])
3125allocateComments ss comment_q =
3126  let
3127    (before,rest)  = break (\(L l _) -> isSubspanOf l ss) comment_q
3128    (middle,after) = break (\(L l _) -> not (isSubspanOf l ss)) rest
3129    comment_q' = before ++ after
3130    newAnns = if null middle then []
3131                             else [(ss,middle)]
3132  in
3133    (comment_q', newAnns)
3134
3135
3136commentToAnnotation :: Located Token -> Located AnnotationComment
3137commentToAnnotation (L l (ITdocCommentNext s))  = L l (AnnDocCommentNext s)
3138commentToAnnotation (L l (ITdocCommentPrev s))  = L l (AnnDocCommentPrev s)
3139commentToAnnotation (L l (ITdocCommentNamed s)) = L l (AnnDocCommentNamed s)
3140commentToAnnotation (L l (ITdocSection n s))    = L l (AnnDocSection n s)
3141commentToAnnotation (L l (ITdocOptions s))      = L l (AnnDocOptions s)
3142commentToAnnotation (L l (ITlineComment s))     = L l (AnnLineComment s)
3143commentToAnnotation (L l (ITblockComment s))    = L l (AnnBlockComment s)
3144commentToAnnotation _                           = panic "commentToAnnotation"
3145
3146-- ---------------------------------------------------------------------
3147
3148isComment :: Token -> Bool
3149isComment (ITlineComment     _)   = True
3150isComment (ITblockComment    _)   = True
3151isComment _ = False
3152
3153isDocComment :: Token -> Bool
3154isDocComment (ITdocCommentNext  _)   = True
3155isDocComment (ITdocCommentPrev  _)   = True
3156isDocComment (ITdocCommentNamed _)   = True
3157isDocComment (ITdocSection      _ _) = True
3158isDocComment (ITdocOptions      _)   = True
3159isDocComment _ = False
3160}
3161