1{-# LANGUAGE CPP                   #-}
2{-# LANGUAGE MultiParamTypeClasses #-}
3{-# LANGUAGE OverloadedStrings     #-}
4{-# LANGUAGE PatternGuards         #-}
5-----------------------------------------------------------------------------
6-- |
7-- Module      :  Distribution.Fields.Parser
8-- License     :  BSD3
9--
10-- Maintainer  :  cabal-devel@haskell.org
11-- Portability :  portable
12module Distribution.Fields.Parser (
13    -- * Types
14    Field(..),
15    Name(..),
16    FieldLine(..),
17    SectionArg(..),
18    -- * Grammar and parsing
19    -- $grammar
20    readFields,
21    readFields',
22#ifdef CABAL_PARSEC_DEBUG
23    -- * Internal
24    parseFile,
25    parseStr,
26    parseBS,
27#endif
28    ) where
29
30import           Control.Monad                  (guard)
31import qualified Data.ByteString.Char8          as B8
32import           Data.Functor.Identity
33import           Distribution.Compat.Prelude
34import           Distribution.Fields.Field
35import           Distribution.Fields.Lexer
36import           Distribution.Fields.LexerMonad
37                 (LexResult (..), LexState (..), LexWarning (..), unLex)
38import           Distribution.Parsec.Position   (Position (..))
39import           Prelude ()
40import           Text.Parsec.Combinator         hiding (eof, notFollowedBy)
41import           Text.Parsec.Error
42import           Text.Parsec.Pos
43import           Text.Parsec.Prim               hiding (many, (<|>))
44
45#ifdef CABAL_PARSEC_DEBUG
46import qualified Data.Text                as T
47import qualified Data.Text.Encoding       as T
48import qualified Data.Text.Encoding.Error as T
49#endif
50
51-- | The 'LexState'' (with a prime) is an instance of parsec's 'Stream'
52-- wrapped around lexer's 'LexState' (without a prime)
53data LexState' = LexState' !LexState (LToken, LexState')
54
55mkLexState' :: LexState -> LexState'
56mkLexState' st = LexState' st
57                   (case unLex lexToken st of LexResult st' tok -> (tok, mkLexState' st'))
58
59type Parser a = ParsecT LexState' () Identity a
60
61instance Stream LexState' Identity LToken where
62  uncons (LexState' _ (tok, st')) =
63    case tok of
64      L _ EOF -> return Nothing
65      _       -> return (Just (tok, st'))
66
67-- | Get lexer warnings accumulated so far
68getLexerWarnings :: Parser [LexWarning]
69getLexerWarnings = do
70  LexState' (LexState { warnings = ws }) _ <- getInput
71  return ws
72
73-- | Set Alex code i.e. the mode "state" lexer is in.
74setLexerMode :: Int -> Parser ()
75setLexerMode code = do
76  LexState' ls _ <- getInput
77  setInput $! mkLexState' ls { curCode = code }
78
79getToken :: (Token -> Maybe a) -> Parser a
80getToken getTok = getTokenWithPos (\(L _ t) -> getTok t)
81
82getTokenWithPos :: (LToken -> Maybe a) -> Parser a
83getTokenWithPos getTok = tokenPrim (\(L _ t) -> describeToken t) updatePos getTok
84  where
85    updatePos :: SourcePos -> LToken -> LexState' -> SourcePos
86    updatePos pos (L (Position col line) _) _ = newPos (sourceName pos) col line
87
88describeToken :: Token -> String
89describeToken t = case t of
90  TokSym   s      -> "symbol "   ++ show s
91  TokStr   s      -> "string "   ++ show s
92  TokOther s      -> "operator " ++ show s
93  Indent _        -> "new line"
94  TokFieldLine _  -> "field content"
95  Colon           -> "\":\""
96  OpenBrace       -> "\"{\""
97  CloseBrace      -> "\"}\""
98--  SemiColon       -> "\";\""
99  EOF             -> "end of file"
100  LexicalError is -> "character in input " ++ show (B8.head is)
101
102tokSym :: Parser (Name Position)
103tokSym', tokStr, tokOther :: Parser (SectionArg Position)
104tokIndent :: Parser Int
105tokColon, tokOpenBrace, tokCloseBrace :: Parser ()
106tokFieldLine :: Parser (FieldLine Position)
107
108tokSym        = getTokenWithPos $ \t -> case t of L pos (TokSym   x) -> Just (mkName pos x);  _ -> Nothing
109tokSym'       = getTokenWithPos $ \t -> case t of L pos (TokSym   x) -> Just (SecArgName pos x);  _ -> Nothing
110tokStr        = getTokenWithPos $ \t -> case t of L pos (TokStr   x) -> Just (SecArgStr pos x);  _ -> Nothing
111tokOther      = getTokenWithPos $ \t -> case t of L pos (TokOther x) -> Just (SecArgOther pos x);  _ -> Nothing
112tokIndent     = getToken $ \t -> case t of Indent   x -> Just x;  _ -> Nothing
113tokColon      = getToken $ \t -> case t of Colon      -> Just (); _ -> Nothing
114tokOpenBrace  = getToken $ \t -> case t of OpenBrace  -> Just (); _ -> Nothing
115tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing
116tokFieldLine  = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing
117
118colon, openBrace, closeBrace :: Parser ()
119
120sectionArg :: Parser (SectionArg Position)
121sectionArg   = tokSym' <|> tokStr <|> tokOther <?> "section parameter"
122
123fieldSecName :: Parser (Name Position)
124fieldSecName = tokSym              <?> "field or section name"
125
126colon        = tokColon      <?> "\":\""
127openBrace    = tokOpenBrace  <?> "\"{\""
128closeBrace   = tokCloseBrace <?> "\"}\""
129
130fieldContent :: Parser (FieldLine Position)
131fieldContent = tokFieldLine <?> "field contents"
132
133newtype IndentLevel = IndentLevel Int
134
135zeroIndentLevel :: IndentLevel
136zeroIndentLevel = IndentLevel 0
137
138incIndentLevel :: IndentLevel -> IndentLevel
139incIndentLevel (IndentLevel i) = IndentLevel (succ i)
140
141indentOfAtLeast :: IndentLevel -> Parser IndentLevel
142indentOfAtLeast (IndentLevel i) = try $ do
143  j <- tokIndent
144  guard (j >= i) <?> "indentation of at least " ++ show i
145  return (IndentLevel j)
146
147
148newtype LexerMode = LexerMode Int
149
150inLexerMode :: LexerMode -> Parser p -> Parser p
151inLexerMode (LexerMode mode) p =
152  do setLexerMode mode; x <- p; setLexerMode in_section; return x
153
154
155-----------------------
156-- Cabal file grammar
157--
158
159-- $grammar
160--
161-- @
162-- CabalStyleFile ::= SecElems
163--
164-- SecElems       ::= SecElem* '\\n'?
165-- SecElem        ::= '\\n' SecElemLayout | SecElemBraces
166-- SecElemLayout  ::= FieldLayout | FieldBraces | SectionLayout | SectionBraces
167-- SecElemBraces  ::= FieldInline | FieldBraces |                 SectionBraces
168-- FieldLayout    ::= name ':' line? ('\\n' line)*
169-- FieldBraces    ::= name ':' '\\n'? '{' content '}'
170-- FieldInline    ::= name ':' content
171-- SectionLayout  ::= name arg* SecElems
172-- SectionBraces  ::= name arg* '\\n'? '{' SecElems '}'
173-- @
174--
175-- and the same thing but left factored...
176--
177-- @
178-- SecElems              ::= SecElem*
179-- SecElem               ::= '\\n' name SecElemLayout
180--                         |      name SecElemBraces
181-- SecElemLayout         ::= ':'   FieldLayoutOrBraces
182--                         | arg*  SectionLayoutOrBraces
183-- FieldLayoutOrBraces   ::= '\\n'? '{' content '}'
184--                         | line? ('\\n' line)*
185-- SectionLayoutOrBraces ::= '\\n'? '{' SecElems '\\n'? '}'
186--                         | SecElems
187-- SecElemBraces         ::= ':' FieldInlineOrBraces
188--                         | arg* '\\n'? '{' SecElems '\\n'? '}'
189-- FieldInlineOrBraces   ::= '\\n'? '{' content '}'
190--                         | content
191-- @
192--
193-- Note how we have several productions with the sequence:
194--
195-- > '\\n'? '{'
196--
197-- That is, an optional newline (and indent) followed by a @{@ token.
198-- In the @SectionLayoutOrBraces@ case you can see that this makes it
199-- not fully left factored (because @SecElems@ can start with a @\\n@).
200-- Fully left factoring here would be ugly, and though we could use a
201-- lookahead of two tokens to resolve the alternatives, we can't
202-- conveniently use Parsec's 'try' here to get a lookahead of only two.
203-- So instead we deal with this case in the lexer by making a line
204-- where the first non-space is @{@ lex as just the @{@ token, without
205-- the usual indent token. Then in the parser we can resolve everything
206-- with just one token of lookahead and so without using 'try'.
207
208-- Top level of a file using cabal syntax
209--
210cabalStyleFile :: Parser [Field Position]
211cabalStyleFile = do es <- elements zeroIndentLevel
212                    eof
213                    return es
214
215-- Elements that live at the top level or inside a section, ie fields
216-- and sectionscontent
217--
218-- elements ::= element*
219elements :: IndentLevel -> Parser [Field Position]
220elements ilevel = many (element ilevel)
221
222-- An individual element, ie a field or a section. These can either use
223-- layout style or braces style. For layout style then it must start on
224-- a line on its own (so that we know its indentation level).
225--
226-- element ::= '\\n' name elementInLayoutContext
227--           |      name elementInNonLayoutContext
228element :: IndentLevel -> Parser (Field Position)
229element ilevel =
230      (do ilevel' <- indentOfAtLeast ilevel
231          name    <- fieldSecName
232          elementInLayoutContext (incIndentLevel ilevel') name)
233  <|> (do name    <- fieldSecName
234          elementInNonLayoutContext name)
235
236-- An element (field or section) that is valid in a layout context.
237-- In a layout context we can have fields and sections that themselves
238-- either use layout style or that use braces style.
239--
240-- elementInLayoutContext ::= ':'  fieldLayoutOrBraces
241--                          | arg* sectionLayoutOrBraces
242elementInLayoutContext :: IndentLevel -> Name Position -> Parser (Field Position)
243elementInLayoutContext ilevel name =
244      (do colon; fieldLayoutOrBraces ilevel name)
245  <|> (do args  <- many sectionArg
246          elems <- sectionLayoutOrBraces ilevel
247          return (Section name args elems))
248
249-- An element (field or section) that is valid in a non-layout context.
250-- In a non-layout context we can have only have fields and sections that
251-- themselves use braces style, or inline style fields.
252--
253-- elementInNonLayoutContext ::= ':' FieldInlineOrBraces
254--                             | arg* '\\n'? '{' elements '\\n'? '}'
255elementInNonLayoutContext :: Name Position -> Parser (Field Position)
256elementInNonLayoutContext name =
257      (do colon; fieldInlineOrBraces name)
258  <|> (do args <- many sectionArg
259          openBrace
260          elems <- elements zeroIndentLevel
261          optional tokIndent
262          closeBrace
263          return (Section name args elems))
264
265-- The body of a field, using either layout style or braces style.
266--
267-- fieldLayoutOrBraces   ::= '\\n'? '{' content '}'
268--                         | line? ('\\n' line)*
269fieldLayoutOrBraces :: IndentLevel -> Name Position -> Parser (Field Position)
270fieldLayoutOrBraces ilevel name = braces <|> fieldLayout
271  where
272    braces = do
273          openBrace
274          ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent)
275          closeBrace
276          return (Field name ls)
277    fieldLayout = inLexerMode (LexerMode in_field_layout) $ do
278          l  <- optionMaybe fieldContent
279          ls <- many (do _ <- indentOfAtLeast ilevel; fieldContent)
280          return $ case l of
281              Nothing -> Field name ls
282              Just l' -> Field name (l' : ls)
283
284-- The body of a section, using either layout style or braces style.
285--
286-- sectionLayoutOrBraces ::= '\\n'? '{' elements \\n? '}'
287--                         | elements
288sectionLayoutOrBraces :: IndentLevel -> Parser [Field Position]
289sectionLayoutOrBraces ilevel =
290      (do openBrace
291          elems <- elements zeroIndentLevel
292          optional tokIndent
293          closeBrace
294          return elems)
295  <|> (elements ilevel)
296
297-- The body of a field, using either inline style or braces.
298--
299-- fieldInlineOrBraces   ::= '\\n'? '{' content '}'
300--                         | content
301fieldInlineOrBraces :: Name Position -> Parser (Field Position)
302fieldInlineOrBraces name =
303      (do openBrace
304          ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent)
305          closeBrace
306          return (Field name ls))
307  <|> (do ls <- inLexerMode (LexerMode in_field_braces) (option [] (fmap (\l -> [l]) fieldContent))
308          return (Field name ls))
309
310
311-- | Parse cabal style 'B8.ByteString' into list of 'Field's, i.e. the cabal AST.
312readFields :: B8.ByteString -> Either ParseError [Field Position]
313readFields s = fmap fst (readFields' s)
314
315-- | Like 'readFields' but also return lexer warnings
316readFields' :: B8.ByteString -> Either ParseError ([Field Position], [LexWarning])
317readFields' s = do
318    parse parser "the input" lexSt
319  where
320    parser = do
321        fields <- cabalStyleFile
322        ws     <- getLexerWarnings
323        pure (fields, ws)
324
325    lexSt = mkLexState' (mkLexState s)
326
327#ifdef CABAL_PARSEC_DEBUG
328parseTest' :: Show a => Parsec LexState' () a -> SourceName -> B8.ByteString -> IO ()
329parseTest' p fname s =
330    case parse p fname (lexSt s) of
331      Left err -> putStrLn (formatError s err)
332
333      Right x  -> print x
334  where
335    lexSt = mkLexState' . mkLexState
336
337parseFile :: Show a => Parser a -> FilePath -> IO ()
338parseFile p f = B8.readFile f >>= \s -> parseTest' p f s
339
340parseStr  :: Show a => Parser a -> String -> IO ()
341parseStr p = parseBS p . B8.pack
342
343parseBS  :: Show a => Parser a -> B8.ByteString -> IO ()
344parseBS p = parseTest' p "<input string>"
345
346formatError :: B8.ByteString -> ParseError -> String
347formatError input perr =
348    unlines
349      [ "Parse error "++ show (errorPos perr) ++ ":"
350      , errLine
351      , indicator ++ errmsg ]
352  where
353    pos       = errorPos perr
354    ls        = lines' (T.decodeUtf8With T.lenientDecode input)
355    errLine   = T.unpack (ls !! (sourceLine pos - 1))
356    indicator = replicate (sourceColumn pos) ' ' ++ "^"
357    errmsg    = showErrorMessages "or" "unknown parse error"
358                                  "expecting" "unexpected" "end of file"
359                                  (errorMessages perr)
360
361-- | Handles windows/osx/unix line breaks uniformly
362lines' :: T.Text -> [T.Text]
363lines' s1
364  | T.null s1 = []
365  | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of
366                  (l, s2) | Just (c,s3) <- T.uncons s2
367                         -> case T.uncons s3 of
368                              Just ('\n', s4) | c == '\r' -> l : lines' s4
369                              _               -> l : lines' s3
370                          | otherwise -> [l]
371#endif
372
373eof :: Parser ()
374eof = notFollowedBy anyToken <?> "end of file"
375  where
376    notFollowedBy :: Parser LToken -> Parser ()
377    notFollowedBy p = try (    (do L _ t <- try p; unexpected (describeToken t))
378                           <|> return ())
379