1{-# LANGUAGE CPP                   #-}
2{-# LANGUAGE MultiParamTypeClasses #-}
3{-# LANGUAGE OverloadedStrings     #-}
4{-# LANGUAGE PatternGuards         #-}
5-----------------------------------------------------------------------------
6-- |
7-- Module      :  Distribution.Fields.Parser
8-- License     :  BSD3
9--
10-- Maintainer  :  cabal-devel@haskell.org
11-- Portability :  portable
12module Distribution.Fields.Parser (
13    -- * Types
14    Field(..),
15    Name(..),
16    FieldLine(..),
17    SectionArg(..),
18    -- * Grammar and parsing
19    -- $grammar
20    readFields,
21    readFields',
22#ifdef CABAL_PARSEC_DEBUG
23    -- * Internal
24    parseFile,
25    parseStr,
26    parseBS,
27#endif
28    ) where
29
30import qualified Data.ByteString.Char8          as B8
31import           Data.Functor.Identity
32import           Distribution.Compat.Prelude
33import           Distribution.Fields.Field
34import           Distribution.Fields.Lexer
35import           Distribution.Fields.LexerMonad
36                 (LexResult (..), LexState (..), LexWarning (..), unLex)
37import           Distribution.Parsec.Position   (Position (..))
38import           Prelude ()
39import           Text.Parsec.Combinator         hiding (eof, notFollowedBy)
40import           Text.Parsec.Error
41import           Text.Parsec.Pos
42import           Text.Parsec.Prim               hiding (many, (<|>))
43
44#ifdef CABAL_PARSEC_DEBUG
45import qualified Data.Text                as T
46import qualified Data.Text.Encoding       as T
47import qualified Data.Text.Encoding.Error as T
48#endif
49
50-- | The 'LexState'' (with a prime) is an instance of parsec's 'Stream'
51-- wrapped around lexer's 'LexState' (without a prime)
52data LexState' = LexState' !LexState (LToken, LexState')
53
54mkLexState' :: LexState -> LexState'
55mkLexState' st = LexState' st
56                   (case unLex lexToken st of LexResult st' tok -> (tok, mkLexState' st'))
57
58type Parser a = ParsecT LexState' () Identity a
59
60instance Stream LexState' Identity LToken where
61  uncons (LexState' _ (tok, st')) =
62    case tok of
63      L _ EOF -> return Nothing
64      _       -> return (Just (tok, st'))
65
66-- | Get lexer warnings accumulated so far
67getLexerWarnings :: Parser [LexWarning]
68getLexerWarnings = do
69  LexState' (LexState { warnings = ws }) _ <- getInput
70  return ws
71
72-- | Set Alex code i.e. the mode "state" lexer is in.
73setLexerMode :: Int -> Parser ()
74setLexerMode code = do
75  LexState' ls _ <- getInput
76  setInput $! mkLexState' ls { curCode = code }
77
78getToken :: (Token -> Maybe a) -> Parser a
79getToken getTok = getTokenWithPos (\(L _ t) -> getTok t)
80
81getTokenWithPos :: (LToken -> Maybe a) -> Parser a
82getTokenWithPos getTok = tokenPrim (\(L _ t) -> describeToken t) updatePos getTok
83  where
84    updatePos :: SourcePos -> LToken -> LexState' -> SourcePos
85    updatePos pos (L (Position col line) _) _ = newPos (sourceName pos) col line
86
87describeToken :: Token -> String
88describeToken t = case t of
89  TokSym   s      -> "symbol "   ++ show s
90  TokStr   s      -> "string "   ++ show s
91  TokOther s      -> "operator " ++ show s
92  Indent _        -> "new line"
93  TokFieldLine _  -> "field content"
94  Colon           -> "\":\""
95  OpenBrace       -> "\"{\""
96  CloseBrace      -> "\"}\""
97--  SemiColon       -> "\";\""
98  EOF             -> "end of file"
99  LexicalError is -> "character in input " ++ show (B8.head is)
100
101tokSym :: Parser (Name Position)
102tokSym', tokStr, tokOther :: Parser (SectionArg Position)
103tokIndent :: Parser Int
104tokColon, tokOpenBrace, tokCloseBrace :: Parser ()
105tokFieldLine :: Parser (FieldLine Position)
106
107tokSym        = getTokenWithPos $ \t -> case t of L pos (TokSym   x) -> Just (mkName pos x);  _ -> Nothing
108tokSym'       = getTokenWithPos $ \t -> case t of L pos (TokSym   x) -> Just (SecArgName pos x);  _ -> Nothing
109tokStr        = getTokenWithPos $ \t -> case t of L pos (TokStr   x) -> Just (SecArgStr pos x);  _ -> Nothing
110tokOther      = getTokenWithPos $ \t -> case t of L pos (TokOther x) -> Just (SecArgOther pos x);  _ -> Nothing
111tokIndent     = getToken $ \t -> case t of Indent   x -> Just x;  _ -> Nothing
112tokColon      = getToken $ \t -> case t of Colon      -> Just (); _ -> Nothing
113tokOpenBrace  = getToken $ \t -> case t of OpenBrace  -> Just (); _ -> Nothing
114tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing
115tokFieldLine  = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing
116
117colon, openBrace, closeBrace :: Parser ()
118
119sectionArg :: Parser (SectionArg Position)
120sectionArg   = tokSym' <|> tokStr <|> tokOther <?> "section parameter"
121
122fieldSecName :: Parser (Name Position)
123fieldSecName = tokSym              <?> "field or section name"
124
125colon        = tokColon      <?> "\":\""
126openBrace    = tokOpenBrace  <?> "\"{\""
127closeBrace   = tokCloseBrace <?> "\"}\""
128
129fieldContent :: Parser (FieldLine Position)
130fieldContent = tokFieldLine <?> "field contents"
131
132newtype IndentLevel = IndentLevel Int
133
134zeroIndentLevel :: IndentLevel
135zeroIndentLevel = IndentLevel 0
136
137incIndentLevel :: IndentLevel -> IndentLevel
138incIndentLevel (IndentLevel i) = IndentLevel (succ i)
139
140indentOfAtLeast :: IndentLevel -> Parser IndentLevel
141indentOfAtLeast (IndentLevel i) = try $ do
142  j <- tokIndent
143  guard (j >= i) <?> "indentation of at least " ++ show i
144  return (IndentLevel j)
145
146
147newtype LexerMode = LexerMode Int
148
149inLexerMode :: LexerMode -> Parser p -> Parser p
150inLexerMode (LexerMode mode) p =
151  do setLexerMode mode; x <- p; setLexerMode in_section; return x
152
153
154-----------------------
155-- Cabal file grammar
156--
157
158-- $grammar
159--
160-- @
161-- CabalStyleFile ::= SecElems
162--
163-- SecElems       ::= SecElem* '\\n'?
164-- SecElem        ::= '\\n' SecElemLayout | SecElemBraces
165-- SecElemLayout  ::= FieldLayout | FieldBraces | SectionLayout | SectionBraces
166-- SecElemBraces  ::= FieldInline | FieldBraces |                 SectionBraces
167-- FieldLayout    ::= name ':' line? ('\\n' line)*
168-- FieldBraces    ::= name ':' '\\n'? '{' content '}'
169-- FieldInline    ::= name ':' content
170-- SectionLayout  ::= name arg* SecElems
171-- SectionBraces  ::= name arg* '\\n'? '{' SecElems '}'
172-- @
173--
174-- and the same thing but left factored...
175--
176-- @
177-- SecElems              ::= SecElem*
178-- SecElem               ::= '\\n' name SecElemLayout
179--                         |      name SecElemBraces
180-- SecElemLayout         ::= ':'   FieldLayoutOrBraces
181--                         | arg*  SectionLayoutOrBraces
182-- FieldLayoutOrBraces   ::= '\\n'? '{' content '}'
183--                         | line? ('\\n' line)*
184-- SectionLayoutOrBraces ::= '\\n'? '{' SecElems '\\n'? '}'
185--                         | SecElems
186-- SecElemBraces         ::= ':' FieldInlineOrBraces
187--                         | arg* '\\n'? '{' SecElems '\\n'? '}'
188-- FieldInlineOrBraces   ::= '\\n'? '{' content '}'
189--                         | content
190-- @
191--
192-- Note how we have several productions with the sequence:
193--
194-- > '\\n'? '{'
195--
196-- That is, an optional newline (and indent) followed by a @{@ token.
197-- In the @SectionLayoutOrBraces@ case you can see that this makes it
198-- not fully left factored (because @SecElems@ can start with a @\\n@).
199-- Fully left factoring here would be ugly, and though we could use a
200-- lookahead of two tokens to resolve the alternatives, we can't
201-- conveniently use Parsec's 'try' here to get a lookahead of only two.
202-- So instead we deal with this case in the lexer by making a line
203-- where the first non-space is @{@ lex as just the @{@ token, without
204-- the usual indent token. Then in the parser we can resolve everything
205-- with just one token of lookahead and so without using 'try'.
206
207-- Top level of a file using cabal syntax
208--
209cabalStyleFile :: Parser [Field Position]
210cabalStyleFile = do es <- elements zeroIndentLevel
211                    eof
212                    return es
213
214-- Elements that live at the top level or inside a section, ie fields
215-- and sectionscontent
216--
217-- elements ::= element*
218elements :: IndentLevel -> Parser [Field Position]
219elements ilevel = many (element ilevel)
220
221-- An individual element, ie a field or a section. These can either use
222-- layout style or braces style. For layout style then it must start on
223-- a line on its own (so that we know its indentation level).
224--
225-- element ::= '\\n' name elementInLayoutContext
226--           |      name elementInNonLayoutContext
227element :: IndentLevel -> Parser (Field Position)
228element ilevel =
229      (do ilevel' <- indentOfAtLeast ilevel
230          name    <- fieldSecName
231          elementInLayoutContext (incIndentLevel ilevel') name)
232  <|> (do name    <- fieldSecName
233          elementInNonLayoutContext name)
234
235-- An element (field or section) that is valid in a layout context.
236-- In a layout context we can have fields and sections that themselves
237-- either use layout style or that use braces style.
238--
239-- elementInLayoutContext ::= ':'  fieldLayoutOrBraces
240--                          | arg* sectionLayoutOrBraces
241elementInLayoutContext :: IndentLevel -> Name Position -> Parser (Field Position)
242elementInLayoutContext ilevel name =
243      (do colon; fieldLayoutOrBraces ilevel name)
244  <|> (do args  <- many sectionArg
245          elems <- sectionLayoutOrBraces ilevel
246          return (Section name args elems))
247
248-- An element (field or section) that is valid in a non-layout context.
249-- In a non-layout context we can have only have fields and sections that
250-- themselves use braces style, or inline style fields.
251--
252-- elementInNonLayoutContext ::= ':' FieldInlineOrBraces
253--                             | arg* '\\n'? '{' elements '\\n'? '}'
254elementInNonLayoutContext :: Name Position -> Parser (Field Position)
255elementInNonLayoutContext name =
256      (do colon; fieldInlineOrBraces name)
257  <|> (do args <- many sectionArg
258          openBrace
259          elems <- elements zeroIndentLevel
260          optional tokIndent
261          closeBrace
262          return (Section name args elems))
263
264-- The body of a field, using either layout style or braces style.
265--
266-- fieldLayoutOrBraces   ::= '\\n'? '{' content '}'
267--                         | line? ('\\n' line)*
268fieldLayoutOrBraces :: IndentLevel -> Name Position -> Parser (Field Position)
269fieldLayoutOrBraces ilevel name = braces <|> fieldLayout
270  where
271    braces = do
272          openBrace
273          ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent)
274          closeBrace
275          return (Field name ls)
276    fieldLayout = inLexerMode (LexerMode in_field_layout) $ do
277          l  <- optionMaybe fieldContent
278          ls <- many (do _ <- indentOfAtLeast ilevel; fieldContent)
279          return $ case l of
280              Nothing -> Field name ls
281              Just l' -> Field name (l' : ls)
282
283-- The body of a section, using either layout style or braces style.
284--
285-- sectionLayoutOrBraces ::= '\\n'? '{' elements \\n? '}'
286--                         | elements
287sectionLayoutOrBraces :: IndentLevel -> Parser [Field Position]
288sectionLayoutOrBraces ilevel =
289      (do openBrace
290          elems <- elements zeroIndentLevel
291          optional tokIndent
292          closeBrace
293          return elems)
294  <|> (elements ilevel)
295
296-- The body of a field, using either inline style or braces.
297--
298-- fieldInlineOrBraces   ::= '\\n'? '{' content '}'
299--                         | content
300fieldInlineOrBraces :: Name Position -> Parser (Field Position)
301fieldInlineOrBraces name =
302      (do openBrace
303          ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent)
304          closeBrace
305          return (Field name ls))
306  <|> (do ls <- inLexerMode (LexerMode in_field_braces) (option [] (fmap (\l -> [l]) fieldContent))
307          return (Field name ls))
308
309
310-- | Parse cabal style 'B8.ByteString' into list of 'Field's, i.e. the cabal AST.
311readFields :: B8.ByteString -> Either ParseError [Field Position]
312readFields s = fmap fst (readFields' s)
313
314-- | Like 'readFields' but also return lexer warnings
315readFields' :: B8.ByteString -> Either ParseError ([Field Position], [LexWarning])
316readFields' s = do
317    parse parser "the input" lexSt
318  where
319    parser = do
320        fields <- cabalStyleFile
321        ws     <- getLexerWarnings
322        pure (fields, ws)
323
324    lexSt = mkLexState' (mkLexState s)
325
326#ifdef CABAL_PARSEC_DEBUG
327parseTest' :: Show a => Parsec LexState' () a -> SourceName -> B8.ByteString -> IO ()
328parseTest' p fname s =
329    case parse p fname (lexSt s) of
330      Left err -> putStrLn (formatError s err)
331
332      Right x  -> print x
333  where
334    lexSt = mkLexState' . mkLexState
335
336parseFile :: Show a => Parser a -> FilePath -> IO ()
337parseFile p f = B8.readFile f >>= \s -> parseTest' p f s
338
339parseStr  :: Show a => Parser a -> String -> IO ()
340parseStr p = parseBS p . B8.pack
341
342parseBS  :: Show a => Parser a -> B8.ByteString -> IO ()
343parseBS p = parseTest' p "<input string>"
344
345formatError :: B8.ByteString -> ParseError -> String
346formatError input perr =
347    unlines
348      [ "Parse error "++ show (errorPos perr) ++ ":"
349      , errLine
350      , indicator ++ errmsg ]
351  where
352    pos       = errorPos perr
353    ls        = lines' (T.decodeUtf8With T.lenientDecode input)
354    errLine   = T.unpack (ls !! (sourceLine pos - 1))
355    indicator = replicate (sourceColumn pos) ' ' ++ "^"
356    errmsg    = showErrorMessages "or" "unknown parse error"
357                                  "expecting" "unexpected" "end of file"
358                                  (errorMessages perr)
359
360-- | Handles windows/osx/unix line breaks uniformly
361lines' :: T.Text -> [T.Text]
362lines' s1
363  | T.null s1 = []
364  | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of
365                  (l, s2) | Just (c,s3) <- T.uncons s2
366                         -> case T.uncons s3 of
367                              Just ('\n', s4) | c == '\r' -> l : lines' s4
368                              _               -> l : lines' s3
369                          | otherwise -> [l]
370#endif
371
372eof :: Parser ()
373eof = notFollowedBy anyToken <?> "end of file"
374  where
375    notFollowedBy :: Parser LToken -> Parser ()
376    notFollowedBy p = try (    (do L _ t <- try p; unexpected (describeToken t))
377                           <|> return ())
378