Distribution/Fields/Parser.hs

{-# LANGUAGE CPP                   #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE PatternGuards         #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  Distribution.Fields.Parser
-- License     :  BSD3
--
-- Maintainer  :  cabal-devel@haskell.org
-- Portability :  portable
module Distribution.Fields.Parser (
    -- * Types
    Field(..),
    Name(..),
    FieldLine(..),
    SectionArg(..),
    -- * Grammar and parsing
    -- $grammar
    readFields,
    readFields',
#ifdef CABAL_PARSEC_DEBUG
    -- * Internal
    parseFile,
    parseStr,
    parseBS,
#endif
    ) where

import qualified Data.ByteString.Char8          as B8
import           Data.Functor.Identity
import           Distribution.Compat.Prelude
import           Distribution.Fields.Field
import           Distribution.Fields.Lexer
import           Distribution.Fields.LexerMonad
                 (LexResult (..), LexState (..), LexWarning (..), unLex)
import           Distribution.Parsec.Position   (Position (..))
import           Prelude ()
import           Text.Parsec.Combinator         hiding (eof, notFollowedBy)
import           Text.Parsec.Error
import           Text.Parsec.Pos
import           Text.Parsec.Prim               hiding (many, (<|>))

#ifdef CABAL_PARSEC_DEBUG
import qualified Data.Text                as T
import qualified Data.Text.Encoding       as T
import qualified Data.Text.Encoding.Error as T
#endif

-- | The 'LexState'' (with a prime) is an instance of parsec's 'Stream'
-- wrapped around lexer's 'LexState' (without a prime)
data LexState' = LexState' !LexState (LToken, LexState')

mkLexState' :: LexState -> LexState'
mkLexState' st = LexState' st
                   (case unLex lexToken st of LexResult st' tok -> (tok, mkLexState' st'))

type Parser a = ParsecT LexState' () Identity a

instance Stream LexState' Identity LToken where
  uncons (LexState' _ (tok, st')) =
    case tok of
      L _ EOF -> return Nothing
      _       -> return (Just (tok, st'))

-- | Get lexer warnings accumulated so far
getLexerWarnings :: Parser [LexWarning]
getLexerWarnings = do
  LexState' (LexState { warnings = ws }) _ <- getInput
  return ws

-- | Set Alex code i.e. the mode "state" lexer is in.
setLexerMode :: Int -> Parser ()
setLexerMode code = do
  LexState' ls _ <- getInput
  setInput $! mkLexState' ls { curCode = code }

getToken :: (Token -> Maybe a) -> Parser a
getToken getTok = getTokenWithPos (\(L _ t) -> getTok t)

getTokenWithPos :: (LToken -> Maybe a) -> Parser a
getTokenWithPos getTok = tokenPrim (\(L _ t) -> describeToken t) updatePos getTok
  where
    updatePos :: SourcePos -> LToken -> LexState' -> SourcePos
    updatePos pos (L (Position col line) _) _ = newPos (sourceName pos) col line

describeToken :: Token -> String
describeToken t = case t of
  TokSym   s      -> "symbol "   ++ show s
  TokStr   s      -> "string "   ++ show s
  TokOther s      -> "operator " ++ show s
  Indent _        -> "new line"
  TokFieldLine _  -> "field content"
  Colon           -> "\":\""
  OpenBrace       -> "\"{\""
  CloseBrace      -> "\"}\""
--  SemiColon       -> "\";\""
  EOF             -> "end of file"
  LexicalError is -> "character in input " ++ show (B8.head is)

tokSym :: Parser (Name Position)
tokSym', tokStr, tokOther :: Parser (SectionArg Position)
tokIndent :: Parser Int
tokColon, tokOpenBrace, tokCloseBrace :: Parser ()
tokFieldLine :: Parser (FieldLine Position)

tokSym        = getTokenWithPos $ \t -> case t of L pos (TokSym   x) -> Just (mkName pos x);  _ -> Nothing
tokSym'       = getTokenWithPos $ \t -> case t of L pos (TokSym   x) -> Just (SecArgName pos x);  _ -> Nothing
tokStr        = getTokenWithPos $ \t -> case t of L pos (TokStr   x) -> Just (SecArgStr pos x);  _ -> Nothing
tokOther      = getTokenWithPos $ \t -> case t of L pos (TokOther x) -> Just (SecArgOther pos x);  _ -> Nothing
tokIndent     = getToken $ \t -> case t of Indent   x -> Just x;  _ -> Nothing
tokColon      = getToken $ \t -> case t of Colon      -> Just (); _ -> Nothing
tokOpenBrace  = getToken $ \t -> case t of OpenBrace  -> Just (); _ -> Nothing
tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing
tokFieldLine  = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing

colon, openBrace, closeBrace :: Parser ()

sectionArg :: Parser (SectionArg Position)
sectionArg   = tokSym' <|> tokStr <|> tokOther <?> "section parameter"

fieldSecName :: Parser (Name Position)
fieldSecName = tokSym              <?> "field or section name"

colon        = tokColon      <?> "\":\""
openBrace    = tokOpenBrace  <?> "\"{\""
closeBrace   = tokCloseBrace <?> "\"}\""

fieldContent :: Parser (FieldLine Position)
fieldContent = tokFieldLine <?> "field contents"

newtype IndentLevel = IndentLevel Int

zeroIndentLevel :: IndentLevel
zeroIndentLevel = IndentLevel 0

incIndentLevel :: IndentLevel -> IndentLevel
incIndentLevel (IndentLevel i) = IndentLevel (succ i)

indentOfAtLeast :: IndentLevel -> Parser IndentLevel
indentOfAtLeast (IndentLevel i) = try $ do
  j <- tokIndent
  guard (j >= i) <?> "indentation of at least " ++ show i
  return (IndentLevel j)


newtype LexerMode = LexerMode Int

inLexerMode :: LexerMode -> Parser p -> Parser p
inLexerMode (LexerMode mode) p =
  do setLexerMode mode; x <- p; setLexerMode in_section; return x


-----------------------
-- Cabal file grammar
--

-- $grammar
--
-- @
-- CabalStyleFile ::= SecElems
--
-- SecElems       ::= SecElem* '\\n'?
-- SecElem        ::= '\\n' SecElemLayout | SecElemBraces
-- SecElemLayout  ::= FieldLayout | FieldBraces | SectionLayout | SectionBraces
-- SecElemBraces  ::= FieldInline | FieldBraces |                 SectionBraces
-- FieldLayout    ::= name ':' line? ('\\n' line)*
-- FieldBraces    ::= name ':' '\\n'? '{' content '}'
-- FieldInline    ::= name ':' content
-- SectionLayout  ::= name arg* SecElems
-- SectionBraces  ::= name arg* '\\n'? '{' SecElems '}'
-- @
--
-- and the same thing but left factored...
--
-- @
-- SecElems              ::= SecElem*
-- SecElem               ::= '\\n' name SecElemLayout
--                         |      name SecElemBraces
-- SecElemLayout         ::= ':'   FieldLayoutOrBraces
--                         | arg*  SectionLayoutOrBraces
-- FieldLayoutOrBraces   ::= '\\n'? '{' content '}'
--                         | line? ('\\n' line)*
-- SectionLayoutOrBraces ::= '\\n'? '{' SecElems '\\n'? '}'
--                         | SecElems
-- SecElemBraces         ::= ':' FieldInlineOrBraces
--                         | arg* '\\n'? '{' SecElems '\\n'? '}'
-- FieldInlineOrBraces   ::= '\\n'? '{' content '}'
--                         | content
-- @
--
-- Note how we have several productions with the sequence:
--
-- > '\\n'? '{'
--
-- That is, an optional newline (and indent) followed by a @{@ token.
-- In the @SectionLayoutOrBraces@ case you can see that this makes it
-- not fully left factored (because @SecElems@ can start with a @\\n@).
-- Fully left factoring here would be ugly, and though we could use a
-- lookahead of two tokens to resolve the alternatives, we can't
-- conveniently use Parsec's 'try' here to get a lookahead of only two.
-- So instead we deal with this case in the lexer by making a line
-- where the first non-space is @{@ lex as just the @{@ token, without
-- the usual indent token. Then in the parser we can resolve everything
-- with just one token of lookahead and so without using 'try'.

-- Top level of a file using cabal syntax
--
cabalStyleFile :: Parser [Field Position]
cabalStyleFile = do es <- elements zeroIndentLevel
                    eof
                    return es

-- Elements that live at the top level or inside a section, ie fields
-- and sectionscontent
--
-- elements ::= element*
elements :: IndentLevel -> Parser [Field Position]
elements ilevel = many (element ilevel)

-- An individual element, ie a field or a section. These can either use
-- layout style or braces style. For layout style then it must start on
-- a line on its own (so that we know its indentation level).
--
-- element ::= '\\n' name elementInLayoutContext
--           |      name elementInNonLayoutContext
element :: IndentLevel -> Parser (Field Position)
element ilevel =
      (do ilevel' <- indentOfAtLeast ilevel
          name    <- fieldSecName
          elementInLayoutContext (incIndentLevel ilevel') name)
  <|> (do name    <- fieldSecName
          elementInNonLayoutContext name)

-- An element (field or section) that is valid in a layout context.
-- In a layout context we can have fields and sections that themselves
-- either use layout style or that use braces style.
--
-- elementInLayoutContext ::= ':'  fieldLayoutOrBraces
--                          | arg* sectionLayoutOrBraces
elementInLayoutContext :: IndentLevel -> Name Position -> Parser (Field Position)
elementInLayoutContext ilevel name =
      (do colon; fieldLayoutOrBraces ilevel name)
  <|> (do args  <- many sectionArg
          elems <- sectionLayoutOrBraces ilevel
          return (Section name args elems))

-- An element (field or section) that is valid in a non-layout context.
-- In a non-layout context we can have only have fields and sections that
-- themselves use braces style, or inline style fields.
--
-- elementInNonLayoutContext ::= ':' FieldInlineOrBraces
--                             | arg* '\\n'? '{' elements '\\n'? '}'
elementInNonLayoutContext :: Name Position -> Parser (Field Position)
elementInNonLayoutContext name =
      (do colon; fieldInlineOrBraces name)
  <|> (do args <- many sectionArg
          openBrace
          elems <- elements zeroIndentLevel
          optional tokIndent
          closeBrace
          return (Section name args elems))

-- The body of a field, using either layout style or braces style.
--
-- fieldLayoutOrBraces   ::= '\\n'? '{' content '}'
--                         | line? ('\\n' line)*
fieldLayoutOrBraces :: IndentLevel -> Name Position -> Parser (Field Position)
fieldLayoutOrBraces ilevel name = braces <|> fieldLayout
  where
    braces = do
          openBrace
          ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent)
          closeBrace
          return (Field name ls)
    fieldLayout = inLexerMode (LexerMode in_field_layout) $ do
          l  <- optionMaybe fieldContent
          ls <- many (do _ <- indentOfAtLeast ilevel; fieldContent)
          return $ case l of
              Nothing -> Field name ls
              Just l' -> Field name (l' : ls)

-- The body of a section, using either layout style or braces style.
--
-- sectionLayoutOrBraces ::= '\\n'? '{' elements \\n? '}'
--                         | elements
sectionLayoutOrBraces :: IndentLevel -> Parser [Field Position]
sectionLayoutOrBraces ilevel =
      (do openBrace
          elems <- elements zeroIndentLevel
          optional tokIndent
          closeBrace
          return elems)
  <|> (elements ilevel)

-- The body of a field, using either inline style or braces.
--
-- fieldInlineOrBraces   ::= '\\n'? '{' content '}'
--                         | content
fieldInlineOrBraces :: Name Position -> Parser (Field Position)
fieldInlineOrBraces name =
      (do openBrace
          ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent)
          closeBrace
          return (Field name ls))
  <|> (do ls <- inLexerMode (LexerMode in_field_braces) (option [] (fmap (\l -> [l]) fieldContent))
          return (Field name ls))


-- | Parse cabal style 'B8.ByteString' into list of 'Field's, i.e. the cabal AST.
readFields :: B8.ByteString -> Either ParseError [Field Position]
readFields s = fmap fst (readFields' s)

-- | Like 'readFields' but also return lexer warnings
readFields' :: B8.ByteString -> Either ParseError ([Field Position], [LexWarning])
readFields' s = do
    parse parser "the input" lexSt
  where
    parser = do
        fields <- cabalStyleFile
        ws     <- getLexerWarnings
        pure (fields, ws)

    lexSt = mkLexState' (mkLexState s)

#ifdef CABAL_PARSEC_DEBUG
parseTest' :: Show a => Parsec LexState' () a -> SourceName -> B8.ByteString -> IO ()
parseTest' p fname s =
    case parse p fname (lexSt s) of
      Left err -> putStrLn (formatError s err)

      Right x  -> print x
  where
    lexSt = mkLexState' . mkLexState

parseFile :: Show a => Parser a -> FilePath -> IO ()
parseFile p f = B8.readFile f >>= \s -> parseTest' p f s

parseStr  :: Show a => Parser a -> String -> IO ()
parseStr p = parseBS p . B8.pack

parseBS  :: Show a => Parser a -> B8.ByteString -> IO ()
parseBS p = parseTest' p "<input string>"

formatError :: B8.ByteString -> ParseError -> String
formatError input perr =
    unlines
      [ "Parse error "++ show (errorPos perr) ++ ":"
      , errLine
      , indicator ++ errmsg ]
  where
    pos       = errorPos perr
    ls        = lines' (T.decodeUtf8With T.lenientDecode input)
    errLine   = T.unpack (ls !! (sourceLine pos - 1))
    indicator = replicate (sourceColumn pos) ' ' ++ "^"
    errmsg    = showErrorMessages "or" "unknown parse error"
                                  "expecting" "unexpected" "end of file"
                                  (errorMessages perr)

-- | Handles windows/osx/unix line breaks uniformly
lines' :: T.Text -> [T.Text]
lines' s1
  | T.null s1 = []
  | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of
                  (l, s2) | Just (c,s3) <- T.uncons s2
                         -> case T.uncons s3 of
                              Just ('\n', s4) | c == '\r' -> l : lines' s4
                              _               -> l : lines' s3
                          | otherwise -> [l]
#endif

eof :: Parser ()
eof = notFollowedBy anyToken <?> "end of file"
  where
    notFollowedBy :: Parser LToken -> Parser ()
    notFollowedBy p = try (    (do L _ t <- try p; unexpected (describeToken t))
                           <|> return ())