1{-# LANGUAGE CPP #-} 2{-# LANGUAGE MultiParamTypeClasses #-} 3{-# LANGUAGE OverloadedStrings #-} 4{-# LANGUAGE PatternGuards #-} 5----------------------------------------------------------------------------- 6-- | 7-- Module : Distribution.Fields.Parser 8-- License : BSD3 9-- 10-- Maintainer : cabal-devel@haskell.org 11-- Portability : portable 12module Distribution.Fields.Parser ( 13 -- * Types 14 Field(..), 15 Name(..), 16 FieldLine(..), 17 SectionArg(..), 18 -- * Grammar and parsing 19 -- $grammar 20 readFields, 21 readFields', 22#ifdef CABAL_PARSEC_DEBUG 23 -- * Internal 24 parseFile, 25 parseStr, 26 parseBS, 27#endif 28 ) where 29 30import Control.Monad (guard) 31import qualified Data.ByteString.Char8 as B8 32import Data.Functor.Identity 33import Distribution.Compat.Prelude 34import Distribution.Fields.Field 35import Distribution.Fields.Lexer 36import Distribution.Fields.LexerMonad 37 (LexResult (..), LexState (..), LexWarning (..), unLex) 38import Distribution.Parsec.Position (Position (..)) 39import Prelude () 40import Text.Parsec.Combinator hiding (eof, notFollowedBy) 41import Text.Parsec.Error 42import Text.Parsec.Pos 43import Text.Parsec.Prim hiding (many, (<|>)) 44 45#ifdef CABAL_PARSEC_DEBUG 46import qualified Data.Text as T 47import qualified Data.Text.Encoding as T 48import qualified Data.Text.Encoding.Error as T 49#endif 50 51-- | The 'LexState'' (with a prime) is an instance of parsec's 'Stream' 52-- wrapped around lexer's 'LexState' (without a prime) 53data LexState' = LexState' !LexState (LToken, LexState') 54 55mkLexState' :: LexState -> LexState' 56mkLexState' st = LexState' st 57 (case unLex lexToken st of LexResult st' tok -> (tok, mkLexState' st')) 58 59type Parser a = ParsecT LexState' () Identity a 60 61instance Stream LexState' Identity LToken where 62 uncons (LexState' _ (tok, st')) = 63 case tok of 64 L _ EOF -> return Nothing 65 _ -> return (Just (tok, st')) 66 67-- | Get lexer warnings accumulated so far 68getLexerWarnings :: Parser [LexWarning] 69getLexerWarnings = do 70 LexState' (LexState { warnings = ws }) _ <- getInput 71 return ws 72 73-- | Set Alex code i.e. the mode "state" lexer is in. 74setLexerMode :: Int -> Parser () 75setLexerMode code = do 76 LexState' ls _ <- getInput 77 setInput $! mkLexState' ls { curCode = code } 78 79getToken :: (Token -> Maybe a) -> Parser a 80getToken getTok = getTokenWithPos (\(L _ t) -> getTok t) 81 82getTokenWithPos :: (LToken -> Maybe a) -> Parser a 83getTokenWithPos getTok = tokenPrim (\(L _ t) -> describeToken t) updatePos getTok 84 where 85 updatePos :: SourcePos -> LToken -> LexState' -> SourcePos 86 updatePos pos (L (Position col line) _) _ = newPos (sourceName pos) col line 87 88describeToken :: Token -> String 89describeToken t = case t of 90 TokSym s -> "symbol " ++ show s 91 TokStr s -> "string " ++ show s 92 TokOther s -> "operator " ++ show s 93 Indent _ -> "new line" 94 TokFieldLine _ -> "field content" 95 Colon -> "\":\"" 96 OpenBrace -> "\"{\"" 97 CloseBrace -> "\"}\"" 98-- SemiColon -> "\";\"" 99 EOF -> "end of file" 100 LexicalError is -> "character in input " ++ show (B8.head is) 101 102tokSym :: Parser (Name Position) 103tokSym', tokStr, tokOther :: Parser (SectionArg Position) 104tokIndent :: Parser Int 105tokColon, tokOpenBrace, tokCloseBrace :: Parser () 106tokFieldLine :: Parser (FieldLine Position) 107 108tokSym = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (mkName pos x); _ -> Nothing 109tokSym' = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (SecArgName pos x); _ -> Nothing 110tokStr = getTokenWithPos $ \t -> case t of L pos (TokStr x) -> Just (SecArgStr pos x); _ -> Nothing 111tokOther = getTokenWithPos $ \t -> case t of L pos (TokOther x) -> Just (SecArgOther pos x); _ -> Nothing 112tokIndent = getToken $ \t -> case t of Indent x -> Just x; _ -> Nothing 113tokColon = getToken $ \t -> case t of Colon -> Just (); _ -> Nothing 114tokOpenBrace = getToken $ \t -> case t of OpenBrace -> Just (); _ -> Nothing 115tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing 116tokFieldLine = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing 117 118colon, openBrace, closeBrace :: Parser () 119 120sectionArg :: Parser (SectionArg Position) 121sectionArg = tokSym' <|> tokStr <|> tokOther <?> "section parameter" 122 123fieldSecName :: Parser (Name Position) 124fieldSecName = tokSym <?> "field or section name" 125 126colon = tokColon <?> "\":\"" 127openBrace = tokOpenBrace <?> "\"{\"" 128closeBrace = tokCloseBrace <?> "\"}\"" 129 130fieldContent :: Parser (FieldLine Position) 131fieldContent = tokFieldLine <?> "field contents" 132 133newtype IndentLevel = IndentLevel Int 134 135zeroIndentLevel :: IndentLevel 136zeroIndentLevel = IndentLevel 0 137 138incIndentLevel :: IndentLevel -> IndentLevel 139incIndentLevel (IndentLevel i) = IndentLevel (succ i) 140 141indentOfAtLeast :: IndentLevel -> Parser IndentLevel 142indentOfAtLeast (IndentLevel i) = try $ do 143 j <- tokIndent 144 guard (j >= i) <?> "indentation of at least " ++ show i 145 return (IndentLevel j) 146 147 148newtype LexerMode = LexerMode Int 149 150inLexerMode :: LexerMode -> Parser p -> Parser p 151inLexerMode (LexerMode mode) p = 152 do setLexerMode mode; x <- p; setLexerMode in_section; return x 153 154 155----------------------- 156-- Cabal file grammar 157-- 158 159-- $grammar 160-- 161-- @ 162-- CabalStyleFile ::= SecElems 163-- 164-- SecElems ::= SecElem* '\\n'? 165-- SecElem ::= '\\n' SecElemLayout | SecElemBraces 166-- SecElemLayout ::= FieldLayout | FieldBraces | SectionLayout | SectionBraces 167-- SecElemBraces ::= FieldInline | FieldBraces | SectionBraces 168-- FieldLayout ::= name ':' line? ('\\n' line)* 169-- FieldBraces ::= name ':' '\\n'? '{' content '}' 170-- FieldInline ::= name ':' content 171-- SectionLayout ::= name arg* SecElems 172-- SectionBraces ::= name arg* '\\n'? '{' SecElems '}' 173-- @ 174-- 175-- and the same thing but left factored... 176-- 177-- @ 178-- SecElems ::= SecElem* 179-- SecElem ::= '\\n' name SecElemLayout 180-- | name SecElemBraces 181-- SecElemLayout ::= ':' FieldLayoutOrBraces 182-- | arg* SectionLayoutOrBraces 183-- FieldLayoutOrBraces ::= '\\n'? '{' content '}' 184-- | line? ('\\n' line)* 185-- SectionLayoutOrBraces ::= '\\n'? '{' SecElems '\\n'? '}' 186-- | SecElems 187-- SecElemBraces ::= ':' FieldInlineOrBraces 188-- | arg* '\\n'? '{' SecElems '\\n'? '}' 189-- FieldInlineOrBraces ::= '\\n'? '{' content '}' 190-- | content 191-- @ 192-- 193-- Note how we have several productions with the sequence: 194-- 195-- > '\\n'? '{' 196-- 197-- That is, an optional newline (and indent) followed by a @{@ token. 198-- In the @SectionLayoutOrBraces@ case you can see that this makes it 199-- not fully left factored (because @SecElems@ can start with a @\\n@). 200-- Fully left factoring here would be ugly, and though we could use a 201-- lookahead of two tokens to resolve the alternatives, we can't 202-- conveniently use Parsec's 'try' here to get a lookahead of only two. 203-- So instead we deal with this case in the lexer by making a line 204-- where the first non-space is @{@ lex as just the @{@ token, without 205-- the usual indent token. Then in the parser we can resolve everything 206-- with just one token of lookahead and so without using 'try'. 207 208-- Top level of a file using cabal syntax 209-- 210cabalStyleFile :: Parser [Field Position] 211cabalStyleFile = do es <- elements zeroIndentLevel 212 eof 213 return es 214 215-- Elements that live at the top level or inside a section, ie fields 216-- and sectionscontent 217-- 218-- elements ::= element* 219elements :: IndentLevel -> Parser [Field Position] 220elements ilevel = many (element ilevel) 221 222-- An individual element, ie a field or a section. These can either use 223-- layout style or braces style. For layout style then it must start on 224-- a line on its own (so that we know its indentation level). 225-- 226-- element ::= '\\n' name elementInLayoutContext 227-- | name elementInNonLayoutContext 228element :: IndentLevel -> Parser (Field Position) 229element ilevel = 230 (do ilevel' <- indentOfAtLeast ilevel 231 name <- fieldSecName 232 elementInLayoutContext (incIndentLevel ilevel') name) 233 <|> (do name <- fieldSecName 234 elementInNonLayoutContext name) 235 236-- An element (field or section) that is valid in a layout context. 237-- In a layout context we can have fields and sections that themselves 238-- either use layout style or that use braces style. 239-- 240-- elementInLayoutContext ::= ':' fieldLayoutOrBraces 241-- | arg* sectionLayoutOrBraces 242elementInLayoutContext :: IndentLevel -> Name Position -> Parser (Field Position) 243elementInLayoutContext ilevel name = 244 (do colon; fieldLayoutOrBraces ilevel name) 245 <|> (do args <- many sectionArg 246 elems <- sectionLayoutOrBraces ilevel 247 return (Section name args elems)) 248 249-- An element (field or section) that is valid in a non-layout context. 250-- In a non-layout context we can have only have fields and sections that 251-- themselves use braces style, or inline style fields. 252-- 253-- elementInNonLayoutContext ::= ':' FieldInlineOrBraces 254-- | arg* '\\n'? '{' elements '\\n'? '}' 255elementInNonLayoutContext :: Name Position -> Parser (Field Position) 256elementInNonLayoutContext name = 257 (do colon; fieldInlineOrBraces name) 258 <|> (do args <- many sectionArg 259 openBrace 260 elems <- elements zeroIndentLevel 261 optional tokIndent 262 closeBrace 263 return (Section name args elems)) 264 265-- The body of a field, using either layout style or braces style. 266-- 267-- fieldLayoutOrBraces ::= '\\n'? '{' content '}' 268-- | line? ('\\n' line)* 269fieldLayoutOrBraces :: IndentLevel -> Name Position -> Parser (Field Position) 270fieldLayoutOrBraces ilevel name = braces <|> fieldLayout 271 where 272 braces = do 273 openBrace 274 ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent) 275 closeBrace 276 return (Field name ls) 277 fieldLayout = inLexerMode (LexerMode in_field_layout) $ do 278 l <- optionMaybe fieldContent 279 ls <- many (do _ <- indentOfAtLeast ilevel; fieldContent) 280 return $ case l of 281 Nothing -> Field name ls 282 Just l' -> Field name (l' : ls) 283 284-- The body of a section, using either layout style or braces style. 285-- 286-- sectionLayoutOrBraces ::= '\\n'? '{' elements \\n? '}' 287-- | elements 288sectionLayoutOrBraces :: IndentLevel -> Parser [Field Position] 289sectionLayoutOrBraces ilevel = 290 (do openBrace 291 elems <- elements zeroIndentLevel 292 optional tokIndent 293 closeBrace 294 return elems) 295 <|> (elements ilevel) 296 297-- The body of a field, using either inline style or braces. 298-- 299-- fieldInlineOrBraces ::= '\\n'? '{' content '}' 300-- | content 301fieldInlineOrBraces :: Name Position -> Parser (Field Position) 302fieldInlineOrBraces name = 303 (do openBrace 304 ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent) 305 closeBrace 306 return (Field name ls)) 307 <|> (do ls <- inLexerMode (LexerMode in_field_braces) (option [] (fmap (\l -> [l]) fieldContent)) 308 return (Field name ls)) 309 310 311-- | Parse cabal style 'B8.ByteString' into list of 'Field's, i.e. the cabal AST. 312readFields :: B8.ByteString -> Either ParseError [Field Position] 313readFields s = fmap fst (readFields' s) 314 315-- | Like 'readFields' but also return lexer warnings 316readFields' :: B8.ByteString -> Either ParseError ([Field Position], [LexWarning]) 317readFields' s = do 318 parse parser "the input" lexSt 319 where 320 parser = do 321 fields <- cabalStyleFile 322 ws <- getLexerWarnings 323 pure (fields, ws) 324 325 lexSt = mkLexState' (mkLexState s) 326 327#ifdef CABAL_PARSEC_DEBUG 328parseTest' :: Show a => Parsec LexState' () a -> SourceName -> B8.ByteString -> IO () 329parseTest' p fname s = 330 case parse p fname (lexSt s) of 331 Left err -> putStrLn (formatError s err) 332 333 Right x -> print x 334 where 335 lexSt = mkLexState' . mkLexState 336 337parseFile :: Show a => Parser a -> FilePath -> IO () 338parseFile p f = B8.readFile f >>= \s -> parseTest' p f s 339 340parseStr :: Show a => Parser a -> String -> IO () 341parseStr p = parseBS p . B8.pack 342 343parseBS :: Show a => Parser a -> B8.ByteString -> IO () 344parseBS p = parseTest' p "<input string>" 345 346formatError :: B8.ByteString -> ParseError -> String 347formatError input perr = 348 unlines 349 [ "Parse error "++ show (errorPos perr) ++ ":" 350 , errLine 351 , indicator ++ errmsg ] 352 where 353 pos = errorPos perr 354 ls = lines' (T.decodeUtf8With T.lenientDecode input) 355 errLine = T.unpack (ls !! (sourceLine pos - 1)) 356 indicator = replicate (sourceColumn pos) ' ' ++ "^" 357 errmsg = showErrorMessages "or" "unknown parse error" 358 "expecting" "unexpected" "end of file" 359 (errorMessages perr) 360 361-- | Handles windows/osx/unix line breaks uniformly 362lines' :: T.Text -> [T.Text] 363lines' s1 364 | T.null s1 = [] 365 | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of 366 (l, s2) | Just (c,s3) <- T.uncons s2 367 -> case T.uncons s3 of 368 Just ('\n', s4) | c == '\r' -> l : lines' s4 369 _ -> l : lines' s3 370 | otherwise -> [l] 371#endif 372 373eof :: Parser () 374eof = notFollowedBy anyToken <?> "end of file" 375 where 376 notFollowedBy :: Parser LToken -> Parser () 377 notFollowedBy p = try ( (do L _ t <- try p; unexpected (describeToken t)) 378 <|> return ()) 379