1{-# LANGUAGE CPP #-} 2{-# LANGUAGE MultiParamTypeClasses #-} 3{-# LANGUAGE OverloadedStrings #-} 4{-# LANGUAGE PatternGuards #-} 5----------------------------------------------------------------------------- 6-- | 7-- Module : Distribution.Fields.Parser 8-- License : BSD3 9-- 10-- Maintainer : cabal-devel@haskell.org 11-- Portability : portable 12module Distribution.Fields.Parser ( 13 -- * Types 14 Field(..), 15 Name(..), 16 FieldLine(..), 17 SectionArg(..), 18 -- * Grammar and parsing 19 -- $grammar 20 readFields, 21 readFields', 22#ifdef CABAL_PARSEC_DEBUG 23 -- * Internal 24 parseFile, 25 parseStr, 26 parseBS, 27#endif 28 ) where 29 30import qualified Data.ByteString.Char8 as B8 31import Data.Functor.Identity 32import Distribution.Compat.Prelude 33import Distribution.Fields.Field 34import Distribution.Fields.Lexer 35import Distribution.Fields.LexerMonad 36 (LexResult (..), LexState (..), LexWarning (..), unLex) 37import Distribution.Parsec.Position (Position (..)) 38import Prelude () 39import Text.Parsec.Combinator hiding (eof, notFollowedBy) 40import Text.Parsec.Error 41import Text.Parsec.Pos 42import Text.Parsec.Prim hiding (many, (<|>)) 43 44#ifdef CABAL_PARSEC_DEBUG 45import qualified Data.Text as T 46import qualified Data.Text.Encoding as T 47import qualified Data.Text.Encoding.Error as T 48#endif 49 50-- | The 'LexState'' (with a prime) is an instance of parsec's 'Stream' 51-- wrapped around lexer's 'LexState' (without a prime) 52data LexState' = LexState' !LexState (LToken, LexState') 53 54mkLexState' :: LexState -> LexState' 55mkLexState' st = LexState' st 56 (case unLex lexToken st of LexResult st' tok -> (tok, mkLexState' st')) 57 58type Parser a = ParsecT LexState' () Identity a 59 60instance Stream LexState' Identity LToken where 61 uncons (LexState' _ (tok, st')) = 62 case tok of 63 L _ EOF -> return Nothing 64 _ -> return (Just (tok, st')) 65 66-- | Get lexer warnings accumulated so far 67getLexerWarnings :: Parser [LexWarning] 68getLexerWarnings = do 69 LexState' (LexState { warnings = ws }) _ <- getInput 70 return ws 71 72-- | Set Alex code i.e. the mode "state" lexer is in. 73setLexerMode :: Int -> Parser () 74setLexerMode code = do 75 LexState' ls _ <- getInput 76 setInput $! mkLexState' ls { curCode = code } 77 78getToken :: (Token -> Maybe a) -> Parser a 79getToken getTok = getTokenWithPos (\(L _ t) -> getTok t) 80 81getTokenWithPos :: (LToken -> Maybe a) -> Parser a 82getTokenWithPos getTok = tokenPrim (\(L _ t) -> describeToken t) updatePos getTok 83 where 84 updatePos :: SourcePos -> LToken -> LexState' -> SourcePos 85 updatePos pos (L (Position col line) _) _ = newPos (sourceName pos) col line 86 87describeToken :: Token -> String 88describeToken t = case t of 89 TokSym s -> "symbol " ++ show s 90 TokStr s -> "string " ++ show s 91 TokOther s -> "operator " ++ show s 92 Indent _ -> "new line" 93 TokFieldLine _ -> "field content" 94 Colon -> "\":\"" 95 OpenBrace -> "\"{\"" 96 CloseBrace -> "\"}\"" 97-- SemiColon -> "\";\"" 98 EOF -> "end of file" 99 LexicalError is -> "character in input " ++ show (B8.head is) 100 101tokSym :: Parser (Name Position) 102tokSym', tokStr, tokOther :: Parser (SectionArg Position) 103tokIndent :: Parser Int 104tokColon, tokOpenBrace, tokCloseBrace :: Parser () 105tokFieldLine :: Parser (FieldLine Position) 106 107tokSym = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (mkName pos x); _ -> Nothing 108tokSym' = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (SecArgName pos x); _ -> Nothing 109tokStr = getTokenWithPos $ \t -> case t of L pos (TokStr x) -> Just (SecArgStr pos x); _ -> Nothing 110tokOther = getTokenWithPos $ \t -> case t of L pos (TokOther x) -> Just (SecArgOther pos x); _ -> Nothing 111tokIndent = getToken $ \t -> case t of Indent x -> Just x; _ -> Nothing 112tokColon = getToken $ \t -> case t of Colon -> Just (); _ -> Nothing 113tokOpenBrace = getToken $ \t -> case t of OpenBrace -> Just (); _ -> Nothing 114tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing 115tokFieldLine = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing 116 117colon, openBrace, closeBrace :: Parser () 118 119sectionArg :: Parser (SectionArg Position) 120sectionArg = tokSym' <|> tokStr <|> tokOther <?> "section parameter" 121 122fieldSecName :: Parser (Name Position) 123fieldSecName = tokSym <?> "field or section name" 124 125colon = tokColon <?> "\":\"" 126openBrace = tokOpenBrace <?> "\"{\"" 127closeBrace = tokCloseBrace <?> "\"}\"" 128 129fieldContent :: Parser (FieldLine Position) 130fieldContent = tokFieldLine <?> "field contents" 131 132newtype IndentLevel = IndentLevel Int 133 134zeroIndentLevel :: IndentLevel 135zeroIndentLevel = IndentLevel 0 136 137incIndentLevel :: IndentLevel -> IndentLevel 138incIndentLevel (IndentLevel i) = IndentLevel (succ i) 139 140indentOfAtLeast :: IndentLevel -> Parser IndentLevel 141indentOfAtLeast (IndentLevel i) = try $ do 142 j <- tokIndent 143 guard (j >= i) <?> "indentation of at least " ++ show i 144 return (IndentLevel j) 145 146 147newtype LexerMode = LexerMode Int 148 149inLexerMode :: LexerMode -> Parser p -> Parser p 150inLexerMode (LexerMode mode) p = 151 do setLexerMode mode; x <- p; setLexerMode in_section; return x 152 153 154----------------------- 155-- Cabal file grammar 156-- 157 158-- $grammar 159-- 160-- @ 161-- CabalStyleFile ::= SecElems 162-- 163-- SecElems ::= SecElem* '\\n'? 164-- SecElem ::= '\\n' SecElemLayout | SecElemBraces 165-- SecElemLayout ::= FieldLayout | FieldBraces | SectionLayout | SectionBraces 166-- SecElemBraces ::= FieldInline | FieldBraces | SectionBraces 167-- FieldLayout ::= name ':' line? ('\\n' line)* 168-- FieldBraces ::= name ':' '\\n'? '{' content '}' 169-- FieldInline ::= name ':' content 170-- SectionLayout ::= name arg* SecElems 171-- SectionBraces ::= name arg* '\\n'? '{' SecElems '}' 172-- @ 173-- 174-- and the same thing but left factored... 175-- 176-- @ 177-- SecElems ::= SecElem* 178-- SecElem ::= '\\n' name SecElemLayout 179-- | name SecElemBraces 180-- SecElemLayout ::= ':' FieldLayoutOrBraces 181-- | arg* SectionLayoutOrBraces 182-- FieldLayoutOrBraces ::= '\\n'? '{' content '}' 183-- | line? ('\\n' line)* 184-- SectionLayoutOrBraces ::= '\\n'? '{' SecElems '\\n'? '}' 185-- | SecElems 186-- SecElemBraces ::= ':' FieldInlineOrBraces 187-- | arg* '\\n'? '{' SecElems '\\n'? '}' 188-- FieldInlineOrBraces ::= '\\n'? '{' content '}' 189-- | content 190-- @ 191-- 192-- Note how we have several productions with the sequence: 193-- 194-- > '\\n'? '{' 195-- 196-- That is, an optional newline (and indent) followed by a @{@ token. 197-- In the @SectionLayoutOrBraces@ case you can see that this makes it 198-- not fully left factored (because @SecElems@ can start with a @\\n@). 199-- Fully left factoring here would be ugly, and though we could use a 200-- lookahead of two tokens to resolve the alternatives, we can't 201-- conveniently use Parsec's 'try' here to get a lookahead of only two. 202-- So instead we deal with this case in the lexer by making a line 203-- where the first non-space is @{@ lex as just the @{@ token, without 204-- the usual indent token. Then in the parser we can resolve everything 205-- with just one token of lookahead and so without using 'try'. 206 207-- Top level of a file using cabal syntax 208-- 209cabalStyleFile :: Parser [Field Position] 210cabalStyleFile = do es <- elements zeroIndentLevel 211 eof 212 return es 213 214-- Elements that live at the top level or inside a section, ie fields 215-- and sectionscontent 216-- 217-- elements ::= element* 218elements :: IndentLevel -> Parser [Field Position] 219elements ilevel = many (element ilevel) 220 221-- An individual element, ie a field or a section. These can either use 222-- layout style or braces style. For layout style then it must start on 223-- a line on its own (so that we know its indentation level). 224-- 225-- element ::= '\\n' name elementInLayoutContext 226-- | name elementInNonLayoutContext 227element :: IndentLevel -> Parser (Field Position) 228element ilevel = 229 (do ilevel' <- indentOfAtLeast ilevel 230 name <- fieldSecName 231 elementInLayoutContext (incIndentLevel ilevel') name) 232 <|> (do name <- fieldSecName 233 elementInNonLayoutContext name) 234 235-- An element (field or section) that is valid in a layout context. 236-- In a layout context we can have fields and sections that themselves 237-- either use layout style or that use braces style. 238-- 239-- elementInLayoutContext ::= ':' fieldLayoutOrBraces 240-- | arg* sectionLayoutOrBraces 241elementInLayoutContext :: IndentLevel -> Name Position -> Parser (Field Position) 242elementInLayoutContext ilevel name = 243 (do colon; fieldLayoutOrBraces ilevel name) 244 <|> (do args <- many sectionArg 245 elems <- sectionLayoutOrBraces ilevel 246 return (Section name args elems)) 247 248-- An element (field or section) that is valid in a non-layout context. 249-- In a non-layout context we can have only have fields and sections that 250-- themselves use braces style, or inline style fields. 251-- 252-- elementInNonLayoutContext ::= ':' FieldInlineOrBraces 253-- | arg* '\\n'? '{' elements '\\n'? '}' 254elementInNonLayoutContext :: Name Position -> Parser (Field Position) 255elementInNonLayoutContext name = 256 (do colon; fieldInlineOrBraces name) 257 <|> (do args <- many sectionArg 258 openBrace 259 elems <- elements zeroIndentLevel 260 optional tokIndent 261 closeBrace 262 return (Section name args elems)) 263 264-- The body of a field, using either layout style or braces style. 265-- 266-- fieldLayoutOrBraces ::= '\\n'? '{' content '}' 267-- | line? ('\\n' line)* 268fieldLayoutOrBraces :: IndentLevel -> Name Position -> Parser (Field Position) 269fieldLayoutOrBraces ilevel name = braces <|> fieldLayout 270 where 271 braces = do 272 openBrace 273 ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent) 274 closeBrace 275 return (Field name ls) 276 fieldLayout = inLexerMode (LexerMode in_field_layout) $ do 277 l <- optionMaybe fieldContent 278 ls <- many (do _ <- indentOfAtLeast ilevel; fieldContent) 279 return $ case l of 280 Nothing -> Field name ls 281 Just l' -> Field name (l' : ls) 282 283-- The body of a section, using either layout style or braces style. 284-- 285-- sectionLayoutOrBraces ::= '\\n'? '{' elements \\n? '}' 286-- | elements 287sectionLayoutOrBraces :: IndentLevel -> Parser [Field Position] 288sectionLayoutOrBraces ilevel = 289 (do openBrace 290 elems <- elements zeroIndentLevel 291 optional tokIndent 292 closeBrace 293 return elems) 294 <|> (elements ilevel) 295 296-- The body of a field, using either inline style or braces. 297-- 298-- fieldInlineOrBraces ::= '\\n'? '{' content '}' 299-- | content 300fieldInlineOrBraces :: Name Position -> Parser (Field Position) 301fieldInlineOrBraces name = 302 (do openBrace 303 ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent) 304 closeBrace 305 return (Field name ls)) 306 <|> (do ls <- inLexerMode (LexerMode in_field_braces) (option [] (fmap (\l -> [l]) fieldContent)) 307 return (Field name ls)) 308 309 310-- | Parse cabal style 'B8.ByteString' into list of 'Field's, i.e. the cabal AST. 311readFields :: B8.ByteString -> Either ParseError [Field Position] 312readFields s = fmap fst (readFields' s) 313 314-- | Like 'readFields' but also return lexer warnings 315readFields' :: B8.ByteString -> Either ParseError ([Field Position], [LexWarning]) 316readFields' s = do 317 parse parser "the input" lexSt 318 where 319 parser = do 320 fields <- cabalStyleFile 321 ws <- getLexerWarnings 322 pure (fields, ws) 323 324 lexSt = mkLexState' (mkLexState s) 325 326#ifdef CABAL_PARSEC_DEBUG 327parseTest' :: Show a => Parsec LexState' () a -> SourceName -> B8.ByteString -> IO () 328parseTest' p fname s = 329 case parse p fname (lexSt s) of 330 Left err -> putStrLn (formatError s err) 331 332 Right x -> print x 333 where 334 lexSt = mkLexState' . mkLexState 335 336parseFile :: Show a => Parser a -> FilePath -> IO () 337parseFile p f = B8.readFile f >>= \s -> parseTest' p f s 338 339parseStr :: Show a => Parser a -> String -> IO () 340parseStr p = parseBS p . B8.pack 341 342parseBS :: Show a => Parser a -> B8.ByteString -> IO () 343parseBS p = parseTest' p "<input string>" 344 345formatError :: B8.ByteString -> ParseError -> String 346formatError input perr = 347 unlines 348 [ "Parse error "++ show (errorPos perr) ++ ":" 349 , errLine 350 , indicator ++ errmsg ] 351 where 352 pos = errorPos perr 353 ls = lines' (T.decodeUtf8With T.lenientDecode input) 354 errLine = T.unpack (ls !! (sourceLine pos - 1)) 355 indicator = replicate (sourceColumn pos) ' ' ++ "^" 356 errmsg = showErrorMessages "or" "unknown parse error" 357 "expecting" "unexpected" "end of file" 358 (errorMessages perr) 359 360-- | Handles windows/osx/unix line breaks uniformly 361lines' :: T.Text -> [T.Text] 362lines' s1 363 | T.null s1 = [] 364 | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of 365 (l, s2) | Just (c,s3) <- T.uncons s2 366 -> case T.uncons s3 of 367 Just ('\n', s4) | c == '\r' -> l : lines' s4 368 _ -> l : lines' s3 369 | otherwise -> [l] 370#endif 371 372eof :: Parser () 373eof = notFollowedBy anyToken <?> "end of file" 374 where 375 notFollowedBy :: Parser LToken -> Parser () 376 notFollowedBy p = try ( (do L _ t <- try p; unexpected (describeToken t)) 377 <|> return ()) 378