1 { 2 ----------------------------------------------------------------------------- 3 -- | 4 -- Module : Distribution.Fields.Lexer 5 -- License : BSD3 6 -- 7 -- Maintainer : cabal-devel@haskell.org 8 -- Portability : portable 9 -- 10 -- Lexer for the cabal files. 11 {-# LANGUAGE CPP #-} 12 {-# LANGUAGE BangPatterns #-} 13 #ifdef CABAL_PARSEC_DEBUG 14 {-# LANGUAGE PatternGuards #-} 15 #endif 16 {-# OPTIONS_GHC -fno-warn-unused-imports #-} 17 module Distribution.Fields.Lexer 18 (ltest, lexToken, Token(..), LToken(..) 19 ,bol_section, in_section, in_field_layout, in_field_braces 20 ,mkLexState) where 21 22 -- [Note: boostrapping parsec parser] 23 -- 24 -- We manually produce the `Lexer.hs` file from `boot/Lexer.x` (make lexer) 25 -- because boostrapping cabal-install would be otherwise tricky. 26 -- Alex is (atm) tricky package to build, cabal-install has some magic 27 -- to move bundled generated files in place, so rather we don't depend 28 -- on it before we can build it ourselves. 29 -- Therefore there is one thing less to worry in bootstrap.sh, which is a win. 30 -- 31 -- See also https://github.com/haskell/cabal/issues/4633 32 -- 33 34 import Prelude () 35 import qualified Prelude as Prelude 36 import Distribution.Compat.Prelude 37 38 import Distribution.Fields.LexerMonad 39 import Distribution.Parsec.Position (Position (..), incPos, retPos) 40 import Data.ByteString (ByteString) 41 import qualified Data.ByteString as B 42 import qualified Data.ByteString.Char8 as B.Char8 43 import qualified Data.Word as Word 44 45 #ifdef CABAL_PARSEC_DEBUG 46 import Debug.Trace 47 import qualified Data.Vector as V 48 import qualified Data.Text as T 49 import qualified Data.Text.Encoding as T 50 import qualified Data.Text.Encoding.Error as T 51 #endif 52 53 } 54 -- Various character classes 55 56 $space = \ -- single space char 57 $ctlchar = [\x0-\x1f \x7f] 58 $printable = \x0-\xff # $ctlchar -- so no \n \r 59 $symbol' = [ \, \= \< \> \+ \* \& \| \! \$ \% \^ \@ \# \? \/ \\ \~ ] 60 $symbol = [$symbol' \- \.] 61 $spacetab = [$space \t] 62 63 $paren = [ \( \) \[ \] ] 64 $field_layout = [$printable \t] 65 $field_layout' = [$printable] # [$space] 66 $field_braces = [$printable \t] # [\{ \}] 67 $field_braces' = [$printable] # [\{ \} $space] 68 $comment = [$printable \t] 69 $namecore = [$printable] # [$space \: \" \{ \} $paren $symbol'] 70 $instr = [$printable $space] # [\"] 71 $instresc = $printable 72 73 @bom = \xef \xbb \xbf 74 @nbsp = \xc2 \xa0 75 @nbspspacetab = ($spacetab | @nbsp) 76 @nbspspace = ($space | @nbsp) 77 @nl = \n | \r\n | \r 78 @name = $namecore+ 79 @string = \" ( $instr | \\ $instresc )* \" 80 @oplike = $symbol+ 81 82 83 tokens :- 84 85 <0> { 86 @bom? { \_ len _ -> do 87 when (len /= 0) $ addWarning LexWarningBOM 88 setStartCode bol_section 89 lexToken 90 } 91 } 92 93 <bol_section, bol_field_layout, bol_field_braces> { 94 @nbspspacetab* @nl { \_pos len inp -> checkWhitespace len inp >> adjustPos retPos >> lexToken } 95 -- no @nl here to allow for comments on last line of the file with no trailing \n 96 $spacetab* "--" $comment* ; -- TODO: check the lack of @nl works here 97 -- including counting line numbers 98 } 99 100 <bol_section> { 101 @nbspspacetab* { \pos len inp -> checkLeadingWhitespace len inp >> 102 if B.length inp == len 103 then return (L pos EOF) 104 else setStartCode in_section 105 >> return (L pos (Indent len)) } 106 $spacetab* \{ { tok OpenBrace } 107 $spacetab* \} { tok CloseBrace } 108 } 109 110 <in_section> { 111 $spacetab+ ; --TODO: don't allow tab as leading space 112 113 "--" $comment* ; 114 115 @name { toki TokSym } 116 @string { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) } 117 @oplike { toki TokOther } 118 $paren { toki TokOther } 119 \: { tok Colon } 120 \{ { tok OpenBrace } 121 \} { tok CloseBrace } 122 @nl { \_ _ _ -> adjustPos retPos >> setStartCode bol_section >> lexToken } 123 } 124 125 <bol_field_layout> { 126 @nbspspacetab* { \pos len inp -> checkLeadingWhitespace len inp >>= \len' -> 127 if B.length inp == len 128 then return (L pos EOF) 129 else setStartCode in_field_layout 130 >> return (L pos (Indent len')) } 131 } 132 133 <in_field_layout> { 134 $spacetab+; 135 $field_layout' $field_layout* { toki TokFieldLine } 136 @nl { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken } 137 } 138 139 <bol_field_braces> { 140 () { \_ _ _ -> setStartCode in_field_braces >> lexToken } 141 } 142 143 <in_field_braces> { 144 $spacetab+; 145 $field_braces' $field_braces* { toki TokFieldLine } 146 \{ { tok OpenBrace } 147 \} { tok CloseBrace } 148 @nl { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_braces >> lexToken } 149 } 150 151 { 152 153 -- | Tokens of outer cabal file structure. Field values are treated opaquely. 154 data Token = TokSym !ByteString -- ^ Haskell-like identifier, number or operator 155 | TokStr !ByteString -- ^ String in quotes 156 | TokOther !ByteString -- ^ Operators and parens 157 | Indent !Int -- ^ Indentation token 158 | TokFieldLine !ByteString -- ^ Lines after @:@ 159 | Colon 160 | OpenBrace 161 | CloseBrace 162 | EOF 163 | LexicalError InputStream --TODO: add separate string lexical error 164 deriving Show 165 166 data LToken = L !Position !Token 167 deriving Show 168 169 toki :: (ByteString -> Token) -> Position -> Int -> ByteString -> Lex LToken 170 toki t pos len input = return $! L pos (t (B.take len input)) 171 172 tok :: Token -> Position -> Int -> ByteString -> Lex LToken 173 tok t pos _len _input = return $! L pos t 174 175 checkLeadingWhitespace :: Int -> ByteString -> Lex Int 176 checkLeadingWhitespace len bs 177 | B.any (== 9) (B.take len bs) = do 178 addWarning LexWarningTab 179 checkWhitespace len bs 180 | otherwise = checkWhitespace len bs 181 182 checkWhitespace :: Int -> ByteString -> Lex Int 183 checkWhitespace len bs 184 | B.any (== 194) (B.take len bs) = do 185 addWarning LexWarningNBSP 186 return $ len - B.count 194 (B.take len bs) 187 | otherwise = return len 188 189 -- ----------------------------------------------------------------------------- 190 -- The input type 191 192 type AlexInput = InputStream 193 194 alexInputPrevChar :: AlexInput -> Char 195 alexInputPrevChar _ = error "alexInputPrevChar not used" 196 197 alexGetByte :: AlexInput -> Maybe (Word.Word8,AlexInput) 198 alexGetByte = B.uncons 199 200 lexicalError :: Position -> InputStream -> Lex LToken 201 lexicalError pos inp = do 202 setInput B.empty 203 return $! L pos (LexicalError inp) 204 205 lexToken :: Lex LToken 206 lexToken = do 207 pos <- getPos 208 inp <- getInput 209 st <- getStartCode 210 case alexScan inp st of 211 AlexEOF -> return (L pos EOF) 212 AlexError inp' -> 213 let !len_bytes = B.length inp - B.length inp' in 214 --FIXME: we want len_chars here really 215 -- need to decode utf8 up to this point 216 lexicalError (incPos len_bytes pos) inp' 217 AlexSkip inp' len_chars -> do 218 checkPosition pos inp inp' len_chars 219 adjustPos (incPos len_chars) 220 setInput inp' 221 lexToken 222 AlexToken inp' len_chars action -> do 223 checkPosition pos inp inp' len_chars 224 adjustPos (incPos len_chars) 225 setInput inp' 226 let !len_bytes = B.length inp - B.length inp' 227 t <- action pos len_bytes inp 228 --traceShow t $ return tok 229 return t 230 231 232 checkPosition :: Position -> ByteString -> ByteString -> Int -> Lex () 233 #ifdef CABAL_PARSEC_DEBUG 234 checkPosition pos@(Position lineno colno) inp inp' len_chars = do 235 text_lines <- getDbgText 236 let len_bytes = B.length inp - B.length inp' 237 pos_txt | lineno-1 < V.length text_lines = T.take len_chars (T.drop (colno-1) (text_lines V.! (lineno-1))) 238 | otherwise = T.empty 239 real_txt = B.take len_bytes inp 240 when (pos_txt /= T.decodeUtf8 real_txt) $ 241 traceShow (pos, pos_txt, T.decodeUtf8 real_txt) $ 242 traceShow (take 3 (V.toList text_lines)) $ return () 243 where 244 getDbgText = Lex $ \s@LexState{ dbgText = txt } -> LexResult s txt 245 #else 246 checkPosition _ _ _ _ = return () 247 #endif 248 249 lexAll :: Lex [LToken] 250 lexAll = do 251 t <- lexToken 252 case t of 253 L _ EOF -> return [t] 254 _ -> do ts <- lexAll 255 return (t : ts) 256 257 ltest :: Int -> String -> Prelude.IO () 258 ltest code s = 259 let (ws, xs) = execLexer (setStartCode code >> lexAll) (B.Char8.pack s) 260 in traverse_ print ws >> traverse_ print xs 261 262 263 mkLexState :: ByteString -> LexState 264 mkLexState input = LexState 265 { curPos = Position 1 1 266 , curInput = input 267 , curCode = 0 268 , warnings = [] 269 #ifdef CABAL_PARSEC_DEBUG 270 , dbgText = V.fromList . lines' . T.decodeUtf8With T.lenientDecode $ input 271 #endif 272 } 273 274 #ifdef CABAL_PARSEC_DEBUG 275 lines' :: T.Text -> [T.Text] 276 lines' s1 277 | T.null s1 = [] 278 | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of 279 (l, s2) | Just (c,s3) <- T.uncons s2 280 -> case T.uncons s3 of 281 Just ('\n', s4) | c == '\r' -> l `T.snoc` '\r' `T.snoc` '\n' : lines' s4 282 _ -> l `T.snoc` c : lines' s3 283 284 | otherwise 285 -> [l] 286 #endif 287 } 288