1 {
2 -----------------------------------------------------------------------------
3 -- |
4 -- Module      :  Distribution.Fields.Lexer
5 -- License     :  BSD3
6 --
7 -- Maintainer  :  cabal-devel@haskell.org
8 -- Portability :  portable
9 --
10 -- Lexer for the cabal files.
11 {-# LANGUAGE CPP #-}
12 {-# LANGUAGE BangPatterns #-}
13 #ifdef CABAL_PARSEC_DEBUG
14 {-# LANGUAGE PatternGuards #-}
15 #endif
16 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
17 module Distribution.Fields.Lexer
18   (ltest, lexToken, Token(..), LToken(..)
19   ,bol_section, in_section, in_field_layout, in_field_braces
20   ,mkLexState) where
21 
22 -- [Note: boostrapping parsec parser]
23 --
24 -- We manually produce the `Lexer.hs` file from `boot/Lexer.x` (make lexer)
25 -- because boostrapping cabal-install would be otherwise tricky.
26 -- Alex is (atm) tricky package to build, cabal-install has some magic
27 -- to move bundled generated files in place, so rather we don't depend
28 -- on it before we can build it ourselves.
29 -- Therefore there is one thing less to worry in bootstrap.sh, which is a win.
30 --
31 -- See also https://github.com/haskell/cabal/issues/4633
32 --
33 
34 import Prelude ()
35 import qualified Prelude as Prelude
36 import Distribution.Compat.Prelude
37 
38 import Distribution.Fields.LexerMonad
39 import Distribution.Parsec.Position (Position (..), incPos, retPos)
40 import Data.ByteString (ByteString)
41 import qualified Data.ByteString as B
42 import qualified Data.ByteString.Char8 as B.Char8
43 import qualified Data.Word as Word
44 
45 #ifdef CABAL_PARSEC_DEBUG
46 import Debug.Trace
47 import qualified Data.Vector as V
48 import qualified Data.Text   as T
49 import qualified Data.Text.Encoding as T
50 import qualified Data.Text.Encoding.Error as T
51 #endif
52 
53 }
54 -- Various character classes
55 
56 $space           = \          -- single space char
57 $ctlchar         = [\x0-\x1f \x7f]
58 $printable       = \x0-\xff # $ctlchar   -- so no \n \r
59 $symbol'         = [ \, \= \< \> \+ \* \& \| \! \$ \% \^ \@ \# \? \/ \\ \~ ]
60 $symbol          = [$symbol' \- \.]
61 $spacetab        = [$space \t]
62 
63 $paren           = [ \( \) \[ \] ]
64 $field_layout    = [$printable \t]
65 $field_layout'   = [$printable] # [$space]
66 $field_braces    = [$printable \t] # [\{ \}]
67 $field_braces'   = [$printable] # [\{ \} $space]
68 $comment         = [$printable \t]
69 $namecore        = [$printable] # [$space \: \" \{ \} $paren $symbol']
70 $instr           = [$printable $space] # [\"]
71 $instresc        = $printable
72 
73 @bom          = \xef \xbb \xbf
74 @nbsp         = \xc2 \xa0
75 @nbspspacetab = ($spacetab | @nbsp)
76 @nbspspace    = ($space | @nbsp)
77 @nl           = \n | \r\n | \r
78 @name         = $namecore+
79 @string       = \" ( $instr | \\ $instresc )* \"
80 @oplike       = $symbol+
81 
82 
83 tokens :-
84 
85 <0> {
86   @bom?  { \_ len _ -> do
87               when (len /= 0) $ addWarning LexWarningBOM
88               setStartCode bol_section
89               lexToken
90          }
91 }
92 
93 <bol_section, bol_field_layout, bol_field_braces> {
94   @nbspspacetab* @nl         { \_pos len inp -> checkWhitespace len inp >> adjustPos retPos >> lexToken }
95   -- no @nl here to allow for comments on last line of the file with no trailing \n
96   $spacetab* "--" $comment*  ;  -- TODO: check the lack of @nl works here
97                                 -- including counting line numbers
98 }
99 
100 <bol_section> {
101   @nbspspacetab*   { \pos len inp -> checkLeadingWhitespace len inp >>
102                                      if B.length inp == len
103                                        then return (L pos EOF)
104                                        else setStartCode in_section
105                                          >> return (L pos (Indent len)) }
106   $spacetab* \{    { tok  OpenBrace }
107   $spacetab* \}    { tok  CloseBrace }
108 }
109 
110 <in_section> {
111   $spacetab+   ; --TODO: don't allow tab as leading space
112 
113   "--" $comment* ;
114 
115   @name        { toki TokSym }
116   @string      { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
117   @oplike      { toki TokOther }
118   $paren       { toki TokOther }
119   \:           { tok  Colon }
120   \{           { tok  OpenBrace }
121   \}           { tok  CloseBrace }
122   @nl          { \_ _ _ -> adjustPos retPos >> setStartCode bol_section >> lexToken }
123 }
124 
125 <bol_field_layout> {
126   @nbspspacetab* { \pos len inp -> checkLeadingWhitespace len inp >>= \len' ->
127                                   if B.length inp == len
128                                     then return (L pos EOF)
129                                     else setStartCode in_field_layout
130                                       >> return (L pos (Indent len')) }
131 }
132 
133 <in_field_layout> {
134   $spacetab+;
135   $field_layout' $field_layout*  { toki TokFieldLine }
136   @nl             { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken }
137 }
138 
139 <bol_field_braces> {
140    ()                { \_ _ _ -> setStartCode in_field_braces >> lexToken }
141 }
142 
143 <in_field_braces> {
144   $spacetab+;
145   $field_braces' $field_braces*    { toki TokFieldLine }
146   \{                { tok  OpenBrace  }
147   \}                { tok  CloseBrace }
148   @nl               { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_braces >> lexToken }
149 }
150 
151 {
152 
153 -- | Tokens of outer cabal file structure. Field values are treated opaquely.
154 data Token = TokSym   !ByteString       -- ^ Haskell-like identifier, number or operator
155            | TokStr   !ByteString       -- ^ String in quotes
156            | TokOther !ByteString       -- ^ Operators and parens
157            | Indent   !Int              -- ^ Indentation token
158            | TokFieldLine !ByteString   -- ^ Lines after @:@
159            | Colon
160            | OpenBrace
161            | CloseBrace
162            | EOF
163            | LexicalError InputStream --TODO: add separate string lexical error
164   deriving Show
165 
166 data LToken = L !Position !Token
167   deriving Show
168 
169 toki :: (ByteString -> Token) -> Position -> Int -> ByteString -> Lex LToken
170 toki t pos  len  input = return $! L pos (t (B.take len input))
171 
172 tok :: Token -> Position -> Int -> ByteString -> Lex LToken
173 tok  t pos _len _input = return $! L pos t
174 
175 checkLeadingWhitespace :: Int -> ByteString -> Lex Int
176 checkLeadingWhitespace len bs
177     | B.any (== 9) (B.take len bs) = do
178         addWarning LexWarningTab
179         checkWhitespace len bs
180     | otherwise = checkWhitespace len bs
181 
182 checkWhitespace :: Int -> ByteString -> Lex Int
183 checkWhitespace len bs
184     | B.any (== 194) (B.take len bs) = do
185         addWarning LexWarningNBSP
186         return $ len - B.count 194 (B.take len bs)
187     | otherwise = return len
188 
189 -- -----------------------------------------------------------------------------
190 -- The input type
191 
192 type AlexInput = InputStream
193 
194 alexInputPrevChar :: AlexInput -> Char
195 alexInputPrevChar _ = error "alexInputPrevChar not used"
196 
197 alexGetByte :: AlexInput -> Maybe (Word.Word8,AlexInput)
198 alexGetByte = B.uncons
199 
200 lexicalError :: Position -> InputStream -> Lex LToken
201 lexicalError pos inp = do
202   setInput B.empty
203   return $! L pos (LexicalError inp)
204 
205 lexToken :: Lex LToken
206 lexToken = do
207   pos <- getPos
208   inp <- getInput
209   st  <- getStartCode
210   case alexScan inp st of
211     AlexEOF -> return (L pos EOF)
212     AlexError inp' ->
213         let !len_bytes = B.length inp - B.length inp' in
214             --FIXME: we want len_chars here really
215             -- need to decode utf8 up to this point
216         lexicalError (incPos len_bytes pos) inp'
217     AlexSkip  inp' len_chars -> do
218         checkPosition pos inp inp' len_chars
219         adjustPos (incPos len_chars)
220         setInput inp'
221         lexToken
222     AlexToken inp' len_chars action -> do
223         checkPosition pos inp inp' len_chars
224         adjustPos (incPos len_chars)
225         setInput inp'
226         let !len_bytes = B.length inp - B.length inp'
227         t <- action pos len_bytes inp
228         --traceShow t $ return tok
229         return t
230 
231 
232 checkPosition :: Position -> ByteString -> ByteString -> Int -> Lex ()
233 #ifdef CABAL_PARSEC_DEBUG
234 checkPosition pos@(Position lineno colno) inp inp' len_chars = do
235     text_lines <- getDbgText
236     let len_bytes = B.length inp - B.length inp'
237         pos_txt   | lineno-1 < V.length text_lines = T.take len_chars (T.drop (colno-1) (text_lines V.! (lineno-1)))
238                   | otherwise = T.empty
239         real_txt  = B.take len_bytes inp
240     when (pos_txt /= T.decodeUtf8 real_txt) $
241       traceShow (pos, pos_txt, T.decodeUtf8 real_txt) $
242       traceShow (take 3 (V.toList text_lines)) $ return ()
243   where
244     getDbgText = Lex $ \s@LexState{ dbgText = txt } -> LexResult s txt
245 #else
246 checkPosition _ _ _ _ = return ()
247 #endif
248 
249 lexAll :: Lex [LToken]
250 lexAll = do
251   t <- lexToken
252   case t of
253     L _ EOF -> return [t]
254     _       -> do ts <- lexAll
255                   return (t : ts)
256 
257 ltest :: Int -> String -> Prelude.IO ()
258 ltest code s =
259   let (ws, xs) = execLexer (setStartCode code >> lexAll) (B.Char8.pack s)
260    in traverse_ print ws >> traverse_ print xs
261 
262 
263 mkLexState :: ByteString -> LexState
264 mkLexState input = LexState
265   { curPos   = Position 1 1
266   , curInput = input
267   , curCode  = 0
268   , warnings = []
269 #ifdef CABAL_PARSEC_DEBUG
270   , dbgText  = V.fromList . lines' . T.decodeUtf8With T.lenientDecode $ input
271 #endif
272   }
273 
274 #ifdef CABAL_PARSEC_DEBUG
275 lines' :: T.Text -> [T.Text]
276 lines' s1
277   | T.null s1 = []
278   | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of
279                   (l, s2) | Just (c,s3) <- T.uncons s2
280                          -> case T.uncons s3 of
281                               Just ('\n', s4) | c == '\r' -> l `T.snoc` '\r' `T.snoc` '\n' : lines' s4
282                               _                           -> l `T.snoc` c : lines' s3
283 
284                           | otherwise
285                          -> [l]
286 #endif
287 }
288