1----------------------------------------------------------------------------- 2-- | 3-- Module : Tokenise 4-- Copyright : 2004 Malcolm Wallace 5-- Licence : LGPL 6-- 7-- Maintainer : Malcolm Wallace <Malcolm.Wallace@cs.york.ac.uk> 8-- Stability : experimental 9-- Portability : All 10-- 11-- The purpose of this module is to lex a source file (language 12-- unspecified) into tokens such that cpp can recognise a replaceable 13-- symbol or macro-use, and do the right thing. 14----------------------------------------------------------------------------- 15 16module Language.Preprocessor.Cpphs.Tokenise 17 ( linesCpp 18 , reslash 19 , tokenise 20 , WordStyle(..) 21 , deWordStyle 22 , parseMacroCall 23 ) where 24 25import Data.Char 26import Language.Preprocessor.Cpphs.HashDefine 27import Language.Preprocessor.Cpphs.Position 28 29-- | A Mode value describes whether to tokenise a la Haskell, or a la Cpp. 30-- The main difference is that in Cpp mode we should recognise line 31-- continuation characters. 32data Mode = Haskell | Cpp 33 34-- | linesCpp is, broadly speaking, Prelude.lines, except that 35-- on a line beginning with a \#, line continuation characters are 36-- recognised. In a line continuation, the newline character is 37-- preserved, but the backslash is not. 38linesCpp :: String -> [String] 39linesCpp [] = [] 40linesCpp (x:xs) | x=='#' = tok Cpp ['#'] xs 41 | otherwise = tok Haskell [] (x:xs) 42 where 43 tok Cpp acc ('\\':'\n':ys) = tok Cpp ('\n':acc) ys 44 tok _ acc ('\n':'#':ys) = reverse acc: tok Cpp ['#'] ys 45 tok _ acc ('\n':ys) = reverse acc: tok Haskell [] ys 46 tok _ acc [] = reverse acc: [] 47 tok mode acc (y:ys) = tok mode (y:acc) ys 48 49-- | Put back the line-continuation characters. 50reslash :: String -> String 51reslash ('\n':xs) = '\\':'\n':reslash xs 52reslash (x:xs) = x: reslash xs 53reslash [] = [] 54 55---- 56-- | Submodes are required to deal correctly with nesting of lexical 57-- structures. 58data SubMode = Any | Pred (Char->Bool) (Posn->String->WordStyle) 59 | String Char | LineComment | NestComment Int 60 | CComment | CLineComment 61 62-- | Each token is classified as one of Ident, Other, or Cmd: 63-- * Ident is a word that could potentially match a macro name. 64-- * Cmd is a complete cpp directive (\#define etc). 65-- * Other is anything else. 66data WordStyle = Ident Posn String | Other String | Cmd (Maybe HashDefine) 67 deriving (Eq,Show) 68other :: Posn -> String -> WordStyle 69other _ s = Other s 70 71deWordStyle :: WordStyle -> String 72deWordStyle (Ident _ i) = i 73deWordStyle (Other i) = i 74deWordStyle (Cmd _) = "\n" 75 76-- | tokenise is, broadly-speaking, Prelude.words, except that: 77-- * the input is already divided into lines 78-- * each word-like "token" is categorised as one of {Ident,Other,Cmd} 79-- * \#define's are parsed and returned out-of-band using the Cmd variant 80-- * All whitespace is preserved intact as tokens. 81-- * C-comments are converted to white-space (depending on first param) 82-- * Parens and commas are tokens in their own right. 83-- * Any cpp line continuations are respected. 84-- No errors can be raised. 85-- The inverse of tokenise is (concatMap deWordStyle). 86tokenise :: Bool -> Bool -> Bool -> Bool -> [(Posn,String)] -> [WordStyle] 87tokenise _ _ _ _ [] = [] 88tokenise stripEol stripComments ansi lang ((pos,str):pos_strs) = 89 (if lang then haskell else plaintext) Any [] pos pos_strs str 90 where 91 -- rules to lex Haskell 92 haskell :: SubMode -> String -> Posn -> [(Posn,String)] 93 -> String -> [WordStyle] 94 haskell Any acc p ls ('\n':'#':xs) = emit acc $ -- emit "\n" $ 95 cpp Any haskell [] [] p ls xs 96 -- warning: non-maximal munch on comment 97 haskell Any acc p ls ('-':'-':xs) = emit acc $ 98 haskell LineComment "--" p ls xs 99 haskell Any acc p ls ('{':'-':xs) = emit acc $ 100 haskell (NestComment 0) "-{" p ls xs 101 haskell Any acc p ls ('/':'*':xs) 102 | stripComments = emit acc $ 103 haskell CComment " " p ls xs 104 haskell Any acc p ls ('/':'/':xs) 105 | stripEol = emit acc $ 106 haskell CLineComment " " p ls xs 107 haskell Any acc p ls ('"':xs) = emit acc $ 108 haskell (String '"') ['"'] p ls xs 109 haskell Any acc p ls ('\'':'\'':xs) = emit acc $ -- TH type quote 110 haskell Any "''" p ls xs 111 haskell Any acc p ls ('\'':xs@('\\':_)) = emit acc $ -- escaped char literal 112 haskell (String '\'') "'" p ls xs 113 haskell Any acc p ls ('\'':x:'\'':xs) = emit acc $ -- character literal 114 emit ['\'', x, '\''] $ 115 haskell Any [] p ls xs 116 haskell Any acc p ls ('\'':xs) = emit acc $ -- TH name quote 117 haskell Any "'" p ls xs 118 haskell Any acc p ls (x:xs) | single x = emit acc $ emit [x] $ 119 haskell Any [] p ls xs 120 haskell Any acc p ls (x:xs) | space x = emit acc $ 121 haskell (Pred space other) [x] 122 p ls xs 123 haskell Any acc p ls (x:xs) | symbol x = emit acc $ 124 haskell (Pred symbol other) [x] 125 p ls xs 126 -- haskell Any [] p ls (x:xs) | ident0 x = id $ 127 haskell Any acc p ls (x:xs) | ident0 x = emit acc $ 128 haskell (Pred ident1 Ident) [x] 129 p ls xs 130 haskell Any acc p ls (x:xs) = haskell Any (x:acc) p ls xs 131 132 haskell pre@(Pred pred ws) acc p ls (x:xs) 133 | pred x = haskell pre (x:acc) p ls xs 134 haskell (Pred _ ws) acc p ls xs = ws p (reverse acc): 135 haskell Any [] p ls xs 136 haskell (String c) acc p ls ('\\':x:xs) 137 | x=='\\' = haskell (String c) ('\\':'\\':acc) p ls xs 138 | x==c = haskell (String c) (c:'\\':acc) p ls xs 139 haskell (String c) acc p ls (x:xs) 140 | x==c = emit (c:acc) $ haskell Any [] p ls xs 141 | otherwise = haskell (String c) (x:acc) p ls xs 142 haskell LineComment acc p ls xs@('\n':_) = emit acc $ haskell Any [] p ls xs 143 haskell LineComment acc p ls (x:xs) = haskell LineComment (x:acc) p ls xs 144 haskell (NestComment n) acc p ls ('{':'-':xs) 145 = haskell (NestComment (n+1)) 146 ("-{"++acc) p ls xs 147 haskell (NestComment 0) acc p ls ('-':'}':xs) 148 = emit ("}-"++acc) $ haskell Any [] p ls xs 149 haskell (NestComment n) acc p ls ('-':'}':xs) 150 = haskell (NestComment (n-1)) 151 ("}-"++acc) p ls xs 152 haskell (NestComment n) acc p ls (x:xs) = haskell (NestComment n) (x:acc) 153 p ls xs 154 haskell CComment acc p ls ('*':'/':xs) = emit (" "++acc) $ 155 haskell Any [] p ls xs 156 haskell CComment acc p ls (x:xs) = haskell CComment (white x:acc) p ls xs 157 haskell CLineComment acc p ls xs@('\n':_)= emit acc $ haskell Any [] p ls xs 158 haskell CLineComment acc p ls (_:xs) = haskell CLineComment (' ':acc) 159 p ls xs 160 haskell mode acc _ ((p,l):ls) [] = haskell mode acc p ls ('\n':l) 161 haskell _ acc _ [] [] = emit acc $ [] 162 163 -- rules to lex Cpp 164 cpp :: SubMode -> (SubMode -> String -> Posn -> [(Posn,String)] 165 -> String -> [WordStyle]) 166 -> String -> [String] -> Posn -> [(Posn,String)] 167 -> String -> [WordStyle] 168 cpp mode next word line pos remaining input = 169 lexcpp mode word line remaining input 170 where 171 lexcpp Any w l ls ('/':'*':xs) = lexcpp (NestComment 0) "" (w*/*l) ls xs 172 lexcpp Any w l ls ('/':'/':xs) = lexcpp LineComment " " (w*/*l) ls xs 173 lexcpp Any w l ((p,l'):ls) ('\\':[]) = cpp Any next [] ("\n":w*/*l) p ls l' 174 lexcpp Any w l ls ('\\':'\n':xs) = lexcpp Any [] ("\n":w*/*l) ls xs 175 lexcpp Any w l ls xs@('\n':_) = Cmd (parseHashDefine ansi 176 (reverse (w*/*l))): 177 next Any [] pos ls xs 178 -- lexcpp Any w l ls ('"':xs) = lexcpp (String '"') ['"'] (w*/*l) ls xs 179 -- lexcpp Any w l ls ('\'':xs) = lexcpp (String '\'') "'" (w*/*l) ls xs 180 lexcpp Any w l ls ('"':xs) = lexcpp Any [] ("\"":(w*/*l)) ls xs 181 lexcpp Any w l ls ('\'':xs) = lexcpp Any [] ("'": (w*/*l)) ls xs 182 lexcpp Any [] l ls (x:xs) 183 | ident0 x = lexcpp (Pred ident1 Ident) [x] l ls xs 184 -- lexcpp Any w l ls (x:xs) | ident0 x = lexcpp (Pred ident1 Ident) [x] (w*/*l) ls xs 185 lexcpp Any w l ls (x:xs) 186 | single x = lexcpp Any [] ([x]:w*/*l) ls xs 187 | space x = lexcpp (Pred space other) [x] (w*/*l) ls xs 188 | symbol x = lexcpp (Pred symbol other) [x] (w*/*l) ls xs 189 | otherwise = lexcpp Any (x:w) l ls xs 190 lexcpp pre@(Pred pred _) w l ls (x:xs) 191 | pred x = lexcpp pre (x:w) l ls xs 192 lexcpp (Pred _ _) w l ls xs = lexcpp Any [] (w*/*l) ls xs 193 lexcpp (String c) w l ls ('\\':x:xs) 194 | x=='\\' = lexcpp (String c) ('\\':'\\':w) l ls xs 195 | x==c = lexcpp (String c) (c:'\\':w) l ls xs 196 lexcpp (String c) w l ls (x:xs) 197 | x==c = lexcpp Any [] ((c:w)*/*l) ls xs 198 | otherwise = lexcpp (String c) (x:w) l ls xs 199 lexcpp LineComment w l ((p,l'):ls) ('\\':[]) 200 = cpp LineComment next [] (('\n':w)*/*l) pos ls l' 201 lexcpp LineComment w l ls ('\\':'\n':xs) 202 = lexcpp LineComment [] (('\n':w)*/*l) ls xs 203 lexcpp LineComment w l ls xs@('\n':_) = lexcpp Any w l ls xs 204 lexcpp LineComment w l ls (_:xs) = lexcpp LineComment (' ':w) l ls xs 205 lexcpp (NestComment _) w l ls ('*':'/':xs) 206 = lexcpp Any [] (w*/*l) ls xs 207 lexcpp (NestComment n) w l ls (x:xs) = lexcpp (NestComment n) (white x:w) l 208 ls xs 209 lexcpp mode w l ((p,l'):ls) [] = cpp mode next w l p ls ('\n':l') 210 lexcpp _ _ _ [] [] = [] 211 212 -- rules to lex non-Haskell, non-cpp text 213 plaintext :: SubMode -> String -> Posn -> [(Posn,String)] 214 -> String -> [WordStyle] 215 plaintext Any acc p ls ('\n':'#':xs) = emit acc $ -- emit "\n" $ 216 cpp Any plaintext [] [] p ls xs 217 plaintext Any acc p ls ('/':'*':xs) 218 | stripComments = emit acc $ 219 plaintext CComment " " p ls xs 220 plaintext Any acc p ls ('/':'/':xs) 221 | stripEol = emit acc $ 222 plaintext CLineComment " " p ls xs 223 plaintext Any acc p ls (x:xs) | single x = emit acc $ emit [x] $ 224 plaintext Any [] p ls xs 225 plaintext Any acc p ls (x:xs) | space x = emit acc $ 226 plaintext (Pred space other) [x] 227 p ls xs 228 plaintext Any acc p ls (x:xs) | ident0 x = emit acc $ 229 plaintext (Pred ident1 Ident) [x] 230 p ls xs 231 plaintext Any acc p ls (x:xs) = plaintext Any (x:acc) p ls xs 232 plaintext pre@(Pred pred ws) acc p ls (x:xs) 233 | pred x = plaintext pre (x:acc) p ls xs 234 plaintext (Pred _ ws) acc p ls xs = ws p (reverse acc): 235 plaintext Any [] p ls xs 236 plaintext CComment acc p ls ('*':'/':xs) = emit (" "++acc) $ 237 plaintext Any [] p ls xs 238 plaintext CComment acc p ls (x:xs) = plaintext CComment (white x:acc) p ls xs 239 plaintext CLineComment acc p ls xs@('\n':_) 240 = emit acc $ plaintext Any [] p ls xs 241 plaintext CLineComment acc p ls (_:xs)= plaintext CLineComment (' ':acc) 242 p ls xs 243 plaintext mode acc _ ((p,l):ls) [] = plaintext mode acc p ls ('\n':l) 244 plaintext _ acc _ [] [] = emit acc $ [] 245 246 -- predicates for lexing Haskell. 247 ident0 x = isAlpha x || x `elem` "_`" 248 ident1 x = isAlphaNum x || x `elem` "'_`" 249 symbol x = x `elem` ":!#$%&*+./<=>?@\\^|-~" 250 single x = x `elem` "(),[];{}" 251 space x = x `elem` " \t" 252 -- conversion of comment text to whitespace 253 white '\n' = '\n' 254 white '\r' = '\r' 255 white _ = ' ' 256 -- emit a token (if there is one) from the accumulator 257 emit "" = id 258 emit xs = (Other (reverse xs):) 259 -- add a reversed word to the accumulator 260 "" */* l = l 261 w */* l = reverse w : l 262 -- help out broken Haskell compilers which need balanced numbers of C 263 -- comments in order to do import chasing :-) -----> */* 264 265 266-- | Parse a possible macro call, returning argument list and remaining input 267parseMacroCall :: Posn -> [WordStyle] -> Maybe ([[WordStyle]],[WordStyle]) 268parseMacroCall p = call . skip 269 where 270 skip (Other x:xs) | all isSpace x = skip xs 271 skip xss = xss 272 call (Other "(":xs) = (args (0::Int) [] [] . skip) xs 273 call _ = Nothing 274 args 0 w acc ( Other ")" :xs) = Just (reverse (addone w acc), xs) 275 args 0 w acc ( Other "," :xs) = args 0 [] (addone w acc) (skip xs) 276 args n w acc (x@(Other "("):xs) = args (n+1) (x:w) acc xs 277 args n w acc (x@(Other ")"):xs) = args (n-1) (x:w) acc xs 278 args n w acc ( Ident _ v :xs) = args n (Ident p v:w) acc xs 279 args n w acc (x@(Other _) :xs) = args n (x:w) acc xs 280 args _ _ _ _ = Nothing 281 addone w acc = reverse (skip w): acc 282