1-----------------------------------------------------------------------------
2-- |
3-- Module      :  Tokenise
4-- Copyright   :  2004 Malcolm Wallace
5-- Licence     :  LGPL
6--
7-- Maintainer  :  Malcolm Wallace <Malcolm.Wallace@cs.york.ac.uk>
8-- Stability   :  experimental
9-- Portability :  All
10--
11-- The purpose of this module is to lex a source file (language
12-- unspecified) into tokens such that cpp can recognise a replaceable
13-- symbol or macro-use, and do the right thing.
14-----------------------------------------------------------------------------
15
16module Language.Preprocessor.Cpphs.Tokenise
17  ( linesCpp
18  , reslash
19  , tokenise
20  , WordStyle(..)
21  , deWordStyle
22  , parseMacroCall
23  ) where
24
25import Data.Char
26import Language.Preprocessor.Cpphs.HashDefine
27import Language.Preprocessor.Cpphs.Position
28
29-- | A Mode value describes whether to tokenise a la Haskell, or a la Cpp.
30--   The main difference is that in Cpp mode we should recognise line
31--   continuation characters.
32data Mode = Haskell | Cpp
33
34-- | linesCpp is, broadly speaking, Prelude.lines, except that
35--   on a line beginning with a \#, line continuation characters are
36--   recognised.  In a line continuation, the newline character is
37--   preserved, but the backslash is not.
38linesCpp :: String -> [String]
39linesCpp  []                 = []
40linesCpp (x:xs) | x=='#'     = tok Cpp     ['#'] xs
41                | otherwise  = tok Haskell [] (x:xs)
42  where
43    tok Cpp   acc ('\\':'\n':ys)   = tok Cpp ('\n':acc) ys
44    tok _     acc ('\n':'#':ys)    = reverse acc: tok Cpp ['#'] ys
45    tok _     acc ('\n':ys)        = reverse acc: tok Haskell [] ys
46    tok _     acc []               = reverse acc: []
47    tok mode  acc (y:ys)           = tok mode (y:acc) ys
48
49-- | Put back the line-continuation characters.
50reslash :: String -> String
51reslash ('\n':xs) = '\\':'\n':reslash xs
52reslash (x:xs)    = x: reslash xs
53reslash   []      = []
54
55----
56-- | Submodes are required to deal correctly with nesting of lexical
57--   structures.
58data SubMode = Any | Pred (Char->Bool) (Posn->String->WordStyle)
59             | String Char | LineComment | NestComment Int
60             | CComment | CLineComment
61
62-- | Each token is classified as one of Ident, Other, or Cmd:
63--   * Ident is a word that could potentially match a macro name.
64--   * Cmd is a complete cpp directive (\#define etc).
65--   * Other is anything else.
66data WordStyle = Ident Posn String | Other String | Cmd (Maybe HashDefine)
67  deriving (Eq,Show)
68other :: Posn -> String -> WordStyle
69other _ s = Other s
70
71deWordStyle :: WordStyle -> String
72deWordStyle (Ident _ i) = i
73deWordStyle (Other i)   = i
74deWordStyle (Cmd _)     = "\n"
75
76-- | tokenise is, broadly-speaking, Prelude.words, except that:
77--    * the input is already divided into lines
78--    * each word-like "token" is categorised as one of {Ident,Other,Cmd}
79--    * \#define's are parsed and returned out-of-band using the Cmd variant
80--    * All whitespace is preserved intact as tokens.
81--    * C-comments are converted to white-space (depending on first param)
82--    * Parens and commas are tokens in their own right.
83--    * Any cpp line continuations are respected.
84--   No errors can be raised.
85--   The inverse of tokenise is (concatMap deWordStyle).
86tokenise :: Bool -> Bool -> Bool -> Bool -> [(Posn,String)] -> [WordStyle]
87tokenise _        _             _    _     [] = []
88tokenise stripEol stripComments ansi lang ((pos,str):pos_strs) =
89    (if lang then haskell else plaintext) Any [] pos pos_strs str
90 where
91    -- rules to lex Haskell
92  haskell :: SubMode -> String -> Posn -> [(Posn,String)]
93             -> String -> [WordStyle]
94  haskell Any acc p ls ('\n':'#':xs)      = emit acc $  -- emit "\n" $
95                                            cpp Any haskell [] [] p ls xs
96    -- warning: non-maximal munch on comment
97  haskell Any acc p ls ('-':'-':xs)       = emit acc $
98                                            haskell LineComment "--" p ls xs
99  haskell Any acc p ls ('{':'-':xs)       = emit acc $
100                                            haskell (NestComment 0) "-{" p ls xs
101  haskell Any acc p ls ('/':'*':xs)
102                          | stripComments = emit acc $
103                                            haskell CComment "  " p ls xs
104  haskell Any acc p ls ('/':'/':xs)
105                          | stripEol      = emit acc $
106                                            haskell CLineComment "  " p ls xs
107  haskell Any acc p ls ('"':xs)           = emit acc $
108                                            haskell (String '"') ['"'] p ls xs
109  haskell Any acc p ls ('\'':'\'':xs)     = emit acc $ -- TH type quote
110                                            haskell Any "''" p ls xs
111  haskell Any acc p ls ('\'':xs@('\\':_)) = emit acc $ -- escaped char literal
112                                            haskell (String '\'') "'" p ls xs
113  haskell Any acc p ls ('\'':x:'\'':xs)   = emit acc $ -- character literal
114                                            emit ['\'', x, '\''] $
115                                            haskell Any [] p ls xs
116  haskell Any acc p ls ('\'':xs)          = emit acc $ -- TH name quote
117                                            haskell Any "'" p ls xs
118  haskell Any acc p ls (x:xs) | single x  = emit acc $ emit [x] $
119                                            haskell Any [] p ls xs
120  haskell Any acc p ls (x:xs) | space x   = emit acc $
121                                            haskell (Pred space other) [x]
122                                                                        p ls xs
123  haskell Any acc p ls (x:xs) | symbol x  = emit acc $
124                                            haskell (Pred symbol other) [x]
125                                                                        p ls xs
126 -- haskell Any [] p ls (x:xs) | ident0 x  = id $
127  haskell Any acc p ls (x:xs) | ident0 x  = emit acc $
128                                            haskell (Pred ident1 Ident) [x]
129                                                                        p ls xs
130  haskell Any acc p ls (x:xs)             = haskell Any (x:acc) p ls xs
131
132  haskell pre@(Pred pred ws) acc p ls (x:xs)
133                        | pred x    = haskell pre (x:acc) p ls xs
134  haskell (Pred _ ws) acc p ls xs   = ws p (reverse acc):
135                                      haskell Any [] p ls xs
136  haskell (String c) acc p ls ('\\':x:xs)
137                        | x=='\\'   = haskell (String c) ('\\':'\\':acc) p ls xs
138                        | x==c      = haskell (String c) (c:'\\':acc) p ls xs
139  haskell (String c) acc p ls (x:xs)
140                        | x==c      = emit (c:acc) $ haskell Any [] p ls xs
141                        | otherwise = haskell (String c) (x:acc) p ls xs
142  haskell LineComment acc p ls xs@('\n':_) = emit acc $ haskell Any [] p ls xs
143  haskell LineComment acc p ls (x:xs)      = haskell LineComment (x:acc) p ls xs
144  haskell (NestComment n) acc p ls ('{':'-':xs)
145                                    = haskell (NestComment (n+1))
146                                                            ("-{"++acc) p ls xs
147  haskell (NestComment 0) acc p ls ('-':'}':xs)
148                                    = emit ("}-"++acc) $ haskell Any [] p ls xs
149  haskell (NestComment n) acc p ls ('-':'}':xs)
150                                    = haskell (NestComment (n-1))
151                                                            ("}-"++acc) p ls xs
152  haskell (NestComment n) acc p ls (x:xs) = haskell (NestComment n) (x:acc)
153                                                                        p ls xs
154  haskell CComment acc p ls ('*':'/':xs)  = emit ("  "++acc) $
155                                            haskell Any [] p ls xs
156  haskell CComment acc p ls (x:xs)        = haskell CComment (white x:acc) p ls xs
157  haskell CLineComment acc p ls xs@('\n':_)= emit acc $ haskell Any [] p ls xs
158  haskell CLineComment acc p ls (_:xs)    = haskell CLineComment (' ':acc)
159                                                                       p ls xs
160  haskell mode acc _ ((p,l):ls) []        = haskell mode acc p ls ('\n':l)
161  haskell _    acc _ [] []                = emit acc $ []
162
163  -- rules to lex Cpp
164  cpp :: SubMode -> (SubMode -> String -> Posn -> [(Posn,String)]
165                     -> String -> [WordStyle])
166         -> String -> [String] -> Posn -> [(Posn,String)]
167         -> String -> [WordStyle]
168  cpp mode next word line pos remaining input =
169    lexcpp mode word line remaining input
170   where
171    lexcpp Any w l ls ('/':'*':xs)   = lexcpp (NestComment 0) "" (w*/*l) ls xs
172    lexcpp Any w l ls ('/':'/':xs)   = lexcpp LineComment "  " (w*/*l) ls xs
173    lexcpp Any w l ((p,l'):ls) ('\\':[])  = cpp Any next [] ("\n":w*/*l) p ls l'
174    lexcpp Any w l ls ('\\':'\n':xs) = lexcpp Any [] ("\n":w*/*l) ls xs
175    lexcpp Any w l ls xs@('\n':_)    = Cmd (parseHashDefine ansi
176                                                           (reverse (w*/*l))):
177                                       next Any [] pos ls xs
178 -- lexcpp Any w l ls ('"':xs)     = lexcpp (String '"') ['"'] (w*/*l) ls xs
179 -- lexcpp Any w l ls ('\'':xs)    = lexcpp (String '\'') "'"  (w*/*l) ls xs
180    lexcpp Any w l ls ('"':xs)       = lexcpp Any [] ("\"":(w*/*l)) ls xs
181    lexcpp Any w l ls ('\'':xs)      = lexcpp Any [] ("'": (w*/*l)) ls xs
182    lexcpp Any [] l ls (x:xs)
183                    | ident0 x  = lexcpp (Pred ident1 Ident) [x] l ls xs
184 -- lexcpp Any w l ls (x:xs) | ident0 x  = lexcpp (Pred ident1 Ident) [x] (w*/*l) ls xs
185    lexcpp Any w l ls (x:xs)
186                    | single x  = lexcpp Any [] ([x]:w*/*l) ls xs
187                    | space x   = lexcpp (Pred space other) [x] (w*/*l) ls xs
188                    | symbol x  = lexcpp (Pred symbol other) [x] (w*/*l) ls xs
189                    | otherwise = lexcpp Any (x:w) l ls xs
190    lexcpp pre@(Pred pred _) w l ls (x:xs)
191                    | pred x    = lexcpp pre (x:w) l ls xs
192    lexcpp (Pred _ _) w l ls xs = lexcpp Any [] (w*/*l) ls xs
193    lexcpp (String c) w l ls ('\\':x:xs)
194                    | x=='\\'   = lexcpp (String c) ('\\':'\\':w) l ls xs
195                    | x==c      = lexcpp (String c) (c:'\\':w) l ls xs
196    lexcpp (String c) w l ls (x:xs)
197                    | x==c      = lexcpp Any [] ((c:w)*/*l) ls xs
198                    | otherwise = lexcpp (String c) (x:w) l ls xs
199    lexcpp LineComment w l ((p,l'):ls) ('\\':[])
200                             = cpp LineComment next [] (('\n':w)*/*l) pos ls l'
201    lexcpp LineComment w l ls ('\\':'\n':xs)
202                                = lexcpp LineComment [] (('\n':w)*/*l) ls xs
203    lexcpp LineComment w l ls xs@('\n':_) = lexcpp Any w l ls xs
204    lexcpp LineComment w l ls (_:xs)      = lexcpp LineComment (' ':w) l ls xs
205    lexcpp (NestComment _) w l ls ('*':'/':xs)
206                                          = lexcpp Any [] (w*/*l) ls xs
207    lexcpp (NestComment n) w l ls (x:xs)  = lexcpp (NestComment n) (white x:w) l
208                                                                        ls xs
209    lexcpp mode w l ((p,l'):ls) []        = cpp mode next w l p ls ('\n':l')
210    lexcpp _    _ _ []          []        = []
211
212    -- rules to lex non-Haskell, non-cpp text
213  plaintext :: SubMode -> String -> Posn -> [(Posn,String)]
214            -> String -> [WordStyle]
215  plaintext Any acc p ls ('\n':'#':xs)  = emit acc $  -- emit "\n" $
216                                          cpp Any plaintext [] [] p ls xs
217  plaintext Any acc p ls ('/':'*':xs)
218                           | stripComments = emit acc $
219                                             plaintext CComment "  " p ls xs
220  plaintext Any acc p ls ('/':'/':xs)
221                                | stripEol = emit acc $
222                                             plaintext CLineComment "  " p ls xs
223  plaintext Any acc p ls (x:xs) | single x = emit acc $ emit [x] $
224                                             plaintext Any [] p ls xs
225  plaintext Any acc p ls (x:xs) | space x  = emit acc $
226                                             plaintext (Pred space other) [x]
227                                                                        p ls xs
228  plaintext Any acc p ls (x:xs) | ident0 x = emit acc $
229                                             plaintext (Pred ident1 Ident) [x]
230                                                                        p ls xs
231  plaintext Any acc p ls (x:xs)            = plaintext Any (x:acc) p ls xs
232  plaintext pre@(Pred pred ws) acc p ls (x:xs)
233                                | pred x   = plaintext pre (x:acc) p ls xs
234  plaintext (Pred _ ws) acc p ls xs        = ws p (reverse acc):
235                                             plaintext Any [] p ls xs
236  plaintext CComment acc p ls ('*':'/':xs) = emit ("  "++acc) $
237                                             plaintext Any [] p ls xs
238  plaintext CComment acc p ls (x:xs)       = plaintext CComment (white x:acc) p ls xs
239  plaintext CLineComment acc p ls xs@('\n':_)
240                                        = emit acc $ plaintext Any [] p ls xs
241  plaintext CLineComment acc p ls (_:xs)= plaintext CLineComment (' ':acc)
242                                                                       p ls xs
243  plaintext mode acc _ ((p,l):ls) []    = plaintext mode acc p ls ('\n':l)
244  plaintext _    acc _ [] []            = emit acc $ []
245
246  -- predicates for lexing Haskell.
247  ident0 x = isAlpha x    || x `elem` "_`"
248  ident1 x = isAlphaNum x || x `elem` "'_`"
249  symbol x = x `elem` ":!#$%&*+./<=>?@\\^|-~"
250  single x = x `elem` "(),[];{}"
251  space  x = x `elem` " \t"
252  -- conversion of comment text to whitespace
253  white '\n' = '\n'
254  white '\r' = '\r'
255  white _    = ' '
256  -- emit a token (if there is one) from the accumulator
257  emit ""  = id
258  emit xs  = (Other (reverse xs):)
259  -- add a reversed word to the accumulator
260  "" */* l = l
261  w */* l  = reverse w : l
262  -- help out broken Haskell compilers which need balanced numbers of C
263  -- comments in order to do import chasing :-)  ----->   */*
264
265
266-- | Parse a possible macro call, returning argument list and remaining input
267parseMacroCall :: Posn -> [WordStyle] -> Maybe ([[WordStyle]],[WordStyle])
268parseMacroCall p = call . skip
269  where
270    skip (Other x:xs) | all isSpace x = skip xs
271    skip xss                          = xss
272    call (Other "(":xs)   = (args (0::Int) [] [] . skip) xs
273    call _                = Nothing
274    args 0 w acc (   Other ")" :xs)  = Just (reverse (addone w acc), xs)
275    args 0 w acc (   Other "," :xs)  = args 0     []   (addone w acc) (skip xs)
276    args n w acc (x@(Other "("):xs)  = args (n+1) (x:w)         acc    xs
277    args n w acc (x@(Other ")"):xs)  = args (n-1) (x:w)         acc    xs
278    args n w acc (   Ident _ v :xs)  = args n     (Ident p v:w) acc    xs
279    args n w acc (x@(Other _)  :xs)  = args n     (x:w)         acc    xs
280    args _ _ _   _                   = Nothing
281    addone w acc = reverse (skip w): acc
282