1{ 2-- ----------------------------------------------------------------------------- 3-- 4-- Parser.y, part of Alex 5-- 6-- (c) Simon Marlow 2003 7-- 8-- ----------------------------------------------------------------------------- 9 10{-# OPTIONS_GHC -w #-} 11 12module Parser ( parse, P ) where 13import AbsSyn 14import Scan 15import CharSet 16import ParseMonad hiding ( StartCode ) 17 18import Data.Char 19--import Debug.Trace 20} 21 22%tokentype { Token } 23 24%name parse 25 26%monad { P } { (>>=) } { return } 27%lexer { lexer } { T _ EOFT } 28 29%token 30 '.' { T _ (SpecialT '.') } 31 ';' { T _ (SpecialT ';') } 32 '<' { T _ (SpecialT '<') } 33 '>' { T _ (SpecialT '>') } 34 ',' { T _ (SpecialT ',') } 35 '$' { T _ (SpecialT '$') } 36 '|' { T _ (SpecialT '|') } 37 '*' { T _ (SpecialT '*') } 38 '+' { T _ (SpecialT '+') } 39 '?' { T _ (SpecialT '?') } 40 '{' { T _ (SpecialT '{') } 41 '}' { T _ (SpecialT '}') } 42 '(' { T _ (SpecialT '(') } 43 ')' { T _ (SpecialT ')') } 44 '#' { T _ (SpecialT '#') } 45 '~' { T _ (SpecialT '~') } 46 '-' { T _ (SpecialT '-') } 47 '[' { T _ (SpecialT '[') } 48 ']' { T _ (SpecialT ']') } 49 '^' { T _ (SpecialT '^') } 50 '/' { T _ (SpecialT '/') } 51 ZERO { T _ ZeroT } 52 STRING { T _ (StringT $$) } 53 BIND { T _ (BindT $$) } 54 ID { T _ (IdT $$) } 55 CODE { T _ (CodeT _) } 56 CHAR { T _ (CharT $$) } 57 SMAC { T _ (SMacT _) } 58 RMAC { T _ (RMacT $$) } 59 SMAC_DEF { T _ (SMacDefT $$) } 60 RMAC_DEF { T _ (RMacDefT $$) } 61 WRAPPER { T _ WrapperT } 62 ENCODING { T _ EncodingT } 63 ACTIONTYPE { T _ ActionTypeT } 64 TOKENTYPE { T _ TokenTypeT } 65 TYPECLASS { T _ TypeClassT } 66%% 67 68alex :: { (Maybe (AlexPosn,Code), [Directive], Scanner, Maybe (AlexPosn,Code)) } 69 : maybe_code directives macdefs scanner maybe_code { ($1,$2,$4,$5) } 70 71maybe_code :: { Maybe (AlexPosn,Code) } 72 : CODE { case $1 of T pos (CodeT code) -> 73 Just (pos,code) } 74 | {- empty -} { Nothing } 75 76directives :: { [Directive] } 77 : directive directives { $1 : $2 } 78 | {- empty -} { [] } 79 80directive :: { Directive } 81 : WRAPPER STRING { WrapperDirective $2 } 82 | ENCODING encoding { EncodingDirective $2 } 83 | ACTIONTYPE STRING { ActionType $2 } 84 | TOKENTYPE STRING { TokenType $2 } 85 | TYPECLASS STRING { TypeClass $2 } 86 87encoding :: { Encoding } 88 : STRING {% lookupEncoding $1 } 89 90macdefs :: { () } 91 : macdef macdefs { () } 92 | {- empty -} { () } 93 94-- hack: the lexer looks for the '=' in a macro definition, because there 95-- doesn't seem to be a way to formulate the grammar here to avoid a 96-- conflict (it needs LR(2) rather than LR(1) to find the '=' and distinguish 97-- an SMAC/RMAC at the beginning of a definition from an SMAC/RMAC that is 98-- part of a regexp in the previous definition). 99macdef :: { () } 100 : SMAC_DEF set {% newSMac $1 $2 } 101 | RMAC_DEF rexp {% newRMac $1 $2 } 102 103scanner :: { Scanner } 104 : BIND tokendefs { Scanner $1 $2 } 105 106tokendefs :: { [RECtx] } 107 : tokendef tokendefs { $1 ++ $2 } 108 | {- empty -} { [] } 109 110tokendef :: { [RECtx] } 111 : startcodes rule { [ replaceCodes $1 $2 ] } 112 | startcodes '{' rules '}' { map (replaceCodes $1) $3 } 113 | rule { [ $1 ] } 114 115rule :: { RECtx } 116 : context rhs { let (l,e,r) = $1 in 117 RECtx [] l e r $2 } 118 119rules :: { [RECtx] } 120 : rule rules { $1 : $2 } 121 | {- empty -} { [] } 122 123startcodes :: { [(String,StartCode)] } 124 : '<' startcodes0 '>' { $2 } 125 126startcodes0 :: { [(String,StartCode)] } 127 : startcode ',' startcodes0 { ($1,0) : $3 } 128 | startcode { [($1,0)] } 129 130startcode :: { String } 131 : ZERO { "0" } 132 | ID { $1 } 133 134rhs :: { Maybe Code } 135 : CODE { case $1 of T _ (CodeT code) -> Just code } 136 | ';' { Nothing } 137 138context :: { Maybe CharSet, RExp, RightContext RExp } 139 : left_ctx rexp right_ctx { (Just $1,$2,$3) } 140 | rexp right_ctx { (Nothing,$1,$2) } 141 142left_ctx :: { CharSet } 143 : '^' { charSetSingleton '\n' } 144 | set '^' { $1 } 145 146right_ctx :: { RightContext RExp } 147 : '$' { RightContextRExp (Ch (charSetSingleton '\n')) } 148 | '/' rexp { RightContextRExp $2 } 149 | '/' CODE { RightContextCode (case $2 of 150 T _ (CodeT code) -> code) } 151 | {- empty -} { NoRightContext } 152 153rexp :: { RExp } 154 : alt '|' rexp { $1 :| $3 } 155 | alt { $1 } 156 157alt :: { RExp } 158 : alt term { $1 :%% $2 } 159 | term { $1 } 160 161term :: { RExp } 162 : rexp0 rep { $2 $1 } 163 | rexp0 { $1 } 164 165rep :: { RExp -> RExp } 166 : '*' { Star } 167 | '+' { Plus } 168 | '?' { Ques } 169 -- TODO: these don't check for digits 170 -- properly. 171 | '{' CHAR '}' { repeat_rng (digit $2) Nothing } 172 | '{' CHAR ',' '}' { repeat_rng (digit $2) (Just Nothing) } 173 | '{' CHAR ',' CHAR '}' { repeat_rng (digit $2) (Just (Just (digit $4))) } 174 175rexp0 :: { RExp } 176 : '(' ')' { Eps } 177 | STRING { foldr (:%%) Eps 178 (map (Ch . charSetSingleton) $1) } 179 | RMAC {% lookupRMac $1 } 180 | set { Ch $1 } 181 | '(' rexp ')' { $2 } 182 183set :: { CharSet } 184 : set '#' set0 { $1 `charSetMinus` $3 } 185 | set0 { $1 } 186 187set0 :: { CharSet } 188 : CHAR { charSetSingleton $1 } 189 | CHAR '-' CHAR { charSetRange $1 $3 } 190 | smac {% lookupSMac $1 } 191 | '[' sets ']' { foldr charSetUnion emptyCharSet $2 } 192 193 -- [^sets] is the same as '. # [sets]' 194 -- The upshot is that [^set] does *not* match a newline character, 195 -- which seems much more useful than just taking the complement. 196 | '[' '^' sets ']' 197 {% do { dot <- lookupSMac (tokPosn $1, "."); 198 return (dot `charSetMinus` 199 foldr charSetUnion emptyCharSet $3) }} 200 201 -- ~set is the same as '. # set' 202 | '~' set0 {% do { dot <- lookupSMac (tokPosn $1, "."); 203 return (dot `charSetMinus` $2) } } 204 205sets :: { [CharSet] } 206 : set sets { $1 : $2 } 207 | {- empty -} { [] } 208 209smac :: { (AlexPosn,String) } 210 : '.' { (tokPosn $1, ".") } 211 | SMAC { case $1 of T p (SMacT s) -> (p, s) } 212 213{ 214happyError :: P a 215happyError = failP "parse error" 216 217-- ----------------------------------------------------------------------------- 218-- Utils 219 220digit c = ord c - ord '0' 221 222repeat_rng :: Int -> Maybe (Maybe Int) -> (RExp->RExp) 223repeat_rng n (Nothing) re = foldr (:%%) Eps (replicate n re) 224repeat_rng n (Just Nothing) re = foldr (:%%) (Star re) (replicate n re) 225repeat_rng n (Just (Just m)) re = intl :%% rst 226 where 227 intl = repeat_rng n Nothing re 228 rst = foldr (\re re'->Ques(re :%% re')) Eps (replicate (m-n) re) 229 230replaceCodes codes rectx = rectx{ reCtxStartCodes = codes } 231 232lookupEncoding :: String -> P Encoding 233lookupEncoding s = case map toLower s of 234 "iso-8859-1" -> return Latin1 235 "latin1" -> return Latin1 236 "utf-8" -> return UTF8 237 "utf8" -> return UTF8 238 _ -> failP ("encoding " ++ show s ++ " not supported") 239 240} 241