1{
2-- -----------------------------------------------------------------------------
3--
4-- Parser.y, part of Alex
5--
6-- (c) Simon Marlow 2003
7--
8-- -----------------------------------------------------------------------------
9
10{-# OPTIONS_GHC -w #-}
11
12module Parser ( parse, P ) where
13import AbsSyn
14import Scan
15import CharSet
16import ParseMonad hiding ( StartCode )
17
18import Data.Char
19--import Debug.Trace
20}
21
22%tokentype { Token }
23
24%name parse
25
26%monad { P } { (>>=) } { return }
27%lexer { lexer } { T _ EOFT }
28
29%token
30	'.'		{ T _ (SpecialT '.') }
31	';'		{ T _ (SpecialT ';') }
32	'<'		{ T _ (SpecialT '<') }
33	'>'		{ T _ (SpecialT '>') }
34	','		{ T _ (SpecialT ',') }
35	'$'		{ T _ (SpecialT '$') }
36	'|'		{ T _ (SpecialT '|') }
37	'*'		{ T _ (SpecialT '*') }
38	'+'		{ T _ (SpecialT '+') }
39	'?'		{ T _ (SpecialT '?') }
40	'{'		{ T _ (SpecialT '{') }
41	'}'		{ T _ (SpecialT '}') }
42	'('		{ T _ (SpecialT '(') }
43	')'		{ T _ (SpecialT ')') }
44	'#'		{ T _ (SpecialT '#') }
45	'~'		{ T _ (SpecialT '~') }
46	'-'		{ T _ (SpecialT '-') }
47	'['		{ T _ (SpecialT '[') }
48	']'		{ T _ (SpecialT ']') }
49	'^'		{ T _ (SpecialT '^') }
50	'/'		{ T _ (SpecialT '/') }
51	ZERO		{ T _ ZeroT }
52	STRING		{ T _ (StringT $$) }
53	BIND		{ T _ (BindT $$) }
54	ID		{ T _ (IdT $$) }
55	CODE		{ T _ (CodeT _) }
56	CHAR		{ T _ (CharT $$) }
57	SMAC		{ T _ (SMacT _) }
58	RMAC		{ T _ (RMacT $$) }
59	SMAC_DEF	{ T _ (SMacDefT $$) }
60	RMAC_DEF	{ T _ (RMacDefT $$) }
61	WRAPPER		{ T _ WrapperT }
62	ENCODING	{ T _ EncodingT }
63        ACTIONTYPE      { T _ ActionTypeT }
64        TOKENTYPE       { T _ TokenTypeT }
65        TYPECLASS       { T _ TypeClassT }
66%%
67
68alex	:: { (Maybe (AlexPosn,Code), [Directive], Scanner, Maybe (AlexPosn,Code)) }
69	: maybe_code directives macdefs scanner maybe_code { ($1,$2,$4,$5) }
70
71maybe_code :: { Maybe (AlexPosn,Code) }
72	: CODE				{ case $1 of T pos (CodeT code) ->
73						Just (pos,code) }
74	| {- empty -}			{ Nothing }
75
76directives :: { [Directive] }
77	: directive directives		{ $1 : $2 }
78	| {- empty -}			{ [] }
79
80directive  :: { Directive }
81	: WRAPPER STRING		{ WrapperDirective $2 }
82	| ENCODING encoding		{ EncodingDirective $2 }
83        | ACTIONTYPE STRING             { ActionType $2 }
84        | TOKENTYPE STRING              { TokenType $2 }
85        | TYPECLASS STRING              { TypeClass $2 }
86
87encoding :: { Encoding }
88        : STRING         		{% lookupEncoding $1 }
89
90macdefs :: { () }
91	: macdef macdefs		{ () }
92	| {- empty -}			{ () }
93
94-- hack: the lexer looks for the '=' in a macro definition, because there
95-- doesn't seem to be a way to formulate the grammar here to avoid a
96-- conflict (it needs LR(2) rather than LR(1) to find the '=' and distinguish
97-- an SMAC/RMAC at the beginning of a definition from an SMAC/RMAC that is
98-- part of a regexp in the previous definition).
99macdef	:: { () }
100	: SMAC_DEF set			{% newSMac $1 $2 }
101	| RMAC_DEF rexp			{% newRMac $1 $2 }
102
103scanner	:: { Scanner }
104	: BIND tokendefs	 	{ Scanner $1 $2 }
105
106tokendefs :: { [RECtx] }
107	: tokendef tokendefs		{ $1 ++ $2 }
108	| {- empty -}			{ [] }
109
110tokendef :: { [RECtx] }
111	: startcodes rule		{ [ replaceCodes $1 $2 ] }
112	| startcodes '{' rules '}'	{ map (replaceCodes $1) $3 }
113	| rule				{ [ $1 ] }
114
115rule    :: { RECtx }
116	: context rhs			{ let (l,e,r) = $1 in
117					  RECtx [] l e r $2 }
118
119rules	:: { [RECtx] }
120	: rule rules			{ $1 : $2 }
121	| {- empty -}			{ [] }
122
123startcodes :: { [(String,StartCode)] }
124	: '<' startcodes0 '>' 		{ $2 }
125
126startcodes0 :: { [(String,StartCode)] }
127	: startcode ',' startcodes0 	{ ($1,0) : $3 }
128	| startcode 			{ [($1,0)] }
129
130startcode :: { String }
131	: ZERO 				{ "0" }
132	| ID	 			{ $1 }
133
134rhs	:: { Maybe Code }
135	: CODE 				{ case $1 of T _ (CodeT code) -> Just code }
136	| ';'	 			{ Nothing }
137
138context :: { Maybe CharSet, RExp, RightContext RExp }
139	: left_ctx rexp right_ctx	{ (Just $1,$2,$3) }
140	| rexp right_ctx		{ (Nothing,$1,$2) }
141
142left_ctx :: { CharSet }
143	: '^'				{ charSetSingleton '\n' }
144	| set '^' 			{ $1 }
145
146right_ctx :: { RightContext RExp }
147	: '$'		{ RightContextRExp (Ch (charSetSingleton '\n')) }
148	| '/' rexp	{ RightContextRExp $2 }
149        | '/' CODE	{ RightContextCode (case $2 of
150						T _ (CodeT code) -> code) }
151	| {- empty -}	{ NoRightContext }
152
153rexp	:: { RExp }
154	: alt '|' rexp 			{ $1 :| $3 }
155	| alt		 		{ $1 }
156
157alt	:: { RExp }
158	: alt term  			{ $1 :%% $2 }
159	| term 				{ $1 }
160
161term	:: { RExp }
162	: rexp0 rep 			{ $2 $1 }
163	| rexp0 			{ $1 }
164
165rep	:: { RExp -> RExp }
166	: '*' 				{ Star }
167	| '+' 				{ Plus }
168	| '?' 				{ Ques }
169					-- TODO: these don't check for digits
170					-- properly.
171	| '{' CHAR '}'			{ repeat_rng (digit $2) Nothing }
172	| '{' CHAR ',' '}'		{ repeat_rng (digit $2) (Just Nothing) }
173	| '{' CHAR ',' CHAR '}' 	{ repeat_rng (digit $2) (Just (Just (digit $4))) }
174
175rexp0	:: { RExp }
176	: '(' ')'  			{ Eps }
177	| STRING			{ foldr (:%%) Eps
178					    (map (Ch . charSetSingleton) $1) }
179	| RMAC 				{% lookupRMac $1 }
180	| set 				{ Ch $1 }
181	| '(' rexp ')' 			{ $2 }
182
183set	:: { CharSet }
184 	: set '#' set0 			{ $1 `charSetMinus` $3 }
185	| set0 				{ $1 }
186
187set0	:: { CharSet }
188	: CHAR 				{ charSetSingleton $1 }
189	| CHAR '-' CHAR			{ charSetRange $1 $3 }
190	| smac 				{% lookupSMac $1 }
191	| '[' sets ']' 			{ foldr charSetUnion emptyCharSet $2 }
192
193	-- [^sets] is the same as  '. # [sets]'
194	-- The upshot is that [^set] does *not* match a newline character,
195	-- which seems much more useful than just taking the complement.
196	| '[' '^' sets ']'
197			{% do { dot <- lookupSMac (tokPosn $1, ".");
198		      	        return (dot `charSetMinus`
199			      		  foldr charSetUnion emptyCharSet $3) }}
200
201	-- ~set is the same as '. # set'
202	| '~' set0	{% do { dot <- lookupSMac (tokPosn $1, ".");
203		      	        return (dot `charSetMinus` $2) } }
204
205sets	:: { [CharSet] }
206	: set sets			{ $1 : $2 }
207	| {- empty -}			{ [] }
208
209smac	:: { (AlexPosn,String) }
210 	: '.'				{ (tokPosn $1, ".") }
211	| SMAC				{ case $1 of T p (SMacT s) -> (p, s) }
212
213{
214happyError :: P a
215happyError = failP "parse error"
216
217-- -----------------------------------------------------------------------------
218-- Utils
219
220digit c = ord c - ord '0'
221
222repeat_rng :: Int -> Maybe (Maybe Int) -> (RExp->RExp)
223repeat_rng n (Nothing) re = foldr (:%%) Eps (replicate n re)
224repeat_rng n (Just Nothing) re = foldr (:%%) (Star re) (replicate n re)
225repeat_rng n (Just (Just m)) re = intl :%% rst
226	where
227	intl = repeat_rng n Nothing re
228	rst = foldr (\re re'->Ques(re :%% re')) Eps (replicate (m-n) re)
229
230replaceCodes codes rectx = rectx{ reCtxStartCodes = codes }
231
232lookupEncoding :: String -> P Encoding
233lookupEncoding s = case map toLower s of
234  "iso-8859-1" -> return Latin1
235  "latin1"     -> return Latin1
236  "utf-8"      -> return UTF8
237  "utf8"       -> return UTF8
238  _            -> failP ("encoding " ++ show s ++ " not supported")
239
240}
241