1{-# OPTIONS_GHC -fno-warn-missing-signatures #-}
2-- | This is a POSIX version of parseRegex that allows NUL characters.
3-- Lazy\/Possessive\/Backrefs are not recognized.  Anchors \^ and \$ are
4-- recognized.
5--
6-- The PGroup returned always have (Maybe GroupIndex) set to (Just _)
7-- and never to Nothing.
8module Text.Regex.TDFA.ReadRegex (parseRegex) where
9
10{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}
11
12import Text.Regex.TDFA.Pattern {- all -}
13import Text.ParserCombinators.Parsec((<|>), (<?>),
14  unexpected, try, runParser, many, getState, setState, CharParser, ParseError,
15  sepBy1, option, notFollowedBy, many1, lookAhead, eof, between,
16  string, noneOf, digit, char, anyChar)
17import Control.Monad(liftM, when, guard)
18import qualified Data.Set as Set(fromList)
19
20-- | BracketElement is internal to this module
21data BracketElement = BEChar Char | BEChars String | BEColl String | BEEquiv String | BEClass String
22
23-- | Return either an error message or a tuple of the Pattern and the
24-- largest group index and the largest DoPa index (both have smallest
25-- index of 1).  Since the regular expression is supplied as [Char] it
26-- automatically supports unicode and @\\NUL@ characters.
27parseRegex :: String -> Either ParseError (Pattern,(GroupIndex,DoPa))
28parseRegex x = runParser (do pat <- p_regex
29                             eof
30                             (lastGroupIndex,lastDopa) <- getState
31                             return (pat,(lastGroupIndex,DoPa lastDopa))) (0,0) x x
32
33p_regex :: CharParser (GroupIndex,Int) Pattern
34p_regex = liftM POr $ sepBy1 p_branch (char '|')
35
36-- man re_format helps alot, it says one-or-more pieces so this is
37-- many1 not many.  Use "()" to indicate an empty piece.
38p_branch = liftM PConcat $ many1 p_piece
39
40p_piece = (p_anchor <|> p_atom) >>= p_post_atom -- correct specification
41
42p_atom =  p_group <|> p_bracket <|> p_char <?> "an atom"
43
44group_index :: CharParser (GroupIndex,Int) (Maybe GroupIndex)
45group_index = do
46  (gi,ci) <- getState
47  let index = succ gi
48  setState (index,ci)
49  return (Just index)
50
51p_group = lookAhead (char '(') >> do
52  index <- group_index
53  liftM (PGroup index) $ between (char '(') (char ')') p_regex
54
55-- p_post_atom takes the previous atom as a parameter
56p_post_atom atom = (char '?' >> return (PQuest atom))
57               <|> (char '+' >> return (PPlus atom))
58               <|> (char '*' >> return (PStar True atom))
59               <|> p_bound atom
60               <|> return atom
61
62p_bound atom = try $ between (char '{') (char '}') (p_bound_spec atom)
63
64p_bound_spec atom = do lowS <- many1 digit
65                       let lowI = read lowS
66                       highMI <- option (Just lowI) $ try $ do
67                                   _ <- char ','
68  -- parsec note: if 'many digits' fails below then the 'try' ensures
69  -- that the ',' will not match the closing '}' in p_bound, same goes
70  -- for any non '}' garbage after the 'many digits'.
71                                   highS <- many digit
72                                   if null highS then return Nothing -- no upper bound
73                                     else do let highI = read highS
74                                             guard (lowI <= highI)
75                                             return (Just (read highS))
76                       return (PBound lowI highMI atom)
77
78-- An anchor cannot be modified by a repetition specifier
79p_anchor = (char '^' >> liftM PCarat char_index)
80       <|> (char '$' >> liftM PDollar char_index)
81       <|> try (do _ <- string "()"
82                   index <- group_index
83                   return $ PGroup index PEmpty)
84       <?> "empty () or anchor ^ or $"
85
86char_index = do (gi,ci) <- getState
87                let ci' = succ ci
88                setState (gi,ci')
89                return (DoPa ci')
90
91p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char where
92  p_dot = char '.' >> char_index >>= return . PDot
93  p_left_brace = try $ (char '{' >> notFollowedBy digit >> char_index >>= return . (`PChar` '{'))
94  p_escaped = char '\\' >> anyChar >>= \c -> char_index >>= return . (`PEscape` c)
95  p_other_char = noneOf specials >>= \c -> char_index >>= return . (`PChar` c)
96    where specials  = "^.[$()|*+?{\\"
97
98-- parse [bar] and [^bar] sets of characters
99p_bracket = (char '[') >> ( (char '^' >> p_set True) <|> (p_set False) )
100
101-- p_set :: Bool -> GenParser Char st Pattern
102p_set invert = do initial <- (option "" ((char ']' >> return "]") <|> (char '-' >> return "-")))
103                  values <- if null initial then many1 p_set_elem else many p_set_elem
104                  _ <- char ']'
105                  ci <- char_index
106                  let chars = maybe'set $ initial
107                                          ++ [c | BEChar c <- values ]
108                                          ++ concat [s | BEChars s <- values ]
109                      colls = maybe'set [PatternSetCollatingElement coll | BEColl coll <- values ]
110                      equivs = maybe'set [PatternSetEquivalenceClass equiv | BEEquiv equiv <- values]
111                      class's = maybe'set [PatternSetCharacterClass a'class | BEClass a'class <- values]
112                      maybe'set x = if null x then Nothing else Just (Set.fromList x)
113                      sets = PatternSet chars class's colls equivs
114                  sets `seq` return $ if invert then PAnyNot ci sets else PAny ci sets
115
116-- From here down the code is the parser and functions for pattern [ ] set things
117
118p_set_elem = p_set_elem_class <|> p_set_elem_equiv <|> p_set_elem_coll
119         <|> p_set_elem_range <|> p_set_elem_char <?> "Failed to parse bracketed string"
120
121p_set_elem_class = liftM BEClass $
122  try (between (string "[:") (string ":]") (many1 $ noneOf ":]"))
123
124p_set_elem_equiv = liftM BEEquiv $
125  try (between (string "[=") (string "=]") (many1 $ noneOf "=]"))
126
127p_set_elem_coll =  liftM BEColl $
128  try (between (string "[.") (string ".]") (many1 $ noneOf ".]"))
129
130p_set_elem_range = try $ do
131  start <- noneOf "]-"
132  _  <- char '-'
133  end <- noneOf "]"
134  -- bug fix: check start <= end before "return (BEChars [start..end])"
135  if start <= end
136    then return (BEChars [start..end])
137    else unexpected "End point of dashed character range is less than starting point"
138
139p_set_elem_char = do
140  c <- noneOf "]"
141  when (c == '-') $ do
142    atEnd <- (lookAhead (char ']') >> return True) <|> (return False)
143    when (not atEnd) (unexpected "A dash is in the wrong place in a bracket")
144  return (BEChar c)
145
146