1{-# OPTIONS_GHC -fno-warn-missing-signatures #-} 2-- | This is a POSIX version of parseRegex that allows NUL characters. 3-- Lazy\/Possessive\/Backrefs are not recognized. Anchors \^ and \$ are 4-- recognized. 5-- 6-- The PGroup returned always have (Maybe GroupIndex) set to (Just _) 7-- and never to Nothing. 8module Text.Regex.TDFA.ReadRegex (parseRegex) where 9 10{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} 11 12import Text.Regex.TDFA.Pattern {- all -} 13import Text.ParserCombinators.Parsec((<|>), (<?>), 14 unexpected, try, runParser, many, getState, setState, CharParser, ParseError, 15 sepBy1, option, notFollowedBy, many1, lookAhead, eof, between, 16 string, noneOf, digit, char, anyChar) 17import Control.Monad(liftM, when, guard) 18import qualified Data.Set as Set(fromList) 19 20-- | BracketElement is internal to this module 21data BracketElement = BEChar Char | BEChars String | BEColl String | BEEquiv String | BEClass String 22 23-- | Return either an error message or a tuple of the Pattern and the 24-- largest group index and the largest DoPa index (both have smallest 25-- index of 1). Since the regular expression is supplied as [Char] it 26-- automatically supports unicode and @\\NUL@ characters. 27parseRegex :: String -> Either ParseError (Pattern,(GroupIndex,DoPa)) 28parseRegex x = runParser (do pat <- p_regex 29 eof 30 (lastGroupIndex,lastDopa) <- getState 31 return (pat,(lastGroupIndex,DoPa lastDopa))) (0,0) x x 32 33p_regex :: CharParser (GroupIndex,Int) Pattern 34p_regex = liftM POr $ sepBy1 p_branch (char '|') 35 36-- man re_format helps alot, it says one-or-more pieces so this is 37-- many1 not many. Use "()" to indicate an empty piece. 38p_branch = liftM PConcat $ many1 p_piece 39 40p_piece = (p_anchor <|> p_atom) >>= p_post_atom -- correct specification 41 42p_atom = p_group <|> p_bracket <|> p_char <?> "an atom" 43 44group_index :: CharParser (GroupIndex,Int) (Maybe GroupIndex) 45group_index = do 46 (gi,ci) <- getState 47 let index = succ gi 48 setState (index,ci) 49 return (Just index) 50 51p_group = lookAhead (char '(') >> do 52 index <- group_index 53 liftM (PGroup index) $ between (char '(') (char ')') p_regex 54 55-- p_post_atom takes the previous atom as a parameter 56p_post_atom atom = (char '?' >> return (PQuest atom)) 57 <|> (char '+' >> return (PPlus atom)) 58 <|> (char '*' >> return (PStar True atom)) 59 <|> p_bound atom 60 <|> return atom 61 62p_bound atom = try $ between (char '{') (char '}') (p_bound_spec atom) 63 64p_bound_spec atom = do lowS <- many1 digit 65 let lowI = read lowS 66 highMI <- option (Just lowI) $ try $ do 67 _ <- char ',' 68 -- parsec note: if 'many digits' fails below then the 'try' ensures 69 -- that the ',' will not match the closing '}' in p_bound, same goes 70 -- for any non '}' garbage after the 'many digits'. 71 highS <- many digit 72 if null highS then return Nothing -- no upper bound 73 else do let highI = read highS 74 guard (lowI <= highI) 75 return (Just (read highS)) 76 return (PBound lowI highMI atom) 77 78-- An anchor cannot be modified by a repetition specifier 79p_anchor = (char '^' >> liftM PCarat char_index) 80 <|> (char '$' >> liftM PDollar char_index) 81 <|> try (do _ <- string "()" 82 index <- group_index 83 return $ PGroup index PEmpty) 84 <?> "empty () or anchor ^ or $" 85 86char_index = do (gi,ci) <- getState 87 let ci' = succ ci 88 setState (gi,ci') 89 return (DoPa ci') 90 91p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char where 92 p_dot = char '.' >> char_index >>= return . PDot 93 p_left_brace = try $ (char '{' >> notFollowedBy digit >> char_index >>= return . (`PChar` '{')) 94 p_escaped = char '\\' >> anyChar >>= \c -> char_index >>= return . (`PEscape` c) 95 p_other_char = noneOf specials >>= \c -> char_index >>= return . (`PChar` c) 96 where specials = "^.[$()|*+?{\\" 97 98-- parse [bar] and [^bar] sets of characters 99p_bracket = (char '[') >> ( (char '^' >> p_set True) <|> (p_set False) ) 100 101-- p_set :: Bool -> GenParser Char st Pattern 102p_set invert = do initial <- (option "" ((char ']' >> return "]") <|> (char '-' >> return "-"))) 103 values <- if null initial then many1 p_set_elem else many p_set_elem 104 _ <- char ']' 105 ci <- char_index 106 let chars = maybe'set $ initial 107 ++ [c | BEChar c <- values ] 108 ++ concat [s | BEChars s <- values ] 109 colls = maybe'set [PatternSetCollatingElement coll | BEColl coll <- values ] 110 equivs = maybe'set [PatternSetEquivalenceClass equiv | BEEquiv equiv <- values] 111 class's = maybe'set [PatternSetCharacterClass a'class | BEClass a'class <- values] 112 maybe'set x = if null x then Nothing else Just (Set.fromList x) 113 sets = PatternSet chars class's colls equivs 114 sets `seq` return $ if invert then PAnyNot ci sets else PAny ci sets 115 116-- From here down the code is the parser and functions for pattern [ ] set things 117 118p_set_elem = p_set_elem_class <|> p_set_elem_equiv <|> p_set_elem_coll 119 <|> p_set_elem_range <|> p_set_elem_char <?> "Failed to parse bracketed string" 120 121p_set_elem_class = liftM BEClass $ 122 try (between (string "[:") (string ":]") (many1 $ noneOf ":]")) 123 124p_set_elem_equiv = liftM BEEquiv $ 125 try (between (string "[=") (string "=]") (many1 $ noneOf "=]")) 126 127p_set_elem_coll = liftM BEColl $ 128 try (between (string "[.") (string ".]") (many1 $ noneOf ".]")) 129 130p_set_elem_range = try $ do 131 start <- noneOf "]-" 132 _ <- char '-' 133 end <- noneOf "]" 134 -- bug fix: check start <= end before "return (BEChars [start..end])" 135 if start <= end 136 then return (BEChars [start..end]) 137 else unexpected "End point of dashed character range is less than starting point" 138 139p_set_elem_char = do 140 c <- noneOf "]" 141 when (c == '-') $ do 142 atEnd <- (lookAhead (char ']') >> return True) <|> (return False) 143 when (not atEnd) (unexpected "A dash is in the wrong place in a bracket") 144 return (BEChar c) 145 146