1{-# LANGUAGE CPP #-} 2{-# LANGUAGE LambdaCase #-} 3{-# LANGUAGE BangPatterns #-} 4{-# LANGUAGE OverloadedStrings #-} 5module Regex.KDE.Compile 6 (compileRegex) 7 where 8 9import Data.Word (Word8) 10import qualified Data.ByteString as B 11import Data.ByteString (ByteString) 12import qualified Data.ByteString.UTF8 as U 13import Safe 14import Data.Attoparsec.ByteString as A hiding (match) 15import Data.Char 16import Control.Applicative 17import Regex.KDE.Regex 18import Control.Monad.State.Strict 19#if !MIN_VERSION_base(4,11,0) 20import Data.Semigroup ((<>)) 21#endif 22 23-- | Compile a UTF-8 encoded ByteString as a Regex. If the first 24-- parameter is True, then the Regex will be case sensitive. 25compileRegex :: Bool -> ByteString -> Either String Regex 26compileRegex caseSensitive bs = 27 let !res = parseOnly (evalStateT parser 0) bs 28 in res 29 where 30 parser = do 31 !re <- pRegex caseSensitive 32 (re <$ lift A.endOfInput) <|> 33 do rest <- lift A.takeByteString 34 fail $ "parse error at byte position " ++ 35 show (B.length bs - B.length rest) 36 37type RParser = StateT Int Parser 38 39pRegex :: Bool -> RParser Regex 40pRegex caseSensitive = 41 option MatchNull $ 42 foldr MatchAlt 43 <$> (pAltPart caseSensitive) 44 <*> (many $ lift (char '|') *> (pAltPart caseSensitive <|> pure mempty)) 45 46pAltPart :: Bool -> RParser Regex 47pAltPart caseSensitive = mconcat <$> many1 (pRegexPart caseSensitive) 48 49char :: Char -> Parser Char 50char c = 51 c <$ satisfy (== fromIntegral (ord c)) 52 53pRegexPart :: Bool -> RParser Regex 54pRegexPart caseSensitive = 55 (lift (pRegexChar caseSensitive) <|> pParenthesized caseSensitive) >>= 56 lift . pSuffix 57 58pParenthesized :: Bool -> RParser Regex 59pParenthesized caseSensitive = do 60 _ <- lift (satisfy (== 40)) 61 -- pcrepattern says: A group that starts with (?| resets the capturing 62 -- parentheses numbers in each alternative. 63 resetCaptureNumbers <- option False (True <$ lift (string "?|")) 64 modifier <- if resetCaptureNumbers 65 then return id 66 else lift (satisfy (== 63) *> pGroupModifiers) 67 <|> (MatchCapture <$> (modify (+ 1) *> get)) 68 currentCaptureNumber <- get 69 contents <- option MatchNull $ 70 foldr MatchAlt 71 <$> (pAltPart caseSensitive) 72 <*> (many $ lift (char '|') *> 73 (((if resetCaptureNumbers 74 then put currentCaptureNumber 75 else return ()) >> pAltPart caseSensitive) <|> pure mempty)) 76 _ <- lift (satisfy (== 41)) 77 return $ modifier contents 78 79pGroupModifiers :: Parser (Regex -> Regex) 80pGroupModifiers = 81 (id <$ char ':') 82 <|> 83 do dir <- option Forward $ Backward <$ char '<' 84 (AssertPositive dir <$ char '=') <|> (AssertNegative dir <$ char '!') 85 <|> 86 do n <- satisfy (\d -> d >= 48 && d <= 57) 87 return (\_ -> Subroutine (fromIntegral n - 48)) 88 <|> 89 do _ <- satisfy (== 82) -- R 90 return (\_ -> Subroutine 0) 91 92pSuffix :: Regex -> Parser Regex 93pSuffix re = option re $ do 94 w <- satisfy (\x -> x == 42 || x == 43 || x == 63 || x == 123) 95 (case w of 96 42 -> return $ MatchAlt (MatchSome re) MatchNull 97 43 -> return $ MatchSome re 98 63 -> return $ MatchAlt re MatchNull 99 123 -> do 100 let isDig x = x >= 48 && x < 58 101 minn <- option Nothing $ readMay . U.toString <$> A.takeWhile isDig 102 maxn <- option Nothing $ char ',' *> 103 (readMay . U.toString <$> A.takeWhile isDig) 104 _ <- char '}' 105 case (minn, maxn) of 106 (Nothing, Nothing) -> mzero 107 (Just n, Nothing) -> return $! atleast n re 108 (Nothing, Just n) -> return $! atmost n re 109 (Just m, Just n) -> return $! between m n re 110 _ -> fail "pSuffix encountered impossible byte") >>= pQuantifierModifier 111 where 112 atmost 0 _ = MatchNull 113 atmost n r = MatchAlt (mconcat (replicate n r)) (atmost (n-1) r) 114 115 between 0 n r = atmost n r 116 between m n r = mconcat (replicate m r) <> atmost (n - m) r 117 118 atleast n r = mconcat (replicate n r) <> MatchAlt (MatchSome r) MatchNull 119 120pQuantifierModifier :: Regex -> Parser Regex 121pQuantifierModifier re = option re $ 122 (Possessive re <$ satisfy (== 43)) <|> 123 (Lazy re <$ satisfy (==63)) 124 125pRegexChar :: Bool -> Parser Regex 126pRegexChar caseSensitive = do 127 w <- satisfy $ const True 128 case w of 129 46 -> return MatchAnyChar 130 37 -> (do -- dynamic %1 %2 131 ds <- A.takeWhile1 (\x -> x >= 48 && x <= 57) 132 case readMay (U.toString ds) of 133 Just !n -> return $ MatchDynamic n 134 Nothing -> fail "not a number") 135 <|> return (MatchChar (== '%')) 136 92 -> pRegexEscapedChar 137 36 -> return AssertEnd 138 94 -> return AssertBeginning 139 91 -> pRegexCharClass 140 _ | w < 128 141 , not (isSpecial w) 142 -> do let c = chr $ fromIntegral w 143 return $! MatchChar $ 144 if caseSensitive 145 then (== c) 146 else (\d -> toLower d == toLower c) 147 | w >= 0xc0 -> do 148 rest <- case w of 149 _ | w >= 0xf0 -> A.take 3 150 | w >= 0xe0 -> A.take 2 151 | otherwise -> A.take 1 152 case U.uncons (B.cons w rest) of 153 Just (d, _) -> return $! MatchChar $ 154 if caseSensitive 155 then (== d) 156 else (\e -> toLower e == toLower d) 157 Nothing -> fail "could not decode as UTF8" 158 | otherwise -> mzero 159 160pRegexEscapedChar :: Parser Regex 161pRegexEscapedChar = do 162 c <- anyChar 163 (case c of 164 'b' -> return AssertWordBoundary 165 '{' -> do -- captured pattern: \1 \2 \{12} 166 ds <- A.takeWhile1 (\x -> x >= 48 && x <= 57) 167 _ <- char '}' 168 case readMay (U.toString ds) of 169 Just !n -> return $ MatchCaptured $ n 170 Nothing -> fail "not a number" 171 'd' -> return $ MatchChar isDigit 172 'D' -> return $ MatchChar (not . isDigit) 173 's' -> return $ MatchChar isSpace 174 'S' -> return $ MatchChar (not . isSpace) 175 'w' -> return $ MatchChar isWordChar 176 'W' -> return $ MatchChar (not . isWordChar) 177 _ | c >= '0' && c <= '9' -> 178 return $! MatchCaptured (ord c - ord '0') 179 | otherwise -> mzero) <|> (MatchChar . (==) <$> pEscaped c) 180 181pEscaped :: Char -> Parser Char 182pEscaped c = 183 case c of 184 '\\' -> return c 185 'a' -> return '\a' 186 'f' -> return '\f' 187 'n' -> return '\n' 188 'r' -> return '\r' 189 't' -> return '\t' 190 'v' -> return '\v' 191 '0' -> do -- \0ooo matches octal ooo 192 ds <- A.take 3 193 case readMay ("'\\o" ++ U.toString ds ++ "'") of 194 Just x -> return x 195 Nothing -> fail "invalid octal character escape" 196 _ | c >= '1' && c <= '7' -> do 197 -- \123 matches octal 123, \1 matches octal 1 198 let octalDigitScanner s w 199 | s < 3, w >= 48 && w <= 55 200 = Just (s + 1) -- digits 0-7 201 | otherwise = Nothing 202 ds <- A.scan (1 :: Int) octalDigitScanner 203 case readMay ("'\\o" ++ [c] ++ U.toString ds ++ "'") of 204 Just x -> return x 205 Nothing -> fail "invalid octal character escape" 206 'z' -> do -- \zhhhh matches unicode hex char hhhh 207 ds <- A.take 4 208 case readMay ("'\\x" ++ U.toString ds ++ "'") of 209 Just x -> return x 210 Nothing -> fail "invalid hex character escape" 211 _ | c >= '1' && c <= '7' -> do -- \ooo octal undocument form but works 212 ds <- A.take 2 213 case readMay ("'\\o" ++ c : U.toString ds ++ "'") of 214 Just x -> return x 215 Nothing -> fail "invalid octal character escape" 216 | otherwise -> return c 217 218pRegexCharClass :: Parser Regex 219pRegexCharClass = do 220 negated <- option False $ True <$ satisfy (== 94) -- '^' 221 let getEscapedClass = do 222 _ <- satisfy (== 92) -- backslash 223 (isDigit <$ char 'd') 224 <|> (not . isDigit <$ char 'D') 225 <|> (isSpace <$ char 's') 226 <|> (not . isSpace <$ char 'S') 227 <|> (isWordChar <$ char 'w') 228 <|> (not . isWordChar <$ char 'W') 229 let getPosixClass = do 230 _ <- string "[:" 231 localNegated <- option False $ True <$ satisfy (== 94) -- '^' 232 res <- (isAlphaNum <$ string "alnum") 233 <|> (isAlpha <$ string "alpha") 234 <|> (isAscii <$ string "ascii") 235 <|> ((\c -> isSpace c && c `notElem` ['\n','\r','\f','\v']) <$ 236 string "blank") 237 <|> (isControl <$ string "cntrl") 238 <|> ((\c -> isPrint c || isSpace c) <$ string "graph:") 239 <|> (isLower <$ string "lower") 240 <|> (isUpper <$ string "upper") 241 <|> (isPrint <$ string "print") 242 <|> (isPunctuation <$ string "punct") 243 <|> (isSpace <$ string "space") 244 <|> ((\c -> isAlphaNum c || 245 generalCategory c == ConnectorPunctuation) 246 <$ string "word:") 247 <|> (isHexDigit <$ string "xdigit") 248 _ <- string ":]" 249 return $! if localNegated then not . res else res 250 let getC = (satisfy (== 92) *> anyChar >>= pEscaped) <|> 251 (chr . fromIntegral <$> satisfy (\x -> x /= 92 && x /= 93)) -- \ ] 252 let getCRange = do 253 c <- getC 254 (\d -> (\x -> x >= c && x <= d)) <$> (char '-' *> getC) <|> 255 return (== c) 256 brack <- option [] $ [(==']')] <$ char ']' 257 fs <- many (getEscapedClass <|> getPosixClass <|> getCRange) 258 _ <- satisfy (== 93) -- ] 259 let f c = any ($ c) $ brack ++ fs 260 return $! MatchChar (if negated then (not . f) else f) 261 262anyChar :: Parser Char 263anyChar = do 264 w <- satisfy (const True) 265 return $! chr $ fromIntegral w 266 267isSpecial :: Word8 -> Bool 268isSpecial 92 = True -- '\\' 269isSpecial 63 = True -- '?' 270isSpecial 42 = True -- '*' 271isSpecial 43 = True -- '+' 272-- isSpecial 123 = True -- '{' -- this is okay except in suffixes 273isSpecial 91 = True -- '[' 274isSpecial 93 = True -- ']' 275isSpecial 37 = True -- '%' 276isSpecial 40 = True -- '(' 277isSpecial 41 = True -- ')' 278isSpecial 124 = True -- '|' 279isSpecial 46 = True -- '.' 280isSpecial 36 = True -- '$' 281isSpecial 94 = True -- '^' 282isSpecial _ = False 283 284