1{-# LANGUAGE CPP #-}
2{-# LANGUAGE LambdaCase #-}
3{-# LANGUAGE BangPatterns #-}
4{-# LANGUAGE OverloadedStrings #-}
5module Regex.KDE.Compile
6  (compileRegex)
7  where
8
9import Data.Word (Word8)
10import qualified Data.ByteString as B
11import Data.ByteString (ByteString)
12import qualified Data.ByteString.UTF8 as U
13import Safe
14import Data.Attoparsec.ByteString as A hiding (match)
15import Data.Char
16import Control.Applicative
17import Regex.KDE.Regex
18import Control.Monad.State.Strict
19#if !MIN_VERSION_base(4,11,0)
20import Data.Semigroup ((<>))
21#endif
22
23-- | Compile a UTF-8 encoded ByteString as a Regex.  If the first
24-- parameter is True, then the Regex will be case sensitive.
25compileRegex :: Bool -> ByteString -> Either String Regex
26compileRegex caseSensitive bs =
27  let !res = parseOnly (evalStateT parser 0) bs
28   in res
29 where
30   parser = do
31     !re <- pRegex caseSensitive
32     (re <$ lift A.endOfInput) <|>
33       do rest <- lift A.takeByteString
34          fail $ "parse error at byte position " ++
35                 show (B.length bs - B.length rest)
36
37type RParser = StateT Int Parser
38
39pRegex :: Bool -> RParser Regex
40pRegex caseSensitive =
41  option MatchNull $
42  foldr MatchAlt
43    <$> (pAltPart caseSensitive)
44    <*> (many $ lift (char '|') *> (pAltPart caseSensitive <|> pure mempty))
45
46pAltPart :: Bool -> RParser Regex
47pAltPart caseSensitive = mconcat <$> many1 (pRegexPart caseSensitive)
48
49char :: Char -> Parser Char
50char c =
51  c <$ satisfy (== fromIntegral (ord c))
52
53pRegexPart :: Bool -> RParser Regex
54pRegexPart caseSensitive =
55  (lift (pRegexChar caseSensitive) <|> pParenthesized caseSensitive) >>=
56     lift . pSuffix
57
58pParenthesized :: Bool -> RParser Regex
59pParenthesized caseSensitive = do
60  _ <- lift (satisfy (== 40))
61  -- pcrepattern says: A group that starts with (?| resets the capturing
62  -- parentheses numbers in each alternative.
63  resetCaptureNumbers <- option False (True <$ lift (string "?|"))
64  modifier <- if resetCaptureNumbers
65                 then return id
66                 else lift (satisfy (== 63) *> pGroupModifiers)
67                    <|> (MatchCapture <$> (modify (+ 1) *> get))
68  currentCaptureNumber <- get
69  contents <- option MatchNull $
70    foldr MatchAlt
71      <$> (pAltPart caseSensitive)
72      <*> (many $ lift (char '|') *>
73            (((if resetCaptureNumbers
74                  then put currentCaptureNumber
75                  else return ()) >> pAltPart caseSensitive) <|> pure mempty))
76  _ <- lift (satisfy (== 41))
77  return $ modifier contents
78
79pGroupModifiers :: Parser (Regex -> Regex)
80pGroupModifiers =
81  (id <$ char ':')
82   <|>
83     do dir <- option Forward $ Backward <$ char '<'
84        (AssertPositive dir <$ char '=') <|> (AssertNegative dir <$ char '!')
85   <|>
86     do n <- satisfy (\d -> d >= 48 && d <= 57)
87        return (\_ -> Subroutine (fromIntegral n - 48))
88   <|>
89     do _ <- satisfy (== 82) -- R
90        return  (\_ -> Subroutine 0)
91
92pSuffix :: Regex -> Parser Regex
93pSuffix re = option re $ do
94  w <- satisfy (\x -> x == 42 || x == 43 || x == 63 || x == 123)
95  (case w of
96    42  -> return $ MatchAlt (MatchSome re) MatchNull
97    43  -> return $ MatchSome re
98    63  -> return $ MatchAlt re MatchNull
99    123 -> do
100      let isDig x = x >= 48 && x < 58
101      minn <- option Nothing $ readMay . U.toString <$> A.takeWhile isDig
102      maxn <- option Nothing $ char ',' *>
103                       (readMay . U.toString <$> A.takeWhile isDig)
104      _ <- char '}'
105      case (minn, maxn) of
106          (Nothing, Nothing) -> mzero
107          (Just n, Nothing)  -> return $! atleast n re
108          (Nothing, Just n)  -> return $! atmost n re
109          (Just m, Just n)   -> return $! between m n re
110    _   -> fail "pSuffix encountered impossible byte") >>= pQuantifierModifier
111 where
112   atmost 0 _ = MatchNull
113   atmost n r = MatchAlt (mconcat (replicate n r)) (atmost (n-1) r)
114
115   between 0 n r = atmost n r
116   between m n r = mconcat (replicate m r) <> atmost (n - m) r
117
118   atleast n r = mconcat (replicate n r) <> MatchAlt (MatchSome r) MatchNull
119
120pQuantifierModifier :: Regex -> Parser Regex
121pQuantifierModifier re = option re $
122  (Possessive re <$ satisfy (== 43)) <|>
123  (Lazy re <$ satisfy (==63))
124
125pRegexChar :: Bool -> Parser Regex
126pRegexChar caseSensitive = do
127  w <- satisfy $ const True
128  case w of
129    46  -> return MatchAnyChar
130    37 -> (do -- dynamic %1 %2
131              ds <- A.takeWhile1 (\x -> x >= 48 && x <= 57)
132              case readMay (U.toString ds) of
133                Just !n -> return $ MatchDynamic n
134                Nothing -> fail "not a number")
135            <|> return (MatchChar (== '%'))
136    92  -> pRegexEscapedChar
137    36  -> return AssertEnd
138    94  -> return AssertBeginning
139    91  -> pRegexCharClass
140    _ | w < 128
141      , not (isSpecial w)
142         -> do let c = chr $ fromIntegral w
143               return $! MatchChar $
144                        if caseSensitive
145                           then (== c)
146                           else (\d -> toLower d == toLower c)
147      | w >= 0xc0 -> do
148          rest <- case w of
149                    _ | w >= 0xf0 -> A.take 3
150                      | w >= 0xe0 -> A.take 2
151                      | otherwise -> A.take 1
152          case U.uncons (B.cons w rest) of
153            Just (d, _) -> return $! MatchChar $
154                             if caseSensitive
155                                then (== d)
156                                else (\e -> toLower e == toLower d)
157            Nothing     -> fail "could not decode as UTF8"
158      | otherwise -> mzero
159
160pRegexEscapedChar :: Parser Regex
161pRegexEscapedChar = do
162  c <- anyChar
163  (case c of
164    'b' -> return AssertWordBoundary
165    '{' -> do -- captured pattern: \1 \2 \{12}
166              ds <- A.takeWhile1 (\x -> x >= 48 && x <= 57)
167              _ <- char '}'
168              case readMay (U.toString ds) of
169                Just !n -> return $ MatchCaptured $ n
170                Nothing -> fail "not a number"
171    'd' -> return $ MatchChar isDigit
172    'D' -> return $ MatchChar (not . isDigit)
173    's' -> return $ MatchChar isSpace
174    'S' -> return $ MatchChar (not . isSpace)
175    'w' -> return $ MatchChar isWordChar
176    'W' -> return $ MatchChar (not . isWordChar)
177    _ | c >= '0' && c <= '9' ->
178       return $! MatchCaptured (ord c - ord '0')
179      | otherwise -> mzero) <|> (MatchChar . (==) <$> pEscaped c)
180
181pEscaped :: Char -> Parser Char
182pEscaped c =
183  case c of
184    '\\' -> return c
185    'a' -> return '\a'
186    'f' -> return '\f'
187    'n' -> return '\n'
188    'r' -> return '\r'
189    't' -> return '\t'
190    'v' -> return '\v'
191    '0' -> do -- \0ooo matches octal ooo
192      ds <- A.take 3
193      case readMay ("'\\o" ++ U.toString ds ++ "'") of
194        Just x  -> return x
195        Nothing -> fail "invalid octal character escape"
196    _ | c >= '1' && c <= '7' -> do
197      -- \123 matches octal 123, \1 matches octal 1
198      let octalDigitScanner s w
199            | s < 3, w >= 48 && w <= 55
200                        = Just (s + 1) -- digits 0-7
201            | otherwise = Nothing
202      ds <- A.scan (1 :: Int) octalDigitScanner
203      case readMay ("'\\o" ++ [c] ++ U.toString ds ++ "'") of
204        Just x  -> return x
205        Nothing -> fail "invalid octal character escape"
206    'z' -> do -- \zhhhh matches unicode hex char hhhh
207      ds <- A.take 4
208      case readMay ("'\\x" ++ U.toString ds ++ "'") of
209        Just x  -> return x
210        Nothing -> fail "invalid hex character escape"
211    _ | c >= '1' && c <= '7' -> do -- \ooo octal undocument form but works
212         ds <- A.take 2
213         case readMay ("'\\o" ++ c : U.toString ds ++ "'") of
214           Just x  -> return x
215           Nothing -> fail "invalid octal character escape"
216      | otherwise -> return c
217
218pRegexCharClass :: Parser Regex
219pRegexCharClass = do
220  negated <- option False $ True <$ satisfy (== 94) -- '^'
221  let getEscapedClass = do
222        _ <- satisfy (== 92) -- backslash
223        (isDigit <$ char 'd')
224         <|> (not . isDigit <$ char 'D')
225         <|> (isSpace <$ char 's')
226         <|> (not . isSpace <$ char 'S')
227         <|> (isWordChar <$ char 'w')
228         <|> (not . isWordChar <$ char 'W')
229  let getPosixClass = do
230        _ <- string "[:"
231        localNegated <- option False $ True <$ satisfy (== 94) -- '^'
232        res <- (isAlphaNum <$ string "alnum")
233             <|> (isAlpha <$ string "alpha")
234             <|> (isAscii <$ string "ascii")
235             <|> ((\c -> isSpace c && c `notElem` ['\n','\r','\f','\v']) <$
236                   string "blank")
237             <|> (isControl <$ string "cntrl")
238             <|> ((\c -> isPrint c || isSpace c) <$ string "graph:")
239             <|> (isLower <$ string "lower")
240             <|> (isUpper <$ string "upper")
241             <|> (isPrint <$ string "print")
242             <|> (isPunctuation <$ string "punct")
243             <|> (isSpace <$ string "space")
244             <|> ((\c -> isAlphaNum c ||
245                         generalCategory c == ConnectorPunctuation)
246                   <$ string "word:")
247             <|> (isHexDigit <$ string "xdigit")
248        _ <- string ":]"
249        return $! if localNegated then not . res else res
250  let getC = (satisfy (== 92) *> anyChar >>= pEscaped) <|>
251       (chr . fromIntegral <$> satisfy (\x -> x /= 92 && x /= 93)) -- \ ]
252  let getCRange = do
253        c <- getC
254        (\d -> (\x -> x >= c && x <= d)) <$> (char '-' *> getC) <|>
255          return (== c)
256  brack <- option [] $ [(==']')] <$ char ']'
257  fs <- many (getEscapedClass <|> getPosixClass <|> getCRange)
258  _ <- satisfy (== 93) -- ]
259  let f c = any ($ c) $ brack ++ fs
260  return $! MatchChar (if negated then (not . f) else f)
261
262anyChar :: Parser Char
263anyChar = do
264  w <- satisfy (const True)
265  return $! chr $ fromIntegral w
266
267isSpecial :: Word8 -> Bool
268isSpecial 92 = True -- '\\'
269isSpecial 63 = True -- '?'
270isSpecial 42 = True -- '*'
271isSpecial 43 = True -- '+'
272-- isSpecial 123 = True -- '{'  -- this is okay except in suffixes
273isSpecial 91 = True -- '['
274isSpecial 93 = True -- ']'
275isSpecial 37 = True -- '%'
276isSpecial 40 = True -- '('
277isSpecial 41 = True -- ')'
278isSpecial 124 = True -- '|'
279isSpecial 46 = True -- '.'
280isSpecial 36 = True -- '$'
281isSpecial 94 = True -- '^'
282isSpecial _  = False
283
284