1{-# LANGUAGE MultiParamTypeClasses, FunctionalDependencies, FlexibleInstances, TypeSynonymInstances #-}
2-----------------------------------------------------------------------------
3-- |
4-- Module      :  Text.Regex.Base.RegexLike
5-- Copyright   :  (c) Chris Kuklewicz 2006
6-- SPDX-License-Identifier: BSD-3-Clause
7--
8-- Maintainer  :  hvr@gnu.org
9-- Stability   :  experimental
10-- Portability :  non-portable (MPTC+FD)
11--
12-- Classes and instances for Regex matching.
13--
14-- All the /classes/ are declared here, and some common type aliases, and
15-- the 'MatchResult' data type.
16--
17-- The only /instances/ here are for 'Extract' 'String', 'Extract' 'SB.ByteString',
18-- and 'Extract' 'ST.Text'. There are no data values.  The 'RegexContext'
19-- instances are in "Text.Regex.Base.Context", except for ones which
20-- run afoul of a repeated variable (@'RegexContext' regex a a@), which
21-- are defined in each modules' String and ByteString modules.
22-----------------------------------------------------------------------------
23
24module Text.Regex.Base.RegexLike (
25  -- ** Type aliases
26  MatchOffset,
27  MatchLength,
28  MatchArray,
29  MatchText,
30  -- ** Data types
31  MatchResult(..),
32  -- ** Classes
33  RegexOptions(..),
34  RegexMaker(..),
35  RegexLike(..),
36  RegexContext(..),
37  Extract(..),
38  AllSubmatches(..),AllTextSubmatches(..),AllMatches(..),AllTextMatches(..)
39  ) where
40
41import Prelude hiding (fail)
42import Control.Monad.Fail as Fail (MonadFail)
43
44import Data.Array(Array,(!))
45import Data.Maybe(isJust)
46import qualified Data.ByteString as SB (take,drop,empty,ByteString)
47import qualified Data.ByteString.Lazy as LB (take,drop,empty,ByteString)
48import qualified Data.Sequence as S(take,drop,empty,Seq)
49import qualified Data.Text as ST (take,drop,empty,Text)
50import qualified Data.Text.Lazy as LT (take,drop,empty,Text)
51
52-- | 0 based index from start of source, or (-1) for unused
53type MatchOffset = Int
54
55-- | non-negative length of a match
56type MatchLength = Int
57
58-- | 0 based array, with 0th index indicating the full match.  If the
59-- full match location is not available, represent as (0,0).
60type MatchArray = Array Int (MatchOffset,MatchLength)
61type MatchText source = Array Int (source,(MatchOffset,MatchLength))
62
63-- | This is the same as the type from JRegex.
64data MatchResult a = MR {
65    mrBefore :: a,
66    mrMatch  :: a,
67    mrAfter  :: a,
68    mrSubList :: [a],
69    mrSubs   :: Array Int a
70}
71
72
73-- | Rather than carry them around spearately, the options for how to
74-- execute a regex are kept as part of the regex.  There are two types
75-- of options.  Those that can only be specified at compilation time
76-- and never changed are @compOpt@.  Those that can be changed later and
77-- affect how matching is performed are @execOpt@.  The actually types
78-- for these depend on the backend.
79--
80class RegexOptions regex compOpt execOpt
81  | regex   -> compOpt execOpt
82  , compOpt -> regex execOpt
83  , execOpt -> regex compOpt
84  where
85
86  -- | No options set at all in the backend.
87  blankCompOpt   :: compOpt
88
89  -- | No options set at all in the backend.
90  blankExecOpt   :: execOpt
91
92  -- | Reasonable options (extended, caseSensitive, multiline regex).
93  defaultCompOpt :: compOpt
94
95  -- | Reasonable options (extended, caseSensitive, multiline regex).
96  defaultExecOpt :: execOpt
97
98  -- | Forget old flags and use new ones.
99  setExecOpts    :: execOpt -> regex -> regex
100
101  -- | Retrieve the current flags.
102  getExecOpts    :: regex -> execOpt
103
104
105-- | @RegexMaker@ captures the creation of the compiled regular
106-- expression from a source type and an option type.  Methods 'makeRegexM' and
107-- 'makeRegexM' report parse errors using 'MonadError', usually (@Either
108-- String regex@).
109--
110-- The 'makeRegex' function has a default implementation that depends
111-- on 'makeRegexOpts' and uses 'defaultCompOpt' and 'defaultExecOpt'.
112-- Similarly for 'makeRegexM' and 'makeRegexOptsM'.
113--
114-- There are also default implementaions for 'makeRegexOpts' and
115-- 'makeRegexOptsM' in terms of each other.  So a minimal instance
116-- definition needs to only define one of these, hopefully
117-- 'makeRegexOptsM'.
118--
119class (RegexOptions regex compOpt execOpt) => RegexMaker regex compOpt execOpt source
120  | regex -> compOpt execOpt, compOpt -> regex execOpt, execOpt -> regex compOpt where
121
122  -- | Use the 'defaultCompOpt' and 'defaultExecOpt'.
123  makeRegex :: source -> regex
124
125  -- | Specify your own options.
126  makeRegexOpts :: compOpt -> execOpt -> source -> regex
127
128  -- | Use the 'defaultCompOpt' and 'defaultExecOpt', reporting errors with 'fail'.
129  makeRegexM :: (MonadFail m) => source -> m regex
130
131  -- | Specify your own options, reporting errors with fail
132  makeRegexOptsM :: (MonadFail m) => compOpt -> execOpt -> source -> m regex
133
134  makeRegex = makeRegexOpts defaultCompOpt defaultExecOpt
135  makeRegexM = makeRegexOptsM defaultCompOpt defaultExecOpt
136  makeRegexOpts c e s = maybe (error "makeRegexOpts failed") id (makeRegexOptsM c e s)
137  makeRegexOptsM c e s = return (makeRegexOpts c e s)
138
139
140-- | RegexLike is parametrized on a regular expression type and a
141-- source type to run the matching on.
142--
143-- There are default implementations: 'matchTest' and 'matchOnceText' use
144-- 'matchOnce'; 'matchCount' and 'matchAllText' use 'matchAll'.
145-- Conversely, 'matchOnce' uses
146-- 'matchOnceText' and 'matchAll' uses 'matchAllText'. So a minimal complete
147-- instance need to provide at least ('matchOnce' or 'matchOnceText') and
148-- ('matchAll' or 'matchAllText').  Additional definitions are often
149-- provided where they will increase efficiency.
150--
151-- > [ c | let notVowel = makeRegex "[^aeiou]" :: Regex, c <- ['a'..'z'], matchTest notVowel [c]  ]
152-- >
153-- > "bcdfghjklmnpqrstvwxyz"
154--
155-- The strictness of these functions is instance dependent.
156--
157class (Extract source) => RegexLike regex source where
158
159  -- | This returns the first match in the source (it checks the whole
160  -- source, not just at the start). This returns an array of
161  -- (offset,length) index pairs for the match and captured
162  -- substrings.  The offset is 0-based.  A (-1) for an offset means a
163  -- failure to match.  The lower bound of the array is 0, and the 0th
164  -- element is the (offset,length) for the whole match.
165  matchOnce  :: regex -> source -> Maybe MatchArray
166
167  -- | @matchAll@ returns a list of matches.  The matches are in order
168  -- and do not overlap. If any match succeeds but has 0 length then
169  -- this will be the last match in the list.
170  matchAll   :: regex -> source -> [MatchArray]
171
172  -- | @matchCount@ returns the number of non-overlapping matches
173  -- returned by @matchAll@.
174  matchCount :: regex -> source -> Int
175
176  -- | @matchTest@ returns @True@ if there is a match somewhere in the
177  -- source (it checks the whole source not just at the start).
178  matchTest  :: regex -> source -> Bool
179
180  -- | This is @matchAll@ with the actual subsections of the source
181  -- instead of just the (offset,length) information.
182  matchAllText  :: regex -> source -> [MatchText source]
183
184  -- | This can return a tuple of three items: the source before the
185  -- match, an array of the match and captured substrings (with their
186  -- indices), and the source after the match.
187  matchOnceText :: regex -> source -> Maybe (source, MatchText source, source)
188
189  matchAll regex source = map (fmap snd) (matchAllText regex source)
190  matchOnce regex source = fmap (\(_,mt,_) -> fmap snd mt) (matchOnceText regex source)
191  matchTest regex source = isJust (matchOnce regex source)
192  matchCount regex source = length (matchAll regex source)
193  matchOnceText regex source =
194    fmap (\ma -> let (o,l) = ma ! 0
195                 in (before o source
196                    ,fmap (\ol -> (extract ol source,ol)) ma
197                    ,after (o+l) source))
198         (matchOnce regex source)
199  matchAllText regex source =
200    map (fmap (\ol -> (extract ol source,ol)))
201        (matchAll regex source)
202
203
204-- | @RegexContext@ is the polymorphic interface to do matching.  Since
205-- 'target' is polymorphic you may need to supply the type explicitly
206-- in contexts where it cannot be inferred.
207--
208-- The monadic 'matchM' version uses 'fail' to report when the 'regex'
209-- has no match in 'source'.  Two examples:
210--
211-- Here the contest 'Bool' is inferred:
212--
213-- > [ c | let notVowel = makeRegex "[^aeiou]" :: Regex, c <- ['a'..'z'], match notVowel [c]  ]
214-- >
215-- > "bcdfghjklmnpqrstvwxyz"
216--
217-- Here the context @[String]@ must be supplied:
218--
219-- > let notVowel = (makeRegex "[^aeiou]" :: Regex )
220-- > in do { c <- ['a'..'z'] ; matchM notVowel [c] } :: [String]
221-- >
222-- > ["b","c","d","f","g","h","j","k","l","m","n","p","q","r","s","t","v","w","x","y","z"]
223--
224class (RegexLike regex source) => RegexContext regex source target where
225  match :: regex -> source -> target
226  matchM :: (MonadFail m) => regex -> source -> m target
227
228
229-- | Extract allows for indexing operations on 'String' or 'ByteString'.
230--
231class Extract source where
232
233  -- | @before@ is a renamed 'take'.
234  before :: Int -> source -> source
235
236  -- | @after@ is a renamed 'drop'.
237  after :: Int -> source -> source
238
239  -- | When there is no match, this can construct an empty data value.
240  empty :: source
241
242  -- | @extract@ takes an offset and length, and has this default implementation:
243  --
244  -- @
245  --   extract (off, len) source = before len (after off source)
246  -- @
247  extract :: (Int,Int) -> source -> source
248  extract (off,len) source = before len (after off source)
249
250instance Extract String where
251  before =  take; after = drop; empty = []
252
253instance Extract SB.ByteString where
254  before = SB.take; after = SB.drop; empty = SB.empty
255
256instance Extract LB.ByteString where
257  before = LB.take . toEnum; after = LB.drop . toEnum; empty = LB.empty
258
259instance Extract (S.Seq a) where
260  before = S.take; after = S.drop; empty = S.empty
261
262-- | @since 0.94.0.0
263instance Extract ST.Text where
264  before = ST.take; after = ST.drop; empty = ST.empty
265
266-- | @since 0.94.0.0
267instance Extract LT.Text where
268  before = LT.take . toEnum; after = LT.drop . toEnum; empty = LT.empty
269
270-- | Used in results of 'RegexContext' instances.
271newtype AllSubmatches f b = AllSubmatches {getAllSubmatches :: (f b)}
272
273-- | Used in results of 'RegexContext' instances.
274newtype AllTextSubmatches f b = AllTextSubmatches {getAllTextSubmatches :: (f b)}
275
276-- | Used in results of 'RegexContext' instances.
277newtype AllMatches f b = AllMatches {getAllMatches :: (f b)}
278
279-- | Used in results of 'RegexContext' instances.
280newtype AllTextMatches f b = AllTextMatches {getAllTextMatches :: (f b) }
281