1{-# LANGUAGE MultiParamTypeClasses, FunctionalDependencies, FlexibleInstances, TypeSynonymInstances #-}
2-----------------------------------------------------------------------------
3-- |
4-- Module      :  Text.Regex.Base.RegexLike
5-- Copyright   :  (c) Chris Kuklewicz 2006
6-- SPDX-License-Identifier: BSD-3-Clause
7--
8-- Maintainer  :  hvr@gnu.org
9-- Stability   :  experimental
10-- Portability :  non-portable (MPTC+FD)
11--
12-- Classes and instances for Regex matching.
13--
14-- All the classes are declared here, and some common type aliases, and
15-- the MatchResult data type.
16--
17-- The only instances here are for 'Extract' 'String', 'Extract' 'SB.ByteString',
18-- and 'Extract' 'ST.Text'. There are no data values.  The 'RegexContext'
19-- instances are in "Text.Regex.Base.Context", except for ones which
20-- run afoul of a repeated variable ('RegexContext' regex a a), which
21-- are defined in each modules' String and ByteString modules.
22-----------------------------------------------------------------------------
23
24module Text.Regex.Base.RegexLike (
25  -- ** Type aliases
26  MatchOffset,
27  MatchLength,
28  MatchArray,
29  MatchText,
30  -- ** Data types
31  MatchResult(..),
32  -- ** Classes
33  RegexOptions(..),
34  RegexMaker(..),
35  RegexLike(..),
36  RegexContext(..),
37  Extract(..),
38  AllSubmatches(..),AllTextSubmatches(..),AllMatches(..),AllTextMatches(..)
39  ) where
40
41import Prelude hiding (fail)
42import Control.Monad.Fail as Fail (MonadFail)
43
44import Data.Array(Array,(!))
45import Data.Maybe(isJust)
46import qualified Data.ByteString as SB (take,drop,empty,ByteString)
47import qualified Data.ByteString.Lazy as LB (take,drop,empty,ByteString)
48import qualified Data.Sequence as S(take,drop,empty,Seq)
49import qualified Data.Text as ST (take,drop,empty,Text)
50import qualified Data.Text.Lazy as LT (take,drop,empty,Text)
51
52-- | 0 based index from start of source, or (-1) for unused
53type MatchOffset = Int
54-- | non-negative length of a match
55type MatchLength = Int
56-- | 0 based array, with 0th index indicating the full match.  If the
57-- full match location is not available, represent as (0,0).
58type MatchArray = Array Int (MatchOffset,MatchLength)
59type MatchText source = Array Int (source,(MatchOffset,MatchLength))
60
61-- | This is the same as the type from JRegex.
62data MatchResult a = MR {
63    mrBefore :: a,
64    mrMatch  :: a,
65    mrAfter  :: a,
66    mrSubList :: [a],
67    mrSubs   :: Array Int a
68}
69
70----------------
71-- | Rather than carry them around spearately, the options for how to
72-- execute a regex are kept as part of the regex.  There are two types
73-- of options.  Those that can only be specified at compilation time
74-- and never changed are CompOpt.  Those that can be changed later and
75-- affect how matching is performed are ExecOpt.  The actually types
76-- for these depend on the backend.
77class RegexOptions regex compOpt execOpt
78  | regex->compOpt execOpt, compOpt->regex execOpt, execOpt->regex compOpt where
79  blankCompOpt :: compOpt    -- ^ no options set at all in the backend
80  blankExecOpt :: execOpt    -- ^ no options set at all in the backend
81  defaultCompOpt :: compOpt  -- ^ reasonable options (extended,caseSensitive,multiline regex)
82  defaultExecOpt :: execOpt  -- ^ reasonable options (extended,caseSensitive,multiline regex)
83  setExecOpts :: execOpt -> regex -> regex
84  -- ^ forget old flags and use new ones
85  getExecOpts :: regex -> execOpt
86  -- ^ retrieve the current flags
87
88----------------
89-- | RegexMaker captures the creation of the compiled regular
90-- expression from a source type and an option type.  'makeRegexM' and
91-- 'makeRegexM' report parse error using 'MonadError', usually (Either
92-- String regex).
93--
94-- The 'makeRegex' function has a default implementation that depends
95-- on makeRegexOpts and used 'defaultCompOpt' and 'defaultExecOpt'.
96-- Similarly for 'makeRegexM' and 'makeRegexOptsM'.
97--
98-- There are also default implementaions for 'makeRegexOpts' and
99-- 'makeRegexOptsM' in terms of each other.  So a minimal instance
100-- definition needs to only define one of these, hopefully
101-- 'makeRegexOptsM'.
102class (RegexOptions regex compOpt execOpt) => RegexMaker regex compOpt execOpt source
103  | regex -> compOpt execOpt, compOpt -> regex execOpt, execOpt -> regex compOpt where
104  -- | make using the defaultCompOpt and defaultExecOpt
105  makeRegex :: source -> regex
106  -- | Specify your own options
107  makeRegexOpts :: compOpt -> execOpt -> source -> regex
108  -- | make using the defaultCompOpt and defaultExecOpt, reporting errors with fail
109  makeRegexM :: (Fail.MonadFail m) => source -> m regex
110  -- | Specify your own options, reporting errors with fail
111  makeRegexOptsM :: (MonadFail m) => compOpt -> execOpt -> source -> m regex
112
113  makeRegex = makeRegexOpts defaultCompOpt defaultExecOpt
114  makeRegexM = makeRegexOptsM defaultCompOpt defaultExecOpt
115  makeRegexOpts c e s = maybe (error "makeRegexOpts failed") id (makeRegexOptsM c e s)
116  makeRegexOptsM c e s = return (makeRegexOpts c e s)
117
118----------------
119-- | RegexLike is parametrized on a regular expression type and a
120-- source type to run the matching on.
121--
122-- There are default implementations: matchTest and matchOnceText use
123-- matchOnce; matchCount and matchAllText use matchAll. matchOnce uses
124-- matchOnceText and matchAll uses matchAllText. So a minimal complete
125-- instance need to provide at least (matchOnce or matchOnceText) and
126-- (matchAll or matchAllText).  Additional definitions are often
127-- provided where they will increase efficiency.
128--
129-- > [ c | let notVowel = makeRegex "[^aeiou]" :: Regex, c <- ['a'..'z'], matchTest notVowel [c]  ]
130-- >
131-- > "bcdfghjklmnpqrstvwxyz"
132--
133-- The strictness of these functions is instance dependent.
134class (Extract source)=> RegexLike regex source where
135  -- | This returns the first match in the source (it checks the whole
136  -- source, not just at the start). This returns an array of
137  -- (offset,length) index pairs for the match and captured
138  -- substrings.  The offset is 0-based.  A (-1) for an offset means a
139  -- failure to match.  The lower bound of the array is 0, and the 0th
140  -- element is the (offset,length) for the whole match.
141  matchOnce  :: regex -> source-> Maybe MatchArray
142  -- | matchAll returns a list of matches.  The matches are in order
143  -- and do not overlap. If any match succeeds but has 0 length then
144  -- this will be the last match in the list.
145  matchAll   :: regex -> source-> [MatchArray]
146  -- | matchCount returns the number of non-overlapping matches
147  -- returned by matchAll.
148  matchCount :: regex -> source-> Int
149  -- | matchTest return True if there is a match somewhere in the
150  -- source (it checks the whole source not just at the start).
151  matchTest  :: regex -> source-> Bool
152  -- | This is matchAll with the actual subsections of the source
153  -- instead of just the (offset,length) information.
154  matchAllText  :: regex -> source-> [MatchText source]
155  -- | This can return a tuple of three items: the source before the
156  -- match, an array of the match and captured substrings (with their
157  -- indices), and the source after the match.
158  matchOnceText :: regex -> source-> Maybe (source,MatchText source,source)
159
160  matchAll regex source = map (fmap snd) (matchAllText regex source)
161  matchOnce regex source = fmap (\(_,mt,_) -> fmap snd mt) (matchOnceText regex source)
162  matchTest regex source = isJust (matchOnce regex source)
163  matchCount regex source = length (matchAll regex source)
164  matchOnceText regex source =
165    fmap (\ma -> let (o,l) = ma!0
166                 in (before o source
167                    ,fmap (\ol -> (extract ol source,ol)) ma
168                    ,after (o+l) source))
169         (matchOnce regex source)
170  matchAllText regex source =
171    map (fmap (\ol -> (extract ol source,ol)))
172        (matchAll regex source)
173
174----------------
175-- | RegexContext is the polymorphic interface to do matching.  Since
176-- 'target' is polymorphic you may need to suply the type explicitly
177-- in contexts where it cannot be inferred.
178--
179-- The monadic 'matchM' version uses 'fail' to report when the 'regex'
180-- has no match in 'source'.  Two examples:
181--
182-- Here the contest 'Bool' is inferred:
183--
184-- > [ c | let notVowel = makeRegex "[^aeiou]" :: Regex, c <- ['a'..'z'], match notVowel [c]  ]
185-- >
186-- > "bcdfghjklmnpqrstvwxyz"
187--
188-- Here the context '[String]' must be supplied:
189--
190-- > let notVowel = (makeRegex "[^aeiou]" :: Regex )
191-- > in do { c <- ['a'..'z'] ; matchM notVowel [c] } :: [String]
192-- >
193-- > ["b","c","d","f","g","h","j","k","l","m","n","p","q","r","s","t","v","w","x","y","z"]
194class (RegexLike regex source) => RegexContext regex source target where
195  match :: regex -> source -> target
196  matchM :: (MonadFail m) => regex -> source -> m target
197
198----------------
199-- | Extract allows for indexing operations on String or ByteString.
200class Extract source where
201  -- | before is a renamed "take"
202  before :: Int -> source -> source
203  -- | after is a renamed "drop"
204  after :: Int -> source -> source
205  -- | For when there is no match, this can construct an empty data value
206  empty :: source
207  -- | extract takes an offset and length and has a default
208  -- implementation of @extract (off,len) source = before len (after
209  -- off source)@
210  extract :: (Int,Int) -> source -> source
211  extract (off,len) source = before len (after off source)
212
213instance Extract String where
214  before =  take; after = drop; empty = []
215
216instance Extract SB.ByteString where
217  before = SB.take; after = SB.drop; empty = SB.empty
218
219instance Extract LB.ByteString where
220  before = LB.take . toEnum; after = LB.drop . toEnum; empty = LB.empty
221
222instance Extract (S.Seq a) where
223  before = S.take; after = S.drop; empty = S.empty
224
225-- | @since 0.94.0.0
226instance Extract ST.Text where
227  before = ST.take; after = ST.drop; empty = ST.empty
228
229-- | @since 0.94.0.0
230instance Extract LT.Text where
231  before = LT.take . toEnum; after = LT.drop . toEnum; empty = LT.empty
232
233-- | Used in results of RegexContext instances
234newtype AllSubmatches f b = AllSubmatches {getAllSubmatches :: (f b)}
235-- | Used in results of RegexContext instances
236newtype AllTextSubmatches f b = AllTextSubmatches {getAllTextSubmatches :: (f b)}
237-- | Used in results of RegexContext instances
238newtype AllMatches f b = AllMatches {getAllMatches :: (f b)}
239-- | Used in results of RegexContext instances
240newtype AllTextMatches f b = AllTextMatches {getAllTextMatches :: (f b) }
241