1{-# LANGUAGE MultiParamTypeClasses, FunctionalDependencies, FlexibleInstances, TypeSynonymInstances #-} 2----------------------------------------------------------------------------- 3-- | 4-- Module : Text.Regex.Base.RegexLike 5-- Copyright : (c) Chris Kuklewicz 2006 6-- SPDX-License-Identifier: BSD-3-Clause 7-- 8-- Maintainer : hvr@gnu.org 9-- Stability : experimental 10-- Portability : non-portable (MPTC+FD) 11-- 12-- Classes and instances for Regex matching. 13-- 14-- All the classes are declared here, and some common type aliases, and 15-- the MatchResult data type. 16-- 17-- The only instances here are for 'Extract' 'String', 'Extract' 'SB.ByteString', 18-- and 'Extract' 'ST.Text'. There are no data values. The 'RegexContext' 19-- instances are in "Text.Regex.Base.Context", except for ones which 20-- run afoul of a repeated variable ('RegexContext' regex a a), which 21-- are defined in each modules' String and ByteString modules. 22----------------------------------------------------------------------------- 23 24module Text.Regex.Base.RegexLike ( 25 -- ** Type aliases 26 MatchOffset, 27 MatchLength, 28 MatchArray, 29 MatchText, 30 -- ** Data types 31 MatchResult(..), 32 -- ** Classes 33 RegexOptions(..), 34 RegexMaker(..), 35 RegexLike(..), 36 RegexContext(..), 37 Extract(..), 38 AllSubmatches(..),AllTextSubmatches(..),AllMatches(..),AllTextMatches(..) 39 ) where 40 41import Prelude hiding (fail) 42import Control.Monad.Fail as Fail (MonadFail) 43 44import Data.Array(Array,(!)) 45import Data.Maybe(isJust) 46import qualified Data.ByteString as SB (take,drop,empty,ByteString) 47import qualified Data.ByteString.Lazy as LB (take,drop,empty,ByteString) 48import qualified Data.Sequence as S(take,drop,empty,Seq) 49import qualified Data.Text as ST (take,drop,empty,Text) 50import qualified Data.Text.Lazy as LT (take,drop,empty,Text) 51 52-- | 0 based index from start of source, or (-1) for unused 53type MatchOffset = Int 54-- | non-negative length of a match 55type MatchLength = Int 56-- | 0 based array, with 0th index indicating the full match. If the 57-- full match location is not available, represent as (0,0). 58type MatchArray = Array Int (MatchOffset,MatchLength) 59type MatchText source = Array Int (source,(MatchOffset,MatchLength)) 60 61-- | This is the same as the type from JRegex. 62data MatchResult a = MR { 63 mrBefore :: a, 64 mrMatch :: a, 65 mrAfter :: a, 66 mrSubList :: [a], 67 mrSubs :: Array Int a 68} 69 70---------------- 71-- | Rather than carry them around spearately, the options for how to 72-- execute a regex are kept as part of the regex. There are two types 73-- of options. Those that can only be specified at compilation time 74-- and never changed are CompOpt. Those that can be changed later and 75-- affect how matching is performed are ExecOpt. The actually types 76-- for these depend on the backend. 77class RegexOptions regex compOpt execOpt 78 | regex->compOpt execOpt, compOpt->regex execOpt, execOpt->regex compOpt where 79 blankCompOpt :: compOpt -- ^ no options set at all in the backend 80 blankExecOpt :: execOpt -- ^ no options set at all in the backend 81 defaultCompOpt :: compOpt -- ^ reasonable options (extended,caseSensitive,multiline regex) 82 defaultExecOpt :: execOpt -- ^ reasonable options (extended,caseSensitive,multiline regex) 83 setExecOpts :: execOpt -> regex -> regex 84 -- ^ forget old flags and use new ones 85 getExecOpts :: regex -> execOpt 86 -- ^ retrieve the current flags 87 88---------------- 89-- | RegexMaker captures the creation of the compiled regular 90-- expression from a source type and an option type. 'makeRegexM' and 91-- 'makeRegexM' report parse error using 'MonadError', usually (Either 92-- String regex). 93-- 94-- The 'makeRegex' function has a default implementation that depends 95-- on makeRegexOpts and used 'defaultCompOpt' and 'defaultExecOpt'. 96-- Similarly for 'makeRegexM' and 'makeRegexOptsM'. 97-- 98-- There are also default implementaions for 'makeRegexOpts' and 99-- 'makeRegexOptsM' in terms of each other. So a minimal instance 100-- definition needs to only define one of these, hopefully 101-- 'makeRegexOptsM'. 102class (RegexOptions regex compOpt execOpt) => RegexMaker regex compOpt execOpt source 103 | regex -> compOpt execOpt, compOpt -> regex execOpt, execOpt -> regex compOpt where 104 -- | make using the defaultCompOpt and defaultExecOpt 105 makeRegex :: source -> regex 106 -- | Specify your own options 107 makeRegexOpts :: compOpt -> execOpt -> source -> regex 108 -- | make using the defaultCompOpt and defaultExecOpt, reporting errors with fail 109 makeRegexM :: (Fail.MonadFail m) => source -> m regex 110 -- | Specify your own options, reporting errors with fail 111 makeRegexOptsM :: (MonadFail m) => compOpt -> execOpt -> source -> m regex 112 113 makeRegex = makeRegexOpts defaultCompOpt defaultExecOpt 114 makeRegexM = makeRegexOptsM defaultCompOpt defaultExecOpt 115 makeRegexOpts c e s = maybe (error "makeRegexOpts failed") id (makeRegexOptsM c e s) 116 makeRegexOptsM c e s = return (makeRegexOpts c e s) 117 118---------------- 119-- | RegexLike is parametrized on a regular expression type and a 120-- source type to run the matching on. 121-- 122-- There are default implementations: matchTest and matchOnceText use 123-- matchOnce; matchCount and matchAllText use matchAll. matchOnce uses 124-- matchOnceText and matchAll uses matchAllText. So a minimal complete 125-- instance need to provide at least (matchOnce or matchOnceText) and 126-- (matchAll or matchAllText). Additional definitions are often 127-- provided where they will increase efficiency. 128-- 129-- > [ c | let notVowel = makeRegex "[^aeiou]" :: Regex, c <- ['a'..'z'], matchTest notVowel [c] ] 130-- > 131-- > "bcdfghjklmnpqrstvwxyz" 132-- 133-- The strictness of these functions is instance dependent. 134class (Extract source)=> RegexLike regex source where 135 -- | This returns the first match in the source (it checks the whole 136 -- source, not just at the start). This returns an array of 137 -- (offset,length) index pairs for the match and captured 138 -- substrings. The offset is 0-based. A (-1) for an offset means a 139 -- failure to match. The lower bound of the array is 0, and the 0th 140 -- element is the (offset,length) for the whole match. 141 matchOnce :: regex -> source-> Maybe MatchArray 142 -- | matchAll returns a list of matches. The matches are in order 143 -- and do not overlap. If any match succeeds but has 0 length then 144 -- this will be the last match in the list. 145 matchAll :: regex -> source-> [MatchArray] 146 -- | matchCount returns the number of non-overlapping matches 147 -- returned by matchAll. 148 matchCount :: regex -> source-> Int 149 -- | matchTest return True if there is a match somewhere in the 150 -- source (it checks the whole source not just at the start). 151 matchTest :: regex -> source-> Bool 152 -- | This is matchAll with the actual subsections of the source 153 -- instead of just the (offset,length) information. 154 matchAllText :: regex -> source-> [MatchText source] 155 -- | This can return a tuple of three items: the source before the 156 -- match, an array of the match and captured substrings (with their 157 -- indices), and the source after the match. 158 matchOnceText :: regex -> source-> Maybe (source,MatchText source,source) 159 160 matchAll regex source = map (fmap snd) (matchAllText regex source) 161 matchOnce regex source = fmap (\(_,mt,_) -> fmap snd mt) (matchOnceText regex source) 162 matchTest regex source = isJust (matchOnce regex source) 163 matchCount regex source = length (matchAll regex source) 164 matchOnceText regex source = 165 fmap (\ma -> let (o,l) = ma!0 166 in (before o source 167 ,fmap (\ol -> (extract ol source,ol)) ma 168 ,after (o+l) source)) 169 (matchOnce regex source) 170 matchAllText regex source = 171 map (fmap (\ol -> (extract ol source,ol))) 172 (matchAll regex source) 173 174---------------- 175-- | RegexContext is the polymorphic interface to do matching. Since 176-- 'target' is polymorphic you may need to suply the type explicitly 177-- in contexts where it cannot be inferred. 178-- 179-- The monadic 'matchM' version uses 'fail' to report when the 'regex' 180-- has no match in 'source'. Two examples: 181-- 182-- Here the contest 'Bool' is inferred: 183-- 184-- > [ c | let notVowel = makeRegex "[^aeiou]" :: Regex, c <- ['a'..'z'], match notVowel [c] ] 185-- > 186-- > "bcdfghjklmnpqrstvwxyz" 187-- 188-- Here the context '[String]' must be supplied: 189-- 190-- > let notVowel = (makeRegex "[^aeiou]" :: Regex ) 191-- > in do { c <- ['a'..'z'] ; matchM notVowel [c] } :: [String] 192-- > 193-- > ["b","c","d","f","g","h","j","k","l","m","n","p","q","r","s","t","v","w","x","y","z"] 194class (RegexLike regex source) => RegexContext regex source target where 195 match :: regex -> source -> target 196 matchM :: (MonadFail m) => regex -> source -> m target 197 198---------------- 199-- | Extract allows for indexing operations on String or ByteString. 200class Extract source where 201 -- | before is a renamed "take" 202 before :: Int -> source -> source 203 -- | after is a renamed "drop" 204 after :: Int -> source -> source 205 -- | For when there is no match, this can construct an empty data value 206 empty :: source 207 -- | extract takes an offset and length and has a default 208 -- implementation of @extract (off,len) source = before len (after 209 -- off source)@ 210 extract :: (Int,Int) -> source -> source 211 extract (off,len) source = before len (after off source) 212 213instance Extract String where 214 before = take; after = drop; empty = [] 215 216instance Extract SB.ByteString where 217 before = SB.take; after = SB.drop; empty = SB.empty 218 219instance Extract LB.ByteString where 220 before = LB.take . toEnum; after = LB.drop . toEnum; empty = LB.empty 221 222instance Extract (S.Seq a) where 223 before = S.take; after = S.drop; empty = S.empty 224 225-- | @since 0.94.0.0 226instance Extract ST.Text where 227 before = ST.take; after = ST.drop; empty = ST.empty 228 229-- | @since 0.94.0.0 230instance Extract LT.Text where 231 before = LT.take . toEnum; after = LT.drop . toEnum; empty = LT.empty 232 233-- | Used in results of RegexContext instances 234newtype AllSubmatches f b = AllSubmatches {getAllSubmatches :: (f b)} 235-- | Used in results of RegexContext instances 236newtype AllTextSubmatches f b = AllTextSubmatches {getAllTextSubmatches :: (f b)} 237-- | Used in results of RegexContext instances 238newtype AllMatches f b = AllMatches {getAllMatches :: (f b)} 239-- | Used in results of RegexContext instances 240newtype AllTextMatches f b = AllTextMatches {getAllTextMatches :: (f b) } 241