1{-# LANGUAGE MultiParamTypeClasses, FunctionalDependencies, FlexibleInstances, TypeSynonymInstances #-} 2----------------------------------------------------------------------------- 3-- | 4-- Module : Text.Regex.Base.RegexLike 5-- Copyright : (c) Chris Kuklewicz 2006 6-- SPDX-License-Identifier: BSD-3-Clause 7-- 8-- Maintainer : hvr@gnu.org 9-- Stability : experimental 10-- Portability : non-portable (MPTC+FD) 11-- 12-- Classes and instances for Regex matching. 13-- 14-- All the /classes/ are declared here, and some common type aliases, and 15-- the 'MatchResult' data type. 16-- 17-- The only /instances/ here are for 'Extract' 'String', 'Extract' 'SB.ByteString', 18-- and 'Extract' 'ST.Text'. There are no data values. The 'RegexContext' 19-- instances are in "Text.Regex.Base.Context", except for ones which 20-- run afoul of a repeated variable (@'RegexContext' regex a a@), which 21-- are defined in each modules' String and ByteString modules. 22----------------------------------------------------------------------------- 23 24module Text.Regex.Base.RegexLike ( 25 -- ** Type aliases 26 MatchOffset, 27 MatchLength, 28 MatchArray, 29 MatchText, 30 -- ** Data types 31 MatchResult(..), 32 -- ** Classes 33 RegexOptions(..), 34 RegexMaker(..), 35 RegexLike(..), 36 RegexContext(..), 37 Extract(..), 38 AllSubmatches(..),AllTextSubmatches(..),AllMatches(..),AllTextMatches(..) 39 ) where 40 41import Prelude hiding (fail) 42import Control.Monad.Fail as Fail (MonadFail) 43 44import Data.Array(Array,(!)) 45import Data.Maybe(isJust) 46import qualified Data.ByteString as SB (take,drop,empty,ByteString) 47import qualified Data.ByteString.Lazy as LB (take,drop,empty,ByteString) 48import qualified Data.Sequence as S(take,drop,empty,Seq) 49import qualified Data.Text as ST (take,drop,empty,Text) 50import qualified Data.Text.Lazy as LT (take,drop,empty,Text) 51 52-- | 0 based index from start of source, or (-1) for unused 53type MatchOffset = Int 54 55-- | non-negative length of a match 56type MatchLength = Int 57 58-- | 0 based array, with 0th index indicating the full match. If the 59-- full match location is not available, represent as (0,0). 60type MatchArray = Array Int (MatchOffset,MatchLength) 61type MatchText source = Array Int (source,(MatchOffset,MatchLength)) 62 63-- | This is the same as the type from JRegex. 64data MatchResult a = MR { 65 mrBefore :: a, 66 mrMatch :: a, 67 mrAfter :: a, 68 mrSubList :: [a], 69 mrSubs :: Array Int a 70} 71 72 73-- | Rather than carry them around spearately, the options for how to 74-- execute a regex are kept as part of the regex. There are two types 75-- of options. Those that can only be specified at compilation time 76-- and never changed are @compOpt@. Those that can be changed later and 77-- affect how matching is performed are @execOpt@. The actually types 78-- for these depend on the backend. 79-- 80class RegexOptions regex compOpt execOpt 81 | regex -> compOpt execOpt 82 , compOpt -> regex execOpt 83 , execOpt -> regex compOpt 84 where 85 86 -- | No options set at all in the backend. 87 blankCompOpt :: compOpt 88 89 -- | No options set at all in the backend. 90 blankExecOpt :: execOpt 91 92 -- | Reasonable options (extended, caseSensitive, multiline regex). 93 defaultCompOpt :: compOpt 94 95 -- | Reasonable options (extended, caseSensitive, multiline regex). 96 defaultExecOpt :: execOpt 97 98 -- | Forget old flags and use new ones. 99 setExecOpts :: execOpt -> regex -> regex 100 101 -- | Retrieve the current flags. 102 getExecOpts :: regex -> execOpt 103 104 105-- | @RegexMaker@ captures the creation of the compiled regular 106-- expression from a source type and an option type. Methods 'makeRegexM' and 107-- 'makeRegexM' report parse errors using 'MonadError', usually (@Either 108-- String regex@). 109-- 110-- The 'makeRegex' function has a default implementation that depends 111-- on 'makeRegexOpts' and uses 'defaultCompOpt' and 'defaultExecOpt'. 112-- Similarly for 'makeRegexM' and 'makeRegexOptsM'. 113-- 114-- There are also default implementaions for 'makeRegexOpts' and 115-- 'makeRegexOptsM' in terms of each other. So a minimal instance 116-- definition needs to only define one of these, hopefully 117-- 'makeRegexOptsM'. 118-- 119class (RegexOptions regex compOpt execOpt) => RegexMaker regex compOpt execOpt source 120 | regex -> compOpt execOpt, compOpt -> regex execOpt, execOpt -> regex compOpt where 121 122 -- | Use the 'defaultCompOpt' and 'defaultExecOpt'. 123 makeRegex :: source -> regex 124 125 -- | Specify your own options. 126 makeRegexOpts :: compOpt -> execOpt -> source -> regex 127 128 -- | Use the 'defaultCompOpt' and 'defaultExecOpt', reporting errors with 'fail'. 129 makeRegexM :: (MonadFail m) => source -> m regex 130 131 -- | Specify your own options, reporting errors with fail 132 makeRegexOptsM :: (MonadFail m) => compOpt -> execOpt -> source -> m regex 133 134 makeRegex = makeRegexOpts defaultCompOpt defaultExecOpt 135 makeRegexM = makeRegexOptsM defaultCompOpt defaultExecOpt 136 makeRegexOpts c e s = maybe (error "makeRegexOpts failed") id (makeRegexOptsM c e s) 137 makeRegexOptsM c e s = return (makeRegexOpts c e s) 138 139 140-- | RegexLike is parametrized on a regular expression type and a 141-- source type to run the matching on. 142-- 143-- There are default implementations: 'matchTest' and 'matchOnceText' use 144-- 'matchOnce'; 'matchCount' and 'matchAllText' use 'matchAll'. 145-- Conversely, 'matchOnce' uses 146-- 'matchOnceText' and 'matchAll' uses 'matchAllText'. So a minimal complete 147-- instance need to provide at least ('matchOnce' or 'matchOnceText') and 148-- ('matchAll' or 'matchAllText'). Additional definitions are often 149-- provided where they will increase efficiency. 150-- 151-- > [ c | let notVowel = makeRegex "[^aeiou]" :: Regex, c <- ['a'..'z'], matchTest notVowel [c] ] 152-- > 153-- > "bcdfghjklmnpqrstvwxyz" 154-- 155-- The strictness of these functions is instance dependent. 156-- 157class (Extract source) => RegexLike regex source where 158 159 -- | This returns the first match in the source (it checks the whole 160 -- source, not just at the start). This returns an array of 161 -- (offset,length) index pairs for the match and captured 162 -- substrings. The offset is 0-based. A (-1) for an offset means a 163 -- failure to match. The lower bound of the array is 0, and the 0th 164 -- element is the (offset,length) for the whole match. 165 matchOnce :: regex -> source -> Maybe MatchArray 166 167 -- | @matchAll@ returns a list of matches. The matches are in order 168 -- and do not overlap. If any match succeeds but has 0 length then 169 -- this will be the last match in the list. 170 matchAll :: regex -> source -> [MatchArray] 171 172 -- | @matchCount@ returns the number of non-overlapping matches 173 -- returned by @matchAll@. 174 matchCount :: regex -> source -> Int 175 176 -- | @matchTest@ returns @True@ if there is a match somewhere in the 177 -- source (it checks the whole source not just at the start). 178 matchTest :: regex -> source -> Bool 179 180 -- | This is @matchAll@ with the actual subsections of the source 181 -- instead of just the (offset,length) information. 182 matchAllText :: regex -> source -> [MatchText source] 183 184 -- | This can return a tuple of three items: the source before the 185 -- match, an array of the match and captured substrings (with their 186 -- indices), and the source after the match. 187 matchOnceText :: regex -> source -> Maybe (source, MatchText source, source) 188 189 matchAll regex source = map (fmap snd) (matchAllText regex source) 190 matchOnce regex source = fmap (\(_,mt,_) -> fmap snd mt) (matchOnceText regex source) 191 matchTest regex source = isJust (matchOnce regex source) 192 matchCount regex source = length (matchAll regex source) 193 matchOnceText regex source = 194 fmap (\ma -> let (o,l) = ma ! 0 195 in (before o source 196 ,fmap (\ol -> (extract ol source,ol)) ma 197 ,after (o+l) source)) 198 (matchOnce regex source) 199 matchAllText regex source = 200 map (fmap (\ol -> (extract ol source,ol))) 201 (matchAll regex source) 202 203 204-- | @RegexContext@ is the polymorphic interface to do matching. Since 205-- 'target' is polymorphic you may need to supply the type explicitly 206-- in contexts where it cannot be inferred. 207-- 208-- The monadic 'matchM' version uses 'fail' to report when the 'regex' 209-- has no match in 'source'. Two examples: 210-- 211-- Here the contest 'Bool' is inferred: 212-- 213-- > [ c | let notVowel = makeRegex "[^aeiou]" :: Regex, c <- ['a'..'z'], match notVowel [c] ] 214-- > 215-- > "bcdfghjklmnpqrstvwxyz" 216-- 217-- Here the context @[String]@ must be supplied: 218-- 219-- > let notVowel = (makeRegex "[^aeiou]" :: Regex ) 220-- > in do { c <- ['a'..'z'] ; matchM notVowel [c] } :: [String] 221-- > 222-- > ["b","c","d","f","g","h","j","k","l","m","n","p","q","r","s","t","v","w","x","y","z"] 223-- 224class (RegexLike regex source) => RegexContext regex source target where 225 match :: regex -> source -> target 226 matchM :: (MonadFail m) => regex -> source -> m target 227 228 229-- | Extract allows for indexing operations on 'String' or 'ByteString'. 230-- 231class Extract source where 232 233 -- | @before@ is a renamed 'take'. 234 before :: Int -> source -> source 235 236 -- | @after@ is a renamed 'drop'. 237 after :: Int -> source -> source 238 239 -- | When there is no match, this can construct an empty data value. 240 empty :: source 241 242 -- | @extract@ takes an offset and length, and has this default implementation: 243 -- 244 -- @ 245 -- extract (off, len) source = before len (after off source) 246 -- @ 247 extract :: (Int,Int) -> source -> source 248 extract (off,len) source = before len (after off source) 249 250instance Extract String where 251 before = take; after = drop; empty = [] 252 253instance Extract SB.ByteString where 254 before = SB.take; after = SB.drop; empty = SB.empty 255 256instance Extract LB.ByteString where 257 before = LB.take . toEnum; after = LB.drop . toEnum; empty = LB.empty 258 259instance Extract (S.Seq a) where 260 before = S.take; after = S.drop; empty = S.empty 261 262-- | @since 0.94.0.0 263instance Extract ST.Text where 264 before = ST.take; after = ST.drop; empty = ST.empty 265 266-- | @since 0.94.0.0 267instance Extract LT.Text where 268 before = LT.take . toEnum; after = LT.drop . toEnum; empty = LT.empty 269 270-- | Used in results of 'RegexContext' instances. 271newtype AllSubmatches f b = AllSubmatches {getAllSubmatches :: (f b)} 272 273-- | Used in results of 'RegexContext' instances. 274newtype AllTextSubmatches f b = AllTextSubmatches {getAllTextSubmatches :: (f b)} 275 276-- | Used in results of 'RegexContext' instances. 277newtype AllMatches f b = AllMatches {getAllMatches :: (f b)} 278 279-- | Used in results of 'RegexContext' instances. 280newtype AllTextMatches f b = AllTextMatches {getAllTextMatches :: (f b) } 281