1{-# LANGUAGE BangPatterns #-} 2-- | 3-- Module : Data.ByteString.Lazy.Search.DFA 4-- Copyright : Daniel Fischer 5-- Licence : BSD3 6-- Maintainer : Daniel Fischer <daniel.is.fischer@googlemail.com> 7-- Stability : Provisional 8-- Portability : non-portable (BangPatterns) 9-- 10-- Fast search of lazy 'L.ByteString' values. Breaking, 11-- splitting and replacing using a deterministic finite automaton. 12 13module Data.ByteString.Lazy.Search.DFA ( -- * Overview 14 -- $overview 15 16 -- ** Complexity and performance 17 -- $complexity 18 19 -- ** Partial application 20 -- $partial 21 22 -- * Finding substrings 23 indices 24 , nonOverlappingIndices 25 -- * Breaking on substrings 26 , breakOn 27 , breakAfter 28 , breakFindAfter 29 -- * Replacing 30 , replace 31 -- * Splitting 32 , split 33 , splitKeepEnd 34 , splitKeepFront 35 ) where 36 37import Data.ByteString.Search.Internal.Utils (automaton, keep, ldrop, lsplit) 38import Data.ByteString.Search.Substitution 39 40import qualified Data.ByteString as S 41import qualified Data.ByteString.Lazy as L 42import qualified Data.ByteString.Lazy.Internal as LI 43import Data.ByteString.Unsafe (unsafeIndex) 44 45import Data.Array.Base (unsafeAt) 46--import Data.Array.Unboxed (UArray) 47 48import Data.Bits 49import Data.Int (Int64) 50 51-- $overview 52-- 53-- This module provides functions related to searching a substring within 54-- a string. The searching algorithm uses a deterministic finite automaton 55-- based on the Knuth-Morris-Pratt algorithm. 56-- The automaton is implemented as an array of @(patternLength + 1) * σ@ 57-- state transitions, where σ is the alphabet size (256), so it is only 58-- suitable for short enough patterns, therefore the patterns in this module 59-- are required to be strict 'S.ByteString's. 60-- 61-- When searching a pattern in a UTF-8-encoded 'L.ByteString', be aware that 62-- these functions work on bytes, not characters, so the indices are 63-- byte-offsets, not character offsets. 64 65-- $complexity 66-- 67-- The time and space complexity of the preprocessing phase is 68-- /O/(@patternLength * σ@). 69-- The searching phase is /O/(@targetLength@), each target character is 70-- inspected only once. 71-- 72-- In general the functions in this module have about the same performance as 73-- the corresponding functions using the Knuth-Morris-Pratt algorithm but 74-- are considerably slower than the Boyer-Moore functions. For very short 75-- patterns or, in the case of 'indices', patterns with a short period 76-- which occur often, however, times are close to or even below the 77-- Boyer-Moore times. 78 79-- $partial 80-- 81-- All functions can usefully be partially applied. Given only a pattern, 82-- the automaton is constructed only once, allowing efficient re-use. 83 84------------------------------------------------------------------------------ 85-- Exported Functions -- 86------------------------------------------------------------------------------ 87 88-- | @'indices'@ finds the starting indices of all possibly overlapping 89-- occurrences of the pattern in the target string. 90-- If the pattern is empty, the result is @[0 .. 'length' target]@. 91{-# INLINE indices #-} 92indices :: S.ByteString -- ^ Strict pattern to find 93 -> L.ByteString -- ^ Lazy string to search 94 -> [Int64] -- ^ Offsets of matches 95indices !pat = lazySearcher True pat . L.toChunks 96 97-- | @'nonOverlappingIndices'@ finds the starting indices of all 98-- non-overlapping occurrences of the pattern in the target string. 99-- It is more efficient than removing indices from the list produced 100-- by 'indices'. 101{-# INLINE nonOverlappingIndices #-} 102nonOverlappingIndices :: S.ByteString -- ^ Strict pattern to find 103 -> L.ByteString -- ^ Lazy string to search 104 -> [Int64] -- ^ Offsets of matches 105nonOverlappingIndices !pat = lazySearcher False pat . L.toChunks 106 107-- | @'breakOn' pattern target@ splits @target@ at the first occurrence 108-- of @pattern@. If the pattern does not occur in the target, the 109-- second component of the result is empty, otherwise it starts with 110-- @pattern@. If the pattern is empty, the first component is empty. 111-- For a non-empty pattern, the first component is generated lazily, 112-- thus the first parts of it can be available before the pattern has 113-- been found or determined to be absent. 114-- 115-- @ 116-- 'uncurry' 'L.append' . 'breakOn' pattern = 'id' 117-- @ 118breakOn :: S.ByteString -- ^ Strict pattern to search for 119 -> L.ByteString -- ^ Lazy string to search in 120 -> (L.ByteString, L.ByteString) 121 -- ^ Head and tail of string broken at substring 122breakOn pat = breaker . L.toChunks 123 where 124 lbrk = lazyBreaker True pat 125 breaker strs = let (f, b) = lbrk strs 126 in (L.fromChunks f, L.fromChunks b) 127 128-- | @'breakAfter' pattern target@ splits @target@ behind the first occurrence 129-- of @pattern@. An empty second component means that either the pattern 130-- does not occur in the target or the first occurrence of pattern is at 131-- the very end of target. If you need to discriminate between those cases, 132-- use breakFindAfter. 133-- If the pattern is empty, the first component is empty. 134-- For a non-empty pattern, the first component is generated lazily, 135-- thus the first parts of it can be available before the pattern has 136-- been found or determined to be absent. 137-- @ 138-- 'uncurry' 'L.append' . 'breakAfter' pattern = 'id' 139-- @ 140breakAfter :: S.ByteString -- ^ Strict pattern to search for 141 -> L.ByteString -- ^ Lazy string to search in 142 -> (L.ByteString, L.ByteString) 143 -- ^ Head and tail of string broken after substring 144breakAfter pat = breaker . L.toChunks 145 where 146 lbrk = lazyBreaker False pat 147 breaker strs = let (f, b) = lbrk strs 148 in (L.fromChunks f, L.fromChunks b) 149 150-- | @'breakFindAfter'@ does the same as 'breakAfter' but additionally indicates 151-- whether the pattern is present in the target. 152-- 153-- @ 154-- 'fst' . 'breakFindAfter' pat = 'breakAfter' pat 155-- @ 156breakFindAfter :: S.ByteString -- ^ Strict pattern to search for 157 -> L.ByteString -- ^ Lazy string to search in 158 -> ((L.ByteString, L.ByteString), Bool) 159 -- ^ Head and tail of string broken after substring 160 -- and presence of pattern 161breakFindAfter pat 162 | S.null pat = \str -> ((L.empty, str), True) 163breakFindAfter pat = breaker . L.toChunks 164 where 165 !patLen = S.length pat 166 lbrk = lazyBreaker True pat 167 breaker strs = let (f, b) = lbrk strs 168 (f1, b1) = lsplit patLen b 169 mbpat = L.fromChunks f1 170 in ((foldr LI.chunk mbpat f, L.fromChunks b1), not (null b)) 171 172-- | @'replace' pat sub text@ replaces all (non-overlapping) occurrences of 173-- @pat@ in @text@ with @sub@. If occurrences of @pat@ overlap, the first 174-- occurrence that does not overlap with a replaced previous occurrence 175-- is substituted. Occurrences of @pat@ arising from a substitution 176-- will not be substituted. For example: 177-- 178-- @ 179-- 'replace' \"ana\" \"olog\" \"banana\" = \"bologna\" 180-- 'replace' \"ana\" \"o\" \"bananana\" = \"bono\" 181-- 'replace' \"aab\" \"abaa\" \"aaabb\" = \"aabaab\" 182-- @ 183-- 184-- The result is a lazy 'L.ByteString', 185-- which is lazily produced, without copying. 186-- Equality of pattern and substitution is not checked, but 187-- 188-- @ 189-- 'replace' pat pat text == text 190-- @ 191-- 192-- holds (the internal structure is generally different). 193-- If the pattern is empty but not the substitution, the result 194-- is equivalent to (were they 'String's) @cycle sub@. 195-- 196-- For non-empty @pat@ and @sub@ a lazy 'L.ByteString', 197-- 198-- @ 199-- 'L.concat' . 'Data.List.intersperse' sub . 'split' pat = 'replace' pat sub 200-- @ 201-- 202-- and analogous relations hold for other types of @sub@. 203replace :: Substitution rep 204 => S.ByteString -- ^ Strict pattern to replace 205 -> rep -- ^ Replacement string 206 -> L.ByteString -- ^ Lazy string to modify 207 -> L.ByteString -- ^ Lazy result 208replace pat 209 | S.null pat = \sub -> prependCycle sub 210 | otherwise = 211 let !patLen = S.length pat 212 breaker = lazyBreaker True pat 213 repl subst strs 214 | null strs = [] 215 | otherwise = 216 let (pre, mtch) = breaker strs 217 in pre ++ case mtch of 218 [] -> [] 219 _ -> subst (repl subst (ldrop patLen mtch)) 220 in \sub -> let {-# NOINLINE subst #-} 221 !subst = substitution sub 222 repl1 = repl subst 223 in L.fromChunks . repl1 . L.toChunks 224 225 226-- | @'split' pattern target@ splits @target@ at each (non-overlapping) 227-- occurrence of @pattern@, removing @pattern@. If @pattern@ is empty, 228-- the result is an infinite list of empty 'L.ByteString's, if @target@ 229-- is empty but not @pattern@, the result is an empty list, otherwise 230-- the following relations hold (where @patL@ is the lazy 'L.ByteString' 231-- corresponding to @pat@): 232-- 233-- @ 234-- 'L.concat' . 'Data.List.intersperse' patL . 'split' pat = 'id', 235-- 'length' ('split' pattern target) == 236-- 'length' ('nonOverlappingIndices' pattern target) + 1, 237-- @ 238-- 239-- no fragment in the result contains an occurrence of @pattern@. 240split :: S.ByteString -- ^ Strict pattern to split on 241 -> L.ByteString -- ^ Lazy string to split 242 -> [L.ByteString] -- ^ Fragments of string 243split pat 244 | S.null pat = const (repeat L.empty) 245split pat = map L.fromChunks . splitter . L.toChunks 246 where 247 !patLen = S.length pat 248 breaker = lazyBreaker True pat 249 splitter strs 250 | null strs = [] 251 | otherwise = splitter' strs 252 splitter' strs 253 | null strs = [[]] 254 | otherwise = 255 case breaker strs of 256 (pre, mtch) -> 257 pre : case mtch of 258 [] -> [] 259 _ -> splitter' (ldrop patLen mtch) 260 261-- | @'splitKeepEnd' pattern target@ splits @target@ after each (non-overlapping) 262-- occurrence of @pattern@. If @pattern@ is empty, the result is an 263-- infinite list of empty 'L.ByteString's, otherwise the following 264-- relations hold: 265-- 266-- @ 267-- 'L.concat' . 'splitKeepEnd' pattern = 'id,' 268-- @ 269-- 270-- all fragments in the result except possibly the last end with 271-- @pattern@, no fragment contains more than one occurrence of @pattern@. 272splitKeepEnd :: S.ByteString -- ^ Strict pattern to split on 273 -> L.ByteString -- ^ Lazy string to split 274 -> [L.ByteString] -- ^ Fragments of string 275splitKeepEnd pat 276 | S.null pat = const (repeat L.empty) 277splitKeepEnd pat = map L.fromChunks . splitter . L.toChunks 278 where 279 breaker = lazyBreaker False pat 280 splitter [] = [] 281 splitter strs = 282 case breaker strs of 283 (pre, mtch) -> pre : splitter mtch 284 285-- | @'splitKeepFront'@ is like 'splitKeepEnd', except that @target@ is split 286-- before each occurrence of @pattern@ and hence all fragments 287-- with the possible exception of the first begin with @pattern@. 288-- No fragment contains more than one non-overlapping occurrence 289-- of @pattern@. 290splitKeepFront :: S.ByteString -- ^ Strict pattern to split on 291 -> L.ByteString -- ^ Lazy string to split 292 -> [L.ByteString] -- ^ Fragments of string 293splitKeepFront pat 294 | S.null pat = const (repeat L.empty) 295splitKeepFront pat = map L.fromChunks . splitter . L.toChunks 296 where 297 !patLen = S.length pat 298 breaker = lazyBreaker True pat 299 splitter strs = case splitter' strs of 300 ([] : rst) -> rst 301 other -> other 302 splitter' [] = [] 303 splitter' strs = 304 case breaker strs of 305 (pre, mtch) -> 306 pre : case mtch of 307 [] -> [] 308 _ -> case lsplit patLen mtch of 309 (pt, rst) -> 310 if null rst 311 then [pt] 312 else let (h : t) = splitter' rst 313 in (pt ++ h) : t 314 315------------------------------------------------------------------------------ 316-- Searching Function -- 317------------------------------------------------------------------------------ 318 319lazySearcher :: Bool -> S.ByteString -> [S.ByteString] -> [Int64] 320lazySearcher _ !pat 321 | S.null pat = 322 let zgo _ [] = [] 323 zgo !prior (!str : rest) = 324 let !l = S.length str 325 !prior' = prior + fromIntegral l 326 in [prior + fromIntegral i | i <- [1 .. l]] ++ zgo prior' rest 327 in (0:) . zgo 0 328 | S.length pat == 1 = 329 let !w = S.head pat 330 ixes = S.elemIndices w 331 go _ [] = [] 332 go !prior (!str : rest) 333 = let !prior' = prior + fromIntegral (S.length str) 334 in map ((+ prior) . fromIntegral) (ixes str) ++ go prior' rest 335 in go 0 336lazySearcher !overlap pat = search 0 0 337 where 338 !patLen = S.length pat 339 !auto = automaton pat 340 !p0 = unsafeIndex pat 0 341 !ams = if overlap then patLen else 0 342 search _ _ [] = [] 343 search !prior st (!str:rest) = match st 0 344 where 345 !strLen = S.length str 346 {-# INLINE strAt #-} 347 strAt :: Int -> Int 348 strAt i = fromIntegral (str `unsafeIndex` i) 349 match 0 !idx 350 | idx == strLen = search (prior + fromIntegral strLen) 0 rest 351 | unsafeIndex str idx == p0 = match 1 (idx + 1) 352 | otherwise = match 0 (idx + 1) 353 match state idx 354 | idx == strLen = search (prior + fromIntegral strLen) state rest 355 | otherwise = 356 let nstate = unsafeAt auto ((state `shiftL` 8) + strAt idx) 357 !nxtIdx = idx + 1 358 in if nstate == patLen 359 then (prior + fromIntegral (nxtIdx - patLen)) : 360 match ams nxtIdx 361 else match nstate nxtIdx 362 363------------------------------------------------------------------------------ 364-- Breaking -- 365------------------------------------------------------------------------------ 366 367-- Code duplication :( 368-- Needed for reasonable performance. 369lazyBreaker :: Bool -> S.ByteString -> [S.ByteString] 370 -> ([S.ByteString], [S.ByteString]) 371lazyBreaker before pat 372 | S.null pat = \strs -> ([], strs) 373 | S.length pat == 1 = 374 let !w = S.head pat 375 !a = if before then 0 else 1 376 ixes = S.elemIndices w 377 scan [] = ([], []) 378 scan (!str:rest) = 379 let !strLen = S.length str 380 in case ixes str of 381 [] -> let (fr, bk) = scan rest in (str : fr, bk) 382 (i:_) -> let !j = i + a 383 in if j == strLen 384 then ([str],rest) 385 else ([S.take j str], S.drop j str : rest) 386 in scan 387lazyBreaker !before pat = bscan [] 0 388 where 389 !patLen = S.length pat 390 !auto = automaton pat 391 !p0 = unsafeIndex pat 0 392 bscan _ _ [] = ([], []) 393 bscan !past !sta (!str:rest) = match sta 0 394 where 395 !strLen = S.length str 396 {-# INLINE strAt #-} 397 strAt :: Int -> Int 398 strAt i = fromIntegral (str `unsafeIndex` i) 399 match 0 idx 400 | idx == strLen = 401 let (fr, bk) = bscan [] 0 rest 402 in (foldr (flip (.) . (:)) id past (str:fr), bk) 403 | unsafeIndex str idx == p0 = match 1 (idx + 1) 404 | otherwise = match 0 (idx + 1) 405 match state idx 406 | idx == strLen = 407 let (kp, !rl) = if before 408 then keep state (str:past) 409 else ([], str:past) 410 (fr, bk) = bscan kp state rest 411 in (foldr (flip (.) . (:)) id rl fr, bk) 412 | otherwise = 413 let !nstate = unsafeAt auto ((state `shiftL` 8) + strAt idx) 414 !nxtIdx = idx + 1 415 in if nstate == patLen 416 then case if before then nxtIdx - patLen else nxtIdx of 417 0 -> (foldr (flip (.) . (:)) id past [], str:rest) 418 stIx | stIx < 0 -> rgo (-stIx) (str:rest) past 419 | stIx == strLen -> 420 (foldr (flip (.) . (:)) id past [str],rest) 421 | otherwise -> 422 (foldr (flip (.) . (:)) id past 423 [S.take stIx str], S.drop stIx str : rest) 424 else match nstate nxtIdx 425 426 427-- Did I already mention that I suck at finding names? 428{-# INLINE rgo #-} 429rgo :: Int -> [S.ByteString] -> [S.ByteString] 430 -> ([S.ByteString], [S.ByteString]) 431rgo !kp acc (!str:more) 432 | sl == kp = (reverse more, str:acc) 433 | sl < kp = rgo (kp - sl) (str:acc) more 434 | otherwise = case S.splitAt (sl - kp) str of 435 (fr, bk) -> 436 (foldr (flip (.) . (:)) id more [fr], bk:acc) 437 where 438 !sl = S.length str 439rgo _ _ [] = error "Not enough past!" 440-- If that error is ever encountered, I screwed up badly. 441