1{-# LANGUAGE BangPatterns #-} 2{-# OPTIONS_HADDOCK hide, prune #-} 3-- | 4-- Module : Data.ByteString.Lazy.Search.Internal.BoyerMoore 5-- Copyright : Daniel Fischer 6-- Chris Kuklewicz 7-- Licence : BSD3 8-- Maintainer : Daniel Fischer <daniel.is.fischer@googlemail.com> 9-- Stability : Provisional 10-- Portability : non-portable (BangPatterns) 11-- 12-- Fast overlapping Boyer-Moore search of both strict and lazy 13-- 'S.ByteString' values. Breaking, splitting and replacing 14-- using the Boyer-Moore algorithm. 15-- 16-- Descriptions of the algorithm can be found at 17-- <http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140> 18-- and 19-- <http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm> 20-- 21-- Original authors: Daniel Fischer (daniel.is.fischer at googlemail.com) and 22-- Chris Kuklewicz (haskell at list.mightyreason.com). 23 24module Data.ByteString.Lazy.Search.Internal.BoyerMoore ( 25 matchLL 26 , matchSL 27 28 -- Non-overlapping 29 , matchNOL 30 31 -- Replacing substrings 32 -- replacing 33 , replaceAllL 34 -- Breaking on substrings 35 -- breaking 36 , breakSubstringL 37 , breakAfterL 38 , breakFindAfterL 39 -- Splitting on substrings 40 -- splitting 41 , splitKeepEndL 42 , splitKeepFrontL 43 , splitDropL 44 ) where 45 46 47import Data.ByteString.Search.Internal.Utils 48 (occurs, suffShifts, ldrop, lsplit, keep, release, strictify) 49import Data.ByteString.Search.Substitution 50 51import qualified Data.ByteString as S 52import qualified Data.ByteString.Lazy as L 53import Data.ByteString.Unsafe (unsafeIndex) 54 55import Data.Array.Base (unsafeAt) 56 57import Data.Word (Word8) 58import Data.Int (Int64) 59 60-- overview 61-- 62-- This module exports three search functions for searching in lazy 63-- ByteSrings, one for searching non-overlapping occurrences of a strict 64-- pattern, and one each for searchin overlapping occurrences of a strict 65-- resp. lazy pattern. The common base name is @match@, the suffix 66-- indicates the type of search. These functions 67-- return (for a non-empty pattern) a list of all the indices of the target 68-- string where an occurrence of the pattern begins, if some occurrences 69-- overlap, all starting indices are reported. The list is produced lazily, 70-- so not necessarily the entire target string is searched. 71-- 72-- The behaviour of these functions when given an empty pattern has changed. 73-- Formerly, the @matchXY@ functions returned an empty list then, now it's 74-- @[0 .. 'length' target]@. 75-- 76-- Newly added are functions to replace all (non-overlapping) occurrences 77-- of a pattern within a string, functions to break ByteStrings at the first 78-- occurrence of a pattern and functions to split ByteStrings at each 79-- occurrence of a pattern. None of these functions does copying, so they 80-- don't introduce large memory overhead. 81-- 82-- Internally, a lazy pattern is always converted to a strict ByteString, 83-- which is necessary for an efficient implementation of the algorithm. 84-- The limit this imposes on the length of the pattern is probably 85-- irrelevant in practice, but perhaps it should be mentioned. 86-- This also means that the @matchL*@ functions are mere convenience wrappers. 87-- Except for the initial 'strictify'ing, there's no difference between lazy 88-- and strict patterns, they call the same workers. There is, however, a 89-- difference between strict and lazy target strings. 90-- For the new functions, no such wrappers are provided, you have to 91-- 'strictify' lazy patterns yourself. 92 93-- caution 94-- 95-- When working with a lazy target string, the relation between the pattern 96-- length and the chunk size can play a big rôle. 97-- Crossing chunk boundaries is relatively expensive, so when that becomes 98-- a frequent occurrence, as may happen when the pattern length is close 99-- to or larger than the chunk size, performance is likely to degrade. 100-- If it is needed, steps can be taken to ameliorate that effect, but unless 101-- entirely separate functions are introduced, that would hurt the 102-- performance for the more common case of patterns much shorter than 103-- the default chunk size. 104 105-- performance 106-- 107-- In general, the Boyer-Moore algorithm is the most efficient method to 108-- search for a pattern inside a string, so most of the time, you'll want 109-- to use the functions of this module, hence this is where the most work 110-- has gone. Very short patterns are an exception to this, for those you 111-- should consider using a finite automaton 112-- ("Data.ByteString.Search.DFA.Array"). That is also often the better 113-- choice for searching longer periodic patterns in a lazy ByteString 114-- with many matches. 115-- 116-- Operating on a strict target string is mostly faster than on a lazy 117-- target string, but the difference is usually small (according to my 118-- tests). 119-- 120-- The known exceptions to this rule of thumb are 121-- 122-- [long targets] Then the smaller memory footprint of a lazy target often 123-- gives (much) better performance. 124-- 125-- [high number of matches] When there are very many matches, strict target 126-- strings are much faster, especially if the pattern is periodic. 127-- 128-- If both conditions hold, either may outweigh the other. 129 130-- complexity 131-- 132-- Preprocessing the pattern is /O/(@patternLength@ + σ) in time and 133-- space (σ is the alphabet size, 256 here) for all functions. 134-- The time complexity of the searching phase for @matchXY@ 135-- is /O/(@targetLength@ \/ @patternLength@) in the best case. 136-- For non-periodic patterns, the worst case complexity is 137-- /O/(@targetLength@), but for periodic patterns, the worst case complexity 138-- is /O/(@targetLength@ * @patternLength@) for the original Boyer-Moore 139-- algorithm. 140-- 141-- The searching functions in this module now contain a modification which 142-- drastically improves the performance for periodic patterns. 143-- I believe that for strict target strings, the worst case is now 144-- /O/(@targetLength@) also for periodic patterns and for lazy target strings, 145-- my semi-educated guess is 146-- /O/(@targetLength@ * (1 + @patternLength@ \/ @chunkSize@)). 147-- I may be very wrong, though. 148-- 149-- The other functions don't have to deal with possible overlapping 150-- patterns, hence the worst case complexity for the processing phase 151-- is /O/(@targetLength@) (respectively /O/(@firstIndex + patternLength@) 152-- for the breaking functions if the pattern occurs). 153 154-- currying 155-- 156-- These functions can all be usefully curried. Given only a pattern 157-- the curried version will compute the supporting lookup tables only 158-- once, allowing for efficient re-use. Similarly, the curried 159-- 'matchLL' and 'matchLS' will compute the concatenated pattern only 160-- once. 161 162-- overflow 163-- 164-- The current code uses @Int@ to keep track of the locations in the 165-- target string. If the length of the pattern plus the length of any 166-- strict chunk of the target string is greater than 167-- @'maxBound' :: 'Int'@ then this will overflow causing an error. We 168-- try to detect this and call 'error' before a segfault occurs. 169 170------------------------------------------------------------------------------ 171-- Wrappers -- 172------------------------------------------------------------------------------ 173 174-- matching 175-- 176-- These functions find the indices of all (possibly overlapping) 177-- occurrences of a pattern in a target string. 178-- If the pattern is empty, the result is @[0 .. length target]@. 179-- If the pattern is much shorter than the target string 180-- and the pattern does not occur very near the beginning of the target, 181-- 182-- > not . null $ matchSS pattern target 183-- 184-- is a much more efficient version of 'S.isInfixOf'. 185 186-- | @'matchLL'@ finds the starting indices of all possibly overlapping 187-- occurrences of the pattern in the target string. 188-- It is a simple wrapper for 'Data.ByteString.Lazy.Search.indices'. 189-- If the pattern is empty, the result is @[0 .. 'length' target]@. 190{-# INLINE matchLL #-} 191matchLL :: L.ByteString -- ^ Lazy pattern 192 -> L.ByteString -- ^ Lazy target string 193 -> [Int64] -- ^ Offsets of matches 194matchLL pat = search . L.toChunks 195 where 196 search = lazySearcher True (strictify pat) 197 198-- | @'matchSL'@ finds the starting indices of all possibly overlapping 199-- occurrences of the pattern in the target string. 200-- It is an alias for 'Data.ByteString.Lazy.Search.indices'. 201-- If the pattern is empty, the result is @[0 .. 'length' target]@. 202{-# INLINE matchSL #-} 203matchSL :: S.ByteString -- ^ Strict pattern 204 -> L.ByteString -- ^ Lazy target string 205 -> [Int64] -- ^ Offsets of matches 206matchSL pat = search . L.toChunks 207 where 208 search = lazySearcher True pat 209 210-- | @'matchNOL'@ finds the indices of all non-overlapping occurrences 211-- of the pattern in the lazy target string. 212{-# INLINE matchNOL #-} 213matchNOL :: S.ByteString -- ^ Strict pattern 214 -> L.ByteString -- ^ Lazy target string 215 -> [Int64] -- ^ Offsets of matches 216matchNOL pat = search . L.toChunks 217 where 218 search = lazySearcher False pat 219 220-- replacing 221-- 222-- These functions replace all (non-overlapping) occurrences of a pattern 223-- in the target string. If some occurrences overlap, the earliest is 224-- replaced and replacing continues at the index after the replaced 225-- occurrence, for example 226-- 227-- > replaceAllL \"ana\" \"olog\" \"banana\" == \"bologna\", 228-- > replaceAllS \"abacab\" \"u\" \"abacabacabacab\" == \"uacu\", 229-- > replaceAllS \"aa\" \"aaa\" \"aaaa\" == \"aaaaaa\". 230-- 231-- Equality of pattern and substitution is not checked, but 232-- 233-- > pat == sub => 'strictify' (replaceAllS pat sub str) == str, 234-- > pat == sub => replaceAllL pat sub str == str. 235-- 236-- The result is a lazily generated lazy ByteString, the first chunks will 237-- generally be available before the entire target has been scanned. 238-- If the pattern is empty, but not the substitution, the result is 239-- equivalent to @'cycle' sub@. 240 241{-# INLINE replaceAllL #-} 242replaceAllL :: Substitution rep 243 => S.ByteString -- ^ Pattern to replace 244 -> rep -- ^ Substitution string 245 -> L.ByteString -- ^ Target string 246 -> L.ByteString -- ^ Lazy result 247replaceAllL pat 248 | S.null pat = \sub -> prependCycle sub 249 | S.length pat == 1 = 250 let breaker = lazyBreak pat 251 repl subst strs 252 | null strs = [] 253 | otherwise = 254 case breaker strs of 255 (pre, mtch) -> 256 pre ++ case mtch of 257 [] -> [] 258 _ -> subst (repl subst (ldrop 1 mtch)) 259 in \sub -> let repl1 = repl (substitution sub) 260 in L.fromChunks . repl1 . L.toChunks 261 | otherwise = 262 let repl = lazyRepl pat 263 in \sub -> let repl1 = repl (substitution sub) 264 in L.fromChunks . repl1 . L.toChunks 265 266-- breaking 267-- 268-- Break a string on a pattern. The first component of the result 269-- contains the prefix of the string before the first occurrence of the 270-- pattern, the second component contains the remainder. 271-- The following relations hold: 272-- 273-- > breakSubstringX \"\" str = (\"\", str) 274-- > not (pat `isInfixOf` str) == null (snd $ breakSunbstringX pat str) 275-- > True == case breakSubstringX pat str of 276-- > (x, y) -> not (pat `isInfixOf` x) 277-- > && (null y || pat `isPrefixOf` y) 278 279-- | The analogous function for a lazy target string. 280-- The first component is generated lazily, so parts of it can be 281-- available before the pattern is detected (or found to be absent). 282{-# INLINE breakSubstringL #-} 283breakSubstringL :: S.ByteString -- ^ Pattern to break on 284 -> L.ByteString -- ^ String to break up 285 -> (L.ByteString, L.ByteString) 286 -- ^ Prefix and remainder of broken string 287breakSubstringL pat = breaker . L.toChunks 288 where 289 lbrk = lazyBreak pat 290 breaker strs = let (f, b) = lbrk strs 291 in (L.fromChunks f, L.fromChunks b) 292 293breakAfterL :: S.ByteString 294 -> L.ByteString 295 -> (L.ByteString, L.ByteString) 296breakAfterL pat 297 | S.null pat = \str -> (L.empty, str) 298breakAfterL pat = breaker' . L.toChunks 299 where 300 !patLen = S.length pat 301 breaker = lazyBreak pat 302 breaker' strs = 303 let (pre, mtch) = breaker strs 304 (pl, a) = if null mtch then ([],[]) else lsplit patLen mtch 305 in (L.fromChunks (pre ++ pl), L.fromChunks a) 306 307breakFindAfterL :: S.ByteString 308 -> L.ByteString 309 -> ((L.ByteString, L.ByteString), Bool) 310breakFindAfterL pat 311 | S.null pat = \str -> ((L.empty, str), True) 312breakFindAfterL pat = breaker' . L.toChunks 313 where 314 !patLen = S.length pat 315 breaker = lazyBreak pat 316 breaker' strs = 317 let (pre, mtch) = breaker strs 318 (pl, a) = if null mtch then ([],[]) else lsplit patLen mtch 319 in ((L.fromChunks (pre ++ pl), L.fromChunks a), not (null mtch)) 320 321-- splitting 322-- 323-- These functions implement various splitting strategies. 324-- 325-- If the pattern to split on is empty, all functions return an 326-- infinite list of empty ByteStrings. 327-- Otherwise, the names are rather self-explanatory. 328-- 329-- For nonempty patterns, the following relations hold: 330-- 331-- > concat (splitKeepXY pat str) == str 332-- > concat ('Data.List.intersperse' pat (splitDropX pat str)) == str. 333-- 334-- All fragments except possibly the last in the result of 335-- @splitKeepEndX pat@ end with @pat@, none of the fragments contains 336-- more than one occurrence of @pat@ or is empty. 337-- 338-- All fragments except possibly the first in the result of 339-- @splitKeepFrontX pat@ begin with @pat@, none of the fragments 340-- contains more than one occurrence of @patq or is empty. 341-- 342-- > splitDropX pat str == map dropPat (splitKeepFrontX pat str) 343-- > where 344-- > patLen = length pat 345-- > dropPat frag 346-- > | pat `isPrefixOf` frag = drop patLen frag 347-- > | otherwise = frag 348-- 349-- but @splitDropX@ is a little more efficient than that. 350 351{-# INLINE splitKeepEndL #-} 352splitKeepEndL :: S.ByteString -- ^ Pattern to split on 353 -> L.ByteString -- ^ String to split 354 -> [L.ByteString] -- ^ List of fragments 355splitKeepEndL pat 356 | S.null pat = const (repeat L.empty) 357 | otherwise = 358 let splitter = lazySplitKeepEnd pat 359 in map L.fromChunks . splitter . L.toChunks 360 361{-# INLINE splitKeepFrontL #-} 362splitKeepFrontL :: S.ByteString -- ^ Pattern to split on 363 -> L.ByteString -- ^ String to split 364 -> [L.ByteString] -- ^ List of fragments 365splitKeepFrontL pat 366 | S.null pat = const (repeat L.empty) 367 | otherwise = 368 let splitter = lazySplitKeepFront pat 369 in map L.fromChunks . splitter . L.toChunks 370 371 372{-# INLINE splitDropL #-} 373splitDropL :: S.ByteString -- ^ Pattern to split on 374 -> L.ByteString -- ^ String to split 375 -> [L.ByteString] -- ^ List of fragments 376splitDropL pat 377 | S.null pat = const (repeat L.empty) 378 | otherwise = 379 let splitter = lazySplitDrop pat 380 in map L.fromChunks . splitter . L.toChunks 381 382------------------------------------------------------------------------------ 383-- Search Functions -- 384------------------------------------------------------------------------------ 385 386lazySearcher :: Bool -> S.ByteString -> [S.ByteString] -> [Int64] 387lazySearcher _ !pat 388 | S.null pat = 389 let zgo !prior [] = [prior] 390 zgo prior (!str : rest) = 391 let !l = S.length str 392 !prior' = prior + fromIntegral l 393 in [prior + fromIntegral i | i <- [0 .. l-1]] ++ zgo prior' rest 394 in zgo 0 395 | S.length pat == 1 = 396 let !w = S.head pat 397 ixes = S.elemIndices w 398 go _ [] = [] 399 go !prior (!str : rest) 400 = let !prior' = prior + fromIntegral (S.length str) 401 in map ((+ prior) . fromIntegral) (ixes str) ++ go prior' rest 402 in go 0 403lazySearcher !overlap pat = searcher 404 where 405 {-# INLINE patAt #-} 406 patAt :: Int -> Word8 407 patAt !i = unsafeIndex pat i 408 409 !patLen = S.length pat 410 !patEnd = patLen - 1 411 {-# INLINE preEnd #-} 412 preEnd = patEnd - 1 413 !maxLen = maxBound - patLen 414 !occT = occurs pat -- for bad-character-shift 415 !suffT = suffShifts pat -- for good-suffix-shift 416 !skip = if overlap then unsafeAt suffT 0 else patLen 417 -- shift after a complete match 418 !kept = patLen - skip -- length of known prefix after full match 419 !pe = patAt patEnd -- last pattern byte for fast comparison 420 421 {-# INLINE occ #-} 422 occ !w = unsafeAt occT (fromIntegral w) 423 424 {-# INLINE suff #-} 425 suff !i = unsafeAt suffT i 426 427 searcher lst = case lst of 428 [] -> [] 429 (h : t) -> 430 if maxLen < S.length h 431 then error "Overflow in BoyerMoore.lazySearcher" 432 else seek 0 [] h t 0 patEnd 433 434 -- seek is used to position the "zipper" of (past, str, future) to the 435 -- correct S.ByteString to search. This is done by ensuring that 436 -- 0 <= strPos < strLen, where strPos = diffPos + patPos. 437 -- Note that future is not a strict parameter. The bytes being compared 438 -- will then be (strAt strPos) and (patAt patPos). 439 -- Splitting this into specialised versions is possible, but it would 440 -- only be useful if the pattern length is close to (or larger than) 441 -- the chunk size. For ordinary patterns of at most a few hundred bytes, 442 -- the overhead of yet more code-paths and larger code size will probably 443 -- outweigh the small gains in the relatively rare calls to seek. 444 seek :: Int64 -> [S.ByteString] -> S.ByteString 445 -> [S.ByteString] -> Int -> Int -> [Int64] 446 seek !prior !past !str future !diffPos !patPos 447 | strPos < 0 = -- need to look at previous chunk 448 case past of 449 (h : t) -> 450 let !hLen = S.length h 451 in seek (prior - fromIntegral hLen) t h (str : future) 452 (diffPos + hLen) patPos 453 [] -> error "seek back too far!" 454 | strEnd < strPos = -- need to look at next chunk if there is 455 case future of 456 (h : t) -> 457 let {-# INLINE prior' #-} 458 prior' = prior + fromIntegral strLen 459 !diffPos' = diffPos - strLen 460 {-# INLINE past' #-} 461 past' = release (-diffPos') (str : past) 462 in if maxLen < S.length h 463 then error "Overflow in BoyerMoore.lazySearcher" 464 else seek prior' past' h t diffPos' patPos 465 [] -> [] 466 | patPos == patEnd = checkEnd strPos 467 | diffPos < 0 = matcherN diffPos patPos 468 | otherwise = matcherP diffPos patPos 469 where 470 !strPos = diffPos + patPos 471 !strLen = S.length str 472 !strEnd = strLen - 1 473 !maxDiff = strLen - patLen 474 475 {-# INLINE strAt #-} 476 strAt !i = unsafeIndex str i 477 478 -- While comparing the last byte of the pattern, the bad- 479 -- character-shift is always at least as large as the good- 480 -- suffix-shift. Eliminating the unnecessary memory reads and 481 -- comparison speeds things up noticeably. 482 checkEnd !sI -- index in string to compare to last of pattern 483 | strEnd < sI = seek prior past str future (sI - patEnd) patEnd 484 | otherwise = 485 case strAt sI of 486 !c | c == pe -> 487 if sI < patEnd 488 then case sI of 489 0 -> seek prior past str future (-patEnd) preEnd 490 _ -> matcherN (sI - patEnd) preEnd 491 else matcherP (sI - patEnd) preEnd 492 | otherwise -> checkEnd (sI + patEnd + occ c) 493 494 -- Once the last byte has matched, we enter the full matcher 495 -- diff is the offset of the window, patI the index of the 496 -- pattern byte to compare next. 497 498 -- matcherN is the tight loop that walks backwards from the end 499 -- of the pattern checking for matching bytes. The offset is 500 -- always negative, so no complete match can occur here. 501 -- When a byte matches, we need to check whether we've reached 502 -- the front of this chunk, otherwise whether we need the next. 503 matcherN !diff !patI = 504 case strAt (diff + patI) of 505 !c | c == patAt patI -> 506 if diff + patI == 0 507 then seek prior past str future diff (patI - 1) 508 else matcherN diff (patI - 1) 509 | otherwise -> 510 let {-# INLINE badShift #-} 511 badShift = patI + occ c 512 {-# INLINE goodShift #-} 513 goodShift = suff patI 514 !diff' = diff + max badShift goodShift 515 in if maxDiff < diff' 516 then seek prior past str future diff' patEnd 517 else checkEnd (diff' + patEnd) 518 519 -- matcherP is the tight loop for non-negative offsets. 520 -- When the pattern is shifted, we must check whether we leave 521 -- the current chunk, otherwise we only need to check for a 522 -- complete match. 523 matcherP !diff !patI = 524 case strAt (diff + patI) of 525 !c | c == patAt patI -> 526 if patI == 0 527 then prior + fromIntegral diff : 528 let !diff' = diff + skip 529 in if maxDiff < diff' 530 then seek prior past str future diff' patEnd 531 else 532 if skip == patLen 533 then 534 checkEnd (diff' + patEnd) 535 else 536 afterMatch diff' patEnd 537 else matcherP diff (patI - 1) 538 | otherwise -> 539 let {-# INLINE badShift #-} 540 badShift = patI + occ c 541 {-# INLINE goodShift #-} 542 goodShift = suff patI 543 !diff' = diff + max badShift goodShift 544 in if maxDiff < diff' 545 then seek prior past str future diff' patEnd 546 else checkEnd (diff' + patEnd) 547 548 -- After a full match, we know how long a prefix of the pattern 549 -- still matches. Do not re-compare the prefix to prevent O(m*n) 550 -- behaviour for periodic patterns. 551 -- This breaks down at chunk boundaries, but except for long 552 -- patterns with a short period, that shouldn't matter much. 553 afterMatch !diff !patI = 554 case strAt (diff + patI) of 555 !c | c == patAt patI -> 556 if patI == kept 557 then prior + fromIntegral diff : 558 let !diff' = diff + skip 559 in if maxDiff < diff' 560 then seek prior past str future diff' patEnd 561 else afterMatch diff' patEnd 562 else afterMatch diff (patI - 1) 563 | patI == patEnd -> 564 checkEnd (diff + (2*patEnd) + occ c) 565 | otherwise -> 566 let {-# INLINE badShift #-} 567 badShift = patI + occ c 568 {-# INLINE goodShift #-} 569 goodShift = suff patI 570 !diff' = diff + max badShift goodShift 571 in if maxDiff < diff' 572 then seek prior past str future diff' patEnd 573 else checkEnd (diff' + patEnd) 574 575------------------------------------------------------------------------------ 576-- Breaking Functions -- 577------------------------------------------------------------------------------ 578 579-- Ugh! Code duplication ahead! 580-- But we want to get the first component lazily, so it's no good to find 581-- the first index (if any) and then split. 582-- Therefore bite the bullet and copy most of the code of lazySearcher. 583-- No need for afterMatch here, fortunately. 584lazyBreak ::S.ByteString -> [S.ByteString] -> ([S.ByteString], [S.ByteString]) 585lazyBreak !pat 586 | S.null pat = \lst -> ([],lst) 587 | S.length pat == 1 = 588 let !w = S.head pat 589 go [] = ([], []) 590 go (!str : rest) = 591 case S.elemIndices w str of 592 [] -> let (pre, post) = go rest in (str : pre, post) 593 (i:_) -> if i == 0 594 then ([], str : rest) 595 else ([S.take i str], S.drop i str : rest) 596 in go 597lazyBreak pat = breaker 598 where 599 !patLen = S.length pat 600 !patEnd = patLen - 1 601 !occT = occurs pat 602 !suffT = suffShifts pat 603 !maxLen = maxBound - patLen 604 !pe = patAt patEnd 605 606 {-# INLINE patAt #-} 607 patAt !i = unsafeIndex pat i 608 609 {-# INLINE occ #-} 610 occ !w = unsafeAt occT (fromIntegral w) 611 612 {-# INLINE suff #-} 613 suff !i = unsafeAt suffT i 614 615 breaker lst = 616 case lst of 617 [] -> ([],[]) 618 (h:t) -> 619 if maxLen < S.length h 620 then error "Overflow in BoyerMoore.lazyBreak" 621 else seek [] h t 0 patEnd 622 623 seek :: [S.ByteString] -> S.ByteString -> [S.ByteString] 624 -> Int -> Int -> ([S.ByteString], [S.ByteString]) 625 seek !past !str future !offset !patPos 626 | strPos < 0 = 627 case past of 628 [] -> error "not enough past!" 629 (h : t) -> seek t h (str : future) (offset + S.length h) patPos 630 | strEnd < strPos = 631 case future of 632 [] -> (foldr (flip (.) . (:)) id past [str], []) 633 (h : t) -> 634 let !off' = offset - strLen 635 (past', !discharge) = keep (-off') (str : past) 636 in if maxLen < S.length h 637 then error "Overflow in BoyerMoore.lazyBreak (future)" 638 else let (pre,post) = seek past' h t off' patPos 639 in (foldr (flip (.) . (:)) id discharge pre, post) 640 | patPos == patEnd = checkEnd strPos 641 | offset < 0 = matcherN offset patPos 642 | otherwise = matcherP offset patPos 643 where 644 {-# INLINE strAt #-} 645 strAt !i = unsafeIndex str i 646 647 !strLen = S.length str 648 !strEnd = strLen - 1 649 !maxOff = strLen - patLen 650 !strPos = offset + patPos 651 652 checkEnd !sI 653 | strEnd < sI = seek past str future (sI - patEnd) patEnd 654 | otherwise = 655 case strAt sI of 656 !c | c == pe -> 657 if sI < patEnd 658 then (if sI == 0 659 then seek past str future (-patEnd) (patEnd - 1) 660 else matcherN (sI - patEnd) (patEnd - 1)) 661 else matcherP (sI - patEnd) (patEnd - 1) 662 | otherwise -> checkEnd (sI + patEnd + occ c) 663 664 matcherN !off !patI = 665 case strAt (off + patI) of 666 !c | c == patAt patI -> 667 if off + patI == 0 668 then seek past str future off (patI - 1) 669 else matcherN off (patI - 1) 670 | otherwise -> 671 let !off' = off + max (suff patI) (patI + occ c) 672 in if maxOff < off' 673 then seek past str future off' patEnd 674 else checkEnd (off' + patEnd) 675 676 matcherP !off !patI = 677 case strAt (off + patI) of 678 !c | c == patAt patI -> 679 if patI == 0 680 then let !pre = if off == 0 then [] else [S.take off str] 681 !post = S.drop off str 682 in (foldr (flip (.) . (:)) id past pre, post:future) 683 else matcherP off (patI - 1) 684 | otherwise -> 685 let !off' = off + max (suff patI) (patI + occ c) 686 in if maxOff < off' 687 then seek past str future off' patEnd 688 else checkEnd (off' + patEnd) 689 690 691------------------------------------------------------------------------------ 692-- Splitting Functions -- 693------------------------------------------------------------------------------ 694 695-- non-empty pattern 696lazySplitKeepFront :: S.ByteString -> [S.ByteString] -> [[S.ByteString]] 697lazySplitKeepFront pat = splitter' 698 where 699 !patLen = S.length pat 700 breaker = lazyBreak pat 701 splitter' strs = case splitter strs of 702 ([]:rest) -> rest 703 other -> other 704 splitter [] = [] 705 splitter strs = 706 case breaker strs of 707 (pre, mtch) -> 708 pre : case mtch of 709 [] -> [] 710 _ -> case lsplit patLen mtch of 711 (pt, rst) -> 712 if null rst 713 then [pt] 714 else let (h : t) = splitter rst 715 in (pt ++ h) : t 716 717-- non-empty pattern 718lazySplitKeepEnd :: S.ByteString -> [S.ByteString] -> [[S.ByteString]] 719lazySplitKeepEnd pat = splitter 720 where 721 !patLen = S.length pat 722 breaker = lazyBreak pat 723 splitter [] = [] 724 splitter strs = 725 case breaker strs of 726 (pre, mtch) -> 727 let (h : t) = if null mtch 728 then [[]] 729 else case lsplit patLen mtch of 730 (pt, rst) -> pt : splitter rst 731 in (pre ++ h) : t 732 733lazySplitDrop :: S.ByteString -> [S.ByteString] -> [[S.ByteString]] 734lazySplitDrop pat = splitter 735 where 736 !patLen = S.length pat 737 breaker = lazyBreak pat 738 splitter [] = [] 739 splitter strs = splitter' strs 740 splitter' [] = [[]] 741 splitter' strs = case breaker strs of 742 (pre,mtch) -> 743 pre : case mtch of 744 [] -> [] 745 _ -> splitter' (ldrop patLen mtch) 746 747------------------------------------------------------------------------------ 748-- Replacing Functions -- 749------------------------------------------------------------------------------ 750 751{- 752 753These would be really nice. 754Unfortunately they're too slow, so instead, there's another instance of 755almost the same code as in lazySearcher below. 756 757-- variant of below 758lazyFRepl :: S.ByteString -> ([S.ByteString] -> [S.ByteString]) 759 -> [S.ByteString] -> [S.ByteString] 760lazyFRepl pat = repl 761 where 762 !patLen = S.length pat 763 breaker = lazyBreak pat 764 repl sub = replacer 765 where 766 replacer [] = [] 767 replacer strs = 768 let (pre, mtch) = breaker strs 769 in pre ++ case mtch of 770 [] -> [] 771 _ -> sub (replacer (ldrop patLen mtch)) 772 773-- This is nice and short. I really hope it's performing well! 774lazyBRepl :: S.ByteString -> S.ByteString -> [S.ByteString] -> [S.ByteString] 775lazyBRepl pat !sub = replacer 776 where 777 !patLen = S.length pat 778 breaker = lazyBreak pat 779 replacer [] = [] 780 replacer strs = let (pre, mtch) = breaker strs 781 in pre ++ case mtch of 782 [] -> [] 783 _ -> sub : replacer (ldrop patLen mtch) 784-} 785 786-- Yet more code duplication. 787-- 788-- Benchmark it against an implementation using lazyBreak and, 789-- unless it's significantly faster, NUKE IT!! 790-- 791-- Sigh, it is significantly faster. 10 - 25 %. 792-- I could live with the 10, but 25 is too much. 793-- 794-- Hmm, maybe an implementation via 795-- replace pat sub = L.intercalate sub . split pat 796-- would be competitive now. 797-- TODO: test speed and space usage. 798-- 799-- replacing loop for lazy ByteStrings as list of chunks, 800-- called only for non-empty patterns 801lazyRepl :: S.ByteString -> ([S.ByteString] -> [S.ByteString]) 802 -> [S.ByteString] -> [S.ByteString] 803lazyRepl pat = replacer 804 where 805 !patLen = S.length pat 806 !patEnd = patLen - 1 807 !occT = occurs pat 808 !suffT = suffShifts pat 809 !maxLen = maxBound - patLen 810 !pe = patAt patEnd 811 812 {-# INLINE patAt #-} 813 patAt !i = unsafeIndex pat i 814 815 {-# INLINE occ #-} 816 occ !w = unsafeAt occT (fromIntegral w) 817 818 {-# INLINE suff #-} 819 suff !i = unsafeAt suffT i 820 821 replacer sub lst = 822 case lst of 823 [] -> [] 824 (h:t) -> 825 if maxLen < S.length h 826 then error "Overflow in BoyerMoore.lazyRepl" 827 else seek [] h t 0 patEnd 828 where 829 chop _ [] = [] 830 chop !k (!str : rest) 831 | k < s = 832 if maxLen < (s - k) 833 then error "Overflow in BoyerMoore.lazyRepl (chop)" 834 else seek [] (S.drop k str) rest 0 patEnd 835 | otherwise = chop (k-s) rest 836 where 837 !s = S.length str 838 839 seek :: [S.ByteString] -> S.ByteString -> [S.ByteString] 840 -> Int -> Int -> [S.ByteString] 841 seek !past !str fut !offset !patPos 842 | strPos < 0 = 843 case past of 844 [] -> error "not enough past!" 845 (h : t) -> seek t h (str : fut) (offset + S.length h) patPos 846 | strEnd < strPos = 847 case fut of 848 [] -> foldr (flip (.) . (:)) id past [str] 849 (h : t) -> 850 let !off' = offset - strLen 851 (past', !discharge) = keep (-off') (str : past) 852 in if maxLen < S.length h 853 then error "Overflow in BoyerMoore.lazyRepl (future)" 854 else foldr (flip (.) . (:)) id discharge $ 855 seek past' h t off' patPos 856 | patPos == patEnd = checkEnd strPos 857 | offset < 0 = matcherN offset patPos 858 | otherwise = matcherP offset patPos 859 where 860 {-# INLINE strAt #-} 861 strAt !i = unsafeIndex str i 862 863 !strLen = S.length str 864 !strEnd = strLen - 1 865 !maxOff = strLen - patLen 866 !strPos = offset + patPos 867 868 checkEnd !sI 869 | strEnd < sI = seek past str fut (sI - patEnd) patEnd 870 | otherwise = 871 case strAt sI of 872 !c | c == pe -> 873 if sI < patEnd 874 then (if sI == 0 875 then seek past str fut (-patEnd) (patEnd - 1) 876 else matcherN (sI - patEnd) (patEnd - 1)) 877 else matcherP (sI - patEnd) (patEnd - 1) 878 | otherwise -> checkEnd (sI + patEnd + occ c) 879 880 matcherN !off !patI = 881 case strAt (off + patI) of 882 !c | c == patAt patI -> 883 if off + patI == 0 884 then seek past str fut off (patI - 1) 885 else matcherN off (patI - 1) 886 | otherwise -> 887 let !off' = off + max (suff patI) (patI + occ c) 888 in if maxOff < off' 889 then seek past str fut off' patEnd 890 else checkEnd (off' + patEnd) 891 892 matcherP !off !patI = 893 case strAt (off + patI) of 894 !c | c == patAt patI -> 895 if patI == 0 896 then foldr (flip (.) . (:)) id past $ 897 let pre = if off == 0 898 then id 899 else (S.take off str :) 900 in pre . sub $ 901 let !p = off + patLen 902 in if p < strLen 903 then seek [] (S.drop p str) fut 0 patEnd 904 else chop (p - strLen) fut 905 else matcherP off (patI - 1) 906 | otherwise -> 907 let !off' = off + max (suff patI) (patI + occ c) 908 in if maxOff < off' 909 then seek past str fut off' patEnd 910 else checkEnd (off' + patEnd) 911