1{-# LANGUAGE BangPatterns #-}
2-- |
3-- Module         : Data.ByteString.Lazy.Search.DFA
4-- Copyright      : Daniel Fischer
5-- Licence        : BSD3
6-- Maintainer     : Daniel Fischer <daniel.is.fischer@googlemail.com>
7-- Stability      : Provisional
8-- Portability    : non-portable (BangPatterns)
9--
10-- Fast search of lazy 'L.ByteString' values. Breaking,
11-- splitting and replacing using a deterministic finite automaton.
12
13module Data.ByteString.Lazy.Search.DFA ( -- * Overview
14                                         -- $overview
15
16                                         -- ** Complexity and performance
17                                         -- $complexity
18
19                                         -- ** Partial application
20                                         -- $partial
21
22                                         -- * Finding substrings
23                                         indices
24                                       , nonOverlappingIndices
25                                         -- * Breaking on substrings
26                                       , breakOn
27                                       , breakAfter
28                                       , breakFindAfter
29                                         -- * Replacing
30                                       , replace
31                                         -- * Splitting
32                                       , split
33                                       , splitKeepEnd
34                                       , splitKeepFront
35                                       ) where
36
37import Data.ByteString.Search.Internal.Utils (automaton, keep, ldrop, lsplit)
38import Data.ByteString.Search.Substitution
39
40import qualified Data.ByteString as S
41import qualified Data.ByteString.Lazy as L
42import qualified Data.ByteString.Lazy.Internal as LI
43import Data.ByteString.Unsafe (unsafeIndex)
44
45import Data.Array.Base (unsafeAt)
46--import Data.Array.Unboxed (UArray)
47
48import Data.Bits
49import Data.Int (Int64)
50
51-- $overview
52--
53-- This module provides functions related to searching a substring within
54-- a string. The searching algorithm uses a deterministic finite automaton
55-- based on the Knuth-Morris-Pratt algorithm.
56-- The automaton is implemented as an array of @(patternLength + 1) * &#963;@
57-- state transitions, where &#963; is the alphabet size (256), so it is only
58-- suitable for short enough patterns, therefore the patterns in this module
59-- are required to be strict 'S.ByteString's.
60--
61-- When searching a pattern in a UTF-8-encoded 'L.ByteString', be aware that
62-- these functions work on bytes, not characters, so the indices are
63-- byte-offsets, not character offsets.
64
65-- $complexity
66--
67-- The time and space complexity of the preprocessing phase is
68-- /O/(@patternLength * &#963;@).
69-- The searching phase is /O/(@targetLength@), each target character is
70-- inspected only once.
71--
72-- In general the functions in this module have about the same performance as
73-- the corresponding functions using the Knuth-Morris-Pratt algorithm but
74-- are considerably slower than the Boyer-Moore functions. For very short
75-- patterns or, in the case of 'indices', patterns with a short period
76-- which occur often, however, times are close to or even below the
77-- Boyer-Moore times.
78
79-- $partial
80--
81-- All functions can usefully be partially applied. Given only a pattern,
82-- the automaton is constructed only once, allowing efficient re-use.
83
84------------------------------------------------------------------------------
85--                            Exported Functions                            --
86------------------------------------------------------------------------------
87
88-- | @'indices'@ finds the starting indices of all possibly overlapping
89--   occurrences of the pattern in the target string.
90--   If the pattern is empty, the result is @[0 .. 'length' target]@.
91{-# INLINE indices #-}
92indices :: S.ByteString     -- ^ Strict pattern to find
93        -> L.ByteString     -- ^ Lazy string to search
94        -> [Int64]          -- ^ Offsets of matches
95indices !pat = lazySearcher True pat . L.toChunks
96
97-- | @'nonOverlappingIndices'@ finds the starting indices of all
98--   non-overlapping occurrences of the pattern in the target string.
99--   It is more efficient than removing indices from the list produced
100--   by 'indices'.
101{-# INLINE nonOverlappingIndices #-}
102nonOverlappingIndices :: S.ByteString   -- ^ Strict pattern to find
103                      -> L.ByteString   -- ^ Lazy string to search
104                      -> [Int64]        -- ^ Offsets of matches
105nonOverlappingIndices !pat = lazySearcher False pat . L.toChunks
106
107-- | @'breakOn' pattern target@ splits @target@ at the first occurrence
108--   of @pattern@. If the pattern does not occur in the target, the
109--   second component of the result is empty, otherwise it starts with
110--   @pattern@. If the pattern is empty, the first component is empty.
111--   For a non-empty pattern, the first component is generated lazily,
112--   thus the first parts of it can be available before the pattern has
113--   been found or determined to be absent.
114--
115-- @
116--   'uncurry' 'L.append' . 'breakOn' pattern = 'id'
117-- @
118breakOn :: S.ByteString  -- ^ Strict pattern to search for
119        -> L.ByteString  -- ^ Lazy string to search in
120        -> (L.ByteString, L.ByteString)
121                         -- ^ Head and tail of string broken at substring
122breakOn pat = breaker . L.toChunks
123  where
124    lbrk = lazyBreaker True pat
125    breaker strs = let (f, b) = lbrk strs
126                   in (L.fromChunks f, L.fromChunks b)
127
128-- | @'breakAfter' pattern target@ splits @target@ behind the first occurrence
129--   of @pattern@. An empty second component means that either the pattern
130--   does not occur in the target or the first occurrence of pattern is at
131--   the very end of target. If you need to discriminate between those cases,
132--   use breakFindAfter.
133--   If the pattern is empty, the first component is empty.
134--   For a non-empty pattern, the first component is generated lazily,
135--   thus the first parts of it can be available before the pattern has
136--   been found or determined to be absent.
137-- @
138--   'uncurry' 'L.append' . 'breakAfter' pattern = 'id'
139-- @
140breakAfter :: S.ByteString  -- ^ Strict pattern to search for
141           -> L.ByteString  -- ^ Lazy string to search in
142           -> (L.ByteString, L.ByteString)
143                            -- ^ Head and tail of string broken after substring
144breakAfter pat = breaker . L.toChunks
145  where
146    lbrk = lazyBreaker False pat
147    breaker strs = let (f, b) = lbrk strs
148                   in (L.fromChunks f, L.fromChunks b)
149
150-- | @'breakFindAfter'@ does the same as 'breakAfter' but additionally indicates
151--   whether the pattern is present in the target.
152--
153-- @
154--   'fst' . 'breakFindAfter' pat = 'breakAfter' pat
155-- @
156breakFindAfter :: S.ByteString  -- ^ Strict pattern to search for
157               -> L.ByteString  -- ^ Lazy string to search in
158               -> ((L.ByteString, L.ByteString), Bool)
159                            -- ^ Head and tail of string broken after substring
160                            --   and presence of pattern
161breakFindAfter pat
162  | S.null pat  = \str -> ((L.empty, str), True)
163breakFindAfter pat = breaker . L.toChunks
164  where
165    !patLen = S.length pat
166    lbrk = lazyBreaker True pat
167    breaker strs = let (f, b) = lbrk strs
168                       (f1, b1) = lsplit patLen b
169                       mbpat = L.fromChunks f1
170                   in ((foldr LI.chunk mbpat f, L.fromChunks b1), not (null b))
171
172-- | @'replace' pat sub text@ replaces all (non-overlapping) occurrences of
173--   @pat@ in @text@ with @sub@. If occurrences of @pat@ overlap, the first
174--   occurrence that does not overlap with a replaced previous occurrence
175--   is substituted. Occurrences of @pat@ arising from a substitution
176--   will not be substituted. For example:
177--
178-- @
179--   'replace' \"ana\" \"olog\" \"banana\" = \"bologna\"
180--   'replace' \"ana\" \"o\" \"bananana\" = \"bono\"
181--   'replace' \"aab\" \"abaa\" \"aaabb\" = \"aabaab\"
182-- @
183--
184--   The result is a lazy 'L.ByteString',
185--   which is lazily produced, without copying.
186--   Equality of pattern and substitution is not checked, but
187--
188-- @
189--   'replace' pat pat text == text
190-- @
191--
192--   holds (the internal structure is generally different).
193--   If the pattern is empty but not the substitution, the result
194--   is equivalent to (were they 'String's) @cycle sub@.
195--
196--   For non-empty @pat@ and @sub@ a lazy 'L.ByteString',
197--
198-- @
199--   'L.concat' . 'Data.List.intersperse' sub . 'split' pat = 'replace' pat sub
200-- @
201--
202--   and analogous relations hold for other types of @sub@.
203replace :: Substitution rep
204        => S.ByteString     -- ^ Strict pattern to replace
205        -> rep              -- ^ Replacement string
206        -> L.ByteString     -- ^ Lazy string to modify
207        -> L.ByteString     -- ^ Lazy result
208replace pat
209  | S.null pat = \sub -> prependCycle sub
210  | otherwise =
211    let !patLen = S.length pat
212        breaker = lazyBreaker True pat
213        repl subst strs
214          | null strs   = []
215          | otherwise   =
216            let (pre, mtch) = breaker strs
217            in pre ++ case mtch of
218                        [] -> []
219                        _  -> subst (repl subst (ldrop patLen mtch))
220    in \sub -> let {-# NOINLINE subst #-}
221                   !subst = substitution sub
222                   repl1 = repl subst
223               in L.fromChunks . repl1 . L.toChunks
224
225
226-- | @'split' pattern target@ splits @target@ at each (non-overlapping)
227--   occurrence of @pattern@, removing @pattern@. If @pattern@ is empty,
228--   the result is an infinite list of empty 'L.ByteString's, if @target@
229--   is empty but not @pattern@, the result is an empty list, otherwise
230--   the following relations hold (where @patL@ is the lazy 'L.ByteString'
231--   corresponding to @pat@):
232--
233-- @
234--   'L.concat' . 'Data.List.intersperse' patL . 'split' pat = 'id',
235--   'length' ('split' pattern target) ==
236--               'length' ('nonOverlappingIndices' pattern target) + 1,
237-- @
238--
239--   no fragment in the result contains an occurrence of @pattern@.
240split :: S.ByteString   -- ^ Strict pattern to split on
241      -> L.ByteString   -- ^ Lazy string to split
242      -> [L.ByteString] -- ^ Fragments of string
243split pat
244  | S.null pat  = const (repeat L.empty)
245split pat = map L.fromChunks . splitter . L.toChunks
246  where
247    !patLen = S.length pat
248    breaker = lazyBreaker True pat
249    splitter strs
250      | null strs  = []
251      | otherwise  = splitter' strs
252    splitter' strs
253      | null strs  = [[]]
254      | otherwise  =
255        case breaker strs of
256          (pre, mtch) ->
257            pre : case mtch of
258                    [] -> []
259                    _  -> splitter' (ldrop patLen mtch)
260
261-- | @'splitKeepEnd' pattern target@ splits @target@ after each (non-overlapping)
262--   occurrence of @pattern@. If @pattern@ is empty, the result is an
263--   infinite list of empty 'L.ByteString's, otherwise the following
264--   relations hold:
265--
266-- @
267--   'L.concat' . 'splitKeepEnd' pattern = 'id,'
268-- @
269--
270--   all fragments in the result except possibly the last end with
271--   @pattern@, no fragment contains more than one occurrence of @pattern@.
272splitKeepEnd :: S.ByteString    -- ^ Strict pattern to split on
273             -> L.ByteString    -- ^ Lazy string to split
274             -> [L.ByteString]  -- ^ Fragments of string
275splitKeepEnd pat
276  | S.null pat = const (repeat L.empty)
277splitKeepEnd pat = map L.fromChunks . splitter . L.toChunks
278  where
279    breaker = lazyBreaker False pat
280    splitter [] = []
281    splitter strs =
282      case breaker strs of
283        (pre, mtch) -> pre : splitter mtch
284
285-- | @'splitKeepFront'@ is like 'splitKeepEnd', except that @target@ is split
286--   before each occurrence of @pattern@ and hence all fragments
287--   with the possible exception of the first begin with @pattern@.
288--   No fragment contains more than one non-overlapping occurrence
289--   of @pattern@.
290splitKeepFront :: S.ByteString    -- ^ Strict pattern to split on
291               -> L.ByteString    -- ^ Lazy string to split
292               -> [L.ByteString]  -- ^ Fragments of string
293splitKeepFront pat
294  | S.null pat  = const (repeat L.empty)
295splitKeepFront pat = map L.fromChunks . splitter . L.toChunks
296  where
297    !patLen = S.length pat
298    breaker = lazyBreaker True pat
299    splitter strs = case splitter' strs of
300                      ([] : rst) -> rst
301                      other -> other
302    splitter' []    = []
303    splitter' strs  =
304      case breaker strs of
305        (pre, mtch) ->
306          pre : case mtch of
307                  [] -> []
308                  _  -> case lsplit patLen mtch of
309                          (pt, rst) ->
310                            if null rst
311                              then [pt]
312                              else let (h : t) = splitter' rst
313                                   in (pt ++ h) : t
314
315------------------------------------------------------------------------------
316--                            Searching Function                            --
317------------------------------------------------------------------------------
318
319lazySearcher :: Bool -> S.ByteString -> [S.ByteString] -> [Int64]
320lazySearcher _ !pat
321    | S.null pat        =
322      let zgo _ [] = []
323          zgo !prior (!str : rest) =
324              let !l = S.length str
325                  !prior' = prior + fromIntegral l
326              in [prior + fromIntegral i | i <- [1 .. l]] ++ zgo prior' rest
327      in (0:) . zgo 0
328    | S.length pat == 1 =
329      let !w = S.head pat
330          ixes = S.elemIndices w
331          go _ [] = []
332          go !prior (!str : rest)
333            = let !prior' = prior + fromIntegral (S.length str)
334              in map ((+ prior) . fromIntegral) (ixes str) ++ go prior' rest
335      in go 0
336lazySearcher !overlap pat = search 0 0
337  where
338    !patLen = S.length pat
339    !auto   = automaton pat
340    !p0     = unsafeIndex pat 0
341    !ams    = if overlap then patLen else 0
342    search _ _ [] = []
343    search !prior st (!str:rest) = match st 0
344      where
345        !strLen = S.length str
346        {-# INLINE strAt #-}
347        strAt :: Int -> Int
348        strAt i = fromIntegral (str `unsafeIndex` i)
349        match 0 !idx
350          | idx == strLen = search (prior + fromIntegral strLen) 0 rest
351          | unsafeIndex str idx == p0   = match 1 (idx + 1)
352          | otherwise     = match 0 (idx + 1)
353        match state idx
354          | idx == strLen = search (prior + fromIntegral strLen) state rest
355          | otherwise     =
356            let nstate = unsafeAt auto ((state `shiftL` 8) + strAt idx)
357                !nxtIdx = idx + 1
358            in if nstate == patLen
359                then (prior + fromIntegral (nxtIdx - patLen)) :
360                            match ams nxtIdx
361                else match nstate nxtIdx
362
363------------------------------------------------------------------------------
364--                                 Breaking                                 --
365------------------------------------------------------------------------------
366
367-- Code duplication :(
368-- Needed for reasonable performance.
369lazyBreaker :: Bool -> S.ByteString -> [S.ByteString]
370                    -> ([S.ByteString], [S.ByteString])
371lazyBreaker before pat
372  | S.null pat  = \strs -> ([], strs)
373  | S.length pat == 1 =
374    let !w = S.head pat
375        !a = if before then 0 else 1
376        ixes = S.elemIndices w
377        scan [] = ([], [])
378        scan (!str:rest) =
379            let !strLen = S.length str
380            in case ixes str of
381                []  -> let (fr, bk) = scan rest in (str : fr, bk)
382                (i:_) -> let !j = i + a
383                         in if j == strLen
384                              then ([str],rest)
385                              else ([S.take j str], S.drop j str : rest)
386    in scan
387lazyBreaker !before pat = bscan [] 0
388  where
389    !patLen = S.length pat
390    !auto   = automaton pat
391    !p0     = unsafeIndex pat 0
392    bscan _ _ [] = ([], [])
393    bscan !past !sta (!str:rest) = match sta 0
394      where
395        !strLen = S.length str
396        {-# INLINE strAt #-}
397        strAt :: Int -> Int
398        strAt i = fromIntegral (str `unsafeIndex` i)
399        match 0 idx
400          | idx == strLen =
401            let (fr, bk) = bscan [] 0 rest
402            in (foldr (flip (.) . (:)) id past (str:fr), bk)
403          | unsafeIndex str idx == p0 = match 1 (idx + 1)
404          | otherwise = match 0 (idx + 1)
405        match state idx
406          | idx == strLen =
407            let (kp, !rl) = if before
408                                then keep state (str:past)
409                                else ([], str:past)
410                (fr, bk) = bscan kp state rest
411            in (foldr (flip (.) . (:)) id rl fr, bk)
412          | otherwise =
413            let !nstate = unsafeAt auto ((state `shiftL` 8) + strAt idx)
414                !nxtIdx = idx + 1
415            in if nstate == patLen
416                then case if before then nxtIdx - patLen else nxtIdx of
417                       0 -> (foldr (flip (.) . (:)) id past [], str:rest)
418                       stIx | stIx < 0 -> rgo (-stIx) (str:rest) past
419                            | stIx == strLen ->
420                              (foldr (flip (.) . (:)) id past [str],rest)
421                            | otherwise ->
422                              (foldr (flip (.) . (:)) id past
423                                    [S.take stIx str], S.drop stIx str : rest)
424                else match nstate nxtIdx
425
426
427-- Did I already mention that I suck at finding names?
428{-# INLINE rgo #-}
429rgo :: Int -> [S.ByteString] -> [S.ByteString]
430    -> ([S.ByteString], [S.ByteString])
431rgo !kp acc (!str:more)
432  | sl == kp    = (reverse more, str:acc)
433  | sl < kp     = rgo (kp - sl) (str:acc) more
434  | otherwise   = case S.splitAt (sl - kp) str of
435                    (fr, bk) ->
436                      (foldr (flip (.) . (:)) id more [fr], bk:acc)
437    where
438      !sl = S.length str
439rgo _ _ [] = error "Not enough past!"
440-- If that error is ever encountered, I screwed up badly.
441