1{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
2
3-- |
4-- Module      : Data.Text.Internal.Search
5-- Copyright   : (c) Bryan O'Sullivan 2009
6--
7-- License     : BSD-style
8-- Maintainer  : bos@serpentine.com
9-- Stability   : experimental
10-- Portability : GHC
11--
12-- Fast substring search for 'Text', based on work by Boyer, Moore,
13-- Horspool, Sunday, and Lundh.
14--
15-- References:
16--
17-- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
18--   Communications of the ACM, 20, 10, 762-772 (1977)
19--
20-- * R. N. Horspool: Practical Fast Searching in Strings.  Software -
21--   Practice and Experience 10, 501-506 (1980)
22--
23-- * D. M. Sunday: A Very Fast Substring Search Algorithm.
24--   Communications of the ACM, 33, 8, 132-142 (1990)
25--
26-- * F. Lundh: The Fast Search Algorithm.
27--   <http://effbot.org/zone/stringlib.htm> (2006)
28
29module Data.Text.Internal.Search
30    (
31      indices
32    ) where
33
34import qualified Data.Text.Array as A
35import Data.Word (Word64)
36import Data.Text.Internal (Text(..))
37import Data.Bits ((.|.), (.&.))
38import Data.Text.Internal.Unsafe.Shift (shiftL)
39
40data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int
41
42-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
43-- @needle@ within @haystack@.  The offsets returned represent
44-- uncorrected indices in the low-level \"needle\" array, to which its
45-- offset must be added.
46--
47-- In (unlikely) bad cases, this algorithm's complexity degrades
48-- towards /O(n*m)/.
49indices :: Text                -- ^ Substring to search for (@needle@)
50        -> Text                -- ^ Text to search in (@haystack@)
51        -> [Int]
52indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
53    | nlen == 1              = scanOne (nindex 0)
54    | nlen <= 0 || ldiff < 0 = []
55    | otherwise              = scan 0
56  where
57    ldiff    = hlen - nlen
58    nlast    = nlen - 1
59    z        = nindex nlast
60    nindex k = A.unsafeIndex narr (noff+k)
61    hindex k = A.unsafeIndex harr (hoff+k)
62    hindex' k | k == hlen  = 0
63              | otherwise = A.unsafeIndex harr (hoff+k)
64    buildTable !i !msk !skp
65        | i >= nlast           = (msk .|. swizzle z) :* skp
66        | otherwise            = buildTable (i+1) (msk .|. swizzle c) skp'
67        where c                = nindex i
68              skp' | c == z    = nlen - i - 2
69                   | otherwise = skp
70    swizzle k = 1 `shiftL` (fromIntegral k .&. 0x3f)
71    scan !i
72        | i > ldiff                  = []
73        | c == z && candidateMatch 0 = i : scan (i + nlen)
74        | otherwise                  = scan (i + delta)
75        where c = hindex (i + nlast)
76              candidateMatch !j
77                    | j >= nlast               = True
78                    | hindex (i+j) /= nindex j = False
79                    | otherwise                = candidateMatch (j+1)
80              delta | nextInPattern = nlen + 1
81                    | c == z        = skip + 1
82                    | otherwise     = 1
83                where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0
84              !(mask :* skip)       = buildTable 0 0 (nlen-2)
85    scanOne c = loop 0
86        where loop !i | i >= hlen     = []
87                      | hindex i == c = i : loop (i+1)
88                      | otherwise     = loop (i+1)
89{-# INLINE indices #-}
90