1{-# LANGUAGE BangPatterns, ScopedTypeVariables #-} 2 3-- | 4-- Module : Data.Text.Internal.Search 5-- Copyright : (c) Bryan O'Sullivan 2009 6-- 7-- License : BSD-style 8-- Maintainer : bos@serpentine.com 9-- Stability : experimental 10-- Portability : GHC 11-- 12-- Fast substring search for 'Text', based on work by Boyer, Moore, 13-- Horspool, Sunday, and Lundh. 14-- 15-- References: 16-- 17-- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm. 18-- Communications of the ACM, 20, 10, 762-772 (1977) 19-- 20-- * R. N. Horspool: Practical Fast Searching in Strings. Software - 21-- Practice and Experience 10, 501-506 (1980) 22-- 23-- * D. M. Sunday: A Very Fast Substring Search Algorithm. 24-- Communications of the ACM, 33, 8, 132-142 (1990) 25-- 26-- * F. Lundh: The Fast Search Algorithm. 27-- <http://effbot.org/zone/stringlib.htm> (2006) 28 29module Data.Text.Internal.Search 30 ( 31 indices 32 ) where 33 34import qualified Data.Text.Array as A 35import Data.Word (Word64) 36import Data.Text.Internal (Text(..)) 37import Data.Bits ((.|.), (.&.)) 38import Data.Text.Internal.Unsafe.Shift (shiftL) 39 40data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int 41 42-- | /O(n+m)/ Find the offsets of all non-overlapping indices of 43-- @needle@ within @haystack@. The offsets returned represent 44-- uncorrected indices in the low-level \"needle\" array, to which its 45-- offset must be added. 46-- 47-- In (unlikely) bad cases, this algorithm's complexity degrades 48-- towards /O(n*m)/. 49indices :: Text -- ^ Substring to search for (@needle@) 50 -> Text -- ^ Text to search in (@haystack@) 51 -> [Int] 52indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen) 53 | nlen == 1 = scanOne (nindex 0) 54 | nlen <= 0 || ldiff < 0 = [] 55 | otherwise = scan 0 56 where 57 ldiff = hlen - nlen 58 nlast = nlen - 1 59 z = nindex nlast 60 nindex k = A.unsafeIndex narr (noff+k) 61 hindex k = A.unsafeIndex harr (hoff+k) 62 hindex' k | k == hlen = 0 63 | otherwise = A.unsafeIndex harr (hoff+k) 64 buildTable !i !msk !skp 65 | i >= nlast = (msk .|. swizzle z) :* skp 66 | otherwise = buildTable (i+1) (msk .|. swizzle c) skp' 67 where c = nindex i 68 skp' | c == z = nlen - i - 2 69 | otherwise = skp 70 swizzle k = 1 `shiftL` (fromIntegral k .&. 0x3f) 71 scan !i 72 | i > ldiff = [] 73 | c == z && candidateMatch 0 = i : scan (i + nlen) 74 | otherwise = scan (i + delta) 75 where c = hindex (i + nlast) 76 candidateMatch !j 77 | j >= nlast = True 78 | hindex (i+j) /= nindex j = False 79 | otherwise = candidateMatch (j+1) 80 delta | nextInPattern = nlen + 1 81 | c == z = skip + 1 82 | otherwise = 1 83 where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0 84 !(mask :* skip) = buildTable 0 0 (nlen-2) 85 scanOne c = loop 0 86 where loop !i | i >= hlen = [] 87 | hindex i == c = i : loop (i+1) 88 | otherwise = loop (i+1) 89{-# INLINE indices #-} 90