1{-# LANGUAGE BangPatterns #-}
2{-# LANGUAGE CPP          #-}
3{-# LANGUAGE MagicHash    #-}
4{-# LANGUAGE RankNTypes   #-}
5
6{-| An implementation of linear hash tables. (See
7<http://en.wikipedia.org/wiki/Linear_hashing>). Use this hash table if you...
8
9  * don't care that inserts and lookups are slower than the other hash table
10    implementations in this collection (this one is slightly faster than
11    @Data.HashTable@ from the base library in most cases)
12
13  * have a soft real-time or interactive application for which the risk of
14    introducing a long pause on insert while all of the keys are rehashed is
15    unacceptable.
16
17
18/Details:/
19
20Linear hashing allows for the expansion of the hash table one slot at a time,
21by moving a \"split\" pointer across an array of pointers to buckets. The
22number of buckets is always a power of two, and the bucket to look in is
23defined as:
24
25@
26bucket(level,key) = hash(key) mod (2^level)
27@
28
29The \"split pointer\" controls the expansion of the hash table. If the hash
30table is at level @k@ (i.e. @2^k@ buckets have been allocated), we first
31calculate @b=bucket(level-1,key)@. If @b < splitptr@, the destination bucket is
32calculated as @b'=bucket(level,key)@, otherwise the original value @b@ is used.
33
34The split pointer is incremented once an insert causes some bucket to become
35fuller than some predetermined threshold; the bucket at the split pointer
36(*not* the bucket which triggered the split!) is then rehashed, and half of its
37keys can be expected to be rehashed into the upper half of the table.
38
39When the split pointer reaches the middle of the bucket array, the size of the
40bucket array is doubled, the level increases, and the split pointer is reset to
41zero.
42
43Linear hashing, although not quite as fast for inserts or lookups as the
44implementation of linear probing included in this package, is well suited for
45interactive applications because it has much better worst case behaviour on
46inserts. Other hash table implementations can suffer from long pauses, because
47it is occasionally necessary to rehash all of the keys when the table grows.
48Linear hashing, on the other hand, only ever rehashes a bounded (effectively
49constant) number of keys when an insert forces a bucket split.
50
51/Space overhead: experimental results/
52
53In randomized testing (see @test\/compute-overhead\/ComputeOverhead.hs@ in the
54source distribution), mean overhead is approximately 1.51 machine words per
55key-value mapping with a very low standard deviation of about 0.06 words, 1.60
56words per mapping at the 95th percentile.
57
58/Unsafe tricks/
59
60Then the @unsafe-tricks@ flag is on when this package is built (and it is on by
61default), we use some unsafe tricks (namely 'unsafeCoerce#' and
62'reallyUnsafePtrEquality#') to save indirections in this table. These
63techniques rely on assumptions about the behaviour of the GHC runtime system
64and, although they've been tested and should be safe under normal conditions,
65are slightly dangerous. Caveat emptor. In particular, these techniques are
66incompatible with HPC code coverage reports.
67
68
69References:
70
71  * W. Litwin. Linear hashing: a new tool for file and table addressing. In
72    /Proc. 6th International Conference on Very Large Data Bases, Volume 6/,
73    pp. 212-223, 1980.
74
75  * P-A. Larson. Dynamic hash tables. /Communications of the ACM/ 31:
76    446-457, 1988.
77-}
78
79module Data.HashTable.ST.Linear
80  ( HashTable
81  , new
82  , newSized
83  , delete
84  , lookup
85  , insert
86  , mutate
87  , mutateST
88  , mapM_
89  , foldM
90  , computeOverhead
91  ) where
92
93------------------------------------------------------------------------------
94#if !MIN_VERSION_base(4,8,0)
95import           Control.Applicative
96import           Data.Word
97#endif
98import           Control.Monad                         hiding (foldM, mapM_)
99import           Control.Monad.ST
100import           Data.Bits
101import           Data.Hashable
102import           Data.STRef
103import           Prelude                               hiding (lookup, mapM_)
104------------------------------------------------------------------------------
105import qualified Data.HashTable.Class                  as C
106import           Data.HashTable.Internal.Array
107import           Data.HashTable.Internal.Linear.Bucket (Bucket)
108import qualified Data.HashTable.Internal.Linear.Bucket as Bucket
109import           Data.HashTable.Internal.Utils
110
111#ifdef DEBUG
112import           System.IO
113#endif
114
115
116------------------------------------------------------------------------------
117-- | A linear hash table.
118newtype HashTable s k v = HT (STRef s (HashTable_ s k v))
119
120data HashTable_ s k v = HashTable
121    { _level    :: {-# UNPACK #-} !Int
122    , _splitptr :: {-# UNPACK #-} !Int
123    , _buckets  :: {-# UNPACK #-} !(MutableArray s (Bucket s k v))
124    }
125
126
127------------------------------------------------------------------------------
128instance C.HashTable HashTable where
129    new             = new
130    newSized        = newSized
131    insert          = insert
132    delete          = delete
133    lookup          = lookup
134    foldM           = foldM
135    mapM_           = mapM_
136    lookupIndex     = lookupIndex
137    nextByIndex     = nextByIndex
138    computeOverhead = computeOverhead
139    mutate          = mutate
140    mutateST        = mutateST
141
142
143------------------------------------------------------------------------------
144instance Show (HashTable s k v) where
145    show _ = "<HashTable>"
146
147
148------------------------------------------------------------------------------
149-- | See the documentation for this function in
150-- "Data.HashTable.Class#v:new".
151new :: ST s (HashTable s k v)
152new = do
153    v <- Bucket.newBucketArray 2
154    newRef $ HashTable 1 0 v
155
156
157------------------------------------------------------------------------------
158-- | See the documentation for this function in
159-- "Data.HashTable.Class#v:newSized".
160newSized :: Int -> ST s (HashTable s k v)
161newSized n = do
162    v <- Bucket.newBucketArray sz
163    newRef $ HashTable lvl 0 v
164
165  where
166    k   = ceiling (fromIntegral n * fillFactor / fromIntegral bucketSplitSize)
167    lvl = max 1 (fromEnum $ log2 k)
168    sz  = power2 lvl
169
170
171
172------------------------------------------------------------------------------
173-- | See the documentation for this function in
174-- "Data.HashTable.Class#v:delete".
175delete :: (Hashable k, Eq k) =>
176          (HashTable s k v)
177       -> k
178       -> ST s ()
179delete htRef !k = readRef htRef >>= work
180  where
181    work (HashTable lvl splitptr buckets) = do
182        let !h0 = hashKey lvl splitptr k
183        debug $ "delete: size=" ++ show (power2 lvl) ++ ", h0=" ++ show h0
184                  ++ "splitptr: " ++ show splitptr
185        delete' buckets h0 k
186{-# INLINE delete #-}
187
188
189------------------------------------------------------------------------------
190-- | See the documentation for this function in
191-- "Data.HashTable.Class#v:lookup".
192lookup :: (Eq k, Hashable k) => (HashTable s k v) -> k -> ST s (Maybe v)
193lookup htRef !k = readRef htRef >>= work
194  where
195    work (HashTable lvl splitptr buckets) = do
196        let h0 = hashKey lvl splitptr k
197        bucket <- readArray buckets h0
198        Bucket.lookup bucket k
199{-# INLINE lookup #-}
200
201
202------------------------------------------------------------------------------
203-- | See the documentation for this function in
204-- "Data.HashTable.Class#v:insert".
205insert :: (Eq k, Hashable k) =>
206          (HashTable s k v)
207       -> k
208       -> v
209       -> ST s ()
210insert htRef k v = do
211    ht' <- readRef htRef >>= work
212    writeRef htRef ht'
213  where
214    work ht@(HashTable lvl splitptr buckets) = do
215        let !h0 = hashKey lvl splitptr k
216        delete' buckets h0 k
217        bsz <- primitiveInsert' buckets h0 k v
218
219        if checkOverflow bsz
220          then do
221            debug $ "insert: splitting"
222            h <- split ht
223            debug $ "insert: done splitting"
224            return h
225          else do
226            debug $ "insert: done"
227            return ht
228{-# INLINE insert #-}
229
230
231------------------------------------------------------------------------------
232mutate :: (Eq k, Hashable k) =>
233          (HashTable s k v)
234       -> k
235       -> (Maybe v -> (Maybe v, a))
236       -> ST s a
237mutate htRef k f = mutateST htRef k (pure . f)
238{-# INLINE mutate #-}
239
240
241------------------------------------------------------------------------------
242mutateST :: (Eq k, Hashable k) =>
243            (HashTable s k v)
244         -> k
245         -> (Maybe v -> ST s (Maybe v, a))
246         -> ST s a
247mutateST htRef k f = do
248    (ht, a) <- readRef htRef >>= work
249    writeRef htRef ht
250    return a
251  where
252    work ht@(HashTable lvl splitptr buckets) = do
253        let !h0 = hashKey lvl splitptr k
254        bucket <- readArray buckets h0
255        (!bsz, mbk, a) <- Bucket.mutateST bucket k f
256        maybe (return ())
257              (writeArray buckets h0)
258              mbk
259        if checkOverflow bsz
260          then do
261            ht' <- split ht
262            return (ht', a)
263          else return (ht, a)
264
265
266------------------------------------------------------------------------------
267-- | See the documentation for this function in
268-- "Data.HashTable.Class#v:mapM_".
269mapM_ :: ((k,v) -> ST s b) -> HashTable s k v -> ST s ()
270mapM_ f htRef = readRef htRef >>= work
271  where
272    work (HashTable lvl _ buckets) = go 0
273      where
274        !sz = power2 lvl
275
276        go !i | i >= sz = return ()
277              | otherwise = do
278            b <- readArray buckets i
279            Bucket.mapM_ f b
280            go $ i+1
281
282
283------------------------------------------------------------------------------
284-- | See the documentation for this function in
285-- "Data.HashTable.Class#v:foldM".
286foldM :: (a -> (k,v) -> ST s a)
287      -> a -> HashTable s k v
288      -> ST s a
289foldM f seed0 htRef = readRef htRef >>= work
290  where
291    work (HashTable lvl _ buckets) = go seed0 0
292      where
293        !sz = power2 lvl
294
295        go !seed !i | i >= sz   = return seed
296                    | otherwise = do
297            b <- readArray buckets i
298            !seed' <- Bucket.foldM f seed b
299            go seed' $ i+1
300
301
302------------------------------------------------------------------------------
303-- | See the documentation for this function in
304-- "Data.HashTable.Class#v:computeOverhead".
305computeOverhead :: HashTable s k v -> ST s Double
306computeOverhead htRef = readRef htRef >>= work
307  where
308    work (HashTable lvl _ buckets) = do
309        (totElems, overhead) <- go 0 0 0
310
311        let n = fromIntegral totElems
312        let o = fromIntegral overhead
313
314        return $ (fromIntegral sz + constOverhead + o) / n
315
316      where
317        constOverhead = 5.0
318
319        !sz = power2 lvl
320
321        go !nelems !overhead !i | i >= sz = return (nelems, overhead)
322                                | otherwise = do
323            b <- readArray buckets i
324            (!n,!o) <- Bucket.nelemsAndOverheadInWords b
325            let !n' = n + nelems
326            let !o' = o + overhead
327
328            go n' o' (i+1)
329
330
331------------------------------
332-- Private functions follow --
333------------------------------
334
335------------------------------------------------------------------------------
336delete' :: Eq k =>
337           MutableArray s (Bucket s k v)
338        -> Int
339        -> k
340        -> ST s ()
341delete' buckets h0 k = do
342    bucket <- readArray buckets h0
343    _ <- Bucket.delete bucket k
344    return ()
345
346
347------------------------------------------------------------------------------
348split :: (Hashable k) =>
349         (HashTable_ s k v)
350      -> ST s (HashTable_ s k v)
351split ht@(HashTable lvl splitptr buckets) = do
352    debug $ "split: start: nbuck=" ++ show (power2 lvl)
353              ++ ", splitptr=" ++ show splitptr
354
355    -- grab bucket at splitPtr
356    oldBucket <- readArray buckets splitptr
357
358    nelems <- Bucket.size oldBucket
359    let !bsz = max Bucket.newBucketSize $
360                   ceiling $ (0.625 :: Double) * fromIntegral nelems
361
362    -- write an empty bucket there
363    dbucket1 <- Bucket.emptyWithSize bsz
364    writeArray buckets splitptr dbucket1
365
366    -- grow the buckets?
367    let lvl2 = power2 lvl
368    let lvl1 = power2 $ lvl-1
369
370    (!buckets',!lvl',!sp') <-
371        if splitptr+1 >= lvl1
372          then do
373            debug $ "split: resizing bucket array"
374            let lvl3 = 2*lvl2
375            b <- Bucket.expandBucketArray lvl3 lvl2 buckets
376            debug $ "split: resizing bucket array: done"
377            return (b,lvl+1,0)
378          else return (buckets,lvl,splitptr+1)
379
380    let ht' = HashTable lvl' sp' buckets'
381
382    -- make sure the other split bucket has enough room in it also
383    let splitOffs = splitptr + lvl1
384    db2   <- readArray buckets' splitOffs
385    db2sz <- Bucket.size db2
386    let db2sz' = db2sz + bsz
387    db2'  <- Bucket.growBucketTo db2sz' db2
388    debug $ "growing bucket at " ++ show splitOffs ++ " to size "
389              ++ show db2sz'
390    writeArray buckets' splitOffs db2'
391
392    -- rehash old bucket
393    debug $ "split: rehashing bucket"
394    let f = uncurry $ primitiveInsert ht'
395    forceSameType f (uncurry $ primitiveInsert ht)
396
397    Bucket.mapM_ f oldBucket
398    debug $ "split: done"
399    return ht'
400
401
402------------------------------------------------------------------------------
403checkOverflow :: Int -> Bool
404checkOverflow sz = sz > bucketSplitSize
405
406
407------------------------------------------------------------------------------
408-- insert w/o splitting
409primitiveInsert :: (Hashable k) =>
410                   (HashTable_ s k v)
411                -> k
412                -> v
413                -> ST s Int
414primitiveInsert (HashTable lvl splitptr buckets) k v = do
415    debug $ "primitiveInsert start: nbuckets=" ++ show (power2 lvl)
416    let h0 = hashKey lvl splitptr k
417    primitiveInsert' buckets h0 k v
418
419
420------------------------------------------------------------------------------
421primitiveInsert' :: MutableArray s (Bucket s k v)
422                 -> Int
423                 -> k
424                 -> v
425                 -> ST s Int
426primitiveInsert' buckets !h0 !k !v = do
427    debug $ "primitiveInsert': bucket number=" ++ show h0
428    bucket <- readArray buckets h0
429    debug $ "primitiveInsert': snoccing bucket"
430    (!hw,m) <- Bucket.snoc bucket k v
431    debug $ "primitiveInsert': bucket snoc'd"
432    maybe (return ())
433          (writeArray buckets h0)
434          m
435    return hw
436
437
438
439
440------------------------------------------------------------------------------
441fillFactor :: Double
442fillFactor = 1.3
443
444
445------------------------------------------------------------------------------
446bucketSplitSize :: Int
447bucketSplitSize = Bucket.bucketSplitSize
448
449
450------------------------------------------------------------------------------
451{-# INLINE power2 #-}
452power2 :: Int -> Int
453power2 i = 1 `iShiftL` i
454
455
456------------------------------------------------------------------------------
457{-# INLINE hashKey #-}
458hashKey :: (Hashable k) => Int -> Int -> k -> Int
459hashKey !lvl !splitptr !k = h1
460  where
461    !h0 = hashAtLvl (lvl-1) k
462    !h1 = if (h0 < splitptr)
463            then hashAtLvl lvl k
464            else h0
465
466
467------------------------------------------------------------------------------
468{-# INLINE hashAtLvl #-}
469hashAtLvl :: (Hashable k) => Int -> k -> Int
470hashAtLvl !lvl !k = h
471  where
472    !h        = hashcode .&. mask
473    !hashcode = hash k
474    !mask     = power2 lvl - 1
475
476
477------------------------------------------------------------------------------
478newRef :: HashTable_ s k v -> ST s (HashTable s k v)
479newRef = liftM HT . newSTRef
480
481writeRef :: HashTable s k v -> HashTable_ s k v -> ST s ()
482writeRef (HT ref) ht = writeSTRef ref ht
483
484readRef :: HashTable s k v -> ST s (HashTable_ s k v)
485readRef (HT ref) = readSTRef ref
486
487
488------------------------------------------------------------------------------
489{-# INLINE debug #-}
490debug :: String -> ST s ()
491
492#ifdef DEBUG
493debug s = unsafeIOToST $ do
494              putStrLn s
495              hFlush stdout
496#else
497#ifdef TESTSUITE
498debug !s = do
499    let !_ = length s
500    return $! ()
501#else
502debug _ = return ()
503#endif
504#endif
505
506
507------------------------------------------------------------------------------
508-- | See the documentation for this function in
509-- "Data.HashTable.Class#v:lookupIndex".
510lookupIndex :: (Eq k, Hashable k) => HashTable s k v -> k -> ST s (Maybe Word)
511lookupIndex htRef !k = readRef htRef >>= work
512  where
513    work (HashTable lvl splitptr buckets) = do
514        let h0 = hashKey lvl splitptr k
515        bucket <- readArray buckets h0
516        mbIx <- Bucket.lookupIndex bucket k
517        return $! do ix <- mbIx
518                     Just $! encodeIndex lvl h0 ix
519{-# INLINE lookupIndex #-}
520
521encodeIndex :: Int -> Int -> Int -> Word
522encodeIndex lvl bucketIx elemIx =
523  fromIntegral bucketIx `Data.Bits.shiftL` indexOffset lvl .|.
524  fromIntegral elemIx
525{-# INLINE encodeIndex #-}
526
527decodeIndex :: Int -> Word -> (Int, Int)
528decodeIndex lvl ix =
529  ( fromIntegral (ix `Data.Bits.shiftR` offset)
530  , fromIntegral ( (bit offset - 1) .&. ix )
531  )
532  where offset = indexOffset lvl
533{-# INLINE decodeIndex #-}
534
535indexOffset :: Int -> Int
536indexOffset lvl = finiteBitSize (0 :: Word) - lvl
537{-# INLINE indexOffset #-}
538
539nextByIndex :: HashTable s k v -> Word -> ST s (Maybe (Word,k,v))
540nextByIndex htRef !k = readRef htRef >>= work
541  where
542    work (HashTable lvl _ buckets) = do
543        let (h0,ix) = decodeIndex lvl k
544        go h0 ix
545
546      where
547        bucketN = power2 lvl
548        go h ix
549          | h < 0 || bucketN <= h = return Nothing
550          | otherwise = do
551              bucket <- readArray buckets h
552              mb     <- Bucket.elemAt bucket ix
553              case mb of
554                Just (k',v) ->
555                  let !ix' = encodeIndex lvl h ix
556                  in return (Just (ix', k', v))
557                Nothing -> go (h+1) 0
558
559{-# INLINE nextByIndex #-}
560