1{-# LANGUAGE BangPatterns #-} 2{-# LANGUAGE CPP #-} 3{-# LANGUAGE MagicHash #-} 4{-# LANGUAGE RankNTypes #-} 5 6{-| An implementation of linear hash tables. (See 7<http://en.wikipedia.org/wiki/Linear_hashing>). Use this hash table if you... 8 9 * don't care that inserts and lookups are slower than the other hash table 10 implementations in this collection (this one is slightly faster than 11 @Data.HashTable@ from the base library in most cases) 12 13 * have a soft real-time or interactive application for which the risk of 14 introducing a long pause on insert while all of the keys are rehashed is 15 unacceptable. 16 17 18/Details:/ 19 20Linear hashing allows for the expansion of the hash table one slot at a time, 21by moving a \"split\" pointer across an array of pointers to buckets. The 22number of buckets is always a power of two, and the bucket to look in is 23defined as: 24 25@ 26bucket(level,key) = hash(key) mod (2^level) 27@ 28 29The \"split pointer\" controls the expansion of the hash table. If the hash 30table is at level @k@ (i.e. @2^k@ buckets have been allocated), we first 31calculate @b=bucket(level-1,key)@. If @b < splitptr@, the destination bucket is 32calculated as @b'=bucket(level,key)@, otherwise the original value @b@ is used. 33 34The split pointer is incremented once an insert causes some bucket to become 35fuller than some predetermined threshold; the bucket at the split pointer 36(*not* the bucket which triggered the split!) is then rehashed, and half of its 37keys can be expected to be rehashed into the upper half of the table. 38 39When the split pointer reaches the middle of the bucket array, the size of the 40bucket array is doubled, the level increases, and the split pointer is reset to 41zero. 42 43Linear hashing, although not quite as fast for inserts or lookups as the 44implementation of linear probing included in this package, is well suited for 45interactive applications because it has much better worst case behaviour on 46inserts. Other hash table implementations can suffer from long pauses, because 47it is occasionally necessary to rehash all of the keys when the table grows. 48Linear hashing, on the other hand, only ever rehashes a bounded (effectively 49constant) number of keys when an insert forces a bucket split. 50 51/Space overhead: experimental results/ 52 53In randomized testing (see @test\/compute-overhead\/ComputeOverhead.hs@ in the 54source distribution), mean overhead is approximately 1.51 machine words per 55key-value mapping with a very low standard deviation of about 0.06 words, 1.60 56words per mapping at the 95th percentile. 57 58/Unsafe tricks/ 59 60Then the @unsafe-tricks@ flag is on when this package is built (and it is on by 61default), we use some unsafe tricks (namely 'unsafeCoerce#' and 62'reallyUnsafePtrEquality#') to save indirections in this table. These 63techniques rely on assumptions about the behaviour of the GHC runtime system 64and, although they've been tested and should be safe under normal conditions, 65are slightly dangerous. Caveat emptor. In particular, these techniques are 66incompatible with HPC code coverage reports. 67 68 69References: 70 71 * W. Litwin. Linear hashing: a new tool for file and table addressing. In 72 /Proc. 6th International Conference on Very Large Data Bases, Volume 6/, 73 pp. 212-223, 1980. 74 75 * P-A. Larson. Dynamic hash tables. /Communications of the ACM/ 31: 76 446-457, 1988. 77-} 78 79module Data.HashTable.ST.Linear 80 ( HashTable 81 , new 82 , newSized 83 , delete 84 , lookup 85 , insert 86 , mutate 87 , mutateST 88 , mapM_ 89 , foldM 90 , computeOverhead 91 ) where 92 93------------------------------------------------------------------------------ 94#if !MIN_VERSION_base(4,8,0) 95import Control.Applicative 96import Data.Word 97#endif 98import Control.Monad hiding (foldM, mapM_) 99import Control.Monad.ST 100import Data.Bits 101import Data.Hashable 102import Data.STRef 103import Prelude hiding (lookup, mapM_) 104------------------------------------------------------------------------------ 105import qualified Data.HashTable.Class as C 106import Data.HashTable.Internal.Array 107import Data.HashTable.Internal.Linear.Bucket (Bucket) 108import qualified Data.HashTable.Internal.Linear.Bucket as Bucket 109import Data.HashTable.Internal.Utils 110 111#ifdef DEBUG 112import System.IO 113#endif 114 115 116------------------------------------------------------------------------------ 117-- | A linear hash table. 118newtype HashTable s k v = HT (STRef s (HashTable_ s k v)) 119 120data HashTable_ s k v = HashTable 121 { _level :: {-# UNPACK #-} !Int 122 , _splitptr :: {-# UNPACK #-} !Int 123 , _buckets :: {-# UNPACK #-} !(MutableArray s (Bucket s k v)) 124 } 125 126 127------------------------------------------------------------------------------ 128instance C.HashTable HashTable where 129 new = new 130 newSized = newSized 131 insert = insert 132 delete = delete 133 lookup = lookup 134 foldM = foldM 135 mapM_ = mapM_ 136 lookupIndex = lookupIndex 137 nextByIndex = nextByIndex 138 computeOverhead = computeOverhead 139 mutate = mutate 140 mutateST = mutateST 141 142 143------------------------------------------------------------------------------ 144instance Show (HashTable s k v) where 145 show _ = "<HashTable>" 146 147 148------------------------------------------------------------------------------ 149-- | See the documentation for this function in 150-- "Data.HashTable.Class#v:new". 151new :: ST s (HashTable s k v) 152new = do 153 v <- Bucket.newBucketArray 2 154 newRef $ HashTable 1 0 v 155 156 157------------------------------------------------------------------------------ 158-- | See the documentation for this function in 159-- "Data.HashTable.Class#v:newSized". 160newSized :: Int -> ST s (HashTable s k v) 161newSized n = do 162 v <- Bucket.newBucketArray sz 163 newRef $ HashTable lvl 0 v 164 165 where 166 k = ceiling (fromIntegral n * fillFactor / fromIntegral bucketSplitSize) 167 lvl = max 1 (fromEnum $ log2 k) 168 sz = power2 lvl 169 170 171 172------------------------------------------------------------------------------ 173-- | See the documentation for this function in 174-- "Data.HashTable.Class#v:delete". 175delete :: (Hashable k, Eq k) => 176 (HashTable s k v) 177 -> k 178 -> ST s () 179delete htRef !k = readRef htRef >>= work 180 where 181 work (HashTable lvl splitptr buckets) = do 182 let !h0 = hashKey lvl splitptr k 183 debug $ "delete: size=" ++ show (power2 lvl) ++ ", h0=" ++ show h0 184 ++ "splitptr: " ++ show splitptr 185 delete' buckets h0 k 186{-# INLINE delete #-} 187 188 189------------------------------------------------------------------------------ 190-- | See the documentation for this function in 191-- "Data.HashTable.Class#v:lookup". 192lookup :: (Eq k, Hashable k) => (HashTable s k v) -> k -> ST s (Maybe v) 193lookup htRef !k = readRef htRef >>= work 194 where 195 work (HashTable lvl splitptr buckets) = do 196 let h0 = hashKey lvl splitptr k 197 bucket <- readArray buckets h0 198 Bucket.lookup bucket k 199{-# INLINE lookup #-} 200 201 202------------------------------------------------------------------------------ 203-- | See the documentation for this function in 204-- "Data.HashTable.Class#v:insert". 205insert :: (Eq k, Hashable k) => 206 (HashTable s k v) 207 -> k 208 -> v 209 -> ST s () 210insert htRef k v = do 211 ht' <- readRef htRef >>= work 212 writeRef htRef ht' 213 where 214 work ht@(HashTable lvl splitptr buckets) = do 215 let !h0 = hashKey lvl splitptr k 216 delete' buckets h0 k 217 bsz <- primitiveInsert' buckets h0 k v 218 219 if checkOverflow bsz 220 then do 221 debug $ "insert: splitting" 222 h <- split ht 223 debug $ "insert: done splitting" 224 return h 225 else do 226 debug $ "insert: done" 227 return ht 228{-# INLINE insert #-} 229 230 231------------------------------------------------------------------------------ 232mutate :: (Eq k, Hashable k) => 233 (HashTable s k v) 234 -> k 235 -> (Maybe v -> (Maybe v, a)) 236 -> ST s a 237mutate htRef k f = mutateST htRef k (pure . f) 238{-# INLINE mutate #-} 239 240 241------------------------------------------------------------------------------ 242mutateST :: (Eq k, Hashable k) => 243 (HashTable s k v) 244 -> k 245 -> (Maybe v -> ST s (Maybe v, a)) 246 -> ST s a 247mutateST htRef k f = do 248 (ht, a) <- readRef htRef >>= work 249 writeRef htRef ht 250 return a 251 where 252 work ht@(HashTable lvl splitptr buckets) = do 253 let !h0 = hashKey lvl splitptr k 254 bucket <- readArray buckets h0 255 (!bsz, mbk, a) <- Bucket.mutateST bucket k f 256 maybe (return ()) 257 (writeArray buckets h0) 258 mbk 259 if checkOverflow bsz 260 then do 261 ht' <- split ht 262 return (ht', a) 263 else return (ht, a) 264 265 266------------------------------------------------------------------------------ 267-- | See the documentation for this function in 268-- "Data.HashTable.Class#v:mapM_". 269mapM_ :: ((k,v) -> ST s b) -> HashTable s k v -> ST s () 270mapM_ f htRef = readRef htRef >>= work 271 where 272 work (HashTable lvl _ buckets) = go 0 273 where 274 !sz = power2 lvl 275 276 go !i | i >= sz = return () 277 | otherwise = do 278 b <- readArray buckets i 279 Bucket.mapM_ f b 280 go $ i+1 281 282 283------------------------------------------------------------------------------ 284-- | See the documentation for this function in 285-- "Data.HashTable.Class#v:foldM". 286foldM :: (a -> (k,v) -> ST s a) 287 -> a -> HashTable s k v 288 -> ST s a 289foldM f seed0 htRef = readRef htRef >>= work 290 where 291 work (HashTable lvl _ buckets) = go seed0 0 292 where 293 !sz = power2 lvl 294 295 go !seed !i | i >= sz = return seed 296 | otherwise = do 297 b <- readArray buckets i 298 !seed' <- Bucket.foldM f seed b 299 go seed' $ i+1 300 301 302------------------------------------------------------------------------------ 303-- | See the documentation for this function in 304-- "Data.HashTable.Class#v:computeOverhead". 305computeOverhead :: HashTable s k v -> ST s Double 306computeOverhead htRef = readRef htRef >>= work 307 where 308 work (HashTable lvl _ buckets) = do 309 (totElems, overhead) <- go 0 0 0 310 311 let n = fromIntegral totElems 312 let o = fromIntegral overhead 313 314 return $ (fromIntegral sz + constOverhead + o) / n 315 316 where 317 constOverhead = 5.0 318 319 !sz = power2 lvl 320 321 go !nelems !overhead !i | i >= sz = return (nelems, overhead) 322 | otherwise = do 323 b <- readArray buckets i 324 (!n,!o) <- Bucket.nelemsAndOverheadInWords b 325 let !n' = n + nelems 326 let !o' = o + overhead 327 328 go n' o' (i+1) 329 330 331------------------------------ 332-- Private functions follow -- 333------------------------------ 334 335------------------------------------------------------------------------------ 336delete' :: Eq k => 337 MutableArray s (Bucket s k v) 338 -> Int 339 -> k 340 -> ST s () 341delete' buckets h0 k = do 342 bucket <- readArray buckets h0 343 _ <- Bucket.delete bucket k 344 return () 345 346 347------------------------------------------------------------------------------ 348split :: (Hashable k) => 349 (HashTable_ s k v) 350 -> ST s (HashTable_ s k v) 351split ht@(HashTable lvl splitptr buckets) = do 352 debug $ "split: start: nbuck=" ++ show (power2 lvl) 353 ++ ", splitptr=" ++ show splitptr 354 355 -- grab bucket at splitPtr 356 oldBucket <- readArray buckets splitptr 357 358 nelems <- Bucket.size oldBucket 359 let !bsz = max Bucket.newBucketSize $ 360 ceiling $ (0.625 :: Double) * fromIntegral nelems 361 362 -- write an empty bucket there 363 dbucket1 <- Bucket.emptyWithSize bsz 364 writeArray buckets splitptr dbucket1 365 366 -- grow the buckets? 367 let lvl2 = power2 lvl 368 let lvl1 = power2 $ lvl-1 369 370 (!buckets',!lvl',!sp') <- 371 if splitptr+1 >= lvl1 372 then do 373 debug $ "split: resizing bucket array" 374 let lvl3 = 2*lvl2 375 b <- Bucket.expandBucketArray lvl3 lvl2 buckets 376 debug $ "split: resizing bucket array: done" 377 return (b,lvl+1,0) 378 else return (buckets,lvl,splitptr+1) 379 380 let ht' = HashTable lvl' sp' buckets' 381 382 -- make sure the other split bucket has enough room in it also 383 let splitOffs = splitptr + lvl1 384 db2 <- readArray buckets' splitOffs 385 db2sz <- Bucket.size db2 386 let db2sz' = db2sz + bsz 387 db2' <- Bucket.growBucketTo db2sz' db2 388 debug $ "growing bucket at " ++ show splitOffs ++ " to size " 389 ++ show db2sz' 390 writeArray buckets' splitOffs db2' 391 392 -- rehash old bucket 393 debug $ "split: rehashing bucket" 394 let f = uncurry $ primitiveInsert ht' 395 forceSameType f (uncurry $ primitiveInsert ht) 396 397 Bucket.mapM_ f oldBucket 398 debug $ "split: done" 399 return ht' 400 401 402------------------------------------------------------------------------------ 403checkOverflow :: Int -> Bool 404checkOverflow sz = sz > bucketSplitSize 405 406 407------------------------------------------------------------------------------ 408-- insert w/o splitting 409primitiveInsert :: (Hashable k) => 410 (HashTable_ s k v) 411 -> k 412 -> v 413 -> ST s Int 414primitiveInsert (HashTable lvl splitptr buckets) k v = do 415 debug $ "primitiveInsert start: nbuckets=" ++ show (power2 lvl) 416 let h0 = hashKey lvl splitptr k 417 primitiveInsert' buckets h0 k v 418 419 420------------------------------------------------------------------------------ 421primitiveInsert' :: MutableArray s (Bucket s k v) 422 -> Int 423 -> k 424 -> v 425 -> ST s Int 426primitiveInsert' buckets !h0 !k !v = do 427 debug $ "primitiveInsert': bucket number=" ++ show h0 428 bucket <- readArray buckets h0 429 debug $ "primitiveInsert': snoccing bucket" 430 (!hw,m) <- Bucket.snoc bucket k v 431 debug $ "primitiveInsert': bucket snoc'd" 432 maybe (return ()) 433 (writeArray buckets h0) 434 m 435 return hw 436 437 438 439 440------------------------------------------------------------------------------ 441fillFactor :: Double 442fillFactor = 1.3 443 444 445------------------------------------------------------------------------------ 446bucketSplitSize :: Int 447bucketSplitSize = Bucket.bucketSplitSize 448 449 450------------------------------------------------------------------------------ 451{-# INLINE power2 #-} 452power2 :: Int -> Int 453power2 i = 1 `iShiftL` i 454 455 456------------------------------------------------------------------------------ 457{-# INLINE hashKey #-} 458hashKey :: (Hashable k) => Int -> Int -> k -> Int 459hashKey !lvl !splitptr !k = h1 460 where 461 !h0 = hashAtLvl (lvl-1) k 462 !h1 = if (h0 < splitptr) 463 then hashAtLvl lvl k 464 else h0 465 466 467------------------------------------------------------------------------------ 468{-# INLINE hashAtLvl #-} 469hashAtLvl :: (Hashable k) => Int -> k -> Int 470hashAtLvl !lvl !k = h 471 where 472 !h = hashcode .&. mask 473 !hashcode = hash k 474 !mask = power2 lvl - 1 475 476 477------------------------------------------------------------------------------ 478newRef :: HashTable_ s k v -> ST s (HashTable s k v) 479newRef = liftM HT . newSTRef 480 481writeRef :: HashTable s k v -> HashTable_ s k v -> ST s () 482writeRef (HT ref) ht = writeSTRef ref ht 483 484readRef :: HashTable s k v -> ST s (HashTable_ s k v) 485readRef (HT ref) = readSTRef ref 486 487 488------------------------------------------------------------------------------ 489{-# INLINE debug #-} 490debug :: String -> ST s () 491 492#ifdef DEBUG 493debug s = unsafeIOToST $ do 494 putStrLn s 495 hFlush stdout 496#else 497#ifdef TESTSUITE 498debug !s = do 499 let !_ = length s 500 return $! () 501#else 502debug _ = return () 503#endif 504#endif 505 506 507------------------------------------------------------------------------------ 508-- | See the documentation for this function in 509-- "Data.HashTable.Class#v:lookupIndex". 510lookupIndex :: (Eq k, Hashable k) => HashTable s k v -> k -> ST s (Maybe Word) 511lookupIndex htRef !k = readRef htRef >>= work 512 where 513 work (HashTable lvl splitptr buckets) = do 514 let h0 = hashKey lvl splitptr k 515 bucket <- readArray buckets h0 516 mbIx <- Bucket.lookupIndex bucket k 517 return $! do ix <- mbIx 518 Just $! encodeIndex lvl h0 ix 519{-# INLINE lookupIndex #-} 520 521encodeIndex :: Int -> Int -> Int -> Word 522encodeIndex lvl bucketIx elemIx = 523 fromIntegral bucketIx `Data.Bits.shiftL` indexOffset lvl .|. 524 fromIntegral elemIx 525{-# INLINE encodeIndex #-} 526 527decodeIndex :: Int -> Word -> (Int, Int) 528decodeIndex lvl ix = 529 ( fromIntegral (ix `Data.Bits.shiftR` offset) 530 , fromIntegral ( (bit offset - 1) .&. ix ) 531 ) 532 where offset = indexOffset lvl 533{-# INLINE decodeIndex #-} 534 535indexOffset :: Int -> Int 536indexOffset lvl = finiteBitSize (0 :: Word) - lvl 537{-# INLINE indexOffset #-} 538 539nextByIndex :: HashTable s k v -> Word -> ST s (Maybe (Word,k,v)) 540nextByIndex htRef !k = readRef htRef >>= work 541 where 542 work (HashTable lvl _ buckets) = do 543 let (h0,ix) = decodeIndex lvl k 544 go h0 ix 545 546 where 547 bucketN = power2 lvl 548 go h ix 549 | h < 0 || bucketN <= h = return Nothing 550 | otherwise = do 551 bucket <- readArray buckets h 552 mb <- Bucket.elemAt bucket ix 553 case mb of 554 Just (k',v) -> 555 let !ix' = encodeIndex lvl h ix 556 in return (Just (ix', k', v)) 557 Nothing -> go (h+1) 0 558 559{-# INLINE nextByIndex #-} 560