1{- git-annex bloom filter 2 - 3 - Copyright 2010-2015 Joey Hess <id@joeyh.name> 4 - 5 - Licensed under the GNU AGPL version 3 or higher. 6 -} 7 8module Annex.BloomFilter where 9 10import Annex.Common 11import qualified Annex 12import Utility.Bloom 13 14import Control.Monad.ST 15 16{- A bloom filter capable of holding half a million keys with a 17 - false positive rate of 1 in 10000000 uses around 16 mb of memory, 18 - so will easily fit on even my lowest memory systems. 19 -} 20bloomCapacity :: Annex Int 21bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig 22bloomAccuracy :: Annex Int 23bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig 24bloomBitsHashes :: Annex (Int, Int) 25bloomBitsHashes = do 26 capacity <- bloomCapacity 27 accuracy <- bloomAccuracy 28 case safeSuggestSizing capacity (1 / fromIntegral accuracy) of 29 Left e -> do 30 warning $ "bloomfilter " ++ e ++ "; falling back to sane value" 31 -- precaulculated value for 500000 (1/10000000) 32 return (16777216,23) 33 Right v -> return v 34 35{- Creates a bloom filter, and runs an action to populate it. 36 - 37 - The action is passed a callback that it can use to feed values into the 38 - bloom filter. 39 - 40 - Once the action completes, the mutable filter is frozen 41 - for later use. 42 -} 43genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v) 44genBloomFilter populate = do 45 (numbits, numhashes) <- bloomBitsHashes 46 bloom <- lift $ newMB (cheapHashes numhashes) numbits 47 populate $ \v -> lift $ insertMB bloom v 48 lift $ unsafeFreezeMB bloom 49 where 50 lift = liftIO . stToIO 51 52bloomFilter :: [v] -> Bloom v -> [v] 53bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l 54