1{- git-annex bloom filter
2 -
3 - Copyright 2010-2015 Joey Hess <id@joeyh.name>
4 -
5 - Licensed under the GNU AGPL version 3 or higher.
6 -}
7
8module Annex.BloomFilter where
9
10import Annex.Common
11import qualified Annex
12import Utility.Bloom
13
14import Control.Monad.ST
15
16{- A bloom filter capable of holding half a million keys with a
17 - false positive rate of 1 in 10000000 uses around 16 mb of memory,
18 - so will easily fit on even my lowest memory systems.
19 -}
20bloomCapacity :: Annex Int
21bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
22bloomAccuracy :: Annex Int
23bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
24bloomBitsHashes :: Annex (Int, Int)
25bloomBitsHashes = do
26	capacity <- bloomCapacity
27	accuracy <- bloomAccuracy
28	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
29		Left e -> do
30			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
31			-- precaulculated value for 500000 (1/10000000)
32			return (16777216,23)
33		Right v -> return v
34
35{- Creates a bloom filter, and runs an action to populate it.
36 -
37 - The action is passed a callback that it can use to feed values into the
38 - bloom filter.
39 -
40 - Once the action completes, the mutable filter is frozen
41 - for later use.
42 -}
43genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
44genBloomFilter populate = do
45	(numbits, numhashes) <- bloomBitsHashes
46	bloom <- lift $ newMB (cheapHashes numhashes) numbits
47	populate $ \v -> lift $ insertMB bloom v
48	lift $ unsafeFreezeMB bloom
49  where
50	lift = liftIO . stToIO
51
52bloomFilter :: [v] -> Bloom v -> [v]
53bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
54