1 #ifndef BLOOM_H
2 #define BLOOM_H
3 
4 struct commit;
5 struct repository;
6 
7 struct bloom_filter_settings {
8 	/*
9 	 * The version of the hashing technique being used.
10 	 * We currently only support version = 1 which is
11 	 * the seeded murmur3 hashing technique implemented
12 	 * in bloom.c.
13 	 */
14 	uint32_t hash_version;
15 
16 	/*
17 	 * The number of times a path is hashed, i.e. the
18 	 * number of bit positions tht cumulatively
19 	 * determine whether a path is present in the
20 	 * Bloom filter.
21 	 */
22 	uint32_t num_hashes;
23 
24 	/*
25 	 * The minimum number of bits per entry in the Bloom
26 	 * filter. If the filter contains 'n' entries, then
27 	 * filter size is the minimum number of 8-bit words
28 	 * that contain n*b bits.
29 	 */
30 	uint32_t bits_per_entry;
31 
32 	/*
33 	 * The maximum number of changed paths per commit
34 	 * before declaring a Bloom filter to be too-large.
35 	 *
36 	 * Not written to the commit-graph file.
37 	 */
38 	uint32_t max_changed_paths;
39 };
40 
41 #define DEFAULT_BLOOM_MAX_CHANGES 512
42 #define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES }
43 #define BITS_PER_WORD 8
44 #define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
45 
46 /*
47  * A bloom_filter struct represents a data segment to
48  * use when testing hash values. The 'len' member
49  * dictates how many entries are stored in
50  * 'data'.
51  */
52 struct bloom_filter {
53 	unsigned char *data;
54 	size_t len;
55 };
56 
57 /*
58  * A bloom_key represents the k hash values for a
59  * given string. These can be precomputed and
60  * stored in a bloom_key for re-use when testing
61  * against a bloom_filter. The number of hashes is
62  * given by the Bloom filter settings and is the same
63  * for all Bloom filters and keys interacting with
64  * the loaded version of the commit graph file and
65  * the Bloom data chunks.
66  */
67 struct bloom_key {
68 	uint32_t *hashes;
69 };
70 
71 /*
72  * Calculate the murmur3 32-bit hash value for the given data
73  * using the given seed.
74  * Produces a uniformly distributed hash value.
75  * Not considered to be cryptographically secure.
76  * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
77  */
78 uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
79 
80 void fill_bloom_key(const char *data,
81 		    size_t len,
82 		    struct bloom_key *key,
83 		    const struct bloom_filter_settings *settings);
84 void clear_bloom_key(struct bloom_key *key);
85 
86 void add_key_to_filter(const struct bloom_key *key,
87 		       struct bloom_filter *filter,
88 		       const struct bloom_filter_settings *settings);
89 
90 void init_bloom_filters(void);
91 
92 enum bloom_filter_computed {
93 	BLOOM_NOT_COMPUTED = (1 << 0),
94 	BLOOM_COMPUTED     = (1 << 1),
95 	BLOOM_TRUNC_LARGE  = (1 << 2),
96 	BLOOM_TRUNC_EMPTY  = (1 << 3),
97 };
98 
99 struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
100 						 struct commit *c,
101 						 int compute_if_not_present,
102 						 const struct bloom_filter_settings *settings,
103 						 enum bloom_filter_computed *computed);
104 
105 #define get_bloom_filter(r, c) get_or_compute_bloom_filter( \
106 	(r), (c), 0, NULL, NULL)
107 
108 int bloom_filter_contains(const struct bloom_filter *filter,
109 			  const struct bloom_key *key,
110 			  const struct bloom_filter_settings *settings);
111 
112 #endif
113