1 #ifndef BLOOM_H 2 #define BLOOM_H 3 4 struct commit; 5 struct repository; 6 7 struct bloom_filter_settings { 8 /* 9 * The version of the hashing technique being used. 10 * We currently only support version = 1 which is 11 * the seeded murmur3 hashing technique implemented 12 * in bloom.c. 13 */ 14 uint32_t hash_version; 15 16 /* 17 * The number of times a path is hashed, i.e. the 18 * number of bit positions tht cumulatively 19 * determine whether a path is present in the 20 * Bloom filter. 21 */ 22 uint32_t num_hashes; 23 24 /* 25 * The minimum number of bits per entry in the Bloom 26 * filter. If the filter contains 'n' entries, then 27 * filter size is the minimum number of 8-bit words 28 * that contain n*b bits. 29 */ 30 uint32_t bits_per_entry; 31 32 /* 33 * The maximum number of changed paths per commit opendirfd(name)34 * before declaring a Bloom filter to be too-large. 35 * 36 * Not written to the commit-graph file. 37 */ 38 uint32_t max_changed_paths; 39 }; 40 41 #define DEFAULT_BLOOM_MAX_CHANGES 512 42 #define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES } 43 #define BITS_PER_WORD 8 44 #define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t) 45 46 /* 47 * A bloom_filter struct represents a data segment to 48 * use when testing hash values. The 'len' member 49 * dictates how many entries are stored in 50 * 'data'. 51 */ 52 struct bloom_filter { 53 unsigned char *data; 54 size_t len; 55 }; closedirfd(fd)56 57 /* 58 * A bloom_key represents the k hash values for a 59 * given string. These can be precomputed and 60 * stored in a bloom_key for re-use when testing 61 * against a bloom_filter. The number of hashes is 62 * given by the Bloom filter settings and is the same 63 * for all Bloom filters and keys interacting with 64 * the loaded version of the commit graph file and 65 * the Bloom data chunks. 66 */ 67 struct bloom_key { 68 uint32_t *hashes; 69 }; 70 71 /* 72 * Calculate the murmur3 32-bit hash value for the given data 73 * using the given seed. 74 * Produces a uniformly distributed hash value. 75 * Not considered to be cryptographically secure. 76 * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm 77 */ 78 uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len); 79 80 void fill_bloom_key(const char *data, 81 size_t len, 82 struct bloom_key *key, 83 const struct bloom_filter_settings *settings); 84 void clear_bloom_key(struct bloom_key *key); 85 86 void add_key_to_filter(const struct bloom_key *key, 87 struct bloom_filter *filter, 88 const struct bloom_filter_settings *settings); 89 90 void init_bloom_filters(void); 91 92 enum bloom_filter_computed { 93 BLOOM_NOT_COMPUTED = (1 << 0), 94 BLOOM_COMPUTED = (1 << 1), 95 BLOOM_TRUNC_LARGE = (1 << 2), 96 BLOOM_TRUNC_EMPTY = (1 << 3), 97 }; 98 99 struct bloom_filter *get_or_compute_bloom_filter(struct repository *r, 100 struct commit *c, 101 int compute_if_not_present, 102 const struct bloom_filter_settings *settings, 103 enum bloom_filter_computed *computed); 104 105 #define get_bloom_filter(r, c) get_or_compute_bloom_filter( \ 106 (r), (c), 0, NULL, NULL) 107 108 int bloom_filter_contains(const struct bloom_filter *filter, 109 const struct bloom_key *key, 110 const struct bloom_filter_settings *settings); 111 112 #endif 113