1 #include "git-compat-util.h"
2 #include "bloom.h"
3 #include "diff.h"
4 #include "diffcore.h"
5 #include "revision.h"
6 #include "hashmap.h"
7 #include "commit-graph.h"
8 #include "commit.h"
9 
10 define_commit_slab(bloom_filter_slab, struct bloom_filter);
11 
12 static struct bloom_filter_slab bloom_filters;
13 
14 struct pathmap_hash_entry {
15     struct hashmap_entry entry;
16     const char path[FLEX_ARRAY];
17 };
18 
rotate_left(uint32_t value,int32_t count)19 static uint32_t rotate_left(uint32_t value, int32_t count)
20 {
21 	uint32_t mask = 8 * sizeof(uint32_t) - 1;
22 	count &= mask;
23 	return ((value << count) | (value >> ((-count) & mask)));
24 }
25 
get_bitmask(uint32_t pos)26 static inline unsigned char get_bitmask(uint32_t pos)
27 {
28 	return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
29 }
30 
load_bloom_filter_from_graph(struct commit_graph * g,struct bloom_filter * filter,struct commit * c)31 static int load_bloom_filter_from_graph(struct commit_graph *g,
32 					struct bloom_filter *filter,
33 					struct commit *c)
34 {
35 	uint32_t lex_pos, start_index, end_index;
36 	uint32_t graph_pos = commit_graph_position(c);
37 
38 	while (graph_pos < g->num_commits_in_base)
39 		g = g->base_graph;
40 
41 	/* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
42 	if (!g->chunk_bloom_indexes)
43 		return 0;
44 
45 	lex_pos = graph_pos - g->num_commits_in_base;
46 
47 	end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
48 
49 	if (lex_pos > 0)
50 		start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
51 	else
52 		start_index = 0;
53 
54 	filter->len = end_index - start_index;
55 	filter->data = (unsigned char *)(g->chunk_bloom_data +
56 					sizeof(unsigned char) * start_index +
57 					BLOOMDATA_CHUNK_HEADER_SIZE);
58 
59 	return 1;
60 }
61 
62 /*
63  * Calculate the murmur3 32-bit hash value for the given data
64  * using the given seed.
65  * Produces a uniformly distributed hash value.
66  * Not considered to be cryptographically secure.
67  * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
68  */
murmur3_seeded(uint32_t seed,const char * data,size_t len)69 uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
70 {
71 	const uint32_t c1 = 0xcc9e2d51;
72 	const uint32_t c2 = 0x1b873593;
73 	const uint32_t r1 = 15;
74 	const uint32_t r2 = 13;
75 	const uint32_t m = 5;
76 	const uint32_t n = 0xe6546b64;
77 	int i;
78 	uint32_t k1 = 0;
79 	const char *tail;
80 
81 	int len4 = len / sizeof(uint32_t);
82 
83 	uint32_t k;
84 	for (i = 0; i < len4; i++) {
85 		uint32_t byte1 = (uint32_t)data[4*i];
86 		uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
87 		uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
88 		uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
89 		k = byte1 | byte2 | byte3 | byte4;
90 		k *= c1;
91 		k = rotate_left(k, r1);
92 		k *= c2;
93 
94 		seed ^= k;
95 		seed = rotate_left(seed, r2) * m + n;
96 	}
97 
98 	tail = (data + len4 * sizeof(uint32_t));
99 
100 	switch (len & (sizeof(uint32_t) - 1)) {
101 	case 3:
102 		k1 ^= ((uint32_t)tail[2]) << 16;
103 		/*-fallthrough*/
104 	case 2:
105 		k1 ^= ((uint32_t)tail[1]) << 8;
106 		/*-fallthrough*/
107 	case 1:
108 		k1 ^= ((uint32_t)tail[0]) << 0;
109 		k1 *= c1;
110 		k1 = rotate_left(k1, r1);
111 		k1 *= c2;
112 		seed ^= k1;
113 		break;
114 	}
115 
116 	seed ^= (uint32_t)len;
117 	seed ^= (seed >> 16);
118 	seed *= 0x85ebca6b;
119 	seed ^= (seed >> 13);
120 	seed *= 0xc2b2ae35;
121 	seed ^= (seed >> 16);
122 
123 	return seed;
124 }
125 
fill_bloom_key(const char * data,size_t len,struct bloom_key * key,const struct bloom_filter_settings * settings)126 void fill_bloom_key(const char *data,
127 		    size_t len,
128 		    struct bloom_key *key,
129 		    const struct bloom_filter_settings *settings)
130 {
131 	int i;
132 	const uint32_t seed0 = 0x293ae76f;
133 	const uint32_t seed1 = 0x7e646e2c;
134 	const uint32_t hash0 = murmur3_seeded(seed0, data, len);
135 	const uint32_t hash1 = murmur3_seeded(seed1, data, len);
136 
137 	key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
138 	for (i = 0; i < settings->num_hashes; i++)
139 		key->hashes[i] = hash0 + i * hash1;
140 }
141 
clear_bloom_key(struct bloom_key * key)142 void clear_bloom_key(struct bloom_key *key)
143 {
144 	FREE_AND_NULL(key->hashes);
145 }
146 
add_key_to_filter(const struct bloom_key * key,struct bloom_filter * filter,const struct bloom_filter_settings * settings)147 void add_key_to_filter(const struct bloom_key *key,
148 		       struct bloom_filter *filter,
149 		       const struct bloom_filter_settings *settings)
150 {
151 	int i;
152 	uint64_t mod = filter->len * BITS_PER_WORD;
153 
154 	for (i = 0; i < settings->num_hashes; i++) {
155 		uint64_t hash_mod = key->hashes[i] % mod;
156 		uint64_t block_pos = hash_mod / BITS_PER_WORD;
157 
158 		filter->data[block_pos] |= get_bitmask(hash_mod);
159 	}
160 }
161 
init_bloom_filters(void)162 void init_bloom_filters(void)
163 {
164 	init_bloom_filter_slab(&bloom_filters);
165 }
166 
pathmap_cmp(const void * hashmap_cmp_fn_data,const struct hashmap_entry * eptr,const struct hashmap_entry * entry_or_key,const void * keydata)167 static int pathmap_cmp(const void *hashmap_cmp_fn_data,
168 		       const struct hashmap_entry *eptr,
169 		       const struct hashmap_entry *entry_or_key,
170 		       const void *keydata)
171 {
172 	const struct pathmap_hash_entry *e1, *e2;
173 
174 	e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
175 	e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);
176 
177 	return strcmp(e1->path, e2->path);
178 }
179 
init_truncated_large_filter(struct bloom_filter * filter)180 static void init_truncated_large_filter(struct bloom_filter *filter)
181 {
182 	filter->data = xmalloc(1);
183 	filter->data[0] = 0xFF;
184 	filter->len = 1;
185 }
186 
get_or_compute_bloom_filter(struct repository * r,struct commit * c,int compute_if_not_present,const struct bloom_filter_settings * settings,enum bloom_filter_computed * computed)187 struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
188 						 struct commit *c,
189 						 int compute_if_not_present,
190 						 const struct bloom_filter_settings *settings,
191 						 enum bloom_filter_computed *computed)
192 {
193 	struct bloom_filter *filter;
194 	int i;
195 	struct diff_options diffopt;
196 
197 	if (computed)
198 		*computed = BLOOM_NOT_COMPUTED;
199 
200 	if (!bloom_filters.slab_size)
201 		return NULL;
202 
203 	filter = bloom_filter_slab_at(&bloom_filters, c);
204 
205 	if (!filter->data) {
206 		load_commit_graph_info(r, c);
207 		if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH)
208 			load_bloom_filter_from_graph(r->objects->commit_graph, filter, c);
209 	}
210 
211 	if (filter->data && filter->len)
212 		return filter;
213 	if (!compute_if_not_present)
214 		return NULL;
215 
216 	repo_diff_setup(r, &diffopt);
217 	diffopt.flags.recursive = 1;
218 	diffopt.detect_rename = 0;
219 	diffopt.max_changes = settings->max_changed_paths;
220 	diff_setup_done(&diffopt);
221 
222 	/* ensure commit is parsed so we have parent information */
223 	repo_parse_commit(r, c);
224 
225 	if (c->parents)
226 		diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
227 	else
228 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
229 	diffcore_std(&diffopt);
230 
231 	if (diff_queued_diff.nr <= settings->max_changed_paths) {
232 		struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL);
233 		struct pathmap_hash_entry *e;
234 		struct hashmap_iter iter;
235 
236 		for (i = 0; i < diff_queued_diff.nr; i++) {
237 			const char *path = diff_queued_diff.queue[i]->two->path;
238 
239 			/*
240 			 * Add each leading directory of the changed file, i.e. for
241 			 * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
242 			 * the Bloom filter could be used to speed up commands like
243 			 * 'git log dir/subdir', too.
244 			 *
245 			 * Note that directories are added without the trailing '/'.
246 			 */
247 			do {
248 				char *last_slash = strrchr(path, '/');
249 
250 				FLEX_ALLOC_STR(e, path, path);
251 				hashmap_entry_init(&e->entry, strhash(path));
252 
253 				if (!hashmap_get(&pathmap, &e->entry, NULL))
254 					hashmap_add(&pathmap, &e->entry);
255 				else
256 					free(e);
257 
258 				if (!last_slash)
259 					last_slash = (char*)path;
260 				*last_slash = '\0';
261 
262 			} while (*path);
263 
264 			diff_free_filepair(diff_queued_diff.queue[i]);
265 		}
266 
267 		if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
268 			init_truncated_large_filter(filter);
269 			if (computed)
270 				*computed |= BLOOM_TRUNC_LARGE;
271 			goto cleanup;
272 		}
273 
274 		filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
275 		if (!filter->len) {
276 			if (computed)
277 				*computed |= BLOOM_TRUNC_EMPTY;
278 			filter->len = 1;
279 		}
280 		CALLOC_ARRAY(filter->data, filter->len);
281 
282 		hashmap_for_each_entry(&pathmap, &iter, e, entry) {
283 			struct bloom_key key;
284 			fill_bloom_key(e->path, strlen(e->path), &key, settings);
285 			add_key_to_filter(&key, filter, settings);
286 			clear_bloom_key(&key);
287 		}
288 
289 	cleanup:
290 		hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry);
291 	} else {
292 		for (i = 0; i < diff_queued_diff.nr; i++)
293 			diff_free_filepair(diff_queued_diff.queue[i]);
294 		init_truncated_large_filter(filter);
295 
296 		if (computed)
297 			*computed |= BLOOM_TRUNC_LARGE;
298 	}
299 
300 	if (computed)
301 		*computed |= BLOOM_COMPUTED;
302 
303 	free(diff_queued_diff.queue);
304 	DIFF_QUEUE_CLEAR(&diff_queued_diff);
305 
306 	return filter;
307 }
308 
bloom_filter_contains(const struct bloom_filter * filter,const struct bloom_key * key,const struct bloom_filter_settings * settings)309 int bloom_filter_contains(const struct bloom_filter *filter,
310 			  const struct bloom_key *key,
311 			  const struct bloom_filter_settings *settings)
312 {
313 	int i;
314 	uint64_t mod = filter->len * BITS_PER_WORD;
315 
316 	if (!mod)
317 		return -1;
318 
319 	for (i = 0; i < settings->num_hashes; i++) {
320 		uint64_t hash_mod = key->hashes[i] % mod;
321 		uint64_t block_pos = hash_mod / BITS_PER_WORD;
322 		if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
323 			return 0;
324 	}
325 
326 	return 1;
327 }
328