1 /*
2  * Copyright (C) the libgit2 contributors. All rights reserved.
3  *
4  * This file is part of libgit2, distributed under the GNU GPL v2 with
5  * a Linking Exception. For full terms see the included COPYING file.
6  */
7 
8 #include "pack-objects.h"
9 
10 #include "zstream.h"
11 #include "delta.h"
12 #include "iterator.h"
13 #include "netops.h"
14 #include "pack.h"
15 #include "thread.h"
16 #include "tree.h"
17 #include "util.h"
18 #include "revwalk.h"
19 #include "commit_list.h"
20 
21 #include "git2/pack.h"
22 #include "git2/commit.h"
23 #include "git2/tag.h"
24 #include "git2/indexer.h"
25 #include "git2/config.h"
26 
27 struct unpacked {
28 	git_pobject *object;
29 	void *data;
30 	struct git_delta_index *index;
31 	size_t depth;
32 };
33 
34 struct tree_walk_context {
35 	git_packbuilder *pb;
36 	git_buf buf;
37 };
38 
39 struct pack_write_context {
40 	git_indexer *indexer;
41 	git_indexer_progress *stats;
42 };
43 
44 struct walk_object {
45 	git_oid id;
46 	unsigned int uninteresting:1,
47 		seen:1;
48 };
49 
50 #ifdef GIT_THREADS
51 # define GIT_PACKBUILDER__MUTEX_OP(pb, mtx, op) git_mutex_##op(&(pb)->mtx)
52 #else
53 # define GIT_PACKBUILDER__MUTEX_OP(pb, mtx, op) git__noop()
54 #endif
55 
56 #define git_packbuilder__cache_lock(pb) GIT_PACKBUILDER__MUTEX_OP(pb, cache_mutex, lock)
57 #define git_packbuilder__cache_unlock(pb) GIT_PACKBUILDER__MUTEX_OP(pb, cache_mutex, unlock)
58 #define git_packbuilder__progress_lock(pb) GIT_PACKBUILDER__MUTEX_OP(pb, progress_mutex, lock)
59 #define git_packbuilder__progress_unlock(pb) GIT_PACKBUILDER__MUTEX_OP(pb, progress_mutex, unlock)
60 
61 /* The minimal interval between progress updates (in seconds). */
62 #define MIN_PROGRESS_UPDATE_INTERVAL 0.5
63 
64 /* Size of the buffer to feed to zlib */
65 #define COMPRESS_BUFLEN (1024 * 1024)
66 
name_hash(const char * name)67 static unsigned name_hash(const char *name)
68 {
69 	unsigned c, hash = 0;
70 
71 	if (!name)
72 		return 0;
73 
74 	/*
75 	 * This effectively just creates a sortable number from the
76 	 * last sixteen non-whitespace characters. Last characters
77 	 * count "most", so things that end in ".c" sort together.
78 	 */
79 	while ((c = *name++) != 0) {
80 		if (git__isspace(c))
81 			continue;
82 		hash = (hash >> 2) + (c << 24);
83 	}
84 	return hash;
85 }
86 
packbuilder_config(git_packbuilder * pb)87 static int packbuilder_config(git_packbuilder *pb)
88 {
89 	git_config *config;
90 	int ret = 0;
91 	int64_t val;
92 
93 	if ((ret = git_repository_config_snapshot(&config, pb->repo)) < 0)
94 		return ret;
95 
96 #define config_get(KEY,DST,DFLT) do { \
97 	ret = git_config_get_int64(&val, config, KEY); \
98 	if (!ret) { \
99 		if (!git__is_sizet(val)) { \
100 			git_error_set(GIT_ERROR_CONFIG, \
101 				"configuration value '%s' is too large", KEY); \
102 			ret = -1; \
103 			goto out; \
104 		} \
105 		(DST) = (size_t)val; \
106 	} else if (ret == GIT_ENOTFOUND) { \
107 	    (DST) = (DFLT); \
108 	    ret = 0; \
109 	} else if (ret < 0) goto out; } while (0)
110 
111 	config_get("pack.deltaCacheSize", pb->max_delta_cache_size,
112 		   GIT_PACK_DELTA_CACHE_SIZE);
113 	config_get("pack.deltaCacheLimit", pb->cache_max_small_delta_size,
114 		   GIT_PACK_DELTA_CACHE_LIMIT);
115 	config_get("pack.deltaCacheSize", pb->big_file_threshold,
116 		   GIT_PACK_BIG_FILE_THRESHOLD);
117 	config_get("pack.windowMemory", pb->window_memory_limit, 0);
118 
119 #undef config_get
120 
121 out:
122 	git_config_free(config);
123 
124 	return ret;
125 }
126 
git_packbuilder_new(git_packbuilder ** out,git_repository * repo)127 int git_packbuilder_new(git_packbuilder **out, git_repository *repo)
128 {
129 	git_packbuilder *pb;
130 
131 	*out = NULL;
132 
133 	pb = git__calloc(1, sizeof(*pb));
134 	GIT_ERROR_CHECK_ALLOC(pb);
135 
136 	if (git_oidmap_new(&pb->object_ix) < 0 ||
137 	    git_oidmap_new(&pb->walk_objects) < 0 ||
138 	    git_pool_init(&pb->object_pool, sizeof(struct walk_object)) < 0)
139 		goto on_error;
140 
141 	pb->repo = repo;
142 	pb->nr_threads = 1; /* do not spawn any thread by default */
143 
144 	if (git_hash_ctx_init(&pb->ctx) < 0 ||
145 		git_zstream_init(&pb->zstream, GIT_ZSTREAM_DEFLATE) < 0 ||
146 		git_repository_odb(&pb->odb, repo) < 0 ||
147 		packbuilder_config(pb) < 0)
148 		goto on_error;
149 
150 #ifdef GIT_THREADS
151 
152 	if (git_mutex_init(&pb->cache_mutex) ||
153 		git_mutex_init(&pb->progress_mutex) ||
154 		git_cond_init(&pb->progress_cond))
155 	{
156 		git_error_set(GIT_ERROR_OS, "failed to initialize packbuilder mutex");
157 		goto on_error;
158 	}
159 
160 #endif
161 
162 	*out = pb;
163 	return 0;
164 
165 on_error:
166 	git_packbuilder_free(pb);
167 	return -1;
168 }
169 
git_packbuilder_set_threads(git_packbuilder * pb,unsigned int n)170 unsigned int git_packbuilder_set_threads(git_packbuilder *pb, unsigned int n)
171 {
172 	GIT_ASSERT_ARG(pb);
173 
174 #ifdef GIT_THREADS
175 	pb->nr_threads = n;
176 #else
177 	GIT_UNUSED(n);
178 	GIT_ASSERT(pb->nr_threads == 1);
179 #endif
180 
181 	return pb->nr_threads;
182 }
183 
rehash(git_packbuilder * pb)184 static int rehash(git_packbuilder *pb)
185 {
186 	git_pobject *po;
187 	size_t i;
188 
189 	git_oidmap_clear(pb->object_ix);
190 
191 	for (i = 0, po = pb->object_list; i < pb->nr_objects; i++, po++) {
192 		if (git_oidmap_set(pb->object_ix, &po->id, po) < 0)
193 			return -1;
194 	}
195 
196 	return 0;
197 }
198 
git_packbuilder_insert(git_packbuilder * pb,const git_oid * oid,const char * name)199 int git_packbuilder_insert(git_packbuilder *pb, const git_oid *oid,
200 			   const char *name)
201 {
202 	git_pobject *po;
203 	size_t newsize;
204 	int ret;
205 
206 	GIT_ASSERT_ARG(pb);
207 	GIT_ASSERT_ARG(oid);
208 
209 	/* If the object already exists in the hash table, then we don't
210 	 * have any work to do */
211 	if (git_oidmap_exists(pb->object_ix, oid))
212 		return 0;
213 
214 	if (pb->nr_objects >= pb->nr_alloc) {
215 		GIT_ERROR_CHECK_ALLOC_ADD(&newsize, pb->nr_alloc, 1024);
216 		GIT_ERROR_CHECK_ALLOC_MULTIPLY(&newsize, newsize / 2, 3);
217 
218 		if (!git__is_uint32(newsize)) {
219 			git_error_set(GIT_ERROR_NOMEMORY, "packfile too large to fit in memory.");
220 			return -1;
221 		}
222 
223 		pb->nr_alloc = newsize;
224 
225 		pb->object_list = git__reallocarray(pb->object_list,
226 			pb->nr_alloc, sizeof(*po));
227 		GIT_ERROR_CHECK_ALLOC(pb->object_list);
228 
229 		if (rehash(pb) < 0)
230 			return -1;
231 	}
232 
233 	po = pb->object_list + pb->nr_objects;
234 	memset(po, 0x0, sizeof(*po));
235 
236 	if ((ret = git_odb_read_header(&po->size, &po->type, pb->odb, oid)) < 0)
237 		return ret;
238 
239 	pb->nr_objects++;
240 	git_oid_cpy(&po->id, oid);
241 	po->hash = name_hash(name);
242 
243 	if (git_oidmap_set(pb->object_ix, &po->id, po) < 0) {
244 		git_error_set_oom();
245 		return -1;
246 	}
247 
248 	pb->done = false;
249 
250 	if (pb->progress_cb) {
251 		double current_time = git__timer();
252 		double elapsed = current_time - pb->last_progress_report_time;
253 
254 		if (elapsed >= MIN_PROGRESS_UPDATE_INTERVAL) {
255 			pb->last_progress_report_time = current_time;
256 
257 			ret = pb->progress_cb(
258 				GIT_PACKBUILDER_ADDING_OBJECTS,
259 				pb->nr_objects, 0, pb->progress_cb_payload);
260 
261 			if (ret)
262 				return git_error_set_after_callback(ret);
263 		}
264 	}
265 
266 	return 0;
267 }
268 
get_delta(void ** out,git_odb * odb,git_pobject * po)269 static int get_delta(void **out, git_odb *odb, git_pobject *po)
270 {
271 	git_odb_object *src = NULL, *trg = NULL;
272 	size_t delta_size;
273 	void *delta_buf;
274 	int error;
275 
276 	*out = NULL;
277 
278 	if (git_odb_read(&src, odb, &po->delta->id) < 0 ||
279 	    git_odb_read(&trg, odb, &po->id) < 0)
280 		goto on_error;
281 
282 	error = git_delta(&delta_buf, &delta_size,
283 		git_odb_object_data(src), git_odb_object_size(src),
284 		git_odb_object_data(trg), git_odb_object_size(trg),
285 		0);
286 
287 	if (error < 0 && error != GIT_EBUFS)
288 		goto on_error;
289 
290 	if (error == GIT_EBUFS || delta_size != po->delta_size) {
291 		git_error_set(GIT_ERROR_INVALID, "delta size changed");
292 		goto on_error;
293 	}
294 
295 	*out = delta_buf;
296 
297 	git_odb_object_free(src);
298 	git_odb_object_free(trg);
299 	return 0;
300 
301 on_error:
302 	git_odb_object_free(src);
303 	git_odb_object_free(trg);
304 	return -1;
305 }
306 
write_object(git_packbuilder * pb,git_pobject * po,int (* write_cb)(void * buf,size_t size,void * cb_data),void * cb_data)307 static int write_object(
308 	git_packbuilder *pb,
309 	git_pobject *po,
310 	int (*write_cb)(void *buf, size_t size, void *cb_data),
311 	void *cb_data)
312 {
313 	git_odb_object *obj = NULL;
314 	git_object_t type;
315 	unsigned char hdr[10], *zbuf = NULL;
316 	void *data = NULL;
317 	size_t hdr_len, zbuf_len = COMPRESS_BUFLEN, data_len;
318 	int error;
319 
320 	/*
321 	 * If we have a delta base, let's use the delta to save space.
322 	 * Otherwise load the whole object. 'data' ends up pointing to
323 	 * whatever data we want to put into the packfile.
324 	 */
325 	if (po->delta) {
326 		if (po->delta_data)
327 			data = po->delta_data;
328 		else if ((error = get_delta(&data, pb->odb, po)) < 0)
329 				goto done;
330 
331 		data_len = po->delta_size;
332 		type = GIT_OBJECT_REF_DELTA;
333 	} else {
334 		if ((error = git_odb_read(&obj, pb->odb, &po->id)) < 0)
335 			goto done;
336 
337 		data = (void *)git_odb_object_data(obj);
338 		data_len = git_odb_object_size(obj);
339 		type = git_odb_object_type(obj);
340 	}
341 
342 	/* Write header */
343 	if ((error = git_packfile__object_header(&hdr_len, hdr, data_len, type)) < 0 ||
344 	    (error = write_cb(hdr, hdr_len, cb_data)) < 0 ||
345 	    (error = git_hash_update(&pb->ctx, hdr, hdr_len)) < 0)
346 		goto done;
347 
348 	if (type == GIT_OBJECT_REF_DELTA) {
349 		if ((error = write_cb(po->delta->id.id, GIT_OID_RAWSZ, cb_data)) < 0 ||
350 			(error = git_hash_update(&pb->ctx, po->delta->id.id, GIT_OID_RAWSZ)) < 0)
351 			goto done;
352 	}
353 
354 	/* Write data */
355 	if (po->z_delta_size) {
356 		data_len = po->z_delta_size;
357 
358 		if ((error = write_cb(data, data_len, cb_data)) < 0 ||
359 			(error = git_hash_update(&pb->ctx, data, data_len)) < 0)
360 			goto done;
361 	} else {
362 		zbuf = git__malloc(zbuf_len);
363 		GIT_ERROR_CHECK_ALLOC(zbuf);
364 
365 		git_zstream_reset(&pb->zstream);
366 
367 		if ((error = git_zstream_set_input(&pb->zstream, data, data_len)) < 0)
368 			goto done;
369 
370 		while (!git_zstream_done(&pb->zstream)) {
371 			if ((error = git_zstream_get_output(zbuf, &zbuf_len, &pb->zstream)) < 0 ||
372 				(error = write_cb(zbuf, zbuf_len, cb_data)) < 0 ||
373 				(error = git_hash_update(&pb->ctx, zbuf, zbuf_len)) < 0)
374 				goto done;
375 
376 			zbuf_len = COMPRESS_BUFLEN; /* reuse buffer */
377 		}
378 	}
379 
380 	/*
381 	 * If po->delta is true, data is a delta and it is our
382 	 * responsibility to free it (otherwise it's a git_object's
383 	 * data). We set po->delta_data to NULL in case we got the
384 	 * data from there instead of get_delta(). If we didn't,
385 	 * there's no harm.
386 	 */
387 	if (po->delta) {
388 		git__free(data);
389 		po->delta_data = NULL;
390 	}
391 
392 	pb->nr_written++;
393 
394 done:
395 	git__free(zbuf);
396 	git_odb_object_free(obj);
397 	return error;
398 }
399 
400 enum write_one_status {
401 	WRITE_ONE_SKIP = -1, /* already written */
402 	WRITE_ONE_BREAK = 0, /* writing this will bust the limit; not written */
403 	WRITE_ONE_WRITTEN = 1, /* normal */
404 	WRITE_ONE_RECURSIVE = 2 /* already scheduled to be written */
405 };
406 
write_one(enum write_one_status * status,git_packbuilder * pb,git_pobject * po,int (* write_cb)(void * buf,size_t size,void * cb_data),void * cb_data)407 static int write_one(
408 	enum write_one_status *status,
409 	git_packbuilder *pb,
410 	git_pobject *po,
411 	int (*write_cb)(void *buf, size_t size, void *cb_data),
412 	void *cb_data)
413 {
414 	int error;
415 
416 	if (po->recursing) {
417 		*status = WRITE_ONE_RECURSIVE;
418 		return 0;
419 	} else if (po->written) {
420 		*status = WRITE_ONE_SKIP;
421 		return 0;
422 	}
423 
424 	if (po->delta) {
425 		po->recursing = 1;
426 
427 		if ((error = write_one(status, pb, po->delta, write_cb, cb_data)) < 0)
428 			return error;
429 
430 		/* we cannot depend on this one */
431 		if (*status == WRITE_ONE_RECURSIVE)
432 			po->delta = NULL;
433 	}
434 
435 	*status = WRITE_ONE_WRITTEN;
436 	po->written = 1;
437 	po->recursing = 0;
438 
439 	return write_object(pb, po, write_cb, cb_data);
440 }
441 
add_to_write_order(git_pobject ** wo,size_t * endp,git_pobject * po)442 GIT_INLINE(void) add_to_write_order(git_pobject **wo, size_t *endp,
443 	git_pobject *po)
444 {
445 	if (po->filled)
446 		return;
447 	wo[(*endp)++] = po;
448 	po->filled = 1;
449 }
450 
add_descendants_to_write_order(git_pobject ** wo,size_t * endp,git_pobject * po)451 static void add_descendants_to_write_order(git_pobject **wo, size_t *endp,
452 	git_pobject *po)
453 {
454 	int add_to_order = 1;
455 	while (po) {
456 		if (add_to_order) {
457 			git_pobject *s;
458 			/* add this node... */
459 			add_to_write_order(wo, endp, po);
460 			/* all its siblings... */
461 			for (s = po->delta_sibling; s; s = s->delta_sibling) {
462 				add_to_write_order(wo, endp, s);
463 			}
464 		}
465 		/* drop down a level to add left subtree nodes if possible */
466 		if (po->delta_child) {
467 			add_to_order = 1;
468 			po = po->delta_child;
469 		} else {
470 			add_to_order = 0;
471 			/* our sibling might have some children, it is next */
472 			if (po->delta_sibling) {
473 				po = po->delta_sibling;
474 				continue;
475 			}
476 			/* go back to our parent node */
477 			po = po->delta;
478 			while (po && !po->delta_sibling) {
479 				/* we're on the right side of a subtree, keep
480 				 * going up until we can go right again */
481 				po = po->delta;
482 			}
483 			if (!po) {
484 				/* done- we hit our original root node */
485 				return;
486 			}
487 			/* pass it off to sibling at this level */
488 			po = po->delta_sibling;
489 		}
490 	};
491 }
492 
add_family_to_write_order(git_pobject ** wo,size_t * endp,git_pobject * po)493 static void add_family_to_write_order(git_pobject **wo, size_t *endp,
494 	git_pobject *po)
495 {
496 	git_pobject *root;
497 
498 	for (root = po; root->delta; root = root->delta)
499 		; /* nothing */
500 	add_descendants_to_write_order(wo, endp, root);
501 }
502 
cb_tag_foreach(const char * name,git_oid * oid,void * data)503 static int cb_tag_foreach(const char *name, git_oid *oid, void *data)
504 {
505 	git_packbuilder *pb = data;
506 	git_pobject *po;
507 
508 	GIT_UNUSED(name);
509 
510 	if ((po = git_oidmap_get(pb->object_ix, oid)) == NULL)
511 		return 0;
512 
513 	po->tagged = 1;
514 
515 	/* TODO: peel objects */
516 
517 	return 0;
518 }
519 
compute_write_order(git_packbuilder * pb)520 static git_pobject **compute_write_order(git_packbuilder *pb)
521 {
522 	size_t i, wo_end, last_untagged;
523 	git_pobject **wo;
524 
525 	if ((wo = git__mallocarray(pb->nr_objects, sizeof(*wo))) == NULL)
526 		return NULL;
527 
528 	for (i = 0; i < pb->nr_objects; i++) {
529 		git_pobject *po = pb->object_list + i;
530 		po->tagged = 0;
531 		po->filled = 0;
532 		po->delta_child = NULL;
533 		po->delta_sibling = NULL;
534 	}
535 
536 	/*
537 	 * Fully connect delta_child/delta_sibling network.
538 	 * Make sure delta_sibling is sorted in the original
539 	 * recency order.
540 	 */
541 	for (i = pb->nr_objects; i > 0;) {
542 		git_pobject *po = &pb->object_list[--i];
543 		if (!po->delta)
544 			continue;
545 		/* Mark me as the first child */
546 		po->delta_sibling = po->delta->delta_child;
547 		po->delta->delta_child = po;
548 	}
549 
550 	/*
551 	 * Mark objects that are at the tip of tags.
552 	 */
553 	if (git_tag_foreach(pb->repo, &cb_tag_foreach, pb) < 0) {
554 		git__free(wo);
555 		return NULL;
556 	}
557 
558 	/*
559 	 * Give the objects in the original recency order until
560 	 * we see a tagged tip.
561 	 */
562 	for (i = wo_end = 0; i < pb->nr_objects; i++) {
563 		git_pobject *po = pb->object_list + i;
564 		if (po->tagged)
565 			break;
566 		add_to_write_order(wo, &wo_end, po);
567 	}
568 	last_untagged = i;
569 
570 	/*
571 	 * Then fill all the tagged tips.
572 	 */
573 	for (; i < pb->nr_objects; i++) {
574 		git_pobject *po = pb->object_list + i;
575 		if (po->tagged)
576 			add_to_write_order(wo, &wo_end, po);
577 	}
578 
579 	/*
580 	 * And then all remaining commits and tags.
581 	 */
582 	for (i = last_untagged; i < pb->nr_objects; i++) {
583 		git_pobject *po = pb->object_list + i;
584 		if (po->type != GIT_OBJECT_COMMIT &&
585 		    po->type != GIT_OBJECT_TAG)
586 			continue;
587 		add_to_write_order(wo, &wo_end, po);
588 	}
589 
590 	/*
591 	 * And then all the trees.
592 	 */
593 	for (i = last_untagged; i < pb->nr_objects; i++) {
594 		git_pobject *po = pb->object_list + i;
595 		if (po->type != GIT_OBJECT_TREE)
596 			continue;
597 		add_to_write_order(wo, &wo_end, po);
598 	}
599 
600 	/*
601 	 * Finally all the rest in really tight order
602 	 */
603 	for (i = last_untagged; i < pb->nr_objects; i++) {
604 		git_pobject *po = pb->object_list + i;
605 		if (!po->filled)
606 			add_family_to_write_order(wo, &wo_end, po);
607 	}
608 
609 	if (wo_end != pb->nr_objects) {
610 		git__free(wo);
611 		git_error_set(GIT_ERROR_INVALID, "invalid write order");
612 		return NULL;
613 	}
614 
615 	return wo;
616 }
617 
write_pack(git_packbuilder * pb,int (* write_cb)(void * buf,size_t size,void * cb_data),void * cb_data)618 static int write_pack(git_packbuilder *pb,
619 	int (*write_cb)(void *buf, size_t size, void *cb_data),
620 	void *cb_data)
621 {
622 	git_pobject **write_order;
623 	git_pobject *po;
624 	enum write_one_status status;
625 	struct git_pack_header ph;
626 	git_oid entry_oid;
627 	size_t i = 0;
628 	int error = 0;
629 
630 	write_order = compute_write_order(pb);
631 	if (write_order == NULL)
632 		return -1;
633 
634 	if (!git__is_uint32(pb->nr_objects)) {
635 		git_error_set(GIT_ERROR_INVALID, "too many objects");
636 		return -1;
637 	}
638 
639 	/* Write pack header */
640 	ph.hdr_signature = htonl(PACK_SIGNATURE);
641 	ph.hdr_version = htonl(PACK_VERSION);
642 	ph.hdr_entries = htonl(pb->nr_objects);
643 
644 	if ((error = write_cb(&ph, sizeof(ph), cb_data)) < 0 ||
645 		(error = git_hash_update(&pb->ctx, &ph, sizeof(ph))) < 0)
646 		goto done;
647 
648 	pb->nr_remaining = pb->nr_objects;
649 	do {
650 		pb->nr_written = 0;
651 		for ( ; i < pb->nr_objects; ++i) {
652 			po = write_order[i];
653 
654 			if ((error = write_one(&status, pb, po, write_cb, cb_data)) < 0)
655 				goto done;
656 		}
657 
658 		pb->nr_remaining -= pb->nr_written;
659 	} while (pb->nr_remaining && i < pb->nr_objects);
660 
661 	if ((error = git_hash_final(&entry_oid, &pb->ctx)) < 0)
662 		goto done;
663 
664 	error = write_cb(entry_oid.id, GIT_OID_RAWSZ, cb_data);
665 
666 done:
667 	/* if callback cancelled writing, we must still free delta_data */
668 	for ( ; i < pb->nr_objects; ++i) {
669 		po = write_order[i];
670 		if (po->delta_data) {
671 			git__free(po->delta_data);
672 			po->delta_data = NULL;
673 		}
674 	}
675 
676 	git__free(write_order);
677 	return error;
678 }
679 
write_pack_buf(void * buf,size_t size,void * data)680 static int write_pack_buf(void *buf, size_t size, void *data)
681 {
682 	git_buf *b = (git_buf *)data;
683 	return git_buf_put(b, buf, size);
684 }
685 
type_size_sort(const void * _a,const void * _b)686 static int type_size_sort(const void *_a, const void *_b)
687 {
688 	const git_pobject *a = (git_pobject *)_a;
689 	const git_pobject *b = (git_pobject *)_b;
690 
691 	if (a->type > b->type)
692 		return -1;
693 	if (a->type < b->type)
694 		return 1;
695 	if (a->hash > b->hash)
696 		return -1;
697 	if (a->hash < b->hash)
698 		return 1;
699 	/*
700 	 * TODO
701 	 *
702 	if (a->preferred_base > b->preferred_base)
703 		return -1;
704 	if (a->preferred_base < b->preferred_base)
705 		return 1;
706 	*/
707 	if (a->size > b->size)
708 		return -1;
709 	if (a->size < b->size)
710 		return 1;
711 	return a < b ? -1 : (a > b); /* newest first */
712 }
713 
delta_cacheable(git_packbuilder * pb,size_t src_size,size_t trg_size,size_t delta_size)714 static int delta_cacheable(
715 	git_packbuilder *pb,
716 	size_t src_size,
717 	size_t trg_size,
718 	size_t delta_size)
719 {
720 	size_t new_size;
721 
722 	if (git__add_sizet_overflow(&new_size, pb->delta_cache_size, delta_size))
723 		return 0;
724 
725 	if (pb->max_delta_cache_size && new_size > pb->max_delta_cache_size)
726 		return 0;
727 
728 	if (delta_size < pb->cache_max_small_delta_size)
729 		return 1;
730 
731 	/* cache delta, if objects are large enough compared to delta size */
732 	if ((src_size >> 20) + (trg_size >> 21) > (delta_size >> 10))
733 		return 1;
734 
735 	return 0;
736 }
737 
try_delta(git_packbuilder * pb,struct unpacked * trg,struct unpacked * src,size_t max_depth,size_t * mem_usage,int * ret)738 static int try_delta(git_packbuilder *pb, struct unpacked *trg,
739 		     struct unpacked *src, size_t max_depth,
740 			 size_t *mem_usage, int *ret)
741 {
742 	git_pobject *trg_object = trg->object;
743 	git_pobject *src_object = src->object;
744 	git_odb_object *obj;
745 	size_t trg_size, src_size, delta_size, sizediff, max_size, sz;
746 	size_t ref_depth;
747 	void *delta_buf;
748 
749 	/* Don't bother doing diffs between different types */
750 	if (trg_object->type != src_object->type) {
751 		*ret = -1;
752 		return 0;
753 	}
754 
755 	*ret = 0;
756 
757 	/* TODO: support reuse-delta */
758 
759 	/* Let's not bust the allowed depth. */
760 	if (src->depth >= max_depth)
761 		return 0;
762 
763 	/* Now some size filtering heuristics. */
764 	trg_size = trg_object->size;
765 	if (!trg_object->delta) {
766 		max_size = trg_size/2 - 20;
767 		ref_depth = 1;
768 	} else {
769 		max_size = trg_object->delta_size;
770 		ref_depth = trg->depth;
771 	}
772 
773 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
774 					(max_depth - ref_depth + 1);
775 	if (max_size == 0)
776 		return 0;
777 
778 	src_size = src_object->size;
779 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
780 	if (sizediff >= max_size)
781 		return 0;
782 	if (trg_size < src_size / 32)
783 		return 0;
784 
785 	/* Load data if not already done */
786 	if (!trg->data) {
787 		if (git_odb_read(&obj, pb->odb, &trg_object->id) < 0)
788 			return -1;
789 
790 		sz = git_odb_object_size(obj);
791 		trg->data = git__malloc(sz);
792 		GIT_ERROR_CHECK_ALLOC(trg->data);
793 		memcpy(trg->data, git_odb_object_data(obj), sz);
794 
795 		git_odb_object_free(obj);
796 
797 		if (sz != trg_size) {
798 			git_error_set(GIT_ERROR_INVALID,
799 				   "inconsistent target object length");
800 			return -1;
801 		}
802 
803 		*mem_usage += sz;
804 	}
805 	if (!src->data) {
806 		size_t obj_sz;
807 
808 		if (git_odb_read(&obj, pb->odb, &src_object->id) < 0 ||
809 			!git__is_ulong(obj_sz = git_odb_object_size(obj)))
810 			return -1;
811 
812 		sz = obj_sz;
813 		src->data = git__malloc(sz);
814 		GIT_ERROR_CHECK_ALLOC(src->data);
815 		memcpy(src->data, git_odb_object_data(obj), sz);
816 
817 		git_odb_object_free(obj);
818 
819 		if (sz != src_size) {
820 			git_error_set(GIT_ERROR_INVALID,
821 				   "inconsistent source object length");
822 			return -1;
823 		}
824 
825 		*mem_usage += sz;
826 	}
827 	if (!src->index) {
828 		if (git_delta_index_init(&src->index, src->data, src_size) < 0)
829 			return 0; /* suboptimal pack - out of memory */
830 
831 		*mem_usage += git_delta_index_size(src->index);
832 	}
833 
834 	if (git_delta_create_from_index(&delta_buf, &delta_size, src->index, trg->data, trg_size,
835 		max_size) < 0)
836 		return 0;
837 
838 	if (trg_object->delta) {
839 		/* Prefer only shallower same-sized deltas. */
840 		if (delta_size == trg_object->delta_size &&
841 		    src->depth + 1 >= trg->depth) {
842 			git__free(delta_buf);
843 			return 0;
844 		}
845 	}
846 
847 	GIT_ASSERT(git_packbuilder__cache_lock(pb) == 0);
848 
849 	if (trg_object->delta_data) {
850 		git__free(trg_object->delta_data);
851 		GIT_ASSERT(pb->delta_cache_size >= trg_object->delta_size);
852 		pb->delta_cache_size -= trg_object->delta_size;
853 		trg_object->delta_data = NULL;
854 	}
855 	if (delta_cacheable(pb, src_size, trg_size, delta_size)) {
856 		bool overflow = git__add_sizet_overflow(
857 			&pb->delta_cache_size, pb->delta_cache_size, delta_size);
858 
859 		GIT_ASSERT(git_packbuilder__cache_unlock(pb) == 0);
860 
861 		if (overflow) {
862 			git__free(delta_buf);
863 			return -1;
864 		}
865 
866 		trg_object->delta_data = git__realloc(delta_buf, delta_size);
867 		GIT_ERROR_CHECK_ALLOC(trg_object->delta_data);
868 	} else {
869 		/* create delta when writing the pack */
870 		GIT_ASSERT(git_packbuilder__cache_unlock(pb) == 0);
871 		git__free(delta_buf);
872 	}
873 
874 	trg_object->delta = src_object;
875 	trg_object->delta_size = delta_size;
876 	trg->depth = src->depth + 1;
877 
878 	*ret = 1;
879 	return 0;
880 }
881 
check_delta_limit(git_pobject * me,size_t n)882 static size_t check_delta_limit(git_pobject *me, size_t n)
883 {
884 	git_pobject *child = me->delta_child;
885 	size_t m = n;
886 
887 	while (child) {
888 		size_t c = check_delta_limit(child, n + 1);
889 		if (m < c)
890 			m = c;
891 		child = child->delta_sibling;
892 	}
893 	return m;
894 }
895 
free_unpacked(struct unpacked * n)896 static size_t free_unpacked(struct unpacked *n)
897 {
898 	size_t freed_mem = 0;
899 
900 	if (n->index) {
901 		freed_mem += git_delta_index_size(n->index);
902 		git_delta_index_free(n->index);
903 	}
904 	n->index = NULL;
905 
906 	if (n->data) {
907 		freed_mem += n->object->size;
908 		git__free(n->data);
909 		n->data = NULL;
910 	}
911 	n->object = NULL;
912 	n->depth = 0;
913 	return freed_mem;
914 }
915 
report_delta_progress(git_packbuilder * pb,uint32_t count,bool force)916 static int report_delta_progress(
917 	git_packbuilder *pb, uint32_t count, bool force)
918 {
919 	int ret;
920 
921 	if (pb->progress_cb) {
922 		double current_time = git__timer();
923 		double elapsed = current_time - pb->last_progress_report_time;
924 
925 		if (force || elapsed >= MIN_PROGRESS_UPDATE_INTERVAL) {
926 			pb->last_progress_report_time = current_time;
927 
928 			ret = pb->progress_cb(
929 				GIT_PACKBUILDER_DELTAFICATION,
930 				count, pb->nr_objects, pb->progress_cb_payload);
931 
932 			if (ret)
933 				return git_error_set_after_callback(ret);
934 		}
935 	}
936 
937 	return 0;
938 }
939 
find_deltas(git_packbuilder * pb,git_pobject ** list,size_t * list_size,size_t window,size_t depth)940 static int find_deltas(git_packbuilder *pb, git_pobject **list,
941 	size_t *list_size, size_t window, size_t depth)
942 {
943 	git_pobject *po;
944 	git_buf zbuf = GIT_BUF_INIT;
945 	struct unpacked *array;
946 	size_t idx = 0, count = 0;
947 	size_t mem_usage = 0;
948 	size_t i;
949 	int error = -1;
950 
951 	array = git__calloc(window, sizeof(struct unpacked));
952 	GIT_ERROR_CHECK_ALLOC(array);
953 
954 	for (;;) {
955 		struct unpacked *n = array + idx;
956 		size_t max_depth, j, best_base = SIZE_MAX;
957 
958 		GIT_ASSERT(git_packbuilder__progress_lock(pb) == 0);
959 		if (!*list_size) {
960 			GIT_ASSERT(git_packbuilder__progress_unlock(pb) == 0);
961 			break;
962 		}
963 
964 		pb->nr_deltified += 1;
965 		report_delta_progress(pb, pb->nr_deltified, false);
966 
967 		po = *list++;
968 		(*list_size)--;
969 		GIT_ASSERT(git_packbuilder__progress_unlock(pb) == 0);
970 
971 		mem_usage -= free_unpacked(n);
972 		n->object = po;
973 
974 		while (pb->window_memory_limit &&
975 		       mem_usage > pb->window_memory_limit &&
976 		       count > 1) {
977 			size_t tail = (idx + window - count) % window;
978 			mem_usage -= free_unpacked(array + tail);
979 			count--;
980 		}
981 
982 		/*
983 		 * If the current object is at pack edge, take the depth the
984 		 * objects that depend on the current object into account
985 		 * otherwise they would become too deep.
986 		 */
987 		max_depth = depth;
988 		if (po->delta_child) {
989 			size_t delta_limit = check_delta_limit(po, 0);
990 
991 			if (delta_limit > max_depth)
992 				goto next;
993 
994 			max_depth -= delta_limit;
995 		}
996 
997 		j = window;
998 		while (--j > 0) {
999 			int ret;
1000 			size_t other_idx = idx + j;
1001 			struct unpacked *m;
1002 
1003 			if (other_idx >= window)
1004 				other_idx -= window;
1005 
1006 			m = array + other_idx;
1007 			if (!m->object)
1008 				break;
1009 
1010 			if (try_delta(pb, n, m, max_depth, &mem_usage, &ret) < 0)
1011 				goto on_error;
1012 			if (ret < 0)
1013 				break;
1014 			else if (ret > 0)
1015 				best_base = other_idx;
1016 		}
1017 
1018 		/*
1019 		 * If we decided to cache the delta data, then it is best
1020 		 * to compress it right away.  First because we have to do
1021 		 * it anyway, and doing it here while we're threaded will
1022 		 * save a lot of time in the non threaded write phase,
1023 		 * as well as allow for caching more deltas within
1024 		 * the same cache size limit.
1025 		 * ...
1026 		 * But only if not writing to stdout, since in that case
1027 		 * the network is most likely throttling writes anyway,
1028 		 * and therefore it is best to go to the write phase ASAP
1029 		 * instead, as we can afford spending more time compressing
1030 		 * between writes at that moment.
1031 		 */
1032 		if (po->delta_data) {
1033 			if (git_zstream_deflatebuf(&zbuf, po->delta_data, po->delta_size) < 0)
1034 				goto on_error;
1035 
1036 			git__free(po->delta_data);
1037 			po->delta_data = git__malloc(zbuf.size);
1038 			GIT_ERROR_CHECK_ALLOC(po->delta_data);
1039 
1040 			memcpy(po->delta_data, zbuf.ptr, zbuf.size);
1041 			po->z_delta_size = zbuf.size;
1042 			git_buf_clear(&zbuf);
1043 
1044 			GIT_ASSERT(git_packbuilder__cache_lock(pb) == 0);
1045 			pb->delta_cache_size -= po->delta_size;
1046 			pb->delta_cache_size += po->z_delta_size;
1047 			GIT_ASSERT(git_packbuilder__cache_unlock(pb) == 0);
1048 		}
1049 
1050 		/*
1051 		 * If we made n a delta, and if n is already at max
1052 		 * depth, leaving it in the window is pointless.  we
1053 		 * should evict it first.
1054 		 */
1055 		if (po->delta && max_depth <= n->depth)
1056 			continue;
1057 
1058 		/*
1059 		 * Move the best delta base up in the window, after the
1060 		 * currently deltified object, to keep it longer.  It will
1061 		 * be the first base object to be attempted next.
1062 		 */
1063 		if (po->delta) {
1064 			struct unpacked swap = array[best_base];
1065 			size_t dist = (window + idx - best_base) % window;
1066 			size_t dst = best_base;
1067 			while (dist--) {
1068 				size_t src = (dst + 1) % window;
1069 				array[dst] = array[src];
1070 				dst = src;
1071 			}
1072 			array[dst] = swap;
1073 		}
1074 
1075 		next:
1076 		idx++;
1077 		if (count + 1 < window)
1078 			count++;
1079 		if (idx >= window)
1080 			idx = 0;
1081 	}
1082 	error = 0;
1083 
1084 on_error:
1085 	for (i = 0; i < window; ++i) {
1086 		git__free(array[i].index);
1087 		git__free(array[i].data);
1088 	}
1089 	git__free(array);
1090 	git_buf_dispose(&zbuf);
1091 
1092 	return error;
1093 }
1094 
1095 #ifdef GIT_THREADS
1096 
1097 struct thread_params {
1098 	git_thread thread;
1099 	git_packbuilder *pb;
1100 
1101 	git_pobject **list;
1102 
1103 	git_cond cond;
1104 	git_mutex mutex;
1105 
1106 	size_t list_size;
1107 	size_t remaining;
1108 
1109 	size_t window;
1110 	size_t depth;
1111 	size_t working;
1112 	size_t data_ready;
1113 };
1114 
threaded_find_deltas(void * arg)1115 static void *threaded_find_deltas(void *arg)
1116 {
1117 	struct thread_params *me = arg;
1118 
1119 	while (me->remaining) {
1120 		if (find_deltas(me->pb, me->list, &me->remaining,
1121 				me->window, me->depth) < 0) {
1122 			; /* TODO */
1123 		}
1124 
1125 		GIT_ASSERT_WITH_RETVAL(git_packbuilder__progress_lock(me->pb) == 0, NULL);
1126 		me->working = 0;
1127 		git_cond_signal(&me->pb->progress_cond);
1128 		GIT_ASSERT_WITH_RETVAL(git_packbuilder__progress_unlock(me->pb) == 0, NULL);
1129 
1130 		if (git_mutex_lock(&me->mutex)) {
1131 			git_error_set(GIT_ERROR_THREAD, "unable to lock packfile condition mutex");
1132 			return NULL;
1133 		}
1134 
1135 		while (!me->data_ready)
1136 			git_cond_wait(&me->cond, &me->mutex);
1137 
1138 		/*
1139 		 * We must not set ->data_ready before we wait on the
1140 		 * condition because the main thread may have set it to 1
1141 		 * before we get here. In order to be sure that new
1142 		 * work is available if we see 1 in ->data_ready, it
1143 		 * was initialized to 0 before this thread was spawned
1144 		 * and we reset it to 0 right away.
1145 		 */
1146 		me->data_ready = 0;
1147 		git_mutex_unlock(&me->mutex);
1148 	}
1149 	/* leave ->working 1 so that this doesn't get more work assigned */
1150 	return NULL;
1151 }
1152 
ll_find_deltas(git_packbuilder * pb,git_pobject ** list,size_t list_size,size_t window,size_t depth)1153 static int ll_find_deltas(git_packbuilder *pb, git_pobject **list,
1154 			  size_t list_size, size_t window, size_t depth)
1155 {
1156 	struct thread_params *p;
1157 	size_t i;
1158 	int ret, active_threads = 0;
1159 
1160 	if (!pb->nr_threads)
1161 		pb->nr_threads = git__online_cpus();
1162 
1163 	if (pb->nr_threads <= 1) {
1164 		find_deltas(pb, list, &list_size, window, depth);
1165 		return 0;
1166 	}
1167 
1168 	p = git__mallocarray(pb->nr_threads, sizeof(*p));
1169 	GIT_ERROR_CHECK_ALLOC(p);
1170 
1171 	/* Partition the work among the threads */
1172 	for (i = 0; i < pb->nr_threads; ++i) {
1173 		size_t sub_size = list_size / (pb->nr_threads - i);
1174 
1175 		/* don't use too small segments or no deltas will be found */
1176 		if (sub_size < 2*window && i+1 < pb->nr_threads)
1177 			sub_size = 0;
1178 
1179 		p[i].pb = pb;
1180 		p[i].window = window;
1181 		p[i].depth = depth;
1182 		p[i].working = 1;
1183 		p[i].data_ready = 0;
1184 
1185 		/* try to split chunks on "path" boundaries */
1186 		while (sub_size && sub_size < list_size &&
1187 		       list[sub_size]->hash &&
1188 		       list[sub_size]->hash == list[sub_size-1]->hash)
1189 			sub_size++;
1190 
1191 		p[i].list = list;
1192 		p[i].list_size = sub_size;
1193 		p[i].remaining = sub_size;
1194 
1195 		list += sub_size;
1196 		list_size -= sub_size;
1197 	}
1198 
1199 	/* Start work threads */
1200 	for (i = 0; i < pb->nr_threads; ++i) {
1201 		if (!p[i].list_size)
1202 			continue;
1203 
1204 		git_mutex_init(&p[i].mutex);
1205 		git_cond_init(&p[i].cond);
1206 
1207 		ret = git_thread_create(&p[i].thread,
1208 					threaded_find_deltas, &p[i]);
1209 		if (ret) {
1210 			git_error_set(GIT_ERROR_THREAD, "unable to create thread");
1211 			return -1;
1212 		}
1213 		active_threads++;
1214 	}
1215 
1216 	/*
1217 	 * Now let's wait for work completion.  Each time a thread is done
1218 	 * with its work, we steal half of the remaining work from the
1219 	 * thread with the largest number of unprocessed objects and give
1220 	 * it to that newly idle thread.  This ensure good load balancing
1221 	 * until the remaining object list segments are simply too short
1222 	 * to be worth splitting anymore.
1223 	 */
1224 	while (active_threads) {
1225 		struct thread_params *target = NULL;
1226 		struct thread_params *victim = NULL;
1227 		size_t sub_size = 0;
1228 
1229 		/* Start by locating a thread that has transitioned its
1230 		 * 'working' flag from 1 -> 0. This indicates that it is
1231 		 * ready to receive more work using our work-stealing
1232 		 * algorithm. */
1233 		GIT_ASSERT(git_packbuilder__progress_lock(pb) == 0);
1234 		for (;;) {
1235 			for (i = 0; !target && i < pb->nr_threads; i++)
1236 				if (!p[i].working)
1237 					target = &p[i];
1238 			if (target)
1239 				break;
1240 			git_cond_wait(&pb->progress_cond, &pb->progress_mutex);
1241 		}
1242 
1243 		/* At this point we hold the progress lock and have located
1244 		 * a thread to receive more work. We still need to locate a
1245 		 * thread from which to steal work (the victim). */
1246 		for (i = 0; i < pb->nr_threads; i++)
1247 			if (p[i].remaining > 2*window &&
1248 			    (!victim || victim->remaining < p[i].remaining))
1249 				victim = &p[i];
1250 
1251 		if (victim) {
1252 			sub_size = victim->remaining / 2;
1253 			list = victim->list + victim->list_size - sub_size;
1254 			while (sub_size && list[0]->hash &&
1255 			       list[0]->hash == list[-1]->hash) {
1256 				list++;
1257 				sub_size--;
1258 			}
1259 			if (!sub_size) {
1260 				/*
1261 				 * It is possible for some "paths" to have
1262 				 * so many objects that no hash boundary
1263 				 * might be found.  Let's just steal the
1264 				 * exact half in that case.
1265 				 */
1266 				sub_size = victim->remaining / 2;
1267 				list -= sub_size;
1268 			}
1269 			target->list = list;
1270 			victim->list_size -= sub_size;
1271 			victim->remaining -= sub_size;
1272 		}
1273 		target->list_size = sub_size;
1274 		target->remaining = sub_size;
1275 		target->working = 1;
1276 		GIT_ASSERT(git_packbuilder__progress_unlock(pb) == 0);
1277 
1278 		if (git_mutex_lock(&target->mutex)) {
1279 			git_error_set(GIT_ERROR_THREAD, "unable to lock packfile condition mutex");
1280 			git__free(p);
1281 			return -1;
1282 		}
1283 
1284 		target->data_ready = 1;
1285 		git_cond_signal(&target->cond);
1286 		git_mutex_unlock(&target->mutex);
1287 
1288 		if (!sub_size) {
1289 			git_thread_join(&target->thread, NULL);
1290 			git_cond_free(&target->cond);
1291 			git_mutex_free(&target->mutex);
1292 			active_threads--;
1293 		}
1294 	}
1295 
1296 	git__free(p);
1297 	return 0;
1298 }
1299 
1300 #else
1301 #define ll_find_deltas(pb, l, ls, w, d) find_deltas(pb, l, &ls, w, d)
1302 #endif
1303 
prepare_pack(git_packbuilder * pb)1304 static int prepare_pack(git_packbuilder *pb)
1305 {
1306 	git_pobject **delta_list;
1307 	size_t i, n = 0;
1308 
1309 	if (pb->nr_objects == 0 || pb->done)
1310 		return 0; /* nothing to do */
1311 
1312 	/*
1313 	 * Although we do not report progress during deltafication, we
1314 	 * at least report that we are in the deltafication stage
1315 	 */
1316 	if (pb->progress_cb)
1317 			pb->progress_cb(GIT_PACKBUILDER_DELTAFICATION, 0, pb->nr_objects, pb->progress_cb_payload);
1318 
1319 	delta_list = git__mallocarray(pb->nr_objects, sizeof(*delta_list));
1320 	GIT_ERROR_CHECK_ALLOC(delta_list);
1321 
1322 	for (i = 0; i < pb->nr_objects; ++i) {
1323 		git_pobject *po = pb->object_list + i;
1324 
1325 		/* Make sure the item is within our size limits */
1326 		if (po->size < 50 || po->size > pb->big_file_threshold)
1327 			continue;
1328 
1329 		delta_list[n++] = po;
1330 	}
1331 
1332 	if (n > 1) {
1333 		git__tsort((void **)delta_list, n, type_size_sort);
1334 		if (ll_find_deltas(pb, delta_list, n,
1335 				   GIT_PACK_WINDOW + 1,
1336 				   GIT_PACK_DEPTH) < 0) {
1337 			git__free(delta_list);
1338 			return -1;
1339 		}
1340 	}
1341 
1342 	report_delta_progress(pb, pb->nr_objects, true);
1343 
1344 	pb->done = true;
1345 	git__free(delta_list);
1346 	return 0;
1347 }
1348 
1349 #define PREPARE_PACK if (prepare_pack(pb) < 0) { return -1; }
1350 
git_packbuilder_foreach(git_packbuilder * pb,int (* cb)(void * buf,size_t size,void * payload),void * payload)1351 int git_packbuilder_foreach(git_packbuilder *pb, int (*cb)(void *buf, size_t size, void *payload), void *payload)
1352 {
1353 	PREPARE_PACK;
1354 	return write_pack(pb, cb, payload);
1355 }
1356 
git_packbuilder_write_buf(git_buf * buf,git_packbuilder * pb)1357 int git_packbuilder_write_buf(git_buf *buf, git_packbuilder *pb)
1358 {
1359 	int error;
1360 
1361 	if ((error = git_buf_sanitize(buf)) < 0)
1362 		return error;
1363 
1364 	PREPARE_PACK;
1365 
1366 	return write_pack(pb, &write_pack_buf, buf);
1367 }
1368 
write_cb(void * buf,size_t len,void * payload)1369 static int write_cb(void *buf, size_t len, void *payload)
1370 {
1371 	struct pack_write_context *ctx = payload;
1372 	return git_indexer_append(ctx->indexer, buf, len, ctx->stats);
1373 }
1374 
git_packbuilder_write(git_packbuilder * pb,const char * path,unsigned int mode,git_indexer_progress_cb progress_cb,void * progress_cb_payload)1375 int git_packbuilder_write(
1376 	git_packbuilder *pb,
1377 	const char *path,
1378 	unsigned int mode,
1379 	git_indexer_progress_cb progress_cb,
1380 	void *progress_cb_payload)
1381 {
1382 	int error = -1;
1383 	git_buf object_path = GIT_BUF_INIT;
1384 	git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
1385 	git_indexer *indexer = NULL;
1386 	git_indexer_progress stats;
1387 	struct pack_write_context ctx;
1388 	int t;
1389 
1390 	PREPARE_PACK;
1391 
1392 	if (path == NULL) {
1393 		if ((error = git_repository_item_path(&object_path, pb->repo, GIT_REPOSITORY_ITEM_OBJECTS)) < 0)
1394 			goto cleanup;
1395 		if ((error = git_buf_joinpath(&object_path, git_buf_cstr(&object_path), "pack")) < 0)
1396 			goto cleanup;
1397 		path = git_buf_cstr(&object_path);
1398 	}
1399 
1400 	opts.progress_cb = progress_cb;
1401 	opts.progress_cb_payload = progress_cb_payload;
1402 
1403 	if ((error = git_indexer_new(&indexer, path, mode, pb->odb, &opts)) < 0)
1404 		goto cleanup;
1405 
1406 	if (!git_repository__configmap_lookup(&t, pb->repo, GIT_CONFIGMAP_FSYNCOBJECTFILES) && t)
1407 		git_indexer__set_fsync(indexer, 1);
1408 
1409 	ctx.indexer = indexer;
1410 	ctx.stats = &stats;
1411 
1412 	if ((error = git_packbuilder_foreach(pb, write_cb, &ctx)) < 0)
1413 		goto cleanup;
1414 
1415 	if ((error = git_indexer_commit(indexer, &stats)) < 0)
1416 		goto cleanup;
1417 
1418 	git_oid_cpy(&pb->pack_oid, git_indexer_hash(indexer));
1419 
1420 cleanup:
1421 	git_indexer_free(indexer);
1422 	git_buf_dispose(&object_path);
1423 	return error;
1424 }
1425 
1426 #undef PREPARE_PACK
1427 
git_packbuilder_hash(git_packbuilder * pb)1428 const git_oid *git_packbuilder_hash(git_packbuilder *pb)
1429 {
1430 	return &pb->pack_oid;
1431 }
1432 
1433 
cb_tree_walk(const char * root,const git_tree_entry * entry,void * payload)1434 static int cb_tree_walk(
1435 	const char *root, const git_tree_entry *entry, void *payload)
1436 {
1437 	int error;
1438 	struct tree_walk_context *ctx = payload;
1439 
1440 	/* A commit inside a tree represents a submodule commit and should be skipped. */
1441 	if (git_tree_entry_type(entry) == GIT_OBJECT_COMMIT)
1442 		return 0;
1443 
1444 	if (!(error = git_buf_sets(&ctx->buf, root)) &&
1445 		!(error = git_buf_puts(&ctx->buf, git_tree_entry_name(entry))))
1446 		error = git_packbuilder_insert(
1447 			ctx->pb, git_tree_entry_id(entry), git_buf_cstr(&ctx->buf));
1448 
1449 	return error;
1450 }
1451 
git_packbuilder_insert_commit(git_packbuilder * pb,const git_oid * oid)1452 int git_packbuilder_insert_commit(git_packbuilder *pb, const git_oid *oid)
1453 {
1454 	git_commit *commit;
1455 
1456 	if (git_commit_lookup(&commit, pb->repo, oid) < 0 ||
1457 		git_packbuilder_insert(pb, oid, NULL) < 0)
1458 		return -1;
1459 
1460 	if (git_packbuilder_insert_tree(pb, git_commit_tree_id(commit)) < 0)
1461 		return -1;
1462 
1463 	git_commit_free(commit);
1464 	return 0;
1465 }
1466 
git_packbuilder_insert_tree(git_packbuilder * pb,const git_oid * oid)1467 int git_packbuilder_insert_tree(git_packbuilder *pb, const git_oid *oid)
1468 {
1469 	int error;
1470 	git_tree *tree = NULL;
1471 	struct tree_walk_context context = { pb, GIT_BUF_INIT };
1472 
1473 	if (!(error = git_tree_lookup(&tree, pb->repo, oid)) &&
1474 	    !(error = git_packbuilder_insert(pb, oid, NULL)))
1475 		error = git_tree_walk(tree, GIT_TREEWALK_PRE, cb_tree_walk, &context);
1476 
1477 	git_tree_free(tree);
1478 	git_buf_dispose(&context.buf);
1479 	return error;
1480 }
1481 
git_packbuilder_insert_recur(git_packbuilder * pb,const git_oid * id,const char * name)1482 int git_packbuilder_insert_recur(git_packbuilder *pb, const git_oid *id, const char *name)
1483 {
1484 	git_object *obj;
1485 	int error;
1486 
1487 	GIT_ASSERT_ARG(pb);
1488 	GIT_ASSERT_ARG(id);
1489 
1490 	if ((error = git_object_lookup(&obj, pb->repo, id, GIT_OBJECT_ANY)) < 0)
1491 		return error;
1492 
1493 	switch (git_object_type(obj)) {
1494 	case GIT_OBJECT_BLOB:
1495 		error = git_packbuilder_insert(pb, id, name);
1496 		break;
1497 	case GIT_OBJECT_TREE:
1498 		error = git_packbuilder_insert_tree(pb, id);
1499 		break;
1500 	case GIT_OBJECT_COMMIT:
1501 		error = git_packbuilder_insert_commit(pb, id);
1502 		break;
1503 	case GIT_OBJECT_TAG:
1504 		if ((error = git_packbuilder_insert(pb, id, name)) < 0)
1505 			goto cleanup;
1506 		error = git_packbuilder_insert_recur(pb, git_tag_target_id((git_tag *) obj), NULL);
1507 		break;
1508 
1509 	default:
1510 		git_error_set(GIT_ERROR_INVALID, "unknown object type");
1511 		error = -1;
1512 	}
1513 
1514 cleanup:
1515 	git_object_free(obj);
1516 	return error;
1517 }
1518 
git_packbuilder_object_count(git_packbuilder * pb)1519 size_t git_packbuilder_object_count(git_packbuilder *pb)
1520 {
1521 	return pb->nr_objects;
1522 }
1523 
git_packbuilder_written(git_packbuilder * pb)1524 size_t git_packbuilder_written(git_packbuilder *pb)
1525 {
1526 	return pb->nr_written;
1527 }
1528 
lookup_walk_object(struct walk_object ** out,git_packbuilder * pb,const git_oid * id)1529 static int lookup_walk_object(struct walk_object **out, git_packbuilder *pb, const git_oid *id)
1530 {
1531 	struct walk_object *obj;
1532 
1533 	obj = git_pool_mallocz(&pb->object_pool, 1);
1534 	if (!obj) {
1535 		git_error_set_oom();
1536 		return -1;
1537 	}
1538 
1539 	git_oid_cpy(&obj->id, id);
1540 
1541 	*out = obj;
1542 	return 0;
1543 }
1544 
retrieve_object(struct walk_object ** out,git_packbuilder * pb,const git_oid * id)1545 static int retrieve_object(struct walk_object **out, git_packbuilder *pb, const git_oid *id)
1546 {
1547 	struct walk_object *obj;
1548 	int error;
1549 
1550 	if ((obj = git_oidmap_get(pb->walk_objects, id)) == NULL) {
1551 		if ((error = lookup_walk_object(&obj, pb, id)) < 0)
1552 			return error;
1553 
1554 		if ((error = git_oidmap_set(pb->walk_objects, &obj->id, obj)) < 0)
1555 			return error;
1556 	}
1557 
1558 	*out = obj;
1559 	return 0;
1560 }
1561 
mark_blob_uninteresting(git_packbuilder * pb,const git_oid * id)1562 static int mark_blob_uninteresting(git_packbuilder *pb, const git_oid *id)
1563 {
1564 	int error;
1565 	struct walk_object *obj;
1566 
1567 	if ((error = retrieve_object(&obj, pb, id)) < 0)
1568 		return error;
1569 
1570 	obj->uninteresting = 1;
1571 
1572 	return 0;
1573 }
1574 
mark_tree_uninteresting(git_packbuilder * pb,const git_oid * id)1575 static int mark_tree_uninteresting(git_packbuilder *pb, const git_oid *id)
1576 {
1577 	struct walk_object *obj;
1578 	git_tree *tree;
1579 	int error;
1580 	size_t i;
1581 
1582 	if ((error = retrieve_object(&obj, pb, id)) < 0)
1583 		return error;
1584 
1585 	if (obj->uninteresting)
1586 		return 0;
1587 
1588 	obj->uninteresting = 1;
1589 
1590 	if ((error = git_tree_lookup(&tree, pb->repo, id)) < 0)
1591 		return error;
1592 
1593 	for (i = 0; i < git_tree_entrycount(tree); i++) {
1594 		const git_tree_entry *entry = git_tree_entry_byindex(tree, i);
1595 		const git_oid *entry_id = git_tree_entry_id(entry);
1596 		switch (git_tree_entry_type(entry)) {
1597 		case GIT_OBJECT_TREE:
1598 			if ((error = mark_tree_uninteresting(pb, entry_id)) < 0)
1599 				goto cleanup;
1600 			break;
1601 		case GIT_OBJECT_BLOB:
1602 			if ((error = mark_blob_uninteresting(pb, entry_id)) < 0)
1603 				goto cleanup;
1604 			break;
1605 		default:
1606 			/* it's a submodule or something unknown, we don't want it */
1607 			;
1608 		}
1609 	}
1610 
1611 cleanup:
1612 	git_tree_free(tree);
1613 	return error;
1614 }
1615 
1616 /*
1617  * Mark the edges of the graph uninteresting. Since we start from a
1618  * git_revwalk, the commits are already uninteresting, but we need to
1619  * mark the trees and blobs.
1620  */
mark_edges_uninteresting(git_packbuilder * pb,git_commit_list * commits)1621 static int mark_edges_uninteresting(git_packbuilder *pb, git_commit_list *commits)
1622 {
1623 	int error;
1624 	git_commit_list *list;
1625 	git_commit *commit;
1626 
1627 	for (list = commits; list; list = list->next) {
1628 		if (!list->item->uninteresting)
1629 			continue;
1630 
1631 		if ((error = git_commit_lookup(&commit, pb->repo, &list->item->oid)) < 0)
1632 			return error;
1633 
1634 		error = mark_tree_uninteresting(pb, git_commit_tree_id(commit));
1635 		git_commit_free(commit);
1636 
1637 		if (error < 0)
1638 			return error;
1639 	}
1640 
1641 	return 0;
1642 }
1643 
pack_objects_insert_tree(git_packbuilder * pb,git_tree * tree)1644 static int pack_objects_insert_tree(git_packbuilder *pb, git_tree *tree)
1645 {
1646 	size_t i;
1647 	int error;
1648 	git_tree *subtree;
1649 	struct walk_object *obj;
1650 	const char *name;
1651 
1652 	if ((error = retrieve_object(&obj, pb, git_tree_id(tree))) < 0)
1653 		return error;
1654 
1655 	if (obj->seen || obj->uninteresting)
1656 		return 0;
1657 
1658 	obj->seen = 1;
1659 
1660 	if ((error = git_packbuilder_insert(pb, &obj->id, NULL)))
1661 		return error;
1662 
1663 	for (i = 0; i < git_tree_entrycount(tree); i++) {
1664 		const git_tree_entry *entry = git_tree_entry_byindex(tree, i);
1665 		const git_oid *entry_id = git_tree_entry_id(entry);
1666 		switch (git_tree_entry_type(entry)) {
1667 		case GIT_OBJECT_TREE:
1668 			if ((error = git_tree_lookup(&subtree, pb->repo, entry_id)) < 0)
1669 				return error;
1670 
1671 			error = pack_objects_insert_tree(pb, subtree);
1672 			git_tree_free(subtree);
1673 
1674 			if (error < 0)
1675 				return error;
1676 
1677 			break;
1678 		case GIT_OBJECT_BLOB:
1679 			if ((error = retrieve_object(&obj, pb, entry_id)) < 0)
1680 				return error;
1681 			if (obj->uninteresting)
1682 				continue;
1683 			name = git_tree_entry_name(entry);
1684 			if ((error = git_packbuilder_insert(pb, entry_id, name)) < 0)
1685 				return error;
1686 			break;
1687 		default:
1688 			/* it's a submodule or something unknown, we don't want it */
1689 			;
1690 		}
1691 	}
1692 
1693 
1694 	return error;
1695 }
1696 
pack_objects_insert_commit(git_packbuilder * pb,struct walk_object * obj)1697 static int pack_objects_insert_commit(git_packbuilder *pb, struct walk_object *obj)
1698 {
1699 	int error;
1700 	git_commit *commit = NULL;
1701 	git_tree *tree = NULL;
1702 
1703 	obj->seen = 1;
1704 
1705 	if ((error = git_packbuilder_insert(pb, &obj->id, NULL)) < 0)
1706 		return error;
1707 
1708 	if ((error = git_commit_lookup(&commit, pb->repo, &obj->id)) < 0)
1709 		return error;
1710 
1711 	if ((error = git_tree_lookup(&tree, pb->repo, git_commit_tree_id(commit))) < 0)
1712 		goto cleanup;
1713 
1714 	if ((error = pack_objects_insert_tree(pb, tree)) < 0)
1715 		goto cleanup;
1716 
1717 cleanup:
1718 	git_commit_free(commit);
1719 	git_tree_free(tree);
1720 	return error;
1721 }
1722 
git_packbuilder_insert_walk(git_packbuilder * pb,git_revwalk * walk)1723 int git_packbuilder_insert_walk(git_packbuilder *pb, git_revwalk *walk)
1724 {
1725 	int error;
1726 	git_oid id;
1727 	struct walk_object *obj;
1728 
1729 	GIT_ASSERT_ARG(pb);
1730 	GIT_ASSERT_ARG(walk);
1731 
1732 	if ((error = mark_edges_uninteresting(pb, walk->user_input)) < 0)
1733 		return error;
1734 
1735 	/*
1736 	 * TODO: git marks the parents of the edges
1737 	 * uninteresting. This may provide a speed advantage, but does
1738 	 * seem to assume the remote does not have a single-commit
1739 	 * history on the other end.
1740 	 */
1741 
1742 	/* walk down each tree up to the blobs and insert them, stopping when uninteresting */
1743 	while ((error = git_revwalk_next(&id, walk)) == 0) {
1744 		if ((error = retrieve_object(&obj, pb, &id)) < 0)
1745 			return error;
1746 
1747 		if (obj->seen || obj->uninteresting)
1748 			continue;
1749 
1750 		if ((error = pack_objects_insert_commit(pb, obj)) < 0)
1751 			return error;
1752 	}
1753 
1754 	if (error == GIT_ITEROVER)
1755 		error = 0;
1756 
1757 	return error;
1758 }
1759 
git_packbuilder_set_callbacks(git_packbuilder * pb,git_packbuilder_progress progress_cb,void * progress_cb_payload)1760 int git_packbuilder_set_callbacks(git_packbuilder *pb, git_packbuilder_progress progress_cb, void *progress_cb_payload)
1761 {
1762 	if (!pb)
1763 		return -1;
1764 
1765 	pb->progress_cb = progress_cb;
1766 	pb->progress_cb_payload = progress_cb_payload;
1767 
1768 	return 0;
1769 }
1770 
git_packbuilder_free(git_packbuilder * pb)1771 void git_packbuilder_free(git_packbuilder *pb)
1772 {
1773 	if (pb == NULL)
1774 		return;
1775 
1776 #ifdef GIT_THREADS
1777 
1778 	git_mutex_free(&pb->cache_mutex);
1779 	git_mutex_free(&pb->progress_mutex);
1780 	git_cond_free(&pb->progress_cond);
1781 
1782 #endif
1783 
1784 	if (pb->odb)
1785 		git_odb_free(pb->odb);
1786 
1787 	if (pb->object_ix)
1788 		git_oidmap_free(pb->object_ix);
1789 
1790 	if (pb->object_list)
1791 		git__free(pb->object_list);
1792 
1793 	git_oidmap_free(pb->walk_objects);
1794 	git_pool_clear(&pb->object_pool);
1795 
1796 	git_hash_ctx_cleanup(&pb->ctx);
1797 	git_zstream_free(&pb->zstream);
1798 
1799 	git__free(pb);
1800 }
1801