1 /*
2  * Copyright (C) the libgit2 contributors. All rights reserved.
3  *
4  * This file is part of libgit2, distributed under the GNU GPL v2 with
5  * a Linking Exception. For full terms see the included COPYING file.
6  */
7 
8 #include "indexer.h"
9 
10 #include "git2/indexer.h"
11 #include "git2/object.h"
12 
13 #include "commit.h"
14 #include "tree.h"
15 #include "tag.h"
16 #include "pack.h"
17 #include "mwindow.h"
18 #include "posix.h"
19 #include "pack.h"
20 #include "filebuf.h"
21 #include "oid.h"
22 #include "oidarray.h"
23 #include "oidmap.h"
24 #include "zstream.h"
25 #include "object.h"
26 
27 extern git_mutex git__mwindow_mutex;
28 
29 size_t git_indexer__max_objects = UINT32_MAX;
30 
31 #define UINT31_MAX (0x7FFFFFFF)
32 
33 struct entry {
34 	git_oid oid;
35 	uint32_t crc;
36 	uint32_t offset;
37 	uint64_t offset_long;
38 };
39 
40 struct git_indexer {
41 	unsigned int parsed_header :1,
42 		pack_committed :1,
43 		have_stream :1,
44 		have_delta :1,
45 		do_fsync :1,
46 		do_verify :1;
47 	struct git_pack_header hdr;
48 	struct git_pack_file *pack;
49 	unsigned int mode;
50 	off64_t off;
51 	off64_t entry_start;
52 	git_object_t entry_type;
53 	git_buf entry_data;
54 	git_packfile_stream stream;
55 	size_t nr_objects;
56 	git_vector objects;
57 	git_vector deltas;
58 	unsigned int fanout[256];
59 	git_hash_ctx hash_ctx;
60 	git_oid hash;
61 	git_indexer_progress_cb progress_cb;
62 	void *progress_payload;
63 	char objbuf[8*1024];
64 
65 	/* OIDs referenced from pack objects. Used for verification. */
66 	git_oidmap *expected_oids;
67 
68 	/* Needed to look up objects which we want to inject to fix a thin pack */
69 	git_odb *odb;
70 
71 	/* Fields for calculating the packfile trailer (hash of everything before it) */
72 	char inbuf[GIT_OID_RAWSZ];
73 	size_t inbuf_len;
74 	git_hash_ctx trailer;
75 };
76 
77 struct delta_info {
78 	off64_t delta_off;
79 };
80 
git_indexer_hash(const git_indexer * idx)81 const git_oid *git_indexer_hash(const git_indexer *idx)
82 {
83 	return &idx->hash;
84 }
85 
parse_header(struct git_pack_header * hdr,struct git_pack_file * pack)86 static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
87 {
88 	int error;
89 	git_map map;
90 
91 	if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0)
92 		return error;
93 
94 	memcpy(hdr, map.data, sizeof(*hdr));
95 	p_munmap(&map);
96 
97 	/* Verify we recognize this pack file format. */
98 	if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
99 		git_error_set(GIT_ERROR_INDEXER, "wrong pack signature");
100 		return -1;
101 	}
102 
103 	if (!pack_version_ok(hdr->hdr_version)) {
104 		git_error_set(GIT_ERROR_INDEXER, "wrong pack version");
105 		return -1;
106 	}
107 
108 	return 0;
109 }
110 
objects_cmp(const void * a,const void * b)111 static int objects_cmp(const void *a, const void *b)
112 {
113 	const struct entry *entrya = a;
114 	const struct entry *entryb = b;
115 
116 	return git_oid__cmp(&entrya->oid, &entryb->oid);
117 }
118 
git_indexer_options_init(git_indexer_options * opts,unsigned int version)119 int git_indexer_options_init(git_indexer_options *opts, unsigned int version)
120 {
121 	GIT_INIT_STRUCTURE_FROM_TEMPLATE(
122 		opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT);
123 	return 0;
124 }
125 
126 #ifndef GIT_DEPRECATE_HARD
git_indexer_init_options(git_indexer_options * opts,unsigned int version)127 int git_indexer_init_options(git_indexer_options *opts, unsigned int version)
128 {
129 	return git_indexer_options_init(opts, version);
130 }
131 #endif
132 
git_indexer_new(git_indexer ** out,const char * prefix,unsigned int mode,git_odb * odb,git_indexer_options * in_opts)133 int git_indexer_new(
134 		git_indexer **out,
135 		const char *prefix,
136 		unsigned int mode,
137 		git_odb *odb,
138 		git_indexer_options *in_opts)
139 {
140 	git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
141 	git_indexer *idx;
142 	git_buf path = GIT_BUF_INIT, tmp_path = GIT_BUF_INIT;
143 	static const char suff[] = "/pack";
144 	int error, fd = -1;
145 
146 	if (in_opts)
147 		memcpy(&opts, in_opts, sizeof(opts));
148 
149 	idx = git__calloc(1, sizeof(git_indexer));
150 	GIT_ERROR_CHECK_ALLOC(idx);
151 	idx->odb = odb;
152 	idx->progress_cb = opts.progress_cb;
153 	idx->progress_payload = opts.progress_cb_payload;
154 	idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
155 	git_buf_init(&idx->entry_data, 0);
156 
157 	if ((error = git_hash_ctx_init(&idx->hash_ctx)) < 0 ||
158 	    (error = git_hash_ctx_init(&idx->trailer)) < 0 ||
159 	    (error = git_oidmap_new(&idx->expected_oids)) < 0)
160 		goto cleanup;
161 
162 	idx->do_verify = opts.verify;
163 
164 	if (git_repository__fsync_gitdir)
165 		idx->do_fsync = 1;
166 
167 	error = git_buf_joinpath(&path, prefix, suff);
168 	if (error < 0)
169 		goto cleanup;
170 
171 	fd = git_futils_mktmp(&tmp_path, git_buf_cstr(&path), idx->mode);
172 	git_buf_dispose(&path);
173 	if (fd < 0)
174 		goto cleanup;
175 
176 	error = git_packfile_alloc(&idx->pack, git_buf_cstr(&tmp_path));
177 	git_buf_dispose(&tmp_path);
178 
179 	if (error < 0)
180 		goto cleanup;
181 
182 	idx->pack->mwf.fd = fd;
183 	if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0)
184 		goto cleanup;
185 
186 	*out = idx;
187 	return 0;
188 
189 cleanup:
190 	if (fd != -1)
191 		p_close(fd);
192 
193 	if (git_buf_len(&tmp_path) > 0)
194 		p_unlink(git_buf_cstr(&tmp_path));
195 
196 	if (idx->pack != NULL)
197 		p_unlink(idx->pack->pack_name);
198 
199 	git_buf_dispose(&path);
200 	git_buf_dispose(&tmp_path);
201 	git__free(idx);
202 	return -1;
203 }
204 
git_indexer__set_fsync(git_indexer * idx,int do_fsync)205 void git_indexer__set_fsync(git_indexer *idx, int do_fsync)
206 {
207 	idx->do_fsync = !!do_fsync;
208 }
209 
210 /* Try to store the delta so we can try to resolve it later */
store_delta(git_indexer * idx)211 static int store_delta(git_indexer *idx)
212 {
213 	struct delta_info *delta;
214 
215 	delta = git__calloc(1, sizeof(struct delta_info));
216 	GIT_ERROR_CHECK_ALLOC(delta);
217 	delta->delta_off = idx->entry_start;
218 
219 	if (git_vector_insert(&idx->deltas, delta) < 0)
220 		return -1;
221 
222 	return 0;
223 }
224 
hash_header(git_hash_ctx * ctx,off64_t len,git_object_t type)225 static int hash_header(git_hash_ctx *ctx, off64_t len, git_object_t type)
226 {
227 	char buffer[64];
228 	size_t hdrlen;
229 	int error;
230 
231 	if ((error = git_odb__format_object_header(&hdrlen,
232 		buffer, sizeof(buffer), (size_t)len, type)) < 0)
233 		return error;
234 
235 	return git_hash_update(ctx, buffer, hdrlen);
236 }
237 
hash_object_stream(git_indexer * idx,git_packfile_stream * stream)238 static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
239 {
240 	ssize_t read;
241 
242 	assert(idx && stream);
243 
244 	do {
245 		if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
246 			break;
247 
248 		if (idx->do_verify)
249 			git_buf_put(&idx->entry_data, idx->objbuf, read);
250 
251 		git_hash_update(&idx->hash_ctx, idx->objbuf, read);
252 	} while (read > 0);
253 
254 	if (read < 0)
255 		return (int)read;
256 
257 	return 0;
258 }
259 
260 /* In order to create the packfile stream, we need to skip over the delta base description */
advance_delta_offset(git_indexer * idx,git_object_t type)261 static int advance_delta_offset(git_indexer *idx, git_object_t type)
262 {
263 	git_mwindow *w = NULL;
264 
265 	assert(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA);
266 
267 	if (type == GIT_OBJECT_REF_DELTA) {
268 		idx->off += GIT_OID_RAWSZ;
269 	} else {
270 		off64_t base_off;
271 		int error = get_delta_base(&base_off, idx->pack, &w, &idx->off, type, idx->entry_start);
272 		git_mwindow_close(&w);
273 		if (error < 0)
274 			return error;
275 	}
276 
277 	return 0;
278 }
279 
280 /* Read from the stream and discard any output */
read_object_stream(git_indexer * idx,git_packfile_stream * stream)281 static int read_object_stream(git_indexer *idx, git_packfile_stream *stream)
282 {
283 	ssize_t read;
284 
285 	assert(stream);
286 
287 	do {
288 		read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
289 	} while (read > 0);
290 
291 	if (read < 0)
292 		return (int)read;
293 
294 	return 0;
295 }
296 
crc_object(uint32_t * crc_out,git_mwindow_file * mwf,off64_t start,off64_t size)297 static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, off64_t start, off64_t size)
298 {
299 	void *ptr;
300 	uint32_t crc;
301 	unsigned int left, len;
302 	git_mwindow *w = NULL;
303 
304 	crc = crc32(0L, Z_NULL, 0);
305 	while (size) {
306 		ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
307 		if (ptr == NULL)
308 			return -1;
309 
310 		len = min(left, (unsigned int)size);
311 		crc = crc32(crc, ptr, len);
312 		size -= len;
313 		start += len;
314 		git_mwindow_close(&w);
315 	}
316 
317 	*crc_out = htonl(crc);
318 	return 0;
319 }
320 
add_expected_oid(git_indexer * idx,const git_oid * oid)321 static int add_expected_oid(git_indexer *idx, const git_oid *oid)
322 {
323 	/*
324 	 * If we know about that object because it is stored in our ODB or
325 	 * because we have already processed it as part of our pack file, we do
326 	 * not have to expect it.
327 	 */
328 	if ((!idx->odb || !git_odb_exists(idx->odb, oid)) &&
329 	    !git_oidmap_exists(idx->pack->idx_cache, oid) &&
330 	    !git_oidmap_exists(idx->expected_oids, oid)) {
331 		    git_oid *dup = git__malloc(sizeof(*oid));
332 		    GIT_ERROR_CHECK_ALLOC(dup);
333 		    git_oid_cpy(dup, oid);
334 		    return git_oidmap_set(idx->expected_oids, dup, dup);
335 	}
336 
337 	return 0;
338 }
339 
check_object_connectivity(git_indexer * idx,const git_rawobj * obj)340 static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
341 {
342 	git_object *object;
343 	git_oid *expected;
344 	int error;
345 
346 	if (obj->type != GIT_OBJECT_BLOB &&
347 	    obj->type != GIT_OBJECT_TREE &&
348 	    obj->type != GIT_OBJECT_COMMIT &&
349 	    obj->type != GIT_OBJECT_TAG)
350 		return 0;
351 
352 	if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
353 		goto out;
354 
355 	if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) {
356 		git_oidmap_delete(idx->expected_oids, &object->cached.oid);
357 		git__free(expected);
358 	}
359 
360 	/*
361 	 * Check whether this is a known object. If so, we can just continue as
362 	 * we assume that the ODB has a complete graph.
363 	 */
364 	if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid))
365 		return 0;
366 
367 	switch (obj->type) {
368 		case GIT_OBJECT_TREE:
369 		{
370 			git_tree *tree = (git_tree *) object;
371 			git_tree_entry *entry;
372 			size_t i;
373 
374 			git_array_foreach(tree->entries, i, entry)
375 				if (add_expected_oid(idx, entry->oid) < 0)
376 					goto out;
377 
378 			break;
379 		}
380 		case GIT_OBJECT_COMMIT:
381 		{
382 			git_commit *commit = (git_commit *) object;
383 			git_oid *parent_oid;
384 			size_t i;
385 
386 			git_array_foreach(commit->parent_ids, i, parent_oid)
387 				if (add_expected_oid(idx, parent_oid) < 0)
388 					goto out;
389 
390 			if (add_expected_oid(idx, &commit->tree_id) < 0)
391 				goto out;
392 
393 			break;
394 		}
395 		case GIT_OBJECT_TAG:
396 		{
397 			git_tag *tag = (git_tag *) object;
398 
399 			if (add_expected_oid(idx, &tag->target) < 0)
400 				goto out;
401 
402 			break;
403 		}
404 		case GIT_OBJECT_BLOB:
405 		default:
406 			break;
407 	}
408 
409 out:
410 	git_object_free(object);
411 
412 	return error;
413 }
414 
store_object(git_indexer * idx)415 static int store_object(git_indexer *idx)
416 {
417 	int i, error;
418 	git_oid oid;
419 	struct entry *entry;
420 	off64_t entry_size;
421 	struct git_pack_entry *pentry;
422 	off64_t entry_start = idx->entry_start;
423 
424 	entry = git__calloc(1, sizeof(*entry));
425 	GIT_ERROR_CHECK_ALLOC(entry);
426 
427 	pentry = git__calloc(1, sizeof(struct git_pack_entry));
428 	GIT_ERROR_CHECK_ALLOC(pentry);
429 
430 	if (git_hash_final(&oid, &idx->hash_ctx)) {
431 		git__free(pentry);
432 		goto on_error;
433 	}
434 	entry_size = idx->off - entry_start;
435 	if (entry_start > UINT31_MAX) {
436 		entry->offset = UINT32_MAX;
437 		entry->offset_long = entry_start;
438 	} else {
439 		entry->offset = (uint32_t)entry_start;
440 	}
441 
442 	if (idx->do_verify) {
443 		git_rawobj rawobj = {
444 		    idx->entry_data.ptr,
445 		    idx->entry_data.size,
446 		    idx->entry_type
447 		};
448 
449 		if ((error = check_object_connectivity(idx, &rawobj)) < 0)
450 			goto on_error;
451 	}
452 
453 	git_oid_cpy(&pentry->sha1, &oid);
454 	pentry->offset = entry_start;
455 
456 	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) {
457 		git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1));
458 		git__free(pentry);
459 		goto on_error;
460 	}
461 
462 	if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) {
463 		git__free(pentry);
464 		git_error_set_oom();
465 		goto on_error;
466 	}
467 
468 	git_oid_cpy(&entry->oid, &oid);
469 
470 	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
471 		goto on_error;
472 
473 	/* Add the object to the list */
474 	if (git_vector_insert(&idx->objects, entry) < 0)
475 		goto on_error;
476 
477 	for (i = oid.id[0]; i < 256; ++i) {
478 		idx->fanout[i]++;
479 	}
480 
481 	return 0;
482 
483 on_error:
484 	git__free(entry);
485 
486 	return -1;
487 }
488 
has_entry(git_indexer * idx,git_oid * id)489 GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id)
490 {
491 	return git_oidmap_exists(idx->pack->idx_cache, id);
492 }
493 
save_entry(git_indexer * idx,struct entry * entry,struct git_pack_entry * pentry,off64_t entry_start)494 static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, off64_t entry_start)
495 {
496 	int i;
497 
498 	if (entry_start > UINT31_MAX) {
499 		entry->offset = UINT32_MAX;
500 		entry->offset_long = entry_start;
501 	} else {
502 		entry->offset = (uint32_t)entry_start;
503 	}
504 
505 	pentry->offset = entry_start;
506 
507 	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) ||
508 	    git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) {
509 		git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack");
510 		return -1;
511 	}
512 
513 	/* Add the object to the list */
514 	if (git_vector_insert(&idx->objects, entry) < 0)
515 		return -1;
516 
517 	for (i = entry->oid.id[0]; i < 256; ++i) {
518 		idx->fanout[i]++;
519 	}
520 
521 	return 0;
522 }
523 
hash_and_save(git_indexer * idx,git_rawobj * obj,off64_t entry_start)524 static int hash_and_save(git_indexer *idx, git_rawobj *obj, off64_t entry_start)
525 {
526 	git_oid oid;
527 	size_t entry_size;
528 	struct entry *entry;
529 	struct git_pack_entry *pentry = NULL;
530 
531 	entry = git__calloc(1, sizeof(*entry));
532 	GIT_ERROR_CHECK_ALLOC(entry);
533 
534 	if (git_odb__hashobj(&oid, obj) < 0) {
535 		git_error_set(GIT_ERROR_INDEXER, "failed to hash object");
536 		goto on_error;
537 	}
538 
539 	pentry = git__calloc(1, sizeof(struct git_pack_entry));
540 	GIT_ERROR_CHECK_ALLOC(pentry);
541 
542 	git_oid_cpy(&pentry->sha1, &oid);
543 	git_oid_cpy(&entry->oid, &oid);
544 	entry->crc = crc32(0L, Z_NULL, 0);
545 
546 	entry_size = (size_t)(idx->off - entry_start);
547 	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
548 		goto on_error;
549 
550 	return save_entry(idx, entry, pentry, entry_start);
551 
552 on_error:
553 	git__free(pentry);
554 	git__free(entry);
555 	git__free(obj->data);
556 	return -1;
557 }
558 
do_progress_callback(git_indexer * idx,git_indexer_progress * stats)559 static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats)
560 {
561 	if (idx->progress_cb)
562 		return git_error_set_after_callback_function(
563 			idx->progress_cb(stats, idx->progress_payload),
564 			"indexer progress");
565 	return 0;
566 }
567 
568 /* Hash everything but the last 20B of input */
hash_partially(git_indexer * idx,const uint8_t * data,size_t size)569 static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size)
570 {
571 	size_t to_expell, to_keep;
572 
573 	if (size == 0)
574 		return;
575 
576 	/* Easy case, dump the buffer and the data minus the last 20 bytes */
577 	if (size >= GIT_OID_RAWSZ) {
578 		git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len);
579 		git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ);
580 
581 		data += size - GIT_OID_RAWSZ;
582 		memcpy(idx->inbuf, data, GIT_OID_RAWSZ);
583 		idx->inbuf_len = GIT_OID_RAWSZ;
584 		return;
585 	}
586 
587 	/* We can just append */
588 	if (idx->inbuf_len + size <= GIT_OID_RAWSZ) {
589 		memcpy(idx->inbuf + idx->inbuf_len, data, size);
590 		idx->inbuf_len += size;
591 		return;
592 	}
593 
594 	/* We need to partially drain the buffer and then append */
595 	to_keep   = GIT_OID_RAWSZ - size;
596 	to_expell = idx->inbuf_len - to_keep;
597 
598 	git_hash_update(&idx->trailer, idx->inbuf, to_expell);
599 
600 	memmove(idx->inbuf, idx->inbuf + to_expell, to_keep);
601 	memcpy(idx->inbuf + to_keep, data, size);
602 	idx->inbuf_len += size - to_expell;
603 }
604 
write_at(git_indexer * idx,const void * data,off64_t offset,size_t size)605 static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size)
606 {
607 	git_file fd = idx->pack->mwf.fd;
608 	size_t mmap_alignment;
609 	size_t page_offset;
610 	off64_t page_start;
611 	unsigned char *map_data;
612 	git_map map;
613 	int error;
614 
615 	assert(data && size);
616 
617 	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
618 		return error;
619 
620 	/* the offset needs to be at the mmap boundary for the platform */
621 	page_offset = offset % mmap_alignment;
622 	page_start = offset - page_offset;
623 
624 	if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0)
625 		return error;
626 
627 	map_data = (unsigned char *)map.data;
628 	memcpy(map_data + page_offset, data, size);
629 	p_munmap(&map);
630 
631 	return 0;
632 }
633 
append_to_pack(git_indexer * idx,const void * data,size_t size)634 static int append_to_pack(git_indexer *idx, const void *data, size_t size)
635 {
636 	off64_t new_size;
637 	size_t mmap_alignment;
638 	size_t page_offset;
639 	off64_t page_start;
640 	off64_t current_size = idx->pack->mwf.size;
641 	int fd = idx->pack->mwf.fd;
642 	int error;
643 
644 	if (!size)
645 		return 0;
646 
647 	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
648 		return error;
649 
650 	/* Write a single byte to force the file system to allocate space now or
651 	 * report an error, since we can't report errors when writing using mmap.
652 	 * Round the size up to the nearest page so that we only need to perform file
653 	 * I/O when we add a page, instead of whenever we write even a single byte. */
654 	new_size = current_size + size;
655 	page_offset = new_size % mmap_alignment;
656 	page_start = new_size - page_offset;
657 
658 	if (p_lseek(fd, page_start + mmap_alignment - 1, SEEK_SET) < 0 ||
659 	    p_write(idx->pack->mwf.fd, data, 1) < 0) {
660 		git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name);
661 		return -1;
662 	}
663 
664 	return write_at(idx, data, idx->pack->mwf.size, size);
665 }
666 
read_stream_object(git_indexer * idx,git_indexer_progress * stats)667 static int read_stream_object(git_indexer *idx, git_indexer_progress *stats)
668 {
669 	git_packfile_stream *stream = &idx->stream;
670 	off64_t entry_start = idx->off;
671 	size_t entry_size;
672 	git_object_t type;
673 	git_mwindow *w = NULL;
674 	int error;
675 
676 	if (idx->pack->mwf.size <= idx->off + 20)
677 		return GIT_EBUFS;
678 
679 	if (!idx->have_stream) {
680 		error = git_packfile_unpack_header(&entry_size, &type, &idx->pack->mwf, &w, &idx->off);
681 		if (error == GIT_EBUFS) {
682 			idx->off = entry_start;
683 			return error;
684 		}
685 		if (error < 0)
686 			return error;
687 
688 		git_mwindow_close(&w);
689 		idx->entry_start = entry_start;
690 		git_hash_init(&idx->hash_ctx);
691 		git_buf_clear(&idx->entry_data);
692 
693 		if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) {
694 			error = advance_delta_offset(idx, type);
695 			if (error == GIT_EBUFS) {
696 				idx->off = entry_start;
697 				return error;
698 			}
699 			if (error < 0)
700 				return error;
701 
702 			idx->have_delta = 1;
703 		} else {
704 			idx->have_delta = 0;
705 
706 			error = hash_header(&idx->hash_ctx, entry_size, type);
707 			if (error < 0)
708 				return error;
709 		}
710 
711 		idx->have_stream = 1;
712 		idx->entry_type = type;
713 
714 		error = git_packfile_stream_open(stream, idx->pack, idx->off);
715 		if (error < 0)
716 			return error;
717 	}
718 
719 	if (idx->have_delta) {
720 		error = read_object_stream(idx, stream);
721 	} else {
722 		error = hash_object_stream(idx, stream);
723 	}
724 
725 	idx->off = stream->curpos;
726 	if (error == GIT_EBUFS)
727 		return error;
728 
729 	/* We want to free the stream reasorces no matter what here */
730 	idx->have_stream = 0;
731 	git_packfile_stream_dispose(stream);
732 
733 	if (error < 0)
734 		return error;
735 
736 	if (idx->have_delta) {
737 		error = store_delta(idx);
738 	} else {
739 		error = store_object(idx);
740 	}
741 
742 	if (error < 0)
743 		return error;
744 
745 	if (!idx->have_delta) {
746 		stats->indexed_objects++;
747 	}
748 	stats->received_objects++;
749 
750 	if ((error = do_progress_callback(idx, stats)) != 0)
751 		return error;
752 
753 	return 0;
754 }
755 
git_indexer_append(git_indexer * idx,const void * data,size_t size,git_indexer_progress * stats)756 int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats)
757 {
758 	int error = -1;
759 	struct git_pack_header *hdr = &idx->hdr;
760 	git_mwindow_file *mwf = &idx->pack->mwf;
761 
762 	assert(idx && data && stats);
763 
764 	if ((error = append_to_pack(idx, data, size)) < 0)
765 		return error;
766 
767 	hash_partially(idx, data, (int)size);
768 
769 	/* Make sure we set the new size of the pack */
770 	idx->pack->mwf.size += size;
771 
772 	if (!idx->parsed_header) {
773 		unsigned int total_objects;
774 
775 		if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header))
776 			return 0;
777 
778 		if ((error = parse_header(&idx->hdr, idx->pack)) < 0)
779 			return error;
780 
781 		idx->parsed_header = 1;
782 		idx->nr_objects = ntohl(hdr->hdr_entries);
783 		idx->off = sizeof(struct git_pack_header);
784 
785 		if (idx->nr_objects <= git_indexer__max_objects) {
786 			total_objects = (unsigned int)idx->nr_objects;
787 		} else {
788 			git_error_set(GIT_ERROR_INDEXER, "too many objects");
789 			return -1;
790 		}
791 
792 		if (git_oidmap_new(&idx->pack->idx_cache) < 0)
793 			return -1;
794 
795 		idx->pack->has_cache = 1;
796 		if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0)
797 			return -1;
798 
799 		if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0)
800 			return -1;
801 
802 		stats->received_objects = 0;
803 		stats->local_objects = 0;
804 		stats->total_deltas = 0;
805 		stats->indexed_deltas = 0;
806 		stats->indexed_objects = 0;
807 		stats->total_objects = total_objects;
808 
809 		if ((error = do_progress_callback(idx, stats)) != 0)
810 			return error;
811 	}
812 
813 	/* Now that we have data in the pack, let's try to parse it */
814 
815 	/* As the file grows any windows we try to use will be out of date */
816 	git_mwindow_free_all(mwf);
817 
818 	while (stats->indexed_objects < idx->nr_objects) {
819 		if ((error = read_stream_object(idx, stats)) != 0) {
820 			if (error == GIT_EBUFS)
821 				break;
822 			else
823 				goto on_error;
824 		}
825 	}
826 
827 	return 0;
828 
829 on_error:
830 	git_mwindow_free_all(mwf);
831 	return error;
832 }
833 
index_path(git_buf * path,git_indexer * idx,const char * suffix)834 static int index_path(git_buf *path, git_indexer *idx, const char *suffix)
835 {
836 	const char prefix[] = "pack-";
837 	size_t slash = (size_t)path->size;
838 
839 	/* search backwards for '/' */
840 	while (slash > 0 && path->ptr[slash - 1] != '/')
841 		slash--;
842 
843 	if (git_buf_grow(path, slash + 1 + strlen(prefix) +
844 					 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
845 		return -1;
846 
847 	git_buf_truncate(path, slash);
848 	git_buf_puts(path, prefix);
849 	git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
850 	path->size += GIT_OID_HEXSZ;
851 	git_buf_puts(path, suffix);
852 
853 	return git_buf_oom(path) ? -1 : 0;
854 }
855 
856 /**
857  * Rewind the packfile by the trailer, as we might need to fix the
858  * packfile by injecting objects at the tail and must overwrite it.
859  */
seek_back_trailer(git_indexer * idx)860 static void seek_back_trailer(git_indexer *idx)
861 {
862 	idx->pack->mwf.size -= GIT_OID_RAWSZ;
863 	git_mwindow_free_all(&idx->pack->mwf);
864 }
865 
inject_object(git_indexer * idx,git_oid * id)866 static int inject_object(git_indexer *idx, git_oid *id)
867 {
868 	git_odb_object *obj;
869 	struct entry *entry;
870 	struct git_pack_entry *pentry = NULL;
871 	git_oid foo = {{0}};
872 	unsigned char hdr[64];
873 	git_buf buf = GIT_BUF_INIT;
874 	off64_t entry_start;
875 	const void *data;
876 	size_t len, hdr_len;
877 	int error;
878 
879 	seek_back_trailer(idx);
880 	entry_start = idx->pack->mwf.size;
881 
882 	if (git_odb_read(&obj, idx->odb, id) < 0) {
883 		git_error_set(GIT_ERROR_INDEXER, "missing delta bases");
884 		return -1;
885 	}
886 
887 	data = git_odb_object_data(obj);
888 	len = git_odb_object_size(obj);
889 
890 	entry = git__calloc(1, sizeof(*entry));
891 	GIT_ERROR_CHECK_ALLOC(entry);
892 
893 	entry->crc = crc32(0L, Z_NULL, 0);
894 
895 	/* Write out the object header */
896 	hdr_len = git_packfile__object_header(hdr, len, git_odb_object_type(obj));
897 	if ((error = append_to_pack(idx, hdr, hdr_len)) < 0)
898 		goto cleanup;
899 
900 	idx->pack->mwf.size += hdr_len;
901 	entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len);
902 
903 	if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0)
904 		goto cleanup;
905 
906 	/* And then the compressed object */
907 	if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0)
908 		goto cleanup;
909 
910 	idx->pack->mwf.size += buf.size;
911 	entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size));
912 	git_buf_dispose(&buf);
913 
914 	/* Write a fake trailer so the pack functions play ball */
915 
916 	if ((error = append_to_pack(idx, &foo, GIT_OID_RAWSZ)) < 0)
917 		goto cleanup;
918 
919 	idx->pack->mwf.size += GIT_OID_RAWSZ;
920 
921 	pentry = git__calloc(1, sizeof(struct git_pack_entry));
922 	GIT_ERROR_CHECK_ALLOC(pentry);
923 
924 	git_oid_cpy(&pentry->sha1, id);
925 	git_oid_cpy(&entry->oid, id);
926 	idx->off = entry_start + hdr_len + len;
927 
928 	error = save_entry(idx, entry, pentry, entry_start);
929 
930 cleanup:
931 	if (error) {
932 		git__free(entry);
933 		git__free(pentry);
934 	}
935 
936 	git_odb_object_free(obj);
937 	return error;
938 }
939 
fix_thin_pack(git_indexer * idx,git_indexer_progress * stats)940 static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats)
941 {
942 	int error, found_ref_delta = 0;
943 	unsigned int i;
944 	struct delta_info *delta;
945 	size_t size;
946 	git_object_t type;
947 	git_mwindow *w = NULL;
948 	off64_t curpos = 0;
949 	unsigned char *base_info;
950 	unsigned int left = 0;
951 	git_oid base;
952 
953 	assert(git_vector_length(&idx->deltas) > 0);
954 
955 	if (idx->odb == NULL) {
956 		git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB");
957 		return -1;
958 	}
959 
960 	/* Loop until we find the first REF delta */
961 	git_vector_foreach(&idx->deltas, i, delta) {
962 		if (!delta)
963 			continue;
964 
965 		curpos = delta->delta_off;
966 		error = git_packfile_unpack_header(&size, &type, &idx->pack->mwf, &w, &curpos);
967 		if (error < 0)
968 			return error;
969 
970 		if (type == GIT_OBJECT_REF_DELTA) {
971 			found_ref_delta = 1;
972 			break;
973 		}
974 	}
975 
976 	if (!found_ref_delta) {
977 		git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object");
978 		return -1;
979 	}
980 
981 	/* curpos now points to the base information, which is an OID */
982 	base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left);
983 	if (base_info == NULL) {
984 		git_error_set(GIT_ERROR_INDEXER, "failed to map delta information");
985 		return -1;
986 	}
987 
988 	git_oid_fromraw(&base, base_info);
989 	git_mwindow_close(&w);
990 
991 	if (has_entry(idx, &base))
992 		return 0;
993 
994 	if (inject_object(idx, &base) < 0)
995 		return -1;
996 
997 	stats->local_objects++;
998 
999 	return 0;
1000 }
1001 
resolve_deltas(git_indexer * idx,git_indexer_progress * stats)1002 static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats)
1003 {
1004 	unsigned int i;
1005 	int error;
1006 	struct delta_info *delta;
1007 	int progressed = 0, non_null = 0, progress_cb_result;
1008 
1009 	while (idx->deltas.length > 0) {
1010 		progressed = 0;
1011 		non_null = 0;
1012 		git_vector_foreach(&idx->deltas, i, delta) {
1013 			git_rawobj obj = {0};
1014 
1015 			if (!delta)
1016 				continue;
1017 
1018 			non_null = 1;
1019 			idx->off = delta->delta_off;
1020 			if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) {
1021 				if (error == GIT_PASSTHROUGH) {
1022 					/* We have not seen the base object, we'll try again later. */
1023 					continue;
1024 				}
1025 				return -1;
1026 			}
1027 
1028 			if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
1029 				/* TODO: error? continue? */
1030 				continue;
1031 
1032 			if (hash_and_save(idx, &obj, delta->delta_off) < 0)
1033 				continue;
1034 
1035 			git__free(obj.data);
1036 			stats->indexed_objects++;
1037 			stats->indexed_deltas++;
1038 			progressed = 1;
1039 			if ((progress_cb_result = do_progress_callback(idx, stats)) < 0)
1040 				return progress_cb_result;
1041 
1042 			/* remove from the list */
1043 			git_vector_set(NULL, &idx->deltas, i, NULL);
1044 			git__free(delta);
1045 		}
1046 
1047 		/* if none were actually set, we're done */
1048 		if (!non_null)
1049 			break;
1050 
1051 		if (!progressed && (fix_thin_pack(idx, stats) < 0)) {
1052 			return -1;
1053 		}
1054 	}
1055 
1056 	return 0;
1057 }
1058 
update_header_and_rehash(git_indexer * idx,git_indexer_progress * stats)1059 static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats)
1060 {
1061 	void *ptr;
1062 	size_t chunk = 1024*1024;
1063 	off64_t hashed = 0;
1064 	git_mwindow *w = NULL;
1065 	git_mwindow_file *mwf;
1066 	unsigned int left;
1067 
1068 	mwf = &idx->pack->mwf;
1069 
1070 	git_hash_init(&idx->trailer);
1071 
1072 
1073 	/* Update the header to include the numer of local objects we injected */
1074 	idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects);
1075 	if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0)
1076 		return -1;
1077 
1078 	/*
1079 	 * We now use the same technique as before to determine the
1080 	 * hash. We keep reading up to the end and let
1081 	 * hash_partially() keep the existing trailer out of the
1082 	 * calculation.
1083 	 */
1084 	git_mwindow_free_all(mwf);
1085 	idx->inbuf_len = 0;
1086 	while (hashed < mwf->size) {
1087 		ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left);
1088 		if (ptr == NULL)
1089 			return -1;
1090 
1091 		hash_partially(idx, ptr, left);
1092 		hashed += left;
1093 
1094 		git_mwindow_close(&w);
1095 	}
1096 
1097 	return 0;
1098 }
1099 
git_indexer_commit(git_indexer * idx,git_indexer_progress * stats)1100 int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats)
1101 {
1102 	git_mwindow *w = NULL;
1103 	unsigned int i, long_offsets = 0, left;
1104 	int error;
1105 	struct git_pack_idx_header hdr;
1106 	git_buf filename = GIT_BUF_INIT;
1107 	struct entry *entry;
1108 	git_oid trailer_hash, file_hash;
1109 	git_filebuf index_file = {0};
1110 	void *packfile_trailer;
1111 
1112 	if (!idx->parsed_header) {
1113 		git_error_set(GIT_ERROR_INDEXER, "incomplete pack header");
1114 		return -1;
1115 	}
1116 
1117 	/* Test for this before resolve_deltas(), as it plays with idx->off */
1118 	if (idx->off + 20 < idx->pack->mwf.size) {
1119 		git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack");
1120 		return -1;
1121 	}
1122 	if (idx->off + 20 > idx->pack->mwf.size) {
1123 		git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack");
1124 		return -1;
1125 	}
1126 
1127 	packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
1128 	if (packfile_trailer == NULL) {
1129 		git_mwindow_close(&w);
1130 		goto on_error;
1131 	}
1132 
1133 	/* Compare the packfile trailer as it was sent to us and what we calculated */
1134 	git_oid_fromraw(&file_hash, packfile_trailer);
1135 	git_mwindow_close(&w);
1136 
1137 	git_hash_final(&trailer_hash, &idx->trailer);
1138 	if (git_oid_cmp(&file_hash, &trailer_hash)) {
1139 		git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch");
1140 		return -1;
1141 	}
1142 
1143 	/* Freeze the number of deltas */
1144 	stats->total_deltas = stats->total_objects - stats->indexed_objects;
1145 
1146 	if ((error = resolve_deltas(idx, stats)) < 0)
1147 		return error;
1148 
1149 	if (stats->indexed_objects != stats->total_objects) {
1150 		git_error_set(GIT_ERROR_INDEXER, "early EOF");
1151 		return -1;
1152 	}
1153 
1154 	if (stats->local_objects > 0) {
1155 		if (update_header_and_rehash(idx, stats) < 0)
1156 			return -1;
1157 
1158 		git_hash_final(&trailer_hash, &idx->trailer);
1159 		write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
1160 	}
1161 
1162 	/*
1163 	 * Is the resulting graph fully connected or are we still
1164 	 * missing some objects? In the second case, we can
1165 	 * bail out due to an incomplete and thus corrupt
1166 	 * packfile.
1167 	 */
1168 	if (git_oidmap_size(idx->expected_oids) > 0) {
1169 		git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects",
1170 			git_oidmap_size(idx->expected_oids));
1171 		return -1;
1172 	}
1173 
1174 	git_vector_sort(&idx->objects);
1175 
1176 	/* Use the trailer hash as the pack file name to ensure
1177 	 * files with different contents have different names */
1178 	git_oid_cpy(&idx->hash, &trailer_hash);
1179 
1180 	git_buf_sets(&filename, idx->pack->pack_name);
1181 	git_buf_shorten(&filename, strlen("pack"));
1182 	git_buf_puts(&filename, "idx");
1183 	if (git_buf_oom(&filename))
1184 		return -1;
1185 
1186 	if (git_filebuf_open(&index_file, filename.ptr,
1187 		GIT_FILEBUF_HASH_CONTENTS |
1188 		(idx->do_fsync ? GIT_FILEBUF_FSYNC : 0),
1189 		idx->mode) < 0)
1190 		goto on_error;
1191 
1192 	/* Write out the header */
1193 	hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
1194 	hdr.idx_version = htonl(2);
1195 	git_filebuf_write(&index_file, &hdr, sizeof(hdr));
1196 
1197 	/* Write out the fanout table */
1198 	for (i = 0; i < 256; ++i) {
1199 		uint32_t n = htonl(idx->fanout[i]);
1200 		git_filebuf_write(&index_file, &n, sizeof(n));
1201 	}
1202 
1203 	/* Write out the object names (SHA-1 hashes) */
1204 	git_vector_foreach(&idx->objects, i, entry) {
1205 		git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid));
1206 	}
1207 
1208 	/* Write out the CRC32 values */
1209 	git_vector_foreach(&idx->objects, i, entry) {
1210 		git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t));
1211 	}
1212 
1213 	/* Write out the offsets */
1214 	git_vector_foreach(&idx->objects, i, entry) {
1215 		uint32_t n;
1216 
1217 		if (entry->offset == UINT32_MAX)
1218 			n = htonl(0x80000000 | long_offsets++);
1219 		else
1220 			n = htonl(entry->offset);
1221 
1222 		git_filebuf_write(&index_file, &n, sizeof(uint32_t));
1223 	}
1224 
1225 	/* Write out the long offsets */
1226 	git_vector_foreach(&idx->objects, i, entry) {
1227 		uint32_t split[2];
1228 
1229 		if (entry->offset != UINT32_MAX)
1230 			continue;
1231 
1232 		split[0] = htonl(entry->offset_long >> 32);
1233 		split[1] = htonl(entry->offset_long & 0xffffffff);
1234 
1235 		git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2);
1236 	}
1237 
1238 	/* Write out the packfile trailer to the index */
1239 	if (git_filebuf_write(&index_file, &trailer_hash, GIT_OID_RAWSZ) < 0)
1240 		goto on_error;
1241 
1242 	/* Write out the hash of the idx */
1243 	if (git_filebuf_hash(&trailer_hash, &index_file) < 0)
1244 		goto on_error;
1245 
1246 	git_filebuf_write(&index_file, &trailer_hash, sizeof(git_oid));
1247 
1248 	/* Figure out what the final name should be */
1249 	if (index_path(&filename, idx, ".idx") < 0)
1250 		goto on_error;
1251 
1252 	/* Commit file */
1253 	if (git_filebuf_commit_at(&index_file, filename.ptr) < 0)
1254 		goto on_error;
1255 
1256 	git_mwindow_free_all(&idx->pack->mwf);
1257 
1258 	/* Truncate file to undo rounding up to next page_size in append_to_pack */
1259 	if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) {
1260 		git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name);
1261 		return -1;
1262 	}
1263 
1264 	if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) {
1265 		git_error_set(GIT_ERROR_OS, "failed to fsync packfile");
1266 		goto on_error;
1267 	}
1268 
1269 	/* We need to close the descriptor here so Windows doesn't choke on commit_at */
1270 	if (p_close(idx->pack->mwf.fd) < 0) {
1271 		git_error_set(GIT_ERROR_OS, "failed to close packfile");
1272 		goto on_error;
1273 	}
1274 
1275 	idx->pack->mwf.fd = -1;
1276 
1277 	if (index_path(&filename, idx, ".pack") < 0)
1278 		goto on_error;
1279 
1280 	/* And don't forget to rename the packfile to its new place. */
1281 	if (p_rename(idx->pack->pack_name, git_buf_cstr(&filename)) < 0)
1282 		goto on_error;
1283 
1284 	/* And fsync the parent directory if we're asked to. */
1285 	if (idx->do_fsync &&
1286 		git_futils_fsync_parent(git_buf_cstr(&filename)) < 0)
1287 		goto on_error;
1288 
1289 	idx->pack_committed = 1;
1290 
1291 	git_buf_dispose(&filename);
1292 	return 0;
1293 
1294 on_error:
1295 	git_mwindow_free_all(&idx->pack->mwf);
1296 	git_filebuf_cleanup(&index_file);
1297 	git_buf_dispose(&filename);
1298 	return -1;
1299 }
1300 
git_indexer_free(git_indexer * idx)1301 void git_indexer_free(git_indexer *idx)
1302 {
1303 	const git_oid *key;
1304 	git_oid *value;
1305 	size_t iter;
1306 
1307 	if (idx == NULL)
1308 		return;
1309 
1310 	if (idx->have_stream)
1311 		git_packfile_stream_dispose(&idx->stream);
1312 
1313 	git_vector_free_deep(&idx->objects);
1314 
1315 	if (idx->pack->idx_cache) {
1316 		struct git_pack_entry *pentry;
1317 		git_oidmap_foreach_value(idx->pack->idx_cache, pentry, {
1318 			git__free(pentry);
1319 		});
1320 
1321 		git_oidmap_free(idx->pack->idx_cache);
1322 	}
1323 
1324 	git_vector_free_deep(&idx->deltas);
1325 
1326 	if (!git_mutex_lock(&git__mwindow_mutex)) {
1327 		if (!idx->pack_committed)
1328 			git_packfile_close(idx->pack, true);
1329 
1330 		git_packfile_free(idx->pack);
1331 		git_mutex_unlock(&git__mwindow_mutex);
1332 	}
1333 
1334 	iter = 0;
1335 	while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0)
1336 		git__free(value);
1337 
1338 	git_hash_ctx_cleanup(&idx->trailer);
1339 	git_hash_ctx_cleanup(&idx->hash_ctx);
1340 	git_buf_dispose(&idx->entry_data);
1341 	git_oidmap_free(idx->expected_oids);
1342 	git__free(idx);
1343 }
1344