1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7
8 #include "indexer.h"
9
10 #include "git2/indexer.h"
11 #include "git2/object.h"
12
13 #include "commit.h"
14 #include "tree.h"
15 #include "tag.h"
16 #include "pack.h"
17 #include "mwindow.h"
18 #include "posix.h"
19 #include "pack.h"
20 #include "filebuf.h"
21 #include "oid.h"
22 #include "oidarray.h"
23 #include "oidmap.h"
24 #include "zstream.h"
25 #include "object.h"
26
27 size_t git_indexer__max_objects = UINT32_MAX;
28
29 #define UINT31_MAX (0x7FFFFFFF)
30
31 struct entry {
32 git_oid oid;
33 uint32_t crc;
34 uint32_t offset;
35 uint64_t offset_long;
36 };
37
38 struct git_indexer {
39 unsigned int parsed_header :1,
40 pack_committed :1,
41 have_stream :1,
42 have_delta :1,
43 do_fsync :1,
44 do_verify :1;
45 struct git_pack_header hdr;
46 struct git_pack_file *pack;
47 unsigned int mode;
48 off64_t off;
49 off64_t entry_start;
50 git_object_t entry_type;
51 git_buf entry_data;
52 git_packfile_stream stream;
53 size_t nr_objects;
54 git_vector objects;
55 git_vector deltas;
56 unsigned int fanout[256];
57 git_hash_ctx hash_ctx;
58 git_oid hash;
59 git_indexer_progress_cb progress_cb;
60 void *progress_payload;
61 char objbuf[8*1024];
62
63 /* OIDs referenced from pack objects. Used for verification. */
64 git_oidmap *expected_oids;
65
66 /* Needed to look up objects which we want to inject to fix a thin pack */
67 git_odb *odb;
68
69 /* Fields for calculating the packfile trailer (hash of everything before it) */
70 char inbuf[GIT_OID_RAWSZ];
71 size_t inbuf_len;
72 git_hash_ctx trailer;
73 };
74
75 struct delta_info {
76 off64_t delta_off;
77 };
78
git_indexer_hash(const git_indexer * idx)79 const git_oid *git_indexer_hash(const git_indexer *idx)
80 {
81 return &idx->hash;
82 }
83
parse_header(struct git_pack_header * hdr,struct git_pack_file * pack)84 static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
85 {
86 int error;
87 git_map map;
88
89 if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0)
90 return error;
91
92 memcpy(hdr, map.data, sizeof(*hdr));
93 p_munmap(&map);
94
95 /* Verify we recognize this pack file format. */
96 if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
97 git_error_set(GIT_ERROR_INDEXER, "wrong pack signature");
98 return -1;
99 }
100
101 if (!pack_version_ok(hdr->hdr_version)) {
102 git_error_set(GIT_ERROR_INDEXER, "wrong pack version");
103 return -1;
104 }
105
106 return 0;
107 }
108
objects_cmp(const void * a,const void * b)109 static int objects_cmp(const void *a, const void *b)
110 {
111 const struct entry *entrya = a;
112 const struct entry *entryb = b;
113
114 return git_oid__cmp(&entrya->oid, &entryb->oid);
115 }
116
git_indexer_options_init(git_indexer_options * opts,unsigned int version)117 int git_indexer_options_init(git_indexer_options *opts, unsigned int version)
118 {
119 GIT_INIT_STRUCTURE_FROM_TEMPLATE(
120 opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT);
121 return 0;
122 }
123
124 #ifndef GIT_DEPRECATE_HARD
git_indexer_init_options(git_indexer_options * opts,unsigned int version)125 int git_indexer_init_options(git_indexer_options *opts, unsigned int version)
126 {
127 return git_indexer_options_init(opts, version);
128 }
129 #endif
130
git_indexer_new(git_indexer ** out,const char * prefix,unsigned int mode,git_odb * odb,git_indexer_options * in_opts)131 int git_indexer_new(
132 git_indexer **out,
133 const char *prefix,
134 unsigned int mode,
135 git_odb *odb,
136 git_indexer_options *in_opts)
137 {
138 git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
139 git_indexer *idx;
140 git_buf path = GIT_BUF_INIT, tmp_path = GIT_BUF_INIT;
141 static const char suff[] = "/pack";
142 int error, fd = -1;
143
144 if (in_opts)
145 memcpy(&opts, in_opts, sizeof(opts));
146
147 idx = git__calloc(1, sizeof(git_indexer));
148 GIT_ERROR_CHECK_ALLOC(idx);
149 idx->odb = odb;
150 idx->progress_cb = opts.progress_cb;
151 idx->progress_payload = opts.progress_cb_payload;
152 idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
153 git_buf_init(&idx->entry_data, 0);
154
155 if ((error = git_hash_ctx_init(&idx->hash_ctx)) < 0 ||
156 (error = git_hash_ctx_init(&idx->trailer)) < 0 ||
157 (error = git_oidmap_new(&idx->expected_oids)) < 0)
158 goto cleanup;
159
160 idx->do_verify = opts.verify;
161
162 if (git_repository__fsync_gitdir)
163 idx->do_fsync = 1;
164
165 error = git_buf_joinpath(&path, prefix, suff);
166 if (error < 0)
167 goto cleanup;
168
169 fd = git_futils_mktmp(&tmp_path, git_buf_cstr(&path), idx->mode);
170 git_buf_dispose(&path);
171 if (fd < 0)
172 goto cleanup;
173
174 error = git_packfile_alloc(&idx->pack, git_buf_cstr(&tmp_path));
175 git_buf_dispose(&tmp_path);
176
177 if (error < 0)
178 goto cleanup;
179
180 idx->pack->mwf.fd = fd;
181 if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0)
182 goto cleanup;
183
184 *out = idx;
185 return 0;
186
187 cleanup:
188 if (fd != -1)
189 p_close(fd);
190
191 if (git_buf_len(&tmp_path) > 0)
192 p_unlink(git_buf_cstr(&tmp_path));
193
194 if (idx->pack != NULL)
195 p_unlink(idx->pack->pack_name);
196
197 git_buf_dispose(&path);
198 git_buf_dispose(&tmp_path);
199 git__free(idx);
200 return -1;
201 }
202
git_indexer__set_fsync(git_indexer * idx,int do_fsync)203 void git_indexer__set_fsync(git_indexer *idx, int do_fsync)
204 {
205 idx->do_fsync = !!do_fsync;
206 }
207
208 /* Try to store the delta so we can try to resolve it later */
store_delta(git_indexer * idx)209 static int store_delta(git_indexer *idx)
210 {
211 struct delta_info *delta;
212
213 delta = git__calloc(1, sizeof(struct delta_info));
214 GIT_ERROR_CHECK_ALLOC(delta);
215 delta->delta_off = idx->entry_start;
216
217 if (git_vector_insert(&idx->deltas, delta) < 0)
218 return -1;
219
220 return 0;
221 }
222
hash_header(git_hash_ctx * ctx,off64_t len,git_object_t type)223 static int hash_header(git_hash_ctx *ctx, off64_t len, git_object_t type)
224 {
225 char buffer[64];
226 size_t hdrlen;
227 int error;
228
229 if ((error = git_odb__format_object_header(&hdrlen,
230 buffer, sizeof(buffer), (size_t)len, type)) < 0)
231 return error;
232
233 return git_hash_update(ctx, buffer, hdrlen);
234 }
235
hash_object_stream(git_indexer * idx,git_packfile_stream * stream)236 static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
237 {
238 ssize_t read;
239
240 GIT_ASSERT_ARG(idx);
241 GIT_ASSERT_ARG(stream);
242
243 do {
244 if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
245 break;
246
247 if (idx->do_verify)
248 git_buf_put(&idx->entry_data, idx->objbuf, read);
249
250 git_hash_update(&idx->hash_ctx, idx->objbuf, read);
251 } while (read > 0);
252
253 if (read < 0)
254 return (int)read;
255
256 return 0;
257 }
258
259 /* In order to create the packfile stream, we need to skip over the delta base description */
advance_delta_offset(git_indexer * idx,git_object_t type)260 static int advance_delta_offset(git_indexer *idx, git_object_t type)
261 {
262 git_mwindow *w = NULL;
263
264 GIT_ASSERT_ARG(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA);
265
266 if (type == GIT_OBJECT_REF_DELTA) {
267 idx->off += GIT_OID_RAWSZ;
268 } else {
269 off64_t base_off;
270 int error = get_delta_base(&base_off, idx->pack, &w, &idx->off, type, idx->entry_start);
271 git_mwindow_close(&w);
272 if (error < 0)
273 return error;
274 }
275
276 return 0;
277 }
278
279 /* Read from the stream and discard any output */
read_object_stream(git_indexer * idx,git_packfile_stream * stream)280 static int read_object_stream(git_indexer *idx, git_packfile_stream *stream)
281 {
282 ssize_t read;
283
284 GIT_ASSERT_ARG(stream);
285
286 do {
287 read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
288 } while (read > 0);
289
290 if (read < 0)
291 return (int)read;
292
293 return 0;
294 }
295
crc_object(uint32_t * crc_out,git_mwindow_file * mwf,off64_t start,off64_t size)296 static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, off64_t start, off64_t size)
297 {
298 void *ptr;
299 uint32_t crc;
300 unsigned int left, len;
301 git_mwindow *w = NULL;
302
303 crc = crc32(0L, Z_NULL, 0);
304 while (size) {
305 ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
306 if (ptr == NULL)
307 return -1;
308
309 len = min(left, (unsigned int)size);
310 crc = crc32(crc, ptr, len);
311 size -= len;
312 start += len;
313 git_mwindow_close(&w);
314 }
315
316 *crc_out = htonl(crc);
317 return 0;
318 }
319
add_expected_oid(git_indexer * idx,const git_oid * oid)320 static int add_expected_oid(git_indexer *idx, const git_oid *oid)
321 {
322 /*
323 * If we know about that object because it is stored in our ODB or
324 * because we have already processed it as part of our pack file, we do
325 * not have to expect it.
326 */
327 if ((!idx->odb || !git_odb_exists(idx->odb, oid)) &&
328 !git_oidmap_exists(idx->pack->idx_cache, oid) &&
329 !git_oidmap_exists(idx->expected_oids, oid)) {
330 git_oid *dup = git__malloc(sizeof(*oid));
331 GIT_ERROR_CHECK_ALLOC(dup);
332 git_oid_cpy(dup, oid);
333 return git_oidmap_set(idx->expected_oids, dup, dup);
334 }
335
336 return 0;
337 }
338
check_object_connectivity(git_indexer * idx,const git_rawobj * obj)339 static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
340 {
341 git_object *object;
342 git_oid *expected;
343 int error;
344
345 if (obj->type != GIT_OBJECT_BLOB &&
346 obj->type != GIT_OBJECT_TREE &&
347 obj->type != GIT_OBJECT_COMMIT &&
348 obj->type != GIT_OBJECT_TAG)
349 return 0;
350
351 if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
352 goto out;
353
354 if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) {
355 git_oidmap_delete(idx->expected_oids, &object->cached.oid);
356 git__free(expected);
357 }
358
359 /*
360 * Check whether this is a known object. If so, we can just continue as
361 * we assume that the ODB has a complete graph.
362 */
363 if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid))
364 return 0;
365
366 switch (obj->type) {
367 case GIT_OBJECT_TREE:
368 {
369 git_tree *tree = (git_tree *) object;
370 git_tree_entry *entry;
371 size_t i;
372
373 git_array_foreach(tree->entries, i, entry)
374 if (add_expected_oid(idx, entry->oid) < 0)
375 goto out;
376
377 break;
378 }
379 case GIT_OBJECT_COMMIT:
380 {
381 git_commit *commit = (git_commit *) object;
382 git_oid *parent_oid;
383 size_t i;
384
385 git_array_foreach(commit->parent_ids, i, parent_oid)
386 if (add_expected_oid(idx, parent_oid) < 0)
387 goto out;
388
389 if (add_expected_oid(idx, &commit->tree_id) < 0)
390 goto out;
391
392 break;
393 }
394 case GIT_OBJECT_TAG:
395 {
396 git_tag *tag = (git_tag *) object;
397
398 if (add_expected_oid(idx, &tag->target) < 0)
399 goto out;
400
401 break;
402 }
403 case GIT_OBJECT_BLOB:
404 default:
405 break;
406 }
407
408 out:
409 git_object_free(object);
410
411 return error;
412 }
413
store_object(git_indexer * idx)414 static int store_object(git_indexer *idx)
415 {
416 int i, error;
417 git_oid oid;
418 struct entry *entry;
419 off64_t entry_size;
420 struct git_pack_entry *pentry;
421 off64_t entry_start = idx->entry_start;
422
423 entry = git__calloc(1, sizeof(*entry));
424 GIT_ERROR_CHECK_ALLOC(entry);
425
426 pentry = git__calloc(1, sizeof(struct git_pack_entry));
427 GIT_ERROR_CHECK_ALLOC(pentry);
428
429 if (git_hash_final(&oid, &idx->hash_ctx)) {
430 git__free(pentry);
431 goto on_error;
432 }
433 entry_size = idx->off - entry_start;
434 if (entry_start > UINT31_MAX) {
435 entry->offset = UINT32_MAX;
436 entry->offset_long = entry_start;
437 } else {
438 entry->offset = (uint32_t)entry_start;
439 }
440
441 if (idx->do_verify) {
442 git_rawobj rawobj = {
443 idx->entry_data.ptr,
444 idx->entry_data.size,
445 idx->entry_type
446 };
447
448 if ((error = check_object_connectivity(idx, &rawobj)) < 0)
449 goto on_error;
450 }
451
452 git_oid_cpy(&pentry->sha1, &oid);
453 pentry->offset = entry_start;
454
455 if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) {
456 git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1));
457 git__free(pentry);
458 goto on_error;
459 }
460
461 if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) {
462 git__free(pentry);
463 git_error_set_oom();
464 goto on_error;
465 }
466
467 git_oid_cpy(&entry->oid, &oid);
468
469 if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
470 goto on_error;
471
472 /* Add the object to the list */
473 if (git_vector_insert(&idx->objects, entry) < 0)
474 goto on_error;
475
476 for (i = oid.id[0]; i < 256; ++i) {
477 idx->fanout[i]++;
478 }
479
480 return 0;
481
482 on_error:
483 git__free(entry);
484
485 return -1;
486 }
487
has_entry(git_indexer * idx,git_oid * id)488 GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id)
489 {
490 return git_oidmap_exists(idx->pack->idx_cache, id);
491 }
492
save_entry(git_indexer * idx,struct entry * entry,struct git_pack_entry * pentry,off64_t entry_start)493 static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, off64_t entry_start)
494 {
495 int i;
496
497 if (entry_start > UINT31_MAX) {
498 entry->offset = UINT32_MAX;
499 entry->offset_long = entry_start;
500 } else {
501 entry->offset = (uint32_t)entry_start;
502 }
503
504 pentry->offset = entry_start;
505
506 if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) ||
507 git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) {
508 git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack");
509 return -1;
510 }
511
512 /* Add the object to the list */
513 if (git_vector_insert(&idx->objects, entry) < 0)
514 return -1;
515
516 for (i = entry->oid.id[0]; i < 256; ++i) {
517 idx->fanout[i]++;
518 }
519
520 return 0;
521 }
522
hash_and_save(git_indexer * idx,git_rawobj * obj,off64_t entry_start)523 static int hash_and_save(git_indexer *idx, git_rawobj *obj, off64_t entry_start)
524 {
525 git_oid oid;
526 size_t entry_size;
527 struct entry *entry;
528 struct git_pack_entry *pentry = NULL;
529
530 entry = git__calloc(1, sizeof(*entry));
531 GIT_ERROR_CHECK_ALLOC(entry);
532
533 if (git_odb__hashobj(&oid, obj) < 0) {
534 git_error_set(GIT_ERROR_INDEXER, "failed to hash object");
535 goto on_error;
536 }
537
538 pentry = git__calloc(1, sizeof(struct git_pack_entry));
539 GIT_ERROR_CHECK_ALLOC(pentry);
540
541 git_oid_cpy(&pentry->sha1, &oid);
542 git_oid_cpy(&entry->oid, &oid);
543 entry->crc = crc32(0L, Z_NULL, 0);
544
545 entry_size = (size_t)(idx->off - entry_start);
546 if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
547 goto on_error;
548
549 return save_entry(idx, entry, pentry, entry_start);
550
551 on_error:
552 git__free(pentry);
553 git__free(entry);
554 git__free(obj->data);
555 return -1;
556 }
557
do_progress_callback(git_indexer * idx,git_indexer_progress * stats)558 static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats)
559 {
560 if (idx->progress_cb)
561 return git_error_set_after_callback_function(
562 idx->progress_cb(stats, idx->progress_payload),
563 "indexer progress");
564 return 0;
565 }
566
567 /* Hash everything but the last 20B of input */
hash_partially(git_indexer * idx,const uint8_t * data,size_t size)568 static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size)
569 {
570 size_t to_expell, to_keep;
571
572 if (size == 0)
573 return;
574
575 /* Easy case, dump the buffer and the data minus the last 20 bytes */
576 if (size >= GIT_OID_RAWSZ) {
577 git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len);
578 git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ);
579
580 data += size - GIT_OID_RAWSZ;
581 memcpy(idx->inbuf, data, GIT_OID_RAWSZ);
582 idx->inbuf_len = GIT_OID_RAWSZ;
583 return;
584 }
585
586 /* We can just append */
587 if (idx->inbuf_len + size <= GIT_OID_RAWSZ) {
588 memcpy(idx->inbuf + idx->inbuf_len, data, size);
589 idx->inbuf_len += size;
590 return;
591 }
592
593 /* We need to partially drain the buffer and then append */
594 to_keep = GIT_OID_RAWSZ - size;
595 to_expell = idx->inbuf_len - to_keep;
596
597 git_hash_update(&idx->trailer, idx->inbuf, to_expell);
598
599 memmove(idx->inbuf, idx->inbuf + to_expell, to_keep);
600 memcpy(idx->inbuf + to_keep, data, size);
601 idx->inbuf_len += size - to_expell;
602 }
603
write_at(git_indexer * idx,const void * data,off64_t offset,size_t size)604 static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size)
605 {
606 #ifdef NO_MMAP
607 size_t remaining_size = size;
608 const char *ptr = (const char *)data;
609
610 /* Handle data size larger that ssize_t */
611 while (remaining_size > 0) {
612 ssize_t nb;
613 HANDLE_EINTR(nb, p_pwrite(idx->pack->mwf.fd, (void *)ptr,
614 remaining_size, offset));
615 if (nb <= 0)
616 return -1;
617
618 ptr += nb;
619 offset += nb;
620 remaining_size -= nb;
621 }
622 #else
623 git_file fd = idx->pack->mwf.fd;
624 size_t mmap_alignment;
625 size_t page_offset;
626 off64_t page_start;
627 unsigned char *map_data;
628 git_map map;
629 int error;
630
631 GIT_ASSERT_ARG(data);
632 GIT_ASSERT_ARG(size);
633
634 if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
635 return error;
636
637 /* the offset needs to be at the mmap boundary for the platform */
638 page_offset = offset % mmap_alignment;
639 page_start = offset - page_offset;
640
641 if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0)
642 return error;
643
644 map_data = (unsigned char *)map.data;
645 memcpy(map_data + page_offset, data, size);
646 p_munmap(&map);
647 #endif
648
649 return 0;
650 }
651
append_to_pack(git_indexer * idx,const void * data,size_t size)652 static int append_to_pack(git_indexer *idx, const void *data, size_t size)
653 {
654 off64_t new_size;
655 size_t mmap_alignment;
656 size_t page_offset;
657 off64_t page_start;
658 off64_t current_size = idx->pack->mwf.size;
659 int error;
660
661 if (!size)
662 return 0;
663
664 if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
665 return error;
666
667 /* Write a single byte to force the file system to allocate space now or
668 * report an error, since we can't report errors when writing using mmap.
669 * Round the size up to the nearest page so that we only need to perform file
670 * I/O when we add a page, instead of whenever we write even a single byte. */
671 new_size = current_size + size;
672 page_offset = new_size % mmap_alignment;
673 page_start = new_size - page_offset;
674
675 if (p_pwrite(idx->pack->mwf.fd, data, 1, page_start + mmap_alignment - 1) < 0) {
676 git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name);
677 return -1;
678 }
679
680 return write_at(idx, data, idx->pack->mwf.size, size);
681 }
682
read_stream_object(git_indexer * idx,git_indexer_progress * stats)683 static int read_stream_object(git_indexer *idx, git_indexer_progress *stats)
684 {
685 git_packfile_stream *stream = &idx->stream;
686 off64_t entry_start = idx->off;
687 size_t entry_size;
688 git_object_t type;
689 git_mwindow *w = NULL;
690 int error;
691
692 if (idx->pack->mwf.size <= idx->off + 20)
693 return GIT_EBUFS;
694
695 if (!idx->have_stream) {
696 error = git_packfile_unpack_header(&entry_size, &type, idx->pack, &w, &idx->off);
697 if (error == GIT_EBUFS) {
698 idx->off = entry_start;
699 return error;
700 }
701 if (error < 0)
702 return error;
703
704 git_mwindow_close(&w);
705 idx->entry_start = entry_start;
706 git_hash_init(&idx->hash_ctx);
707 git_buf_clear(&idx->entry_data);
708
709 if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) {
710 error = advance_delta_offset(idx, type);
711 if (error == GIT_EBUFS) {
712 idx->off = entry_start;
713 return error;
714 }
715 if (error < 0)
716 return error;
717
718 idx->have_delta = 1;
719 } else {
720 idx->have_delta = 0;
721
722 error = hash_header(&idx->hash_ctx, entry_size, type);
723 if (error < 0)
724 return error;
725 }
726
727 idx->have_stream = 1;
728 idx->entry_type = type;
729
730 error = git_packfile_stream_open(stream, idx->pack, idx->off);
731 if (error < 0)
732 return error;
733 }
734
735 if (idx->have_delta) {
736 error = read_object_stream(idx, stream);
737 } else {
738 error = hash_object_stream(idx, stream);
739 }
740
741 idx->off = stream->curpos;
742 if (error == GIT_EBUFS)
743 return error;
744
745 /* We want to free the stream reasorces no matter what here */
746 idx->have_stream = 0;
747 git_packfile_stream_dispose(stream);
748
749 if (error < 0)
750 return error;
751
752 if (idx->have_delta) {
753 error = store_delta(idx);
754 } else {
755 error = store_object(idx);
756 }
757
758 if (error < 0)
759 return error;
760
761 if (!idx->have_delta) {
762 stats->indexed_objects++;
763 }
764 stats->received_objects++;
765
766 if ((error = do_progress_callback(idx, stats)) != 0)
767 return error;
768
769 return 0;
770 }
771
git_indexer_append(git_indexer * idx,const void * data,size_t size,git_indexer_progress * stats)772 int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats)
773 {
774 int error = -1;
775 struct git_pack_header *hdr = &idx->hdr;
776 git_mwindow_file *mwf = &idx->pack->mwf;
777
778 GIT_ASSERT_ARG(idx);
779 GIT_ASSERT_ARG(data);
780 GIT_ASSERT_ARG(stats);
781
782 if ((error = append_to_pack(idx, data, size)) < 0)
783 return error;
784
785 hash_partially(idx, data, (int)size);
786
787 /* Make sure we set the new size of the pack */
788 idx->pack->mwf.size += size;
789
790 if (!idx->parsed_header) {
791 unsigned int total_objects;
792
793 if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header))
794 return 0;
795
796 if ((error = parse_header(&idx->hdr, idx->pack)) < 0)
797 return error;
798
799 idx->parsed_header = 1;
800 idx->nr_objects = ntohl(hdr->hdr_entries);
801 idx->off = sizeof(struct git_pack_header);
802
803 if (idx->nr_objects <= git_indexer__max_objects) {
804 total_objects = (unsigned int)idx->nr_objects;
805 } else {
806 git_error_set(GIT_ERROR_INDEXER, "too many objects");
807 return -1;
808 }
809
810 if (git_oidmap_new(&idx->pack->idx_cache) < 0)
811 return -1;
812
813 idx->pack->has_cache = 1;
814 if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0)
815 return -1;
816
817 if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0)
818 return -1;
819
820 stats->received_objects = 0;
821 stats->local_objects = 0;
822 stats->total_deltas = 0;
823 stats->indexed_deltas = 0;
824 stats->indexed_objects = 0;
825 stats->total_objects = total_objects;
826
827 if ((error = do_progress_callback(idx, stats)) != 0)
828 return error;
829 }
830
831 /* Now that we have data in the pack, let's try to parse it */
832
833 /* As the file grows any windows we try to use will be out of date */
834 if ((error = git_mwindow_free_all(mwf)) < 0)
835 goto on_error;
836
837 while (stats->indexed_objects < idx->nr_objects) {
838 if ((error = read_stream_object(idx, stats)) != 0) {
839 if (error == GIT_EBUFS)
840 break;
841 else
842 goto on_error;
843 }
844 }
845
846 return 0;
847
848 on_error:
849 git_mwindow_free_all(mwf);
850 return error;
851 }
852
index_path(git_buf * path,git_indexer * idx,const char * suffix)853 static int index_path(git_buf *path, git_indexer *idx, const char *suffix)
854 {
855 const char prefix[] = "pack-";
856 size_t slash = (size_t)path->size;
857
858 /* search backwards for '/' */
859 while (slash > 0 && path->ptr[slash - 1] != '/')
860 slash--;
861
862 if (git_buf_grow(path, slash + 1 + strlen(prefix) +
863 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
864 return -1;
865
866 git_buf_truncate(path, slash);
867 git_buf_puts(path, prefix);
868 git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
869 path->size += GIT_OID_HEXSZ;
870 git_buf_puts(path, suffix);
871
872 return git_buf_oom(path) ? -1 : 0;
873 }
874
875 /**
876 * Rewind the packfile by the trailer, as we might need to fix the
877 * packfile by injecting objects at the tail and must overwrite it.
878 */
seek_back_trailer(git_indexer * idx)879 static int seek_back_trailer(git_indexer *idx)
880 {
881 idx->pack->mwf.size -= GIT_OID_RAWSZ;
882 return git_mwindow_free_all(&idx->pack->mwf);
883 }
884
inject_object(git_indexer * idx,git_oid * id)885 static int inject_object(git_indexer *idx, git_oid *id)
886 {
887 git_odb_object *obj = NULL;
888 struct entry *entry = NULL;
889 struct git_pack_entry *pentry = NULL;
890 git_oid foo = {{0}};
891 unsigned char hdr[64];
892 git_buf buf = GIT_BUF_INIT;
893 off64_t entry_start;
894 const void *data;
895 size_t len, hdr_len;
896 int error;
897
898 if ((error = seek_back_trailer(idx)) < 0)
899 goto cleanup;
900
901 entry_start = idx->pack->mwf.size;
902
903 if ((error = git_odb_read(&obj, idx->odb, id)) < 0) {
904 git_error_set(GIT_ERROR_INDEXER, "missing delta bases");
905 goto cleanup;
906 }
907
908 data = git_odb_object_data(obj);
909 len = git_odb_object_size(obj);
910
911 entry = git__calloc(1, sizeof(*entry));
912 GIT_ERROR_CHECK_ALLOC(entry);
913
914 entry->crc = crc32(0L, Z_NULL, 0);
915
916 /* Write out the object header */
917 if ((error = git_packfile__object_header(&hdr_len, hdr, len, git_odb_object_type(obj))) < 0 ||
918 (error = append_to_pack(idx, hdr, hdr_len)) < 0)
919 goto cleanup;
920
921 idx->pack->mwf.size += hdr_len;
922 entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len);
923
924 if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0)
925 goto cleanup;
926
927 /* And then the compressed object */
928 if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0)
929 goto cleanup;
930
931 idx->pack->mwf.size += buf.size;
932 entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size));
933 git_buf_dispose(&buf);
934
935 /* Write a fake trailer so the pack functions play ball */
936
937 if ((error = append_to_pack(idx, &foo, GIT_OID_RAWSZ)) < 0)
938 goto cleanup;
939
940 idx->pack->mwf.size += GIT_OID_RAWSZ;
941
942 pentry = git__calloc(1, sizeof(struct git_pack_entry));
943 GIT_ERROR_CHECK_ALLOC(pentry);
944
945 git_oid_cpy(&pentry->sha1, id);
946 git_oid_cpy(&entry->oid, id);
947 idx->off = entry_start + hdr_len + len;
948
949 error = save_entry(idx, entry, pentry, entry_start);
950
951 cleanup:
952 if (error) {
953 git__free(entry);
954 git__free(pentry);
955 }
956
957 git_odb_object_free(obj);
958 return error;
959 }
960
fix_thin_pack(git_indexer * idx,git_indexer_progress * stats)961 static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats)
962 {
963 int error, found_ref_delta = 0;
964 unsigned int i;
965 struct delta_info *delta;
966 size_t size;
967 git_object_t type;
968 git_mwindow *w = NULL;
969 off64_t curpos = 0;
970 unsigned char *base_info;
971 unsigned int left = 0;
972 git_oid base;
973
974 GIT_ASSERT(git_vector_length(&idx->deltas) > 0);
975
976 if (idx->odb == NULL) {
977 git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB");
978 return -1;
979 }
980
981 /* Loop until we find the first REF delta */
982 git_vector_foreach(&idx->deltas, i, delta) {
983 if (!delta)
984 continue;
985
986 curpos = delta->delta_off;
987 error = git_packfile_unpack_header(&size, &type, idx->pack, &w, &curpos);
988 if (error < 0)
989 return error;
990
991 if (type == GIT_OBJECT_REF_DELTA) {
992 found_ref_delta = 1;
993 break;
994 }
995 }
996
997 if (!found_ref_delta) {
998 git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object");
999 return -1;
1000 }
1001
1002 /* curpos now points to the base information, which is an OID */
1003 base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left);
1004 if (base_info == NULL) {
1005 git_error_set(GIT_ERROR_INDEXER, "failed to map delta information");
1006 return -1;
1007 }
1008
1009 git_oid_fromraw(&base, base_info);
1010 git_mwindow_close(&w);
1011
1012 if (has_entry(idx, &base))
1013 return 0;
1014
1015 if (inject_object(idx, &base) < 0)
1016 return -1;
1017
1018 stats->local_objects++;
1019
1020 return 0;
1021 }
1022
resolve_deltas(git_indexer * idx,git_indexer_progress * stats)1023 static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats)
1024 {
1025 unsigned int i;
1026 int error;
1027 struct delta_info *delta;
1028 int progressed = 0, non_null = 0, progress_cb_result;
1029
1030 while (idx->deltas.length > 0) {
1031 progressed = 0;
1032 non_null = 0;
1033 git_vector_foreach(&idx->deltas, i, delta) {
1034 git_rawobj obj = {0};
1035
1036 if (!delta)
1037 continue;
1038
1039 non_null = 1;
1040 idx->off = delta->delta_off;
1041 if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) {
1042 if (error == GIT_PASSTHROUGH) {
1043 /* We have not seen the base object, we'll try again later. */
1044 continue;
1045 }
1046 return -1;
1047 }
1048
1049 if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
1050 /* TODO: error? continue? */
1051 continue;
1052
1053 if (hash_and_save(idx, &obj, delta->delta_off) < 0)
1054 continue;
1055
1056 git__free(obj.data);
1057 stats->indexed_objects++;
1058 stats->indexed_deltas++;
1059 progressed = 1;
1060 if ((progress_cb_result = do_progress_callback(idx, stats)) < 0)
1061 return progress_cb_result;
1062
1063 /* remove from the list */
1064 git_vector_set(NULL, &idx->deltas, i, NULL);
1065 git__free(delta);
1066 }
1067
1068 /* if none were actually set, we're done */
1069 if (!non_null)
1070 break;
1071
1072 if (!progressed && (fix_thin_pack(idx, stats) < 0)) {
1073 return -1;
1074 }
1075 }
1076
1077 return 0;
1078 }
1079
update_header_and_rehash(git_indexer * idx,git_indexer_progress * stats)1080 static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats)
1081 {
1082 void *ptr;
1083 size_t chunk = 1024*1024;
1084 off64_t hashed = 0;
1085 git_mwindow *w = NULL;
1086 git_mwindow_file *mwf;
1087 unsigned int left;
1088
1089 mwf = &idx->pack->mwf;
1090
1091 git_hash_init(&idx->trailer);
1092
1093
1094 /* Update the header to include the numer of local objects we injected */
1095 idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects);
1096 if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0)
1097 return -1;
1098
1099 /*
1100 * We now use the same technique as before to determine the
1101 * hash. We keep reading up to the end and let
1102 * hash_partially() keep the existing trailer out of the
1103 * calculation.
1104 */
1105 if (git_mwindow_free_all(mwf) < 0)
1106 return -1;
1107
1108 idx->inbuf_len = 0;
1109 while (hashed < mwf->size) {
1110 ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left);
1111 if (ptr == NULL)
1112 return -1;
1113
1114 hash_partially(idx, ptr, left);
1115 hashed += left;
1116
1117 git_mwindow_close(&w);
1118 }
1119
1120 return 0;
1121 }
1122
git_indexer_commit(git_indexer * idx,git_indexer_progress * stats)1123 int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats)
1124 {
1125 git_mwindow *w = NULL;
1126 unsigned int i, long_offsets = 0, left;
1127 int error;
1128 struct git_pack_idx_header hdr;
1129 git_buf filename = GIT_BUF_INIT;
1130 struct entry *entry;
1131 git_oid trailer_hash, file_hash;
1132 git_filebuf index_file = {0};
1133 void *packfile_trailer;
1134
1135 if (!idx->parsed_header) {
1136 git_error_set(GIT_ERROR_INDEXER, "incomplete pack header");
1137 return -1;
1138 }
1139
1140 /* Test for this before resolve_deltas(), as it plays with idx->off */
1141 if (idx->off + 20 < idx->pack->mwf.size) {
1142 git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack");
1143 return -1;
1144 }
1145 if (idx->off + 20 > idx->pack->mwf.size) {
1146 git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack");
1147 return -1;
1148 }
1149
1150 packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
1151 if (packfile_trailer == NULL) {
1152 git_mwindow_close(&w);
1153 goto on_error;
1154 }
1155
1156 /* Compare the packfile trailer as it was sent to us and what we calculated */
1157 git_oid_fromraw(&file_hash, packfile_trailer);
1158 git_mwindow_close(&w);
1159
1160 git_hash_final(&trailer_hash, &idx->trailer);
1161 if (git_oid_cmp(&file_hash, &trailer_hash)) {
1162 git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch");
1163 return -1;
1164 }
1165
1166 /* Freeze the number of deltas */
1167 stats->total_deltas = stats->total_objects - stats->indexed_objects;
1168
1169 if ((error = resolve_deltas(idx, stats)) < 0)
1170 return error;
1171
1172 if (stats->indexed_objects != stats->total_objects) {
1173 git_error_set(GIT_ERROR_INDEXER, "early EOF");
1174 return -1;
1175 }
1176
1177 if (stats->local_objects > 0) {
1178 if (update_header_and_rehash(idx, stats) < 0)
1179 return -1;
1180
1181 git_hash_final(&trailer_hash, &idx->trailer);
1182 write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
1183 }
1184
1185 /*
1186 * Is the resulting graph fully connected or are we still
1187 * missing some objects? In the second case, we can
1188 * bail out due to an incomplete and thus corrupt
1189 * packfile.
1190 */
1191 if (git_oidmap_size(idx->expected_oids) > 0) {
1192 git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects",
1193 git_oidmap_size(idx->expected_oids));
1194 return -1;
1195 }
1196
1197 git_vector_sort(&idx->objects);
1198
1199 /* Use the trailer hash as the pack file name to ensure
1200 * files with different contents have different names */
1201 git_oid_cpy(&idx->hash, &trailer_hash);
1202
1203 git_buf_sets(&filename, idx->pack->pack_name);
1204 git_buf_shorten(&filename, strlen("pack"));
1205 git_buf_puts(&filename, "idx");
1206 if (git_buf_oom(&filename))
1207 return -1;
1208
1209 if (git_filebuf_open(&index_file, filename.ptr,
1210 GIT_FILEBUF_HASH_CONTENTS |
1211 (idx->do_fsync ? GIT_FILEBUF_FSYNC : 0),
1212 idx->mode) < 0)
1213 goto on_error;
1214
1215 /* Write out the header */
1216 hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
1217 hdr.idx_version = htonl(2);
1218 git_filebuf_write(&index_file, &hdr, sizeof(hdr));
1219
1220 /* Write out the fanout table */
1221 for (i = 0; i < 256; ++i) {
1222 uint32_t n = htonl(idx->fanout[i]);
1223 git_filebuf_write(&index_file, &n, sizeof(n));
1224 }
1225
1226 /* Write out the object names (SHA-1 hashes) */
1227 git_vector_foreach(&idx->objects, i, entry) {
1228 git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid));
1229 }
1230
1231 /* Write out the CRC32 values */
1232 git_vector_foreach(&idx->objects, i, entry) {
1233 git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t));
1234 }
1235
1236 /* Write out the offsets */
1237 git_vector_foreach(&idx->objects, i, entry) {
1238 uint32_t n;
1239
1240 if (entry->offset == UINT32_MAX)
1241 n = htonl(0x80000000 | long_offsets++);
1242 else
1243 n = htonl(entry->offset);
1244
1245 git_filebuf_write(&index_file, &n, sizeof(uint32_t));
1246 }
1247
1248 /* Write out the long offsets */
1249 git_vector_foreach(&idx->objects, i, entry) {
1250 uint32_t split[2];
1251
1252 if (entry->offset != UINT32_MAX)
1253 continue;
1254
1255 split[0] = htonl(entry->offset_long >> 32);
1256 split[1] = htonl(entry->offset_long & 0xffffffff);
1257
1258 git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2);
1259 }
1260
1261 /* Write out the packfile trailer to the index */
1262 if (git_filebuf_write(&index_file, &trailer_hash, GIT_OID_RAWSZ) < 0)
1263 goto on_error;
1264
1265 /* Write out the hash of the idx */
1266 if (git_filebuf_hash(&trailer_hash, &index_file) < 0)
1267 goto on_error;
1268
1269 git_filebuf_write(&index_file, &trailer_hash, sizeof(git_oid));
1270
1271 /* Figure out what the final name should be */
1272 if (index_path(&filename, idx, ".idx") < 0)
1273 goto on_error;
1274
1275 /* Commit file */
1276 if (git_filebuf_commit_at(&index_file, filename.ptr) < 0)
1277 goto on_error;
1278
1279 if (git_mwindow_free_all(&idx->pack->mwf) < 0)
1280 goto on_error;
1281
1282 /* Truncate file to undo rounding up to next page_size in append_to_pack */
1283 if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) {
1284 git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name);
1285 return -1;
1286 }
1287
1288 if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) {
1289 git_error_set(GIT_ERROR_OS, "failed to fsync packfile");
1290 goto on_error;
1291 }
1292
1293 /* We need to close the descriptor here so Windows doesn't choke on commit_at */
1294 if (p_close(idx->pack->mwf.fd) < 0) {
1295 git_error_set(GIT_ERROR_OS, "failed to close packfile");
1296 goto on_error;
1297 }
1298
1299 idx->pack->mwf.fd = -1;
1300
1301 if (index_path(&filename, idx, ".pack") < 0)
1302 goto on_error;
1303
1304 /* And don't forget to rename the packfile to its new place. */
1305 if (p_rename(idx->pack->pack_name, git_buf_cstr(&filename)) < 0)
1306 goto on_error;
1307
1308 /* And fsync the parent directory if we're asked to. */
1309 if (idx->do_fsync &&
1310 git_futils_fsync_parent(git_buf_cstr(&filename)) < 0)
1311 goto on_error;
1312
1313 idx->pack_committed = 1;
1314
1315 git_buf_dispose(&filename);
1316 return 0;
1317
1318 on_error:
1319 git_mwindow_free_all(&idx->pack->mwf);
1320 git_filebuf_cleanup(&index_file);
1321 git_buf_dispose(&filename);
1322 return -1;
1323 }
1324
git_indexer_free(git_indexer * idx)1325 void git_indexer_free(git_indexer *idx)
1326 {
1327 const git_oid *key;
1328 git_oid *value;
1329 size_t iter;
1330
1331 if (idx == NULL)
1332 return;
1333
1334 if (idx->have_stream)
1335 git_packfile_stream_dispose(&idx->stream);
1336
1337 git_vector_free_deep(&idx->objects);
1338
1339 if (idx->pack->idx_cache) {
1340 struct git_pack_entry *pentry;
1341 git_oidmap_foreach_value(idx->pack->idx_cache, pentry, {
1342 git__free(pentry);
1343 });
1344
1345 git_oidmap_free(idx->pack->idx_cache);
1346 }
1347
1348 git_vector_free_deep(&idx->deltas);
1349
1350 git_packfile_free(idx->pack, !idx->pack_committed);
1351
1352 iter = 0;
1353 while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0)
1354 git__free(value);
1355
1356 git_hash_ctx_cleanup(&idx->trailer);
1357 git_hash_ctx_cleanup(&idx->hash_ctx);
1358 git_buf_dispose(&idx->entry_data);
1359 git_oidmap_free(idx->expected_oids);
1360 git__free(idx);
1361 }
1362