1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7
8 #include "indexer.h"
9
10 #include "git2/indexer.h"
11 #include "git2/object.h"
12
13 #include "commit.h"
14 #include "tree.h"
15 #include "tag.h"
16 #include "pack.h"
17 #include "mwindow.h"
18 #include "posix.h"
19 #include "pack.h"
20 #include "filebuf.h"
21 #include "oid.h"
22 #include "oidarray.h"
23 #include "oidmap.h"
24 #include "zstream.h"
25 #include "object.h"
26
27 extern git_mutex git__mwindow_mutex;
28
29 size_t git_indexer__max_objects = UINT32_MAX;
30
31 #define UINT31_MAX (0x7FFFFFFF)
32
33 struct entry {
34 git_oid oid;
35 uint32_t crc;
36 uint32_t offset;
37 uint64_t offset_long;
38 };
39
40 struct git_indexer {
41 unsigned int parsed_header :1,
42 pack_committed :1,
43 have_stream :1,
44 have_delta :1,
45 do_fsync :1,
46 do_verify :1;
47 struct git_pack_header hdr;
48 struct git_pack_file *pack;
49 unsigned int mode;
50 off64_t off;
51 off64_t entry_start;
52 git_object_t entry_type;
53 git_buf entry_data;
54 git_packfile_stream stream;
55 size_t nr_objects;
56 git_vector objects;
57 git_vector deltas;
58 unsigned int fanout[256];
59 git_hash_ctx hash_ctx;
60 git_oid hash;
61 git_indexer_progress_cb progress_cb;
62 void *progress_payload;
63 char objbuf[8*1024];
64
65 /* OIDs referenced from pack objects. Used for verification. */
66 git_oidmap *expected_oids;
67
68 /* Needed to look up objects which we want to inject to fix a thin pack */
69 git_odb *odb;
70
71 /* Fields for calculating the packfile trailer (hash of everything before it) */
72 char inbuf[GIT_OID_RAWSZ];
73 size_t inbuf_len;
74 git_hash_ctx trailer;
75 };
76
77 struct delta_info {
78 off64_t delta_off;
79 };
80
git_indexer_hash(const git_indexer * idx)81 const git_oid *git_indexer_hash(const git_indexer *idx)
82 {
83 return &idx->hash;
84 }
85
parse_header(struct git_pack_header * hdr,struct git_pack_file * pack)86 static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
87 {
88 int error;
89 git_map map;
90
91 if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0)
92 return error;
93
94 memcpy(hdr, map.data, sizeof(*hdr));
95 p_munmap(&map);
96
97 /* Verify we recognize this pack file format. */
98 if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
99 git_error_set(GIT_ERROR_INDEXER, "wrong pack signature");
100 return -1;
101 }
102
103 if (!pack_version_ok(hdr->hdr_version)) {
104 git_error_set(GIT_ERROR_INDEXER, "wrong pack version");
105 return -1;
106 }
107
108 return 0;
109 }
110
objects_cmp(const void * a,const void * b)111 static int objects_cmp(const void *a, const void *b)
112 {
113 const struct entry *entrya = a;
114 const struct entry *entryb = b;
115
116 return git_oid__cmp(&entrya->oid, &entryb->oid);
117 }
118
git_indexer_options_init(git_indexer_options * opts,unsigned int version)119 int git_indexer_options_init(git_indexer_options *opts, unsigned int version)
120 {
121 GIT_INIT_STRUCTURE_FROM_TEMPLATE(
122 opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT);
123 return 0;
124 }
125
126 #ifndef GIT_DEPRECATE_HARD
git_indexer_init_options(git_indexer_options * opts,unsigned int version)127 int git_indexer_init_options(git_indexer_options *opts, unsigned int version)
128 {
129 return git_indexer_options_init(opts, version);
130 }
131 #endif
132
git_indexer_new(git_indexer ** out,const char * prefix,unsigned int mode,git_odb * odb,git_indexer_options * in_opts)133 int git_indexer_new(
134 git_indexer **out,
135 const char *prefix,
136 unsigned int mode,
137 git_odb *odb,
138 git_indexer_options *in_opts)
139 {
140 git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
141 git_indexer *idx;
142 git_buf path = GIT_BUF_INIT, tmp_path = GIT_BUF_INIT;
143 static const char suff[] = "/pack";
144 int error, fd = -1;
145
146 if (in_opts)
147 memcpy(&opts, in_opts, sizeof(opts));
148
149 idx = git__calloc(1, sizeof(git_indexer));
150 GIT_ERROR_CHECK_ALLOC(idx);
151 idx->odb = odb;
152 idx->progress_cb = opts.progress_cb;
153 idx->progress_payload = opts.progress_cb_payload;
154 idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
155 git_buf_init(&idx->entry_data, 0);
156
157 if ((error = git_hash_ctx_init(&idx->hash_ctx)) < 0 ||
158 (error = git_hash_ctx_init(&idx->trailer)) < 0 ||
159 (error = git_oidmap_new(&idx->expected_oids)) < 0)
160 goto cleanup;
161
162 idx->do_verify = opts.verify;
163
164 if (git_repository__fsync_gitdir)
165 idx->do_fsync = 1;
166
167 error = git_buf_joinpath(&path, prefix, suff);
168 if (error < 0)
169 goto cleanup;
170
171 fd = git_futils_mktmp(&tmp_path, git_buf_cstr(&path), idx->mode);
172 git_buf_dispose(&path);
173 if (fd < 0)
174 goto cleanup;
175
176 error = git_packfile_alloc(&idx->pack, git_buf_cstr(&tmp_path));
177 git_buf_dispose(&tmp_path);
178
179 if (error < 0)
180 goto cleanup;
181
182 idx->pack->mwf.fd = fd;
183 if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0)
184 goto cleanup;
185
186 *out = idx;
187 return 0;
188
189 cleanup:
190 if (fd != -1)
191 p_close(fd);
192
193 if (git_buf_len(&tmp_path) > 0)
194 p_unlink(git_buf_cstr(&tmp_path));
195
196 if (idx->pack != NULL)
197 p_unlink(idx->pack->pack_name);
198
199 git_buf_dispose(&path);
200 git_buf_dispose(&tmp_path);
201 git__free(idx);
202 return -1;
203 }
204
git_indexer__set_fsync(git_indexer * idx,int do_fsync)205 void git_indexer__set_fsync(git_indexer *idx, int do_fsync)
206 {
207 idx->do_fsync = !!do_fsync;
208 }
209
210 /* Try to store the delta so we can try to resolve it later */
store_delta(git_indexer * idx)211 static int store_delta(git_indexer *idx)
212 {
213 struct delta_info *delta;
214
215 delta = git__calloc(1, sizeof(struct delta_info));
216 GIT_ERROR_CHECK_ALLOC(delta);
217 delta->delta_off = idx->entry_start;
218
219 if (git_vector_insert(&idx->deltas, delta) < 0)
220 return -1;
221
222 return 0;
223 }
224
hash_header(git_hash_ctx * ctx,off64_t len,git_object_t type)225 static int hash_header(git_hash_ctx *ctx, off64_t len, git_object_t type)
226 {
227 char buffer[64];
228 size_t hdrlen;
229 int error;
230
231 if ((error = git_odb__format_object_header(&hdrlen,
232 buffer, sizeof(buffer), (size_t)len, type)) < 0)
233 return error;
234
235 return git_hash_update(ctx, buffer, hdrlen);
236 }
237
hash_object_stream(git_indexer * idx,git_packfile_stream * stream)238 static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
239 {
240 ssize_t read;
241
242 assert(idx && stream);
243
244 do {
245 if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
246 break;
247
248 if (idx->do_verify)
249 git_buf_put(&idx->entry_data, idx->objbuf, read);
250
251 git_hash_update(&idx->hash_ctx, idx->objbuf, read);
252 } while (read > 0);
253
254 if (read < 0)
255 return (int)read;
256
257 return 0;
258 }
259
260 /* In order to create the packfile stream, we need to skip over the delta base description */
advance_delta_offset(git_indexer * idx,git_object_t type)261 static int advance_delta_offset(git_indexer *idx, git_object_t type)
262 {
263 git_mwindow *w = NULL;
264
265 assert(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA);
266
267 if (type == GIT_OBJECT_REF_DELTA) {
268 idx->off += GIT_OID_RAWSZ;
269 } else {
270 off64_t base_off;
271 int error = get_delta_base(&base_off, idx->pack, &w, &idx->off, type, idx->entry_start);
272 git_mwindow_close(&w);
273 if (error < 0)
274 return error;
275 }
276
277 return 0;
278 }
279
280 /* Read from the stream and discard any output */
read_object_stream(git_indexer * idx,git_packfile_stream * stream)281 static int read_object_stream(git_indexer *idx, git_packfile_stream *stream)
282 {
283 ssize_t read;
284
285 assert(stream);
286
287 do {
288 read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
289 } while (read > 0);
290
291 if (read < 0)
292 return (int)read;
293
294 return 0;
295 }
296
crc_object(uint32_t * crc_out,git_mwindow_file * mwf,off64_t start,off64_t size)297 static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, off64_t start, off64_t size)
298 {
299 void *ptr;
300 uint32_t crc;
301 unsigned int left, len;
302 git_mwindow *w = NULL;
303
304 crc = crc32(0L, Z_NULL, 0);
305 while (size) {
306 ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
307 if (ptr == NULL)
308 return -1;
309
310 len = min(left, (unsigned int)size);
311 crc = crc32(crc, ptr, len);
312 size -= len;
313 start += len;
314 git_mwindow_close(&w);
315 }
316
317 *crc_out = htonl(crc);
318 return 0;
319 }
320
add_expected_oid(git_indexer * idx,const git_oid * oid)321 static int add_expected_oid(git_indexer *idx, const git_oid *oid)
322 {
323 /*
324 * If we know about that object because it is stored in our ODB or
325 * because we have already processed it as part of our pack file, we do
326 * not have to expect it.
327 */
328 if ((!idx->odb || !git_odb_exists(idx->odb, oid)) &&
329 !git_oidmap_exists(idx->pack->idx_cache, oid) &&
330 !git_oidmap_exists(idx->expected_oids, oid)) {
331 git_oid *dup = git__malloc(sizeof(*oid));
332 GIT_ERROR_CHECK_ALLOC(dup);
333 git_oid_cpy(dup, oid);
334 return git_oidmap_set(idx->expected_oids, dup, dup);
335 }
336
337 return 0;
338 }
339
check_object_connectivity(git_indexer * idx,const git_rawobj * obj)340 static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
341 {
342 git_object *object;
343 git_oid *expected;
344 int error;
345
346 if (obj->type != GIT_OBJECT_BLOB &&
347 obj->type != GIT_OBJECT_TREE &&
348 obj->type != GIT_OBJECT_COMMIT &&
349 obj->type != GIT_OBJECT_TAG)
350 return 0;
351
352 if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
353 goto out;
354
355 if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) {
356 git_oidmap_delete(idx->expected_oids, &object->cached.oid);
357 git__free(expected);
358 }
359
360 /*
361 * Check whether this is a known object. If so, we can just continue as
362 * we assume that the ODB has a complete graph.
363 */
364 if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid))
365 return 0;
366
367 switch (obj->type) {
368 case GIT_OBJECT_TREE:
369 {
370 git_tree *tree = (git_tree *) object;
371 git_tree_entry *entry;
372 size_t i;
373
374 git_array_foreach(tree->entries, i, entry)
375 if (add_expected_oid(idx, entry->oid) < 0)
376 goto out;
377
378 break;
379 }
380 case GIT_OBJECT_COMMIT:
381 {
382 git_commit *commit = (git_commit *) object;
383 git_oid *parent_oid;
384 size_t i;
385
386 git_array_foreach(commit->parent_ids, i, parent_oid)
387 if (add_expected_oid(idx, parent_oid) < 0)
388 goto out;
389
390 if (add_expected_oid(idx, &commit->tree_id) < 0)
391 goto out;
392
393 break;
394 }
395 case GIT_OBJECT_TAG:
396 {
397 git_tag *tag = (git_tag *) object;
398
399 if (add_expected_oid(idx, &tag->target) < 0)
400 goto out;
401
402 break;
403 }
404 case GIT_OBJECT_BLOB:
405 default:
406 break;
407 }
408
409 out:
410 git_object_free(object);
411
412 return error;
413 }
414
store_object(git_indexer * idx)415 static int store_object(git_indexer *idx)
416 {
417 int i, error;
418 git_oid oid;
419 struct entry *entry;
420 off64_t entry_size;
421 struct git_pack_entry *pentry;
422 off64_t entry_start = idx->entry_start;
423
424 entry = git__calloc(1, sizeof(*entry));
425 GIT_ERROR_CHECK_ALLOC(entry);
426
427 pentry = git__calloc(1, sizeof(struct git_pack_entry));
428 GIT_ERROR_CHECK_ALLOC(pentry);
429
430 if (git_hash_final(&oid, &idx->hash_ctx)) {
431 git__free(pentry);
432 goto on_error;
433 }
434 entry_size = idx->off - entry_start;
435 if (entry_start > UINT31_MAX) {
436 entry->offset = UINT32_MAX;
437 entry->offset_long = entry_start;
438 } else {
439 entry->offset = (uint32_t)entry_start;
440 }
441
442 if (idx->do_verify) {
443 git_rawobj rawobj = {
444 idx->entry_data.ptr,
445 idx->entry_data.size,
446 idx->entry_type
447 };
448
449 if ((error = check_object_connectivity(idx, &rawobj)) < 0)
450 goto on_error;
451 }
452
453 git_oid_cpy(&pentry->sha1, &oid);
454 pentry->offset = entry_start;
455
456 if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) {
457 git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1));
458 git__free(pentry);
459 goto on_error;
460 }
461
462 if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) {
463 git__free(pentry);
464 git_error_set_oom();
465 goto on_error;
466 }
467
468 git_oid_cpy(&entry->oid, &oid);
469
470 if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
471 goto on_error;
472
473 /* Add the object to the list */
474 if (git_vector_insert(&idx->objects, entry) < 0)
475 goto on_error;
476
477 for (i = oid.id[0]; i < 256; ++i) {
478 idx->fanout[i]++;
479 }
480
481 return 0;
482
483 on_error:
484 git__free(entry);
485
486 return -1;
487 }
488
has_entry(git_indexer * idx,git_oid * id)489 GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id)
490 {
491 return git_oidmap_exists(idx->pack->idx_cache, id);
492 }
493
save_entry(git_indexer * idx,struct entry * entry,struct git_pack_entry * pentry,off64_t entry_start)494 static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, off64_t entry_start)
495 {
496 int i;
497
498 if (entry_start > UINT31_MAX) {
499 entry->offset = UINT32_MAX;
500 entry->offset_long = entry_start;
501 } else {
502 entry->offset = (uint32_t)entry_start;
503 }
504
505 pentry->offset = entry_start;
506
507 if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) ||
508 git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) {
509 git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack");
510 return -1;
511 }
512
513 /* Add the object to the list */
514 if (git_vector_insert(&idx->objects, entry) < 0)
515 return -1;
516
517 for (i = entry->oid.id[0]; i < 256; ++i) {
518 idx->fanout[i]++;
519 }
520
521 return 0;
522 }
523
hash_and_save(git_indexer * idx,git_rawobj * obj,off64_t entry_start)524 static int hash_and_save(git_indexer *idx, git_rawobj *obj, off64_t entry_start)
525 {
526 git_oid oid;
527 size_t entry_size;
528 struct entry *entry;
529 struct git_pack_entry *pentry = NULL;
530
531 entry = git__calloc(1, sizeof(*entry));
532 GIT_ERROR_CHECK_ALLOC(entry);
533
534 if (git_odb__hashobj(&oid, obj) < 0) {
535 git_error_set(GIT_ERROR_INDEXER, "failed to hash object");
536 goto on_error;
537 }
538
539 pentry = git__calloc(1, sizeof(struct git_pack_entry));
540 GIT_ERROR_CHECK_ALLOC(pentry);
541
542 git_oid_cpy(&pentry->sha1, &oid);
543 git_oid_cpy(&entry->oid, &oid);
544 entry->crc = crc32(0L, Z_NULL, 0);
545
546 entry_size = (size_t)(idx->off - entry_start);
547 if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
548 goto on_error;
549
550 return save_entry(idx, entry, pentry, entry_start);
551
552 on_error:
553 git__free(pentry);
554 git__free(entry);
555 git__free(obj->data);
556 return -1;
557 }
558
do_progress_callback(git_indexer * idx,git_indexer_progress * stats)559 static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats)
560 {
561 if (idx->progress_cb)
562 return git_error_set_after_callback_function(
563 idx->progress_cb(stats, idx->progress_payload),
564 "indexer progress");
565 return 0;
566 }
567
568 /* Hash everything but the last 20B of input */
hash_partially(git_indexer * idx,const uint8_t * data,size_t size)569 static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size)
570 {
571 size_t to_expell, to_keep;
572
573 if (size == 0)
574 return;
575
576 /* Easy case, dump the buffer and the data minus the last 20 bytes */
577 if (size >= GIT_OID_RAWSZ) {
578 git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len);
579 git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ);
580
581 data += size - GIT_OID_RAWSZ;
582 memcpy(idx->inbuf, data, GIT_OID_RAWSZ);
583 idx->inbuf_len = GIT_OID_RAWSZ;
584 return;
585 }
586
587 /* We can just append */
588 if (idx->inbuf_len + size <= GIT_OID_RAWSZ) {
589 memcpy(idx->inbuf + idx->inbuf_len, data, size);
590 idx->inbuf_len += size;
591 return;
592 }
593
594 /* We need to partially drain the buffer and then append */
595 to_keep = GIT_OID_RAWSZ - size;
596 to_expell = idx->inbuf_len - to_keep;
597
598 git_hash_update(&idx->trailer, idx->inbuf, to_expell);
599
600 memmove(idx->inbuf, idx->inbuf + to_expell, to_keep);
601 memcpy(idx->inbuf + to_keep, data, size);
602 idx->inbuf_len += size - to_expell;
603 }
604
write_at(git_indexer * idx,const void * data,off64_t offset,size_t size)605 static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size)
606 {
607 git_file fd = idx->pack->mwf.fd;
608 size_t mmap_alignment;
609 size_t page_offset;
610 off64_t page_start;
611 unsigned char *map_data;
612 git_map map;
613 int error;
614
615 assert(data && size);
616
617 if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
618 return error;
619
620 /* the offset needs to be at the mmap boundary for the platform */
621 page_offset = offset % mmap_alignment;
622 page_start = offset - page_offset;
623
624 if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0)
625 return error;
626
627 map_data = (unsigned char *)map.data;
628 memcpy(map_data + page_offset, data, size);
629 p_munmap(&map);
630
631 return 0;
632 }
633
append_to_pack(git_indexer * idx,const void * data,size_t size)634 static int append_to_pack(git_indexer *idx, const void *data, size_t size)
635 {
636 off64_t new_size;
637 size_t mmap_alignment;
638 size_t page_offset;
639 off64_t page_start;
640 off64_t current_size = idx->pack->mwf.size;
641 int fd = idx->pack->mwf.fd;
642 int error;
643
644 if (!size)
645 return 0;
646
647 if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
648 return error;
649
650 /* Write a single byte to force the file system to allocate space now or
651 * report an error, since we can't report errors when writing using mmap.
652 * Round the size up to the nearest page so that we only need to perform file
653 * I/O when we add a page, instead of whenever we write even a single byte. */
654 new_size = current_size + size;
655 page_offset = new_size % mmap_alignment;
656 page_start = new_size - page_offset;
657
658 if (p_lseek(fd, page_start + mmap_alignment - 1, SEEK_SET) < 0 ||
659 p_write(idx->pack->mwf.fd, data, 1) < 0) {
660 git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name);
661 return -1;
662 }
663
664 return write_at(idx, data, idx->pack->mwf.size, size);
665 }
666
read_stream_object(git_indexer * idx,git_indexer_progress * stats)667 static int read_stream_object(git_indexer *idx, git_indexer_progress *stats)
668 {
669 git_packfile_stream *stream = &idx->stream;
670 off64_t entry_start = idx->off;
671 size_t entry_size;
672 git_object_t type;
673 git_mwindow *w = NULL;
674 int error;
675
676 if (idx->pack->mwf.size <= idx->off + 20)
677 return GIT_EBUFS;
678
679 if (!idx->have_stream) {
680 error = git_packfile_unpack_header(&entry_size, &type, &idx->pack->mwf, &w, &idx->off);
681 if (error == GIT_EBUFS) {
682 idx->off = entry_start;
683 return error;
684 }
685 if (error < 0)
686 return error;
687
688 git_mwindow_close(&w);
689 idx->entry_start = entry_start;
690 git_hash_init(&idx->hash_ctx);
691 git_buf_clear(&idx->entry_data);
692
693 if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) {
694 error = advance_delta_offset(idx, type);
695 if (error == GIT_EBUFS) {
696 idx->off = entry_start;
697 return error;
698 }
699 if (error < 0)
700 return error;
701
702 idx->have_delta = 1;
703 } else {
704 idx->have_delta = 0;
705
706 error = hash_header(&idx->hash_ctx, entry_size, type);
707 if (error < 0)
708 return error;
709 }
710
711 idx->have_stream = 1;
712 idx->entry_type = type;
713
714 error = git_packfile_stream_open(stream, idx->pack, idx->off);
715 if (error < 0)
716 return error;
717 }
718
719 if (idx->have_delta) {
720 error = read_object_stream(idx, stream);
721 } else {
722 error = hash_object_stream(idx, stream);
723 }
724
725 idx->off = stream->curpos;
726 if (error == GIT_EBUFS)
727 return error;
728
729 /* We want to free the stream reasorces no matter what here */
730 idx->have_stream = 0;
731 git_packfile_stream_dispose(stream);
732
733 if (error < 0)
734 return error;
735
736 if (idx->have_delta) {
737 error = store_delta(idx);
738 } else {
739 error = store_object(idx);
740 }
741
742 if (error < 0)
743 return error;
744
745 if (!idx->have_delta) {
746 stats->indexed_objects++;
747 }
748 stats->received_objects++;
749
750 if ((error = do_progress_callback(idx, stats)) != 0)
751 return error;
752
753 return 0;
754 }
755
git_indexer_append(git_indexer * idx,const void * data,size_t size,git_indexer_progress * stats)756 int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats)
757 {
758 int error = -1;
759 struct git_pack_header *hdr = &idx->hdr;
760 git_mwindow_file *mwf = &idx->pack->mwf;
761
762 assert(idx && data && stats);
763
764 if ((error = append_to_pack(idx, data, size)) < 0)
765 return error;
766
767 hash_partially(idx, data, (int)size);
768
769 /* Make sure we set the new size of the pack */
770 idx->pack->mwf.size += size;
771
772 if (!idx->parsed_header) {
773 unsigned int total_objects;
774
775 if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header))
776 return 0;
777
778 if ((error = parse_header(&idx->hdr, idx->pack)) < 0)
779 return error;
780
781 idx->parsed_header = 1;
782 idx->nr_objects = ntohl(hdr->hdr_entries);
783 idx->off = sizeof(struct git_pack_header);
784
785 if (idx->nr_objects <= git_indexer__max_objects) {
786 total_objects = (unsigned int)idx->nr_objects;
787 } else {
788 git_error_set(GIT_ERROR_INDEXER, "too many objects");
789 return -1;
790 }
791
792 if (git_oidmap_new(&idx->pack->idx_cache) < 0)
793 return -1;
794
795 idx->pack->has_cache = 1;
796 if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0)
797 return -1;
798
799 if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0)
800 return -1;
801
802 stats->received_objects = 0;
803 stats->local_objects = 0;
804 stats->total_deltas = 0;
805 stats->indexed_deltas = 0;
806 stats->indexed_objects = 0;
807 stats->total_objects = total_objects;
808
809 if ((error = do_progress_callback(idx, stats)) != 0)
810 return error;
811 }
812
813 /* Now that we have data in the pack, let's try to parse it */
814
815 /* As the file grows any windows we try to use will be out of date */
816 git_mwindow_free_all(mwf);
817
818 while (stats->indexed_objects < idx->nr_objects) {
819 if ((error = read_stream_object(idx, stats)) != 0) {
820 if (error == GIT_EBUFS)
821 break;
822 else
823 goto on_error;
824 }
825 }
826
827 return 0;
828
829 on_error:
830 git_mwindow_free_all(mwf);
831 return error;
832 }
833
index_path(git_buf * path,git_indexer * idx,const char * suffix)834 static int index_path(git_buf *path, git_indexer *idx, const char *suffix)
835 {
836 const char prefix[] = "pack-";
837 size_t slash = (size_t)path->size;
838
839 /* search backwards for '/' */
840 while (slash > 0 && path->ptr[slash - 1] != '/')
841 slash--;
842
843 if (git_buf_grow(path, slash + 1 + strlen(prefix) +
844 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
845 return -1;
846
847 git_buf_truncate(path, slash);
848 git_buf_puts(path, prefix);
849 git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
850 path->size += GIT_OID_HEXSZ;
851 git_buf_puts(path, suffix);
852
853 return git_buf_oom(path) ? -1 : 0;
854 }
855
856 /**
857 * Rewind the packfile by the trailer, as we might need to fix the
858 * packfile by injecting objects at the tail and must overwrite it.
859 */
seek_back_trailer(git_indexer * idx)860 static void seek_back_trailer(git_indexer *idx)
861 {
862 idx->pack->mwf.size -= GIT_OID_RAWSZ;
863 git_mwindow_free_all(&idx->pack->mwf);
864 }
865
inject_object(git_indexer * idx,git_oid * id)866 static int inject_object(git_indexer *idx, git_oid *id)
867 {
868 git_odb_object *obj;
869 struct entry *entry;
870 struct git_pack_entry *pentry = NULL;
871 git_oid foo = {{0}};
872 unsigned char hdr[64];
873 git_buf buf = GIT_BUF_INIT;
874 off64_t entry_start;
875 const void *data;
876 size_t len, hdr_len;
877 int error;
878
879 seek_back_trailer(idx);
880 entry_start = idx->pack->mwf.size;
881
882 if (git_odb_read(&obj, idx->odb, id) < 0) {
883 git_error_set(GIT_ERROR_INDEXER, "missing delta bases");
884 return -1;
885 }
886
887 data = git_odb_object_data(obj);
888 len = git_odb_object_size(obj);
889
890 entry = git__calloc(1, sizeof(*entry));
891 GIT_ERROR_CHECK_ALLOC(entry);
892
893 entry->crc = crc32(0L, Z_NULL, 0);
894
895 /* Write out the object header */
896 hdr_len = git_packfile__object_header(hdr, len, git_odb_object_type(obj));
897 if ((error = append_to_pack(idx, hdr, hdr_len)) < 0)
898 goto cleanup;
899
900 idx->pack->mwf.size += hdr_len;
901 entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len);
902
903 if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0)
904 goto cleanup;
905
906 /* And then the compressed object */
907 if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0)
908 goto cleanup;
909
910 idx->pack->mwf.size += buf.size;
911 entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size));
912 git_buf_dispose(&buf);
913
914 /* Write a fake trailer so the pack functions play ball */
915
916 if ((error = append_to_pack(idx, &foo, GIT_OID_RAWSZ)) < 0)
917 goto cleanup;
918
919 idx->pack->mwf.size += GIT_OID_RAWSZ;
920
921 pentry = git__calloc(1, sizeof(struct git_pack_entry));
922 GIT_ERROR_CHECK_ALLOC(pentry);
923
924 git_oid_cpy(&pentry->sha1, id);
925 git_oid_cpy(&entry->oid, id);
926 idx->off = entry_start + hdr_len + len;
927
928 error = save_entry(idx, entry, pentry, entry_start);
929
930 cleanup:
931 if (error) {
932 git__free(entry);
933 git__free(pentry);
934 }
935
936 git_odb_object_free(obj);
937 return error;
938 }
939
fix_thin_pack(git_indexer * idx,git_indexer_progress * stats)940 static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats)
941 {
942 int error, found_ref_delta = 0;
943 unsigned int i;
944 struct delta_info *delta;
945 size_t size;
946 git_object_t type;
947 git_mwindow *w = NULL;
948 off64_t curpos = 0;
949 unsigned char *base_info;
950 unsigned int left = 0;
951 git_oid base;
952
953 assert(git_vector_length(&idx->deltas) > 0);
954
955 if (idx->odb == NULL) {
956 git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB");
957 return -1;
958 }
959
960 /* Loop until we find the first REF delta */
961 git_vector_foreach(&idx->deltas, i, delta) {
962 if (!delta)
963 continue;
964
965 curpos = delta->delta_off;
966 error = git_packfile_unpack_header(&size, &type, &idx->pack->mwf, &w, &curpos);
967 if (error < 0)
968 return error;
969
970 if (type == GIT_OBJECT_REF_DELTA) {
971 found_ref_delta = 1;
972 break;
973 }
974 }
975
976 if (!found_ref_delta) {
977 git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object");
978 return -1;
979 }
980
981 /* curpos now points to the base information, which is an OID */
982 base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left);
983 if (base_info == NULL) {
984 git_error_set(GIT_ERROR_INDEXER, "failed to map delta information");
985 return -1;
986 }
987
988 git_oid_fromraw(&base, base_info);
989 git_mwindow_close(&w);
990
991 if (has_entry(idx, &base))
992 return 0;
993
994 if (inject_object(idx, &base) < 0)
995 return -1;
996
997 stats->local_objects++;
998
999 return 0;
1000 }
1001
resolve_deltas(git_indexer * idx,git_indexer_progress * stats)1002 static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats)
1003 {
1004 unsigned int i;
1005 int error;
1006 struct delta_info *delta;
1007 int progressed = 0, non_null = 0, progress_cb_result;
1008
1009 while (idx->deltas.length > 0) {
1010 progressed = 0;
1011 non_null = 0;
1012 git_vector_foreach(&idx->deltas, i, delta) {
1013 git_rawobj obj = {0};
1014
1015 if (!delta)
1016 continue;
1017
1018 non_null = 1;
1019 idx->off = delta->delta_off;
1020 if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) {
1021 if (error == GIT_PASSTHROUGH) {
1022 /* We have not seen the base object, we'll try again later. */
1023 continue;
1024 }
1025 return -1;
1026 }
1027
1028 if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
1029 /* TODO: error? continue? */
1030 continue;
1031
1032 if (hash_and_save(idx, &obj, delta->delta_off) < 0)
1033 continue;
1034
1035 git__free(obj.data);
1036 stats->indexed_objects++;
1037 stats->indexed_deltas++;
1038 progressed = 1;
1039 if ((progress_cb_result = do_progress_callback(idx, stats)) < 0)
1040 return progress_cb_result;
1041
1042 /* remove from the list */
1043 git_vector_set(NULL, &idx->deltas, i, NULL);
1044 git__free(delta);
1045 }
1046
1047 /* if none were actually set, we're done */
1048 if (!non_null)
1049 break;
1050
1051 if (!progressed && (fix_thin_pack(idx, stats) < 0)) {
1052 return -1;
1053 }
1054 }
1055
1056 return 0;
1057 }
1058
update_header_and_rehash(git_indexer * idx,git_indexer_progress * stats)1059 static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats)
1060 {
1061 void *ptr;
1062 size_t chunk = 1024*1024;
1063 off64_t hashed = 0;
1064 git_mwindow *w = NULL;
1065 git_mwindow_file *mwf;
1066 unsigned int left;
1067
1068 mwf = &idx->pack->mwf;
1069
1070 git_hash_init(&idx->trailer);
1071
1072
1073 /* Update the header to include the numer of local objects we injected */
1074 idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects);
1075 if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0)
1076 return -1;
1077
1078 /*
1079 * We now use the same technique as before to determine the
1080 * hash. We keep reading up to the end and let
1081 * hash_partially() keep the existing trailer out of the
1082 * calculation.
1083 */
1084 git_mwindow_free_all(mwf);
1085 idx->inbuf_len = 0;
1086 while (hashed < mwf->size) {
1087 ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left);
1088 if (ptr == NULL)
1089 return -1;
1090
1091 hash_partially(idx, ptr, left);
1092 hashed += left;
1093
1094 git_mwindow_close(&w);
1095 }
1096
1097 return 0;
1098 }
1099
git_indexer_commit(git_indexer * idx,git_indexer_progress * stats)1100 int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats)
1101 {
1102 git_mwindow *w = NULL;
1103 unsigned int i, long_offsets = 0, left;
1104 int error;
1105 struct git_pack_idx_header hdr;
1106 git_buf filename = GIT_BUF_INIT;
1107 struct entry *entry;
1108 git_oid trailer_hash, file_hash;
1109 git_filebuf index_file = {0};
1110 void *packfile_trailer;
1111
1112 if (!idx->parsed_header) {
1113 git_error_set(GIT_ERROR_INDEXER, "incomplete pack header");
1114 return -1;
1115 }
1116
1117 /* Test for this before resolve_deltas(), as it plays with idx->off */
1118 if (idx->off + 20 < idx->pack->mwf.size) {
1119 git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack");
1120 return -1;
1121 }
1122 if (idx->off + 20 > idx->pack->mwf.size) {
1123 git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack");
1124 return -1;
1125 }
1126
1127 packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
1128 if (packfile_trailer == NULL) {
1129 git_mwindow_close(&w);
1130 goto on_error;
1131 }
1132
1133 /* Compare the packfile trailer as it was sent to us and what we calculated */
1134 git_oid_fromraw(&file_hash, packfile_trailer);
1135 git_mwindow_close(&w);
1136
1137 git_hash_final(&trailer_hash, &idx->trailer);
1138 if (git_oid_cmp(&file_hash, &trailer_hash)) {
1139 git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch");
1140 return -1;
1141 }
1142
1143 /* Freeze the number of deltas */
1144 stats->total_deltas = stats->total_objects - stats->indexed_objects;
1145
1146 if ((error = resolve_deltas(idx, stats)) < 0)
1147 return error;
1148
1149 if (stats->indexed_objects != stats->total_objects) {
1150 git_error_set(GIT_ERROR_INDEXER, "early EOF");
1151 return -1;
1152 }
1153
1154 if (stats->local_objects > 0) {
1155 if (update_header_and_rehash(idx, stats) < 0)
1156 return -1;
1157
1158 git_hash_final(&trailer_hash, &idx->trailer);
1159 write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
1160 }
1161
1162 /*
1163 * Is the resulting graph fully connected or are we still
1164 * missing some objects? In the second case, we can
1165 * bail out due to an incomplete and thus corrupt
1166 * packfile.
1167 */
1168 if (git_oidmap_size(idx->expected_oids) > 0) {
1169 git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects",
1170 git_oidmap_size(idx->expected_oids));
1171 return -1;
1172 }
1173
1174 git_vector_sort(&idx->objects);
1175
1176 /* Use the trailer hash as the pack file name to ensure
1177 * files with different contents have different names */
1178 git_oid_cpy(&idx->hash, &trailer_hash);
1179
1180 git_buf_sets(&filename, idx->pack->pack_name);
1181 git_buf_shorten(&filename, strlen("pack"));
1182 git_buf_puts(&filename, "idx");
1183 if (git_buf_oom(&filename))
1184 return -1;
1185
1186 if (git_filebuf_open(&index_file, filename.ptr,
1187 GIT_FILEBUF_HASH_CONTENTS |
1188 (idx->do_fsync ? GIT_FILEBUF_FSYNC : 0),
1189 idx->mode) < 0)
1190 goto on_error;
1191
1192 /* Write out the header */
1193 hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
1194 hdr.idx_version = htonl(2);
1195 git_filebuf_write(&index_file, &hdr, sizeof(hdr));
1196
1197 /* Write out the fanout table */
1198 for (i = 0; i < 256; ++i) {
1199 uint32_t n = htonl(idx->fanout[i]);
1200 git_filebuf_write(&index_file, &n, sizeof(n));
1201 }
1202
1203 /* Write out the object names (SHA-1 hashes) */
1204 git_vector_foreach(&idx->objects, i, entry) {
1205 git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid));
1206 }
1207
1208 /* Write out the CRC32 values */
1209 git_vector_foreach(&idx->objects, i, entry) {
1210 git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t));
1211 }
1212
1213 /* Write out the offsets */
1214 git_vector_foreach(&idx->objects, i, entry) {
1215 uint32_t n;
1216
1217 if (entry->offset == UINT32_MAX)
1218 n = htonl(0x80000000 | long_offsets++);
1219 else
1220 n = htonl(entry->offset);
1221
1222 git_filebuf_write(&index_file, &n, sizeof(uint32_t));
1223 }
1224
1225 /* Write out the long offsets */
1226 git_vector_foreach(&idx->objects, i, entry) {
1227 uint32_t split[2];
1228
1229 if (entry->offset != UINT32_MAX)
1230 continue;
1231
1232 split[0] = htonl(entry->offset_long >> 32);
1233 split[1] = htonl(entry->offset_long & 0xffffffff);
1234
1235 git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2);
1236 }
1237
1238 /* Write out the packfile trailer to the index */
1239 if (git_filebuf_write(&index_file, &trailer_hash, GIT_OID_RAWSZ) < 0)
1240 goto on_error;
1241
1242 /* Write out the hash of the idx */
1243 if (git_filebuf_hash(&trailer_hash, &index_file) < 0)
1244 goto on_error;
1245
1246 git_filebuf_write(&index_file, &trailer_hash, sizeof(git_oid));
1247
1248 /* Figure out what the final name should be */
1249 if (index_path(&filename, idx, ".idx") < 0)
1250 goto on_error;
1251
1252 /* Commit file */
1253 if (git_filebuf_commit_at(&index_file, filename.ptr) < 0)
1254 goto on_error;
1255
1256 git_mwindow_free_all(&idx->pack->mwf);
1257
1258 /* Truncate file to undo rounding up to next page_size in append_to_pack */
1259 if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) {
1260 git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name);
1261 return -1;
1262 }
1263
1264 if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) {
1265 git_error_set(GIT_ERROR_OS, "failed to fsync packfile");
1266 goto on_error;
1267 }
1268
1269 /* We need to close the descriptor here so Windows doesn't choke on commit_at */
1270 if (p_close(idx->pack->mwf.fd) < 0) {
1271 git_error_set(GIT_ERROR_OS, "failed to close packfile");
1272 goto on_error;
1273 }
1274
1275 idx->pack->mwf.fd = -1;
1276
1277 if (index_path(&filename, idx, ".pack") < 0)
1278 goto on_error;
1279
1280 /* And don't forget to rename the packfile to its new place. */
1281 if (p_rename(idx->pack->pack_name, git_buf_cstr(&filename)) < 0)
1282 goto on_error;
1283
1284 /* And fsync the parent directory if we're asked to. */
1285 if (idx->do_fsync &&
1286 git_futils_fsync_parent(git_buf_cstr(&filename)) < 0)
1287 goto on_error;
1288
1289 idx->pack_committed = 1;
1290
1291 git_buf_dispose(&filename);
1292 return 0;
1293
1294 on_error:
1295 git_mwindow_free_all(&idx->pack->mwf);
1296 git_filebuf_cleanup(&index_file);
1297 git_buf_dispose(&filename);
1298 return -1;
1299 }
1300
git_indexer_free(git_indexer * idx)1301 void git_indexer_free(git_indexer *idx)
1302 {
1303 const git_oid *key;
1304 git_oid *value;
1305 size_t iter;
1306
1307 if (idx == NULL)
1308 return;
1309
1310 if (idx->have_stream)
1311 git_packfile_stream_dispose(&idx->stream);
1312
1313 git_vector_free_deep(&idx->objects);
1314
1315 if (idx->pack->idx_cache) {
1316 struct git_pack_entry *pentry;
1317 git_oidmap_foreach_value(idx->pack->idx_cache, pentry, {
1318 git__free(pentry);
1319 });
1320
1321 git_oidmap_free(idx->pack->idx_cache);
1322 }
1323
1324 git_vector_free_deep(&idx->deltas);
1325
1326 if (!git_mutex_lock(&git__mwindow_mutex)) {
1327 if (!idx->pack_committed)
1328 git_packfile_close(idx->pack, true);
1329
1330 git_packfile_free(idx->pack);
1331 git_mutex_unlock(&git__mwindow_mutex);
1332 }
1333
1334 iter = 0;
1335 while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0)
1336 git__free(value);
1337
1338 git_hash_ctx_cleanup(&idx->trailer);
1339 git_hash_ctx_cleanup(&idx->hash_ctx);
1340 git_buf_dispose(&idx->entry_data);
1341 git_oidmap_free(idx->expected_oids);
1342 git__free(idx);
1343 }
1344